q-bio0402046/sublin.tex
1: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2: \documentclass[a4paper,11pt,leqno]{article}
3: \setlength\oddsidemargin{0.70in}
4: %\usepackage{isolatin1}
5: \usepackage{amsfonts}     
6: \usepackage{amsmath}
7: %\usepackage{amssymb}
8: \usepackage{amstext}
9: \usepackage{amsthm}             
10: \usepackage{xspace}
11: \usepackage[dvips]{graphicx}
12: %\usepackage{showkeys}
13: 
14: %Definizioni utili...:)
15: %--------------------------
16: \newcommand\ac{\`a\xspace}
17: \newcommand\ec{\`e\xspace}
18: \newcommand\ic{\`\i\xspace}
19: \newcommand\oc{\`o\xspace}
20: \newcommand\uc{\`u\xspace}
21: \newcommand\eg{\'e\xspace}
22: %--------------------------
23: %--------------------------
24: \newcommand\hr{\hfill\break}
25: \newcommand\name{\bfseries}
26: \newcommand\chaptit{\bfseries\itshape}
27: \newcommand\bls{\rightline{$ \blacksquare$}}
28: \newcommand\ovln{\overline}
29: \newcommand\unln{\underline}
30: %--------------------------
31: 
32: %-----------------------------------------
33: % La R dei reali, la C dei complessi, etc.
34: \newcommand{\ok} {\qed}
35: \newcommand{\C}{\mathbb C}
36: \newcommand{\R}{\mathbb R}
37: \newcommand{\Rn}{{\mathbb R}^{n}}
38: \newcommand{\N}{\mathbb N}
39: \newcommand{\Q}{\mathbb Q}
40: \newcommand{\Z}{\mathbb Z}
41: \newcommand{\E}{\mathbb E}
42: \newcommand{\eps}{\varepsilon}
43: \newcommand{\Graf}{\mathrm{Graf}}
44: \newcommand{\Dom}{\mathrm{Dom}}
45: \newcommand{\Int}{\mathrm{Int}}
46: \newcommand{\Imm}{\mathrm{Imm}}
47: \newcommand{\grap}{\left\{}
48: \newcommand{\grch}{\right\}}
49: %-----------------------------------------
50: % Teoremi, Lemmi, etc.
51: \newtheorem{theorem} {Theorem}%[chapter]
52: \newtheorem{theorem*}{Theorem}
53: \newtheorem{prop*} {Proposition} 
54: \newtheorem{lemma*}{Lemma}
55: %\newtheorem{guess}{Osservazione}[chapter]
56: \newtheorem{lemma}{Lemma}%[chapter]
57: \newtheorem{prop} {Proposition}% [chapter]
58: %-----------------------------------------
59: 
60: % Cambiamo stile di scrittura... :))
61: \theoremstyle{definition}
62: \newtheorem{definition}{Definition}%[chapter]
63: \newtheorem{definition*}{Definition}
64: \newtheorem{cor}{Corollary}%[chapter]
65: \newtheorem{cor*}{Corollary}
66: \newtheorem{rem}{Remark}%[chapter]
67: \newtheorem{rem*}{Remark}
68: %-----------------------------------------
69: \theoremstyle{remark}
70: \newtheorem{nota}{Notazione}%[chapter]
71: \newtheorem{es}{Esempio}%[chapter]
72: %-----------------------------------------
73: 
74: %-----------------------------------------
75: %Stile delle pagine...:)                 
76: \pagestyle{plain}                        
77: %----------------------------------------------------------------------
78: 
79: %----------------------------------------------------------------------
80: % nuovi ambienti molto carini per dimostrazioni e osservazioni :)
81: % basta fare \begin{prf}  \end{prf} per le dimostrazioni e
82: % \begin{guess}   \end{guess} per le osservazioni e il gioco e' fatto!!!
83: \newtheorem{dim*}{\bf Proof}%[chapter]
84: \newenvironment{prf}{\begin{dim*}\begin{rm}} {\end{rm}\qed\end{dim*}}   
85: \newtheorem{guess*}{\bf Osservazione}%[chapter]
86: \newenvironment{guess}{\begin{guess*}\begin{rm}}{\end{rm}\end{guess*}}  
87: %-----------------------------------------------------------------------
88: 
89: 
90: \newcommand{\no}{\noindent}
91: \newcommand{\mk}{\medskip}
92: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
93: \input psfig.sty
94: \begin{document}
95: \title{Sublinear growth of Information in DNA sequences}
96: \author{Giulia Menconi\\ \small {Dipartimento di Matematica
97: Applicata}\\ \small{and}\\ \small {C.I.S.S.C. Centro
98: Interdisciplinare} \\ \small {per lo Studio dei Sistemi Complessi}\\
99: \small {Universit\ac di Pisa}\\ \small {Via Bonanno Pisano 25/b 56126
100: PISA - Italy}\\ \small{menconi@mail.dm.unipi.it}\\October 23, 2003} \date{} \maketitle
101: \vskip 11truecm
102: \centerline{Running title: Sublinear Information in DNA}
103: \vskip 0.2truecm 
104: {Keywords: Information Content, compression
105: algorithm, DNA, repetitive sequences}
106: \newpage
107: \begin{abstract}
108: We introduce a novel method to analyse complete genomes and recognise
109: some distinctive features by means of an adaptive compression
110: algorithm, which is not DNA-oriented. We study the Information Content
111: as a function of the number of symbols encoded by the
112: algorithm. Preliminar results are shown concerning regions having a
113: sublinear type of information growth, which is strictly connected to
114: the presence of highly repetitive subregions that might be supposed to
115: have a regulatory function within the genome.
116: \end{abstract}
117: %\tableofcontents
118: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
119: \section{Introduction}
120: We shall analyse the genome sequences from the point of view of data
121: compression in order to exploit a linguistic analysis. As the context
122: suggests, the genomes are interpreted as symbol sequences of finite
123: length, drawn by an Information Source (the Nature) that remains
124: mainly unknown and emits symbols taken from the alphabet of the four
125: nucleotides $\{A,\ C,\ G,\ T\}$. Each genome identifies a living
126: organism and we assume that it may be considered as the unique
127: realisation produced by the Source relative to that organism.
128: 
129: We shall not give here a formal definition of Information
130: Source. Intuitively, it is a device emitting a sequence of symbols
131: $\dots x_1x_2x_3\dots$ where each $x_i$ is an element of a finite
132: alphabet $\mathcal A$. The rigorous definition \cite{billingsley} lies
133: on the notion of sequence space $\Omega_\mathcal A$, that is the space
134: of one-sided infinite sequences (also called strings)
135: $\omega=(\omega_0,\omega_1,\dots)$ whose symbols are drawn from the
136: alphabet.
137: % $(\Omega_{\mathcal A}, \sigma,
138: %\mu_{\mathcal A})$. Let $\mathcal A$ is a finite alphabet with $N$
139: %symbols $\{a_0,a_1,\dots,a_{N-1}\}$, then the sequence space
140: %$\Omega_{\mathcal A}$ is the space of one-sided infinite sequences
141: %(also called strings) $\omega=(\omega_0,\omega_1,\dots)$ whose symbols
142: %are drawn from the alphabet. The sequence space may be equipped with a
143: %probabiblity invariant measure $\mu_{\mathcal A}$. The dynamical law
144: %is the shift transformation $\sigma: \Omega _{\mathcal A}
145: %\longrightarrow \Omega _{\mathcal A}$ defined as
146: %$\sigma(\omega_0,\omega_1,\omega_2,\dots)=(\omega_1,\omega_2,\dots)\.$
147: Even if an Information Source is rigorously defined as a stochastic
148: process $\mathbb X=(\mathbb X_n)_{n\in\N}$ acting on a sequence space,
149: we may consider the symbolic source $\Omega _{\mathcal A}$ as the
150: subset of the sequence space containing all the realizations of the
151: process $\mathbb X$. This shall motivate the use of the term
152: Information Source when referring to a sequence space. We shall denote
153: by $\mathcal{A}^*$ the set of finite symbolic sequences on the
154: alphabet $\mathcal{A}$. If $s\in\mathcal{A}^*$ its length will be
155: denoted by $|s|$.
156: 
157: DNA sequences are special quaternary symbol sequences. As only a small
158: fraction of DNA nucleotides results in a viable organism, the
159: sequences belonging to a living organism are expected to be nonrandom
160: and have some constraints. Therefore, DNA sequences should be
161: compressible, at least locally.
162: 
163: In our approach to symbol sequences, the crucial notion is the
164: \textit{Information Content}. Given a finite string $s$ in $\mathcal{A}^*$,
165: the meaning of \textit{ quantity of information} $I(s)$ contained in
166: $s$ has the following natural connotation:
167: 
168: \begin{center}
169: $I(s)$ \textit{is the length of the smallest binary message from which you
170: can reconstruct} $s$.
171: \end{center}
172: 
173: In his pioneering work, Shannon defined the quantity of information as
174: a statistical notion using the tools of probability theory
175: (\cite{kin}). Thus in Shannon framework, the quantity of information
176: which is contained in a string depends on its context. For example the
177: string $^{\prime }pane^{\prime }$ contains a certain information when
178: it is considered as a string coming from the English language. The
179: same string $^{\prime }pane^{\prime }$ contains much less Shannon
180: information when it is considered as a string coming from the Italian
181: language because it is more frequent in the Italian language (in
182: Italian it means ''bread'' and, of course, it is very
183: frequent). Roughly speaking, the Shannon information of a string is
184: the absolute value of the logarithm of the probability of its
185: occurrence.
186: 
187: However, there are measures of information which depend intrinsically
188: on the string and not on its probability within a given context. We
189: will adopt this point of view. An example of these measures of
190: information is the Algorithmic Information Content ($AIC$). We will
191: not formally define it (see \cite{kin} and \cite{Ch} for rigorous
192: definitions and properties). We limit ourselves to give an intuitive
193: idea which is very close to the formal definition. We can consider a
194: partial recursive function as a computer $C$ which takes a program $p$
195: (namely a binary string) as an input, performs some computations and
196: gives a string $s=C(p)$, written in the given alphabet, as an output.
197: The $AIC$ of a string $s$ is defined as the length of the shortest
198: binary program $p$ which gives $s$ as its output, namely
199: $$I_{AIC}(s,C)=\min \{|p|:C(p)=s\}, $$ where $|p|$ means the length in
200: bit of the string which the program $p$ consists of. A theorem due to
201: A. N. Kolmogorov (\cite{kolmogorov}) implies that the information
202: content ${AIC}$ of $s$ with respect to $C$ depends only on $s$ up to a
203: fixed constant, therefore its asymptotic behaviour does not depend on
204: the choice of $C$. The shortest program $p$ which outputs the string
205: $s$ is a sort of optimal encoding of $s$. The information that is
206: necessary to reconstruct the string is contained in the
207: program. Unfortunately, this coding procedure cannot be performed by
208: any algorithm. This is a very deep statement and, in some sense, it is
209: equivalent to the Turing halting problem or to the G\"{o}del
210: incompleteness theorem. Then the Algorithmic Information Content is
211: not computable by any algorithm.
212: 
213: Our method is focused on another measure: the information content of a
214: finite string can also be defined by a lossless data compression
215: algorithm $Z$ (\cite{Ch}, \cite{cleary}). This turns out to be a
216: Computable Information Content (CIC). In reference \cite{licatone}
217: quantitative relations among Shannon entropy of the source, the AIC
218: and the CIC of sequences are provided.
219: 
220: The ``classical'' studies in compression algorithms answer the
221: question about the com\-pres\-si\-bi\-li\-ty of DNA with the
222: additional advantage of using compression techniques to capture the
223: properties of DNA. It is known that DNA sequences have two linguistic
224: characteristic structures: {\it reverse complements} and {\it
225: approximate repeats}. The reverse complement $\sigma ^c$ of a sequence
226: $\sigma$ is a sequence such that each symbol of $\sigma$ is replaced
227: in $\sigma^c$ by its complement one. That is, reading the reverse
228: complement of a subsequence from a single strand of DNA is the same as
229: reading the corresponding complementary subsequence in the other
230: strand. The approximate repeats are repeats that contain
231: errors. Approximate repeats are due to the local variability that is a
232: common feature within genomes.
233: 
234: There have been developed several special-purpose compression
235: algorithms for DNA sequences (for instance, see  \cite{cleary}, \cite{jiang},
236: \cite{chen}, \cite{tahi}). These
237: algorithms are called DNA-oriented because they use the
238: aforementioned charateristic structures of ge\-no\-mes together with a
239: sort of statistical compression to achieve a compression ratio lower
240: than two bits per symbol. This is a great improvement since the
241: standard text compression algorithms such as {\it compress} or {\it
242: gzip} cannot compress DNA sequences but only expand the file with more
243: than two bits per symbol. The reason for text compression to fail on
244: DNA sequences is that the regularities in genomes are much more
245: subtler than in English texts, for which those algorithms have been
246: designed.
247: 
248: Our analysis makes reference to a different approach. We aim at using
249: the compression algorithm CASToRe, which has been created without any
250: biological purpose and {\it a priori} linguistic knowledge, to
251: understand whether there exist low information regions within a
252: genome, whether they have a functional type in common, whether they
253: are extended or have short length and what kind of growth the
254: information content shows in those regions.
255: %The results shown in table \ref{cssh1} confirm that the
256: %algorithm CASToRe allows the compression ratio of complete genomes to
257: %be well under the threshold of two bits per symbol and this is the
258: %crucial point that convinced us to exploit the analysis of genomes by
259: %means of CASToRe.
260: Finally, as the algorithm CASToRe belongs to the class of algorithms
261: that adaptively create a dictionary relative to a parsing of the input
262: sequence, we shall study dictionaries after compression, in order to
263: investigate the relations between patterns and biological functions.
264: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
265: \section{Computable Information Content}
266: \begin{definition}[Compression Algorithm]
267: A lossless data compression algorithm is any injective function
268: $Z:\mathcal{A}^*\rightarrow\{0,1\}^*$.
269: \end{definition}
270: Therefore, a compression algorithm is a reversible coding such that
271: from the original string $s$ may be recovered from the encoded string
272: $Z(s)$. Since the coded string contains all the information that is
273: necessary to reconstruct and describe the structural features of the
274: original string, we can consider the length of the coded string as an
275: approximate measure of the quantity of information that is contained
276: in the original string.
277: \begin{definition}[Computable Information Content]
278: The information content of a finite string $s\in:\mathcal{A}^*$ with
279: respect to a compression algorithm $Z$ is defined as
280: \begin{equation}
281: CIC_{Z}(s)=|Z(s)|\ .
282: \end{equation}
283: The CIC of a string $s$ is the length (in bit units) of the coded
284: string $Z(s)$.
285: \end{definition}
286: The advantage of using a compression algorithm lies in the fact that
287: the information content $CIC_{Z}\left( s\right) $ is a
288: computable function over the space of finite strings. For this reason
289: we named it Computable Information Content.
290: 
291: Moreover, we define another quantity, the complexity of a finite
292: sequence, providing an estimate for the rate of information content
293: contained in it.
294: 
295: \begin{definition}[Computable Complexity of a finite string]
296: The complexity of $s$ with respect to $Z$ is the compression ratio
297: \begin{equation}
298: K_{Z}(s)=\frac{I_Z(s)}{|s|}\ .
299: \end{equation}
300: \end{definition}
301: 
302: \begin{rem*}
303: Under suitable optimality assumptions on the compression algorithm
304: $Z$, we can extend this definition to infinite symbolic sequences
305: belonging to $\Omega_\mathcal A$ and asympotically obtain the Shannon
306: entropy of the Information Source from which the sequence has been
307: drawn (\cite{gal4},\cite{gal3}). The theoretical work
308: has been extended also to trajectories coming from general dynamical
309: systems and it is supported by application to several complex systems,
310: as to turbulent or intermittent regimes (\cite{CSF02}, \cite{giuliauno},
311: \cite{bonanno}, \cite{cristalli}, \cite{jacopogiulia}) and to weakly
312: chaotic dynamical systems (\cite{menconi},\cite{licatone}).
313: \end{rem*}
314: 
315: \section{Dictionaries, words and phrases}
316: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
317: Let us describe the sort of linguistic analysis we shall perform on
318:  genetic sequences. We shall use the CIC method to extract the
319:  functional regions whose information content is low and its growth is
320:  sublinear. We aim at understanding whether those regions show
321:  peculiar features such as specific highly repeated patterns of
322:  nucleotides (they are usually called {\it motifs}). Finally, we shall
323:  scan other genomes, both coming from the same domain of life and from
324:  different domains, looking for the presence of low information
325:  regions and comparing the motifs to each other. These regions are
326:  called {\it atypical}, as surprisingly they are highly compressible
327:  in comparison with the other regions. The dictionaries of some
328:  atypical regions will be studied and related to some known biological
329:  functions (e.g. being a promoter region). Finally, a preliminar result on
330:  potential application of this method to gene finding will be
331:  introduced.
332: 
333: \subsection{The algorithm CASToRe}\label{castore}
334: We have created and implemented a particular compression algorithm we
335: called CASToRe which is a modification of the Lempel-Ziv compression
336: schemes $LZ77$ and $LZ78$ (\cite{lz77}, \cite{lz78}) and it has been
337: introduced and studied in references \cite{CSF02} and
338: \cite{menconi}. Its theoretical advantages with respect to LZ78 showed
339: that this algorithm is a sensitive measure of the Information content
340: of low entropy sequences. This is the reason that motivates the choice
341: of the acronym \textbf{CASToRe} to name the new algorithm: its meaning
342: is \textbf{C}ompression \textbf{A}lgorithm, \textbf{ S}ensitive
343: \textbf{To} \textbf{Re}gularity. As it has been proved in
344: \cite{menconi}, the Information content $I_Z$ of a constant sequence
345: $s^n$, originally with length $n$, is $\Psi(n)=4+2\log (n+1)[\log (\log
346: (n+1))-1]$, if the algorithm $Z$ is CASToRe. The theory predicts that
347: the best possible information content for a constant sequence of
348: length $n$ is $AIC(s^n) =\log (n) + $constant. It may be shown that
349: the algorithm $LZ78$ encodes a constant $n$-digits long sequence to a
350: string with length about $const\ +\ n^{\frac 1 2}$ bits; so, we cannot
351: expect that $LZ78$ is able to distinguish a sequence whose information
352: content grows like $n^{\alpha}$ ($\alpha < \frac 1 2$) from a constant
353: or periodic string. Furthermore, the running time of CASToRe is also
354: sensibly shorter than that of $LZ77$ (with infinite window), then any
355: implementation is more efficient. These are the main reasons that
356: motivate the choice of using CASToRe also for numerical experiments.
357: 
358: Now we briefly describe the internal running of CASToRe.
359: 
360: As the Ziv-Lempel schemes, the algorithm CASToRe is based on an
361: adaptive dictionary (\cite{bell}). One of the basic differences in the
362: coding procedure is that the algorithm $LZ77$ splits the input strings
363: in overlapping phrases, while the algorithm CASToRe (as well as the
364: algorithm $LZ78$) parses the input string in non-overlapping
365: phrases. Moreover, CASToRe differs from $LZ78$ because the new phrase
366: is a pair of two already parsed phrases, while $LZ78$ couples one
367: already parsed phrase and one symbol from the alphabet.
368:  
369: At the beginning of encoding procedure, the dictionary contains only
370: the alphabet. In order to explain the main rules of the encoding, let
371: us consider a step $h$ within the encoding process, when the dictionary
372: already contains $h$ phrases $\{e_1,\dots,e_h\}$.
373:  
374: The new phrase is defined as a pair ({\it prefix pointer},{\it suffix
375: pointer}). The two pointers are referred to two (not necessarily
376: different) phrases $\rho_p$ and $\rho_s$ chosen among the ones
377: contained in the current dictionary as follows. First, the algorithm
378: reads the input stream starting from the current position of the front
379: end, looking for the longest phrase $\rho_p$ matching the
380: stream. Then, the algorithm looks for the longest phrase $\rho_s$ such
381: that the joint word $\rho_p+ \rho_s$ matches the stream. The new
382: phrase $e_{h+1}$ that will be added to the dictionary is then
383: $e_{h+1}=\rho_p+ \rho_s$.
384:  
385: The output file contains an ordered sequence of the binary encoding of 
386: the pairs $(i_p,i_s)$ such that $i_p$ and $i_s$ are the dictionary 
387: index numbers corresponding to the prefix word $\rho _p$ and to the 
388: suffix word $\rho_s$, respectively.  The pair $(i_p,i_s)$ is referred 
389: to the new encoded phrase $e_{h+1}$ and has its own index number 
390: $i_{h+1}$. 
391: 
392: \subsubsection{Example}
393:  
394: The following example shows how the algorithm CASToRe encodes the 
395: input stream 
396: \begin{equation*} 
397: \omega =(abcababccabb\dots) . 
398: \end{equation*} 
399:  
400: Let the source alphabet be $\mathcal{A}=\{a,b,c\}$. 
401:  
402: The output file corresponds to the binary encoding of the following
403: pairs contained in the second column. The first column is the
404: dictionary index number of the encoded phrase in the dictionary which
405: is showed in the same line, second column. For an easier reading, we
406: add a third column which shows each encoded phrase in the original
407: stream $\omega$, but which is not contained in the output file:
408: $$ 
409: \begin{array}{lll} 
410: &\mbox{First, the alphabet is loaded}&\\
411: 1 & (0,\ ^{\prime}a\ ^{\prime}\ ) & [a] \\  
412: 2 & (0,\ ^{\prime}b\ ^{\prime}\ ) & [b] \\  
413: 3 & (0,\ ^{\prime}c\ ^{\prime}\ ) & [c] \\  
414: &\mbox{Then, the encoding procedure starts}&\\
415: 4 & (1,2) & [ab] \\  
416: 5 & (3,4) & [cab] \\  
417: 6 & (4,3) & [abc]\\
418: 7 & (5,3) & [cabc]
419: \end{array} 
420: $$ 
421: and so on. 
422: 
423: %The main difference between CASToRe and $LZ78$ is that the new phrase in
424: %CASToRe encoding is composed by two words, while in $LZ78$ encoding
425: %the new word is composed by one word and one symbol. However, there
426: %are sequences for which both algorithms give the same parsing of
427: %the input sequence. As an example, consider the string
428: %$$s=(abaaababbabb)$$ on the alphabet $\mathcal A=\{a,b\}$.
429: %We will show the two encodings at the same time.
430: %
431: %First, the alphabet is loaded by both algorithms in the same way:
432: % $$ 
433: %\begin{array}{lll} 
434: %1 & (0,\ ^{\prime}a\ ^{\prime}\ )  \\  
435: %2 & (0,\ ^{\prime}b\ ^{\prime}\ )  
436: %\end{array}
437: %$$
438: %Then, the encoding procedures start:
439: %$$
440: %\begin{array}{lcc}
441: %& \mbox{CASToRe}& \quad LZ78\\
442: %3 & (1,2) & \qquad(1,\ ^{\prime}b\ ^{\prime}\ ) \\  
443: %4 & (1,1) & \qquad(1,\ ^{\prime}a\ ^{\prime}\ )  \\  
444: %5 & (3,1) & \qquad(3,\ ^{\prime}a\ ^{\prime}\ ) \\
445: %6 & (2,2) & \qquad(2,\ ^{\prime}b\ ^{\prime}\ ) \\
446: %7 & (3,2) & \qquad(3,\ ^{\prime}b\ ^{\prime}\ )
447: %\end{array} 
448: %$$ The resulting parsing of $s$ is $\{ab,aa,aba,bb,abb\}$ and the
449: %information contents are comparable. Of course, it is reasonable that
450: %if the input sequence has length sensibly larger than the number of
451: %characters in the alphabet, the compression via the algorithm CASToRe
452: %is definitely better than that via the algorithm $LZ78$.
453: %\begin{rem*}
454: %\begin{enumerate}
455: %The CASToRe coding procedure, which pairs words already in the 
456: %dictionary to create a new phrase, is similar to the procedure that can 
457: %be found in the recent work \cite{grass}, which seems to be able to 
458: %give a very precise entropy estimation, detecting very long range 
459: %correlations in the English language. 
460: %\item To our knowledge, optimality properties of the algorithm
461: %CASToRe are still unproved. Nevertheless, there is experimental
462: %evidence that the information content calculated via the algorithm
463: %CASToRe is numerically and qualitatively analogous to the information
464: %content calculated via the algorithm $LZ77$ (see Chapter \ref{chsix}). 
465: %\end{enumerate}
466: %\end{rem*}
467: \subsection{Reading the dictionary}
468: The dictionary built by the algorithm CASToRe is an ordered collection
469: of phrases, that is, of pairs of words. Thus, a phrase is
470: composed by a prefix-word and a suffix-word. By construction, phrases
471: are different from each other, since the algorithm exploits a parsing
472: on the input string. Furthermore, each phrase may become a word, if it
473: appears as prefix or suffix of other phrases in the following
474: dictionary.
475: 
476: In the following, we shall look at the most frequent words, at the
477: longest phrases and in some cases we shall compare the results to the
478: same analysis performed by means of the algorithm $LZ77$ and exploited
479: in collaboration with a group of physicists from the University of
480: Rome (see their previous work \cite{loreto} by V. Loreto et al. for
481: details on the methodology). We shall show that recurrent subsequences
482: occur especially along the regions with lowest information
483: content. Notice that we refer to exact repeats.
484: 
485: We shall distinguish among recurrent subsequences either {\bf
486: motifs} or {\bf patterns}. A {\it motif} is a recurrent word in the
487: dictionary, whereas a {\it pattern} is a recurrent subsequence that
488: does not match any word of the dictionary, but is contained in some of
489: them. If a motif is found, we shall follow its {\it descent}, that is
490: the set of phrases whose the motif is either a prefix or a suffix or
491: both. Moreover, we shall search for the motif to be a {\bf sliding
492: pattern}, in the sense that it is contained in other phrases
493: without being their prefix nor their suffix. Furthermore, if only a
494: sliding pattern is to be found, then we shall recover its {\bf root},
495: that is the longest word of the dictionary matching part of the
496: pattern.
497: \section{The Information Content of DNA sequences}
498: We have analysed the computable complexity of 12 complete
499: genomes\footnote{The genomes have been downloaded by means of
500: the GenBank sequence libraries
501: http://www.ncbi.nlm.nih.gov/Genbank/index.html} of some Archaea,
502: Bacteria and Eukaryotes, together with chromosomes II and IV of
503: \textit{ Arabidopsis thaliana}. The complete list is shown on the
504: following Table \ref{cssh1}.
505: 
506: In order to take into account the biological
507: functional constraints actually existing among the bases within the
508: genome and to highlight new features of coding and noncoding regions, we
509: have exploited a {\it fragment analysis}.
510: \begin{definition}
511: We say that any exon, intron or intergenic region is a functional
512: {\it fragment} of the genome sequence, following the prediction as it
513: has been identified via biological databases and statistical tools
514: (\cite{myers}).
515: \end{definition}
516: {\bf Notation. }In prokariotic genomes there are two functional types,
517: therefore we shall denote by $Coding\_\#$ and $Inter\_\#$ the coding
518: and the noncoding fragments, respectively, where $\#$ is an index to
519: order fragments. In eukaryotic genomes there are three different types
520: of regions: we shall denote by $Exon\_\#$ the coding fragments and by
521: $Intron\_\#$ and $Inter\_\#$ the noncoding intragenic fragments and the
522: noncoding integenic fragments, respectively.
523: 
524: Thus, we shall consider the Computable Complexity $K(f)$
525: of each fragment and study the Information Content growth $CIC(f)$ within a
526: fragment.
527: 
528: First, we have considered how the Information Content varies along
529: some complete DNA sequences: that is, we have studied the behaviour of
530: the CIC of a genome as a function of the number of encoded symbols. As
531: a result, we remark that the function $CIC(\sigma_n)$ grows linearly for all
532: the complete genomes $\sigma$ we have analysed and the asymptotic
533: slope is the value of their Computable complexity $K(\sigma)$:
534: $$CIC(\sigma_n)\ \sim\ K(\sigma)\cdot n \ ,$$ where $\sigma_n$
535: indicates the first $n$ bases in the complete genome $\sigma$. However, we can
536: enhance some regions of the genome and we will see that the $CIC$-line
537: is locally no more straight. This characteristic feature is shared by
538: all the genomes we have analysed, both Prokaryotes and
539: Eukaryotes and confirms the intuitive idea that the Information
540: Content growth should be slower in the parts of the genome where some
541: regularity prevails.
542: 
543: \begin{figure}[hb]
544: \begin{tabular}{lr}
545: \raggedright{(a)\psfig{figure=globus_tot.ps,width=6.5cm,angle=270}} &
546: \raggedleft {(b)\psfig{figure=globus_totIngra.ps,width=6.5cm,angle=270}}
547: \end{tabular}
548: \caption{\it (a) complete $CIC(n)$ graph for {\it Archaeoglobus
549: fulgidus} complete genome; (b) local enhancement of the region from 380000 to
550:  410000 bp. The behaviour of $CIC(n)$ is no more linear.}\label{cfrTot}
551: \end{figure}
552: 
553: For instance, see the results about the genome of {\it Archaeoglobus
554: fulgidus} (Prokaryote) which are pictured on figure \ref{cfrTot}. For
555: the sake of brevity, we shall not show analogous pictures coming from
556: other genomes. 
557: 
558: \begin{table}\begin{center}
559: \begin{tabular}{|c|c|c|}
560: \hline
561: \textbf{Genome} & \it{CSS}&{\bf$H_1$} \\ \hline\hline
562: \textit{Methanococcus jannaschii} & 1.794&1.887 \\ \hline%11
563: \textit{Archeoglobus fulgidus} & 1.909&1.987 \\ \hline%39
564: \textit{Methanobacterium thermoautrophicum} & 1.907 &1.986\\ \hline%07
565: \textit{Pyrococcus abyssi} & 1.901&1.979 \\ \hline\hline%68
566: \textit{Aquifex aeolicus} & 1.883 &1.976\\ \hline%82
567: \textit{Escherichia coli} & 1.893 &1.987\\ \hline%03
568: \textit{Bacillus subtilis} & 1.870 &1.975\\ \hline%38
569: \textit{Haemophylus influenzae} & 1.866 &1.947\\ \hline%89
570: \textit{Mycoplasma genitalium} & 1.848 &1.959\\ \hline%45
571: %\textit{Rickettsia prowazekii} & 1.823 &1.795\\ \hline%46
572: \textit{Thermotoga maritima} & 1.893 &1.984\\ \hline\hline%42
573: \textit{Arabidopsis thaliana} (chr. II and IV) & 1.892&1.938 \\ \hline%76
574: \textit{Saccharomyces cerevisiae} & 1.889 &1.949\\ \hline%27
575: \textit{Caenorhabditis elegans} & 1.777&1.936 \\ \hline%81
576: \end{tabular}
577: \caption{\it complete genomes. Comparison CSS
578: vs. $H_1$.}\label{cssh1}\end{center}
579: \end{table}
580: 
581: For what concerns the values of computable complexity $K$ for the
582: complete genomes we have analysed, the results are shown on Table
583: \ref{cssh1} . We have indicated the complexity $K$ as $CSS$, meaning
584: {\it complexity as a single string}, to distinguish it from the
585: fragment complexity, which is the value of the computable complexity
586: of the functional fragments within the complete genome and which will
587: be denoted by $FC$ in the following. The final column in Table
588: \ref{cssh1} shows the first order entropy $H_1$ of the sequence. If
589: $p_A,\ p_C,\ p_G,\ p_T$ are the nucleotide frequencies over a genome
590: $\sigma$ (the frequency is calculated as the number of occurrences of
591: a specific nucleotide over the total number of nucleotides), then the
592: first order entropy is $H_1=\sum_{i=A,C,G,T}p_i\log p_i$. We recall
593: that, when the symbols are drawn uniformly at random from the source
594: and all the positions in the sequence are independent from each other,
595: an optimal coding procedure will devote $\log _{2}(\#\mathcal{A})$
596: bits per symbol to represent each character (\cite{coverthomas}),
597: where $\#\mathcal{A}$ is the number of symbols in the alphabet
598: $\mathcal{A}$. In this case the asymptotically maximal complexity
599: equals the $H_1$ value for those values of nucleotide frequencies. For
600: quaternary sequences, like the genomes, this maximal mean first order
601: entropy is 2 bits per symbol. Since the $H_1$ value represents a
602: quantity of information of a single string which is dependent on the
603: probability measure on the space of sequences, at first sight the
604: genomes cannot be considered randomly distributed (from a statistical
605: point of view), because for all of them the $H_1$ values are different
606: from 2 bits per symbol. First, we notice that the values of the
607: complexity $CSS$ are significantly different from 2 and lower than the
608: $H_1$ entropy values. Again, this is in complete agreement with the
609: fact that the randomness of the genomes has strong constraints. It is
610: also possible to clearly recognise that some genomes have very low
611: computable complexity (smaller than 1.90 bits per symbol), which means
612: that their internal structure presents mid-range and long-range
613: correlations.
614: 
615: The compression of complete genomes does not satisfy the quest for
616: local structures along a genome. The presence of local nonlinearities
617: in the Information Content function for complete genomes suggests the
618: existence of specific functional fragments whose Information Content
619: function grows sublinearly. We recall that we named those regions
620: atypical. Consequently, we shall investigate in this direction by
621: means of the fragment analysis.
622: 
623: \subsection{A sublinearity index}
624: In order to identify the regions where the growth of the function
625: $CIC(\sigma_n)$ is sublinear, we define a sublinearity index, that allows us
626: to determine whether a functional region is atypical. 
627: 
628: In the following, $\sigma$ shall denote any fragment within a
629: genome. The sublinearity index may be defined by means of any
630: adaptive compression algorithm $Z$, although the experimental results are
631: referred to the algorithm CASToRe.
632: 
633: Let $N=|\sigma|$ be the length of the input sequence $\sigma$. Let
634: $\mathcal{P}(\sigma,Z)$ be the parsing of $\sigma$ with respect to the
635: algorithm $Z$:
636: $\mathcal{P}(\sigma,Z)=\{\phi_1,\phi_2,\dots,\phi_t\}$. Therefore, the
637: input string $\sigma$ is the ordered juxtaposition of phrases $\phi
638: _j$'s. We use the symbol $n_k$ to indicate the current total number of
639: encoded symbols up to step $k$ of the encoding procedure:
640: $n_k=\Sigma_{j=1}^{k}|\phi _j|$. Due to the fact that
641: $|\phi_k|=n_k-n_{k-1}$, we say that $n_k$ is the parsing index
642: corresponding to the phrase $\phi _k$. The Information Content after
643: $k$ steps is then the quantity
644: $I(n_k)=\Sigma_{j=1}^{k}I(\phi_j)$. Obviously, it holds that
645: $n_t=\Sigma_{j=1}^{t}|\phi _j|=N$ and
646: $I(\sigma)=I(N)=\Sigma_{j=1}^{t}I(\phi_j)$. Since the encoding
647: procedure might be not precise in the early steps as well as in the
648: final steps, we fix two bounds defining the restriction of the
649: potential integer value $n_j$. Let $T_{inf}=20\% |\sigma|$ be the
650: lower bound and $T_{sup}=90\%|\sigma|$ be the upper bound. The choice
651: of the bounds will be such that there exist two parsing indexes
652: $n_{inf}$ and $n_{sup}$ such that $T_{inf}\leq n_{inf}<n_{sup}\leq
653: T_{sup}$. Moreover, since the algorithm $Z$ requires that the input
654: sequence is sufficiently long to make the compression reliable and
655: efficient, we shall not analyse sequences whose length $N$ is lower
656: than $200$ symbols. Thus, for the set $\{n_j\ \|\ j=1,\dots,t\}$
657: coming from the parsing of $\sigma$ via the algorithm $Z$, we define
658: the domain $\mathcal{D}=\{n_k\ \|\ n_{inf}\leq n_k\leq n_{sup}\ ,\
659: n_t\geq 200\}$.
660: 
661: \begin{definition}[Sublinearity index of a finite symbol sequence]\label{sublinind}
662: 
663: $\qquad$\\{\it Let $q_{min}$, $q_{max}$ and $q_Z(\sigma)$ be defined
664: as follows:
665: \begin{equation*}
666: q_{min}=\min\limits_{n_k\in\mathcal{D}}\left\{\frac{I(n_k)}{n_k}\right\}\ ,
667: \end{equation*}
668: \begin{equation*}
669: q_{max}=\max\limits_{n_k\in\mathcal{D}}\left\{\frac{I(n_k)}{n_k}\right\}
670: \end{equation*}
671: and
672: \begin{equation*}
673: q_{_Z}(\sigma)=\frac{q_{min}}{q_{max}}\ .
674: \end{equation*}
675: The sublinearity index $\mathcal{G}_{_Z}(\sigma)$ of the input sequence
676: $\sigma$ with respect to the parsing defined via the algorithm $Z$ is
677: the quantity
678: \begin{equation}\label{gi}\index{$\mathcal{G}_{_Z}$, sublinearity index}
679: \mathcal{G}_{_Z}(\sigma)=\frac{\log(q_{_Z}(\sigma))}{\log(\frac
680: {n_{sup}}{n_{inf}})}+1\ .
681: \end{equation}}
682: \end{definition}
683: 
684: The definition of this index $\mathcal{G}_{_Z}$ deserves some
685: comments. Its main characteristic is that it allows a criterion to
686: identify atypical regions to be established.
687: 
688: First of all, it is known that the behaviour of the Information
689: Content of a finite sequence $\sigma$ is an increasing function
690: $I(\sigma^n)$ that grows at most linearly with the number $n$ of
691: encoded symbols. Therefore, the indexes $q_{min}$ and $q_{max}$ can be
692: easily calculated by:
693: $$q_{min}=\frac{I(n_{sup})}{n_{sup}}\ \ \mbox{and}\ \
694: q_{max}=\frac{I(n_{inf})}{n_{inf}}\ .$$
695: Hence, it is straightforward that the value of the sublinearity index is 
696: \begin{equation}\label{utileG}
697: \mathcal{G}_{_Z}(\sigma)=\frac{\log(I(n_{sup}))-\log(I(n_{inf}))}
698: {\log(n_{sup})-\log(n_{inf})}\ .
699: \end{equation}
700: 
701: We notice that the fragment we have analysed are not periodic,
702: otherwise the phrases found in the parsing by the algorithm CASToRe
703: would definitely show length doubling, which is absent in the
704: dictionaries of all the fragments. Furthermore, the Information
705: Content growth of any functional fragment $\sigma$ can not be a
706: logarithmic function $\Psi(n)$ (see Section \ref{castore}), but we
707: might assume that it can be read ($\forall\ 1\leq n\leq |\sigma|$) as
708: \begin{equation}
709: CIC(\sigma_n)=\mathcal O(Cn^{\gamma})\ ,\mbox{ with
710: exponent $0<\gamma\leq 1$ and constant $C>0$}\ .
711: \label{infoPo}
712: \end{equation} 
713: 
714: Note that this formula is relative to a finite sequence, therefore the
715: writing $\mathcal O(Cn^{\gamma})$ is not referring to an asympotic
716: behaviour (as $n\leq |\sigma|$), but it means that the integer
717: function $CIC(\sigma_n)$ is fitted by a function whose do\-mi\-nant term is
718: a power law with exponent smaller than 1. Since we have excluded any
719: pure periodicity, hypothesis (\ref{infoPo}) is doubtless
720: plausible. 
721: 
722: %As we have already pointed out in the previous chapters, the behaviour
723: %of the information content is a discriminant characteristic of
724: %different dynamical behaviours: if the symbol sequence is a symbolic
725: %orbit drawn from a chaotic dynamical system whose entropy is positive,
726: %then the Information content grows linearly (i.e. following equation
727: %(\ref{infoPo}) with $\alpha=1$). Moreover, an example of dynamical
728: %systems whose symbolic orbits have mean Information content growing as
729: %a power law (following equation (\ref{infoPo}) with $0<\alpha<1$) is
730: %given by the Manneville-Pomeau family with the driving parameter
731: %$z>2$. In that case, if the compression algorithm is $LZ77$, Theorem
732: %\ref{tman77} states that the exepctation value of the Information
733: %content of an orbit grows as $I(n)=n^{\frac 1 {z-1}}$. Of course, the
734: %relationship between the kind of growth of Information content and the
735: %dynamical type of the system generating the sequence is not
736: %one-to-one. For instance, both when the dynamics is periodic and when
737: %the orbit is a trajectory coming from the logistic map at the
738: %Feigenbaum point, the information grows logarithmically (see Chapter
739: %\ref{chsix}, Section \ref{sequa}); nevertheless, the two types of
740: %dynamics are far from being similar, because the one is ordered, the
741: %latter is weakly chaotic. 
742: 
743: The two following main points are definitely true. First, a
744: sublinear growth of Information Content is an indicator of the
745: presence of some regularity in the input sequence and this is much
746: more evident when the index $\mathcal{G}_{_Z}$ is significantly
747: smaller than 1. Second, small values of the index $\mathcal{G}_{_Z}$
748: may correspond to different sublinear information growths $-$ also
749: other than power-law-like $-$ that consequently might be a signal of
750: different underlying dynamics generating the symbol sequences.
751: 
752: In the following Lemma, the sublinearity index $\mathcal{G}_{_Z}$ in
753: the case of Information Content growing exactly as a power law is evaluated.
754: \begin{lemma}
755: If $CIC(n_k)=C {n_k} ^\gamma$ with $0<\gamma\leq 1$, then
756: $\mathcal{G}_{_Z}=\gamma$.\end{lemma}
757: \begin{proof}
758:  Consider the formula (\ref{utileG}).  In this case, it holds that
759: $$\mathcal{G}_{_Z}=\frac{\log (C)+\alpha\log(n_{sup})-\log
760: (C)-\alpha\log(n_{inf})}{\log(n_{sup})-\log (n_{inf})}\ .$$ Therefore,
761: the conclusion is straightforward.\end{proof}
762: 
763: Thus, according to formula (\ref{infoPo}), the sublinearity index
764: $\mathcal{G}_{_Z}$ is a reliable quantity that allows the degree of
765: sublinearity of the information content growth to be estimated. In
766: order to evaluate the precision of the index $\mathcal{G}_{_Z}$ with
767: respect to the {\it true} actual exponent $\gamma$, we have compared
768: the values of $\mathcal{G}_{_Z}$ with the values of $\gamma$ as they
769: are given by a numerical fit on the integer function $I(n)$. The
770: results are definitely satisfactory. Some examples are shown on
771: Table \ref{tabellalfa} and are referred to several fragments from the
772: genomes of {\it Archaeoglobus fulgidus}, {\it Escherichia coli} and
773: {\it Arabidopsis thaliana}.
774: 
775: \begin{table}\begin{center}
776: \begin{tabular}{|c|c|c|c|}
777: \hline 
778: \mbox{Genome}&\mbox{Sequence}&\mbox{value of }$\mathcal{G}_{_Z}$
779: &\mbox{fit-value of }$\gamma$\\ 
780: \hline\hline
781: $Archaeoglobus\ fulgidus$&$Coding\_685495$&0.965&1.000\\
782: \hline
783: $Archaeoglobus\ fulgidus$&$Inter\_1143603$&0.949&0.949\\
784: \hline
785: $Archaeoglobus\ fulgidus$&$Inter\_393196$&0.832&0.831\\
786: \hline\hline 
787: $Escherichia\  coli$&$Inter\_2302612$&0.768&0.747\\
788: \hline 
789: $Escherichia\  coli$&$Inter\_4293752$&0.728&0.730\\ 
790: \hline
791: $Escherichia\  coli$&$Coding\_91419$&0.986&0.986\\
792: \hline\hline 
793: $Arabidopsis\ thaliana$&$Exon\_23950656$&0.614&0.585\\
794: \hline
795: $Arabidopsis\ thaliana$&$Intron\_5063613$&0.767&0.738\\
796: \hline
797: $Arabidopsis\ thaliana$&$Inter\_19660110$&0.887&0.886\\
798: \hline
799: \end{tabular}
800: \caption{\it reliability of the sublinearity index $\mathcal{G}_{_Z}$ in the
801: case of several functional regions from different genomes.}\label{tabellalfa}
802: \end{center}
803: \end{table}
804: 
805: The following definition will be used to extract the atypical
806: functional regions. The threshold has been fixed according to the
807: empirical principle that the kind of growth $n^\gamma$ where $\gamma$
808: lies in $[0.9,1]$ is, on a general basis, equivalent to a linear
809: growth, due to the finiteness of the sequences under analysis.
810: 
811: \begin{definition}[Atypical region]\label{atypical}
812: An atypical region within a genome is any functional region whose
813: sublinearity index $\mathcal{G}_{_Z}$ is smaller than 0.9.
814: \end{definition}
815: 
816: \begin{figure}
817: \centerline{\psfig{figure=globusUPS_393196.ps,width=8cm,angle=270}}
818: \caption{\it Archaeoglobus fulgidus genome. The behaviour of the
819: information content of region $Inter\_393196$ is a power law whose
820: exponent is 0.832. The picture is in linear scale.}\label{regLow}
821: \end{figure}
822: \begin{figure}
823: \centerline{\psfig{figure=GKglobus.ps,width=8cm,angle=270}}
824: \caption{\it Archaeoglobus fulgidus genome. Comparison between the
825: values of sublinearity index and fragment complexity of all functional
826: regions with length greater than 200 bp. The crosses ($+$) are referred
827: to coding regions, while the diamonds ($\diamond$) are referred to
828: intergenic regions. The vertical line is the threshold for the
829: sublinearity index, under which the region is atypical.}\label{cfrGK}
830: \end{figure}
831: 
832: The connection between sublinearity index and fragment complexity is
833: not precise, even if in the extreme cases where both values are either
834: high or low a sort of clusters are detected. For instance, Figure
835: \ref{cfrGK} illustrates what the relation is between the sublinearity
836: index (horizontal axis) and the fragment complexity (vertical axis) in
837: the case of the genome of {\it Archaeoglobus fulgidus}. Atypical
838: regions are indicated by means of a vertical line that represents the
839: threshold for the sublinearity index as introduced in Definition
840: \ref{atypical}. It is clear that, both in the case of coding regions
841: (depicted by a cross) and of noncoding regions (depicted by a
842: diamond), the higher the fragment complexity is, the higher the
843: sublinearity index is.  Furthermore, the detection of atypical regions
844: with high fragment complexity suggests that the sublinearity index may
845: be more meaningful in identifying regularity of sequences than the
846: fragment complexity.
847: \section{Experimental results}
848: In the following, we shall introduce some preliminar examples of
849: application of the $CIC$ method. Ww shall analyse the dictionary of
850: some long atypical regions within the genomes of {\it Archaeoglobus
851: fulgidus}, {\it Methanococcus jannaschii} and {\it Arabidopsis
852: thaliana}. We shall discover peculiar properties and propose
853: some biological motivations to those features. This part of the work
854: has been developed in collaboration to the Animal Biology and Genetics
855: Department of the University of Florence.
856: 
857: \subsection{Archaeoglobus fulgidus}
858: 
859: {\it Archaeoglobus fulgidus} is a sulphur-metabolizing anaerobic
860: organism. It belongs to the Archaeoglobales, archaeal sulfate reducers
861: unrelated to other sulfate reducers. They grow at extremely high
862: temperatures. Archaeoglobus species causes corrosion of iron and steel
863: in oil and gas processing systems by the production of iron
864: sulphide. This organism has one circular chromosome.
865: \begin{figure}
866: \centerline{\psfig{figure=LGglobus.ps,width=8cm,angle=270}}
867: \caption{\it Archaeoglobus fulgidus genome. Of each functional region,
868:   its length and the corresponding sublinearity index are
869:   plotted. The crosses ($+$) are referred to coding regions,
870:   while the squares ($\square$) are referred to intergenic regions.
871:   The horizontal line is the threshold for the sublinearity index, under
872:   which the region is atypical.}\label{cfrLGglobus}
873: \end{figure}
874: 
875: Looking at Figure \ref{cfrLGglobus}, we have extracted two regions:
876: one atypical region, which is noncoding, and two non-atypical regions,
877: one coding and one noncoding. This choice is aimed at comparing the
878: dictionaries of regions with sublinear grwoth of information to the
879: dictionaries of regions with li\-near growth of information.
880: 
881: The exemplified regions are
882: \begin{itemize}
883: \item $Coding\_685495$: non-atypical region, length $L=2300\ bp$,
884:   sublinearity index $\mathcal{G}_{_Z}=0.965$, fragment complexity $K=
885:   2.108$;
886: \item $Inter\_1143603$: non-atypical region, length $L=2219\ bp$,
887:  sublinearity index $\mathcal{G}_{_Z}=0.949$, fragment complexity $K= 2.117$;
888: \item $Inter\_393196$: atypical region, length $L=2629\ bp$,
889:   sublinearity index \linebreak$\mathcal{G}_{_Z}=0.832$, fragment
890:   complexity $K= 1.494$.
891: \end{itemize}
892: We start analysing the non-typical regions.
893: \begin{figure}
894: \begin{tabular}{lr}
895: \raggedright{(a)\psfig{figure=globusCOD_685495len.ps,width=6.5cm,angle=270}}
896: &\raggedleft{(b)\psfig{figure=globusUPS_1143603len.ps,width=6.5cm,angle=270}}
897: \end{tabular}
898: \begin{tabular}{lr}
899: \raggedright{(c)\psfig{figure=globusCOD_685495_stalen.ps,width=6.5cm,angle=270}}
900: &\raggedleft{(d)\psfig{figure=globusUPS_1143603_stalen.ps,width=6.5cm,angle=270}}
901: \end{tabular}
902: \caption{\it Archaeoglobus fulgidus genome. Plots (a) and (b) show the
903:   location and length of the phrases in the parsing by the algorithm
904:   CASToRe, in non-atypical regions $Coding\_685495$ and
905:   $Inter\_1143603$, respectively. Graphs (c) and (d) illustrate the
906:   distribution of phrase length in the same
907:   regions.}\label{globNonatyp}
908: \end{figure}
909: First, we have plotted the length of the phrases in the dictionary
910: together with their position in the input sequence (see Figure
911: \ref{globNonatyp} $(a)$ and $(b)$). In both non-atypical regions, the
912: phrases are short and the maximal length is 11 bp. The Gaussian
913: distribution of phrase length confirms that these regions are not
914: regular, but highly variable (see Figure \ref{globNonatyp} $(c)$ and
915: $(d)$). The extent of the dictionary is great in both non-atypical
916: regions: 415 phrases in the dictionary of region $Coding\_685495$ and
917: 393 phrases in the dictionary of region $Inter\_1143603$.
918: 
919: However, in the case of region $Coding\_685495$, the algorithm CASToRe
920: recognised 31 codons as phrases that are also used as prefix or suffix
921: words quite frequently. Table \ref{globcodon} illustrates the details
922: of this feature that has been found only in coding regions; in fact,
923: in non-atypical noncoding regions the codons that are recognised as
924: phrases are always a few.
925: \begin{table}\begin{center}
926: \begin{tabular}{|c|c|c||c|c|c|}
927: \hline 
928: \mbox{Codon}&\mbox{$\#$ prefix}&\mbox{$\#$ suffix}&\mbox{Codon}&\mbox{$\#$ prefix}&\mbox{$\#$ suffix}\\
929: \hline\hline
930: AAA&10&4&CTG&8&7\\
931: \hline
932: AAG&1&8&GCA&3&2\\
933: \hline
934: AAT&4&1&GCC&8&6\\
935: \hline
936: ACA&10&4&GCT&5&3\\
937: \hline
938: ACC&4&7&GGA&2&4\\
939: \hline
940: ACG&4&1&GGT&2&2\\
941: \hline
942: ACT&7&6&TAA&5&8\\
943: \hline
944: ATG&3&0&TAG&0&1\\
945: \hline
946: ATT&4 &8&TAT&2&4\\
947: \hline
948: CAA&14&10&TCA&6&6\\
949: \hline
950: CAG&3&7&TCC&8&2\\
951: \hline
952: CAT&0&0&TCT&7&5\\
953: \hline
954: CCG&3&1&TGT&2&6\\
955: \hline
956: CCT&2&3&TTA&5&7\\
957: \hline
958: CGG&0&0&TTG&0&4\\
959: \hline
960: CGT&5&7& & &\\
961: \hline
962: \end{tabular}
963: \caption{\it 31 different codons have been recognised as phrases in the
964:   parsing by the algorithm CASToRe, in region $Coding\_685495$ of the
965:   genome of {\it Archaeoglobus fulgidus}. Some of them have been also
966:   used as prefix or suffix of other phrases. Columns named $\#\ 
967:   prefix$ and $\#\ suffix$ indicate how many times the phrase has been
968:   used as a prefix or suffix.}\label{globcodon}
969: \end{center}
970: \end{table}
971: \begin{figure}
972: \begin{tabular}{lr}
973: \raggedright{(a)\psfig{figure=globusUPS_393196len.ps,width=6.5cm,angle=270}}&
974: \raggedleft{(b)\psfig{figure=globusUPS_393196_stalen.ps,width=6.5cm,angle=270}}
975: \end{tabular}
976: \caption{\it Archaeoglobus fulgidus genome. Plot (a) shows
977:   the location and length of the phrases in the parsing by the
978:   algorithm CASToRe, in atypical region $Inter\_393196$. Graph (b) 
979:   illustrates the distribution of phrase length in the same
980:   region.}\label{globatyp}
981: \end{figure}
982: 
983: Conversely, the dictionary relative to fragment $Inter\_393196$, which
984: is atypical noncoding, shows completely different
985: characteristics. First of all, the dictionary contains 349
986: phrases. Moreover, Figure \ref{globatyp} $(a)$ shows that in this
987: sequence there should be recurrences of similar patterns, because of
988: the several long phrases (that is, longer than 25 bp) that are spread
989: along the whole sequence. Another feature, which will be paradigmatic
990: of atypical regions, is the anomalous (non-Gaussian) tail in the
991: distribution of phrase length (see Figure \ref{globatyp} $(b)$). The
992: distribution is no longer peaked at only one value, but there is a
993: significant occurrence of long words that could not be found in
994: non-atypical regions and is consistent with the presence of regularity
995: within any atypical region.
996: 
997: According to the dictionary obtained by means of algorithm CASToRe,
998: there is a dominant motif $\mathcal M$ of length 25 bp (phrase
999: nr. $109$ in the dictionary), that is also used 9 times as a prefix
1000: and 3 times as a suffix.  Table \ref{tabMglobus} illustrates what the
1001: dominant motif $\mathcal M$ and its descent are. We recall that the
1002: descent of a phrase $\phi$ is the set of other phrases in the
1003: dictionary such that $\phi$ is either their prefix or suffix or both.
1004: \begin{table}\begin{center}
1005: \begin{tabular}{c}
1006: $\mathcal M=$AATCCCATTTTGGTCTGATTTCAAC\\
1007: Descent of $\mathcal M\  :$\\
1008: {\bf AATCCCATTTTGGTCTGATTTCAAC}ACA\\
1009: {\bf AATCCCATTTTGGTCTGATTTCAAC}AG\\
1010: {\bf AATCCCATTTTGGTCTGATTTCAAC}CAA\\
1011: {\bf AATCCCATTTTGGTCTGATTTCAAC}CT\\
1012: {\bf AATCCCATTTTGGTCTGATTTCAAC}GA\\
1013: {\bf AATCCCATTTTGGTCTGATTTCAAC}GT\\
1014: {\bf AATCCCATTTTGGTCTGATTTCAAC}TATTT\\
1015: {\bf AATCCCATTTTGGTCTGATTTCAAC}TT\\
1016: {\bf AATCCCATTTTGGTCTGATTTCAAC}TTTC\\
1017: CCCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\
1018: CTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\
1019: TTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\
1020: \end{tabular}
1021: \caption{\it dominant motif $\mathcal M$ and its descent in atypical
1022:   region $Inter\_393196$ of the genome of Archaeoglobus fulgidus.}
1023: \label{tabMglobus}
1024: \end{center}
1025: \end{table}
1026: 
1027: The presence of a dominant motif partially motivates the many
1028: oscillations in the $CIC$ growth, as depicted in Figure \ref{regLow}.
1029: Furthermore, a complete explanation lays on the fact that the motif
1030: $\mathcal M$ is also a sliding pattern in many other phrases (see
1031: Table \ref{slidinglobus}). This is
1032: an irrefutable evidence of the fact that this atypical region shows a
1033: {\it variable periodicity} represented by the recurrence of the
1034: motif $\mathcal M$ sometimes slightly modified, as in the case of
1035: approximate repeats. 
1036: 
1037: Even if the biological usefulness of the motif $\mathcal M$ is still
1038: unknown, another hint to its peculiarity is provided by the
1039: compression of region $Inter\_393196$ by means of algorithm
1040: $LZ77$. The motif $\mathcal M$ is a motif also in the dictionary
1041: extracted by $LZ77$. Therefore, the idea that this motif should have a
1042: precise biological meaning is even more convincing. Furthermore, this
1043: example suggests that also approximate repeats generated by insertions may
1044: be identified via $CIC$ method.
1045: \begin{table}\begin{center}
1046: \begin{tabular}{c}
1047: $\mathcal M=$AATCCCATTTTGGTCTGATTTCAAC\\
1048: Motif as a sliding pattern in:\\
1049: TTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\
1050: {\bf AATCCCATTTTGGTCTGATTTCAAC}GAAG\\
1051: {\bf AATCCCATTTTGGTCTGATTTCAAC}CTCC\\
1052: {\bf AATCCCATTTTGGTCTGATTTCAAC}TATTT\\
1053: CTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\
1054: CCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\
1055: TCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\
1056: TTC{\bf AATCCCATTTTGGTCTGATTTCAAC}TCC\\
1057: TTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}CTT\\
1058: CGCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\
1059: {\bf AATCCCATTTTGGTCTGATTTCAAC}GAGGCGT\\
1060: CCCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\
1061: CTCCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\
1062: CCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}TA\\
1063: ACTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}AG\\
1064: TTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}TTTA\\
1065: CTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}ATC\\
1066: GTCTCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\
1067: CACGCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\
1068: ACCCCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}
1069: \end{tabular}
1070: \caption{\it phrases where the motif $\mathcal M$ is a sliding
1071:   pattern. The motif is written bold typed.}
1072: \label{slidinglobus}
1073: \end{center}
1074: \end{table}
1075: \subsection{Methanococcus jannaschii}
1076: \begin{figure}
1077: \centerline{\psfig{figure=LGmjann.ps,width=8cm,angle=270}}
1078: \caption{\it Methanococcus jannaschii genome. Of each functional
1079:   region, its length and the corresponding sublinearity index are
1080:   plotted. The crosses ($+$) are referred to coding regions, while the
1081:   squares ($\square$) are referred to intergenic regions.  The
1082:   horizontal line is the threshold for the sublinearity index, under
1083:   which the region is atypical. }\label{LGmjann}
1084: \end{figure}
1085: 
1086: {\it Methanococcus jannaschii} is a thermophilic (48-94$^\circ$ C),
1087: strict anaerobic Archaebacterium living at pressures of over 200
1088: atmospheres. It is an autotroph which gets its energy from hydrogen
1089: and carbon dioxide producing methane and it is capable of nitrogen
1090: fixation. Morphologically, it is characterized by having two bundles
1091: of flagella at the same cellular pole. The genome of {\it
1092: Methanococcus jannaschii} consists of the main circular chromosome and
1093: two circular extrachromosomal elements (ECE), one large and one
1094: small. We have analysed only the main chromosome.
1095: 
1096: In this genome we shall show one atypical region, whose sublinearity
1097: index is particularly low and having approximately the same extent as
1098: the other regions that have been already analysed. However, as it
1099: is shown on Figure \ref{LGmjann}, this genome presents many other long
1100: atypical regions, that will be studied in future work.
1101: 
1102: The atypical region we have analysed is
1103: \begin{itemize}
1104: \item $Inter\_236189$: atypical region, length $L=2112\ bp$,
1105:   sublinearity index \linebreak$\mathcal{G}_{_Z}=0.707$, fragment
1106:   complexity $K= 1.405$.
1107: \end{itemize}
1108: \begin{figure}
1109: \centerline{\psfig{figure=mjannUPS_236189.ps,width=8cm,angle=270}}
1110: \caption{\it Methanococcus jannaschii genome. The behaviour of the
1111:   information content of region $Inter\_236189$ grows sublinearly with
1112:   index 0.707. The picture is in linear scale.}\label{mjannLow}
1113: \end{figure}
1114: The behaviour of the information content in atypical region
1115: $Inter\_236189$ is twofold: until the first 1500 base pairs have been
1116: encoded, the growth is almost logarithmic, while in the final part the
1117: $CIC$ increase is faster (see Figure \ref{mjannLow}). Therefore, the
1118: first part of the sequence should be more regular than the second one.
1119: 
1120: \begin{figure}
1121: \begin{tabular}{lr}
1122: \raggedright{(a)\psfig{figure=mjannUPS_236189len.ps,width=6.5cm,angle=270}}&
1123: \raggedleft{(b)\psfig{figure=mjannUPS_236189_BISstalen.ps,width=6.5cm,angle=270}}
1124: \end{tabular}
1125: \caption{\it Methanococcus jannaschii genome. Plot $(a)$ shows 
1126:   location and length of the phrases in the parsing by the algorithm
1127:   CASToRe of region $Inter\_236189$. In graph (b) the corresponding
1128:   distribution of phrase length is pictured.}\label{mjannparole}
1129: \end{figure}
1130: 
1131: This aspect is well-represented in graph $(a)$ of Figure
1132: \ref{mjannparole}. The presence of longer and longer phrases before
1133: 1500 bp have been compressed is an evidence for the existence of
1134: highly repetitive subsequences in the first half, whereas in the
1135: second half of the input sequence $Inter\_236189$ the previous
1136: regularity is broken and only brief repetitions can be
1137: found. Consequently, the extent of the dictionary is low: there are
1138: only 264 phrases. 
1139: 
1140: As in the case of the analysed atypical region of genome of {\it
1141:   Archaeoglobus fulgidus}, the distribution of phrase length has an
1142: anomalous non-Gaussian tail that comprehends also a phrase that is 134
1143:   bp long (Figure \ref{mjannparole} $(b)$).
1144: 
1145: \begin{table}\begin{center}
1146: \begin{tabular}{|l|}
1147: \hline
1148: AATTAAAATCAGACCGTTTCGGAATGGAAAT\\
1149: \hline
1150: AGACCGTTTCGGAATGGAAAT\\
1151: \hline
1152: AGACCGTTTCGGAATGGAAATGAT\\
1153: \hline
1154: AGGGAACCCTAAAAAGGTTC\\
1155: \hline
1156: AGGGAACCCTAAAAAGGTTCCCTTGAGGGTT\\
1157: \hline
1158: AGGGAACCCTAAAAAGGTTCCCTTGAGGGTTCATTAAAATCAGACCGTT\\
1159:                                        TCGGAATGGAAATCTGTT\\
1160: \hline
1161: AGGGAACCCTAAAAAGGTTCCCTTGAGGGTTCATTAAAATCAGACCGTT\\
1162: TCGGAATGGAAATCTGTTAGGGAACCCTAAAAAGGTTCCCTTGAGGGTT\\
1163: CATTAAAATCAGACCGTTTCGGAATGGAAATCTGTT\\
1164: \hline
1165: ATTAAAATCAGACCGTTTCGGAATGGAAATGATT\\
1166: \hline
1167: CATTAAAATCAGACCGTTTCGGAATGGAAATTC\\
1168: \hline
1169: CATTAAAATCAGACCGTTTCGGAATGGAAATCTGTT\\
1170: \hline
1171: CCTTGAGGGTTCATTAAAATCAGACCGTTTCGGAATGGAAATCTGTT\\
1172: \hline
1173: GTATTAAAATCAGACCGTTTCGGAAT\\
1174: \hline
1175: GTTTCGGAATGGAAATCTGTT\\
1176: \hline
1177: GTTTCGGAATGGAAATGAAT\\
1178: \hline
1179: GTTTCGGAATGGAAATGATT\\
1180: \hline
1181: GTTTCGGAATGGAAATTTTT\\
1182: \hline
1183: TAAAATCAGACCGTTTCGGAAT\\
1184: \hline
1185: TAAAATCAGACCGTTTCGGAATGGAAAT\\
1186: \hline
1187: TAAAATCAGACCGTTTCGGAATGGG\\
1188: \hline
1189: \end{tabular}
1190: \caption{\it Methanococcus jannaschii genome. Phrases longer than 20
1191:   bp are listed, coming from the dictionary relative to atypical
1192:   region $Inter\_236189$.}\label{tabmjann}
1193: \end{center}
1194: \end{table}
1195: 
1196: For what concerns the analysis of recurrent phrases in the dictionary,
1197: it holds that only phrases that are shorter than 10 bp are used more
1198: than three times as prefix word or suffix word. As it is shown in
1199: Table \ref{tabmjann}, the phrases longer than 20 bp (that correspond
1200: to the high ``spikes'' of Figure \ref{mjannparole} $(a)$) do not allow a
1201: dominant motif to be determined in such a definite way as in the case
1202: of atypical region $Inter\_393196$ of {\it Archaeoglobus fulgidus}
1203: genome. The increasingly longer phrases that have been detected in
1204: graph \ref{mjannparole} $(a)$ are not generated by coupling
1205: the prefix word to itself (as it would have been if there were a precise
1206: periodicity), but prefix and suffix words were different from each
1207: other and neither they are subsequent. Again, the longest phrases
1208: coincides with the longest ones found by means of the algorithm $LZ77$.
1209: 
1210: However, the main point of distinction of this atypical region is that
1211: all long phrases are rich in T$^n$A$^m$-patterns. This fact, together
1212: with the positive homology response classify this region as a promoter
1213: region containing a subregion known as {\it TATA box}. The promoter
1214: sequence could be located using program PROSCAN Version 1.7
1215: (\cite{promoter}).
1216: 
1217: The dictionary of this region provides another example of regularity
1218: in DNA sequences, different from the one coming from the genome of
1219: {\it Archaeoglobus fulgidus}.
1220: 
1221: \subsection{Arabidopsis thaliana}
1222: {\it Arabidopsis thaliana} is a small flowering plant that is widely
1223: used as a model organism in plant biology. {\it Arabidopsis} is a member of
1224: the mustard (Brassicaceae) family, which includes cultivated species
1225: such as cabbage and radish. {\it Arabidopsis thaliana} is the first
1226: plant for which the complete genome has been sequenced. Its genome
1227: consists of five chromosomes, but we have analysed only chromosomes II
1228: and IV. Since the research regarding this genome is
1229: still {\it in itinere}, here we shall present some very preliminar results
1230: concerning chromosome II.
1231: 
1232: The atypical regions we have analysed are
1233: \begin{itemize}
1234: \item $Coding\_8330271$: atypical region, length $L=309\ bp$,
1235:   sublinearity index $\mathcal{G}_{_Z}=0.166$, fragment
1236:   complexity $K= 1.113$.
1237: \item $Inter\_22564763$: atypical region, length $L=65849\ bp$,
1238:   sublinearity index $\mathcal{G}_{_Z}=0.589$, fragment
1239:   complexity $K= 0.911$.
1240: \end{itemize}
1241: These regions have been chosen as peculiar among the
1242: many atypical regions (see Figure \ref{lgarab}) belogning to this
1243: genome: a short and very regular coding region and a long intergenic
1244: region.
1245: \begin{figure}
1246: \begin{tabular}{lr}
1247: \raggedright{(a)\psfig{figure=LGCODINTRarab.ps,width=6.5cm,angle=270}}&
1248: \raggedleft{(b)\psfig{figure=LGUPSarab.ps,width=6.5cm,angle=270}}
1249: \end{tabular}
1250: \caption{\it Arabidopsis thaliana genome. Of each functional region, its
1251:   length and the corresponding sublinearity index are plotted. In
1252:   picture (a), the crosses ($+$) are referred to coding regions, while
1253:   the squares ($\square$) are referred to introns. In picture (b), the
1254:   squares ($\square$) are referred to intergenic regions. In both
1255:   plots, the horizontal line is the threshold for the sublinearity
1256:   index, under which the region is atypical. }\label{lgarab}
1257: \end{figure}
1258: \begin{figure}
1259: \centerline{(a)\psfig{figure=arabCOD_8330271.ps,width=7cm,angle=270}}
1260: \begin{tabular}{lr}
1261: \raggedright{(b)\psfig{figure=arabCOD_8330271len.ps,width=6.5cm,angle=270}}&
1262: \raggedleft{(c)\psfig{figure=arabCOD_8330271_stalen.ps,width=6.5cm,angle=270}}
1263: \end{tabular}
1264: \caption{\it Atypical region $Coding\_8330271$. (a) The Information
1265: Content growth is logarithmic for the main part of the sequence. The
1266: word length doubling is shown on plot (b) and the multimodal
1267: distribution of word length is illustrated in (c).}\label{proteinarab}
1268: \end{figure}
1269: 
1270: The atypical region $Coding\_8330271$ is characterized by a period
1271: $^\prime GA^{\ \prime}$ that is repeated for most part of the sequence
1272: (the first 200 bp). This is made evident both from the $I(n)$ plot on
1273: Figure \ref{proteinarab} $(a)$, which is definitely logarithmic in the
1274: first part, and from the word length doubling highlighted in Figure
1275: \ref{proteinarab} $(b)$. Also, the multimodal distribution of word
1276: length reflects the atypical nature of this regions, while the maximal
1277: length is $12$ bp, which confirms that the characteristic maximal
1278: length in non-atypical coding regions is about $11-12$ bp (for
1279: instance, see \ref{globNonatyp} (c)). The putative protein that may be
1280: obtained by translating this coding region is following protein
1281: Atg219370:
1282: $$
1283: \begin{array}{l}
1284: \mathrm{ERERGSERERERERERERERERERERERERERERERER}\\
1285: \mathrm{EREREREREREREREREREREREREKHKPATLAKNRRR}\\
1286: \mathrm{RFVKNRRRRDHRRRISIIDGYESQF*V}\\
1287: \end{array}
1288: $$ 
1289: 
1290: In the above notation, each letter corresponds to an amino acid, while
1291: the star indicates the end of the protein. This putative protein is
1292: very rich in Glutamate (E) and Arginine (R), but its function is still
1293: unknown and consideration should be given to the fact that the actual
1294: existence of this protein in the living organism has not yet been
1295: confirmed by biomolecular laboratory experiments, therefore this
1296: fragment has been classified as coding onyl by means of statistical
1297: predicitive methods .
1298: \begin{figure}
1299: \begin{tabular}{lr}
1300: \raggedright{(a)\psfig{figure=arabUPS_22564763.ps,width=6.5cm,angle=270}}&
1301: \raggedleft{(b)\psfig{figure=arabUPS_22564763len.ps,width=6.5cm,angle=270}}
1302: \end{tabular}
1303: \begin{tabular}{lr}
1304: \raggedright{(c)\psfig{figure=Zoomgeniarab.ps,width=6.5cm,angle=270}}&
1305: \raggedleft{(d)\psfig{figure=arabUPS_22564763_stalen.ps,width=6.5cm,angle=270}}
1306: \end{tabular}
1307: \caption{\it Arabidopsis thaliana genome (chromosome II). (a) The
1308: behaviour of Information Content of atypical region $Inter\_22564763$
1309: grows in a very peculiar way. Its sublinearity index has been
1310: evaluated as 0.589. (b) The plot shows location and length of the
1311: phrases in the parsing obtained by the algorithm CASToRe. (c) The plot
1312: is an enhancement of the final part of the atypical region
1313: $Inter\_22564763$. (d) The distribution of phrase length for the
1314: aforementioned parsing is pictured.}\label{arabgene}
1315: \end{figure}
1316: 
1317: The atypical region $Inter\_22564763$ was a challenging task, because
1318: not only the Information Content growth shows an abrupt change around
1319: $50000$ bp (Figure \ref{arabgene} $(a)$), but also the word length is
1320: subjected to a deep decrease when reaching that threshold, although at
1321: that point the dictionary already contained more than $1700$ phrases,
1322: most of them longer than 50 bp (Figure \ref{arabgene} $(b)$ and
1323: $(c)$).
1324: 
1325: It was this twofold look of the region that suggested that in the
1326: final part of this region (from $50000$ bp to $65849$ bp) there might
1327: have been some coding sequences. This was also supported by the
1328: prevailing length of about $11-12$ bp, which, as it was already
1329: pointed out, may be considered as characteristic of coding regions.
1330: \begin{figure}
1331: \centerline{\psfig{figure=geniTrovatiarab.ps,width=12cm,angle=270}}
1332: \caption{\it Arabidopsis thaliana genome (chromosome II). Same part of
1333: atypical region $Inter\_22564763$ as plot (c) in Figure
1334: \ref{arabgene}. The boxes correspond to the location of the
1335: four predicted genes (labelled as $^\prime G1^\prime,^\prime
1336: G2^\prime,^\prime G3^\prime,^\prime G4^\prime$) as they have been
1337: predicted looking for similarities with Arabidopsis thaliana known
1338: genes.}\label{cfrarabgene}
1339: \end{figure}
1340: As a result, four putative genes G1, G2, G3 and G4 have
1341: been located by means of Hidden Markov Model-based program
1342: FGENESH\footnote{This program is available at the website
1343: www.softberry.com to which we refer concerning the reliability and
1344: efficiency of the algorithm.} that has been created for predicting
1345: multiple genes and their structure in genomic DNA sequences. The
1346: analysis via FGENESH has been exploited with respect to known genes in {\it
1347: Arabidopsis thaliana}. Their predicted position is illustrated in Figure
1348: \ref{cfrarabgene}.
1349: 
1350: %%%%%%%%%%%%questa parte va tolta
1351: %In Table \ref{putagene} we shall introduce a short
1352: %characterisation of each predicted gene and exon. Notice that all
1353: %these genes are to be read in the complementary strand.
1354: %
1355: %A precise biological screening of these data is still in progress and
1356: %more and more open questions are arising from them.
1357: %\begin{table}\label{putagene}
1358: %\begin{center}
1359: %\begin{tabular}{|c|c|c|c|c|}
1360: %\hline
1361: %Gene&Feature&start$-$end&Open Reading Frame&length\\
1362: %\hline
1363: %\hline
1364: %G1&PolA&52194 &&\\
1365: %&last exon&52373$-$52544&52373$-$52543&171\\
1366: %&internal exon&52583$-$52950&52585$-$52950&366\\
1367: %&internal exon &53142$-$54182&53142$-$54182&1041\\
1368: %&internal exon&54212$-$54316&54212$-$54316&105\\
1369: %&final exon&54413$-$54751&54413$-$54751&339\\
1370: %&promoter&54766&  &\\
1371: %\hline 
1372: %\hline
1373: %G2&PolA&54802 &&\\
1374: %&last exon&54836$-$55447&54836$-$55447&612\\
1375: %&final exon&55528$-$55983&55528$-$55983&456\\
1376: %&promoter&56054&  &\\
1377: %\hline
1378: %\hline
1379: %G3&PolA&62628 &&\\
1380: %&last exon&62648$-$62861&62648$-$62860&213\\
1381: %&internal exon&62899$-$63232&62901$-$63230&330\\
1382: %&internal exon &63382$-$63723&63383$-$63721&339\\
1383: %&internal exon&63826$-$64309&63827$-$64309&483\\
1384: %&final exon&64410$-$64490&&81\\
1385: %&promoter&64768&  &\\
1386: %\hline 
1387: %\hline
1388: %G4&PolA&64895 &&\\
1389: %&exon&64973$-$65278&64973$-$65278&306\\
1390: %\hline
1391: %\end{tabular}
1392: %\end{center}
1393: %\caption{\it The detailed location and internal structure of predicted
1394: %genes within the intergenic region $Inter\_22564763$ of {\it
1395: %Arabidopsis thaliana}. The analysis has been performed by means of
1396: % {\rm FGENESH}.}
1397: %\end{table}
1398: \section{Final remarks and future work}
1399: We have shown that complete genomes may be analysed in some of their
1400: distinctive features by means of the Computable Information Content
1401: obtained via compression algorithms. The Information Content may be
1402: used to extract regions having an atypical information growth, which
1403: is strictly connected to the presence of highly repetitive subregions
1404: that might be supposed to have a regulatory function within the
1405: genome. Different types of sublinearities have been associated to
1406: different biogical features. These results shall pave the way for a
1407: more profound understanding of the local compressibility of genomes
1408: and for a more detailed identification of motifs and patterns that are
1409: significant to some biological function, in view of a joint use
1410: together with other predictive methods.
1411: \begin{thebibliography}{99}
1412: \bibitem{billingsley} Billingsley P., {\it Ergodic Theory and
1413: Information}, J. Wiley and Sons, New York (1965).
1414: \bibitem{kin}Khinchin A.I., {\it Mathematical foundations of information 
1415: theory}, Dover Publications, New York, 1957 
1416: \bibitem{Ch}Chaitin G.J, {\it Information, Randomness and
1417:     incompleteness}, second edition, World Scientific, Singapore
1418:     (1990).  
1419: \bibitem{kolmogorov} Kolmogorov A. N., ''On the entropy per time unit
1420:  as a metric invariant of automorohism'', {\it Dokl. Acad. Nauk.}, 
1421: {\bf 124}: 754-755 (1959).
1422: \bibitem{cleary}  Bell T., Witten I. H., Cleary J. G., \textit{Modeling for 
1423: text compression}, ACM Computing Surveys, \textbf{21}, 557--591 (1989).
1424: \bibitem{licatone}Benci V., Bonanno C., Galatolo S., Menconi G.,
1425: Virgilio M., {\it Dynamical systems and computable information},
1426: to appear on {\it Disc. Cont. Dyn. Syst.- B}.
1427: \bibitem{jiang} Adebiyi E. F., Jiang T., Kaufmann M.,''An efficient
1428: algorithm for finding short approximate non-tandem repeats'', {\it
1429: Bioinformatics}, {\bf 17}, Suppl 1: S5--S12 (2001).
1430: \bibitem{chen} Li M., Badger J.H., Chen X., Kwong S., Kearney P.,
1431: Zhang H., ``An information based sequence distance and its Application
1432: to whole mithocondrial genome phylogeny'', {\it Bioinformatics}, {\bf
1433: 17} (2): 149--154 (2001).
1434: \bibitem{tahi} Grumbach S., Tahi F., ``A new challenge for compression
1435: algorithms: genetic sequences'', {\it Information processing \&
1436: Management}, {\bf 30}: 875--886 (1994). 
1437: \bibitem{gal4}  Galatolo S., \emph{``Orbit complexity and data compression''}
1438: , Discrete and Continuous Dynamical Systems \textbf{7}, 477-486 (2001).
1439: \bibitem{gal3} Galatolo S., ``Complexity, initial data sensitivity,
1440: di\-men\-sion and weak chaos in dynamical systems'', {\it
1441: Nonlinearity}, {\bf 16}, 4, 1219 (2003).
1442: \bibitem{CSF02} Argenti F., Benci V., Cerrai P., Cordelli A., 
1443: Galatolo S., Menconi G., ``Information and dynamical systems: a 
1444: concrete measurement on sporadic dynamics'', {\it Chaos, Solitons and 
1445: Fractals}, \textbf{13}, 3, 461--469 (2002).
1446: \bibitem{giuliauno}P. Allegrini, V. Benci, P. Grigolini, P. Hamilton,
1447: M. Ignaccolo, G. Menconi, L. Palatella, G. Raffaelli, N. Scafetta,
1448: M. Virgilio, Y. Yang,``Compression and Diffusion: A Joint Approach to
1449: Detect Complexity'', {\it Chaos, Solitons \& Fractals}
1450: {\bf 15}, 17 (2003).
1451: \bibitem{bonanno} Bonanno C., ''The Manneville map: topological,
1452: metric and computational approach'',
1453: http://arXiv.org/abs/math.DS/0107195 (2001).
1454: \bibitem{cristalli} Fronzoni L., Galeotti L., Menconi G., ``Measure of
1455: Diffusion Entropy of weak turbulence in sample of nematic liquid
1456: chrystal'',in {\it Determinism, Holism and
1457: Complexity}, p.87, Atti dell'omonimo convegno tenutosi ad Arcidosso
1458: (GR), 2-8 Settembre 2001, Vieri Benci 
1459: et al. editors, Kluwer Academic/Plenum Publishers, NY (2003).
1460: \bibitem{jacopogiulia} Bellazzini J., Menconi G., Ignaccolo M., Buresti G.,
1461: Grigolini P., ``Vortex Dynamics in evolutive flows: a weakly chaotic
1462: phenomenon'', {\it Physical Review E}, {\bf 68}: 026126 (2003).
1463: \bibitem{menconi} Bonanno C., Menconi G., ``Computational information 
1464: for the logistic map at the chaos threshold'', {\it Disc. Cont. Dyn. Syst.- 
1465: B}, \textbf{2}, no.3, 415--431 (2002).
1466: \bibitem{lz77} Ziv J., Lempel A., ``A Universal Algorithm for 
1467: Sequential Data Compression'', {\it IEEE Transactions on Information Theory}, 
1468: \textbf{23}, 337--342 (1977).
1469: \bibitem{lz78} Ziv J., Lempel A., ``Compression of Individual 
1470: Sequences Via Variable-Rate Coding'', {\it IEEE Transactions on Information 
1471: Theory}, \textbf{24}, 530--536 (1978). 
1472: \bibitem{bell}  Bell T., Witten I. H., Cleary J. G., \textit{Modeling for 
1473: text compression}, ACM Computing Surveys, \textbf{21}, 557--591 (1989).
1474: \bibitem{loreto} Benedetto D, Caglioti E., Loreto V.,''Language trees
1475: and zipping'',{\it Phys Rev Lett} {\bf 88}(4):048702(2002).
1476: \bibitem{myers} Myers G., ``Whole-genome DNA sequencing'', {\it
1477: Computing in Science \& Engineering}, {\bf 1}, 3:33--43 (1999).
1478: \bibitem{coverthomas} Cover T. M., Thomas J. A., {\it Elements of
1479: Information Theory}, Wiley (1991).
1480: \bibitem{promoter} Prestridge, D.S., ``Predicting Pol II Promoter
1481: Sequences Using Transcription Factor Binding Sites'', {\it
1482: J. Mol. Biol.},{\bf 249}: 923-32 (1995).
1483: \end{thebibliography}
1484: \end{document}
1485: 
1486: