0402:q-bio0402046/sublin.tex

1: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

2: \documentclass[a4paper,11pt,leqno]{article}

3: \setlength\oddsidemargin{0.70in}

4: %\usepackage{isolatin1}

5: \usepackage{amsfonts}

6: \usepackage{amsmath}

7: %\usepackage{amssymb}

8: \usepackage{amstext}

9: \usepackage{amsthm}

10: \usepackage{xspace}

11: \usepackage[dvips]{graphicx}

12: %\usepackage{showkeys}

13:

14: %Definizioni utili...:)

15: %--------------------------

16: \newcommand\ac{\`a\xspace}

17: \newcommand\ec{\`e\xspace}

18: \newcommand\ic{\`\i\xspace}

19: \newcommand\oc{\`o\xspace}

20: \newcommand\uc{\`u\xspace}

21: \newcommand\eg{\'e\xspace}

22: %--------------------------

23: %--------------------------

24: \newcommand\hr{\hfill\break}

25: \newcommand\name{\bfseries}

26: \newcommand\chaptit{\bfseries\itshape}

27: \newcommand\bls{\rightline{$ \blacksquare$}}

28: \newcommand\ovln{\overline}

29: \newcommand\unln{\underline}

30: %--------------------------

31:

32: %-----------------------------------------

33: % La R dei reali, la C dei complessi, etc.

34: \newcommand{\ok} {\qed}

35: \newcommand{\C}{\mathbb C}

36: \newcommand{\R}{\mathbb R}

37: \newcommand{\Rn}{{\mathbb R}^{n}}

38: \newcommand{\N}{\mathbb N}

39: \newcommand{\Q}{\mathbb Q}

40: \newcommand{\Z}{\mathbb Z}

41: \newcommand{\E}{\mathbb E}

42: \newcommand{\eps}{\varepsilon}

43: \newcommand{\Graf}{\mathrm{Graf}}

44: \newcommand{\Dom}{\mathrm{Dom}}

45: \newcommand{\Int}{\mathrm{Int}}

46: \newcommand{\Imm}{\mathrm{Imm}}

47: \newcommand{\grap}{\left\{}

48: \newcommand{\grch}{\right\}}

49: %-----------------------------------------

50: % Teoremi, Lemmi, etc.

51: \newtheorem{theorem} {Theorem}%[chapter]

52: \newtheorem{theorem*}{Theorem}

53: \newtheorem{prop*} {Proposition}

54: \newtheorem{lemma*}{Lemma}

55: %\newtheorem{guess}{Osservazione}[chapter]

56: \newtheorem{lemma}{Lemma}%[chapter]

57: \newtheorem{prop} {Proposition}% [chapter]

58: %-----------------------------------------

59:

60: % Cambiamo stile di scrittura... :))

61: \theoremstyle{definition}

62: \newtheorem{definition}{Definition}%[chapter]

63: \newtheorem{definition*}{Definition}

64: \newtheorem{cor}{Corollary}%[chapter]

65: \newtheorem{cor*}{Corollary}

66: \newtheorem{rem}{Remark}%[chapter]

67: \newtheorem{rem*}{Remark}

68: %-----------------------------------------

69: \theoremstyle{remark}

70: \newtheorem{nota}{Notazione}%[chapter]

71: \newtheorem{es}{Esempio}%[chapter]

72: %-----------------------------------------

73:

74: %-----------------------------------------

75: %Stile delle pagine...:)

76: \pagestyle{plain}

77: %----------------------------------------------------------------------

78:

79: %----------------------------------------------------------------------

80: % nuovi ambienti molto carini per dimostrazioni e osservazioni :)

81: % basta fare \begin{prf}  \end{prf} per le dimostrazioni e

82: % \begin{guess}   \end{guess} per le osservazioni e il gioco e' fatto!!!

83: \newtheorem{dim*}{\bf Proof}%[chapter]

84: \newenvironment{prf}{\begin{dim*}\begin{rm}} {\end{rm}\qed\end{dim*}}

85: \newtheorem{guess*}{\bf Osservazione}%[chapter]

86: \newenvironment{guess}{\begin{guess*}\begin{rm}}{\end{rm}\end{guess*}}

87: %-----------------------------------------------------------------------

88:

89:

90: \newcommand{\no}{\noindent}

91: \newcommand{\mk}{\medskip}

92: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

93: \input psfig.sty

94: \begin{document}

95: \title{Sublinear growth of Information in DNA sequences}

96: \author{Giulia Menconi\\ \small {Dipartimento di Matematica

97: Applicata}\\ \small{and}\\ \small {C.I.S.S.C. Centro

98: Interdisciplinare} \\ \small {per lo Studio dei Sistemi Complessi}\\

99: \small {Universit\ac di Pisa}\\ \small {Via Bonanno Pisano 25/b 56126

100: PISA - Italy}\\ \small{menconi@mail.dm.unipi.it}\\October 23, 2003} \date{} \maketitle

101: \vskip 11truecm

102: \centerline{Running title: Sublinear Information in DNA}

103: \vskip 0.2truecm

104: {Keywords: Information Content, compression

105: algorithm, DNA, repetitive sequences}

106: \newpage

107: \begin{abstract}

108: We introduce a novel method to analyse complete genomes and recognise

109: some distinctive features by means of an adaptive compression

110: algorithm, which is not DNA-oriented. We study the Information Content

111: as a function of the number of symbols encoded by the

112: algorithm. Preliminar results are shown concerning regions having a

113: sublinear type of information growth, which is strictly connected to

114: the presence of highly repetitive subregions that might be supposed to

115: have a regulatory function within the genome.

116: \end{abstract}

117: %\tableofcontents

118: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

119: \section{Introduction}

120: We shall analyse the genome sequences from the point of view of data

121: compression in order to exploit a linguistic analysis. As the context

122: suggests, the genomes are interpreted as symbol sequences of finite

123: length, drawn by an Information Source (the Nature) that remains

124: mainly unknown and emits symbols taken from the alphabet of the four

125: nucleotides $\{A,\ C,\ G,\ T\}$. Each genome identifies a living

126: organism and we assume that it may be considered as the unique

127: realisation produced by the Source relative to that organism.

128:

129: We shall not give here a formal definition of Information

130: Source. Intuitively, it is a device emitting a sequence of symbols

131: $\dots x_1x_2x_3\dots$ where each $x_i$ is an element of a finite

132: alphabet $\mathcal A$. The rigorous definition \cite{billingsley} lies

133: on the notion of sequence space $\Omega_\mathcal A$, that is the space

134: of one-sided infinite sequences (also called strings)

135: $\omega=(\omega_0,\omega_1,\dots)$ whose symbols are drawn from the

136: alphabet.

137: % $(\Omega_{\mathcal A}, \sigma,

138: %\mu_{\mathcal A})$. Let $\mathcal A$ is a finite alphabet with $N$

139: %symbols $\{a_0,a_1,\dots,a_{N-1}\}$, then the sequence space

140: %$\Omega_{\mathcal A}$ is the space of one-sided infinite sequences

141: %(also called strings) $\omega=(\omega_0,\omega_1,\dots)$ whose symbols

142: %are drawn from the alphabet. The sequence space may be equipped with a

143: %probabiblity invariant measure $\mu_{\mathcal A}$. The dynamical law

144: %is the shift transformation $\sigma: \Omega _{\mathcal A}

145: %\longrightarrow \Omega _{\mathcal A}$ defined as

146: %$\sigma(\omega_0,\omega_1,\omega_2,\dots)=(\omega_1,\omega_2,\dots)\.$

147: Even if an Information Source is rigorously defined as a stochastic

148: process $\mathbb X=(\mathbb X_n)_{n\in\N}$ acting on a sequence space,

149: we may consider the symbolic source $\Omega _{\mathcal A}$ as the

150: subset of the sequence space containing all the realizations of the

151: process $\mathbb X$. This shall motivate the use of the term

152: Information Source when referring to a sequence space. We shall denote

153: by $\mathcal{A}^*$ the set of finite symbolic sequences on the

154: alphabet $\mathcal{A}$. If $s\in\mathcal{A}^*$ its length will be

155: denoted by $|s|$.

156:

157: DNA sequences are special quaternary symbol sequences. As only a small

158: fraction of DNA nucleotides results in a viable organism, the

159: sequences belonging to a living organism are expected to be nonrandom

160: and have some constraints. Therefore, DNA sequences should be

161: compressible, at least locally.

162:

163: In our approach to symbol sequences, the crucial notion is the

164: \textit{Information Content}. Given a finite string $s$ in $\mathcal{A}^*$,

165: the meaning of \textit{ quantity of information} $I(s)$ contained in

166: $s$ has the following natural connotation:

167:

168: \begin{center}

169: $I(s)$ \textit{is the length of the smallest binary message from which you

170: can reconstruct} $s$.

171: \end{center}

172:

173: In his pioneering work, Shannon defined the quantity of information as

174: a statistical notion using the tools of probability theory

175: (\cite{kin}). Thus in Shannon framework, the quantity of information

176: which is contained in a string depends on its context. For example the

177: string $^{\prime }pane^{\prime }$ contains a certain information when

178: it is considered as a string coming from the English language. The

179: same string $^{\prime }pane^{\prime }$ contains much less Shannon

180: information when it is considered as a string coming from the Italian

181: language because it is more frequent in the Italian language (in

182: Italian it means ''bread'' and, of course, it is very

183: frequent). Roughly speaking, the Shannon information of a string is

184: the absolute value of the logarithm of the probability of its

185: occurrence.

186:

187: However, there are measures of information which depend intrinsically

188: on the string and not on its probability within a given context. We

189: will adopt this point of view. An example of these measures of

190: information is the Algorithmic Information Content ($AIC$). We will

191: not formally define it (see \cite{kin} and \cite{Ch} for rigorous

192: definitions and properties). We limit ourselves to give an intuitive

193: idea which is very close to the formal definition. We can consider a

194: partial recursive function as a computer $C$ which takes a program $p$

195: (namely a binary string) as an input, performs some computations and

196: gives a string $s=C(p)$, written in the given alphabet, as an output.

197: The $AIC$ of a string $s$ is defined as the length of the shortest

198: binary program $p$ which gives $s$ as its output, namely

199: $$I_{AIC}(s,C)=\min \{|p|:C(p)=s\}, $$ where $|p|$ means the length in

200: bit of the string which the program $p$ consists of. A theorem due to

201: A. N. Kolmogorov (\cite{kolmogorov}) implies that the information

202: content ${AIC}$ of $s$ with respect to $C$ depends only on $s$ up to a

203: fixed constant, therefore its asymptotic behaviour does not depend on

204: the choice of $C$. The shortest program $p$ which outputs the string

205: $s$ is a sort of optimal encoding of $s$. The information that is

206: necessary to reconstruct the string is contained in the

207: program. Unfortunately, this coding procedure cannot be performed by

208: any algorithm. This is a very deep statement and, in some sense, it is

209: equivalent to the Turing halting problem or to the G\"{o}del

210: incompleteness theorem. Then the Algorithmic Information Content is

211: not computable by any algorithm.

212:

213: Our method is focused on another measure: the information content of a

214: finite string can also be defined by a lossless data compression

215: algorithm $Z$ (\cite{Ch}, \cite{cleary}). This turns out to be a

216: Computable Information Content (CIC). In reference \cite{licatone}

217: quantitative relations among Shannon entropy of the source, the AIC

218: and the CIC of sequences are provided.

219:

220: The ``classical'' studies in compression algorithms answer the

221: question about the com\-pres\-si\-bi\-li\-ty of DNA with the

222: additional advantage of using compression techniques to capture the

223: properties of DNA. It is known that DNA sequences have two linguistic

224: characteristic structures: {\it reverse complements} and {\it

225: approximate repeats}. The reverse complement $\sigma ^c$ of a sequence

226: $\sigma$ is a sequence such that each symbol of $\sigma$ is replaced

227: in $\sigma^c$ by its complement one. That is, reading the reverse

228: complement of a subsequence from a single strand of DNA is the same as

229: reading the corresponding complementary subsequence in the other

230: strand. The approximate repeats are repeats that contain

231: errors. Approximate repeats are due to the local variability that is a

232: common feature within genomes.

233:

234: There have been developed several special-purpose compression

235: algorithms for DNA sequences (for instance, see  \cite{cleary}, \cite{jiang},

236: \cite{chen}, \cite{tahi}). These

237: algorithms are called DNA-oriented because they use the

238: aforementioned charateristic structures of ge\-no\-mes together with a

239: sort of statistical compression to achieve a compression ratio lower

240: than two bits per symbol. This is a great improvement since the

241: standard text compression algorithms such as {\it compress} or {\it

242: gzip} cannot compress DNA sequences but only expand the file with more

243: than two bits per symbol. The reason for text compression to fail on

244: DNA sequences is that the regularities in genomes are much more

245: subtler than in English texts, for which those algorithms have been

246: designed.

247:

248: Our analysis makes reference to a different approach. We aim at using

249: the compression algorithm CASToRe, which has been created without any

250: biological purpose and {\it a priori} linguistic knowledge, to

251: understand whether there exist low information regions within a

252: genome, whether they have a functional type in common, whether they

253: are extended or have short length and what kind of growth the

254: information content shows in those regions.

255: %The results shown in table \ref{cssh1} confirm that the

256: %algorithm CASToRe allows the compression ratio of complete genomes to

257: %be well under the threshold of two bits per symbol and this is the

258: %crucial point that convinced us to exploit the analysis of genomes by

259: %means of CASToRe.

260: Finally, as the algorithm CASToRe belongs to the class of algorithms

261: that adaptively create a dictionary relative to a parsing of the input

262: sequence, we shall study dictionaries after compression, in order to

263: investigate the relations between patterns and biological functions.

264: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

265: \section{Computable Information Content}

266: \begin{definition}[Compression Algorithm]

267: A lossless data compression algorithm is any injective function

268: $Z:\mathcal{A}^*\rightarrow\{0,1\}^*$.

269: \end{definition}

270: Therefore, a compression algorithm is a reversible coding such that

271: from the original string $s$ may be recovered from the encoded string

272: $Z(s)$. Since the coded string contains all the information that is

273: necessary to reconstruct and describe the structural features of the

274: original string, we can consider the length of the coded string as an

275: approximate measure of the quantity of information that is contained

276: in the original string.

277: \begin{definition}[Computable Information Content]

278: The information content of a finite string $s\in:\mathcal{A}^*$ with

279: respect to a compression algorithm $Z$ is defined as

280: \begin{equation}

281: CIC_{Z}(s)=|Z(s)|\ .

282: \end{equation}

283: The CIC of a string $s$ is the length (in bit units) of the coded

284: string $Z(s)$.

285: \end{definition}

286: The advantage of using a compression algorithm lies in the fact that

287: the information content $CIC_{Z}\left( s\right) $ is a

288: computable function over the space of finite strings. For this reason

289: we named it Computable Information Content.

290:

291: Moreover, we define another quantity, the complexity of a finite

292: sequence, providing an estimate for the rate of information content

293: contained in it.

294:

295: \begin{definition}[Computable Complexity of a finite string]

296: The complexity of $s$ with respect to $Z$ is the compression ratio

297: \begin{equation}

298: K_{Z}(s)=\frac{I_Z(s)}{|s|}\ .

299: \end{equation}

300: \end{definition}

301:

302: \begin{rem*}

303: Under suitable optimality assumptions on the compression algorithm

304: $Z$, we can extend this definition to infinite symbolic sequences

305: belonging to $\Omega_\mathcal A$ and asympotically obtain the Shannon

306: entropy of the Information Source from which the sequence has been

307: drawn (\cite{gal4},\cite{gal3}). The theoretical work

308: has been extended also to trajectories coming from general dynamical

309: systems and it is supported by application to several complex systems,

310: as to turbulent or intermittent regimes (\cite{CSF02}, \cite{giuliauno},

311: \cite{bonanno}, \cite{cristalli}, \cite{jacopogiulia}) and to weakly

312: chaotic dynamical systems (\cite{menconi},\cite{licatone}).

313: \end{rem*}

314:

315: \section{Dictionaries, words and phrases}

316: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%

317: Let us describe the sort of linguistic analysis we shall perform on

318:  genetic sequences. We shall use the CIC method to extract the

319:  functional regions whose information content is low and its growth is

320:  sublinear. We aim at understanding whether those regions show

321:  peculiar features such as specific highly repeated patterns of

322:  nucleotides (they are usually called {\it motifs}). Finally, we shall

323:  scan other genomes, both coming from the same domain of life and from

324:  different domains, looking for the presence of low information

325:  regions and comparing the motifs to each other. These regions are

326:  called {\it atypical}, as surprisingly they are highly compressible

327:  in comparison with the other regions. The dictionaries of some

328:  atypical regions will be studied and related to some known biological

329:  functions (e.g. being a promoter region). Finally, a preliminar result on

330:  potential application of this method to gene finding will be

331:  introduced.

332:

333: \subsection{The algorithm CASToRe}\label{castore}

334: We have created and implemented a particular compression algorithm we

335: called CASToRe which is a modification of the Lempel-Ziv compression

336: schemes $LZ77$ and $LZ78$ (\cite{lz77}, \cite{lz78}) and it has been

337: introduced and studied in references \cite{CSF02} and

338: \cite{menconi}. Its theoretical advantages with respect to LZ78 showed

339: that this algorithm is a sensitive measure of the Information content

340: of low entropy sequences. This is the reason that motivates the choice

341: of the acronym \textbf{CASToRe} to name the new algorithm: its meaning

342: is \textbf{C}ompression \textbf{A}lgorithm, \textbf{ S}ensitive

343: \textbf{To} \textbf{Re}gularity. As it has been proved in

344: \cite{menconi}, the Information content $I_Z$ of a constant sequence

345: $s^n$, originally with length $n$, is $\Psi(n)=4+2\log (n+1)[\log (\log

346: (n+1))-1]$, if the algorithm $Z$ is CASToRe. The theory predicts that

347: the best possible information content for a constant sequence of

348: length $n$ is $AIC(s^n) =\log (n) + $constant. It may be shown that

349: the algorithm $LZ78$ encodes a constant $n$-digits long sequence to a

350: string with length about $const\ +\ n^{\frac 1 2}$ bits; so, we cannot

351: expect that $LZ78$ is able to distinguish a sequence whose information

352: content grows like $n^{\alpha}$ ($\alpha < \frac 1 2$) from a constant

353: or periodic string. Furthermore, the running time of CASToRe is also

354: sensibly shorter than that of $LZ77$ (with infinite window), then any

355: implementation is more efficient. These are the main reasons that

356: motivate the choice of using CASToRe also for numerical experiments.

357:

358: Now we briefly describe the internal running of CASToRe.

359:

360: As the Ziv-Lempel schemes, the algorithm CASToRe is based on an

361: adaptive dictionary (\cite{bell}). One of the basic differences in the

362: coding procedure is that the algorithm $LZ77$ splits the input strings

363: in overlapping phrases, while the algorithm CASToRe (as well as the

364: algorithm $LZ78$) parses the input string in non-overlapping

365: phrases. Moreover, CASToRe differs from $LZ78$ because the new phrase

366: is a pair of two already parsed phrases, while $LZ78$ couples one

367: already parsed phrase and one symbol from the alphabet.

368:

369: At the beginning of encoding procedure, the dictionary contains only

370: the alphabet. In order to explain the main rules of the encoding, let

371: us consider a step $h$ within the encoding process, when the dictionary

372: already contains $h$ phrases $\{e_1,\dots,e_h\}$.

373:

374: The new phrase is defined as a pair ({\it prefix pointer},{\it suffix

375: pointer}). The two pointers are referred to two (not necessarily

376: different) phrases $\rho_p$ and $\rho_s$ chosen among the ones

377: contained in the current dictionary as follows. First, the algorithm

378: reads the input stream starting from the current position of the front

379: end, looking for the longest phrase $\rho_p$ matching the

380: stream. Then, the algorithm looks for the longest phrase $\rho_s$ such

381: that the joint word $\rho_p+ \rho_s$ matches the stream. The new

382: phrase $e_{h+1}$ that will be added to the dictionary is then

383: $e_{h+1}=\rho_p+ \rho_s$.

384:

385: The output file contains an ordered sequence of the binary encoding of

386: the pairs $(i_p,i_s)$ such that $i_p$ and $i_s$ are the dictionary

387: index numbers corresponding to the prefix word $\rho _p$ and to the

388: suffix word $\rho_s$, respectively.  The pair $(i_p,i_s)$ is referred

389: to the new encoded phrase $e_{h+1}$ and has its own index number

390: $i_{h+1}$.

391:

392: \subsubsection{Example}

393:

394: The following example shows how the algorithm CASToRe encodes the

395: input stream

396: \begin{equation*}

397: \omega =(abcababccabb\dots) .

398: \end{equation*}

399:

400: Let the source alphabet be $\mathcal{A}=\{a,b,c\}$.

401:

402: The output file corresponds to the binary encoding of the following

403: pairs contained in the second column. The first column is the

404: dictionary index number of the encoded phrase in the dictionary which

405: is showed in the same line, second column. For an easier reading, we

406: add a third column which shows each encoded phrase in the original

407: stream $\omega$, but which is not contained in the output file:

408: $$

409: \begin{array}{lll}

410: &\mbox{First, the alphabet is loaded}&\\

411: 1 & (0,\ ^{\prime}a\ ^{\prime}\ ) & [a] \\

412: 2 & (0,\ ^{\prime}b\ ^{\prime}\ ) & [b] \\

413: 3 & (0,\ ^{\prime}c\ ^{\prime}\ ) & [c] \\

414: &\mbox{Then, the encoding procedure starts}&\\

415: 4 & (1,2) & [ab] \\

416: 5 & (3,4) & [cab] \\

417: 6 & (4,3) & [abc]\\

418: 7 & (5,3) & [cabc]

419: \end{array}

420: $$

421: and so on.

422:

423: %The main difference between CASToRe and $LZ78$ is that the new phrase in

424: %CASToRe encoding is composed by two words, while in $LZ78$ encoding

425: %the new word is composed by one word and one symbol. However, there

426: %are sequences for which both algorithms give the same parsing of

427: %the input sequence. As an example, consider the string

428: %$$s=(abaaababbabb)$$ on the alphabet $\mathcal A=\{a,b\}$.

429: %We will show the two encodings at the same time.

430: %

431: %First, the alphabet is loaded by both algorithms in the same way:

432: % $$

433: %\begin{array}{lll}

434: %1 & (0,\ ^{\prime}a\ ^{\prime}\ )  \\

435: %2 & (0,\ ^{\prime}b\ ^{\prime}\ )

436: %\end{array}

437: %$$

438: %Then, the encoding procedures start:

439: %$$

440: %\begin{array}{lcc}

441: %& \mbox{CASToRe}& \quad LZ78\\

442: %3 & (1,2) & \qquad(1,\ ^{\prime}b\ ^{\prime}\ ) \\

443: %4 & (1,1) & \qquad(1,\ ^{\prime}a\ ^{\prime}\ )  \\

444: %5 & (3,1) & \qquad(3,\ ^{\prime}a\ ^{\prime}\ ) \\

445: %6 & (2,2) & \qquad(2,\ ^{\prime}b\ ^{\prime}\ ) \\

446: %7 & (3,2) & \qquad(3,\ ^{\prime}b\ ^{\prime}\ )

447: %\end{array}

448: %$$ The resulting parsing of $s$ is $\{ab,aa,aba,bb,abb\}$ and the

449: %information contents are comparable. Of course, it is reasonable that

450: %if the input sequence has length sensibly larger than the number of

451: %characters in the alphabet, the compression via the algorithm CASToRe

452: %is definitely better than that via the algorithm $LZ78$.

453: %\begin{rem*}

454: %\begin{enumerate}

455: %The CASToRe coding procedure, which pairs words already in the

456: %dictionary to create a new phrase, is similar to the procedure that can

457: %be found in the recent work \cite{grass}, which seems to be able to

458: %give a very precise entropy estimation, detecting very long range

459: %correlations in the English language.

460: %\item To our knowledge, optimality properties of the algorithm

461: %CASToRe are still unproved. Nevertheless, there is experimental

462: %evidence that the information content calculated via the algorithm

463: %CASToRe is numerically and qualitatively analogous to the information

464: %content calculated via the algorithm $LZ77$ (see Chapter \ref{chsix}).

465: %\end{enumerate}

466: %\end{rem*}

467: \subsection{Reading the dictionary}

468: The dictionary built by the algorithm CASToRe is an ordered collection

469: of phrases, that is, of pairs of words. Thus, a phrase is

470: composed by a prefix-word and a suffix-word. By construction, phrases

471: are different from each other, since the algorithm exploits a parsing

472: on the input string. Furthermore, each phrase may become a word, if it

473: appears as prefix or suffix of other phrases in the following

474: dictionary.

475:

476: In the following, we shall look at the most frequent words, at the

477: longest phrases and in some cases we shall compare the results to the

478: same analysis performed by means of the algorithm $LZ77$ and exploited

479: in collaboration with a group of physicists from the University of

480: Rome (see their previous work \cite{loreto} by V. Loreto et al. for

481: details on the methodology). We shall show that recurrent subsequences

482: occur especially along the regions with lowest information

483: content. Notice that we refer to exact repeats.

484:

485: We shall distinguish among recurrent subsequences either {\bf

486: motifs} or {\bf patterns}. A {\it motif} is a recurrent word in the

487: dictionary, whereas a {\it pattern} is a recurrent subsequence that

488: does not match any word of the dictionary, but is contained in some of

489: them. If a motif is found, we shall follow its {\it descent}, that is

490: the set of phrases whose the motif is either a prefix or a suffix or

491: both. Moreover, we shall search for the motif to be a {\bf sliding

492: pattern}, in the sense that it is contained in other phrases

493: without being their prefix nor their suffix. Furthermore, if only a

494: sliding pattern is to be found, then we shall recover its {\bf root},

495: that is the longest word of the dictionary matching part of the

496: pattern.

497: \section{The Information Content of DNA sequences}

498: We have analysed the computable complexity of 12 complete

499: genomes\footnote{The genomes have been downloaded by means of

500: the GenBank sequence libraries

501: http://www.ncbi.nlm.nih.gov/Genbank/index.html} of some Archaea,

502: Bacteria and Eukaryotes, together with chromosomes II and IV of

503: \textit{ Arabidopsis thaliana}. The complete list is shown on the

504: following Table \ref{cssh1}.

505:

506: In order to take into account the biological

507: functional constraints actually existing among the bases within the

508: genome and to highlight new features of coding and noncoding regions, we

509: have exploited a {\it fragment analysis}.

510: \begin{definition}

511: We say that any exon, intron or intergenic region is a functional

512: {\it fragment} of the genome sequence, following the prediction as it

513: has been identified via biological databases and statistical tools

514: (\cite{myers}).

515: \end{definition}

516: {\bf Notation. }In prokariotic genomes there are two functional types,

517: therefore we shall denote by $Coding\_\#$ and $Inter\_\#$ the coding

518: and the noncoding fragments, respectively, where $\#$ is an index to

519: order fragments. In eukaryotic genomes there are three different types

520: of regions: we shall denote by $Exon\_\#$ the coding fragments and by

521: $Intron\_\#$ and $Inter\_\#$ the noncoding intragenic fragments and the

522: noncoding integenic fragments, respectively.

523:

524: Thus, we shall consider the Computable Complexity $K(f)$

525: of each fragment and study the Information Content growth $CIC(f)$ within a

526: fragment.

527:

528: First, we have considered how the Information Content varies along

529: some complete DNA sequences: that is, we have studied the behaviour of

530: the CIC of a genome as a function of the number of encoded symbols. As

531: a result, we remark that the function $CIC(\sigma_n)$ grows linearly for all

532: the complete genomes $\sigma$ we have analysed and the asymptotic

533: slope is the value of their Computable complexity $K(\sigma)$:

534: $$CIC(\sigma_n)\ \sim\ K(\sigma)\cdot n \ ,$$ where $\sigma_n$

535: indicates the first $n$ bases in the complete genome $\sigma$. However, we can

536: enhance some regions of the genome and we will see that the $CIC$-line

537: is locally no more straight. This characteristic feature is shared by

538: all the genomes we have analysed, both Prokaryotes and

539: Eukaryotes and confirms the intuitive idea that the Information

540: Content growth should be slower in the parts of the genome where some

541: regularity prevails.

542:

543: \begin{figure}[hb]

544: \begin{tabular}{lr}

545: \raggedright{(a)\psfig{figure=globus_tot.ps,width=6.5cm,angle=270}} &

546: \raggedleft {(b)\psfig{figure=globus_totIngra.ps,width=6.5cm,angle=270}}

547: \end{tabular}

548: \caption{\it (a) complete $CIC(n)$ graph for {\it Archaeoglobus

549: fulgidus} complete genome; (b) local enhancement of the region from 380000 to

550:  410000 bp. The behaviour of $CIC(n)$ is no more linear.}\label{cfrTot}

551: \end{figure}

552:

553: For instance, see the results about the genome of {\it Archaeoglobus

554: fulgidus} (Prokaryote) which are pictured on figure \ref{cfrTot}. For

555: the sake of brevity, we shall not show analogous pictures coming from

556: other genomes.

557:

558: \begin{table}\begin{center}

559: \begin{tabular}{|c|c|c|}

560: \hline

561: \textbf{Genome} & \it{CSS}&{\bf$H_1$} \\ \hline\hline

562: \textit{Methanococcus jannaschii} & 1.794&1.887 \\ \hline%11

563: \textit{Archeoglobus fulgidus} & 1.909&1.987 \\ \hline%39

564: \textit{Methanobacterium thermoautrophicum} & 1.907 &1.986\\ \hline%07

565: \textit{Pyrococcus abyssi} & 1.901&1.979 \\ \hline\hline%68

566: \textit{Aquifex aeolicus} & 1.883 &1.976\\ \hline%82

567: \textit{Escherichia coli} & 1.893 &1.987\\ \hline%03

568: \textit{Bacillus subtilis} & 1.870 &1.975\\ \hline%38

569: \textit{Haemophylus influenzae} & 1.866 &1.947\\ \hline%89

570: \textit{Mycoplasma genitalium} & 1.848 &1.959\\ \hline%45

571: %\textit{Rickettsia prowazekii} & 1.823 &1.795\\ \hline%46

572: \textit{Thermotoga maritima} & 1.893 &1.984\\ \hline\hline%42

573: \textit{Arabidopsis thaliana} (chr. II and IV) & 1.892&1.938 \\ \hline%76

574: \textit{Saccharomyces cerevisiae} & 1.889 &1.949\\ \hline%27

575: \textit{Caenorhabditis elegans} & 1.777&1.936 \\ \hline%81

576: \end{tabular}

577: \caption{\it complete genomes. Comparison CSS

578: vs. $H_1$.}\label{cssh1}\end{center}

579: \end{table}

580:

581: For what concerns the values of computable complexity $K$ for the

582: complete genomes we have analysed, the results are shown on Table

583: \ref{cssh1} . We have indicated the complexity $K$ as $CSS$, meaning

584: {\it complexity as a single string}, to distinguish it from the

585: fragment complexity, which is the value of the computable complexity

586: of the functional fragments within the complete genome and which will

587: be denoted by $FC$ in the following. The final column in Table

588: \ref{cssh1} shows the first order entropy $H_1$ of the sequence. If

589: $p_A,\ p_C,\ p_G,\ p_T$ are the nucleotide frequencies over a genome

590: $\sigma$ (the frequency is calculated as the number of occurrences of

591: a specific nucleotide over the total number of nucleotides), then the

592: first order entropy is $H_1=\sum_{i=A,C,G,T}p_i\log p_i$. We recall

593: that, when the symbols are drawn uniformly at random from the source

594: and all the positions in the sequence are independent from each other,

595: an optimal coding procedure will devote $\log _{2}(\#\mathcal{A})$

596: bits per symbol to represent each character (\cite{coverthomas}),

597: where $\#\mathcal{A}$ is the number of symbols in the alphabet

598: $\mathcal{A}$. In this case the asymptotically maximal complexity

599: equals the $H_1$ value for those values of nucleotide frequencies. For

600: quaternary sequences, like the genomes, this maximal mean first order

601: entropy is 2 bits per symbol. Since the $H_1$ value represents a

602: quantity of information of a single string which is dependent on the

603: probability measure on the space of sequences, at first sight the

604: genomes cannot be considered randomly distributed (from a statistical

605: point of view), because for all of them the $H_1$ values are different

606: from 2 bits per symbol. First, we notice that the values of the

607: complexity $CSS$ are significantly different from 2 and lower than the

608: $H_1$ entropy values. Again, this is in complete agreement with the

609: fact that the randomness of the genomes has strong constraints. It is

610: also possible to clearly recognise that some genomes have very low

611: computable complexity (smaller than 1.90 bits per symbol), which means

612: that their internal structure presents mid-range and long-range

613: correlations.

614:

615: The compression of complete genomes does not satisfy the quest for

616: local structures along a genome. The presence of local nonlinearities

617: in the Information Content function for complete genomes suggests the

618: existence of specific functional fragments whose Information Content

619: function grows sublinearly. We recall that we named those regions

620: atypical. Consequently, we shall investigate in this direction by

621: means of the fragment analysis.

622:

623: \subsection{A sublinearity index}

624: In order to identify the regions where the growth of the function

625: $CIC(\sigma_n)$ is sublinear, we define a sublinearity index, that allows us

626: to determine whether a functional region is atypical.

627:

628: In the following, $\sigma$ shall denote any fragment within a

629: genome. The sublinearity index may be defined by means of any

630: adaptive compression algorithm $Z$, although the experimental results are

631: referred to the algorithm CASToRe.

632:

633: Let $N=|\sigma|$ be the length of the input sequence $\sigma$. Let

634: $\mathcal{P}(\sigma,Z)$ be the parsing of $\sigma$ with respect to the

635: algorithm $Z$:

636: $\mathcal{P}(\sigma,Z)=\{\phi_1,\phi_2,\dots,\phi_t\}$. Therefore, the

637: input string $\sigma$ is the ordered juxtaposition of phrases $\phi

638: _j$'s. We use the symbol $n_k$ to indicate the current total number of

639: encoded symbols up to step $k$ of the encoding procedure:

640: $n_k=\Sigma_{j=1}^{k}|\phi _j|$. Due to the fact that

641: $|\phi_k|=n_k-n_{k-1}$, we say that $n_k$ is the parsing index

642: corresponding to the phrase $\phi _k$. The Information Content after

643: $k$ steps is then the quantity

644: $I(n_k)=\Sigma_{j=1}^{k}I(\phi_j)$. Obviously, it holds that

645: $n_t=\Sigma_{j=1}^{t}|\phi _j|=N$ and

646: $I(\sigma)=I(N)=\Sigma_{j=1}^{t}I(\phi_j)$. Since the encoding

647: procedure might be not precise in the early steps as well as in the

648: final steps, we fix two bounds defining the restriction of the

649: potential integer value $n_j$. Let $T_{inf}=20\% |\sigma|$ be the

650: lower bound and $T_{sup}=90\%|\sigma|$ be the upper bound. The choice

651: of the bounds will be such that there exist two parsing indexes

652: $n_{inf}$ and $n_{sup}$ such that $T_{inf}\leq n_{inf}<n_{sup}\leq

653: T_{sup}$. Moreover, since the algorithm $Z$ requires that the input

654: sequence is sufficiently long to make the compression reliable and

655: efficient, we shall not analyse sequences whose length $N$ is lower

656: than $200$ symbols. Thus, for the set $\{n_j\ \|\ j=1,\dots,t\}$

657: coming from the parsing of $\sigma$ via the algorithm $Z$, we define

658: the domain $\mathcal{D}=\{n_k\ \|\ n_{inf}\leq n_k\leq n_{sup}\ ,\

659: n_t\geq 200\}$.

660:

661: \begin{definition}[Sublinearity index of a finite symbol sequence]\label{sublinind}

662:

663: $\qquad$\\{\it Let $q_{min}$, $q_{max}$ and $q_Z(\sigma)$ be defined

664: as follows:

665: \begin{equation*}

666: q_{min}=\min\limits_{n_k\in\mathcal{D}}\left\{\frac{I(n_k)}{n_k}\right\}\ ,

667: \end{equation*}

668: \begin{equation*}

669: q_{max}=\max\limits_{n_k\in\mathcal{D}}\left\{\frac{I(n_k)}{n_k}\right\}

670: \end{equation*}

671: and

672: \begin{equation*}

673: q_{_Z}(\sigma)=\frac{q_{min}}{q_{max}}\ .

674: \end{equation*}

675: The sublinearity index $\mathcal{G}_{_Z}(\sigma)$ of the input sequence

676: $\sigma$ with respect to the parsing defined via the algorithm $Z$ is

677: the quantity

678: \begin{equation}\label{gi}\index{$\mathcal{G}_{_Z}$, sublinearity index}

679: \mathcal{G}_{_Z}(\sigma)=\frac{\log(q_{_Z}(\sigma))}{\log(\frac

680: {n_{sup}}{n_{inf}})}+1\ .

681: \end{equation}}

682: \end{definition}

683:

684: The definition of this index $\mathcal{G}_{_Z}$ deserves some

685: comments. Its main characteristic is that it allows a criterion to

686: identify atypical regions to be established.

687:

688: First of all, it is known that the behaviour of the Information

689: Content of a finite sequence $\sigma$ is an increasing function

690: $I(\sigma^n)$ that grows at most linearly with the number $n$ of

691: encoded symbols. Therefore, the indexes $q_{min}$ and $q_{max}$ can be

692: easily calculated by:

693: $$q_{min}=\frac{I(n_{sup})}{n_{sup}}\ \ \mbox{and}\ \

694: q_{max}=\frac{I(n_{inf})}{n_{inf}}\ .$$

695: Hence, it is straightforward that the value of the sublinearity index is

696: \begin{equation}\label{utileG}

697: \mathcal{G}_{_Z}(\sigma)=\frac{\log(I(n_{sup}))-\log(I(n_{inf}))}

698: {\log(n_{sup})-\log(n_{inf})}\ .

699: \end{equation}

700:

701: We notice that the fragment we have analysed are not periodic,

702: otherwise the phrases found in the parsing by the algorithm CASToRe

703: would definitely show length doubling, which is absent in the

704: dictionaries of all the fragments. Furthermore, the Information

705: Content growth of any functional fragment $\sigma$ can not be a

706: logarithmic function $\Psi(n)$ (see Section \ref{castore}), but we

707: might assume that it can be read ($\forall\ 1\leq n\leq |\sigma|$) as

708: \begin{equation}

709: CIC(\sigma_n)=\mathcal O(Cn^{\gamma})\ ,\mbox{ with

710: exponent $0<\gamma\leq 1$ and constant $C>0$}\ .

711: \label{infoPo}

712: \end{equation}

713:

714: Note that this formula is relative to a finite sequence, therefore the

715: writing $\mathcal O(Cn^{\gamma})$ is not referring to an asympotic

716: behaviour (as $n\leq |\sigma|$), but it means that the integer

717: function $CIC(\sigma_n)$ is fitted by a function whose do\-mi\-nant term is

718: a power law with exponent smaller than 1. Since we have excluded any

719: pure periodicity, hypothesis (\ref{infoPo}) is doubtless

720: plausible.

721:

722: %As we have already pointed out in the previous chapters, the behaviour

723: %of the information content is a discriminant characteristic of

724: %different dynamical behaviours: if the symbol sequence is a symbolic

725: %orbit drawn from a chaotic dynamical system whose entropy is positive,

726: %then the Information content grows linearly (i.e. following equation

727: %(\ref{infoPo}) with $\alpha=1$). Moreover, an example of dynamical

728: %systems whose symbolic orbits have mean Information content growing as

729: %a power law (following equation (\ref{infoPo}) with $0<\alpha<1$) is

730: %given by the Manneville-Pomeau family with the driving parameter

731: %$z>2$. In that case, if the compression algorithm is $LZ77$, Theorem

732: %\ref{tman77} states that the exepctation value of the Information

733: %content of an orbit grows as $I(n)=n^{\frac 1 {z-1}}$. Of course, the

734: %relationship between the kind of growth of Information content and the

735: %dynamical type of the system generating the sequence is not

736: %one-to-one. For instance, both when the dynamics is periodic and when

737: %the orbit is a trajectory coming from the logistic map at the

738: %Feigenbaum point, the information grows logarithmically (see Chapter

739: %\ref{chsix}, Section \ref{sequa}); nevertheless, the two types of

740: %dynamics are far from being similar, because the one is ordered, the

741: %latter is weakly chaotic.

742:

743: The two following main points are definitely true. First, a

744: sublinear growth of Information Content is an indicator of the

745: presence of some regularity in the input sequence and this is much

746: more evident when the index $\mathcal{G}_{_Z}$ is significantly

747: smaller than 1. Second, small values of the index $\mathcal{G}_{_Z}$

748: may correspond to different sublinear information growths $-$ also

749: other than power-law-like $-$ that consequently might be a signal of

750: different underlying dynamics generating the symbol sequences.

751:

752: In the following Lemma, the sublinearity index $\mathcal{G}_{_Z}$ in

753: the case of Information Content growing exactly as a power law is evaluated.

754: \begin{lemma}

755: If $CIC(n_k)=C {n_k} ^\gamma$ with $0<\gamma\leq 1$, then

756: $\mathcal{G}_{_Z}=\gamma$.\end{lemma}

757: \begin{proof}

758:  Consider the formula (\ref{utileG}).  In this case, it holds that

759: $$\mathcal{G}_{_Z}=\frac{\log (C)+\alpha\log(n_{sup})-\log

760: (C)-\alpha\log(n_{inf})}{\log(n_{sup})-\log (n_{inf})}\ .$$ Therefore,

761: the conclusion is straightforward.\end{proof}

762:

763: Thus, according to formula (\ref{infoPo}), the sublinearity index

764: $\mathcal{G}_{_Z}$ is a reliable quantity that allows the degree of

765: sublinearity of the information content growth to be estimated. In

766: order to evaluate the precision of the index $\mathcal{G}_{_Z}$ with

767: respect to the {\it true} actual exponent $\gamma$, we have compared

768: the values of $\mathcal{G}_{_Z}$ with the values of $\gamma$ as they

769: are given by a numerical fit on the integer function $I(n)$. The

770: results are definitely satisfactory. Some examples are shown on

771: Table \ref{tabellalfa} and are referred to several fragments from the

772: genomes of {\it Archaeoglobus fulgidus}, {\it Escherichia coli} and

773: {\it Arabidopsis thaliana}.

774:

775: \begin{table}\begin{center}

776: \begin{tabular}{|c|c|c|c|}

777: \hline

778: \mbox{Genome}&\mbox{Sequence}&\mbox{value of }$\mathcal{G}_{_Z}$

779: &\mbox{fit-value of }$\gamma$\\

780: \hline\hline

781: $Archaeoglobus\ fulgidus$&$Coding\_685495$&0.965&1.000\\

782: \hline

783: $Archaeoglobus\ fulgidus$&$Inter\_1143603$&0.949&0.949\\

784: \hline

785: $Archaeoglobus\ fulgidus$&$Inter\_393196$&0.832&0.831\\

786: \hline\hline

787: $Escherichia\  coli$&$Inter\_2302612$&0.768&0.747\\

788: \hline

789: $Escherichia\  coli$&$Inter\_4293752$&0.728&0.730\\

790: \hline

791: $Escherichia\  coli$&$Coding\_91419$&0.986&0.986\\

792: \hline\hline

793: $Arabidopsis\ thaliana$&$Exon\_23950656$&0.614&0.585\\

794: \hline

795: $Arabidopsis\ thaliana$&$Intron\_5063613$&0.767&0.738\\

796: \hline

797: $Arabidopsis\ thaliana$&$Inter\_19660110$&0.887&0.886\\

798: \hline

799: \end{tabular}

800: \caption{\it reliability of the sublinearity index $\mathcal{G}_{_Z}$ in the

801: case of several functional regions from different genomes.}\label{tabellalfa}

802: \end{center}

803: \end{table}

804:

805: The following definition will be used to extract the atypical

806: functional regions. The threshold has been fixed according to the

807: empirical principle that the kind of growth $n^\gamma$ where $\gamma$

808: lies in $[0.9,1]$ is, on a general basis, equivalent to a linear

809: growth, due to the finiteness of the sequences under analysis.

810:

811: \begin{definition}[Atypical region]\label{atypical}

812: An atypical region within a genome is any functional region whose

813: sublinearity index $\mathcal{G}_{_Z}$ is smaller than 0.9.

814: \end{definition}

815:

816: \begin{figure}

817: \centerline{\psfig{figure=globusUPS_393196.ps,width=8cm,angle=270}}

818: \caption{\it Archaeoglobus fulgidus genome. The behaviour of the

819: information content of region $Inter\_393196$ is a power law whose

820: exponent is 0.832. The picture is in linear scale.}\label{regLow}

821: \end{figure}

822: \begin{figure}

823: \centerline{\psfig{figure=GKglobus.ps,width=8cm,angle=270}}

824: \caption{\it Archaeoglobus fulgidus genome. Comparison between the

825: values of sublinearity index and fragment complexity of all functional

826: regions with length greater than 200 bp. The crosses ($+$) are referred

827: to coding regions, while the diamonds ($\diamond$) are referred to

828: intergenic regions. The vertical line is the threshold for the

829: sublinearity index, under which the region is atypical.}\label{cfrGK}

830: \end{figure}

831:

832: The connection between sublinearity index and fragment complexity is

833: not precise, even if in the extreme cases where both values are either

834: high or low a sort of clusters are detected. For instance, Figure

835: \ref{cfrGK} illustrates what the relation is between the sublinearity

836: index (horizontal axis) and the fragment complexity (vertical axis) in

837: the case of the genome of {\it Archaeoglobus fulgidus}. Atypical

838: regions are indicated by means of a vertical line that represents the

839: threshold for the sublinearity index as introduced in Definition

840: \ref{atypical}. It is clear that, both in the case of coding regions

841: (depicted by a cross) and of noncoding regions (depicted by a

842: diamond), the higher the fragment complexity is, the higher the

843: sublinearity index is.  Furthermore, the detection of atypical regions

844: with high fragment complexity suggests that the sublinearity index may

845: be more meaningful in identifying regularity of sequences than the

846: fragment complexity.

847: \section{Experimental results}

848: In the following, we shall introduce some preliminar examples of

849: application of the $CIC$ method. Ww shall analyse the dictionary of

850: some long atypical regions within the genomes of {\it Archaeoglobus

851: fulgidus}, {\it Methanococcus jannaschii} and {\it Arabidopsis

852: thaliana}. We shall discover peculiar properties and propose

853: some biological motivations to those features. This part of the work

854: has been developed in collaboration to the Animal Biology and Genetics

855: Department of the University of Florence.

856:

857: \subsection{Archaeoglobus fulgidus}

858:

859: {\it Archaeoglobus fulgidus} is a sulphur-metabolizing anaerobic

860: organism. It belongs to the Archaeoglobales, archaeal sulfate reducers

861: unrelated to other sulfate reducers. They grow at extremely high

862: temperatures. Archaeoglobus species causes corrosion of iron and steel

863: in oil and gas processing systems by the production of iron

864: sulphide. This organism has one circular chromosome.

865: \begin{figure}

866: \centerline{\psfig{figure=LGglobus.ps,width=8cm,angle=270}}

867: \caption{\it Archaeoglobus fulgidus genome. Of each functional region,

868:   its length and the corresponding sublinearity index are

869:   plotted. The crosses ($+$) are referred to coding regions,

870:   while the squares ($\square$) are referred to intergenic regions.

871:   The horizontal line is the threshold for the sublinearity index, under

872:   which the region is atypical.}\label{cfrLGglobus}

873: \end{figure}

874:

875: Looking at Figure \ref{cfrLGglobus}, we have extracted two regions:

876: one atypical region, which is noncoding, and two non-atypical regions,

877: one coding and one noncoding. This choice is aimed at comparing the

878: dictionaries of regions with sublinear grwoth of information to the

879: dictionaries of regions with li\-near growth of information.

880:

881: The exemplified regions are

882: \begin{itemize}

883: \item $Coding\_685495$: non-atypical region, length $L=2300\ bp$,

884:   sublinearity index $\mathcal{G}_{_Z}=0.965$, fragment complexity $K=

885:   2.108$;

886: \item $Inter\_1143603$: non-atypical region, length $L=2219\ bp$,

887:  sublinearity index $\mathcal{G}_{_Z}=0.949$, fragment complexity $K= 2.117$;

888: \item $Inter\_393196$: atypical region, length $L=2629\ bp$,

889:   sublinearity index \linebreak$\mathcal{G}_{_Z}=0.832$, fragment

890:   complexity $K= 1.494$.

891: \end{itemize}

892: We start analysing the non-typical regions.

893: \begin{figure}

894: \begin{tabular}{lr}

895: \raggedright{(a)\psfig{figure=globusCOD_685495len.ps,width=6.5cm,angle=270}}

896: &\raggedleft{(b)\psfig{figure=globusUPS_1143603len.ps,width=6.5cm,angle=270}}

897: \end{tabular}

898: \begin{tabular}{lr}

899: \raggedright{(c)\psfig{figure=globusCOD_685495_stalen.ps,width=6.5cm,angle=270}}

900: &\raggedleft{(d)\psfig{figure=globusUPS_1143603_stalen.ps,width=6.5cm,angle=270}}

901: \end{tabular}

902: \caption{\it Archaeoglobus fulgidus genome. Plots (a) and (b) show the

903:   location and length of the phrases in the parsing by the algorithm

904:   CASToRe, in non-atypical regions $Coding\_685495$ and

905:   $Inter\_1143603$, respectively. Graphs (c) and (d) illustrate the

906:   distribution of phrase length in the same

907:   regions.}\label{globNonatyp}

908: \end{figure}

909: First, we have plotted the length of the phrases in the dictionary

910: together with their position in the input sequence (see Figure

911: \ref{globNonatyp} $(a)$ and $(b)$). In both non-atypical regions, the

912: phrases are short and the maximal length is 11 bp. The Gaussian

913: distribution of phrase length confirms that these regions are not

914: regular, but highly variable (see Figure \ref{globNonatyp} $(c)$ and

915: $(d)$). The extent of the dictionary is great in both non-atypical

916: regions: 415 phrases in the dictionary of region $Coding\_685495$ and

917: 393 phrases in the dictionary of region $Inter\_1143603$.

918:

919: However, in the case of region $Coding\_685495$, the algorithm CASToRe

920: recognised 31 codons as phrases that are also used as prefix or suffix

921: words quite frequently. Table \ref{globcodon} illustrates the details

922: of this feature that has been found only in coding regions; in fact,

923: in non-atypical noncoding regions the codons that are recognised as

924: phrases are always a few.

925: \begin{table}\begin{center}

926: \begin{tabular}{|c|c|c||c|c|c|}

927: \hline

928: \mbox{Codon}&\mbox{$\#$ prefix}&\mbox{$\#$ suffix}&\mbox{Codon}&\mbox{$\#$ prefix}&\mbox{$\#$ suffix}\\

929: \hline\hline

930: AAA&10&4&CTG&8&7\\

931: \hline

932: AAG&1&8&GCA&3&2\\

933: \hline

934: AAT&4&1&GCC&8&6\\

935: \hline

936: ACA&10&4&GCT&5&3\\

937: \hline

938: ACC&4&7&GGA&2&4\\

939: \hline

940: ACG&4&1&GGT&2&2\\

941: \hline

942: ACT&7&6&TAA&5&8\\

943: \hline

944: ATG&3&0&TAG&0&1\\

945: \hline

946: ATT&4 &8&TAT&2&4\\

947: \hline

948: CAA&14&10&TCA&6&6\\

949: \hline

950: CAG&3&7&TCC&8&2\\

951: \hline

952: CAT&0&0&TCT&7&5\\

953: \hline

954: CCG&3&1&TGT&2&6\\

955: \hline

956: CCT&2&3&TTA&5&7\\

957: \hline

958: CGG&0&0&TTG&0&4\\

959: \hline

960: CGT&5&7& & &\\

961: \hline

962: \end{tabular}

963: \caption{\it 31 different codons have been recognised as phrases in the

964:   parsing by the algorithm CASToRe, in region $Coding\_685495$ of the

965:   genome of {\it Archaeoglobus fulgidus}. Some of them have been also

966:   used as prefix or suffix of other phrases. Columns named $\#\

967:   prefix$ and $\#\ suffix$ indicate how many times the phrase has been

968:   used as a prefix or suffix.}\label{globcodon}

969: \end{center}

970: \end{table}

971: \begin{figure}

972: \begin{tabular}{lr}

973: \raggedright{(a)\psfig{figure=globusUPS_393196len.ps,width=6.5cm,angle=270}}&

974: \raggedleft{(b)\psfig{figure=globusUPS_393196_stalen.ps,width=6.5cm,angle=270}}

975: \end{tabular}

976: \caption{\it Archaeoglobus fulgidus genome. Plot (a) shows

977:   the location and length of the phrases in the parsing by the

978:   algorithm CASToRe, in atypical region $Inter\_393196$. Graph (b)

979:   illustrates the distribution of phrase length in the same

980:   region.}\label{globatyp}

981: \end{figure}

982:

983: Conversely, the dictionary relative to fragment $Inter\_393196$, which

984: is atypical noncoding, shows completely different

985: characteristics. First of all, the dictionary contains 349

986: phrases. Moreover, Figure \ref{globatyp} $(a)$ shows that in this

987: sequence there should be recurrences of similar patterns, because of

988: the several long phrases (that is, longer than 25 bp) that are spread

989: along the whole sequence. Another feature, which will be paradigmatic

990: of atypical regions, is the anomalous (non-Gaussian) tail in the

991: distribution of phrase length (see Figure \ref{globatyp} $(b)$). The

992: distribution is no longer peaked at only one value, but there is a

993: significant occurrence of long words that could not be found in

994: non-atypical regions and is consistent with the presence of regularity

995: within any atypical region.

996:

997: According to the dictionary obtained by means of algorithm CASToRe,

998: there is a dominant motif $\mathcal M$ of length 25 bp (phrase

999: nr. $109$ in the dictionary), that is also used 9 times as a prefix

1000: and 3 times as a suffix.  Table \ref{tabMglobus} illustrates what the

1001: dominant motif $\mathcal M$ and its descent are. We recall that the

1002: descent of a phrase $\phi$ is the set of other phrases in the

1003: dictionary such that $\phi$ is either their prefix or suffix or both.

1004: \begin{table}\begin{center}

1005: \begin{tabular}{c}

1006: $\mathcal M=$AATCCCATTTTGGTCTGATTTCAAC\\

1007: Descent of $\mathcal M\  :$\\

1008: {\bf AATCCCATTTTGGTCTGATTTCAAC}ACA\\

1009: {\bf AATCCCATTTTGGTCTGATTTCAAC}AG\\

1010: {\bf AATCCCATTTTGGTCTGATTTCAAC}CAA\\

1011: {\bf AATCCCATTTTGGTCTGATTTCAAC}CT\\

1012: {\bf AATCCCATTTTGGTCTGATTTCAAC}GA\\

1013: {\bf AATCCCATTTTGGTCTGATTTCAAC}GT\\

1014: {\bf AATCCCATTTTGGTCTGATTTCAAC}TATTT\\

1015: {\bf AATCCCATTTTGGTCTGATTTCAAC}TT\\

1016: {\bf AATCCCATTTTGGTCTGATTTCAAC}TTTC\\

1017: CCCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\

1018: CTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\

1019: TTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\

1020: \end{tabular}

1021: \caption{\it dominant motif $\mathcal M$ and its descent in atypical

1022:   region $Inter\_393196$ of the genome of Archaeoglobus fulgidus.}

1023: \label{tabMglobus}

1024: \end{center}

1025: \end{table}

1026:

1027: The presence of a dominant motif partially motivates the many

1028: oscillations in the $CIC$ growth, as depicted in Figure \ref{regLow}.

1029: Furthermore, a complete explanation lays on the fact that the motif

1030: $\mathcal M$ is also a sliding pattern in many other phrases (see

1031: Table \ref{slidinglobus}). This is

1032: an irrefutable evidence of the fact that this atypical region shows a

1033: {\it variable periodicity} represented by the recurrence of the

1034: motif $\mathcal M$ sometimes slightly modified, as in the case of

1035: approximate repeats.

1036:

1037: Even if the biological usefulness of the motif $\mathcal M$ is still

1038: unknown, another hint to its peculiarity is provided by the

1039: compression of region $Inter\_393196$ by means of algorithm

1040: $LZ77$. The motif $\mathcal M$ is a motif also in the dictionary

1041: extracted by $LZ77$. Therefore, the idea that this motif should have a

1042: precise biological meaning is even more convincing. Furthermore, this

1043: example suggests that also approximate repeats generated by insertions may

1044: be identified via $CIC$ method.

1045: \begin{table}\begin{center}

1046: \begin{tabular}{c}

1047: $\mathcal M=$AATCCCATTTTGGTCTGATTTCAAC\\

1048: Motif as a sliding pattern in:\\

1049: TTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\

1050: {\bf AATCCCATTTTGGTCTGATTTCAAC}GAAG\\

1051: {\bf AATCCCATTTTGGTCTGATTTCAAC}CTCC\\

1052: {\bf AATCCCATTTTGGTCTGATTTCAAC}TATTT\\

1053: CTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\

1054: CCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\

1055: TCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\

1056: TTC{\bf AATCCCATTTTGGTCTGATTTCAAC}TCC\\

1057: TTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}CTT\\

1058: CGCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\

1059: {\bf AATCCCATTTTGGTCTGATTTCAAC}GAGGCGT\\

1060: CCCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\

1061: CTCCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\

1062: CCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}TA\\

1063: ACTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}AG\\

1064: TTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}TTTA\\

1065: CTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}ATC\\

1066: GTCTCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\

1067: CACGCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}\\

1068: ACCCCTTTC{\bf AATCCCATTTTGGTCTGATTTCAAC}

1069: \end{tabular}

1070: \caption{\it phrases where the motif $\mathcal M$ is a sliding

1071:   pattern. The motif is written bold typed.}

1072: \label{slidinglobus}

1073: \end{center}

1074: \end{table}

1075: \subsection{Methanococcus jannaschii}

1076: \begin{figure}

1077: \centerline{\psfig{figure=LGmjann.ps,width=8cm,angle=270}}

1078: \caption{\it Methanococcus jannaschii genome. Of each functional

1079:   region, its length and the corresponding sublinearity index are

1080:   plotted. The crosses ($+$) are referred to coding regions, while the

1081:   squares ($\square$) are referred to intergenic regions.  The

1082:   horizontal line is the threshold for the sublinearity index, under

1083:   which the region is atypical. }\label{LGmjann}

1084: \end{figure}

1085:

1086: {\it Methanococcus jannaschii} is a thermophilic (48-94$^\circ$ C),

1087: strict anaerobic Archaebacterium living at pressures of over 200

1088: atmospheres. It is an autotroph which gets its energy from hydrogen

1089: and carbon dioxide producing methane and it is capable of nitrogen

1090: fixation. Morphologically, it is characterized by having two bundles

1091: of flagella at the same cellular pole. The genome of {\it

1092: Methanococcus jannaschii} consists of the main circular chromosome and

1093: two circular extrachromosomal elements (ECE), one large and one

1094: small. We have analysed only the main chromosome.

1095:

1096: In this genome we shall show one atypical region, whose sublinearity

1097: index is particularly low and having approximately the same extent as

1098: the other regions that have been already analysed. However, as it

1099: is shown on Figure \ref{LGmjann}, this genome presents many other long

1100: atypical regions, that will be studied in future work.

1101:

1102: The atypical region we have analysed is

1103: \begin{itemize}

1104: \item $Inter\_236189$: atypical region, length $L=2112\ bp$,

1105:   sublinearity index \linebreak$\mathcal{G}_{_Z}=0.707$, fragment

1106:   complexity $K= 1.405$.

1107: \end{itemize}

1108: \begin{figure}

1109: \centerline{\psfig{figure=mjannUPS_236189.ps,width=8cm,angle=270}}

1110: \caption{\it Methanococcus jannaschii genome. The behaviour of the

1111:   information content of region $Inter\_236189$ grows sublinearly with

1112:   index 0.707. The picture is in linear scale.}\label{mjannLow}

1113: \end{figure}

1114: The behaviour of the information content in atypical region

1115: $Inter\_236189$ is twofold: until the first 1500 base pairs have been

1116: encoded, the growth is almost logarithmic, while in the final part the

1117: $CIC$ increase is faster (see Figure \ref{mjannLow}). Therefore, the

1118: first part of the sequence should be more regular than the second one.

1119:

1120: \begin{figure}

1121: \begin{tabular}{lr}

1122: \raggedright{(a)\psfig{figure=mjannUPS_236189len.ps,width=6.5cm,angle=270}}&

1123: \raggedleft{(b)\psfig{figure=mjannUPS_236189_BISstalen.ps,width=6.5cm,angle=270}}

1124: \end{tabular}

1125: \caption{\it Methanococcus jannaschii genome. Plot $(a)$ shows

1126:   location and length of the phrases in the parsing by the algorithm

1127:   CASToRe of region $Inter\_236189$. In graph (b) the corresponding

1128:   distribution of phrase length is pictured.}\label{mjannparole}

1129: \end{figure}

1130:

1131: This aspect is well-represented in graph $(a)$ of Figure

1132: \ref{mjannparole}. The presence of longer and longer phrases before

1133: 1500 bp have been compressed is an evidence for the existence of

1134: highly repetitive subsequences in the first half, whereas in the

1135: second half of the input sequence $Inter\_236189$ the previous

1136: regularity is broken and only brief repetitions can be

1137: found. Consequently, the extent of the dictionary is low: there are

1138: only 264 phrases.

1139:

1140: As in the case of the analysed atypical region of genome of {\it

1141:   Archaeoglobus fulgidus}, the distribution of phrase length has an

1142: anomalous non-Gaussian tail that comprehends also a phrase that is 134

1143:   bp long (Figure \ref{mjannparole} $(b)$).

1144:

1145: \begin{table}\begin{center}

1146: \begin{tabular}{|l|}

1147: \hline

1148: AATTAAAATCAGACCGTTTCGGAATGGAAAT\\

1149: \hline

1150: AGACCGTTTCGGAATGGAAAT\\

1151: \hline

1152: AGACCGTTTCGGAATGGAAATGAT\\

1153: \hline

1154: AGGGAACCCTAAAAAGGTTC\\

1155: \hline

1156: AGGGAACCCTAAAAAGGTTCCCTTGAGGGTT\\

1157: \hline

1158: AGGGAACCCTAAAAAGGTTCCCTTGAGGGTTCATTAAAATCAGACCGTT\\

1159:                                        TCGGAATGGAAATCTGTT\\

1160: \hline

1161: AGGGAACCCTAAAAAGGTTCCCTTGAGGGTTCATTAAAATCAGACCGTT\\

1162: TCGGAATGGAAATCTGTTAGGGAACCCTAAAAAGGTTCCCTTGAGGGTT\\

1163: CATTAAAATCAGACCGTTTCGGAATGGAAATCTGTT\\

1164: \hline

1165: ATTAAAATCAGACCGTTTCGGAATGGAAATGATT\\

1166: \hline

1167: CATTAAAATCAGACCGTTTCGGAATGGAAATTC\\

1168: \hline

1169: CATTAAAATCAGACCGTTTCGGAATGGAAATCTGTT\\

1170: \hline

1171: CCTTGAGGGTTCATTAAAATCAGACCGTTTCGGAATGGAAATCTGTT\\

1172: \hline

1173: GTATTAAAATCAGACCGTTTCGGAAT\\

1174: \hline

1175: GTTTCGGAATGGAAATCTGTT\\

1176: \hline

1177: GTTTCGGAATGGAAATGAAT\\

1178: \hline

1179: GTTTCGGAATGGAAATGATT\\

1180: \hline

1181: GTTTCGGAATGGAAATTTTT\\

1182: \hline

1183: TAAAATCAGACCGTTTCGGAAT\\

1184: \hline

1185: TAAAATCAGACCGTTTCGGAATGGAAAT\\

1186: \hline

1187: TAAAATCAGACCGTTTCGGAATGGG\\

1188: \hline

1189: \end{tabular}

1190: \caption{\it Methanococcus jannaschii genome. Phrases longer than 20

1191:   bp are listed, coming from the dictionary relative to atypical

1192:   region $Inter\_236189$.}\label{tabmjann}

1193: \end{center}

1194: \end{table}

1195:

1196: For what concerns the analysis of recurrent phrases in the dictionary,

1197: it holds that only phrases that are shorter than 10 bp are used more

1198: than three times as prefix word or suffix word. As it is shown in

1199: Table \ref{tabmjann}, the phrases longer than 20 bp (that correspond

1200: to the high ``spikes'' of Figure \ref{mjannparole} $(a)$) do not allow a

1201: dominant motif to be determined in such a definite way as in the case

1202: of atypical region $Inter\_393196$ of {\it Archaeoglobus fulgidus}

1203: genome. The increasingly longer phrases that have been detected in

1204: graph \ref{mjannparole} $(a)$ are not generated by coupling

1205: the prefix word to itself (as it would have been if there were a precise

1206: periodicity), but prefix and suffix words were different from each

1207: other and neither they are subsequent. Again, the longest phrases

1208: coincides with the longest ones found by means of the algorithm $LZ77$.

1209:

1210: However, the main point of distinction of this atypical region is that

1211: all long phrases are rich in T$^n$A$^m$-patterns. This fact, together

1212: with the positive homology response classify this region as a promoter

1213: region containing a subregion known as {\it TATA box}. The promoter

1214: sequence could be located using program PROSCAN Version 1.7

1215: (\cite{promoter}).

1216:

1217: The dictionary of this region provides another example of regularity

1218: in DNA sequences, different from the one coming from the genome of

1219: {\it Archaeoglobus fulgidus}.

1220:

1221: \subsection{Arabidopsis thaliana}

1222: {\it Arabidopsis thaliana} is a small flowering plant that is widely

1223: used as a model organism in plant biology. {\it Arabidopsis} is a member of

1224: the mustard (Brassicaceae) family, which includes cultivated species

1225: such as cabbage and radish. {\it Arabidopsis thaliana} is the first

1226: plant for which the complete genome has been sequenced. Its genome

1227: consists of five chromosomes, but we have analysed only chromosomes II

1228: and IV. Since the research regarding this genome is

1229: still {\it in itinere}, here we shall present some very preliminar results

1230: concerning chromosome II.

1231:

1232: The atypical regions we have analysed are

1233: \begin{itemize}

1234: \item $Coding\_8330271$: atypical region, length $L=309\ bp$,

1235:   sublinearity index $\mathcal{G}_{_Z}=0.166$, fragment

1236:   complexity $K= 1.113$.

1237: \item $Inter\_22564763$: atypical region, length $L=65849\ bp$,

1238:   sublinearity index $\mathcal{G}_{_Z}=0.589$, fragment

1239:   complexity $K= 0.911$.

1240: \end{itemize}

1241: These regions have been chosen as peculiar among the

1242: many atypical regions (see Figure \ref{lgarab}) belogning to this

1243: genome: a short and very regular coding region and a long intergenic

1244: region.

1245: \begin{figure}

1246: \begin{tabular}{lr}

1247: \raggedright{(a)\psfig{figure=LGCODINTRarab.ps,width=6.5cm,angle=270}}&

1248: \raggedleft{(b)\psfig{figure=LGUPSarab.ps,width=6.5cm,angle=270}}

1249: \end{tabular}

1250: \caption{\it Arabidopsis thaliana genome. Of each functional region, its

1251:   length and the corresponding sublinearity index are plotted. In

1252:   picture (a), the crosses ($+$) are referred to coding regions, while

1253:   the squares ($\square$) are referred to introns. In picture (b), the

1254:   squares ($\square$) are referred to intergenic regions. In both

1255:   plots, the horizontal line is the threshold for the sublinearity

1256:   index, under which the region is atypical. }\label{lgarab}

1257: \end{figure}

1258: \begin{figure}

1259: \centerline{(a)\psfig{figure=arabCOD_8330271.ps,width=7cm,angle=270}}

1260: \begin{tabular}{lr}

1261: \raggedright{(b)\psfig{figure=arabCOD_8330271len.ps,width=6.5cm,angle=270}}&

1262: \raggedleft{(c)\psfig{figure=arabCOD_8330271_stalen.ps,width=6.5cm,angle=270}}

1263: \end{tabular}

1264: \caption{\it Atypical region $Coding\_8330271$. (a) The Information

1265: Content growth is logarithmic for the main part of the sequence. The

1266: word length doubling is shown on plot (b) and the multimodal

1267: distribution of word length is illustrated in (c).}\label{proteinarab}

1268: \end{figure}

1269:

1270: The atypical region $Coding\_8330271$ is characterized by a period

1271: $^\prime GA^{\ \prime}$ that is repeated for most part of the sequence

1272: (the first 200 bp). This is made evident both from the $I(n)$ plot on

1273: Figure \ref{proteinarab} $(a)$, which is definitely logarithmic in the

1274: first part, and from the word length doubling highlighted in Figure

1275: \ref{proteinarab} $(b)$. Also, the multimodal distribution of word

1276: length reflects the atypical nature of this regions, while the maximal

1277: length is $12$ bp, which confirms that the characteristic maximal

1278: length in non-atypical coding regions is about $11-12$ bp (for

1279: instance, see \ref{globNonatyp} (c)). The putative protein that may be

1280: obtained by translating this coding region is following protein

1281: Atg219370:

1282: $$

1283: \begin{array}{l}

1284: \mathrm{ERERGSERERERERERERERERERERERERERERERER}\\

1285: \mathrm{EREREREREREREREREREREREREKHKPATLAKNRRR}\\

1286: \mathrm{RFVKNRRRRDHRRRISIIDGYESQF*V}\\

1287: \end{array}

1288: $$

1289:

1290: In the above notation, each letter corresponds to an amino acid, while

1291: the star indicates the end of the protein. This putative protein is

1292: very rich in Glutamate (E) and Arginine (R), but its function is still

1293: unknown and consideration should be given to the fact that the actual

1294: existence of this protein in the living organism has not yet been

1295: confirmed by biomolecular laboratory experiments, therefore this

1296: fragment has been classified as coding onyl by means of statistical

1297: predicitive methods .

1298: \begin{figure}

1299: \begin{tabular}{lr}

1300: \raggedright{(a)\psfig{figure=arabUPS_22564763.ps,width=6.5cm,angle=270}}&

1301: \raggedleft{(b)\psfig{figure=arabUPS_22564763len.ps,width=6.5cm,angle=270}}

1302: \end{tabular}

1303: \begin{tabular}{lr}

1304: \raggedright{(c)\psfig{figure=Zoomgeniarab.ps,width=6.5cm,angle=270}}&

1305: \raggedleft{(d)\psfig{figure=arabUPS_22564763_stalen.ps,width=6.5cm,angle=270}}

1306: \end{tabular}

1307: \caption{\it Arabidopsis thaliana genome (chromosome II). (a) The

1308: behaviour of Information Content of atypical region $Inter\_22564763$

1309: grows in a very peculiar way. Its sublinearity index has been

1310: evaluated as 0.589. (b) The plot shows location and length of the

1311: phrases in the parsing obtained by the algorithm CASToRe. (c) The plot

1312: is an enhancement of the final part of the atypical region

1313: $Inter\_22564763$. (d) The distribution of phrase length for the

1314: aforementioned parsing is pictured.}\label{arabgene}

1315: \end{figure}

1316:

1317: The atypical region $Inter\_22564763$ was a challenging task, because

1318: not only the Information Content growth shows an abrupt change around

1319: $50000$ bp (Figure \ref{arabgene} $(a)$), but also the word length is

1320: subjected to a deep decrease when reaching that threshold, although at

1321: that point the dictionary already contained more than $1700$ phrases,

1322: most of them longer than 50 bp (Figure \ref{arabgene} $(b)$ and

1323: $(c)$).

1324:

1325: It was this twofold look of the region that suggested that in the

1326: final part of this region (from $50000$ bp to $65849$ bp) there might

1327: have been some coding sequences. This was also supported by the

1328: prevailing length of about $11-12$ bp, which, as it was already

1329: pointed out, may be considered as characteristic of coding regions.

1330: \begin{figure}

1331: \centerline{\psfig{figure=geniTrovatiarab.ps,width=12cm,angle=270}}

1332: \caption{\it Arabidopsis thaliana genome (chromosome II). Same part of

1333: atypical region $Inter\_22564763$ as plot (c) in Figure

1334: \ref{arabgene}. The boxes correspond to the location of the

1335: four predicted genes (labelled as $^\prime G1^\prime,^\prime

1336: G2^\prime,^\prime G3^\prime,^\prime G4^\prime$) as they have been

1337: predicted looking for similarities with Arabidopsis thaliana known

1338: genes.}\label{cfrarabgene}

1339: \end{figure}

1340: As a result, four putative genes G1, G2, G3 and G4 have

1341: been located by means of Hidden Markov Model-based program

1342: FGENESH\footnote{This program is available at the website

1343: www.softberry.com to which we refer concerning the reliability and

1344: efficiency of the algorithm.} that has been created for predicting

1345: multiple genes and their structure in genomic DNA sequences. The

1346: analysis via FGENESH has been exploited with respect to known genes in {\it

1347: Arabidopsis thaliana}. Their predicted position is illustrated in Figure

1348: \ref{cfrarabgene}.

1349:

1350: %%%%%%%%%%%%questa parte va tolta

1351: %In Table \ref{putagene} we shall introduce a short

1352: %characterisation of each predicted gene and exon. Notice that all

1353: %these genes are to be read in the complementary strand.

1354: %

1355: %A precise biological screening of these data is still in progress and

1356: %more and more open questions are arising from them.

1357: %\begin{table}\label{putagene}

1358: %\begin{center}

1359: %\begin{tabular}{|c|c|c|c|c|}

1360: %\hline

1361: %Gene&Feature&start$-$end&Open Reading Frame&length\\

1362: %\hline

1363: %\hline

1364: %G1&PolA&52194 &&\\

1365: %&last exon&52373$-$52544&52373$-$52543&171\\

1366: %&internal exon&52583$-$52950&52585$-$52950&366\\

1367: %&internal exon &53142$-$54182&53142$-$54182&1041\\

1368: %&internal exon&54212$-$54316&54212$-$54316&105\\

1369: %&final exon&54413$-$54751&54413$-$54751&339\\

1370: %&promoter&54766&  &\\

1371: %\hline

1372: %\hline

1373: %G2&PolA&54802 &&\\

1374: %&last exon&54836$-$55447&54836$-$55447&612\\

1375: %&final exon&55528$-$55983&55528$-$55983&456\\

1376: %&promoter&56054&  &\\

1377: %\hline

1378: %\hline

1379: %G3&PolA&62628 &&\\

1380: %&last exon&62648$-$62861&62648$-$62860&213\\

1381: %&internal exon&62899$-$63232&62901$-$63230&330\\

1382: %&internal exon &63382$-$63723&63383$-$63721&339\\

1383: %&internal exon&63826$-$64309&63827$-$64309&483\\

1384: %&final exon&64410$-$64490&&81\\

1385: %&promoter&64768&  &\\

1386: %\hline

1387: %\hline

1388: %G4&PolA&64895 &&\\

1389: %&exon&64973$-$65278&64973$-$65278&306\\

1390: %\hline

1391: %\end{tabular}

1392: %\end{center}

1393: %\caption{\it The detailed location and internal structure of predicted

1394: %genes within the intergenic region $Inter\_22564763$ of {\it

1395: %Arabidopsis thaliana}. The analysis has been performed by means of

1396: % {\rm FGENESH}.}

1397: %\end{table}

1398: \section{Final remarks and future work}

1399: We have shown that complete genomes may be analysed in some of their

1400: distinctive features by means of the Computable Information Content

1401: obtained via compression algorithms. The Information Content may be

1402: used to extract regions having an atypical information growth, which

1403: is strictly connected to the presence of highly repetitive subregions

1404: that might be supposed to have a regulatory function within the

1405: genome. Different types of sublinearities have been associated to

1406: different biogical features. These results shall pave the way for a

1407: more profound understanding of the local compressibility of genomes

1408: and for a more detailed identification of motifs and patterns that are

1409: significant to some biological function, in view of a joint use

1410: together with other predictive methods.

1411: \begin{thebibliography}{99}

1412: \bibitem{billingsley} Billingsley P., {\it Ergodic Theory and

1413: Information}, J. Wiley and Sons, New York (1965).

1414: \bibitem{kin}Khinchin A.I., {\it Mathematical foundations of information

1415: theory}, Dover Publications, New York, 1957

1416: \bibitem{Ch}Chaitin G.J, {\it Information, Randomness and

1417:     incompleteness}, second edition, World Scientific, Singapore

1418:     (1990).

1419: \bibitem{kolmogorov} Kolmogorov A. N., ''On the entropy per time unit

1420:  as a metric invariant of automorohism'', {\it Dokl. Acad. Nauk.},

1421: {\bf 124}: 754-755 (1959).

1422: \bibitem{cleary}  Bell T., Witten I. H., Cleary J. G., \textit{Modeling for

1423: text compression}, ACM Computing Surveys, \textbf{21}, 557--591 (1989).

1424: \bibitem{licatone}Benci V., Bonanno C., Galatolo S., Menconi G.,

1425: Virgilio M., {\it Dynamical systems and computable information},

1426: to appear on {\it Disc. Cont. Dyn. Syst.- B}.

1427: \bibitem{jiang} Adebiyi E. F., Jiang T., Kaufmann M.,''An efficient

1428: algorithm for finding short approximate non-tandem repeats'', {\it

1429: Bioinformatics}, {\bf 17}, Suppl 1: S5--S12 (2001).

1430: \bibitem{chen} Li M., Badger J.H., Chen X., Kwong S., Kearney P.,

1431: Zhang H., ``An information based sequence distance and its Application

1432: to whole mithocondrial genome phylogeny'', {\it Bioinformatics}, {\bf

1433: 17} (2): 149--154 (2001).

1434: \bibitem{tahi} Grumbach S., Tahi F., ``A new challenge for compression

1435: algorithms: genetic sequences'', {\it Information processing \&

1436: Management}, {\bf 30}: 875--886 (1994).

1437: \bibitem{gal4}  Galatolo S., \emph{``Orbit complexity and data compression''}

1438: , Discrete and Continuous Dynamical Systems \textbf{7}, 477-486 (2001).

1439: \bibitem{gal3} Galatolo S., ``Complexity, initial data sensitivity,

1440: di\-men\-sion and weak chaos in dynamical systems'', {\it

1441: Nonlinearity}, {\bf 16}, 4, 1219 (2003).

1442: \bibitem{CSF02} Argenti F., Benci V., Cerrai P., Cordelli A.,

1443: Galatolo S., Menconi G., ``Information and dynamical systems: a

1444: concrete measurement on sporadic dynamics'', {\it Chaos, Solitons and

1445: Fractals}, \textbf{13}, 3, 461--469 (2002).

1446: \bibitem{giuliauno}P. Allegrini, V. Benci, P. Grigolini, P. Hamilton,

1447: M. Ignaccolo, G. Menconi, L. Palatella, G. Raffaelli, N. Scafetta,

1448: M. Virgilio, Y. Yang,``Compression and Diffusion: A Joint Approach to

1449: Detect Complexity'', {\it Chaos, Solitons \& Fractals}

1450: {\bf 15}, 17 (2003).

1451: \bibitem{bonanno} Bonanno C., ''The Manneville map: topological,

1452: metric and computational approach'',

1453: http://arXiv.org/abs/math.DS/0107195 (2001).

1454: \bibitem{cristalli} Fronzoni L., Galeotti L., Menconi G., ``Measure of

1455: Diffusion Entropy of weak turbulence in sample of nematic liquid

1456: chrystal'',in {\it Determinism, Holism and

1457: Complexity}, p.87, Atti dell'omonimo convegno tenutosi ad Arcidosso

1458: (GR), 2-8 Settembre 2001, Vieri Benci

1459: et al. editors, Kluwer Academic/Plenum Publishers, NY (2003).

1460: \bibitem{jacopogiulia} Bellazzini J., Menconi G., Ignaccolo M., Buresti G.,

1461: Grigolini P., ``Vortex Dynamics in evolutive flows: a weakly chaotic

1462: phenomenon'', {\it Physical Review E}, {\bf 68}: 026126 (2003).

1463: \bibitem{menconi} Bonanno C., Menconi G., ``Computational information

1464: for the logistic map at the chaos threshold'', {\it Disc. Cont. Dyn. Syst.-

1465: B}, \textbf{2}, no.3, 415--431 (2002).

1466: \bibitem{lz77} Ziv J., Lempel A., ``A Universal Algorithm for

1467: Sequential Data Compression'', {\it IEEE Transactions on Information Theory},

1468: \textbf{23}, 337--342 (1977).

1469: \bibitem{lz78} Ziv J., Lempel A., ``Compression of Individual

1470: Sequences Via Variable-Rate Coding'', {\it IEEE Transactions on Information

1471: Theory}, \textbf{24}, 530--536 (1978).

1472: \bibitem{bell}  Bell T., Witten I. H., Cleary J. G., \textit{Modeling for

1473: text compression}, ACM Computing Surveys, \textbf{21}, 557--591 (1989).

1474: \bibitem{loreto} Benedetto D, Caglioti E., Loreto V.,''Language trees

1475: and zipping'',{\it Phys Rev Lett} {\bf 88}(4):048702(2002).

1476: \bibitem{myers} Myers G., ``Whole-genome DNA sequencing'', {\it

1477: Computing in Science \& Engineering}, {\bf 1}, 3:33--43 (1999).

1478: \bibitem{coverthomas} Cover T. M., Thomas J. A., {\it Elements of

1479: Information Theory}, Wiley (1991).

1480: \bibitem{promoter} Prestridge, D.S., ``Predicting Pol II Promoter

1481: Sequences Using Transcription Factor Binding Sites'', {\it

1482: J. Mol. Biol.},{\bf 249}: 923-32 (1995).

1483: \end{thebibliography}

1484: \end{document}

1485:

1486: