1: \documentclass{article}
2: \usepackage{natbib}
3: \usepackage{graphicx}
4: \pagestyle{plain}
5:
6:
7: \def\A{{\tt A}}
8: \def\C{{\tt C}}
9: \def\G{{\tt G}}
10: \def\T{{\tt T}}
11: \def\ApA{{\tt ApA}}
12: \def\ApC{{\tt ApC}}
13: \def\ApG{{\tt ApG}}
14: \def\ApT{{\tt ApT}}
15: \def\CpA{{\tt CpA}}
16: \def\CpC{{\tt CpC}}
17: \def\CpG{{\tt CpG}}
18: \def\CpT{{\tt CpT}}
19: \def\GpA{{\tt GpA}}
20: \def\GpC{{\tt GpC}}
21: \def\GpG{{\tt GpG}}
22: \def\GpT{{\tt GpT}}
23: \def\TpA{{\tt TpA}}
24: \def\TpC{{\tt TpC}}
25: \def\TpG{{\tt TpG}}
26: \def\TpT{{\tt TpT}}
27: \def\ra{\rightarrow}
28: \def\statGC{stationary GC-content}
29: \def\rat{$\ra$}
30:
31: \def\tabi{\begin{table}[bth]
32: \begin{center}
33: \footnotesize
34: \begin{tabular}{r|c|c|c|c}
35: &6 parameter
36: &7 parameter
37: &8 parameter
38: &9 parameter
39: \\
40: &model
41: &model
42: &model
43: &model
44: \\
45: \hline
46: \A:\T\rat\C:\G
47: &0.012
48: &0.012
49: &0.011
50: &0.007
51: \\
52: \A:\T\rat\T:\A
53: &0.010
54: &0.011
55: &0.011
56: &0.011
57: \\
58: \C:\G\rat\G:\C
59: &0.016
60: &0.016
61: &0.012
62: &0.012
63: \\
64: \C:\G\rat\A:\T
65: &0.015
66: &0.014
67: &0.014
68: &0.014
69: \\
70: \A:\T\rat\G:\C
71: &0.036
72: &0.036
73: &0.036
74: &0.036
75: \\
76: \C:\G\rat\T:\A
77: &0.158
78: &0.059
79: &0.060
80: &0.060
81: \\
82: \hline
83: \CpG\rat\CpA/\TpG
84: &
85: &0.618
86: &0.627
87: &0.624
88: \\
89: \CpG\rat\CpC/\GpG
90: &
91: &
92: &0.029
93: &0.029
94: \\
95: \TpT/\ApA\rat\TpG/\CpA
96: &
97: &
98: &
99: &0.013
100: \\
101: \hline
102: \statGC
103: &0.213
104: &0.341
105: &0.340
106: &0.339
107: \\
108: \hline
109: $-2 \log\lambda$
110: &
111: &7.7$\cdot 10^6$
112: &1.3$\cdot 10^5$
113: &9.6$\cdot 10^4$
114: \end{tabular}
115: \caption{
116: \footnotesize
117: \label{tab1}Estimates for substitution frequencies for nested models
118: of nucleotide substitution in human AluSx repeats. Given are the substitution
119: frequencies per bp in the time span after the insertion of the AluSx repeats
120: into the human genome. In the last row we note the $-2\log\lambda$ where $\lambda$ is the
121: likelihood ratio of the model and the one with one less parameter in the column
122: to the left.}
123: \end{center}
124: \end{table}
125: }
126:
127: \def\tabii{\begin{table}[bht]
128: \begin{center}
129: \footnotesize
130: \begin{tabular}{r|c|c|c|c}
131: &6 parameter
132: &7 parameter
133: &8 parameter
134: &9 parameter
135: \\
136: &model
137: &model
138: &model
139: &model
140: \\
141: \hline
142: \A:\T\rat\C:\G
143: &0.024
144: &0.025
145: &0.026
146: &0.026
147: \\
148: \A:\T\rat\T:\A
149: &0.041
150: &0.041
151: &0.041
152: &0.041
153: \\
154: \C:\G\rat\G:\C
155: &0.037
156: &0.036
157: &0.036
158: &0.023
159: \\
160: \C:\G\rat\A:\T
161: &0.029
162: &0.029
163: &0.028
164: &0.028
165: \\
166: \A:\T\rat\G:\C
167: &0.073
168: &0.074
169: &0.046
170: &0.046
171: \\
172: \C:\G\rat\T:\A
173: &0.151
174: &0.111
175: &0.105
176: &0.107
177: \\
178: \hline
179: \CpG\rat\CpA/\TpG
180: &
181: &0.274
182: &0.331
183: &0.328
184: \\
185: \CpA/\TpG\rat\CpG
186: &
187: &
188: &0.100
189: &0.097
190: \\
191: \CpG\rat\CpC/\GpG
192: &
193: &
194: &
195: &0.096
196: \\
197: \hline
198: \statGC
199: &0.349
200: &0.374
201: &0.335
202: &0.337
203: \\
204: \hline
205: $-2 \log\lambda$
206: &
207: &2.9$\cdot 10^5$
208: &1.6$\cdot 10^5$
209: &1.1$\cdot 10^5$
210: \end{tabular}
211: \caption{
212: \footnotesize
213: \label{tab2}Estimates for substitution frequencies for nested models
214: of nucleotide substitution in DANA repeats from {\em Danio rerio}.}
215: \end{center}
216: \end{table}
217: }
218:
219:
220: \def\tabiii{\begin{table}[htb]
221: \begin{center}
222: \footnotesize
223: \begin{tabular}{r|c|c|c|c}
224: &6 parameter
225: &7 parameter
226: &8 parameter
227: &9 parameter
228: \\
229: &model
230: &model
231: &model
232: &model
233: \\
234: \hline
235: \A:\T\rat\C:\G
236: &0.038
237: &0.038
238: &0.038
239: &0.038
240: \\
241: \A:\T\rat\T:\A
242: &0.052
243: &0.045
244: &0.045
245: &0.045
246: \\
247: \C:\G\rat\G:\C
248: &0.034
249: &0.034
250: &0.034
251: &0.034
252: \\
253: \C:\G\rat\A:\T
254: &0.074
255: &0.074
256: &0.074
257: &0.074
258: \\
259: \A:\T\rat\G:\C
260: &0.052
261: &0.052
262: &0.052
263: &0.047
264: \\
265: \C:\G\rat\T:\A
266: &0.108
267: &0.108
268: &0.098
269: &0.098
270: \\
271: \hline
272: \TpA\rat\TpT/\ApA
273: &
274: &0.029
275: &0.028
276: &0.028
277: \\
278: \TpC/\GpA\rat\TpT/\ApA
279: &
280: &
281: &0.036
282: &0.035
283: \\
284: \GpT/\ApC\rat\GpC
285: &
286: &
287: &
288: &0.021
289: \\
290: \hline
291: \statGC
292: &0.330
293: &0.330
294: &0.328
295: &0.326
296: \\
297: \hline
298: $-2 \log\lambda$
299: &
300: &853
301: &592
302: &40
303: \end{tabular}
304: \caption{
305: \footnotesize
306: \label{tab3}Estimates for substitution frequencies for nested models of nucleotide
307: substitution in DNAREP1\_DM transposable element from {\em Drosophila melanogaster}.}
308: \end{center}
309: \end{table}
310: }
311:
312:
313:
314: \author{Peter F. Arndt${}^{1*}$ and Terence Hwa${}^2$\\[5mm]
315: ${}^1$ Max Planck Institute for Molecular Genetics, \\
316: Ihnestr. 73, 14195 Berlin, Germany\\[1mm]
317: ${}^2$ Center for Theoretical Biological Physics,
318: \\
319: UC San Diego,
320: 9500 Gilman Drive, La Jolla, CA 92093-0374
321: \\[3mm]
322: ${}^*$ To whom correspondence should be addressed.
323: }
324:
325: \title{Identification and Measurement of Neighbor Dependent Nucleotide Substitution Processes}
326: \begin{document}
327: \maketitle
328:
329: \begin{abstract}
330: \mbox{}\\\noindent
331: {\bf Motivation:}
332: The presence of neighbor dependencies generated a specific pattern of
333: dinucleotide frequencies in all organisms. Especially, the
334: CpG-methylation-deamination process is the predominant substitution process in
335: vertebrates and needs to be incorporated into a more realistic model for
336: nucleotide substitutions.
337: \\\noindent
338: {\bf Results:}
339: Based on a general framework of nucleotide substitutions we develop a method
340: that is able to identify the most relevant neighbor dependent substitution
341: processes, measure their strength, and judge their importance to be included
342: into the modeling. Starting from a model for neighbor independent nucleotide
343: substitution we successively add neighbor dependent substitution processes in
344: the order of their ability to increase the likelihood of the model describing
345: given data. The analysis of neighbor dependent nucleotide substitutions in
346: human, zebrafish and fruit fly is presented.
347: \\\noindent
348: {\bf Availability:} A web server to perform the presented analysis is
349: publicly available at:
350: http://evogen.molgen.mpg.de/server/substitution-analysis .
351: \\\noindent
352: {\bf Contact:} arndt@molgen.mpg.de
353: %\\[5mm]\noindent
354: %{\bf Running Head:}
355: %Neighbor Dependent Nucleotide Substitution
356:
357:
358:
359:
360: \end{abstract}
361:
362:
363: \section{Introduction}
364: The identity of the neighboring nucleotide can have a drastic influence on the
365: mutation rates of a nucleotide. A well-known and studied example of this fact
366: is the increased mutation of cytosine to thymine in \CpG\ dinucleotides in
367: vertebrates \citep{Co78, RR80}. This process is triggered by the methylation of
368: cytosine in \CpG\, followed by deamination, and mutation from \CpG\ to \TpG\ or
369: \CpA\ (on the reverse strand). Due to this process the number of \CpG\ is
370: decreased while the number of \TpG\ and \CpA\ is larger than expected from
371: independently evolving nucleotides. Most of the deviant dinucleotide odds
372: ratios (dinucleotide frequencies normalized for the base composition) in the
373: human genome can be explained by the presence of the \CpG\ methylation
374: deamination process \citep{ABH02}. Biochemical studies in the 1970s already
375: compared these odds ratios for different genomes and different fractions of
376: genomic DNA \citep{Ru76, RS77} and concluded that these ratios are a remarkably
377: stable property of genomes. In the following Karlin and coworkers \citep{CB95,
378: KM97, KMC97} elaborated and expanded these observations, showing that the
379: pattern of dinucleotide abundance constitutes a genomic signature in the sense
380: that it stable across different parts of a genome and generally similar between
381: related organisms. Since this signature is also present in non-coding and
382: intergenic DNA it is very promising to study neighbor dependent mutation and
383: fixation processes (we refer to the effective process as the substitution
384: process) to understand the evolution of neutral DNA.
385: However, to pursue on this track new models for nucleotide
386: substitutions that extends those which only capture neighbor independent
387: nucleotide substitutions (see \citep{LioGoldman} for a review) have to be
388: formulated (see also \citep{ABH02, Haussler, LH04}).
389:
390: Recently a framework to include such neighbor dependent processes has been
391: introduced \citep{ABH02}. The framework itself is capable to include any type
392: of neighbor dependent process and was already successfully applied to model the
393: \CpG\ methylation deamination process in vertebrates \citep{APH03}. Although
394: these models are mathematically more complicated they however allow a
395: quantitative analysis of neighbor dependent processes and to make reliable
396: estimations on other properties e.g. the stationary GC-content. Here we will
397: extend this framework and discuss the inclusion of more neighbor dependent
398: substitutions and how one can infer their relevance without prior knowledge on
399: the underlying biochemical processes. In vertebrates the \CpG\ methylation
400: deamination process is the predominant nucleotide substitution process. Its
401: rate is about 40 times higher than this of a transversion and its history can
402: actually reconstructed for the last 250 Myr \citep{APH03}. One reason for this
403: substitution frequency being so high is that in vertebrates \CpG\ methylation is also
404: used in gene regulation,
405: as methylated regions of the genome are not transcribed.
406: Consequently, \CpG's in these regions often mutate. We
407: know already that also other vertebrates use methylation in the same way but do
408: not know about the quantitative extent their genomes are methylated. The
409: situation is still rather unclear in other kingdoms of life. Although we
410: clearly see signatures of neighbor dependent substitution processes, we do not
411: know the responsible processes and their rates.
412:
413: To present our method we study neighbor dependent substitutions in human ({\em
414: Homo sapiens}), zebrafish ({\em Danio rerio}) and fruit fly ({\em Drosophila
415: melanogaster}). In all these studies we first try to model the observed
416: nucleotide substitutions with a model which does not include any neighbor
417: dependent nucleotide substitutions (12 free rate parameters) and then ask the
418: question which neighbor dependent substitution process one would have to
419: include to describe the observed data best. The idea is to capture the most of
420: the observed substitutions by single nucleotide substitutions independent of
421: the neighboring bases and then to include neighbor dependent substitutions one
422: by one to generate a better model with the least number of parameters.
423: Processes are added in the order of their ability to describe the observed data
424: better. Naturally, the addition of any further process (together with one rate
425: parameter) into a model will increase the likelihood of this model to describe
426: the observed data. In order not to over-fit the data we use a likelihood ratio
427: test to judge whether the addition of further process is justified. The
428: strength of our approach is to come up with a model with fewer parameters that
429: still captures the essential neighbor dependent nucleotide substitution
430: processes. This prevents over-fitting the model to given data and eases the
431: quantitative estimation of a smaller number of parameters.
432:
433:
434: The rest of the paper organizes as follows. In the next section we will
435: describe details of our method. There is no need to implement the described
436: procedure for readers who want to analyze their own sequences, since we are
437: running a public web server at
438: {http://evogen.molgen.mpg.de/server/substitution-analysis}. At this site one is
439: able to upload sequence data and perform the presented analysis. First
440: applications of such an analysis will be presented in the results section.
441:
442: \section{Method}
443:
444: \subsection{The substitution model}
445:
446: In total there are 12 distinct neighbor independent substitution processes of
447: a single nucleotides by another; four of them are so-called transitions that
448: interchange a purine with a purine or a pyrimidine with a pyrimidine. The
449: remaining eight processes are the so-called transversions that interchange a
450: purine with a pyrimidine and vice versa. The rates of these processes, $\alpha\ra\beta$, will be
451: denoted $r_{\alpha\beta}$, where $\alpha,\beta\in\{\A,\C,\G,\T\}$ denote a
452: nucleotide. On top of these 12 processes we want to consider also neighbor
453: dependent processes of the kind $\kappa\lambda\ra\kappa\sigma$ and
454: $\kappa\lambda\ra\sigma\lambda$
455: where the right or left base of a
456: di-nucleotide changes, respectively. There might be several of those processes
457: present in our model, their rates will be denoted by $r_{\kappa\lambda\kappa\sigma}$ or
458: $r_{\kappa\lambda\sigma\lambda}$ . We do not consider
459: processes where both nucleotides of a dinucleotide change at the same time. In
460: vertebrates, the most important neighbor dependent process to consider is the
461: substitution of cytosine in \CpG\ resulting in \TpG\ or \CpA. Its rate is
462: about 40 times higher than this of a transversion \citep{APH03}. This process is
463: triggered by the methylation and subsequent deamination of cytosine in \CpG\
464: pairs. It is commonly (and erroneously) assumed that this process only affects
465: \CpG\ dinucleotides. However, this is not the case as it has been shown
466: \citep{ABH02}.
467:
468: The model is parameterized by the substitution rates and the length of
469: the time span,~$dt$, the respective substitution processes acted upon the sequence,
470: which would in our case be the time between the observation of an ancestral
471: sequence and its daughter sequence,~$T$. We have the freedom to rescale time and
472: measure it in units of $T$. In this case, the time span is $dt=1$ and with
473: this choice the substitution
474: rates are equal to the substitution frequencies giving the number of nucleotide
475: substitutions per bp. In the simplest case our model includes
476: neighbor independent processes only and is parameterized by 12 substitution
477: frequencies. For each additional neighbor dependent process we gain one
478: additional parameter. The set of all these substitution frequencies will be
479: denoted by $\{r\}$. The number of parameters can actually be reduced by a factor of two
480: when one considers substitutions along neutrally evolving DNA. In this case we
481: cannot distinguish the two strands of the DNA and therefore the substitution
482: rates are reverse complement symmetric, e.g. the rate for the substitution \C\rat\A\ is
483: equal to the rate for the substitution \G\rat\T\ (in the following we will denote this
484: process by $\C:\G\ra\A:\T$, for the rates we have $r_{\C\A}=r_{\G\T}$).
485:
486: In order to facilitate the subsequent maximum likelihood analysis we need to
487: compute the probability,~$P_{\{r\}}(\cdot\beta\cdot|\alpha_1\alpha_2\alpha_3)$,
488: that the base $\alpha_2$ flanked by $\alpha_1$ to the left and by $\alpha_3$ to
489: the right, changes into the base $\beta$ for given substitution frequencies
490: $\{r\}$. This probability can easily calculated by numerically solving the time
491: evolution of the probability to find three bases $p(\alpha\beta\gamma;t)$ at
492: time $t$, which is given by the Master equation and can be written as the
493: following set of differential equations:
494: %\begin{eqnarray}
495: %\frac\partial{\partial t}p(\alpha\beta\gamma;t)
496: %&=&
497: %\sum_{\epsilon\in\{\A,\C,\G,\T\}}
498: %\left[
499: %r_{\epsilon\ra\alpha}\;p(\epsilon\beta\gamma;t)
500: %+r_{\epsilon\ra\beta} \;p(\alpha\epsilon\gamma;t)
501: %+r_{\epsilon\ra\gamma} \;p(\alpha\beta\epsilon;t)
502: %\right]
503: %\nonumber\\
504: %&&+
505: %\sum_{\{\kappa\lambda\ra\kappa\sigma\}}
506: %r_{\kappa\lambda\ra\kappa\sigma}
507: %\left[
508: %\delta_{\kappa\sigma,\alpha\beta}\;p(\kappa\lambda\gamma;t)-
509: %\delta_{\kappa\lambda,\alpha\beta}\;p(\alpha\beta\gamma;t)
510: %\right]
511: %\nonumber\\
512: %&&+
513: %\sum_{\{\kappa\lambda\ra\sigma\lambda\}}
514: %r_{\kappa\lambda\ra\sigma\lambda}
515: %\left[
516: %\delta_{\sigma\lambda,\beta\gamma}\;p(\alpha\kappa\lambda;t)-
517: %\delta_{\kappa\lambda,\beta\gamma}\;p(\alpha\beta\gamma;t)
518: %\right]
519: %\end{eqnarray}
520: \begin{eqnarray}
521: \frac\partial{\partial t}p(\alpha\beta\gamma;t)
522: &=&
523: \sum_{\epsilon\in\{\A,\C,\G,\T\}}
524: \left[
525: r_{\epsilon\alpha}\;p(\epsilon\beta\gamma;t)
526: +r_{\epsilon\beta} \;p(\alpha\epsilon\gamma;t)
527: +r_{\epsilon\gamma} \;p(\alpha\beta\epsilon;t)
528: \right]
529: \nonumber\\
530: &&+
531: \sum_{\epsilon\epsilon'}
532: r_{\epsilon\epsilon'\alpha\beta}\;p(\epsilon\epsilon'\gamma;t)
533: %\nonumber\\
534: %&&
535: +\sum_{\epsilon\epsilon'}
536: r_{\epsilon\epsilon'\beta\gamma}\;p(\alpha\epsilon\epsilon';t),
537: \label{dgl}
538: \end{eqnarray}
539: where the rate parameters with the equal initial and final state,
540: $r_{\alpha\alpha}$ and $r_{\alpha\beta\alpha\beta}$,
541: are defined by
542: \begin{equation}
543: r_{\alpha\alpha}=-\sum_{\epsilon\neq\alpha}r_{\alpha\epsilon}
544: ,\quad
545: r_{\alpha\beta\alpha\beta}=-\sum_{(\epsilon\epsilon')\neq(\alpha\beta)}r_{\alpha\beta\epsilon\epsilon'},
546: \nonumber
547: \end{equation}
548: and rates of neighbor dependent substitution processes not included into the
549: model are take to be zero. The above definitions guarantee the conservation of
550: the total probability,
551: $\sum_{\alpha\beta\gamma}
552: \frac\partial{\partial t}p(\alpha\beta\gamma;t)=0
553: $,
554: since the total influx is balanced by an appropriate outflux of probability.
555: The first three terms on the r.h.s.~in Eq.~(\ref{dgl}) describe single
556: nucleotide substitutions on the three sites whereas the last two sums (which
557: are summed over all pairs of nucleotides) represent the neighbor dependent
558: processes at the sites $(1,2)$ and $(2,3)$, respectively. To describe the
559: evolution of three nucleotides $\alpha_1\alpha_2\alpha_3$, these differential
560: equations have to be solved for initial conditions of the form
561: \begin{equation}
562: p(\alpha\beta\gamma;t=0)
563: =\left\{
564: \begin{array}{cl}
565: 1&\mbox{if }(\alpha\beta\gamma)=(\alpha_1\alpha_2\alpha_3)\\
566: 0&\mbox{otherwise.}
567: \end{array}
568: \right.
569: \end{equation}
570: After numerically iterating the above differential equations using
571: the Runge-Kutta algorithm \citep{Pr92} we get the above transition probability as
572: \begin{equation}
573: P_{\{r\}}(\cdot\beta_2\cdot|\alpha_1\alpha_2\alpha_3)=
574: \sum_{\beta_1\beta_3}p(\beta_1\beta_2\beta_3;t=1)
575: \;.
576: \end{equation}
577: The above iteration has to be carried out 64 times for all possible combinations of
578: initial bases $\alpha_1\alpha_2\alpha_3$. After each iteration
579: 4 of the transition probabilities
580: $P_{\{r\}}(\cdot\beta\cdot|\alpha_1\alpha_2\alpha_3)$
581: with $\beta=\A,\C,\G,$ or \T\
582: can be computed. Note, that the above
583: set of differential equations can easily extended to describe systems of length
584: $N>3$. In this case one has to solve for $4^N$ functions
585: $p(\alpha_1\alpha_2\dots\alpha_N;t)$.
586:
587: \subsection{Estimation of substitution frequencies}
588:
589: One can estimate all the above mentioned substitution frequencies from real
590: sequence data by comparing a pair of ancestral
591: $\vec{\alpha}=\alpha_1\alpha_2\dots\alpha_N$ and daughter sequence
592: $\vec{\beta}=\beta_1\beta_2\dots\beta_N$, where the daughter sequence
593: represents the state of the ancestral sequence after the substitution processes
594: acted upon it for some time. Note that we do not assume any other properties
595: regarding to the nucleotide or dinucleotide distributions of the sequences.
596: Especially, the two sequences do not need to be in their stationary state with
597: respect to the substitution model. [In practice, these pairs of ancestral and daughter
598: sequences can be obtained in various ways. One very fruitful approach is to
599: take alignments of repetitive sequences, which can be found in various genomes
600: due to the activity of retroviruses. Such repetitive elements have entered
601: these genomes during short periods in evolution. Hence all copies of such
602: elements in a genome have been subject to nucleotide substitutions for the same
603: time and accumulated corresponding amounts of changes. Various such repetitive
604: elements and their respective alignment to the once active master (which is
605: taken to be the ancestral sequence \citep{APH03}) can be identified using the
606: RepeatMasker, http://www.repeatmasker.org.]
607:
608: The log likelihood that a sequence
609: $\vec{\beta}$ evolved from a master sequence $\vec{\alpha}$ under a given
610: substitution model parameterized by the substitution frequencies $\{r\}$ is
611: given by
612: \begin{eqnarray}
613: \log L_{\{r\}}&=&
614: \log P_{\{r\}}(\vec{\beta}|\vec{\alpha})
615: \nonumber\\
616: &\approx&
617: \log \prod_{i=2}^{L-1}
618: %\sum_{i=2}^{L-1} \log
619: P_{\{r\}}(\cdot\beta_i\cdot|\alpha_{i-1}\alpha_i\alpha_{i+1})
620: \nonumber\\
621: &=&
622: \sum_{\alpha_1\alpha_2\alpha_3\beta_2}
623: N(\alpha_1\alpha_2\alpha_3\ra\cdot\beta_2\cdot)
624: \log
625: P_{\{r\}}(\cdot\beta_2\cdot|\alpha_1\alpha_2\alpha_3)
626: \;.
627: \label{eqll}
628: \end{eqnarray}
629: where $P_{\{r\}}(\vec{\beta}|\vec{\alpha})$ is the probability of the
630: evolution of the sequence $\vec{\alpha}$ into $\vec{\beta}$. This probability
631: can very well be approximated by the product in the second line.
632: This is due to the fact that the correlations induced by the substitutional
633: processes are very short ranged \citep{ABH02}. We therefore
634: take into account the identities of bases and the dynamics on
635: the nearest neighbors to the left and to the right, and neglect
636: those on the next nearest neighbors and beyond.
637: For most applications
638: this approximation turns out to be sufficient since estimated
639: substitution frequencies deviate less than 1\% from their actual
640: values (see below).
641: Note that this approximation is even exact in the absence of neighbor dependent
642: substitution processes. The numbers
643: $N(\alpha_1\alpha_2\alpha_3\ra\cdot\beta_2\cdot)$ denotes the
644: counts of observations of a base substitution from $\alpha_2$
645: (flanked by $\alpha_1$ to the
646: left and $\alpha_3$ to the right) to $\beta_2$.
647:
648: To estimate the substitution frequencies $\{r^\star\}$
649: for a given pair of $\vec{\alpha}$ and
650: $\vec{\beta}$ or given numbers $N(\alpha_1\alpha_2\alpha_3\ra\cdot\beta_2\cdot)$ we
651: have to maximize the above likelihood by adjusting the
652: substitution frequencies. This can easily be done using Powell's method
653: \citep{Pr92} while taking care of boundary conditions \citep{Bo66}, i.e. the
654: positivity of the substitution frequencies.
655:
656: \begin{figure}[htb]
657: \bigskip
658: \bigskip
659: \begin{center}
660: \includegraphics[width=0.8\textwidth]{fig1-2panel}
661: \caption{\footnotesize
662: \label{fig1}Plot of the estimated frequencies and their standard
663: deviation (from 500 measurements) for randomly drawn sequences of various
664: length. The daughter sequences have been synthetically aged using the
665: following processes (with frequency as indicated by the dotted lines):
666: transversions (0.01), \A:\T\rat\G:\C\ (0.03), \G:\C\rat\A:\T\ (0.05), and
667: \CpG\rat\CpA/\TpG\ (0.4). The stationary GC-content for this model is $0.3474$.
668: }
669: \end{center}
670: \end{figure}
671:
672:
673:
674: \begin{figure}[htb]
675: \bigskip
676: \bigskip
677: \begin{center}
678: \includegraphics[width=0.8\textwidth]{fig2}
679: \caption{\footnotesize\label{fig2}Plot of the deviations of the estimated frequencies
680: $\{|\bar{r}^*-\hat{r}|\}$ (open symbols) and the standard deviation $\{\Delta
681: r^*\}$ (closed symbols) from 500 measurements for randomly drawn sequences of
682: various lengths. The daughter sequences have been synthetically aged using the
683: following processes (with frequency): transversions (0.0001), \A:\T\rat\G:\C\
684: (0.0003), \G:\C\rat\A:\T\ (0.0005), and \CpG\rat\CpA/\TpG\
685: (0.004).
686: }
687: \end{center}
688: \end{figure}
689:
690:
691: \begin{figure}[htb]
692: \bigskip
693: \bigskip
694: \begin{center}
695: \includegraphics[width=0.8\textwidth]{fig_vs_dt}
696: \caption{\footnotesize\label{fig3}A plot of the estimated frequencies for various
697: degrees of sequence divergence. The dotted lines give expected values of the
698: frequencies. The sequence length has been chosen to be $N=10^7$. }
699: \end{center}
700: \end{figure}
701:
702:
703: \subsection{Uncertainty of estimates for finite sequence length}
704:
705: Due to the stochastic nature of the substitution process and due to the fact
706: that always only a finite amount of sequence data is available to estimate
707: the substitution frequencies $\{r^\star\}$, estimated frequencies will show
708: deviations from the real substitution frequencies. In general we do not know
709: or cannot infer these real frequencies otherwise. In order to be able to
710: analyze the uncertainty of frequency estimates from finite sequences we
711: synthetically (in silico) generate pairs of ancestral and daughter sequences
712: using known substitution processes and rates $\{\hat{r}\}$. In the following
713: section we include just one neighbor dependent substitution process, namely the
714: \CpG-methylation deamination process, \CpG\rat\CpA/\TpG, which plays a
715: predominant role in the analysis of nucleotide substitutions in vertebrates.
716: The nucleotides of the ancestral sequences $\vec{\alpha}$ (of length $N$) have
717: been chosen randomly with equal probability from the 4 nucleotides.
718: Subsequently, the ancestral sequence was synthetically aged and we applied
719: substitutions using a Monte Carlo algorithm as described in \citep{ABH02}
720: yielding the sequence $\vec{\beta}$. The resulting pair of sequences is then
721: analyzed using the above procedure to get estimates of the rates $\{r^\star\}$.
722: We repeated this experiment 500 times and got estimates for the means
723: $\{\bar{r}^*\}$ and standard deviation $\{\Delta r^*\}$ of these measurements.
724: In addition we computed the stationary GC-content from each set of substitution
725: frequencies \citep{ABH02}. Results of this analysis are presented in
726: Figure~\ref{fig1} where we show the mean and standard deviation of estimated
727: rates for different length of sequences $N$. The transversion frequencies were
728: chosen to be 0.01, the frequency of the \A:\T\rat\G:\C\ transition to be 0.03,
729: that of the \G:\C\rat\A:\T\ transition to be 0.05, and that of the
730: \CpG\rat\CpA/\TpG\ transition to be 0.4, as indicated by the doted lines in
731: Figure~\ref{fig1}. This choice of frequencies mimics the relative strength of
732: the substitution process as they are observed in the human genome. As can be
733: seen the uncertainty of observed substitution frequencies correlates positively
734: with the substitution frequencies and negatively with the length of the
735: sequences.
736:
737: To further quantify these uncertainties and discuss their dependence on various
738: quantities we plotted the deviations $\{|\bar{r}^*-\hat{r}|\}$ and the standard
739: deviations $\{\Delta r^*\}$ as a function of the sequence length $N$ in
740: Figure~\ref{fig2}. The standard deviations decrease with $1/\sqrt{N}$. In the
741: absence of neighbor dependent substitutions and for ancestral sequences with
742: equally probable nucleotides the standard deviation for reverse complement
743: symmetric frequencies can actually be calculated to be
744: %
745: \begin{equation}
746: \Delta r^*_{\alpha\beta}=
747: \left(\frac{2 r_{\alpha\beta}}{N}\right)^{1/2}
748: \label{dri}
749: \end{equation}
750: %
751: as long as all frequencies $r\ll 1$.
752: Corresponding lines are presented also in Figure~\ref{fig2} and fit the observed
753: deviations well. The deviation for neighbor dependent processes
754: such as the process \CpG\rat\CpA/\TpG\ can be computed to be of
755: the order of:
756: %
757: \begin{equation}
758: \Delta r^*_{\alpha\beta\gamma\delta}=
759: \left(\frac{8 r_{\alpha\beta\gamma\delta}}{N}\right)^{1/2}
760: \label{drii}
761: \end{equation}
762: %
763: Note, that for $r\ll 1$ these errors stem only from the stochastic nature of
764: the underlying substitutional process and are not due to approximations used
765: during our maximum likelihood analysis of the sequence pairs $\vec{\alpha}$ and
766: $\vec{\beta}$ as described in the previous section.
767:
768: The deviations of the observed from the real frequencies
769: $\{|\bar{r}^*-\hat{r}|\}$ (see Figure~\ref{fig2}) also decrease with
770: $1/\sqrt{N}$ and are always bounded from above by $\{\Delta r^*\}$. Note,
771: that the estimates of substitution frequencies are very precise, although we
772: used an approximation when deriving the likelihood in Eq. (\ref{eqll}). This
773: property does not hold true for neighbor dependent processes in general. For
774: instance, we observe small (below 1\%, data not shown) but systematic
775: deviations of the estimated substitution frequencies if we include the process
776: \ApA/\TpT\rat\CpA/\TpG. In this case, one should also take into account the
777: identity and dynamics of nucleotides on next nearest neighbor sites and the
778: associated neighbor dependent processes. One would have to introduce higher
779: order corrections in Eq. (\ref{eqll}). This is true because of overlapping
780: initial states of the neighbor dependent process, i.e. two \ApA's in a
781: triplet \A\A\A. However, such corrections do not have to be considered for the
782: \CpG\rat\CpA/\TpG\ process. For a given \CpG, the next nearest neighbor
783: dependent process might only occur on a neighboring \CpG, which in contrast to
784: \ApA's cannot overlap with the given \CpG. Hence correlations to the next
785: \CpG\ are even smaller, which makes the estimation of substitution frequencies
786: neglecting such correlations very precise. In the absence of any neighbor
787: dependent process there is no approximation involved to compute the likelihood
788: in Eq. (\ref{eqll}) and therefore estimates will be asymptotically exact for
789: $N\ra\infty$.
790:
791:
792:
793: The above formulas for the standard deviation, Eqs. (\ref{dri}) and
794: (\ref{drii}), lose their validity if any one of the frequencies is of the
795: order of one. However, the standard deviations are still decreasing with
796: increasing sequence length. In Figure~\ref{fig3} we present estimated
797: frequencies from sequences of various degrees of divergence. The substitution
798: rates have been chosen in the ratios 1:3:5:40 for the transversions, the
799: \A:\T\rat\G:\C\ transition, the \G:\C\rat\A:\T\ transition, and the
800: \CpG\rat\CpA/\TpG\ process. On the horizontal axis we plot the length of the
801: time interval the ancestral sequenced (of length $N=10^7$) has been aged. The
802: dotted lines give the real substitution frequencies, which are the products of
803: the corresponding rates and the length of the time interval. As long as not
804: all substitution frequencies are greater than one (to the left of the dashed
805: vertical line in Figure~\ref{fig3}) the substitution frequencies can
806: faithfully estimated, even if single frequencies exceed one (the dashed
807: horizontal line). If all substitution frequencies are of the order of or
808: larger than one, the estimation of substitution frequencies is not possible
809: anymore (to the right of the dashed vertical line). In this case, more or less
810: all nucleotides underwent one or more substitution processes making it
811: impossible to estimate the frequencies of the underlying processes.
812:
813:
814: In reality however, the nucleotides in the ancestral sequence will not be
815: randomly distributed with equal probability from the 4 nucleotides (as assumed
816: above). On top of that genomic sequences will show non-trivial dinucleotide
817: distributions, i.e. neighboring bases are not independent and the dinucleotide
818: frequencies $f_{\alpha\beta}$ will deviate from the product of nucleotide
819: frequencies $f_\alpha f_\beta$ \citep{CB95}. Both these factors will influence
820: the deviations between the observed and the real substitution frequencies and
821: in those cases the above formulas (\ref{dri}) and (\ref{drii}) do not hold
822: anymore. We also expect additional errors due to the presence of unaccounted
823: neighbor dependent processes. Depending on the magnitude of the rates for such
824: processes the errors can get quite significant as discussed below. To exclude
825: the latter type of errors one actually has to try to incorporate additional
826: neighbor dependent processes and judge whether their inclusion is actually
827: relevant (as discussed in the next subsection).
828:
829: For genomic applications, it is further not possible to repeat the measurements
830: of substitution frequencies for different sets of sequences to get an estimate
831: of the typical errors. However, one can still get estimates on the expected
832: standard deviation from bootstrapping the available data. One has to resample
833: the available data drawing randomly and with replacement $N$ pairs of aligned
834: ancestral and daughter nucleotides (keeping the information of the ancestral
835: base identity to the left and to the right) and generate a list of counts
836: $N(\alpha_1\alpha_2\alpha_3\ra\cdot\beta_2\cdot)$ which then will be used to
837: maximize the likelihood and estimate the substitution frequencies as described
838: above. One repeats this resampling procedure $M$ times and from the $M$
839: estimates of the substitution frequencies and stationary GC-content calculates
840: their standard deviation, which gives the statistical error due to the limited
841: amount of sequence data. We found that $M = 500$ samples are sufficient to
842: estimate those errors (data not shown).
843:
844: \subsection{Extending the model to include additional processes}
845:
846:
847: Next we address how one can extend
848: a given substitution model and
849: include additional neighbor dependent processes to maximize the potential of
850: such a model to describe the observed data.
851: With the inclusion of additional neighbor dependent processes the likelihood of
852: a model $\{r'\}$ will in any case be greater than the one of the original model $\{r\}$.
853: This is true because the models are nested and one has one more free parameter
854: to explain the given data.
855: To test whether the inclusion of a new parameter is justified we employ
856: the likelihood ratio test for nested models. Let
857: $\lambda=L_{\{r\}}/L_{\{r'\}}$ be the likelihood ratio, then $-2\log\lambda$ has
858: an asymptotic chi-square distribution with degrees of freedom equal to the
859: difference in the numbers of free parameters of the two models, which in our
860: case is one \citep{EG01}.
861:
862: In practice we extend a given substitution model in turn by one out of the
863: $4\times 4\times 3\times 2=96$ possible neighbor dependent processes. Out of
864: those extended models we choose the best one, i.e. the one with the highest
865: likelihood $L_{\{r'\}}$. Since the best is chosen out of a finite set of possibilities,
866: we have to account for multiple testing and use a Bonferroni
867: correction. Hence we require that $-2\log\lambda>15$ to have significance on
868: the 5\% level\footnote{Note that $\int_0^{15}
869: \chi^2_1(x)\,dx=0.99989>1-0.05/96$}. We confirmed this conservative threshold
870: also by simulations using sequences that have been synthetically mutated
871: according to a known model.
872:
873:
874: \section{Results}
875: \tabi
876: As a first test, we applied the described method to
877: identify and measure neighbor dependent substitution processes
878: to human genomic data. We
879: took the copies of the AluSx SINEs that have been found in a genome-wide search
880: of the human genome (release v20.34c.1 at ensembl.org from April 1st, 2004).
881: These elements are assumed to have evolved neutrally and therefore the
882: substitution process is reverse complement symmetric. Results are presented in
883: Table 1.
884: In the first column of data we give estimations for the 6 neighbor independent
885: single nucleotide substitutions. We subsequently tested 48 possible
886: extension of this simple substitution model by one additional neighbor
887: dependent substitution process together with its reverse complement symmetric process
888: (Note that in this case only 48 extensions have to be considered).
889: As
890: expected (and shown in the second column in Table 1) the \CpG\ methylation
891: deamination process (\CpG\rat\CpA/\TpG) turns out give the best improvement
892: with $-2\log\lambda=7.7\cdot 10^6$, which is clearly above the threshold of
893: $15$. The substitution frequency of this process is about 45 times higher than
894: that of a transversion. Extending the model from 6 to 7 parameters and
895: including the \CpG\rat\CpA/\TpG\ process, mostly affects the estimate for the
896: \G:\C\rat\A:\T\ transition, which decreases about a factor three. Please also
897: note that subsequently the estimation of the stationary GC-content from those
898: rates rises from 21\% for the 6 parameter model to 34\% for the 7 parameter
899: model. This reveals that estimates of
900: substitution frequencies and
901: the stationary nucleotide composition are
902: very much affected by the underlying substitution model.
903: Substantial deviations can be observed when
904: the substitution model does not include all relevant process, as it the case
905: for the 6 parameter model for nucleotide substitutions in the human lineage.
906: In principle there can be even more neighbor dependent processes, which we have to
907: account for. We therefore try to incorporate an additional process
908: besides the already found one.
909:
910: The second process that needs to be included to improve the model is the
911: substitution of \CpG\rat\CpC/\GpG\ ($-2 \log\lambda=1.3\cdot 10^5$). This is
912: another \CpG\ based process and probably also triggered by the methylation of
913: cytosine. However, the substitution frequency is about 30 times smaller than
914: this of the \CpG\rat\CpA/\TpG\ process. The third process is then the
915: substitution \TpT/\ApA\rat\TpG/\CpA\ ($-2\log\lambda=9.6\cdot 10^4$). The
916: instability of the \TpT\ dinucleotide does not come as a surprise here, since
917: two consecutive thymine nucleotides tend to form a thymine photodimer
918: $\T\!<>\!\T$. This process is one of the major lesions formed in DNA during
919: exposure to UV light \citep{DZC97}.
920:
921: \smallskip
922:
923: Next we turn to the analysis of the DANA repeats in zebrafish ({\em Danio rerio}).
924: Results are presented in Table 2. Again we start with a model just comprising
925: single nucleotide transversions and transitions. As observed in human the
926: transitions occur more often than transversions and there is a strong \A:\T\ bias
927: in the single nucleotide substitutions. Zebrafish being a vertebrate also
928: utilizes methylation as an additional process to regulate gene expression. As a
929: consequence we observe a higher mutability of the \CpG\ dinucleotide due to the
930: deamination process also in zebrafish. However the substitution frequency for
931: the \CpG\rat\CpA/\TpG\ process is in zebrafish only about 8 times higher than this of
932: a transversion suggesting that the degree of methylation is generally lower
933: than in human.
934:
935: \tabii
936: \smallskip
937:
938: We also investigated non-vertebrate sequence data. As an example we
939: present here the analysis of the DNAREP1\_DM repeat in {\em Drosophila melanogaster}
940: (Table 3). The case to include neighbor dependent process is in this clearly
941: not as strong as for vertebrate genomes. The values of $-2\log\lambda$ are 3 orders of
942: magnitude smaller but still above threshold for the first 3 processes which are
943: chosen by our procedure to be included into a model for nucleotide
944: substitutions in fly. The first such process is the substitution \TpA\rat\TpT/\ApA.
945: Although the corresponding substitution frequency is lower than all the single
946: nucleotide transitions and transversions, the dinucleotide frequencies in the
947: stationary state deviate up to 10\% from their neutral expectation under a
948: neighbor independent substitution model (data not shown). Therefore even processes with
949: a small contribution to the overall substitutions have a large influence on the
950: observed patterns of dinucleotide frequencies or genomic signatures and
951: therefore may very well be solely responsible for the generation of such
952: pattern in different species.
953:
954: \tabiii
955:
956: \section{Conclusion}
957:
958: We presented a framework to identify the existence and measure the rates of
959: neighbor dependent nucleotide substitution processes. We discussed the
960: extension of models of nucleotide substitutions in human and included more
961: neighbor dependent processes besides the well-known \CpG\ methylation
962: deamination process \citep{ABH02}. We could also show that the \CpG\
963: methylation deamination is the predominant substitution process in zebrafish,
964: while it does not play a role in fruit fly. We exemplified our method
965: using sequence data from one particular subfamily of repeats from these three
966: organisms. In the case of the human genome a much more thorough analysis on
967: various families of repeats have been presented in \citep{APH03}. A similar
968: study, which also would have to include also neighbor dependent substitutions, for
969: other species will further broaden our knowledge about the molecular processes
970: that are responsible for nucleotide mutations and their fixation.
971:
972: {\bf Acknowledgment}
973: We thank Nadia Singh and Dmitri Petrov (Stanford) for kindly
974: providing sequence data on the DNAREP1\_DM repeat in {\em Drosophila
975: melanogaster}.
976:
977: \newpage
978: \def\etal{{\em et.al.}}
979: \begin{thebibliography}{}
980:
981: \bibitem[Arndt \etal, 2002]{ABH02}
982: Arndt, P. F., Burge, C. B. and Hwa, T. (2002).
983: DNA Sequence Evolution with Neighbor-Dependent Mutation.
984: 6th Annual International Conference on Computational Biology RECOMB2002, Washington DC, ACM Press, KK.
985:
986: \bibitem[Arndt \etal, 2003]{APH03}
987: Arndt, P. F., Petrov, D. A. and Hwa, T. (2003).
988: Distinct changes of genomic biases in nucleotide substitution at the time of Mammalian radiation.
989: {\em Mol Biol Evol} {\bf 20}(11): 1887-96.
990:
991: \bibitem[Box, 1966]{Bo66}
992: Box, M. J. (1966).
993: A Comparison of Several Current Optimization Methods and Use of Transformations in Constrained Problems.
994: {\em Computer Journal} {\bf 9}(1): 67-77.
995:
996: \bibitem[Coulondre \etal, 1978]{Co78}
997: Coulondre, C., Miller, J. H., Farabaugh, P. J., et al. (1978).
998: Molecular basis of base substitution hotspots in Escherichia coli.
999: {\em Nature} {\bf 274}(5673): 775-80.
1000:
1001: \bibitem[Douki \etal, 1997]{DZC97}
1002: Douki, T., Zalizniak, T. and Cadet, J. (1997).
1003: Far-UV-induced dimeric photoproducts in short oligonucleotides: sequence effects.
1004: {\em Photochem Photobiol} {\bf 66}(2): 171-9.
1005:
1006: \bibitem[Ewens and Grant, 2001]{EG01}
1007: Ewens, W. J. and Grant, G. (2001).
1008: {\em Statistical methods in bioinformatics : an introduction.}
1009: New York, Springer.
1010:
1011: \bibitem[Karlin and Burge, 1995]{CB95}
1012: Karlin, S. and Burge, C. (1995).
1013: Dinucleotide relative abundance extremes: a genomic signature.
1014: {\em Trends Genet} {\bf 11}(7): 283-90.
1015:
1016: \bibitem[Karlin and Mr\'azek, 1997]{KM97}
1017: Karlin, S. and Mr\'azek, J. (1997).
1018: Compositional differences within and between eukaryotic genomes.
1019: {\em Proc Natl Acad Sci U S A} {\bf 94}(19): 10227-32.
1020:
1021: \bibitem[Karlin \etal, 1997]{KMC97}
1022: Karlin, S., Mr\'azek, J. and Campbell, A. M. (1997).
1023: Compositional biases of bacterial genomes and evolutionary implications.
1024: {\em J Bacteriol} {\bf 179}(12): 3899-913.
1025:
1026: \bibitem[Lio and Goldman, 1998]{LioGoldman}
1027: Lio,P. and Goldman,N. (1998).
1028: Models of molecular evolution and phylogeny.
1029: {\em Genome Res.}, {\bf 8}, 1233-1244.
1030:
1031: \bibitem[Lunter and Hein, 2004]{LH04}
1032: Lunter, G. and Hein, J. (2004).
1033: A nucleotide substitution model with nearest-neighbour interactions.
1034: {\em Bioinformatics} {\bf 20} Suppl 1:I216-I223.
1035:
1036: \bibitem[Press \etal, 1992]{Pr92}
1037: Press, W. H., Teukolsky, S. A., Vetterling, W. T., et al. (1992).
1038: {\em Numerical Recipes in C, The art of scientific computing.}
1039: Cambridge, Cambridge University Press.
1040:
1041: \bibitem[Razin and Riggs, 1980]{RR80}
1042: Razin, A. and Riggs, A. D. (1980).
1043: DNA methylation and gene function.
1044: {\em Science} {\bf 210}(4470): 604-10.
1045:
1046: \bibitem[Russell \etal, 1976]{Ru76}
1047: Russell, G. J., Walker, P. M., Elton, R. A., et al. (1976).
1048: Doublet frequency analysis of fractionated vertebrate nuclear DNA.
1049: {\em J Mol Biol} {\bf 108}(1): 1-23.
1050:
1051: \bibitem[Russell and Subak-Sharpe, 1977]{RS77}
1052: Russell, G. J. and Subak-Sharpe, J. H. (1977).
1053: Similarity of the general designs of protochordates and invertebrates.
1054: {\em Nature} {\bf 266}(5602): 533-6.
1055:
1056: \bibitem[Siepel and Haussler, 2004]{Haussler}
1057: Siepel, A. and Haussler, D. (2004).
1058: Phylogenetic estimation of context-dependent substitution rates by maximum likelihood.
1059: {\em Mol Biol Evol.} {\bf 21}(3):468-88.
1060:
1061:
1062:
1063:
1064: \end{thebibliography}
1065:
1066:
1067:
1068:
1069: \end{document}
1070:
1071:
1072:
1073: