0501:q-bio0501018/main.tex

1: \documentclass{article}

2: \usepackage{natbib}

3: \usepackage{graphicx}

4: \pagestyle{plain}

5:

6:

7: \def\A{{\tt A}}

8: \def\C{{\tt C}}

9: \def\G{{\tt G}}

10: \def\T{{\tt T}}

11: \def\ApA{{\tt ApA}}

12: \def\ApC{{\tt ApC}}

13: \def\ApG{{\tt ApG}}

14: \def\ApT{{\tt ApT}}

15: \def\CpA{{\tt CpA}}

16: \def\CpC{{\tt CpC}}

17: \def\CpG{{\tt CpG}}

18: \def\CpT{{\tt CpT}}

19: \def\GpA{{\tt GpA}}

20: \def\GpC{{\tt GpC}}

21: \def\GpG{{\tt GpG}}

22: \def\GpT{{\tt GpT}}

23: \def\TpA{{\tt TpA}}

24: \def\TpC{{\tt TpC}}

25: \def\TpG{{\tt TpG}}

26: \def\TpT{{\tt TpT}}

27: \def\ra{\rightarrow}

28: \def\statGC{stationary GC-content}

29: \def\rat{$\ra$}

30:

31: \def\tabi{\begin{table}[bth]

32: \begin{center}

33: \footnotesize

34: \begin{tabular}{r|c|c|c|c}

35: &6 parameter

36: &7 parameter

37: &8 parameter

38: &9 parameter

39: \\

40: &model

41: &model

42: &model

43: &model

44: \\

45: \hline

46: \A:\T\rat\C:\G

47: &0.012

48: &0.012

49: &0.011

50: &0.007

51: \\

52: \A:\T\rat\T:\A

53: &0.010

54: &0.011

55: &0.011

56: &0.011

57: \\

58: \C:\G\rat\G:\C

59: &0.016

60: &0.016

61: &0.012

62: &0.012

63: \\

64: \C:\G\rat\A:\T

65: &0.015

66: &0.014

67: &0.014

68: &0.014

69: \\

70: \A:\T\rat\G:\C

71: &0.036

72: &0.036

73: &0.036

74: &0.036

75: \\

76: \C:\G\rat\T:\A

77: &0.158

78: &0.059

79: &0.060

80: &0.060

81: \\

82: \hline

83: \CpG\rat\CpA/\TpG

84: &

85: &0.618

86: &0.627

87: &0.624

88: \\

89: \CpG\rat\CpC/\GpG

90: &

91: &

92: &0.029

93: &0.029

94: \\

95: \TpT/\ApA\rat\TpG/\CpA

96: &

97: &

98: &

99: &0.013

100: \\

101: \hline

102: \statGC

103: &0.213

104: &0.341

105: &0.340

106: &0.339

107: \\

108: \hline

109: $-2 \log\lambda$

110: &

111: &7.7$\cdot 10^6$

112: &1.3$\cdot 10^5$

113: &9.6$\cdot 10^4$

114: \end{tabular}

115: \caption{

116: \footnotesize

117: \label{tab1}Estimates for substitution frequencies for nested models

118: of nucleotide substitution in human AluSx repeats. Given are the substitution

119: frequencies per bp in the time span after the insertion of the AluSx repeats

120: into the human genome. In the last row we note the $-2\log\lambda$ where $\lambda$ is the

121: likelihood ratio of the model and the one with one less parameter in the column

122: to the left.}

123: \end{center}

124: \end{table}

125: }

126:

127: \def\tabii{\begin{table}[bht]

128: \begin{center}

129: \footnotesize

130: \begin{tabular}{r|c|c|c|c}

131: &6 parameter

132: &7 parameter

133: &8 parameter

134: &9 parameter

135: \\

136: &model

137: &model

138: &model

139: &model

140: \\

141: \hline

142: \A:\T\rat\C:\G

143: &0.024

144: &0.025

145: &0.026

146: &0.026

147: \\

148: \A:\T\rat\T:\A

149: &0.041

150: &0.041

151: &0.041

152: &0.041

153: \\

154: \C:\G\rat\G:\C

155: &0.037

156: &0.036

157: &0.036

158: &0.023

159: \\

160: \C:\G\rat\A:\T

161: &0.029

162: &0.029

163: &0.028

164: &0.028

165: \\

166: \A:\T\rat\G:\C

167: &0.073

168: &0.074

169: &0.046

170: &0.046

171: \\

172: \C:\G\rat\T:\A

173: &0.151

174: &0.111

175: &0.105

176: &0.107

177: \\

178: \hline

179: \CpG\rat\CpA/\TpG

180: &

181: &0.274

182: &0.331

183: &0.328

184: \\

185: \CpA/\TpG\rat\CpG

186: &

187: &

188: &0.100

189: &0.097

190: \\

191: \CpG\rat\CpC/\GpG

192: &

193: &

194: &

195: &0.096

196: \\

197: \hline

198: \statGC

199: &0.349

200: &0.374

201: &0.335

202: &0.337

203: \\

204: \hline

205: $-2 \log\lambda$

206: &

207: &2.9$\cdot 10^5$

208: &1.6$\cdot 10^5$

209: &1.1$\cdot 10^5$

210: \end{tabular}

211: \caption{

212: \footnotesize

213: \label{tab2}Estimates for substitution frequencies for nested models

214: of nucleotide substitution in DANA repeats from {\em Danio rerio}.}

215: \end{center}

216: \end{table}

217: }

218:

219:

220: \def\tabiii{\begin{table}[htb]

221: \begin{center}

222: \footnotesize

223: \begin{tabular}{r|c|c|c|c}

224: &6 parameter

225: &7 parameter

226: &8 parameter

227: &9 parameter

228: \\

229: &model

230: &model

231: &model

232: &model

233: \\

234: \hline

235: \A:\T\rat\C:\G

236: &0.038

237: &0.038

238: &0.038

239: &0.038

240: \\

241: \A:\T\rat\T:\A

242: &0.052

243: &0.045

244: &0.045

245: &0.045

246: \\

247: \C:\G\rat\G:\C

248: &0.034

249: &0.034

250: &0.034

251: &0.034

252: \\

253: \C:\G\rat\A:\T

254: &0.074

255: &0.074

256: &0.074

257: &0.074

258: \\

259: \A:\T\rat\G:\C

260: &0.052

261: &0.052

262: &0.052

263: &0.047

264: \\

265: \C:\G\rat\T:\A

266: &0.108

267: &0.108

268: &0.098

269: &0.098

270: \\

271: \hline

272: \TpA\rat\TpT/\ApA

273: &

274: &0.029

275: &0.028

276: &0.028

277: \\

278: \TpC/\GpA\rat\TpT/\ApA

279: &

280: &

281: &0.036

282: &0.035

283: \\

284: \GpT/\ApC\rat\GpC

285: &

286: &

287: &

288: &0.021

289: \\

290: \hline

291: \statGC

292: &0.330

293: &0.330

294: &0.328

295: &0.326

296: \\

297: \hline

298: $-2 \log\lambda$

299: &

300: &853

301: &592

302: &40

303: \end{tabular}

304: \caption{

305: \footnotesize

306: \label{tab3}Estimates for substitution frequencies for nested models of nucleotide

307: substitution in DNAREP1\_DM transposable element from {\em Drosophila melanogaster}.}

308: \end{center}

309: \end{table}

310: }

311:

312:

313:

314: \author{Peter F. Arndt${}^{1*}$ and Terence Hwa${}^2$\\[5mm]

315: ${}^1$ Max Planck Institute for Molecular Genetics, \\

316: Ihnestr. 73, 14195 Berlin, Germany\\[1mm]

317: ${}^2$ Center for Theoretical Biological Physics,

318: \\

319: UC San Diego,

320: 9500 Gilman Drive, La Jolla, CA 92093-0374

321: \\[3mm]

322: ${}^*$ To whom correspondence should be addressed.

323: }

324:

325: \title{Identification and Measurement of Neighbor Dependent Nucleotide Substitution Processes}

326: \begin{document}

327: \maketitle

328:

329: \begin{abstract}

330: \mbox{}\\\noindent

331: {\bf Motivation:}

332: The presence of neighbor dependencies generated a specific pattern of

333: dinucleotide frequencies in all organisms.  Especially, the

334: CpG-methylation-deamination process is the predominant substitution process in

335: vertebrates and needs to be incorporated into a more realistic model for

336: nucleotide substitutions.

337: \\\noindent

338: {\bf Results:}

339: Based on a general framework of nucleotide substitutions we develop a method

340: that is able to identify the most relevant neighbor dependent substitution

341: processes, measure their strength, and judge their importance to be included

342: into the modeling. Starting from a model for neighbor independent nucleotide

343: substitution we successively add neighbor dependent substitution processes in

344: the order of their ability to increase the likelihood of the model describing

345: given data. The analysis of neighbor dependent nucleotide substitutions in

346: human, zebrafish and fruit fly is presented.

347: \\\noindent

348: {\bf Availability:} A web server to perform the presented analysis is

349: publicly available at:

350: http://evogen.molgen.mpg.de/server/substitution-analysis .

351: \\\noindent

352: {\bf Contact:} arndt@molgen.mpg.de

353: %\\[5mm]\noindent

354: %{\bf Running Head:}

355: %Neighbor Dependent Nucleotide Substitution

356:

357:

358:

359:

360: \end{abstract}

361:

362:

363: \section{Introduction}

364: The identity of the neighboring nucleotide can have a drastic influence on the

365: mutation rates of a nucleotide. A well-known and studied example of this fact

366: is the increased mutation of cytosine to thymine in \CpG\ dinucleotides in

367: vertebrates \citep{Co78, RR80}. This process is triggered by the methylation of

368: cytosine in \CpG\, followed by deamination, and mutation from \CpG\ to \TpG\ or

369: \CpA\ (on the reverse strand). Due to this process the number of \CpG\ is

370: decreased while the number of \TpG\ and \CpA\ is larger than expected from

371: independently evolving nucleotides. Most of the deviant dinucleotide odds

372: ratios (dinucleotide frequencies normalized for the base composition) in the

373: human genome can be explained by the presence of the \CpG\ methylation

374: deamination process \citep{ABH02}. Biochemical studies in the 1970s already

375: compared these odds ratios for different genomes and different fractions of

376: genomic DNA \citep{Ru76, RS77} and concluded that these ratios are a remarkably

377: stable property of genomes. In the following Karlin and coworkers \citep{CB95,

378: KM97, KMC97} elaborated and expanded these observations, showing that the

379: pattern of dinucleotide abundance constitutes a genomic signature in the sense

380: that it stable across different parts of a genome and generally similar between

381: related organisms. Since this signature is also present in non-coding and

382: intergenic DNA it is very promising to study neighbor dependent mutation and

383: fixation processes (we refer to the effective process as the substitution

384: process) to understand the evolution of neutral DNA.

385: However, to pursue on this track new models for nucleotide

386: substitutions that extends those which only capture neighbor independent

387: nucleotide substitutions (see  \citep{LioGoldman} for a review) have to be

388: formulated (see also \citep{ABH02, Haussler, LH04}).

389:

390: Recently a framework to include such neighbor dependent processes has been

391: introduced \citep{ABH02}.  The framework itself is capable to include any type

392: of neighbor dependent process and was already successfully applied to model the

393: \CpG\ methylation deamination process in vertebrates \citep{APH03}.  Although

394: these models are mathematically more complicated they however allow a

395: quantitative analysis of neighbor dependent processes and to make reliable

396: estimations on other properties e.g. the stationary GC-content.  Here we will

397: extend this framework and discuss the inclusion of more neighbor dependent

398: substitutions and how one can infer their relevance without prior knowledge on

399: the underlying biochemical processes.  In vertebrates the \CpG\ methylation

400: deamination process is the predominant nucleotide substitution process. Its

401: rate is about 40 times higher than this of a transversion and its history can

402: actually reconstructed for the last 250 Myr \citep{APH03}. One reason for this

403: substitution frequency being so high is that in vertebrates \CpG\ methylation is also

404: used in gene regulation,

405: as methylated regions of the genome are not transcribed.

406: Consequently, \CpG's in these regions often mutate. We

407: know already that also other vertebrates use methylation in the same way but do

408: not know about the quantitative extent their genomes are methylated. The

409: situation is still rather unclear in other kingdoms of life. Although we

410: clearly see signatures of neighbor dependent substitution processes, we do not

411: know the responsible processes and their rates.

412:

413: To present our method we study neighbor dependent substitutions in human ({\em

414: Homo sapiens}), zebrafish ({\em Danio rerio}) and fruit fly ({\em Drosophila

415: melanogaster}). In all these studies we first try to model the observed

416: nucleotide substitutions with a model which does not include any neighbor

417: dependent nucleotide substitutions (12 free rate parameters) and then ask the

418: question which neighbor dependent substitution process one would have to

419: include to describe the observed data best. The idea is to capture the most of

420: the observed substitutions by single nucleotide substitutions independent of

421: the neighboring bases and then to include neighbor dependent substitutions one

422: by one to generate a better model with the least number of parameters.

423: Processes are added in the order of their ability to describe the observed data

424: better. Naturally, the addition of any further process (together with one rate

425: parameter) into a model will increase the likelihood of this model to describe

426: the observed data. In order not to over-fit the data we use a likelihood ratio

427: test to judge whether the addition of further process is justified.  The

428: strength of our approach is to come up with a model with fewer parameters that

429: still captures the essential neighbor dependent nucleotide substitution

430: processes. This prevents over-fitting the model to given data and eases the

431: quantitative estimation of a smaller number of parameters.

432:

433:

434: The rest of the paper organizes as follows. In the next section we will

435: describe details of our method. There is no need to implement the described

436: procedure for readers who want to analyze their own sequences, since we are

437: running a public web server at

438: {http://evogen.molgen.mpg.de/server/substitution-analysis}.  At this site one is

439: able to upload sequence data and perform the presented analysis. First

440: applications of such an analysis will be presented in the results section.

441:

442: \section{Method}

443:

444: \subsection{The substitution model}

445:

446: In total there are 12 distinct neighbor independent substitution processes of

447: a single nucleotides by another; four of them are so-called transitions that

448: interchange a purine with a purine or a pyrimidine with a pyrimidine. The

449: remaining eight processes are the so-called transversions that interchange a

450: purine with a pyrimidine and vice versa. The rates of these processes, $\alpha\ra\beta$, will be

451: denoted $r_{\alpha\beta}$, where $\alpha,\beta\in\{\A,\C,\G,\T\}$ denote a

452: nucleotide. On top of these 12 processes we want to consider also neighbor

453: dependent processes of the kind $\kappa\lambda\ra\kappa\sigma$ and

454: $\kappa\lambda\ra\sigma\lambda$

455: where the right or left base of a

456: di-nucleotide changes, respectively. There might be several of those processes

457: present in our model, their rates will be denoted by $r_{\kappa\lambda\kappa\sigma}$ or

458: $r_{\kappa\lambda\sigma\lambda}$ . We do not consider

459: processes where both nucleotides of a dinucleotide change at the same time. In

460: vertebrates, the most important neighbor dependent process to consider is the

461: substitution of cytosine in \CpG\ resulting in \TpG\ or \CpA. Its rate is

462: about 40 times higher than this of a transversion \citep{APH03}. This process is

463: triggered by the methylation and subsequent deamination of cytosine in \CpG\

464: pairs. It is commonly (and erroneously) assumed that this process only affects

465: \CpG\ dinucleotides. However, this is not the case as it has been shown

466: \citep{ABH02}.

467:

468: The model is parameterized by the substitution rates and the length of

469: the time span,~$dt$, the respective substitution processes acted upon the sequence,

470: which would in our case be the time between the observation of an ancestral

471: sequence and its daughter sequence,~$T$. We have the freedom to rescale time and

472: measure it in units of $T$. In this case, the time span is $dt=1$ and with

473: this choice  the substitution

474: rates are equal to the substitution frequencies giving the number of nucleotide

475: substitutions per bp. In the simplest case our model includes

476: neighbor independent processes only and is parameterized by 12 substitution

477: frequencies. For each additional neighbor dependent process we gain one

478: additional parameter. The set of all these substitution frequencies will be

479: denoted by $\{r\}$. The number of parameters can actually be reduced by a factor of two

480: when one considers substitutions along neutrally evolving DNA. In this case we

481: cannot distinguish the two strands of the DNA and therefore the substitution

482: rates are reverse complement symmetric, e.g. the rate for the substitution \C\rat\A\ is

483: equal to the rate for the substitution \G\rat\T\ (in the following we will denote this

484: process by $\C:\G\ra\A:\T$, for the rates we have $r_{\C\A}=r_{\G\T}$).

485:

486: In order to facilitate the subsequent maximum likelihood analysis we need to

487: compute the probability,~$P_{\{r\}}(\cdot\beta\cdot|\alpha_1\alpha_2\alpha_3)$,

488: that the base $\alpha_2$ flanked by $\alpha_1$ to the left and by $\alpha_3$ to

489: the right, changes into the base $\beta$ for given substitution frequencies

490: $\{r\}$. This probability can easily calculated by numerically solving the time

491: evolution of the probability to find three bases $p(\alpha\beta\gamma;t)$ at

492: time $t$, which is given by the Master equation and can be written as the

493: following set of differential equations:

494: %\begin{eqnarray}

495: %\frac\partial{\partial t}p(\alpha\beta\gamma;t)

496: %&=&

497: %\sum_{\epsilon\in\{\A,\C,\G,\T\}}

498: %\left[

499: %r_{\epsilon\ra\alpha}\;p(\epsilon\beta\gamma;t)

500: %+r_{\epsilon\ra\beta} \;p(\alpha\epsilon\gamma;t)

501: %+r_{\epsilon\ra\gamma} \;p(\alpha\beta\epsilon;t)

502: %\right]

503: %\nonumber\\

504: %&&+

505: %\sum_{\{\kappa\lambda\ra\kappa\sigma\}}

506: %r_{\kappa\lambda\ra\kappa\sigma}

507: %\left[

508: %\delta_{\kappa\sigma,\alpha\beta}\;p(\kappa\lambda\gamma;t)-

509: %\delta_{\kappa\lambda,\alpha\beta}\;p(\alpha\beta\gamma;t)

510: %\right]

511: %\nonumber\\

512: %&&+

513: %\sum_{\{\kappa\lambda\ra\sigma\lambda\}}

514: %r_{\kappa\lambda\ra\sigma\lambda}

515: %\left[

516: %\delta_{\sigma\lambda,\beta\gamma}\;p(\alpha\kappa\lambda;t)-

517: %\delta_{\kappa\lambda,\beta\gamma}\;p(\alpha\beta\gamma;t)

518: %\right]

519: %\end{eqnarray}

520: \begin{eqnarray}

521: \frac\partial{\partial t}p(\alpha\beta\gamma;t)

522: &=&

523: \sum_{\epsilon\in\{\A,\C,\G,\T\}}

524: \left[

525: r_{\epsilon\alpha}\;p(\epsilon\beta\gamma;t)

526: +r_{\epsilon\beta} \;p(\alpha\epsilon\gamma;t)

527: +r_{\epsilon\gamma} \;p(\alpha\beta\epsilon;t)

528: \right]

529: \nonumber\\

530: &&+

531: \sum_{\epsilon\epsilon'}

532: r_{\epsilon\epsilon'\alpha\beta}\;p(\epsilon\epsilon'\gamma;t)

533: %\nonumber\\

534: %&&

535: +\sum_{\epsilon\epsilon'}

536: r_{\epsilon\epsilon'\beta\gamma}\;p(\alpha\epsilon\epsilon';t),

537: \label{dgl}

538: \end{eqnarray}

539: where the rate parameters with the equal initial and final state,

540: $r_{\alpha\alpha}$ and $r_{\alpha\beta\alpha\beta}$,

541: are defined by

542: \begin{equation}

543: r_{\alpha\alpha}=-\sum_{\epsilon\neq\alpha}r_{\alpha\epsilon}

544: ,\quad

545: r_{\alpha\beta\alpha\beta}=-\sum_{(\epsilon\epsilon')\neq(\alpha\beta)}r_{\alpha\beta\epsilon\epsilon'},

546: \nonumber

547: \end{equation}

548: and rates of neighbor dependent substitution processes not included into the

549: model are take to be zero.  The above definitions guarantee the conservation of

550: the total probability,

551: $\sum_{\alpha\beta\gamma}

552: \frac\partial{\partial t}p(\alpha\beta\gamma;t)=0

553: $,

554: since the total influx is balanced by an appropriate outflux of probability.

555: The first three terms on the r.h.s.~in Eq.~(\ref{dgl}) describe single

556: nucleotide substitutions on the three sites whereas the last two sums (which

557: are summed over all pairs of nucleotides) represent the neighbor dependent

558: processes at the sites $(1,2)$ and $(2,3)$, respectively. To describe the

559: evolution of three nucleotides $\alpha_1\alpha_2\alpha_3$, these differential

560: equations have to be solved for initial conditions of the form

561: \begin{equation}

562: p(\alpha\beta\gamma;t=0)

563: =\left\{

564: \begin{array}{cl}

565: 1&\mbox{if }(\alpha\beta\gamma)=(\alpha_1\alpha_2\alpha_3)\\

566: 0&\mbox{otherwise.}

567: \end{array}

568: \right.

569: \end{equation}

570: After numerically iterating the above differential equations using

571: the Runge-Kutta algorithm \citep{Pr92} we get the above transition probability as

572: \begin{equation}

573: P_{\{r\}}(\cdot\beta_2\cdot|\alpha_1\alpha_2\alpha_3)=

574: \sum_{\beta_1\beta_3}p(\beta_1\beta_2\beta_3;t=1)

575: \;.

576: \end{equation}

577: The above iteration has to be carried out 64 times for all possible combinations of

578: initial bases $\alpha_1\alpha_2\alpha_3$. After each iteration

579: 4 of the transition probabilities

580: $P_{\{r\}}(\cdot\beta\cdot|\alpha_1\alpha_2\alpha_3)$

581: with $\beta=\A,\C,\G,$ or \T\

582:  can be computed.  Note, that the above

583: set of differential equations can easily extended to describe systems of length

584: $N>3$.  In this case one has to solve for $4^N$ functions

585: $p(\alpha_1\alpha_2\dots\alpha_N;t)$.

586:

587: \subsection{Estimation of substitution frequencies}

588:

589: One can estimate all the above mentioned substitution frequencies from real

590: sequence data by comparing a pair of ancestral

591: $\vec{\alpha}=\alpha_1\alpha_2\dots\alpha_N$ and daughter sequence

592: $\vec{\beta}=\beta_1\beta_2\dots\beta_N$, where the daughter sequence

593: represents the state of the ancestral sequence after the substitution processes

594: acted upon it for some time. Note that we do not assume any other properties

595: regarding to the nucleotide or dinucleotide distributions of the sequences.

596: Especially, the two sequences do not need to be in their stationary state with

597: respect to the substitution model. [In practice, these pairs of ancestral and daughter

598: sequences can be obtained in various ways. One very fruitful approach is to

599: take alignments of repetitive sequences, which can be found in various genomes

600: due to the activity of retroviruses.  Such repetitive elements have entered

601: these genomes during short periods in evolution.  Hence all copies of such

602: elements in a genome have been subject to nucleotide substitutions for the same

603: time and accumulated corresponding amounts of changes.  Various such repetitive

604: elements and their respective alignment to the once active master (which is

605: taken to be the ancestral sequence \citep{APH03}) can be identified using the

606: RepeatMasker, http://www.repeatmasker.org.]

607:

608: The log likelihood that a sequence

609: $\vec{\beta}$ evolved from a master sequence $\vec{\alpha}$ under a given

610: substitution model parameterized by the substitution frequencies $\{r\}$ is

611: given by

612: \begin{eqnarray}

613: \log L_{\{r\}}&=&

614: \log P_{\{r\}}(\vec{\beta}|\vec{\alpha})

615: \nonumber\\

616: &\approx&

617: \log \prod_{i=2}^{L-1}

618: %\sum_{i=2}^{L-1} \log

619: P_{\{r\}}(\cdot\beta_i\cdot|\alpha_{i-1}\alpha_i\alpha_{i+1})

620: \nonumber\\

621: &=&

622: \sum_{\alpha_1\alpha_2\alpha_3\beta_2}

623: N(\alpha_1\alpha_2\alpha_3\ra\cdot\beta_2\cdot)

624: \log

625: P_{\{r\}}(\cdot\beta_2\cdot|\alpha_1\alpha_2\alpha_3)

626: \;.

627: \label{eqll}

628: \end{eqnarray}

629: where  $P_{\{r\}}(\vec{\beta}|\vec{\alpha})$ is the probability of the

630: evolution of the sequence $\vec{\alpha}$ into $\vec{\beta}$.  This probability

631: can very well be approximated by the product in the second line.

632: This is due to the fact that the correlations induced by the substitutional

633: processes are very short ranged \citep{ABH02}. We therefore

634: take into account the identities of bases and the dynamics on

635: the nearest neighbors to the left and to the right, and neglect

636: those on the next nearest neighbors and beyond.

637: For most applications

638: this approximation turns out to be sufficient since estimated

639: substitution frequencies deviate less than 1\% from their actual

640: values (see below).

641: Note that this approximation is even exact in the absence of neighbor dependent

642: substitution processes. The numbers

643: $N(\alpha_1\alpha_2\alpha_3\ra\cdot\beta_2\cdot)$ denotes the

644: counts of observations of a base substitution from $\alpha_2$

645: (flanked by $\alpha_1$ to the

646: left and $\alpha_3$ to the right) to $\beta_2$.

647:

648: To estimate the substitution frequencies $\{r^\star\}$

649: for a given pair of $\vec{\alpha}$ and

650: $\vec{\beta}$ or given numbers $N(\alpha_1\alpha_2\alpha_3\ra\cdot\beta_2\cdot)$ we

651: have to maximize the above likelihood by adjusting the

652: substitution frequencies.  This can easily be done using Powell's method

653: \citep{Pr92} while taking care of boundary conditions \citep{Bo66}, i.e. the

654: positivity of the substitution frequencies.

655:

656: \begin{figure}[htb]

657: \bigskip

658: \bigskip

659:   \begin{center}

660:     \includegraphics[width=0.8\textwidth]{fig1-2panel}

661: 	\caption{\footnotesize

662: \label{fig1}Plot of the estimated frequencies and their standard

663: deviation (from 500 measurements) for randomly drawn sequences of various

664: length.  The daughter sequences have been synthetically aged using the

665: following processes (with frequency as indicated by the dotted lines):

666: transversions (0.01), \A:\T\rat\G:\C\ (0.03), \G:\C\rat\A:\T\ (0.05), and

667: \CpG\rat\CpA/\TpG\ (0.4). The stationary GC-content for this model is $0.3474$.

668: }

669:   \end{center}

670: \end{figure}

671:

672:

673:

674: \begin{figure}[htb]

675: \bigskip

676: \bigskip

677:   \begin{center}

678:     \includegraphics[width=0.8\textwidth]{fig2}

679: 	\caption{\footnotesize\label{fig2}Plot of the deviations of the estimated frequencies

680: $\{|\bar{r}^*-\hat{r}|\}$ (open symbols) and the standard deviation $\{\Delta

681: r^*\}$ (closed symbols) from 500 measurements for randomly drawn sequences of

682: various lengths.  The daughter sequences have been synthetically aged using the

683: following processes (with frequency): transversions (0.0001), \A:\T\rat\G:\C\

684: (0.0003), \G:\C\rat\A:\T\ (0.0005), and \CpG\rat\CpA/\TpG\

685: (0.004).

686: }

687:   \end{center}

688: \end{figure}

689:

690:

691: \begin{figure}[htb]

692: \bigskip

693: \bigskip

694:   \begin{center}

695:     \includegraphics[width=0.8\textwidth]{fig_vs_dt}

696: 	\caption{\footnotesize\label{fig3}A plot of the estimated frequencies for various

697: degrees of sequence divergence. The dotted lines give expected values of the

698: frequencies. The sequence length has been chosen to be $N=10^7$. }

699:   \end{center}

700: \end{figure}

701:

702:

703: \subsection{Uncertainty of estimates for finite sequence length}

704:

705: Due to the stochastic nature of the substitution process and due to the fact

706: that always only a finite amount of sequence data is available to estimate

707: the substitution frequencies $\{r^\star\}$, estimated frequencies will show

708: deviations from the real substitution frequencies.  In general we do not know

709: or cannot infer these real frequencies otherwise.  In order to be able to

710: analyze the uncertainty of frequency estimates from finite sequences we

711: synthetically (in silico) generate pairs of ancestral and daughter sequences

712: using known substitution processes and rates $\{\hat{r}\}$.  In the following

713: section we include just one neighbor dependent substitution process, namely the

714: \CpG-methylation deamination process, \CpG\rat\CpA/\TpG, which plays a

715: predominant role in the analysis of nucleotide substitutions in vertebrates.

716: The nucleotides of the ancestral sequences $\vec{\alpha}$ (of length $N$) have

717: been chosen randomly with equal probability from the 4 nucleotides.

718: Subsequently, the ancestral sequence was synthetically aged and we applied

719: substitutions using a Monte Carlo algorithm as described in \citep{ABH02}

720: yielding the sequence $\vec{\beta}$.  The resulting pair of sequences is then

721: analyzed using the above procedure to get estimates of the rates $\{r^\star\}$.

722: We repeated this experiment 500 times and got estimates for the means

723: $\{\bar{r}^*\}$ and standard deviation $\{\Delta r^*\}$ of these measurements.

724: In addition we computed the stationary GC-content from each set of substitution

725: frequencies \citep{ABH02}. Results of this analysis are presented in

726: Figure~\ref{fig1} where we show the mean and standard deviation of estimated

727: rates for different length of sequences $N$.  The transversion frequencies were

728: chosen to be 0.01, the frequency of the \A:\T\rat\G:\C\ transition to be 0.03,

729: that of the \G:\C\rat\A:\T\ transition to be 0.05, and that of the

730: \CpG\rat\CpA/\TpG\ transition to be 0.4, as indicated by the doted lines in

731: Figure~\ref{fig1}. This choice of frequencies mimics the relative strength of

732: the substitution process as they are observed in the human genome.  As can be

733: seen the uncertainty of observed substitution frequencies correlates positively

734: with the substitution frequencies and negatively with the length of the

735: sequences.

736:

737: To further quantify these uncertainties and discuss their dependence on various

738: quantities we plotted the deviations $\{|\bar{r}^*-\hat{r}|\}$ and the standard

739: deviations $\{\Delta r^*\}$ as a function of the sequence length $N$ in

740: Figure~\ref{fig2}.  The standard deviations decrease with $1/\sqrt{N}$. In the

741: absence of neighbor dependent substitutions and for ancestral sequences with

742: equally probable nucleotides the standard deviation for reverse complement

743: symmetric frequencies can actually be calculated to be

744: %

745: \begin{equation}

746: \Delta r^*_{\alpha\beta}=

747: \left(\frac{2 r_{\alpha\beta}}{N}\right)^{1/2}

748: \label{dri}

749: \end{equation}

750: %

751: as long as all frequencies $r\ll  1$.

752: Corresponding lines are presented also in Figure~\ref{fig2} and fit the observed

753: deviations well. The deviation for neighbor dependent processes

754: such as the process \CpG\rat\CpA/\TpG\ can be computed to be of

755: the order of:

756: %

757: \begin{equation}

758: \Delta r^*_{\alpha\beta\gamma\delta}=

759: \left(\frac{8 r_{\alpha\beta\gamma\delta}}{N}\right)^{1/2}

760: \label{drii}

761: \end{equation}

762: %

763: Note, that for $r\ll 1$ these errors stem only from the stochastic nature of

764: the underlying substitutional process and are not due to approximations used

765: during our maximum likelihood analysis of the sequence pairs $\vec{\alpha}$ and

766: $\vec{\beta}$ as described in the previous section.

767:

768: The deviations of the observed from the real frequencies

769: $\{|\bar{r}^*-\hat{r}|\}$ (see Figure~\ref{fig2}) also decrease with

770: $1/\sqrt{N}$ and are always bounded from above by  $\{\Delta r^*\}$.  Note,

771: that the estimates of substitution frequencies are very precise, although we

772: used an approximation when deriving the likelihood in Eq. (\ref{eqll}).  This

773: property does not hold true for neighbor dependent processes in general. For

774: instance, we observe small (below 1\%, data not shown) but systematic

775: deviations of the estimated substitution frequencies if we include the process

776: \ApA/\TpT\rat\CpA/\TpG. In this case, one should also take into account the

777: identity and dynamics of nucleotides on next nearest neighbor sites and the

778: associated neighbor dependent processes.  One would have to introduce higher

779: order corrections in Eq. (\ref{eqll}).  This is true because of overlapping

780: initial states of the  neighbor dependent process, i.e.  two \ApA's in a

781: triplet \A\A\A.  However, such corrections do not have to be considered for the

782: \CpG\rat\CpA/\TpG\ process.  For a given \CpG, the next nearest neighbor

783: dependent process might only occur on a neighboring \CpG, which in contrast to

784: \ApA's cannot overlap with the given \CpG.  Hence correlations to the next

785: \CpG\ are even smaller, which makes the estimation of substitution frequencies

786: neglecting such correlations very precise.  In the absence of any neighbor

787: dependent process there is no approximation involved to compute the likelihood

788: in Eq. (\ref{eqll}) and therefore estimates will be asymptotically exact for

789: $N\ra\infty$.

790:

791:

792:

793: The above formulas for the standard deviation, Eqs. (\ref{dri}) and

794: (\ref{drii}), lose their validity if any one of the frequencies is of the

795: order of one. However, the standard deviations are still decreasing with

796: increasing sequence length. In Figure~\ref{fig3} we present estimated

797: frequencies from sequences of various degrees of divergence.  The substitution

798: rates have been chosen in the ratios 1:3:5:40 for the transversions, the

799: \A:\T\rat\G:\C\ transition, the \G:\C\rat\A:\T\ transition, and the

800: \CpG\rat\CpA/\TpG\ process. On the horizontal axis we plot the length of the

801: time interval the ancestral sequenced (of length $N=10^7$) has been aged. The

802: dotted lines give the real substitution frequencies, which are the products of

803: the corresponding rates and the length of the time interval.  As long as not

804: all substitution frequencies are greater than one (to the left of the dashed

805: vertical line in Figure~\ref{fig3}) the substitution frequencies can

806: faithfully estimated, even if single frequencies exceed one (the dashed

807: horizontal line).  If all substitution frequencies are of the order of or

808: larger than one, the estimation of substitution frequencies is not possible

809: anymore (to the right of the dashed vertical line).  In this case, more or less

810: all nucleotides underwent one or more substitution processes making it

811: impossible to estimate the frequencies of the underlying processes.

812:

813:

814: In reality however, the nucleotides in the ancestral sequence will not be

815: randomly distributed with equal probability from the 4 nucleotides (as assumed

816: above). On top of that genomic sequences will show non-trivial dinucleotide

817: distributions, i.e. neighboring bases are not independent and the dinucleotide

818: frequencies $f_{\alpha\beta}$ will deviate from the product of nucleotide

819: frequencies $f_\alpha f_\beta$ \citep{CB95}.  Both these factors will influence

820: the deviations between the observed and the real substitution frequencies and

821: in those cases the above formulas (\ref{dri}) and (\ref{drii}) do not hold

822: anymore.  We also expect additional errors due to the presence of unaccounted

823: neighbor dependent processes.  Depending on the magnitude of the rates for such

824: processes the errors can get quite significant as discussed below. To exclude

825: the latter type of errors one actually has to try to incorporate additional

826: neighbor dependent processes and judge whether their inclusion is actually

827: relevant (as discussed in the next subsection).

828:

829: For genomic applications, it is further not possible to repeat the measurements

830: of substitution frequencies for different sets of sequences to get an estimate

831: of the typical errors.  However, one can still get estimates on the expected

832: standard deviation from bootstrapping the available data. One has to resample

833: the available data drawing randomly and with replacement $N$ pairs of aligned

834: ancestral and daughter nucleotides (keeping the information of the ancestral

835: base identity to the left and to the right) and generate a list of counts

836: $N(\alpha_1\alpha_2\alpha_3\ra\cdot\beta_2\cdot)$ which then will be used to

837: maximize the likelihood and estimate the substitution frequencies as described

838: above.  One  repeats this resampling procedure $M$ times and from the $M$

839: estimates of the substitution frequencies and stationary GC-content calculates

840: their standard deviation, which gives the statistical error due to the limited

841: amount of sequence data. We found that $M = 500$ samples are sufficient to

842: estimate those errors (data not shown).

843:

844: \subsection{Extending the model to include additional processes}

845:

846:

847: Next we address how one can extend

848: a given substitution model and

849: include additional neighbor dependent processes to maximize the potential of

850: such a model to describe the observed data.

851: With the inclusion of additional neighbor dependent processes the likelihood of

852: a model $\{r'\}$ will in any case be greater than the one of the original model $\{r\}$.

853: This is true because the models are nested and one has one more free parameter

854: to explain the given data.

855: To test whether the inclusion of a new parameter is justified we employ

856: the likelihood ratio test for nested models. Let

857: $\lambda=L_{\{r\}}/L_{\{r'\}}$ be the likelihood ratio, then $-2\log\lambda$ has

858: an asymptotic chi-square distribution with degrees of freedom equal to the

859: difference in the numbers of free parameters of the two models, which in our

860: case is one \citep{EG01}.

861:

862: In practice we extend a given substitution model in turn by one out of the

863: $4\times 4\times 3\times 2=96$ possible neighbor dependent processes.  Out of

864: those extended models we choose the best one, i.e. the one with the highest

865: likelihood $L_{\{r'\}}$.  Since the best is chosen out of a finite set of possibilities,

866: we have to account for multiple testing and use a Bonferroni

867: correction.  Hence we require that $-2\log\lambda>15$ to have significance on

868: the 5\% level\footnote{Note that $\int_0^{15}

869: \chi^2_1(x)\,dx=0.99989>1-0.05/96$}.  We confirmed this conservative threshold

870: also by simulations using sequences that have been synthetically mutated

871: according to a known model.

872:

873:

874: \section{Results}

875: \tabi

876: As a first test, we applied the described method to

877: identify and measure neighbor dependent substitution processes

878: to human genomic data. We

879: took the copies of the AluSx SINEs that have been found in a genome-wide search

880: of the human genome (release v20.34c.1 at ensembl.org from April 1st, 2004).

881: These elements are assumed to have evolved neutrally and therefore the

882: substitution process is reverse complement symmetric. Results are presented in

883: Table 1.

884: In the first column of data we give estimations for the 6 neighbor independent

885: single nucleotide substitutions. We subsequently tested 48 possible

886: extension of this simple substitution model by one additional neighbor

887: dependent substitution process together with its reverse complement symmetric process

888: (Note that in this case only 48 extensions have to be considered).

889: As

890: expected (and shown in the second column in Table 1) the \CpG\ methylation

891: deamination process (\CpG\rat\CpA/\TpG) turns out give the best improvement

892: with $-2\log\lambda=7.7\cdot 10^6$, which is clearly above the threshold of

893: $15$.  The substitution frequency of this process is about 45 times higher than

894: that of a transversion.  Extending the model from 6 to 7 parameters and

895: including the \CpG\rat\CpA/\TpG\ process, mostly affects the estimate for the

896: \G:\C\rat\A:\T\ transition, which decreases about a factor three.  Please also

897: note that subsequently the estimation of the stationary GC-content from those

898: rates rises from 21\% for the 6 parameter model to 34\% for the 7 parameter

899: model. This reveals that estimates of

900: substitution frequencies and

901:  the stationary nucleotide composition are

902: very much affected by the underlying substitution model.

903: Substantial deviations can be observed when

904: the substitution model does not include all relevant process, as it the case

905: for the 6 parameter model for nucleotide substitutions in the human lineage.

906: In principle there can be even more neighbor dependent processes, which we have to

907: account for. We therefore try to incorporate an additional process

908: besides the already found one.

909:

910: The second process that needs to be included to improve the model is the

911: substitution of \CpG\rat\CpC/\GpG\ ($-2 \log\lambda=1.3\cdot 10^5$). This is

912: another \CpG\ based process and probably also triggered by the methylation of

913: cytosine. However, the substitution frequency is about 30 times smaller than

914: this of the \CpG\rat\CpA/\TpG\ process. The third process is then the

915: substitution \TpT/\ApA\rat\TpG/\CpA\ ($-2\log\lambda=9.6\cdot 10^4$). The

916: instability of the \TpT\ dinucleotide does not come as a surprise here, since

917: two consecutive thymine nucleotides tend to form a thymine photodimer

918: $\T\!<>\!\T$. This process is one of the major lesions formed in DNA during

919: exposure to UV light \citep{DZC97}.

920:

921: \smallskip

922:

923: Next we turn to the analysis of the DANA repeats in zebrafish ({\em Danio rerio}).

924: Results are presented in Table 2. Again we start with a model just comprising

925: single nucleotide transversions and transitions. As observed in human the

926: transitions occur more often than transversions and there is a strong \A:\T\ bias

927: in the single nucleotide substitutions. Zebrafish being a vertebrate also

928: utilizes methylation as an additional process to regulate gene expression. As a

929: consequence we observe a higher mutability of the \CpG\ dinucleotide due to the

930: deamination process also in zebrafish. However the substitution frequency for

931: the \CpG\rat\CpA/\TpG\ process is in zebrafish only about 8 times higher than this of

932: a transversion suggesting that the degree of methylation is generally lower

933: than in human.

934:

935: \tabii

936: \smallskip

937:

938: We also investigated non-vertebrate sequence data. As an example we

939: present here the analysis of the DNAREP1\_DM repeat in {\em Drosophila melanogaster}

940: (Table 3). The case to include neighbor dependent process is in this clearly

941: not as strong as for vertebrate genomes. The values of $-2\log\lambda$ are 3 orders of

942: magnitude smaller but still above threshold for the first 3 processes which are

943: chosen by our procedure to be included into a model for nucleotide

944: substitutions in fly. The first such process is the substitution \TpA\rat\TpT/\ApA.

945: Although the corresponding substitution frequency is lower than all the single

946: nucleotide transitions and transversions, the dinucleotide frequencies in the

947: stationary state deviate up to 10\% from their neutral expectation under a

948: neighbor independent substitution model (data not shown). Therefore even processes with

949: a small contribution to the overall substitutions have a large influence on the

950: observed patterns of dinucleotide frequencies or genomic signatures and

951: therefore may very well be solely responsible for the generation of such

952: pattern in different species.

953:

954: \tabiii

955:

956: \section{Conclusion}

957:

958: We presented a framework to identify the existence and measure the rates of

959: neighbor dependent nucleotide substitution processes.  We discussed the

960: extension of models of nucleotide substitutions in human and included more

961: neighbor dependent processes besides the well-known \CpG\ methylation

962: deamination process \citep{ABH02}. We could also show that the \CpG\

963: methylation deamination is the predominant substitution process in zebrafish,

964: while it does not play a role in fruit fly. We exemplified our method

965: using sequence data from one particular subfamily of repeats from these three

966: organisms. In the case of the human genome a much more thorough analysis on

967: various families of repeats have been presented in \citep{APH03}.  A similar

968: study, which also would have to include also neighbor dependent substitutions, for

969: other species will further broaden our knowledge about the molecular processes

970: that are responsible for nucleotide mutations and their fixation.

971:

972: {\bf Acknowledgment}

973: We thank Nadia Singh and Dmitri Petrov (Stanford) for kindly

974: providing sequence data on the DNAREP1\_DM repeat in {\em Drosophila

975: melanogaster}.

976:

977: \newpage

978: \def\etal{{\em et.al.}}

979: \begin{thebibliography}{}

980:

981: \bibitem[Arndt \etal, 2002]{ABH02}

982: Arndt, P. F., Burge, C. B. and Hwa, T. (2002).

983: DNA Sequence Evolution with Neighbor-Dependent Mutation.

984: 6th Annual International Conference on Computational Biology RECOMB2002, Washington DC, ACM Press, KK.

985:

986: \bibitem[Arndt \etal, 2003]{APH03}

987: Arndt, P. F., Petrov, D. A. and Hwa, T. (2003).

988: Distinct changes of genomic biases in nucleotide substitution at the time of Mammalian radiation.

989: {\em Mol Biol Evol} {\bf 20}(11): 1887-96.

990:

991: \bibitem[Box, 1966]{Bo66}

992: Box, M. J. (1966).

993: A Comparison of Several Current Optimization Methods and Use of Transformations in Constrained Problems.

994: {\em Computer Journal} {\bf 9}(1): 67-77.

995:

996: \bibitem[Coulondre \etal, 1978]{Co78}

997: Coulondre, C., Miller, J. H., Farabaugh, P. J., et al. (1978).

998: Molecular basis of base substitution hotspots in Escherichia coli.

999: {\em Nature} {\bf 274}(5673): 775-80.

1000:

1001: \bibitem[Douki \etal, 1997]{DZC97}

1002: Douki, T., Zalizniak, T. and Cadet, J. (1997).

1003: Far-UV-induced dimeric photoproducts in short oligonucleotides: sequence effects.

1004: {\em Photochem Photobiol} {\bf 66}(2): 171-9.

1005:

1006: \bibitem[Ewens and Grant, 2001]{EG01}

1007: Ewens, W. J. and Grant, G. (2001).

1008: {\em Statistical methods in bioinformatics : an introduction.}

1009: New York, Springer.

1010:

1011: \bibitem[Karlin and Burge, 1995]{CB95}

1012: Karlin, S. and Burge, C. (1995).

1013: Dinucleotide relative abundance extremes: a genomic signature.

1014: {\em Trends Genet} {\bf 11}(7): 283-90.

1015:

1016: \bibitem[Karlin and Mr\'azek, 1997]{KM97}

1017: Karlin, S. and Mr\'azek, J. (1997).

1018: Compositional differences within and between eukaryotic genomes.

1019: {\em Proc Natl Acad Sci U S A} {\bf 94}(19): 10227-32.

1020:

1021: \bibitem[Karlin \etal, 1997]{KMC97}

1022: Karlin, S., Mr\'azek, J. and Campbell, A. M. (1997).

1023: Compositional biases of bacterial genomes and evolutionary implications.

1024: {\em J Bacteriol} {\bf 179}(12): 3899-913.

1025:

1026: \bibitem[Lio and Goldman, 1998]{LioGoldman}

1027: Lio,P. and Goldman,N. (1998).

1028: Models of molecular evolution and phylogeny.

1029: {\em Genome Res.}, {\bf 8}, 1233-1244.

1030:

1031: \bibitem[Lunter and Hein, 2004]{LH04}

1032: Lunter, G. and Hein, J. (2004).

1033: A nucleotide substitution model with nearest-neighbour interactions.

1034: {\em Bioinformatics} {\bf 20} Suppl 1:I216-I223.

1035:

1036: \bibitem[Press \etal, 1992]{Pr92}

1037: Press, W. H., Teukolsky, S. A., Vetterling, W. T., et al. (1992).

1038: {\em Numerical Recipes in C, The art of scientific computing.}

1039: Cambridge, Cambridge University Press.

1040:

1041: \bibitem[Razin and Riggs, 1980]{RR80}

1042: Razin, A. and Riggs, A. D. (1980).

1043: DNA methylation and gene function.

1044: {\em Science} {\bf 210}(4470): 604-10.

1045:

1046: \bibitem[Russell \etal, 1976]{Ru76}

1047: Russell, G. J., Walker, P. M., Elton, R. A., et al. (1976).

1048: Doublet frequency analysis of fractionated vertebrate nuclear DNA.

1049: {\em J Mol Biol} {\bf 108}(1): 1-23.

1050:

1051: \bibitem[Russell and Subak-Sharpe, 1977]{RS77}

1052: Russell, G. J. and Subak-Sharpe, J. H. (1977).

1053: Similarity of the general designs of protochordates and invertebrates.

1054: {\em Nature} {\bf 266}(5602): 533-6.

1055:

1056: \bibitem[Siepel and Haussler, 2004]{Haussler}

1057: Siepel, A. and Haussler, D. (2004).

1058: Phylogenetic estimation of context-dependent substitution rates by maximum likelihood.

1059: {\em Mol Biol Evol.} {\bf 21}(3):468-88.

1060:

1061:

1062:

1063:

1064: \end{thebibliography}

1065:

1066:

1067:

1068:

1069: \end{document}

1070:

1071:

1072:

1073: