0606:q-bio0606016/arxiv.tex

1: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% LaTeX file using REVTex v4 (Manuscript)

2: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Use American English

3: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% (0,1) = (twocolumns, onecolumn)

4: \def \manuflag {0}

5:

6: \ifnum \manuflag = 0

7:  \documentclass[pre,twocolumn,showpacs,amsmath,amssymb]{revtex4}

8:  \usepackage{psfig}

9:  \usepackage{rotating}

10:  \usepackage{epsfig}

11:  \usepackage{delarray}

12:  \usepackage{graphicx}

13:  \usepackage{dcolumn}

14:  \usepackage{bm}

15:  \def \Title{

16: Large-scale Oscillation of

17: Structure-Related DNA Sequence Features in Human Chromosome 21

18: }

19:  \def \figsize{6.85cm}

20:  \def \figname{\footnotesize \sc FIG.}

21:  \def \tblname{\footnotesize \sc TABLE}

22:  \sloppy

23:  \newcommand{\SEC}{\section}

24:  \newcommand{\SUBSEC}{\subsection}

25: \else

26:  \documentclass[pre,onecolumn,showpacs,amsmath,amssymb]{revtex4}

27:  \usepackage{psfig}

28:  \usepackage{epsfig}

29:  \usepackage{delarray}

30:  \usepackage{graphicx}

31:  \usepackage{dcolumn}

32:  \usepackage{bm}

33:  \usepackage{rotating}

34:  \def \Title{

35: Large-scale Oscillation of

36: Structure-Related DNA Sequence Features in Human Chromosome 21 }

37:  \def \figsize{12.6cm}

38:  \renewcommand{\baselinestretch}{2.4}

39: \fi

40:

41: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% abstract

42: \def \Abstract{

43: Human chromosome 21 is the only chromosome in human

44: genome that exhibits oscillation of (G+C)-content

45: of cycle length of hundreds kilobases (500 kb near

46: the right telomere).

47: We aim at establishing the existence of similar periodicity in

48: structure-related sequence features in order to

49: relate this (G+C)\% oscillation to other biological phenomena.

50: The following quantities are shown to oscillate with

51: the same 500kb periodicity in human chromosome 21:

52: binding energy calculated by two sets of dinucleotide-based

53: thermodynamic parameters,

54: AA/TT and AAA/TTT bi-/tri-nucleotide density,

55: 5'-TA-3' dinucleotide density, and signal for 10/11-base

56: periodicity of AA/TT or AAA/TTT.

57: These intrinsic  quantities are related to structural

58: features of the double helix of DNA molecules, such

59: as base-pair binding, untwisting/unwinding,  stiffness,

60: and a putative tendency for nucleosome formation.

61: }

62:

63:

64:

65: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% bulk of paper

66: \ifnum \manuflag = 0

67:  \begin{document}

68:  \title{\Title}

69:  \author{Wentian Li}

70:  \email{wli@nslij-genetics.org}

71:  \affiliation{The Robert S. Boas Center for Genomics and Human Genetics,

72: Feinstein Institute for Medical Research, North Shore LIJ Health System,

73: 350 Community Drive, Manhasset, NY, USA.

74: 	      	}

75:  \author{Pedro Miramontes}

76:  \email{pmv@fciencias.unam.mx}

77:  \affiliation{

78: Departamento de Matem\'{a}ticas,

79: Facultad de Ciencias, Universidad Nacional Aut\'{o}noma de M\'{e}xico,

80: Circuito Exterior, Ciudad Universitaria, 04510 M\'{e}xico, D.F. and \\

81: Departamento de Matem\'{a}ticas, Universidad de Sonora,

82: Encinas y Rosales, Hermosillo 83000 Sonora, M\'{e}xico.

83:  }

84:  \begin{abstract}

85:    \Abstract

86:  \end{abstract}

87:  \pacs{87.10.+e, 87.14.Gg, 87.15.Cc, 02.50.-r, , 02.50.Tt, 89.75Da, 89.75.Fb, 05.40.-a}

88:  %% \keywords{Suggested keywords if desired}

89:  \maketitle

90: \else

91:  \begin{document}

92:  \title{\Title}

93:  \author{Wentian Li}

94:  \email{...@...}

95:  \affiliation{...}

96:  \author{Pedro Miramontes}

97:  \email{pmv@fciencias.unam.mx}

98:  \affiliation{,

99:  }

100:  \begin{abstract}

101:    \Abstract

102:  \end{abstract}

103:  \pacs{87.10.+e, 02.50.-r, 05.40.-a  \hfill {\tt Thu Sep  5 15:36:39 EDT 2002}}

104:  %% \keywords{Suggested keywords if desired}

105: \maketitle

106: \fi

107:

108: \SEC{Introduction}

109:

110: DNA sequences are full of features at small, intermediate,

111: and large scales \citep{li97}. At short distances, there is strong

112: periodicity-of-three-nucleotide signal in protein-coding

113: regions (but absent in non-coding regions) \citep{period3},

114: and  a weaker but ubiquitous 10-11 bases signal in many genomes

115: \citep{period10}.  At intermediate length scale, there are

116: {\sl Alu} sequences of about 300 bases long \citep{alu},

117: and nucleosome-forming sequence of around 120-200 bases

118: \citep{nucleosome}. At large length scales, the most well known

119: features are the existence of alternating (G+C)\%-high and

120: (G+C)\%-low ``isochores" \citep{isochore}, and the distribution of sine

121: wave that prefers long-wavelength signals (the so-called

122: ``1/f" spectra when viewed in the spectral space

123: \citep{1f}).

124:

125: A recent survey of (G+C)\% fluctuation in all human

126: ({\sl Homo sapiens}) chromosomes

127: revealed that chromosome 21 exhibits a unique

128: 500 kilobases (kb) oscillation in (G+C)\% \citep{li-holste04}.

129: This oscillation starts around the position of 43.5 million

130: bases (Mb) and lasts five cycles (with five (G+C)\%-low

131: six (G+C)\%-high peaks). No other human chromosomes exhibit

132: similar periodicities with such a long cycle length.

133:

134: Human chromosome 21 has other special properties as

135: compared to the rest of the human chromosomes. First, it is

136: the shortest human chromosome. Second, its (G+C)\% increases

137: stepwise from left (centromeric) to right (telomeric, i.e.,

138: close to the end of the chromosome),

139: with three distinct ``super" isochore regions (see, e.g. Fig.3

140: of \citep{isochore}(b)). The 500kb oscillation of

141: (G+C)\% described above appears in the third region

142: with the highest (G+C)\% and the highest gene content.

143: Third, the failure rate in segregating  homologous chromosomes

144: during meiosis is the highest among surviving infants

145: in human chromosome 21 than any other human chromosomes.

146: When this happens, the surviving infants typically carries

147: three copies of chromosome 21 (``trisomy 21") instead

148: of one copy \citep{trisomy}.

149: The resulting Down syndrome is the leading case

150: of birth defects \citep{patterson}.

151:

152:

153: The uniqueness of the 500kb oscillation in (G+C)\% in

154: human chromosome 21 and highest trisomy rate in chromosome

155: 21 among surviving infants motivated us to speculate the

156: possibility that this 500kb oscillation might be somewhat

157: related to the trisomy risk. An argument is that

158: the periodicity in (G+C)\% is a basis for certain structural

159: periodicity, which in turn might interfere with the

160: proper segregation of chromatids during meiosis.

161: One intriguing observation is that for younger mothers

162: with trisomy 21, the placement of meiosis exchange

163: tends to be telomeric \citep{sherman05}.

164:

165:

166: In this paper, we examine whether sequence-based

167: structure features oscillates with the 500kb

168: cycle length in the telomeric region of human chromosome 21.

169: The structural features we focus on include

170: helix binding energy, flexibility or

171: stiffness in secondary structure of DNA helix,

172: tendency for nucleosome formation based on periodicity

173: of 10-11 bases, and a tendency for anchoring DNA loops.

174:

175: Note that only the intrinsic quantities are

176: calculable here: chromatin structures that depend on

177: extrinsic protein factors require experimental

178: data, and these evidences are not yet conclusive. Also

179: note that the sequence-to-structure connections in

180: some model are based on simplified assumptions, and

181: our calculation may only give a partial picture of

182: DNA helix structure properties. Our hope is for this

183: work to contribute to the eventual establishment

184: of a sequence-function connection.

185:

186: \begin{table}[t]

187: \begin{center}

188: \begin{tabular}{lcccc}

189: 5' /\ 3'  & G & A & T & C \\

190: \hline

191: G & 2.75/1.84 & 1.41/1.30 & 1.13/1.44 & 2.82/2.24 \\

192: A &  (see CT) & 1.66/1.00 & 1.19/0.88 &  (see GT) \\

193: T &  (see CA)  & 0.76/0.58 &  (see AA) & (see GA) \\

194: C & 3.28/2.17 & 1.80/1.45 & 1.35/1.28 &  (see GG) \\

195: \hline

196: \end{tabular}

197: \end{center}

198: \caption{

199: \label{tab:01}

200: Free energy ($\Delta G)$ of helix binding in

201: nearest neighbor models at 37$^o$C with

202: Breslauer/SantaLucia parameters (kcal/mol).

203: }

204: \end{table}

205:

206: \begin{figure}[!tpb]%figure1

207: \begin{turn}{-90}

208:         \resizebox{!}{8.9cm}{\includegraphics{fig1-energy.eps}}

209: \end{turn}

210: \caption{

211: (A) (G+C)\% calculated in non-overlapping windows

212: of size 2kb;

213: (B) free energy $\Delta G$ in nearest neighbor model

214: with Breslauer's parameter values;

215: (C) free energy $\Delta G$ in nearest neighbor model

216: with SantaLucia's parameter values;

217: The $x$-axis is the chromosome position, in Mb.

218: }\label{fig:01}

219: \end{figure}

220:

221:

222:

223: \SEC{DNA binding energy and stability}

224:

225: It has been well known that basepairs with strong bases (G-C)

226: are more stable than basepairs with weak bases (A-T),

227: due to the presence of three versus two hydrogen bonds.

228: This single-base model of binding energy has been extended

229: to dinucleotide models where a dinucleotide step (two

230: neighboring basepairs) contributes an amount to the

231: total binding energy \citep{tinoco}. There are two commonly used

232: parameter value sets in the dinucleotide model: one by Breslauer

233: and his colleagues \citep{breslauer86} and another

234: summarized by SantaLucia, also known as the unified

235: parameters \citep{santalucia98}. The nearest-neighbor

236: free energy $\Delta G$ parameter values at 37$^o$C are listed

237: in Table \ref{tab:01} for all 16 dinucleotide steps.

238:

239: A 3.9Mb sequence from the NCBI Build 35 (May'2004, hg17)

240: of human chromosome 21 is downloaded from the UCSC genome browser

241: \cite{ucsc}, starting from the position 43Mb and

242: ending at the right telomere, of position 46.944323Mb.

243:

244: Figure \ref{fig:01} shows the (G+C)\% and averaged binding

245: free energy $\Delta G$ calculated by the dinucleotide model

246: with Breslauer's and SantaLucia's parameters, using non-overlapping

247: windows of 2kb. It is clear that binding energy is higher in

248: (G+C)\%-high peak regions, thus also oscillates with the 500kb

249: periodicity. However, the magnitude of oscillation is larger

250: in the free energy based on Breslauer's parameters than that

251: using SantaLucia's parameters (range of (1.51-2.23) versus

252: (1.18-1.69)).

253:

254: Among the values of $\Delta G$ in Table \ref{tab:01}, the highest helix binding

255: energies are usually associated with two strong bases (G or C),

256: with the exception of 1.84 kcal/mol for GG/CC dinucleotide in

257: SantaLucia's parameters. The lowest binding energies tend to be

258: associated with two weak bases (A or T), but with the exceptions

259: of AA/TT (1.66 kcal/mol) and AT (1.19 kcal/mol)

260: dinucleotides in Breslauer's parameters.  The difference

261: between the two sets of parameters is the largest for

262: CG (1.11 kcal/mol, 40.7\% of the average between the two parameters),

263: GG/CC (0.91 kcal/mol, 39.7\%), and AA/TT (0.66 kcal/mol,

264: 49.6\%) dinucleotides. With these exceptions,

265: one may not automatically assume binding energy to fluctuate

266: the same way as (G+C)\%. What Figure \ref{fig:01}

267: have shown is that the difference between the single-base

268: model (counting the number of weak and strong bases) and the

269: dinucleotide models is not large enough to destroy the 500kb

270: oscillation in binding energy.

271:

272: The correlation coefficient between windowed energy values

273: and the (G+C)\% values was calculated (the first two lines in

274: Table \ref{tab:02}). These correlation values show that

275: SantaLucia parameters are more correlated with the GC\% than Breslauer's

276: parameters (correlation coefficient of 0.998 versus 0.981

277: using the 2kb window). By examining the two sets

278: of free energy parameters in Table \ref{tab:01} closely, it

279: is clear that difference can be traced to the fact that

280: Breslauer's parameters assign a higher energy value for two

281: AT-rich dinucleotides than SantaLucia's parameters: 5'-AA-3'

282: and 5'-AT-3'.  It is still debatable whether Breslauer's

283: or SantaLucia's parameters reflect the {\sl in vivo}

284: situation of helix local thermodynamics \citep{mira03},

285: and the issue may not be settled soon \citep{melo05}.

286:

287:

288: \begin{figure}[!tpb]%figure2

289: \begin{turn}{-90}

290:         \resizebox{!}{8.5cm}{\includegraphics{fig2-dimer.eps}}

291: \end{turn}

292: \caption{(A): Density of AA/TT in non-overlapping windows of size 2kb;

293: (B) AAA/TTT density;

294: (C) 5'-YR-3' density;

295: (D) 5'-TA-3' density. }\label{fig:02}

296: \end{figure}

297:

298:

299: \SEC{DNA flexibility, stiffness, and untwisting}

300:

301: Without an actual measurement of the DNA polymer mechanic

302: properties, we rely on dinucleotides and trinucleotides

303: that are known to be related to the DNA flexibility, stiffness,

304: and untwisting to study the variation of these properties

305: along the chromosome.  For example, the AA..A/TT..T tract is

306: known to have a stiff configuration because of an

307: additional hydrogen bond between adjacent pairs along

308: two diagonally located bases \citep{nelson87}. This

309: hypothesis had been confirmed for AA/TT dinucleotide by

310: their limited range of roll and slide values \citep{el97}.

311: We use AA/TT dinucleotide and AAA/TTT trinucleotide density

312: in a moving window as an indicator for the intrinsic

313: stiffness of the double helix.

314:

315: Unlike A/T-tracts, 5'-pyrimidine-purine-3' (5'-YR-3')

316: steps can adopt two possible configurations, and thus

317: they are flexible \citep{call04}. In a simplified

318: approach, we use 5'-YR-3' density as an indicator for

319: flexibility of the DNA double helix.

320:

321: Among the four 5'-YR-3' steps (CA, CG, TA, TG),

322: 5'-TA-3' has the weakest basepair binding.

323: The biconfiguration nature and weak binding make 5'-TA-3'

324: one of the best candidates for untwisting initiation sites of

325: double helix \citep{call04}. We use the

326: 5'-YR-3' and 5'-TA-3' density in moving windows

327: as an indicator for an untwisting potential.

328:

329: Figure \ref{fig:02} shows densities of the above

330: mentioned di-/tri-nucleotide: AA/TT\%, AAA/TTT \%,

331: 5'-YR-3' \%, and 5'-TA-3' \%.  The 500kb oscillation

332: in the first two densities is clearly seen. The

333: 5'-YR-3' density does not exhibit any regular oscillation

334: of 500kb, whereas 5'-TA-3'density does oscillate

335: with the 500kb wavelength.

336:

337: Note that the signal we are measuring by the

338: di-/tri-nucleotide density is different from

339: that of CpG island \citep{cpg}. In detecting CpG islands,

340: the density of 5'-CG-3' dinucleotide is normalized

341: by the square of GC\% (the observed over expected, or O/E),

342: and the presence of a signal require the 5'-CG-3'

343: density to be at least a quadratic function of GC\%.

344: In fact, it was known that the O/E signal increases

345: with the GC\%, indicating a cubic relationship between

346: 5'-CG-3' density and GC\% in CpG islands \citep{matsuo93}.

347: Here only the ``linear" signal was measured.

348:

349: \begin{figure}[!tpb]%figure3

350: \begin{turn}{-90}

351:         \resizebox{!}{8.5cm}{\includegraphics{fig3-p10.eps}}

352: \end{turn}

353: \caption{

354: (A) Density of AA-10b-AA/TT-10b-TT in non-overlapping window

355: of size 2kb;

356: (B) AAA-10b-AAA/TTT-10b-TTT density.

357: (C) YWG-10b-VWG density, where VWG indicates [not-T][A/T][G]

358: or it's reverse complement triplet [C][A/T][not-A].

359: }\label{fig:03}

360: \end{figure}

361:

362:

363: \SEC{Periodicity-10-base signal and nucleosome forming

364: potential}

365:

366:

367: It has been known that almost all genomes contain a AA-10b-AA/TT-10b-TT

368: signal \citep{period10}, where the ``10b"

369: can be 10 or 11 bases for individual cases, but after

370: averaging becomes a real number between 10 and 11.

371: This periodic signal is also present in the aligned

372: nucleosome-forming sequences \citep{sat86}.

373: We count the number of occurrence of AA-10-AA,

374: TT-10-TT, AA-11-AA, and TT-11-TT in a moving window,

375: then convert to density (similar calculation for

376: AAA-10b-AAA/TTT-10b-TTT density is also carried out).

377: As a crude approximation, this density is used to

378: indicate the region's tendency for nucleosome

379: formation.

380:

381: Figure \ref{fig:03} (A)(B) show the AA-10b-AA/TT-10b-TT

382: and AAA-10b-AAA/TTT-10b-TTT density in a 2kb non-overlapping

383: moving window. The 500kb oscillation is clearly seen,

384: and may support the idea that the nucleosome forming

385: strength also oscillates with that wavelength in this region.

386:

387: However, it was suggested that the regular spacing of 10

388: bases of another triplet motif, [not-T][A/T][G],

389: can be considered as a nucleosome formation signal

390: (called ``VWG" signal) \citep{vwg}. We count the occurrence

391: of [not-T][A/T][G]-10/11-[not-T][A/T][G] and

392: [C][A/T][not-A]-10/11-[C][A/T][not-A] in a moving window,

393: whose density is plotted in Figure \ref{fig:03}(C). This

394: VWG signal does not exhibit a 500kb oscillation in this

395: region.

396:

397: In a more sophisticated study based on discriminant

398: analysis, a composite measure called ``nucleosome

399: formation potential" (NFP) was proposed \cite{nfp}.

400: As shown in Fig.1 of \citep{vino-nfp}, this NFP value

401: decreases with GC\%. Since AA-10b-AA/TT-10b-TT and

402: AAA-10b-AAA/TTT-10b-TTT density also decreases with

403: GC\%, the two measures are consistent.  The VWG signal,

404: however, does not have a simple relationship with

405: GC\%, though mostly it increases with GC\%.

406: Whether one can predict nucleosome forming potential

407: of a DNA sequence accurately, and whether such

408: an intrinsic potential really exists, seems still to be open

409: questions, and it is possible that either AA/TT-10b-AA/TT

410: or VWG-10b-VWG signal does not present the whole

411: picture on nucleosome formation.

412:

413:

414:

415:

416: \SEC{Discussion and Conclusion}

417:

418: Besides the helix structure related intrinsic features,

419: the scaffold/matrix-attached-regions (S/MARs) is another

420: pattern that can be determined from the DNA sequence.

421: S/MARs are the base/foundation of DNA loops \cite{mirk}, and

422: S/MAR sequences can be obtained from S/MAR databases

423: such as the one developed at the University of G\"{o}ttingen

424: \citep{liebich}.

425:

426: By examining the top 34 most frequent hexamers in S/MAR

427: sequences (Table 2 of \citep{liebich}(b)), it is clear

428: that S/MARs are AT-rich \citep{saitoh94}. In fact,

429: only 11 hexamers contain one G or C, ranked 10,

430: 16--18, 21, 22, 25--27, 29, 30 in the top34, and the rest

431: consist exclusively of A and T \citep{liebich}.

432: It is not surprising that S/MAR hexamer density

433: (percentage of hexamers that match the top 34

434: most frequent S/MAR hexamer motifs and their reverse complement)

435: also oscillates with a 500kb wavelenegth in this

436: region \citep{li-gene}.

437:

438: The existence of 500kb oscillation in most of the

439: quantities we have examined indicates that these

440: structure-related sequence features are correlated with GC\%.

441: To assess this correlation directly, Figure \ref{fig:05} shows the

442: scatter plot of ten quantities used in Figures 1-3 as versus

443: GC\%,  and Table \ref{tab:02} lists correlation coefficients of

444: all pairs among these eleven quantities. Figure \ref{fig:05} and

445: Table \ref{tab:02} have confirmed that these structure-based

446: sequence features are highly correlated (test results of these

447: correlation coefficients are all significant with the exception

448: of a few pairs involving 5'-YR-3'), and GC\% can be

449: used as a good surrogate for these features (with

450: the exception of 5'YR-3').

451:

452: Density of 5'-YR-'3 is not correlated with other quantitied

453: studied (4 correlation coefficients are not significant at

454: the $p$-value=0.01 level, and 5 other correlation coefficients,

455: though significant, are rather weak). The next group of

456: quantities that have weak correlation with others are

457: the AAA-10b-AAA/TTT-10b-TTT and VWG-10b-VWG densities,

458: with several correlation coefficients in the 0.4-0.5 range.

459:

460: \begin{table*}[!t]

461: \begin{center}

462: \begin{tabular}{lcccccccccc}

463:  & GC & Breslauer & SantaLucia & 5'YR3' & AA & AAA & 5'TA3' & AA10AA & AAA10AAA  & VWG10VWG   \\

464: \hline

465: Breslauer & 0.981&        &       &  & & & & & &  \\

466: SantaLucia& 0.998&  0.985&        & & & & & & &  \\

467: 5'YR3'   & -0.133& -0.195& -0.103&  & & & & & &  \\

468: AA       & -0.960& -0.896& -0.950& -0.042* &  & & & & &  \\

469: AAA      & -0.917& -0.844& -0.903& -0.044* & 0.974& &  & & &  \\

470: 5'TA3'   & -0.946& -0.915& -0.947&  0.183  & 0.912& 0.858&  & & &  \\

471: AA10AA   & -0.864& -0.791& -0.851& -0.043* & 0.922& 0.956& 0.810&  & &  \\

472: AAA10AAA & -0.610& -0.545& -0.595& -0.064  & 0.683& 0.789& 0.557& 0.866&  & \\

473: VWG10VWG & 0.526 & 0.398 & 0.514 & 0.279 & -0.657 & -0.637 & -0.574 & -0.601 & -0.458  &\\

474: S/MAR    & -0.881& -0.807& -0.868& -0.002* & 0.929& 0.967& 0.854& 0.947& 0.810 & -0.617\\

475: \hline

476: \end{tabular}

477: \end{center}

478: \caption{

479: \label{tab:02}

480: Correlation coefficients of eleven quantities obtained from non-overlapping

481: 2kb windows: GC\%, bindinger energy by Breslauer's model and SantaLucia's

482: model, densities of 5'-YR-3', AA/TT, AAA/TTT,

483: 5'-TA-3', AA-10b-AA/TT-10b-TT, AAA-10b-AAA/TTT-10b-TTT,

484: VWG-10b-VWG, and density of top S/MAR hexamers.

485: Testing of correlation coefficient equal to zero is significant

486: at $p$-value=0.01 level for all pairs except those marked by

487: the stars (YR-AA $p$=0.064, YR-AAA $p$=0.049, YR-AA10AA $p$=0.056,

488: and YR-SMAR $p$=0.93).

489: }

490: \end{table*}

491:

492: \begin{figure}[!tpb]%figure5

493: \begin{turn}{-90}

494:         \resizebox{8cm}{8cm}{\includegraphics{fig4-correlation.eps}}

495: \end{turn}

496: \caption{

497: Scatter plots of ten quantities versus GC\%:

498: (A) helix binding energy by Breslauer's model;

499: (B)  binding energy by SantaLucia's model;

500: (C) AA/TT (upper) and AAA/TTT (lower, using the symbol ') densities;

501: (D) 5'-YR-3' density;

502: (E) 5'-TA-3' densities;

503: (F) AA-10b-AA/TT-10b-TT (upper) and AAA-10b-AAA/TTT-10b-TTT

504: (lower, using the symbol ') densities;

505: (G) VWG-10-VWG densities;

506: and (H) density of the top 34 hexamers in

507: known S/MAR sequences and their reverse complements.

508: The corresponding values for randomized sequences

509: are also shown (grey circles). The correlation coefficient

510: between these quantities andf GC\% is indicated on the plot.

511: }\label{fig:05}

512: \end{figure}

513:

514: One may ask the question on whether the correlation between

515: these quantities and GC\% is ``trivial", because these

516: patterns are either dominated by GC-rich or AT-rich

517: di- tri-nucleotides. This question can be addressed by

518: examining the GC\%-preserving random sequences. In

519: Figure \ref{fig:05} the ten structure-related quantities

520: for the random sequences are shown as a function of GC\%

521: (circles). Several interesting observations can be made.

522:

523: \begin{itemize}

524: \item

525: Binding energies calculated on real DNA sequences are very

526: close to those calculated on randomized sequences. However,

527: the binding energy of real DNA sequences is slightly

528: lower than that of random sequences at high GC\% values.

529: A similar observation was made in \citep{vino} (Fig.1(C)

530: of  \citep{vino}) on the ``relative" thermostability.

531: \item

532: The A/T-tract density is higher in real DNA sequences

533: than randomized sequences, mainly in the AT-rich ranges.

534: It indicates that DNA sequences are more rigid than

535: randomized sequences in general.

536: \item

537: The biconfigurational 5'-YR-3' dinucleotide density is lower

538: in real DNA sequences than randomized sequences

539: (with some exceptions for DNA segments with GC\%

540: around 50\%-60\%). It indicates DNA sequences are

541: less flexible than randomized sequences.

542: \item

543: The 5'-TA-3' density is lower in DNA sequences than

544: random sequences, making them less susceptible

545: for helix untwistings.

546: \item

547: The periodicity of 10/11 bp signal for both AA/TT, AAA/TTT,

548: and VWG triplet has a stronger presence in real DNA

549: sequences than random sequences, probably making them more

550: likely to form nucleosomes.

551: \item

552: The S/MAR potential is higher in DNA sequences than

553: randomized sequences.

554: \end{itemize}

555:

556:

557: From these observations, one may expect that the

558: binding energy faithfully follows the same variation

559: and oscillation as GC\%; A/T tract density, TA density,

560: AAA-10b-AAA signal, and S/MAR signal more or less

561: follow the same oscillation as GC\%; YR density,

562: AAA-10b-AAA signal, and YWG-10b-YWG signal may not

563: follow the same oscillation as GC\%.

564:

565:

566: It has been known that GC\% conveys biological information \citep{isochore}(c).

567: For example, the Giemsa-dark chromosome staining band, or G-band,

568: is AT-rich, whereas the Giemsa-light band or R-band is GC-rich

569: \citep{ikemura88}, or by a new hypothesis, AT-rich and

570: GC-rich relative to its neighboring bands \citep{gojobori02}.

571: Gene density is another example, with GC-rich regions being

572: relatively gene-rich  \citep{mouch91}.  Fluorescence microscopy

573: images show that chromosomes inside the nucleus are organized

574: in a radial order, called ``chromosome territories"

575: \citep{cremer}. The GC-rich, gene-rich regions

576: tend to be located towards the center of the nucleus \citep{saccone02}, and

577: the corresponding chromatin compartments are more

578: ``open" \citep{cremer}.

579:

580:

581: Without experimental evidences, it is difficult to speculate what

582: type of high-order chromatin structure this 500kb oscillation

583: might cause. According to the chromatin structure model

584: summarized in \citep{filipski90}, there could be multiple

585: level of foldings in the hierarchical structure of a chromatid:

586: Watson-Crick's double helix (10bp for one helix turn),

587: nucleosomes ($\sim$ 200bp per unit), solenoids (6

588: nucleosome units per helix turn, or 1.2kb) that twist

589: to form a loop of $\sim$ 50kb, rosettes that consist

590: of 6 loops ($\sim$ 300 kb), coils that consist of 30

591: rosettes ($\sim$ 9Mb), and finally the chromatids

592: consist of, for a medium sized human chromosome,

593: $\sim$ 10 coils. Within the framework of

594: this model, our 500kb oscillation matches roughly the size

595: of a rosette. However, we should caution that the

596: exact figure for the size of these hierarchical units

597: is illustrative, and the model itself may be too much

598: based on {\sl in vitro} experiments, and on inactive

599: cells \citep{vanholde95}.

600:

601: The unique large-scale oscillation of GC\% in human

602: chromosome 21 studied in this paper and in \citep{li-holste04}

603: can be further analyzed from several perspectives.

604: One is about its evolutionary presevation in other

605: species. Due to the high degree of similarity between

606: human and chimpanzee, it is natural to assume that the

607: same 500kb oscillation would also be present in chimpanzee

608: genome. Indeed, it was shown that 500kb oscillation

609: exists in chimpanzee chromosome 22 \cite{li-gene}. On

610: the other hand, no such 500kb oscillation was observed

611: in mouse genome. It would be interesting to check its

612: existence in species in-between mouse and human.

613:

614: It was suggested for the yeast genome \citep{filipski02}

615: that the transcription direction of open reading frame (ORF)

616: points from GC-rich to GC-poor regions. Combined with

617: the general picture that DNA loop anchored in AT-rich

618: regions whereas the GC-rich part of the loop is exposed

619: to the outside, transcription likely starts from the

620: top of DNA loop to loop base. Although the length scale

621: between two GC-rich regions analyzed in the yeast genome

622: ($\sim$10kb) is much shorter than the GC\% oscillation length

623: studied here, there are some evidence

624: that gene density on two opposite strands alternating

625: in this region (Fig.5(c) of \cite{li-holste04}). A more

626: careful analysis is needed to confirm the similarity

627: between human and yeast genome, and the regular oscillation

628: of GC\% discussed here provides an ideal test ground.

629:

630:

631: In conclusion, the 500kb oscillation in GC\% as reported in

632: \citep{li-holste04} was shown to lead to similar oscillation

633: of some intrinsic structure-related patterns. And we

634: hypothesis that a regular oscillation in chromatin structure

635: with the same wavelength is also present in this

636: region.

637:

638:

639: \SEC*{Acknowledgements}

640:

641: W.L. acknowledges the financial support at the

642: The Robert S Boas Center for Genomics and Human Genetics.

643: P.M. thanks the support of DGAPA project IN111003.

644:

645:

646:

647: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% bibliography \bibitem [] {}

648: \begin{thebibliography}{99}

649:

650: \bibitem[Li, 1997]{li97}

651: W. Li,

652: Comp. \& Chem., {\bf 21}, 257-272 (1997).

653:

654: \bibitem[Fickett, 1982]{period3}

655: J.W. Fickett,

656: Nucl. Acids Res., {\bf 10}, 5303-5318 (1982);

657: V.R. Chechetkin, L.A. Knizhnikova, A.Y. Turygin,

658: J. Biomol. Struct. Dyn., {\bf 12}, 271-299 (1994);

659: G. Gutierrez and A. Marin,

660: J. Theo. Biol., {\bf 167}, 413-414 (1994);

661: S. Tiwari, S. Ramachandran, A. Bhattacharya, S. Bhattacharya, R. Ramaswamy,

662: Comp Appl. Biosc., {\bf 13},  263-270 (1997);

663: W. Lee and L. Luo,

664: Phys. Rev. E, {\bf 56}, 848-851 (1997).

665:

666: \bibitem[Widom, 1996]{period10}

667: E.N. Trifonov and J.L. Sussman,

668: Proc. Natl. Acad. Sci. USA , {\bf 77}, 3816-3820

669: (1980);

670: J. Widom,

671: J. Mol. Biol., {\bf 259}, 579-588 (1996);

672: V.R. Chechetkin and V.V. Lobzin,

673: J. Biomol. Struct. Dyn., {\bf 15}, 937-947 (1998);

674: H. Herzel, O. Weiss, E.N. Trifonov,

675: Bioinformatics, {\bf 15}, 187-193 (1999);

676: E. Larsabal and  A. Danchin,

677: BMC Bioinformatics, {\bf 6}, 206 (2005).

678:

679:

680:

681: \bibitem[Schmid and Jelenik, 1982]{alu}

682: C.W. Schmid and  W.R. Jelenik WR ,

683: Science, {\bf 216}, 1065-1070 (1982);

684: C. Willard, H.T. Nguyen, C.W. Schmid,

685: J. Mol. Evo. , {\bf 26}, 180-186 (1987);

686: M.A. Batzer and P.L. Deininger,

687: Nature Rev. Genet., {\bf 3}, 370-379 (2002).

688:

689: \bibitem[Hewish and Burgoyne, 1973]{nucleosome}

690: D. Hewish  and L. Burgoyne,

691: Bioch. Biophy. Res. Comm., {\bf 52}, 504-510 (1973);

692: H.R. Widlund, et al.,

693: J. Mol. Biol., {\bf 267}, 807-817 (1997);

694: J. Widom,

695: Q. Rev. Biophys., {\bf 34}, 269-324 (2001).

696:

697:

698: \bibitem[Macaya {\it et~al}., 1976]{isochore}

699: (a)

700: G. Macaya, J.P. Thiery, G. Bernardi,

701: J. Mol. Biol., {\bf 108}, 237-254 (1976);

702: (b) G. Bernardi,

703: Gene, {\bf 276}, 3-13 (2001);

704: (c) G. Bernardi,

705: {\sl Structural and Evolutionary Genomics}

706: (Elsevier, 2004).

707:

708: \bibitem[Li and Kaneko, 1992]{1f}

709: W. Li and K. Kaneko,

710: Europhys. Lett., {\bf 17}, 655-660 (1992);

711: R.F. Voss,

712: Phys. Rev. Letts., {\bf 68}, 3805-3808 (1992);

713: X. Lu, Z. Sun, H. Chen, Y. Li,

714: Phys. Rev. E, {\bf 58}, 3578-3584 (1998);

715: A. Fukushima, et al.,

716: Gene, {\bf 300}, 203-211 (2002);

717: W. Li and D. Holste,

718: Fluct. Noise Letts., {\bf 4}, L453-L464;

719: W. Li and D. Holste,

720: Phys. Rev. E, {\bf 71}, 041910 (2005).

721:

722: \bibitem[Li and Holste, 2004]{li-holste04}

723: W. Li and D. Holste,

724: Comput. Biol. and Chem., {\bf 28}, 393-399 (2004).

725:

726: \bibitem[Patterson and Costa, 2005]{trisomy}

727: T. Hassold and P. Hunt,

728: Nature Rev. Genet., {\bf 2}, 280-291 (2001).

729:

730: \bibitem[Patterson and Costa, 2005]{patterson}

731: S.E. Antonarakis, et al.,

732: Nature Rev. Genet., {\bf 5}, 725-738 (2004);

733: D. Patterson  and A.C.S.  Costa,

734: {\sl lbid.}, {\bf 6}, 137-147 (2005).

735:

736: \bibitem[Lamb {\it et~al}., 2005]{sherman05}

737: N.E. Lamb, et al.,

738: Am. J. Hum. Genet., {\bf 76}, 91-99 (2005).

739:

740: \bibitem[DeVoe and Tinoco, 1962]{tinoco}

741: H. DeVoe and I. Tinoco Jr. ,

742: J. Mol. Biol., {\bf 4}, 500-517 (1962).

743:

744: \bibitem[Breslauer {\it et~al}., 1986]{breslauer86}

745: K.J. Breslauer, R. Frank, H. Bl\"{o}cker, L.A. Marky,

746: Proc. Natl. Acad. Sci. USA , {\bf 83}, 3746-3750 (1986).

747:

748: \bibitem[SantaLucia, 1998]{santalucia98}

749: J. SantaLucia Jr.,

750: Proc. Natl. Acad. Sci., {\bf 95}, 1460-1465 (1998).

751:

752: \bibitem[SantaLucia, 1998]{ucsc}

753: Genome browser from the University of

754: California at Santa Cruz (UCSC) Genome

755: Bioinformatics Site.

756: URL: {\sl http://genome.ucsc.edu/}.

757:

758: \bibitem[Miramontes and Cocho,  2003]{mira03}

759: P. Miramontes  and G. Cocho,

760: Physica A, {\bf 321}, 577-586 (2003).

761:

762: \bibitem[Panjkovich and Melo, 2005]{melo05}

763: A. Panjkovich and F. Melo,

764: Bioinformatics, {\bf 21}, 711-722 (2005).

765:

766: \bibitem[Nelson {\it et~al}.,  1987]{nelson87}

767: H.C.M. Nelson, J.T. Finch, B.F. Luisi, A. Klug ,

768: Nature, {\bf 33}, 221-226 (1987).

769:

770: \bibitem[El Hassan and Calladine, 1997]{el97}

771: M.A. El Hassan and C.R. Calladine,

772: Phil. Trans. Royal Soc. London A, {\bf 355}, 43-100 (1997).

773:

774: \bibitem[Calladine {\it et~al}., 2004]{call04}

775: C.R. Calladine, H.R. Drew, B.F. Luisi, A.A. Travers,

776: {\sl Understanding DNA -- The Molecule and How It Works}

777: 3rd edition (Elsevier, 2004).

778:

779: \bibitem[Gardiner-Garden and Frommer, 1987]{cpg}

780: M. Gardiner-Garden and M. Frommer,

781: J. Mol. Biol., {\bf 196}, 261-282 (1987);

782: F. Larsen, G. Gundersen, R. Lopez, H. Prydz,

783: Genomics, {\bf 13}, 1095-1107 (1992).

784:

785: \bibitem[Matsuo {\it et~al}., 1993 ]{matsuo93}

786: K. Matsuo, et al.,

787: Somatic Cell and Mol. Genet., {\bf 19}, 535-543 (1993).

788:

789: \bibitem[Satchwell {\it et~al}., 1986]{sat86}

790: S.C. Satchwell, H.R. Drew, A.A. Travers,

791: J. Mol. Biol., {\bf 191}, 659-675 (1986);

792:

793: \bibitem[Satchwell {\it et~al}., 1986]{vwg}

794: P. Baldi, S. Brunak, Y. Chauvin, A. Krogh,

795: J.  Mol. Biol. , {\bf 263}, 503-510 (1996);

796: A. Stein and M. Bina,

797: Nucl. Acids Res. ,  {\bf 27}, 848-853 (1999).

798:

799: \bibitem[Satchwell {\it et~al}., 1986]{nfp}

800: V.G. Levitsky, O.A. Podkolodnaya, N.A. Kolchanov, N.L. Podkolodny,

801: Bioinformatics, {\bf 17}, 998-1010 (2001);

802: {\sl lbid.} {\bf 17}, 1062-1064.

803:

804: \bibitem[Satchwell {\it et~al}., 1986]{vino-nfp}

805: A.E. Vinogradov,

806: Nucl. Acids Res. , {\bf 33}, 559-563 (2005).

807:

808:

809: \bibitem[Mirkovitch {\it et~al}.,  1984]{mirk}

810: J. Mirkovitch, M.E. Mirault, U.K. Laemmli,

811: Cell,  {\bf 39}, 223-232 (1984).

812:

813:

814: \bibitem[Liebich {\it et~al}., 2002a]{liebich}

815: (a) I. Liebich, J. Bode, M. Frisch, E. Wingender,

816: Nucl. Acids Res. ,  {\bf 30}, 307-309 (2002);

817: (b) I. Liebich, J. Bode, I. Reuter, E. Wingender,

818: {\sl lbid.}, {\bf 30}, 3433-3442 (2002).

819:

820: \bibitem[Saitoh and Laemmli, 1994]{saitoh94}

821: Y Saitoh and U.K. Laemmli,

822: Cell, {\bf 76}, 609-622 (1994).

823:

824: \bibitem[Ikemura and Aota, 1988]{li-gene}

825: W. Li, Gene, submitted (2006).

826:

827: \bibitem[Vinogradov, 2003]{vino}

828: A.E. Vinogradov,

829: Nucl. Acids Res., {\bf 31}, 1838-1844 (2003).

830:

831: \bibitem[Ikemura and Aota, 1988]{ikemura88}

832: D.E. Coming,

833: Ann. Rev. Genet., {\bf 12}, 25-46 (1978);

834: T. Ikemura  and S. Aota,

835: J. Mol. Biol., {\bf 203}, 1-13 (1988).

836:

837: \bibitem[Niimura and Gojobori, 2002]{gojobori02}

838: Y. Niimura  and T. Gojobori,

839: Proc. Natl. Acad. Sci. USA, {\bf 99}, 797-802 (2002).

840:

841: \bibitem[Mouchiroud {\it et~al}., 1991]{mouch91}

842: D. Mouchiroud, et al.,

843: Gene, {\bf 100}, 181-187 (1991);

844: S. Zoubak, O. Clay, G. Bernardi,

845: {\sl lbid.}, {bf 174}, 95-102 (1996).

846:

847: \bibitem[Cremer {\it et~al}., 2000]{cremer}

848: N. Sadoni, et al.,

849: J. Cell Biol., {\bf 146}, 1211-1226 (1999);

850: T. Cremer,  et al.,

851: Critical Review in Euk. Gene Exp., {\bf 10}, 179-212 (2000);

852: R.R. Williams,

853: Trends in Genet., {\bf 19}, 298-302 (2003).

854:

855: \bibitem[Saccone {\it et~al}., 2002]{saccone02}

856: S. Saccone, C. Federico, G. Bernardi,

857: Gene, {\bf 300}, 169-178 (2002).

858:

859:

860: \bibitem[Filipski {\it et~al}., 1990]{filipski90}

861: J. Filipski,  et al.,

862: EMBO J., {\bf 19}, 1319-1327 (1990).

863:

864: \bibitem[van Holde and Zlatanova, 1989]{vanholde95}

865: K. Van Holde and J. Zlatanova,

866: J. Biol. Chem., {\bf 270}, 8373-8376 (1995).

867:

868:

869: \bibitem[Filipski and Mucha, 2002]{filipski02}

870: J. Filipski and M. Mucha,

871: Gene, {\bf 300}, 63-68;

872: A. Marin, M. Wang, G. Gutierrez,

873: Gene, {\bf 333}, 151-155.

874:

875:

876:

877: \end{thebibliography}

878:

879:

880: \end{document}

881:

882:

883: