0103:math0103007/TR.tex

1: \documentclass[11pt]{article}

2: \usepackage{latexsym}

3: \usepackage{amssymb}

4:

5: \setlength{\oddsidemargin}{-5mm}

6: \setlength{\evensidemargin}{-5mm}

7: \setlength{\topmargin}{-1cm}

8: \setlength{\textheight}{22.5cm}

9: \setlength{\textwidth}{17cm}

10:

11: % \renewcommand{\baselinestretch}{1.2}  % double space

12: % \input paper_defns.tex

13: % \input{/users/yiannis/latex/paper_defns}

14: % \input{../latex/paper_defns}

15: % \input{/users/yiannis/latex/paper_defns}

16: % \input{/home/mean/u21/yiannis/latex/paper_defns}

17: % \input{/v0/yiannis/latex/paper_defns}

18: % \input{/sccm0/yiannis/latex/paper_defns}

19: % \input{/tmp_mnt/home-georgep/yiannis/latex/paper_defns}

20:

21: \def\proof{{\sc Proof. }}

22: \def\qed{\hfill$\Box$}

23:

24: \newcommand{\Qp}{\mbox{\boldmath $Q$}}

25: \newcommand{\Xp}{\mbox{\boldmath $X$}}

26: \newcommand{\Xb}{\mbox{\bf X}}

27: \newcommand{\Xps}{\mbox{\scriptsize\boldmath $X$}}

28: \newcommand{\xp}{\mbox{\boldmath $x$}}

29: \newcommand{\Xtp}{\mbox{\boldmath $\tilde{X}$}}

30: \newcommand{\xinp}{\mbox{\boldmath $\xi_n$}}

31: \newcommand{\Yp}{\mbox{\boldmath $Y$}}

32: \newcommand{\yp}{\mbox{\boldmath $y$}}

33: \newcommand{\Zp}{\mbox{\boldmath $Z$}}

34: \newcommand{\orig}{\bf 0}

35: \newcommand{\Equiv}{\mbox{$\Leftrightarrow$}}

36: \newcommand{\weakly}{\mbox{$ \;\stackrel{\cal D}{\longrightarrow}\; $}}

37: \newcommand{\equald}{\mbox{$ \;\stackrel{\cal D}{=}\; $}}

38: \newcommand{\bydef}{\mbox{$ \;\stackrel{\triangle}{=}\; $}}

39: \newcommand{\home}{\mbox{$\!\mathtt{\sim}$}}

40: \newcommand{\eqexp}{\mbox{$ \;\stackrel{\cdot}{=}\; $}}

41: \newcommand{\leqa}{\mbox{$ \;\stackrel{(a)}{\leq}\; $}}

42: \newcommand{\leqb}{\mbox{$ \;\stackrel{(b)}{\leq}\; $}}

43: \newcommand{\leqc}{\mbox{$ \;\stackrel{(c)}{\leq}\; $}}

44: \newcommand{\leqd}{\mbox{$ \;\stackrel{(d)}{\leq}\; $}}

45: \newcommand{\leqe}{\mbox{$ \;\stackrel{(e)}{\leq}\; $}}

46: \newcommand{\geqa}{\mbox{$ \;\stackrel{(a)}{\geq}\; $}}

47: \newcommand{\geqb}{\mbox{$ \;\stackrel{(b)}{\geq}\; $}}

48: \newcommand{\geqc}{\mbox{$ \;\stackrel{(c)}{\geq}\; $}}

49: \newcommand{\geqd}{\mbox{$ \;\stackrel{(d)}{\geq}\; $}}

50: \newcommand{\geqe}{\mbox{$ \;\stackrel{(e)}{\geq}\; $}}

51: \newcommand{\eqa}{\mbox{$ \;\stackrel{(a)}{=}\; $}}

52: \newcommand{\eqb}{\mbox{$ \;\stackrel{(b)}{=}\; $}}

53: \newcommand{\eqc}{\mbox{$ \;\stackrel{(c)}{=}\; $}}

54: \newcommand{\eqd}{\mbox{$ \;\stackrel{(d)}{=}\; $}}

55: \newcommand{\eqe}{\mbox{$ \;\stackrel{(e)}{=}\; $}}

56: \newcommand{\eqf}{\mbox{$ \;\stackrel{(f)}{=}\; $}}

57: \newcommand{\eqg}{\mbox{$ \;\stackrel{(g)}{=}\; $}}

58: \newcommand{\eqh}{\mbox{$ \;\stackrel{(h)}{=}\; $}}

59: \newcommand{\eqi}{\mbox{$ \;\stackrel{(i)}{=}\; $}}

60: \newcommand{\eqj}{\mbox{$ \;\stackrel{(j)}{=}\; $}}

61: \newcommand{\eqk}{\mbox{$ \;\stackrel{(k)}{=}\; $}}

62: \newcommand{\eql}{\mbox{$ \;\stackrel{(\ell)}{=}\; $}}

63: \newcommand{\approxa}{\mbox{$ \;\stackrel{(a)}{\approx}\; $}}

64: \newcommand{\approxb}{\mbox{$ \;\stackrel{(b)}{\approx}\; $}}

65: \newcommand{\approxc}{\mbox{$ \;\stackrel{(c)}{\approx}\; $}}

66: \newcommand{\approxd}{\mbox{$ \;\stackrel{(d)}{\approx}\; $}}

67: \newcommand{\appeq}{\mbox{$ \stackrel{\cdot}{=} $}}

68: \newcommand{\appleq}{\mbox{$ \stackrel{\mathbf{\cdot}}{\leq} $}}

69: \newcommand{\subD}{_{{}_D}}

70: \newcommand{\subDi}{_{{}_{D_i}}}

71: \newcommand{\RL}{{\mathbb R}}

72: \newcommand{\NN}{{\mathbb N}}

73: \newcommand{\IN}{{\mathbb Z}}

74: \newcommand{\RN}{{\mathbb Q}}

75: \newcommand{\IND}{{\mathbb I}}

76: \newcommand{\BBP}{{\mathbb P}}

77: \newcommand{\BBQ}{{\mathbb Q}}

78: \newcommand{\BBM}{{\mathbb M}}

79: \newcommand{\Ind}{\mbox{\rm I$\!$I}}

80: \newcommand{\PB}{\mbox{\boldmath $P$}}

81: \newcommand{\QB}{\mbox{\boldmath $Q$}}

82: \newcommand{\QBn}{\mbox{\boldmath $Q_n$}}

83: \newcommand{\QTn}{\mbox{$\widetilde{Q}_n$}}

84: \newcommand{\PR}{\mbox{\rm Pr}}

85: \newcommand{\VAR}{\mbox{\rm Var}}

86: \newcommand{\COV}{\mbox{\rm Cov}}

87: \newcommand{\signs}{\mbox{\scriptsize sign}}

88: \newcommand{\essinf}{\mathop{\rm ess\, inf}}

89: \newcommand{\esssup}{\mathop{\rm ess\, sup}}

90: \newcommand{\Ahat}{\mbox{$\hat{A}$}}

91: \newcommand{\Ahatn}{\mbox{$\hat{A}^n$}}

92: \newcommand{\Ahatnsq}{\mbox{$\hat{A}^{n^2}$}}

93: \newcommand{\Ahatk}{\mbox{$\hat{A}^k$}}

94: \newcommand{\Ahatnd}{\mbox{$\hat{A}^{n^d}$}}

95: \newcommand{\ahat}{\mbox{$\hat{a}$}}

96: \newcommand{\Ahats}{\mbox{\scriptsize $\hat{A}$}}

97: \newcommand{\ahats}{\mbox{\scriptsize $\hat{a}$}}

98: \newcommand{\Nhat}{\hat{N}}

99: \newcommand{\hatN}{\hat{N}}

100: \newcommand{\Phatn}{\mbox{$\hat{P}_n$}}

101: \newcommand{\phatn}{\mbox{\scriptsize$\hat{P}_n$}}

102: \newcommand{\sphatn}{\mbox{\tiny$\hat{P}_n$}}

103: \newcommand{\xhat}{\mbox{$\hat{x}$}}

104: \newcommand{\Xhat}{\mbox{$\hat{X}$}}

105: \newcommand{\yhat}{\mbox{$\hat{y}$}}

106: \newcommand{\calH}{\mbox{${\cal H}$}}

107: \newcommand{\calLn}{\mbox{${\cal L}_n$}}

108: \newcommand{\calM}{\mbox{${\cal M}$}}

109: \newcommand{\calR}{\mbox{${\cal R}$}}

110: \newcommand{\calX}{\mbox{${\cal X}$}}

111: \newcommand{\calXhat}{\mbox{$\hat{\cal X}$}}

112: \newcommand{\Dmin}{\mbox{$D_{\rm min}$}}

113: \newcommand{\Dinf}{\mbox{$D_{\rm min}^{(\infty)}$}}

114: \newcommand{\rhomin}{\mbox{$\rho_{\rm min}$}}

115: \newcommand{\rhomax}{\mbox{$\rho_{\rm max}$}}

116: \newcommand{\Lmax}{\mbox{$L_{\rm max}$}}

117: \newcommand{\Dmax}{\mbox{$D_{\rm max}$}}

118: \newcommand{\Dbar}{\mbox{$\overline{D}$}}

119: \newcommand{\dmax}{\mbox{$d_{\rm max}$}}

120: \newcommand{\Dmaxs}{\mbox{\scriptsize $D_{\rm max}$}}

121: \newcommand{\Dav}{\mbox{$D_{\rm av}$}}

122: \newcommand{\Dminn}{\mbox{$D_{\rm min}^{(n)}$}}

123: \newcommand{\Dmink}{\mbox{$D_{\rm min}^{(k)}$}}

124: \newcommand{\Dminone}{\mbox{$D_{\rm min}^{(1)}$}}

125: \newcommand{\Dmaxn}{\mbox{$D_{\rm max}^{(n)}$}}

126: \newcommand{\Davn}{\mbox{$D_{\rm av}^{(n)}$}}

127: \newcommand{\DminP}{\mbox{$D_{\rm min}^{P,Q}$}}

128: \newcommand{\DmaxP}{\mbox{$D_{\rm max}^{P,Q}$}}

129: \newcommand{\Dminmu}{\mbox{$D_{\rm min}^{\mu,\nu}$}}

130: \newcommand{\Dmaxmu}{\mbox{$D_{\rm max}^{\mu,\nu}$}}

131: \newcommand{\Dminmun}{\mbox{$D_{\rm min}^{\mu_n,\nu_n}$}}

132: \newcommand{\Dmaxmun}{\mbox{$D_{\rm max}^{\mu_n,\nu_n}$}}

133: \newcommand{\Davmu}{\mbox{$D_{\rm av}^{\mu,\nu}$}}

134: \newcommand{\Rmin}{\mbox{$R_{\rm min}$}}

135: \newcommand{\LA}{\mbox{$\Lambda$}}

136: \newcommand{\Lbar}{\mbox{$\bar{\Lambda}$}}

137: % \newcommand{\la}{\mbox{$\lambda$}}

138: \newcommand{\lab}{\mbox{$\bar{\lambda}$}}

139: \newcommand{\las}{\mbox{\scriptsize$\lambda$}}

140: \newcommand{\iid}{\mbox{i.i.d.}\!}

141: \newcommand{\psipm}{\psi^{\pm}}

142: \newcommand{\limnd}{\lim_{

143: 	            \mbox{\scriptsize

144: 			 $\begin{array}{c}

145: 				n\to\infty\\

146: 				D\downarrow 0

147: 			  \end{array}$

148: 	     		 }

149: 		 	 }

150: 		   }

151: \newcommand{\limsupnd}{\limsup_{

152:                     \mbox{\scriptsize

153:                          $\begin{array}{c}

154:                                 n\to\infty\\

155:                                 D\downarrow 0

156:                           \end{array}$

157:                          }

158:                          }

159:                    }

160: \newcommand{\liminfnd}{\liminf_{

161:                     \mbox{\scriptsize

162:                          $\begin{array}{c}

163:                                 n\to\infty\\

164:                                 D\downarrow 0

165:                           \end{array}$

166:                          }

167:                          }

168:                    }

169: \newcommand{\argmin}{\mathop{\rm arg\, min}}

170:

171: \newcommand{\la}{\lambda}

172:

173: \def\be{\begin{eqnarray}}

174: \def\ee{\end{eqnarray}}

175: \def\ben{\begin{eqnarray*}}

176: \def\een{\end{eqnarray*}}

177:

178:

179: \input{epsf}

180:

181: \title{Source Coding, Large Deviations,\\

182: and Approximate Pattern Matching}

183:

184: \author{A. Dembo \and I. Kontoyiannis}

185:

186: \date{\today}

187:

188: \begin{document}

189: \bibliographystyle{plain}

190: \maketitle

191:

192: \thispagestyle{empty}

193: \setcounter{page}{-2}

194:

195: \footnotetext[1]{

196: A. Dembo is with

197: the Departments of

198: Mathematics and of

199: Statistics,

200: Stanford University,

201: Stanford, CA 94305.

202: Email: {\tt amir@stat.stanford.edu}

203: Web: {\tt www-stat.stanford.edu/\home amir}

204: }

205:

206: \footnotetext[2]{

207: I.\ Kontoyiannis is with the Division

208: of Applied Mathematics, Brown University,

209: Box F, 182 George St., Providence, RI 02912, USA.

210: Email: {\tt yiannis@dam.brown.edu}

211: Web: {\tt www.dam.brown.edu/people/yiannis/}

212: [Permanent address:

213: Department of Statistics,

214: Purdue University,

215: 1399 Mathematical Sciences Building,

216: W.~Lafayette, IN 47907-1399, USA.]

217: }

218:

219: \footnotetext[3]{

220: Amir Dembo was supported in part

221: by NSF grant \#DMS-0072331.

222: I. Kontoyiannis was supported in part

223: by NSF grant \#0073378-CCR.}

224:

225: \bigskip

226:

227: %Y changed dedication as we agreed

228: % \centerline{\it Dedicated to the memory of Aaron Wyner, a valuable

229: % friend and colleague.}

230:

231: % Dedicated to the memory of a dear

232: % friend and colleague, Aaron Wyner.

233:

234: % \bigskip

235:

236: \newpage

237:

238: {\bf Abstract --- }

239: We present a development of parts of rate-distortion

240: theory and pattern-matching algorithms for lossy data

241: compression, centered around a lossy version of the

242: Asymptotic Equipartition Property (AEP). This treatment

243: closely parallels the corresponding development in

244: lossless compression, a point of view that was advanced

245: in an important paper of Wyner and Ziv in 1989.

246: In the lossless case we review how the AEP underlies

247: the analysis of the Lempel-Ziv algorithm by viewing it

248: as a random code and reducing it to the idealized

249: Shannon code. This also provides

250: information about the redundancy of the Lempel-Ziv

251: algorithm and about the asymptotic behavior of

252: several relevant quantities.

253:

254: In the lossy case we

255: give various versions of the statement of the

256: generalized AEP and we outline the general

257: methodology of its proof via large deviations.

258: Its relationship with Barron and Orey's

259: generalized AEP is

260: also discussed.

261: The lossy AEP is applied to: (i)~prove strengthened

262: versions of Shannon's direct source coding theorem

263: and universal coding theorems;

264: (ii)~characterize the performance of

265: ``mismatched'' codebooks in lossy

266: data compression;

267: (iii)~analyze the performance of

268: pattern-matching algorithms for lossy

269: compression (including Lempel-Ziv schemes);

270: (iv)~determine the first order asymptotics

271: of waiting times (with distortion) between

272: stationary processes; (v)~characterize the

273: best achievable rate of ``weighted''

274: codebooks as an optimal sphere-covering

275: exponent. We then present a refinement to

276: the lossy AEP and use it to: (i)~prove second order

277: (direct and converse) lossy source coding theorems,

278: including universal coding theorems; (ii)~characterize

279: which sources are quantitatively easier to compress;

280: (iii)~determine the second order

281: asymptotics of waiting times between

282: stationary processes;

283: (iv)~determine the precise asymptotic

284: behavior of longest match-lengths

285: between stationary processes.

286: Extensions to random fields are also given.

287:

288: \medskip

289:

290: {\bf Index Terms --- } Rate-distortion theory,

291: pattern-matching, large deviations,

292: data compression.

293:

294: \bigskip

295:

296: %%% TOC COMMAND

297: % \newpage

298: \tableofcontents

299:

300: \newpage

301: \section{Introduction}

302: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

303:

304: \subsection{Lossless Data Compression}

305: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

306: It is probably only a slight exaggeration to

307: say that the central piece of mathematics

308: in the proof of almost any lossless coding

309: theorem is provided by the

310: Asymptotic Equipartition Property, or AEP.

311: Suppose we want to (losslessly)

312: compress a message

313: $X_1^n=(X_1,X_2,\ldots,X_n)$

314: generated by a stationary

315: memoryless source $\Xp=\{X_n\;;\;n\geq 1\}$

316: where each $X_i$ takes values in the

317: finite alphabet $A$

318: (much more general situations will be considered

319: later). For this source, the AEP states that

320: as $n\to\infty$

321: \be

322: -\frac{1}{n}\log_2 P^n(X_1^n)\to H

323: \;\;\;\;\mbox{in probability}

324: \label{eq:shannonAEP}

325: \ee

326: where $P$ is the common distribution of the

327: independent and identically distributed ($\iid$)

328: random variables $X_i$, $P^n$ denotes the

329: (product) joint distribution of $X_1^n$, and

330: $H=E[-\log_2 P(X_1)]$ is

331: the entropy rate of the source --

332: see Shannon's original paper

333: \cite[Theorem~3]{shannon:48} or

334: Cover and Thomas' text \cite[Chapter~4]{cover:book}.

335: [Here and throughout the paper,

336: $\log_2$ denotes the logarithm taken

337: to base 2, and $\log$ denotes

338: the natural logarithm.]

339: From (\ref{eq:shannonAEP}) we can

340: immediately extract some useful

341: information: It implies that

342: when $n$ is large the message

343: $X_1^n$ will most likely have

344: probability at least as high

345: as $2^{-n(H+\epsilon)}$:

346: \be

347: P^n(X_1^n)\geq 2^{-n(H+\epsilon)}

348: 	\;\;\;\;\mbox{with high probability.}

349: \label{eq:tocompare}

350: \ee

351: But there cannot be many high-probability messages.

352: In fact, there can be at most $2^{n(H+\epsilon)}$

353: messages with $P^n(X_1^n)\geq 2^{-n(H+\epsilon)}$,

354: so we need approximately $2^{nH}$ representative

355: messages from the source $\Xp$ in order to cover

356: our bets (with high probability).

357: If we let ${\cal T}_n$ be the

358: set of high-probability

359: strings $x_1^n\in A^n$ having

360: $P^n(x_1^n)\geq 2^{-n(H+\epsilon)}$,

361: then with high probability we can

362: correctly represent the source output

363: $X_1^n$ by an element of ${\cal T}_n$.

364: Since there are no more than

365: $2^{n(H+\epsilon)}$ of them,

366: we need no more than $nH$ bits

367: to correctly encode $X_1^n$.

368:

369: \paragraph{Shannon's Random Code.}

370: Another way to extract information

371: from (\ref{eq:shannonAEP}) is as follows.

372: The fact that for large $n$ we typically have

373: $P^n(X_1^n)\approx 2^{-nH}$ also means that

374: if we independently generate another random

375: string, say $Y_1^n$, from the same distribution

376: as the source, the probability that $X_1^n$

377: is the same as $Y_1^n$ is about $2^{-nH}$.

378: Suppose that instead of using the strings in

379: ${\cal T}_n$ above as our representatives

380: for the source, we decided to independently

381: generate a collection of random strings

382: $Y_1^n$ from the distribution $P^n$; how

383: many would we need? Given a source string

384: $X_1^n$, the probability that any one

385: of the $Y_1^n$ matches it is

386: $\approx 2^{-nH}$, so in order to

387: have high probability of success

388: in representing $X_1^n$ without error

389: we should choose approximately

390: $2^{n(H+\epsilon)}$ random strings $Y_1^n$.

391: Therefore, whether we choose the set of

392: representatives systematically or

393: randomly, we always need about

394: $2^{nH}$ strings in order to be able

395: to encode $X_1^n$ losslessly with high

396: probability. Note that the randomly

397: generated set ${\cal T}_n$ is nothing

398: but Shannon's random codebook

399: \cite{shannon:59} specialized to the

400: case of lossless compression.

401:

402: \paragraph{Idealized Lempel-Ziv Coding.}

403: In 1989, in a very influential paper

404: \cite{wyner-ziv:1}, Wyner and Ziv took

405: the above argument several steps further.

406: Aiming to ``obtain

407: insight into the workings of [...] the

408: Lempel-Ziv data compression algorithm,''

409: they considered the following coding

410: scenario: Suppose that an encoder and a

411: decoder both have available to them

412: a long database, say an infinitely

413: long string $Y_1^\infty=(Y_1,Y_2,\ldots)$

414: that is independently generated from

415: the same distribution as the source.

416: Given a source string $X_1^n$ to

417: be transmitted, the

418: encoder looks for the

419: first appearance of $X_1^n$ in the

420: database (assuming, for now,

421: that it does appear somewhere).

422: Let $W$ denote the position of

423: this first appearance, that is,

424: let $W$ be the smallest integer

425: for which

426: $Y_W^{W+n-1}=(Y_W,Y_{W+1},\ldots,Y_{W+n-1})$

427: is equal to $X_1^n$.

428: Then all the encoder has to do is

429: it to tell the decoder the value of

430: $W$; the decoder can read off the string

431: $Y_W^{W+n-1}$ and recover $X_1^n$

432: perfectly. This description can be

433: given using

434: (cf. \cite{elias}\cite{wyner-ziv:2})

435: no more than

436: \be

437: \ell(X_1^n)=\log_2 W + O(\log_2\log_2 W)

438: 	\;\;\;\;\mbox{bits.}

439: \label{eq:elias1}

440: \ee

441:

442: How good is this scheme?

443: First note that, for any given source

444: string $X_1^n$, the random variable

445: $W$ records the first ``success'' in a

446: sequence of trials (``Is $Y_1^n=X_1^n$?,''

447: ``Is $Y_2^{n+1}=X_1^n$?,''

448: and so on),

449: each of which has probability

450: of success $p=P^n(X_1^n)$. Although

451: these trials are not independent,

452: for large $n$ they are almost independent

453: (in a sense that

454: will be made precise below), so the

455: distribution of $W$ is close to

456: a geometric with parameter

457: $p=P^n(X_1^n)$.

458: For long strings $X_1^n$ (i.e., for

459: large $n$) $p$ is small,

460: and $W$ is typically close to its

461: expected value, which is approximately

462: equal to the mean of a geometric

463: random variable

464: with parameter $p$, namely $1/p$. But the

465: AEP tells us that, when $n$ is large,

466: $p=P^n(X_1^n)\approx 2^{-nH}$, so we

467: expect $W$ to  be typically around $2^{nH}$.

468: Hence, from (\ref{eq:elias1}) the

469: description length $\ell(X_1^n)$

470: of $X_1^n$ will be, to first order,

471: $$\ell(X_1^n)

472: \approx -\log_2 P^n(X_1^n)

473: \approx nH

474: 	\;\;\;\mbox{bits, with high probability.}$$

475: This shows that above scheme is asymptotically

476: optimal, in that its limiting compression ratio

477: is equal to the entropy.

478:

479: \paragraph{Practical Lempel-Ziv Coding.}

480: The Lempel-Ziv algorithm

481: \cite{ziv-lempel:1}\cite{ziv-lempel:2} and

482: its many variants (see, e.g.,

483: \cite[Ch.~8]{bell:cleary:witten}) are some

484: of the most successful data compression

485: algorithms used in practice. Roughly speaking,

486: the main idea behind these algorithms is to

487: use the message's own past as a database

488: for future encoding. Instead of looking

489: for the first match in an infinitely long

490: database, in practice the encoder looks

491: for the longest match in a database of

492: fixed length. The analysis in

493: \cite{wyner-ziv:1} of the idealized

494: scheme described above was the first

495: step in providing a probabilistic

496: justification for the optimality

497: of the actual practical algorithms.

498: Subsequently, in \cite{wyner-ziv:3}

499: and \cite{wyner-ziv:2} Wyner and Ziv

500: established the asymptotic optimality

501: of the Sliding-Window (SWLZ) and

502: the Fixed-Database (FDLZ) versions

503: of the algorithm.

504:

505:

506: \subsection{Lossy Data Compression}

507: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

508:

509: A similar development to the one outlined

510: above can be given in the case of lossy

511: data compression, this time centered around

512: a lossy analog of the AEP \cite{my:thesis}.

513: To motivate this

514: discussion we look at Shannon's original

515: random coding proof of the (direct) lossy

516: source coding theorem \cite{shannon:59}.

517:

518: \paragraph{Shannon's Random Code.}

519: Suppose we want to describe the output

520: $X_1^n$ of a memoryless source,

521: with distortion $D$ or less with respect

522: to a family of single-letter distortion

523: measures $\{\rho_n\}$.

524: Let $Q_n^*$ be the optimum reproduction

525: distribution on $\Ahatn$, where

526: $\Ahat$ is the reproduction alphabet.

527: Shannon's random coding

528: argument says that we should

529: construct a codebook ${\cal T}_n$

530: of $2^{n(R(D)+\epsilon)}$ codewords

531: $Y_1^n$ generated $\iid$ from $Q_n^*$,

532: where $R(D)$ is the rate-distortion

533: function of the source (in bits).

534: The proof that $2^{n(R(D)+\epsilon)}$ codewords

535: indeed suffice is based on the following

536: result, Lemma~1 in \cite{shannon:59}.

537:

538: \medskip

539:

540: {\em Shannon's ``Lemma~1'':}

541: For $x_1^n\in A^n$ let $B(x_1^n,D)$ denote

542: the distortion-ball of radius $D$ around

543: $x_1^n$, i.e., the collection of all

544: reproduction strings $y_1^n\in\Ahatn$ with

545: $\rho_n(x_1^n,y_1^n)\leq D$.

546: When $n$ is large:\footnote{The

547: notation in Shannon's statement is

548: slightly different, and he considers

549: the more general case of ergodic sources.

550: For the sake of clarity we restrict

551: attention here to the $\iid$ case.}

552: \be

553: Q_n^*(B(X_1^n,D))\geq 2^{-n(R(D)+\epsilon)}

554: 	\;\;\;\;\mbox{with high probability.}

555: \label{eq:lemma1}

556: \ee

557:

558: \medskip

559:

560: In the proof of the coding theorem

561: this

562: lemma plays the same role that

563: the AEP played in the lossless case;

564: notice the similarity between

565: (\ref{eq:lemma1}) and its analog

566: (\ref{eq:tocompare}) in the lossless case.

567:

568: Let's fix a source string $X_1^n$ to

569: be encoded. The probability that $X_1^n$

570: matches any one of the codewords

571: $Y_1^n$ in ${\cal T}_n$ is

572: $$\Pr\{\rho_n(X_1^n,Y_1^n)\leq D\,|\,X_1^n\}

573: =

574: \Pr\{Y_1^n\in B(X_1^n,D)\,|\,X_1^n\}

575: =

576: Q_n^*(B(X_1^n,D))$$

577: and by the lemma this probability is

578: at least $2^{-n(R(D)+\epsilon)}$.

579: Therefore, with $2^{n(R(D)+\epsilon)}$

580: independent codewords to choose from,

581: we have a good chance for finding

582: a match with distortion $D$ or less.

583:

584: \paragraph{Generalized AEP and Applications.}

585: A stronger and more general version

586: of Lemma~1 will be our starting point

587: in this paper. In the following section

588: we will prove a {\em generalized AEP}:

589: For any product measure

590: $Q^n$ on $\Ahatn$

591: \be

592: -\frac{1}{n}\log Q^n(B(X_1^n,D)) \to R_1(P,Q,D)

593: 	\;\;\;\;\mbox{w.p.1}

594: \label{eq:firstDAEP}

595: \ee

596: where

597: $R_1(P,Q,D)$ is a (non-random) function

598: of the distributions $P$ and $Q$ and

599: of the distortion level $D$.

600: [We will later prove several

601: variants of (\ref{eq:firstDAEP})

602: under much weaker assumptions.]

603:

604: Like the AEP in the lossless case,

605: the generalized AEP and its refinements

606: find numerous applications in data

607: compression, universal data compression,

608: and in general pattern-matching questions.

609: Many of these applications were inspired

610: by the treatment in Wyner and Ziv's 1989

611: paper \cite{wyner-ziv:1}.  A (very

612: incomplete) sample of subsequent

613: work in the Wyner-Ziv spirit

614: includes the papers

615: 	\cite{steinberg-gutman}\cite{luczak-szpankowski}\cite{

616: 	yang-kieffer:1}\cite{kontoyiannis-lossy1-1}

617: 	on lossy data compression,

618: 	and

619: 	\cite{luczak-szpankowski}\cite{

620: 	dembo-kontoyiannis}\cite{yang-zhang:99c}

621: 	on pattern-matching.

622:

623: Aaron Wyner himself remained active in this

624: field for the following ten years, and his

625: last paper \cite{wyner-ziv-wyner}, co-written

626: with J.~Ziv and A.J.~Wyner,

627: was a review paper on this subject.

628: In the present paper we review the corresponding

629: developments in the lossy case, and in the process

630: we add new results (and some new proofs of

631: recent results) in an attempt to present a more

632: complete picture.

633:

634: \subsection{Central Themes, Paper Outline}

635: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

636:

637: In Section~2 we give an extensive discussion of

638: the generalized AEP. By now there are numerous

639: different proofs under different assumptions,

640: and we offer a streamlined approach to the most

641: general versions using techniques from large

642: deviation theory (cf.

643: \cite{yang-kieffer:1}\cite{dembo-kontoyiannis}%

644: \cite{chi-it:01}\cite{chi-AP:01}

645: and Bucklew's earlier work

646: \cite{bucklew:87}\cite{bucklew:88}). We also

647: discuss the relationship

648: of the generalized AEP

649: with the classical extensions

650: of the AEP (due to Barron \cite{barron:1}

651: and Orey \cite{orey:85})

652: to processes with densities.

653: We establish a formal connection

654: between these two by looking at the limit

655: of the distortion level $D\downarrow 0$.

656:

657: In Section~3 we develop applications

658: of the generalized AEP to a number

659: of related problems.

660: We show how the generalized AEP

661: can be used to determine the

662: asymptotic behavior of Shannon's

663: random coding scheme, and we

664: discuss the role of mismatch

665: in

666: lossy data compression.

667: We also determine the first order

668: asymptotic behavior of

669: waiting times and longest

670: match-lengths between stationary

671: processes.  The main ideas used

672: here are strong approximation

673: \cite{kontoyiannis-jtp} and

674: duality \cite{wyner-ziv:1}.

675: We present strengthened versions

676: of Shannon's direct lossy source coding

677: theorem (and of a corresponding universal

678: coding theorem), showing that {\em almost all}

679: random codebooks achieve essentially

680: the same compression performance.

681: A lossy version of the Lempel-Ziv

682: algorithm is recalled, which

683: achieves optimal compression

684: performance (asymptotically)

685: as well as polynomial

686: complexity at the encoder.

687: We also discuss how the classical

688: source coding problem

689: can be generalized to a question about

690: weighted sphere-covering. The answer

691: to this question gives, as

692: corollaries, Shannon's coding theorems,

693: Stein's lemma in hypothesis testing,

694: and some converse concentration inequalities.

695:

696: Section~4 is devoted to second order

697: refinements of the AEP and the generalized

698: AEP. It is shown, for example, that

699: under certain conditions

700: $-\log P^n(X_1^n)$ and $-\log Q^n(B(X_1^n,D))$

701: are asymptotically Gaussian.

702: These refinements are used in Section~5

703: to provide corresponding second order

704: results (such as central limit theorems)

705: for the applications considered in Section~3.

706: We prove second order asymptotic results

707: for waiting times

708: and longest match-lengths.

709: Precise redundancy rates are

710: given for Shannon's random code,

711: and converse coding theorems show

712: that the random code achieves the

713: optimal pointwise redundancy,

714: up to terms of order $(\log n)$.

715: For $\iid$ sources the pointwise

716: redundancy is typically of order

717: $\sigma\sqrt{n}$, where $\sigma$

718: is the minimal coding variance of

719: the source. When $\sigma=0$ these

720: fluctuations disappear, and the

721: best pointwise redundancy is of

722: order $(\log n)$. The question of

723: exactly when $\sigma$ can be equal

724: to zero is briefly discussed.

725:

726: Finally, Sections~6 and 7

727: contain generalizations of some

728: of the above results to

729: random fields. All the results

730: stated there

731: are new, although

732: most of them are straightforward

733: generalizations of corresponding

734: one-dimensional results.

735:

736: % \newpage

737: \section{The Generalized AEP}

738: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

739:

740: \subsection{Notation and Definitions}

741: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

742: We begin by introducing some basic definitions

743: and notation that will remain in effect for

744: the rest of the paper. We will consider a

745: stationary ergodic

746: process

747: $\Xp=\{X_n\;;\;n\in\IN\}$

748: taking values in a general alphabet

749: $A$.\footnote{To avoid

750: uninteresting technicalities,

751: we will assume throughout that

752: $A$ is a complete, separable metric space,

753: equipped with its associated Borel

754: $\sigma$-field ${\cal A}$.

755: Similarly we take $(\Ahat,\hat{\cal A})$

756: to be the Borel measurable space

757: corresponding to a complete,

758: separable metric space $\Ahat$.} When

759: talking about data compression, $\Xp$

760: will be our source and $A$ will be

761: called the source alphabet. We write

762: $X_i^j$ for the vector of random

763: variables $X_i^j=(X_i,X_{i+1},\ldots,X_j)$,

764: and similarly

765: $x_i^j=(x_i,x_{i+1},\ldots,x_j)\in

766: A^{j-i+1}$

767: for a realization of these random variables,

768: $-\infty\leq i\leq j\leq \infty$.

769: We let $P_n$ denote the marginal

770: distribution of $X_1^n$ on $A^n$

771: ($n\geq 1$), and write $\BBP$

772: for the distribution of the whole

773: process.

774: Similarly, we take

775: $\Yp=\{Y_n\;;\;n\in\IN\}$

776: to be a stationary ergodic

777: process taking values in the

778: (possibly different) alphabet

779: $\Ahat$.${}^2$

780: In the context of

781: data compression,

782: $\Ahat$ is the reproduction

783: alphabet and $\Yp$

784: has the ``codebook'' distribution.

785: We write $Q_n$ for the marginal

786: distribution of $Y_1^n$ on $\Ahatn$,

787: $n\geq 1$, and $\BBQ$ for the

788: distribution of the whole process $\Yp$.

789: We will always assume that the process

790: $\Yp$ is independent of $\Xp$.

791:

792: Let $\rho:A\times\Ahat\to[0,\infty)$

793: be an arbitrary nonnegative (measurable)

794: function, and define a sequence of

795: single-letter distortion measures

796: $\rho_n:A^n\times\Ahatn\to[0,\infty)$ by

797: \ben

798: \rho_n(x_1^n,y_1^n)\bydef\frac{1}{n}\sum_{i=1}^n\rho(x_i,y_i)

799: \;\;\;\;x_1^n\in A^n,\;y_1^n\in\Ahatn.

800: \een

801: Given $D\geq 0$ and $x_1^n\in A^n$,

802: we write

803: $B(x_1^n,D)$ for

804: the distortion-ball of radius $D$ around $x_1^n$:

805: $$B(x_1^n,D)=\{y_1^n\in\Ahatn\;:\;\rho_n(x_1^n,y_1^n)\leq D\}.$$

806:

807: Throughout the paper, $\log$ denotes

808: the natural logarithm and $\log_2$

809: the logarithm to base 2. Unless otherwise

810: mentioned, all familiar information-theoretic

811: quantities (such as the entropy,

812: mutual information, and so on)

813: are assume to be defined in terms

814: of natural logarithms (and are

815: therefore given in nats).

816:

817: \subsection{Generalized AEP When $\Yp$ is I.I.D.}

818: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

819:

820: In the case when $A$ is finite,

821: the classical AEP, also known as the

822: Shannon-McMillan-Breiman theorem

823: (see \cite[Chapter~15]{cover:book}

824: or the original papers

825: \cite{shannon:48}\cite{mcmillan}\cite{breiman:57}\cite{breiman:60}),

826: states that as $n\to\infty$

827: \be

828: -\frac{1}{n}\log P_n(X_1^n) \to H(\BBP)

829: \;\;\;\;\mbox{w.p.1}

830: \label{eq:discreteAEP}

831: \ee

832: where

833: $$H(\BBP)\bydef\lim_{n\to\infty}\frac{1}{n}H(X_1^n)$$

834: is the entropy rate of the process $\Xp$

835: (in nats, since we are taking logarithms to

836: base $e$).

837: As we saw in the

838: Introduction,

839: in lossy data

840: compression the role of the AEP is taken

841: up by the result of Shannon's ``Lemma 1''

842: and, more generally, by statements of the form

843: $$-\frac{1}{n}Q_n(B(X_1^n,D))\to R(\BBP,\BBQ,D)

844: \;\;\;\;\mbox{w.p.1}$$

845: for some

846: non-random

847: ``rate-function'' $R(\BBP,\BBQ,D)$.

848:

849: First we consider the simplest case where $\Yp$

850: is assumed to be an $\iid$ process. We write

851: $Q=Q_1$ for its first order marginal,

852: so that $Q_n=Q^n$, for $n\geq 1.$ Similarly

853: we write $P=P_1$ for the first order marginal

854: of $\Xp$.

855: Let

856: \be

857: \Dmin & \bydef &  E_P[\essinf_{Y\sim Q} \;\rho(X,Y)]

858: 	\label{eq:Dmin} \\

859: \Dav & \bydef &  E_{P\times Q}[\rho(X,Y)].

860: 	\label{eq:Dav}

861: \ee

862: [Recall that the essential infimum of

863: a function $g(Y)$ of the random variable

864: $Y$ with distribution $Q$ is defined as

865: $\essinf_{Y\sim Q} g(Y) =

866: \sup\{t\in\RL\;:\;Q\{g(Y)>t\}=1\}.$]

867:

868: Clearly $0\leq\Dmin\leq\Dav$.

869: To avoid the trivial case when $\rho(x,y)$

870: is essentially constant for

871: ($\BBP$-almost) all

872: $x\in A$, we assume that with positive

873: $\BBP$-probability $\rho(x,y)$ is not

874: essentially constant in $y$, that is:

875: \be

876: \Dmin < \Dav.

877: \label{eq:nonconst}

878: \ee

879: Note also that

880: for $D$ greater than $\Dav$,

881: the probability

882: $Q^n(B(X_1^n,D))\to 1$

883: as $n\to\infty$

884: (this is easy to see

885: by the ergodic theorem),

886: so we restrict our attention

887: to distortion levels $D<\Dav$.

888:

889: \medskip

890:

891: {\em Theorem~1. Generalized AEP when $\Yp$ is $\iid$:}

892: Let $\Xp$ be a stationary ergodic process

893: and $\Yp$ be $\iid$ with marginal distribution

894: $Q$ on $\Ahat$.

895: Assume that $\Dav=E_{P\times Q}[\rho(X,Y)]$ is

896: finite. Then for any $D\in(\Dmin,\Dav)$

897: \ben

898: -\frac{1}{n}\log Q^n(B(X_1^n,D)) \to R_1(P,Q,D)

899:         \;\;\;\;\mbox{w.p.1}.

900: \een

901: The rate-function $R_1(P,Q,D)$

902: is defined as

903: \ben

904: R_1(P,Q,D) = \inf_W H(W\|P\times Q)

905: \een

906: where $H(W\|V)$ denotes the relative

907: entropy between two distributions

908: $W$ and $V$,

909: $$H(W\|V) \bydef \left\{ \begin{array}{ll}

910:    E_W[\log\frac{dW}{dV}]

911: 		   & \mbox{if the density $\frac{dW}{dV}$ exists}, \\

912:    \infty 	   & \mbox{otherwise}

913:  \end{array} \right.

914: $$

915: and the infimum is taken over all

916: joint distributions $W$ on

917: $A\times\Ahat$ such that

918: the first marginal of $W$ is $P$

919: and $E_W[\rho(X,Y)]\leq D.$

920:

921: \medskip

922:

923: {\em Example~1: The rate-function $R_1(P,Q,D)$

924: when $Q$ is Gaussian:}

925: Although in general the rate-function

926: $R_1(P,Q,D)$ cannot be evaluated explicitly,

927: here we show that it is possible to obtain

928: an exact expression for $R_1(P,Q,D)$ in the

929: special case when $\rho(x,y)=(x-y)^2$,

930: $\Xp$ is a real-valued, process,

931: and $Q$ is a Gaussian measure

932: on $\RL.$ Specifically, assume

933: that $\Xp$ is a zero-mean,

934: stationary ergodic process

935: with finite variance

936: $\sigma^2=\VAR(X_1)<\infty$,

937: and take $Q$ to be

938: a zero-mean Gaussian measure

939: with variance $\tau^2$, i.e.,

940: $Q\sim N(0,\tau^2)$.

941: Under these assumptions, it is easy to see

942: that $\Dmin=0$ and $\Dav=\sigma^2+\tau^2$.

943: Moreover, with the help of Proposition~2 below,

944: $R_1(P,Q,D)$ can be explicitly

945: evaluated as:

946: $$R_1(P,Q,D) = \left\{ \begin{array}{ll}

947: 	\infty\,,	& \;\;\;\;D=0\\

948: 	\frac{1}{2}\log\left(\frac{v}{D}\right)

949: 	  -\frac{(v-D)(v-\sigma^2)}

950: 		{2v\tau^2}\,,

951: 	 		& \;\;\;\;0<D<\sigma^2+\tau^2\\

952: 	0\,, 		& \;\;\;\;D\geq \sigma^2+\tau^2

953:  \end{array} \right.

954: $$

955: where

956: $$v\bydef\frac{1}{2}\left[\tau^2+\sqrt{\tau^4+4D\sigma^2}\right].$$

957: We will come back to this example when considering

958: mismatched rate-distortion codebooks in Section~3.2.

959:

960: \medskip

961:

962: {\em Remark 1:}

963: In more familiar information-theoretic

964: terms, the rate-function $R_1(P,Q,D)$

965: can equivalently be defined as

966: (cf. \cite{yang-kieffer:1})

967: \ben

968: R_1(P,Q,D) = \inf_{(X,Y)}\,[I(X;Y)

969: 	+H(Q_Y\|Q)]

970: \een

971: where $I(X;Y)$ denotes the mutual

972: information (in nats) between the

973: random variables $X$ and $Y$,

974: and the infimum is over

975: all jointly distributed random variables

976: $(X,Y)$ with values in $A\times\Ahat$

977: such that $X$ has distribution $P$,

978: $E[\rho(X,Y)]\leq D$, and $Q_Y$ denotes

979: the distribution of $Y$.

980:

981: \medskip

982:

983: {\em Remark 2:}

984: The assumption that $\Yp$ is

985: $\iid$ is clearly restrictive

986: and it will be relaxed below.

987: On the other hand the assumptions

988: on the distortion measure

989: $\rho$ seem to be minimal;

990: we simply assume that $\rho$

991: has finite expectation (in

992: the more general results below

993: $\rho$ is assumed to be bounded).

994: In this form, the result of

995: Theorem~1 is new.

996:

997: \medskip

998:

999: {\em Discussion of Proof:}

1000: Let's fix a realization $x_1^\infty$

1001: of $\Xp$. The probability

1002: $Q^n(B(X_1^n,D))$ can be written as

1003: $$\PR\left\{Y_1^n\in B(X_1^n,D) \,|\,X_1^n=x_1^n\right\} \;=\;

1004: \PR\left\{\frac{1}{n}\sum_{i=1}^n\rho(x_i,Y_i)\leq D \right\}.$$

1005: Since the distortion level $D$ is taken smaller than the

1006: average value $\Dav$, this is large deviations probability

1007: for the partial sums $(1/n)\sum_{i=1}^n Z_i$ of

1008: the independent (but not identically distributed)

1009: random variables $Z_i=\rho(x_i,Y_i)$. The proof

1010: is essentially an application of the

1011: G\"artner-Ellis theorem of large deviations

1012: to the random variables $\{Z_i\}$.

1013:

1014: \medskip

1015:

1016: {\em Proof Outline:}

1017: Choose and fix a realization $x_1^\infty$ of

1018: $\Xp$ and define the random variables

1019: $Z_i=\rho(x_i,Y_i)$. Let

1020: $$S_n=\frac{1}{n}\sum_{i=1}^nZ_i$$

1021: and define the log-moment generating

1022: functions of the normalized partial

1023: sums $S_n$ by

1024: $$\LA_n(\la) \bydef \log

1025: 	E_{Q^n}\left(e^{\lambda S_n}\right),

1026: 	\;\;\;\;\lambda\leq 0.$$

1027: Then for any $\la\leq 0$, by the ergodic theorem we have

1028: that

1029: \be

1030: \frac{1}{n}\LA_n(n\la)

1031: 	= \frac{1}{n}\sum_{i=1}^n\log

1032: 	E_Q\left(e^{\lambda\rho(x_i,Y_i)}\right)

1033: 	\to

1034: 	\LA(\la)\bydef E_P\left[\log

1035: 	E_Q\left(e^{\lambda\rho(X,Y)}\right)

1036: 	\right]

1037: \label{eq:GEcheck}

1038: \ee

1039: for $\BBP$-almost any realization $x_1^\infty$.

1040: Now we would like to apply the

1041: G\"artner-Ellis theorem, but first

1042: we need to check some simple properties

1043: of the function $\LA(\la)$.

1044: Note that

1045: $\LA(\la)\leq 0$

1046: and

1047: also (by Jensen's inequality)

1048: $\LA(\la)\geq \la\Dav>-\infty$,

1049: for all

1050: $\la\leq 0$.

1051: Moreover, $\LA(\la)$ is twice

1052: differentiable

1053: in $\la$ with

1054: $$\LA'(\la) = E_{P\times Q}\left(\rho(X,Y)

1055:     \frac

1056: 	{e^{\lambda\rho(X,Y)}}

1057: 	{E_Q[e^{\lambda\rho(X,Y')}]}

1058: 			   \right)

1059: $$

1060: and

1061: $$\LA''(\la) =

1062: E_P\left[

1063:     E_Q

1064:     \left\{\rho^2(X,Y)

1065:     \frac

1066:         {e^{\lambda\rho(X,Y)}}

1067:         {E_Q[e^{\lambda\rho(X,Y')}]}

1068:     \right\}

1069: 	\;-\;

1070:     \left(E_Q

1071:         \left\{

1072: 	\rho(X,Y)

1073:     	\frac

1074:         {e^{\lambda\rho(X,Y)}}

1075:         {E_Q[e^{\lambda\rho(X,Y')}]}

1076:         \right\}

1077:     \right)^2

1078: \right]$$

1079: (this differentiability is easily verified by

1080: an application of the dominated convergence

1081: theorem). By the Cauchy-Schwarz

1082: inequality $\LA''(\la)\geq 0$ for all $\la<0$,

1083: and in fact $\LA''(\la)$ is strictly positive

1084: due to assumption (\ref{eq:nonconst}).

1085: Also it is not hard to verify that

1086: $$\lim_{\lambda\uparrow 0}\LA'(\la)=\Dav$$

1087: and

1088: \be

1089: \lim_{\lambda\downarrow -\infty}\LA'(\la)=\Dmin.

1090: \label{eq:la-lim}

1091: \ee

1092: Since $D\in(\Dmin,\Dav)$,

1093: there exists a unique $\la^*<0$ with

1094: $\LA'(\la^*)=D$, and therefore

1095: the Fenchel-Legendre

1096: transform of $\LA(\la)$ evaluated at $D$ is

1097: $$\LA^*(D)\bydef\sup_{\la\leq 0}[\la D-\LA(\la)]

1098: 	\;=\;\la^*D-\LA(\la^*).$$

1099: Now we can apply the

1100: G\"artner-Ellis theorem

1101: \cite[Theorem~2.3.6]{dembo-zeitouni:book}

1102: to deduce from

1103: (\ref{eq:GEcheck})

1104: that with $\BBP$-probability one

1105: $$-\frac{1}{n}\log Q^n(B(X_1^n,D)) \to \LA^*(D).$$

1106: The proof is complete upon noticing

1107: that $\LA^*(D)$ is nothing but $R_1(P,Q,D)$.

1108: This is stated and proved in

1109: the following proposition.

1110: \qed

1111:

1112: \medskip

1113:

1114: {\em Proposition 2. Characterization of the Rate Function:}

1115: In the notation of the proof of Theorem~1,

1116: $\LA^*(D)=R_1(P,Q,D)$, for $D\in(\Dmin,\Dav)$.

1117:

1118: \medskip

1119:

1120: {\em Proof Outline:} Under additional

1121: assumptions on the distortion measure

1122: $\rho$ this has appeared in various papers

1123: (see, e.g., \cite{dembo-kontoyiannis}\cite{yang-zhang:99}).

1124: For completeness, we offer a proof sketch here.

1125:

1126: In the notation of the above proof, consider

1127: the measure $W$ on $A\times\Ahat$ defined by

1128: $$\frac{dW(x,y)}{dP\times Q} =

1129: \frac{e^{\las^*\rho(x,y)}}

1130: {E_Q[e^{

1131: \las^*

1132: \rho(x,Y)}]}.$$

1133: Obviously the first marginal of $W$

1134: is $P$ and it is easy to check that

1135: that $E_W[\rho(X,Y)]=\LA'(\la^*)=D$.

1136: Therefore, by the definitions of

1137: $R_1(P,Q,D)$ and $W$, and by

1138: the choice of $\la^*$:

1139: \be

1140: R_1(P,Q,D)\leq H(W\|P\times Q)

1141: 	=\la^*D-\LA(\la^*)

1142: 	=\LA^*(D).

1143: \label{eq:propUBD}

1144: \ee

1145: To prove the corresponding lower

1146: bound we first claim that

1147: for any measurable function

1148: $\phi:\Ahat\to (-\infty,0]$,

1149: and any probability measure

1150: $Q'$ on $\Ahat$,

1151: \be

1152: H(Q'\|Q)\geq E_{Q'}(\phi(Y)) -\log E_{Q}(e^{\phi(Y)}).

1153: \label{eq:generalSV}

1154: \ee

1155: Let $Q_\phi$ denote the probability measure on $\Ahat$ such that

1156: $dQ_\phi/dQ=e^\phi/E_{Q}(e^{\phi(Y)})$. Clearly, it

1157: suffices to prove (\ref{eq:generalSV}) in case $dQ'/dQ$ exists,

1158: in which case the difference between the left and right hand sides is

1159: $$

1160:  E_{Q'}\left\{\log\frac{dQ'}{dQ}\right\} -

1161:  E_{Q'}\left\{\log\left(\frac{e^{\phi}}

1162: 	{

1163: 	E_{Q}(e^{\phi})

1164: 	}

1165: 	\right)\right\}

1166: \;=\;

1167: %

1168: %   E_{Q_\phi}\left\{\frac{dQ'}{dQ_\phi}\log\left(\frac{dQ'}{dQ_\phi}

1169: % 	\right)\right\}

1170: H(Q'\|

1171: Q_\phi)

1172: \;\geq\;0.

1173: $$

1174: % where the last inequality follows from

1175: % Jensen's inequality for the convex

1176: % function $t \log t$.

1177: Given an arbitrary

1178: candidate $W$ as in the definition of

1179: $R_1(P,Q,D)$ and any $x\in A$, we take

1180: $Q'=W(\cdot|x)$ and $\phi(y)=\la^*\rho(x,y)$

1181: in (\ref{eq:generalSV}) to get that

1182: $$H(W(\cdot|x)\|Q(\cdot))\geq \la^* E_{W(Y|x)}[\rho(x,Y)]

1183: 	- \log E_{Q}(e^{\lambda^*\rho(x,y)}).$$

1184: Substituting $X$ for $x$,

1185: taking expectations of both sides

1186: with respect to $P$,

1187: and recalling that $\la^*<0$

1188: and $E_W[\rho(X,Y)]\leq D$, we get:

1189: $$H(W\|Q)\geq \la^*D - \LA(\la^*) = \LA^*(D).$$

1190: Since $W$ was arbitrary it follows that

1191: $R_1(P,Q,D)\geq \LA^*(D)$, and together

1192: with (\ref{eq:propUBD}) this completes

1193: the proof.

1194: \qed

1195:

1196: \subsection{Generalized AEP When $\Yp$ is Not I.I.D.}

1197: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1198:

1199: Next we present two versions

1200: of the generalized AEP that hold

1201: when $\Yp$ is a stationary

1202: dependent process,

1203: under some additional conditions.

1204:

1205: Throughout this section we will

1206: assume that the distortion

1207: measure is essentially bounded

1208: \be

1209: \Dmax \bydef \esssup_{(X_1,Y_1)\sim P_1\times Q_1}

1210: 		\rho(X_1,Y_1)<\infty.

1211: \label{eq:Dmax}

1212: \ee

1213: We let $\Dav$ be defined as earlier,

1214: $\Dav = E_{P_1\times Q_1}[\rho(X_1,Y_1)]$,

1215: and for $n\geq 1$ we let

1216: \ben

1217: \Dminn

1218: \bydef

1219: 	E_{P_n}

1220: 	\left[

1221: 		\essinf_{Y_1^n\sim Q_n} \;\rho_n(X_1^n,Y_1^n)

1222: 	\right].

1223: \een

1224: It is easy to see that $n \Dminn$ is a finite, superadditive sequence,

1225: and therefore we can also define

1226: $$\Dmin = \lim_{n\to\infty} \Dminn = \sup_{n\geq 1} \Dminn.$$

1227: As before, we will assume that

1228: the distortion measure $\rho$ is

1229: not essentially constant,

1230: that is, $\Dmin<\Dav.$

1231:

1232: \medskip

1233:

1234: We first state a version

1235: of the generalized AEP that

1236: was recently proved by Chi

1237: \cite{chi-it:01}, for

1238: processes $\Yp$ satisfying

1239: a rather strong

1240: mixing condition: We say that

1241: the stationary process $\Yp$

1242: is {\em $\psipm$-mixing}, if

1243: for all $d$ large enough there is a

1244: finite constant $c_d$ such that

1245: $$

1246: c_d^{-1} \BBQ(A)\BBQ(B) < \BBQ(A\cap B) < c_d \BBQ(A)\BBQ(B)

1247: $$

1248: for all events $A\in\sigma(Y_{-\infty}^0)$

1249: and $B\in\sigma(Y_d^{\infty})$,

1250: where $\sigma(Y_i^j)$ denotes

1251: the $\sigma$-field generated by $Y_i^j$.

1252: Recall the usual definition

1253: according to which $\Yp$ is called

1254: {\em $\psi$-mixing} if in fact

1255: the constants $c_d \to 1$ as

1256: $d \to \infty$; see \cite{bradley}

1257: for more details.

1258: Clearly $\psipm$-mixing is weaker

1259: than {\em $\psi$-mixing}.

1260:

1261: \medskip

1262:

1263: %Y -------------- rephrased Theorems 3 and 4 and the discussion here

1264:

1265: {\em Theorem~3. Generalized AEP when $\Yp$ is $\psipm$-mixing

1266: \cite{chi-it:01}:}

1267: Let $\Xp$ and $\Yp$ be stationary ergodic

1268: processes. Assume that $\Yp$ is $\psipm$-mixing,

1269: and that the distortion measure $\rho$ is bounded.

1270: Then for all $D\in(\Dmin,\Dav)$

1271: \be

1272: -\frac{1}{n}\log Q_n(B(X_1^n,D)) \to R(\BBP,\BBQ,D)

1273:         \;\;\;\;\mbox{w.p.1}

1274: \label{eq:thm4}

1275: \ee

1276: where $R(\BBP,\BBQ,D)$ is the rate-function defined by

1277: \be

1278: \label{eq:thm4b}

1279: R(\BBP,\BBQ,D) = \lim_{n\to\infty} R_n(P_n,Q_n,D)

1280: \ee

1281: where, for $n \geq 1$,

1282: \ben

1283: R_n(P_n,Q_n,D) \bydef \inf_{V_n} n^{-1} H(V_n\|P_n\times Q_n)

1284: \een

1285: and the infimum is taken over all joint

1286: distributions $V_n$ on $A^n\times\Ahatn$

1287: such that the $A^n$-marginal of $V_n$

1288: is $P_n$ and $E_{V_n}[\rho_n(X_1^n,Y_1^n)]\leq D$.

1289:

1290: \medskip

1291:

1292: As we discussed in the previous section,

1293: the proof of most versions of the generalized

1294: AEP consistst of two steps: First a

1295: ``conditional large deviations'' result

1296: is proved for the random variables

1297: $\{\rho_n(x_1^n,Y_1^n)\;;\;n\geq 1\}$,

1298: where $x_1^\infty$ is a fixed realization of the

1299: process $\Xp$. Second, the rate-function

1300: $R(\BBP,\BBQ,D)$ is characterized as the

1301: limit of a sequence of minimizations in

1302: terms of relative entropy.

1303:

1304: In a subseqeunt paper, Chi \cite{chi-AP:01}

1305: showed that the first of these steps

1306: (the large deviations part) remains

1307: valid under a condition

1308: weaker than $\psipm$-mixing,

1309: condition~$(S)$ of \cite{bryc-dembo:96}.

1310: In the following theorem we give a general

1311: version of the second step; we prove

1312: that the generalized AEP (\ref{eq:thm4})

1313: and the formula (\ref{eq:thm4b}) for the

1314: rate-function remain valid as long as

1315: the random variables

1316: $\{\rho_n(x_1^n,Y_1^n)\;;\;n\geq 1\}$

1317: satisfy a large deviations principle (LDP)

1318: with some {\it deterministic}, convex

1319: rate-function (see \cite{dembo-zeitouni:book}

1320: for the precise meaning of this statement).

1321:

1322: \medskip

1323:

1324: {\em Theorem~4.}

1325: Let $\Xp$ and $\Yp$ be stationary processes.

1326: Assume that $\rho$ is bounded, and that with

1327: $\BBP$-probability one, conditional on

1328: $X_1^\infty=x_1^\infty$, the random variables

1329: $\{\rho_n(x_1^n,Y_1^n)\;;\;n\geq 1\}$ satisfy a

1330: large deviations principle with some

1331: deterministic, convex rate-function.

1332: Then, both (\ref{eq:thm4}) and (\ref{eq:thm4b})

1333: hold for any $D\in(\Dmin,\Dav)$, except

1334: possibly at the point $D=\Dinf$, where

1335: \be

1336: \Dinf \bydef \inf \{ D \geq 0 :

1337: \sup_{n \geq 1} R_n(P_n,Q_n,D) < \infty \}.

1338: \label{eq:dinf}

1339: \ee

1340:

1341: \medskip

1342:

1343: Since Theorem~4 has an exact

1344: analog in the case of random fields,

1345: we postpone its proof until the

1346: proof of the corresponding result

1347: (Theorem~27) in Section~6.

1348:

1349: \medskip

1350:

1351: {\em Remark 3:}

1352: Suppose that the joint process $(\Xp,\Yp)$ is

1353: stationary, and that it satisfies a

1354: ``process-level large deviations principle''

1355: (see Remark~6 in Section~6 for a somewhat

1356: more detailed statement) on the space of

1357: % For each $n\geq 1$, given a $x_1^n\in A^n$

1358: % let $x^{(n)}$ denote the periodic

1359: % extension of the string $x_1^n$ to an

1360: % infinite realization in $A^\NN$.

1361: % Similarly define $X^{(n)}$ and $Y^{(n)}$

1362: % as the periodic extensions of $X_1^n$

1363: % and $Y_1^n$, respectively.

1364: % The process-level empirical

1365: % measure $\calLn$ induced

1366: % (by the stationary processes $\Xp$ and $\Yp$)

1367: stationary probability measures

1368: on $(A^\infty\times\hat{A}^\infty)$

1369: % is then defined as

1370: % $$\calLn\bydef\frac{1}{n}\sum_{i=1}^n

1371: 	% \delta_{(X^{(n)}_{i+\cdot},Y^{(n)}_{i+\cdot})}$$

1372: % where $\delta_{s,s'}$ denotes the measure

1373: % assigning unit mass to the joint sequence

1374: % $(s,s')\in A^\NN\times\hat{A}^\NN$

1375: % and $X^{(n)}_{i+\cdot}$ (or $Y^{(n)}_{i+\cdot}$)

1376: % denotes the infinite sequence $X^{(n)}$

1377: % (respectively, $Y^{(n)}$) shifted by

1378: % $i$ positions to the left.

1379: % Equipp the space of stationary

1380: % probability measures on

1381: % $(A^\NN\times\hat{A}^\NN)$ with the

1382: equipped with the

1383: topology of weak convergence.

1384: Assume, moreover,

1385: that this LDP holds with a convex,

1386: good rate-function $I(\cdot)$.

1387: [See \cite{dawson-gartner:87}\cite[Sec.~5.3,~5.4]{deuschel-stroock:book}%

1388: \cite[Sec.~6.5.3]{dembo-zeitouni:book}\cite{bryc-dembo:96}

1389: for a general discussion as well as specific examples

1390: of processes for which the above conditions

1391: hold. Apart from the $\iid$ case, these

1392: examples also include all ergodic finite-state

1393: Markov chains, among many others.]

1394:

1395: It is easy to check that, when

1396: $\rho$ is bounded and continuous on

1397: $A \times \hat{A}$, then with

1398: $\BBP$-probability one, conditional on

1399: $x_1^\infty$, the random variables

1400: $\{\rho_n(x_1^n,Y_1^n)\}$

1401: % satisfy a

1402: % large deviations principle with some

1403: % deterministic, convex rate-function.

1404: % for $\BBP$-a.e. $\Xp$,

1405: % conditional upon $\Xp$ the sequence

1406: % $\{\rho_n(X_1^n,Y_1^n)\}$

1407: satisfy the LDP upper bound with respect

1408: to the deterministic, convex rate-function

1409: $J(D)=\inf  I(\nu)$, where the infimum

1410: is over all stationary probability measures

1411: $\nu$ on $A^\infty \times \hat{A}^\infty$ such that

1412: the $A^\infty$-marginal of $\nu$ is $\BBP$ and

1413: $E_\nu[\rho(X_1,Y_1)] = D$.

1414: Indeed, Comets \cite{comets:89} provides

1415: such an argument when $\Xp$ and $\Yp$ are both $\iid$

1416: Moreover, he shows that in that case

1417: the corresponding LDP lower bound also holds,

1418: and hence Theorem 4 applies.

1419: Unfortunately, the conditional

1420: LDP lower bound has to be verified on a

1421: case-by-case basis.

1422:

1423: \medskip

1424:

1425: {\em Remark 4:}

1426: Although quite strong,

1427: the $\psipm$-mixing condition

1428: of Theorem~3, and the $(S)$-mixing

1429: condition of \cite{chi-AP:01},

1430: probably cannot be significantly

1431: relaxed: For example, in the special case

1432: when $\Xp$ is a constant process

1433: taking on just a single value,

1434: if Theorem~3 were to hold (for any bounded

1435: distortion measure) with a strictly

1436: monotone rate-function, then necessarily

1437: the empirical measures of $Y_1^n$

1438: would satisfy the LDP in the space

1439: ${\cal P}_a(\Ahat)$

1440: (see \cite{bryc-dembo:96} for details).

1441: But

1442: \cite[Example~1]{bryc-dembo:96} illustrates

1443: that this LDP

1444: may fail even when $\Yp$ is a stationary

1445: ergodic Markov chain with

1446: discrete alphabet $\Ahat$. In particular,

1447: the example in \cite{bryc-dembo:96}

1448: has an exponential $\phi$-mixing rate.

1449:

1450: %Y ------------------ end of changes

1451:

1452: \subsection{Generalized AEP for Optimal Lossy Compression}

1453: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1454:

1455: Here we present a version of the generalized

1456: AEP that is useful in proving direct coding

1457: theorems. Let $\Xp$ be a stationary

1458: ergodic process. For the distortion measure

1459: $\rho$ we adopt two simple regularity conditions.

1460: We assume the existence of a {\em reference

1461: letter}, i.e., an $\hat{a}\in\Ahat$ such that

1462: $$E_{P_1}[\rho(X_1,\hat{a})]<\infty.$$

1463: Also, following \cite{kieffer:91}, we

1464: require that for any distortion level

1465: $D>0$ there is a scalar quantizer

1466: for $\Xp$ with finite rate.

1467:

1468: \smallskip

1469:

1470: {\em Quantization Condition:}

1471: For each $D>0$

1472: there is a ``quantizer'' $q:A\to B$ for

1473: some countable (finite or infinite)

1474: subset $B\subset\Ahat$,

1475: such that:

1476: % \begin{enumerate}

1477: % \item[i.]

1478:

1479: i. $\;\;\rho(x,q(x))\leq D$ for all $x\in A$, and

1480:

1481: ii. $\;$

1482: % \item[ii.]

1483: the entropy $H(q(X_1))<\infty$.

1484: % \end{enumerate}

1485:

1486: \smallskip

1487:

1488: \noindent

1489: The following was implicitly proved

1490: in \cite{kieffer:91};

1491: % in the process

1492: % of proving a pointwise converse to the

1493: % source coding theorem;

1494: see also \cite{konto-zhang:00}

1495: for details.

1496:

1497: \smallskip

1498:

1499: {\em Theorem~5. Generalized AEP for Optimal Lossy Compression

1500: \cite{kieffer:91}:}

1501: Let $\Xp$ be a stationary ergodic process.

1502: Assume that the distortion measure $\rho$

1503: satisfies the quantization condition,

1504: that a reference letter exists, and

1505: that for each $n\geq 1$ the infimum of

1506: $$E_{P_n}[-\log Q_n(B(X_1^n,D))]$$

1507: over all probability measures $Q_n$

1508: on $\Ahatn$ is achieved by some

1509: $\widetilde{Q}_n$.

1510: Then for any $D>0$

1511: \be

1512: -\frac{1}{n}\log \widetilde{Q}_n(B(X_1^n,D)) \to R(D)

1513:         \;\;\;\;\mbox{w.p.1}

1514: \label{eq:chi}

1515: \ee

1516: where $R(D)$ is the rate-distortion

1517: function of the process $\Xp$.

1518:

1519: \medskip

1520:

1521: {\em Historical Remarks:}

1522: The relevance of the quantities

1523: $-\log Q_n(B(X_1^n,D))$ to

1524: information theory was first

1525: suggested implicitly

1526: by Kieffer \cite{kieffer:91}

1527: and more explicitly

1528: by {\L}uczak and Szpankowski

1529: \cite{luczak-szpankowski}.

1530: Since then, many papers have

1531: appeared proving the generalized AEP

1532: under different conditions;

1533: we mention here a subset

1534: of those proving some of

1535: the more general results.

1536: The case of finite alphabet

1537: processes was considered by Yang and Kieffer

1538: \cite{yang-kieffer:1}.

1539: The generalized AEP for

1540: processes with general

1541: alphabets and $\Yp$ $\iid$

1542: was proved by Dembo and

1543: Kontoyiannis

1544: \cite{dembo-kontoyiannis}

1545: and by Yang and Zhang

1546: \cite{yang-zhang:99}.

1547: Finally, the case when

1548: $\Yp$ is not $\iid$ was

1549: (Theorem~3) treated by

1550: Chi \cite{chi-it:01}\cite{chi-AP:01}.

1551: The observations of Theorem~4

1552: about the rate-function

1553: $R(\BBP,\BBQ,D)$ are new.

1554: Theorem~5 essentially

1555: comes from Kieffer's work

1556: \cite{kieffer:91};

1557: see also \cite{konto-zhang:00}.

1558:

1559: We should also mention

1560: that, in

1561: a somewhat different context,

1562: the intimate relationship

1563: between the AEP and large

1564: deviations is discussed in

1565: some detail by Orey in

1566: \cite{orey:85b}.

1567:

1568: \subsection{Densities vs. Balls}

1569: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1570: Let us recall the classical generalization

1571: of the AEP, due to Barron \cite{barron:1}

1572: and Orey \cite{orey:85}, to processes with

1573: values in general alphabets. Suppose $\Xp$

1574: as above is a general stationary ergodic process

1575: with marginals $\{P_n\}$ that are

1576: absolutely continuous with respect to

1577: the sequence of measures $\BBM=\{M_n\}$.

1578:

1579: \medskip

1580:

1581: {\em Theorem~6. AEP for Processes with Densities

1582: \cite{barron:1}\cite{orey:85}:}

1583: Let $\Xp$ be a stationary ergodic process whose

1584: marginals $P_n$ have densities $f_n=dP_n/dM_n$

1585: with respect to the $\sigma$-finite measures $M_n$,

1586: $n\geq 1$. Assume that the sequence $\BBM$

1587: of dominating measures is Markov of finite order,

1588: with a stationary transition measure, and that the

1589: relative entropies

1590: \ben

1591: H_n\bydef E_{P_n}\left[\log\frac{f_n(X_1^n)}

1592: 				  {f_{n-1}(X_1^{n-1})}

1593: 		\right],\;\;\;\;n\geq 2,

1594: \een

1595: have $H_n>-\infty$ eventually. Then

1596: \be

1597: -\frac{1}{n}\log \frac{dP_n}{dM_n}(X_1^n)\to -H(\BBP\|\BBM)

1598:         \;\;\;\;\mbox{w.p.1}

1599: \label{eq:BarronAEP}

1600: \ee

1601: where $H(\BBP\|\BBM)$ is the relative entropy

1602: rate defined as $H(\BBP\|\BBM)=\lim_n H_n=\inf_n H_n$.

1603:

1604: \medskip

1605:

1606: The AEP for processes with densities is

1607: also know to hold when the reference measures

1608: $M_n$ do not form a Markov

1609: sequence, under some additional

1610: mixing conditions (see \cite{orey:85} where

1611: $M_n$ are taken to be non-Markov measures

1612: satisfying an additional mixing condition,

1613: and the more recent extension

1614: in \cite{chazottesetal:98}

1615: where the $M_n$ are taken to be

1616: discrete Gibbs measures.)

1617: %%% (with H\"{o}lder continuous potentials)

1618: Moreover, Kieffer

1619: \cite{kieffer:73}\cite{kieffer:73b}

1620: has given counterexamples

1621: illustrating that without some mixing

1622: conditions on $\{M_n\}$ the AEP

1623: (\ref{eq:BarronAEP}) fails to hold.

1624:

1625: There is a tempting analogy between

1626: the generalized AEP (\ref{eq:thm4})

1627: and the AEP for processes with

1628: densities (\ref{eq:BarronAEP}).

1629: The formal similarity between

1630: the two suggests that, if we identify

1631: the measures $Q_n$ with the reference

1632: measures $M_n$, corresponding results

1633: should hold in the two cases.

1634: Indeed, this does in general appear

1635: to be the case, as is illustrated

1636: by the various generalized AEPs

1637: stated above. Moreover, we can

1638: interpret the result of Theorem~5

1639: as the natural analog of the classical

1640: discrete AEP (\ref{eq:discreteAEP}) to

1641: the case of lossy data compression.

1642: As we argued in the

1643: introduction,

1644: the generalized AEPs of the previous

1645: sections play analogous roles in the proofs

1646: of the corresponding direct coding

1647: theorems.

1648:

1649: Taking this analogy further indicates

1650: that there might be a relationship

1651: between these two different

1652: generalizations.

1653: In particular, when $n$ is large and

1654: the distortion level $D$ is small,

1655: the following heuristic

1656: calculation seems compelling:

1657:

1658: \ben

1659: -H(\BBP\|\BBQ)

1660: &\approxa&

1661: -\frac{1}{n}\log \frac{dP_n}{dQ_n}(X_1^n)\\

1662: &\approxb&

1663: -\frac{1}{n}\log \frac{P_n(B(X_1^n,D))}{Q_n(B(X_1^n,D))}\\

1664: &=&

1665: -\frac{1}{n}\log P_n(B(X_1^n,D))

1666: +\frac{1}{n}\log Q_n(B(X_1^n,D))\\

1667: &\approxc&

1668: R(\BBP,\BBP,D)-R(\BBP,\BBQ,D)\\

1669: &\approxd&-H(\BBP\|\BBQ)

1670: \een

1671: where $(a)$ holds in the limit as $n\to\infty$

1672: by Theorem~6, $(b)$ should hold when $D$ is small

1673: by the assumption that $P_n$ has a density

1674: with respect to $Q_n$, $(c)$ would

1675: follow in the limit as $n\to\infty$ by

1676: an application of the generalized AEP,

1677: and it is natural to conjecture

1678: that $(d)$ holds in the limits

1679: as $D\downarrow 0$ by reading

1680: the above calculation backwards.

1681:

1682: In the following two sections we

1683: formalize the above heuristic

1684: argument in two special cases:

1685: First when $\Xp$ is a discrete

1686: process taking values in a finite

1687: alphabet, and second when $\Xp$ is

1688: a continuous process

1689: taking values in $\RL^d$.

1690:

1691:

1692: \subsubsection{Discrete Case}

1693: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1694: Here we take $\Xp$ to be a stationary ergodic

1695: process taking values in a finite alphabet $A$,

1696: and $\Yp$ to be $\iid$ with first order marginal

1697: distribution $Q=Q_1$ on the same alphabet $A=\Ahat$.

1698: Similarly we write $P=P_1$ for the first order

1699: marginal of $\Xp$.

1700: In Theorem~7 we justify the above

1701: calculation by showing that the

1702: limits as $D\downarrow 0$ and as $n\to\infty$

1703: can indeed be taken together in any

1704: fashion: We show that the double

1705: limit of the central expression

1706: \be

1707: r_n(X_1^n,D)

1708: \bydef

1709: \frac{1}{n}\log \frac{P_n(B(X_1^n,D))}{Q_n(B(X_1^n,D))}

1710: \label{eq:ratio}

1711: \ee

1712: is equal to $H(\BBP\|\BBQ)$ with probability 1,

1713: independently of how $n$ grows and

1714: $D$ decreases to zero. Its proof is

1715: given in Appendix~A.

1716:

1717: \medskip

1718:

1719: {\em Theorem~7. Densities vs. Balls in the Discrete Case:}

1720: Let $\Xp$ be a stationary ergodic process

1721: and $\Yp$ be $\iid$, both on the finite

1722: alphabet $A$. Assume that $\rho(x,y)=0$

1723: if and only if $x=y$, and $Q(x)>0$ for all $x$.

1724: Then the following

1725: double limit exists:

1726: $$\limnd

1727: \frac{1}{n}\log

1728:         \frac{P_n(B(X_1^n,D))}

1729:              {Q^n(B(X_1^n,D))}

1730: 	\;=\; H(\BBP\|\BBQ)

1731: \;\;\;\;\mbox{w.p.1}

1732: $$

1733: In particular, the repeated limit

1734: $\lim_{n}\lim_{D}$

1735: exists with probability one

1736: and is equal to $H(\BBP\|\BBQ)$.

1737:

1738: \subsubsection{Continuous Case}

1739: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1740:

1741: Here we state a weaker version of Theorem~7 in the

1742: case when

1743: $A=\Ahat=\RL^d$ for some $d\geq 1$, and

1744: when $\Xp$ is an $\RL^d$-valued,

1745: stationary ergodic process.

1746: Suppose that the marginals $\{P_n\}$ of

1747: $\Xp$ are absolutely continuous

1748: with respect to a sequence

1749: of reference measures $\{Q_n\}$. Throughout

1750: this section we take the $Q_n$

1751: to be product measures, $Q_n=Q^n,$

1752: for some fixed Borel probability

1753: measure $Q$ on $\RL^d$.

1754: A typical example to keep in mind

1755: is when $Q$ a Gaussian measure on

1756: $\RL$ and $\Xp$ a real-valued stationary

1757: ergodic process all of whose marginals

1758: $P_n$ have continuous densities

1759: with respect to Lebesgue measure.

1760:

1761: For simplicity, we take $\rho$ to be

1762: squared-error distortion,

1763: $\rho(x,y)=(x-y)^2$, although

1764: the proof of Theorem~8, given in

1765: Appendix~B, may easily be adapted to

1766: apply for somewhat more general

1767: difference distortion measures.

1768:

1769: \medskip

1770:

1771: {\em Theorem~8. Densities vs. Balls in the Continuous Case:}

1772: Let $\Xp$ be an $\RL^d$-valued stationary ergodic process,

1773: whose marginals $P_n$ have densities $f_n=dP_n/dQ_n$ with

1774: respect to a sequence of product measures $Q_n=Q^n$,

1775: $n\geq 1$, for a given probability measure $Q$ on $\RL^d$.

1776: Let $\rho(x,y)=(x-y)^2$ for any $x,y \in \RL^d$.

1777:

1778: (a) The following repeated limit holds:

1779: $$\lim_{n\to\infty}

1780:   \lim_{D\downarrow 0}\;

1781:         \frac{1}{n}\log

1782:         \frac{P_n(B(X_1^n,D))}

1783:              {Q_n(B(X_1^n,D))}

1784: 	= H(\BBP\|\BBQ)

1785: 	\;\;\;\;\mbox{w.p.1.}

1786: $$

1787:

1788: (b) Assume, moreover, that $\Xp$ is $\iid$

1789: with marginal distribution $P_1=P$ on $\RL^d$,

1790: and that the following conditions are satisfied:

1791: Both $E_{P\times Q}[\rho(X,Y)]$ and

1792: $E_{P\times P}[\rho(X,Y)]$ are finite

1793: and nonzero; the expectation

1794: $$E_P[-\log Q(B(X,D))]

1795: \;\;\;\;\mbox{is finite for all}\;D>0;$$

1796: and a $\delta>0$ exists for which

1797: \be

1798: E_P\left[\sup_{0<D<\delta} \left|

1799: 	\log \frac{P(B(X,D))}{Q(B(X,D))} \right|\right]<\infty.

1800: \label{eq:integrability}

1801: \ee

1802: Then, the reverse repeated limit also holds:

1803: $$\lim_{D\downarrow 0}

1804:   \lim_{n\to\infty}\;

1805:         \frac{1}{n}\log

1806:         \frac{P_n(B(X_1^n,D))}

1807:              {Q_n(B(X_1^n,D))}

1808: 	= H(\BBP\|\BBQ)

1809: 	\;\;\;\;\mbox{w.p.1.}

1810: $$

1811:

1812: \medskip

1813:

1814: It is easy to check that all conditions of the

1815: theorem hold when $Q$ is a Gaussian measure on $\RL$

1816: and $P$ has finite variance and a probability density

1817: function $g$ (with respect to Lebesgue measure)

1818: such that $E_P(\sup_{|y-X|<\delta} |\log g(y)|)<\infty$

1819: for some $\delta>0$.  For example, this is the case

1820: when both $P$ and $Q$ are Gaussian distributions on $\RL$.

1821:

1822: As will be seen from the proof of the theorem,

1823: although we are primarily interested in the

1824: case when the relative entropy rate $H(\BBP\|\BBQ)$

1825: is finite, the result remains true when

1826: $H(\BBP\|\BBQ)=\infty$, and in that case

1827: assumption (\ref{eq:integrability}) can be relaxed to

1828: $$E_P\left[\sup_{0<D<\delta} \log \frac{Q(B(X,D))}{P(B(X,D))}

1829: \right]<\infty.$$

1830:

1831: %Y added Feldman reference

1832:

1833: Finally we note that, in the context of ergodic

1834: theory, Feldman \cite{feldman:80} developed

1835: a different verison of the generalized AEP,

1836: and also discussed the relationship between

1837: the two types of asymptotics (as $n\to\infty$,

1838: and as $D\downarrow 0$).

1839:

1840: % \newpage

1841: \section{Applications of the Generalized AEP}

1842: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1843: As outlined in the

1844: introduction, the generalized AEP can

1845: be applied to a number of problems in data compression

1846: and pattern matching. Following along the lines of the

1847: corresponding applications in the lossless case, below

1848: we present applications of the results of the previous

1849: section to: 1.~Shannon's random coding schemes;

1850: 2.~mismatched codebooks in lossy data compression;

1851: 3.~waiting times between stationary processes

1852: (corresponding to idealized Lempel-Ziv coding);

1853: 4.~practical lossy Lempel-Ziv coding for memoryless

1854: sources; and 5.~weighted codebooks in

1855: rate-distortion theory.

1856:

1857: \subsection{Shannon's Random Codes}

1858: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1859: Shannon's well-known construction of optimal

1860: codes for lossy data compression is based on

1861: the idea of generating a random codebook. We

1862: review here a slightly modified version of

1863: his construction \cite{shannon:59}

1864: and describe how the performance of the

1865: resulting random code can be analyzed

1866: using the generalized AEP.

1867:

1868: % \paragraph*{Shannon's Random Codebooks.}

1869: Given a sequence of probability distributions

1870: $Q_n$ on $\Ahatn$, $n\geq 1$, we generate a

1871: {\em random codebook according to the measures $Q_n$}

1872: as an infinite sequence of $\iid$ random vectors

1873: $$Y_1^n(i),\;\;\;\;i\geq 1$$

1874: with each $Y_1^n(i)$ having distribution

1875: $Q_n$ on $\Ahatn$. Suppose that, for a fixed $n$,

1876: this codebook is available to both the encoder and decoder.

1877: Given a source string $X_1^n$ to

1878: be described with distortion $D$ or less,

1879: the encoder looks for a $D$-close match of

1880: $X_1^n$ into the codebook $\{Y_1^n(i)\;;\;i\geq 1\}$.

1881: Let $i_n$ be the position of the first such match

1882: \ben

1883: i_n\bydef \inf \{i\geq 1\;:\;\rho_n(X_1^n,Y_1^n(i))\leq D\}

1884: % \label

1885: \een

1886: with the convention that the infimum of

1887: the empty set equals $+\infty$. If a

1888: match is found, then the encoder describes

1889: to the decoder the position $i_n$ using

1890: Elias' code for the integers

1891: \cite{elias}. This takes no more than

1892: \be

1893: \log_2 i_n + 2\log_2\log_2 i_n + \mbox{Const.}

1894: \;\;\;\;\mbox{bits}.

1895: \label{eq:elias}

1896: \ee

1897: If no match is found

1898: (something that asymptotically will

1899: {\em not} happen, with probability one),

1900: then the encoder describes $X_1^n$ with

1901: distortion $D$ or less using some other

1902: default scheme.

1903:

1904: Let $\ell_n(X_1^n)$

1905: denote the overall description

1906: length of the algorithm just

1907: described. In view of (\ref{eq:elias}),

1908: in order to understand its

1909: compression performance,

1910: that is, to understand the

1911: asymptotic behavior of

1912: $\ell_n(X_1^n)$, it suffices

1913: to understand the behavior of the quantity

1914: $$\log_2 i_n,\;\;\;\;\mbox{for large $n$.}$$

1915: Suppose that the probability

1916: $Q_n(B(X_1^n,D))$ of finding a $D$-close

1917: match for $X_1^n$ in the codebook is nonzero.

1918: Then, conditional on the source string $X_1^n$,

1919: the distribution of $i_n$ is geometric with

1920: parameter $Q_n(B(X_1^n,D))$. From this

1921: observation is easy to deduce that

1922: the behavior of $i_n$ is closely

1923: related to the behavior of the quantity

1924: $1/Q_n(B(X_1^n,D))$. The next theorem is

1925: an easy consequence of this fact so it is

1926: stated here without proof; see the

1927: corresponding arguments in

1928: \cite{kontoyiannis-red:00}\cite{konto-zhang:00}.

1929:

1930: \medskip

1931:

1932: {\em Theorem~9. Strong Approximation:}

1933: Let $\Xp$ be an arbitrary process and

1934: let $\{Q_n\}$ be a given sequence of

1935: codebook distributions.

1936: If $Q_n(B(X_1^n,D))>0$ eventually with

1937: probability one,

1938: then for any $\epsilon>0$:

1939: \ben

1940: \log_2 i_n

1941: &\leq& -\log_2 Q_n(B(X_1^n,D)) + \log_2\log_2 n + 3

1942: 	\;\;\;\;\mbox{eventually, w.p.1}\\

1943: \mbox{and}\;\;

1944: 	\log_2 i_n

1945: &\geq&

1946: -\log_2 Q_n(B(X_1^n,D)) -

1947: \log_2 n - (1+\epsilon)\log_2\log_2 n

1948: \;\;\;\;\mbox{eventually, w.p.1.}

1949: \een

1950:

1951: \medskip

1952:

1953: The above estimates can now be combined

1954: with the results of the generalized AEP

1955: in the previous section to determine the

1956: performance of codes based on random

1957: codebooks with respect to the ``optimal''

1958: measures $Q_n$. To illustrate this

1959: approach we consider the special case

1960: of memoryless sources and finite

1961: reproduction alphabets, and show that

1962: the random code with respect to

1963: (almost) any random codebook realization

1964: is asymptotically optimal, with

1965: probability one. Note that corresponding

1966: results can be proved, in exactly the

1967: same way, under much more general

1968: assumptions. For example, utilizing

1969: Theorem~5 instead of Theorem~1 we

1970: can prove the analog of Theorem~10

1971: below for arbitrary stationary

1972: ergodic sources.

1973:

1974: Let $\Xp$ be an $\iid$ source with

1975: marginal distribution $P_1=P$

1976: on $A$, and take the reproduction

1977: alphabet $\Ahat$ to be finite.

1978: For simplicity we will

1979: assume that the distortion measure

1980: $\rho$ is bounded, i.e.,

1981: $\sup_{x,y}\rho(x,y)<\infty,$

1982: and we also make the customary

1983: assumption that

1984: \be

1985: \sup_{x\in A}\min_{y\in\hat{A}}\rho(x,y) = 0.

1986: \label{eq:maximin}

1987: \ee

1988: [See the remark at the end of Section~5.1.1 for

1989: a discussion of this condition and when it can

1990: be relaxed.]

1991: As usual, we define the rate-distortion

1992: function of the memoryless source

1993: $\Xp$ by

1994: $$R(D)=\inf_{(X,Y)}\,I(X;Y)$$

1995: where the infimum is over all jointly

1996: distributed random variables $(X,Y)$

1997: with values in $A\times\Ahat$, such

1998: that $X$ has distribution $P$

1999: and $E[\rho(X,Y)]\leq D$.

2000: Let

2001: \be

2002: \Dbar\bydef \min_{y\in\hat{A}}E_P[\rho(X,y)]

2003: \label{eq:Dbar}

2004: \ee

2005: and note that $R(D)=0$ for $D\geq\Dbar$.

2006: To avoid the trivial case when

2007: $R(D)=0$ for all $D,$ we assume

2008: that $\Dbar>0$ and we restrict

2009: our attention to the interesting

2010: range of values $D\in(0,\Dbar)$.

2011: Recall \cite{yang-zhang:99}\cite{kontoyiannis-red:00}

2012: that for any such $D$,

2013: $R(D)$ can alternatively be

2014: written as

2015: $$R(D)=\inf_Q R_1(P,Q,D)$$

2016: where the infimum is over all

2017: probability distributions $Q$ on $\Ahat$.

2018: Since we take $\Ahat$ to be finite,

2019: this infimum is always achieved

2020: (see \cite{kontoyiannis-red:00})

2021: by a probability distribution

2022: $Q=Q^*$.

2023: To avoid cumbersome notation in the statements

2024: of the coding theorems given next and also in

2025: later parts of the paper, we also write

2026: $\calR(D)$ for the rate-distortion

2027: function of the source $\Xp$ expressed

2028: in {\em bits} rather than in nats:

2029: $$\calR(D)\bydef (\log_2 e)R(D).$$

2030: Finally, we write $Q_n^*$ for the product

2031: measures $(Q^*)^n$ and call

2032: $\{Q_n^*\}$ the {\em optimal reproduction

2033: distributions at distortion level $D$.}

2034:

2035: Combining Theorem~9 with the

2036: generalized AEP of Theorem~1

2037: implies the following

2038: strengthened direct

2039: coding theorem.

2040:

2041: \medskip

2042:

2043: {\em Theorem~10. Pointwise Coding Theorem

2044: for I.I.D. Sources \cite{kontoyiannis-red:00}:}

2045: Let $\Xp$ be an $\iid$ source with distribution

2046: $P$ on $A$, and let $Q_n^*$ denote the optimal

2047: reproduction distributions at distortion level

2048: $D\in(0,\Dbar)$.

2049: Then the codes based on almost any realization

2050: of the Shannon random codebooks according

2051: to the measures $\{Q_n^*\}$ have codelengths

2052: $\ell_n(X_1^n)$

2053: satisfying:

2054: $$\lim_{n\to\infty}\frac{1}{n}\ell_n(X_1^n)

2055: = \calR(D)\;\;\;\;\mbox{bits per symbol, w.p.1.}$$

2056:

2057: \medskip

2058:

2059: A simple modification of the above scheme can

2060: be used to obtain {\em universal} codebooks

2061: that achieve optimal compression for any

2062: memoryless source:

2063: Given a fixed block-length $n$, we consider

2064: the collection of all $n$-types on $\Ahat$,

2065: namely, all distributions $Q$ of the form

2066: $Q(\hat{a})=j/n$, $0\leq j\leq n$, for

2067: $\hat{a}\in\Ahat$. Instead of generating

2068: a single random codebook according to the

2069: optimal distribution $Q_n^*$, we generate

2070: {\em multiple codebooks}, one for each

2071: product measure $Q^n$ corresponding to an

2072: $n$-type $Q$ on $\Ahat$. Then we (as the

2073: encoder) adopt a greedy coding strategy. We find

2074: the first $D$-close match for $X_1^n$ in

2075: each of the codebooks, and pick the one

2076: in which the match appears the earliest.

2077: To describe $X_1^n$ to the decoder with

2078: distortion $D$ or less we then describe

2079: two things: (a)~the index of the codebook

2080: in which the earliest match was found,

2081: and (b)~the position $i_n$ of this

2082: earliest match. Since there are at

2083: most polynomially many $n$-types

2084: (cf. \cite{csiszar:book}\cite{cover:book}),

2085: the rate of the description of (a) is

2086: asymptotically negligible. Moreover,

2087: since the set of $n$-types is

2088: asymptotically dense among probability

2089: measures on $\Ahat$, we eventually

2090: do as well as if we were using the

2091: optimum codebook distribution $Q_n^*$.

2092:

2093: \medskip

2094:

2095: {\em Theorem~11. Pointwise Universal Coding Theorem

2096: \cite{kontoyiannis-red:00}:}

2097: Let $\Xp$ be an arbitrary $\iid$ source with

2098: distribution $P$ on $A$, let $R(D)$

2099: be the rate-distortion function of this source

2100: at distortion level $D\in(0,\Dbar)$,

2101: and let $\calR(D)$ denote its

2102: rate-distortion function in bits.

2103: The codes

2104: based on almost any realization of the

2105: universal Shannon random codebooks have

2106: codelengths $\ell_n(X_1^n)$ satisfying:

2107: $$\lim_{n\to\infty}\frac{1}{n}\ell_n(X_1^n)

2108: = \calR(D)\;\;\;\;\mbox{bits per symbol, w.p.1.}$$

2109:

2110: \subsection{Mismatched Codebooks}

2111: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

2112: In the last section we described how,

2113: for memoryless sources, the Shannon

2114: random codebooks with respect to the

2115: optimal reproduction distributions can

2116: be used to achieve asymptotically

2117: optimal compression performance. In this

2118: section we briefly consider the question

2119: of determining the rate achieved

2120: when an arbitrary (stationary ergodic)

2121: source $\Xp$ is encoded using a

2122: random codebook according to the

2123: $\iid$ distributions $Q^n$,

2124: for an arbitrary distribution $Q$

2125: on $\Ahat$. For further discussion

2126: of the problem of mismatched

2127: codebooks see

2128: \cite{sakrison:69}\cite{sakrison:70}\cite{lapidoth:97}\cite{kanlis:phd}

2129: and the references therein.

2130:

2131: The following theorem is an immediate

2132: consequence of combining Theorem~1

2133: with Theorem~9 and the discussion in

2134: Section~3.1 (see also Example~1 in

2135: Section~2.2).

2136:

2137: \medskip

2138:

2139: {\em Theorem~12. Mismatched Coding Rate:}

2140: Let $\Xp$ be a stationary ergodic process

2141: with marginal distribution $P_1=P$ on $A$,

2142: let $Q$ be an arbitrary distribution

2143: on $\Ahat$, and define $\Dmin$ and

2144: $\Dav$ as in Section~2.2.

2145: \begin{itemize}

2146: \item[(a)]{\em Arbitrary I.I.D. Codebooks:}

2147:   For any distortion level $D\in(\Dmin,\Dav)$,

2148:   the codes based on almost any realization

2149:   of the Shannon random codebooks according

2150:   to the measures $\{Q^n\}$ have codelengths

2151: 	$\ell_n(X_1^n)$ satisfying:

2152:   $$\lim_{n\to\infty}\frac{1}{n}\ell_n(X_1^n)

2153:   = (\log_2 e)R_1(P,Q,D)\;\;\;\;\mbox{bits per symbol, w.p.1.}$$

2154: \item[(b)]{\em I.I.D. Gaussian Codebooks:}

2155:   Suppose

2156: $\rho(x,y)=(x-y)^2$ and

2157: $\Xp$ is a real-valued process with

2158:   finite variance $\sigma^2=\VAR(X_1)$.

2159:   Let $Q$ be the $N(0,\tau^2)$ distribution

2160:   on $\RL$. Then for any distortion level

2161:   $D\in(0,\sigma^2+\tau^2)$, the codes based on

2162:   almost any realization of the Gaussian

2163:   codebooks according to the measures $\{Q^n\}$

2164:   have codelengths

2165:   $\ell_n(X_1^n)$ satisfying:

2166:   $$\lim_{n\to\infty}\frac{1}{n}\ell_n(X_1^n)

2167:   = \frac{1}{2}\log_2\left(\frac{v}{D}\right)

2168: 	-(\log_2 e)\frac{(v-D)(v-\sigma^2)}

2169:                         {2v\tau^2}

2170: 	\;\;\;\;\mbox{bits per symbol, w.p.1,}$$

2171:   where

2172:   $$v\bydef\frac{1}{2}\left[\tau^2+\sqrt{\tau^4+4D\sigma^2}\right].$$

2173: \end{itemize}

2174:

2175: \medskip

2176:

2177: {\em Lossless vs. Lossy Mismatch:} Recall

2178: that, in the case of lossless data compression,

2179: if instead of the true source distribution

2180: $P$ a different coding distribution $Q$ is used,

2181: then the code-rate achieved is

2182: \be

2183: H(P)+H(P\|Q).

2184: \label{eq:penalty1}

2185: \ee

2186: Similarly in the current setting of lossy

2187: data compression, if instead of the optimal

2188: reproduction distribution $Q^*$ we use a

2189: different codebook distribution $Q$, the

2190: rate we achieve is $R_1(P,Q,D)$.

2191: An upper bound for $R_1(P,Q,D)$ is

2192: obtained by taking $(X,Y)$

2193: in the expression of Remark~1

2194: %-------

2195: to be the jointly distributed random

2196: variables that achieve the infimum

2197: in the definition of the rate-distortion

2198: function of $P$. Then the (mismatched) rate

2199: of the random code based on $Q$ instead

2200: of $Q^*$ is:

2201: \be

2202: R_1(P,Q,D)\leq R(D) + H(Q^*\|Q).

2203: \label{eq:penalty2}

2204: \ee

2205: Equations (\ref{eq:penalty1})

2206: and (\ref{eq:penalty2}) illustrate

2207: the analogy between the penalty terms

2208: in the lossless and lossy case

2209: due to mismatch.

2210:

2211: \medskip

2212:

2213: Next we discuss two special cases

2214: of part~(b) of the theorem

2215: that are of particular interest.

2216:

2217: \medskip

2218:

2219: {\em Example~2: Gaussian codebook with

2220: mismatched distribution:}

2221: Consider the following coding scenario:

2222: We want to encode data generated by

2223: an $\iid$ Gaussian process

2224: with $N(0,\sigma^2)$ distribution,

2225: with squared-error distortion

2226: $D$ or less.

2227: In this case, it is well-known

2228: \cite{berger:book}\cite{cover:book} that

2229: for any $D\in(0,\sigma^2)$

2230: the optimal reproduction distribution $Q^*$

2231: is the $N(0,\sigma^2-D)$ distribution,

2232: so we construct random codebooks

2233: according to the $\iid$ distributions

2234: $Q_n^*=(Q^*)^n$.

2235:

2236: But suppose that, instead of an

2237: $\iid$ Gaussian, the source turns out

2238: to be some arbitrary stationary ergodic

2239: $\Xp$ with zero mean and variance $\sigma^2$.

2240: Theorem~12~(b) implies that the asymptotic

2241: rate achieved by our $\iid$ Gaussian

2242: codebook is equal to

2243: $$\frac{1}{2}\log_2\left(\frac{\sigma^2}{D}\right)

2244: \;\;\;\;\mbox{bits per symbol.}$$

2245: Since this is exactly the

2246: rate-distortion function of the

2247: $\iid$ $N(0,\sigma^2)$ source, we

2248: conclude that the rate achieved is

2249: the same as what we would have

2250: obtained on the Gaussian source we

2251: originally expected. This offers

2252: yet another justification of the

2253: folk theorem that the Gaussian

2254: source is the hardest one to compress,

2255: among sources with a fixed variance.

2256: In fact, the above result is

2257: a natural fixed-distortion

2258: analog of \cite[Theorem~3]{lapidoth:97}.

2259:

2260: \medskip

2261:

2262: {\em Example~3: Gaussian codebook with mismatched variance:}

2263: Here we consider a different type of mismatch.

2264: As before, we are prepared to encode an

2265: $\iid$ Gaussian source, but we have an

2266: incorrect estimate of its variance,

2267: say $\hat{\sigma}^2$ instead of the true

2268: variance $\sigma^2$. So we are using

2269: a random codebook with respect to the

2270: optimal reproduction distribution

2271: $Q_n^*=(Q^*)^n$, where $Q^*$ is the

2272: $N(0,\hat{\sigma}^2-D)$ distribution,

2273: but the actual source is $\iid$

2274: $N(0,\sigma^2)$. In this case,

2275: the rate achieved by

2276: the random codebooks according to

2277: the distributions $Q_n^*$ is given

2278: by the expression in Theorem~12~(b),

2279: with $\tau^2$ replaced by $\hat{\sigma}^2-D$.

2280: Although the resulting expression

2281: is somewhat long and not easy to

2282: manipulate analytically, it is

2283: straightforward to evaluate

2284: numerically. For example,

2285: Figure~1 shows the asymptotic

2286: rate achieved, as a function of the

2287: error $e=\sigma^2-\hat{\sigma}^2$

2288: in the estimate of the true variance.

2289: As expected, the best rate is

2290: achieved when the codebook distribution

2291: is matched the source (corresponding to $e=0$),

2292: and it is equal to the rate-distortion function

2293: of the source. Moreover, as one might

2294: expect, it is more harmful to

2295: underestimate the variance

2296: than to overestimate it.

2297:

2298: \begin{figure}[ht]

2299: \centerline{\epsfxsize 3.2in \epsfbox{rate.eps}}

2300: \caption{This graph shows the rate achieved by an

2301: $\iid$ Gaussian codebook of variance $\hat{\sigma}^2-D$

2302: when applied to $\iid$ $N(0,\sigma^2)$ data.

2303: The rate is shown as a function of the error

2304: $e=\sigma^2-\hat{\sigma}^2$ in the variance estimate.

2305: In this particular example: $\sigma^2=2$, $D=1$,

2306: the error $e$ ranges from $-1/2$ to $1/2$,

2307: and the rate-distortion function of the source

2308: equals 0.5 bits/symbol.}

2309: \end{figure}

2310:

2311: \subsection{Waiting Times and Idealized Lempel-Ziv Coding}

2312: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

2313: Given $D\geq 0$ and two independent realizations

2314: from the stationary ergodic processes $\Xp$ and $\Yp$,

2315: our main quantity of interest here is the

2316: {\em waiting time} $W_n=W_n(D)$ until a $D$-close

2317: version of the initial string $X_1^n$ first appears

2318: in $Y_1^\infty$. Formally

2319: \be

2320: W_n\;=\;\inf\{i\geq 1\; :\;

2321: 	\rho_n(X_1^n,Y_i^{i+n-1})\leq D\}

2322: \label{eq:Wn-def}

2323: \ee

2324: with the convention, as before, that the infimum

2325: of the empty set equals $+\infty$.

2326:

2327: The motivation for studying the asymptotic behavior

2328: of $W_n$ for large $n$ is twofold.

2329:

2330: \medskip

2331:

2332: {\em Idealized Lempel-Ziv coding.}

2333: The natural extension of the idealized

2334: scenario described in the

2335: introduction

2336: is to consider a message $X_1^n$ that

2337: is to be encoded with the help of a

2338: database $Y_1^\infty$.

2339: The

2340: source and

2341: the database are assumed to be

2342: independent, and the database

2343: distribution may or may not be

2344: the same as that of the source.

2345: In order to communicate $X_1^n$

2346: to the decoder with distortion

2347: $D$ or less, the encoder simply

2348: describes $W_n$, using no more

2349: than

2350: $$\log_2 W_n + O(\log_2\log_2 W_n)\;\;\;\;\mbox{bits.}$$

2351: Therefore, the asymptotic performance

2352: of this idealized scheme can be

2353: completely understood in terms

2354: of the asymptotics of $\log W_n$,

2355: for large $n$.

2356:

2357: \medskip

2358:

2359: {\em DNA pattern matching.}

2360: Here we imagine that $X_1^n$ represents

2361: a DNA or protein ``template,'' and we

2362: want to see whether it appears, either

2363: exactly or approximately, as a contiguous

2364: substring of a database DNA sequence

2365: $Y_1^\infty$. We are interested in

2366: quantifying the ``degree of surprise''

2367: in the fact that a $D$-close match was

2368: found at position $W_n$. Specifically,

2369: was the match found ``atypically''

2370: early, or is the value of $W_n$

2371: consistent with the hypothesis

2372: that the template and the database

2373: are independent? For a detailed

2374: discussion, see, e.g.,

2375: \cite[Section~3.2]{dembo-zeitouni:book}\cite{karlin-ost:88}%

2376: \cite{agw:90}\cite{arratia-waterman}

2377: and the references therein.

2378:

2379: % \medskip

2380:

2381: \newpage

2382:

2383: If for a moment we consider

2384: the case when both $\Xp$ and

2385: $\Yp$ are $\iid$, we see that

2386: the waiting time $W_n$ is,

2387: at least intuitively, closely

2388: related to the index $i_n$ of

2389: Section~3.1.

2390: As the following result shows,

2391: although the distribution of

2392: $W_n$ is not exactly geometric,

2393: $W_n$ behaves very much

2394: like $i_n$, at least in the

2395: exponent. That is, the

2396: difference

2397: $$\log W_n -[-\log Q_n(B(X_1^n,D))]$$

2398: is ``small,'' eventually

2399: with probability one.

2400:

2401: Recall the definition of

2402: $\psi$-mixing from Section~2.3, and

2403: also the definition of the

2404: $\phi$-mixing coefficients of $\Yp$

2405: $$\phi(k)\;=\;\sup\{|\BBQ(B|A)-\BBQ(B)|\;:\;\;

2406: B\in\sigma(Y_{k}^{\infty}),\;

2407: A\in\sigma(Y_{-\infty}^0),\; \BBQ(A)>0\}$$

2408: where, as before, $\sigma(Y_i^j)$ denotes

2409: the $\sigma$-field generated by $Y_i^j$.

2410: The process $\Yp$ is called

2411: {\em $\phi$-mixing}

2412: if $\phi(k)\to 0$ as $k\to\infty$;

2413: see \cite{bradley} for an extensive

2414: discussion of $\phi$-mixing and related

2415: mixing conditions.

2416:

2417: \medskip

2418:

2419: {\em Theorem~13. Strong Approximation

2420: \cite{kontoyiannis-jtp}\cite{dembo-kontoyiannis}:}

2421: Let $\Xp$ and $\Yp$ be stationary ergodic processes,

2422: and assume that $\Yp$ is either $\psi$-mixing

2423: or $\phi$-mixing with summable $\phi$-mixing

2424: coefficients, $\sum_{k\geq 1} \phi(k)<\infty$.

2425: If $Q_n(B(X_1^n,D))>0$ eventually with

2426: probability one,

2427: then for any $\epsilon>0$:

2428: \ben

2429: -(1+\epsilon)\log n

2430: \;\leq\;

2431: \log [W_n Q_n(B(X_1^n,D))]

2432: \;\leq\;

2433: (2+\epsilon)\log n

2434: \;\;\;\;\mbox{eventually, w.p.1.}

2435: \een

2436:

2437: \medskip

2438:

2439: Theorem~13 of course implies that

2440: \be

2441: \log W_n = -\log Q_n(B(X_1^n,D)) + O(\log n)

2442: \;\;\;\;\mbox{w.p.1}

2443: \label{eq:strong}

2444: \ee

2445: and combining this with the generalized

2446: AEP statements of Theorems~1 and~4 we

2447: immediately obtain the first order

2448: (or strong-law-of-large-numbers, SLLN)

2449: asymptotic behavior of the waiting

2450: times $W_n$:

2451:

2452: \medskip

2453:

2454: {\em Theorem~14. SLLN for Waiting Times:}

2455: Let $\Xp$ and $\Yp$ be stationary ergodic processes.

2456:

2457: (a)~If $\Yp$ is $\iid$ and the

2458: average distortion $\Dav$ is finite,

2459: then for any $D\in(\Dmin,\Dav)$

2460: \be

2461: \frac{1}{n}\log W_n \to R_1(P_1,Q_1,D)

2462: \;\;\;\;\mbox{w.p.1.}

2463: \label{eq:w-slln}

2464: \ee

2465:

2466: (b)~If $\Yp$ is $\psi$-mixing and the distortion

2467: 	measure $\rho$ is bounded, then for any

2468: 	$D\in(\Dmin,\Dav)$

2469: \be

2470: \frac{1}{n}\log W_n \to R(\BBP,\BBQ,D)

2471: \;\;\;\;\mbox{w.p.1.}

2472: \label{eq:w-slln2}

2473: \ee

2474:

2475: \medskip

2476:

2477: Note that similar results can be

2478: obtained under different assumptions

2479: on the process $\Yp$, using Theorems~3

2480: and~5 in place of Theorems~1 and~4 as

2481: done above.

2482: When $\Xp$ is taken to be an

2483: arbitrary stationary ergodic process,

2484: it is natural to expect that the

2485: mixing conditions for $\Yp$ in

2486: Theorem~14~(b) cannot be

2487: substantially relaxed.

2488: In fact, even in the case of exact matching

2489: between finite-alphabet processes, Shields

2490: \cite{shields:3}

2491: has produced a counterexample demonstrating

2492: that the analog of Theorem~13 does not hold

2493: for arbitrary stationary ergodic $\Yp$.

2494:

2495: \medskip

2496:

2497: {\em Historical Remarks:}

2498: Waiting times in the context

2499: of lossy data compression were

2500: studied by Steinberg and Gutman

2501: \cite{steinberg-gutman} and {\L}uczak

2502: and Szpankowski \cite{luczak-szpankowski}.

2503: Yang and Kieffer \cite{yang-kieffer:1}

2504: identified the limiting rate-function

2505: for a wide range of finite alphabet

2506: sources, and Dembo and Kontoyiannis

2507: \cite{dembo-kontoyiannis} and Chi

2508: \cite{chi-it:01}

2509: generalized these results to processes with

2510: general alphabets.

2511:

2512: The strong approximation idea was

2513: introduced

2514: in \cite{kontoyiannis-jtp}

2515: in the case of exact matching.  For

2516: processes $\Yp$ with summable

2517: $\phi$-mixing coefficients, Theorem~13 was

2518: proved in \cite{dembo-kontoyiannis}, and when

2519: $\Yp$ is $\psi$-mixing it was proved, for the

2520: case of no distortion, in \cite{kontoyiannis-jtp}.

2521: Examining the latter proof, \cite{chi-it:01}

2522: observed that it immediately generalizes to

2523: the statement of Theorem~13.

2524:

2525: Related results were obtained by

2526: Kanaya and Muramatsu \cite{kanaya-muramatsu:97},

2527: who extended some of the results of

2528: \cite{steinberg-gutman}

2529: to processes with general alphabets,

2530: and by Koga and Arimoto \cite{koga-arimoto:98}

2531: who considered {\em non-overlapping} waiting

2532: times between finite-alphabet processes

2533: and Gaussian processes.

2534: Finally, Shields \cite{shields:3}

2535: and Marton and Shields

2536: \cite{marton-shields:1}

2537: considered waiting times with

2538: respect to Hamming distortion

2539: and for $\Xp$ and $\Yp$ having

2540: the same distribution over a

2541: finite alphabet. For

2542: the case of small

2543: distortion they showed,

2544: under some conditions,

2545: that approximate matching

2546: results like (\ref{eq:w-slln})

2547: and (\ref{eq:w-slln2})

2548: reduce to their natural

2549: exact matching analogs as

2550: $D\to 0$.

2551:

2552: \subsection{Match-Lengths and Practical Lempel-Ziv Coding}

2553: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

2554: In the idealized coding scenario of

2555: the previous section we considered the

2556: case where a fixed-length message $X_1^n$

2557: is to be compressed using an infinitely

2558: long database $Y_1^\infty$. But, in practice,

2559: the reverse situation is much more common:

2560: We typically have a ``long'' message

2561: $(X_1,X_2,\dots)$ to be compressed, and

2562: only a finite-length database $Y_1^m$

2563: is available to the encoder and decoder.

2564: It is therefore natural (following

2565: the corresponding development in the

2566: case of lossless compression)

2567: to try and match ``as much as possible''

2568: from the message $(X_1,X_2,\dots)$ into the

2569: database $Y_1^m$.

2570: % \cite{wyner-ziv:3}, it is

2571: With this in mind we

2572: define the {\em match-length}

2573: $L_m$ as the length $\ell$ of

2574: the longest prefix $X_1^\ell$ that

2575: matches somewhere in the database

2576: with distortion $D$ or less:

2577: \be

2578: L_m=\sup\{\ell \geq 1\;:\;

2579: \rho_\ell(X_1^\ell,Y_{j}^{j+\ell-1})\leq D,

2580: \;\;\mbox{for some}\;\;j=1,2,\ldots,m\}.

2581: \label{eq:Lm-def}

2582: \ee

2583:

2584: Intuitively, there is a connection

2585: between match-lengths and waiting times.

2586: Long matches should mean short waiting times,

2587: and vice versa. In the case of exact matching

2588: this connection was precisely formalized by

2589: Wyner and Ziv \cite{wyner-ziv:1}, who observed

2590: that the following ``duality'' relationship

2591: always holds:

2592: \be

2593: W_n\leq m

2594: \;\;\;\;

2595: % \mbox{if and only if}

2596: \Leftrightarrow

2597: \;\;\;\;

2598: L_m\geq n.

2599: \label{eq:easy-dual}

2600: \ee

2601: This is almost identical to the

2602: standard relationship in renewal

2603: theory between the number of

2604: events by a certain time and

2605: the time of the $n$th event

2606: (see, e.g., \cite{fellerII:book}).

2607: Wyner and Ziv \cite{wyner-ziv:1}

2608: utilized (\ref{eq:easy-dual})

2609: to translate their first order

2610: asymptotic results about $W_n$

2611: to corresponding results about

2612: $L_m$.

2613:

2614: Unfortunately this simple relationship

2615: no longer holds in the case of

2616: {\em approximate} matching,

2617: when a distortion measure

2618: is introduced. Instead, the following

2619: modified duality was employed

2620: in \cite{dembo-kontoyiannis}

2621: to obtain corresponding

2622: results in approximate matching

2623: and lossy data compression:

2624: \be

2625: W_n\leq m\;\;\Rightarrow\;\;L_m\geq n

2626: \;\;\;\;

2627: \mbox{and}

2628: \;\;\;\;

2629: L_m\geq n\;\;\Rightarrow\;\;\inf_{k\geq n} W_k\leq m.

2630: \label{eq:duality}

2631: \ee

2632: In \cite{dembo-kontoyiannis} it is shown

2633: that (\ref{eq:duality}) can be used to

2634: deduce the asymptotic behavior of $L_m$

2635: from that of $W_n$,

2636: but this translation

2637: is not straightforward anymore.

2638: In fact, as we discuss in Section~5.2,

2639: a somewhat more

2640: delicate analysis is needed

2641: in this case.

2642: Nevertheless,

2643: once the

2644: behavior of the waiting

2645: times is understood,

2646: the first implication in

2647: (\ref{eq:duality}) immediately

2648: yields asymptotic {\em lower bounds}

2649: on the behavior of the match-lengths.

2650: This is significant for data compression

2651: since long match-lengths usually mean

2652: good compression performance.

2653: Indeed, this observation allowed

2654: \cite{kontoyiannis-lossy1-1} to introduce

2655: a new lossy version of the Lempel-Ziv algorithm

2656: that achieves asymptotically optimal

2657: compression performance for

2658: memoryless sources.

2659: The key characteristics of the

2660: algorithm are that it has

2661: polynomial implementation

2662: complexity, and that it

2663: achieves redundancy comparable

2664: to that of its lossless counterpart,

2665: the FDLZ \cite{wyner-ziv:3}.

2666:

2667: We also

2668: mention that, before

2669: \cite{kontoyiannis-lossy1-1},

2670: several practical (yet suboptimal)

2671: lossy versions of the Lempel-Ziv

2672: algorithm were introduced,

2673: perhaps most notably

2674: by Steinberg and Gutman

2675: \cite{steinberg-gutman} and {\L}uczak

2676: and Szpankowski \cite{luczak-szpankowski}.

2677: Roughly speaking, the reason for

2678: their suboptimal compression performance

2679: was that the coding was done with respect

2680: to a database that had the same

2681: distribution as the source. In view

2682: of the discussion in the previous

2683: section, it is clear that the asymptotic

2684: code-rate of these algorithms is

2685: $R_1(P,P,D)$, which is typically

2686: significantly larger than

2687: the optimal $R(D)=\inf_Q R_1(P,Q,D)$;

2688: see

2689: \cite{yang-kieffer:1} or

2690: \cite{kontoyiannis-lossy1-1}

2691: for

2692: more detailed discussions.

2693:

2694:

2695: \subsection{Weighted Codebooks and Sphere-Covering}

2696: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

2697: Here we describe a related question that was recently

2698: considered in \cite{covering-TR:99}. In the classical

2699: rate-distortion problem, one is interested in finding

2700: ``efficient'' codebooks for describing the output of some

2701: random source to within some tolerable distortion

2702: level. In terms of data compression, a codebook is

2703: ``efficient'' when it contains relatively few codewords,

2704: so that it yields a code with a low rate. Here we are

2705: interested in the more general problem of finding

2706: codebooks with small ``mass.''

2707:

2708: Let $\Xp$ be an

2709: $\iid$ process with marginal distribution

2710: $P$ on a finite alphabet $A$,

2711: and take $\Ahat = A$ and $\rho$

2712: a distortion measure with the property

2713: that $\rho(x,y)=0$ if and only if $x=y$.

2714: Let $M:A\to(0,\infty)$ be

2715: an arbitrary nonnegative function

2716: assigning mass $M^n(C_n)$ to

2717: subsets $C_n$ of $A^n$:

2718: $$M^n(C_n)\bydef\sum_{y_1^n\in C_n} M^n(y_1^n)

2719: 	\bydef\sum_{y_1^n\in C_n}\prod_{i=1}^n M(y_i).$$

2720:

2721: The question of interest here can be

2722: stated as follows. Let $C_n$ be

2723: a subset $A^n$ (we think of $C_n$

2724: as the codebook) that

2725: nearly $D$-covers all of $A^n$,

2726: i.e., with high probability,

2727: every string $X_1^n$ generated

2728: by the source will match at

2729: least one element of $C_n$

2730: with distortion $D$ or less:

2731: \be

2732: P^n\{\mbox{there is an $y_1^n\in C_n$ such that}\;

2733: 	\rho_n(X_1^n,y_1^n)\leq D\}\approx 1.

2734: \label{eq:cover}

2735: \ee

2736: If (\ref{eq:cover}) holds,

2737: how small can the mass of $C_n$ be?

2738:

2739: For example,

2740: taking $M$ identically equal to one,

2741: this problem reduces to the rate-distortion

2742: question. Taking $M$ to be a different

2743: probability measure $Q$, it reduces to

2744: the classical hypothesis testing question,

2745: whereas $M=P$ (the source distribution)

2746: yields ``converses''

2747: to some measure-concentration inequalities;

2748: see \cite{covering-TR:99}

2749: for a detailed treatment of

2750: these and more general cases.

2751:

2752: The next result characterizes the best growth

2753: exponent for the mass of an

2754: arbitrary codebook $C_n$.

2755:

2756: \medskip

2757:

2758: {\em Theorem~15: Weighted Codebooks \cite{covering-TR:99}:}

2759: Let $\Xp$ be an $\iid$ source on the finite

2760: alphabet $A=\Ahat$, and suppose that

2761: $\rho(x,y)=0$ if and only if $x=y.$

2762: \begin{itemize}

2763: \item[$(\Leftarrow)$] Let $C_n$ be an arbitrary subset of $A^n$,

2764: and write $D$ for  the expected distance of a source string

2765: $X_1^n$ from $C_n$:

2766: $$D=E_{P^n}[\min_{y_1^n\in C_n}

2767: 	\rho_n(X_1^n,y_1^n)].$$

2768: Then

2769: $$M^n(C_n)\geq e^{nr(D)}$$

2770: where the rate-function $r(D)=r(D;P,M)$ is defined by

2771: $$r(D)=r(D;P,M)=\inf_{(X,Y)}\{I(X;Y)+ E[\log M(Y)]\}$$

2772: and the infimum is taken over all jointly distributed

2773: random variables $(X,Y)$ with values in $A$, such that

2774: $X\sim P$ and $E[\rho(X,Y)]\leq D.$

2775: \item[$(\Rightarrow)$]

2776: For every $D\geq 0$

2777: there is a sequence

2778: of codebooks $\{C^*_n\}$ such that

2779: \ben

2780: &&

2781:         \limsup_{n\to\infty}\;

2782:         \frac{1}{n}\log M^n(C^*_n)\leq r(D)\\

2783: \mbox{and}&&

2784:         \limsup_{n\to\infty}\;

2785:         E_{P^n}[\min_{y_1^n\in C^*_n}

2786: 	\rho_n(X_1^n,y_1^n)]\leq D.

2787: \een

2788: \end{itemize}

2789:

2790: \medskip

2791:

2792: The main ingredient in the proof of the direct

2793: coding theorem in part~$(\Rightarrow)$ above

2794: is provided by yet another version

2795: of the generalized AEP. Let $(X^*,Y^*)$ be a pair

2796: of random variables achieving the infimum in the

2797: definition of $r(D)$, and let $Q^*$ be the

2798: distribution of $Y^*$. Now for $\delta>0$

2799: and $n\geq 1$ define the sets

2800: $${\cal G}_n=\{y_1^n\in A^n\;:\;

2801:         \hat{P}_{y_1^n}(b)\leq Q^*(b)+\delta,

2802:         \;\;\forall\, b\in A\}$$

2803: where $\hat{P}_{y_1^n}$ denotes the empirical

2804: distribution induced by $y_1^n$ on $A$.

2805: For each $n\geq 1$ define the ``conditioned''

2806: measure $Q^{(c)}_n$ on $A^n$ by conditioning

2807: the product measure $(Q^*)^n$ to the set

2808: ${\cal G}_n$. The next theorem provides

2809: the necessary version of the generalized

2810: AEP in this case.

2811:

2812: \medskip

2813:

2814: {\em Theorem~16: Generalized AEP

2815: for Conditioned Measures \cite{covering-TR:99}:}

2816: With the conditioned measures $Q^{(c)}_n$ defined

2817: as above, we have:

2818: $$\limsup_{n\to\infty} -\frac{1}{n}\log Q^{(c)}_n(B(X_1^n,D))

2819: 	\leq I(X^*;Y^*) \;\;\;\;\mbox{w.p.1.}$$

2820:

2821:

2822:

2823: \section{Refinements of the Generalized AEP}

2824: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

2825: As we saw in Section~3, the generalized AEP can

2826: be used to determine the first order asymptotic

2827: behavior of a number of interesting objects

2828: arising in applications. For example, the

2829: generalized AEP of Theorem~1

2830: $$-\frac{1}{n}\log Q^n(B(X_1^n,D))\to R_1(P,Q,D)

2831: 	 \;\;\;\;\mbox{w.p.1}$$

2832: immediately translated

2833: (via the strong approximation of

2834: Theorem~13)

2835: to a strong-law-of-large-numbers

2836: (SLLN) result for the waiting times:

2837: $$\frac{1}{n}\log W_n \to R_1(P,Q,D)

2838:          \;\;\;\;\mbox{w.p.1.}$$

2839:

2840: In this section we will prove refinements

2841: to the generalized AEP of Section~2.2,

2842: and in Section~5 we will revisit the applications

2843: of the previous section and use these refinements

2844: to prove corresponding second order asymptotic

2845: results.

2846:

2847: To get some motivation,

2848: let us consider for a moment

2849: the simplest version of the

2850: classical AEP, for an $\iid$

2851: process $\Xp$ with distribution

2852: $P$ on the finite alphabet $A$.

2853: The AEP here follows by a simple

2854: application of the law

2855: of large numbers,

2856: \be

2857: -\frac{1}{n}\log P^n(X_1^n)

2858: =

2859: \frac{1}{n}\sum_{i=1}^n[-\log P(X_i)]

2860: \to H

2861: \label{eq:oldPS}

2862: \ee

2863: where $H$ is the entropy of $P$.

2864: But (\ref{eq:oldPS}) contains

2865: more information than that:

2866: It says that $-\log P^n(X_1^n)$

2867: is in fact equal to the partial sum

2868: $S_n=\sum_{i=1}^n Z_i$ of the $\iid$

2869: random variables $Z_i=-\log P(X_i)$.

2870: Therefore we can apply the

2871: central limit theorem (CLT)

2872: or the law of the iterated

2873: logarithm (LIL) to get more

2874: precise information on the

2875: convergence of the AEP.

2876:

2877: The same strategy can be carried out

2878: for non-$\iid$ processes: Initially

2879: Ibragimov \cite{ibragimov:62}

2880: and then Philipp and Stout \cite{philipp-stout:book}

2881: showed that even when $\Xp$ is a Markov chain,

2882: or, more generally, a weakly dependent

2883: random process, the quantities $-\log P^n(X_1^n)$

2884: can be approximated by the partial sums of

2885: an associated weakly dependent process.

2886: These results have found a number of

2887: applications in lossless data

2888: compression and related areas

2889: \cite{kontoyiannis-jtp}\cite{kontoyiannis-97}.

2890:

2891: In this and the following section we will

2892: carry out a similar program in the lossy

2893: case. Throughout this section we will

2894: adopt the notation and assumptions

2895: of Section~2.2: Let

2896: $\Xp$ be a stationary ergodic

2897: process with first order marginal

2898: $P_1=P$ on $A$, and let $Q$ be

2899: an arbitrary probability measure

2900: on $\Ahat$. Define $\Dmin$ and $\Dav$,

2901: as before (as in equations (\ref{eq:Dmin})

2902: and (\ref{eq:Dav})), and

2903: assume that $\Dmin<\Dav$ so that

2904: the distortion measure $\rho(X,Y)$ is not

2905: essentially constant in $Y$ with positive

2906: probability. We also impose here the

2907: additional assumption that $\rho$ has

2908: a finite third moment:

2909: \be

2910: D_3\bydef

2911: E_{P\times Q}[\rho^3(X,Y)]<\infty.

2912: \label{eq:third}

2913: \ee

2914:

2915: The first result of this section

2916: refines Theorem~1 by giving a more

2917: precise asymptotic estimate of the

2918: quantity $-\log Q^n(B(X_1^n,D))$ in

2919: terms of the rate-function $R_1(P,Q,D)$

2920: and the empirical measure $\Phatn$

2921: induced by $X_1^n$ on $A^n$

2922: $$\Phatn\bydef\frac{1}{n}\sum_{i=1}^n\delta_{X_i}$$

2923: where $\delta_x$ denotes the measure assigning

2924: unit mass to $x\in A$.

2925:

2926: \medskip

2927:

2928: {\em Theorem~17: \cite{yang-zhang:99}:}

2929: Let $\Xp$ be a stationary ergodic process

2930: with marginal $P$ on $A$, and let $Q$ be

2931: an arbitrary probability measure on $\Ahat.$

2932: Assume that $D_3=E_{P\times Q}[\rho^3(X,Y)]$

2933: is finite. Then for any $D\in(\Dmin,\Dav)$:

2934: \be

2935: -\log Q^n(B(X_1^n,D))= nR_1(\hat{P}_n,Q,D)+\frac{1}{2}\log n + O(1)

2936: \;\;\;\;\mbox{w.p.1.}

2937: \label{eq:br}

2938: \ee

2939:

2940: \medskip

2941:

2942: Next we show that the most significant

2943: term in (\ref{eq:br}) can be approximated

2944: by the partial sum of a weakly dependent

2945: random process. Recall the definition of

2946: the $\alpha$-mixing coefficients of $\Xp$

2947: $$\alpha(k)\;=\;\sup\{|\BBP(A\cap B)-\BBP(A)\BBP(B)|\;:\;\;

2948: A\in\sigma(X_{-\infty}^0),\; B\in\sigma(X_{k}^{\infty})\}$$

2949: where $\sigma(X_i^j)$ is

2950: the $\sigma$-field generated by $X_i^j$.

2951: The process $\Xp$ is called {\em $\alpha$-mixing}

2952: if $\alpha(k)\to 0$ as $k\to\infty$;

2953: see \cite{bradley} for more details.

2954:

2955: We also need to recall some of the notation

2956: from the proof of Theorem~1 in Section~2.2.

2957: For $x\in A$ and $\la\in\RL$, let $\LA_x(\la)$

2958: denote the log-moment generating function

2959: of the random variable $\rho(x,Y)$

2960: $$\LA_x(\la)\bydef \log E_Q\left(e^{\lambda\rho(x,Y)}\right)$$

2961: and note that the function $\LA(\la)$ defined

2962: in (\ref{eq:GEcheck}) can be written

2963: as $\LA(\la)=E_P[\LA_X(\la)]$.

2964: Also recall that for any $D\in(\Dmin,\Dav)$ there

2965: exists a unique $\la^*<0$ such that

2966: $\LA'(\la^*)=D$.

2967:

2968: % \medskip

2969:

2970: \newpage

2971:

2972: {\em Theorem~18: \cite{dembo-kontoyiannis}:}

2973: Let $\Xp$ be a stationary $\alpha$-mixing process

2974: with marginal $P$ on $A$, and let $Q$ be

2975: an arbitrary probability measure on $\Ahat.$

2976: Assume that the $\alpha$-mixing coefficients

2977: of $\Xp$ satisfy

2978: \be

2979: \sum_{k=1}^\infty \alpha^t(k)<\infty,

2980: \;\;\;\;\mbox{for some $t\in(0,1/3)$}

2981: \label{eq:LIL-cond}

2982: \ee

2983: and that $D_3=E_{P\times Q}[\rho^3(X,Y)]$

2984: is finite. Then for any $D\in(\Dmin,\Dav)$:

2985: $$nR_1(\hat{P}_n,Q,D) = nR_1(P,Q,D) + \sum_{i=1}^n

2986: 	g(X_i) + O(\log\log n)

2987: \;\;\;\;\mbox{w.p.1}$$

2988: where

2989: \be

2990: g(x)\bydef \LA(\la^*) -\LA_x(\la^*),\;\;\;\;

2991: x\in A.

2992: \label{eq:functiong}

2993: \ee

2994:

2995: \medskip

2996:

2997: Theorem~18 is a small generalization

2998: of \cite[Theorem~3]{dembo-kontoyiannis}.

2999: Before giving its proof outline,

3000: we combine Theorems~17 and~18 to

3001: show that, as promised, $-\log Q^n(B(X_1^n,D))$

3002: can be accurately approximated as the

3003: partial sum of the weakly dependent

3004: random process $\{g(X_n)\}$.

3005:

3006: \medskip

3007:

3008: {\em Corollary~19: Second Order Generalized AEP:}

3009: Let $\Xp$ be a stationary $\alpha$-mixing process

3010: with marginal $P$ on $A$, and let $Q$ be

3011: an arbitrary probability measure on $\Ahat.$

3012: Assume that the $\alpha$-mixing coefficients

3013: of $\Xp$ satisfy

3014: (\ref{eq:LIL-cond})

3015: and that $D_3=E_{P\times Q}[\rho^3(X,Y)]$ is

3016: finite. Then for any $D\in(\Dmin,\Dav)$, and

3017: with $g(x)$ defined as in (\ref{eq:functiong}):

3018: $$

3019: -\log Q^n(B(X_1^n,D))= nR_1(P,Q,D) + \sum_{i=1}^ng(X_i)

3020: 	+ \frac{1}{2}\log n + O(\log\log n)

3021: \;\;\;\;\mbox{w.p.1.}$$

3022:

3023: \medskip

3024:

3025: {\em Proof Outline for Theorem~18:}

3026: Adapting the argument leading from

3027: (22) to (24) of \cite{dembo-kontoyiannis},

3028: one easily checks that the result of

3029: Theorem~18 holds as soon as

3030: \be

3031: \liminf_{n \to \infty} \inf_{|\theta|<\delta} B_n(\theta)

3032: &>&0

3033: 	\;\;\;\;\mbox{w.p.1}

3034: 	\label{Bncond}\\

3035: \mbox{and}\;\;\;\;

3036: \limsup_{n \to \infty} \frac{ n A_n^2}{\log \log n}

3037: &<&\infty

3038: 	\;\;\;\;\mbox{w.p.1}

3039: 	\label{Ancond}

3040: \ee

3041: where

3042: $A_n=

3043: n^{-1}

3044: \sum_{k=1}^n\zeta_k$ is

3045: the

3046: empirical mean of the centered

3047: random variables $\zeta_k=\Lambda_{X_k}'(\la^*)-D$,

3048: and $B_n(\theta)$ is the

3049: empirical mean

3050: of the non-negative random variables

3051: $\Lambda_{X_k}''(\la^*+\theta)$.

3052: By the ergodic theorem we have,

3053: with probability one,

3054: \begin{eqnarray*}

3055: \liminf_{n \to \infty} \inf_{|\theta|<\delta} B_n(\theta)

3056: &\geq&

3057: \liminf_{n \to \infty} \frac{1}{n} \sum_{k=1}^n

3058: \inf_{|\theta|<\delta} \Lambda_{X_k}''(\la^*+\theta) \\

3059: &=& E_P \left [ \inf_{|\theta|<\delta} \Lambda_{X}''(\la^*+\theta)

3060: 	\right]

3061: \end{eqnarray*}

3062: and by Fatou's lemma and the continuity of

3063: the map $\theta \mapsto \Lambda_{x}''(\la^*+\theta)$

3064: it follows that

3065: $$

3066: \liminf_{\delta \downarrow 0}

3067: E_P \left[

3068: 	\inf_{|\theta|<\delta} \Lambda_X''(\la^*+\theta)

3069: 	\right]

3070: \geq E_P [\Lambda_X''(\la^*)] = \Lambda''(\la^*) > 0.

3071: $$

3072: This implies that

3073: (\ref{Bncond}) holds once $\delta>0$

3074: is made small enough. [Note that the above

3075: argument also avoids an incorrect -- but

3076: also unnecessary -- application

3077: of the uniform ergodic theorem in the

3078: derivation of \cite[eq.~(26)]{dembo-kontoyiannis}.]

3079:

3080: Turning to (\ref{Ancond}), since $\la^*<0$,

3081: it follows by the convexity of $\Lambda_x(\la)$

3082: that that for any $x\in A$:

3083: % and the non-negativity

3084: % of $\rho(\cdot,\cdot)$,

3085: $$

3086: 0 \leq \Lambda_x'(\la^*) \leq \Lambda_x'(0) = E_Q[\rho(x,Y)].

3087: $$

3088: Consequently, H\"older's inequality and assumption

3089: (\ref{eq:third}) imply that the random variable

3090: $$|\zeta_k| \leq E_Q[\rho(X_k,Y)|X_k]+D$$

3091: has a finite third moment.

3092: Recall  \cite{oodaira-yoshihara:71a}

3093: that the LIL holds for the partial sum $A_n$ of a

3094: zero-mean, stationary process $\{\zeta_k\}$ with

3095: a finite third moment, as soon as

3096: the $\alpha$-mixing coefficients

3097: of $\{\zeta_k\}$ satisfy (\ref{eq:LIL-cond}).

3098: The observation

3099: that $\zeta_k$ is a deterministic

3100: function of $X_k$ for all $k$

3101: completes the proof. \qed

3102:

3103: \section{Applications -- Second Order Results}

3104: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

3105: Here we revisit the applications considered

3106: in Section~3, and using the

3107: ``second order generalized AEP''

3108: of Corollary~19 we prove second order

3109: refinements for many of the results from

3110: Section~3. In Section~5.1 we consider

3111: the problem of lossy data compression

3112: in the same setting as in Section~3.1.

3113: We use the second order AEP

3114: to determine the precise asymptotic

3115: behavior of the Shannon random codebooks,

3116: and show that, with probability one,

3117: they achieve optimal compression performance

3118: up to terms of order $(\log n)$ bits.

3119: Moreover, essentially the same compression

3120: performance can be achieved universally.

3121: For arbitrary variable-length codes

3122: operating at a fixed rate level, we show

3123: that the rate at which they can achieve

3124: the optimal rate of $n\calR(D)$ bits is

3125: at best of order $O(\sqrt{n})$ bits.

3126: This is the

3127: best possible redundancy rate as

3128: long as the ``minimal coding variance''

3129: of the source is strictly positive.

3130: For discrete $\iid$ sources,

3131: a characterization is given of

3132: when this variance can be zero.

3133:

3134: In Section~5.2 we look at waiting times,

3135: and we prove a second order refinement to

3136: Theorem~14, and in Section~5.3 we

3137: consider the problem of determining

3138: the asymptotic behavior of longest

3139: match-lengths. As discussed briefly

3140: in Section~3.4, their asymptotics

3141: can be deduced from the corresponding

3142: waiting-times results via duality.

3143:

3144: \subsection{Lossy Data Compression}

3145: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

3146:

3147: \subsubsection{Random Codes and Second Order Converses}

3148: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

3149: Here we consider the exact same setup as in Section~3.1:

3150: An $\iid$ source $\Xp$ with distribution $P$ on $A$

3151: is to be compressed with distortion $D$ or less with

3152: respect to a bounded distortion measure

3153: $\rho$, satisfying, as before, the usual

3154: assumption (\ref{eq:maximin}) --

3155: see the remark at the end of this

3156: section for

3157: its implications.

3158: We take

3159: the reproduction alphabet $\Ahat$ to be

3160: finite, define $\Dbar$ as in (\ref{eq:Dbar}),

3161: and assume that $\Dbar>0$.

3162:

3163: For $D\in(0,\Dbar)$, let $Q_n^*$, $n\geq 1$,

3164: denote the optimal reproduction distributions

3165: at distortion level $D$. Combining the

3166: strong approximation Theorem~9 with the

3167: second order generalized AEP

3168: of Corollary~19 and the discussion in

3169: Section~3.1 yields:

3170:

3171: % \medskip

3172:

3173: \newpage

3174:

3175: {\em Theorem~20: Pointwise Redundancy for I.I.D. Sources

3176: \cite{kontoyiannis-red:00}:}

3177: Suppose $\Xp$ is an $\iid$ source with distribution

3178: $P$ on $A$, and with rate-distortion

3179: function $\calR(D)$ (in bits). Let $Q_n^*$

3180: denote the optimal reproduction distributions

3181: at distortion level $D\in(0,\Dbar)$,

3182: and define the function

3183: $h(x)=(\log_2 e)g(x)$, $x\in A$,

3184: with $g$ defined as in (\ref{eq:functiong}).

3185: Then:

3186: \begin{itemize}

3187: \item[(a)]

3188: The codes based on almost any realization

3189: of the Shannon random codebooks according

3190: to the measures $\{Q_n^*\}$ have codelengths

3191: $\ell_n(X_1^n)$

3192: satisfying

3193: $$

3194: \ell_n(X_1^n)\leq

3195: n\calR(D)

3196: +\sum_{i=1}^n h(X_i)

3197: +4\log n

3198: \;\;\;\;\mbox{bits, eventually, w.p.1.}$$

3199: \item[(b)]

3200: The codes based on almost any realization

3201: of the universal Shannon random codebooks

3202: have codelengths $\ell_n(X_1^n)$ satisfying

3203: $$

3204: \ell_n(X_1^n)\leq

3205: n\calR(D)

3206: +\sum_{i=1}^n h(X_i)

3207: +(4+|\Ahat|)\log n

3208: \;\;\;\;\mbox{bits, eventually, w.p.1.}$$

3209: \end{itemize}

3210:

3211: \medskip

3212:

3213: We remark that the coefficients of the

3214: $(\log n)$ terms in (a) and (b) above

3215: are not the best possible, and can be

3216: significantly improved; see

3217: \cite{konto-zhang:00} for more details.

3218:

3219: Perhaps somewhat surprisingly,

3220: it turns out that the performance

3221: of the above random codes is

3222: optimal up to terms of order

3223: $(\log n)$ bits.

3224: Recall that a {\em code $C_n$ operating

3225: at distortion level $D\geq 0$} is

3226: defined by a triplet $(B_n,\phi_n,\psi_n)$ where:

3227: \begin{itemize}

3228: \item[$(a)$]

3229: $B_n$ is a subset of $\Ahatn$, called the {\em codebook},

3230: \item[$(b)$]

3231: $\phi_n:A^n\to B_n$ is the {\em encoder},

3232: \item[$(c)$]

3233: $\psi_n:B_n\to \{0,1\}^*$ is a

3234: uniquely decodable map,

3235: \end{itemize}

3236: such that

3237: $$\rho_n(x_1^n,\phi_n(x_1^n))\leq D,

3238: \;\;\;\;\;\;\mbox{for all}\;\;x_1^n\in A^n.$$

3239: The codelengths $\ell_n(X_1^n)$ achieved by

3240: such a code are simply:

3241: $$\ell_n(x_1^n)=\;

3242: \mbox{length of}\;[\psi_n(\phi_n(x_1^n))]

3243: \;\;\;\;\mbox{bits}.$$

3244:

3245: \medskip

3246:

3247: {\em Theorem~21: Pointwise Converse for I.I.D. Sources

3248: \cite{kontoyiannis-red:00}:}

3249: Let $\Xp$ be an $\iid$ source with distribution

3250: $P$ on $A$, and let $\{C_n\}$ be an arbitrary

3251: sequence of codes operating at distortion

3252: level $D\in(0,\Dbar)$, with associated

3253: codelengths $\{\ell_n\}$. Then:

3254: $$

3255: \ell_n(X_1^n)\geq

3256: n\calR(D)

3257: +\sum_{i=1}^n h(X_i)

3258: -\log n

3259: \;\;\;\;\mbox{bits, eventually, w.p.1}$$

3260: where $h(x)$

3261: is defined as in Theorem~20.

3262:

3263: \medskip

3264:

3265: The proof of Theorem~21 in \cite{kontoyiannis-red:00}

3266: uses techniques quite different to those developed in

3267: this paper. In particular, the key step in the proof

3268: is established by an application of

3269: the generalized Kuhn-Tucker conditions of Bell and

3270: Cover \cite{bell-cover:88}.

3271:

3272: Theorems~20 and~21 are next combined to

3273: yield ``second order'' refinements to

3274: Shannon's classical source coding theorem.

3275: For a source $\Xp$ as in Theorem~21 and

3276: a $D\in(0,\Dbar)$, the {\em minimal coding

3277: variance $\sigma^2=\sigma^2(P,D)$ of

3278: source $P$ at distortion level $D$}

3279: is

3280: \be

3281: \sigma^2=\sigma^2(P,D)\bydef\VAR[h(X_1)]

3282: \label{eq:mincv}

3283: \ee

3284: with $h(x)$ as in Theorem~20.

3285:

3286: % \medskip

3287:

3288: \newpage

3289:

3290: {\em Theorem~22: Second Order Source Coding Theorems

3291: \cite{kontoyiannis-red:00}:}

3292: Let $\Xp$ be an $\iid$ source with

3293: distribution $P$ on $A$ and with

3294: rate-distortion function $\calR(D)$

3295: (in bits).

3296: For $D\in(0,\Dbar)$:

3297: \begin{itemize}

3298: \item[]{\bf (CLT)}

3299: There is a sequence of

3300: random variables $G_n=G_n(P,D)$ such that, for any

3301: sequence of codes $\{C_n,\ell_n\}$ operating

3302: at distortion level $D$, we have

3303: \be

3304: \ell_n(X_1^n)-

3305: n\calR(D)\geq \sqrt{n}G_n

3306:         \;\;\;\;\mbox{bits, eventually, w.p.1}

3307: \label{eq:clt}

3308: \ee

3309: and the $G_n$ converge in distribution

3310: to a Gaussian random variable

3311: $$G_n\weakly N(0,\sigma^2)$$

3312: where $\sigma^2=\sigma^2(P,D)$

3313: is the minimal coding variance.

3314: \item[]{\bf (LIL)}

3315: With $\sigma^2$ as above,

3316: for any sequence of codes

3317: $\{C_n,\ell_n\}$ operating

3318: at distortion level $D$:

3319: \ben

3320: \limsup_{n\to\infty}\;

3321: \frac{\ell_n(X_1^n)-n\calR(D)}{\sqrt{2n\log\log n}}

3322: &\geq& \sigma\;\;\;\;\mbox{w.p.1}\\

3323: \liminf_{n\to\infty}\;

3324: \frac{\ell_n(X_1^n)-n\calR(D)}{\sqrt{2n\log\log n}}

3325: &\geq& -\sigma\;\;\;\;\mbox{w.p.1.}

3326: \een

3327: \item[]{\bf (\boldmath$\Rightarrow$)}

3328: Moreover, there exist codes $\{C_n,\ell_n\}$

3329: operating at distortion level $D$, that

3330: asymptotically achieve equality

3331: {\em universally} in all these

3332: lower bounds.

3333: \end{itemize}

3334:

3335: \medskip

3336:

3337: {\em Remark on Assumption (\ref{eq:maximin}):}

3338: When the distortion measure does not satisfy

3339: assumption (\ref{eq:maximin}) [as, for example,

3340: when $\rho(x,y)=(x-y)^2$ with $A=\RL$ and $\Ahat$

3341: a finite subset of $\RL$], we can modify $\rho$

3342: to $\rho'(x,y)=\rho(x,y)-f(x)$, with

3343: $f(x)=\min_{y \in \hat{A}} \, \rho(x,y)$,

3344: so that $\rho'$ satisfies (\ref{eq:maximin}).

3345: Then, to generate codes operating at

3346: distortion level $D$ with respect to $\rho$,

3347: we can construct random codebooks for

3348: as before but do the encoding with respect

3349: to $\rho'(x,y)$ at the {\it random}

3350: distortion level $D_n= D - E_{\hat{P}_n}(f(X))$.

3351: It is not hard to check that

3352: \cite[Theorem 2]{dembo-kontoyiannis}

3353: can be extended to apply when $D$ is

3354: replaced by the sequence $\{D_n\}$.

3355: Since $D_n \to D - E_P(f(X))$ as $n\to\infty$,

3356: this results with the first order

3357: approximation

3358: $$-\frac{1}{n}\log Q^*_n(B(X_1^n,D_n))\approx

3359: R_1^{\rho'}(\hat{P}_n,Q^*,D_n).$$

3360: Simple algebra then shows that

3361: $$

3362: R_1^{\rho'}(\hat{P}_n,Q^*,D_n)=R_1^\rho(\hat{P}_n,Q^*,D)

3363: $$

3364: implying that all the results of Section 5.1.1

3365: remain valid [despite the fact that $\rho$

3366: does not satisfy  (\ref{eq:maximin})], with

3367: the function $h(\cdot)$ taken in terms of

3368: the log-moment generating function

3369: $\Lambda_x(\la)$ of the {\it original}

3370: distortion measure $\rho$ (and not that of

3371: the modified $\rho'$).

3372:

3373:

3374: \subsubsection{Critical Behavior}

3375: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

3376: In view of Theorems~20 and~21 above,

3377: the codelengths $\ell_n^*(X_1^n)$ of

3378: the best code operating at distortion

3379: level $D$ have:

3380: $$\ell^*_n(X_1^n)\approx

3381: n\calR(D)

3382: +\sum_{i=1}^n h(X_i)

3383: +O(\log n)

3384: \;\;\;\;\mbox{bits.}$$

3385: This reveals an interesting

3386: dichotomy in the behavior of

3387: the ``pointwise'' redundancy of

3388: the best code:

3389: \begin{itemize}

3390: \item

3391: Either the minimal coding variance

3392: $\sigma^2$ (recall (\ref{eq:mincv}))

3393: is nonzero, in which case the best

3394: rate at which optimality can

3395: be achieved is of order $\sqrt{n}$

3396: bits by the CLT;

3397: \item

3398: or $\sigma^2=0$, and the best redundancy

3399: rate is of order $(\log n)$ bits

3400: (cf. \cite{zhang-yang-wei:I}).

3401: \end{itemize}

3402: Under certain conditions, in this section

3403: we give a precise characterization of

3404: when each of these two cases can occur.

3405: Before stating it, we briefly discuss

3406: two examples to gain some intuition.

3407:

3408: \medskip

3409:

3410: {\em Example~4: Lossless Compression:}

3411: Lossless data compression

3412: can be considered as an extreme case

3413: of lossy compression, where $\Xp$ is

3414: an $\iid$ source with distribution $P$

3415: on a finite set $A=\Ahat$,

3416: and the distortion level $D$

3417: is set to zero. Here it is

3418: well-known that (ignoring the integer

3419: length constraints) the best code is

3420: given by the idealized Shannon code,

3421: $\ell_n(X_1^n)=-\log_2 P^n(X_1^n)$.

3422: Accordingly, the upper

3423: and lower bounds of Theorems~21

3424: and~22 say that the best code has

3425: codelengths

3426: $$\ell_n(X_1^n) = n\calH(P)

3427: 	+\sum_{i=1}^n h(X_i)$$

3428: where $\calH(P)$ is the entropy of $P$

3429: in bits, and with

3430: $$h(x)\bydef-\log_2 P(x) - \calH(P),

3431: 	\;\;\;\;x\in A.$$

3432: When is $\sigma^2=0$? By its

3433: definition (\ref{eq:mincv}),

3434: $\sigma^2$ is zero if and only if

3435: the function $h(x)$ is constant over $x$,

3436: which, in this case, can only happen if

3437: $P(x)$ is constant over $x\in A$.

3438: Therefore, here:

3439: {\em $\sigma^2=0$ if and only if

3440: the source has a uniform distribution

3441: over $A$.}

3442:

3443: \medskip

3444:

3445: {\em Example~5: Binary Source with Hamming Distortion:}

3446: Consider the simplest

3447: non-trivial lossy example:

3448: Let $\Xp$ be an $\iid$ source

3449: with Bernoulli($p$) distribution

3450: (for some $p\in(0,1/2]$),

3451: let $A=\Ahat=\{0,1\}$,

3452: and take $\rho$ to be Hamming

3453: distortion: $\rho(x,y)=|x-y|$.

3454: For $D\in(0,p)$ it is not

3455: hard to evaluate all the

3456: relevant quantities

3457: explicitly

3458: (see, e.g.,

3459: \cite[Example~2.7.1]{berger:book}

3460: or \cite[Theorem~13.3.1]{cover:book}).

3461: In particular,

3462: the optimal reproduction

3463: distribution $Q^*$ is

3464: Bernoulli($q$),

3465: with $q=(p-D)/(1-2D)$, and

3466: our function of interest is:

3467: $$h(x)=

3468: -\log_2\left(\frac{P(x)}{1-D}\right)

3469:         -E_P\left[

3470:                 -\log_2\left(\frac{P(X_1)}{1-D}\right)

3471:                 \right].$$

3472: Recalling that the minimal coding

3473: variance is zero if and only if

3474: $h(x)$ is constant, from the above

3475: expression we see that, similarly

3476: to the previous example, also

3477: here:

3478: {\em $\sigma^2=0$ if and only if

3479: the source has a uniform distribution}.

3480:

3481: \medskip

3482:

3483: For discrete sources, the next result gives

3484: conditions under which the characterization

3485: suggested by these two examples remains valid.

3486: Suppose $A=\Ahat=\{a_1,a_2,\ldots,a_k\}$

3487: is a finite set, write $\rho_{ij}$ for

3488: $\rho(a_i,a_j)$, and assume

3489: that $\rho$ is symmetric

3490: and that $\rho_{ij}=0$ if and only if

3491: $i=j$. We call $\rho$ a

3492: {\em permutation distortion measure},

3493: if all rows of the matrix

3494: $(\rho_{ij})_{i,j=1,\ldots,k}$

3495: are permutations of one another.

3496:

3497: \medskip

3498:

3499: {\em Theorem~23: Variance Characterization

3500: \cite{dembo-kontoyiannis:crit:01}:}

3501: Let $\Xp$ be a discrete source with

3502: distribution $P$ and rate-distortion

3503: function $R(D)$. Assume that $R(D)$

3504: is strictly convex over $(0,\Dbar)$.

3505: There are exactly two possibilities:

3506: \begin{itemize}

3507: \item[(a)]

3508: Either $\sigma^2=\sigma^2(P,D)$ is only

3509: zero for finitely many $D\in(0,\Dbar).$

3510: \item[(b)]

3511: Or $\sigma^2=\sigma^2(P,D)\equiv 0$

3512: for {\em all} $D\in(0,\Dbar)$, in which

3513: case $P$ is the uniform distribution

3514: on $A$ and $\rho$ is

3515: a permutation distortion measure.

3516: \end{itemize}

3517:

3518: \medskip

3519:

3520: A general discussion of this

3521: problem, including the case of continuous

3522: sources, is given in

3523: \cite{dembo-kontoyiannis:crit:01}.

3524: Also, in the lossless case,

3525: the problem of characterizing

3526: when $\sigma^2=0$ for sources

3527: with memory is dealt with

3528: in \cite{kontoyiannis-97}.

3529:

3530: Before moving on to waiting times and match-lengths

3531: we mention that, in a somewhat similar

3532: vain, the problem of understanding the best

3533: {\em expected}

3534: redundancy rate in lossy data

3535: compression has also been recently considered in

3536: \cite{zhang-yang-wei:I})\cite{yang-zhang:II}%

3537: \cite{yang-zhang:III}\cite{ishii-yamamoto:97}.

3538:

3539:

3540: \subsection{Waiting Times}

3541: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

3542: Next we turn to waiting times.

3543: Recall that, given $D\geq 0$

3544: and two independent realizations

3545: of the stationary ergodic

3546: processes $\Xp$ and $\Yp$,

3547: the waiting time $W_n$ was

3548: defined as the time of the

3549: first appearance of $X_1^n$

3550: in $\Yp$ with distortion $D$

3551: or less (see (\ref{eq:Wn-def})

3552: for the precise definition).

3553: In Theorem~14 we gave conditions

3554: that identified the first order

3555: limiting behavior of $W_n$.

3556: In particular, when $\Yp$ is

3557: $\iid$, it was shown in

3558: Theorem~14~(a)

3559: that

3560: \be

3561: \frac{\log W_n}{n}\to R_1(P,Q,D)

3562: \;\;\;\;\mbox{w.p.1}

3563: \label{eq:w-slln3}

3564: \ee

3565: where $P$ and $Q$ are the first

3566: order marginals of $\Xp$

3567: and $\Yp$, respectively.

3568:

3569: The next result gives conditions

3570: under which the SLLN-type

3571: statement of (\ref{eq:w-slln3})

3572: can be refined to a CLT and

3573: a LIL.

3574:

3575: \medskip

3576:

3577: {\em Theorem~24: CLT and LIL for Waiting Times:}

3578: Let $\Xp$ be a stationary $\alpha$-mixing process

3579: and $\Yp$ be an $\iid$ process, with marginal

3580: distributions $P$ and $Q$, on $A$ and $\Ahat$,

3581: respectively. Assume that the $\alpha$-mixing

3582: coefficients of $\Xp$ satisfy (\ref{eq:LIL-cond})

3583: and that $D_3=E_{P\times Q}[\rho^3(X,Y)]$ is

3584: finite. Then for any $D\in(\Dmin,\Dav)$ the

3585: following series converges

3586: \be

3587: \sigma^2\bydef E_P[g^2(X_1)]+2\sum_{k=2}^\infty E_P[g(X_1)g(X_k)]

3588: \label{eq:variance}

3589: \ee

3590: with $g(x)$ defined as in (\ref{eq:functiong}),

3591: and, moreover:

3592: \begin{itemize}

3593: \item[]{\bf (CLT)} With $R_1=R_1(P,Q,D)$:

3594: $$\frac{\log W_n \;-\; nR_1}{\sqrt{n}}

3595: 	\weakly N(0,\sigma^2).$$

3596: \item[]{\bf (LIL)}

3597: The set of limit points of the sequence

3598: $$\left\{

3599: 	\frac{\log W_n \;-\; nR_1}

3600: 	     {\sqrt{2n\log\log n}}

3601:   \right\},\quad n\geq 3$$

3602: coincides with $[-\sigma,\sigma]$, with

3603: probability one.

3604: \end{itemize}

3605:

3606: \medskip

3607:

3608: {\em Proof Outline:}

3609: For a bounded distortion measure

3610: $\rho$, Theorem~24 was proved in

3611: \cite{dembo-kontoyiannis}.

3612: To obtain the more general statement above

3613: combine the strong approximation

3614: of Theorem~13 with the second order

3615: AEP in Corollary~19 to get:

3616: \be

3617: \log W_n=

3618: nR_1(P,Q,D) + \sum_{i=1}^ng(X_i) + O(\log n)

3619: \;\;\;\;\mbox{w.p.1.}

3620: \label{eq:inter}

3621: \ee

3622: Since $\Xp$ satisfies the mixing

3623: assumption (\ref{eq:LIL-cond}),

3624: so does the process $\{g(X_n)\}$.

3625: Also, since $\la^*<0$, the function

3626: $\LA_x(\la^*)$ is bounded above by zero,

3627: and by Jensen's inequality it is

3628: bounded below by $\la^*E_Q[\rho(x,Y)].$

3629: Therefore,

3630: $$|\LA_x(\la^*)|\leq |\la^*|E_Q[\rho(x,Y)]$$

3631: and this, together with

3632: H\"older's inequality and

3633: the definition of $g(x),$ imply

3634: that $E_P[|g(X_1)|^3]<\infty$.

3635: Therefore we can apply the CLT

3636: of \cite[Theorem~1.7]{peligrad:86}

3637: to the process $\{g(X_n)\}$

3638: in order to deduce the CLT-part

3639: of the theorem from (\ref{eq:inter}).

3640: Similarly, applying the LIL of

3641: \cite{oodaira-yoshihara:71a}

3642: to $\{g(X_n)\}$, from (\ref{eq:inter})

3643: we get the LIL-part of the theorem.

3644: \qed

3645:

3646: \medskip

3647:

3648: {\em Remark 5:} When the variance

3649: $\sigma^2$

3650: in (\ref{eq:variance}) is positive,

3651: then the {\em functional} versions of

3652: the above CLT and LIL given in

3653: \cite{dembo-kontoyiannis} still hold,

3654: under exactly the conditions of Theorem~24.

3655: (This follows by

3656: applying the functional CLT of

3657: \cite[Theorem~1.7]{peligrad:86}

3658: and the functional LIL of

3659: \cite[Theorem~1~(IV)]{oodaira-yoshihara:71b}.)

3660:

3661: \subsection{Match-Lengths and Duality}

3662: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

3663:

3664: Finally we turn to our last application,

3665: match-lengths. Recall that, given a

3666: distortion level $D\geq 0$ and two

3667: independent realizations of the

3668: processes $\Xp$ and $\Yp$, the match-length

3669: $L_m$ is defined as the length $\ell$

3670: of the longest prefix $X_1^\ell$ that

3671: appears (with distortion $D$ or less)

3672: starting somewhere in the ``database''

3673: $Y_1^m.$ See (\ref{eq:Lm-def}) for the

3674: precise definition. As we briefly mentioned

3675: in Section~3.4, there is a duality

3676: relationship between match-lengths

3677: and waiting times: Roughly speaking,

3678: long matches mean short waiting times,

3679: and vice-versa;

3680: see (\ref{eq:duality}).

3681:

3682: Although the relation (\ref{eq:duality})

3683: is not as simple as the duality

3684: (\ref{eq:easy-dual}) for exact matching,

3685: it is still possible to use

3686: (\ref{eq:duality}) to translate

3687: the asymptotic results for $W_n$

3688: to corresponding results for $L_m$.

3689: These are given in Theorem~25 below.

3690: This translation, carried out

3691: in \cite{dembo-kontoyiannis}, is

3692: more delicate than in the case of

3693: exact matching. For example, in

3694: order to prove the CLT for the

3695: match-lengths $L_m$ one

3696: invokes

3697: the functional CLT for

3698: the waiting times (see Remark~5 above

3699: and the proof of Theorem~4 in

3700: \cite{dembo-kontoyiannis}).

3701:

3702: \medskip

3703:

3704: {\em Theorem~25: Match-Lengths Asymptotics:}

3705: Let $\Xp$ be a stationary process

3706: and $\Yp$ be an $\iid$ process, with marginal

3707: distributions $P$ and $Q$, on $A$ and $\Ahat$,

3708: respectively. Assume

3709: that $D_3=E_{P\times Q}[\rho^3(X,Y)]$ is

3710: finite. Then for any $D\in(\Dmin,\Dav)$ we have

3711: $$\mbox{\bf (LLN)}\hspace{1.65in}

3712: \frac{L_m}{\log m}\,\to\,\frac{1}{R_1}\;\;\;\;

3713: \mbox{w.p.1}\hspace{2.4in}$$

3714: where $R_1=R_1(P,Q,D)$.

3715: If, moreover,

3716: % $\Xp$ is $\alpha$-mixing and its

3717: the $\alpha$-mixing coefficients of $\Xp$

3718: satisfy (\ref{eq:LIL-cond}) and

3719: the variance $\sigma^2$ in (\ref{eq:variance})

3720: is nonzero, then, with $\tau^2\bydef \sigma^2R_1^{-3}$,

3721: we have,

3722: \ben

3723: &\mbox{\bf (CLT)}&

3724: 	\hspace{1.6in}

3725: 	\frac{L_m-\frac{\log m}{R_1}}{\sqrt{\log m}}

3726: 	\,\weakly\,N(0,\tau^2)

3727: 	\hspace{2.2in}\\

3728: &\mbox{\bf (LIL)}&

3729: 	\hspace{1.2in}

3730: 	\limsup_{

3731: 	m\to\infty}

3732: 	\,\frac{L_m-\frac{\log m}{

3733: R_1

3734: }}

3735: 	{\sqrt{2\log m\,\log\log\log m}}\,

3736: 	=\,\tau\;\;\;\;

3737: 	\mbox{w.p.1.}

3738: \een

3739:

3740: \medskip

3741:

3742: The results of Theorem~25 were

3743: proved in \cite{dembo-kontoyiannis}

3744: for any bounded distortion measure

3745: $\rho$.  The slightly

3746: more general version stated above

3747: is proved in exactly the same way,

3748: using the results of Section~4

3749: in place of Theorems~2 and~3

3750: of \cite{dembo-kontoyiannis}.

3751:

3752: \section{Random Fields -- First Order Results}

3753: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

3754: This and the following section are devoted

3755: to generalizations of the results of

3756: Sections~2--5 to the case of random fields.

3757: Specifically, the role of the processes $\Xp$

3758: and $\Yp$ will now be played by stationary

3759: ergodic random fields

3760: $\Xp=\{X_u\;;\;u\in\IN^d\}$

3761: and $\Yp=\{Y_u\;;\;u\in\IN^d\}$.

3762: As we will see, many of the problems

3763: that we considered have natural

3764: analogs in this case, and

3765: the overall theme

3766: carries over:

3767: The generalized AEP and its refinement

3768: can be extended to random fields,

3769: and the corresponding questions in

3770: data compression and pattern matching

3771: can be answered following

3772: the same path as before.

3773:

3774: \subsection{Notation and Definitions}

3775: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

3776: The following definitions and notation

3777: will remain in effect throughout Sections~6

3778: and~7.

3779:

3780: We consider two random fields

3781: $\Xp=\{X_u\;;\;u\in\IN^d\}$

3782: and $\Yp=\{Y_u\;;\;u\in\IN^d\}$,

3783: $d\geq 2$, taking values

3784: in $A$ and $\Ahat$,

3785: respectively, and indexed

3786: by points $u=(u_1,u_2,\ldots,u_d)$

3787: on the integer lattice $\IN^d$.

3788: As before, $A$ and $\Ahat$

3789: are complete, separable

3790: metric spaces, equipped with

3791: their Borel

3792: $\sigma$-fields ${\cal A}$

3793: and $\hat{\cal A}$,

3794: respectively.

3795: Let $\BBP$ and $\BBQ$

3796: denote the

3797: (infinite-dimensional)

3798: measures of the entire random

3799: fields $\Xp$ and $\Yp$.

3800: Unless explicitly stated

3801: otherwise, we always assume

3802: that $\Xp$ and $\Yp$ are

3803: independent of each other.

3804:

3805: Throughout the rest of the

3806: paper we will

3807: assume that $\Xp$ and $\Yp$

3808: are stationary and ergodic.

3809: To be precise, by that we mean

3810: that the Abelian group of

3811: translations

3812: $\{T_u\,:\,u\in\IN^d\}$

3813: acts on both

3814: $(A^{\IN^d},{\cal A}^{\IN^d},\BBP)$

3815: and

3816: $(\hat{A}^{\IN^d},\hat{\cal A}^{\IN^d},\BBQ)$

3817: in a measure-preserving,

3818: ergodic manner; see \cite{krengel:book}

3819: for a detailed exposition.

3820:

3821: For $v,w\in\IN^d$,

3822: the distance between $v$ and $w$

3823: is defined by

3824: $$d(v,w)\bydef\max_{1\leq i\leq d}|v_i-w_i|$$

3825: and the distance between two subsets

3826: $V,W\subset\IN^d$ is

3827: $$d(V,W)\bydef\inf_{v\in V,\;w\in W} d(v,w).$$

3828: Given $v,w\in\IN^d$, we let

3829: $[v,w]=\{u\in\IN^d\;:\;

3830: \mbox{$v_j\le u_j\leq w_j$ for all $j$}\}$,

3831: where $[v,w]$ is empty in case $v_j>w_j$ for some $j$.

3832:

3833: We write $C(n)$ for the

3834: $d$-dimensional cube of side $n\geq1$,

3835: \ben

3836: C(n)=

3837: 	\{u\in\IN^d\;:\;\mbox{$1\leq u_j\leq n$ for all $j$}\}

3838: \een

3839: and $[0,\infty)$ for the ``infinite cube''

3840: % and for $v\in\IN^d$,

3841: % $$X_v^\infty\bydef\{X_u\;:\;

3842: % \mbox{$u_j\geq v_j$ for all $j$}\}.$$

3843: % and by $v+U$ we denote the translate

3844: % $$\{v+u\in\IN^d\;:\;u\in U\}.$$

3845: % and similarly

3846: % $[v,\infty)=\{u\in\IN^d\;:\;

3847: % \mbox{$u_j\geq v_j$ for all $j$}\}$.

3848: % and denote the

3849: % and $C(\infty)$ denotes the ``infinite cube''

3850: $$[0,\infty)=\{u\in\IN^d\;:\;

3851: 	\mbox{$u_j\geq 0$ for all $j$}\}.$$

3852: For an arbitrary subset

3853: $U\subset\IN^d$ we let

3854: $|U|$ denote its size;

3855: for example, $|C(n)|=n^d$.

3856: Also for $U\subset\IN^d$ we write

3857: $$X_U\bydef\{X_u\;;\;u\in U\}$$

3858: so that, in particular,

3859: $X_{[0,\infty)} = \{X_u\;;\;

3860: 	\mbox{$u_j\geq 0$ for all $j$}\}.$

3861: For

3862: $V\subset\IN^d$ and $u\in\IN^d$ we

3863: let $u+U$ denote the translate

3864: $$u+V=\{u+v\;:\;v\in V\}.$$

3865:

3866: For each $n\geq 1$, let $P_n$ denote the

3867: marginal distribution of $X_{C(n)}$

3868: on $A^{n^d}$, and similarly write

3869: $Q_n$ for the distribution of $Y_{C(n)}$.

3870: Let $\rho:A\times\Ahat\to[0,\infty)$

3871: be an arbitrary nonnegative (measurable)

3872: function, and define a sequence of

3873: single-letter distortion measures

3874: $\rho_n:A^{n^d}\times\Ahatnd\to[0,\infty)$,

3875: $n\geq 1$, by

3876: \ben

3877: \rho_n(x_{C(n)},y_{C(n)})\bydef\frac{1}{n^d}

3878: 	\sum_{u\in C(n)}\rho(x_u,y_u)

3879: \;\;\;\;x_{C(n)}\in A^{n^d},\;y_{C(n)}\in\Ahatnd.

3880: \een

3881: Given $D\geq 0$ and $x_{C(n)}\in A^{n^d}$,

3882: we write

3883: $B(x_{C(n)},D)$ for

3884: the distortion-ball of radius $D$:

3885: $$B(x_{C(n)},D)=

3886: \left\{

3887: y_{C(n)}\in\Ahatnd\;:\;\rho_n(x_{C(n)},y_{C(n)})\leq D

3888: \right\}.$$

3889:

3890: \subsection{Generalized AEP}

3891: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

3892: It is well-known that the classical AEP

3893: \ben

3894: -\frac{1}{n}\log P_n(X_1^n) \to H(\BBP)

3895: \;\;\;\;\mbox{w.p.1}

3896: \een

3897: generalizes to the case of finite-alphabet

3898: random fields on $\IN^d$, as well

3899: as to other amenable group actions

3900: \cite{ornstein-weiss:83}. In this

3901: section we give two versions of the

3902: generalized AEP of Theorems~1 and~4

3903: to the case of random fields on $\IN^d$.

3904:

3905: \paragraph{$\Yp$ is i.i.d. }

3906: In the notation of Section~6.1,

3907: we take $\Xp$ to be a stationary

3908: ergodic random field with first

3909: order marginal $P_1=P,$ and

3910: $\Yp$ to be i.i.d. with first

3911: order marginal $Q_1=Q$.

3912: We define $\Dmin$ and $\Dav$

3913: as in the one-dimensional

3914: case (recall equations (\ref{eq:Dmin})

3915: and (\ref{eq:Dav})), and assume

3916: that $\rho(x,y)$ is not essentially

3917: constant for ($\BBP$-almost)

3918: all $x\in A$, that is, $\Dmin < \Dav.$

3919:

3920: A simple examination of the proof of

3921: Theorem~1 shows that it

3922: extends {\sl verbatim} to the

3923: case of random fields, with the

3924: only difference that instead of the

3925: usual ergodic theorem we now need

3926: to invoke the ergodic theorem

3927: for $\IN^d$ actions; see

3928: \cite[Chapter~6]{krengel:book}.

3929: We thus obtain:

3930:

3931: \medskip

3932:

3933: {\em Theorem~26. Generalized AEP when $\Yp$ is $\iid$:}

3934: Let $\Xp$ be a stationary ergodic random field on

3935: $\IN^d$ and $\Yp$ be $\iid$, with marginal distributions

3936: $P$ and $Q$ on $A$ and $\Ahat$, respectively.

3937: Assume that $\Dav=E_{P\times Q}[\rho(X,Y)]$ is

3938: finite. Then for any $D\in(\Dmin,\Dav)$

3939: \ben

3940: -\frac{1}{n^d}\log Q^{n^d}(B(X_{C(n)},D)) \to R_1(P,Q,D)

3941:         \;\;\;\;\mbox{w.p.1}

3942: \een

3943: with the (one-dimensional)

3944: rate-function $R_1(P,Q,D)$

3945: defined as in Theorem~1.

3946:

3947: %Y ----------------- rephrased the discussion-results below

3948:

3949: \paragraph{$\Yp$ is not i.i.d. }

3950: Let $\Xp$ and $\Yp$ be stationary random fields and

3951: define $\Dav$ and $\Dmax$ exactly as in the

3952: one-dimensional case (recall

3953: (\ref{eq:Dav}) and (\ref{eq:Dmax})).

3954: We assume that the distortion

3955: measure $\rho$ is essentially

3956: bounded, $\Dmax < \infty$,

3957: and define

3958: \be

3959: \Dmin \bydef

3960: \sup_{n \geq 1} \Dminn = \lim_{n\to\infty} \Dminn

3961: \label{eq:dmind}

3962: \ee

3963: where

3964: \be

3965: \label{eq:dminn}

3966: \Dminn \bydef

3967: E_{P_n}[\essinf_{Y_{C(n)}\sim Q_n} \;\rho_n(X_{C(n)},Y_{C(n)})].

3968: \ee

3969: To see that the limit in (\ref{eq:dmind})

3970: exists and equals the supremum, first

3971: note that $\{n^d \Dminn\}$ is an

3972: increasing sequence, and that

3973: $D_{\rm min}^{(nk)}\geq D_{\rm min}^{(k)}$

3974: for all $n,k\geq 1$.

3975: Now fix $k\geq 1$ arbitrary. Given $n\geq k$

3976: we write $n = mk + r$ for some $0\leq r\leq k-1$,

3977: so that

3978: $$ n^dD_{\rm min}^{(n)}\geq (mk)^dD_{\rm min}^{(mk)}

3979: \geq (mk)^dD_{\rm min}^{(k)}.$$

3980: Since $n/mk\to 1$ as $n\to\infty$, this implies that

3981: $$\liminf_{n\to\infty} D_{\rm min}^{(n)} \geq D_{\rm min}^{(k)}.$$

3982: Since $k$ was arbitrary we are done.

3983:

3984: Finally, we assume once again that

3985: the distortion measure $\rho$ is

3986: not essentially constant,

3987: that is, $\Dmin<\Dav$.

3988: Our next result is the

3989: random fields analog of Theorem 4;

3990: it is proved in Appendix~C.

3991:

3992: \medskip

3993:

3994: {\em Theorem~27. Generalized AEP rate function.}

3995: Let $\Xp$ and $\Yp$ be stationary random fields.

3996: Assume that $\rho$ is bounded, and that with

3997: $\BBP$-probability one, conditional on

3998: $X_{[0,\infty)}=

3999: x_{[0,\infty)}$, the random variables

4000: $\{\rho_n(x_{C(n)},Y_{C(n)} )\}$ satisfy a

4001: large deviations principle with some

4002: deterministic, convex rate-function.

4003: Then for all $D\in (\Dmin,\Dav)$,

4004: except possibly at $D = \Dinf$,

4005: \be

4006: \label{eq:ldp-27}

4007: \lim_{n \to \infty}

4008: -\frac{1}{n^d}\log Q_n(B(X_{C(n)},D)) = R(\BBP,\BBQ,D)

4009:         \;\;\;\;\mbox{w.p.1}

4010: \ee

4011: where $\Dinf$

4012: and the rate-function $R(\BBP,\BBQ,D)$

4013: are defined as in the one-dimensional case,

4014: by (\ref{eq:dinf}) and (\ref{eq:thm4b}),

4015: respectively, and the rate-functions

4016: $R_n(P_n,Q_n,D)$ are now defined as

4017: \ben

4018: R_n(P_n,Q_n,D) = \inf_{V_n} \frac{1}{n^d} H(V_n\|P_n\times Q_n)

4019: \een

4020: with the infimum taken over all joint distributions

4021: $V_n$ on $A^{n^d}\times\Ahatnd$ such that

4022: the $A^{n^d}$-marginal of $V_n$ is $P_n$

4023: and $E_{V_n}[\rho_n(X_{C(n)},Y_{C(n)})]\leq D$.

4024:

4025: \medskip

4026:

4027: {\em Remark 6:} Suppose that $(\Xp,\Yp)$

4028: is a stationary random field satisfying

4029: a ``process-level LDP'' with a convex, good

4030: rate-function. To be precise,

4031: given $x_{C(n)}\in A^{n^d}$,

4032: write $x^{(n)}$ for the periodic

4033: extension of $x_{C(n)}$ to an

4034: infinite realization in $A^{[0,\infty)}$

4035: and let $X^{(n)}$ and $Y^{(n)}$ denote

4036: the periodic extensions of $X_{C(n)}$

4037: and $Y_{C(n)}$, respectively.

4038: The process-level empirical

4039: measure $\calLn$  induced

4040: by $\Xp$ and $\Yp$ on

4041: $(A^{[0,\infty)}\times\hat{A}^{[0,\infty)})$ is

4042: defined by

4043: $$\calLn\bydef\frac{1}{n^d}\sum_{u\in C(n)}

4044: 	\delta_{(X^{(n)}_{u+[0,\infty)},Y^{(n)}_{u+[0,\infty)})}$$

4045: where $\delta_{s,s'}$ denotes the measure

4046: assigning unit mass to the joint realization

4047: $(s,s')\in A^{[0,\infty)}\times\hat{A}^{[0,\infty)}$,

4048: and $X^{(n)}_{u+[0,\infty)}$

4049: (or $Y^{(n)}_{u+[0,\infty)}$)

4050: denotes $X^{(n)}$

4051: (respectively, $Y^{(n)}$)

4052: shifted by $u$ [i.e., the

4053: value of $X^{(n)}_{u+[0,\infty)}$

4054: at position $v$ is the same as

4055: the value of $X^{(n)}$ at position

4056: $u+v$; similarly for $Y^{(n)}_{u+[0,\infty)}$.]

4057: By assuming that $(\Xp,\Yp)$

4058: satisfy a ``process-level LDP''

4059: we mean that the sequence of measures

4060: $\{\calLn\}$ satisfies the LDP in

4061: the space of stationary

4062: probability measures on

4063: $(A^{[0,\infty)}\times\hat{A}^{[0,\infty)})$

4064: equipped with the topology of weak convergence,

4065: with some convex, good rate-function $I(\cdot)$.

4066: These assumptions are satisfied by many of

4067: the random field models used in applications,

4068: and in particular by a large class of Gibbs fields

4069: (see, e.g.,

4070: \cite{comets:86}\cite{folmer-orey}\cite{olla:88}

4071: for general theory and

4072: \cite{guyon:book}\cite{winkler:book} for

4073: examples in the areas of

4074: image processing and image analysis).

4075:

4076: As in the one-dimensional case, suppose

4077: that the process-level LDP condition

4078: holds, and that the

4079: distortion measure $\rho$ is

4080: bounded and continuous on $A\times\Ahat$.

4081: Then with $\BBP$-probability one,

4082: conditional on

4083: $X_{[0,\infty)}=x_{[0,\infty)}$,

4084: the sequence

4085: $\{\rho_n(x_{C(n)},Y_{C(n)})\}$ satisfies

4086: the LDP upper bound with respect to the

4087: deterministic, convex rate-function $J(\cdot)$

4088: as in Remark~3.

4089: Moreover, assuming sufficiently strong mixing

4090: properties for $\Yp$ one may also verify the

4091: corresponding lower bound (for example, by

4092: adapting the stochastic subadditivity approach of

4093: \cite{chi-AP:01}).

4094:

4095: %Y ------------------------- end of changes

4096:

4097: \subsection{Applications}

4098: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

4099: In Sections~6.3.1 and~6.3.2 below we

4100: consider the random field analogs of

4101: the problems discussed in Section~3

4102: in the context of one-dimensional

4103: processes.

4104: In the instances when our analysis

4105: was restricted to $\iid$ processes,

4106: the extension to random fields is

4107: trivial -- an $\iid$ random field

4108: is no different from an $\iid$ process.

4109: For that reason, we only give the

4110: full statements of corresponding

4111: random fields results when the

4112: generalization from $d=1$ to

4113: $d\geq 2$ does involve some

4114: modifications. Otherwise, only

4115: a brief description of the corresponding

4116: results is mentioned.

4117:

4118: \subsubsection{Lossy Data Compression}

4119: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

4120: Here we very briefly discuss the

4121: problem of data compression, when

4122: the data is in the form of a two-

4123: or more generally a $d$-dimensional

4124: array.

4125: In this case, the underlying data

4126: source is naturally modeled as

4127: a $d$-dimensional random field.

4128: Extensive discussions of the

4129: general information-theoretic

4130: problems on random fields are

4131: given

4132: % by Berger, Shen and Ye

4133: in \cite{berger-shen-ye:92}

4134: and

4135: % by Ye and Berger's

4136: the recent monograph \cite{ye-berger:book};

4137: see also

4138: \cite{follmer:73}.

4139:

4140: First we discuss the results

4141: given in Section~3.1. The construction

4142: of the random codebooks described there

4143: generalizes to random fields in an

4144: obvious fashion, and the statement

4145: as well as the proof of Theorem~9

4146: remain unchanged. Following the

4147: notation exactly as developed for

4148: $\iid$ sources, the strengthened

4149: coding theorems given in

4150: Theorems~10 and~11 follow by

4151: combining (the obvious

4152: generalization of) Theorem~9

4153: with the generalized

4154: AEP of Theorem~26.

4155:

4156: Similarly, the mismatched-codebook

4157: results of Section~3.2 only rely on

4158: Theorem~9 and the generalized AEP

4159: of Theorem~1, and therefore

4160: immediately generalize to the

4161: random field case. Finally

4162: Theorems~15 and~16 in Section~3.5

4163: are only stated for $\iid$ processes,

4164: hence, as mentioned above, they

4165: trivially extend to random

4166: fields.

4167:

4168: \subsubsection{Waiting Times}

4169: Here we consider the natural

4170: $d$-dimensional analogs of the

4171: waiting times questions considered

4172: in Section~3.3. Given two

4173: independent realizations

4174: of the random fields $\Xp$ and $\Yp$,

4175: our main quantity of interest here

4176: is how ``far'' we have to look

4177: in $\Yp$ until we find a match for

4178: the pattern $X_{C(n)}$ with distortion

4179: $D$ or less. Given $n\geq 1$ and

4180: a distortion level $D\geq 0$, we

4181: define the {\em waiting time} $W_n$

4182: as the smallest length $i$

4183: such that a copy of the pattern

4184: $X_{C(n)}$ appears somewhere in

4185: $Y_{C(i+n-1)}$, with distortion

4186: $D$ or less.

4187: Formally,

4188: \ben

4189: W_n\;=\;\inf\{i\geq 1\; :\;

4190: 	\rho_n(X_{C(n)},Y_{u+C(n)})\leq D

4191: 	\;\;\mbox{for some}\;u\in[0,i-1]^d\}

4192: % \label{eq:Wnd-def}

4193: \een

4194: with the convention that the infimum

4195: of the empty set equals $+\infty$.

4196:

4197: In the one-dimensional case our

4198: main tool in investigating

4199: the asymptotic behavior of the

4200: waiting times was the strong

4201: approximation in Theorem~13.

4202: Roughly speaking, Theorem~13

4203: stated that the waiting time

4204: $W_n$ for a $D$-close match

4205: of $X_1^n$ in $\Yp$ is

4206: inversely proportional

4207: to the probability $Q_n(B(X_1^n,D))$

4208: of such a match.

4209: In Theorem~28 below we generalize

4210: this result to the $d$-dimensional

4211: case by showing that the $d$-dimensional

4212: volume $(W_n)^d$ we have to search

4213: in $\Yp$ in order to find a $D$-close

4214: match for $X_{C(n)}$ is, roughly,

4215: inversely proportional

4216: to the probability $Q_n(B(X_{C(n)},D))$

4217: of finding such a match.

4218:

4219: Before stating

4220: Theorem~28 we need to recall

4221: the following definition.

4222: Dobrushin's {\em non-uniform

4223: $\phi$-mixing coefficients}

4224: of a stationary random field

4225: $\Yp$ are

4226: \ben

4227: \phi_\ell(k)\;=\;\sup\{|\BBQ(B|A)-\BBQ(B)|\;:

4228: & & \hspace{-0.2in}

4229: 	B\in\sigma(Y_{U}),\; A\in\sigma(Y_{V}),\; \BBQ(A)>0\\

4230: & & \quad\quad |U|\leq \ell,\; |V|<\infty,\; d(U,V)\geq k \}

4231: \een

4232: where $\sigma(Y_U)$ denotes

4233: the $\sigma$-field generated by

4234: the random variables $Y_U$, $U\subset\IN^d$.

4235: See \cite[Chapter~6]{lin-lu:book}

4236: or \cite{doukhan:book} for detailed

4237: discussions of the coefficients

4238: $\{\phi_\ell(k)\}$ and their properties.

4239:

4240: \medskip

4241:

4242: {\em Theorem~28. Strong Approximation:}

4243: Let $\Xp$ and $\Yp$ be stationary ergodic

4244: random fields, and assume that the non-uniform

4245: $\phi$-mixing  coefficients of $\Yp$ satisfy

4246: \be

4247: \limsup_{n\to\infty}\sum_{j=1}^\infty

4248: (j+1)^{d-1}\phi_n(jn)<\infty.

4249: \label{eq:dobrushin}

4250: \ee

4251: If $Q_n(B(X_{C(n)},D))>0$ eventually with

4252: probability one,

4253: then for any $\epsilon>0$:

4254: \ben

4255: -(1+\epsilon)\log n

4256: \;\leq\;

4257: \log [W^d_n Q_n(B(X_{C(n)},D))]

4258: \;\leq\;

4259: (d+1+\epsilon)\log n

4260: \;\;\;\;\mbox{eventually, w.p.1.}

4261: \een

4262:

4263: \medskip

4264:

4265: The proof of Theorem~28 is a

4266: straightforward modification of

4267: the corresponding one-dimensional argument in

4268: \cite{dembo-kontoyiannis}; it is given in

4269: Appendix~D.

4270:

4271: \medskip

4272:

4273: {\em Remark 7:} The mixing condition

4274: (\ref{eq:dobrushin}) is satisfied by

4275: a rather large class of

4276: stationary random fields. For

4277: example in the case of Markov

4278: random fields, it is easy to check

4279: that under Dobrushin's uniqueness

4280: condition the limit in

4281: (\ref{eq:dobrushin}) is finite;

4282: see \cite[Section~8.2]{georgii:1}

4283: or \cite{doukhan:book} for

4284: more details.

4285:

4286: \medskip

4287:

4288: Next we combine the above strong

4289: approximation result with the

4290: generalized AEPs of Theorems~26

4291: and~27, to read off the first order

4292: asymptotic behavior of the

4293: waiting times. Theorem~29

4294: below generalizes Theorem~14

4295: to the random field case.

4296:

4297: \medskip

4298:

4299: {\em Theorem~29. SLLN for Waiting Times:}

4300: Let $\Xp$ and $\Yp$ be stationary ergodic

4301: random fields:

4302:

4303: (a)~If $\Yp$ is $\iid$ and the

4304: average distortion $\Dav$ is finite,

4305: then for any $D\in(\Dmin,\Dav)$

4306: \ben

4307: \frac{1}{n^d}\log W_n^d \to R_1(P_1,Q_1,D)

4308: \;\;\;\;\mbox{w.p.1.}

4309: % \label{eq:w-slln-d}

4310: \een

4311:

4312: (b)~Suppose that the

4313: % distortion measure

4314: % $\rho$ is bounded, that the

4315: conditions of Theorem~27 are satisfied,

4316: and that $\Yp$ also satisfies

4317: the mixing assumption (\ref{eq:dobrushin}).

4318: Then, for any $D\in(\Dinf,\Dav)$:

4319: \ben

4320: \frac{1}{n^d}\log W^d_n \to R(\BBP,\BBQ,D)

4321: \;\;\;\;\mbox{w.p.1.}

4322: \een

4323:

4324: \medskip

4325:

4326: %A modified.

4327: %Y removed!

4328: %Y we don't really know (and it's definitely not "not hard to verify")

4329: %Y that the assumptions of (b) -- including the conditional LDP -- are

4330: %Y satisfied by a class of Gibbs/MRFs.

4331: % Note that, although the assumptions

4332: % of part~(b) of the theorem appear to

4333: % be rather heavy, they are not hard to

4334: % verify for a class of Gibbs

4335: % or Markov random fields;

4336: % see the comments in

4337: % Remarks~6 and~7 above.

4338:

4339:

4340: \newpage

4341:

4342: \section{Random-Fields -- Second Order Results}

4343: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

4344: Finally we turn to the random field extensions

4345: of the second order results of Sections~4 and~5.

4346: In Section~7.1 we state the random field analog

4347: of the second order generalized AEP, and in~7.2

4348: we discuss its application to the problems

4349: of lossy data compression and pattern matching.

4350:

4351: \subsection{Refinements of Generalized AEP}

4352: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

4353: Let $\Xp$ be a stationary ergodic random

4354: field with marginal distribution $P$ on $A$,

4355: and let $Q$ be a fixed probability measure

4356: on $\Ahat$. We will assume throughout that

4357: the distortion measure $\rho$ has a finite

4358: third moment,

4359: \be

4360: D_3\bydef

4361: E_{P\times Q}[\rho^3(X,Y)]<\infty

4362: \label{eq:third-d}

4363: \ee

4364: and that it is not essentially

4365: constant, i.e., $\Dmin<\Dav$,

4366: with $\Dmin$ and $\Dav$ defined

4367: as before (cf. (\ref{eq:Dmin})

4368: and (\ref{eq:Dav})).

4369:

4370: The goal of this section is

4371: to give the random field analogs of

4372: Theorems~17 and~18 and of Corollary~19

4373: from the one-dimensional case.

4374:

4375: An examination of the proof of Theorem~17 in

4376: \cite{yang-zhang:99} shows that its proof

4377: only depends on the ergodicity of $\Xp$

4378: and the $\iid$ structure of the product

4379: measures $Q^n$. Simply replacing the

4380: application of the ergodic theorem

4381: by the ergodic theorem

4382: for $\IN^d$ actions

4383: \cite[Chapter~6]{krengel:book}

4384: immediately yields the following

4385: generalization: As long as condition

4386: (\ref{eq:third-d}) is satisfied,

4387: for all $D\in(\Dmin,\Dav)$ we have

4388: \be

4389: -\log Q^{n^d}(B(X_{C(n)},D))= n^dR_1(\hat{P}_n,Q,D)+\frac{d}{2}\log n + O(1)

4390: \;\;\;\;\mbox{w.p.1}

4391: \label{eq:br-d}

4392: \ee

4393: where $\hat{P}_n$ is now the empirical measure

4394: induced by $X_{C(n)}$ on $A$.

4395:

4396: In order to generalize Theorem~18 to $\IN^d$

4397: we need to introduce a measure of dependence

4398: analogous to $\alpha$-mixing in the

4399: one-dimensional case. For a stationary

4400: random field $\Xp$ on $\IN^d$ we define

4401: the {\em uniform $\alpha$-mixing coefficients}

4402: of $\Xp$ by

4403: \ben

4404: \alpha(k)\;=\;\sup\{|\BBP(A\cap B)-\BBP(A)\BBP(B)|\;:

4405: & & \hspace{-0.2in}

4406:         A\in\sigma(X_{U}),\; B\in\sigma(X_{V}),\; d(U,V)\geq k \}

4407: \een

4408: where, as before, $\sigma(X_U)$ denotes

4409: the $\sigma$-field generated by

4410: the random variables $Y_U$.

4411: See \cite{lin-lu:book}\cite{doukhan:book}

4412: for more details.

4413:

4414: Apart from ergodicity, the main technical

4415: ingredient in the proof of Theorem~18 above

4416: (see also the proof of

4417: \cite[Theorem~3]{dembo-kontoyiannis})

4418: is the LIL for $\Xp$.

4419: Similarly to the one-dimensional case,

4420: the LIL for a random field $\Xp$

4421: holds as soon as the following

4422: mixing condition is satisfied

4423: \be

4424: \alpha(k)\leq C\ k^{

4425: -

4426: 3d(1+\epsilon)},

4427: \quad\mbox{for some $\epsilon>0$ and $C<\infty.$}

4428: \label{eq:LIL-cond2}

4429: \ee

4430: [This follows from the

4431: almost sure invariance principle

4432: in \cite[Theorem~1]{berkes-morrow}.]

4433:

4434: Assuming that (\ref{eq:LIL-cond2})

4435: and the third moment condition

4436: (\ref{eq:third-d}) both hold,

4437: we get the following generalization

4438: of Theorem~18. For all $D\in(\Dmin,\Dav)$,

4439: \be

4440: n^dR_1(\hat{P}_n,Q,D) = n^dR_1(P,Q,D) + \sum_{u\in C(n)}

4441:         g(X_u) + O(\log\log n)

4442: \;\;\;\;\mbox{w.p.1}

4443: \label{eq:taylor-d}

4444: \ee

4445: with $g(x)$ defined exactly as in the

4446: one-dimensional case (\ref{eq:functiong}).

4447:

4448: Combining (\ref{eq:br-d}) and (\ref{eq:taylor-d})

4449: gives the following generalization of Corollary~19:

4450:

4451: \medskip

4452:

4453: {\em Theorem~30: Second Order Generalized AEP:}

4454: Let $\Xp$ be a stationary ergodic random field

4455: with marginal distribution $P$ on $A$, and let

4456: $Q$ be an arbitrary probability measure on $\Ahat.$

4457: Assume that the uniform $\alpha$-mixing coefficients

4458: of $\Xp$ satisfy

4459: (\ref{eq:LIL-cond2})

4460: and that $D_3=E_{P\times Q}[\rho^3(X,Y)]$ is

4461: finite. Then for any $D\in(\Dmin,\Dav)$, and

4462: with $g(x)$ defined as in (\ref{eq:functiong}),

4463: $$

4464: -\log Q^{n^d}(B(X_1^n,D))= n^dR_1(P,Q,D) + \sum_{u\in C(n)}g(X_u)

4465:         + \frac{d}{2}\log n + O(\log\log n)

4466: \;\;\;\;\mbox{w.p.1.}$$

4467:

4468: \subsection{Applications}

4469: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

4470: Next we discuss applications of the second order

4471: generalized AEP to the

4472: $d$-dimensional analogs of the

4473: data compression and pattern

4474: matching problems of Section~4.

4475: As in Section~6.3, the only

4476: results stated explicitly are

4477: those whose extensions to $\IN^d$

4478: require modifications.

4479:

4480: As mentioned in Section~6.3.1,

4481: the one-dimensional construction

4482: of the random codes,

4483: as well as the main tool used

4484: in their analysis, Theorem~9,

4485: immediately generalize to the

4486: random field case. And since

4487: all the second order results

4488: of Section~5.1 (Theorems~20--23)

4489: are stated for $\iid$ sources,

4490: their statements as well as

4491: proofs carry over {\sl verbatim}

4492: to this case.

4493:

4494: For the problem of waiting times,

4495: we can use the second order generalized

4496: AEP of Theorem~30 to refine the SLLN

4497: of Theorem~29

4498: \ben

4499: \frac{1}{n^d}\log W_n^d \to R_1(P,Q,D)

4500: \;\;\;\;\mbox{w.p.1}

4501: \een

4502: to a corresponding CLT and LIL

4503: as in the one-dimensional case.

4504: These refinements are stated in

4505: Theorem~31 below. Its proof is

4506: identical to that of Theorem~24

4507: in the one dimensional case. The

4508: only difference here is that we

4509: need to invoke the CLT and LIL

4510: for the partial sums of the random

4511: field $\{g(X_u)\;;\;u\in\IN^d\}$.

4512: Under the conditions of the

4513: theorem, these follow from the

4514: almost sure invariance principle

4515: of \cite[Theorem~1]{berkes-morrow}.

4516:

4517: \medskip

4518:

4519: {\em Theorem~31:}

4520: Let $\Xp$ be a stationary ergodic random

4521: field and $\Yp$ be $\iid$, with marginal

4522: distributions $P$ and $Q$ on $A$ and $\Ahat$,

4523: respectively. Assume that the

4524: uniform $\alpha$-mixing

4525: coefficients of $\Xp$ satisfy (\ref{eq:LIL-cond2})

4526: and that $D_3=E_{P\times Q}[\rho^3(X,Y)]$ is

4527: finite. Then for any $D\in(\Dmin,\Dav)$ the

4528: following series is absolutely convergent

4529: \be

4530: \sigma^2\bydef

4531: % E_P[g^2(X_{\orig})]+2

4532: \sum_{u\in\IN^d} E_P[g(X_{\orig})g(X_u)]

4533: \label{eq:variance-d}

4534: \ee

4535: with $g(x)$ defined as in (\ref{eq:functiong}),

4536: and, moreover:

4537: \begin{itemize}

4538: \item[]{\bf (CLT)} With $R_1=R_1(P,Q,D)$:

4539: $$\frac{\log W^d_n \;-\; n^dR_1}{n^{d/2}}

4540:         \weakly N(0,\sigma^2).$$

4541: \item[]{\bf (LIL)}

4542: The set of limit points of the sequence

4543: $$\left\{

4544:         \frac{\log W^d_n \;-\; n^dR_1}

4545:              {\sqrt{2n^d\log\log n}}

4546:   \right\},\quad n\geq 3$$

4547: coincides with $[-\sigma,\sigma]$, with

4548: probability one.

4549: \end{itemize}

4550:

4551:

4552: % \medskip

4553: %

4554: % \subsubsection{Match-Lengths and Lossy LZ on Random Fields}

4555: % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

4556: %

4557: % -- Possibly the analog of Theorem 25

4558: %

4559: % -- Possibly a generalization of Steinberg \& Gutman's result

4560:

4561:

4562:

4563: % \newpage

4564: \section*{Acknowledgments}

4565: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

4566: We thank Tam\'{a}s Linder and Yuval Peres

4567: for useful discussions regarding Theorems 7 and 8.

4568:

4569: \appendix

4570: \section{Proof of Theorem~7}

4571: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

4572: We prove the upper and lower bounds separately.

4573: For the upper bound,

4574: recalling the definition

4575: of $r_n(X_1^n)$ in

4576: (\ref{eq:ratio})

4577: we observe that

4578: $$r_n(X_1^n,D)

4579: % \frac{1}{n}\log

4580: % \frac{P_n(B(X_1^n,D))}

4581: % {Q^n(B(X_1^n,D))}

4582: \leq \frac{1}{n}\log {P_n(B(X_1^n,D))} -\frac{1}{n}\log Q^n(X_1^n)$$

4583: where the second term converges to $H(P) + H(P\|Q)$

4584: as $n\to \infty$, by the ergodic theorem.

4585: Since the first term is increasing in $D$,

4586: for any fixed $D>0$ we have with

4587: $\BBP$-probability one:

4588: \be

4589: \limsupnd

4590: % \frac{1}{n}\log

4591: % \frac{P_n(B(X_1^n,D))}

4592: % {Q^n(B(X_1^n,D))}

4593: r_n(X_1^n,D)

4594: 	\leq

4595: 	H(P) + H(P\|Q) +

4596: 	\limsup_{n\to\infty}

4597: 	\frac{1}{n}\log P_n(B(X_1^n,D)).

4598: \label{eq:discUB1}

4599: \ee

4600: Now the pointwise source coding

4601: theorem (see \cite[Theorems~1 and~5]{konto-zhang:00})

4602: implies that

4603: \be

4604: \liminf_{n\to\infty}-\frac{1}{n}\log P_n(B(X_1^n,D))\geq R(D)

4605: \;\;\;\;\mbox{w.p.1}

4606: \label{eq:discUB2}

4607: \ee

4608: where $R(D)$ is the rate-distortion

4609: function of the source $\Xp$

4610: (in nats).

4611: % To see this, note

4612: % that that in the proof of

4613: % Theorem~6~$(ii)$ in

4614: % \cite{kontoyiannis-red:00}

4615: % it was shown that

4616: % $$\limsup_{n\to\infty}

4617: % 	\frac{1}{n}\log\frac{P_n(B(X_1^n,D))}

4618: % 	{\widetilde{Q}_n(B(X_1^n,D))}

4619: % 	\leq 0,

4620: % 	\;\;\;\;\mbox{w.p.1}

4621: % $$

4622: % (in fact this is shown to hold

4623: % for any sequence of probability

4624: % measures $\{Q'_n\}$ in place of

4625: % $\{P_n\}$),

4626: % where each measure $\widetilde{Q}_n$

4627: % minimizes $E_{P_n}[-\log Q_n(B(X_1^n,D))]$

4628: % over all probability measures $Q_n$.

4629: % But from Theorem~5 we also know that

4630: % $$\lim_{n\to\infty}

4631: % -\frac{1}{n}\log\widetilde{Q}_n(B(X_1^n,D))

4632: % = R(D)

4633: % \;\;\;\;\mbox{w.p.1.}

4634: % $$

4635: %

4636: From equations (\ref{eq:discUB1})

4637: and (\ref{eq:discUB2}) we get

4638: \ben

4639: \limsupnd

4640: r_n(X_1^n,D)

4641: & \leq &

4642: 	H(P) + H(P\|Q) -R(D)\\

4643: & \leq &

4644: 	H(P) + H(P\|Q) - H(\BBP) + H(P) - R_1(D)

4645: \;\;\;\;\mbox{w.p.1}

4646: \een

4647: where $R_1(D)$ denotes the first order

4648: rate-distortion function of $\Xp$,

4649: $H(\BBP)$ is the entropy rate of $\Xp$

4650: (both in nats), and

4651: the second inequality follows

4652: from the Wyner-Ziv bound;

4653: see \cite[Remark~4]{wyner-ziv:71}.

4654: The assumption that $\rho(x,y)=0$

4655: if and only if $x=y$ implies that

4656: $\lim_{D\to 0} R_1(D)=H(P)$,

4657: so letting $D\downarrow 0$

4658: the above right hand side becomes

4659: $H(P) + H(P\|Q) -H(\BBP)$

4660: and it is an easy calculation

4661: to verify that this is

4662: indeed the same

4663: as $H(\BBP\|\BBQ)$.

4664: This gives the required

4665: upped bound.

4666:

4667: For the lower bound we proceed

4668: similarly by noting that

4669: $$

4670: % \frac{1}{n}\log

4671: % \frac{P_n(B(X_1^n,D))}

4672: % {Q^n(B(X_1^n,D))}

4673: r_n(X_1^n,D)

4674: \geq \frac{1}{n}\log {P_n(X_1^n)}

4675: 	-\frac{1}{n}\log Q^n(B(X_1^n,D)),$$

4676: where the first term converges to $H(\BBP)$

4677: by the classical AEP

4678: (as $n\to \infty$).

4679: Since the second term is decreasing

4680: in $D$, for any fixed $D>0$ small

4681: enough we have

4682: with probability one:

4683: \ben

4684: \liminfnd

4685: % \frac{1}{n}\log

4686: % \frac{P_n(B(X_1^n,D))}

4687: % {Q^n(B(X_1^n,D))}

4688: r_n(X_1^n,D)

4689: & \geq &

4690:         - H(\BBP) -

4691:         \limsup_{n\to\infty}

4692:         \frac{1}{n}\log Q^n(B(X_1^n,D))\\

4693: & = &

4694: 	- H(\BBP) + R_1(P,Q,D)

4695: \een

4696: where the last step follows from

4697: the generalized AEP in Theorem~1

4698: (note that $\Dmin=0$ here).

4699: By the characterization of the

4700: rate-function in Proposition~2

4701: we know that

4702: $$R_1(P,Q,D) = \sup_{\la'\leq 0} [\la' D-\LA(\la')]

4703: 	\geq [\la D-\LA(\la)]=

4704: 	-E_{P}\left[\log E_{Q}\left(

4705: 		e^{\lambda(\rho(X,Y)-D)}

4706: 	\right)\right]$$

4707: for any fixed $\la<0$.

4708: Therefore, for any

4709: $D$ small enough and

4710: $\la<0$ we have

4711: \ben

4712: \liminfnd

4713: % \frac{1}{n}\log

4714: % \frac{P_n(B(X_1^n,D))}

4715: % {Q^n(B(X_1^n,D))}

4716: r_n(X_1^n,D)

4717: \geq - H(\BBP)  -E_{P}\left[\log E_{Q}\left(

4718:                 e^{\lambda(\rho(X,Y)-D)}

4719:         \right)\right]

4720: \;\;\;\;\mbox{w.p.1.}

4721: \een

4722: Letting $D\to 0$ and then $\la\to-\infty$,

4723: by the dominated convergence theorem (and

4724: the assumption $\rho(x,y)=0$ iff $x=y$)

4725: the right hand side above converges

4726: to $- H(\BBP) + H(P\|Q) + H(P)

4727: = H(\BBP\|\BBQ),$ proving the

4728: lower bound.

4729:

4730: Finally, since for each fixed $n$

4731: the limit as $D\downarrow 0$ of

4732: $r_n(X_1^n,D)$

4733: exists,

4734: % $$\frac{1}{n}\log \frac{P_n(B(X_1^n,D))}{Q_n(B(X_1^n,D))}$$

4735: it follows that

4736: the repeated limit

4737: $\lim_{n}\lim_{D}$

4738: also exists and is equal

4739: to the double limit $H(P\|Q)$.

4740: \qed

4741:

4742: \section{Proof of Theorem~8}

4743: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

4744: Part~(a): Fixing $n$, let $f_n=dP_n/dQ_n$ and consider the set

4745: $$

4746: A_n \bydef \left\{ x_1^n : \; Q_n(B(x_1^n,D))>0 \;\; \forall D>0,

4747: f_n(x_1^n) = \limsup_{D\downarrow 0}\; \frac{P_n(B(x_1^n,D))}{Q_n(B(x_1^n,D))}

4748: = \liminf_{D\downarrow 0}\; \frac{P_n(B(x_1^n,D))}{Q_n(B(x_1^n,D))}

4749: \, \right\}.

4750: $$

4751: By the Radon-Nikodym theorem

4752: (cf. \cite[Theorems 1.6.1, 1.6.2]{evans-gariepy}),

4753: we know that $Q_n(A_n)=1$, hence also $P_n(A_n)=1$.

4754: With $\BBP(\cup_n A_n^c)=0$, we conclude the proof of part~(a)

4755: by applying Theorem~6 for $M_n=Q^n$ (in which case $H_n \geq 0$).

4756:

4757: Part~(b): As $Q(A_1)=1$, in particular

4758: $Q(B(x,D))>0$ for all $D>0$ and $Q$-almost

4759: every $x\in\RL^d$ (hence also for $P=P_1$-almost

4760: every $x\in\RL^d$), implying that $\Dmin$ of

4761: (\ref{eq:Dmin}) is zero. The same argument

4762: yields also that $P(B(x,D))>0$ for all $D>0$

4763: and $P$-almost every $x$, hence $\Dmin$ is

4764: still zero if we replace $Q$ by $P$. Thus,

4765: for all

4766: $D< \min\{E_{P\times Q}[\rho(X,Y)],E_{P\times P}[\rho(X,Y)]\}$,

4767: applying Theorem~1 twice we get

4768: $$

4769:   \lim_{n\to\infty}\;

4770: r_n(X_1^n,D)

4771: %         \frac{1}{n}\log

4772: %         \frac{P^n(B(X_1^n,D))}

4773: %              {Q^n(B(X_1^n,D))}

4774: 	= R_1(P,Q,D)-R_1(P,P,D)

4775: 	\;\;\;\;\mbox{w.p.1.}

4776: $$

4777: For any probability measure $\mu$ and any $\la \leq 0$, let

4778: $$

4779: \Lambda(\lambda;\mu) = \int  \left[

4780: \log \int e^{\lambda \rho(x,y)} d\mu(y)\right] dP(x).

4781: $$

4782:

4783: Fixing $D>0$ small enough,

4784: we have by Proposition~2

4785: that $R_1(P,P,D) = \lambda D -\Lambda(\lambda;P)$

4786: for the unique $\lambda=\lambda(D)<0$ such that $\Lambda'(\lambda;P)=D$,

4787: whereas $R_1(P,Q,D) \geq \lambda D - \Lambda(\lambda;Q)$.

4788: Since $E_{P\times P}[\rho(X,Y)]>0$, we have also that

4789: $\lambda(D) \downarrow -\infty$ as $D \downarrow 0$ (see (\ref{eq:la-lim})).

4790: Consequently,

4791: $$

4792: \liminf_{D\downarrow 0} \{ R_1(P,Q,D)-R_1(P,P,D) \} \geq

4793: \liminf_{\lambda \downarrow -\infty}

4794: \{ \Lambda(\lambda;P) - \Lambda(\lambda;Q) \}

4795: $$

4796: Similarly, by Proposition~2 we have

4797: $R_1(P,Q,D) = \widetilde{\lambda} D -\Lambda(\widetilde{\lambda};Q)$

4798: for $\widetilde{\lambda}<0$ such that $\Lambda'(\widetilde{\lambda};Q)=D$,

4799: $R_1(P,P,D) \geq \widetilde{\lambda}D -

4800: \Lambda(\widetilde{\lambda};P)$, and with $E_{P\times Q}[\rho(X,Y)]>0$,

4801: also $\widetilde{\lambda}\downarrow -\infty$ when $D \downarrow 0$.

4802: Therefore, it suffices to show that

4803: \be

4804: \label{eq:dn-lim}

4805: \lim_{\lambda \downarrow -\infty}

4806: \{ \Lambda(\lambda;P) - \Lambda(\lambda;Q) \} = H(P\|Q) \;.

4807: \ee

4808:

4809: To this end, for any $\lambda <0$ and $x \in \RL^d$, let

4810: $$

4811: h_\lambda(x) \bydef

4812:  \frac{E_P(e^{\lambda \rho(x,Y)})}{E_Q(e^{\lambda \rho(x,Y)})}

4813: $$

4814: noting that

4815: $$

4816: \Lambda(\lambda;P) - \Lambda(\lambda;Q) = \int \log h_\lambda(x) dP(x).

4817: $$

4818: Using the change of variable $U=\rho(x,Y) \geq 0$ followed

4819: by integration by parts, we see that

4820: $$

4821: h_\lambda(x) =

4822:  \frac{\int_0^\infty e^{\lambda u} g_{x}(u) du}{\int_0^\infty

4823: e^{\lambda u} k_{x}(u) du} \;,

4824: $$

4825: where $g_x (r)=P(B(x,r))$ and $k_x(r)=Q(B(x,r))$ are nonnegative,

4826: nondecreasing and bounded above by $1$. Considering separately

4827: $u \leq 2\eta$ and $u>2\eta$, it is easy to check that for any $\eta>0$,

4828: \be

4829: \sup_{0 <r \leq 2\eta} \frac{g_{x}(r)}{k_{x}(r)} + \psi_{\lambda,x} \geq

4830: h_\lambda(x) \geq \inf_{0 < r \leq 2\eta} \frac{g_{x}(r)}{k_{x}(r)}

4831: \: \frac{1}{1+\psi_{\lambda,x}}

4832: \label{eq:bd-dn}

4833: \ee

4834: where

4835: \be

4836: \psi_{\lambda,x} \bydef

4837: \frac{\int_{2\eta}^\infty e^{\lambda u} du}

4838: {\int_0^{2\eta} e^{\lambda u} k_{x}(u) du} \leq

4839: \frac{1}{\eta |\lambda| k_x(\eta)} \;.

4840: \label{eq:bd2-dn}

4841: \ee

4842: Fix $x \in A_1$ of part (a), in which case $k_x(r)>0$ for all $r>0$ and

4843: $g_x(r)/k_x(r) \to f_1(x)$ as $r \to 0$.

4844: Letting $\lambda \downarrow -\infty$ and then $\eta \to 0$, it

4845: follows by (\ref{eq:bd-dn}) and (\ref{eq:bd2-dn}) that

4846: $$

4847: \lim_{\lambda \downarrow -\infty} h_\lambda(x) = f_1(x) \,.

4848: $$

4849: Recall that $P(A_1)=1$ and our assumption that

4850: $\int \log k_x(\eta) dP(x) > -\infty$ for any $\eta >0$.

4851: By our integrability conditions, the function

4852: $\min\{0, \inf_{\lambda \geq 1} \log h_\lambda(x)\}$ is $P$-integrable,

4853: hence, by Fatou's lemma,

4854: $$

4855: \liminf_{\lambda \downarrow -\infty} \int \log h_\lambda(x) dP(x) \geq

4856: \int \log f_1(x) dP(x) = H(P\|Q) \,.

4857: $$

4858: Moreover, in case $H(P\|Q)<\infty$, our assupmtions imply that

4859: $\sup_{\lambda \geq 1} |\log h_\lambda(x)|$ is $P$-integrable,

4860: hence by dominated convergence,

4861: $\int \log h_\lambda(x) dP(x) \to \int \log f_1(x) dP(x)$ for

4862: $\lambda \downarrow -\infty$,

4863: as required to complete the proof of

4864: (\ref{eq:dn-lim}).

4865: \qed

4866:

4867: \section{Proof of Theorem~27}

4868: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

4869:

4870: Recall our assumption that, for

4871: $\BBP$-a.e. $x_{[0,\infty)]}$, conditional on

4872: $X_{[0,\infty)]}=

4873: x_{[0,\infty)]}$

4874: the random variables $\{\rho_n(x_{C(n)},Y_{C(n)})\}$ satisfy the LDP

4875: with a {\it deterministic} convex good rate-function

4876: denoted hereafter $R(\BBP,\BBQ,\cdot)$. Since

4877: $\rho$ is bounded, by Varadhan's lemma and convex duality,

4878: this implies that

4879: \be

4880: \label{eq:am-las}

4881: R(\BBP,\BBQ,D) =

4882: \sup_{\lambda \in \RL} [ \lambda D - \Lambda_\infty(\lambda) ]

4883: \bydef

4884: \Lambda_\infty^*(D)

4885: \ee

4886: where for any $\lambda \in \RL$, the finite, deterministic limit

4887: $$

4888: \Lambda_\infty(\lambda) \bydef \lim_{n \to \infty}

4889: \frac{1}{n^d} \log \int e^{\lambda \sum_{u \in C(n)}\rho(x_u,y_u)}

4890: dQ_n(y_{C(n)})

4891: $$

4892: exists for $\BBP$-a.e. $x_{[0,\infty)}$

4893: (cf. \cite[Theorem 4.5.10]{dembo-zeitouni:book}).

4894: By bounded convergence,

4895: $\Lambda_\infty(\lambda)$ is also the limit of

4896: $$

4897: \Lambda_n(\lambda) \bydef

4898: \frac{1}{n^d} \int \left[

4899: \log \int e^{\lambda \sum_{u \in C(n)}\rho(x_u,y_u)}

4900: dQ_n(y_{C(n)}) \right] dP_n(x_{C(n)}) \;.

4901: $$

4902:

4903: By stationarity,

4904: \be

4905: \label{eq:am-dav}

4906: \Dav=E_{P_n \times Q_n} (\rho_n(X_{C(n)},Y_{C(n)})),

4907: \;\;\;\forall n \geq 1

4908: \ee

4909: so replacing $P_1$, $Q_1$ and $\rho(x,y)$ of Proposition~2 by

4910: $P_n$, $Q_n$ and $n^d \rho_n(x_{C(n)},y_{C(n)})$,

4911: respectively, we see that

4912: \be

4913: \label{eq:am-lasn}

4914: R_n(P_n,Q_n,D) =

4915: \sup_{\lambda \in \RL} [ \lambda D - \Lambda_n(\lambda) ]

4916: \bydef

4917: \Lambda_n^*(D) \,.

4918: \ee

4919: %where for any $D \in (\Dmin,\Dav)$, the above supremum is

4920: %obtained at the unique $\lambda^*_n < 0$ such that $\Lambda_n'(\lambda^*_n)=D$.

4921: Note that

4922: $|\Lambda_n(\lambda)-\Lambda_n(\lambda')| \leq c |\lambda-\lambda'|$ for some

4923: $c<\infty$ and all $n$, $\lambda,\lambda' \in \RL$,

4924: % (by the boundedness of $\rho$),

4925: hence the convergence of $\Lambda_n(\cdot)$ to $\Lambda_\infty(\cdot)$ is

4926: uniform on compact subsets of $\RL$. In particular, the convex,

4927: continuous functions $\Lambda_n(\cdot)$ converge infimally to $\Lambda_\infty(\cdot)$,

4928: and consequently, by \cite[Theorem 5]{wijsman}, the convex functions

4929: $\Lambda_n^*(\cdot)$ converge infimally to $\Lambda_\infty^*(\cdot)$, that is

4930: \be

4931: \label{eq:inf-conv}

4932: \Lambda_\infty^*(D)

4933: =

4934: \lim_{\delta \to 0} \limsup_{n \to \infty} \inf_{|\hat{D}-D|<\delta} \Lambda_n^*(\hat{D})

4935: =

4936: \lim_{\delta \to 0} \liminf_{n \to \infty} \inf_{|\hat{D}-D|<\delta} \Lambda_n^*(\hat{D}) \,.

4937: \ee

4938:

4939: It follows from (\ref{eq:am-dav}) and Jensen's inequality

4940: that $\Lambda_n(\lambda) \geq \lambda \Dav$ for all $n$ and $\lambda$,

4941: hence, for $D \leq \Dav$ suffices to consider $\lambda \leq 0$ in

4942: (\ref{eq:am-las}) and in (\ref{eq:am-lasn}). Thus, for $1 \leq n \leq \infty$,

4943: $\Lambda^*_n$ are non-negative, convex,

4944: and monotone non-increasing on $[0,\Dav]$, with

4945: $\Lambda^*_n(\Dav)=0$. For $1 \leq n \leq \infty$,

4946: let

4947: $$

4948: \Dminn \bydef \lim_{\lambda \downarrow -\infty} \frac{\Lambda_n(\lambda)}{\lambda} \,,

4949: $$

4950: so that $\Lambda_n^*(D)=\infty$ for $D < \Dminn$, while

4951: $\Lambda_n^*(D)<\infty$ for $D>\Dminn$.

4952: Note that for $n < \infty$ this coincides with the definition of $\Dminn$

4953: given in (\ref{eq:dminn}). It is easy to check then that (\ref{eq:inf-conv})

4954: implies the pointwise convergence of $\Lambda^*_n(\cdot)=R_n(\BBP,\BBQ,\cdot)$

4955: to $\Lambda^*_\infty(\cdot)=R(\BBP,\BBQ,\cdot)$ at any $D$ for which

4956: $\Lambda^*_\infty(D-\delta) \downarrow \Lambda^*_\infty (D)$, that is,

4957: for all $D \neq \Dinf$. In particular, necessarily $\Dinf \in [\Dmin,\Dav]$,

4958: and $\Dinf$ may also be defined via (\ref{eq:dinf}).

4959: The continuity of $R(\BBP,\BBQ,D)$ at $D \in (\Dmin,\Dav)$, $D \neq \Dinf$ implies

4960: the equality in (\ref{eq:ldp-27}) for such $D$, thus completing the proof of the

4961: theorem.

4962: \qed

4963:

4964: \section{Proof of Theorem~28}

4965: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

4966: For each $m\geq 1$, let $G_m$ be the

4967: collection of ``good'' realizations

4968: $x_{\IN^d}\in A^{\IN^d}$

4969: $$G_m= \left\{ x_{\IN^d}\in A^{\IN^d}

4970: : \;\;Q_n(B(x_{C(n)},D))>0 \;\mbox{for all}\; n\geq m \right\}$$

4971: so that the assumption that

4972: $Q_n(B(X_{C(n)},D))>0$ eventually, with probability one

4973: translates to

4974: \be

4975: \BBP\{\cup_{m\geq 1} G_m\}=1.

4976: \label{eq:eventuallyOK}

4977: \ee

4978:

4979: To prove the lower bound

4980: we choose and fix an $m\geq 1$

4981: and a realization $x_{\IN^d}\in G_m$.

4982: Then for any $K>1$:

4983: \ben

4984: \PR\{W_n^d<K\,|\,X_{C(n)}=x_{C(n)}\}

4985: &\leq& \sum_{u\in[0,\lfloor K^{1/d} \rfloor-1]^d}

4986: 	\,Q_n\{Y_{u+C(n)}\in B(x_{C(n)},D)\}\\

4987: &\leq& K\,Q_n(B(x_{C(n)},D)).

4988: \een

4989: Since, by its definition,

4990: $W_n$ is always greater than

4991: or equal to one, this

4992: inequality trivially holds

4993: also for $K\in(0,1]$.

4994: Setting

4995: $K=[n^{1+\epsilon}Q_n(B(x_{C(n)},D))]^{-1}$

4996: above gives,

4997: for all $n \geq m$,

4998: \ben

4999: \PR\{\log[W^d_nQ_n(B(X_{C(n)},D))]<-(1+\epsilon)\log n

5000: \,|\,X_{C(n)}=x_{C(n)}\}

5001: \leq \frac{1}{n^{1+\epsilon}}.

5002: \een

5003: Since this bound is uniform over

5004: $x_{\IN^d} \in G_m$ and summable, the Borel-Cantelli

5005: lemma and assumption (\ref{eq:eventuallyOK})

5006: imply that

5007: \be

5008: \log[W^d_nQ_n(B(X_{C(n)},D))]\;\geq\;-(1+\epsilon)\log n

5009: \;\;\;\;\mbox{eventually, w.p.1.}

5010: \label{eq:bc:2}

5011: \ee

5012:

5013: For the upper bound

5014: we choose and fix an $m\geq 1$

5015: and a realization $x_{\IN^d}\in G_m$,

5016: and take $K\geq (n+1)^d$.

5017: Note that

5018: $$\PR\{W_n^d>K\,|\,X_{C(n)}=x_{C(n)}\}

5019: \leq\Pr

5020:     \left\{

5021:       \sum_{u\in [0,M]^d}

5022:       \IND_{

5023: 	\{

5024: 	Y_{nu+C(n)}\in B(X_{C(n)},D)

5025: 	\}

5026: 	   }

5027: 		= 0

5028:     \right\}

5029: $$

5030: where

5031: the sum is over the $(M+1)^d$

5032: integer positions $u\in[0,M]^d\subset\IN^d$,

5033: $nu$ denotes the point

5034: $(nu_1,nu_2,\ldots,nu_d)\in\IN^d$,

5035: and

5036: $$M=M(K,n)\bydef

5037: \left\lfloor\frac{K^{1/d}-1}{n}

5038: \right\rfloor.$$

5039: Let $\Sigma_n$ denote the sum

5040: in the above probability,

5041: $$\Sigma_n=\sum_{u\in[0,M]^d}I_n(u)$$

5042: where $I_n(u)$ is the indicator function

5043: of the event $\{Y_{nu+C(n)}\in B(X_{C(n)},D)\}$.

5044: In this notation:

5045: \be

5046: \PR\{W_n^d>K\,|\,X_{C(n)}=x_{C(n)}\}

5047: \,\leq\,\BBQ\{\Sigma_n=0\}\,\leq\,\frac{\VAR_\BBQ(\Sigma_n)}

5048: 				{[E_\BBQ(\Sigma_n)]^2}.

5049: \label{eq:estimate1}

5050: \ee

5051: By stationarity

5052: \be

5053: E_\BBQ(\Sigma_n) = [M+1]^dQ_n(B(x_{C(n)},D))

5054: \label{eq:estimate2}

5055: \ee

5056: and by the definition of the

5057: $\phi$-mixing coefficients, if $u\neq v$,

5058: $$E_\BBQ\{I_n(u)I_n(v)\}\leq Q_n(B(x_{C(n)},D))

5059: [\phi_n(nd(u,v)-n+1)+Q_n(B(x_{C(n)},D))].$$

5060: Using the last two estimates

5061: we can bound the variance as

5062: \be

5063: \VAR_\BBQ\{\Sigma_n\}

5064: &=&\sum_{u,v\in[0,M]^d}

5065: \COV_\BBQ(I_n(u), I_n(v))

5066:         \nonumber\\

5067: &\leq&[M+1]^dQ_n(B(x_{C(n)},D))

5068: 	\nonumber\\

5069: & & \quad

5070: 	+\sum_{u,v\in[0,M]^d,\;u\neq v}

5071: 	\Big[Q_n(B(x_{C(n)},D))

5072: 	\phi_n(nd(u,v)-n+1)\Big]

5073: 	\nonumber\\

5074: &\leq&[M+1]^dQ_n(B(x_{C(n)},D))

5075: 	\left[1+\sum_{j=1}^{M}

5076: c_d j^{d-1}\phi_n(nj-n+1)\right]

5077:         \label{eq:estimate3}

5078: \ee

5079: where

5080: $c_d j^{d-1}$ bounds the number of possible points

5081: $u$ that can be at a distance exactly $j$ from a

5082: given point $v$ (for some constant $c_d$).

5083: By assumption (\ref{eq:dobrushin}) we can find

5084: a finite constant $\Phi$ such that the expression

5085: in square brackets in (\ref{eq:estimate3}) is bounded

5086: above by $\Phi$, uniformly in $n$.

5087: Substituting this bound, together with

5088: (\ref{eq:estimate2}) and (\ref{eq:estimate3}),

5089: in (\ref{eq:estimate1}), gives

5090: \be

5091: \PR\{W_n>K\,|\,X_{C(n)}=x_{C(n)}\}&\leq&\frac

5092:         {\Phi}

5093:         {[M+1]^dQ_n(B(x_{C(n)},D))}.

5094: \label{eq:estimate4}

5095: \ee

5096: Let $\epsilon>0$ arbitrary,

5097: take $n$ large enough so

5098: that $n^{(1+\epsilon)/d}\geq 2$,

5099: and let $K=n^{d+1+\epsilon}/Q_n(B(x_{C(n)},D)).$

5100: Simple algebra shows that with this choice

5101: of $K$ we have

5102: $$[M+1]^dQ_n(B(x_{C(n)},D))\geq\frac{1}{2}n^{1+\epsilon}$$

5103: and substituting this in (\ref{eq:estimate4})

5104: yields

5105: \ben

5106: 	\PR\{\log[W_n^dQ_n(B(

5107: x_{

5108: C(n)},D))] >

5109: 	(d+1+\epsilon)\log n\,|\,X_{C(n)}=x_{C(n)}\}\leq

5110:         \frac{2\Phi}{n^{1+\epsilon}}.

5111: \een

5112: This bound is uniform

5113: over $x_{\IN^d} \in G_m$ and summable,

5114: so the Borel-Cantelli lemma

5115: and (\ref{eq:eventuallyOK})

5116: imply that

5117: \be

5118: \log[W_n^dQ_n(B(

5119: X_{

5120: C(n)},D))]\;\leq\;(d+1+\epsilon)\log n

5121: \;\;\;\;\mbox{eventually, w.p.1.}

5122: \label{eq:bc:1}

5123: \ee

5124:

5125: Combining (\ref{eq:bc:1}) and (\ref{eq:bc:2})

5126: completes the proof.

5127: \qed

5128:

5129: \newpage

5130:

5131: \begin{thebibliography}{10}

5132:

5133: \bibitem{agw:90}

5134: R.~Arratia, L.~Gordon, and M.S. Waterman.

5135: \newblock The {E}rd{\"{o}}s-{R}{\'{e}}nyi law in distribution for coin tossing

5136:   and sequence matching.

5137: \newblock {\em Ann. Stat.}, 18:539--570, 1990.

5138:

5139: \bibitem{arratia-waterman}

5140: R.~Arratia and M.S. Waterman.

5141: \newblock A phase transition for the score in matching random sequences

5142:   allowing deletions.

5143: \newblock {\em Ann. Appl. Probab.}, 4:200--225, 1994.

5144:

5145: \bibitem{barron:1}

5146: A.R. Barron.

5147: \newblock The strong ergodic theorem for densities: {G}eneralized

5148:   {S}hannon-{M}cmillan-{B}reiman theorem.

5149: \newblock {\em Ann. Probab.}, 13:1292--1303, 1985.

5150:

5151: \bibitem{bell:cleary:witten}

5152: J.G. Bell, T.C.~Cleary and I.H. Witten.

5153: \newblock {\em Text Compression}.

5154: \newblock Prentice Hall, New Jersey, 1990.

5155:

5156: \bibitem{bell-cover:88}

5157: R.~Bell and T.M. Cover.

5158: \newblock Game-theoretic optimal portfolios.

5159: \newblock {\em Management Sci.}, 34(6):724--733, 1988.

5160:

5161: \bibitem{berger:book}

5162: T.~Berger.

5163: \newblock {\em Rate Distortion Theory: A Mathematical Basis for Data

5164:   Compression}.

5165: \newblock Prentice-Hall Inc., Englewood Cliffs, NJ, 1971.

5166:

5167: \bibitem{berger-shen-ye:92}

5168: T.~Berger, S.Y. Shen, and Z.X. Ye.

5169: \newblock Some communication problems of random fields.

5170: \newblock {\em Internat. J. Math. Statist. Sci.}, 1(1):47--77, 1992.

5171:

5172: \bibitem{berkes-morrow}

5173: I.~Berkes and G.J. Morrow.

5174: \newblock Strong invariance principles for mixing random fields.

5175: \newblock {\em Z. Wahrsch. Verw. Gebiete}, 57(1):15--37, 1981.

5176:

5177: \bibitem{bradley}

5178: B.~C. Bradley.

5179: \newblock Basic properties of strong mixing conditions.

5180: \newblock In E.~Eberlein and M.S. Taqqu, editors, {\em Dependence in

5181:   Probability and Statistics}, pages 165--192, 1986.

5182:

5183: \bibitem{breiman:57}

5184: L.~Breiman.

5185: \newblock The individual ergodic theorem for information theory.

5186: \newblock {\em Ann. Math. Stat.}, 28:809--811, 1957.

5187:

5188: \bibitem{breiman:60}

5189: L.~Breiman.

5190: \newblock Correction to ``{T}he individual ergodic theorem for information

5191:   theory''.

5192: \newblock {\em Ann. Math. Stat.}, 31:809--810, 1960.

5193:

5194: \bibitem{bryc-dembo:96}

5195: W.~Bryc and A.~Dembo.

5196: \newblock Large deviations and strong mixing.

5197: \newblock {\em Ann. Inst. H. Poincar\'e Probab. Statist.}, 32(4):549--569,

5198:   1996.

5199:

5200: \bibitem{bucklew:87}

5201: J.A. Bucklew.

5202: \newblock The source coding theorem via {S}anov's theorem.

5203: \newblock {\em IEEE Trans. Inform. Theory}, 33(6):907--909, 1987.

5204:

5205: \bibitem{bucklew:88}

5206: J.A. Bucklew.

5207: \newblock A large deviation theory proof of the abstract alphabet source coding

5208:   theorem.

5209: \newblock {\em IEEE Trans. Inform. Theory}, 34(5):1081--1083, 1988.

5210:

5211: \bibitem{chazottesetal:98}

5212: J.-R. Chazottes, E.~Floriani, and R.~Lima.

5213: \newblock Relative entropy and identification of {G}ibbs measures in dynamical

5214:   systems.

5215: \newblock {\em J. Statist. Phys.}, 90:697--725, 1998.

5216:

5217: \bibitem{chi-it:01}

5218: Z.~Chi.

5219: \newblock The first order asymptotics of waiting times with distortion between

5220:   stationary processes.

5221: \newblock {\em IEEE Trans. Inform. Theory}, 47(1):338--347, 2001.

5222:

5223: \bibitem{chi-AP:01}

5224: Z.~Chi.

5225: \newblock Stochastic sub-additivity approach to conditional large deviation

5226:   principle.

5227: \newblock {\em To appear, Ann. Probab.}, 2001.

5228:

5229: \bibitem{comets:86}

5230: F.~Comets.

5231: \newblock Grandes d\'eviations pour des champs de {G}ibbs sur {${\bf Z }\sp

5232:   d$}.

5233: \newblock {\em C. R. Acad. Sci. Paris S\'er. I Math.}, 303(11):511--513, 1986.

5234:

5235: \bibitem{comets:89}

5236: F.~Comets.

5237: \newblock Large deviation estimates for a conditional probability distribution.

5238:   {A}pplications to random interaction {G}ibbs measures.

5239: \newblock {\em Probab. Theory Related Fields}, 80:407--432, 1989.

5240:

5241: \bibitem{cover:book}

5242: T.M. Cover and J.A. Thomas.

5243: \newblock {\em Elements of Information Theory}.

5244: \newblock J. Wiley, New York, 1991.

5245:

5246: \bibitem{csiszar:book}

5247: I.~Csisz{\'{a}}r and J.~K{\"{o}}rner.

5248: \newblock {\em Information Theory: Coding Theorems for Discrete Memoryless

5249:   Systems}.

5250: \newblock Academic Press, New York, 1981.

5251:

5252: \bibitem{dawson-gartner:87}

5253: D.A. Dawson and J.~G{\"a}rtner.

5254: \newblock Large deviations from the {M}c{K}ean-{V}lasov limit for weakly

5255:   interacting diffusions.

5256: \newblock {\em Stochastics}, 20(4):247--308, 1987.

5257:

5258: \bibitem{dembo-kontoyiannis}

5259: A.~Dembo and I.~Kontoyiannis.

5260: \newblock The asymptotics of waiting times between stationary processes,

5261:   allowing distortion.

5262: \newblock {\em Ann. Appl. Probab.}, 9:413--429, 1999.

5263:

5264: \bibitem{dembo-kontoyiannis:crit:01}

5265: A.~Dembo and I.~Kontoyiannis.

5266: \newblock Critical behavior in lossy source coding.

5267: \newblock {\em To appear,}, 2001.

5268: \newblock [Available from \texttt{www.dam.brown.edu/people/yiannis}].

5269:

5270: \bibitem{dembo-zeitouni:book}

5271: A.~Dembo and O.~Zeitouni.

5272: \newblock {\em Large Deviations Techniques And Applications}.

5273: \newblock Springer-Verlag, New York, second edition, 1998.

5274:

5275: \bibitem{deuschel-stroock:book}

5276: J.D. Deuschel and D.W. Stroock.

5277: \newblock {\em Large Deviations}.

5278: \newblock Academic Press, Boston, 1989.

5279:

5280: \bibitem{doukhan:book}

5281: P.~Doukhan.

5282: \newblock {\em Mixing: Properties and Examples}.

5283: \newblock Springer-Verlag, New York, 1994.

5284:

5285: \bibitem{elias}

5286: P.~Elias.

5287: \newblock Universal codeword sets and representations of the integers.

5288: \newblock {\em IEEE Trans. Inform. Theory}, 21:194--203, 1975.

5289:

5290: \bibitem{evans-gariepy}

5291: L.C. Evans and R.F. Gariepy.

5292: \newblock {\em Measure theory and fine properties of functions}.

5293: \newblock CRC Press, Boca Raton, FL, 1992.

5294:

5295: \bibitem{feldman:80}

5296: J.~Feldman.

5297: \newblock $r$-entropy, equipartition, and {O}rnstein's isomorphism theorem in

5298:   \mbox{${\bf R}^{n}$}.

5299: \newblock {\em Israel J. Math.}, 36(3-4):321--345, 1980.

5300:

5301: \bibitem{fellerII:book}

5302: W.~Feller.

5303: \newblock {\em An Introduction to Probability Theory and its Applications.

5304:   {V}ol. {I}{I}.}

5305: \newblock John Wiley \& Sons Inc., New York, second edition, 1971.

5306:

5307: \bibitem{folmer-orey}

5308: H.~F{\"o}llmer and S.~Orey.

5309: \newblock Large deviations for the empirical field of a {G}ibbs measure.

5310: \newblock {\em Ann. Probab.}, 16(3):961--977, 1988.

5311:

5312: \bibitem{follmer:73}

5313: Hans F{\"o}llmer.

5314: \newblock On entropy and information gain in random fields.

5315: \newblock {\em Z. Wahrsch. Verw. Gabiete}, 26:207--217, 1973.

5316:

5317: \bibitem{georgii:1}

5318: H.-O. Georgii.

5319: \newblock {\em {Gibbs} Measures and Phase Transitions}.

5320: \newblock W. de Gruyter: Berlin et al, 1989.

5321:

5322: \bibitem{guyon:book}

5323: X.~Guyon.

5324: \newblock {\em Random Fields on a Network: Modeling, Statistics, and

5325:   Applications}.

5326: \newblock Springer-Verlag, New York, 1995.

5327:

5328: \bibitem{ibragimov:62}

5329: I.A. Ibragimov.

5330: \newblock Some limit theorems for stationary processes.

5331: \newblock {\em Theory Probab. Appl.}, 7:349--382, 1962.

5332:

5333: \bibitem{ishii-yamamoto:97}

5334: D.~Ishii and H.~Yamamoto.

5335: \newblock The redundancy of universal coding with a fidelity criterion.

5336: \newblock {\em IEICE Trans. Fundamentals}, E80-A:2225--2231, 1997.

5337:

5338: \bibitem{kanaya-muramatsu:97}

5339: F.~Kanaya and J.~Muramatsu.

5340: \newblock An almost sure recurrence theorem with distortion for stationary

5341:   ergodic sources.

5342: \newblock {\em IEICE Trans. Fundamentals}, E80-A:2264--2267, 1997.

5343:

5344: \bibitem{kanlis:phd}

5345: A.~Kanlis.

5346: \newblock {\em Compression and Transmission of Information at Multiple

5347:   Resolutions}.

5348: \newblock PhD thesis, Dept. of Electrical and Computer Engineering, University

5349:   of Maryland at College Park, 1998.

5350:

5351: \bibitem{karlin-ost:88}

5352: S.~Karlin and F.~Ost.

5353: \newblock Maximal length of common words among random letter sequences.

5354: \newblock {\em Ann. Probab.}, 16:535--563, 1988.

5355:

5356: \bibitem{kieffer:73}

5357: J.C. Kieffer.

5358: \newblock A counterexample to {P}erez's generalization of the

5359:   {S}hannon-{M}c{M}illan theorem.

5360: \newblock {\em Ann. Probab.}, 1:362--364, 1973.

5361:

5362: \bibitem{kieffer:73b}

5363: J.C. Kieffer.

5364: \newblock Correction to: ``{A} counterexample to {P}erez's generalization of

5365:   the {S}hannon-{M}c{M}illan theorem'' ({A}nn. {P}robability {\bf 1} (1973),

5366:   362-364).

5367: \newblock {\em Ann. Probab.}, 4:153--154, 1976.

5368:

5369: \bibitem{kieffer:91}

5370: J.C. Kieffer.

5371: \newblock Sample converses in source coding theory.

5372: \newblock {\em IEEE Trans. Inform. Theory}, 37(2):263--268, 1991.

5373:

5374: \bibitem{koga-arimoto:98}

5375: H.~Koga and S.~Arimoto.

5376: \newblock On the asymptotic behaviors of the recurrence time with a fidelity

5377:   criterion for discrete memoryless sources and memoryless {G}aussian sources.

5378: \newblock {\em IEICE Trans. Fundamentals}, E81-A:981--986, 1998.

5379:

5380: \bibitem{kontoyiannis-97}

5381: I.~Kontoyiannis.

5382: \newblock Second-order noiseless source coding theorems.

5383: \newblock {\em IEEE Trans. Inform. Theory}, 43(4):1339--1341, July 1997.

5384:

5385: \bibitem{kontoyiannis-jtp}

5386: I.~Kontoyiannis.

5387: \newblock Asymptotic recurrence and waiting times for stationary processes.

5388: \newblock {\em J. Theoret. Probab.}, 11:795--811, 1998.

5389:

5390: \bibitem{my:thesis}

5391: I.~Kontoyiannis.

5392: \newblock {\em Recurrence and Waiting Times in Stationary Processes, and their

5393:   Applications in Data Compression}.

5394: \newblock PhD thesis, Dept. of Electrical Engineering, Stanford University, May

5395:   1998.

5396:

5397: \bibitem{covering-TR:99}

5398: I.~Kontoyiannis.

5399: \newblock Efficient sphere-covering and converse measure concentration via

5400:   generalized coding theorems.

5401: \newblock Technical Report 99-24, Department of Statistics, Purdue University,

5402:   October 1999.

5403: \newblock [Available from \texttt{www.dam.brown.edu/people/yiannis}].

5404:

5405: \bibitem{kontoyiannis-lossy1-1}

5406: I.~Kontoyiannis.

5407: \newblock An implementable lossy version of the {L}empel-{Z}iv algorithm --

5408:   {P}art~{I}: {O}ptimality for memoryless sources.

5409: \newblock {\em IEEE Trans. Inform. Theory}, 45(7):2293--2305, November 1999.

5410:

5411: \bibitem{kontoyiannis-red:00}

5412: I.~Kontoyiannis.

5413: \newblock Pointwise redundancy in lossy data compression and universal lossy

5414:   data compression.

5415: \newblock {\em IEEE Trans. Inform. Theory}, 46(1):136--152, January 2000.

5416:

5417: \bibitem{konto-zhang:00}

5418: I.~Kontoyiannis and J.~Zhang.

5419: \newblock Arbitrary source models and {B}ayesian codebooks in rate-distortion

5420:   theory.

5421: \newblock {\em Preprint}, 2000.

5422:

5423: \bibitem{krengel:book}

5424: U.~Krengel.

5425: \newblock {\em Ergodic Theorems}.

5426: \newblock Walter de Gruyter \& Co., Berlin, 1985.

5427:

5428: \bibitem{lapidoth:97}

5429: A.~Lapidoth.

5430: \newblock On the role of mismatch in rate distortion theory.

5431: \newblock {\em IEEE Trans. Inform. Theory}, 43(1):38--47, 1997.

5432:

5433: \bibitem{lin-lu:book}

5434: Z.~Lin and C.~Lu.

5435: \newblock {\em Limit Theory for Mixing Dependent Random Variables}.

5436: \newblock Kluwer Academic Publishers, Dordrecht, 1996.

5437:

5438: \bibitem{luczak-szpankowski}

5439: T.~{\L}uczak and W.~Szpankowski.

5440: \newblock A suboptimal lossy data compression algorithm based on approximate

5441:   pattern matching.

5442: \newblock {\em IEEE Trans. Inform. Theory}, 43(5):1439--1451, 1997.

5443:

5444: \bibitem{marton-shields:1}

5445: K.~Marton and P.C. Shields.

5446: \newblock Almost sure waiting time results for weak and very weak {B}ernoulli

5447:   processes.

5448: \newblock {\em Ergod. Th. \& Dynam. Sys.}, 15:951--960, 1995.

5449:

5450: \bibitem{mcmillan}

5451: B.~McMillan.

5452: \newblock The basic theorems of information theory.

5453: \newblock {\em Ann. Math. Stat.}, 24:196--219, 1953.

5454:

5455: \bibitem{olla:88}

5456: S.~Olla.

5457: \newblock Large deviations for {G}ibbs random fields.

5458: \newblock {\em Probab. Theory Related Fields}, 77(3):343--357, 1988.

5459:

5460: \bibitem{oodaira-yoshihara:71a}

5461: H.~Oodaira and K.I. Yoshihara.

5462: \newblock The law of the iterated logarithm for stationary processes satisfying

5463:   mixing conditions.

5464: \newblock {\em K\=odai Math. Sem. Rep.}, 23:311--334, 1971.

5465:

5466: \bibitem{oodaira-yoshihara:71b}

5467: H.~Oodaira and K.I. Yoshihara.

5468: \newblock Note on the law of the iterated logarithm for stationary processes

5469:   satisfying mixing conditions.

5470: \newblock {\em K\=odai Math. Sem. Rep.}, 23:335--342, 1971.

5471:

5472: \bibitem{orey:85}

5473: S.~Orey.

5474: \newblock On the {S}hannon-{P}erez-{M}oy theorem.

5475: \newblock In {\em Particle systems, random media and large deviations

5476:   (Brunswick, Maine, 1984)}, pages 319--327. Amer. Math. Soc., Providence,

5477:   R.I., 1985.

5478:

5479: \bibitem{orey:85b}

5480: S.~Orey.

5481: \newblock Large deviations in ergodic theory.

5482: \newblock In {\em Seminar on stochastic processes, 1984 (Evanston, Ill.,

5483:   1984)}, pages 195--249. Birkh\"auser Boston, Boston, Mass., 1986.

5484:

5485: \bibitem{ornstein-weiss:83}

5486: D.~Ornstein and B.~Weiss.

5487: \newblock The {S}hannon-{M}c{M}illan-{B}reiman theorem for a class of amenable

5488:   groups.

5489: \newblock {\em Israel J. Math.}, 44:53--60, 1983.

5490:

5491: \bibitem{peligrad:86}

5492: M.~Peligrad.

5493: \newblock Recent advances in the central limit theorem and its weak invariance

5494:   principle for mixing sequences of random variables (a survey).

5495: \newblock In E.~Eberlein and M.S. Taqqu, editors, {\em Dependence in

5496:   Probability and Statistics}, pages 193--223, 1986.

5497:

5498: \bibitem{philipp-stout:book}

5499: W.~Philipp and W.~Stout.

5500: \newblock {\em Almost Sure Invariance Principles for Partial Sums of Weakly

5501:   Dependent Random Variables}.

5502: \newblock Memoirs of the AMS, 1975.

5503: \newblock vol. 2, issue 2, no. 161.

5504:

5505: \bibitem{sakrison:69}

5506: D.J. Sakrison.

5507: \newblock The rate distortion function for a class of sources.

5508: \newblock {\em Information and Control}, 15:165--195, 1969.

5509:

5510: \bibitem{sakrison:70}

5511: D.J. Sakrison.

5512: \newblock The rate of a class of random processes.

5513: \newblock {\em IEEE Trans. Inform. Theory}, IT-16:10--16, 1970.

5514:

5515: \bibitem{shannon:48}

5516: C.E. Shannon.

5517: \newblock A mathematical theory of communication.

5518: \newblock {\em Bell System Tech. J.}, 27:379--423, 623--656, 1948.

5519:

5520: \bibitem{shannon:59}

5521: C.E. Shannon.

5522: \newblock Coding theorems for a discrete source with a fidelity criterion.

5523: \newblock {\em IRE Nat. Conv. Rec.}, part~4:142--163, 1959.

5524: \newblock Reprinted in D. Slepian (ed.), {\em Key Papers in the Development of

5525:   Information Theory}, IEEE Press, 1974.

5526:

5527: \bibitem{shields:3}

5528: P.C. Shields.

5529: \newblock Waiting times: {P}ositive and negative results on the {W}yner-{Z}iv

5530:   problem.

5531: \newblock {\em J. Theoret. Probab.}, 6(3):499--519, 1993.

5532:

5533: \bibitem{steinberg-gutman}

5534: Y.~Steinberg and M.~Gutman.

5535: \newblock An algorithm for source coding subject to a fidelity criterion, based

5536:   upon string matching.

5537: \newblock {\em IEEE Trans. Inform. Theory}, 39(3):877--886, 1993.

5538:

5539: \bibitem{wijsman}

5540: R.A. Wijsman.

5541: \newblock Convergence of sequences of convex sets, cones and functions.

5542: \newblock {\em Bull. Amer. Math. Soc.}, 70:186--188, 1964.

5543:

5544: \bibitem{winkler:book}

5545: G.~Winkler.

5546: \newblock {\em Image Analysis, Random Fields and Dynamic Monte Carlo Methods: A

5547:   Mathematical Introduction}.

5548: \newblock Springer-Verlag, Berlin, 1995.

5549:

5550: \bibitem{wyner-ziv:71}

5551: A.D. Wyner and J.~Ziv.

5552: \newblock Bounds on the rate-distortion function for stationary sources with

5553:   memory.

5554: \newblock {\em IEEE Trans. Information Theory}, IT-17:508--513, 1971.

5555:

5556: \bibitem{wyner-ziv:1}

5557: A.D. Wyner and J.~Ziv.

5558: \newblock Some asymptotic properties of the entropy of a stationary ergodic

5559:   data source with applications to data compression.

5560: \newblock {\em IEEE Trans. Inform. Theory}, 35(6):1250--1258, 1989.

5561:

5562: \bibitem{wyner-ziv:3}

5563: A.D. Wyner and J.~Ziv.

5564: \newblock Fixed data base version of the {L}empel-{Z}iv data compression

5565:   algorithm.

5566: \newblock {\em IEEE Trans. Inform. Theory}, 37(3):878--880, 1991.

5567:

5568: \bibitem{wyner-ziv:2}

5569: A.D. Wyner and J.~Ziv.

5570: \newblock The sliding-window {L}empel-{Z}iv algorithm is asymptotically

5571:   optimal.

5572: \newblock {\em Proc. IEEE}, 82(6):872--877, 1994.

5573:

5574: \bibitem{wyner-ziv-wyner}

5575: A.D. Wyner, J.~Ziv, and A.J. Wyner.

5576: \newblock On the role of pattern matching in information theory. ({I}nformation

5577:   theory: 1948--1998).

5578: \newblock {\em IEEE Trans. Inform. Theory}, 44(6):2045--2056, 1998.

5579:

5580: \bibitem{yang-kieffer:1}

5581: E.-h. Yang and J.C. Kieffer.

5582: \newblock On the performance of data compression algorithms based upon string

5583:   matching.

5584: \newblock {\em IEEE Trans. Inform. Theory}, 44(1):47--65, 1998.

5585:

5586: \bibitem{yang-zhang:III}

5587: E.-h. Yang and Z.~Zhang.

5588: \newblock The redundancy of source coding with a fidelity criterion --

5589:   {P}art~{III}: {C}oding at a fixed distortion level with unknown statistics.

5590: \newblock {\em Preprint}.

5591:

5592: \bibitem{yang-zhang:99}

5593: E.-h. Yang and Z.~Zhang.

5594: \newblock On the redundancy of lossy source coding with abstract alphabets.

5595: \newblock {\em IEEE Trans. Inform. Theory}, 45(4):1092--1110, 1999.

5596:

5597: \bibitem{yang-zhang:99c}

5598: E.-h. Yang and Z.~Zhang.

5599: \newblock The shortest common superstring problem: average case analysis for

5600:   both exact and approximate matching.

5601: \newblock {\em IEEE Trans. Inform. Theory}, 45(6):1867--1886, 1999.

5602:

5603: \bibitem{yang-zhang:II}

5604: E.-h. Yang and Z.~Zhang.

5605: \newblock The redundancy of source coding with a fidelity criterion --

5606:   {P}art~{II}: {C}oding at a fixed rate level with unknown statistics.

5607: \newblock {\em IEEE Trans. Inform. Theory}, 47(1):126--145, 2001.

5608:

5609: \bibitem{ye-berger:book}

5610: T.~Ye, Z.and~Berger.

5611: \newblock {\em Information Measures for Discrete Random Fields}.

5612: \newblock Science Press, Beijing, 1998.

5613:

5614: \bibitem{zhang-yang-wei:I}

5615: Z.~Zhang, E.-h. Yang, and V.K. Wei.

5616: \newblock The redundancy of source coding with a fidelity criterion --

5617:   {P}art~{I}: {K}nown statistics.

5618: \newblock {\em IEEE Trans. Inform. Theory}, 43(1):71--91, 1997.

5619:

5620: \bibitem{ziv-lempel:1}

5621: J.~Ziv and A.~Lempel.

5622: \newblock A universal algorithm for sequential data compression.

5623: \newblock {\em IEEE Trans. Inform. Theory}, 23(3):337--343, 1977.

5624:

5625: \bibitem{ziv-lempel:2}

5626: J.~Ziv and A.~Lempel.

5627: \newblock Compression of individual sequences by variable rate coding.

5628: \newblock {\em IEEE Trans. Inform. Theory}, 24(5):530--536, 1978.

5629:

5630: \end{thebibliography}

5631:

5632: % \bibliography{ik}

5633: % \bibliography{/users/yiannis/latex/ik}

5634: % \bibliography{../latex/ik}

5635: % \bibliography{/users/yiannis/latex/ik}

5636: % \bibliography{/home/mean/u21/yiannis/latex/ik}

5637: % \bibliography{/v0/yiannis/latex/ik}

5638: % \bibliography{/sccm0/yiannis/latex/ik}

5639: % \bibliography{/tmp_mnt/home-georgep/yiannis/latex/ik}

5640:

5641: \end{document}

5642:

5643: