math0103007/TR.tex
1: \documentclass[11pt]{article}
2: \usepackage{latexsym}
3: \usepackage{amssymb}
4: 
5: \setlength{\oddsidemargin}{-5mm}
6: \setlength{\evensidemargin}{-5mm}
7: \setlength{\topmargin}{-1cm}
8: \setlength{\textheight}{22.5cm}
9: \setlength{\textwidth}{17cm}
10: 
11: % \renewcommand{\baselinestretch}{1.2}  % double space
12: % \input paper_defns.tex
13: % \input{/users/yiannis/latex/paper_defns}
14: % \input{../latex/paper_defns}
15: % \input{/users/yiannis/latex/paper_defns}
16: % \input{/home/mean/u21/yiannis/latex/paper_defns}
17: % \input{/v0/yiannis/latex/paper_defns}
18: % \input{/sccm0/yiannis/latex/paper_defns}
19: % \input{/tmp_mnt/home-georgep/yiannis/latex/paper_defns}
20: 
21: \def\proof{{\sc Proof. }}
22: \def\qed{\hfill$\Box$}
23: 
24: \newcommand{\Qp}{\mbox{\boldmath $Q$}}
25: \newcommand{\Xp}{\mbox{\boldmath $X$}}
26: \newcommand{\Xb}{\mbox{\bf X}}
27: \newcommand{\Xps}{\mbox{\scriptsize\boldmath $X$}}
28: \newcommand{\xp}{\mbox{\boldmath $x$}}
29: \newcommand{\Xtp}{\mbox{\boldmath $\tilde{X}$}}
30: \newcommand{\xinp}{\mbox{\boldmath $\xi_n$}}
31: \newcommand{\Yp}{\mbox{\boldmath $Y$}}
32: \newcommand{\yp}{\mbox{\boldmath $y$}}
33: \newcommand{\Zp}{\mbox{\boldmath $Z$}}
34: \newcommand{\orig}{\bf 0}
35: \newcommand{\Equiv}{\mbox{$\Leftrightarrow$}}
36: \newcommand{\weakly}{\mbox{$ \;\stackrel{\cal D}{\longrightarrow}\; $}}
37: \newcommand{\equald}{\mbox{$ \;\stackrel{\cal D}{=}\; $}}
38: \newcommand{\bydef}{\mbox{$ \;\stackrel{\triangle}{=}\; $}}
39: \newcommand{\home}{\mbox{$\!\mathtt{\sim}$}}
40: \newcommand{\eqexp}{\mbox{$ \;\stackrel{\cdot}{=}\; $}}
41: \newcommand{\leqa}{\mbox{$ \;\stackrel{(a)}{\leq}\; $}}
42: \newcommand{\leqb}{\mbox{$ \;\stackrel{(b)}{\leq}\; $}}
43: \newcommand{\leqc}{\mbox{$ \;\stackrel{(c)}{\leq}\; $}}
44: \newcommand{\leqd}{\mbox{$ \;\stackrel{(d)}{\leq}\; $}}
45: \newcommand{\leqe}{\mbox{$ \;\stackrel{(e)}{\leq}\; $}}
46: \newcommand{\geqa}{\mbox{$ \;\stackrel{(a)}{\geq}\; $}}
47: \newcommand{\geqb}{\mbox{$ \;\stackrel{(b)}{\geq}\; $}}
48: \newcommand{\geqc}{\mbox{$ \;\stackrel{(c)}{\geq}\; $}}
49: \newcommand{\geqd}{\mbox{$ \;\stackrel{(d)}{\geq}\; $}}
50: \newcommand{\geqe}{\mbox{$ \;\stackrel{(e)}{\geq}\; $}}
51: \newcommand{\eqa}{\mbox{$ \;\stackrel{(a)}{=}\; $}}
52: \newcommand{\eqb}{\mbox{$ \;\stackrel{(b)}{=}\; $}}
53: \newcommand{\eqc}{\mbox{$ \;\stackrel{(c)}{=}\; $}}
54: \newcommand{\eqd}{\mbox{$ \;\stackrel{(d)}{=}\; $}}
55: \newcommand{\eqe}{\mbox{$ \;\stackrel{(e)}{=}\; $}}
56: \newcommand{\eqf}{\mbox{$ \;\stackrel{(f)}{=}\; $}}
57: \newcommand{\eqg}{\mbox{$ \;\stackrel{(g)}{=}\; $}}
58: \newcommand{\eqh}{\mbox{$ \;\stackrel{(h)}{=}\; $}}
59: \newcommand{\eqi}{\mbox{$ \;\stackrel{(i)}{=}\; $}}
60: \newcommand{\eqj}{\mbox{$ \;\stackrel{(j)}{=}\; $}}
61: \newcommand{\eqk}{\mbox{$ \;\stackrel{(k)}{=}\; $}}
62: \newcommand{\eql}{\mbox{$ \;\stackrel{(\ell)}{=}\; $}}
63: \newcommand{\approxa}{\mbox{$ \;\stackrel{(a)}{\approx}\; $}}
64: \newcommand{\approxb}{\mbox{$ \;\stackrel{(b)}{\approx}\; $}}
65: \newcommand{\approxc}{\mbox{$ \;\stackrel{(c)}{\approx}\; $}}
66: \newcommand{\approxd}{\mbox{$ \;\stackrel{(d)}{\approx}\; $}}
67: \newcommand{\appeq}{\mbox{$ \stackrel{\cdot}{=} $}}
68: \newcommand{\appleq}{\mbox{$ \stackrel{\mathbf{\cdot}}{\leq} $}}
69: \newcommand{\subD}{_{{}_D}}
70: \newcommand{\subDi}{_{{}_{D_i}}}
71: \newcommand{\RL}{{\mathbb R}}
72: \newcommand{\NN}{{\mathbb N}}
73: \newcommand{\IN}{{\mathbb Z}}
74: \newcommand{\RN}{{\mathbb Q}}
75: \newcommand{\IND}{{\mathbb I}}
76: \newcommand{\BBP}{{\mathbb P}}
77: \newcommand{\BBQ}{{\mathbb Q}}
78: \newcommand{\BBM}{{\mathbb M}}
79: \newcommand{\Ind}{\mbox{\rm I$\!$I}}
80: \newcommand{\PB}{\mbox{\boldmath $P$}} 
81: \newcommand{\QB}{\mbox{\boldmath $Q$}} 
82: \newcommand{\QBn}{\mbox{\boldmath $Q_n$}} 
83: \newcommand{\QTn}{\mbox{$\widetilde{Q}_n$}} 
84: \newcommand{\PR}{\mbox{\rm Pr}} 
85: \newcommand{\VAR}{\mbox{\rm Var}} 
86: \newcommand{\COV}{\mbox{\rm Cov}} 
87: \newcommand{\signs}{\mbox{\scriptsize sign}}
88: \newcommand{\essinf}{\mathop{\rm ess\, inf}}
89: \newcommand{\esssup}{\mathop{\rm ess\, sup}}
90: \newcommand{\Ahat}{\mbox{$\hat{A}$}}
91: \newcommand{\Ahatn}{\mbox{$\hat{A}^n$}}
92: \newcommand{\Ahatnsq}{\mbox{$\hat{A}^{n^2}$}}
93: \newcommand{\Ahatk}{\mbox{$\hat{A}^k$}}
94: \newcommand{\Ahatnd}{\mbox{$\hat{A}^{n^d}$}}
95: \newcommand{\ahat}{\mbox{$\hat{a}$}}
96: \newcommand{\Ahats}{\mbox{\scriptsize $\hat{A}$}}
97: \newcommand{\ahats}{\mbox{\scriptsize $\hat{a}$}}
98: \newcommand{\Nhat}{\hat{N}}
99: \newcommand{\hatN}{\hat{N}}
100: \newcommand{\Phatn}{\mbox{$\hat{P}_n$}}
101: \newcommand{\phatn}{\mbox{\scriptsize$\hat{P}_n$}}
102: \newcommand{\sphatn}{\mbox{\tiny$\hat{P}_n$}}
103: \newcommand{\xhat}{\mbox{$\hat{x}$}}
104: \newcommand{\Xhat}{\mbox{$\hat{X}$}}
105: \newcommand{\yhat}{\mbox{$\hat{y}$}}
106: \newcommand{\calH}{\mbox{${\cal H}$}}
107: \newcommand{\calLn}{\mbox{${\cal L}_n$}}
108: \newcommand{\calM}{\mbox{${\cal M}$}}
109: \newcommand{\calR}{\mbox{${\cal R}$}}
110: \newcommand{\calX}{\mbox{${\cal X}$}}
111: \newcommand{\calXhat}{\mbox{$\hat{\cal X}$}}
112: \newcommand{\Dmin}{\mbox{$D_{\rm min}$}}
113: \newcommand{\Dinf}{\mbox{$D_{\rm min}^{(\infty)}$}}
114: \newcommand{\rhomin}{\mbox{$\rho_{\rm min}$}}
115: \newcommand{\rhomax}{\mbox{$\rho_{\rm max}$}}
116: \newcommand{\Lmax}{\mbox{$L_{\rm max}$}}
117: \newcommand{\Dmax}{\mbox{$D_{\rm max}$}}
118: \newcommand{\Dbar}{\mbox{$\overline{D}$}}
119: \newcommand{\dmax}{\mbox{$d_{\rm max}$}}
120: \newcommand{\Dmaxs}{\mbox{\scriptsize $D_{\rm max}$}}
121: \newcommand{\Dav}{\mbox{$D_{\rm av}$}}
122: \newcommand{\Dminn}{\mbox{$D_{\rm min}^{(n)}$}}
123: \newcommand{\Dmink}{\mbox{$D_{\rm min}^{(k)}$}}
124: \newcommand{\Dminone}{\mbox{$D_{\rm min}^{(1)}$}}
125: \newcommand{\Dmaxn}{\mbox{$D_{\rm max}^{(n)}$}}
126: \newcommand{\Davn}{\mbox{$D_{\rm av}^{(n)}$}}
127: \newcommand{\DminP}{\mbox{$D_{\rm min}^{P,Q}$}}
128: \newcommand{\DmaxP}{\mbox{$D_{\rm max}^{P,Q}$}}
129: \newcommand{\Dminmu}{\mbox{$D_{\rm min}^{\mu,\nu}$}}
130: \newcommand{\Dmaxmu}{\mbox{$D_{\rm max}^{\mu,\nu}$}}
131: \newcommand{\Dminmun}{\mbox{$D_{\rm min}^{\mu_n,\nu_n}$}}
132: \newcommand{\Dmaxmun}{\mbox{$D_{\rm max}^{\mu_n,\nu_n}$}}
133: \newcommand{\Davmu}{\mbox{$D_{\rm av}^{\mu,\nu}$}}
134: \newcommand{\Rmin}{\mbox{$R_{\rm min}$}}
135: \newcommand{\LA}{\mbox{$\Lambda$}}
136: \newcommand{\Lbar}{\mbox{$\bar{\Lambda}$}}
137: % \newcommand{\la}{\mbox{$\lambda$}}
138: \newcommand{\lab}{\mbox{$\bar{\lambda}$}}
139: \newcommand{\las}{\mbox{\scriptsize$\lambda$}}
140: \newcommand{\iid}{\mbox{i.i.d.}\!}
141: \newcommand{\psipm}{\psi^{\pm}}
142: \newcommand{\limnd}{\lim_{
143: 	            \mbox{\scriptsize
144: 			 $\begin{array}{c}
145: 				n\to\infty\\
146: 				D\downarrow 0
147: 			  \end{array}$
148: 	     		 }
149: 		 	 }
150: 		   }
151: \newcommand{\limsupnd}{\limsup_{
152:                     \mbox{\scriptsize
153:                          $\begin{array}{c}
154:                                 n\to\infty\\
155:                                 D\downarrow 0
156:                           \end{array}$
157:                          }
158:                          }
159:                    }
160: \newcommand{\liminfnd}{\liminf_{
161:                     \mbox{\scriptsize
162:                          $\begin{array}{c}
163:                                 n\to\infty\\
164:                                 D\downarrow 0
165:                           \end{array}$
166:                          }
167:                          }
168:                    }
169: \newcommand{\argmin}{\mathop{\rm arg\, min}}
170: 
171: \newcommand{\la}{\lambda}
172: 
173: \def\be{\begin{eqnarray}}
174: \def\ee{\end{eqnarray}}
175: \def\ben{\begin{eqnarray*}}
176: \def\een{\end{eqnarray*}}
177: 
178: 
179: \input{epsf}
180: 
181: \title{Source Coding, Large Deviations,\\
182: and Approximate Pattern Matching}
183: 
184: \author{A. Dembo \and I. Kontoyiannis}
185: 
186: \date{\today}
187: 
188: \begin{document}
189: \bibliographystyle{plain}
190: \maketitle
191: 
192: \thispagestyle{empty}
193: \setcounter{page}{-2}
194: 
195: \footnotetext[1]{
196: A. Dembo is with
197: the Departments of
198: Mathematics and of
199: Statistics,
200: Stanford University,
201: Stanford, CA 94305.
202: Email: {\tt amir@stat.stanford.edu}
203: Web: {\tt www-stat.stanford.edu/\home amir}
204: }
205:  
206: \footnotetext[2]{
207: I.\ Kontoyiannis is with the Division
208: of Applied Mathematics, Brown University,
209: Box F, 182 George St., Providence, RI 02912, USA.
210: Email: {\tt yiannis@dam.brown.edu}
211: Web: {\tt www.dam.brown.edu/people/yiannis/}
212: [Permanent address:
213: Department of Statistics,
214: Purdue University,
215: 1399 Mathematical Sciences Building,
216: W.~Lafayette, IN 47907-1399, USA.]
217: }
218:  
219: \footnotetext[3]{
220: Amir Dembo was supported in part
221: by NSF grant \#DMS-0072331.
222: I. Kontoyiannis was supported in part
223: by NSF grant \#0073378-CCR.}
224: 
225: \bigskip
226: 
227: %Y changed dedication as we agreed
228: % \centerline{\it Dedicated to the memory of Aaron Wyner, a valuable
229: % friend and colleague.}
230: 
231: % Dedicated to the memory of a dear 
232: % friend and colleague, Aaron Wyner.
233: 
234: % \bigskip
235: 
236: \newpage
237: 
238: {\bf Abstract --- }
239: We present a development of parts of rate-distortion 
240: theory and pattern-matching algorithms for lossy data 
241: compression, centered around a lossy version of the 
242: Asymptotic Equipartition Property (AEP). This treatment 
243: closely parallels the corresponding development in 
244: lossless compression, a point of view that was advanced 
245: in an important paper of Wyner and Ziv in 1989. 
246: In the lossless case we review how the AEP underlies 
247: the analysis of the Lempel-Ziv algorithm by viewing it 
248: as a random code and reducing it to the idealized 
249: Shannon code. This also provides 
250: information about the redundancy of the Lempel-Ziv 
251: algorithm and about the asymptotic behavior of 
252: several relevant quantities. 
253: 
254: In the lossy case we 
255: give various versions of the statement of the 
256: generalized AEP and we outline the general 
257: methodology of its proof via large deviations. 
258: Its relationship with Barron and Orey's
259: generalized AEP is 
260: also discussed.
261: The lossy AEP is applied to: (i)~prove strengthened 
262: versions of Shannon's direct source coding theorem 
263: and universal coding theorems;
264: (ii)~characterize the performance of
265: ``mismatched'' codebooks in lossy
266: data compression;
267: (iii)~analyze the performance of 
268: pattern-matching algorithms for lossy 
269: compression (including Lempel-Ziv schemes); 
270: (iv)~determine the first order asymptotics 
271: of waiting times (with distortion) between 
272: stationary processes; (v)~characterize the 
273: best achievable rate of ``weighted'' 
274: codebooks as an optimal sphere-covering 
275: exponent. We then present a refinement to 
276: the lossy AEP and use it to: (i)~prove second order 
277: (direct and converse) lossy source coding theorems,
278: including universal coding theorems; (ii)~characterize 
279: which sources are quantitatively easier to compress;
280: (iii)~determine the second order
281: asymptotics of waiting times between
282: stationary processes;
283: (iv)~determine the precise asymptotic
284: behavior of longest match-lengths 
285: between stationary processes.
286: Extensions to random fields are also given.
287: 
288: \medskip
289: 
290: {\bf Index Terms --- } Rate-distortion theory, 
291: pattern-matching, large deviations, 
292: data compression.
293: 
294: \bigskip
295: 
296: %%% TOC COMMAND
297: % \newpage
298: \tableofcontents
299: 
300: \newpage 
301: \section{Introduction}
302: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
303: 
304: \subsection{Lossless Data Compression}
305: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
306: It is probably only a slight exaggeration to 
307: say that the central piece of mathematics
308: in the proof of almost any lossless coding
309: theorem is provided by the
310: Asymptotic Equipartition Property, or AEP.
311: Suppose we want to (losslessly) 
312: compress a message 
313: $X_1^n=(X_1,X_2,\ldots,X_n)$
314: generated by a stationary 
315: memoryless source $\Xp=\{X_n\;;\;n\geq 1\}$
316: where each $X_i$ takes values in the 
317: finite alphabet $A$
318: (much more general situations will be considered
319: later). For this source, the AEP states that
320: as $n\to\infty$
321: \be
322: -\frac{1}{n}\log_2 P^n(X_1^n)\to H
323: \;\;\;\;\mbox{in probability}
324: \label{eq:shannonAEP}
325: \ee
326: where $P$ is the common distribution of the 
327: independent and identically distributed ($\iid$) 
328: random variables $X_i$, $P^n$ denotes the 
329: (product) joint distribution of $X_1^n$, and 
330: $H=E[-\log_2 P(X_1)]$ is
331: the entropy rate of the source --
332: see Shannon's original paper
333: \cite[Theorem~3]{shannon:48} or 
334: Cover and Thomas' text \cite[Chapter~4]{cover:book}.
335: [Here and throughout the paper, 
336: $\log_2$ denotes the logarithm taken
337: to base 2, and $\log$ denotes
338: the natural logarithm.]
339: From (\ref{eq:shannonAEP}) we can 
340: immediately extract some useful 
341: information: It implies that 
342: when $n$ is large the message 
343: $X_1^n$ will most likely have 
344: probability at least as high 
345: as $2^{-n(H+\epsilon)}$:
346: \be
347: P^n(X_1^n)\geq 2^{-n(H+\epsilon)}
348: 	\;\;\;\;\mbox{with high probability.}
349: \label{eq:tocompare}
350: \ee
351: But there cannot be many high-probability messages.
352: In fact, there can be at most $2^{n(H+\epsilon)}$
353: messages with $P^n(X_1^n)\geq 2^{-n(H+\epsilon)}$,
354: so we need approximately $2^{nH}$ representative
355: messages from the source $\Xp$ in order to cover
356: our bets (with high probability). 
357: If we let ${\cal T}_n$ be the 
358: set of high-probability
359: strings $x_1^n\in A^n$ having 
360: $P^n(x_1^n)\geq 2^{-n(H+\epsilon)}$,
361: then with high probability we can
362: correctly represent the source output
363: $X_1^n$ by an element of ${\cal T}_n$.
364: Since there are no more than 
365: $2^{n(H+\epsilon)}$ of them, 
366: we need no more than $nH$ bits
367: to correctly encode $X_1^n$.
368: 
369: \paragraph{Shannon's Random Code.}
370: Another way to extract information
371: from (\ref{eq:shannonAEP}) is as follows. 
372: The fact that for large $n$ we typically have
373: $P^n(X_1^n)\approx 2^{-nH}$ also means that
374: if we independently generate another random
375: string, say $Y_1^n$, from the same distribution
376: as the source, the probability that $X_1^n$
377: is the same as $Y_1^n$ is about $2^{-nH}$.
378: Suppose that instead of using the strings in
379: ${\cal T}_n$ above as our representatives
380: for the source, we decided to independently
381: generate a collection of random strings 
382: $Y_1^n$ from the distribution $P^n$; how
383: many would we need? Given a source string
384: $X_1^n$, the probability that any one
385: of the $Y_1^n$ matches it is 
386: $\approx 2^{-nH}$, so in order to
387: have high probability of success
388: in representing $X_1^n$ without error
389: we should choose approximately
390: $2^{n(H+\epsilon)}$ random strings $Y_1^n$.
391: Therefore, whether we choose the set of
392: representatives systematically or
393: randomly, we always need about
394: $2^{nH}$ strings in order to be able
395: to encode $X_1^n$ losslessly with high
396: probability. Note that the randomly
397: generated set ${\cal T}_n$ is nothing
398: but Shannon's random codebook
399: \cite{shannon:59} specialized to the
400: case of lossless compression.
401: 
402: \paragraph{Idealized Lempel-Ziv Coding.}
403: In 1989, in a very influential paper 
404: \cite{wyner-ziv:1}, Wyner and Ziv took 
405: the above argument several steps further. 
406: Aiming to ``obtain 
407: insight into the workings of [...] the 
408: Lempel-Ziv data compression algorithm,'' 
409: they considered the following coding 
410: scenario: Suppose that an encoder and a
411: decoder both have available to them
412: a long database, say an infinitely 
413: long string $Y_1^\infty=(Y_1,Y_2,\ldots)$
414: that is independently generated from 
415: the same distribution as the source. 
416: Given a source string $X_1^n$ to 
417: be transmitted, the
418: encoder looks for the
419: first appearance of $X_1^n$ in the
420: database (assuming, for now,
421: that it does appear somewhere).
422: Let $W$ denote the position of
423: this first appearance, that is,
424: let $W$ be the smallest integer
425: for which 
426: $Y_W^{W+n-1}=(Y_W,Y_{W+1},\ldots,Y_{W+n-1})$
427: is equal to $X_1^n$.
428: Then all the encoder has to do is 
429: it to tell the decoder the value of 
430: $W$; the decoder can read off the string 
431: $Y_W^{W+n-1}$ and recover $X_1^n$ 
432: perfectly. This description can be
433: given using
434: (cf. \cite{elias}\cite{wyner-ziv:2})
435: no more than 
436: \be
437: \ell(X_1^n)=\log_2 W + O(\log_2\log_2 W)
438: 	\;\;\;\;\mbox{bits.}
439: \label{eq:elias1}
440: \ee
441: 
442: How good is this scheme? 
443: First note that, for any given source
444: string $X_1^n$, the random variable
445: $W$ records the first ``success'' in a 
446: sequence of trials (``Is $Y_1^n=X_1^n$?,''
447: ``Is $Y_2^{n+1}=X_1^n$?,''
448: and so on),
449: each of which has probability
450: of success $p=P^n(X_1^n)$. Although
451: these trials are not independent,
452: for large $n$ they are almost independent
453: (in a sense that 
454: will be made precise below), so the 
455: distribution of $W$ is close to 
456: a geometric with parameter 
457: $p=P^n(X_1^n)$.
458: For long strings $X_1^n$ (i.e., for
459: large $n$) $p$ is small,
460: and $W$ is typically close to its 
461: expected value, which is approximately 
462: equal to the mean of a geometric
463: random variable
464: with parameter $p$, namely $1/p$. But the 
465: AEP tells us that, when $n$ is large,
466: $p=P^n(X_1^n)\approx 2^{-nH}$, so we 
467: expect $W$ to  be typically around $2^{nH}$.
468: Hence, from (\ref{eq:elias1}) the 
469: description length $\ell(X_1^n)$
470: of $X_1^n$ will be, to first order,
471: $$\ell(X_1^n)
472: \approx -\log_2 P^n(X_1^n) 
473: \approx nH 
474: 	\;\;\;\mbox{bits, with high probability.}$$
475: This shows that above scheme is asymptotically
476: optimal, in that its limiting compression ratio 
477: is equal to the entropy.
478: 
479: \paragraph{Practical Lempel-Ziv Coding.}
480: The Lempel-Ziv algorithm 
481: \cite{ziv-lempel:1}\cite{ziv-lempel:2} and
482: its many variants (see, e.g.,  
483: \cite[Ch.~8]{bell:cleary:witten}) are some
484: of the most successful data compression
485: algorithms used in practice. Roughly speaking, 
486: the main idea behind these algorithms is to 
487: use the message's own past as a database 
488: for future encoding. Instead of looking
489: for the first match in an infinitely long
490: database, in practice the encoder looks
491: for the longest match in a database of
492: fixed length. The analysis in 
493: \cite{wyner-ziv:1} of the idealized 
494: scheme described above was the first 
495: step in providing a probabilistic 
496: justification for the optimality
497: of the actual practical algorithms. 
498: Subsequently, in \cite{wyner-ziv:3} 
499: and \cite{wyner-ziv:2} Wyner and Ziv
500: established the asymptotic optimality
501: of the Sliding-Window (SWLZ) and
502: the Fixed-Database (FDLZ) versions
503: of the algorithm.
504: 
505: 
506: \subsection{Lossy Data Compression}
507: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
508: 
509: A similar development to the one outlined
510: above can be given in the case of lossy
511: data compression, this time centered around
512: a lossy analog of the AEP \cite{my:thesis}. 
513: To motivate this
514: discussion we look at Shannon's original 
515: random coding proof of the (direct) lossy 
516: source coding theorem \cite{shannon:59}. 
517: 
518: \paragraph{Shannon's Random Code.}
519: Suppose we want to describe the output 
520: $X_1^n$ of a memoryless source,
521: with distortion $D$ or less with respect 
522: to a family of single-letter distortion 
523: measures $\{\rho_n\}$. 
524: Let $Q_n^*$ be the optimum reproduction 
525: distribution on $\Ahatn$, where 
526: $\Ahat$ is the reproduction alphabet. 
527: Shannon's random coding 
528: argument says that we should 
529: construct a codebook ${\cal T}_n$ 
530: of $2^{n(R(D)+\epsilon)}$ codewords 
531: $Y_1^n$ generated $\iid$ from $Q_n^*$,
532: where $R(D)$ is the rate-distortion
533: function of the source (in bits).
534: The proof that $2^{n(R(D)+\epsilon)}$ codewords 
535: indeed suffice is based on the following 
536: result, Lemma~1 in \cite{shannon:59}.
537: 
538: \medskip
539: 
540: {\em Shannon's ``Lemma~1'':}
541: For $x_1^n\in A^n$ let $B(x_1^n,D)$ denote 
542: the distortion-ball of radius $D$ around
543: $x_1^n$, i.e., the collection of all
544: reproduction strings $y_1^n\in\Ahatn$ with
545: $\rho_n(x_1^n,y_1^n)\leq D$.
546: When $n$ is large:\footnote{The
547: notation in Shannon's statement is
548: slightly different, and he considers
549: the more general case of ergodic sources. 
550: For the sake of clarity we restrict
551: attention here to the $\iid$ case.}
552: \be
553: Q_n^*(B(X_1^n,D))\geq 2^{-n(R(D)+\epsilon)}
554: 	\;\;\;\;\mbox{with high probability.}
555: \label{eq:lemma1}
556: \ee
557: 
558: \medskip
559: 
560: In the proof of the coding theorem
561: this 
562: lemma plays the same role that
563: the AEP played in the lossless case;
564: notice the similarity between 
565: (\ref{eq:lemma1}) and its analog 
566: (\ref{eq:tocompare}) in the lossless case. 
567: 
568: Let's fix a source string $X_1^n$ to 
569: be encoded. The probability that $X_1^n$ 
570: matches any one of the codewords 
571: $Y_1^n$ in ${\cal T}_n$ is
572: $$\Pr\{\rho_n(X_1^n,Y_1^n)\leq D\,|\,X_1^n\}
573: =
574: \Pr\{Y_1^n\in B(X_1^n,D)\,|\,X_1^n\}
575: =
576: Q_n^*(B(X_1^n,D))$$
577: and by the lemma this probability is 
578: at least $2^{-n(R(D)+\epsilon)}$.
579: Therefore, with $2^{n(R(D)+\epsilon)}$
580: independent codewords to choose from,
581: we have a good chance for finding
582: a match with distortion $D$ or less.
583: 
584: \paragraph{Generalized AEP and Applications.}
585: A stronger and more general version
586: of Lemma~1 will be our starting point
587: in this paper. In the following section 
588: we will prove a {\em generalized AEP}:
589: For any product measure 
590: $Q^n$ on $\Ahatn$
591: \be
592: -\frac{1}{n}\log Q^n(B(X_1^n,D)) \to R_1(P,Q,D)
593: 	\;\;\;\;\mbox{w.p.1}
594: \label{eq:firstDAEP}
595: \ee
596: where 
597: $R_1(P,Q,D)$ is a (non-random) function
598: of the distributions $P$ and $Q$ and
599: of the distortion level $D$. 
600: [We will later prove several
601: variants of (\ref{eq:firstDAEP})
602: under much weaker assumptions.]
603: 
604: Like the AEP in the lossless case,
605: the generalized AEP and its refinements
606: find numerous applications in data 
607: compression, universal data compression, 
608: and in general pattern-matching questions.
609: Many of these applications were inspired
610: by the treatment in Wyner and Ziv's 1989
611: paper \cite{wyner-ziv:1}.  A (very
612: incomplete) sample of subsequent
613: work in the Wyner-Ziv spirit
614: includes the papers
615: 	\cite{steinberg-gutman}\cite{luczak-szpankowski}\cite{
616: 	yang-kieffer:1}\cite{kontoyiannis-lossy1-1}
617: 	on lossy data compression, 
618: 	and
619: 	\cite{luczak-szpankowski}\cite{
620: 	dembo-kontoyiannis}\cite{yang-zhang:99c}
621: 	on pattern-matching.
622: 
623: Aaron Wyner himself remained active in this
624: field for the following ten years, and his
625: last paper \cite{wyner-ziv-wyner}, co-written
626: with J.~Ziv and A.J.~Wyner,
627: was a review paper on this subject.
628: In the present paper we review the corresponding
629: developments in the lossy case, and in the process
630: we add new results (and some new proofs of 
631: recent results) in an attempt to present a more
632: complete picture.
633: 
634: \subsection{Central Themes, Paper Outline}
635: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
636: 
637: In Section~2 we give an extensive discussion of
638: the generalized AEP. By now there are numerous
639: different proofs under different assumptions, 
640: and we offer a streamlined approach to the most 
641: general versions using techniques from large 
642: deviation theory (cf.
643: \cite{yang-kieffer:1}\cite{dembo-kontoyiannis}%
644: \cite{chi-it:01}\cite{chi-AP:01}
645: and Bucklew's earlier work 
646: \cite{bucklew:87}\cite{bucklew:88}). We also
647: discuss the relationship 
648: of the generalized AEP
649: with the classical extensions
650: of the AEP (due to Barron \cite{barron:1}
651: and Orey \cite{orey:85})
652: to processes with densities.
653: We establish a formal connection
654: between these two by looking at the limit
655: of the distortion level $D\downarrow 0$.
656: 
657: In Section~3 we develop applications 
658: of the generalized AEP to a number
659: of related problems. 
660: We show how the generalized AEP
661: can be used to determine the
662: asymptotic behavior of Shannon's
663: random coding scheme, and we
664: discuss the role of mismatch
665: in
666: lossy data compression.
667: We also determine the first order 
668: asymptotic behavior of 
669: waiting times and longest 
670: match-lengths between stationary 
671: processes.  The main ideas used 
672: here are strong approximation 
673: \cite{kontoyiannis-jtp} and 
674: duality \cite{wyner-ziv:1}.
675: We present strengthened versions
676: of Shannon's direct lossy source coding
677: theorem (and of a corresponding universal
678: coding theorem), showing that {\em almost all}
679: random codebooks achieve essentially 
680: the same compression performance. 
681: A lossy version of the Lempel-Ziv 
682: algorithm is recalled, which 
683: achieves optimal compression 
684: performance (asymptotically)
685: as well as polynomial 
686: complexity at the encoder.
687: We also discuss how the classical 
688: source coding problem
689: can be generalized to a question about
690: weighted sphere-covering. The answer 
691: to this question gives, as
692: corollaries, Shannon's coding theorems,
693: Stein's lemma in hypothesis testing, 
694: and some converse concentration inequalities.
695: 
696: Section~4 is devoted to second order
697: refinements of the AEP and the generalized
698: AEP. It is shown, for example, that 
699: under certain conditions
700: $-\log P^n(X_1^n)$ and $-\log Q^n(B(X_1^n,D))$ 
701: are asymptotically Gaussian.
702: These refinements are used in Section~5
703: to provide corresponding second order 
704: results (such as central limit theorems) 
705: for the applications considered in Section~3. 
706: We prove second order asymptotic results 
707: for waiting times
708: and longest match-lengths. 
709: Precise redundancy rates are 
710: given for Shannon's random code,
711: and converse coding theorems show 
712: that the random code achieves the 
713: optimal pointwise redundancy, 
714: up to terms of order $(\log n)$. 
715: For $\iid$ sources the pointwise 
716: redundancy is typically of order 
717: $\sigma\sqrt{n}$, where $\sigma$
718: is the minimal coding variance of 
719: the source. When $\sigma=0$ these
720: fluctuations disappear, and the
721: best pointwise redundancy is of
722: order $(\log n)$. The question of 
723: exactly when $\sigma$ can be equal 
724: to zero is briefly discussed.
725: 
726: Finally, Sections~6 and 7 
727: contain generalizations of some
728: of the above results to 
729: random fields. All the results
730: stated there
731: are new, although
732: most of them are straightforward 
733: generalizations of corresponding
734: one-dimensional results.
735: 
736: % \newpage
737: \section{The Generalized AEP}
738: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
739: 
740: \subsection{Notation and Definitions}
741: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
742: We begin by introducing some basic definitions
743: and notation that will remain in effect for 
744: the rest of the paper. We will consider a
745: stationary ergodic 
746: process 
747: $\Xp=\{X_n\;;\;n\in\IN\}$
748: taking values in a general alphabet 
749: $A$.\footnote{To avoid 
750: uninteresting technicalities,
751: we will assume throughout that 
752: $A$ is a complete, separable metric space,
753: equipped with its associated Borel 
754: $\sigma$-field ${\cal A}$.
755: Similarly we take $(\Ahat,\hat{\cal A})$
756: to be the Borel measurable space
757: corresponding to a complete,
758: separable metric space $\Ahat$.} When
759: talking about data compression, $\Xp$
760: will be our source and $A$ will be 
761: called the source alphabet. We write
762: $X_i^j$ for the vector of random
763: variables $X_i^j=(X_i,X_{i+1},\ldots,X_j)$,
764: and similarly 
765: $x_i^j=(x_i,x_{i+1},\ldots,x_j)\in 
766: A^{j-i+1}$
767: for a realization of these random variables,
768: $-\infty\leq i\leq j\leq \infty$.
769: We let $P_n$ denote the marginal 
770: distribution of $X_1^n$ on $A^n$
771: ($n\geq 1$), and write $\BBP$
772: for the distribution of the whole
773: process.
774: Similarly, we take
775: $\Yp=\{Y_n\;;\;n\in\IN\}$
776: to be a stationary ergodic
777: process taking values in the
778: (possibly different) alphabet
779: $\Ahat$.${}^2$ 
780: In the context of
781: data compression,
782: $\Ahat$ is the reproduction
783: alphabet and $\Yp$
784: has the ``codebook'' distribution.
785: We write $Q_n$ for the marginal
786: distribution of $Y_1^n$ on $\Ahatn$,
787: $n\geq 1$, and $\BBQ$ for the
788: distribution of the whole process $\Yp$.
789: We will always assume that the process
790: $\Yp$ is independent of $\Xp$.
791: 
792: Let $\rho:A\times\Ahat\to[0,\infty)$
793: be an arbitrary nonnegative (measurable)
794: function, and define a sequence of
795: single-letter distortion measures 
796: $\rho_n:A^n\times\Ahatn\to[0,\infty)$ by
797: \ben
798: \rho_n(x_1^n,y_1^n)\bydef\frac{1}{n}\sum_{i=1}^n\rho(x_i,y_i)
799: \;\;\;\;x_1^n\in A^n,\;y_1^n\in\Ahatn.
800: \een
801: Given $D\geq 0$ and $x_1^n\in A^n$, 
802: we write 
803: $B(x_1^n,D)$ for
804: the distortion-ball of radius $D$ around $x_1^n$:
805: $$B(x_1^n,D)=\{y_1^n\in\Ahatn\;:\;\rho_n(x_1^n,y_1^n)\leq D\}.$$
806: 
807: Throughout the paper, $\log$ denotes
808: the natural logarithm and $\log_2$
809: the logarithm to base 2. Unless otherwise
810: mentioned, all familiar information-theoretic
811: quantities (such as the entropy,
812: mutual information, and so on)
813: are assume to be defined in terms
814: of natural logarithms (and are 
815: therefore given in nats).
816: 
817: \subsection{Generalized AEP When $\Yp$ is I.I.D.}
818: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
819: 
820: In the case when $A$ is finite,
821: the classical AEP, also known as the 
822: Shannon-McMillan-Breiman theorem
823: (see \cite[Chapter~15]{cover:book} 
824: or the original papers
825: \cite{shannon:48}\cite{mcmillan}\cite{breiman:57}\cite{breiman:60}),
826: states that as $n\to\infty$
827: \be
828: -\frac{1}{n}\log P_n(X_1^n) \to H(\BBP)
829: \;\;\;\;\mbox{w.p.1}
830: \label{eq:discreteAEP}
831: \ee
832: where 
833: $$H(\BBP)\bydef\lim_{n\to\infty}\frac{1}{n}H(X_1^n)$$
834: is the entropy rate of the process $\Xp$
835: (in nats, since we are taking logarithms to
836: base $e$).
837: As we saw in the
838: Introduction,
839: in lossy data
840: compression the role of the AEP is taken 
841: up by the result of Shannon's ``Lemma 1'' 
842: and, more generally, by statements of the form
843: $$-\frac{1}{n}Q_n(B(X_1^n,D))\to R(\BBP,\BBQ,D)
844: \;\;\;\;\mbox{w.p.1}$$
845: for some 
846: non-random
847: ``rate-function'' $R(\BBP,\BBQ,D)$.
848: 
849: First we consider the simplest case where $\Yp$ 
850: is assumed to be an $\iid$ process. We write 
851: $Q=Q_1$ for its first order marginal,
852: so that $Q_n=Q^n$, for $n\geq 1.$ Similarly
853: we write $P=P_1$ for the first order marginal 
854: of $\Xp$. 
855: Let 
856: \be
857: \Dmin & \bydef &  E_P[\essinf_{Y\sim Q} \;\rho(X,Y)]
858: 	\label{eq:Dmin} \\
859: \Dav & \bydef &  E_{P\times Q}[\rho(X,Y)].
860: 	\label{eq:Dav}
861: \ee
862: [Recall that the essential infimum of
863: a function $g(Y)$ of the random variable
864: $Y$ with distribution $Q$ is defined as
865: $\essinf_{Y\sim Q} g(Y) = 
866: \sup\{t\in\RL\;:\;Q\{g(Y)>t\}=1\}.$]
867: 
868: Clearly $0\leq\Dmin\leq\Dav$.
869: To avoid the trivial case when $\rho(x,y)$
870: is essentially constant for
871: ($\BBP$-almost) all
872: $x\in A$, we assume that with positive
873: $\BBP$-probability $\rho(x,y)$ is not
874: essentially constant in $y$, that is:
875: \be
876: \Dmin < \Dav.
877: \label{eq:nonconst}
878: \ee
879: Note also that
880: for $D$ greater than $\Dav$,
881: the probability 
882: $Q^n(B(X_1^n,D))\to 1$
883: as $n\to\infty$
884: (this is easy to see 
885: by the ergodic theorem),
886: so we restrict our attention
887: to distortion levels $D<\Dav$.
888: 
889: \medskip 
890: 
891: {\em Theorem~1. Generalized AEP when $\Yp$ is $\iid$:}
892: Let $\Xp$ be a stationary ergodic process
893: and $\Yp$ be $\iid$ with marginal distribution 
894: $Q$ on $\Ahat$.
895: Assume that $\Dav=E_{P\times Q}[\rho(X,Y)]$ is
896: finite. Then for any $D\in(\Dmin,\Dav)$
897: \ben
898: -\frac{1}{n}\log Q^n(B(X_1^n,D)) \to R_1(P,Q,D)
899:         \;\;\;\;\mbox{w.p.1}.
900: \een
901: The rate-function $R_1(P,Q,D)$ 
902: is defined as
903: \ben
904: R_1(P,Q,D) = \inf_W H(W\|P\times Q)
905: \een
906: where $H(W\|V)$ denotes the relative
907: entropy between two distributions
908: $W$ and $V$,
909: $$H(W\|V) \bydef \left\{ \begin{array}{ll}
910:    E_W[\log\frac{dW}{dV}]
911: 		   & \mbox{if the density $\frac{dW}{dV}$ exists}, \\
912:    \infty 	   & \mbox{otherwise}
913:  \end{array} \right.
914: $$
915: and the infimum is taken over all 
916: joint distributions $W$ on
917: $A\times\Ahat$ such that 
918: the first marginal of $W$ is $P$
919: and $E_W[\rho(X,Y)]\leq D.$
920: 
921: \medskip
922: 
923: {\em Example~1: The rate-function $R_1(P,Q,D)$ 
924: when $Q$ is Gaussian:} 
925: Although in general the rate-function
926: $R_1(P,Q,D)$ cannot be evaluated explicitly,
927: here we show that it is possible to obtain 
928: an exact expression for $R_1(P,Q,D)$ in the 
929: special case when $\rho(x,y)=(x-y)^2$,
930: $\Xp$ is a real-valued, process, 
931: and $Q$ is a Gaussian measure 
932: on $\RL.$ Specifically, assume
933: that $\Xp$ is a zero-mean,
934: stationary ergodic process
935: with finite variance 
936: $\sigma^2=\VAR(X_1)<\infty$,
937: and take $Q$ to be 
938: a zero-mean Gaussian measure 
939: with variance $\tau^2$, i.e.,
940: $Q\sim N(0,\tau^2)$.
941: Under these assumptions, it is easy to see
942: that $\Dmin=0$ and $\Dav=\sigma^2+\tau^2$.
943: Moreover, with the help of Proposition~2 below,
944: $R_1(P,Q,D)$ can be explicitly
945: evaluated as:
946: $$R_1(P,Q,D) = \left\{ \begin{array}{ll}
947: 	\infty\,,	& \;\;\;\;D=0\\
948: 	\frac{1}{2}\log\left(\frac{v}{D}\right)
949: 	  -\frac{(v-D)(v-\sigma^2)}
950: 		{2v\tau^2}\,,
951: 	 		& \;\;\;\;0<D<\sigma^2+\tau^2\\
952: 	0\,, 		& \;\;\;\;D\geq \sigma^2+\tau^2
953:  \end{array} \right.
954: $$
955: where 
956: $$v\bydef\frac{1}{2}\left[\tau^2+\sqrt{\tau^4+4D\sigma^2}\right].$$
957: We will come back to this example when considering
958: mismatched rate-distortion codebooks in Section~3.2.
959: 
960: \medskip
961:  
962: {\em Remark 1:}
963: In more familiar information-theoretic
964: terms, the rate-function $R_1(P,Q,D)$
965: can equivalently be defined as 
966: (cf. \cite{yang-kieffer:1})
967: \ben
968: R_1(P,Q,D) = \inf_{(X,Y)}\,[I(X;Y)
969: 	+H(Q_Y\|Q)]
970: \een
971: where $I(X;Y)$ denotes the mutual
972: information (in nats) between the 
973: random variables $X$ and $Y$,
974: and the infimum is over
975: all jointly distributed random variables
976: $(X,Y)$ with values in $A\times\Ahat$
977: such that $X$ has distribution $P$,
978: $E[\rho(X,Y)]\leq D$, and $Q_Y$ denotes
979: the distribution of $Y$.
980: 
981: \medskip
982: 
983: {\em Remark 2:}
984: The assumption that $\Yp$ is
985: $\iid$ is clearly restrictive
986: and it will be relaxed below.
987: On the other hand the assumptions
988: on the distortion measure 
989: $\rho$ seem to be minimal;
990: we simply assume that $\rho$
991: has finite expectation (in
992: the more general results below
993: $\rho$ is assumed to be bounded).
994: In this form, the result of 
995: Theorem~1 is new.
996: 
997: \medskip
998: 
999: {\em Discussion of Proof:}
1000: Let's fix a realization $x_1^\infty$
1001: of $\Xp$. The probability
1002: $Q^n(B(X_1^n,D))$ can be written as
1003: $$\PR\left\{Y_1^n\in B(X_1^n,D) \,|\,X_1^n=x_1^n\right\} \;=\;
1004: \PR\left\{\frac{1}{n}\sum_{i=1}^n\rho(x_i,Y_i)\leq D \right\}.$$
1005: Since the distortion level $D$ is taken smaller than the
1006: average value $\Dav$, this is large deviations probability
1007: for the partial sums $(1/n)\sum_{i=1}^n Z_i$ of
1008: the independent (but not identically distributed)
1009: random variables $Z_i=\rho(x_i,Y_i)$. The proof
1010: is essentially an application of the 
1011: G\"artner-Ellis theorem of large deviations
1012: to the random variables $\{Z_i\}$.
1013: 
1014: \medskip
1015: 
1016: {\em Proof Outline:}
1017: Choose and fix a realization $x_1^\infty$ of
1018: $\Xp$ and define the random variables 
1019: $Z_i=\rho(x_i,Y_i)$. Let 
1020: $$S_n=\frac{1}{n}\sum_{i=1}^nZ_i$$
1021: and define the log-moment generating
1022: functions of the normalized partial 
1023: sums $S_n$ by
1024: $$\LA_n(\la) \bydef \log
1025: 	E_{Q^n}\left(e^{\lambda S_n}\right),
1026: 	\;\;\;\;\lambda\leq 0.$$
1027: Then for any $\la\leq 0$, by the ergodic theorem we have
1028: that
1029: \be
1030: \frac{1}{n}\LA_n(n\la)
1031: 	= \frac{1}{n}\sum_{i=1}^n\log
1032: 	E_Q\left(e^{\lambda\rho(x_i,Y_i)}\right)
1033: 	\to 
1034: 	\LA(\la)\bydef E_P\left[\log
1035: 	E_Q\left(e^{\lambda\rho(X,Y)}\right)
1036: 	\right]
1037: \label{eq:GEcheck}
1038: \ee
1039: for $\BBP$-almost any realization $x_1^\infty$.
1040: Now we would like to apply the 
1041: G\"artner-Ellis theorem, but first
1042: we need to check some simple properties
1043: of the function $\LA(\la)$. 
1044: Note that 
1045: $\LA(\la)\leq 0$ 
1046: and
1047: also (by Jensen's inequality)
1048: $\LA(\la)\geq \la\Dav>-\infty$,
1049: for all 
1050: $\la\leq 0$.
1051: Moreover, $\LA(\la)$ is twice
1052: differentiable
1053: in $\la$ with
1054: $$\LA'(\la) = E_{P\times Q}\left(\rho(X,Y)
1055:     \frac
1056: 	{e^{\lambda\rho(X,Y)}}
1057: 	{E_Q[e^{\lambda\rho(X,Y')}]}
1058: 			   \right)
1059: $$
1060: and 
1061: $$\LA''(\la) = 
1062: E_P\left[
1063:     E_Q
1064:     \left\{\rho^2(X,Y)
1065:     \frac
1066:         {e^{\lambda\rho(X,Y)}}
1067:         {E_Q[e^{\lambda\rho(X,Y')}]}
1068:     \right\}
1069: 	\;-\;
1070:     \left(E_Q
1071:         \left\{
1072: 	\rho(X,Y)
1073:     	\frac
1074:         {e^{\lambda\rho(X,Y)}}
1075:         {E_Q[e^{\lambda\rho(X,Y')}]}
1076:         \right\}
1077:     \right)^2
1078: \right]$$
1079: (this differentiability is easily verified by
1080: an application of the dominated convergence
1081: theorem). By the Cauchy-Schwarz
1082: inequality $\LA''(\la)\geq 0$ for all $\la<0$,
1083: and in fact $\LA''(\la)$ is strictly positive
1084: due to assumption (\ref{eq:nonconst}).
1085: Also it is not hard to verify that 
1086: $$\lim_{\lambda\uparrow 0}\LA'(\la)=\Dav$$
1087: and
1088: \be
1089: \lim_{\lambda\downarrow -\infty}\LA'(\la)=\Dmin.
1090: \label{eq:la-lim}
1091: \ee
1092: Since $D\in(\Dmin,\Dav)$,
1093: there exists a unique $\la^*<0$ with
1094: $\LA'(\la^*)=D$, and therefore
1095: the Fenchel-Legendre
1096: transform of $\LA(\la)$ evaluated at $D$ is
1097: $$\LA^*(D)\bydef\sup_{\la\leq 0}[\la D-\LA(\la)]
1098: 	\;=\;\la^*D-\LA(\la^*).$$
1099: Now we can apply the
1100: G\"artner-Ellis theorem
1101: \cite[Theorem~2.3.6]{dembo-zeitouni:book}
1102: to deduce from
1103: (\ref{eq:GEcheck})
1104: that with $\BBP$-probability one
1105: $$-\frac{1}{n}\log Q^n(B(X_1^n,D)) \to \LA^*(D).$$
1106: The proof is complete upon noticing 
1107: that $\LA^*(D)$ is nothing but $R_1(P,Q,D)$.
1108: This is stated and proved in 
1109: the following proposition.
1110: \qed
1111: 
1112: \medskip
1113: 
1114: {\em Proposition 2. Characterization of the Rate Function:}
1115: In the notation of the proof of Theorem~1,
1116: $\LA^*(D)=R_1(P,Q,D)$, for $D\in(\Dmin,\Dav)$.
1117: 
1118: \medskip
1119: 
1120: {\em Proof Outline:} Under additional 
1121: assumptions on the distortion measure 
1122: $\rho$ this has appeared in various papers
1123: (see, e.g., \cite{dembo-kontoyiannis}\cite{yang-zhang:99}).
1124: For completeness, we offer a proof sketch here.
1125: 
1126: In the notation of the above proof, consider
1127: the measure $W$ on $A\times\Ahat$ defined by
1128: $$\frac{dW(x,y)}{dP\times Q} = 
1129: \frac{e^{\las^*\rho(x,y)}}
1130: {E_Q[e^{
1131: \las^* 
1132: \rho(x,Y)}]}.$$
1133: Obviously the first marginal of $W$
1134: is $P$ and it is easy to check that 
1135: that $E_W[\rho(X,Y)]=\LA'(\la^*)=D$.
1136: Therefore, by the definitions of
1137: $R_1(P,Q,D)$ and $W$, and by
1138: the choice of $\la^*$:
1139: \be
1140: R_1(P,Q,D)\leq H(W\|P\times Q)
1141: 	=\la^*D-\LA(\la^*)
1142: 	=\LA^*(D).
1143: \label{eq:propUBD}
1144: \ee
1145: To prove the corresponding lower
1146: bound we first claim that 
1147: for any measurable function
1148: $\phi:\Ahat\to (-\infty,0]$,
1149: and any probability measure 
1150: $Q'$ on $\Ahat$,
1151: \be
1152: H(Q'\|Q)\geq E_{Q'}(\phi(Y)) -\log E_{Q}(e^{\phi(Y)}).
1153: \label{eq:generalSV}
1154: \ee
1155: Let $Q_\phi$ denote the probability measure on $\Ahat$ such that
1156: $dQ_\phi/dQ=e^\phi/E_{Q}(e^{\phi(Y)})$. Clearly, it 
1157: suffices to prove (\ref{eq:generalSV}) in case $dQ'/dQ$ exists,
1158: in which case the difference between the left and right hand sides is 
1159: $$
1160:  E_{Q'}\left\{\log\frac{dQ'}{dQ}\right\} -
1161:  E_{Q'}\left\{\log\left(\frac{e^{\phi}}
1162: 	{
1163: 	E_{Q}(e^{\phi})
1164: 	}
1165: 	\right)\right\}
1166: \;=\; 
1167: %
1168: %   E_{Q_\phi}\left\{\frac{dQ'}{dQ_\phi}\log\left(\frac{dQ'}{dQ_\phi}
1169: % 	\right)\right\}
1170: H(Q'\|
1171: Q_\phi)
1172: \;\geq\;0.
1173: $$
1174: % where the last inequality follows from 
1175: % Jensen's inequality for the convex 
1176: % function $t \log t$. 
1177: Given an arbitrary
1178: candidate $W$ as in the definition of
1179: $R_1(P,Q,D)$ and any $x\in A$, we take
1180: $Q'=W(\cdot|x)$ and $\phi(y)=\la^*\rho(x,y)$
1181: in (\ref{eq:generalSV}) to get that 
1182: $$H(W(\cdot|x)\|Q(\cdot))\geq \la^* E_{W(Y|x)}[\rho(x,Y)]
1183: 	- \log E_{Q}(e^{\lambda^*\rho(x,y)}).$$
1184: Substituting $X$ for $x$,
1185: taking expectations of both sides 
1186: with respect to $P$, 
1187: and recalling that $\la^*<0$
1188: and $E_W[\rho(X,Y)]\leq D$, we get:
1189: $$H(W\|Q)\geq \la^*D - \LA(\la^*) = \LA^*(D).$$
1190: Since $W$ was arbitrary it follows that
1191: $R_1(P,Q,D)\geq \LA^*(D)$, and together
1192: with (\ref{eq:propUBD}) this completes 
1193: the proof.
1194: \qed
1195: 
1196: \subsection{Generalized AEP When $\Yp$ is Not I.I.D.}
1197: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1198: 
1199: Next we present two versions 
1200: of the generalized AEP that hold
1201: when $\Yp$ is a stationary 
1202: dependent process,
1203: under some additional conditions.
1204: 
1205: Throughout this section we will
1206: assume that the distortion
1207: measure is essentially bounded
1208: \be
1209: \Dmax \bydef \esssup_{(X_1,Y_1)\sim P_1\times Q_1} 
1210: 		\rho(X_1,Y_1)<\infty.
1211: \label{eq:Dmax}
1212: \ee
1213: We let $\Dav$ be defined as earlier,
1214: $\Dav = E_{P_1\times Q_1}[\rho(X_1,Y_1)]$,
1215: and for $n\geq 1$ we let
1216: \ben
1217: \Dminn 
1218: \bydef
1219: 	E_{P_n}
1220: 	\left[
1221: 		\essinf_{Y_1^n\sim Q_n} \;\rho_n(X_1^n,Y_1^n)
1222: 	\right].
1223: \een
1224: It is easy to see that $n \Dminn$ is a finite, superadditive sequence, 
1225: and therefore we can also define
1226: $$\Dmin = \lim_{n\to\infty} \Dminn = \sup_{n\geq 1} \Dminn.$$
1227: As before, we will assume that 
1228: the distortion measure $\rho$ is 
1229: not essentially constant,
1230: that is, $\Dmin<\Dav.$
1231: 
1232: \medskip
1233: 
1234: We first state a version 
1235: of the generalized AEP that 
1236: was recently proved by Chi 
1237: \cite{chi-it:01}, for 
1238: processes $\Yp$ satisfying 
1239: a rather strong
1240: mixing condition: We say that 
1241: the stationary process $\Yp$ 
1242: is {\em $\psipm$-mixing}, if 
1243: for all $d$ large enough there is a
1244: finite constant $c_d$ such that
1245: $$
1246: c_d^{-1} \BBQ(A)\BBQ(B) < \BBQ(A\cap B) < c_d \BBQ(A)\BBQ(B)
1247: $$
1248: for all events $A\in\sigma(Y_{-\infty}^0)$
1249: and $B\in\sigma(Y_d^{\infty})$,
1250: where $\sigma(Y_i^j)$ denotes
1251: the $\sigma$-field generated by $Y_i^j$. 
1252: Recall the usual definition 
1253: according to which $\Yp$ is called 
1254: {\em $\psi$-mixing} if in fact 
1255: the constants $c_d \to 1$ as 
1256: $d \to \infty$; see \cite{bradley} 
1257: for more details.
1258: Clearly $\psipm$-mixing is weaker 
1259: than {\em $\psi$-mixing}.
1260: 
1261: \medskip
1262: 
1263: %Y -------------- rephrased Theorems 3 and 4 and the discussion here 
1264: 
1265: {\em Theorem~3. Generalized AEP when $\Yp$ is $\psipm$-mixing
1266: \cite{chi-it:01}:}
1267: Let $\Xp$ and $\Yp$ be stationary ergodic
1268: processes. Assume that $\Yp$ is $\psipm$-mixing,
1269: and that the distortion measure $\rho$ is bounded.
1270: Then for all $D\in(\Dmin,\Dav)$
1271: \be
1272: -\frac{1}{n}\log Q_n(B(X_1^n,D)) \to R(\BBP,\BBQ,D)
1273:         \;\;\;\;\mbox{w.p.1}
1274: \label{eq:thm4}
1275: \ee
1276: where $R(\BBP,\BBQ,D)$ is the rate-function defined by
1277: \be
1278: \label{eq:thm4b}
1279: R(\BBP,\BBQ,D) = \lim_{n\to\infty} R_n(P_n,Q_n,D) 
1280: \ee
1281: where, for $n \geq 1$, 
1282: \ben
1283: R_n(P_n,Q_n,D) \bydef \inf_{V_n} n^{-1} H(V_n\|P_n\times Q_n)
1284: \een
1285: and the infimum is taken over all joint 
1286: distributions $V_n$ on $A^n\times\Ahatn$ 
1287: such that the $A^n$-marginal of $V_n$ 
1288: is $P_n$ and $E_{V_n}[\rho_n(X_1^n,Y_1^n)]\leq D$.
1289: 
1290: \medskip
1291: 
1292: As we discussed in the previous section,
1293: the proof of most versions of the generalized
1294: AEP consistst of two steps: First a 
1295: ``conditional large deviations'' result
1296: is proved for the random variables 
1297: $\{\rho_n(x_1^n,Y_1^n)\;;\;n\geq 1\}$,
1298: where $x_1^\infty$ is a fixed realization of the
1299: process $\Xp$. Second, the rate-function
1300: $R(\BBP,\BBQ,D)$ is characterized as the 
1301: limit of a sequence of minimizations in 
1302: terms of relative entropy.
1303: 
1304: In a subseqeunt paper, Chi \cite{chi-AP:01}
1305: showed that the first of these steps 
1306: (the large deviations part) remains
1307: valid under a condition 
1308: weaker than $\psipm$-mixing, 
1309: condition~$(S)$ of \cite{bryc-dembo:96}.
1310: In the following theorem we give a general
1311: version of the second step; we prove
1312: that the generalized AEP (\ref{eq:thm4}) 
1313: and the formula (\ref{eq:thm4b}) for the 
1314: rate-function remain valid as long as 
1315: the random variables 
1316: $\{\rho_n(x_1^n,Y_1^n)\;;\;n\geq 1\}$
1317: satisfy a large deviations principle (LDP)
1318: with some {\it deterministic}, convex 
1319: rate-function (see \cite{dembo-zeitouni:book}
1320: for the precise meaning of this statement).
1321: 
1322: \medskip
1323: 
1324: {\em Theorem~4.} 
1325: Let $\Xp$ and $\Yp$ be stationary processes.
1326: Assume that $\rho$ is bounded, and that with
1327: $\BBP$-probability one, conditional on 
1328: $X_1^\infty=x_1^\infty$, the random variables
1329: $\{\rho_n(x_1^n,Y_1^n)\;;\;n\geq 1\}$ satisfy a
1330: large deviations principle with some
1331: deterministic, convex rate-function.
1332: Then, both (\ref{eq:thm4}) and (\ref{eq:thm4b})
1333: hold for any $D\in(\Dmin,\Dav)$, except 
1334: possibly at the point $D=\Dinf$, where
1335: \be
1336: \Dinf \bydef \inf \{ D \geq 0 : 
1337: \sup_{n \geq 1} R_n(P_n,Q_n,D) < \infty \}.
1338: \label{eq:dinf}
1339: \ee
1340: 
1341: \medskip
1342: 
1343: Since Theorem~4 has an exact 
1344: analog in the case of random fields, 
1345: we postpone its proof until the
1346: proof of the corresponding result
1347: (Theorem~27) in Section~6.
1348: 
1349: \medskip
1350: 
1351: {\em Remark 3:} 
1352: Suppose that the joint process $(\Xp,\Yp)$ is
1353: stationary, and that it satisfies a
1354: ``process-level large deviations principle''
1355: (see Remark~6 in Section~6 for a somewhat 
1356: more detailed statement) on the space of
1357: % For each $n\geq 1$, given a $x_1^n\in A^n$
1358: % let $x^{(n)}$ denote the periodic
1359: % extension of the string $x_1^n$ to an
1360: % infinite realization in $A^\NN$.
1361: % Similarly define $X^{(n)}$ and $Y^{(n)}$
1362: % as the periodic extensions of $X_1^n$ 
1363: % and $Y_1^n$, respectively. 
1364: % The process-level empirical 
1365: % measure $\calLn$ induced
1366: % (by the stationary processes $\Xp$ and $\Yp$)
1367: stationary probability measures
1368: on $(A^\infty\times\hat{A}^\infty)$
1369: % is then defined as
1370: % $$\calLn\bydef\frac{1}{n}\sum_{i=1}^n
1371: 	% \delta_{(X^{(n)}_{i+\cdot},Y^{(n)}_{i+\cdot})}$$
1372: % where $\delta_{s,s'}$ denotes the measure 
1373: % assigning unit mass to the joint sequence 
1374: % $(s,s')\in A^\NN\times\hat{A}^\NN$ 
1375: % and $X^{(n)}_{i+\cdot}$ (or $Y^{(n)}_{i+\cdot}$) 
1376: % denotes the infinite sequence $X^{(n)}$
1377: % (respectively, $Y^{(n)}$) shifted by
1378: % $i$ positions to the left. 
1379: % Equipp the space of stationary
1380: % probability measures on
1381: % $(A^\NN\times\hat{A}^\NN)$ with the 
1382: equipped with the
1383: topology of weak convergence.
1384: Assume, moreover,
1385: that this LDP holds with a convex,
1386: good rate-function $I(\cdot)$.
1387: [See \cite{dawson-gartner:87}\cite[Sec.~5.3,~5.4]{deuschel-stroock:book}%
1388: \cite[Sec.~6.5.3]{dembo-zeitouni:book}\cite{bryc-dembo:96} 
1389: for a general discussion as well as specific examples 
1390: of processes for which the above conditions
1391: hold. Apart from the $\iid$ case, these
1392: examples also include all ergodic finite-state
1393: Markov chains, among many others.]
1394: 
1395: It is easy to check that, when 
1396: $\rho$ is bounded and continuous on 
1397: $A \times \hat{A}$, then with
1398: $\BBP$-probability one, conditional on 
1399: $x_1^\infty$, the random variables
1400: $\{\rho_n(x_1^n,Y_1^n)\}$ 
1401: % satisfy a
1402: % large deviations principle with some
1403: % deterministic, convex rate-function.
1404: % for $\BBP$-a.e. $\Xp$, 
1405: % conditional upon $\Xp$ the sequence 
1406: % $\{\rho_n(X_1^n,Y_1^n)\}$ 
1407: satisfy the LDP upper bound with respect 
1408: to the deterministic, convex rate-function 
1409: $J(D)=\inf  I(\nu)$, where the infimum
1410: is over all stationary probability measures
1411: $\nu$ on $A^\infty \times \hat{A}^\infty$ such that
1412: the $A^\infty$-marginal of $\nu$ is $\BBP$ and
1413: $E_\nu[\rho(X_1,Y_1)] = D$.
1414: Indeed, Comets \cite{comets:89} provides
1415: such an argument when $\Xp$ and $\Yp$ are both $\iid$
1416: Moreover, he shows that in that case 
1417: the corresponding LDP lower bound also holds, 
1418: and hence Theorem 4 applies.
1419: Unfortunately, the conditional
1420: LDP lower bound has to be verified on a 
1421: case-by-case basis.
1422: 
1423: \medskip
1424: 
1425: {\em Remark 4:} 
1426: Although quite strong,
1427: the $\psipm$-mixing condition
1428: of Theorem~3, and the $(S)$-mixing
1429: condition of \cite{chi-AP:01},
1430: probably cannot be significantly 
1431: relaxed: For example, in the special case 
1432: when $\Xp$ is a constant process 
1433: taking on just a single value,
1434: if Theorem~3 were to hold (for any bounded 
1435: distortion measure) with a strictly 
1436: monotone rate-function, then necessarily 
1437: the empirical measures of $Y_1^n$ 
1438: would satisfy the LDP in the space 
1439: ${\cal P}_a(\Ahat)$ 
1440: (see \cite{bryc-dembo:96} for details).
1441: But
1442: \cite[Example~1]{bryc-dembo:96} illustrates
1443: that this LDP
1444: may fail even when $\Yp$ is a stationary 
1445: ergodic Markov chain with 
1446: discrete alphabet $\Ahat$. In particular,
1447: the example in \cite{bryc-dembo:96}
1448: has an exponential $\phi$-mixing rate. 
1449: 
1450: %Y ------------------ end of changes
1451: 
1452: \subsection{Generalized AEP for Optimal Lossy Compression}
1453: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1454: 
1455: Here we present a version of the generalized
1456: AEP that is useful in proving direct coding
1457: theorems. Let $\Xp$ be a stationary
1458: ergodic process. For the distortion measure
1459: $\rho$ we adopt two simple regularity conditions.
1460: We assume the existence of a {\em reference 
1461: letter}, i.e., an $\hat{a}\in\Ahat$ such that 
1462: $$E_{P_1}[\rho(X_1,\hat{a})]<\infty.$$
1463: Also, following \cite{kieffer:91}, we 
1464: require that for any distortion level 
1465: $D>0$ there is a scalar quantizer 
1466: for $\Xp$ with finite rate.
1467: 
1468: \smallskip
1469: 
1470: {\em Quantization Condition:} 
1471: For each $D>0$
1472: there is a ``quantizer'' $q:A\to B$ for
1473: some countable (finite or infinite)
1474: subset $B\subset\Ahat$, 
1475: such that:
1476: % \begin{enumerate}
1477: % \item[i.] 
1478: 
1479: i. $\;\;\rho(x,q(x))\leq D$ for all $x\in A$, and
1480: 
1481: ii. $\;$
1482: % \item[ii.] 
1483: the entropy $H(q(X_1))<\infty$.
1484: % \end{enumerate}
1485: 
1486: \smallskip
1487: 
1488: \noindent
1489: The following was implicitly proved
1490: in \cite{kieffer:91}; 
1491: % in the process
1492: % of proving a pointwise converse to the
1493: % source coding theorem; 
1494: see also \cite{konto-zhang:00} 
1495: for details.
1496: 
1497: \smallskip
1498:  
1499: {\em Theorem~5. Generalized AEP for Optimal Lossy Compression
1500: \cite{kieffer:91}:}
1501: Let $\Xp$ be a stationary ergodic process.
1502: Assume that the distortion measure $\rho$
1503: satisfies the quantization condition,
1504: that a reference letter exists, and 
1505: that for each $n\geq 1$ the infimum of
1506: $$E_{P_n}[-\log Q_n(B(X_1^n,D))]$$
1507: over all probability measures $Q_n$
1508: on $\Ahatn$ is achieved by some 
1509: $\widetilde{Q}_n$.
1510: Then for any $D>0$
1511: \be
1512: -\frac{1}{n}\log \widetilde{Q}_n(B(X_1^n,D)) \to R(D)
1513:         \;\;\;\;\mbox{w.p.1}
1514: \label{eq:chi}
1515: \ee
1516: where $R(D)$ is the rate-distortion 
1517: function of the process $\Xp$.
1518:  
1519: \medskip
1520: 
1521: {\em Historical Remarks:}
1522: The relevance of the quantities 
1523: $-\log Q_n(B(X_1^n,D))$ to 
1524: information theory was first 
1525: suggested implicitly
1526: by Kieffer \cite{kieffer:91}
1527: and more explicitly 
1528: by {\L}uczak and Szpankowski 
1529: \cite{luczak-szpankowski}.
1530: Since then, many papers have 
1531: appeared proving the generalized AEP 
1532: under different conditions;
1533: we mention here a subset
1534: of those proving some of
1535: the more general results.
1536: The case of finite alphabet
1537: processes was considered by Yang and Kieffer
1538: \cite{yang-kieffer:1}.
1539: The generalized AEP for 
1540: processes with general 
1541: alphabets and $\Yp$ $\iid$ 
1542: was proved by Dembo and
1543: Kontoyiannis
1544: \cite{dembo-kontoyiannis}
1545: and by Yang and Zhang 
1546: \cite{yang-zhang:99}.
1547: Finally, the case when
1548: $\Yp$ is not $\iid$ was
1549: (Theorem~3) treated by
1550: Chi \cite{chi-it:01}\cite{chi-AP:01}.
1551: The observations of Theorem~4 
1552: about the rate-function 
1553: $R(\BBP,\BBQ,D)$ are new.
1554: Theorem~5 essentially
1555: comes from Kieffer's work
1556: \cite{kieffer:91};
1557: see also \cite{konto-zhang:00}.
1558: 
1559: We should also mention 
1560: that, in 
1561: a somewhat different context,
1562: the intimate relationship
1563: between the AEP and large
1564: deviations is discussed in
1565: some detail by Orey in
1566: \cite{orey:85b}.
1567: 
1568: \subsection{Densities vs. Balls}
1569: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1570: Let us recall the classical generalization
1571: of the AEP, due to Barron \cite{barron:1}
1572: and Orey \cite{orey:85}, to processes with
1573: values in general alphabets. Suppose $\Xp$
1574: as above is a general stationary ergodic process
1575: with marginals $\{P_n\}$ that are
1576: absolutely continuous with respect to
1577: the sequence of measures $\BBM=\{M_n\}$.
1578: 
1579: \medskip
1580: 
1581: {\em Theorem~6. AEP for Processes with Densities
1582: \cite{barron:1}\cite{orey:85}:}
1583: Let $\Xp$ be a stationary ergodic process whose
1584: marginals $P_n$ have densities $f_n=dP_n/dM_n$ 
1585: with respect to the $\sigma$-finite measures $M_n$,
1586: $n\geq 1$. Assume that the sequence $\BBM$
1587: of dominating measures is Markov of finite order, 
1588: with a stationary transition measure, and that the 
1589: relative entropies
1590: \ben
1591: H_n\bydef E_{P_n}\left[\log\frac{f_n(X_1^n)}
1592: 				  {f_{n-1}(X_1^{n-1})}
1593: 		\right],\;\;\;\;n\geq 2,
1594: \een
1595: have $H_n>-\infty$ eventually. Then
1596: \be
1597: -\frac{1}{n}\log \frac{dP_n}{dM_n}(X_1^n)\to -H(\BBP\|\BBM)
1598:         \;\;\;\;\mbox{w.p.1}
1599: \label{eq:BarronAEP}
1600: \ee
1601: where $H(\BBP\|\BBM)$ is the relative entropy
1602: rate defined as $H(\BBP\|\BBM)=\lim_n H_n=\inf_n H_n$.
1603: 
1604: \medskip
1605: 
1606: The AEP for processes with densities is
1607: also know to hold when the reference measures
1608: $M_n$ do not form a Markov
1609: sequence, under some additional
1610: mixing conditions (see \cite{orey:85} where
1611: $M_n$ are taken to be non-Markov measures
1612: satisfying an additional mixing condition,
1613: and the more recent extension 
1614: in \cite{chazottesetal:98}
1615: where the $M_n$ are taken to be
1616: discrete Gibbs measures.)
1617: %%% (with H\"{o}lder continuous potentials)
1618: Moreover, Kieffer 
1619: \cite{kieffer:73}\cite{kieffer:73b}
1620: has given counterexamples
1621: illustrating that without some mixing
1622: conditions on $\{M_n\}$ the AEP
1623: (\ref{eq:BarronAEP}) fails to hold.
1624: 
1625: There is a tempting analogy between
1626: the generalized AEP (\ref{eq:thm4})
1627: and the AEP for processes with 
1628: densities (\ref{eq:BarronAEP}).
1629: The formal similarity between
1630: the two suggests that, if we identify 
1631: the measures $Q_n$ with the reference
1632: measures $M_n$, corresponding results
1633: should hold in the two cases. 
1634: Indeed, this does in general appear
1635: to be the case, as is illustrated 
1636: by the various generalized AEPs
1637: stated above. Moreover, we can
1638: interpret the result of Theorem~5
1639: as the natural analog of the classical 
1640: discrete AEP (\ref{eq:discreteAEP}) to
1641: the case of lossy data compression.
1642: As we argued in the
1643: introduction,
1644: the generalized AEPs of the previous
1645: sections play analogous roles in the proofs
1646: of the corresponding direct coding
1647: theorems.
1648: 
1649: Taking this analogy further indicates
1650: that there might be a relationship
1651: between these two different 
1652: generalizations.
1653: In particular, when $n$ is large and
1654: the distortion level $D$ is small,
1655: the following heuristic
1656: calculation seems compelling:
1657: 
1658: \ben
1659: -H(\BBP\|\BBQ)
1660: &\approxa&
1661: -\frac{1}{n}\log \frac{dP_n}{dQ_n}(X_1^n)\\
1662: &\approxb&
1663: -\frac{1}{n}\log \frac{P_n(B(X_1^n,D))}{Q_n(B(X_1^n,D))}\\
1664: &=&
1665: -\frac{1}{n}\log P_n(B(X_1^n,D))
1666: +\frac{1}{n}\log Q_n(B(X_1^n,D))\\
1667: &\approxc& 
1668: R(\BBP,\BBP,D)-R(\BBP,\BBQ,D)\\
1669: &\approxd&-H(\BBP\|\BBQ)
1670: \een
1671: where $(a)$ holds in the limit as $n\to\infty$
1672: by Theorem~6, $(b)$ should hold when $D$ is small
1673: by the assumption that $P_n$ has a density
1674: with respect to $Q_n$, $(c)$ would 
1675: follow in the limit as $n\to\infty$ by 
1676: an application of the generalized AEP, 
1677: and it is natural to conjecture
1678: that $(d)$ holds in the limits
1679: as $D\downarrow 0$ by reading
1680: the above calculation backwards.
1681: 
1682: In the following two sections we
1683: formalize the above heuristic 
1684: argument in two special cases: 
1685: First when $\Xp$ is a discrete 
1686: process taking values in a finite 
1687: alphabet, and second when $\Xp$ is
1688: a continuous process
1689: taking values in $\RL^d$.
1690: 
1691: 
1692: \subsubsection{Discrete Case}
1693: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1694: Here we take $\Xp$ to be a stationary ergodic
1695: process taking values in a finite alphabet $A$,
1696: and $\Yp$ to be $\iid$ with first order marginal 
1697: distribution $Q=Q_1$ on the same alphabet $A=\Ahat$.
1698: Similarly we write $P=P_1$ for the first order
1699: marginal of $\Xp$.
1700: In Theorem~7 we justify the above
1701: calculation by showing that the 
1702: limits as $D\downarrow 0$ and as $n\to\infty$
1703: can indeed be taken together in any 
1704: fashion: We show that the double 
1705: limit of the central expression 
1706: \be
1707: r_n(X_1^n,D)
1708: \bydef
1709: \frac{1}{n}\log \frac{P_n(B(X_1^n,D))}{Q_n(B(X_1^n,D))}
1710: \label{eq:ratio}
1711: \ee
1712: is equal to $H(\BBP\|\BBQ)$ with probability 1,
1713: independently of how $n$ grows and 
1714: $D$ decreases to zero. Its proof is 
1715: given in Appendix~A.
1716: 
1717: \medskip
1718: 
1719: {\em Theorem~7. Densities vs. Balls in the Discrete Case:}
1720: Let $\Xp$ be a stationary ergodic process
1721: and $\Yp$ be $\iid$, both on the finite 
1722: alphabet $A$. Assume that $\rho(x,y)=0$
1723: if and only if $x=y$, and $Q(x)>0$ for all $x$. 
1724: Then the following
1725: double limit exists:
1726: $$\limnd
1727: \frac{1}{n}\log
1728:         \frac{P_n(B(X_1^n,D))}
1729:              {Q^n(B(X_1^n,D))}
1730: 	\;=\; H(\BBP\|\BBQ)
1731: \;\;\;\;\mbox{w.p.1}
1732: $$
1733: In particular, the repeated limit
1734: $\lim_{n}\lim_{D}$
1735: exists with probability one
1736: and is equal to $H(\BBP\|\BBQ)$.
1737: 
1738: \subsubsection{Continuous Case}
1739: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1740: 
1741: Here we state a weaker version of Theorem~7 in the 
1742: case when
1743: $A=\Ahat=\RL^d$ for some $d\geq 1$, and
1744: when $\Xp$ is an $\RL^d$-valued, 
1745: stationary ergodic process.
1746: Suppose that the marginals $\{P_n\}$ of 
1747: $\Xp$ are absolutely continuous 
1748: with respect to a sequence
1749: of reference measures $\{Q_n\}$. Throughout 
1750: this section we take the $Q_n$
1751: to be product measures, $Q_n=Q^n,$
1752: for some fixed Borel probability
1753: measure $Q$ on $\RL^d$.
1754: A typical example to keep in mind 
1755: is when $Q$ a Gaussian measure on 
1756: $\RL$ and $\Xp$ a real-valued stationary 
1757: ergodic process all of whose marginals 
1758: $P_n$ have continuous densities
1759: with respect to Lebesgue measure.
1760: 
1761: For simplicity, we take $\rho$ to be 
1762: squared-error distortion,
1763: $\rho(x,y)=(x-y)^2$, although
1764: the proof of Theorem~8, given in 
1765: Appendix~B, may easily be adapted to 
1766: apply for somewhat more general
1767: difference distortion measures.
1768: 
1769: \medskip
1770: 
1771: {\em Theorem~8. Densities vs. Balls in the Continuous Case:}
1772: Let $\Xp$ be an $\RL^d$-valued stationary ergodic process,
1773: whose marginals $P_n$ have densities $f_n=dP_n/dQ_n$ with 
1774: respect to a sequence of product measures $Q_n=Q^n$, 
1775: $n\geq 1$, for a given probability measure $Q$ on $\RL^d$.
1776: Let $\rho(x,y)=(x-y)^2$ for any $x,y \in \RL^d$.
1777: 
1778: (a) The following repeated limit holds:
1779: $$\lim_{n\to\infty}
1780:   \lim_{D\downarrow 0}\;
1781:         \frac{1}{n}\log
1782:         \frac{P_n(B(X_1^n,D))}
1783:              {Q_n(B(X_1^n,D))}
1784: 	= H(\BBP\|\BBQ)
1785: 	\;\;\;\;\mbox{w.p.1.}
1786: $$
1787: 
1788: (b) Assume, moreover, that $\Xp$ is $\iid$ 
1789: with marginal distribution $P_1=P$ on $\RL^d$, 
1790: and that the following conditions are satisfied:
1791: Both $E_{P\times Q}[\rho(X,Y)]$ and
1792: $E_{P\times P}[\rho(X,Y)]$ are finite 
1793: and nonzero; the expectation
1794: $$E_P[-\log Q(B(X,D))] 
1795: \;\;\;\;\mbox{is finite for all}\;D>0;$$
1796: and a $\delta>0$ exists for which
1797: \be
1798: E_P\left[\sup_{0<D<\delta} \left|
1799: 	\log \frac{P(B(X,D))}{Q(B(X,D))} \right|\right]<\infty.
1800: \label{eq:integrability}
1801: \ee
1802: Then, the reverse repeated limit also holds:
1803: $$\lim_{D\downarrow 0}
1804:   \lim_{n\to\infty}\;
1805:         \frac{1}{n}\log
1806:         \frac{P_n(B(X_1^n,D))}
1807:              {Q_n(B(X_1^n,D))}
1808: 	= H(\BBP\|\BBQ)
1809: 	\;\;\;\;\mbox{w.p.1.}
1810: $$
1811: 
1812: \medskip
1813: 
1814: It is easy to check that all conditions of the
1815: theorem hold when $Q$ is a Gaussian measure on $\RL$
1816: and $P$ has finite variance and a probability density 
1817: function $g$ (with respect to Lebesgue measure)
1818: such that $E_P(\sup_{|y-X|<\delta} |\log g(y)|)<\infty$ 
1819: for some $\delta>0$.  For example, this is the case 
1820: when both $P$ and $Q$ are Gaussian distributions on $\RL$.
1821: 
1822: As will be seen from the proof of the theorem,
1823: although we are primarily interested in the 
1824: case when the relative entropy rate $H(\BBP\|\BBQ)$
1825: is finite, the result remains true when 
1826: $H(\BBP\|\BBQ)=\infty$, and in that case
1827: assumption (\ref{eq:integrability}) can be relaxed to
1828: $$E_P\left[\sup_{0<D<\delta} \log \frac{Q(B(X,D))}{P(B(X,D))} 
1829: \right]<\infty.$$
1830: 
1831: %Y added Feldman reference
1832: 
1833: Finally we note that, in the context of ergodic
1834: theory, Feldman \cite{feldman:80} developed
1835: a different verison of the generalized AEP,
1836: and also discussed the relationship between 
1837: the two types of asymptotics (as $n\to\infty$, 
1838: and as $D\downarrow 0$).
1839: 
1840: % \newpage
1841: \section{Applications of the Generalized AEP}
1842: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1843: As outlined in the
1844: introduction, the generalized AEP can
1845: be applied to a number of problems in data compression
1846: and pattern matching. Following along the lines of the
1847: corresponding applications in the lossless case, below
1848: we present applications of the results of the previous
1849: section to: 1.~Shannon's random coding schemes;
1850: 2.~mismatched codebooks in lossy data compression;
1851: 3.~waiting times between stationary processes
1852: (corresponding to idealized Lempel-Ziv coding);
1853: 4.~practical lossy Lempel-Ziv coding for memoryless
1854: sources; and 5.~weighted codebooks in 
1855: rate-distortion theory.
1856:  
1857: \subsection{Shannon's Random Codes}
1858: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1859: Shannon's well-known construction of optimal
1860: codes for lossy data compression is based on
1861: the idea of generating a random codebook. We
1862: review here a slightly modified version of
1863: his construction \cite{shannon:59}
1864: and describe how the performance of the
1865: resulting random code can be analyzed 
1866: using the generalized AEP.
1867: 
1868: % \paragraph*{Shannon's Random Codebooks.}
1869: Given a sequence of probability distributions
1870: $Q_n$ on $\Ahatn$, $n\geq 1$, we generate a 
1871: {\em random codebook according to the measures $Q_n$}
1872: as an infinite sequence of $\iid$ random vectors
1873: $$Y_1^n(i),\;\;\;\;i\geq 1$$
1874: with each $Y_1^n(i)$ having distribution
1875: $Q_n$ on $\Ahatn$. Suppose that, for a fixed $n$,
1876: this codebook is available to both the encoder and decoder. 
1877: Given a source string $X_1^n$ to
1878: be described with distortion $D$ or less,
1879: the encoder looks for a $D$-close match of
1880: $X_1^n$ into the codebook $\{Y_1^n(i)\;;\;i\geq 1\}$.
1881: Let $i_n$ be the position of the first such match
1882: \ben
1883: i_n\bydef \inf \{i\geq 1\;:\;\rho_n(X_1^n,Y_1^n(i))\leq D\}
1884: % \label
1885: \een
1886: with the convention that the infimum of
1887: the empty set equals $+\infty$. If a
1888: match is found, then the encoder describes
1889: to the decoder the position $i_n$ using
1890: Elias' code for the integers 
1891: \cite{elias}. This takes no more than
1892: \be
1893: \log_2 i_n + 2\log_2\log_2 i_n + \mbox{Const.}
1894: \;\;\;\;\mbox{bits}.
1895: \label{eq:elias}
1896: \ee
1897: If no match is found 
1898: (something that asymptotically will 
1899: {\em not} happen, with probability one),
1900: then the encoder describes $X_1^n$ with
1901: distortion $D$ or less using some other
1902: default scheme. 
1903: 
1904: Let $\ell_n(X_1^n)$
1905: denote the overall description 
1906: length of the algorithm just 
1907: described. In view of (\ref{eq:elias}),
1908: in order to understand its 
1909: compression performance,
1910: that is, to understand the 
1911: asymptotic behavior of 
1912: $\ell_n(X_1^n)$, it suffices 
1913: to understand the behavior of the quantity
1914: $$\log_2 i_n,\;\;\;\;\mbox{for large $n$.}$$
1915: Suppose that the probability
1916: $Q_n(B(X_1^n,D))$ of finding a $D$-close
1917: match for $X_1^n$ in the codebook is nonzero.
1918: Then, conditional on the source string $X_1^n$, 
1919: the distribution of $i_n$ is geometric with
1920: parameter $Q_n(B(X_1^n,D))$. From this 
1921: observation is easy to deduce that 
1922: the behavior of $i_n$ is closely
1923: related to the behavior of the quantity
1924: $1/Q_n(B(X_1^n,D))$. The next theorem is
1925: an easy consequence of this fact so it is
1926: stated here without proof; see the 
1927: corresponding arguments in 
1928: \cite{kontoyiannis-red:00}\cite{konto-zhang:00}.
1929: 
1930: \medskip
1931: 
1932: {\em Theorem~9. Strong Approximation:}
1933: Let $\Xp$ be an arbitrary process and
1934: let $\{Q_n\}$ be a given sequence of 
1935: codebook distributions. 
1936: If $Q_n(B(X_1^n,D))>0$ eventually with 
1937: probability one,
1938: then for any $\epsilon>0$:
1939: \ben
1940: \log_2 i_n 
1941: &\leq& -\log_2 Q_n(B(X_1^n,D)) + \log_2\log_2 n + 3
1942: 	\;\;\;\;\mbox{eventually, w.p.1}\\
1943: \mbox{and}\;\;
1944: 	\log_2 i_n 
1945: &\geq&
1946: -\log_2 Q_n(B(X_1^n,D)) -
1947: \log_2 n - (1+\epsilon)\log_2\log_2 n
1948: \;\;\;\;\mbox{eventually, w.p.1.}
1949: \een
1950: 
1951: \medskip
1952: 
1953: The above estimates can now be combined 
1954: with the results of the generalized AEP 
1955: in the previous section to determine the
1956: performance of codes based on random 
1957: codebooks with respect to the ``optimal''
1958: measures $Q_n$. To illustrate this 
1959: approach we consider the special case
1960: of memoryless sources and finite
1961: reproduction alphabets, and show that
1962: the random code with respect to 
1963: (almost) any random codebook realization 
1964: is asymptotically optimal, with 
1965: probability one. Note that corresponding
1966: results can be proved, in exactly the
1967: same way, under much more general 
1968: assumptions. For example, utilizing
1969: Theorem~5 instead of Theorem~1 we
1970: can prove the analog of Theorem~10
1971: below for arbitrary stationary 
1972: ergodic sources.
1973: 
1974: Let $\Xp$ be an $\iid$ source with 
1975: marginal distribution $P_1=P$ 
1976: on $A$, and take the reproduction
1977: alphabet $\Ahat$ to be finite.
1978: For simplicity we will
1979: assume that the distortion measure 
1980: $\rho$ is bounded, i.e., 
1981: $\sup_{x,y}\rho(x,y)<\infty,$
1982: and we also make the customary 
1983: assumption that 
1984: \be
1985: \sup_{x\in A}\min_{y\in\hat{A}}\rho(x,y) = 0.
1986: \label{eq:maximin}
1987: \ee
1988: [See the remark at the end of Section~5.1.1 for
1989: a discussion of this condition and when it can 
1990: be relaxed.]
1991: As usual, we define the rate-distortion
1992: function of the memoryless source
1993: $\Xp$ by
1994: $$R(D)=\inf_{(X,Y)}\,I(X;Y)$$
1995: where the infimum is over all jointly 
1996: distributed random variables $(X,Y)$ 
1997: with values in $A\times\Ahat$, such 
1998: that $X$ has distribution $P$
1999: and $E[\rho(X,Y)]\leq D$.
2000: Let 
2001: \be
2002: \Dbar\bydef \min_{y\in\hat{A}}E_P[\rho(X,y)]
2003: \label{eq:Dbar}
2004: \ee
2005: and note that $R(D)=0$ for $D\geq\Dbar$.
2006: To avoid the trivial case when
2007: $R(D)=0$ for all $D,$ we assume
2008: that $\Dbar>0$ and we restrict 
2009: our attention to the interesting
2010: range of values $D\in(0,\Dbar)$.
2011: Recall \cite{yang-zhang:99}\cite{kontoyiannis-red:00}
2012: that for any such $D$,
2013: $R(D)$ can alternatively be 
2014: written as
2015: $$R(D)=\inf_Q R_1(P,Q,D)$$
2016: where the infimum is over all 
2017: probability distributions $Q$ on $\Ahat$.
2018: Since we take $\Ahat$ to be finite,
2019: this infimum is always achieved
2020: (see \cite{kontoyiannis-red:00})
2021: by a probability distribution
2022: $Q=Q^*$. 
2023: To avoid cumbersome notation in the statements
2024: of the coding theorems given next and also in 
2025: later parts of the paper, we also write
2026: $\calR(D)$ for the rate-distortion 
2027: function of the source $\Xp$ expressed
2028: in {\em bits} rather than in nats:
2029: $$\calR(D)\bydef (\log_2 e)R(D).$$
2030: Finally, we write $Q_n^*$ for the product
2031: measures $(Q^*)^n$ and call
2032: $\{Q_n^*\}$ the {\em optimal reproduction 
2033: distributions at distortion level $D$.}
2034: 
2035: Combining Theorem~9 with the 
2036: generalized AEP of Theorem~1 
2037: implies the following 
2038: strengthened direct
2039: coding theorem.
2040: 
2041: \medskip
2042:  
2043: {\em Theorem~10. Pointwise Coding Theorem 
2044: for I.I.D. Sources \cite{kontoyiannis-red:00}:}
2045: Let $\Xp$ be an $\iid$ source with distribution
2046: $P$ on $A$, and let $Q_n^*$ denote the optimal
2047: reproduction distributions at distortion level
2048: $D\in(0,\Dbar)$.
2049: Then the codes based on almost any realization 
2050: of the Shannon random codebooks according
2051: to the measures $\{Q_n^*\}$ have codelengths 
2052: $\ell_n(X_1^n)$
2053: satisfying:
2054: $$\lim_{n\to\infty}\frac{1}{n}\ell_n(X_1^n)
2055: = \calR(D)\;\;\;\;\mbox{bits per symbol, w.p.1.}$$
2056: 
2057: \medskip
2058: 
2059: A simple modification of the above scheme can
2060: be used to obtain {\em universal} codebooks
2061: that achieve optimal compression for any 
2062: memoryless source:
2063: Given a fixed block-length $n$, we consider
2064: the collection of all $n$-types on $\Ahat$,
2065: namely, all distributions $Q$ of the form
2066: $Q(\hat{a})=j/n$, $0\leq j\leq n$, for 
2067: $\hat{a}\in\Ahat$. Instead of generating 
2068: a single random codebook according to the 
2069: optimal distribution $Q_n^*$, we generate 
2070: {\em multiple codebooks}, one for each
2071: product measure $Q^n$ corresponding to an
2072: $n$-type $Q$ on $\Ahat$. Then we (as the 
2073: encoder) adopt a greedy coding strategy. We find 
2074: the first $D$-close match for $X_1^n$ in 
2075: each of the codebooks, and pick the one 
2076: in which the match appears the earliest.
2077: To describe $X_1^n$ to the decoder with
2078: distortion $D$ or less we then describe
2079: two things: (a)~the index of the codebook 
2080: in which the earliest match was found, 
2081: and (b)~the position $i_n$ of this 
2082: earliest match. Since there are at
2083: most polynomially many $n$-types
2084: (cf. \cite{csiszar:book}\cite{cover:book}),
2085: the rate of the description of (a) is 
2086: asymptotically negligible. Moreover,
2087: since the set of $n$-types is 
2088: asymptotically dense among probability
2089: measures on $\Ahat$, we eventually
2090: do as well as if we were using the 
2091: optimum codebook distribution $Q_n^*$.
2092: 
2093: \medskip
2094:  
2095: {\em Theorem~11. Pointwise Universal Coding Theorem
2096: \cite{kontoyiannis-red:00}:}
2097: Let $\Xp$ be an arbitrary $\iid$ source with 
2098: distribution $P$ on $A$, let $R(D)$
2099: be the rate-distortion function of this source
2100: at distortion level $D\in(0,\Dbar)$,
2101: and let $\calR(D)$ denote its 
2102: rate-distortion function in bits.
2103: The codes 
2104: based on almost any realization of the 
2105: universal Shannon random codebooks have 
2106: codelengths $\ell_n(X_1^n)$ satisfying:
2107: $$\lim_{n\to\infty}\frac{1}{n}\ell_n(X_1^n)
2108: = \calR(D)\;\;\;\;\mbox{bits per symbol, w.p.1.}$$
2109: 
2110: \subsection{Mismatched Codebooks}
2111: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2112: In the last section we described how,
2113: for memoryless sources, the Shannon 
2114: random codebooks with respect to the
2115: optimal reproduction distributions can
2116: be used to achieve asymptotically 
2117: optimal compression performance. In this 
2118: section we briefly consider the question
2119: of determining the rate achieved
2120: when an arbitrary (stationary ergodic)
2121: source $\Xp$ is encoded using a 
2122: random codebook according to the 
2123: $\iid$ distributions $Q^n$, 
2124: for an arbitrary distribution $Q$ 
2125: on $\Ahat$. For further discussion 
2126: of the problem of mismatched
2127: codebooks see 
2128: \cite{sakrison:69}\cite{sakrison:70}\cite{lapidoth:97}\cite{kanlis:phd}
2129: and the references therein.
2130: 
2131: The following theorem is an immediate
2132: consequence of combining Theorem~1  
2133: with Theorem~9 and the discussion in 
2134: Section~3.1 (see also Example~1 in 
2135: Section~2.2).
2136: 
2137: \medskip
2138: 
2139: {\em Theorem~12. Mismatched Coding Rate:}
2140: Let $\Xp$ be a stationary ergodic process
2141: with marginal distribution $P_1=P$ on $A$,
2142: let $Q$ be an arbitrary distribution
2143: on $\Ahat$, and define $\Dmin$ and
2144: $\Dav$ as in Section~2.2.
2145: \begin{itemize}
2146: \item[(a)]{\em Arbitrary I.I.D. Codebooks:}
2147:   For any distortion level $D\in(\Dmin,\Dav)$,
2148:   the codes based on almost any realization
2149:   of the Shannon random codebooks according
2150:   to the measures $\{Q^n\}$ have codelengths 
2151: 	$\ell_n(X_1^n)$ satisfying:
2152:   $$\lim_{n\to\infty}\frac{1}{n}\ell_n(X_1^n)
2153:   = (\log_2 e)R_1(P,Q,D)\;\;\;\;\mbox{bits per symbol, w.p.1.}$$
2154: \item[(b)]{\em I.I.D. Gaussian Codebooks:}
2155:   Suppose
2156: $\rho(x,y)=(x-y)^2$ and
2157: $\Xp$ is a real-valued process with
2158:   finite variance $\sigma^2=\VAR(X_1)$.
2159:   Let $Q$ be the $N(0,\tau^2)$ distribution 
2160:   on $\RL$. Then for any distortion level 
2161:   $D\in(0,\sigma^2+\tau^2)$, the codes based on 
2162:   almost any realization of the Gaussian
2163:   codebooks according to the measures $\{Q^n\}$ 
2164:   have codelengths 
2165:   $\ell_n(X_1^n)$ satisfying:
2166:   $$\lim_{n\to\infty}\frac{1}{n}\ell_n(X_1^n)
2167:   = \frac{1}{2}\log_2\left(\frac{v}{D}\right)
2168: 	-(\log_2 e)\frac{(v-D)(v-\sigma^2)}
2169:                         {2v\tau^2}
2170: 	\;\;\;\;\mbox{bits per symbol, w.p.1,}$$
2171:   where
2172:   $$v\bydef\frac{1}{2}\left[\tau^2+\sqrt{\tau^4+4D\sigma^2}\right].$$
2173: \end{itemize}
2174: 
2175: \medskip
2176: 
2177: {\em Lossless vs. Lossy Mismatch:} Recall
2178: that, in the case of lossless data compression,
2179: if instead of the true source distribution
2180: $P$ a different coding distribution $Q$ is used,
2181: then the code-rate achieved is 
2182: \be
2183: H(P)+H(P\|Q).
2184: \label{eq:penalty1}
2185: \ee
2186: Similarly in the current setting of lossy 
2187: data compression, if instead of the optimal
2188: reproduction distribution $Q^*$ we use a
2189: different codebook distribution $Q$, the
2190: rate we achieve is $R_1(P,Q,D)$. 
2191: An upper bound for $R_1(P,Q,D)$ is
2192: obtained by taking $(X,Y)$
2193: in the expression of Remark~1 
2194: %-------
2195: to be the jointly distributed random 
2196: variables that achieve the infimum 
2197: in the definition of the rate-distortion
2198: function of $P$. Then the (mismatched) rate
2199: of the random code based on $Q$ instead 
2200: of $Q^*$ is:
2201: \be
2202: R_1(P,Q,D)\leq R(D) + H(Q^*\|Q).
2203: \label{eq:penalty2}
2204: \ee
2205: Equations (\ref{eq:penalty1})
2206: and (\ref{eq:penalty2}) illustrate
2207: the analogy between the penalty terms 
2208: in the lossless and lossy case
2209: due to mismatch.
2210: 
2211: \medskip
2212: 
2213: Next we discuss two special cases 
2214: of part~(b) of the theorem
2215: that are of particular interest.
2216: 
2217: \medskip
2218: 
2219: {\em Example~2: Gaussian codebook with 
2220: mismatched distribution:}
2221: Consider the following coding scenario:
2222: We want to encode data generated by 
2223: an $\iid$ Gaussian process 
2224: with $N(0,\sigma^2)$ distribution,
2225: with squared-error distortion
2226: $D$ or less.
2227: In this case, it is well-known 
2228: \cite{berger:book}\cite{cover:book} that
2229: for any $D\in(0,\sigma^2)$ 
2230: the optimal reproduction distribution $Q^*$
2231: is the $N(0,\sigma^2-D)$ distribution,
2232: so we construct random codebooks 
2233: according to the $\iid$ distributions
2234: $Q_n^*=(Q^*)^n$.
2235: 
2236: But suppose that, instead of an
2237: $\iid$ Gaussian, the source turns out
2238: to be some arbitrary stationary ergodic 
2239: $\Xp$ with zero mean and variance $\sigma^2$.
2240: Theorem~12~(b) implies that the asymptotic
2241: rate achieved by our $\iid$ Gaussian
2242: codebook is equal to
2243: $$\frac{1}{2}\log_2\left(\frac{\sigma^2}{D}\right)
2244: \;\;\;\;\mbox{bits per symbol.}$$
2245: Since this is exactly the
2246: rate-distortion function of the 
2247: $\iid$ $N(0,\sigma^2)$ source, we
2248: conclude that the rate achieved is
2249: the same as what we would have 
2250: obtained on the Gaussian source we
2251: originally expected. This offers
2252: yet another justification of the
2253: folk theorem that the Gaussian 
2254: source is the hardest one to compress,
2255: among sources with a fixed variance. 
2256: In fact, the above result is 
2257: a natural fixed-distortion
2258: analog of \cite[Theorem~3]{lapidoth:97}.
2259: 
2260: \medskip
2261: 
2262: {\em Example~3: Gaussian codebook with mismatched variance:}
2263: Here we consider a different type of mismatch.
2264: As before, we are prepared to encode an 
2265: $\iid$ Gaussian source, but we have an 
2266: incorrect estimate of its variance,
2267: say $\hat{\sigma}^2$ instead of the true
2268: variance $\sigma^2$. So we are using
2269: a random codebook with respect to the
2270: optimal reproduction distribution
2271: $Q_n^*=(Q^*)^n$, where $Q^*$ is the
2272: $N(0,\hat{\sigma}^2-D)$ distribution,
2273: but the actual source is $\iid$ 
2274: $N(0,\sigma^2)$. In this case, 
2275: the rate achieved by 
2276: the random codebooks according to 
2277: the distributions $Q_n^*$ is given
2278: by the expression in Theorem~12~(b),
2279: with $\tau^2$ replaced by $\hat{\sigma}^2-D$.
2280: Although the resulting expression 
2281: is somewhat long and not easy to 
2282: manipulate analytically, it is 
2283: straightforward to evaluate 
2284: numerically. For example, 
2285: Figure~1 shows the asymptotic 
2286: rate achieved, as a function of the 
2287: error $e=\sigma^2-\hat{\sigma}^2$
2288: in the estimate of the true variance.
2289: As expected, the best rate is 
2290: achieved when the codebook distribution 
2291: is matched the source (corresponding to $e=0$),
2292: and it is equal to the rate-distortion function
2293: of the source. Moreover, as one might 
2294: expect, it is more harmful to 
2295: underestimate the variance 
2296: than to overestimate it.
2297: 
2298: \begin{figure}[ht]
2299: \centerline{\epsfxsize 3.2in \epsfbox{rate.eps}}
2300: \caption{This graph shows the rate achieved by an
2301: $\iid$ Gaussian codebook of variance $\hat{\sigma}^2-D$
2302: when applied to $\iid$ $N(0,\sigma^2)$ data.
2303: The rate is shown as a function of the error
2304: $e=\sigma^2-\hat{\sigma}^2$ in the variance estimate.
2305: In this particular example: $\sigma^2=2$, $D=1$, 
2306: the error $e$ ranges from $-1/2$ to $1/2$,
2307: and the rate-distortion function of the source
2308: equals 0.5 bits/symbol.}
2309: \end{figure}
2310: 
2311: \subsection{Waiting Times and Idealized Lempel-Ziv Coding}
2312: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2313: Given $D\geq 0$ and two independent realizations
2314: from the stationary ergodic processes $\Xp$ and $\Yp$, 
2315: our main quantity of interest here is the 
2316: {\em waiting time} $W_n=W_n(D)$ until a $D$-close 
2317: version of the initial string $X_1^n$ first appears
2318: in $Y_1^\infty$. Formally
2319: \be
2320: W_n\;=\;\inf\{i\geq 1\; :\; 
2321: 	\rho_n(X_1^n,Y_i^{i+n-1})\leq D\}
2322: \label{eq:Wn-def}
2323: \ee
2324: with the convention, as before, that the infimum 
2325: of the empty set equals $+\infty$.
2326: 
2327: The motivation for studying the asymptotic behavior
2328: of $W_n$ for large $n$ is twofold.
2329: 
2330: \medskip
2331: 
2332: {\em Idealized Lempel-Ziv coding.}
2333: The natural extension of the idealized
2334: scenario described in the
2335: introduction
2336: is to consider a message $X_1^n$ that
2337: is to be encoded with the help of a
2338: database $Y_1^\infty$.
2339: The
2340: source and
2341: the database are assumed to be 
2342: independent, and the database
2343: distribution may or may not be
2344: the same as that of the source.
2345: In order to communicate $X_1^n$
2346: to the decoder with distortion
2347: $D$ or less, the encoder simply
2348: describes $W_n$, using no more
2349: than
2350: $$\log_2 W_n + O(\log_2\log_2 W_n)\;\;\;\;\mbox{bits.}$$
2351: Therefore, the asymptotic performance 
2352: of this idealized scheme can be 
2353: completely understood in terms
2354: of the asymptotics of $\log W_n$,
2355: for large $n$.
2356: 
2357: \medskip
2358: 
2359: {\em DNA pattern matching.}
2360: Here we imagine that $X_1^n$ represents 
2361: a DNA or protein ``template,'' and we 
2362: want to see whether it appears, either
2363: exactly or approximately, as a contiguous 
2364: substring of a database DNA sequence 
2365: $Y_1^\infty$. We are interested in
2366: quantifying the ``degree of surprise''
2367: in the fact that a $D$-close match was
2368: found at position $W_n$. Specifically,
2369: was the match found ``atypically''
2370: early, or is the value of $W_n$ 
2371: consistent with the hypothesis
2372: that the template and the database
2373: are independent? For a detailed
2374: discussion, see, e.g.,
2375: \cite[Section~3.2]{dembo-zeitouni:book}\cite{karlin-ost:88}%
2376: \cite{agw:90}\cite{arratia-waterman}
2377: and the references therein.
2378: 
2379: % \medskip
2380: 
2381: \newpage
2382: 
2383: If for a moment we consider
2384: the case when both $\Xp$ and
2385: $\Yp$ are $\iid$, we see that 
2386: the waiting time $W_n$ is,
2387: at least intuitively, closely
2388: related to the index $i_n$ of 
2389: Section~3.1.
2390: As the following result shows,
2391: although the distribution of
2392: $W_n$ is not exactly geometric,
2393: $W_n$ behaves very much 
2394: like $i_n$, at least in the 
2395: exponent. That is, the
2396: difference
2397: $$\log W_n -[-\log Q_n(B(X_1^n,D))]$$
2398: is ``small,'' eventually
2399: with probability one.
2400: 
2401: Recall the definition of
2402: $\psi$-mixing from Section~2.3, and
2403: also the definition of the
2404: $\phi$-mixing coefficients of $\Yp$
2405: $$\phi(k)\;=\;\sup\{|\BBQ(B|A)-\BBQ(B)|\;:\;\;
2406: B\in\sigma(Y_{k}^{\infty}),\;
2407: A\in\sigma(Y_{-\infty}^0),\; \BBQ(A)>0\}$$
2408: where, as before, $\sigma(Y_i^j)$ denotes
2409: the $\sigma$-field generated by $Y_i^j$. 
2410: The process $\Yp$ is called
2411: {\em $\phi$-mixing}
2412: if $\phi(k)\to 0$ as $k\to\infty$;
2413: see \cite{bradley} for an extensive 
2414: discussion of $\phi$-mixing and related
2415: mixing conditions.
2416:  
2417: \medskip
2418: 
2419: {\em Theorem~13. Strong Approximation
2420: \cite{kontoyiannis-jtp}\cite{dembo-kontoyiannis}:}
2421: Let $\Xp$ and $\Yp$ be stationary ergodic processes,
2422: and assume that $\Yp$ is either $\psi$-mixing
2423: or $\phi$-mixing with summable $\phi$-mixing
2424: coefficients, $\sum_{k\geq 1} \phi(k)<\infty$.
2425: If $Q_n(B(X_1^n,D))>0$ eventually with 
2426: probability one,
2427: then for any $\epsilon>0$:
2428: \ben
2429: -(1+\epsilon)\log n
2430: \;\leq\;
2431: \log [W_n Q_n(B(X_1^n,D))]
2432: \;\leq\;
2433: (2+\epsilon)\log n
2434: \;\;\;\;\mbox{eventually, w.p.1.}
2435: \een
2436: 
2437: \medskip
2438: 
2439: Theorem~13 of course implies that
2440: \be
2441: \log W_n = -\log Q_n(B(X_1^n,D)) + O(\log n)
2442: \;\;\;\;\mbox{w.p.1}
2443: \label{eq:strong}
2444: \ee
2445: and combining this with the generalized
2446: AEP statements of Theorems~1 and~4 we
2447: immediately obtain the first order 
2448: (or strong-law-of-large-numbers, SLLN)
2449: asymptotic behavior of the waiting
2450: times $W_n$:
2451: 
2452: \medskip
2453: 
2454: {\em Theorem~14. SLLN for Waiting Times:}
2455: Let $\Xp$ and $\Yp$ be stationary ergodic processes.
2456: 
2457: (a)~If $\Yp$ is $\iid$ and the 
2458: average distortion $\Dav$ is finite,
2459: then for any $D\in(\Dmin,\Dav)$
2460: \be
2461: \frac{1}{n}\log W_n \to R_1(P_1,Q_1,D)
2462: \;\;\;\;\mbox{w.p.1.}
2463: \label{eq:w-slln}
2464: \ee
2465: 
2466: (b)~If $\Yp$ is $\psi$-mixing and the distortion
2467: 	measure $\rho$ is bounded, then for any
2468: 	$D\in(\Dmin,\Dav)$
2469: \be
2470: \frac{1}{n}\log W_n \to R(\BBP,\BBQ,D)
2471: \;\;\;\;\mbox{w.p.1.}
2472: \label{eq:w-slln2}
2473: \ee
2474: 
2475: \medskip
2476: 
2477: Note that similar results can be
2478: obtained under different assumptions
2479: on the process $\Yp$, using Theorems~3 
2480: and~5 in place of Theorems~1 and~4 as
2481: done above.
2482: When $\Xp$ is taken to be an
2483: arbitrary stationary ergodic process,
2484: it is natural to expect that the
2485: mixing conditions for $\Yp$ in
2486: Theorem~14~(b) cannot be 
2487: substantially relaxed.
2488: In fact, even in the case of exact matching 
2489: between finite-alphabet processes, Shields 
2490: \cite{shields:3}
2491: has produced a counterexample demonstrating
2492: that the analog of Theorem~13 does not hold 
2493: for arbitrary stationary ergodic $\Yp$. 
2494: 
2495: \medskip
2496: 
2497: {\em Historical Remarks:}
2498: Waiting times in the context
2499: of lossy data compression were
2500: studied by Steinberg and Gutman 
2501: \cite{steinberg-gutman} and {\L}uczak 
2502: and Szpankowski \cite{luczak-szpankowski}.
2503: Yang and Kieffer \cite{yang-kieffer:1}
2504: identified the limiting rate-function
2505: for a wide range of finite alphabet
2506: sources, and Dembo and Kontoyiannis 
2507: \cite{dembo-kontoyiannis} and Chi 
2508: \cite{chi-it:01}
2509: generalized these results to processes with
2510: general alphabets.
2511: 
2512: The strong approximation idea was 
2513: introduced
2514: in \cite{kontoyiannis-jtp}
2515: in the case of exact matching.  For
2516: processes $\Yp$ with summable
2517: $\phi$-mixing coefficients, Theorem~13 was 
2518: proved in \cite{dembo-kontoyiannis}, and when 
2519: $\Yp$ is $\psi$-mixing it was proved, for the 
2520: case of no distortion, in \cite{kontoyiannis-jtp}.
2521: Examining the latter proof, \cite{chi-it:01}
2522: observed that it immediately generalizes to
2523: the statement of Theorem~13.
2524: 
2525: Related results were obtained by
2526: Kanaya and Muramatsu \cite{kanaya-muramatsu:97},
2527: who extended some of the results of
2528: \cite{steinberg-gutman}
2529: to processes with general alphabets,
2530: and by Koga and Arimoto \cite{koga-arimoto:98}
2531: who considered {\em non-overlapping} waiting 
2532: times between finite-alphabet processes
2533: and Gaussian processes.
2534: Finally, Shields \cite{shields:3}
2535: and Marton and Shields 
2536: \cite{marton-shields:1}
2537: considered waiting times with
2538: respect to Hamming distortion
2539: and for $\Xp$ and $\Yp$ having
2540: the same distribution over a
2541: finite alphabet. For
2542: the case of small
2543: distortion they showed,
2544: under some conditions, 
2545: that approximate matching 
2546: results like (\ref{eq:w-slln}) 
2547: and (\ref{eq:w-slln2}) 
2548: reduce to their natural 
2549: exact matching analogs as
2550: $D\to 0$.
2551: 
2552: \subsection{Match-Lengths and Practical Lempel-Ziv Coding}
2553: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2554: In the idealized coding scenario of
2555: the previous section we considered the
2556: case where a fixed-length message $X_1^n$
2557: is to be compressed using an infinitely 
2558: long database $Y_1^\infty$. But, in practice, 
2559: the reverse situation is much more common:
2560: We typically have a ``long'' message 
2561: $(X_1,X_2,\dots)$ to be compressed, and 
2562: only a finite-length database $Y_1^m$
2563: is available to the encoder and decoder.
2564: It is therefore natural (following
2565: the corresponding development in the
2566: case of lossless compression)
2567: to try and match ``as much as possible'' 
2568: from the message $(X_1,X_2,\dots)$ into the 
2569: database $Y_1^m$. 
2570: % \cite{wyner-ziv:3}, it is
2571: With this in mind we 
2572: define the {\em match-length}
2573: $L_m$ as the length $\ell$ of
2574: the longest prefix $X_1^\ell$ that
2575: matches somewhere in the database
2576: with distortion $D$ or less:
2577: \be
2578: L_m=\sup\{\ell \geq 1\;:\;
2579: \rho_\ell(X_1^\ell,Y_{j}^{j+\ell-1})\leq D,
2580: \;\;\mbox{for some}\;\;j=1,2,\ldots,m\}.
2581: \label{eq:Lm-def}
2582: \ee
2583: 
2584: Intuitively, there is a connection 
2585: between match-lengths and waiting times. 
2586: Long matches should mean short waiting times, 
2587: and vice versa. In the case of exact matching
2588: this connection was precisely formalized by
2589: Wyner and Ziv \cite{wyner-ziv:1}, who observed 
2590: that the following ``duality'' relationship 
2591: always holds:
2592: \be
2593: W_n\leq m
2594: \;\;\;\;
2595: % \mbox{if and only if}
2596: \Leftrightarrow
2597: \;\;\;\;
2598: L_m\geq n.
2599: \label{eq:easy-dual}
2600: \ee
2601: This is almost identical to the
2602: standard relationship in renewal
2603: theory between the number of
2604: events by a certain time and
2605: the time of the $n$th event
2606: (see, e.g., \cite{fellerII:book}).
2607: Wyner and Ziv \cite{wyner-ziv:1}
2608: utilized (\ref{eq:easy-dual})
2609: to translate their first order
2610: asymptotic results about $W_n$ 
2611: to corresponding results about
2612: $L_m$. 
2613: 
2614: Unfortunately this simple relationship 
2615: no longer holds in the case of
2616: {\em approximate} matching,
2617: when a distortion measure
2618: is introduced. Instead, the following 
2619: modified duality was employed
2620: in \cite{dembo-kontoyiannis} 
2621: to obtain corresponding 
2622: results in approximate matching 
2623: and lossy data compression:
2624: \be
2625: W_n\leq m\;\;\Rightarrow\;\;L_m\geq n
2626: \;\;\;\;
2627: \mbox{and}
2628: \;\;\;\;
2629: L_m\geq n\;\;\Rightarrow\;\;\inf_{k\geq n} W_k\leq m.
2630: \label{eq:duality}
2631: \ee
2632: In \cite{dembo-kontoyiannis} it is shown
2633: that (\ref{eq:duality}) can be used to
2634: deduce the asymptotic behavior of $L_m$
2635: from that of $W_n$,
2636: but this translation
2637: is not straightforward anymore.
2638: In fact, as we discuss in Section~5.2, 
2639: a somewhat more
2640: delicate analysis is needed
2641: in this case.
2642: Nevertheless,
2643: once the
2644: behavior of the waiting
2645: times is understood, 
2646: the first implication in 
2647: (\ref{eq:duality}) immediately 
2648: yields asymptotic {\em lower bounds} 
2649: on the behavior of the match-lengths.
2650: This is significant for data compression
2651: since long match-lengths usually mean
2652: good compression performance.
2653: Indeed, this observation allowed
2654: \cite{kontoyiannis-lossy1-1} to introduce
2655: a new lossy version of the Lempel-Ziv algorithm
2656: that achieves asymptotically optimal 
2657: compression performance for
2658: memoryless sources.
2659: The key characteristics of the 
2660: algorithm are that it has
2661: polynomial implementation 
2662: complexity, and that it
2663: achieves redundancy comparable
2664: to that of its lossless counterpart, 
2665: the FDLZ \cite{wyner-ziv:3}.
2666: 
2667: We also
2668: mention that, before
2669: \cite{kontoyiannis-lossy1-1},
2670: several practical (yet suboptimal)
2671: lossy versions of the Lempel-Ziv
2672: algorithm were introduced,
2673: perhaps most notably 
2674: by Steinberg and Gutman
2675: \cite{steinberg-gutman} and {\L}uczak 
2676: and Szpankowski \cite{luczak-szpankowski}.
2677: Roughly speaking, the reason for
2678: their suboptimal compression performance 
2679: was that the coding was done with respect
2680: to a database that had the same 
2681: distribution as the source. In view
2682: of the discussion in the previous
2683: section, it is clear that the asymptotic
2684: code-rate of these algorithms is
2685: $R_1(P,P,D)$, which is typically
2686: significantly larger than
2687: the optimal $R(D)=\inf_Q R_1(P,Q,D)$;
2688: see
2689: \cite{yang-kieffer:1} or 
2690: \cite{kontoyiannis-lossy1-1}
2691: for
2692: more detailed discussions.
2693: 
2694: 
2695: \subsection{Weighted Codebooks and Sphere-Covering}
2696: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2697: Here we describe a related question that was recently 
2698: considered in \cite{covering-TR:99}. In the classical 
2699: rate-distortion problem, one is interested in finding 
2700: ``efficient'' codebooks for describing the output of some 
2701: random source to within some tolerable distortion 
2702: level. In terms of data compression, a codebook is
2703: ``efficient'' when it contains relatively few codewords,
2704: so that it yields a code with a low rate. Here we are 
2705: interested in the more general problem of finding 
2706: codebooks with small ``mass.'' 
2707: 
2708: Let $\Xp$ be an
2709: $\iid$ process with marginal distribution
2710: $P$ on a finite alphabet $A$, 
2711: and take $\Ahat = A$ and $\rho$
2712: a distortion measure with the property
2713: that $\rho(x,y)=0$ if and only if $x=y$.
2714: Let $M:A\to(0,\infty)$ be
2715: an arbitrary nonnegative function
2716: assigning mass $M^n(C_n)$ to
2717: subsets $C_n$ of $A^n$:
2718: $$M^n(C_n)\bydef\sum_{y_1^n\in C_n} M^n(y_1^n)
2719: 	\bydef\sum_{y_1^n\in C_n}\prod_{i=1}^n M(y_i).$$
2720: 
2721: The question of interest here can be
2722: stated as follows. Let $C_n$ be 
2723: a subset $A^n$ (we think of $C_n$ 
2724: as the codebook) that
2725: nearly $D$-covers all of $A^n$, 
2726: i.e., with high probability, 
2727: every string $X_1^n$ generated
2728: by the source will match at 
2729: least one element of $C_n$ 
2730: with distortion $D$ or less:
2731: \be
2732: P^n\{\mbox{there is an $y_1^n\in C_n$ such that}\;
2733: 	\rho_n(X_1^n,y_1^n)\leq D\}\approx 1.
2734: \label{eq:cover}
2735: \ee
2736: If (\ref{eq:cover}) holds,
2737: how small can the mass of $C_n$ be?
2738: 	
2739: For example,
2740: taking $M$ identically equal to one, 
2741: this problem reduces to the rate-distortion
2742: question. Taking $M$ to be a different
2743: probability measure $Q$, it reduces to
2744: the classical hypothesis testing question,
2745: whereas $M=P$ (the source distribution) 
2746: yields ``converses''
2747: to some measure-concentration inequalities;
2748: see \cite{covering-TR:99}
2749: for a detailed treatment of 
2750: these and more general cases.
2751: 
2752: The next result characterizes the best growth
2753: exponent for the mass of an
2754: arbitrary codebook $C_n$.
2755: 
2756: \medskip
2757: 
2758: {\em Theorem~15: Weighted Codebooks \cite{covering-TR:99}:}
2759: Let $\Xp$ be an $\iid$ source on the finite
2760: alphabet $A=\Ahat$, and suppose that 
2761: $\rho(x,y)=0$ if and only if $x=y.$
2762: \begin{itemize}
2763: \item[$(\Leftarrow)$] Let $C_n$ be an arbitrary subset of $A^n$,
2764: and write $D$ for  the expected distance of a source string 
2765: $X_1^n$ from $C_n$:
2766: $$D=E_{P^n}[\min_{y_1^n\in C_n}
2767: 	\rho_n(X_1^n,y_1^n)].$$
2768: Then
2769: $$M^n(C_n)\geq e^{nr(D)}$$
2770: where the rate-function $r(D)=r(D;P,M)$ is defined by
2771: $$r(D)=r(D;P,M)=\inf_{(X,Y)}\{I(X;Y)+ E[\log M(Y)]\}$$
2772: and the infimum is taken over all jointly distributed
2773: random variables $(X,Y)$ with values in $A$, such that
2774: $X\sim P$ and $E[\rho(X,Y)]\leq D.$
2775: \item[$(\Rightarrow)$] 
2776: For every $D\geq 0$ 
2777: there is a sequence
2778: of codebooks $\{C^*_n\}$ such that 
2779: \ben
2780: &&
2781:         \limsup_{n\to\infty}\;
2782:         \frac{1}{n}\log M^n(C^*_n)\leq r(D)\\
2783: \mbox{and}&&
2784:         \limsup_{n\to\infty}\;
2785:         E_{P^n}[\min_{y_1^n\in C^*_n}
2786: 	\rho_n(X_1^n,y_1^n)]\leq D.
2787: \een
2788: \end{itemize} 
2789: 
2790: \medskip
2791: 
2792: The main ingredient in the proof of the direct 
2793: coding theorem in part~$(\Rightarrow)$ above 
2794: is provided by yet another version
2795: of the generalized AEP. Let $(X^*,Y^*)$ be a pair
2796: of random variables achieving the infimum in the
2797: definition of $r(D)$, and let $Q^*$ be the 
2798: distribution of $Y^*$. Now for $\delta>0$ 
2799: and $n\geq 1$ define the sets
2800: $${\cal G}_n=\{y_1^n\in A^n\;:\;
2801:         \hat{P}_{y_1^n}(b)\leq Q^*(b)+\delta,
2802:         \;\;\forall\, b\in A\}$$
2803: where $\hat{P}_{y_1^n}$ denotes the empirical
2804: distribution induced by $y_1^n$ on $A$.
2805: For each $n\geq 1$ define the ``conditioned''
2806: measure $Q^{(c)}_n$ on $A^n$ by conditioning
2807: the product measure $(Q^*)^n$ to the set
2808: ${\cal G}_n$. The next theorem provides
2809: the necessary version of the generalized 
2810: AEP in this case.
2811: 
2812: \medskip
2813: 
2814: {\em Theorem~16: Generalized AEP 
2815: for Conditioned Measures \cite{covering-TR:99}:}
2816: With the conditioned measures $Q^{(c)}_n$ defined
2817: as above, we have:
2818: $$\limsup_{n\to\infty} -\frac{1}{n}\log Q^{(c)}_n(B(X_1^n,D))
2819: 	\leq I(X^*;Y^*) \;\;\;\;\mbox{w.p.1.}$$
2820: 
2821: 
2822: 
2823: \section{Refinements of the Generalized AEP}
2824: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2825: As we saw in Section~3, the generalized AEP can
2826: be used to determine the first order asymptotic
2827: behavior of a number of interesting objects 
2828: arising in applications. For example, the 
2829: generalized AEP of Theorem~1
2830: $$-\frac{1}{n}\log Q^n(B(X_1^n,D))\to R_1(P,Q,D)
2831: 	 \;\;\;\;\mbox{w.p.1}$$
2832: immediately translated 
2833: (via the strong approximation of
2834: Theorem~13)
2835: to a strong-law-of-large-numbers
2836: (SLLN) result for the waiting times:
2837: $$\frac{1}{n}\log W_n \to R_1(P,Q,D)
2838:          \;\;\;\;\mbox{w.p.1.}$$
2839: 
2840: In this section we will prove refinements
2841: to the generalized AEP of Section~2.2, 
2842: and in Section~5 we will revisit the applications
2843: of the previous section and use these refinements
2844: to prove corresponding second order asymptotic 
2845: results.
2846: 
2847: To get some motivation, 
2848: let us consider for a moment 
2849: the simplest version of the 
2850: classical AEP, for an $\iid$ 
2851: process $\Xp$ with distribution 
2852: $P$ on the finite alphabet $A$. 
2853: The AEP here follows by a simple 
2854: application of the law 
2855: of large numbers,
2856: \be
2857: -\frac{1}{n}\log P^n(X_1^n)
2858: =
2859: \frac{1}{n}\sum_{i=1}^n[-\log P(X_i)]
2860: \to H
2861: \label{eq:oldPS}
2862: \ee
2863: where $H$ is the entropy of $P$.
2864: But (\ref{eq:oldPS}) contains 
2865: more information than that:
2866: It says that $-\log P^n(X_1^n)$ 
2867: is in fact equal to the partial sum 
2868: $S_n=\sum_{i=1}^n Z_i$ of the $\iid$
2869: random variables $Z_i=-\log P(X_i)$.
2870: Therefore we can apply the
2871: central limit theorem (CLT)
2872: or the law of the iterated 
2873: logarithm (LIL) to get more 
2874: precise information on the 
2875: convergence of the AEP.
2876: 
2877: The same strategy can be carried out
2878: for non-$\iid$ processes: Initially
2879: Ibragimov \cite{ibragimov:62}
2880: and then Philipp and Stout \cite{philipp-stout:book}
2881: showed that even when $\Xp$ is a Markov chain,
2882: or, more generally, a weakly dependent 
2883: random process, the quantities $-\log P^n(X_1^n)$ 
2884: can be approximated by the partial sums of
2885: an associated weakly dependent process.
2886: These results have found a number of
2887: applications in lossless data 
2888: compression and related areas 
2889: \cite{kontoyiannis-jtp}\cite{kontoyiannis-97}.
2890: 
2891: In this and the following section we will
2892: carry out a similar program in the lossy
2893: case. Throughout this section we will 
2894: adopt the notation and assumptions 
2895: of Section~2.2: Let
2896: $\Xp$ be a stationary ergodic
2897: process with first order marginal
2898: $P_1=P$ on $A$, and let $Q$ be
2899: an arbitrary probability measure
2900: on $\Ahat$. Define $\Dmin$ and $\Dav$,
2901: as before (as in equations (\ref{eq:Dmin}) 
2902: and (\ref{eq:Dav})), and 
2903: assume that $\Dmin<\Dav$ so that
2904: the distortion measure $\rho(X,Y)$ is not
2905: essentially constant in $Y$ with positive
2906: probability. We also impose here the
2907: additional assumption that $\rho$ has
2908: a finite third moment:
2909: \be
2910: D_3\bydef
2911: E_{P\times Q}[\rho^3(X,Y)]<\infty.
2912: \label{eq:third}
2913: \ee
2914: 
2915: The first result of this section 
2916: refines Theorem~1 by giving a more 
2917: precise asymptotic estimate of the
2918: quantity $-\log Q^n(B(X_1^n,D))$ in
2919: terms of the rate-function $R_1(P,Q,D)$
2920: and the empirical measure $\Phatn$ 
2921: induced by $X_1^n$ on $A^n$
2922: $$\Phatn\bydef\frac{1}{n}\sum_{i=1}^n\delta_{X_i}$$
2923: where $\delta_x$ denotes the measure assigning
2924: unit mass to $x\in A$.
2925: 
2926: \medskip
2927: 
2928: {\em Theorem~17: \cite{yang-zhang:99}:}
2929: Let $\Xp$ be a stationary ergodic process
2930: with marginal $P$ on $A$, and let $Q$ be
2931: an arbitrary probability measure on $\Ahat.$
2932: Assume that $D_3=E_{P\times Q}[\rho^3(X,Y)]$ 
2933: is finite. Then for any $D\in(\Dmin,\Dav)$:
2934: \be
2935: -\log Q^n(B(X_1^n,D))= nR_1(\hat{P}_n,Q,D)+\frac{1}{2}\log n + O(1)
2936: \;\;\;\;\mbox{w.p.1.}
2937: \label{eq:br}
2938: \ee
2939: 
2940: \medskip
2941: 
2942: Next we show that the most significant
2943: term in (\ref{eq:br}) can be approximated 
2944: by the partial sum of a weakly dependent
2945: random process. Recall the definition of 
2946: the $\alpha$-mixing coefficients of $\Xp$
2947: $$\alpha(k)\;=\;\sup\{|\BBP(A\cap B)-\BBP(A)\BBP(B)|\;:\;\;
2948: A\in\sigma(X_{-\infty}^0),\; B\in\sigma(X_{k}^{\infty})\}$$
2949: where $\sigma(X_i^j)$ is
2950: the $\sigma$-field generated by $X_i^j$. 
2951: The process $\Xp$ is called {\em $\alpha$-mixing}
2952: if $\alpha(k)\to 0$ as $k\to\infty$;
2953: see \cite{bradley} for more details.
2954: 
2955: We also need to recall some of the notation 
2956: from the proof of Theorem~1 in Section~2.2.
2957: For $x\in A$ and $\la\in\RL$, let $\LA_x(\la)$
2958: denote the log-moment generating function
2959: of the random variable $\rho(x,Y)$
2960: $$\LA_x(\la)\bydef \log E_Q\left(e^{\lambda\rho(x,Y)}\right)$$
2961: and note that the function $\LA(\la)$ defined
2962: in (\ref{eq:GEcheck}) can be written
2963: as $\LA(\la)=E_P[\LA_X(\la)]$.
2964: Also recall that for any $D\in(\Dmin,\Dav)$ there
2965: exists a unique $\la^*<0$ such that 
2966: $\LA'(\la^*)=D$. 
2967: 
2968: % \medskip
2969: 
2970: \newpage
2971: 
2972: {\em Theorem~18: \cite{dembo-kontoyiannis}:}
2973: Let $\Xp$ be a stationary $\alpha$-mixing process
2974: with marginal $P$ on $A$, and let $Q$ be
2975: an arbitrary probability measure on $\Ahat.$
2976: Assume that the $\alpha$-mixing coefficients
2977: of $\Xp$ satisfy
2978: \be
2979: \sum_{k=1}^\infty \alpha^t(k)<\infty, 
2980: \;\;\;\;\mbox{for some $t\in(0,1/3)$}
2981: \label{eq:LIL-cond}
2982: \ee
2983: and that $D_3=E_{P\times Q}[\rho^3(X,Y)]$ 
2984: is finite. Then for any $D\in(\Dmin,\Dav)$:
2985: $$nR_1(\hat{P}_n,Q,D) = nR_1(P,Q,D) + \sum_{i=1}^n 
2986: 	g(X_i) + O(\log\log n)
2987: \;\;\;\;\mbox{w.p.1}$$
2988: where 
2989: \be
2990: g(x)\bydef \LA(\la^*) -\LA_x(\la^*),\;\;\;\;
2991: x\in A.
2992: \label{eq:functiong}
2993: \ee
2994: 
2995: \medskip
2996: 
2997: Theorem~18 is a small generalization 
2998: of \cite[Theorem~3]{dembo-kontoyiannis}.
2999: Before giving its proof outline,
3000: we combine Theorems~17 and~18 to 
3001: show that, as promised, $-\log Q^n(B(X_1^n,D))$
3002: can be accurately approximated as the 
3003: partial sum of the weakly dependent
3004: random process $\{g(X_n)\}$.
3005: 
3006: \medskip
3007: 
3008: {\em Corollary~19: Second Order Generalized AEP:}
3009: Let $\Xp$ be a stationary $\alpha$-mixing process
3010: with marginal $P$ on $A$, and let $Q$ be
3011: an arbitrary probability measure on $\Ahat.$
3012: Assume that the $\alpha$-mixing coefficients
3013: of $\Xp$ satisfy
3014: (\ref{eq:LIL-cond})
3015: and that $D_3=E_{P\times Q}[\rho^3(X,Y)]$ is
3016: finite. Then for any $D\in(\Dmin,\Dav)$, and
3017: with $g(x)$ defined as in (\ref{eq:functiong}):
3018: $$
3019: -\log Q^n(B(X_1^n,D))= nR_1(P,Q,D) + \sum_{i=1}^ng(X_i) 
3020: 	+ \frac{1}{2}\log n + O(\log\log n)
3021: \;\;\;\;\mbox{w.p.1.}$$
3022: 
3023: \medskip
3024: 
3025: {\em Proof Outline for Theorem~18:}
3026: Adapting the argument leading from
3027: (22) to (24) of \cite{dembo-kontoyiannis},
3028: one easily checks that the result of 
3029: Theorem~18 holds as soon as 
3030: \be
3031: \liminf_{n \to \infty} \inf_{|\theta|<\delta} B_n(\theta)
3032: &>&0
3033: 	\;\;\;\;\mbox{w.p.1}
3034: 	\label{Bncond}\\
3035: \mbox{and}\;\;\;\;
3036: \limsup_{n \to \infty} \frac{ n A_n^2}{\log \log n}
3037: &<&\infty 
3038: 	\;\;\;\;\mbox{w.p.1}
3039: 	\label{Ancond}
3040: \ee
3041: where
3042: $A_n=
3043: n^{-1}
3044: \sum_{k=1}^n\zeta_k$ is
3045: the
3046: empirical mean of the centered
3047: random variables $\zeta_k=\Lambda_{X_k}'(\la^*)-D$,
3048: and $B_n(\theta)$ is the
3049: empirical mean
3050: of the non-negative random variables
3051: $\Lambda_{X_k}''(\la^*+\theta)$. 
3052: By the ergodic theorem we have,
3053: with probability one,
3054: \begin{eqnarray*}
3055: \liminf_{n \to \infty} \inf_{|\theta|<\delta} B_n(\theta)
3056: &\geq&
3057: \liminf_{n \to \infty} \frac{1}{n} \sum_{k=1}^n
3058: \inf_{|\theta|<\delta} \Lambda_{X_k}''(\la^*+\theta) \\
3059: &=& E_P \left [ \inf_{|\theta|<\delta} \Lambda_{X}''(\la^*+\theta)
3060: 	\right]
3061: \end{eqnarray*}
3062: and by Fatou's lemma and the continuity of
3063: the map $\theta \mapsto \Lambda_{x}''(\la^*+\theta)$
3064: it follows that
3065: $$
3066: \liminf_{\delta \downarrow 0}
3067: E_P \left[
3068: 	\inf_{|\theta|<\delta} \Lambda_X''(\la^*+\theta)
3069: 	\right]
3070: \geq E_P [\Lambda_X''(\la^*)] = \Lambda''(\la^*) > 0.
3071: $$
3072: This implies that 
3073: (\ref{Bncond}) holds once $\delta>0$
3074: is made small enough. [Note that the above 
3075: argument also avoids an incorrect -- but 
3076: also unnecessary -- application
3077: of the uniform ergodic theorem in the 
3078: derivation of \cite[eq.~(26)]{dembo-kontoyiannis}.]
3079: 
3080: Turning to (\ref{Ancond}), since $\la^*<0$, 
3081: it follows by the convexity of $\Lambda_x(\la)$ 
3082: that that for any $x\in A$:
3083: % and the non-negativity 
3084: % of $\rho(\cdot,\cdot)$, 
3085: $$
3086: 0 \leq \Lambda_x'(\la^*) \leq \Lambda_x'(0) = E_Q[\rho(x,Y)].
3087: $$
3088: Consequently, H\"older's inequality and assumption
3089: (\ref{eq:third}) imply that the random variable
3090: $$|\zeta_k| \leq E_Q[\rho(X_k,Y)|X_k]+D$$
3091: has a finite third moment.
3092: Recall  \cite{oodaira-yoshihara:71a}
3093: that the LIL holds for the partial sum $A_n$ of a
3094: zero-mean, stationary process $\{\zeta_k\}$ with
3095: a finite third moment, as soon as 
3096: the $\alpha$-mixing coefficients 
3097: of $\{\zeta_k\}$ satisfy (\ref{eq:LIL-cond}).
3098: The observation 
3099: that $\zeta_k$ is a deterministic
3100: function of $X_k$ for all $k$
3101: completes the proof. \qed
3102: 
3103: \section{Applications -- Second Order Results}
3104: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3105: Here we revisit the applications considered
3106: in Section~3, and using the 
3107: ``second order generalized AEP''
3108: of Corollary~19 we prove second order 
3109: refinements for many of the results from 
3110: Section~3. In Section~5.1 we consider
3111: the problem of lossy data compression
3112: in the same setting as in Section~3.1.
3113: We use the second order AEP
3114: to determine the precise asymptotic
3115: behavior of the Shannon random codebooks,
3116: and show that, with probability one,
3117: they achieve optimal compression performance 
3118: up to terms of order $(\log n)$ bits. 
3119: Moreover, essentially the same compression 
3120: performance can be achieved universally. 
3121: For arbitrary variable-length codes 
3122: operating at a fixed rate level, we show 
3123: that the rate at which they can achieve
3124: the optimal rate of $n\calR(D)$ bits is
3125: at best of order $O(\sqrt{n})$ bits. 
3126: This is the 
3127: best possible redundancy rate as 
3128: long as the ``minimal coding variance'' 
3129: of the source is strictly positive. 
3130: For discrete $\iid$ sources, 
3131: a characterization is given of 
3132: when this variance can be zero.
3133: 
3134: In Section~5.2 we look at waiting times,
3135: and we prove a second order refinement to
3136: Theorem~14, and in Section~5.3 we 
3137: consider the problem of determining
3138: the asymptotic behavior of longest
3139: match-lengths. As discussed briefly
3140: in Section~3.4, their asymptotics
3141: can be deduced from the corresponding
3142: waiting-times results via duality.
3143: 
3144: \subsection{Lossy Data Compression}
3145: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3146: 
3147: \subsubsection{Random Codes and Second Order Converses}
3148: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3149: Here we consider the exact same setup as in Section~3.1:
3150: An $\iid$ source $\Xp$ with distribution $P$ on $A$
3151: is to be compressed with distortion $D$ or less with
3152: respect to a bounded distortion measure
3153: $\rho$, satisfying, as before, the usual
3154: assumption (\ref{eq:maximin}) --
3155: see the remark at the end of this
3156: section for
3157: its implications.
3158: We take 
3159: the reproduction alphabet $\Ahat$ to be
3160: finite, define $\Dbar$ as in (\ref{eq:Dbar}),
3161: and assume that $\Dbar>0$.
3162: 
3163: For $D\in(0,\Dbar)$, let $Q_n^*$, $n\geq 1$,
3164: denote the optimal reproduction distributions
3165: at distortion level $D$. Combining the
3166: strong approximation Theorem~9 with the
3167: second order generalized AEP
3168: of Corollary~19 and the discussion in 
3169: Section~3.1 yields:
3170: 
3171: % \medskip
3172: 
3173: \newpage
3174: 
3175: {\em Theorem~20: Pointwise Redundancy for I.I.D. Sources
3176: \cite{kontoyiannis-red:00}:}
3177: Suppose $\Xp$ is an $\iid$ source with distribution
3178: $P$ on $A$, and with rate-distortion
3179: function $\calR(D)$ (in bits). Let $Q_n^*$ 
3180: denote the optimal reproduction distributions 
3181: at distortion level $D\in(0,\Dbar)$, 
3182: and define the function
3183: $h(x)=(\log_2 e)g(x)$, $x\in A$,
3184: with $g$ defined as in (\ref{eq:functiong}).
3185: Then:
3186: \begin{itemize}
3187: \item[(a)]
3188: The codes based on almost any realization
3189: of the Shannon random codebooks according
3190: to the measures $\{Q_n^*\}$ have codelengths
3191: $\ell_n(X_1^n)$
3192: satisfying
3193: $$
3194: \ell_n(X_1^n)\leq
3195: n\calR(D)
3196: +\sum_{i=1}^n h(X_i)
3197: +4\log n
3198: \;\;\;\;\mbox{bits, eventually, w.p.1.}$$
3199: \item[(b)]
3200: The codes based on almost any realization 
3201: of the universal Shannon random codebooks 
3202: have codelengths $\ell_n(X_1^n)$ satisfying
3203: $$
3204: \ell_n(X_1^n)\leq
3205: n\calR(D)
3206: +\sum_{i=1}^n h(X_i)
3207: +(4+|\Ahat|)\log n
3208: \;\;\;\;\mbox{bits, eventually, w.p.1.}$$
3209: \end{itemize}
3210: 
3211: \medskip
3212: 
3213: We remark that the coefficients of the 
3214: $(\log n)$ terms in (a) and (b) above 
3215: are not the best possible, and can be 
3216: significantly improved; see 
3217: \cite{konto-zhang:00} for more details.  
3218: 
3219: Perhaps somewhat surprisingly, 
3220: it turns out that the performance 
3221: of the above random codes is 
3222: optimal up to terms of order 
3223: $(\log n)$ bits. 
3224: Recall that a {\em code $C_n$ operating 
3225: at distortion level $D\geq 0$} is
3226: defined by a triplet $(B_n,\phi_n,\psi_n)$ where:
3227: \begin{itemize}
3228: \item[$(a)$]
3229: $B_n$ is a subset of $\Ahatn$, called the {\em codebook},
3230: \item[$(b)$]
3231: $\phi_n:A^n\to B_n$ is the {\em encoder},
3232: \item[$(c)$]
3233: $\psi_n:B_n\to \{0,1\}^*$ is a
3234: uniquely decodable map,
3235: \end{itemize}
3236: such that 
3237: $$\rho_n(x_1^n,\phi_n(x_1^n))\leq D,
3238: \;\;\;\;\;\;\mbox{for all}\;\;x_1^n\in A^n.$$
3239: The codelengths $\ell_n(X_1^n)$ achieved by
3240: such a code are simply:
3241: $$\ell_n(x_1^n)=\;
3242: \mbox{length of}\;[\psi_n(\phi_n(x_1^n))]
3243: \;\;\;\;\mbox{bits}.$$
3244: 
3245: \medskip
3246: 
3247: {\em Theorem~21: Pointwise Converse for I.I.D. Sources
3248: \cite{kontoyiannis-red:00}:}
3249: Let $\Xp$ be an $\iid$ source with distribution
3250: $P$ on $A$, and let $\{C_n\}$ be an arbitrary
3251: sequence of codes operating at distortion
3252: level $D\in(0,\Dbar)$, with associated
3253: codelengths $\{\ell_n\}$. Then:
3254: $$
3255: \ell_n(X_1^n)\geq
3256: n\calR(D)
3257: +\sum_{i=1}^n h(X_i)
3258: -\log n
3259: \;\;\;\;\mbox{bits, eventually, w.p.1}$$
3260: where $h(x)$
3261: is defined as in Theorem~20.
3262: 
3263: \medskip
3264: 
3265: The proof of Theorem~21 in \cite{kontoyiannis-red:00}
3266: uses techniques quite different to those developed in
3267: this paper. In particular, the key step in the proof 
3268: is established by an application of
3269: the generalized Kuhn-Tucker conditions of Bell and
3270: Cover \cite{bell-cover:88}.
3271: 
3272: Theorems~20 and~21 are next combined to 
3273: yield ``second order'' refinements to 
3274: Shannon's classical source coding theorem. 
3275: For a source $\Xp$ as in Theorem~21 and 
3276: a $D\in(0,\Dbar)$, the {\em minimal coding
3277: variance $\sigma^2=\sigma^2(P,D)$ of
3278: source $P$ at distortion level $D$}
3279: is
3280: \be
3281: \sigma^2=\sigma^2(P,D)\bydef\VAR[h(X_1)]
3282: \label{eq:mincv}
3283: \ee
3284: with $h(x)$ as in Theorem~20.
3285: 
3286: % \medskip
3287: 
3288: \newpage
3289: 
3290: {\em Theorem~22: Second Order Source Coding Theorems
3291: \cite{kontoyiannis-red:00}:}
3292: Let $\Xp$ be an $\iid$ source with 
3293: distribution $P$ on $A$ and with
3294: rate-distortion function $\calR(D)$
3295: (in bits).
3296: For $D\in(0,\Dbar)$:
3297: \begin{itemize}
3298: \item[]{\bf (CLT)}
3299: There is a sequence of
3300: random variables $G_n=G_n(P,D)$ such that, for any
3301: sequence of codes $\{C_n,\ell_n\}$ operating
3302: at distortion level $D$, we have
3303: \be
3304: \ell_n(X_1^n)-
3305: n\calR(D)\geq \sqrt{n}G_n
3306:         \;\;\;\;\mbox{bits, eventually, w.p.1}
3307: \label{eq:clt}
3308: \ee
3309: and the $G_n$ converge in distribution
3310: to a Gaussian random variable
3311: $$G_n\weakly N(0,\sigma^2)$$
3312: where $\sigma^2=\sigma^2(P,D)$ 
3313: is the minimal coding variance.
3314: \item[]{\bf (LIL)}
3315: With $\sigma^2$ as above,
3316: for any sequence of codes
3317: $\{C_n,\ell_n\}$ operating
3318: at distortion level $D$:
3319: \ben
3320: \limsup_{n\to\infty}\;
3321: \frac{\ell_n(X_1^n)-n\calR(D)}{\sqrt{2n\log\log n}}
3322: &\geq& \sigma\;\;\;\;\mbox{w.p.1}\\
3323: \liminf_{n\to\infty}\;
3324: \frac{\ell_n(X_1^n)-n\calR(D)}{\sqrt{2n\log\log n}}
3325: &\geq& -\sigma\;\;\;\;\mbox{w.p.1.}
3326: \een
3327: \item[]{\bf (\boldmath$\Rightarrow$)}
3328: Moreover, there exist codes $\{C_n,\ell_n\}$
3329: operating at distortion level $D$, that 
3330: asymptotically achieve equality
3331: {\em universally} in all these 
3332: lower bounds.
3333: \end{itemize}
3334: 
3335: \medskip
3336: 
3337: {\em Remark on Assumption (\ref{eq:maximin}):}
3338: When the distortion measure does not satisfy 
3339: assumption (\ref{eq:maximin}) [as, for example, 
3340: when $\rho(x,y)=(x-y)^2$ with $A=\RL$ and $\Ahat$
3341: a finite subset of $\RL$], we can modify $\rho$
3342: to $\rho'(x,y)=\rho(x,y)-f(x)$, with 
3343: $f(x)=\min_{y \in \hat{A}} \, \rho(x,y)$, 
3344: so that $\rho'$ satisfies (\ref{eq:maximin}).
3345: Then, to generate codes operating at 
3346: distortion level $D$ with respect to $\rho$, 
3347: we can construct random codebooks for 
3348: as before but do the encoding with respect
3349: to $\rho'(x,y)$ at the {\it random} 
3350: distortion level $D_n= D - E_{\hat{P}_n}(f(X))$. 
3351: It is not hard to check that
3352: \cite[Theorem 2]{dembo-kontoyiannis}
3353: can be extended to apply when $D$ is 
3354: replaced by the sequence $\{D_n\}$.
3355: Since $D_n \to D - E_P(f(X))$ as $n\to\infty$,
3356: this results with the first order 
3357: approximation 
3358: $$-\frac{1}{n}\log Q^*_n(B(X_1^n,D_n))\approx
3359: R_1^{\rho'}(\hat{P}_n,Q^*,D_n).$$
3360: Simple algebra then shows that 
3361: $$
3362: R_1^{\rho'}(\hat{P}_n,Q^*,D_n)=R_1^\rho(\hat{P}_n,Q^*,D)
3363: $$
3364: implying that all the results of Section 5.1.1
3365: remain valid [despite the fact that $\rho$
3366: does not satisfy  (\ref{eq:maximin})], with
3367: the function $h(\cdot)$ taken in terms of 
3368: the log-moment generating function 
3369: $\Lambda_x(\la)$ of the {\it original} 
3370: distortion measure $\rho$ (and not that of 
3371: the modified $\rho'$).
3372: 
3373: 
3374: \subsubsection{Critical Behavior}
3375: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3376: In view of Theorems~20 and~21 above,
3377: the codelengths $\ell_n^*(X_1^n)$ of 
3378: the best code operating at distortion 
3379: level $D$ have:
3380: $$\ell^*_n(X_1^n)\approx
3381: n\calR(D)
3382: +\sum_{i=1}^n h(X_i)
3383: +O(\log n)
3384: \;\;\;\;\mbox{bits.}$$
3385: This reveals an interesting 
3386: dichotomy in the behavior of 
3387: the ``pointwise'' redundancy of
3388: the best code:
3389: \begin{itemize}
3390: \item
3391: Either the minimal coding variance
3392: $\sigma^2$ (recall (\ref{eq:mincv})) 
3393: is nonzero, in which case the best
3394: rate at which optimality can
3395: be achieved is of order $\sqrt{n}$
3396: bits by the CLT;
3397: \item
3398: or $\sigma^2=0$, and the best redundancy 
3399: rate is of order $(\log n)$ bits
3400: (cf. \cite{zhang-yang-wei:I}).
3401: \end{itemize}
3402: Under certain conditions, in this section 
3403: we give a precise characterization of 
3404: when each of these two cases can occur. 
3405: Before stating it, we briefly discuss 
3406: two examples to gain some intuition.
3407:  
3408: \medskip
3409: 
3410: {\em Example~4: Lossless Compression:}
3411: Lossless data compression 
3412: can be considered as an extreme case 
3413: of lossy compression, where $\Xp$ is
3414: an $\iid$ source with distribution $P$
3415: on a finite set $A=\Ahat$,
3416: and the distortion level $D$ 
3417: is set to zero. Here it is 
3418: well-known that (ignoring the integer 
3419: length constraints) the best code is 
3420: given by the idealized Shannon code,
3421: $\ell_n(X_1^n)=-\log_2 P^n(X_1^n)$.
3422: Accordingly, the upper 
3423: and lower bounds of Theorems~21 
3424: and~22 say that the best code has 
3425: codelengths 
3426: $$\ell_n(X_1^n) = n\calH(P)
3427: 	+\sum_{i=1}^n h(X_i)$$
3428: where $\calH(P)$ is the entropy of $P$
3429: in bits, and with
3430: $$h(x)\bydef-\log_2 P(x) - \calH(P),
3431: 	\;\;\;\;x\in A.$$
3432: When is $\sigma^2=0$? By its
3433: definition (\ref{eq:mincv}),
3434: $\sigma^2$ is zero if and only if
3435: the function $h(x)$ is constant over $x$,
3436: which, in this case, can only happen if
3437: $P(x)$ is constant over $x\in A$. 
3438: Therefore, here:
3439: {\em $\sigma^2=0$ if and only if
3440: the source has a uniform distribution
3441: over $A$.} 
3442: 
3443: \medskip
3444:  
3445: {\em Example~5: Binary Source with Hamming Distortion:}
3446: Consider the simplest
3447: non-trivial lossy example: 
3448: Let $\Xp$ be an $\iid$ source
3449: with Bernoulli($p$) distribution 
3450: (for some $p\in(0,1/2]$),
3451: let $A=\Ahat=\{0,1\}$,
3452: and take $\rho$ to be Hamming
3453: distortion: $\rho(x,y)=|x-y|$.
3454: For $D\in(0,p)$ it is not
3455: hard to evaluate all the 
3456: relevant quantities 
3457: explicitly
3458: (see, e.g., 
3459: \cite[Example~2.7.1]{berger:book}
3460: or \cite[Theorem~13.3.1]{cover:book}).
3461: In particular,
3462: the optimal reproduction
3463: distribution $Q^*$ is
3464: Bernoulli($q$),
3465: with $q=(p-D)/(1-2D)$, and
3466: our function of interest is:
3467: $$h(x)=
3468: -\log_2\left(\frac{P(x)}{1-D}\right)
3469:         -E_P\left[
3470:                 -\log_2\left(\frac{P(X_1)}{1-D}\right)
3471:                 \right].$$
3472: Recalling that the minimal coding
3473: variance is zero if and only if
3474: $h(x)$ is constant, from the above
3475: expression we see that, similarly
3476: to the previous example, also 
3477: here:
3478: {\em $\sigma^2=0$ if and only if
3479: the source has a uniform distribution}.
3480: 
3481: \medskip
3482: 
3483: For discrete sources, the next result gives
3484: conditions under which the characterization 
3485: suggested by these two examples remains valid.
3486: Suppose $A=\Ahat=\{a_1,a_2,\ldots,a_k\}$
3487: is a finite set, write $\rho_{ij}$ for 
3488: $\rho(a_i,a_j)$, and assume
3489: that $\rho$ is symmetric
3490: and that $\rho_{ij}=0$ if and only if
3491: $i=j$. We call $\rho$ a 
3492: {\em permutation distortion measure}, 
3493: if all rows of the matrix 
3494: $(\rho_{ij})_{i,j=1,\ldots,k}$
3495: are permutations of one another.
3496: 
3497: \medskip
3498: 
3499: {\em Theorem~23: Variance Characterization
3500: \cite{dembo-kontoyiannis:crit:01}:}
3501: Let $\Xp$ be a discrete source with
3502: distribution $P$ and rate-distortion 
3503: function $R(D)$. Assume that $R(D)$ 
3504: is strictly convex over $(0,\Dbar)$. 
3505: There are exactly two possibilities:
3506: \begin{itemize}
3507: \item[(a)]
3508: Either $\sigma^2=\sigma^2(P,D)$ is only
3509: zero for finitely many $D\in(0,\Dbar).$
3510: \item[(b)]
3511: Or $\sigma^2=\sigma^2(P,D)\equiv 0$ 
3512: for {\em all} $D\in(0,\Dbar)$, in which 
3513: case $P$ is the uniform distribution
3514: on $A$ and $\rho$ is
3515: a permutation distortion measure.
3516: \end{itemize}
3517: 
3518: \medskip
3519: 
3520: A general discussion of this
3521: problem, including the case of continuous
3522: sources, is given in 
3523: \cite{dembo-kontoyiannis:crit:01}.
3524: Also, in the lossless case,
3525: the problem of characterizing 
3526: when $\sigma^2=0$ for sources 
3527: with memory is dealt with 
3528: in \cite{kontoyiannis-97}.
3529: 
3530: Before moving on to waiting times and match-lengths 
3531: we mention that, in a somewhat similar
3532: vain, the problem of understanding the best
3533: {\em expected}
3534: redundancy rate in lossy data
3535: compression has also been recently considered in 
3536: \cite{zhang-yang-wei:I})\cite{yang-zhang:II}%
3537: \cite{yang-zhang:III}\cite{ishii-yamamoto:97}.
3538: 
3539: 
3540: \subsection{Waiting Times}
3541: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3542: Next we turn to waiting times.
3543: Recall that, given $D\geq 0$ 
3544: and two independent realizations 
3545: of the stationary ergodic 
3546: processes $\Xp$ and $\Yp$,
3547: the waiting time $W_n$ was
3548: defined as the time of the
3549: first appearance of $X_1^n$
3550: in $\Yp$ with distortion $D$
3551: or less (see (\ref{eq:Wn-def})
3552: for the precise definition).
3553: In Theorem~14 we gave conditions
3554: that identified the first order
3555: limiting behavior of $W_n$.
3556: In particular, when $\Yp$ is
3557: $\iid$, it was shown in 
3558: Theorem~14~(a)
3559: that
3560: \be
3561: \frac{\log W_n}{n}\to R_1(P,Q,D)
3562: \;\;\;\;\mbox{w.p.1}
3563: \label{eq:w-slln3}
3564: \ee
3565: where $P$ and $Q$ are the first
3566: order marginals of $\Xp$ 
3567: and $\Yp$, respectively.
3568: 
3569: The next result gives conditions
3570: under which the SLLN-type
3571: statement of (\ref{eq:w-slln3})
3572: can be refined to a CLT and
3573: a LIL.
3574: 
3575: \medskip
3576: 
3577: {\em Theorem~24: CLT and LIL for Waiting Times:}
3578: Let $\Xp$ be a stationary $\alpha$-mixing process
3579: and $\Yp$ be an $\iid$ process, with marginal
3580: distributions $P$ and $Q$, on $A$ and $\Ahat$,
3581: respectively. Assume that the $\alpha$-mixing 
3582: coefficients of $\Xp$ satisfy (\ref{eq:LIL-cond})
3583: and that $D_3=E_{P\times Q}[\rho^3(X,Y)]$ is
3584: finite. Then for any $D\in(\Dmin,\Dav)$ the
3585: following series converges
3586: \be
3587: \sigma^2\bydef E_P[g^2(X_1)]+2\sum_{k=2}^\infty E_P[g(X_1)g(X_k)]
3588: \label{eq:variance}
3589: \ee
3590: with $g(x)$ defined as in (\ref{eq:functiong}),
3591: and, moreover:
3592: \begin{itemize}
3593: \item[]{\bf (CLT)} With $R_1=R_1(P,Q,D)$:
3594: $$\frac{\log W_n \;-\; nR_1}{\sqrt{n}}
3595: 	\weakly N(0,\sigma^2).$$
3596: \item[]{\bf (LIL)}
3597: The set of limit points of the sequence
3598: $$\left\{
3599: 	\frac{\log W_n \;-\; nR_1}
3600: 	     {\sqrt{2n\log\log n}}
3601:   \right\},\quad n\geq 3$$
3602: coincides with $[-\sigma,\sigma]$, with
3603: probability one.
3604: \end{itemize}
3605: 
3606: \medskip
3607: 
3608: {\em Proof Outline:}
3609: For a bounded distortion measure 
3610: $\rho$, Theorem~24 was proved in 
3611: \cite{dembo-kontoyiannis}. 
3612: To obtain the more general statement above
3613: combine the strong approximation
3614: of Theorem~13 with the second order
3615: AEP in Corollary~19 to get:
3616: \be
3617: \log W_n=
3618: nR_1(P,Q,D) + \sum_{i=1}^ng(X_i) + O(\log n)
3619: \;\;\;\;\mbox{w.p.1.}
3620: \label{eq:inter}
3621: \ee
3622: Since $\Xp$ satisfies the mixing
3623: assumption (\ref{eq:LIL-cond}),
3624: so does the process $\{g(X_n)\}$.
3625: Also, since $\la^*<0$, the function
3626: $\LA_x(\la^*)$ is bounded above by zero,
3627: and by Jensen's inequality it is
3628: bounded below by $\la^*E_Q[\rho(x,Y)].$
3629: Therefore,
3630: $$|\LA_x(\la^*)|\leq |\la^*|E_Q[\rho(x,Y)]$$
3631: and this, together with 
3632: H\"older's inequality and
3633: the definition of $g(x),$ imply
3634: that $E_P[|g(X_1)|^3]<\infty$.
3635: Therefore we can apply the CLT
3636: of \cite[Theorem~1.7]{peligrad:86} 
3637: to the process $\{g(X_n)\}$
3638: in order to deduce the CLT-part 
3639: of the theorem from (\ref{eq:inter}).
3640: Similarly, applying the LIL of
3641: \cite{oodaira-yoshihara:71a}
3642: to $\{g(X_n)\}$, from (\ref{eq:inter})
3643: we get the LIL-part of the theorem.
3644: \qed
3645: 
3646: \medskip
3647: 
3648: {\em Remark 5:} When the variance
3649: $\sigma^2$
3650: in (\ref{eq:variance}) is positive,
3651: then the {\em functional} versions of 
3652: the above CLT and LIL given in 
3653: \cite{dembo-kontoyiannis} still hold, 
3654: under exactly the conditions of Theorem~24.
3655: (This follows by
3656: applying the functional CLT of
3657: \cite[Theorem~1.7]{peligrad:86}
3658: and the functional LIL of
3659: \cite[Theorem~1~(IV)]{oodaira-yoshihara:71b}.)
3660: 
3661: \subsection{Match-Lengths and Duality}
3662: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3663: 
3664: Finally we turn to our last application,
3665: match-lengths. Recall that, given a 
3666: distortion level $D\geq 0$ and two 
3667: independent realizations of the 
3668: processes $\Xp$ and $\Yp$, the match-length 
3669: $L_m$ is defined as the length $\ell$ 
3670: of the longest prefix $X_1^\ell$ that 
3671: appears (with distortion $D$ or less) 
3672: starting somewhere in the ``database'' 
3673: $Y_1^m.$ See (\ref{eq:Lm-def}) for the 
3674: precise definition. As we briefly mentioned
3675: in Section~3.4, there is a duality
3676: relationship between match-lengths
3677: and waiting times: Roughly speaking,
3678: long matches mean short waiting times,
3679: and vice-versa;
3680: see (\ref{eq:duality}).
3681: 
3682: Although the relation (\ref{eq:duality}) 
3683: is not as simple as the duality 
3684: (\ref{eq:easy-dual}) for exact matching,
3685: it is still possible to use 
3686: (\ref{eq:duality}) to translate
3687: the asymptotic results for $W_n$
3688: to corresponding results for $L_m$.
3689: These are given in Theorem~25 below.
3690: This translation, carried out
3691: in \cite{dembo-kontoyiannis}, is 
3692: more delicate than in the case of
3693: exact matching. For example, in
3694: order to prove the CLT for the 
3695: match-lengths $L_m$ one
3696: invokes
3697: the functional CLT for
3698: the waiting times (see Remark~5 above
3699: and the proof of Theorem~4 in 
3700: \cite{dembo-kontoyiannis}).
3701: 
3702: \medskip
3703: 
3704: {\em Theorem~25: Match-Lengths Asymptotics:}
3705: Let $\Xp$ be a stationary process
3706: and $\Yp$ be an $\iid$ process, with marginal
3707: distributions $P$ and $Q$, on $A$ and $\Ahat$,
3708: respectively. Assume 
3709: that $D_3=E_{P\times Q}[\rho^3(X,Y)]$ is
3710: finite. Then for any $D\in(\Dmin,\Dav)$ we have
3711: $$\mbox{\bf (LLN)}\hspace{1.65in} 
3712: \frac{L_m}{\log m}\,\to\,\frac{1}{R_1}\;\;\;\;
3713: \mbox{w.p.1}\hspace{2.4in}$$
3714: where $R_1=R_1(P,Q,D)$.
3715: If, moreover, 
3716: % $\Xp$ is $\alpha$-mixing and its 
3717: the $\alpha$-mixing coefficients of $\Xp$
3718: satisfy (\ref{eq:LIL-cond}) and 
3719: the variance $\sigma^2$ in (\ref{eq:variance}) 
3720: is nonzero, then, with $\tau^2\bydef \sigma^2R_1^{-3}$,
3721: we have,
3722: \ben
3723: &\mbox{\bf (CLT)}&
3724: 	\hspace{1.6in}
3725: 	\frac{L_m-\frac{\log m}{R_1}}{\sqrt{\log m}}
3726: 	\,\weakly\,N(0,\tau^2)
3727: 	\hspace{2.2in}\\
3728: &\mbox{\bf (LIL)}&
3729: 	\hspace{1.2in} 
3730: 	\limsup_{
3731: 	m\to\infty}
3732: 	\,\frac{L_m-\frac{\log m}{
3733: R_1
3734: }}
3735: 	{\sqrt{2\log m\,\log\log\log m}}\,
3736: 	=\,\tau\;\;\;\;
3737: 	\mbox{w.p.1.}
3738: \een
3739: 
3740: \medskip
3741: 
3742: The results of Theorem~25 were
3743: proved in \cite{dembo-kontoyiannis}
3744: for any bounded distortion measure 
3745: $\rho$.  The slightly 
3746: more general version stated above
3747: is proved in exactly the same way, 
3748: using the results of Section~4 
3749: in place of Theorems~2 and~3 
3750: of \cite{dembo-kontoyiannis}.
3751: 
3752: \section{Random Fields -- First Order Results}
3753: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3754: This and the following section are devoted 
3755: to generalizations of the results of
3756: Sections~2--5 to the case of random fields.
3757: Specifically, the role of the processes $\Xp$
3758: and $\Yp$ will now be played by stationary
3759: ergodic random fields
3760: $\Xp=\{X_u\;;\;u\in\IN^d\}$
3761: and $\Yp=\{Y_u\;;\;u\in\IN^d\}$.
3762: As we will see, many of the problems 
3763: that we considered have natural 
3764: analogs in this case, and 
3765: the overall theme
3766: carries over:
3767: The generalized AEP and its refinement
3768: can be extended to random fields,
3769: and the corresponding questions in
3770: data compression and pattern matching
3771: can be answered following 
3772: the same path as before.
3773: 
3774: \subsection{Notation and Definitions}
3775: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3776: The following definitions and notation
3777: will remain in effect throughout Sections~6
3778: and~7.
3779: 
3780: We consider two random fields
3781: $\Xp=\{X_u\;;\;u\in\IN^d\}$
3782: and $\Yp=\{Y_u\;;\;u\in\IN^d\}$,
3783: $d\geq 2$, taking values 
3784: in $A$ and $\Ahat$, 
3785: respectively, and indexed
3786: by points $u=(u_1,u_2,\ldots,u_d)$
3787: on the integer lattice $\IN^d$.
3788: As before, $A$ and $\Ahat$
3789: are complete, separable 
3790: metric spaces, equipped with 
3791: their Borel
3792: $\sigma$-fields ${\cal A}$
3793: and $\hat{\cal A}$, 
3794: respectively.
3795: Let $\BBP$ and $\BBQ$
3796: denote the 
3797: (infinite-dimensional)
3798: measures of the entire random
3799: fields $\Xp$ and $\Yp$.
3800: Unless explicitly stated
3801: otherwise, we always assume
3802: that $\Xp$ and $\Yp$ are
3803: independent of each other.
3804: 
3805: Throughout the rest of the
3806: paper we will
3807: assume that $\Xp$ and $\Yp$
3808: are stationary and ergodic. 
3809: To be precise, by that we mean 
3810: that the Abelian group of 
3811: translations
3812: $\{T_u\,:\,u\in\IN^d\}$
3813: acts on both 
3814: $(A^{\IN^d},{\cal A}^{\IN^d},\BBP)$
3815: and
3816: $(\hat{A}^{\IN^d},\hat{\cal A}^{\IN^d},\BBQ)$
3817: in a measure-preserving,
3818: ergodic manner; see \cite{krengel:book}
3819: for a detailed exposition.
3820: 
3821: For $v,w\in\IN^d$,
3822: the distance between $v$ and $w$ 
3823: is defined by
3824: $$d(v,w)\bydef\max_{1\leq i\leq d}|v_i-w_i|$$
3825: and the distance between two subsets 
3826: $V,W\subset\IN^d$ is
3827: $$d(V,W)\bydef\inf_{v\in V,\;w\in W} d(v,w).$$
3828: Given $v,w\in\IN^d$, we let
3829: $[v,w]=\{u\in\IN^d\;:\;
3830: \mbox{$v_j\le u_j\leq w_j$ for all $j$}\}$,
3831: where $[v,w]$ is empty in case $v_j>w_j$ for some $j$.
3832: 
3833: We write $C(n)$ for the 
3834: $d$-dimensional cube of side $n\geq1$,
3835: \ben
3836: C(n)=
3837: 	\{u\in\IN^d\;:\;\mbox{$1\leq u_j\leq n$ for all $j$}\}
3838: \een
3839: and $[0,\infty)$ for the ``infinite cube''
3840: % and for $v\in\IN^d$, 
3841: % $$X_v^\infty\bydef\{X_u\;:\;
3842: % \mbox{$u_j\geq v_j$ for all $j$}\}.$$
3843: % and by $v+U$ we denote the translate
3844: % $$\{v+u\in\IN^d\;:\;u\in U\}.$$
3845: % and similarly
3846: % $[v,\infty)=\{u\in\IN^d\;:\;
3847: % \mbox{$u_j\geq v_j$ for all $j$}\}$.
3848: % and denote the 
3849: % and $C(\infty)$ denotes the ``infinite cube''
3850: $$[0,\infty)=\{u\in\IN^d\;:\;
3851: 	\mbox{$u_j\geq 0$ for all $j$}\}.$$
3852: For an arbitrary subset 
3853: $U\subset\IN^d$ we let
3854: $|U|$ denote its size;
3855: for example, $|C(n)|=n^d$.
3856: Also for $U\subset\IN^d$ we write
3857: $$X_U\bydef\{X_u\;;\;u\in U\}$$
3858: so that, in particular,
3859: $X_{[0,\infty)} = \{X_u\;;\;
3860: 	\mbox{$u_j\geq 0$ for all $j$}\}.$
3861: For 
3862: $V\subset\IN^d$ and $u\in\IN^d$ we
3863: let $u+U$ denote the translate
3864: $$u+V=\{u+v\;:\;v\in V\}.$$
3865: 
3866: For each $n\geq 1$, let $P_n$ denote the 
3867: marginal distribution of $X_{C(n)}$
3868: on $A^{n^d}$, and similarly write
3869: $Q_n$ for the distribution of $Y_{C(n)}$.
3870: Let $\rho:A\times\Ahat\to[0,\infty)$
3871: be an arbitrary nonnegative (measurable)
3872: function, and define a sequence of
3873: single-letter distortion measures 
3874: $\rho_n:A^{n^d}\times\Ahatnd\to[0,\infty)$,
3875: $n\geq 1$, by
3876: \ben
3877: \rho_n(x_{C(n)},y_{C(n)})\bydef\frac{1}{n^d}
3878: 	\sum_{u\in C(n)}\rho(x_u,y_u)
3879: \;\;\;\;x_{C(n)}\in A^{n^d},\;y_{C(n)}\in\Ahatnd.
3880: \een
3881: Given $D\geq 0$ and $x_{C(n)}\in A^{n^d}$,
3882: we write 
3883: $B(x_{C(n)},D)$ for
3884: the distortion-ball of radius $D$:
3885: $$B(x_{C(n)},D)=
3886: \left\{
3887: y_{C(n)}\in\Ahatnd\;:\;\rho_n(x_{C(n)},y_{C(n)})\leq D
3888: \right\}.$$
3889: 
3890: \subsection{Generalized AEP}
3891: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3892: It is well-known that the classical AEP
3893: \ben
3894: -\frac{1}{n}\log P_n(X_1^n) \to H(\BBP)
3895: \;\;\;\;\mbox{w.p.1}
3896: \een
3897: generalizes to the case of finite-alphabet 
3898: random fields on $\IN^d$, as well 
3899: as to other amenable group actions
3900: \cite{ornstein-weiss:83}. In this 
3901: section we give two versions of the
3902: generalized AEP of Theorems~1 and~4
3903: to the case of random fields on $\IN^d$.
3904: 
3905: \paragraph{$\Yp$ is i.i.d. }
3906: In the notation of Section~6.1,
3907: we take $\Xp$ to be a stationary 
3908: ergodic random field with first 
3909: order marginal $P_1=P,$ and 
3910: $\Yp$ to be i.i.d. with first 
3911: order marginal $Q_1=Q$. 
3912: We define $\Dmin$ and $\Dav$ 
3913: as in the one-dimensional 
3914: case (recall equations (\ref{eq:Dmin})
3915: and (\ref{eq:Dav})), and assume
3916: that $\rho(x,y)$ is not essentially 
3917: constant for ($\BBP$-almost) 
3918: all $x\in A$, that is, $\Dmin < \Dav.$
3919: 
3920: A simple examination of the proof of 
3921: Theorem~1 shows that it
3922: extends {\sl verbatim} to the
3923: case of random fields, with the
3924: only difference that instead of the
3925: usual ergodic theorem we now need
3926: to invoke the ergodic theorem
3927: for $\IN^d$ actions; see
3928: \cite[Chapter~6]{krengel:book}.
3929: We thus obtain:
3930: 
3931: \medskip
3932: 
3933: {\em Theorem~26. Generalized AEP when $\Yp$ is $\iid$:}
3934: Let $\Xp$ be a stationary ergodic random field on
3935: $\IN^d$ and $\Yp$ be $\iid$, with marginal distributions
3936: $P$ and $Q$ on $A$ and $\Ahat$, respectively.
3937: Assume that $\Dav=E_{P\times Q}[\rho(X,Y)]$ is
3938: finite. Then for any $D\in(\Dmin,\Dav)$
3939: \ben
3940: -\frac{1}{n^d}\log Q^{n^d}(B(X_{C(n)},D)) \to R_1(P,Q,D)
3941:         \;\;\;\;\mbox{w.p.1}
3942: \een
3943: with the (one-dimensional)
3944: rate-function $R_1(P,Q,D)$
3945: defined as in Theorem~1.
3946: 
3947: %Y ----------------- rephrased the discussion-results below
3948: 
3949: \paragraph{$\Yp$ is not i.i.d. }
3950: Let $\Xp$ and $\Yp$ be stationary random fields and
3951: define $\Dav$ and $\Dmax$ exactly as in the
3952: one-dimensional case (recall 
3953: (\ref{eq:Dav}) and (\ref{eq:Dmax})).
3954: We assume that the distortion 
3955: measure $\rho$ is essentially
3956: bounded, $\Dmax < \infty$,
3957: and define
3958: \be
3959: \Dmin \bydef 
3960: \sup_{n \geq 1} \Dminn = \lim_{n\to\infty} \Dminn 
3961: \label{eq:dmind}
3962: \ee
3963: where 
3964: \be
3965: \label{eq:dminn}
3966: \Dminn \bydef 
3967: E_{P_n}[\essinf_{Y_{C(n)}\sim Q_n} \;\rho_n(X_{C(n)},Y_{C(n)})].
3968: \ee
3969: To see that the limit in (\ref{eq:dmind}) 
3970: exists and equals the supremum, first 
3971: note that $\{n^d \Dminn\}$ is an
3972: increasing sequence, and that 
3973: $D_{\rm min}^{(nk)}\geq D_{\rm min}^{(k)}$
3974: for all $n,k\geq 1$.
3975: Now fix $k\geq 1$ arbitrary. Given $n\geq k$
3976: we write $n = mk + r$ for some $0\leq r\leq k-1$,
3977: so that
3978: $$ n^dD_{\rm min}^{(n)}\geq (mk)^dD_{\rm min}^{(mk)}
3979: \geq (mk)^dD_{\rm min}^{(k)}.$$
3980: Since $n/mk\to 1$ as $n\to\infty$, this implies that
3981: $$\liminf_{n\to\infty} D_{\rm min}^{(n)} \geq D_{\rm min}^{(k)}.$$
3982: Since $k$ was arbitrary we are done.
3983: 
3984: Finally, we assume once again that
3985: the distortion measure $\rho$ is
3986: not essentially constant, 
3987: that is, $\Dmin<\Dav$.
3988: Our next result is the 
3989: random fields analog of Theorem 4; 
3990: it is proved in Appendix~C.
3991: 
3992: \medskip
3993: 
3994: {\em Theorem~27. Generalized AEP rate function.}
3995: Let $\Xp$ and $\Yp$ be stationary random fields.
3996: Assume that $\rho$ is bounded, and that with
3997: $\BBP$-probability one, conditional on 
3998: $X_{[0,\infty)}=
3999: x_{[0,\infty)}$, the random variables
4000: $\{\rho_n(x_{C(n)},Y_{C(n)} )\}$ satisfy a
4001: large deviations principle with some
4002: deterministic, convex rate-function.
4003: Then for all $D\in (\Dmin,\Dav)$, 
4004: except possibly at $D = \Dinf$, 
4005: \be
4006: \label{eq:ldp-27}
4007: \lim_{n \to \infty}
4008: -\frac{1}{n^d}\log Q_n(B(X_{C(n)},D)) = R(\BBP,\BBQ,D)
4009:         \;\;\;\;\mbox{w.p.1}
4010: \ee
4011: where $\Dinf$ 
4012: and the rate-function $R(\BBP,\BBQ,D)$ 
4013: are defined as in the one-dimensional case, 
4014: by (\ref{eq:dinf}) and (\ref{eq:thm4b}),
4015: respectively, and the rate-functions
4016: $R_n(P_n,Q_n,D)$ are now defined as
4017: \ben
4018: R_n(P_n,Q_n,D) = \inf_{V_n} \frac{1}{n^d} H(V_n\|P_n\times Q_n)
4019: \een
4020: with the infimum taken over all joint distributions 
4021: $V_n$ on $A^{n^d}\times\Ahatnd$ such that
4022: the $A^{n^d}$-marginal of $V_n$ is $P_n$
4023: and $E_{V_n}[\rho_n(X_{C(n)},Y_{C(n)})]\leq D$.
4024: 
4025: \medskip
4026: 
4027: {\em Remark 6:} Suppose that $(\Xp,\Yp)$
4028: is a stationary random field satisfying 
4029: a ``process-level LDP'' with a convex, good
4030: rate-function. To be precise,
4031: given $x_{C(n)}\in A^{n^d}$, 
4032: write $x^{(n)}$ for the periodic
4033: extension of $x_{C(n)}$ to an 
4034: infinite realization in $A^{[0,\infty)}$
4035: and let $X^{(n)}$ and $Y^{(n)}$ denote
4036: the periodic extensions of $X_{C(n)}$
4037: and $Y_{C(n)}$, respectively.
4038: The process-level empirical 
4039: measure $\calLn$  induced
4040: by $\Xp$ and $\Yp$ on 
4041: $(A^{[0,\infty)}\times\hat{A}^{[0,\infty)})$ is
4042: defined by
4043: $$\calLn\bydef\frac{1}{n^d}\sum_{u\in C(n)}
4044: 	\delta_{(X^{(n)}_{u+[0,\infty)},Y^{(n)}_{u+[0,\infty)})}$$
4045: where $\delta_{s,s'}$ denotes the measure
4046: assigning unit mass to the joint realization
4047: $(s,s')\in A^{[0,\infty)}\times\hat{A}^{[0,\infty)}$,
4048: and $X^{(n)}_{u+[0,\infty)}$ 
4049: (or $Y^{(n)}_{u+[0,\infty)}$) 
4050: denotes $X^{(n)}$ 
4051: (respectively, $Y^{(n)}$) 
4052: shifted by $u$ [i.e., the
4053: value of $X^{(n)}_{u+[0,\infty)}$ 
4054: at position $v$ is the same as
4055: the value of $X^{(n)}$ at position
4056: $u+v$; similarly for $Y^{(n)}_{u+[0,\infty)}$.]
4057: By assuming that $(\Xp,\Yp)$
4058: satisfy a ``process-level LDP''
4059: we mean that the sequence of measures
4060: $\{\calLn\}$ satisfies the LDP in 
4061: the space of stationary
4062: probability measures on
4063: $(A^{[0,\infty)}\times\hat{A}^{[0,\infty)})$
4064: equipped with the topology of weak convergence,
4065: with some convex, good rate-function $I(\cdot)$.
4066: These assumptions are satisfied by many of 
4067: the random field models used in applications, 
4068: and in particular by a large class of Gibbs fields 
4069: (see, e.g., 
4070: \cite{comets:86}\cite{folmer-orey}\cite{olla:88} 
4071: for general theory and 
4072: \cite{guyon:book}\cite{winkler:book} for
4073: examples in the areas of
4074: image processing and image analysis).
4075: 
4076: As in the one-dimensional case, suppose
4077: that the process-level LDP condition 
4078: holds, and that the 
4079: distortion measure $\rho$ is
4080: bounded and continuous on $A\times\Ahat$.
4081: Then with $\BBP$-probability one,
4082: conditional on 
4083: $X_{[0,\infty)}=x_{[0,\infty)}$,
4084: the sequence 
4085: $\{\rho_n(x_{C(n)},Y_{C(n)})\}$ satisfies 
4086: the LDP upper bound with respect to the 
4087: deterministic, convex rate-function $J(\cdot)$
4088: as in Remark~3.
4089: Moreover, assuming sufficiently strong mixing 
4090: properties for $\Yp$ one may also verify the 
4091: corresponding lower bound (for example, by 
4092: adapting the stochastic subadditivity approach of 
4093: \cite{chi-AP:01}).
4094: 
4095: %Y ------------------------- end of changes
4096: 
4097: \subsection{Applications}
4098: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
4099: In Sections~6.3.1 and~6.3.2 below we
4100: consider the random field analogs of 
4101: the problems discussed in Section~3
4102: in the context of one-dimensional 
4103: processes. 
4104: In the instances when our analysis 
4105: was restricted to $\iid$ processes, 
4106: the extension to random fields is 
4107: trivial -- an $\iid$ random field 
4108: is no different from an $\iid$ process. 
4109: For that reason, we only give the 
4110: full statements of corresponding 
4111: random fields results when the 
4112: generalization from $d=1$ to 
4113: $d\geq 2$ does involve some
4114: modifications. Otherwise, only 
4115: a brief description of the corresponding 
4116: results is mentioned.
4117: 
4118: \subsubsection{Lossy Data Compression}
4119: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
4120: Here we very briefly discuss the 
4121: problem of data compression, when 
4122: the data is in the form of a two- 
4123: or more generally a $d$-dimensional 
4124: array.
4125: In this case, the underlying data 
4126: source is naturally modeled as 
4127: a $d$-dimensional random field.
4128: Extensive discussions of the
4129: general information-theoretic
4130: problems on random fields are
4131: given 
4132: % by Berger, Shen and Ye 
4133: in \cite{berger-shen-ye:92}
4134: and 
4135: % by Ye and Berger's 
4136: the recent monograph \cite{ye-berger:book};
4137: see also 
4138: \cite{follmer:73}.
4139: 
4140: First we discuss the results
4141: given in Section~3.1. The construction
4142: of the random codebooks described there
4143: generalizes to random fields in an 
4144: obvious fashion, and the statement 
4145: as well as the proof of Theorem~9
4146: remain unchanged. Following the
4147: notation exactly as developed for 
4148: $\iid$ sources, the strengthened 
4149: coding theorems given in 
4150: Theorems~10 and~11 follow by 
4151: combining (the obvious 
4152: generalization of) Theorem~9 
4153: with the generalized
4154: AEP of Theorem~26.
4155: 
4156: Similarly, the mismatched-codebook
4157: results of Section~3.2 only rely on
4158: Theorem~9 and the generalized AEP
4159: of Theorem~1, and therefore 
4160: immediately generalize to the
4161: random field case. Finally 
4162: Theorems~15 and~16 in Section~3.5
4163: are only stated for $\iid$ processes,
4164: hence, as mentioned above, they
4165: trivially extend to random 
4166: fields.
4167: 
4168: \subsubsection{Waiting Times}
4169: Here we consider the natural
4170: $d$-dimensional analogs of the
4171: waiting times questions considered
4172: in Section~3.3. Given two 
4173: independent realizations
4174: of the random fields $\Xp$ and $\Yp$,
4175: our main quantity of interest here
4176: is how ``far'' we have to look
4177: in $\Yp$ until we find a match for
4178: the pattern $X_{C(n)}$ with distortion
4179: $D$ or less. Given $n\geq 1$ and 
4180: a distortion level $D\geq 0$, we 
4181: define the {\em waiting time} $W_n$
4182: as the smallest length $i$ 
4183: such that a copy of the pattern 
4184: $X_{C(n)}$ appears somewhere in 
4185: $Y_{C(i+n-1)}$, with distortion
4186: $D$ or less.
4187: Formally,
4188: \ben
4189: W_n\;=\;\inf\{i\geq 1\; :\; 
4190: 	\rho_n(X_{C(n)},Y_{u+C(n)})\leq D
4191: 	\;\;\mbox{for some}\;u\in[0,i-1]^d\}
4192: % \label{eq:Wnd-def}
4193: \een
4194: with the convention that the infimum 
4195: of the empty set equals $+\infty$.
4196: 
4197: In the one-dimensional case our
4198: main tool in investigating
4199: the asymptotic behavior of the
4200: waiting times was the strong
4201: approximation in Theorem~13.
4202: Roughly speaking, Theorem~13
4203: stated that the waiting time
4204: $W_n$ for a $D$-close match
4205: of $X_1^n$ in $\Yp$ is 
4206: inversely proportional
4207: to the probability $Q_n(B(X_1^n,D))$
4208: of such a match.
4209: In Theorem~28 below we generalize
4210: this result to the $d$-dimensional
4211: case by showing that the $d$-dimensional
4212: volume $(W_n)^d$ we have to search 
4213: in $\Yp$ in order to find a $D$-close
4214: match for $X_{C(n)}$ is, roughly,
4215: inversely proportional
4216: to the probability $Q_n(B(X_{C(n)},D))$
4217: of finding such a match.
4218: 
4219: Before stating
4220: Theorem~28 we need to recall
4221: the following definition.
4222: Dobrushin's {\em non-uniform 
4223: $\phi$-mixing coefficients}
4224: of a stationary random field
4225: $\Yp$ are
4226: \ben
4227: \phi_\ell(k)\;=\;\sup\{|\BBQ(B|A)-\BBQ(B)|\;:
4228: & & \hspace{-0.2in}
4229: 	B\in\sigma(Y_{U}),\; A\in\sigma(Y_{V}),\; \BBQ(A)>0\\
4230: & & \quad\quad |U|\leq \ell,\; |V|<\infty,\; d(U,V)\geq k \}
4231: \een
4232: where $\sigma(Y_U)$ denotes
4233: the $\sigma$-field generated by 
4234: the random variables $Y_U$, $U\subset\IN^d$. 
4235: See \cite[Chapter~6]{lin-lu:book}
4236: or \cite{doukhan:book} for detailed
4237: discussions of the coefficients
4238: $\{\phi_\ell(k)\}$ and their properties.
4239: 
4240: \medskip
4241: 
4242: {\em Theorem~28. Strong Approximation:}
4243: Let $\Xp$ and $\Yp$ be stationary ergodic 
4244: random fields, and assume that the non-uniform
4245: $\phi$-mixing  coefficients of $\Yp$ satisfy
4246: \be
4247: \limsup_{n\to\infty}\sum_{j=1}^\infty
4248: (j+1)^{d-1}\phi_n(jn)<\infty.
4249: \label{eq:dobrushin}
4250: \ee
4251: If $Q_n(B(X_{C(n)},D))>0$ eventually with 
4252: probability one,
4253: then for any $\epsilon>0$:
4254: \ben
4255: -(1+\epsilon)\log n
4256: \;\leq\;
4257: \log [W^d_n Q_n(B(X_{C(n)},D))]
4258: \;\leq\;
4259: (d+1+\epsilon)\log n
4260: \;\;\;\;\mbox{eventually, w.p.1.}
4261: \een
4262: 
4263: \medskip
4264: 
4265: The proof of Theorem~28 is a 
4266: straightforward modification of
4267: the corresponding one-dimensional argument in
4268: \cite{dembo-kontoyiannis}; it is given in 
4269: Appendix~D.
4270: 
4271: \medskip
4272: 
4273: {\em Remark 7:} The mixing condition
4274: (\ref{eq:dobrushin}) is satisfied by
4275: a rather large class of 
4276: stationary random fields. For
4277: example in the case of Markov 
4278: random fields, it is easy to check 
4279: that under Dobrushin's uniqueness 
4280: condition the limit in 
4281: (\ref{eq:dobrushin}) is finite;
4282: see \cite[Section~8.2]{georgii:1}
4283: or \cite{doukhan:book} for 
4284: more details.
4285: 
4286: \medskip
4287: 
4288: Next we combine the above strong
4289: approximation result with the
4290: generalized AEPs of Theorems~26
4291: and~27, to read off the first order
4292: asymptotic behavior of the 
4293: waiting times. Theorem~29
4294: below generalizes Theorem~14
4295: to the random field case.
4296: 
4297: \medskip
4298:  
4299: {\em Theorem~29. SLLN for Waiting Times:}
4300: Let $\Xp$ and $\Yp$ be stationary ergodic 
4301: random fields:
4302:  
4303: (a)~If $\Yp$ is $\iid$ and the
4304: average distortion $\Dav$ is finite,
4305: then for any $D\in(\Dmin,\Dav)$
4306: \ben
4307: \frac{1}{n^d}\log W_n^d \to R_1(P_1,Q_1,D)
4308: \;\;\;\;\mbox{w.p.1.}
4309: % \label{eq:w-slln-d}
4310: \een
4311:  
4312: (b)~Suppose that the 
4313: % distortion measure
4314: % $\rho$ is bounded, that the 
4315: conditions of Theorem~27 are satisfied,
4316: and that $\Yp$ also satisfies 
4317: the mixing assumption (\ref{eq:dobrushin}).
4318: Then, for any $D\in(\Dinf,\Dav)$:
4319: \ben
4320: \frac{1}{n^d}\log W^d_n \to R(\BBP,\BBQ,D)
4321: \;\;\;\;\mbox{w.p.1.}
4322: \een
4323:  
4324: \medskip
4325: 
4326: %A modified.
4327: %Y removed! 
4328: %Y we don't really know (and it's definitely not "not hard to verify")
4329: %Y that the assumptions of (b) -- including the conditional LDP -- are 
4330: %Y satisfied by a class of Gibbs/MRFs.
4331: % Note that, although the assumptions
4332: % of part~(b) of the theorem appear to
4333: % be rather heavy, they are not hard to 
4334: % verify for a class of Gibbs
4335: % or Markov random fields;
4336: % see the comments in 
4337: % Remarks~6 and~7 above.
4338: 
4339: 
4340: \newpage
4341: 
4342: \section{Random-Fields -- Second Order Results}
4343: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
4344: Finally we turn to the random field extensions 
4345: of the second order results of Sections~4 and~5.
4346: In Section~7.1 we state the random field analog 
4347: of the second order generalized AEP, and in~7.2
4348: we discuss its application to the problems
4349: of lossy data compression and pattern matching.
4350: 
4351: \subsection{Refinements of Generalized AEP}
4352: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
4353: Let $\Xp$ be a stationary ergodic random 
4354: field with marginal distribution $P$ on $A$,
4355: and let $Q$ be a fixed probability measure
4356: on $\Ahat$. We will assume throughout that
4357: the distortion measure $\rho$ has a finite
4358: third moment,
4359: \be
4360: D_3\bydef
4361: E_{P\times Q}[\rho^3(X,Y)]<\infty
4362: \label{eq:third-d}
4363: \ee
4364: and that it is not essentially
4365: constant, i.e., $\Dmin<\Dav$,
4366: with $\Dmin$ and $\Dav$ defined
4367: as before (cf. (\ref{eq:Dmin})
4368: and (\ref{eq:Dav})).
4369: 
4370: The goal of this section is
4371: to give the random field analogs of 
4372: Theorems~17 and~18 and of Corollary~19
4373: from the one-dimensional case.
4374: 
4375: An examination of the proof of Theorem~17 in
4376: \cite{yang-zhang:99} shows that its proof
4377: only depends on the ergodicity of $\Xp$
4378: and the $\iid$ structure of the product 
4379: measures $Q^n$. Simply replacing the
4380: application of the ergodic theorem
4381: by the ergodic theorem
4382: for $\IN^d$ actions
4383: \cite[Chapter~6]{krengel:book}
4384: immediately yields the following 
4385: generalization: As long as condition
4386: (\ref{eq:third-d}) is satisfied, 
4387: for all $D\in(\Dmin,\Dav)$ we have
4388: \be
4389: -\log Q^{n^d}(B(X_{C(n)},D))= n^dR_1(\hat{P}_n,Q,D)+\frac{d}{2}\log n + O(1)
4390: \;\;\;\;\mbox{w.p.1}
4391: \label{eq:br-d}
4392: \ee
4393: where $\hat{P}_n$ is now the empirical measure
4394: induced by $X_{C(n)}$ on $A$.
4395: 
4396: In order to generalize Theorem~18 to $\IN^d$
4397: we need to introduce a measure of dependence
4398: analogous to $\alpha$-mixing in the 
4399: one-dimensional case. For a stationary
4400: random field $\Xp$ on $\IN^d$ we define
4401: the {\em uniform $\alpha$-mixing coefficients}
4402: of $\Xp$ by
4403: \ben
4404: \alpha(k)\;=\;\sup\{|\BBP(A\cap B)-\BBP(A)\BBP(B)|\;:
4405: & & \hspace{-0.2in}
4406:         A\in\sigma(X_{U}),\; B\in\sigma(X_{V}),\; d(U,V)\geq k \}
4407: \een
4408: where, as before, $\sigma(X_U)$ denotes
4409: the $\sigma$-field generated by
4410: the random variables $Y_U$.
4411: See \cite{lin-lu:book}\cite{doukhan:book} 
4412: for more details.
4413: 
4414: Apart from ergodicity, the main technical
4415: ingredient in the proof of Theorem~18 above
4416: (see also the proof of 
4417: \cite[Theorem~3]{dembo-kontoyiannis})
4418: is the LIL for $\Xp$. 
4419: Similarly to the one-dimensional case, 
4420: the LIL for a random field $\Xp$
4421: holds as soon as the following
4422: mixing condition is satisfied
4423: \be
4424: \alpha(k)\leq C\ k^{
4425: -
4426: 3d(1+\epsilon)},
4427: \quad\mbox{for some $\epsilon>0$ and $C<\infty.$}
4428: \label{eq:LIL-cond2}
4429: \ee
4430: [This follows from the 
4431: almost sure invariance principle
4432: in \cite[Theorem~1]{berkes-morrow}.]
4433: 
4434: Assuming that (\ref{eq:LIL-cond2})
4435: and the third moment condition
4436: (\ref{eq:third-d}) both hold,
4437: we get the following generalization
4438: of Theorem~18. For all $D\in(\Dmin,\Dav)$,
4439: \be
4440: n^dR_1(\hat{P}_n,Q,D) = n^dR_1(P,Q,D) + \sum_{u\in C(n)}
4441:         g(X_u) + O(\log\log n)
4442: \;\;\;\;\mbox{w.p.1}
4443: \label{eq:taylor-d}
4444: \ee
4445: with $g(x)$ defined exactly as in the
4446: one-dimensional case (\ref{eq:functiong}).
4447: 
4448: Combining (\ref{eq:br-d}) and (\ref{eq:taylor-d})
4449: gives the following generalization of Corollary~19:
4450: 
4451: \medskip
4452: 
4453: {\em Theorem~30: Second Order Generalized AEP:}
4454: Let $\Xp$ be a stationary ergodic random field
4455: with marginal distribution $P$ on $A$, and let 
4456: $Q$ be an arbitrary probability measure on $\Ahat.$
4457: Assume that the uniform $\alpha$-mixing coefficients
4458: of $\Xp$ satisfy 
4459: (\ref{eq:LIL-cond2})
4460: and that $D_3=E_{P\times Q}[\rho^3(X,Y)]$ is
4461: finite. Then for any $D\in(\Dmin,\Dav)$, and
4462: with $g(x)$ defined as in (\ref{eq:functiong}),
4463: $$
4464: -\log Q^{n^d}(B(X_1^n,D))= n^dR_1(P,Q,D) + \sum_{u\in C(n)}g(X_u)
4465:         + \frac{d}{2}\log n + O(\log\log n)
4466: \;\;\;\;\mbox{w.p.1.}$$
4467: 
4468: \subsection{Applications}
4469: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
4470: Next we discuss applications of the second order 
4471: generalized AEP to the 
4472: $d$-dimensional analogs of the
4473: data compression and pattern 
4474: matching problems of Section~4.
4475: As in Section~6.3, the only 
4476: results stated explicitly are
4477: those whose extensions to $\IN^d$
4478: require modifications.
4479: 
4480: As mentioned in Section~6.3.1,
4481: the one-dimensional construction 
4482: of the random codes,
4483: as well as the main tool used 
4484: in their analysis, Theorem~9,
4485: immediately generalize to the 
4486: random field case. And since
4487: all the second order results 
4488: of Section~5.1 (Theorems~20--23)
4489: are stated for $\iid$ sources, 
4490: their statements as well as 
4491: proofs carry over {\sl verbatim} 
4492: to this case.
4493: 
4494: For the problem of waiting times,
4495: we can use the second order generalized 
4496: AEP of Theorem~30 to refine the SLLN
4497: of Theorem~29
4498: \ben
4499: \frac{1}{n^d}\log W_n^d \to R_1(P,Q,D)
4500: \;\;\;\;\mbox{w.p.1}
4501: \een
4502: to a corresponding CLT and LIL
4503: as in the one-dimensional case.
4504: These refinements are stated in
4505: Theorem~31 below. Its proof is
4506: identical to that of Theorem~24
4507: in the one dimensional case. The
4508: only difference here is that we 
4509: need to invoke the CLT and LIL
4510: for the partial sums of the random 
4511: field $\{g(X_u)\;;\;u\in\IN^d\}$. 
4512: Under the conditions of the 
4513: theorem, these follow from the 
4514: almost sure invariance principle
4515: of \cite[Theorem~1]{berkes-morrow}.
4516: 
4517: \medskip
4518: 
4519: {\em Theorem~31:} 
4520: Let $\Xp$ be a stationary ergodic random 
4521: field and $\Yp$ be $\iid$, with marginal
4522: distributions $P$ and $Q$ on $A$ and $\Ahat$,
4523: respectively. Assume that the 
4524: uniform $\alpha$-mixing
4525: coefficients of $\Xp$ satisfy (\ref{eq:LIL-cond2})
4526: and that $D_3=E_{P\times Q}[\rho^3(X,Y)]$ is
4527: finite. Then for any $D\in(\Dmin,\Dav)$ the
4528: following series is absolutely convergent
4529: \be
4530: \sigma^2\bydef 
4531: % E_P[g^2(X_{\orig})]+2
4532: \sum_{u\in\IN^d} E_P[g(X_{\orig})g(X_u)]
4533: \label{eq:variance-d}
4534: \ee
4535: with $g(x)$ defined as in (\ref{eq:functiong}),
4536: and, moreover:
4537: \begin{itemize}
4538: \item[]{\bf (CLT)} With $R_1=R_1(P,Q,D)$:
4539: $$\frac{\log W^d_n \;-\; n^dR_1}{n^{d/2}}
4540:         \weakly N(0,\sigma^2).$$
4541: \item[]{\bf (LIL)}
4542: The set of limit points of the sequence
4543: $$\left\{
4544:         \frac{\log W^d_n \;-\; n^dR_1}
4545:              {\sqrt{2n^d\log\log n}}
4546:   \right\},\quad n\geq 3$$
4547: coincides with $[-\sigma,\sigma]$, with
4548: probability one.
4549: \end{itemize}
4550: 
4551: 
4552: % \medskip
4553: % 
4554: % \subsubsection{Match-Lengths and Lossy LZ on Random Fields}
4555: % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
4556: % 
4557: % -- Possibly the analog of Theorem 25
4558: % 
4559: % -- Possibly a generalization of Steinberg \& Gutman's result
4560: 
4561: 
4562: 
4563: % \newpage
4564: \section*{Acknowledgments}
4565: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
4566: We thank Tam\'{a}s Linder and Yuval Peres
4567: for useful discussions regarding Theorems 7 and 8.
4568: 
4569: \appendix
4570: \section{Proof of Theorem~7}
4571: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
4572: We prove the upper and lower bounds separately.
4573: For the upper bound, 
4574: recalling the definition
4575: of $r_n(X_1^n)$ in
4576: (\ref{eq:ratio})
4577: we observe that
4578: $$r_n(X_1^n,D)
4579: % \frac{1}{n}\log
4580: % \frac{P_n(B(X_1^n,D))}
4581: % {Q^n(B(X_1^n,D))}
4582: \leq \frac{1}{n}\log {P_n(B(X_1^n,D))} -\frac{1}{n}\log Q^n(X_1^n)$$
4583: where the second term converges to $H(P) + H(P\|Q)$ 
4584: as $n\to \infty$, by the ergodic theorem. 
4585: Since the first term is increasing in $D$,
4586: for any fixed $D>0$ we have with 
4587: $\BBP$-probability one:
4588: \be
4589: \limsupnd 
4590: % \frac{1}{n}\log
4591: % \frac{P_n(B(X_1^n,D))}
4592: % {Q^n(B(X_1^n,D))}
4593: r_n(X_1^n,D)
4594: 	\leq 
4595: 	H(P) + H(P\|Q) + 
4596: 	\limsup_{n\to\infty}
4597: 	\frac{1}{n}\log P_n(B(X_1^n,D)).
4598: \label{eq:discUB1}
4599: \ee
4600: Now the pointwise source coding
4601: theorem (see \cite[Theorems~1 and~5]{konto-zhang:00})
4602: implies that 
4603: \be
4604: \liminf_{n\to\infty}-\frac{1}{n}\log P_n(B(X_1^n,D))\geq R(D)
4605: \;\;\;\;\mbox{w.p.1}
4606: \label{eq:discUB2}
4607: \ee
4608: where $R(D)$ is the rate-distortion
4609: function of the source $\Xp$
4610: (in nats). 
4611: % To see this, note
4612: % that that in the proof of
4613: % Theorem~6~$(ii)$ in
4614: % \cite{kontoyiannis-red:00}
4615: % it was shown that 
4616: % $$\limsup_{n\to\infty}
4617: % 	\frac{1}{n}\log\frac{P_n(B(X_1^n,D))}
4618: % 	{\widetilde{Q}_n(B(X_1^n,D))}
4619: % 	\leq 0,
4620: % 	\;\;\;\;\mbox{w.p.1}
4621: % $$
4622: % (in fact this is shown to hold
4623: % for any sequence of probability 
4624: % measures $\{Q'_n\}$ in place of 
4625: % $\{P_n\}$),
4626: % where each measure $\widetilde{Q}_n$
4627: % minimizes $E_{P_n}[-\log Q_n(B(X_1^n,D))]$
4628: % over all probability measures $Q_n$.
4629: % But from Theorem~5 we also know that 
4630: % $$\lim_{n\to\infty} 
4631: % -\frac{1}{n}\log\widetilde{Q}_n(B(X_1^n,D))
4632: % = R(D)
4633: % \;\;\;\;\mbox{w.p.1.}
4634: % $$
4635: % 
4636: From equations (\ref{eq:discUB1}) 
4637: and (\ref{eq:discUB2}) we get
4638: \ben
4639: \limsupnd 
4640: r_n(X_1^n,D)
4641: & \leq & 
4642: 	H(P) + H(P\|Q) -R(D)\\
4643: & \leq & 
4644: 	H(P) + H(P\|Q) - H(\BBP) + H(P) - R_1(D)
4645: \;\;\;\;\mbox{w.p.1}
4646: \een
4647: where $R_1(D)$ denotes the first order 
4648: rate-distortion function of $\Xp$,
4649: $H(\BBP)$ is the entropy rate of $\Xp$
4650: (both in nats), and
4651: the second inequality follows
4652: from the Wyner-Ziv bound;
4653: see \cite[Remark~4]{wyner-ziv:71}.
4654: The assumption that $\rho(x,y)=0$
4655: if and only if $x=y$ implies that
4656: $\lim_{D\to 0} R_1(D)=H(P)$,
4657: so letting $D\downarrow 0$ 
4658: the above right hand side becomes
4659: $H(P) + H(P\|Q) -H(\BBP)$
4660: and it is an easy calculation 
4661: to verify that this is 
4662: indeed the same
4663: as $H(\BBP\|\BBQ)$.
4664: This gives the required
4665: upped bound.
4666: 
4667: For the lower bound we proceed
4668: similarly by noting that 
4669: $$
4670: % \frac{1}{n}\log 
4671: % \frac{P_n(B(X_1^n,D))} 
4672: % {Q^n(B(X_1^n,D))}
4673: r_n(X_1^n,D)
4674: \geq \frac{1}{n}\log {P_n(X_1^n)} 
4675: 	-\frac{1}{n}\log Q^n(B(X_1^n,D)),$$
4676: where the first term converges to $H(\BBP)$
4677: by the classical AEP
4678: (as $n\to \infty$).
4679: Since the second term is decreasing 
4680: in $D$, for any fixed $D>0$ small
4681: enough we have
4682: with probability one:
4683: \ben
4684: \liminfnd 
4685: % \frac{1}{n}\log 
4686: % \frac{P_n(B(X_1^n,D))} 
4687: % {Q^n(B(X_1^n,D))}
4688: r_n(X_1^n,D)
4689: & \geq & 
4690:         - H(\BBP) -
4691:         \limsup_{n\to\infty}
4692:         \frac{1}{n}\log Q^n(B(X_1^n,D))\\
4693: & = & 
4694: 	- H(\BBP) + R_1(P,Q,D)
4695: \een
4696: where the last step follows from 
4697: the generalized AEP in Theorem~1
4698: (note that $\Dmin=0$ here).
4699: By the characterization of the
4700: rate-function in Proposition~2
4701: we know that 
4702: $$R_1(P,Q,D) = \sup_{\la'\leq 0} [\la' D-\LA(\la')]
4703: 	\geq [\la D-\LA(\la)]= 
4704: 	-E_{P}\left[\log E_{Q}\left(
4705: 		e^{\lambda(\rho(X,Y)-D)}
4706: 	\right)\right]$$
4707: for any fixed $\la<0$.
4708: Therefore, for any
4709: $D$ small enough and
4710: $\la<0$ we have
4711: \ben
4712: \liminfnd
4713: % \frac{1}{n}\log 
4714: % \frac{P_n(B(X_1^n,D))} 
4715: % {Q^n(B(X_1^n,D))}
4716: r_n(X_1^n,D)
4717: \geq - H(\BBP)  -E_{P}\left[\log E_{Q}\left(
4718:                 e^{\lambda(\rho(X,Y)-D)}
4719:         \right)\right]
4720: \;\;\;\;\mbox{w.p.1.}
4721: \een
4722: Letting $D\to 0$ and then $\la\to-\infty$,
4723: by the dominated convergence theorem (and
4724: the assumption $\rho(x,y)=0$ iff $x=y$)
4725: the right hand side above converges
4726: to $- H(\BBP) + H(P\|Q) + H(P)
4727: = H(\BBP\|\BBQ),$ proving the
4728: lower bound. 
4729: 
4730: Finally, since for each fixed $n$ 
4731: the limit as $D\downarrow 0$ of
4732: $r_n(X_1^n,D)$
4733: exists,
4734: % $$\frac{1}{n}\log \frac{P_n(B(X_1^n,D))}{Q_n(B(X_1^n,D))}$$
4735: it follows that 
4736: the repeated limit
4737: $\lim_{n}\lim_{D}$
4738: also exists and is equal
4739: to the double limit $H(P\|Q)$.
4740: \qed
4741: 
4742: \section{Proof of Theorem~8}
4743: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
4744: Part~(a): Fixing $n$, let $f_n=dP_n/dQ_n$ and consider the set
4745: $$
4746: A_n \bydef \left\{ x_1^n : \; Q_n(B(x_1^n,D))>0 \;\; \forall D>0,
4747: f_n(x_1^n) = \limsup_{D\downarrow 0}\; \frac{P_n(B(x_1^n,D))}{Q_n(B(x_1^n,D))}
4748: = \liminf_{D\downarrow 0}\; \frac{P_n(B(x_1^n,D))}{Q_n(B(x_1^n,D))}
4749: \, \right\}.
4750: $$
4751: By the Radon-Nikodym theorem 
4752: (cf. \cite[Theorems 1.6.1, 1.6.2]{evans-gariepy}),
4753: we know that $Q_n(A_n)=1$, hence also $P_n(A_n)=1$.
4754: With $\BBP(\cup_n A_n^c)=0$, we conclude the proof of part~(a)
4755: by applying Theorem~6 for $M_n=Q^n$ (in which case $H_n \geq 0$).
4756: 
4757: Part~(b): As $Q(A_1)=1$, in particular
4758: $Q(B(x,D))>0$ for all $D>0$ and $Q$-almost
4759: every $x\in\RL^d$ (hence also for $P=P_1$-almost
4760: every $x\in\RL^d$), implying that $\Dmin$ of 
4761: (\ref{eq:Dmin}) is zero. The same argument 
4762: yields also that $P(B(x,D))>0$ for all $D>0$ 
4763: and $P$-almost every $x$, hence $\Dmin$ is
4764: still zero if we replace $Q$ by $P$. Thus, 
4765: for all 
4766: $D< \min\{E_{P\times Q}[\rho(X,Y)],E_{P\times P}[\rho(X,Y)]\}$,
4767: applying Theorem~1 twice we get 
4768: $$
4769:   \lim_{n\to\infty}\; 
4770: r_n(X_1^n,D)
4771: %         \frac{1}{n}\log
4772: %         \frac{P^n(B(X_1^n,D))}
4773: %              {Q^n(B(X_1^n,D))}
4774: 	= R_1(P,Q,D)-R_1(P,P,D)
4775: 	\;\;\;\;\mbox{w.p.1.}
4776: $$
4777: For any probability measure $\mu$ and any $\la \leq 0$, let
4778: $$
4779: \Lambda(\lambda;\mu) = \int  \left[ 
4780: \log \int e^{\lambda \rho(x,y)} d\mu(y)\right] dP(x).
4781: $$
4782: 
4783: Fixing $D>0$ small enough,
4784: we have by Proposition~2 
4785: that $R_1(P,P,D) = \lambda D -\Lambda(\lambda;P)$
4786: for the unique $\lambda=\lambda(D)<0$ such that $\Lambda'(\lambda;P)=D$, 
4787: whereas $R_1(P,Q,D) \geq \lambda D - \Lambda(\lambda;Q)$. 
4788: Since $E_{P\times P}[\rho(X,Y)]>0$, we have also that 
4789: $\lambda(D) \downarrow -\infty$ as $D \downarrow 0$ (see (\ref{eq:la-lim})).
4790: Consequently, 
4791: $$
4792: \liminf_{D\downarrow 0} \{ R_1(P,Q,D)-R_1(P,P,D) \} \geq 
4793: \liminf_{\lambda \downarrow -\infty}
4794: \{ \Lambda(\lambda;P) - \Lambda(\lambda;Q) \}
4795: $$
4796: Similarly, by Proposition~2 we have
4797: $R_1(P,Q,D) = \widetilde{\lambda} D -\Lambda(\widetilde{\lambda};Q)$
4798: for $\widetilde{\lambda}<0$ such that $\Lambda'(\widetilde{\lambda};Q)=D$,
4799: $R_1(P,P,D) \geq \widetilde{\lambda}D -
4800: \Lambda(\widetilde{\lambda};P)$, and with $E_{P\times Q}[\rho(X,Y)]>0$,
4801: also $\widetilde{\lambda}\downarrow -\infty$ when $D \downarrow 0$.
4802: Therefore, it suffices to show that
4803: \be
4804: \label{eq:dn-lim}
4805: \lim_{\lambda \downarrow -\infty}
4806: \{ \Lambda(\lambda;P) - \Lambda(\lambda;Q) \} = H(P\|Q) \;.
4807: \ee
4808: 
4809: To this end, for any $\lambda <0$ and $x \in \RL^d$, let
4810: $$
4811: h_\lambda(x) \bydef
4812:  \frac{E_P(e^{\lambda \rho(x,Y)})}{E_Q(e^{\lambda \rho(x,Y)})}
4813: $$
4814: noting that
4815: $$
4816: \Lambda(\lambda;P) - \Lambda(\lambda;Q) = \int \log h_\lambda(x) dP(x).
4817: $$ 
4818: Using the change of variable $U=\rho(x,Y) \geq 0$ followed
4819: by integration by parts, we see that
4820: $$
4821: h_\lambda(x) =
4822:  \frac{\int_0^\infty e^{\lambda u} g_{x}(u) du}{\int_0^\infty
4823: e^{\lambda u} k_{x}(u) du} \;,
4824: $$
4825: where $g_x (r)=P(B(x,r))$ and $k_x(r)=Q(B(x,r))$ are nonnegative,
4826: nondecreasing and bounded above by $1$. Considering separately
4827: $u \leq 2\eta$ and $u>2\eta$, it is easy to check that for any $\eta>0$,
4828: \be
4829: \sup_{0 <r \leq 2\eta} \frac{g_{x}(r)}{k_{x}(r)} + \psi_{\lambda,x} \geq
4830: h_\lambda(x) \geq \inf_{0 < r \leq 2\eta} \frac{g_{x}(r)}{k_{x}(r)}
4831: \: \frac{1}{1+\psi_{\lambda,x}}
4832: \label{eq:bd-dn}
4833: \ee
4834: where
4835: \be
4836: \psi_{\lambda,x} \bydef
4837: \frac{\int_{2\eta}^\infty e^{\lambda u} du}
4838: {\int_0^{2\eta} e^{\lambda u} k_{x}(u) du} \leq
4839: \frac{1}{\eta |\lambda| k_x(\eta)} \;.
4840: \label{eq:bd2-dn}
4841: \ee
4842: Fix $x \in A_1$ of part (a), in which case $k_x(r)>0$ for all $r>0$ and
4843: $g_x(r)/k_x(r) \to f_1(x)$ as $r \to 0$.
4844: Letting $\lambda \downarrow -\infty$ and then $\eta \to 0$, it
4845: follows by (\ref{eq:bd-dn}) and (\ref{eq:bd2-dn}) that
4846: $$
4847: \lim_{\lambda \downarrow -\infty} h_\lambda(x) = f_1(x) \,.
4848: $$
4849: Recall that $P(A_1)=1$ and our assumption that
4850: $\int \log k_x(\eta) dP(x) > -\infty$ for any $\eta >0$.
4851: By our integrability conditions, the function
4852: $\min\{0, \inf_{\lambda \geq 1} \log h_\lambda(x)\}$ is $P$-integrable,
4853: hence, by Fatou's lemma,
4854: $$
4855: \liminf_{\lambda \downarrow -\infty} \int \log h_\lambda(x) dP(x) \geq 
4856: \int \log f_1(x) dP(x) = H(P\|Q) \,.
4857: $$
4858: Moreover, in case $H(P\|Q)<\infty$, our assupmtions imply that
4859: $\sup_{\lambda \geq 1} |\log h_\lambda(x)|$ is $P$-integrable,
4860: hence by dominated convergence,
4861: $\int \log h_\lambda(x) dP(x) \to \int \log f_1(x) dP(x)$ for
4862: $\lambda \downarrow -\infty$,
4863: as required to complete the proof of
4864: (\ref{eq:dn-lim}).
4865: \qed
4866: 
4867: \section{Proof of Theorem~27}
4868: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
4869: 
4870: Recall our assumption that, for 
4871: $\BBP$-a.e. $x_{[0,\infty)]}$, conditional on 
4872: $X_{[0,\infty)]}=
4873: x_{[0,\infty)]}$ 
4874: the random variables $\{\rho_n(x_{C(n)},Y_{C(n)})\}$ satisfy the LDP
4875: with a {\it deterministic} convex good rate-function 
4876: denoted hereafter $R(\BBP,\BBQ,\cdot)$. Since 
4877: $\rho$ is bounded, by Varadhan's lemma and convex duality, 
4878: this implies that 
4879: \be
4880: \label{eq:am-las}
4881: R(\BBP,\BBQ,D) =
4882: \sup_{\lambda \in \RL} [ \lambda D - \Lambda_\infty(\lambda) ]
4883: \bydef
4884: \Lambda_\infty^*(D)
4885: \ee
4886: where for any $\lambda \in \RL$, the finite, deterministic limit
4887: $$
4888: \Lambda_\infty(\lambda) \bydef \lim_{n \to \infty} 
4889: \frac{1}{n^d} \log \int e^{\lambda \sum_{u \in C(n)}\rho(x_u,y_u)} 
4890: dQ_n(y_{C(n)})
4891: $$
4892: exists for $\BBP$-a.e. $x_{[0,\infty)}$ 
4893: (cf. \cite[Theorem 4.5.10]{dembo-zeitouni:book}).
4894: By bounded convergence, 
4895: $\Lambda_\infty(\lambda)$ is also the limit of 
4896: $$
4897: \Lambda_n(\lambda) \bydef
4898: \frac{1}{n^d} \int \left[
4899: \log \int e^{\lambda \sum_{u \in C(n)}\rho(x_u,y_u)} 
4900: dQ_n(y_{C(n)}) \right] dP_n(x_{C(n)}) \;.
4901: $$
4902: 
4903: By stationarity, 
4904: \be
4905: \label{eq:am-dav}
4906: \Dav=E_{P_n \times Q_n} (\rho_n(X_{C(n)},Y_{C(n)})), 
4907: \;\;\;\forall n \geq 1
4908: \ee
4909: so replacing $P_1$, $Q_1$ and $\rho(x,y)$ of Proposition~2 by
4910: $P_n$, $Q_n$ and $n^d \rho_n(x_{C(n)},y_{C(n)})$, 
4911: respectively, we see that
4912: \be
4913: \label{eq:am-lasn}
4914: R_n(P_n,Q_n,D) =
4915: \sup_{\lambda \in \RL} [ \lambda D - \Lambda_n(\lambda) ]
4916: \bydef
4917: \Lambda_n^*(D) \,.
4918: \ee
4919: %where for any $D \in (\Dmin,\Dav)$, the above supremum is
4920: %obtained at the unique $\lambda^*_n < 0$ such that $\Lambda_n'(\lambda^*_n)=D$.
4921: Note that 
4922: $|\Lambda_n(\lambda)-\Lambda_n(\lambda')| \leq c |\lambda-\lambda'|$ for some
4923: $c<\infty$ and all $n$, $\lambda,\lambda' \in \RL$,
4924: % (by the boundedness of $\rho$),
4925: hence the convergence of $\Lambda_n(\cdot)$ to $\Lambda_\infty(\cdot)$ is 
4926: uniform on compact subsets of $\RL$. In particular, the convex,
4927: continuous functions $\Lambda_n(\cdot)$ converge infimally to $\Lambda_\infty(\cdot)$,
4928: and consequently, by \cite[Theorem 5]{wijsman}, the convex functions
4929: $\Lambda_n^*(\cdot)$ converge infimally to $\Lambda_\infty^*(\cdot)$, that is
4930: \be
4931: \label{eq:inf-conv}
4932: \Lambda_\infty^*(D)
4933: =
4934: \lim_{\delta \to 0} \limsup_{n \to \infty} \inf_{|\hat{D}-D|<\delta} \Lambda_n^*(\hat{D}) 
4935: =
4936: \lim_{\delta \to 0} \liminf_{n \to \infty} \inf_{|\hat{D}-D|<\delta} \Lambda_n^*(\hat{D}) \,.
4937: \ee
4938: 
4939: It follows from (\ref{eq:am-dav}) and Jensen's inequality 
4940: that $\Lambda_n(\lambda) \geq \lambda \Dav$ for all $n$ and $\lambda$,
4941: hence, for $D \leq \Dav$ suffices to consider $\lambda \leq 0$ in
4942: (\ref{eq:am-las}) and in (\ref{eq:am-lasn}). Thus, for $1 \leq n \leq \infty$,
4943: $\Lambda^*_n$ are non-negative, convex,
4944: and monotone non-increasing on $[0,\Dav]$, with 
4945: $\Lambda^*_n(\Dav)=0$. For $1 \leq n \leq \infty$,
4946: let
4947: $$
4948: \Dminn \bydef \lim_{\lambda \downarrow -\infty} \frac{\Lambda_n(\lambda)}{\lambda} \,,
4949: $$
4950: so that $\Lambda_n^*(D)=\infty$ for $D < \Dminn$, while
4951: $\Lambda_n^*(D)<\infty$ for $D>\Dminn$.
4952: Note that for $n < \infty$ this coincides with the definition of $\Dminn$ 
4953: given in (\ref{eq:dminn}). It is easy to check then that (\ref{eq:inf-conv})
4954: implies the pointwise convergence of $\Lambda^*_n(\cdot)=R_n(\BBP,\BBQ,\cdot)$
4955: to $\Lambda^*_\infty(\cdot)=R(\BBP,\BBQ,\cdot)$ at any $D$ for which
4956: $\Lambda^*_\infty(D-\delta) \downarrow \Lambda^*_\infty (D)$, that is,
4957: for all $D \neq \Dinf$. In particular, necessarily $\Dinf \in [\Dmin,\Dav]$,
4958: and $\Dinf$ may also be defined via (\ref{eq:dinf}). 
4959: The continuity of $R(\BBP,\BBQ,D)$ at $D \in (\Dmin,\Dav)$, $D \neq \Dinf$ implies
4960: the equality in (\ref{eq:ldp-27}) for such $D$, thus completing the proof of the 
4961: theorem.
4962: \qed
4963: 
4964: \section{Proof of Theorem~28}
4965: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
4966: For each $m\geq 1$, let $G_m$ be the 
4967: collection of ``good'' realizations
4968: $x_{\IN^d}\in A^{\IN^d}$
4969: $$G_m= \left\{ x_{\IN^d}\in A^{\IN^d} 
4970: : \;\;Q_n(B(x_{C(n)},D))>0 \;\mbox{for all}\; n\geq m \right\}$$
4971: so that the assumption that 
4972: $Q_n(B(X_{C(n)},D))>0$ eventually, with probability one 
4973: translates to 
4974: \be
4975: \BBP\{\cup_{m\geq 1} G_m\}=1.
4976: \label{eq:eventuallyOK}
4977: \ee
4978: 
4979: To prove the lower bound
4980: we choose and fix an $m\geq 1$
4981: and a realization $x_{\IN^d}\in G_m$. 
4982: Then for any $K>1$:
4983: \ben
4984: \PR\{W_n^d<K\,|\,X_{C(n)}=x_{C(n)}\}
4985: &\leq& \sum_{u\in[0,\lfloor K^{1/d} \rfloor-1]^d}
4986: 	\,Q_n\{Y_{u+C(n)}\in B(x_{C(n)},D)\}\\
4987: &\leq& K\,Q_n(B(x_{C(n)},D)).
4988: \een
4989: Since, by its definition,
4990: $W_n$ is always greater than
4991: or equal to one, this 
4992: inequality trivially holds 
4993: also for $K\in(0,1]$.
4994: Setting 
4995: $K=[n^{1+\epsilon}Q_n(B(x_{C(n)},D))]^{-1}$ 
4996: above gives,
4997: for all $n \geq m$,
4998: \ben
4999: \PR\{\log[W^d_nQ_n(B(X_{C(n)},D))]<-(1+\epsilon)\log n
5000: \,|\,X_{C(n)}=x_{C(n)}\}
5001: \leq \frac{1}{n^{1+\epsilon}}.
5002: \een
5003: Since this bound is uniform over 
5004: $x_{\IN^d} \in G_m$ and summable, the Borel-Cantelli 
5005: lemma and assumption (\ref{eq:eventuallyOK})
5006: imply that
5007: \be
5008: \log[W^d_nQ_n(B(X_{C(n)},D))]\;\geq\;-(1+\epsilon)\log n
5009: \;\;\;\;\mbox{eventually, w.p.1.}
5010: \label{eq:bc:2}
5011: \ee
5012: 
5013: For the upper bound 
5014: we choose and fix an $m\geq 1$
5015: and a realization $x_{\IN^d}\in G_m$,
5016: and take $K\geq (n+1)^d$.
5017: Note that 
5018: $$\PR\{W_n^d>K\,|\,X_{C(n)}=x_{C(n)}\}
5019: \leq\Pr
5020:     \left\{
5021:       \sum_{u\in [0,M]^d}
5022:       \IND_{
5023: 	\{
5024: 	Y_{nu+C(n)}\in B(X_{C(n)},D)
5025: 	\}
5026: 	   }
5027: 		= 0 
5028:     \right\}
5029: $$
5030: where 
5031: the sum is over the $(M+1)^d$
5032: integer positions $u\in[0,M]^d\subset\IN^d$,
5033: $nu$ denotes the point 
5034: $(nu_1,nu_2,\ldots,nu_d)\in\IN^d$,
5035: and
5036: $$M=M(K,n)\bydef
5037: \left\lfloor\frac{K^{1/d}-1}{n}
5038: \right\rfloor.$$
5039: Let $\Sigma_n$ denote the sum 
5040: in the above probability,
5041: $$\Sigma_n=\sum_{u\in[0,M]^d}I_n(u)$$
5042: where $I_n(u)$ is the indicator function 
5043: of the event $\{Y_{nu+C(n)}\in B(X_{C(n)},D)\}$.
5044: In this notation:
5045: \be
5046: \PR\{W_n^d>K\,|\,X_{C(n)}=x_{C(n)}\}
5047: \,\leq\,\BBQ\{\Sigma_n=0\}\,\leq\,\frac{\VAR_\BBQ(\Sigma_n)}
5048: 				{[E_\BBQ(\Sigma_n)]^2}.
5049: \label{eq:estimate1}
5050: \ee
5051: By stationarity
5052: \be
5053: E_\BBQ(\Sigma_n) = [M+1]^dQ_n(B(x_{C(n)},D))
5054: \label{eq:estimate2}
5055: \ee
5056: and by the definition of the
5057: $\phi$-mixing coefficients, if $u\neq v$,
5058: $$E_\BBQ\{I_n(u)I_n(v)\}\leq Q_n(B(x_{C(n)},D))
5059: [\phi_n(nd(u,v)-n+1)+Q_n(B(x_{C(n)},D))].$$ 
5060: Using the last two estimates
5061: we can bound the variance as
5062: \be
5063: \VAR_\BBQ\{\Sigma_n\}
5064: &=&\sum_{u,v\in[0,M]^d}
5065: \COV_\BBQ(I_n(u), I_n(v))
5066:         \nonumber\\
5067: &\leq&[M+1]^dQ_n(B(x_{C(n)},D))
5068: 	\nonumber\\
5069: & & \quad
5070: 	+\sum_{u,v\in[0,M]^d,\;u\neq v}
5071: 	\Big[Q_n(B(x_{C(n)},D))
5072: 	\phi_n(nd(u,v)-n+1)\Big]
5073: 	\nonumber\\
5074: &\leq&[M+1]^dQ_n(B(x_{C(n)},D))
5075: 	\left[1+\sum_{j=1}^{M}
5076: c_d j^{d-1}\phi_n(nj-n+1)\right]
5077:         \label{eq:estimate3}
5078: \ee
5079: where
5080: $c_d j^{d-1}$ bounds the number of possible points
5081: $u$ that can be at a distance exactly $j$ from a 
5082: given point $v$ (for some constant $c_d$).
5083: By assumption (\ref{eq:dobrushin}) we can find
5084: a finite constant $\Phi$ such that the expression
5085: in square brackets in (\ref{eq:estimate3}) is bounded
5086: above by $\Phi$, uniformly in $n$.
5087: Substituting this bound, together with
5088: (\ref{eq:estimate2}) and (\ref{eq:estimate3}),
5089: in (\ref{eq:estimate1}), gives
5090: \be
5091: \PR\{W_n>K\,|\,X_{C(n)}=x_{C(n)}\}&\leq&\frac
5092:         {\Phi}
5093:         {[M+1]^dQ_n(B(x_{C(n)},D))}.
5094: \label{eq:estimate4}
5095: \ee
5096: Let $\epsilon>0$ arbitrary, 
5097: take $n$ large enough so 
5098: that $n^{(1+\epsilon)/d}\geq 2$,
5099: and let $K=n^{d+1+\epsilon}/Q_n(B(x_{C(n)},D)).$
5100: Simple algebra shows that with this choice
5101: of $K$ we have
5102: $$[M+1]^dQ_n(B(x_{C(n)},D))\geq\frac{1}{2}n^{1+\epsilon}$$
5103: and substituting this in (\ref{eq:estimate4}) 
5104: yields
5105: \ben
5106: 	\PR\{\log[W_n^dQ_n(B(
5107: x_{
5108: C(n)},D))] >
5109: 	(d+1+\epsilon)\log n\,|\,X_{C(n)}=x_{C(n)}\}\leq
5110:         \frac{2\Phi}{n^{1+\epsilon}}.
5111: \een
5112: This bound is uniform 
5113: over $x_{\IN^d} \in G_m$ and summable, 
5114: so the Borel-Cantelli lemma 
5115: and (\ref{eq:eventuallyOK})
5116: imply that 
5117: \be
5118: \log[W_n^dQ_n(B(
5119: X_{
5120: C(n)},D))]\;\leq\;(d+1+\epsilon)\log n
5121: \;\;\;\;\mbox{eventually, w.p.1.}
5122: \label{eq:bc:1}
5123: \ee
5124: 
5125: Combining (\ref{eq:bc:1}) and (\ref{eq:bc:2}) 
5126: completes the proof.
5127: \qed
5128: 
5129: \newpage 
5130: 
5131: \begin{thebibliography}{10}
5132: 
5133: \bibitem{agw:90}
5134: R.~Arratia, L.~Gordon, and M.S. Waterman.
5135: \newblock The {E}rd{\"{o}}s-{R}{\'{e}}nyi law in distribution for coin tossing
5136:   and sequence matching.
5137: \newblock {\em Ann. Stat.}, 18:539--570, 1990.
5138: 
5139: \bibitem{arratia-waterman}
5140: R.~Arratia and M.S. Waterman.
5141: \newblock A phase transition for the score in matching random sequences
5142:   allowing deletions.
5143: \newblock {\em Ann. Appl. Probab.}, 4:200--225, 1994.
5144: 
5145: \bibitem{barron:1}
5146: A.R. Barron.
5147: \newblock The strong ergodic theorem for densities: {G}eneralized
5148:   {S}hannon-{M}cmillan-{B}reiman theorem.
5149: \newblock {\em Ann. Probab.}, 13:1292--1303, 1985.
5150: 
5151: \bibitem{bell:cleary:witten}
5152: J.G. Bell, T.C.~Cleary and I.H. Witten.
5153: \newblock {\em Text Compression}.
5154: \newblock Prentice Hall, New Jersey, 1990.
5155: 
5156: \bibitem{bell-cover:88}
5157: R.~Bell and T.M. Cover.
5158: \newblock Game-theoretic optimal portfolios.
5159: \newblock {\em Management Sci.}, 34(6):724--733, 1988.
5160: 
5161: \bibitem{berger:book}
5162: T.~Berger.
5163: \newblock {\em Rate Distortion Theory: A Mathematical Basis for Data
5164:   Compression}.
5165: \newblock Prentice-Hall Inc., Englewood Cliffs, NJ, 1971.
5166: 
5167: \bibitem{berger-shen-ye:92}
5168: T.~Berger, S.Y. Shen, and Z.X. Ye.
5169: \newblock Some communication problems of random fields.
5170: \newblock {\em Internat. J. Math. Statist. Sci.}, 1(1):47--77, 1992.
5171: 
5172: \bibitem{berkes-morrow}
5173: I.~Berkes and G.J. Morrow.
5174: \newblock Strong invariance principles for mixing random fields.
5175: \newblock {\em Z. Wahrsch. Verw. Gebiete}, 57(1):15--37, 1981.
5176: 
5177: \bibitem{bradley}
5178: B.~C. Bradley.
5179: \newblock Basic properties of strong mixing conditions.
5180: \newblock In E.~Eberlein and M.S. Taqqu, editors, {\em Dependence in
5181:   Probability and Statistics}, pages 165--192, 1986.
5182: 
5183: \bibitem{breiman:57}
5184: L.~Breiman.
5185: \newblock The individual ergodic theorem for information theory.
5186: \newblock {\em Ann. Math. Stat.}, 28:809--811, 1957.
5187: 
5188: \bibitem{breiman:60}
5189: L.~Breiman.
5190: \newblock Correction to ``{T}he individual ergodic theorem for information
5191:   theory''.
5192: \newblock {\em Ann. Math. Stat.}, 31:809--810, 1960.
5193: 
5194: \bibitem{bryc-dembo:96}
5195: W.~Bryc and A.~Dembo.
5196: \newblock Large deviations and strong mixing.
5197: \newblock {\em Ann. Inst. H. Poincar\'e Probab. Statist.}, 32(4):549--569,
5198:   1996.
5199: 
5200: \bibitem{bucklew:87}
5201: J.A. Bucklew.
5202: \newblock The source coding theorem via {S}anov's theorem.
5203: \newblock {\em IEEE Trans. Inform. Theory}, 33(6):907--909, 1987.
5204: 
5205: \bibitem{bucklew:88}
5206: J.A. Bucklew.
5207: \newblock A large deviation theory proof of the abstract alphabet source coding
5208:   theorem.
5209: \newblock {\em IEEE Trans. Inform. Theory}, 34(5):1081--1083, 1988.
5210: 
5211: \bibitem{chazottesetal:98}
5212: J.-R. Chazottes, E.~Floriani, and R.~Lima.
5213: \newblock Relative entropy and identification of {G}ibbs measures in dynamical
5214:   systems.
5215: \newblock {\em J. Statist. Phys.}, 90:697--725, 1998.
5216: 
5217: \bibitem{chi-it:01}
5218: Z.~Chi.
5219: \newblock The first order asymptotics of waiting times with distortion between
5220:   stationary processes.
5221: \newblock {\em IEEE Trans. Inform. Theory}, 47(1):338--347, 2001.
5222: 
5223: \bibitem{chi-AP:01}
5224: Z.~Chi.
5225: \newblock Stochastic sub-additivity approach to conditional large deviation
5226:   principle.
5227: \newblock {\em To appear, Ann. Probab.}, 2001.
5228: 
5229: \bibitem{comets:86}
5230: F.~Comets.
5231: \newblock Grandes d\'eviations pour des champs de {G}ibbs sur {${\bf Z }\sp
5232:   d$}.
5233: \newblock {\em C. R. Acad. Sci. Paris S\'er. I Math.}, 303(11):511--513, 1986.
5234: 
5235: \bibitem{comets:89}
5236: F.~Comets.
5237: \newblock Large deviation estimates for a conditional probability distribution.
5238:   {A}pplications to random interaction {G}ibbs measures.
5239: \newblock {\em Probab. Theory Related Fields}, 80:407--432, 1989.
5240: 
5241: \bibitem{cover:book}
5242: T.M. Cover and J.A. Thomas.
5243: \newblock {\em Elements of Information Theory}.
5244: \newblock J. Wiley, New York, 1991.
5245: 
5246: \bibitem{csiszar:book}
5247: I.~Csisz{\'{a}}r and J.~K{\"{o}}rner.
5248: \newblock {\em Information Theory: Coding Theorems for Discrete Memoryless
5249:   Systems}.
5250: \newblock Academic Press, New York, 1981.
5251: 
5252: \bibitem{dawson-gartner:87}
5253: D.A. Dawson and J.~G{\"a}rtner.
5254: \newblock Large deviations from the {M}c{K}ean-{V}lasov limit for weakly
5255:   interacting diffusions.
5256: \newblock {\em Stochastics}, 20(4):247--308, 1987.
5257: 
5258: \bibitem{dembo-kontoyiannis}
5259: A.~Dembo and I.~Kontoyiannis.
5260: \newblock The asymptotics of waiting times between stationary processes,
5261:   allowing distortion.
5262: \newblock {\em Ann. Appl. Probab.}, 9:413--429, 1999.
5263: 
5264: \bibitem{dembo-kontoyiannis:crit:01}
5265: A.~Dembo and I.~Kontoyiannis.
5266: \newblock Critical behavior in lossy source coding.
5267: \newblock {\em To appear,}, 2001.
5268: \newblock [Available from \texttt{www.dam.brown.edu/people/yiannis}].
5269: 
5270: \bibitem{dembo-zeitouni:book}
5271: A.~Dembo and O.~Zeitouni.
5272: \newblock {\em Large Deviations Techniques And Applications}.
5273: \newblock Springer-Verlag, New York, second edition, 1998.
5274: 
5275: \bibitem{deuschel-stroock:book}
5276: J.D. Deuschel and D.W. Stroock.
5277: \newblock {\em Large Deviations}.
5278: \newblock Academic Press, Boston, 1989.
5279: 
5280: \bibitem{doukhan:book}
5281: P.~Doukhan.
5282: \newblock {\em Mixing: Properties and Examples}.
5283: \newblock Springer-Verlag, New York, 1994.
5284: 
5285: \bibitem{elias}
5286: P.~Elias.
5287: \newblock Universal codeword sets and representations of the integers.
5288: \newblock {\em IEEE Trans. Inform. Theory}, 21:194--203, 1975.
5289: 
5290: \bibitem{evans-gariepy}
5291: L.C. Evans and R.F. Gariepy.
5292: \newblock {\em Measure theory and fine properties of functions}.
5293: \newblock CRC Press, Boca Raton, FL, 1992.
5294: 
5295: \bibitem{feldman:80}
5296: J.~Feldman.
5297: \newblock $r$-entropy, equipartition, and {O}rnstein's isomorphism theorem in
5298:   \mbox{${\bf R}^{n}$}.
5299: \newblock {\em Israel J. Math.}, 36(3-4):321--345, 1980.
5300: 
5301: \bibitem{fellerII:book}
5302: W.~Feller.
5303: \newblock {\em An Introduction to Probability Theory and its Applications.
5304:   {V}ol. {I}{I}.}
5305: \newblock John Wiley \& Sons Inc., New York, second edition, 1971.
5306: 
5307: \bibitem{folmer-orey}
5308: H.~F{\"o}llmer and S.~Orey.
5309: \newblock Large deviations for the empirical field of a {G}ibbs measure.
5310: \newblock {\em Ann. Probab.}, 16(3):961--977, 1988.
5311: 
5312: \bibitem{follmer:73}
5313: Hans F{\"o}llmer.
5314: \newblock On entropy and information gain in random fields.
5315: \newblock {\em Z. Wahrsch. Verw. Gabiete}, 26:207--217, 1973.
5316: 
5317: \bibitem{georgii:1}
5318: H.-O. Georgii.
5319: \newblock {\em {Gibbs} Measures and Phase Transitions}.
5320: \newblock W. de Gruyter: Berlin et al, 1989.
5321: 
5322: \bibitem{guyon:book}
5323: X.~Guyon.
5324: \newblock {\em Random Fields on a Network: Modeling, Statistics, and
5325:   Applications}.
5326: \newblock Springer-Verlag, New York, 1995.
5327: 
5328: \bibitem{ibragimov:62}
5329: I.A. Ibragimov.
5330: \newblock Some limit theorems for stationary processes.
5331: \newblock {\em Theory Probab. Appl.}, 7:349--382, 1962.
5332: 
5333: \bibitem{ishii-yamamoto:97}
5334: D.~Ishii and H.~Yamamoto.
5335: \newblock The redundancy of universal coding with a fidelity criterion.
5336: \newblock {\em IEICE Trans. Fundamentals}, E80-A:2225--2231, 1997.
5337: 
5338: \bibitem{kanaya-muramatsu:97}
5339: F.~Kanaya and J.~Muramatsu.
5340: \newblock An almost sure recurrence theorem with distortion for stationary
5341:   ergodic sources.
5342: \newblock {\em IEICE Trans. Fundamentals}, E80-A:2264--2267, 1997.
5343: 
5344: \bibitem{kanlis:phd}
5345: A.~Kanlis.
5346: \newblock {\em Compression and Transmission of Information at Multiple
5347:   Resolutions}.
5348: \newblock PhD thesis, Dept. of Electrical and Computer Engineering, University
5349:   of Maryland at College Park, 1998.
5350: 
5351: \bibitem{karlin-ost:88}
5352: S.~Karlin and F.~Ost.
5353: \newblock Maximal length of common words among random letter sequences.
5354: \newblock {\em Ann. Probab.}, 16:535--563, 1988.
5355: 
5356: \bibitem{kieffer:73}
5357: J.C. Kieffer.
5358: \newblock A counterexample to {P}erez's generalization of the
5359:   {S}hannon-{M}c{M}illan theorem.
5360: \newblock {\em Ann. Probab.}, 1:362--364, 1973.
5361: 
5362: \bibitem{kieffer:73b}
5363: J.C. Kieffer.
5364: \newblock Correction to: ``{A} counterexample to {P}erez's generalization of
5365:   the {S}hannon-{M}c{M}illan theorem'' ({A}nn. {P}robability {\bf 1} (1973),
5366:   362-364).
5367: \newblock {\em Ann. Probab.}, 4:153--154, 1976.
5368: 
5369: \bibitem{kieffer:91}
5370: J.C. Kieffer.
5371: \newblock Sample converses in source coding theory.
5372: \newblock {\em IEEE Trans. Inform. Theory}, 37(2):263--268, 1991.
5373: 
5374: \bibitem{koga-arimoto:98}
5375: H.~Koga and S.~Arimoto.
5376: \newblock On the asymptotic behaviors of the recurrence time with a fidelity
5377:   criterion for discrete memoryless sources and memoryless {G}aussian sources.
5378: \newblock {\em IEICE Trans. Fundamentals}, E81-A:981--986, 1998.
5379: 
5380: \bibitem{kontoyiannis-97}
5381: I.~Kontoyiannis.
5382: \newblock Second-order noiseless source coding theorems.
5383: \newblock {\em IEEE Trans. Inform. Theory}, 43(4):1339--1341, July 1997.
5384: 
5385: \bibitem{kontoyiannis-jtp}
5386: I.~Kontoyiannis.
5387: \newblock Asymptotic recurrence and waiting times for stationary processes.
5388: \newblock {\em J. Theoret. Probab.}, 11:795--811, 1998.
5389: 
5390: \bibitem{my:thesis}
5391: I.~Kontoyiannis.
5392: \newblock {\em Recurrence and Waiting Times in Stationary Processes, and their
5393:   Applications in Data Compression}.
5394: \newblock PhD thesis, Dept. of Electrical Engineering, Stanford University, May
5395:   1998.
5396: 
5397: \bibitem{covering-TR:99}
5398: I.~Kontoyiannis.
5399: \newblock Efficient sphere-covering and converse measure concentration via
5400:   generalized coding theorems.
5401: \newblock Technical Report 99-24, Department of Statistics, Purdue University,
5402:   October 1999.
5403: \newblock [Available from \texttt{www.dam.brown.edu/people/yiannis}].
5404: 
5405: \bibitem{kontoyiannis-lossy1-1}
5406: I.~Kontoyiannis.
5407: \newblock An implementable lossy version of the {L}empel-{Z}iv algorithm --
5408:   {P}art~{I}: {O}ptimality for memoryless sources.
5409: \newblock {\em IEEE Trans. Inform. Theory}, 45(7):2293--2305, November 1999.
5410: 
5411: \bibitem{kontoyiannis-red:00}
5412: I.~Kontoyiannis.
5413: \newblock Pointwise redundancy in lossy data compression and universal lossy
5414:   data compression.
5415: \newblock {\em IEEE Trans. Inform. Theory}, 46(1):136--152, January 2000.
5416: 
5417: \bibitem{konto-zhang:00}
5418: I.~Kontoyiannis and J.~Zhang.
5419: \newblock Arbitrary source models and {B}ayesian codebooks in rate-distortion
5420:   theory.
5421: \newblock {\em Preprint}, 2000.
5422: 
5423: \bibitem{krengel:book}
5424: U.~Krengel.
5425: \newblock {\em Ergodic Theorems}.
5426: \newblock Walter de Gruyter \& Co., Berlin, 1985.
5427: 
5428: \bibitem{lapidoth:97}
5429: A.~Lapidoth.
5430: \newblock On the role of mismatch in rate distortion theory.
5431: \newblock {\em IEEE Trans. Inform. Theory}, 43(1):38--47, 1997.
5432: 
5433: \bibitem{lin-lu:book}
5434: Z.~Lin and C.~Lu.
5435: \newblock {\em Limit Theory for Mixing Dependent Random Variables}.
5436: \newblock Kluwer Academic Publishers, Dordrecht, 1996.
5437: 
5438: \bibitem{luczak-szpankowski}
5439: T.~{\L}uczak and W.~Szpankowski.
5440: \newblock A suboptimal lossy data compression algorithm based on approximate
5441:   pattern matching.
5442: \newblock {\em IEEE Trans. Inform. Theory}, 43(5):1439--1451, 1997.
5443: 
5444: \bibitem{marton-shields:1}
5445: K.~Marton and P.C. Shields.
5446: \newblock Almost sure waiting time results for weak and very weak {B}ernoulli
5447:   processes.
5448: \newblock {\em Ergod. Th. \& Dynam. Sys.}, 15:951--960, 1995.
5449: 
5450: \bibitem{mcmillan}
5451: B.~McMillan.
5452: \newblock The basic theorems of information theory.
5453: \newblock {\em Ann. Math. Stat.}, 24:196--219, 1953.
5454: 
5455: \bibitem{olla:88}
5456: S.~Olla.
5457: \newblock Large deviations for {G}ibbs random fields.
5458: \newblock {\em Probab. Theory Related Fields}, 77(3):343--357, 1988.
5459: 
5460: \bibitem{oodaira-yoshihara:71a}
5461: H.~Oodaira and K.I. Yoshihara.
5462: \newblock The law of the iterated logarithm for stationary processes satisfying
5463:   mixing conditions.
5464: \newblock {\em K\=odai Math. Sem. Rep.}, 23:311--334, 1971.
5465: 
5466: \bibitem{oodaira-yoshihara:71b}
5467: H.~Oodaira and K.I. Yoshihara.
5468: \newblock Note on the law of the iterated logarithm for stationary processes
5469:   satisfying mixing conditions.
5470: \newblock {\em K\=odai Math. Sem. Rep.}, 23:335--342, 1971.
5471: 
5472: \bibitem{orey:85}
5473: S.~Orey.
5474: \newblock On the {S}hannon-{P}erez-{M}oy theorem.
5475: \newblock In {\em Particle systems, random media and large deviations
5476:   (Brunswick, Maine, 1984)}, pages 319--327. Amer. Math. Soc., Providence,
5477:   R.I., 1985.
5478: 
5479: \bibitem{orey:85b}
5480: S.~Orey.
5481: \newblock Large deviations in ergodic theory.
5482: \newblock In {\em Seminar on stochastic processes, 1984 (Evanston, Ill.,
5483:   1984)}, pages 195--249. Birkh\"auser Boston, Boston, Mass., 1986.
5484: 
5485: \bibitem{ornstein-weiss:83}
5486: D.~Ornstein and B.~Weiss.
5487: \newblock The {S}hannon-{M}c{M}illan-{B}reiman theorem for a class of amenable
5488:   groups.
5489: \newblock {\em Israel J. Math.}, 44:53--60, 1983.
5490: 
5491: \bibitem{peligrad:86}
5492: M.~Peligrad.
5493: \newblock Recent advances in the central limit theorem and its weak invariance
5494:   principle for mixing sequences of random variables (a survey).
5495: \newblock In E.~Eberlein and M.S. Taqqu, editors, {\em Dependence in
5496:   Probability and Statistics}, pages 193--223, 1986.
5497: 
5498: \bibitem{philipp-stout:book}
5499: W.~Philipp and W.~Stout.
5500: \newblock {\em Almost Sure Invariance Principles for Partial Sums of Weakly
5501:   Dependent Random Variables}.
5502: \newblock Memoirs of the AMS, 1975.
5503: \newblock vol. 2, issue 2, no. 161.
5504: 
5505: \bibitem{sakrison:69}
5506: D.J. Sakrison.
5507: \newblock The rate distortion function for a class of sources.
5508: \newblock {\em Information and Control}, 15:165--195, 1969.
5509: 
5510: \bibitem{sakrison:70}
5511: D.J. Sakrison.
5512: \newblock The rate of a class of random processes.
5513: \newblock {\em IEEE Trans. Inform. Theory}, IT-16:10--16, 1970.
5514: 
5515: \bibitem{shannon:48}
5516: C.E. Shannon.
5517: \newblock A mathematical theory of communication.
5518: \newblock {\em Bell System Tech. J.}, 27:379--423, 623--656, 1948.
5519: 
5520: \bibitem{shannon:59}
5521: C.E. Shannon.
5522: \newblock Coding theorems for a discrete source with a fidelity criterion.
5523: \newblock {\em IRE Nat. Conv. Rec.}, part~4:142--163, 1959.
5524: \newblock Reprinted in D. Slepian (ed.), {\em Key Papers in the Development of
5525:   Information Theory}, IEEE Press, 1974.
5526: 
5527: \bibitem{shields:3}
5528: P.C. Shields.
5529: \newblock Waiting times: {P}ositive and negative results on the {W}yner-{Z}iv
5530:   problem.
5531: \newblock {\em J. Theoret. Probab.}, 6(3):499--519, 1993.
5532: 
5533: \bibitem{steinberg-gutman}
5534: Y.~Steinberg and M.~Gutman.
5535: \newblock An algorithm for source coding subject to a fidelity criterion, based
5536:   upon string matching.
5537: \newblock {\em IEEE Trans. Inform. Theory}, 39(3):877--886, 1993.
5538: 
5539: \bibitem{wijsman}
5540: R.A. Wijsman.
5541: \newblock Convergence of sequences of convex sets, cones and functions.
5542: \newblock {\em Bull. Amer. Math. Soc.}, 70:186--188, 1964.
5543: 
5544: \bibitem{winkler:book}
5545: G.~Winkler.
5546: \newblock {\em Image Analysis, Random Fields and Dynamic Monte Carlo Methods: A
5547:   Mathematical Introduction}.
5548: \newblock Springer-Verlag, Berlin, 1995.
5549: 
5550: \bibitem{wyner-ziv:71}
5551: A.D. Wyner and J.~Ziv.
5552: \newblock Bounds on the rate-distortion function for stationary sources with
5553:   memory.
5554: \newblock {\em IEEE Trans. Information Theory}, IT-17:508--513, 1971.
5555: 
5556: \bibitem{wyner-ziv:1}
5557: A.D. Wyner and J.~Ziv.
5558: \newblock Some asymptotic properties of the entropy of a stationary ergodic
5559:   data source with applications to data compression.
5560: \newblock {\em IEEE Trans. Inform. Theory}, 35(6):1250--1258, 1989.
5561: 
5562: \bibitem{wyner-ziv:3}
5563: A.D. Wyner and J.~Ziv.
5564: \newblock Fixed data base version of the {L}empel-{Z}iv data compression
5565:   algorithm.
5566: \newblock {\em IEEE Trans. Inform. Theory}, 37(3):878--880, 1991.
5567: 
5568: \bibitem{wyner-ziv:2}
5569: A.D. Wyner and J.~Ziv.
5570: \newblock The sliding-window {L}empel-{Z}iv algorithm is asymptotically
5571:   optimal.
5572: \newblock {\em Proc. IEEE}, 82(6):872--877, 1994.
5573: 
5574: \bibitem{wyner-ziv-wyner}
5575: A.D. Wyner, J.~Ziv, and A.J. Wyner.
5576: \newblock On the role of pattern matching in information theory. ({I}nformation
5577:   theory: 1948--1998).
5578: \newblock {\em IEEE Trans. Inform. Theory}, 44(6):2045--2056, 1998.
5579: 
5580: \bibitem{yang-kieffer:1}
5581: E.-h. Yang and J.C. Kieffer.
5582: \newblock On the performance of data compression algorithms based upon string
5583:   matching.
5584: \newblock {\em IEEE Trans. Inform. Theory}, 44(1):47--65, 1998.
5585: 
5586: \bibitem{yang-zhang:III}
5587: E.-h. Yang and Z.~Zhang.
5588: \newblock The redundancy of source coding with a fidelity criterion --
5589:   {P}art~{III}: {C}oding at a fixed distortion level with unknown statistics.
5590: \newblock {\em Preprint}.
5591: 
5592: \bibitem{yang-zhang:99}
5593: E.-h. Yang and Z.~Zhang.
5594: \newblock On the redundancy of lossy source coding with abstract alphabets.
5595: \newblock {\em IEEE Trans. Inform. Theory}, 45(4):1092--1110, 1999.
5596: 
5597: \bibitem{yang-zhang:99c}
5598: E.-h. Yang and Z.~Zhang.
5599: \newblock The shortest common superstring problem: average case analysis for
5600:   both exact and approximate matching.
5601: \newblock {\em IEEE Trans. Inform. Theory}, 45(6):1867--1886, 1999.
5602: 
5603: \bibitem{yang-zhang:II}
5604: E.-h. Yang and Z.~Zhang.
5605: \newblock The redundancy of source coding with a fidelity criterion --
5606:   {P}art~{II}: {C}oding at a fixed rate level with unknown statistics.
5607: \newblock {\em IEEE Trans. Inform. Theory}, 47(1):126--145, 2001.
5608: 
5609: \bibitem{ye-berger:book}
5610: T.~Ye, Z.and~Berger.
5611: \newblock {\em Information Measures for Discrete Random Fields}.
5612: \newblock Science Press, Beijing, 1998.
5613: 
5614: \bibitem{zhang-yang-wei:I}
5615: Z.~Zhang, E.-h. Yang, and V.K. Wei.
5616: \newblock The redundancy of source coding with a fidelity criterion --
5617:   {P}art~{I}: {K}nown statistics.
5618: \newblock {\em IEEE Trans. Inform. Theory}, 43(1):71--91, 1997.
5619: 
5620: \bibitem{ziv-lempel:1}
5621: J.~Ziv and A.~Lempel.
5622: \newblock A universal algorithm for sequential data compression.
5623: \newblock {\em IEEE Trans. Inform. Theory}, 23(3):337--343, 1977.
5624: 
5625: \bibitem{ziv-lempel:2}
5626: J.~Ziv and A.~Lempel.
5627: \newblock Compression of individual sequences by variable rate coding.
5628: \newblock {\em IEEE Trans. Inform. Theory}, 24(5):530--536, 1978.
5629: 
5630: \end{thebibliography}
5631: 
5632: % \bibliography{ik}
5633: % \bibliography{/users/yiannis/latex/ik}
5634: % \bibliography{../latex/ik}
5635: % \bibliography{/users/yiannis/latex/ik}
5636: % \bibliography{/home/mean/u21/yiannis/latex/ik}
5637: % \bibliography{/v0/yiannis/latex/ik}
5638: % \bibliography{/sccm0/yiannis/latex/ik}
5639: % \bibliography{/tmp_mnt/home-georgep/yiannis/latex/ik}
5640: 
5641: \end{document}
5642: 
5643: