1: %\documentstyle[11pt,epsf]{article}
2: \documentclass[11pt,epsf]{article}
3: \usepackage{amsmath}
4: \usepackage{amsfonts}
5: \usepackage{amssymb}
6: \usepackage{graphicx}
7: \topmargin 0.25truein
8: \oddsidemargin -0.1truein
9: \evensidemargin -0.1truein
10: \textheight 8.5truein
11: \textwidth 6.5truein
12: %\footheight 0.15truein
13: \footskip 0.6truein
14: \headheight 0.0truein
15: \headsep 0.0truein
16: \parskip 4pt plus 1pt
17:
18: \newenvironment{define}{\begin{trivlist}\item[]{\bf Definition:}\rm}{\end{trivlist}}
19: \newenvironment{corol}{\begin{trivlist}\item[]{\bf Corollary:}\rm}{\end{trivlist}}
20: \newenvironment{discus}{\begin{trivlist}\item[]{\bf Discussion:}\rm}{\end{trivlist}}
21: \newtheorem{theorem}{Theorem}
22: \newtheorem{lemma}{Lemma}
23: \newcommand {\bbeta} {\mbox{\boldmath $\beta$}}
24: \newcommand {\hx} {\hat{x}}
25: \newcommand {\hX} {\hat{X}}
26: \newcommand {\dfn} {\stackrel{\Delta} {=}}
27: \newcommand {\exe} {\stackrel{\cdot} {=}}
28: \newcommand{\eqa}{\stackrel{\mbox{(a)}}{=}}
29: \newcommand{\eqb}{\stackrel{\mbox{(b)}}{=}}
30: \newcommand{\eqc}{\stackrel{\mbox{(c)}}{=}}
31: \newcommand{\eqd}{\stackrel{\mbox{(d)}}{=}}
32: \newcommand{\eqe}{\stackrel{\mbox{(e)}}{=}}
33: \newcommand{\eqf}{\stackrel{\mbox{(f)}}{=}}
34: \newcommand{\lea}{\stackrel{\mbox{(a)}}{\le}}
35: \newcommand{\leb}{\stackrel{\mbox{(b)}}{\le}}
36: \newcommand{\lec}{\stackrel{\mbox{(c)}}{\le}}
37: \newcommand{\led}{\stackrel{\mbox{(d)}}{\le}}
38: \newcommand{\lee}{\stackrel{\mbox{(e)}}{\le}}
39: \newcommand{\lef}{\stackrel{\mbox{(f)}}{\le}}
40: \newcommand{\gea}{\stackrel{\mbox{(a)}}{\ge}}
41: \newcommand{\geb}{\stackrel{\mbox{(b)}}{\ge}}
42: \newcommand{\gec}{\stackrel{\mbox{(c)}}{\ge}}
43: \newcommand{\ged}{\stackrel{\mbox{(d)}}{\ge}}
44: \newcommand{\gee}{\stackrel{\mbox{(e)}}{\ge}}
45: \newcommand{\gef}{\stackrel{\mbox{(f)}}{\ge}}
46: \newcommand {\reals} {{\rm I\!R}}
47: \newcommand {\ba} {\mbox{\boldmath $a$}}
48: \newcommand {\bb} {\mbox{\boldmath $b$}}
49: \newcommand {\bc} {\mbox{\boldmath $c$}}
50: \newcommand {\bd} {\mbox{\boldmath $d$}}
51: \newcommand {\be} {\mbox{\boldmath $e$}}
52: \newcommand {\Bf} {\mbox{\boldmath $f$}}
53: \newcommand {\bg} {\mbox{\boldmath $g$}}
54: \newcommand {\bh} {\mbox{\boldmath $h$}}
55: \newcommand {\bi} {\mbox{\boldmath $i$}}
56: \newcommand {\bj} {\mbox{\boldmath $j$}}
57: \newcommand {\bk} {\mbox{\boldmath $k$}}
58: \newcommand {\bl} {\mbox{\boldmath $l$}}
59: \newcommand {\bm} {\mbox{\boldmath $m$}}
60: \newcommand {\bn} {\mbox{\boldmath $n$}}
61: \newcommand {\bo} {\mbox{\boldmath $o$}}
62: \newcommand {\bp} {\mbox{\boldmath $p$}}
63: \newcommand {\bq} {\mbox{\boldmath $q$}}
64: \newcommand {\br} {\mbox{\boldmath $r$}}
65: \newcommand {\bs} {\mbox{\boldmath $s$}}
66: \newcommand {\bt} {\mbox{\boldmath $t$}}
67: \newcommand {\bu} {\mbox{\boldmath $u$}}
68: \newcommand {\bv} {\mbox{\boldmath $v$}}
69: \newcommand {\bw} {\mbox{\boldmath $w$}}
70: \newcommand {\bx} {\mbox{\boldmath $x$}}
71: \newcommand {\by} {\mbox{\boldmath $y$}}
72: \newcommand {\bz} {\mbox{\boldmath $z$}}
73: \newcommand {\bA} {\mbox{\boldmath $A$}}
74: \newcommand {\bB} {\mbox{\boldmath $B$}}
75: \newcommand {\bC} {\mbox{\boldmath $C$}}
76: \newcommand {\bD} {\mbox{\boldmath $D$}}
77: \newcommand {\bE} {\mbox{\boldmath $E$}}
78: \newcommand {\bF} {\mbox{\boldmath $F$}}
79: \newcommand {\bG} {\mbox{\boldmath $G$}}
80: \newcommand {\bH} {\mbox{\boldmath $H$}}
81: \newcommand {\bI} {\mbox{\boldmath $I$}}
82: \newcommand {\bJ} {\mbox{\boldmath $J$}}
83: \newcommand {\bK} {\mbox{\boldmath $K$}}
84: \newcommand {\bL} {\mbox{\boldmath $L$}}
85: \newcommand {\bM} {\mbox{\boldmath $M$}}
86: \newcommand {\bN} {\mbox{\boldmath $N$}}
87: \newcommand {\bO} {\mbox{\boldmath $O$}}
88: \newcommand {\bP} {\mbox{\boldmath $P$}}
89: \newcommand {\bQ} {\mbox{\boldmath $Q$}}
90: \newcommand {\bR} {\mbox{\boldmath $R$}}
91: \newcommand {\bS} {\mbox{\boldmath $S$}}
92: \newcommand {\bT} {\mbox{\boldmath $T$}}
93: \newcommand {\bU} {\mbox{\boldmath $U$}}
94: \newcommand {\bV} {\mbox{\boldmath $V$}}
95: \newcommand {\bW} {\mbox{\boldmath $W$}}
96: \newcommand {\bX} {\mbox{\boldmath $X$}}
97: \newcommand {\bY} {\mbox{\boldmath $Y$}}
98: \newcommand {\bZ} {\mbox{\boldmath $Z$}}
99: \newcommand{\calA}{{\cal A}}
100: \newcommand{\calB}{{\cal B}}
101: \newcommand{\calC}{{\cal C}}
102: \newcommand{\calD}{{\cal D}}
103: \newcommand{\calE}{{\cal E}}
104: \newcommand{\calF}{{\cal F}}
105: \newcommand{\calG}{{\cal G}}
106: \newcommand{\calH}{{\cal H}}
107: \newcommand{\calI}{{\cal I}}
108: \newcommand{\calJ}{{\cal J}}
109: \newcommand{\calK}{{\cal K}}
110: \newcommand{\calL}{{\cal L}}
111: \newcommand{\calM}{{\cal M}}
112: \newcommand{\calN}{{\cal N}}
113: \newcommand{\calO}{{\cal O}}
114: \newcommand{\calP}{{\cal P}}
115: \newcommand{\calQ}{{\cal Q}}
116: \newcommand{\calR}{{\cal R}}
117: \newcommand{\calS}{{\cal S}}
118: \newcommand{\calT}{{\cal T}}
119: \newcommand{\calU}{{\cal U}}
120: \newcommand{\calV}{{\cal V}}
121: \newcommand{\calW}{{\cal W}}
122: \newcommand{\calX}{{\cal X}}
123: \newcommand{\calY}{{\cal Y}}
124: \newcommand{\calZ}{{\cal Z}}
125: \begin{document}
126: \thispagestyle{empty}
127: \title{An Identity of Chernoff Bounds with an Interpretation
128: in Statistical Physics and Applications in Information Theory
129: %\thanks{This research was supported by my wife and kids.}
130: }
131: \author{Neri Merhav
132: %\thanks{
133: %Currently on sabbatical leave at HP Laboratories,
134: %1501 Page Mill Road, MS 3U-4, Palo Alto CA 94304, USA.}
135: }
136: %\date{}
137: \maketitle
138:
139: \begin{center}
140: Department of Electrical Engineering \\
141: Technion - Israel Institute of Technology \\
142: Haifa 32000, Israel \\
143: \end{center}
144: \vspace{1.5\baselineskip}
145: \setlength{\baselineskip}{1.5\baselineskip}
146:
147: \begin{abstract}
148:
149: An identity between two versions of the
150: Chernoff bound on the probability a certain
151: large deviations event, is established. This identity has an interpretation
152: in statistical physics, namely, an isothermal equilibrium of a composite system that
153: consists of multiple subsystems of particles.
154: Several information--theoretic application examples, where
155: the analysis of this large deviations probability naturally arises, are
156: then described from the viewpoint of this statistical mechanical interpretation.
157: This results in several relationships between
158: information theory and statistical physics, which
159: we hope, the reader will find insightful.
160:
161: \vspace{0.5cm}
162:
163: \noindent
164: {\bf Index Terms:} Large deviations theory,
165: Chernoff bound, statistical physics, thermal equilibrium,
166: equipartition, thermodynamics, phase transitions.
167: \end{abstract}
168:
169:
170: \section{Introduction}
171:
172: Relationships between information theory and statistical physics have been
173: extensively recognized over the last few decades, and they are drawn from
174: many different aspects. We mention here only a few of them.
175:
176: One such aspect is characterized by identifying structures
177: of optimization problems pertaining to certain information--theoretic settings
178: as being analogous to parallel structures that arise in statistical physics,
179: and then borrowing statistical--mechanical
180: insights, as well as powerful analysis techniques (like the replica
181: method) from statistical physics to the dual information--theoretic setting
182: of interest. A very partial list of works along this line includes
183: \cite{AB01},
184: \cite{GV02},
185: \cite{HK05},
186: \cite{KH05},
187: \cite{KNM02},
188: \cite{KabS99}
189: \cite{KSNS01},
190: \cite{KanS99},
191: \cite{MM06} (and references therein),
192: \cite{MR06},
193: \cite{Murayama02},
194: \cite{PS99},
195: \cite{RC00},
196: \cite{Sourlas89},
197: \cite{Sourlas94},
198: \cite{Tanaka01},
199: \cite{Tanaka02},
200: and \cite{WSW05}.
201:
202: Another aspect pertains to the
203: philosophy and the application of the maximum entropy principle,
204: which emerged in statistical mechanics
205: in the nineteenth century and has been advocated during the
206: previous century in
207: a wide variety of more general contexts, by Jaynes
208: \cite{Jaynes57a},\cite{Jaynes57b},\cite{Jaynes82}, and by
209: Shore and Johnson \cite{SJ80}, as a general guiding principle
210: to problems in information theory
211: (see, e.g., \cite[Chap.\ 11]{CT91} and references therein)
212: and other areas, such as signal processing,
213: in particular, speech coding (see, e.g., \cite{GGRS81})
214: spectrum estimation (see, e.g., \cite{Burg75}), and others.
215:
216: Yet another aspect is related to ideas and theories that
217: underly the notion of `trading' between
218: information bits and energy, or heat. In particular,
219: Landauer's erasure principle
220: \cite{Landauer61} is argued to provide a powerful link between
221: information theory and physics and to
222: suggest a physical theory of information
223: (comprehensive overviews are included in,
224: e.g., \cite{Maroney04} and \cite{PV01}).
225: According to Landauer's principle, the erasure of
226: every bit of information increases the thermodynamic
227: entropy of the world by $k\ln 2$, where $k$ is Boltzmann's
228: constant, and so, information is actually physical.
229:
230: Finally, to shift gears more to the direction of this paper,
231: we should mention the aspect of the interface between statistical physics and
232: large deviations theory,
233: a line of research advocated most
234: prominently by Ellis \cite{Ellis85},\cite{Ellis06},
235: and developed also by Oono \cite{Oono89},
236: McAllester \cite{McAllester}, and others. The main
237: theme here evolves around the
238: identification of Chernoff bounds and more general large deviations
239: rate functions with free energies (along with
240: their related partition functions),
241: thermodynamical entropies, and the underlying maximum--entropy/equilibrium principle associated with them.
242: In particular, Ellis' book \cite{Ellis85}
243: is devoted largely to the application of large deviations theory
244: to the statistical physics pertaining to
245: models of ferromagnetic spin arrays, like
246: Ising spin glasses and others,
247: in order to explore
248: phase transitions phenomena of spontaneous
249: magnetization (see also \cite{MM06}).
250:
251: This paper, which is mostly expository in character,
252: lies in the intersection
253: of information theory, large deviations theory, and
254: statistical physics. In particular, we establish a simple identity between two
255: quantities as they can both be interpreted as the rate
256: function of a certain large deviations event
257: that involves multiple
258: distributions of sets of independent random variables (as opposed
259: to the usual, single set of i.i.d.\ random variables).
260: The analysis of this large deviations event is of a general form
261: that is frequently encountered in numerous applications in
262: information theory (cf.\ Section 4). Its informal description is as follows:
263: Let $v_1,\ldots,v_n$ be an arbitrary
264: (deterministic) sequence whose components take
265: on values in a finite set $\calV$, and let $U_1,\ldots,U_n$ be a sequence of
266: random variables where each component is generated independently
267: according to a distribution $q(u_i|v_i)$, $i=1,\ldots,n$.
268: For a given function $f$ and a constant $E$,
269: we are interested in the large deviations
270: analysis (Chernoff bound) of the probability of the event
271: \begin{equation}
272: \label{event}
273: \sum_{i=1}^n f(U_i,v_i)\le nE,
274: \end{equation}
275: assuming that the relative frequencies of the various symbols in
276: $(v_1,\ldots,v_n)$ stabilize as $n$ grows without bound, and assuming
277: that $E$ is sufficiently small to make this a rare event for large $n$.
278:
279: There are (at least) two ways to drive a Chernoff bound on the probability
280: of this event. The first is to treat the entire sequence of RV's,
281: $\{f(U_i,v_i)\}$ as a whole, and the second is to partition it
282: according to the various symbols $\{v_i\}$, i.e., to consider the separate
283: large deviations events of the
284: partial sums, $\sum_{i:v_i=v}f(U_i,v)$, $v\in\calV$, for all possible
285: allocations of the total `budget' $nE$ among the various $\{v\}$.
286: These two approaches lead to two
287: (seemingly) different expressions of Chernoff bounds,
288: but since they are both exponentially tight, they must agree.
289:
290: As will be described and discussed in Section 2,
291: the identity between these two Chernoff bounds has a natural
292: interpretation in statistical physics: it is viewed as
293: a situation of thermal equilibrium (maximum
294: entropy) in a system that consists of several
295: subsystems (which can be of different kinds), each of them with many particles.
296:
297: As will be shown in Section 4,
298: the above--described problem of large deviations analysis of the
299: event (\ref{event}) is encountered in many applications in information
300: theory, such as rate--distortion coding, channel capacity, hypothesis
301: testing (signal detection, in particular), and others. The
302: above mentioned statistical mechanical
303: interpretation then applies to all
304: of them. Accordingly, Section 4 is devoted to expository descriptions of
305: each of these applications, along with
306: the underlying physics that is inspired by
307: the proposed thermal equilibrium interpretation. The reader is assumed to have
308: very elementary background in statistical physics.
309:
310: The remaining part of this paper is organized as follows. In Section 2,
311: we establish some notation conventions. In Section 3, we assert and prove
312: our main result, which is the identity between the above described
313: Chernoff bounds. Finally, in Section 4, we explore the application
314: examples.
315:
316: \section{Notation}
317:
318: Throughout this paper, scalar random
319: variables (RV's) will be denoted by the capital
320: letters, like $U$,$V$,$X$, and $Y$, their sample values will be denoted by
321: the respective lower case letters, and their alphabets will be denoted
322: by the respective calligraphic letters.
323: A similar convention will apply to
324: random vectors and their sample values,
325: which will be denoted with same symbols superscripted by the dimension.
326: Thus, for example, $X^n$ will denote a random $n$-vector $(X_1,\ldots,X_n)$,
327: and $x^n=(x_1,...,x_n)$ is a specific vector value in $\calX^n$,
328: the $n$-th Cartesian power of $\calX$. The
329: notations $x_i^j$ and $X_i^j$, where $i$
330: and $j$ are integers and $i\le j$, will designate segments $(x_i,\ldots,x_j)$
331: and $(X_i,\ldots,X_j)$, respectively,
332: where for $i=1$, the subscript will be omitted (as above).
333: Sequences without specifying indices are denoted by $ \{\cdot\} $.
334: Sources and channels will be denoted generically by the letter $P$ or $Q$.
335: Specific letter probabilities corresponding to a source $P$ will be
336: denoted by the corresponding lower case letter, e.g., $p(v)$ is the
337: probability of a letter $v\in\calV$. A similar convention will be applied
338: to a channel $Q$ and the corresponding transition probabilities, e.g., $q(u|v)$,
339: $u\in\calU$, $v\in\calV$.
340: The cardinality of a finite set $\calA$ will be denoted by $|\calA|$.
341: Information theoretic quantities like entropies, and mutual
342: informations will be denoted following the usual conventions
343: of the information theory literature.
344:
345: Notation pertaining to statistical physics
346: will also follow, wherever possible,
347: the customary conventions. I.e., $k$ will denote
348: Boltzmann's constant ($k=1.38065\times 10^{-23}$ Joules
349: per Kelvin degree), $T$ -- absolute temperature (in Kelvin
350: degrees), $\beta=1/(kT)$ -- the inverse temperature
351: (in units of $\mbox{Joule}^{-1}$ or $\mbox{erg}^{-1}$),
352: $E$ -- energy, the letter $Z$ will be used to
353: denote partition functions, etc.
354:
355: \section{Main Result}
356:
357: Let $\calU$ and $\calV$ be finite\footnote{The assumption that $\calU$ is
358: finite, is made mostly for the sake of convenience and simplicity. Most
359: of our results extend straightforwardly to the case of a continuous
360: alphabet $\calU$. The extension to a continuous alphabet $\calV$ is somewhat
361: more subtle, however.}
362: sets and let $f:\calU\times\calV\to\reals$
363: be a given function. Let
364: $P=\{p(v),~v\in\calV\}$
365: be a probability mass function on $\calV$ and
366: let $Q=\{q(u|v),~u\in\calU,~v\in\calV\}$ be a
367: matrix of conditional probabilities from $\calV$ to $\calU$.
368: %We are given a deterministic vector
369: %$v^n=(v_1,\dots,v_n)$ with components in $\calV$,
370: %where each letter $v\in\calV$ appears $n_v$ times, $\sum_{v\in\calV}n_v=n$. We
371: %will also denote $p(v)=n_v/n$.
372:
373: Next, let us define for each $v\in\calV$,
374: the partition function:
375: \begin{equation}
376: \label{zvb}
377: Z_v(\beta)=\sum_{u\in\calU}q(u|v)e^{-\beta f(u,v)},~~~~\beta\ge 0,
378: \end{equation}
379: and for a given $E_v$ in the range
380: \begin{equation}
381: \label{range}
382: \min_{u\in\calU}f(u,v) \le E_v \le
383: \sum_{u\in\calU}q(u|v)f(u,v),
384: \end{equation}
385: let
386: \begin{equation}
387: S_v(E_v)=\min_{\beta\ge 0}[\beta E_v+\ln Z_v(\beta)].
388: \end{equation}
389: Further, for a given constant $E$ in the range
390: $$\sum_{v\in\calV}p(v)\min_{u\in\calU}f(u,v) \le E \le
391: \sum_{u\in\calU}\sum_{v\in\calV}p(v)q(u|v)f(u,v),$$
392: let
393: \begin{equation}
394: \bar{S}(E)=\min_{\beta\ge 0}\left[\beta E+
395: \sum_{v\in\calV}p(v)\ln Z_v(\beta)\right].
396: \end{equation}
397: Let $\calH(E)$ denote the set of all $|\calV|$--dimensional vectors
398: $\bar{E}=\{E_v,~v\in\calV\}$, where each component $E_v$ satisfies
399: (\ref{range}),
400: and where $\sum_vp(v)E_v\le E$.
401: Our main result, in this section, is the following:
402:
403: \begin{theorem}
404: \begin{equation}
405: \label{identity}
406: \max_{\bar{E}\in\calH(E)}\sum_{v\in\calV}p(v)S_v(E_v)=\bar{S}(E).
407: \end{equation}
408: \end{theorem}
409:
410: The expression on the right--hand side is,
411: of course, more convenient to work with since
412: it involves minimization w.r.t.\ one parameter
413: only, as opposed to the left--hand side,
414: where there is a minimization over $\beta$
415: for every $v$, as well as a maximization
416: over the $|\calV|$--dimensional vector $\bar{E}$.
417:
418: While the proof of Theorem 1 below is fairly short,
419: in the Appendix (subsection A.1), we outline an alternative
420: proof which, although somewhat longer,
421: provides some additional insight, we believe.
422: As described briefly in the Introduction,
423: it is based on two different approaches to the analysis
424: of the rate function, $I(E)$, pertaining to
425: the probability of the event:
426: \begin{equation}
427: \label{ld}
428: \sum_{i=1}^n f(U_i,v_i)\le nE,
429: \end{equation}
430: where $\{U_i\}$ are RV's taking values in $\calU$ and drawn according to
431: $q(u^n|v^n)=\prod_{i=1}^nq(u_i|v_i)$, and
432: $v^n=(v_1,\dots,v_n)$ is a given deterministic vector whose
433: components are in $\calV$,
434: with each $v\in\calV$ appearing
435: $n_v$ times ($\sum_{v\in\calV}n_v=n$), and
436: the related relative frequency, $n_v/n$ is exactly $p(v)$.
437:
438: It should be noted that
439: the proof in the Appendix pertains to a
440: slightly different definition of the set
441: $\calH(E)$, where the individual upper bound to
442: each $E_v$ is enlarged to $\max_uf(u,v)$.
443: Thus, $\calH(E)$ is extended to a larger set,
444: which will be denoted by $\calH_0(E)$ in the Appendix. But the
445: maximum over $\calH_0(E)$ is
446: always attained within the original set $\calH(E)$
447: (as is actually shown in the proof below).
448:
449: \vspace{0.5cm}
450:
451: \noindent
452: {\it Proof.}
453: Here we prove the identity of Theorem 1
454: directly, without using large deviations analysis and Chernoff bounds.
455: We first prove that for every $\bar{E}\in\calH(E)$,
456: we have $\sum_{v\in\calV}p(v)S_v(E_v)\le \bar{S}(E)$
457: and then, of course,
458: $$\max_{\bar{E}\in\calH(E)}\sum_{v\in\calV}p(v)S_v(E_v)\le \bar{S}(E)$$
459: as well. This follows from the following chain of inequalities:
460: \begin{eqnarray}
461: \sum_{v\in\calV}p(v)S_v(E_v)&=&\sum_{v\in\calV}p(v)\cdot\min_{\beta\ge 0}[\beta E_v+\ln Z_v(\beta)]\nonumber\\
462: &=&\sum_{v\in\calV}\min_{\beta\ge 0}[\beta p(v)E_v+p(v)\ln Z_v(\beta)]\nonumber\\
463: &\le&\min_{\beta\ge 0}\left[\beta\sum_{v\in\calV}p(v)E_v+
464: \sum_{v\in\calV}p(v)\ln Z_v(\beta)\right]\nonumber\\
465: &\le&\min_{\beta\ge 0}\left[\beta E+\sum_{v\in\calV}p(v)\ln Z_v(\beta)\right]\nonumber\\
466: &=&\bar{S}(E),
467: \end{eqnarray}
468: where in the second inequality we used the postulate that
469: $\sum_vp(v)E_v\le E$.
470:
471: In the other direction, let $\beta^*$ be the achiever of $\bar{S}(E)$,
472: i.e., $\beta^*$ is the solution to the equation:
473: $$E=-\left[\frac{\partial}{\partial\beta}\sum_vp(v)
474: \ln Z_v(\beta)\right]_{\beta=\beta^*}.$$
475: For each $v\in\calV$,
476: let $E_v^*\in[\min_uf(u,v),\sum_uq(u|v)f(u,v)]$ be chosen such that
477: $\beta^*$ would be the achiever of $S_v(E_v^*)$, i.e., $E_v^*=-[\partial\ln Z_v(\beta)/\partial\beta]_{\beta=\beta^*}$.
478: Obviously, the vector $\{E_v^*,~v\in\calV\}$ lies in $\calH(E)$, and
479: \begin{eqnarray}
480: \sum_vp(v)E_v^*&=&-\sum_vp(v)\left[\frac{\partial\ln
481: Z_v(\beta)}{\partial\beta}\right]_{\beta=\beta^*}\nonumber\\
482: &=&-\left[\frac{\partial}{\partial\beta}\sum_vp(v)
483: \ln Z_v(\beta)\right]_{\beta=\beta^*}\nonumber\\
484: &=&E.
485: \end{eqnarray}
486: Thus,
487: \begin{eqnarray}
488: \max_{\bar{E}\in\calH(E)}\sum_{v\in\calV}p(v)S_v(E_v)
489: &\ge&\sum_{v\in\calV}p(v)S_v(E_v^*)\nonumber\\
490: &=&\sum_{v\in\calV}p(v)[\beta^* E_v^*+\ln Z_v(\beta^*)]\nonumber\\
491: &=&\beta^*\sum_{v\in\calV}p(v)E_v^*+\sum_vp(v)\ln Z_v(\beta^*)\nonumber\\
492: &=&\beta^*E+\sum_vp(v)\ln Z_v(\beta^*)\nonumber\\
493: &=&\bar{S}(E).
494: \end{eqnarray}
495: This completes the proof of Theorem 1.
496: $\Box$
497:
498: The function $Z_v(\beta)$ is similar to the well--known partition function
499: pertaining to the Boltzmann distribution w.r.t.\ the Hamiltonian (energy function)
500: $\calE_v(u)=f(u,v)$,
501: except that each exponential term
502: is weighted by $q(u|v)$, as opposed to the usual form,
503: which is just $\sum_{u\in\calU}e^{-\beta \calE_v(u)}$.
504: Before describing the statistical mechanical interpretation of eq.\ (\ref{identity}),
505: we should note that $Z_v(\beta)$ defined in (\ref{zvb}) can easily be related to
506: the ordinary partition function, without weighting, as follows:
507: Suppose that $\{q(u|v)\}$ are rational\footnote{Even
508: if not rational, they can always be approximated as such to an arbitrarily good precision.}
509: and hence can be represented as ratios
510: of two positive integers, $q(u|v)=M(u|v)/M$,
511: where $M \ge |\calU|$ is common to all $u\in\calU$ (and $v\in\calV$). Now,
512: imagine that every value of $u$ actually represents
513: a `quantization' of a more refined microstate (call it a
514: ``nanostate'') $w\in\calW$, $|\calW|=M$, so that $u=g_v(w)$,
515: where $g_v$ is a many--to--one function, for which the inverse image of every $u$ consists of
516: $M(u|v)$ many values of $w$. Suppose further that the Hamiltonian depends
517: on $w$ only via $g_v(w)$, i.e., $\calE_v'(w)=\calE_v(g_v(w))$. Then, the (ordinary) partition
518: function related to $w$ is given by
519: \begin{eqnarray}
520: \label{wpf}
521: \zeta_v(\beta)&=&\sum_{w\in\calW}e^{-\beta\calE_v'(w)}\nonumber\\
522: &=&\sum_{w\in\calW}e^{-\beta\calE_v(g_v(w))}\nonumber\\
523: &=&\sum_{u\in\calU}M(u|v)e^{-\beta\calE_v(u)}\nonumber\\
524: &=&M\sum_{u\in\calU}q(u|v)e^{-\beta\calE_v(u)}=MZ_v(\beta).
525: \end{eqnarray}
526: Thus, the weighted partition function is, within a constant factor $M$,
527: the same as the ordinary partition function of $w$. This factor
528: cancels out when probabilities are calculated since it appears both in the
529: numerator and the denominator. Moreover,
530: it affects neither the minimizing $\beta$
531: that achieves $S_v(E_v)$ or
532: $\bar{S}(E)$, nor the derivatives of the log--partition
533: function.
534:
535: We now move on to our interpretation of
536: eq.\ (\ref{identity}) from the viewpoint
537: of elementary statistical physics: Consider a physical system which consists of $|\calV|$ subsystems of
538: particles. The total number of particles in the system is $n$ and the total amount
539: of energy is $nE$ Joules. For each $v\in\calV$, the subsystem indexed by $v$
540: (subsystem $v$, for short) contains
541: $n_v=np(v)$ particles, each of which can lie in any microstate
542: within a finite set of microstates $\calU$ (or an underlying
543: nanostate in a set $\calW$),
544: and it is characterized by an additive Hamiltonian
545: $\calE_v(u_1,\ldots,u_{n_v})=\sum_{i=1}^{n_v}f(u_i,v)$. The total amount of
546: energy possessed by subsystem $v$ is given by $n_vE_v$ Joules. As long
547: as the subsystems are in thermal isolation from each other, each one of them
548: may have its own temperature $T_v=1/(k\beta_v)$, where $\beta_v$ is the achiever
549: of the normalized (per--particle) entropy
550: associated with an average per--particle energy $E_v$, i.e.,
551: $$S_v(E_v)=\min_{\beta\ge 0}[\beta E_v+\ln Z_v(\beta)].$$
552: The above--mentioned rate function $I(E)$ of $\mbox{Pr}\{\sum_{i=1}^nf(U_i,v_i)\le nE\}$
553: is then given by the negative maximum total per--particle entropy,
554: $\sum_vp(v)S_v(E_v)$, where the maximum is over all energy allocations $\{E_v\}$
555: such that the total energy is conserved, i.e., $\sum_vp(v)E_v=E$.
556: This maximum is attained by the expression of the
557: r.h.s.\ of eq.\ (\ref{identity}), where there
558: is {\it only one} temperature parameter, and hence
559: it corresponds to {\it thermal equilibrium}.
560: In other words, the whole system then lies in the same
561: temperature $T^*=1/(k\beta^*)$, where $\beta^*$
562: is the minimizer of
563: $\bar{S}(E)$. Thus, the energy allocation among
564: the various subsystems in
565: equilibrium is such that their temperatures are the same
566: (cf.\ the above proof of Theorem 1). Theorem 1 is then
567: interpreted as expressing the second law
568: of thermodynamics.
569:
570: At this point, a few comments are in order:
571: \begin{enumerate}
572: \item It should be pointed out that in the above physical interpretation, we have implicitly assumed that the
573: particles within each subsystem are distinguishable, and so the partition function corresponding to a set of $n_v$
574: particles is given by the partition function of a single particle raised to the power of $n_v$, without dividing
575: by $n_v!$. This differs then from the indistinguishable case only by a constant factor
576: (as long as $n_v$ is indeed constant)
577: and hence the difference between the distinguishable and the indistinguishable
578: cases is not essential for the most part of our discussion.
579: \item As mentioned in the above paragraph, our conclusion is that $I(E)=-\bar{S}(E)$. At first glance, this may
580: seem peculiar as it appears that $I(E)$ may be negative. However, one should keep in mind that $\bar{S}(E)$
581: is induced by a (convex) combination of weighted partition functions,
582: rather than ordinary partition functions, like $\zeta_v(\beta)$. Referring to eq.\ (\ref{wpf}), the ordinary
583: notion of entropy $\Sigma(E)$
584: as the normalized log--number of (nano)states with normalized energy $E$, is
585: given by
586: \begin{eqnarray}
587: \bar{\Sigma}(E)&=&\min_{\beta\ge0}\left[\beta E+
588: \sum_vp(v)\ln \zeta_v(\beta)\right]\nonumber\\
589: &=&\min_{\beta\ge 0}\left[\beta E+\sum_vp(v)\ln Z_v(\beta)\right]+\ln M\nonumber\\
590: &=&\bar{S}(E)+\ln M.
591: \end{eqnarray}
592: Thus,
593: $$I(E)=\ln M - \bar{\Sigma}(E),$$
594: which is always non--negative.
595: \item The identity (\ref{identity})
596: can be thought
597: of as a generalized concavity property of the entropy:
598: Had all the entropy
599: functions $S_v(\cdot)$ been the same, this would have been the usual
600: concavity property. What makes this equality less trivial and more interesting
601: is that it continues to hold even when $S_v(\cdot)$, for
602: the various $v\in\calV$, are different from each
603: other.
604: \item On the more technical level, since this paper draws analogies with physics,
605: we should say a few words about physical units. The products $\beta E$, $\beta E_v$, $\beta f(u,v)$, etc.,
606: should all be pure numbers, of course. Since $\beta=1/(kT)$,
607: where $k$ is Boltzmann's constant and $T$ is absolute temperature,
608: and since $kT$ has units of energy (Joules or ergs, etc.),
609: it is understood that $E$, $E_v$, $f(u,v)$ and the like, should all have units of energy as well. In the applications
610: described below, whenever this is not the case, i.e., the latter quantities are pure numbers rather than physical energies,
611: we will sometimes reparametrize $\beta$ by $\beta\epsilon_0$, where $\epsilon_0$ is an arbitrary constant possessing
612: units of energy (e.g., $\epsilon_0=1$ Joule or $\epsilon_0=1$ erg),
613: and we absorb $\epsilon_0$ in the Hamiltonian, i.e.,
614: redefine $\calE_v(u)=\epsilon_0f(u,v)$. Thus, in this case, $S_v(E)$, where $E$ is the now the energy in units
615: of $\epsilon_0$, is redefined as
616: $$S_v(E)=\min_{\beta\ge 0}
617: \left[\beta\cdot\epsilon_0 E+
618: \ln\left(\sum_uq(u|v)e^{-\beta\calE_v(u)}\right)\right].$$
619: This kind of modification is
620: not essential, but it may help to avoid confusion
621: about units when the picture
622: is viewed from the aspects of physics.
623: \end{enumerate}
624:
625: \section{Applications}
626:
627: Equipped with the main result of the previous section and its statistical mechanical
628: interpretation, we next introduce a few applications that fall within the
629: framework considered. In all these applications,
630: there is an underlying large deviations event of the type of eq. (\ref{ld}), whose rate function
631: is of interest. The above described viewpoint of statistical physics is then relevant in all
632: these applications.
633:
634: \subsection{The Rate--Distortion Function}
635:
636: Let $P=\{p(x),~x\in\calX\}$ designate the vector
637: of letter probabilities associated with a
638: given discrete memoryless source (DMS), and for a given reproduction
639: alphabet $\hat{\calX}$, let $d:\calX\times\hat{\calX}\to\reals^+$
640: denote a single--letter distortion measure. Let $R(D)$ denote the rate--distortion
641: function of the DMS $P$.
642:
643: One useful way to think of the rate--distortion function
644: is inspired by the classical random coding argument:
645: Let $(\hX_1,\ldots,\hX_n)$ be drawn i.i.d.\ from the
646: optimum random coding distribution $q^*(\hx_1,\ldots,\hx_n)=\prod_{i=1}^n
647: q^*(\hx_i)$ and
648: consider the event $\sum_{i=1}^n d(x_i,\hX_i)\le nD$,
649: where $x^n$ is a given source vector, typical to $P$, i.e.,
650: the composition of $x^n$ consists
651: of $n_x=np(x)$ occurrences of each $x\in\calX$. This is exactly an event of the type (\ref{ld}) with
652: $U_i=\hX_i$, $v_i=x_i$, $i=1,\ldots,n$, $q(u|v)=q(\hx|x)=q^*(\hx)$ independently of $x$,
653: $f(u,v)=f(\hx,x)=d(x,\hx)$, and $E=D$. I.e., the Hamiltonian $\calE_x(\hx)$ is given by
654: $\epsilon_0d(x,\hx)$ and the total energy is $nD$ in units of $\epsilon_0$.
655:
656: Suppose that this
657: probability is of the exponential order of $e^{-nI(D)}$. Then,
658: it takes about $M=e^{n[I(D)+\epsilon]}$ ($\epsilon > 0$, however small)
659: independent trials to `succeed' at least once
660: (with high probability) in having some realization of $\hX^n$
661: within distance $nD$ from $x^n$.
662: This is the well--known
663: the classical random coding achievability argument that leads to $I(D)=R(D)$.
664: Thus, the large--deviations rate function of interest agrees exactly
665: with the rate--distortion function (cf.\ \cite[Sect.\ 3.4]{Berger71}), which is:
666: \begin{equation}
667: R(D)=-\min_{\beta\ge 0}\left[\beta\cdot\epsilon_0D+\sum_{x\in\calX}p(x)
668: \ln\left(\sum_{\hx\in\hat{\calX}}q^*(\hx)e^{-\beta\cdot\epsilon_0 d(x,\hx)}\right)\right].
669: \end{equation}
670: Interestingly,
671: in \cite[p.\ 90, Corollary 4.2.3]{Gray90}), the rate--distortion function
672: is shown, using completely different considerations,
673: to have a parametric representation which can be written exactly in this form.
674:
675: The fact that the rate--distortion function has an
676: interpretation of an isothermal equilibrium situation in
677: statistical thermodynamics is not quite new
678: (cf.\ e.g.\ \cite[Sect.\ 6.4]{Berger71}, \cite{Rose94}).
679: Here, however, we obtain it in a more explicit
680: manner and as a special case of a more
681: general principle.
682:
683: A simple example is that of the binary symmetric source with the Hamming distortion
684: measure. It is easy to see that, in this example,
685: the relationship between distortion and temperature is:
686: \begin{equation}
687: T=\frac{\epsilon_0}{k\ln[(1-D)/D]}~~\mbox{or, equivalently,}~~D=\frac{1}{1+e^{\epsilon_0/(kT)}}
688: \end{equation}
689: and, of course, $R(D)=1-h_2(D)$, where $h_2(D)$ is the binary entropy function.
690:
691: A slightly more involved example pertains to the regime
692: of high resolution (small distortion) and it turns out to
693: be related to (a generalized version of) the
694: law of equipartition of energy in statistical physics:
695: Consider the $L_\theta$ distortion measure, $d(x,\hx)=|x-\hx|^\theta$ (most
696: commonly encountered are the cases $\theta=1$ and $\theta=2$). Let
697: us assume that $D > 0$ is very small and consider the (continuous)
698: uniform random coding distribution $q(\hx)=\frac{1}{2A}$ in the interval
699: $[-A,A]$ and zero elsewhere. This random coding distribution is suboptimal, but
700: it corresponds, and hence is well motivated,
701: by many results in high--resolution quantization using
702: uniform quantizers (see, e.g., \cite{GN98} and references therein).
703: For every $x\in\calX$, the partition function
704: is given by
705: $$Z_x(\beta)=\frac{1}{2A}\int_{-A}^A
706: \exp\{-\beta\epsilon_0|\hx-x|^\theta\} \mbox{d}\hx.$$
707: When $D$ is very small, $\beta$ is very large, and
708: then the finite--interval integral pertaining to $Z_x(\beta)$ can be
709: approximated\footnote{See the Appendix (subsection A.2)
710: for a more rigorous derivation.} by an infinite one,
711: provided that the support of $\{p(x)\}$
712: is included\footnote{An alternative, softer
713: condition is that the probability that $|x|\ge A$ is negligibly small.}
714: in the interval $[-A,A]$:
715: \begin{equation}
716: \label{approxz}
717: Z_x(\beta)\approx\frac{1}{2A}\int_{-\infty}^\infty
718: \exp\{-\beta\epsilon_0|\hx-x|^\theta\} \mbox{d}\hx,
719: \end{equation}
720: which then becomes independent of $x$. The average distortion
721: (internal energy) associated with this partition function can
722: be evaluated using the same technique as the one that leads to the
723: law of equipartition in statistical physics:
724: \begin{eqnarray}
725: \label{equipartition}
726: \epsilon_0 D&\approx&-\frac{\partial}{\partial \beta}\ln
727: \left[\int_{-\infty}^\infty
728: \exp\{-\beta\epsilon_0|\hx-x|^\theta\} \mbox{d}\hx\right]\nonumber\\
729: &=&-\frac{\partial}{\partial \beta}\ln\left[
730: \beta^{-1/\theta}\cdot\int_{-\infty}^\infty
731: \exp\{-\epsilon_0|\beta^{1/\theta}(\hx-x)|^\theta\}
732: \mbox{d}(\beta^{1/\theta}(\hx-x))\right]\nonumber\\
733: &=&-\frac{\partial}{\partial \beta}\ln\left[
734: \beta^{-1/\theta}\cdot\int_{-\infty}^\infty
735: \exp\{-\epsilon_0|z|^\theta\}
736: \mbox{d}z\right]\nonumber\\
737: &=&-\frac{\mbox{d}}{\mbox{d}\beta}\ln
738: \left(\beta^{-1/\theta}\right)-
739: \frac{\partial}{\partial \beta}\ln\left[
740: \int_{-\infty}^\infty
741: \exp\{-\epsilon_0|z|^\theta\}
742: \mbox{d}z\right]\nonumber\\
743: &=&\frac{1}{\beta \theta}-0
744: =\frac{kT}{\theta}
745: \end{eqnarray}
746: [Note that for $\theta=2$, where the Hamiltonian is
747: quadratic in the integration variable
748: $\hx$, this is exactly the law of equipartition.]
749: Thus, for low temperatures, the distortion
750: is given by $D=kT/(\epsilon_0\theta)$, i.e.,
751: distortion is linear in temperature in that regime,
752: and the constant of proportionality is related to the
753: heat capacity, $C=k/\theta$.
754: Since the temperature is proportional to the negative local slope
755: of the distortion--rate function (as the reciprocal, $\beta$, is proportional
756: to the negative local slope of the rate--distortion function), this means that the distortion
757: is proportional to its derivative w.r.t.\ $R$, which means an exponential relationship of the
758: form $D=D_0e^{-\theta R}$ ($D_0$ -- constant). For $\theta=2$ (mean square error),
759: this is recognized as the well--known characterization
760: of distortion as function of rate in the high resolution regime.
761: Specifically, in this case, the factor of $2$ at the denominator
762: of $kT/2$, the universal expression of the
763: internal energy per degree of freedom according to
764: the equipartition theorem, has the same origin as the factor of $2$ that appears
765: in the exponent
766: of $D(R)=D_0e^{-2R}$ (decay of 6dB per bit).
767: Thus the law of equipartition in statistical physics is
768: related to the behavior of rate distortion codes in the high resolution regime.
769:
770: To compute the rate associated with this temperature more explicitly,
771: note that the minimizing $\beta^*$
772: is given by $1/(\theta\epsilon_0D)$, and so
773: \begin{eqnarray}
774: R&=&-\beta^*\epsilon_0D-
775: \ln\left[\frac{1}{2A}\int_{-\infty}^\infty
776: \exp\{-\beta^*\epsilon_0|\hx-x|^\theta\} \mbox{d}\hx\right]\nonumber\\
777: &=&-\frac{1}{\theta}-\ln\left[\frac{1}{2A}
778: \cdot\frac{2\Gamma(1/\theta)}{\theta(1/\theta D)^{1/\theta}}\right]\nonumber\\
779: &=&\ln\left[\frac{A\theta}{\Gamma(1/\theta)
780: (\theta eD)^{1/\theta}}\right]\nonumber\\
781: &=&\ln\left[\frac{A\theta}{\Gamma(1/\theta)}\right]-
782: \frac{1}{\theta}\ln(\theta eD).
783: \end{eqnarray}
784:
785: \subsection{Channel Capacity}
786:
787: In complete duality to the random coding argument that puts the
788: rate--distortion function in the framework discussed in Section 3,
789: a parallel argument can be made with regard to channel capacity.
790:
791: Given a discrete memoryless channel (DMC) with a finite input alphabet $\calX$,
792: and a finite output alphabet $\calY$, we can obtain capacity using the following
793: argument. Let $\{q^*(x),~x\in\calX\}$ be the optimum random coding distribution according
794: to which, each codeword $X^n$ is drawn independently. Let $y^n$ be a given channel
795: output sequence which is typical to the output distribution $p(y)=\sum_{x\in\calX}q(x)W(y|x)$,
796: where $\{W(y|x),~x\in\calX,~y\in\calY\}$ are the channel transition probabilities. That is,
797: each symbol $y$ appears $n_y=np(y)$ times in $y^n$. Consider now the large deviations event
798: \begin{equation}
799: \label{td}
800: \sum_{i=1}^n\log\frac{1}{W(y_i|X_i)}\le nH(Y|X),
801: \end{equation}
802: where $H(Y|X)=-\sum_{x\in\calX}\sum_{y\in\calY}q(x)W(y|x)\log W(y|x)$.
803: By the union bound, as long as the number of randomly chosen codewords is exponentially less
804: than $e^{-nI}$, where $I$ is the rate function of the large--deviations event (\ref{td}), then
805: the average error probability still vanishes as $n\to\infty$.\footnote{Here we apply the union
806: bound to a threshold decoder that seeks a unique codeword that satisfies (\ref{td}),
807: which although suboptimum, is still good enough to achieve capacity.}
808: Since this is the exactly the achievability argument of the channel coding theorem, then $I=C$, where $C$ the channel
809: capacity.
810:
811: Again, this complies with our model setting with the assignments, $U_i=X_i$, $v_i=y_i$, $i=1,\ldots,n$,
812: $q(u|v)=q(x|y)=q^*(x)$ independently of $y$,
813: $f(u,v)=f(x,y)=-\log W(y|x)$ and $E=H(Y|X)$ units of $\epsilon_0$.
814: In other words, channel capacity can be represented as
815: \begin{equation}
816: C=-\min_{\beta\ge 0}\left[\beta \cdot \epsilon_0 H(Y|X)+
817: \sum_{y\in\calY}p(y)\ln\left(\sum_{x\in\calX}q^*(x)e^{-\beta\cdot\epsilon_0[-\log W(y|x)]}\right)\right].
818: \end{equation}
819: It is easy to see that, in this case, the equilibrium
820: temperature always corresponds
821: to $\beta\epsilon_0=1$, namely, $T=\epsilon_0/k$.
822:
823: By the same token, one can derive an expression
824: of the random coding capacity pertaining to mismatched
825: decoding, where the decoder uses an additive metric $m(x,y)$
826: other than the optimum metric,
827: $-\log W(y|x)$ (see, e.g., \cite{Balakirsky95},
828: \cite{CN95},
829: \cite{Lapidoth94},
830: \cite{LS96-2},
831: \cite{MKLS94},
832: and references therein).
833: The only modifications to the above
834: expression would be to replace the Hamiltonian
835: by $\calE_y(x)=\epsilon_0m(x,y)$
836: and to replace $H(Y|X)$ by the expectation
837: of $m(X,Y)$ w.r.t.\ $q^*(x)W(y|x)$.
838: The new optimum random coding distribution
839: might change as well. Here, it
840: is no longer necessarily true that the equilibrium temperature
841: is $T=\epsilon_0/k$.
842:
843: \subsection{Signal Detection and Hypothesis Testing}
844:
845: Consider the following binary hypothesis testing problem:
846: Given a deterministic signal, which is repreresented by a sequence $x^n=(x_1,\ldots,x_n)$
847: with elements taking on values in a (finite) set $\calX$ and relative frequencies $\{p(x),~x\in\calX\}$,
848: and given an observation sequence
849: $Y^n=(Y_1,\ldots,Y_n)$, we are required to decide between two hypotheses:
850: \begin{itemize}
851: \item[$H_0:$] The observation vector $Y^n$ is ``pure noise,''
852: distributed according to some product measure $Q=\{q(y),~y\in\calY\}$, i.e., $q(y^n)=\prod_{i=1}^nq(y_i)$,
853: which is unrelated to $x^n$.
854: \item[$H_1:$] The observation vector $Y^n$ is a ``noisy version'' of $x^n$,
855: distributed according to $q(y^n|x^n)=\prod_{i=1}^nq(y_i|x_i)$.
856: \end{itemize}
857: The optimum detector (under both the Bayesian and the Neyman--Pearson criterion) compares
858: the likelihood ratio $\sum_{i=1}^n\ln [q(y_i)/q(y_i|x_i)]$ to a threshold $nE_0$, and decides
859: in favor of $H_0$ if this threshold is exceeded, otherwise, it decides in favor of $H_1$.
860:
861: The false--alarm probability then is the probability of the event
862: $$\sum_{i=1}^n\ln \left[\frac{q(Y_i)}{q(Y_i|x_i)}\right]\le nE_0$$
863: under $Q$. This, again, fits our scenario with the substitutions
864: $U_i=Y_i$, $v_i=x_i$, $i=1,\ldots,n$,
865: $q(u|v)=q(y)$, independently of $x=v$, $f(u,v)=f(y,x)=\ln[q(y)/q(y|x)]$, and $E=E_0$.
866: Similarly, the analysis of the missed--detection probability corresponds to
867: the assignments: $U_i=Y_i$, and $v_i=x_i$, $i=1,\ldots,n$, as before, but now
868: $q(u|v)=q(y|x)$, $f(u,v)=f(y,x)=\ln[q(y|x)/q(y)]$ and $E=-E_0$.
869: Note that when $\{q(y)\}$
870: is the uniform distribution over $\calY$, the missed-detection event
871: can also be interpreted as the probability of excess code--length of
872: an arithmetic lossless source code w.r.t.\ $\{q(y|x)\}$.
873:
874: Another situation of hypothesis testing that is related to our study in a similar manner is
875: one where the signal $x^n$ is always underlying the observations, but the decision to be made
876: is associated with two hypotheses regarding
877: the noise level, or the temperature. In this case, there is a certain
878: Hamiltonian $\calE_x(y)$ for each $x\in\calX$, and we assume a Boltzmann--Gibbs distribution
879: parametrized by the temperature
880: $$q(y|x,\beta)=\frac{e^{-\beta\calE_x(y)}}{\zeta_x(\beta)}$$
881: where
882: $$\zeta_x(\beta)=\sum_ye^{-\beta\calE_x(y)}.$$
883: Note that here $\zeta_x(\beta)$ is an ordinary partition function, without
884: weighting (cf.\ (\ref{wpf})). We shall also denote
885: $$\bar{\Sigma}(E)=\min_{\beta\ge 0}\left[\beta E
886: +\sum_{x\in\calX}p(x)\ln\zeta_x(\beta)\right].$$
887: As $\bar{\Sigma}(E)$ is induced by a convex combination of non-weighted
888: partition functions, it has the significance of the normalized logarithm
889: of the number of microstates with energy about $nE$. Thus, $k\cdot\bar{\Sigma}(E)$,
890: where $k$ is Boltzmann's constant, is the thermodynamic entropy.
891:
892: Given two values $\beta_1$ and $\beta_2$ (say, $\beta_1 > \beta_2$),
893: the hypotheses now are the following:
894: \begin{itemize}
895: \item[$H_1:$] $Y^n$ is
896: distributed according to $q_1(y^n|x^n)=\prod_{i=1}^nq(y_i|x_i,\beta_1)$.
897: \item[$H_2:$] $Y^n$ is
898: distributed according to $q_2(y^n|x^n)=\prod_{i=1}^nq(y_i|x_i,\beta_2)$.
899: \end{itemize}
900: The likelihood ratio test compares
901: $\sum_{i=1}^n\calE_{x_i}(Y_i)$ to a threshold, $nE_0$, and
902: decides in favor of $H_2$ if the threshold
903: is exceeded, otherwise, it favors $H_1$.
904: Here, $E_0$ should lie in the interval $(E_1,E_2)$,
905: where
906: $$E_i\dfn-\sum_{x\in\calX}p(x)\cdot\left[\frac{\partial\ln \zeta_x(\beta)}{\partial
907: \beta}\right]_{\beta=\beta_i},~~~i=1,2.$$
908: For convenience, let us assume now that
909: $E_i$, $i=0,1,2,$ and $\calE_x(y)$ already have units of energy, so
910: there is no need to have the constant $\epsilon_0$. In this
911: situation, the exponent of the error probability under $H_2$ is given by
912: $-\bar{S}(E_0)$, where
913: \begin{eqnarray}
914: \bar{S}(E_0)&=&\min_{\beta\ge 0}\left[\beta E_0
915: +\sum_{x\in\calX}p(x)\ln\left(\sum_{y\in\calY}q(y|x,\beta_2)
916: e^{-\beta\calE_x(y)}\right)\right]\nonumber\\
917: &=&\min_{\beta\ge 0}\left[\beta E_0+
918: \sum_{x\in\calX}p(x)\ln\left(\frac{\zeta_x(\beta+\beta_2)}
919: {\zeta_x(\beta_2)}\right)
920: \right]\nonumber\\
921: &=&\min_{\beta\ge 0}\left[\beta E_0+
922: \sum_{x\in\calX}p(x)\ln \zeta_x(\beta+\beta_2)-
923: \sum_{x\in\calX}p(x)\ln \zeta_x(\beta_2)
924: \right]\nonumber\\
925: &=&\min_{\beta\ge 0}\left[(\beta+\beta_2)E_0+
926: \sum_{x\in\calX}p(x)\ln \zeta_x(\beta+\beta_2)\right]-\beta_2E_0-
927: \sum_{x\in\calX}p(x)\ln \zeta_x(\beta_2)\nonumber\\
928: &=&\min_{\beta\ge \beta_2}\left[\beta E_0+
929: \sum_{x\in\calX}p(x)\ln \zeta_x(\beta)\right]+\beta_2(E_2-E_0)
930: -\left[\beta_2E_2+\sum_{x\in\calX}p(x)\ln \zeta_x(\beta_2)\right]\nonumber\\
931: &=&\min_{\beta\ge \beta_2}\left[\beta E_0+
932: \sum_{x\in\calX}p(x)\ln \zeta_x(\beta)\right]+\beta_2(E_2-E_0)\nonumber\\
933: & &-\min_{\beta\ge 0}\left[\beta E_2+\sum_{x\in\calX}p(x)\ln \zeta_x(\beta)\right]\nonumber\\
934: &=&\bar{\Sigma}(E_0)-\bar{\Sigma}(E_2)+\beta_2(E_2-E_0),
935: \end{eqnarray}
936: where we have used the fact that the achiever $\beta(E)$
937: of $\bar{\Sigma}(E)$ is a monotonically non-increasing function of $E$,
938: thus, $E_0 < E_2$ implies $\beta(E_0) > \beta(E_2)=\beta_2$,
939: and so, the global minimum over $\beta\ge 0$ is attained
940: for $\beta\ge\beta_2$ anyway.
941:
942: It then follows that the error exponent $I_2$ under $H_2$ is given by
943: \begin{eqnarray}
944: I_2&=&\bar{\Sigma}(E_2)-\bar{\Sigma}(E_0)-\beta_2(E_2-E_0)\nonumber\\
945: &=&\frac{1}{k}\left[k\bar{\Sigma}(E_2)-k\bar{\Sigma}(E_0)-
946: \frac{E_2-E_0}{T_2}\right]\nonumber\\
947: &=&\frac{1}{k}\int_{E_0}^{E_2}\left[\frac{1}{T(E)}-\frac{1}{T_2}\right]\mbox{d}E
948: \nonumber\\
949: &=&\frac{1}{k}\int_{T_0}^{T_2}\left(\frac{1}{T}-\frac{1}{T_2}\right)
950: \bar{C}(T)\mbox{d}T,
951: \end{eqnarray}
952: where $T(E)=1/(k\beta(E))$ is the temperature corresponding to
953: energy $E$, $T_i=T(E_i)$, $i=0,1,2$, and $\bar{C}(T)=\mbox{d}E/\mbox{d}T$
954: is the average heat capacity per particle of the system, which is
955: the weighted average of heat capacities of all subsystems, i.e.,
956: $$\bar{C}(T)=\sum_{x\in\calX}p(x)C_x(T),$$
957: where
958: $$C_x(T)=\frac{\mbox{d}E_x}{\mbox{d}T}=
959: \frac{1}{kT^2}\left[\frac{\mbox{d}^2\ln \zeta_x(\beta)}{d\beta^2}\right]_{\beta=
960: 1/(kT)}.$$
961: Thus,
962: $$I_2=\sum_{x\in\calX}p(x)\cdot
963: \frac{1}{k}\int_{T_0}^{T_2}\left(\frac{1}{T}-\frac{1}{T_2}\right)
964: C_x(T)\mbox{d}T,$$
965: which is interpreted as the weighted average of the relative contributions
966: of all subsystems, which all lie in the same temperature $T_0$.
967:
968: In a similar manner, the rate function $I_1$ of the probability
969: of error under $H_1$ is given by:
970: \begin{eqnarray}
971: I_1&=&\bar{\Sigma}(E_1)-\bar{\Sigma}(E_0)-\beta_1(E_1-E_0)\nonumber\\
972: &=&\frac{1}{k}\left[
973: k\bar{\Sigma}(E_1)-k\bar{\Sigma}(E_0)-\frac{E_1-E_0}{T_1}\right]\nonumber\\
974: &=&\frac{1}{k}\int_{E_1}^{E_0}\left[\frac{1}{T_1}-
975: \frac{1}{T(E)}\right]\mbox{d}E\nonumber\\
976: &=&\frac{1}{k}\int_{T_1}^{T_0}\left(\frac{1}{T_1}-
977: \frac{1}{T}\right)\bar{C}(T)\mbox{d}T.
978: \end{eqnarray}
979:
980: The expression in the square brackets
981: of the second line pertaining to $I_2$ has a simple graphical
982: interpretation (see Fig.\ 1): It is the vertical distance
983: (corresponding to the vertical line $E=E_0$) between the curve $\bar{\Sigma}(E)$
984: and the line tangent to that curve at $E=E_2$ (whose slope is $\beta_2=
985: \beta(E_2)$). The two other expressions of $I_2$, in the
986: last chain of equalities, describe the error exponent $I_2$ in terms
987: of slow heating from temperature $T_0$ to temperature $T_2$.
988: Similar comments apply to $I_1$ (cf.\ Fig.\ 1).
989: Thus, the error exponents
990: are linear functionals of the average heat capacity, $\bar{C}(T)$,
991: in the range of temperatures $[T_1,T_2]$.
992: The higher is the heat capacity, the better is the discrimination
993: between the hypotheses. This is related to the fact that Fisher information
994: of the parameter $\beta$ is given by
995: $$J(\beta)=\sum_{x\in\calX}p(x)\frac{\mbox{d}^2\ln \zeta_x(\beta)}{\mbox{d}\beta^2}=
996: kT^2\bar{C}(T),$$
997: namely, again, a linear function of $\bar{C}(T)$.
998: However, while the Fisher information
999: depends only on one local value of $\bar{C}(T)$
1000: (as it measures the sensitivity of the
1001: likelihood function to the parameter in a local manner), the error exponents
1002: depend on $\{\bar{C}(T): T_1\le T\le T_2\}$
1003: in a cumulative manner, via the above integrals.
1004: The tradeoff between $I_1$ and $I_2$ is also obvious: by enlarging the
1005: threshold $E_0$, or, correspondingly, $T_0$,
1006: the range of integration pertaining to
1007: $I_1$ increases at the expense of the one of $I_2$ and vice versa. In the
1008: extreme case, where $I_2=0$, we get
1009: $$I_1=D(P_2\|P_1)=
1010: \frac{1}{k}\int_{T_1}^{T_2}\left(\frac{1}{T_1}-
1011: \frac{1}{T}\right)\bar{C}(T)\mbox{d}T.$$
1012:
1013: \begin{figure}[ht]
1014: \hspace*{1cm}\input{graph2.pstex_t}
1015: \caption{Entropy as function of energy and a graphical representation
1016: of error exponents.}
1017: \label{gen}
1018: \end{figure}
1019:
1020: \subsection{Error Exponents of Time--Varying Scalar Quantizers}
1021:
1022: In this application example, we are back to the problem area
1023: of lossy data compression, but this time, it is about scalar (symbol--by--symbol)
1024: compression. This setup is motivated by earlier results about the optimality
1025: of time--shared scalar quantizers within the class of
1026: causal source codes for memoryless sources, both under
1027: the average rate/distortion criteria \cite{NG82} and large--deviations performance
1028: criteria \cite{MK03}. In particular, it was shown that
1029: under both criteria, optimum time--sharing
1030: between at most two (entropy coded) scalar quantizers
1031: is as good as any causal source code for memoryless sources.
1032: Here, we will focus on the large deviations performance criteria, namely,
1033: source coding exponents.
1034:
1035: Consider a time--varying scalar quantizer $\hX_i=f_i(X_i)$, acting on a DMS
1036: $X_1,X_2,\ldots$, $X_i\in\calX$, drawn from $q$,
1037: where $\{f_i\}$ is an arbitrary (deterministic) sequence
1038: of quantizers from a given finite set $\calF=\{F_1,\ldots,F_S\}$,
1039: where $F_s:\calX\to\hat{\calX}_s$, $\hat{\calX}_s$ being the reproduction alphabet
1040: corresponding to $F_s$, $s=1,\ldots,S$. In other words, for every $i=1,2,\ldots,n$,
1041: $f_i=F_{s_i}$, for a certain arbitrary sequence of `states',
1042: $s_1,s_2,\ldots$ (known to the decoder) with components in $\calS=\{1,2,\ldots,S\}$.
1043:
1044: The distortion incurred by such a time--varying scalar quantizer, over $n$ units of time, is
1045: $\sum_{i=1}^nd(X_i,f_i(X_i))=
1046: \sum_{i=1}^nd(X_i,F_{s_i}(X_i))$. The total code length is $\sum_{i=1}^n L_{s_i}(F_{s_i}(X_i))$,
1047: where the per--symbol length functions
1048: $L_{s}(\cdot)$ may correspond to either fixed--rate coding, where $L_s(\hx)=R_s\dfn
1049: \lceil\log|\hat{\calX}_s|\rceil$ for all $\hx$,
1050: or any other length function satisfying the Kraft
1051: inequality, $\sum_{\hx\in\hat{\calX}_s}2^{-L_s(\hx)}\le 1$.
1052: For the sake of simplicity of the exposition, let us assume fixed--rate coding.
1053: We will denote by $n_s$, $s\in\calS$, the number of times that $s_i=s$
1054: occurs in $s^n$, and $p(s)=n_s/n$ is the corresponding relative frequency.
1055:
1056: In \cite{MK03}, among other results, the rate function of the excess distortion event
1057: $$\sum_{i=1}^nd(X_i,F_{s_i}(X_i)) > nD,~~~~
1058: D> \sum_{(x,s)\in\calX\times\calS}q(x)p(s)d(x,F_s(x))$$
1059: was optimized across the class of all time--varying scalar quantizers (each one
1060: corresponding to a different sequence $s_1,\ldots,s_n$) subject to a code--length
1061: constraint $\sum_{i=1}^nR_{s_i}\le nR$, or equivalently, $\sum_{s\in\calS}n_sR_s\le nR$,
1062: for a given pair $(D,R)$.
1063:
1064: In the notation of our generic model, here we have $U_i=X_i$, $v_i=s_i$, $i=1,\ldots,n$,
1065: $q(u|v)=q(x|s)=q(x)$ independently of $s$,
1066: $f(u,v)=f(x,s)=-d(x,F_s(x))$, and $E=-D$.\footnote{One
1067: may prefer to redefine
1068: $f(x,s)=D_{\max}-d(x,F_s(x))$ and $E=D_{\max}-D$,
1069: where $D_{max}\dfn\max_{x,s}d(x,F_s(x))$, in order to
1070: work with non--negative quantities.} and the excess distortion exponent is of the same form
1071: as before (see also \cite{MK03}).
1072: Here, however, unlike the previous application examples, we have a degree
1073: of freedom to select the relative frequency of usage, $p(s)$, of each member of $\calF$,
1074: i.e., the time--sharing protocol, but we also have the constraint $\sum_sp(s)R_s\le R$.
1075:
1076: From the statistical physics point of view, these additional ingredients mean that
1077: we have a freedom to select the number of particles in each subsystem
1078: (though the total number, $n$, is still fixed), and the additional
1079: constraint, $\sum_sp(s)R_s\le R$, which is actually equivalent to the equality constraint
1080: $\sum_sp(s)R_s= R$ (in the interesting region of $(R,D)$ pairs) can be viewed as an additional
1081: conservation law with respect to some other
1082: constant of motion, in addition to the energy (e.g., the momentum), where in
1083: subsystem $s$, the (average) value of the corresponding physical quantity
1084: per particle is $R_s$.
1085:
1086: While in \cite{MK03}, we have considered the problem of maximizing the rate function
1087: (the source coding exponent) of the excess distortion event
1088: $\sum_{i=1}^nd(X_i,F_{s_i}(X_i)) > nD$, a related objective (although somewhat
1089: less well motivated, but still interesting) is to minimize the rate function
1090: (or maximize the probability) of the small distortion event
1091: $$\sum_{i=1}^nd(X_i,F_{s_i}(X_i)) < nD,~~~
1092: D < \sum_{(x,s)\in\calX\times\calS}q(x)p(s)d(x,F_s(x)).$$
1093: In this case, the optimum performance is given by
1094: $$F(R,D)=\max_{P\in \calP(R)}\min_{\beta\ge 0}\left[\beta D+\sum_{s=1}^Sp(s)\ln
1095: \left(\sum_{x\in\calX}q(x)e^{-\beta d(x,F_s(x))}\right)\right],$$
1096: where $\calP(R)$ is the class of all probability distributions $P=\{p(s),~s\in\calS\}$ with
1097: $\sum_sp(s)R_s\le R$. From the viewpoint of statistical physics, this corresponds
1098: to a situation where the various subsystems are allowed to interact, not only thermally,
1099: but also chemically, i.e., an exchange of particles is enabled in addition to the exchange of
1100: energy, and the maximization over $\calP(R)$ (maximum entropy) is achieved when the
1101: chemical potentials of the various subsystems reach a balance. As the maximization over
1102: $P\in\calP(R)$ subject to the constraint $\sum_sp(s)R_s\le R$, for a given $\beta$,
1103: is a linear programming
1104: problem with one constraint (in addition to $\sum_sp(s)=1$), then as was shown in
1105: \cite{MK03}, for each distortion level (or energy) $D$, the optimum $P\in\calP(R)$ may be
1106: non--zero for at most two members of $\calS$ only, which means that at most two subsystems
1107: are populated by particles in thermal and chemical equilibrium under the two conservation
1108: laws (of $D$ and of $R$). However, the choice of these two
1109: members of $\calS$ depends, in general,
1110: on $D$, which in turn depends on the temperature. Thus, when
1111: the system is heated gradually, certain {\it phase transitions}
1112: may occur, whenever
1113: there is a change in the choice of the two populated subsystems.
1114:
1115: Finally, referring to comment no.\ 1 of Section 3, we should point out that here,
1116: in contrast to our discussion thus far, the difference between the ensemble of
1117: distinguishable particles and indistinguishable particles becomes critical since the
1118: factors $\{n_s!\}$ are no longer constant. Had we assumed indistinguishability, the
1119: normalized log--partition function
1120: would no longer be affine in $P$, thus the maximization over $P$
1121: would no longer be a linear programming problem, and the conclusion might have been
1122: different. In the source coding problem, the indistinguishable case corresponds to
1123: a situation where the sequence of states $s^n$ is chosen uniformly at random
1124: (with the decoder being informed of the result
1125: of the random selection, of course). In this case,
1126: the Chernoff bound corresponding to each composition $\{n_s,~s\in\calS\}$ of $s^n$
1127: should be weighed by the probability of this composition, which is
1128: $S^{-n}n!/\prod_sn_s!$. Now, each factor of $1/n_s!$ can be
1129: absorbed in the corresponding
1130: partition function $Z_s(\beta)$ of subsystem $s$, with the interpretation
1131: that in each subsystem the particles are now indistinguishable. The maximum over $P$ would
1132: now correspond to the dominant contribution in this weighted average of Chernoff bounds.
1133: One can, of course, extend the discussion to any i.i.d.\ distribution on $s^n$, thus
1134: introducing additional bias and preferring some compositions over others.
1135:
1136: \section*{Appendix}
1137: \renewcommand{\theequation}{A.\arabic{equation}}
1138: \setcounter{equation}{0}
1139:
1140: \subsection*{A.1. Sketch of an Alternative Proof of Theorem 1 via Chernoff Bounds}
1141:
1142: In this subsection,
1143: we outline another proof of Theorem 1
1144: using a large deviations analysis approach. In particular,
1145: consider the large deviations event $\sum_{i=1}^nf(U_i,v_i)\le nE$,
1146: as described in Section 2.
1147: Assuming that the relative frequencies $\{p(v)\}$ all stabilize
1148: as $n\to\infty$, let us compute the rate function $I(E)$
1149: of the probability of this event in two different methods, where one would yield
1150: the left--hand side of (\ref{identity}) and the other would give the right--hand
1151: side of (\ref{identity}).
1152:
1153: In the first method,
1154: we partition the sequence $v^n$ according to its different letters.
1155: Specifically, let
1156: $$E_v\dfn\frac{1}{n_v}\sum_{i:v_i=v}f(U_i,v),$$
1157: where $n_v$ is the number of occurrences of the symbol $v\in\calV$ along $v^n$.
1158: Let $\calG$ denote the set of all possible vector values that
1159: can be taken on by the vector $\bar{E}=\{E_v,~v\in\calV\}$.
1160: Now, obviously, $\sum_{i=1}^n f(U_i,v_i)\le nE$
1161: if and only if there exists a
1162: vector $\tilde{E}=\{\tilde{E}_v,~v\in\calV\}\in\calG$
1163: such that $E_v\le \tilde{E}_v$ for all $v\in\calV$ and
1164: $\sum_{v\in\calV}p(v)\tilde{E}_v\le E$.
1165: The ``if'' part follows from
1166: $$\sum_{i=1}^n f(U_i,v_i)=n\sum_{v\in\calV}p(v)E_v\le
1167: n\sum_{v\in\calV}p(v)\tilde{E}_v\le nE.$$
1168: The ``only if'' part follows by setting $\tilde{E}_v=E_v$ for all $v\in\calV$.
1169: Therefore, denoting
1170: $\calH_G(E)=\calH_0(E)\bigcap\calG$ (where $\calH_0(E)$ is defined as in Section 2), we have:
1171: \begin{eqnarray}
1172: \label{ub1}
1173: \mbox{Pr}\left\{\sum_{i=1}^n f(U_i,v_i)\le nE\right\}&=&
1174: \mbox{Pr}\bigcup_{\bar{E}\in\calH_G(E)}\left\{\sum_{i:v_i=v}
1175: f(U_i,v)\le n_v\tilde{E}_v,~~v\in\calV\right\}\nonumber\\
1176: &\le&\sum_{\tilde{E}\in\calH_G(E)}\mbox{Pr}\left\{\sum_{i:v_i=v}
1177: f(U_i,v)\le n_v\tilde{E}_v,~~v\in\calV\right\}\nonumber\\
1178: &=&\sum_{\tilde{E}\in\calH_G(E)}\prod_{v\in\calV}
1179: \mbox{Pr}\left\{\sum_{i:v_i=v} f(U_i,v)\le n_v\tilde{E}_v\right\}\nonumber\\
1180: &\le&|\calH_G(E)|\cdot\max_{\tilde{E}\in\calH_G(E)}\prod_{v\in\calV}
1181: \mbox{Pr}\left\{\sum_{i:v_i=v} f(U_i,v)\le n_v\tilde{E}_v\right\}\nonumber\\
1182: &\le&|\calG|\cdot\max_{\tilde{E}\in\calH_G(E)}\prod_{v\in\calV}
1183: \mbox{Pr}\left\{\sum_{i:v_i=v} f(U_i,v)\le n_v\tilde{E}_v\right\},
1184: \end{eqnarray}
1185: and on the other hand,
1186: \begin{eqnarray}
1187: \label{lb1}
1188: \mbox{Pr}\left\{\sum_{i=1}^n f(U_i,v_i)\le nE\right\}&=&
1189: \mbox{Pr}\bigcup_{\tilde{E}\in\calH_G(E)}\left\{\sum_{i:v_i=v}
1190: f(U_i,v)\le n_v\tilde{E}_v,~~v\in\calV\right\}\nonumber\\
1191: &\ge&\max_{\tilde{E}\in\calH_G(E)}\mbox{Pr}\left\{\sum_{i:v_i=v}
1192: f(U_i,v)\le n_v\tilde{E}_v,~~v\in\calV\right\}\nonumber\\
1193: &=&\max_{\tilde{E}\in\calH_G(E)}\prod_{v\in\calV}
1194: \mbox{Pr}\left\{\sum_{i:v_i=v} f(U_i,v)\le n_v\tilde{E}_v\right\}.
1195: \end{eqnarray}
1196: At this point, the only gap between the upper bound (\ref{ub1}) and
1197: the lower bound (\ref{lb1}) is the factor $|\calG|$. The number of different
1198: values that $\tilde{E}_v$ can take does not exceed the number of different
1199: type classes of sequences of length $n_v$ over the alphabet $\calU$,
1200: which is upper bounded by $(n_v+1)^{|\calU|-1}$.
1201: Thus,
1202: \begin{eqnarray}
1203: |\calG|&\le&\prod_{v\in\calV}[n_v+1]^{|\calU|-1}\nonumber\\
1204: &=&\exp\left\{(|\calU|-1)\sum_v\log(n_v+1)\right\}\nonumber\\
1205: &=&\exp\left\{|\calV|\cdot(|\calU|-1)\sum_v\frac{1}{|\calV|}
1206: \log(n_v+1)\right\}\nonumber\\
1207: &\le&\exp\left\{|\calV|\cdot(|\calU|-1)\log\left(\sum_v\frac{1}{|\calV|}
1208: [n_v+1]\right)\right\}\nonumber\\
1209: &=&\exp\left\{|\calV|\cdot(|\calU|-1)\log\left(
1210: \frac{n}{|\calV|}+1\right)\right\}\nonumber\\
1211: &=&\left(\frac{n}{|\calV|}+1\right)^{|\calV|\cdot(|\calU|-1)},
1212: \end{eqnarray}
1213: and therefore $|\calG|$ is only polynomial in $n$, and hence does not affect the
1214: exponential behavior. Now, each one of the terms
1215: $\mbox{Pr}
1216: \{\sum_{i:v_i=v} f(U_i,v)\le n_v\tilde{E}_v\}$ is bounded
1217: exponentially tightly by an individual Chernoff bound,
1218: $$\exp\left\{n_v\min_{\beta\ge 0}\left[\beta
1219: \tilde{E}_v+\ln\left(\sum_uq(u|v)e^{-\beta f(u,v)}
1220: \right)\right]\right\},$$
1221: and so, the dominant term of their product is of the exponential order of
1222: $$\max_{\tilde{E}\in\calH_G(E)}\sum_vp(v)\cdot
1223: \min_{\beta\ge 0}\left[\beta \tilde{E}_v+\ln\left(\sum_uq(u)e^{-\beta f(u,v)}
1224: \right)\right]=\max_{\tilde{E}\in\calH_G(E)}\sum_vp(v)S_v(E_v).$$
1225: Finally, as $n_v\to\infty$, the set $\calH_G(E)$ becomes dense in the continuous set $\calH_0(E)$,
1226: and by simple continuity arguments, the maximum over $\calH_G(E)$ tends to the maximum over $\calH_0(E)$.
1227:
1228: The other method to evaluate the rate function $I(E)$
1229: is as follows. Let $\ell$ be a fixed positive integer that divides $n$,
1230: and denote $\ell_v=\ell p(v)$, $v\in\calV$ (assume that $\ell$ is chosen large enough that
1231: $\ell p(v)$ is well approximated by the closest integer with a very small relative error).
1232: Now, re--order the pairs $\{(U_i,v_i)\}$
1233: (periodically), according to the following rule:
1234: Assuming, without loss of generality, that
1235: $\calV=\{1,2,\ldots,|\calV|\}$, the first
1236: $\ell_1=\ell p(1)$ symbol pairs of each $\ell$--block of $(u^n,v^n)$
1237: are such that $v=1$, the next $\ell_2=\ell p(2)$ symbol pairs
1238: of each $\ell$--block are such that
1239: $v=2$, and so on. In other words,
1240: each $\ell$--block, $v_{(i-1)\ell+1}^{i\ell}=(v_{(i-1)\ell+1},
1241: v_{(i-1)\ell+2},\ldots,v_{i\ell})$, $i=1,2,\ldots,n/\ell$,
1242: consists of the same relative frequencies $\{p(v)\}$
1243: as the entire sequence, $v^n$. Now, for
1244: the re--ordered sequence of pairs, let us define
1245: $X_i=\sum_{t=(i-1)\ell+1}^{i\ell}f(U_t,v_t)$,
1246: $i=1,2,\ldots,n/\ell$.
1247: Obviously, $X_1,X_2,\ldots,X_{n/\ell}$ are i.i.d.\ and therefore
1248: the probability of the large deviations event $\{\sum_{i=1}^{n/\ell}X_i \le \frac{n}{\ell}\cdot
1249: \ell E\}$ can be assessed exponentially tightly by the Chernoff bound
1250: as follows:
1251: \begin{eqnarray}
1252: &&\exp\left\{\frac{n}{\ell}\cdot\min_{\beta\ge 0}\left[\beta\cdot\ell E+\ln\left(
1253: \sum_{u^\ell\in\calU^\ell}q(u^\ell|v^\ell)\exp\left\{-\beta
1254: \sum_{i=1}^{\ell}f(u_i,v_i)\right\}\right)\right]
1255: \right\}\nonumber\\
1256: &=&\exp\left\{\frac{n}{\ell}\cdot\min_{\beta\ge 0}
1257: \left[\beta\cdot\ell E+\ln\left(\prod_{v\in\calV}
1258: \sum_{u^{\ell_v}}q(u^{\ell_v}|v^{\ell_v})
1259: \exp\left\{-\beta\sum_{i=1}^{\ell_v}f(u_i,v)\right\}\right)\right]
1260: \right\}\nonumber\\
1261: &=&\exp\left\{\frac{n}{\ell}\cdot\min_{\beta\ge 0}
1262: \left[\beta\cdot\ell E+\ln\left(\prod_{v\in\calV}
1263: \left[\sum_{u\in\calU}q(u|v)e^{-\beta f(u,v)}\right]^{\ell_v}\right)\right]
1264: \right\}\nonumber\\
1265: &=&\exp\left\{\frac{n}{\ell}\cdot\min_{\beta\ge 0}
1266: \left[\beta\cdot\ell E+\ell\cdot\sum_{v\in\calV}
1267: p(v)\ln\left(\sum_{u\in\calU}q(u|v)e^{-\beta f(u,v)}\right)\right]
1268: \right\}\nonumber\\
1269: &=&\exp\left\{n\cdot\min_{\beta\ge 0}\left[\beta E+\sum_{v\in\calV}
1270: p(v)\ln\left(\sum_{u\in\calU}q(u|v)e^{-\beta f(u,v)}\right)\right]
1271: \right\}\nonumber\\
1272: &=&e^{n\bar{S}(E)}.
1273: \end{eqnarray}
1274: Since both approaches yield exponentially tight
1275: evaluations of $I(E)$, they must be equal.
1276:
1277: \subsection*{A.2. A More Rigorous Derivation of Eq.\ (\ref{equipartition})}
1278:
1279: The exact derivation of eq.\ (\ref{equipartition}) for the finite
1280: interval integration, is as follows:
1281: \begin{eqnarray}
1282: \epsilon_0 D&=-&\frac{\partial}{\partial \beta}\ln
1283: \left[\int_{-A}^A
1284: \exp\{-\beta\epsilon_0|\hx-x|^\theta\} \mbox{d}\hx\right]\nonumber\\
1285: &=&-\frac{\partial}{\partial \beta}\ln\left[
1286: \beta^{-1/\theta}\cdot\int_{-\beta^{1/\theta}(A+x)}^{\beta^{1/\theta}(A-x)}
1287: \exp\{-\epsilon_0|\beta^{1/\theta}(\hx-x)|^\theta\}
1288: \mbox{d}(\beta^{1/\theta}(\hx-x))\right]\nonumber\\
1289: &=&-\frac{\partial}{\partial \beta}\ln\left[
1290: \beta^{-1/\theta}\cdot\int_{-\beta^{1/\theta}(A+x)}^{\beta^{1/\theta}(A-x)}
1291: \exp\{-\epsilon_0|z|^\theta\}
1292: \mbox{d}z\right]\nonumber\\
1293: &=&-\frac{\partial}{\partial \beta}\ln
1294: \left(\beta^{-1/\theta}\right)-\frac{\partial}{\partial \beta}\ln
1295: \left[\int_{-\beta^{1/\theta}(A+x)}^{\beta^{1/\theta}(A-x)}
1296: \exp\{-\epsilon_0|z|^\theta\}\mbox{d}z\right]\nonumber\\
1297: &=&\frac{1}{\beta\theta}\left\{1-\frac{\beta^{1/\theta}
1298: [(A-x)\exp\{-\beta\epsilon_0|A-x|^\theta\}
1299: +(A+x)\exp\{-\beta\epsilon_0|A+x|^\theta\}]}
1300: {\int_{-\beta^{1/\theta}(A+x)}^{\beta^{1/\theta}(A-x)}
1301: \exp\{-\epsilon_0|z|^\theta\}\mbox{d}z}\right\}.
1302: \end{eqnarray}
1303: When $\beta$ is very large, the denominator
1304: of the second term of the expression
1305: in the curly brackets of the right--most side, goes to
1306: $\int_{-\infty}^{\infty}
1307: \exp\{-\epsilon_0|z|^\theta\}\mbox{d}z$, which is a constant.
1308: Now if, in addition, $|x|<A$, then the numerator
1309: tends to zero as $\beta$ grows without bound.
1310: Thus, the dominant term, for low temperatures, is $1/(\beta\theta)=kT/\theta$.
1311:
1312: An exact closed--form expression, for every finite $\beta$, can be derived
1313: for the case $\theta=1$, since in this case, the integral at the denominator has a simple
1314: expression. For example, setting $\theta=1$, and $x=0$ in the above expression, yields:
1315: \begin{eqnarray}
1316: D&=&\frac{1}{\beta\epsilon_0}-\frac{A}{e^{\beta\epsilon_0A}-1}\nonumber\\
1317: &=&\frac{kT}{\epsilon_0}-\frac{A}{e^{\epsilon_0A/(kT)}-1}.
1318: \end{eqnarray}
1319: Note that this expression is valid only in the range where it is monotonically
1320: increasing in $T$. (Beyond this point, the minimizing $\beta$ is no longer the
1321: point of zero derivative).
1322:
1323: \begin{thebibliography}{AA}
1324:
1325: \bibitem{AB01}
1326: R.~Albert and A.-L.~ Barab\'asi, ``Statistical mechanics of complex networks,'' %SPI-042
1327: arXiv:cond-mat/0106096, June 6, 2001.
1328:
1329: \bibitem{Balakirsky95}
1330: V.~B.~Balakirsky, ``A converse coding theorem
1331: for mismatched decoding at the output of binary-input memoryless channels,''
1332: {\it IEEE Trans.\ Inform.\ Theory}, vol.\ 41, no.\ 6, pp.\ 1889--1902,
1333: November 1995.
1334:
1335: \bibitem{Berger71}
1336: T.~Berger, {\sl Rate distortion theory: a mathematical basis for data compression},
1337: Prentice--Hall, Inc., Engelwood Cliffs, NJ, 1971.
1338:
1339: \bibitem{Burg75}
1340: J.~P.~Burg, {\it Maximum entropy spectral analysis}, Ph.D.\ thesis, Department of
1341: Geophysics, Stanford University, Stanford, CA, 1975.
1342:
1343: \bibitem{CT91}
1344: T.~M.~Cover and J.~A.~Thomas, {\it Elements of Information Theory}, (first edition),
1345: John Wiley \& Sons, Inc., New York, 1991.
1346:
1347: \bibitem{CK81}
1348: I. Csisz\' ar and J.
1349: K\" orner, {\sl Information theory: coding theorems for discrete
1350: memoryless systems}, New York: Academic, 1981.
1351:
1352: \bibitem{CN95}
1353: I.~Csisz\'ar and P.~Narayan, ``Channel capacity for a given decoding
1354: metric,'' {\it IEEE Trans.\ Inform.\ Theory}, vol.\ 41, no.\ 1,
1355: pp.\ 35--43, January 1995.
1356:
1357: \bibitem{Ellis85}
1358: R.~S.~Ellis, {\it Entropy,
1359: large deviations, and statistical mechanics}, %SPI-054
1360: Springer--Verlag, NY, 1985.
1361:
1362: \bibitem{Ellis06}
1363: R.~S.~Ellis, ``The theory of large deviations and applications to statistical mechanics,''
1364: lectures for international
1365: seminar on extreme events in complex dynamics, October 2006.
1366: Available on--line at:
1367: [http://www.math.umass.edu/$\sim$rsellis/pdf-files/Dresden-lectures.pdf].
1368:
1369: \bibitem{Gray90}
1370: R.~M.~Gray, {\sl Source coding theory}, Kluwer Academic Publishers, 1990.
1371:
1372: \bibitem{GGRS81}
1373: R. M. Gray, A. H. Gray, G. Rebolledo, and J. E. Shore, ``Rate distortion%MDI-003
1374: speech coding with a minimum discrimination information distortion measure'',
1375: {\em IEEE Trans.~Inform.~Theory\/},
1376: vol.~IT--27, no.~6, pp.~708--721, November 1981.
1377:
1378: \bibitem{GN98}
1379: R.~M.~Gray and D.~L.~Neuhoff, ``Quantization,''
1380: {\em IEEE Trans.~Inform.~Theory\/}, vol.\ 44, no.\ 6, pp.\ 2325--2383, October 1998.
1381:
1382: \bibitem{GV02}
1383: D.~Guo and S.~Verd\'u, ``Multiuser detection and statistical physics,'' %SPI-006
1384: in {\it Communications, Information and Network Security},
1385: V.~Bhargava, H.~V.~Poor, V.~Tarokh, and S.~Yoon, Eds., Chap.\ 13, pp.\ 229-277,
1386: Kluwer Academic Publishers, Norwell, Mass, USA, 2002.
1387:
1388: \bibitem{HK05}
1389: T.~Hosaka and Y.~Kabashima, ``Statistical mechanical approach to error exponents %SPI-024
1390: of lossy data compression,'' {\it J.~Physical Society of Japan}, vol.\ 74, no.\ 1,
1391: pp.\ 488--497, January 2005.
1392:
1393: \bibitem{Jaynes57a}
1394: E.~T.~Jaynes, ``Information theory and statistical mechanics,'' %MDI-013
1395: {\it Phys.\ Rev.\ A}, vol.\ 106, pp.\ 620--630, May 1957.
1396:
1397: \bibitem{Jaynes57b}
1398: E.~T.~Jaynes, ``Information theory and statistical mechanics - II,'' %MDI-014
1399: {\it Phys.\ Rev.\ A}, vol.\ 108, pp.\ 171--190, October 1957.
1400:
1401: \bibitem{Jaynes82}
1402: E.~T.~Jaynes, "On the rationale of maximum-entropy methods", %MDI-002
1403: {\em Proc. of the IEEE\/}, vol.~70, no.~9, pp.~939--952, September 1982.
1404:
1405: \bibitem{KH05}
1406: Y.~Kabashima and T.~Hosaka, ``Statistical mechanics for source coding with a fidelity %SPI-037
1407: criterion,'' {\it Progress of Theoretical Physics}, Supplement no.\ 157,
1408: pp.\ 197--204, 2005.
1409:
1410: \bibitem{KNM02}
1411: Y.~Kabashima, K.~Nakamura, and J.~van Mourik,
1412: ``Statistical mechanics of typical set decoding,'' %SPI-025
1413: {\it Physical Review E}, vol.\ 66, 2002.
1414:
1415: \bibitem{KabS99}
1416: Y.~Kabashima and D.~Saad, ``Statistical mechanics of error correcting codes,'' %SPI-005
1417: {\it Europhysics Letters}, vol.\ 45, no.\ 1, pp.\ 97--103, 1999.
1418:
1419: \bibitem{KSNS01}
1420: Y.~Kabashima, N.~Sazuka, K.~Nakamura, and D.~Saad, ``Tighter decoding %SPI-002
1421: reliability bound for Gallager's error--correcting code,'' {\it Physical Review E},
1422: vol.\ 64, pp.\ 046113-1--046113-4, 2001.
1423:
1424: \bibitem{KanS99}
1425: I.~Kanter and D.~Saad, ``Error--correcting codes that nearly saturate %SPI-001
1426: Shannon's bound,'' {\it Physical Review Letters}, vol.\ 83, no.\ 13,
1427: pp.\ 2660--2663, September 1999.
1428:
1429: \bibitem{Landauer61}
1430: R.~Landauer, ``Irreversibility and heat generation in the computing process,'' {\it IBM
1431: J.\ Res.\ Dev.}, vol.\ 5, pp.\ 183--191, 1961.
1432:
1433: \bibitem{Lapidoth94}
1434: A. Lapidoth, ``Mismatched decoding and the multiple access channel,''%MAC-003
1435: Stanford Univ. Tech. Report, February 1994.
1436:
1437: \bibitem{LS96-2}
1438: A. Lapidoth and S. Shamai (Shitz), ``A lower bound on the bit-error %CCTT-022
1439: rate resulting from mismatched Viterbi decoding,''
1440: Technical Report, CC Pub No.~163,
1441: Department of Electrical Engineering, Technion -- I.I.T., August 1996.
1442:
1443: \bibitem{Maroney04}
1444: O.~J.~E.~Maroney, ``The (absence of a) relationship between thermodynamic and logical %SPI-044
1445: reversibility,'' arXiv:physics/0406137, June 27, 2004.
1446:
1447: \bibitem{McAllester}
1448: D.~McAllester, ``A statistical mechanics approach to large deviations theorems,'' %SPI-048
1449: preprint, 2006. Available on-line at: [http://citeseer.ist.psu.edu/443261.html].
1450:
1451: \bibitem{MKLS94}
1452: N. Merhav, G. Kaplan, A. Lapidoth, and S. Shamai (Shitz), ``On %CC-007
1453: information rates for mismatched decoders,''
1454: {\em IEEE Trans.~Inform.~Theory\/},
1455: vol.~IT--40, no.~6, pp.~1953--1967, November 1994.
1456:
1457: \bibitem{MK03}
1458: N.~Merhav and I.~Kontoyiannis, ``Source
1459: coding exponents for zero--delay coding with finite memory,''
1460: {\it IEEE Trans.\ Inform.\ Theory},
1461: vol.\ 49, no.\ 3, pp.\ 609--625,
1462: March 2003.
1463:
1464: \bibitem{MM06}
1465: M.~M\'ezard and A.~Montanari, {\it Constraint satisfaction networks in physics
1466: and computation}, draft, February 27, 2006.
1467: Available on--line at: [http://www.lptms.u-psud.fr/membres/mezard/].
1468:
1469: \bibitem{MR06}
1470: T.~Mora and O.~Rivoire, ``Statistical mechanics of error exponents for error--correcting %SPI-034
1471: codes,'' arXiv:cond-mat/0606696, June 2006.
1472:
1473: \bibitem{Murayama02}
1474: T.~Mutayama, ``Statistical mechanics of the data compression theorem,'' %SPI-029
1475: {\it J.~Phys.\ A: Math.\ Gen.}, vol.\ 35, pp.\ L95--L100, 2002.
1476:
1477: \bibitem{NG82}
1478: D.~L.~Neuhoff and R.~K.~Gilbert, ``Causal source codes,'' %RDT-001
1479: {\em IEEE Trans.~Inform.~Theory\/},
1480: vol.~IT--28, no.~5, pp.~701--713, September 1982.
1481:
1482: \bibitem{Oono89}
1483: Y.~Oono, ``Large deviation and statistical physics,'' {\it Progress of Theoretical Physics %SPI-05
1484: Supplement}, no.\ 99, pp.\ 165--205, 1989.
1485:
1486: \bibitem{PV01}
1487: M.~B.~Plenio and V.~Vitelli,
1488: ``The physics of forgetting: Landauer's erasure principle and information %QIT-017
1489: theory,'' {\it Contemporary Physics}, vol.\ 42, no.\ 1, pp.\ 25--60, 2001.
1490:
1491: \bibitem{PS99}
1492: A.~Procacci and B.~Scoppola, ``Statistical mechanics approach to coding theory,'' %SPI-023
1493: {\it J.~of Statistical Physics}, vol.\ 96, nos.\ 3/4, pp.\ 907--912, 1999.
1494:
1495: \bibitem{RC00}
1496: I.~Rojdestvenski and M.~C.~Cottman, ``Mapping of statistical physics to information theory%SPI-030
1497: with application to biological systems,'' {\it J.~Theor.\ Biol.}, pp.\ 43--54, 2000.
1498:
1499: \bibitem{Rose94}
1500: K.~Rose, ``A mapping approach to rate-distortion computation and %RDT-010
1501: analysis,'' {\em IEEE Trans.~Inform.~Theory\/}, vol.\ 40, no.\ 6, pp.\ 1939--1952, November 1994.
1502:
1503: \bibitem{Shinzato}
1504: T.~Shinzato, ``Statistical physics and thermodynamics on large deviation,'' preprint. %SPI-052
1505: Available online at [http://www.sp.dis.titech.ac.jp/shinzato/LD.pdf].
1506:
1507: \bibitem{SJ80}
1508: J.~E.~Shore and R.~W.~Johnson, ``Axiomatic derivation of the principle of %MDI-008
1509: maximum entropy and the principle of minimum cross-entropy,''
1510: {\em IEEE Trans.~Inform.~Theory\/},
1511: vol.~IT--26, no.~1, pp.~26--37, January 1980.
1512:
1513: \bibitem{Sourlas89}
1514: N.~Sourlas, ``Spin--glass models as error--correcting codes,'' {\it Nature}, %SPI-003
1515: pp.\ 693--695, vol.\ 339, June 1989.
1516:
1517: \bibitem{Sourlas94}
1518: N.~Sourlas, ``Spin glasses, error--correcting codes and finite--temperature %SPI-004
1519: decoding,'' {\it Europhysics Letters}, vol.\ 25, pp.\ 159--164, 1994.
1520:
1521: \bibitem{Tanaka01}
1522: T.~Tanaka, ``Statistical mechanics of CDMA
1523: multiuser demodulation,'' {\it Europhysics Letters}, %SPI-016
1524: vol.\ 54, no.\ 4, pp.\ 540--546, 2001.
1525:
1526: \bibitem{Tanaka02}
1527: T.~Tanaka, ``A statistical--mechanics approach to large--system analysis of CDMA %SPI-014
1528: multiuser detectors,'' {\it IEEE Trans.\ Inform.\
1529: Theory}, vol.\ 48, no.\ 11, pp.\ 2888--2910, November 2002.
1530:
1531: \bibitem{WSW05}
1532: M.~J.~Wainwright, T.~S.~Jaakkola, and S.~S.~Willsky, ``A new class of upper bounds on %SPI-013
1533: the log partition function,''
1534: {\em IEEE Trans.~Inform.~Theory\/},
1535: vol.~51, no.~7, pp.~2313--2335, July 2005.
1536:
1537: \end{thebibliography}
1538: \end{document}
1539: