0702:cs0702101/p115.tex

1: %\documentstyle[11pt,epsf]{article}

2: \documentclass[11pt,epsf]{article}

3: \usepackage{amsmath}

4: \usepackage{amsfonts}

5: \usepackage{amssymb}

6: \usepackage{graphicx}

7: \topmargin      0.25truein

8: \oddsidemargin  -0.1truein

9: \evensidemargin -0.1truein

10: \textheight     8.5truein

11: \textwidth      6.5truein

12: %\footheight     0.15truein

13: \footskip       0.6truein

14: \headheight     0.0truein

15: \headsep        0.0truein

16: \parskip 4pt plus 1pt

17:

18: \newenvironment{define}{\begin{trivlist}\item[]{\bf Definition:}\rm}{\end{trivlist}}

19: \newenvironment{corol}{\begin{trivlist}\item[]{\bf Corollary:}\rm}{\end{trivlist}}

20: \newenvironment{discus}{\begin{trivlist}\item[]{\bf Discussion:}\rm}{\end{trivlist}}

21: \newtheorem{theorem}{Theorem}

22: \newtheorem{lemma}{Lemma}

23: \newcommand {\bbeta} {\mbox{\boldmath $\beta$}}

24: \newcommand {\hx} {\hat{x}}

25: \newcommand {\hX} {\hat{X}}

26: \newcommand {\dfn} {\stackrel{\Delta} {=}}

27: \newcommand {\exe} {\stackrel{\cdot} {=}}

28: \newcommand{\eqa}{\stackrel{\mbox{(a)}}{=}}

29: \newcommand{\eqb}{\stackrel{\mbox{(b)}}{=}}

30: \newcommand{\eqc}{\stackrel{\mbox{(c)}}{=}}

31: \newcommand{\eqd}{\stackrel{\mbox{(d)}}{=}}

32: \newcommand{\eqe}{\stackrel{\mbox{(e)}}{=}}

33: \newcommand{\eqf}{\stackrel{\mbox{(f)}}{=}}

34: \newcommand{\lea}{\stackrel{\mbox{(a)}}{\le}}

35: \newcommand{\leb}{\stackrel{\mbox{(b)}}{\le}}

36: \newcommand{\lec}{\stackrel{\mbox{(c)}}{\le}}

37: \newcommand{\led}{\stackrel{\mbox{(d)}}{\le}}

38: \newcommand{\lee}{\stackrel{\mbox{(e)}}{\le}}

39: \newcommand{\lef}{\stackrel{\mbox{(f)}}{\le}}

40: \newcommand{\gea}{\stackrel{\mbox{(a)}}{\ge}}

41: \newcommand{\geb}{\stackrel{\mbox{(b)}}{\ge}}

42: \newcommand{\gec}{\stackrel{\mbox{(c)}}{\ge}}

43: \newcommand{\ged}{\stackrel{\mbox{(d)}}{\ge}}

44: \newcommand{\gee}{\stackrel{\mbox{(e)}}{\ge}}

45: \newcommand{\gef}{\stackrel{\mbox{(f)}}{\ge}}

46: \newcommand {\reals} {{\rm I\!R}}

47: \newcommand {\ba} {\mbox{\boldmath $a$}}

48: \newcommand {\bb} {\mbox{\boldmath $b$}}

49: \newcommand {\bc} {\mbox{\boldmath $c$}}

50: \newcommand {\bd} {\mbox{\boldmath $d$}}

51: \newcommand {\be} {\mbox{\boldmath $e$}}

52: \newcommand {\Bf} {\mbox{\boldmath $f$}}

53: \newcommand {\bg} {\mbox{\boldmath $g$}}

54: \newcommand {\bh} {\mbox{\boldmath $h$}}

55: \newcommand {\bi} {\mbox{\boldmath $i$}}

56: \newcommand {\bj} {\mbox{\boldmath $j$}}

57: \newcommand {\bk} {\mbox{\boldmath $k$}}

58: \newcommand {\bl} {\mbox{\boldmath $l$}}

59: \newcommand {\bm} {\mbox{\boldmath $m$}}

60: \newcommand {\bn} {\mbox{\boldmath $n$}}

61: \newcommand {\bo} {\mbox{\boldmath $o$}}

62: \newcommand {\bp} {\mbox{\boldmath $p$}}

63: \newcommand {\bq} {\mbox{\boldmath $q$}}

64: \newcommand {\br} {\mbox{\boldmath $r$}}

65: \newcommand {\bs} {\mbox{\boldmath $s$}}

66: \newcommand {\bt} {\mbox{\boldmath $t$}}

67: \newcommand {\bu} {\mbox{\boldmath $u$}}

68: \newcommand {\bv} {\mbox{\boldmath $v$}}

69: \newcommand {\bw} {\mbox{\boldmath $w$}}

70: \newcommand {\bx} {\mbox{\boldmath $x$}}

71: \newcommand {\by} {\mbox{\boldmath $y$}}

72: \newcommand {\bz} {\mbox{\boldmath $z$}}

73: \newcommand {\bA} {\mbox{\boldmath $A$}}

74: \newcommand {\bB} {\mbox{\boldmath $B$}}

75: \newcommand {\bC} {\mbox{\boldmath $C$}}

76: \newcommand {\bD} {\mbox{\boldmath $D$}}

77: \newcommand {\bE} {\mbox{\boldmath $E$}}

78: \newcommand {\bF} {\mbox{\boldmath $F$}}

79: \newcommand {\bG} {\mbox{\boldmath $G$}}

80: \newcommand {\bH} {\mbox{\boldmath $H$}}

81: \newcommand {\bI} {\mbox{\boldmath $I$}}

82: \newcommand {\bJ} {\mbox{\boldmath $J$}}

83: \newcommand {\bK} {\mbox{\boldmath $K$}}

84: \newcommand {\bL} {\mbox{\boldmath $L$}}

85: \newcommand {\bM} {\mbox{\boldmath $M$}}

86: \newcommand {\bN} {\mbox{\boldmath $N$}}

87: \newcommand {\bO} {\mbox{\boldmath $O$}}

88: \newcommand {\bP} {\mbox{\boldmath $P$}}

89: \newcommand {\bQ} {\mbox{\boldmath $Q$}}

90: \newcommand {\bR} {\mbox{\boldmath $R$}}

91: \newcommand {\bS} {\mbox{\boldmath $S$}}

92: \newcommand {\bT} {\mbox{\boldmath $T$}}

93: \newcommand {\bU} {\mbox{\boldmath $U$}}

94: \newcommand {\bV} {\mbox{\boldmath $V$}}

95: \newcommand {\bW} {\mbox{\boldmath $W$}}

96: \newcommand {\bX} {\mbox{\boldmath $X$}}

97: \newcommand {\bY} {\mbox{\boldmath $Y$}}

98: \newcommand {\bZ} {\mbox{\boldmath $Z$}}

99: \newcommand{\calA}{{\cal A}}

100: \newcommand{\calB}{{\cal B}}

101: \newcommand{\calC}{{\cal C}}

102: \newcommand{\calD}{{\cal D}}

103: \newcommand{\calE}{{\cal E}}

104: \newcommand{\calF}{{\cal F}}

105: \newcommand{\calG}{{\cal G}}

106: \newcommand{\calH}{{\cal H}}

107: \newcommand{\calI}{{\cal I}}

108: \newcommand{\calJ}{{\cal J}}

109: \newcommand{\calK}{{\cal K}}

110: \newcommand{\calL}{{\cal L}}

111: \newcommand{\calM}{{\cal M}}

112: \newcommand{\calN}{{\cal N}}

113: \newcommand{\calO}{{\cal O}}

114: \newcommand{\calP}{{\cal P}}

115: \newcommand{\calQ}{{\cal Q}}

116: \newcommand{\calR}{{\cal R}}

117: \newcommand{\calS}{{\cal S}}

118: \newcommand{\calT}{{\cal T}}

119: \newcommand{\calU}{{\cal U}}

120: \newcommand{\calV}{{\cal V}}

121: \newcommand{\calW}{{\cal W}}

122: \newcommand{\calX}{{\cal X}}

123: \newcommand{\calY}{{\cal Y}}

124: \newcommand{\calZ}{{\cal Z}}

125: \begin{document}

126: \thispagestyle{empty}

127: \title{An Identity of Chernoff Bounds with an Interpretation

128: in Statistical Physics and Applications in Information Theory

129: %\thanks{This research was supported by my wife and kids.}

130: }

131: \author{Neri Merhav

132: %\thanks{

133: %Currently on sabbatical leave at HP Laboratories,

134: %1501 Page Mill Road, MS 3U-4, Palo Alto CA 94304, USA.}

135: }

136: %\date{}

137: \maketitle

138:

139: \begin{center}

140: Department of Electrical Engineering \\

141: Technion - Israel Institute of Technology \\

142: Haifa 32000, Israel \\

143: \end{center}

144: \vspace{1.5\baselineskip}

145: \setlength{\baselineskip}{1.5\baselineskip}

146:

147: \begin{abstract}

148:

149: An identity between two versions of the

150: Chernoff bound on the probability a certain

151: large deviations event, is established. This identity has an interpretation

152: in statistical physics, namely, an isothermal equilibrium of a composite system that

153: consists of multiple subsystems of particles.

154: Several information--theoretic application examples, where

155: the analysis of this large deviations probability naturally arises, are

156: then described from the viewpoint of this statistical mechanical interpretation.

157: This results in several relationships between

158: information theory and statistical physics, which

159: we hope, the reader will find insightful.

160:

161: \vspace{0.5cm}

162:

163: \noindent

164: {\bf Index Terms:} Large deviations theory,

165: Chernoff bound, statistical physics, thermal equilibrium,

166: equipartition, thermodynamics, phase transitions.

167: \end{abstract}

168:

169:

170: \section{Introduction}

171:

172: Relationships between information theory and statistical physics have been

173: extensively recognized over the last few decades, and they are drawn from

174: many different aspects. We mention here only a few of them.

175:

176: One such aspect is characterized by identifying structures

177: of optimization problems pertaining to certain information--theoretic settings

178: as being analogous to parallel structures that arise in statistical physics,

179: and then borrowing statistical--mechanical

180: insights, as well as powerful analysis techniques (like the replica

181: method) from statistical physics to the dual information--theoretic setting

182: of interest. A very partial list of works along this line includes

183: \cite{AB01},

184: \cite{GV02},

185: \cite{HK05},

186: \cite{KH05},

187: \cite{KNM02},

188: \cite{KabS99}

189: \cite{KSNS01},

190: \cite{KanS99},

191: \cite{MM06} (and references therein),

192: \cite{MR06},

193: \cite{Murayama02},

194: \cite{PS99},

195: \cite{RC00},

196: \cite{Sourlas89},

197: \cite{Sourlas94},

198: \cite{Tanaka01},

199: \cite{Tanaka02},

200: and \cite{WSW05}.

201:

202: Another aspect pertains to the

203: philosophy and the application of the maximum entropy principle,

204: which emerged in statistical mechanics

205: in the nineteenth century and has been advocated during the

206: previous century in

207: a wide variety of more general contexts, by Jaynes

208: \cite{Jaynes57a},\cite{Jaynes57b},\cite{Jaynes82}, and by

209: Shore and Johnson \cite{SJ80}, as a general guiding principle

210: to problems in information theory

211: (see, e.g., \cite[Chap.\ 11]{CT91} and references therein)

212: and other areas, such as signal processing,

213: in particular, speech coding (see, e.g., \cite{GGRS81})

214: spectrum estimation (see, e.g., \cite{Burg75}), and others.

215:

216: Yet another aspect is related to ideas and theories that

217: underly the notion of `trading' between

218: information bits and energy, or heat. In particular,

219: Landauer's erasure principle

220: \cite{Landauer61} is argued to provide a powerful link between

221: information theory and physics and to

222: suggest a physical theory of information

223: (comprehensive overviews are included in,

224: e.g., \cite{Maroney04} and \cite{PV01}).

225: According to Landauer's principle, the erasure of

226: every bit of information increases the thermodynamic

227: entropy of the world by $k\ln 2$, where $k$ is Boltzmann's

228: constant, and so, information is actually physical.

229:

230: Finally, to shift gears more to the direction of this paper,

231: we should mention the aspect of the interface between statistical physics and

232: large deviations theory,

233: a line of research advocated most

234: prominently by Ellis \cite{Ellis85},\cite{Ellis06},

235: and developed also by Oono \cite{Oono89},

236: McAllester \cite{McAllester}, and others. The main

237: theme here evolves around the

238: identification of Chernoff bounds and more general large deviations

239: rate functions with free energies (along with

240: their related partition functions),

241: thermodynamical entropies, and the underlying maximum--entropy/equilibrium principle associated with them.

242: In particular, Ellis' book \cite{Ellis85}

243: is devoted largely to the application of large deviations theory

244: to the statistical physics pertaining to

245: models of ferromagnetic spin arrays, like

246: Ising spin glasses and others,

247: in order to explore

248: phase transitions phenomena of spontaneous

249: magnetization (see also \cite{MM06}).

250:

251: This paper, which is mostly expository in character,

252: lies in the intersection

253: of information theory, large deviations theory, and

254: statistical physics. In particular, we establish a simple identity between two

255: quantities as they can both be interpreted as the rate

256: function of a certain large deviations event

257: that involves multiple

258: distributions of sets of independent random variables (as opposed

259: to the usual, single set of i.i.d.\ random variables).

260: The analysis of this large deviations event is of a general form

261: that is frequently encountered in numerous applications in

262: information theory (cf.\ Section 4). Its informal description is as follows:

263: Let $v_1,\ldots,v_n$ be an arbitrary

264: (deterministic) sequence whose components take

265: on values in a finite set $\calV$, and let $U_1,\ldots,U_n$ be a sequence of

266: random variables where each component is generated independently

267: according to a distribution $q(u_i|v_i)$, $i=1,\ldots,n$.

268: For a given function $f$ and a constant $E$,

269: we are interested in the large deviations

270: analysis (Chernoff bound) of the probability of the event

271: \begin{equation}

272: \label{event}

273: \sum_{i=1}^n f(U_i,v_i)\le nE,

274: \end{equation}

275: assuming that the relative frequencies of the various symbols in

276: $(v_1,\ldots,v_n)$ stabilize as $n$ grows without bound, and assuming

277: that $E$ is sufficiently small to make this a rare event for large $n$.

278:

279: There are (at least) two ways to drive a Chernoff bound on the probability

280: of this event. The first is to treat the entire sequence of RV's,

281: $\{f(U_i,v_i)\}$ as a whole, and the second is to partition it

282: according to the various symbols $\{v_i\}$, i.e., to consider the separate

283: large deviations events  of the

284: partial sums, $\sum_{i:v_i=v}f(U_i,v)$, $v\in\calV$, for all possible

285: allocations of the total `budget' $nE$ among the various $\{v\}$.

286: These two approaches lead to two

287: (seemingly) different expressions of Chernoff bounds,

288: but since they are both exponentially tight, they must agree.

289:

290: As will be described and discussed in Section 2,

291: the identity between these two Chernoff bounds has a natural

292: interpretation in statistical physics: it is viewed as

293: a situation of thermal equilibrium (maximum

294: entropy) in a system that consists of several

295: subsystems (which can be of different kinds), each of them with many particles.

296:

297: As will be shown in Section 4,

298: the above--described problem of large deviations analysis of the

299: event (\ref{event}) is encountered in many applications in information

300: theory, such as rate--distortion coding, channel capacity, hypothesis

301: testing (signal detection, in particular), and others. The

302: above mentioned statistical mechanical

303: interpretation then applies to all

304: of them. Accordingly, Section 4 is devoted to expository descriptions of

305: each of these applications, along with

306: the underlying physics that is inspired by

307: the proposed thermal equilibrium interpretation. The reader is assumed to have

308: very elementary background in statistical physics.

309:

310: The remaining part of this paper is organized as follows. In Section 2,

311: we establish some notation conventions. In Section 3, we assert and prove

312: our main result, which is the identity between the above described

313: Chernoff bounds. Finally, in Section 4, we explore the application

314: examples.

315:

316: \section{Notation}

317:

318: Throughout this paper, scalar random

319: variables (RV's) will be denoted by the capital

320: letters, like $U$,$V$,$X$, and $Y$, their sample values will be denoted by

321: the respective lower case letters, and their alphabets will be denoted

322: by the respective calligraphic letters.

323: A similar convention will apply to

324: random vectors and their sample values,

325: which will be denoted with same symbols superscripted by the dimension.

326: Thus, for example, $X^n$ will denote a random $n$-vector $(X_1,\ldots,X_n)$,

327: and $x^n=(x_1,...,x_n)$ is a specific vector value in $\calX^n$,

328: the $n$-th Cartesian power of $\calX$. The

329: notations $x_i^j$ and $X_i^j$, where $i$

330: and $j$ are integers and $i\le j$, will designate segments $(x_i,\ldots,x_j)$

331: and $(X_i,\ldots,X_j)$, respectively,

332: where for $i=1$, the subscript will be omitted (as above).

333: Sequences without specifying indices are denoted by $ \{\cdot\} $.

334: Sources and channels will be denoted generically by the letter $P$ or $Q$.

335: Specific letter probabilities corresponding to a source $P$ will be

336: denoted by the corresponding lower case letter, e.g., $p(v)$ is the

337: probability of a letter $v\in\calV$. A similar convention will be applied

338: to a channel $Q$ and the corresponding transition probabilities, e.g., $q(u|v)$,

339: $u\in\calU$, $v\in\calV$.

340: The cardinality of a finite set $\calA$ will be denoted by $|\calA|$.

341: Information theoretic quantities like entropies, and mutual

342: informations will be denoted following the usual conventions

343: of the information theory literature.

344:

345: Notation pertaining to statistical physics

346: will also follow, wherever possible,

347: the customary conventions. I.e., $k$ will denote

348: Boltzmann's constant ($k=1.38065\times 10^{-23}$ Joules

349: per Kelvin degree), $T$ -- absolute temperature (in Kelvin

350: degrees), $\beta=1/(kT)$ -- the inverse temperature

351: (in units of $\mbox{Joule}^{-1}$ or $\mbox{erg}^{-1}$),

352: $E$ -- energy, the letter $Z$ will be used to

353: denote partition functions, etc.

354:

355: \section{Main Result}

356:

357: Let $\calU$ and $\calV$ be finite\footnote{The assumption that $\calU$ is

358: finite, is made mostly for the sake of convenience and simplicity. Most

359: of our results extend straightforwardly to the case of a continuous

360: alphabet $\calU$. The extension to a continuous alphabet $\calV$ is somewhat

361: more subtle, however.}

362: sets and let $f:\calU\times\calV\to\reals$

363: be a given function. Let

364: $P=\{p(v),~v\in\calV\}$

365: be a probability mass function on $\calV$ and

366: let $Q=\{q(u|v),~u\in\calU,~v\in\calV\}$ be a

367: matrix of conditional probabilities from $\calV$ to $\calU$.

368: %We are given a deterministic vector

369: %$v^n=(v_1,\dots,v_n)$ with components in $\calV$,

370: %where each letter $v\in\calV$ appears $n_v$ times, $\sum_{v\in\calV}n_v=n$. We

371: %will also denote $p(v)=n_v/n$.

372:

373: Next, let us define for each $v\in\calV$,

374: the partition function:

375: \begin{equation}

376: \label{zvb}

377: Z_v(\beta)=\sum_{u\in\calU}q(u|v)e^{-\beta f(u,v)},~~~~\beta\ge 0,

378: \end{equation}

379: and for a given $E_v$ in the range

380: \begin{equation}

381: \label{range}

382: \min_{u\in\calU}f(u,v) \le E_v \le

383: \sum_{u\in\calU}q(u|v)f(u,v),

384: \end{equation}

385: let

386: \begin{equation}

387: S_v(E_v)=\min_{\beta\ge 0}[\beta E_v+\ln Z_v(\beta)].

388: \end{equation}

389: Further, for a given constant $E$ in the range

390: $$\sum_{v\in\calV}p(v)\min_{u\in\calU}f(u,v) \le E \le

391: \sum_{u\in\calU}\sum_{v\in\calV}p(v)q(u|v)f(u,v),$$

392: let

393: \begin{equation}

394: \bar{S}(E)=\min_{\beta\ge 0}\left[\beta E+

395: \sum_{v\in\calV}p(v)\ln Z_v(\beta)\right].

396: \end{equation}

397: Let $\calH(E)$ denote the set of all $|\calV|$--dimensional vectors

398: $\bar{E}=\{E_v,~v\in\calV\}$, where each component $E_v$ satisfies

399: (\ref{range}),

400: and where $\sum_vp(v)E_v\le E$.

401: Our main result, in this section, is the following:

402:

403: \begin{theorem}

404: \begin{equation}

405: \label{identity}

406: \max_{\bar{E}\in\calH(E)}\sum_{v\in\calV}p(v)S_v(E_v)=\bar{S}(E).

407: \end{equation}

408: \end{theorem}

409:

410: The expression on the right--hand side is,

411: of course, more convenient to work with since

412: it involves minimization w.r.t.\ one parameter

413: only, as opposed to the left--hand side,

414: where there is a minimization over $\beta$

415: for every $v$, as well as a maximization

416: over the $|\calV|$--dimensional vector $\bar{E}$.

417:

418: While the proof of Theorem 1 below is fairly short,

419: in the Appendix (subsection A.1), we outline an alternative

420: proof which, although somewhat longer,

421: provides some additional insight, we believe.

422: As described briefly in the Introduction,

423: it is based on two different approaches to the analysis

424: of the rate function, $I(E)$, pertaining to

425: the probability of the event:

426: \begin{equation}

427: \label{ld}

428: \sum_{i=1}^n f(U_i,v_i)\le nE,

429: \end{equation}

430: where $\{U_i\}$ are RV's taking values in $\calU$ and drawn according to

431: $q(u^n|v^n)=\prod_{i=1}^nq(u_i|v_i)$, and

432: $v^n=(v_1,\dots,v_n)$ is a given deterministic vector whose

433: components are in $\calV$,

434: with each $v\in\calV$ appearing

435: $n_v$ times ($\sum_{v\in\calV}n_v=n$), and

436: the related relative frequency, $n_v/n$ is exactly $p(v)$.

437:

438: It should be noted that

439: the proof in the Appendix pertains to a

440: slightly different definition of the set

441: $\calH(E)$, where the individual upper bound to

442: each $E_v$ is enlarged to $\max_uf(u,v)$.

443: Thus, $\calH(E)$ is extended to a larger set,

444: which will be denoted by $\calH_0(E)$ in the Appendix. But the

445: maximum over $\calH_0(E)$ is

446: always attained within the original set $\calH(E)$

447: (as is actually shown in the proof below).

448:

449: \vspace{0.5cm}

450:

451: \noindent

452: {\it Proof.}

453: Here we prove the identity of Theorem 1

454: directly, without using large deviations analysis and Chernoff bounds.

455: We first prove that for every $\bar{E}\in\calH(E)$,

456: we have $\sum_{v\in\calV}p(v)S_v(E_v)\le \bar{S}(E)$

457: and then, of course,

458: $$\max_{\bar{E}\in\calH(E)}\sum_{v\in\calV}p(v)S_v(E_v)\le \bar{S}(E)$$

459: as well. This follows from the following chain of inequalities:

460: \begin{eqnarray}

461: \sum_{v\in\calV}p(v)S_v(E_v)&=&\sum_{v\in\calV}p(v)\cdot\min_{\beta\ge 0}[\beta E_v+\ln Z_v(\beta)]\nonumber\\

462: &=&\sum_{v\in\calV}\min_{\beta\ge 0}[\beta p(v)E_v+p(v)\ln Z_v(\beta)]\nonumber\\

463: &\le&\min_{\beta\ge 0}\left[\beta\sum_{v\in\calV}p(v)E_v+

464: \sum_{v\in\calV}p(v)\ln Z_v(\beta)\right]\nonumber\\

465: &\le&\min_{\beta\ge 0}\left[\beta E+\sum_{v\in\calV}p(v)\ln Z_v(\beta)\right]\nonumber\\

466: &=&\bar{S}(E),

467: \end{eqnarray}

468: where in the second inequality we used the postulate that

469: $\sum_vp(v)E_v\le E$.

470:

471: In the other direction, let $\beta^*$ be the achiever of $\bar{S}(E)$,

472: i.e., $\beta^*$ is the solution to the equation:

473: $$E=-\left[\frac{\partial}{\partial\beta}\sum_vp(v)

474: \ln Z_v(\beta)\right]_{\beta=\beta^*}.$$

475: For each $v\in\calV$,

476: let $E_v^*\in[\min_uf(u,v),\sum_uq(u|v)f(u,v)]$ be chosen such that

477: $\beta^*$ would be the achiever of $S_v(E_v^*)$, i.e., $E_v^*=-[\partial\ln Z_v(\beta)/\partial\beta]_{\beta=\beta^*}$.

478: Obviously, the vector $\{E_v^*,~v\in\calV\}$ lies in $\calH(E)$, and

479: \begin{eqnarray}

480: \sum_vp(v)E_v^*&=&-\sum_vp(v)\left[\frac{\partial\ln

481: Z_v(\beta)}{\partial\beta}\right]_{\beta=\beta^*}\nonumber\\

482: &=&-\left[\frac{\partial}{\partial\beta}\sum_vp(v)

483: \ln Z_v(\beta)\right]_{\beta=\beta^*}\nonumber\\

484: &=&E.

485: \end{eqnarray}

486: Thus,

487: \begin{eqnarray}

488: \max_{\bar{E}\in\calH(E)}\sum_{v\in\calV}p(v)S_v(E_v)

489: &\ge&\sum_{v\in\calV}p(v)S_v(E_v^*)\nonumber\\

490: &=&\sum_{v\in\calV}p(v)[\beta^* E_v^*+\ln Z_v(\beta^*)]\nonumber\\

491: &=&\beta^*\sum_{v\in\calV}p(v)E_v^*+\sum_vp(v)\ln Z_v(\beta^*)\nonumber\\

492: &=&\beta^*E+\sum_vp(v)\ln Z_v(\beta^*)\nonumber\\

493: &=&\bar{S}(E).

494: \end{eqnarray}

495: This completes the proof of Theorem 1.

496: $\Box$

497:

498: The function $Z_v(\beta)$ is similar to the well--known partition function

499: pertaining to the Boltzmann distribution w.r.t.\ the Hamiltonian (energy function)

500: $\calE_v(u)=f(u,v)$,

501: except that each exponential term

502: is weighted by $q(u|v)$, as opposed to the usual form,

503: which is just $\sum_{u\in\calU}e^{-\beta \calE_v(u)}$.

504: Before describing the statistical mechanical interpretation of eq.\ (\ref{identity}),

505: we should note that $Z_v(\beta)$ defined in (\ref{zvb}) can easily be related to

506: the ordinary partition function, without weighting, as follows:

507: Suppose that $\{q(u|v)\}$ are rational\footnote{Even

508: if not rational, they can always be approximated as such to an arbitrarily good precision.}

509: and hence can be represented as ratios

510: of two positive integers, $q(u|v)=M(u|v)/M$,

511: where $M \ge |\calU|$ is common to all $u\in\calU$ (and $v\in\calV$). Now,

512: imagine that every value of $u$ actually represents

513: a `quantization' of a more refined microstate (call it a

514: ``nanostate'') $w\in\calW$, $|\calW|=M$, so that $u=g_v(w)$,

515: where $g_v$ is a many--to--one function, for which the inverse image of every $u$ consists of

516: $M(u|v)$ many values of $w$. Suppose further that the Hamiltonian depends

517: on $w$ only via $g_v(w)$, i.e., $\calE_v'(w)=\calE_v(g_v(w))$. Then, the (ordinary) partition

518: function related to $w$ is given by

519: \begin{eqnarray}

520: \label{wpf}

521: \zeta_v(\beta)&=&\sum_{w\in\calW}e^{-\beta\calE_v'(w)}\nonumber\\

522: &=&\sum_{w\in\calW}e^{-\beta\calE_v(g_v(w))}\nonumber\\

523: &=&\sum_{u\in\calU}M(u|v)e^{-\beta\calE_v(u)}\nonumber\\

524: &=&M\sum_{u\in\calU}q(u|v)e^{-\beta\calE_v(u)}=MZ_v(\beta).

525: \end{eqnarray}

526: Thus, the weighted partition function is, within a constant factor $M$,

527: the same as the ordinary partition function of $w$. This factor

528: cancels out when probabilities are calculated since it appears both in the

529: numerator and the denominator. Moreover,

530: it affects neither the minimizing $\beta$

531: that achieves $S_v(E_v)$ or

532: $\bar{S}(E)$, nor the derivatives of the log--partition

533: function.

534:

535: We now move on to our interpretation of

536: eq.\ (\ref{identity}) from the viewpoint

537: of elementary statistical physics: Consider a physical system which consists of $|\calV|$ subsystems of

538: particles. The total number of particles in the system is $n$ and the total amount

539: of energy is $nE$ Joules. For each $v\in\calV$, the subsystem indexed by $v$

540: (subsystem $v$, for short) contains

541: $n_v=np(v)$ particles, each of which can lie in any microstate

542: within a finite set of microstates $\calU$ (or an underlying

543: nanostate in a set $\calW$),

544: and it is characterized by an additive Hamiltonian

545: $\calE_v(u_1,\ldots,u_{n_v})=\sum_{i=1}^{n_v}f(u_i,v)$. The total amount of

546: energy possessed by subsystem $v$ is given by $n_vE_v$ Joules. As long

547: as the subsystems are in thermal isolation from each other, each one of them

548: may have its own temperature $T_v=1/(k\beta_v)$, where $\beta_v$ is the achiever

549: of the normalized (per--particle) entropy

550: associated with an average per--particle energy $E_v$, i.e.,

551: $$S_v(E_v)=\min_{\beta\ge 0}[\beta E_v+\ln Z_v(\beta)].$$

552: The above--mentioned rate function $I(E)$ of $\mbox{Pr}\{\sum_{i=1}^nf(U_i,v_i)\le nE\}$

553: is then given by the negative maximum total per--particle entropy,

554: $\sum_vp(v)S_v(E_v)$, where the maximum is over all energy allocations $\{E_v\}$

555: such that the total energy is conserved, i.e., $\sum_vp(v)E_v=E$.

556: This maximum is attained by the expression of the

557: r.h.s.\ of eq.\ (\ref{identity}), where there

558: is {\it only one} temperature parameter, and hence

559: it corresponds to {\it thermal equilibrium}.

560: In other words, the whole system then lies in the same

561: temperature $T^*=1/(k\beta^*)$, where $\beta^*$

562: is the minimizer of

563: $\bar{S}(E)$. Thus, the energy allocation among

564: the various subsystems in

565: equilibrium is such that their temperatures are the same

566: (cf.\ the above proof of Theorem 1). Theorem 1 is then

567: interpreted as expressing the second law

568: of thermodynamics.

569:

570: At this point, a few comments are in order:

571: \begin{enumerate}

572: \item It should be pointed out that in the above physical interpretation, we have implicitly assumed that the

573: particles within each subsystem are distinguishable, and so the partition function corresponding to a set of $n_v$

574: particles is given by the partition function of a single particle raised to the power of $n_v$, without dividing

575: by $n_v!$. This differs then from the indistinguishable case only by a constant factor

576: (as long as $n_v$ is indeed constant)

577: and hence the difference between the distinguishable and the indistinguishable

578: cases is not essential for the most part of our discussion.

579: \item As mentioned in the above paragraph, our conclusion is that $I(E)=-\bar{S}(E)$. At first glance, this may

580: seem peculiar as it appears that $I(E)$ may be negative. However, one should keep in mind that $\bar{S}(E)$

581: is induced by a (convex) combination of weighted partition functions,

582: rather than ordinary partition functions, like $\zeta_v(\beta)$. Referring to eq.\ (\ref{wpf}), the ordinary

583: notion of entropy $\Sigma(E)$

584: as the normalized log--number of (nano)states with normalized energy $E$, is

585: given by

586: \begin{eqnarray}

587: \bar{\Sigma}(E)&=&\min_{\beta\ge0}\left[\beta E+

588: \sum_vp(v)\ln \zeta_v(\beta)\right]\nonumber\\

589: &=&\min_{\beta\ge 0}\left[\beta E+\sum_vp(v)\ln Z_v(\beta)\right]+\ln M\nonumber\\

590: &=&\bar{S}(E)+\ln M.

591: \end{eqnarray}

592: Thus,

593: $$I(E)=\ln M - \bar{\Sigma}(E),$$

594: which is always non--negative.

595: \item The identity (\ref{identity})

596: can be thought

597: of as a generalized concavity property of the entropy:

598: Had all the entropy

599: functions $S_v(\cdot)$ been the same, this would have been the usual

600: concavity property. What makes this equality less trivial and more interesting

601: is that it continues to hold even when $S_v(\cdot)$, for

602: the various $v\in\calV$, are different from each

603: other.

604: \item On the more technical level, since this paper draws analogies with physics,

605: we should say a few words about physical units. The products $\beta E$, $\beta E_v$, $\beta f(u,v)$, etc.,

606: should all be pure numbers, of course. Since $\beta=1/(kT)$,

607: where $k$ is Boltzmann's constant and $T$ is absolute temperature,

608: and since $kT$ has units of energy (Joules or ergs, etc.),

609: it is understood that $E$, $E_v$, $f(u,v)$ and the like, should all have units of energy as well. In the applications

610: described below, whenever this is not the case, i.e., the latter quantities are pure numbers rather than physical energies,

611: we will sometimes reparametrize $\beta$ by $\beta\epsilon_0$, where $\epsilon_0$ is an arbitrary constant possessing

612: units of energy (e.g., $\epsilon_0=1$ Joule or $\epsilon_0=1$ erg),

613: and we absorb $\epsilon_0$ in the Hamiltonian, i.e.,

614: redefine $\calE_v(u)=\epsilon_0f(u,v)$. Thus, in this case, $S_v(E)$, where $E$ is the now the energy in units

615: of $\epsilon_0$, is redefined as

616: $$S_v(E)=\min_{\beta\ge 0}

617: \left[\beta\cdot\epsilon_0 E+

618: \ln\left(\sum_uq(u|v)e^{-\beta\calE_v(u)}\right)\right].$$

619: This kind of modification is

620: not essential, but it may help to avoid confusion

621: about units when the picture

622: is viewed from the aspects of physics.

623: \end{enumerate}

624:

625: \section{Applications}

626:

627: Equipped with the main result of the previous section and its statistical mechanical

628: interpretation, we next introduce a few applications that fall within the

629: framework considered. In all these applications,

630: there is an underlying large deviations event of the type of eq. (\ref{ld}), whose rate function

631: is of interest. The above described viewpoint of statistical physics is then relevant in all

632: these applications.

633:

634: \subsection{The Rate--Distortion Function}

635:

636: Let $P=\{p(x),~x\in\calX\}$ designate the vector

637: of letter probabilities associated with a

638: given discrete memoryless source (DMS), and for a given reproduction

639: alphabet $\hat{\calX}$, let $d:\calX\times\hat{\calX}\to\reals^+$

640: denote a single--letter distortion measure. Let $R(D)$ denote the rate--distortion

641: function of the DMS $P$.

642:

643: One useful way to think of the rate--distortion function

644: is inspired by the classical random coding argument:

645: Let $(\hX_1,\ldots,\hX_n)$ be drawn i.i.d.\ from the

646: optimum random coding distribution $q^*(\hx_1,\ldots,\hx_n)=\prod_{i=1}^n

647: q^*(\hx_i)$ and

648: consider the event $\sum_{i=1}^n d(x_i,\hX_i)\le nD$,

649: where $x^n$ is a given source vector, typical to $P$, i.e.,

650: the composition of $x^n$ consists

651: of $n_x=np(x)$ occurrences of each $x\in\calX$. This is exactly an event of the type (\ref{ld}) with

652: $U_i=\hX_i$, $v_i=x_i$, $i=1,\ldots,n$, $q(u|v)=q(\hx|x)=q^*(\hx)$ independently of $x$,

653: $f(u,v)=f(\hx,x)=d(x,\hx)$, and $E=D$. I.e., the Hamiltonian $\calE_x(\hx)$ is given by

654: $\epsilon_0d(x,\hx)$ and the total energy is $nD$ in units of $\epsilon_0$.

655:

656: Suppose that this

657: probability is of the exponential order of $e^{-nI(D)}$. Then,

658: it takes about $M=e^{n[I(D)+\epsilon]}$ ($\epsilon > 0$, however small)

659: independent trials to `succeed' at least once

660: (with high probability) in having some realization of $\hX^n$

661: within distance $nD$ from $x^n$.

662: This is the well--known

663: the classical random coding achievability argument that leads to $I(D)=R(D)$.

664: Thus, the large--deviations rate function of interest agrees exactly

665: with the rate--distortion function (cf.\ \cite[Sect.\ 3.4]{Berger71}), which is:

666: \begin{equation}

667: R(D)=-\min_{\beta\ge 0}\left[\beta\cdot\epsilon_0D+\sum_{x\in\calX}p(x)

668: \ln\left(\sum_{\hx\in\hat{\calX}}q^*(\hx)e^{-\beta\cdot\epsilon_0 d(x,\hx)}\right)\right].

669: \end{equation}

670: Interestingly,

671: in \cite[p.\ 90, Corollary 4.2.3]{Gray90}), the rate--distortion function

672: is shown, using completely different considerations,

673: to have a parametric representation which can be written exactly in this form.

674:

675: The fact that the rate--distortion function has an

676: interpretation of an isothermal equilibrium situation in

677: statistical thermodynamics is not quite new

678: (cf.\ e.g.\ \cite[Sect.\ 6.4]{Berger71}, \cite{Rose94}).

679: Here, however, we obtain it in a more explicit

680: manner and as a special case of a more

681: general principle.

682:

683: A simple example is that of the binary symmetric source with the Hamming distortion

684: measure. It is easy to see that, in this example,

685: the relationship between distortion and temperature is:

686: \begin{equation}

687: T=\frac{\epsilon_0}{k\ln[(1-D)/D]}~~\mbox{or, equivalently,}~~D=\frac{1}{1+e^{\epsilon_0/(kT)}}

688: \end{equation}

689: and, of course, $R(D)=1-h_2(D)$, where $h_2(D)$ is the binary entropy function.

690:

691: A slightly more involved example pertains to the regime

692: of high resolution (small distortion) and it turns out to

693: be related to (a generalized version of) the

694: law of equipartition of energy in statistical physics:

695: Consider the $L_\theta$ distortion measure, $d(x,\hx)=|x-\hx|^\theta$ (most

696: commonly encountered are the cases $\theta=1$ and $\theta=2$). Let

697: us assume that $D > 0$ is very small and consider the (continuous)

698: uniform random coding distribution $q(\hx)=\frac{1}{2A}$ in the interval

699: $[-A,A]$ and zero elsewhere. This random coding distribution is suboptimal, but

700: it corresponds, and hence is well motivated,

701: by many results in high--resolution quantization using

702: uniform quantizers (see, e.g., \cite{GN98} and references therein).

703: For every $x\in\calX$, the partition function

704: is given by

705: $$Z_x(\beta)=\frac{1}{2A}\int_{-A}^A

706: \exp\{-\beta\epsilon_0|\hx-x|^\theta\} \mbox{d}\hx.$$

707: When $D$ is very small, $\beta$ is very large, and

708: then the finite--interval integral pertaining to $Z_x(\beta)$ can be

709: approximated\footnote{See the Appendix (subsection A.2)

710: for a more rigorous derivation.} by an infinite one,

711: provided that the support of $\{p(x)\}$

712: is included\footnote{An alternative, softer

713: condition is that the probability that $|x|\ge A$ is negligibly small.}

714: in the interval $[-A,A]$:

715: \begin{equation}

716: \label{approxz}

717: Z_x(\beta)\approx\frac{1}{2A}\int_{-\infty}^\infty

718: \exp\{-\beta\epsilon_0|\hx-x|^\theta\} \mbox{d}\hx,

719: \end{equation}

720: which then becomes independent of $x$. The average distortion

721: (internal energy) associated with this partition function can

722: be evaluated using the same technique as the one that leads to the

723: law of equipartition in statistical physics:

724: \begin{eqnarray}

725: \label{equipartition}

726: \epsilon_0 D&\approx&-\frac{\partial}{\partial \beta}\ln

727: \left[\int_{-\infty}^\infty

728: \exp\{-\beta\epsilon_0|\hx-x|^\theta\} \mbox{d}\hx\right]\nonumber\\

729: &=&-\frac{\partial}{\partial \beta}\ln\left[

730: \beta^{-1/\theta}\cdot\int_{-\infty}^\infty

731: \exp\{-\epsilon_0|\beta^{1/\theta}(\hx-x)|^\theta\}

732: \mbox{d}(\beta^{1/\theta}(\hx-x))\right]\nonumber\\

733: &=&-\frac{\partial}{\partial \beta}\ln\left[

734: \beta^{-1/\theta}\cdot\int_{-\infty}^\infty

735: \exp\{-\epsilon_0|z|^\theta\}

736: \mbox{d}z\right]\nonumber\\

737: &=&-\frac{\mbox{d}}{\mbox{d}\beta}\ln

738: \left(\beta^{-1/\theta}\right)-

739: \frac{\partial}{\partial \beta}\ln\left[

740: \int_{-\infty}^\infty

741: \exp\{-\epsilon_0|z|^\theta\}

742: \mbox{d}z\right]\nonumber\\

743: &=&\frac{1}{\beta \theta}-0

744: =\frac{kT}{\theta}

745: \end{eqnarray}

746: [Note that for $\theta=2$, where the Hamiltonian is

747: quadratic in the integration variable

748: $\hx$, this is exactly the law of equipartition.]

749: Thus, for low temperatures, the distortion

750: is given by $D=kT/(\epsilon_0\theta)$, i.e.,

751: distortion is linear in temperature in that regime,

752: and the constant of proportionality is related to the

753: heat capacity, $C=k/\theta$.

754: Since the temperature is proportional to the negative local slope

755: of the distortion--rate function (as the reciprocal, $\beta$, is proportional

756: to the negative local slope of the rate--distortion function), this means that the distortion

757: is proportional to its derivative w.r.t.\ $R$, which means an exponential relationship of the

758: form $D=D_0e^{-\theta R}$ ($D_0$ -- constant). For $\theta=2$ (mean square error),

759: this is recognized as the well--known characterization

760: of distortion as function of rate in the high resolution regime.

761: Specifically, in this case, the factor of $2$ at the denominator

762: of $kT/2$, the universal expression of the

763: internal energy per degree of freedom according to

764: the equipartition theorem, has the same origin as the factor of $2$ that appears

765: in the exponent

766: of $D(R)=D_0e^{-2R}$ (decay of 6dB per bit).

767: Thus the law of equipartition in statistical physics is

768: related to the behavior of rate distortion codes in the high resolution regime.

769:

770: To compute the rate associated with this temperature more explicitly,

771: note that the minimizing $\beta^*$

772: is given by $1/(\theta\epsilon_0D)$, and so

773: \begin{eqnarray}

774: R&=&-\beta^*\epsilon_0D-

775: \ln\left[\frac{1}{2A}\int_{-\infty}^\infty

776: \exp\{-\beta^*\epsilon_0|\hx-x|^\theta\} \mbox{d}\hx\right]\nonumber\\

777: &=&-\frac{1}{\theta}-\ln\left[\frac{1}{2A}

778: \cdot\frac{2\Gamma(1/\theta)}{\theta(1/\theta D)^{1/\theta}}\right]\nonumber\\

779: &=&\ln\left[\frac{A\theta}{\Gamma(1/\theta)

780: (\theta eD)^{1/\theta}}\right]\nonumber\\

781: &=&\ln\left[\frac{A\theta}{\Gamma(1/\theta)}\right]-

782: \frac{1}{\theta}\ln(\theta eD).

783: \end{eqnarray}

784:

785: \subsection{Channel Capacity}

786:

787: In complete duality to the random coding argument that puts the

788: rate--distortion function in the framework discussed in Section 3,

789: a parallel argument can be made with regard to channel capacity.

790:

791: Given a discrete memoryless channel (DMC) with a finite input alphabet $\calX$,

792: and a finite output alphabet $\calY$, we can obtain capacity using the following

793: argument. Let $\{q^*(x),~x\in\calX\}$ be the optimum random coding distribution according

794: to which, each codeword $X^n$ is drawn independently. Let $y^n$ be a given channel

795: output sequence which is typical to the output distribution $p(y)=\sum_{x\in\calX}q(x)W(y|x)$,

796: where $\{W(y|x),~x\in\calX,~y\in\calY\}$ are the channel transition probabilities. That is,

797: each symbol $y$ appears $n_y=np(y)$ times in $y^n$. Consider now the large deviations event

798: \begin{equation}

799: \label{td}

800: \sum_{i=1}^n\log\frac{1}{W(y_i|X_i)}\le nH(Y|X),

801: \end{equation}

802: where $H(Y|X)=-\sum_{x\in\calX}\sum_{y\in\calY}q(x)W(y|x)\log W(y|x)$.

803: By the union bound, as long as the number of randomly chosen codewords is exponentially less

804: than $e^{-nI}$, where $I$ is the rate function of the large--deviations event (\ref{td}), then

805: the average error probability still vanishes as $n\to\infty$.\footnote{Here we apply the union

806: bound to a threshold decoder that seeks a unique codeword that satisfies (\ref{td}),

807: which although suboptimum, is still good enough to achieve capacity.}

808: Since this is the exactly the achievability argument of the channel coding theorem, then $I=C$, where $C$ the channel

809: capacity.

810:

811: Again, this complies with our model setting with the assignments, $U_i=X_i$, $v_i=y_i$, $i=1,\ldots,n$,

812: $q(u|v)=q(x|y)=q^*(x)$ independently of $y$,

813: $f(u,v)=f(x,y)=-\log W(y|x)$ and $E=H(Y|X)$ units of $\epsilon_0$.

814: In other words, channel capacity can be represented as

815: \begin{equation}

816: C=-\min_{\beta\ge 0}\left[\beta \cdot \epsilon_0 H(Y|X)+

817: \sum_{y\in\calY}p(y)\ln\left(\sum_{x\in\calX}q^*(x)e^{-\beta\cdot\epsilon_0[-\log W(y|x)]}\right)\right].

818: \end{equation}

819: It is easy to see that, in this case, the equilibrium

820: temperature always corresponds

821: to $\beta\epsilon_0=1$, namely, $T=\epsilon_0/k$.

822:

823: By the same token, one can derive an expression

824: of the random coding capacity pertaining to mismatched

825: decoding, where the decoder uses an additive metric $m(x,y)$

826: other than the optimum metric,

827: $-\log W(y|x)$ (see, e.g., \cite{Balakirsky95},

828: \cite{CN95},

829: \cite{Lapidoth94},

830: \cite{LS96-2},

831: \cite{MKLS94},

832: and references therein).

833: The only modifications to the above

834: expression would be to replace the Hamiltonian

835: by $\calE_y(x)=\epsilon_0m(x,y)$

836: and to replace $H(Y|X)$ by the expectation

837: of $m(X,Y)$ w.r.t.\ $q^*(x)W(y|x)$.

838: The new optimum random coding distribution

839: might change as well. Here, it

840: is no longer necessarily true that the equilibrium temperature

841: is $T=\epsilon_0/k$.

842:

843: \subsection{Signal Detection and Hypothesis Testing}

844:

845: Consider the following binary hypothesis testing problem:

846: Given a deterministic signal, which is repreresented by a sequence $x^n=(x_1,\ldots,x_n)$

847: with elements taking on values in a (finite) set $\calX$ and relative frequencies $\{p(x),~x\in\calX\}$,

848: and given an observation sequence

849: $Y^n=(Y_1,\ldots,Y_n)$, we are required to decide between two hypotheses:

850: \begin{itemize}

851: \item[$H_0:$] The observation vector $Y^n$ is ``pure noise,''

852: distributed according to some product measure $Q=\{q(y),~y\in\calY\}$, i.e., $q(y^n)=\prod_{i=1}^nq(y_i)$,

853: which is unrelated to $x^n$.

854: \item[$H_1:$] The observation vector $Y^n$ is a ``noisy version'' of $x^n$,

855: distributed according to $q(y^n|x^n)=\prod_{i=1}^nq(y_i|x_i)$.

856: \end{itemize}

857: The optimum detector (under both the Bayesian and the Neyman--Pearson criterion) compares

858: the likelihood ratio $\sum_{i=1}^n\ln [q(y_i)/q(y_i|x_i)]$ to a threshold $nE_0$, and decides

859: in favor of $H_0$ if this threshold is exceeded, otherwise, it decides in favor of $H_1$.

860:

861: The false--alarm probability then is the probability of the event

862: $$\sum_{i=1}^n\ln \left[\frac{q(Y_i)}{q(Y_i|x_i)}\right]\le nE_0$$

863: under $Q$. This, again, fits our scenario with the substitutions

864: $U_i=Y_i$, $v_i=x_i$, $i=1,\ldots,n$,

865: $q(u|v)=q(y)$, independently of $x=v$, $f(u,v)=f(y,x)=\ln[q(y)/q(y|x)]$, and $E=E_0$.

866: Similarly, the analysis of the missed--detection probability corresponds to

867: the assignments: $U_i=Y_i$, and $v_i=x_i$, $i=1,\ldots,n$, as before, but now

868: $q(u|v)=q(y|x)$, $f(u,v)=f(y,x)=\ln[q(y|x)/q(y)]$ and $E=-E_0$.

869: Note that when $\{q(y)\}$

870: is the uniform distribution over $\calY$, the missed-detection event

871: can also be interpreted as the probability of excess code--length of

872: an arithmetic lossless source code w.r.t.\ $\{q(y|x)\}$.

873:

874: Another situation of hypothesis testing that is related to our study in a similar manner is

875: one where the signal $x^n$ is always underlying the observations, but the decision to be made

876: is associated with two hypotheses regarding

877: the noise level, or the temperature. In this case, there is a certain

878: Hamiltonian $\calE_x(y)$ for each $x\in\calX$, and we assume a Boltzmann--Gibbs distribution

879: parametrized by the temperature

880: $$q(y|x,\beta)=\frac{e^{-\beta\calE_x(y)}}{\zeta_x(\beta)}$$

881: where

882: $$\zeta_x(\beta)=\sum_ye^{-\beta\calE_x(y)}.$$

883: Note that here $\zeta_x(\beta)$ is an ordinary partition function, without

884: weighting (cf.\ (\ref{wpf})). We shall also denote

885: $$\bar{\Sigma}(E)=\min_{\beta\ge 0}\left[\beta E

886: +\sum_{x\in\calX}p(x)\ln\zeta_x(\beta)\right].$$

887: As $\bar{\Sigma}(E)$ is induced by a convex combination of non-weighted

888: partition functions, it has the significance of the normalized logarithm

889: of the number of microstates with energy about $nE$. Thus, $k\cdot\bar{\Sigma}(E)$,

890: where $k$ is Boltzmann's constant, is the thermodynamic entropy.

891:

892: Given two values $\beta_1$ and $\beta_2$ (say, $\beta_1 > \beta_2$),

893: the hypotheses now are the following:

894: \begin{itemize}

895: \item[$H_1:$] $Y^n$ is

896: distributed according to $q_1(y^n|x^n)=\prod_{i=1}^nq(y_i|x_i,\beta_1)$.

897: \item[$H_2:$] $Y^n$ is

898: distributed according to $q_2(y^n|x^n)=\prod_{i=1}^nq(y_i|x_i,\beta_2)$.

899: \end{itemize}

900: The likelihood ratio test compares

901: $\sum_{i=1}^n\calE_{x_i}(Y_i)$ to a threshold, $nE_0$, and

902: decides in favor of $H_2$ if the threshold

903: is exceeded, otherwise, it favors $H_1$.

904: Here, $E_0$ should lie in the interval $(E_1,E_2)$,

905: where

906: $$E_i\dfn-\sum_{x\in\calX}p(x)\cdot\left[\frac{\partial\ln \zeta_x(\beta)}{\partial

907: \beta}\right]_{\beta=\beta_i},~~~i=1,2.$$

908: For convenience, let us assume now that

909: $E_i$, $i=0,1,2,$ and $\calE_x(y)$ already have units of energy, so

910: there is no need to have the constant $\epsilon_0$. In this

911: situation, the exponent of the error probability under $H_2$ is given by

912: $-\bar{S}(E_0)$, where

913: \begin{eqnarray}

914: \bar{S}(E_0)&=&\min_{\beta\ge 0}\left[\beta E_0

915: +\sum_{x\in\calX}p(x)\ln\left(\sum_{y\in\calY}q(y|x,\beta_2)

916: e^{-\beta\calE_x(y)}\right)\right]\nonumber\\

917: &=&\min_{\beta\ge 0}\left[\beta E_0+

918: \sum_{x\in\calX}p(x)\ln\left(\frac{\zeta_x(\beta+\beta_2)}

919: {\zeta_x(\beta_2)}\right)

920: \right]\nonumber\\

921: &=&\min_{\beta\ge 0}\left[\beta E_0+

922: \sum_{x\in\calX}p(x)\ln \zeta_x(\beta+\beta_2)-

923: \sum_{x\in\calX}p(x)\ln \zeta_x(\beta_2)

924: \right]\nonumber\\

925: &=&\min_{\beta\ge 0}\left[(\beta+\beta_2)E_0+

926: \sum_{x\in\calX}p(x)\ln \zeta_x(\beta+\beta_2)\right]-\beta_2E_0-

927: \sum_{x\in\calX}p(x)\ln \zeta_x(\beta_2)\nonumber\\

928: &=&\min_{\beta\ge \beta_2}\left[\beta E_0+

929: \sum_{x\in\calX}p(x)\ln \zeta_x(\beta)\right]+\beta_2(E_2-E_0)

930: -\left[\beta_2E_2+\sum_{x\in\calX}p(x)\ln \zeta_x(\beta_2)\right]\nonumber\\

931: &=&\min_{\beta\ge \beta_2}\left[\beta E_0+

932: \sum_{x\in\calX}p(x)\ln \zeta_x(\beta)\right]+\beta_2(E_2-E_0)\nonumber\\

933: & &-\min_{\beta\ge 0}\left[\beta E_2+\sum_{x\in\calX}p(x)\ln \zeta_x(\beta)\right]\nonumber\\

934: &=&\bar{\Sigma}(E_0)-\bar{\Sigma}(E_2)+\beta_2(E_2-E_0),

935: \end{eqnarray}

936: where we have used the fact that the achiever $\beta(E)$

937: of $\bar{\Sigma}(E)$ is a monotonically non-increasing function of $E$,

938: thus, $E_0 < E_2$ implies $\beta(E_0) > \beta(E_2)=\beta_2$,

939: and so, the global minimum over $\beta\ge 0$ is attained

940: for $\beta\ge\beta_2$ anyway.

941:

942: It then follows that the error exponent $I_2$ under $H_2$ is given by

943: \begin{eqnarray}

944: I_2&=&\bar{\Sigma}(E_2)-\bar{\Sigma}(E_0)-\beta_2(E_2-E_0)\nonumber\\

945: &=&\frac{1}{k}\left[k\bar{\Sigma}(E_2)-k\bar{\Sigma}(E_0)-

946: \frac{E_2-E_0}{T_2}\right]\nonumber\\

947: &=&\frac{1}{k}\int_{E_0}^{E_2}\left[\frac{1}{T(E)}-\frac{1}{T_2}\right]\mbox{d}E

948: \nonumber\\

949: &=&\frac{1}{k}\int_{T_0}^{T_2}\left(\frac{1}{T}-\frac{1}{T_2}\right)

950: \bar{C}(T)\mbox{d}T,

951: \end{eqnarray}

952: where $T(E)=1/(k\beta(E))$ is the temperature corresponding to

953: energy $E$, $T_i=T(E_i)$, $i=0,1,2$, and $\bar{C}(T)=\mbox{d}E/\mbox{d}T$

954: is the average heat capacity per particle of the system, which is

955: the weighted average of heat capacities of all subsystems, i.e.,

956: $$\bar{C}(T)=\sum_{x\in\calX}p(x)C_x(T),$$

957: where

958: $$C_x(T)=\frac{\mbox{d}E_x}{\mbox{d}T}=

959: \frac{1}{kT^2}\left[\frac{\mbox{d}^2\ln \zeta_x(\beta)}{d\beta^2}\right]_{\beta=

960: 1/(kT)}.$$

961: Thus,

962: $$I_2=\sum_{x\in\calX}p(x)\cdot

963: \frac{1}{k}\int_{T_0}^{T_2}\left(\frac{1}{T}-\frac{1}{T_2}\right)

964: C_x(T)\mbox{d}T,$$

965: which is interpreted as the weighted average of the relative contributions

966: of all subsystems, which all lie in the same temperature $T_0$.

967:

968: In a similar manner, the rate function $I_1$ of the probability

969: of error under $H_1$ is given by:

970: \begin{eqnarray}

971: I_1&=&\bar{\Sigma}(E_1)-\bar{\Sigma}(E_0)-\beta_1(E_1-E_0)\nonumber\\

972: &=&\frac{1}{k}\left[

973: k\bar{\Sigma}(E_1)-k\bar{\Sigma}(E_0)-\frac{E_1-E_0}{T_1}\right]\nonumber\\

974: &=&\frac{1}{k}\int_{E_1}^{E_0}\left[\frac{1}{T_1}-

975: \frac{1}{T(E)}\right]\mbox{d}E\nonumber\\

976: &=&\frac{1}{k}\int_{T_1}^{T_0}\left(\frac{1}{T_1}-

977: \frac{1}{T}\right)\bar{C}(T)\mbox{d}T.

978: \end{eqnarray}

979:

980: The expression in the square brackets

981: of the second line pertaining to $I_2$ has a simple graphical

982: interpretation (see Fig.\ 1): It is the vertical distance

983: (corresponding to the vertical line $E=E_0$) between the curve $\bar{\Sigma}(E)$

984: and the line tangent to that curve at $E=E_2$ (whose slope is $\beta_2=

985: \beta(E_2)$). The two other expressions of $I_2$, in the

986: last chain of equalities, describe the error exponent $I_2$ in terms

987: of slow heating from temperature $T_0$ to temperature $T_2$.

988: Similar comments apply to $I_1$ (cf.\ Fig.\ 1).

989: Thus, the error exponents

990: are linear functionals of the average heat capacity, $\bar{C}(T)$,

991: in the range of temperatures $[T_1,T_2]$.

992: The higher is the heat capacity, the better is the discrimination

993: between the hypotheses. This is related to the fact that Fisher information

994: of the parameter $\beta$ is given by

995: $$J(\beta)=\sum_{x\in\calX}p(x)\frac{\mbox{d}^2\ln \zeta_x(\beta)}{\mbox{d}\beta^2}=

996: kT^2\bar{C}(T),$$

997: namely, again, a linear function of $\bar{C}(T)$.

998: However, while the Fisher information

999: depends only on one local value of $\bar{C}(T)$

1000: (as it measures the sensitivity of the

1001: likelihood function to the parameter in a local manner), the error exponents

1002: depend on $\{\bar{C}(T): T_1\le T\le T_2\}$

1003: in a cumulative manner, via the above integrals.

1004: The tradeoff between $I_1$ and $I_2$ is also obvious: by enlarging the

1005: threshold $E_0$, or, correspondingly, $T_0$,

1006: the range of integration pertaining to

1007: $I_1$ increases at the expense of the one of $I_2$ and vice versa. In the

1008: extreme case, where $I_2=0$, we get

1009: $$I_1=D(P_2\|P_1)=

1010: \frac{1}{k}\int_{T_1}^{T_2}\left(\frac{1}{T_1}-

1011: \frac{1}{T}\right)\bar{C}(T)\mbox{d}T.$$

1012:

1013: \begin{figure}[ht]

1014: \hspace*{1cm}\input{graph2.pstex_t}

1015: \caption{Entropy as function of energy and a graphical representation

1016: of error exponents.}

1017: \label{gen}

1018: \end{figure}

1019:

1020: \subsection{Error Exponents of Time--Varying Scalar Quantizers}

1021:

1022: In this application example, we are back to the problem area

1023: of lossy data compression, but this time, it is about scalar (symbol--by--symbol)

1024: compression. This setup is motivated by earlier results about the optimality

1025: of time--shared scalar quantizers within the class of

1026: causal source codes for memoryless sources, both under

1027: the average rate/distortion criteria \cite{NG82} and large--deviations performance

1028: criteria \cite{MK03}. In particular, it was shown that

1029: under both criteria, optimum time--sharing

1030: between at most two (entropy coded) scalar quantizers

1031: is as good as any causal source code for memoryless sources.

1032: Here, we will focus on the large deviations performance criteria, namely,

1033: source coding exponents.

1034:

1035: Consider a time--varying scalar quantizer $\hX_i=f_i(X_i)$, acting on a DMS

1036: $X_1,X_2,\ldots$, $X_i\in\calX$, drawn from $q$,

1037: where $\{f_i\}$ is an arbitrary (deterministic) sequence

1038: of quantizers from a given finite set $\calF=\{F_1,\ldots,F_S\}$,

1039: where $F_s:\calX\to\hat{\calX}_s$, $\hat{\calX}_s$ being the reproduction alphabet

1040: corresponding to $F_s$, $s=1,\ldots,S$. In other words, for every $i=1,2,\ldots,n$,

1041: $f_i=F_{s_i}$, for a certain arbitrary sequence of `states',

1042: $s_1,s_2,\ldots$ (known to the decoder) with components in $\calS=\{1,2,\ldots,S\}$.

1043:

1044: The distortion incurred by such a time--varying scalar quantizer, over $n$ units of time, is

1045: $\sum_{i=1}^nd(X_i,f_i(X_i))=

1046: \sum_{i=1}^nd(X_i,F_{s_i}(X_i))$. The total code length is $\sum_{i=1}^n L_{s_i}(F_{s_i}(X_i))$,

1047: where the per--symbol length functions

1048: $L_{s}(\cdot)$ may correspond to either fixed--rate coding, where $L_s(\hx)=R_s\dfn

1049: \lceil\log|\hat{\calX}_s|\rceil$ for all $\hx$,

1050: or any other length function satisfying the Kraft

1051: inequality, $\sum_{\hx\in\hat{\calX}_s}2^{-L_s(\hx)}\le 1$.

1052: For the sake of simplicity of the exposition, let us assume fixed--rate coding.

1053: We will denote by $n_s$, $s\in\calS$, the number of times that $s_i=s$

1054: occurs in $s^n$, and $p(s)=n_s/n$ is the corresponding relative frequency.

1055:

1056: In \cite{MK03}, among other results, the rate function of the excess distortion event

1057: $$\sum_{i=1}^nd(X_i,F_{s_i}(X_i)) > nD,~~~~

1058: D> \sum_{(x,s)\in\calX\times\calS}q(x)p(s)d(x,F_s(x))$$

1059: was optimized across the class of all time--varying scalar quantizers (each one

1060: corresponding to a different sequence $s_1,\ldots,s_n$) subject to a code--length

1061: constraint $\sum_{i=1}^nR_{s_i}\le nR$, or equivalently, $\sum_{s\in\calS}n_sR_s\le nR$,

1062: for a given pair $(D,R)$.

1063:

1064: In the notation of our generic model, here we have $U_i=X_i$, $v_i=s_i$, $i=1,\ldots,n$,

1065: $q(u|v)=q(x|s)=q(x)$ independently of $s$,

1066: $f(u,v)=f(x,s)=-d(x,F_s(x))$, and $E=-D$.\footnote{One

1067: may prefer to redefine

1068: $f(x,s)=D_{\max}-d(x,F_s(x))$ and $E=D_{\max}-D$,

1069: where $D_{max}\dfn\max_{x,s}d(x,F_s(x))$, in order to

1070: work with non--negative quantities.} and the excess distortion exponent is of the same form

1071: as before (see also \cite{MK03}).

1072: Here, however, unlike the previous application examples, we have a degree

1073: of freedom to select the relative frequency of usage, $p(s)$, of each member of $\calF$,

1074: i.e., the time--sharing protocol, but we also have the constraint $\sum_sp(s)R_s\le R$.

1075:

1076: From the statistical physics point of view, these additional ingredients mean that

1077: we have a freedom to select the number of particles in each subsystem

1078: (though the total number, $n$, is still fixed), and the additional

1079: constraint, $\sum_sp(s)R_s\le R$, which is actually equivalent to the equality constraint

1080: $\sum_sp(s)R_s= R$ (in the interesting region of $(R,D)$ pairs) can be viewed as an additional

1081: conservation law with respect to some other

1082: constant of motion, in addition to the energy (e.g., the momentum), where in

1083: subsystem $s$, the (average) value of the corresponding physical quantity

1084: per particle is $R_s$.

1085:

1086: While in \cite{MK03}, we have considered the problem of maximizing the rate function

1087: (the source coding exponent) of the excess distortion event

1088: $\sum_{i=1}^nd(X_i,F_{s_i}(X_i)) > nD$, a related objective (although somewhat

1089: less well motivated, but still interesting) is to minimize the rate function

1090: (or maximize the probability) of the small distortion event

1091: $$\sum_{i=1}^nd(X_i,F_{s_i}(X_i)) < nD,~~~

1092: D < \sum_{(x,s)\in\calX\times\calS}q(x)p(s)d(x,F_s(x)).$$

1093: In this case, the optimum performance is given by

1094: $$F(R,D)=\max_{P\in \calP(R)}\min_{\beta\ge 0}\left[\beta D+\sum_{s=1}^Sp(s)\ln

1095: \left(\sum_{x\in\calX}q(x)e^{-\beta d(x,F_s(x))}\right)\right],$$

1096: where $\calP(R)$ is the class of all probability distributions $P=\{p(s),~s\in\calS\}$ with

1097: $\sum_sp(s)R_s\le R$. From the viewpoint of statistical physics, this corresponds

1098: to a situation where the various subsystems are allowed to interact, not only thermally,

1099: but also chemically, i.e., an exchange of particles is enabled in addition to the exchange of

1100: energy, and the maximization over $\calP(R)$ (maximum entropy) is achieved when the

1101: chemical potentials of the various subsystems reach a balance. As the maximization over

1102: $P\in\calP(R)$ subject to the constraint $\sum_sp(s)R_s\le R$, for a given $\beta$,

1103: is a linear programming

1104: problem with one constraint (in addition to $\sum_sp(s)=1$), then as was shown in

1105: \cite{MK03}, for each distortion level (or energy) $D$, the optimum $P\in\calP(R)$ may be

1106: non--zero for at most two members of $\calS$ only, which means that at most two subsystems

1107: are populated by particles in thermal and chemical equilibrium under the two conservation

1108: laws (of $D$ and of $R$). However, the choice of these two

1109: members of $\calS$ depends, in general,

1110: on $D$, which in turn depends on the temperature. Thus, when

1111: the system is heated gradually, certain {\it phase transitions}

1112: may occur, whenever

1113: there is a change in the choice of the two populated subsystems.

1114:

1115: Finally, referring to comment no.\ 1 of Section 3, we should point out that here,

1116: in contrast to our discussion thus far, the difference between the ensemble of

1117: distinguishable particles and indistinguishable particles becomes critical since the

1118: factors $\{n_s!\}$ are no longer constant. Had we assumed indistinguishability, the

1119: normalized log--partition function

1120: would no longer be affine in $P$, thus the maximization over $P$

1121: would no longer be a linear programming problem, and the conclusion might have been

1122: different. In the source coding problem, the indistinguishable case corresponds to

1123: a situation where the sequence of states $s^n$ is chosen uniformly at random

1124: (with the decoder being informed of the result

1125: of the random selection, of course). In this case,

1126: the Chernoff bound corresponding to each composition $\{n_s,~s\in\calS\}$ of $s^n$

1127: should be weighed by the probability of this composition, which is

1128: $S^{-n}n!/\prod_sn_s!$. Now, each factor of $1/n_s!$ can be

1129: absorbed in the corresponding

1130: partition function $Z_s(\beta)$ of subsystem $s$, with the interpretation

1131: that in each subsystem the particles are now indistinguishable. The maximum over $P$ would

1132: now correspond to the dominant contribution in this weighted average of Chernoff bounds.

1133: One can, of course, extend the discussion to any i.i.d.\ distribution on $s^n$, thus

1134: introducing additional bias and preferring some compositions over others.

1135:

1136: \section*{Appendix}

1137: \renewcommand{\theequation}{A.\arabic{equation}}

1138:     \setcounter{equation}{0}

1139:

1140: \subsection*{A.1. Sketch of an Alternative Proof of Theorem 1 via Chernoff Bounds}

1141:

1142: In this subsection,

1143: we outline another proof of Theorem 1

1144: using a large deviations analysis approach. In particular,

1145: consider the large deviations event $\sum_{i=1}^nf(U_i,v_i)\le nE$,

1146: as described in Section 2.

1147: Assuming that the relative frequencies $\{p(v)\}$ all stabilize

1148: as $n\to\infty$, let us compute the rate function $I(E)$

1149: of the probability of this event in two different methods, where one would yield

1150: the left--hand side of (\ref{identity}) and the other would give the right--hand

1151: side of (\ref{identity}).

1152:

1153: In the first method,

1154: we partition the sequence $v^n$ according to its different letters.

1155: Specifically, let

1156: $$E_v\dfn\frac{1}{n_v}\sum_{i:v_i=v}f(U_i,v),$$

1157: where $n_v$ is the number of occurrences of the symbol $v\in\calV$ along $v^n$.

1158: Let $\calG$ denote the set of all possible vector values that

1159: can be taken on by the vector $\bar{E}=\{E_v,~v\in\calV\}$.

1160: Now, obviously, $\sum_{i=1}^n f(U_i,v_i)\le nE$

1161: if and only if there exists a

1162: vector $\tilde{E}=\{\tilde{E}_v,~v\in\calV\}\in\calG$

1163: such that $E_v\le \tilde{E}_v$ for all $v\in\calV$ and

1164: $\sum_{v\in\calV}p(v)\tilde{E}_v\le E$.

1165: The ``if'' part follows from

1166: $$\sum_{i=1}^n f(U_i,v_i)=n\sum_{v\in\calV}p(v)E_v\le

1167: n\sum_{v\in\calV}p(v)\tilde{E}_v\le nE.$$

1168: The ``only if'' part follows by setting $\tilde{E}_v=E_v$ for all $v\in\calV$.

1169: Therefore, denoting

1170: $\calH_G(E)=\calH_0(E)\bigcap\calG$ (where $\calH_0(E)$ is defined as in Section 2), we have:

1171: \begin{eqnarray}

1172: \label{ub1}

1173: \mbox{Pr}\left\{\sum_{i=1}^n f(U_i,v_i)\le nE\right\}&=&

1174: \mbox{Pr}\bigcup_{\bar{E}\in\calH_G(E)}\left\{\sum_{i:v_i=v}

1175: f(U_i,v)\le n_v\tilde{E}_v,~~v\in\calV\right\}\nonumber\\

1176: &\le&\sum_{\tilde{E}\in\calH_G(E)}\mbox{Pr}\left\{\sum_{i:v_i=v}

1177: f(U_i,v)\le n_v\tilde{E}_v,~~v\in\calV\right\}\nonumber\\

1178: &=&\sum_{\tilde{E}\in\calH_G(E)}\prod_{v\in\calV}

1179: \mbox{Pr}\left\{\sum_{i:v_i=v} f(U_i,v)\le n_v\tilde{E}_v\right\}\nonumber\\

1180: &\le&|\calH_G(E)|\cdot\max_{\tilde{E}\in\calH_G(E)}\prod_{v\in\calV}

1181: \mbox{Pr}\left\{\sum_{i:v_i=v} f(U_i,v)\le n_v\tilde{E}_v\right\}\nonumber\\

1182: &\le&|\calG|\cdot\max_{\tilde{E}\in\calH_G(E)}\prod_{v\in\calV}

1183: \mbox{Pr}\left\{\sum_{i:v_i=v} f(U_i,v)\le n_v\tilde{E}_v\right\},

1184: \end{eqnarray}

1185: and on the other hand,

1186: \begin{eqnarray}

1187: \label{lb1}

1188: \mbox{Pr}\left\{\sum_{i=1}^n f(U_i,v_i)\le nE\right\}&=&

1189: \mbox{Pr}\bigcup_{\tilde{E}\in\calH_G(E)}\left\{\sum_{i:v_i=v}

1190: f(U_i,v)\le n_v\tilde{E}_v,~~v\in\calV\right\}\nonumber\\

1191: &\ge&\max_{\tilde{E}\in\calH_G(E)}\mbox{Pr}\left\{\sum_{i:v_i=v}

1192: f(U_i,v)\le n_v\tilde{E}_v,~~v\in\calV\right\}\nonumber\\

1193: &=&\max_{\tilde{E}\in\calH_G(E)}\prod_{v\in\calV}

1194: \mbox{Pr}\left\{\sum_{i:v_i=v} f(U_i,v)\le n_v\tilde{E}_v\right\}.

1195: \end{eqnarray}

1196: At this point, the only gap between the upper bound (\ref{ub1}) and

1197: the lower bound (\ref{lb1}) is the factor $|\calG|$. The number of different

1198: values that $\tilde{E}_v$ can take does not exceed the number of different

1199: type classes of sequences of length $n_v$ over the alphabet $\calU$,

1200: which is upper bounded by $(n_v+1)^{|\calU|-1}$.

1201: Thus,

1202: \begin{eqnarray}

1203: |\calG|&\le&\prod_{v\in\calV}[n_v+1]^{|\calU|-1}\nonumber\\

1204: &=&\exp\left\{(|\calU|-1)\sum_v\log(n_v+1)\right\}\nonumber\\

1205: &=&\exp\left\{|\calV|\cdot(|\calU|-1)\sum_v\frac{1}{|\calV|}

1206: \log(n_v+1)\right\}\nonumber\\

1207: &\le&\exp\left\{|\calV|\cdot(|\calU|-1)\log\left(\sum_v\frac{1}{|\calV|}

1208: [n_v+1]\right)\right\}\nonumber\\

1209: &=&\exp\left\{|\calV|\cdot(|\calU|-1)\log\left(

1210: \frac{n}{|\calV|}+1\right)\right\}\nonumber\\

1211: &=&\left(\frac{n}{|\calV|}+1\right)^{|\calV|\cdot(|\calU|-1)},

1212: \end{eqnarray}

1213: and therefore $|\calG|$ is only polynomial in $n$, and hence does not affect the

1214: exponential behavior. Now, each one of the terms

1215: $\mbox{Pr}

1216: \{\sum_{i:v_i=v} f(U_i,v)\le n_v\tilde{E}_v\}$ is bounded

1217: exponentially tightly by an individual Chernoff bound,

1218: $$\exp\left\{n_v\min_{\beta\ge 0}\left[\beta

1219: \tilde{E}_v+\ln\left(\sum_uq(u|v)e^{-\beta f(u,v)}

1220: \right)\right]\right\},$$

1221: and so, the dominant term of their product is of the exponential order of

1222: $$\max_{\tilde{E}\in\calH_G(E)}\sum_vp(v)\cdot

1223: \min_{\beta\ge 0}\left[\beta \tilde{E}_v+\ln\left(\sum_uq(u)e^{-\beta f(u,v)}

1224: \right)\right]=\max_{\tilde{E}\in\calH_G(E)}\sum_vp(v)S_v(E_v).$$

1225: Finally, as $n_v\to\infty$, the set $\calH_G(E)$ becomes dense in the continuous set $\calH_0(E)$,

1226: and by simple continuity arguments, the maximum over $\calH_G(E)$ tends to the maximum over $\calH_0(E)$.

1227:

1228: The other method to evaluate the rate function $I(E)$

1229: is as follows. Let $\ell$ be a fixed positive integer that divides $n$,

1230: and denote $\ell_v=\ell p(v)$, $v\in\calV$ (assume that $\ell$ is chosen large enough that

1231: $\ell p(v)$ is well approximated by the closest integer with a very small relative error).

1232: Now, re--order the pairs $\{(U_i,v_i)\}$

1233: (periodically), according to the following rule:

1234: Assuming, without loss of generality, that

1235: $\calV=\{1,2,\ldots,|\calV|\}$, the first

1236: $\ell_1=\ell p(1)$ symbol pairs of each $\ell$--block of $(u^n,v^n)$

1237: are such that $v=1$, the next $\ell_2=\ell p(2)$ symbol pairs

1238: of each $\ell$--block are such that

1239: $v=2$, and so on. In other words,

1240: each $\ell$--block, $v_{(i-1)\ell+1}^{i\ell}=(v_{(i-1)\ell+1},

1241: v_{(i-1)\ell+2},\ldots,v_{i\ell})$, $i=1,2,\ldots,n/\ell$,

1242: consists of the same relative frequencies $\{p(v)\}$

1243: as the entire sequence, $v^n$. Now, for

1244: the re--ordered sequence of pairs, let us define

1245: $X_i=\sum_{t=(i-1)\ell+1}^{i\ell}f(U_t,v_t)$,

1246: $i=1,2,\ldots,n/\ell$.

1247: Obviously, $X_1,X_2,\ldots,X_{n/\ell}$ are i.i.d.\ and therefore

1248: the probability of the large deviations event $\{\sum_{i=1}^{n/\ell}X_i \le \frac{n}{\ell}\cdot

1249: \ell E\}$ can be assessed exponentially tightly by the Chernoff bound

1250: as follows:

1251: \begin{eqnarray}

1252: &&\exp\left\{\frac{n}{\ell}\cdot\min_{\beta\ge 0}\left[\beta\cdot\ell E+\ln\left(

1253: \sum_{u^\ell\in\calU^\ell}q(u^\ell|v^\ell)\exp\left\{-\beta

1254: \sum_{i=1}^{\ell}f(u_i,v_i)\right\}\right)\right]

1255: \right\}\nonumber\\

1256: &=&\exp\left\{\frac{n}{\ell}\cdot\min_{\beta\ge 0}

1257: \left[\beta\cdot\ell E+\ln\left(\prod_{v\in\calV}

1258: \sum_{u^{\ell_v}}q(u^{\ell_v}|v^{\ell_v})

1259: \exp\left\{-\beta\sum_{i=1}^{\ell_v}f(u_i,v)\right\}\right)\right]

1260: \right\}\nonumber\\

1261: &=&\exp\left\{\frac{n}{\ell}\cdot\min_{\beta\ge 0}

1262: \left[\beta\cdot\ell E+\ln\left(\prod_{v\in\calV}

1263: \left[\sum_{u\in\calU}q(u|v)e^{-\beta f(u,v)}\right]^{\ell_v}\right)\right]

1264: \right\}\nonumber\\

1265: &=&\exp\left\{\frac{n}{\ell}\cdot\min_{\beta\ge 0}

1266: \left[\beta\cdot\ell E+\ell\cdot\sum_{v\in\calV}

1267: p(v)\ln\left(\sum_{u\in\calU}q(u|v)e^{-\beta f(u,v)}\right)\right]

1268: \right\}\nonumber\\

1269: &=&\exp\left\{n\cdot\min_{\beta\ge 0}\left[\beta E+\sum_{v\in\calV}

1270: p(v)\ln\left(\sum_{u\in\calU}q(u|v)e^{-\beta f(u,v)}\right)\right]

1271: \right\}\nonumber\\

1272: &=&e^{n\bar{S}(E)}.

1273: \end{eqnarray}

1274: Since both approaches yield exponentially tight

1275: evaluations of $I(E)$, they must be equal.

1276:

1277: \subsection*{A.2. A More Rigorous Derivation of Eq.\ (\ref{equipartition})}

1278:

1279: The exact derivation of eq.\ (\ref{equipartition}) for the finite

1280: interval integration, is as follows:

1281: \begin{eqnarray}

1282: \epsilon_0 D&=-&\frac{\partial}{\partial \beta}\ln

1283: \left[\int_{-A}^A

1284: \exp\{-\beta\epsilon_0|\hx-x|^\theta\} \mbox{d}\hx\right]\nonumber\\

1285: &=&-\frac{\partial}{\partial \beta}\ln\left[

1286: \beta^{-1/\theta}\cdot\int_{-\beta^{1/\theta}(A+x)}^{\beta^{1/\theta}(A-x)}

1287: \exp\{-\epsilon_0|\beta^{1/\theta}(\hx-x)|^\theta\}

1288: \mbox{d}(\beta^{1/\theta}(\hx-x))\right]\nonumber\\

1289: &=&-\frac{\partial}{\partial \beta}\ln\left[

1290: \beta^{-1/\theta}\cdot\int_{-\beta^{1/\theta}(A+x)}^{\beta^{1/\theta}(A-x)}

1291: \exp\{-\epsilon_0|z|^\theta\}

1292: \mbox{d}z\right]\nonumber\\

1293: &=&-\frac{\partial}{\partial \beta}\ln

1294: \left(\beta^{-1/\theta}\right)-\frac{\partial}{\partial \beta}\ln

1295: \left[\int_{-\beta^{1/\theta}(A+x)}^{\beta^{1/\theta}(A-x)}

1296: \exp\{-\epsilon_0|z|^\theta\}\mbox{d}z\right]\nonumber\\

1297: &=&\frac{1}{\beta\theta}\left\{1-\frac{\beta^{1/\theta}

1298: [(A-x)\exp\{-\beta\epsilon_0|A-x|^\theta\}

1299: +(A+x)\exp\{-\beta\epsilon_0|A+x|^\theta\}]}

1300: {\int_{-\beta^{1/\theta}(A+x)}^{\beta^{1/\theta}(A-x)}

1301: \exp\{-\epsilon_0|z|^\theta\}\mbox{d}z}\right\}.

1302: \end{eqnarray}

1303: When $\beta$ is very large, the denominator

1304: of the second term of the expression

1305: in the curly brackets of the right--most side, goes to

1306: $\int_{-\infty}^{\infty}

1307: \exp\{-\epsilon_0|z|^\theta\}\mbox{d}z$, which is a constant.

1308: Now if, in addition, $|x|<A$, then the numerator

1309: tends to zero as $\beta$ grows without bound.

1310: Thus, the dominant term, for low temperatures, is $1/(\beta\theta)=kT/\theta$.

1311:

1312: An exact closed--form expression, for every finite $\beta$, can be derived

1313: for the case $\theta=1$, since in this case, the integral at the denominator has a simple

1314: expression. For example, setting $\theta=1$, and $x=0$ in the above expression, yields:

1315: \begin{eqnarray}

1316: D&=&\frac{1}{\beta\epsilon_0}-\frac{A}{e^{\beta\epsilon_0A}-1}\nonumber\\

1317: &=&\frac{kT}{\epsilon_0}-\frac{A}{e^{\epsilon_0A/(kT)}-1}.

1318: \end{eqnarray}

1319: Note that this expression is valid only in the range where it is monotonically

1320: increasing in $T$. (Beyond this point, the minimizing $\beta$ is no longer the

1321: point of zero derivative).

1322:

1323: \begin{thebibliography}{AA}

1324:

1325: \bibitem{AB01}

1326: R.~Albert and A.-L.~ Barab\'asi, ``Statistical mechanics of complex networks,'' %SPI-042

1327: arXiv:cond-mat/0106096, June 6, 2001.

1328:

1329: \bibitem{Balakirsky95}

1330: V.~B.~Balakirsky, ``A converse coding theorem

1331: for mismatched decoding at the output of binary-input memoryless channels,''

1332: {\it IEEE Trans.\ Inform.\ Theory}, vol.\ 41, no.\ 6, pp.\ 1889--1902,

1333: November 1995.

1334:

1335: \bibitem{Berger71}

1336: T.~Berger, {\sl Rate distortion theory: a mathematical basis for data compression},

1337: Prentice--Hall, Inc., Engelwood Cliffs, NJ, 1971.

1338:

1339: \bibitem{Burg75}

1340: J.~P.~Burg, {\it Maximum entropy spectral analysis}, Ph.D.\ thesis, Department of

1341: Geophysics, Stanford University, Stanford, CA, 1975.

1342:

1343: \bibitem{CT91}

1344: T.~M.~Cover and J.~A.~Thomas, {\it Elements of Information Theory}, (first edition),

1345: John Wiley \& Sons, Inc., New York, 1991.

1346:

1347: \bibitem{CK81}

1348: I. Csisz\' ar and J.

1349: K\" orner, {\sl Information theory: coding theorems for discrete

1350: memoryless systems}, New York: Academic, 1981.

1351:

1352: \bibitem{CN95}

1353: I.~Csisz\'ar and P.~Narayan, ``Channel capacity for a given decoding

1354: metric,'' {\it IEEE Trans.\ Inform.\ Theory}, vol.\ 41, no.\ 1,

1355: pp.\ 35--43, January 1995.

1356:

1357: \bibitem{Ellis85}

1358: R.~S.~Ellis, {\it Entropy,

1359: large deviations, and statistical mechanics}, %SPI-054

1360: Springer--Verlag, NY, 1985.

1361:

1362: \bibitem{Ellis06}

1363: R.~S.~Ellis, ``The theory of large deviations and applications to statistical mechanics,''

1364: lectures for international

1365: seminar on extreme events in complex dynamics, October 2006.

1366: Available on--line at:

1367: [http://www.math.umass.edu/$\sim$rsellis/pdf-files/Dresden-lectures.pdf].

1368:

1369: \bibitem{Gray90}

1370: R.~M.~Gray, {\sl Source coding theory}, Kluwer Academic Publishers, 1990.

1371:

1372: \bibitem{GGRS81}

1373: R. M. Gray, A. H. Gray, G. Rebolledo, and J. E. Shore, ``Rate distortion%MDI-003

1374: speech coding with a minimum discrimination information distortion measure'',

1375: {\em IEEE Trans.~Inform.~Theory\/},

1376: vol.~IT--27, no.~6, pp.~708--721, November 1981.

1377:

1378: \bibitem{GN98}

1379: R.~M.~Gray and D.~L.~Neuhoff, ``Quantization,''

1380: {\em IEEE Trans.~Inform.~Theory\/}, vol.\ 44, no.\ 6, pp.\ 2325--2383, October 1998.

1381:

1382: \bibitem{GV02}

1383: D.~Guo and S.~Verd\'u, ``Multiuser detection and statistical physics,'' %SPI-006

1384: in {\it Communications, Information and Network Security},

1385: V.~Bhargava, H.~V.~Poor, V.~Tarokh, and S.~Yoon, Eds., Chap.\ 13, pp.\ 229-277,

1386: Kluwer Academic Publishers, Norwell, Mass, USA, 2002.

1387:

1388: \bibitem{HK05}

1389: T.~Hosaka and Y.~Kabashima, ``Statistical mechanical approach to error exponents %SPI-024

1390: of lossy data compression,'' {\it J.~Physical Society of Japan}, vol.\ 74, no.\ 1,

1391: pp.\ 488--497, January 2005.

1392:

1393: \bibitem{Jaynes57a}

1394: E.~T.~Jaynes, ``Information theory and statistical mechanics,'' %MDI-013

1395: {\it Phys.\ Rev.\ A}, vol.\ 106, pp.\ 620--630, May 1957.

1396:

1397: \bibitem{Jaynes57b}

1398: E.~T.~Jaynes, ``Information theory and statistical mechanics - II,'' %MDI-014

1399: {\it Phys.\ Rev.\ A}, vol.\ 108, pp.\ 171--190, October 1957.

1400:

1401: \bibitem{Jaynes82}

1402: E.~T.~Jaynes, "On the rationale of maximum-entropy methods", %MDI-002

1403: {\em Proc. of the IEEE\/}, vol.~70, no.~9, pp.~939--952, September 1982.

1404:

1405: \bibitem{KH05}

1406: Y.~Kabashima and T.~Hosaka, ``Statistical mechanics for source coding with a fidelity %SPI-037

1407: criterion,'' {\it Progress of Theoretical Physics}, Supplement no.\ 157,

1408: pp.\ 197--204, 2005.

1409:

1410: \bibitem{KNM02}

1411: Y.~Kabashima, K.~Nakamura, and J.~van Mourik,

1412: ``Statistical mechanics of typical set decoding,'' %SPI-025

1413: {\it Physical Review E}, vol.\ 66, 2002.

1414:

1415: \bibitem{KabS99}

1416: Y.~Kabashima and D.~Saad, ``Statistical mechanics of error correcting codes,'' %SPI-005

1417: {\it Europhysics Letters}, vol.\ 45, no.\ 1, pp.\ 97--103, 1999.

1418:

1419: \bibitem{KSNS01}

1420: Y.~Kabashima, N.~Sazuka, K.~Nakamura, and D.~Saad, ``Tighter decoding %SPI-002

1421: reliability bound for Gallager's error--correcting code,'' {\it Physical Review E},

1422: vol.\ 64, pp.\ 046113-1--046113-4, 2001.

1423:

1424: \bibitem{KanS99}

1425: I.~Kanter and D.~Saad, ``Error--correcting codes that nearly saturate %SPI-001

1426: Shannon's bound,'' {\it Physical Review Letters}, vol.\ 83, no.\ 13,

1427: pp.\ 2660--2663, September 1999.

1428:

1429: \bibitem{Landauer61}

1430: R.~Landauer, ``Irreversibility and heat generation in the computing process,'' {\it IBM

1431: J.\ Res.\ Dev.}, vol.\ 5, pp.\ 183--191, 1961.

1432:

1433: \bibitem{Lapidoth94}

1434: A. Lapidoth, ``Mismatched decoding and the multiple access channel,''%MAC-003

1435: Stanford Univ. Tech. Report, February 1994.

1436:

1437: \bibitem{LS96-2}

1438: A. Lapidoth and S. Shamai (Shitz), ``A lower bound on the bit-error %CCTT-022

1439: rate resulting from mismatched Viterbi decoding,''

1440: Technical Report, CC Pub No.~163,

1441: Department of Electrical Engineering, Technion -- I.I.T., August 1996.

1442:

1443: \bibitem{Maroney04}

1444: O.~J.~E.~Maroney, ``The (absence of a) relationship between thermodynamic and logical %SPI-044

1445: reversibility,'' arXiv:physics/0406137, June 27, 2004.

1446:

1447: \bibitem{McAllester}

1448: D.~McAllester, ``A statistical mechanics approach to large deviations theorems,'' %SPI-048

1449: preprint, 2006. Available on-line at: [http://citeseer.ist.psu.edu/443261.html].

1450:

1451: \bibitem{MKLS94}

1452: N. Merhav, G. Kaplan, A. Lapidoth, and S. Shamai (Shitz), ``On %CC-007

1453: information rates for mismatched decoders,''

1454: {\em IEEE Trans.~Inform.~Theory\/},

1455: vol.~IT--40, no.~6, pp.~1953--1967, November 1994.

1456:

1457: \bibitem{MK03}

1458: N.~Merhav and I.~Kontoyiannis, ``Source

1459: coding exponents for zero--delay coding with finite memory,''

1460: {\it IEEE Trans.\ Inform.\ Theory},

1461: vol.\ 49, no.\ 3, pp.\ 609--625,

1462: March 2003.

1463:

1464: \bibitem{MM06}

1465: M.~M\'ezard and A.~Montanari, {\it Constraint satisfaction networks in physics

1466: and computation}, draft, February 27, 2006.

1467: Available on--line at: [http://www.lptms.u-psud.fr/membres/mezard/].

1468:

1469: \bibitem{MR06}

1470: T.~Mora and O.~Rivoire, ``Statistical mechanics of error exponents for error--correcting %SPI-034

1471: codes,'' arXiv:cond-mat/0606696, June 2006.

1472:

1473: \bibitem{Murayama02}

1474: T.~Mutayama, ``Statistical mechanics of the data compression theorem,'' %SPI-029

1475: {\it J.~Phys.\ A: Math.\ Gen.}, vol.\ 35, pp.\ L95--L100, 2002.

1476:

1477: \bibitem{NG82}

1478: D.~L.~Neuhoff and R.~K.~Gilbert, ``Causal source codes,'' %RDT-001

1479: {\em IEEE Trans.~Inform.~Theory\/},

1480: vol.~IT--28, no.~5, pp.~701--713, September 1982.

1481:

1482: \bibitem{Oono89}

1483: Y.~Oono, ``Large deviation and statistical physics,'' {\it Progress of Theoretical Physics %SPI-05

1484: Supplement}, no.\ 99, pp.\ 165--205, 1989.

1485:

1486: \bibitem{PV01}

1487: M.~B.~Plenio and V.~Vitelli,

1488: ``The physics of forgetting: Landauer's erasure principle and information %QIT-017

1489: theory,'' {\it Contemporary Physics}, vol.\ 42, no.\ 1, pp.\ 25--60, 2001.

1490:

1491: \bibitem{PS99}

1492: A.~Procacci and B.~Scoppola, ``Statistical mechanics approach to coding theory,'' %SPI-023

1493: {\it J.~of Statistical Physics}, vol.\ 96, nos.\ 3/4, pp.\ 907--912, 1999.

1494:

1495: \bibitem{RC00}

1496: I.~Rojdestvenski and M.~C.~Cottman, ``Mapping of statistical physics to information theory%SPI-030

1497: with application to biological systems,'' {\it J.~Theor.\ Biol.}, pp.\ 43--54, 2000.

1498:

1499: \bibitem{Rose94}

1500: K.~Rose, ``A mapping approach to rate-distortion computation and %RDT-010

1501: analysis,'' {\em IEEE Trans.~Inform.~Theory\/}, vol.\ 40, no.\ 6, pp.\ 1939--1952, November 1994.

1502:

1503: \bibitem{Shinzato}

1504: T.~Shinzato, ``Statistical physics and thermodynamics on large deviation,'' preprint. %SPI-052

1505: Available online at [http://www.sp.dis.titech.ac.jp/shinzato/LD.pdf].

1506:

1507: \bibitem{SJ80}

1508: J.~E.~Shore and R.~W.~Johnson, ``Axiomatic derivation of the principle of %MDI-008

1509: maximum entropy and the principle of minimum cross-entropy,''

1510: {\em IEEE Trans.~Inform.~Theory\/},

1511: vol.~IT--26, no.~1, pp.~26--37, January 1980.

1512:

1513: \bibitem{Sourlas89}

1514: N.~Sourlas, ``Spin--glass models as error--correcting codes,'' {\it Nature}, %SPI-003

1515: pp.\ 693--695, vol.\ 339, June 1989.

1516:

1517: \bibitem{Sourlas94}

1518: N.~Sourlas, ``Spin glasses, error--correcting codes and finite--temperature %SPI-004

1519: decoding,'' {\it Europhysics Letters}, vol.\ 25, pp.\ 159--164, 1994.

1520:

1521: \bibitem{Tanaka01}

1522: T.~Tanaka, ``Statistical mechanics of CDMA

1523: multiuser demodulation,'' {\it Europhysics Letters}, %SPI-016

1524: vol.\ 54, no.\ 4, pp.\ 540--546, 2001.

1525:

1526: \bibitem{Tanaka02}

1527: T.~Tanaka, ``A statistical--mechanics approach to large--system analysis of CDMA %SPI-014

1528: multiuser detectors,'' {\it IEEE Trans.\ Inform.\

1529: Theory}, vol.\ 48, no.\ 11, pp.\ 2888--2910, November 2002.

1530:

1531: \bibitem{WSW05}

1532: M.~J.~Wainwright, T.~S.~Jaakkola, and S.~S.~Willsky, ``A new class of upper bounds on %SPI-013

1533: the log partition function,''

1534: {\em IEEE Trans.~Inform.~Theory\/},

1535: vol.~51, no.~7, pp.~2313--2335, July 2005.

1536:

1537: \end{thebibliography}

1538: \end{document}

1539: