0411:cs0411014/rate.tex

1: %\documentclass[twocolumn,twoside]{IEEEtran}

2: \documentclass[draft,onecolumn]{IEEEtran}

3: \usepackage{amsmath,amstext,amssymb,epsf}

4: \usepackage[dvips]{graphicx}

5: %\newcommand{\epsffile}[1]{\includegraphics{#1}}

6: \date{}

7: %\setlength{\topmargin}{-0.6in}

8: %\setlength{\textwidth}{6.2in}

9: %\setlength{\oddsidemargin}{0.2in}

10: %\setlength{\evensidemargin}{0.2in}

11: %\setlength{\textheight}{9.4in}

12:

13: %\newenvironment{changekolya}{}{}

14: \newenvironment{changekolya}{[}{]}

15: %\newenvironment{changekolya}{$\Big[$}{$\Big]$}

16:

17: \newcommand{\All}{\mathcal U}

18: \newcommand{\Ham}{\mathcal H}

19: \newcommand{\Euc}{\mathcal E}

20: \newcommand{\dd}{l}

21: \newcommand{\tx}{\tilde x}

22: \newcommand{\dds}{m}

23: \newcommand{\ddd}{d}

24: \newcommand{\ddds}{d'}

25: \newcommand{\tir}{r}

26: \renewcommand{\le}{\leq}

27: \renewcommand{\ge}{\geq}

28: \renewcommand{\emptyset}{\varnothing}

29:

30: \newcommand{\dm}{D}

31: \newcommand{\comp}{s}

32: \newcommand{\A}{\mathcal A}

33: \newcommand{\BB}{\mathcal B}

34: \newcommand{\B}{\Delta}

35: \newcommand{\C}{\mathcal C}

36: \newcommand{\Q}{\mathcal Q}

37: \newcommand{\U}{\mathcal U}

38: \newcommand{\X}{\booln}

39: \newcommand{\Y}{\mathcal{Y}}

40: \newcommand{\dmax}{n}

41: \newcommand{\ymax}{y_0}

42: \newcommand{\dmin}{0}

43: \newcommand{\booln}{\{0,1\}^n}

44: \newcommand{\bool}{\{0,1\}^*}

45: %\newcommand{\dmin}{d_\text{min}}

46:

47: \newcommand{\K}{C}

48: \newcommand{\KE}{\textit{CE}}

49: \newcommand{\wh}[1]{\lfloor #1 \rfloor}

50: \newcommand{\wwh}[1]{\lceil #1 \rceil}

51: %\newcommand{\poly}{\text{poly}}

52: \newcommand{\eps}{\varepsilon}

53: \newcommand{\Time}{\textit{Time}}

54: \newcommand{\cd}{\textit{CD}}

55: \newcommand{\pair}[1]{\langle #1\rangle}

56: \newcommand{\prob}{\text{Prob}}

57: \newcommand{\ave}{\mathop{\bf E}}

58: %\newcommand{\N}{\mathbb{N}}

59: %\newcommand{\R}{\mathbb{R}}

60: %\newcommand{\Q}{\mathbb{Q}}

61: \newcommand{\len}[2]{l_{#1}(#2)}

62: \newcommand{\close}[2]{#1=\mathcal E(#2)}

63: \newcommand{\dclose}[2]{#1 \cong #2}

64: \newcommand{\last}{\textit{LAST}}

65: \newcommand{\least}{\textit{least}}

66: \newcommand{\bb}[1]{\textit{BB}(#1)}

67: \newcommand{\bbinv}[1]{{\textit{BB}}^{-1}(#1)}

68: \newcommand{\Loss}{{\rm Loss}}

69:

70: \newtheorem{theorem}{\sc Theorem}

71: \newtheorem{proposition}{\sc Proposition}

72: \newtheorem{lemma}{\sc Lemma}

73: \newtheorem{coro}{\sc Corollary}

74:

75: %\theoremstyle{remark}

76: %\newtheorem{example}{Example}

77: \newtheorem{nota}{\sc Notation}

78: \newtheorem{defin}{\sc Definition}

79: \newtheorem{rem}{\sc Remark}

80: \newtheorem{cla}{\sc Claim}

81: \newtheorem{ex}{\sc Example}

82: \newenvironment{comment}{\begin{small}\begin{quotation}\hspace{-0.23in}\rm}{\end{quotation}\end{small}}

83: %\newenvironment{proof}{\par \sc Proof.\rm}{\hspace*{\fill}$\Box$\vspace{1ex}}

84: \newenvironment{remark}{\begin{rem}}{\hspace*{\fill}$\Diamond$\end{rem}}

85: \newenvironment{example}{\begin{ex}}{\hspace*{\fill}$\diamondsuit$\end{ex}}

86: \newenvironment{claim}{\begin{cla}}{\end{cla}}

87: \newenvironment{corollary}{\begin{coro}}{\end{coro}}

88: \newenvironment{requirement}{\begin{req}}{\end{req}}

89: \newenvironment{definition}{\begin{defin}}{\end{defin}}

90: %\newenvironment{remark}{\begin{rem}}{\end{rem}}

91: \newenvironment{notation}{\begin{nota}}{\end{nota}}

92:

93:

94:

95: \title{Rate Distortion and Denoising of Individual Data

96: Using Kolmogorov complexity}

97: \author{

98: Nikolai K. Vereshchagin\thanks{

99: NKV, Dept. Math. Logic \& Theor. Algor.,

100: Moscow State Univ., Russia. Email: nikolay.vereshchagin@gmail.com}

101: and

102: Paul M.B. Vit\'anyi\thanks{

103: PMBV, CWI, Science Park 123, 1098XG Amsterdam, the Netherlands.

104: Email: Paul.Vitanyi@cwi.nl}

105: }

106:

107: \begin{document}

108: \maketitle

109: \begin{abstract}

110: %Kolmogorov complexity can be used

111: %to obtain a rate-distortion theory and denoising of individual data

112: %(source words)

113: %taken to be finite binary strings. We prove for almost all distortion measures

114: %(i) different source words have different rate-distortion curves, and for

115: %every curve from a wide family there is a source word that yields

116: %this curve approximately;

117: %(ii) a Kolmogorov complexity

118: %characterization of the rate-distortion curve

119: %for individual source words in

120: %terms of algorithmic mutual information;

121: %(iii) if a destination word witnesses the rate-distortion curve of

122: %a given source word

123: %at a given rate, then this destination word captures

124: %as many properties of the source word as is possible

125: %at this rate;

126: %(iv) application of the last result to the denoising of

127: %corrupted individual source words; and

128: %(v) the relation between the expected rate-distortion

129: %curves of the individual source words and Shannon's rate-distortion curve.

130: We examine the structure of families of distortion balls from

131: the perspective of Kolmogorov complexity. Special attention is paid to

132: the canonical rate-distortion function of a source word

133: which returns the minimal Kolmogorov complexity of all distortion balls

134: containing that word subject to a bound on their cardinality. This canonical

135: rate-distortion function is related to the more standard

136: algorithmic rate-distortion function for the given distortion measure.

137: Examples are given of list distortion,

138: Hamming distortion, and Euclidean distortion.

139: The algorithmic rate-distortion function can behave

140: differently from Shannon's rate-distortion function.

141: To this end, we show that the canonical

142: rate-distortion function can and does assume a wide class of shapes

143: (unlike Shannon's); we relate low algorithmic mutual information

144: to low Kolmogorov complexity (and consequently suggest that certain aspects of the

145: mutual information formulation of Shannon's rate-distortion function

146: behave differently than would an analogous formulation using algorithmic

147: mutual information); we explore the notion that low Kolmogorov complexity

148: distortion balls containing a given word

149: capture the interesting properties of that word

150: (which is hard to formalize in Shannon's theory) and this

151: suggests an approach to denoising; and, finally, we show that

152: the different behavior of the rate-distortion curves

153: of individual source words to some extent

154: disappears after averaging over the source words.

155: \end{abstract}

156:

157: \section{Introduction}

158: \label{sect.rdsf}

159: Rate distortion theory analyzes the transmission and

160: storage of information at insufficient bit rates.

161: The aim is to minimize the resulting information loss

162: expressed in a given distortion measure.

163: The original data is called the `source word'

164: and the encoding used for transmission or storage

165: is called the `destination word.' The number of bits available

166: for a destination word is called the `rate.'

167: The choice of distortion

168: measure

169: is usually a selection of which aspects of the source word are relevant

170: in the setting at hand, and

171: which aspects are irrelevant (such as noise).

172: For example, in application to

173: lossy compression of a sound file this results

174: in a compressed file where, among others, the very high and

175: very low inaudible frequencies have been suppressed.

176: The distortion measure is chosen such that it penalizes

177: the deletion of the inaudible

178: frequencies but lightly because they are not

179: relevant for the auditory

180: experience. We study rate distortion of

181: individual source words using Kolmogorov complexity and show

182: how it is related to

183: denoising.

184: The classical probabilistic theory is

185: reviewed in Appendix~\ref{sect.ratedistortion}.

186: Computability notions are reviewed in Appendix~\ref{sect.computability}

187: and Kolmogorov complexity in Appendix~\ref{sect.kolmcompl}.

188: Randomness deficiency according to Definition~\ref{def.rd}

189: and its relation to the fitness of a destination word for

190: a source word is explained further in Appendix~\ref{sect.rd}.

191: Appendix~\ref{sect.exhamming} gives the proof, required

192: for a Hamming distortion example, that

193: every large Hamming ball can be covered by a

194: small number of smaller

195: Hamming balls (each of equal cardinality).

196: More specifically, the number of covering balls

197: is close to the ratio between the cardinality

198: of the large Hamming ball and the small Hamming ball.

199: The proofs of the theorems are deferred to Appendix~\ref{sect.proofs}.

200:

201: \subsection{Related Work}

202: In \cite{Ko74} A.N. Kolmogorov formulated the

203: `structure function' which can be viewed as a proposal

204: for non-probabilistic model

205: selection. This function and the associated Kolmogorov

206: sufficient statistics are partially treated in

207: \cite{Sh83,Vy87,GTV01}

208: and analyzed in detail in \cite{VV02}.

209: We will show that the structure function

210: approach can be generalized to give an approach to

211: rate distortion and denoising of

212: individual data.

213:

214: Classical rate-distortion theory

215: was initiated by Shannon in~\cite{Sh48}.

216: In~\cite{Sh59} Shannon gave a nonconstructive

217: asymptotic characterization of the expected rate-distortion curve of a

218: random variable

219: (Theorem~\ref{theo.shannon} in Appendix~\ref{sect.ratedistortion}).

220: References \cite{Be71,BG98} treat

221: more general distortion measures and random variables in the Shannon

222: framework.

223:

224: References~\cite{YS93,MK94,SE03} relate

225: the classical and algorithmic approaches according to traditional

226: information-theoretic concerns. We follow their definitions of

227: the rate-distortion function.

228: The results show that if the source word is obtained from random

229: i.i.d. sources, then with high probability and in expectation

230: its individual rate-distortion curve is close to

231: the Shannon's single rate-distortion curve.

232: In contrast, our Theorem~\ref{theo.allshapesrd} shows that

233: for distortion measures satisfying properties 1 through 4

234: below

235: there are many different shapes of individual

236: rate-distortion functions related to the different

237: individual source words,

238: and many of them

239: are very different from Shannon's rate-distortion curve.

240:

241:

242: Also Ziv~\cite{Zi80} considers

243: a rate-distortion function for individual data.

244: The rate-distortion function is assigned to

245: every infinite sequence $\omega$ of letters of a finite alphabet $\Gamma$.

246: %(and not to a finite object, as in the present paper).

247: The source words $x$

248: are prefixes of $\omega$

249: and the encoding function is

250: computed by a finite state transducer.

251: Kolmogorov complexity is not involved.

252:

253: In \cite{Sa94,Na95,CYV97,Do02}

254: alternative approaches to denoising via compression

255: and in \cite{RV06,rum} applications of the current work

256: are given.

257:

258:  In \cite{VV02} Theorems~\ref{theo.allshapesrd}, \ref{th45} were obtained

259:  for a particular distortion measure relevant to model selection (the example

260: ${\cal L}$ in this paper).

261: The techniques used in that paper

262: do not generalize to prove the current theorems which concern

263: arbitrary distortion measures

264: satisfying certain properties

265: given below.

266:

267:

268: \subsection{Results}

269: A source word is taken to be a finite binary string.

270: Destination words are finite objects (not necessarily finite binary strings).

271: For every destination word encoding a particular source word with

272: a certain distortion, there is a finite set of source words that are

273: encoded by this destination word with at most that distortion.

274: %Therefore, we can loosely

275: %identify a destination word with the set of source words thus defined

276: %(if there are more than one such set we take the first one in

277: %lexicographical order).

278: We call these finite sets of source words `distortion balls.'

279: Our approach is based on the Kolmogorov complexity

280: of distortion balls. For every source word we

281: define its `canonical' rate-distortion function,

282: %independent of a distortion measure,

283: from which

284: the algorithmic rate-distortion function of that source word

285: %for a specific distortion measure

286: can be obtained by a simple

287: transformation,

288: Lemma~\ref{lem.rg}.

289:

290: Below we assume that a distortion measure

291: satisfies certain properties which are specified in the theorems

292: concerned.

293: In Theorem~\ref{theo.allshapesrd} it is shown that

294: there are distinct canonical rate-distortion curves (and hence distinct

295: rate-distortion curves) associated with

296: distinct source words (although some curves may coincide). Moreover,

297: every candidate curve from a given family of curves is

298: realized approximately as the

299: canonical rate-distortion curve (and hence for a related family

300: of curves every  curve is realized approximately as the

301: rate-distortion curve) of some

302: source word.

303: In Theorem~\ref{th-shannon-analog} we prove a Kolmogorov

304: complexity analogue for

305: Shannon's theorem, Theorem~\ref{theo.shannon}

306: in Appendix~\ref{sect.ratedistortion}, on the characterization

307: of the expected rate-distortion

308: curve of a random variable.

309: The new theorem states approximately the following:

310: For every source word and every destination word there exists

311: another destination word that has Kolmogorov complexity

312: equal to algorithmic information in the first destination word about the

313: source word, up to a logarithmic additive term,

314: and both destination words incur the same distortion

315: with the source word. (The theorem is given in the distortion-ball formulation

316: of destination words.)

317: In Theorem~\ref{th45} we show that, at every rate,

318: the destination word incurring the least distortion

319: is in fact the `best-fitting' among all destination words at that rate.

320: `Best-fitting' is taken in the sense of sharing the most

321: properties with the source word.

322: (This notion of a `best-fitting' destination word for a

323: source word can be expressed in Kolmogorov complexity, but

324: not in the classic probabilistic framework. Hence there is no

325: classical analogue for this theorem.)

326: It turns out that this yields a method of denoising by compression.

327: Finally, in Theorem~\ref{thm.dresf}, we show that the expectation

328: of the algorithmic rate-distortion functions is

329: pointwise related to Shannon's rate-distortion function, where the closeness

330: depends on the Kolmogorov complexities involved and

331: ergodicity and stationarity of the source.

332:

333:

334:

335:

336: \section{Preliminaries}

337:

338: \subsection{Data and Binary Strings}

339: We write {\em string} to mean a finite binary string.

340:   Other finite objects can be encoded into strings in natural

341: ways.  The set of strings is denoted by $\{0,1\}^*$. The {\em length}

342: of a string $x$ is the number of bits in it denoted as $|x|$. The {\em empty}

343: %string $x$ has length $|x| = 0$.

344: string $\epsilon$ has length $|\epsilon| = 0$.

345: Identify the natural numbers

346: ${\cal N}$ (including 0) and $\{0,1\}^*$ according to the

347: correspondence

348:  \begin{equation}\label{order}

349:  (0, \epsilon ), (1,0), (2,1), (3,00), (4,01), \ldots .

350:  \end{equation}

351: Then, $|010|=3$.

352: The emphasis is on binary sequences only for convenience;

353: observations in every finite alphabet can be so encoded in a way

354: that is `theory neutral'. For example, if a finite alphabet $\Sigma$ has

355: cardinality $2^k$, then every element $i \in \Sigma$ can be encoded

356: by $\sigma(i)$ which

357: is a block of bits of length $k$. With this encoding every $x \in \Sigma^*$

358: satisfies that the Kolmogorov complexity

359: $\K(x)=\K(\sigma(x))$ (see Appendix~\ref{sect.kolmcompl} for basic definitions

360: and results on Kolmogorov complexity)

361: up to an additive constant that is

362: independent of $x$.

363:

364: \subsection{Rate-Distortion Vocabulary}

365: %Let ${\cal X}$ be the {\em source alphabet}

366: %consisting of a set of {\em source} objects

367: %called {\em words} or {\em messages}.

368: Let ${\cal X}$ be a set, called

369: the {\em source alphabet} whose elements are called

370: {\em source words} or {\em messages}.

371: We also use a set $\Y$ called the {\em destination alphabet},

372: whose elements are called {\em destination words}.

373: (The destination alphabet is also called the reproduction alphabet.)

374: In general there are no restrictions on the set

375: ${\cal X}$; it can be countable or uncountable.

376: However, for technical reasons, we assume ${\cal X}= \{0,1\}^*$.

377: On the other hand, it is important that the set $\Y$ consists

378: of {\em finite objects}: we need that the notion of Kolmogorov complexity

379: $\K(y)$ be defined for all $y\in\Y$.

380: (Again, for basic definitions and results on Kolmogorov complexity

381: see Appendix~\ref{sect.kolmcompl}.)

382: In this paper it is not essential

383: whether we use plain Kolmogorov complexity or the  prefix

384: variant; we use plain Kolmogorov complexity.

385:

386:

387: Suppose we want to communicate a source word

388: $x \in {\cal X}$ using a {\em destination word}

389: $y \in {\Y}$

390: that can be encoded in at most $r$ bits in the sense that

391: the Kolmogorov complexity $\K(y) \leq r$.

392: %(For example, if  $|{\Y}| \ll 2^r$.)

393: %If $x \in {\cal X}$, that is $x$ is a finite object,

394: %and the Kolmogorov complexity

395: %$\K(x) > r$,

396: %%or if $x$ is not a finite object in which case we define $\K(x)= \infty$,

397: %then $\K(y) \leq r < \K(x)$ for every destination word

398: %$y \in {\Y}$.

399: %Therefore, $x$ cannot be reproduced from any such $y$.

400: Assume furthermore that we are given

401: a {\em distortion}

402: function

403: $d: {\cal X} \times {\Y} \rightarrow {\cal R} \bigcup \{\infty\}$,

404: that measures the fidelity of the destination word

405: against the source word.

406: Here ${\cal R}$ denotes the nonnegative real numbers,

407:

408: \begin{definition}\label{def.rddr}

409: \rm

410: Let $x\in {\cal X} = \{0,1\}^*$ and ${\cal Q}$ denote the rational numbers.

411: The {\em rate-distortion function} $r_x: {\cal Q} \rightarrow {\cal N}$ is

412: the minimum number of bits in

413: a destination word $y$

414: to obtain a distortion of at most $\delta$ defined by

415: \[

416: r_x(\delta) = \min_{y \in {\Y}} \{\K(y) :  d(x,y)\le \delta\}

417: \]

418: %The domain of $r_x$ is the set $\Q$ of

419: %rational numbers.

420: The `inverse' of the above function is

421: is the {\em distortion-rate function} $d_x: {\cal N} \rightarrow {\cal R}$

422:  and is

423: defined by

424: \[

425: d_x (r) = \min_{y \in {\Y}}  \{d(x,y) :    \K(y) \leq r \}.

426: \]

427: %The domain of $d_x$ is $\cal N$.

428: \end{definition}

429: These functions are analogs for individual source words $x$ of the

430: Shannon's rate-distortion

431: function defined in \eqref{eq.rndelta} and its related

432: distortion-rate function, expressing

433: the least expected rate or distortion at which outcomes

434: from a random source $X$ can be transmitted,

435: see Appendix~\ref{sect.ratedistortion}.

436:

437: \subsection{Canonical Rate-Distortion Function}

438:

439: Let ${\cal X}=\{0,1\}^*$ be the source

440: alphabet,

441: ${\Y}$ a destination

442: alphabet,

443: and $d$ a distortion measure.

444:

445: \begin{definition}\label{def.distball}

446: \rm

447: A {\em distortion ball} $B(y,\delta)$ centered on $y \in {\Y}$

448: with radius $\delta\in\cal Q$ is defined by

449: \[

450: B(y,\delta)= \{x \in {\cal X}: d(x,y) \leq \delta \},

451: \]

452: and its cardinality is denoted by $b(y,\delta) = |B(y,\delta)|$.

453: (We will consider only pairs $(\Y,d)$

454: such that all distortion balls are finite.)

455: If the cardinality $b(y,\delta)$ depends only on

456: $\delta$ but not on the center $y$, then we denote it by  $b(\delta)$.

457: The family ${\A}^{d,\Y}$ is

458: defined as the set of all nonempty distortion balls.

459: The restriction

460: to strings of length $n$ is denoted by

461: ${\A}^{d,\Y}_n$.

462: \end{definition}

463: %Every

464: %distortion ball corresponds uniquely with a (destination word, distortion)

465: %pair, and if a ball corresponds to more than one such pair

466: %then we take the first one in a given order among the pairs

467: %having the least distortion.

468:

469: To define the canonical rate-distortion function we need

470: the notion of the Kolmogorov complexity

471: of a finite set.

472:

473: \begin{definition}\label{def.kcset}

474: \rm

475: Fix a computable

476: total order on the set of all strings

477: (say the order defined in \eqref{order}).

478: The {\em Kolmogorov complexity $\K(A)$ of a finite set}

479: %$A=\{x_1, \ldots x_m\} \subseteq \{0,1\}^*$

480: is defined as the length of the shortest

481: %program $p$

482: string $p$

483: such that the universal reference Turing machine $U$

484: %prints $U(p)=x_1, \ldots , x_m$ in a fixed order,

485: %say lexicographic, and halts.

486: %We require that the constituent elements are

487: %distinguishable so that we can tell

488: %them apart.

489: given $p$ as input prints the list of all elements of $A$

490: in the fixed order

491: and halts.

492: We require that the constituent elements are

493: distinguishable so that we can tell

494: them apart.

495: Similarly we define the {\em conditional} versions

496: $\K(A\mid z)$ and $\K(z\mid A)$

497: where $A$ is a finite set of strings

498: and $z$ is a string or a finite set of strings.

499: \end{definition}

500:

501: \begin{remark}

502: \rm

503: In Definition~\ref{def.kcset}

504: it is important that $U(p)$ halts after printing the last

505: element in the list---in this way we know that the list is complete.

506: If we allowed $U(p)$ to not halt, then we would obtain the

507: complexity of the so-called \emph{implicit description} of $A$, which can be

508: much smaller than $\K(A)$.

509: \end{remark}

510: \begin{remark}

511: \rm

512: We can allow  $U(p)$ to output the list of elements

513: in any order in Definition~\ref{def.kcset}. This flexibility

514: decreases $\K(A)$

515: by at most a constant not depending on $A$ but only depending

516: on the order in \eqref{order}.

517: The same applies to $\K(A\mid z)$.

518: On the other hand, if $A$ occurs in a conditional,

519: such as in $\K(z\mid A)$, then

520: it {\em is} important that elements of $A$ are given in the fixed

521: order. This is the case since the order in which the

522: elements of $A$ are listed

523: can provide extra information.

524: \end{remark}

525:

526: \begin{definition}\label{def.Kfamily}

527: \rm

528: Fix a computable bijection $\phi$ from the family of all finite

529: subsets of $\{0,1\}^*$ to  $\{0,1\}^*$.

530: Let $\A$ be a finite family of finite subsets of ${\cal X}=\{0,1\}^*$.

531: Define the {\em Kolmogorov complexity} $\K(\A)$

532: by $\K(\A)=\K(\{\phi(A)): A\in\A\})$.

533: \end{definition}

534: \begin{remark}

535: \rm

536: An equivalent definition

537: of $\K(A \mid z)$ and $\K(z \mid A)$ as in Definition~\ref{def.kcset}

538: is as follows. Let $\phi$ be as in Definition~\ref{def.Kfamily}.

539: Then we can define $\K(A \mid z)$ by $\K(\phi(A) \mid z)$ and  $\K(z \mid A)$

540: by $\K(z \mid \phi(A))$.

541: \end{remark}

542:

543: \begin{definition}\label{def.gx}

544: \rm

545: For every

546: string $x$

547: the {\em canonical rate-distortion function}

548: %with respect to a distortion family of distortion balls

549: %${\A}_n^d$ as in Definition \ref{eq.Ad}

550: $g_x:\mathcal N\to\mathcal N$

551: is defined by

552: \[

553: %g_{x}(l) = \min_{B \in {\A}^d_n} \{ \K(B) : x\in B,\log |B| \leq l\}.

554: g_{x}(l) = \min_{B \in {\A}^{d,\Y}}

555: \{ \K(B) : x\in B,\log |B| \leq l\}.

556: \]

557: \end{definition}

558:

559: In a similar way we can define

560: the \emph{canonical distortion-rate function}:

561: $$h_x(j)= \min_{B \in {\A}^{d,\Y}}\{\log|B|:

562: x\in B,\ C(B)\le j\}.

563: $$

564:

565: \begin{definition}

566: \rm

567: A {\em distortion family} ${\A}$ is a set of finite nonempty

568: subsets of the set of source words

569: ${\cal X}=\{0,1\}^*$. The restriction to source words of length $n$

570: is denoted by ${\A}_n$.

571: %By ${\A}_n$ we denote the restriction of ${\A}$

572: %to strings of length~$n$.

573: \end{definition}

574:

575: Every destination alphabet $\Y$ and

576: distortion measure $d$

577: gives rise to a set of distortion balls

578: ${\A}^{d,\Y}$, which is a distortion family. Thus

579: the class of distortion families

580: obviously includes every family of distortion

581: balls (or distortion spheres,

582: which is sometimes more convenient)

583: arising from every combination of

584: destination set

585: and distortion measure.

586: It is easy to see that we also can

587: substitute the more general distortion families ${\A}$

588: for ${\A}^{d,\Y}$ in the definitions

589: of the canonical rate-distortion and distortion-rate

590: function.

591: %For example, the set $A$ in Definition~\ref{def.kcset}

592: %can be a distortion ball

593: %as in Definition~\ref{def.distball}, and such a distortion ball

594: %is an element of the distortion family ${\A}^d$ consisting of

595: %all distortion balls with respect to a given distortion measure $d$

596: %as in Definition~\ref{def.distfam}.

597: %\end{example}

598: %Given a string $x$, we can look for a

599: %finite set $A \in {\A}^d$ that contains $x$ and is both simple

600: %and small, as follows.

601: %For every $x\in \{0,1\}^*$ we identify

602: %the set of pairs of integers $(k,l)$ such that

603: %there is $A\in {\A}^d$ with $x\in A$, $\K(A)\le k$, and $ \log |A|\le l$.

604: %The set $P_x$ of all such pairs will be called the {\em profile of $x$}.

605: %Strings of the same complexity can have quite different profiles.

606: %All such pairs $(k,l)$ satisfy the inequality

607: %$k+l\geq \K(x\mid  \min\{|k|,|l|\})$ up to an additive constant term

608: %(since

609: %we can reconstruct $x$ by providing a constant bit reconstruction

610: %program with a $k$-bit description of $A$,

611: %an $l$-bit ordinal number of $x$ in $A$,

612: %and the minimum length of these two in the conditional term two tell the two

613: %apart).

614: %%\end{example}

615:

616: %In \cite{VV02} the authors analyzed Kolmogorov's structure function

617: %which is actually

618: %%the particular distortion measure of the

619: %%later Example~\ref{exam.list}.

620: %the canonical distortion-rate function for the distortion

621: %family $\A$ consisting of \emph{all} finite subsets of

622: %$\mathcal X=\{0,1\}^*$,

623: %This distortion family is equal to $\A^{d,\Y}$

624: %where ${\Y}$ consists of all nonempty finite

625: %subsets of $\{0,1\}^*$, and the distortion of $x \in \{0,1\}^*$

626: %with respect to $S \subseteq \{0,1\}^*$ is $d(x,S)=\lceil \log |S| \rceil$

627: %\begin{changekolya}if $x\in S$ and $\infty$ otherwise\end{changekolya}.

628: %This is the maximum number of bits required to identify an element

629: %of $S$.

630: %The rate-distortion function

631: %$r_x(\delta)$ is

632: %$\min_{S \subseteq \{0,1\}^n}  \{\K(S): d(x,S) \leq \delta \}$.

633: %$\min_{S \subseteq \{0,1\}^*}  \{\K(S): \lceil \log |S| \rceil \leq \delta \}$

634: %and essentially coincides with the canonical rate-distortion function

635: %($g_x$ is the restriction of $r_x$ to $\cal N$).

636: %

637: %

638: %

639: %In general, destination words are not sets. But for the source alphabet

640: %${\cal X} = \{0,1\}^*$, a destination alphabet ${\Y}$,

641: %and a distortion measure $d$, we consider the family of distortion

642: %balls ${\A}^d_n$ consisting of a

643: %particular subset of all nonempty subsets of $\{0,1\}^n$.

644: %Let $x \in {\cal X}$ be of length $n$ and $y \in {\Y}$ such that

645: %the radius (that is, distortion)

646: %$d(x,y)= \delta$. Then, we associate the distortion ball $B(y,\delta)$

647: %with the destination word $y$ for the source word $x$.

648: %Because not all nonempty subsets of $\{0,1\}^n$ are distortion balls

649: %of ${\A}^d_n$, the rate-distortion function restricted

650: %to the latter family of distortion balls behaves differently from before.

651: %The new `rate-distortion function' now becomes

652: %$\min_{S \in {\A}^d_n} \{\K(S): \log |S|

653: %\leq \delta\}$.  This leads to

654: %Definition~\ref{def.gx} of the function $g_x$ below. By

655: %Lemma~\ref{lem.rg} below it turns out that by analyzing the

656: %single function $g_x$ we obtain the rate-distortion functions for

657: %every distortion measure by a simple transformation requiring

658: %the cardinality of the distortion balls.

659: %%These rate-distortion functions

660: %%are therefore computable provided the cardinality of the distortion balls

661: %%is computable.

662: %%The transformation between a distortion ball and the

663: %%corresponding destination word and distortion

664: %%is computable provided the distortion ball

665: %%is computable from the destination word and distortion.

666: %

667: %

668: In general,

669: the canonical rate-distortion function of $x$ can be quite different

670: from the rate-distortion function of $x$. However, by

671: Lemma~\ref{lem.rg} below it turns out that

672: for every distortion measure satisfying certain conditions

673: and for every $x$

674: the rate-distortion function

675: $r_x$ is obtained from $g_x$ by a simple transformation requiring

676: the cardinality of the distortion balls.

677: %$r_x(\delta)=g_x(\lceil \log b(\delta)\rceil)+O(\log|x|+\K(\delta))$ for all

678: %rational $\delta$. Here $b(\delta)$ stands for $\max_{y\in\Y}b(y,\delta)$.

679:

680:

681: \begin{remark}

682: Fix a string $x\in\mathcal X=\{0,1\}^*$

683: and consider different distortion families $\A$.

684: Let $g_x^\A$ denote the canonical rate-distortion

685: function of $x$ with respect to a family $\A$.

686: Obviously, if $\A\subset\BB$

687: then $g_x^\A$ is pointwise not less than

688: $g_x^\BB$ (and it may happen that $g_x^\A(i)\gg g_x^\BB(i)$ for some $i$).

689: But as long as $\A$ satisfies certain natural properties, then

690: the set of all possible $g_x$, when

691: $x$ ranges over $\mathcal X$, does not depend on the particular $\A$

692: involved, see

693: Theorem~\ref{theo.allshapesrd}.

694: \end{remark}

695: %Consider a source word $x$ of length $n$.

696: %For every natural $l\leq n$, the function value

697: %$g_x(l)$ is the minimum $k$

698: %such that the pair $(k,l)$ belongs

699: %to the profile of $x$.

700:

701:

702: \subsection{Use of the Big O Term}

703: In the sequel we use `additive constant $c$' or

704: equivalently `additive $O(1)$ term' to mean a constant.

705: accounting for the length of a fixed binary program,

706: independent from every variable or parameter in the expression

707: in which it occurs.

708: Similarly we use

709: `$O(f(m,n,\dots))$' to mean a function $g(m,n,\dots)$

710: such that $g(m,n,\dots) \leq c f(m,n,\dots)+c$ where $c$

711: is a fixed constant

712: independent from every variable $m,n,\dots$ in the expression.

713:

714:

715: \section{Distortion Measures}

716:

717: %\begin{definition}

718: %\rm

719: %The {\em Kolmogorov complexity of a finite family ${\A}$

720: %of finite nonempty subsets}

721: %$A_1, \ldots, A_m$

722: %of $\{0,1\}^*$ is defined by $\K({\A})=

723: %\K(A_1, \ldots, A_m)$, where the

724: %sequence $A_1, \ldots , A_m$

725: %is in a fixed order,

726: %say lexicographic, the constituent sets are delimited so we can tell

727: %them apart, and the elements of the constituent sets are in fixed order

728: %(say lexicographic) and delimited.

729: %\end{definition}

730:

731: Since every family of distortion

732: balls is a distortion family,

733: considering arbitrary distortion measures and destination alphabets

734: results in distortion families. We consider

735: the following mild conditions on

736: distortion families~${\A}$:

737: \begin{description}

738: \item{\bf Property 1.}

739: For every natural number $n$,

740: the family ${\A}$ contains

741: the set $\{0,1\}^n$ of all strings of length $n$ as an element.

742: \item{\bf Property 2.}

743: All $x,y\in A\in {\A}$ satisfy

744: $|x|=|y|$.

745: \item{\bf Property 3.}

746: Recall that ${\A}_n = \{A \in {\A}: A \subseteq  \{0,1\}^n \}$.

747: Then, $\K({\A}_n)=O(\log n)$.

748: \item{\bf Property 4.}

749: For every natural $n$, let

750: $\alpha_n$ denote the minimal number

751: that satisfies the following.

752: For every positive integer $c$ every

753: set $A\in {\A}_n$ can be covered by at most

754: %$\alpha_n |A|/c$ sets $B\in {\A}_n$ with $|B| \leq c$.

755: $\alpha_n |A|/c$ sets $B\in {\A}$ with $|B| \leq c$.

756: Call $\alpha_n$

757: the {\em covering coefficient} related to ${\A}_n$.

758: Property 4 is satisfied if $\alpha_n$ be bounded by

759: a polynomial in $n$.

760: The smaller the covering coefficient is, the more accurate will

761: be the description

762: that we obtain of the shapes of the structure functions below.

763: \end{description}

764: The following three example

765: families ${\A}$ satisfy all four properties.

766: \begin{example} \label{exam.list}

767: \rm

768: ${\cal L}$ {\em the list distortion family}.

769: Let ${\cal L}_n$

770: be the family of all nonempty subsets

771: of $\{0,1\}^n$.

772: This is the family of distortion balls

773: for list distortion, which we define as follows.

774: Let

775: ${\cal X} =\{0,1\}^*$ and

776: ${\Y}=\bigcup_n\mathcal L_n$.

777: A  source word $x \in \{0,1\}^n$ is

778: encoded by a destination word

779: which is a subset or {\em list}

780: $S \subseteq \{0,1\}^n$ with $x \in S$.

781: Given $S$, we can retrieve $x$ by its index of $\log |S|$ bits in $S$,

782: ignoring rounding up, whence the name `list code.'

783: The distortion measure is $d(x,S)= \log |S|$ if $x \in S$,

784: and $\infty$ otherwise. Thus, distortion balls come only in the form

785: $B(S,\log |S|)$ with cardinality $b(S,\log |S|)=|S|$.

786: Trivially, the covering coefficient

787: as defined in property~4,

788: for the list distortion family ${\cal L}$,

789: satisfies $\alpha_n \leq 2$.

790: Reference~\cite{VV02} describes

791: all possible canonical distortion-rate curves, called

792: Kolmogorov's  structure function there and first defined in \cite{Ko74}.

793: %More precisely,

794: %the function $h_x(i)$ equals $d_x(i)$, the distortion-rate function

795: %for the

796: %distortion family ${\cal L}_n$.

797: The distortion-rate function for list distortion

798: coincides with the canonical distortion-rate function.

799: The rate-distortion

800: function of $x$ for list distortion is

801: \[

802: r_x(\delta) =

803: \min_{S \subseteq \{0,1\}^n} \{\K(S): x \in S , \; \log |S| \leq \delta \}

804: \]

805: and essentially coincides with the canonical rate-distortion function

806: ($g_x$ is the restriction of $r_x$ to $\cal N$).

807: %The canonical rate-distortion function $g_x$

808: %can be converted to the particular rate-distortion

809: %function $r_x$ for a family ${\cal L}_n$ according to

810: %\eqref{eq.sfrd}.

811: \end{example}

812:

813: \begin{example}

814: \rm

815: ${\cal H}$ {\em the Hamming distortion family}.

816: Let ${\cal X} = {\Y} =\{0,1\}^*$.

817: A source word  $x \in \{0,1\}^n$ is

818: encoded by a destination word $y \in \{0,1\}^n$.

819: For every positive integer $n$, the {\em Hamming distance}

820: between two strings $x= x_1 \ldots x_n$ and

821: $y =y_1 \ldots y_n$ is defined by

822: \begin{equation}\label{eq.hamdist}

823: d(x,y)= \frac{1}{n} |\{i : x_i\neq y_i\}|.

824: \end{equation}

825: If $x$ and $y$ have different lengths, then $d(x,y)=\infty$.

826: A {\em Hamming ball} in $\{0,1\}^n$ with center

827: $y\in \{0,1\}^n$ and radius $\delta$ ($0 \leq \delta \leq 1$)  is the set

828: $B(y,\delta)=\{x\in\{0,1\}^n: d(x,y)\le \delta \}$.

829: Every $x$ is in either $B( 00\ldots 0,\frac{1}{2})$ or

830: $B(11\ldots 1,\frac{1}{2})$, so we need to consider only

831: Hamming distance $0 \leq \delta \leq \frac{1}{2}$.

832: Let ${\cal H}_n$ be the family of all Hamming balls

833: in $\{0,1\}^n$.

834: We will use the following

835: approximation of $b(\delta)$---the cardinality of Hamming balls

836: in ${\cal H}_n$ of radius

837: $\delta$.

838: Suppose that $0 \le \delta \le \frac{1}{2}$ and $\delta n$ is an integer,

839: and let

840: $H(\delta)=\delta\log 1/\delta+(1-\delta)\log1/(1-\delta)$

841: be Shannon's binary entropy function. Then,

842: \begin{equation}

843: \label{binom-entropy}

844: 2^{n H(\delta)-\log n/2-O(1)} \leq

845: b(\delta) \leq 2^{nH(\delta)}.

846: \end{equation}

847: In Appendix~\ref{sect.exhamming}

848: it is shown that the covering coefficient

849: as defined in property~$4$,

850: for the Hamming distortion family ${\cal H}_n$,

851: satisfies $\alpha_n = n^{O(1)}$. The function

852: \[

853: r_x(\delta) = \min_{y \in \{0,1\}^n} \{\K(y):

854:  d(x,y) \leq \delta  \}

855: \]

856: is the rate-distortion

857: function of $x$ for Hamming distortion. An approximation to

858: one such function is depicted in Figure~\ref{ham.eps}.

859: \end{example}

860:

861: \begin{example}

862: \rm

863: ${\cal E}$ {\em the Euclidean distortion family}.

864: Let ${\cal E}_n$ be

865: the family of all intervals in $\{0,1\}^n$,

866: where an interval  is a

867: subset of $\{0,1\}^n$ of the form $\{x: a\leq x\leq b\}$

868: and $\leq$ denotes the lexicographic ordering on $\{0,1\}^n$.

869: Let ${\Y} =\{0,1\}^*$.

870: A  source word $x \in \{0,1\}^n$ is

871: encoded by a destination word $y \in \{0,1\}^n$.

872: Interpret strings in $\{0,1\}^n$ as

873: binary notations for rational numbers in the segment $[0,1]$.

874: Consider the Euclidean distance $|x-y|$

875: between rational numbers $x$ and $y$.

876: The balls in this metric are intervals;

877: the cardinality of a ball of radius $\delta$

878: is about $\delta 2^n$.

879: Trivially, the covering coefficient

880: as defined in property~$4$,

881: for the Euclidean distortion family ${\cal E}_n$,

882: satisfies  $\alpha_n \leq 2$.

883: The function

884: \[

885: r_x(\delta) = \min_{y \in \{0,1\}^n} \{ \K(y):  |x-y| \leq \delta \}

886: \]

887: is the rate-distortion

888: function of $x$ for Euclidean distortion.

889: \end{example}

890: All the properties 1 through 4

891: are straightforward for all three families,

892: except property~$4$ in the case

893: of the family of Hamming balls.

894:

895: \section{Shapes}\label{sec1}

896:

897: The rate-distortion functions of the

898: individual strings of length $n$ can assume roughly

899: every shape. That is, every shape

900: derivable from a function in the large family

901: $G_n$ of Definition~\ref{def.gx} below through transformation

902: \eqref{eq.sfrd}.

903:

904: We start the formal part of this section.

905: Let ${\A}$ be a distortion family satisfying

906: properties~1 through~4.

907:

908: Property $1$ implies that $\{0,1\}^n \in {\A}$ and property $4$

909: applied to $\{0,1\}^n$ and $c=1$,

910: for every $n$, implies trivially that

911: the family ${\A}$ contains the singleton set

912: $\{x\}$ for every $x\in\{0,1\}^*$. Hence,

913: $$

914: g_x(0)= \K(\{x\})= \K(x)+O(1).

915: $$

916: Property~$1$

917: implies that for every $n$ and string $x$ of length $n$,

918: \[

919: g_x(n)\leq \K(\{0,1\}^n)=\K(n)+O(1)\leq \log n+O(1).

920: \]

921: Together this means that for every $n$ and every

922: string $x$ of length $n$,

923: the function $g_x(l)$ decreases from about $\K(x)$

924: to about $0$ as $l$ increases from 0 to $n$.

925:

926: \begin{lemma}\label{lem.shapesg}

927: Let ${\A}$ be a distortion family satisfying

928: properties~$1$ through $4$.

929: For every $n$ and every string $x$ of length $n$ we have

930: $g_x(n)= O(\log n)$, and

931: $0\le g_x(l)-g_x(m)\leq m-l+O(\log n)$

932: for all $l<m\leq n$.

933: \end{lemma}

934: \begin{proof}

935: The first equation and the left-hand inequality of

936: the second equation are

937: straightforward.

938: To prove

939: the right-hand inequality

940: %translate it

941: %into the following property of the profile of $x$:

942: %If  a pair $(k,m) \in P_x$

943: %and $l<m$, then

944: %also the pair $(k+m-l+O(\log n),l) \in P_x$.

945: %

946: %Let

947: let $A$ witness $g_x(m)=k$, which implies that

948: $\K(A)=k$ and $ \log |A|\leq m$. By Property 4 there is

949: a covering of $A$ by at most $\alpha_n |A|/2^{l}$ sets in ${\A}_n$

950: of cardinality at most $2^{l}$ each.

951: Given a list of $A$ and a list of $\A_n$, we can find

952: such a covering.

953: Let $B$ be one of

954: the covering sets containing $x$.

955: Then, $x$ can be specified by $A,n,l,\A_n$

956: and the index $i$ of $B$

957: among the covering sets.

958: We need also $O(\log k+\log\log i+\log\log l +\log \log n)$

959: extra bits to separate the descriptions of $A$ and $\A_n$, and

960: the binary representations of $i,n,l$, from one another.

961: Without loss of generality we can assume that $k$

962: is less than $n$.

963: Thus all the extra information

964: and separator bits are included in $O(\log n)$ bits.

965: Altogether,

966: $\K(B)\leq \K(A) +m-l +O(\log n)\leq k +m-l +O(\log n)$, which shows

967: that $g_x(l)\le k+m-l+O(\log n)=g_x(m)+m-l+O(\log n)$.

968: \end{proof}

969:

970:

971: \begin{example}\rm

972: Lemma~\ref{lem.shapesg} shows

973: that

974: $$

975: \K(x)-i-O(\log n)\leq g_x(i)\leq n-i+O(\log n),

976: $$

977: for every $0 \leq i \leq n$.

978: The right-hand inequality

979: is obtained by setting $m=n$, $l=i$ in

980: the lemma, yielding

981: $$

982: g_x(i)=g_x(i)-g_x(n)+O(\log n)\leq n-i+O(\log n).

983: $$

984: The left-hand inequality

985: is obtained by setting $l=0$, $m=i$

986: in the lemma, yielding

987: $$

988: \K(x)-g_x(i)=g_x(0)-g_x(i) +O(1)\le i-0+O(\log n).

989: $$

990: The last displayed equation can also be shown by a simple direct argument:

991: $x$ can be described by the minimal description

992: of the set $A \in {\A}$

993: witnessing $g_x(i)$ and by the ordinal number of $x$ in $A$.

994: \end{example}

995:

996: The rate-distortion

997: function $r_x$ differs

998: from $g_x$ by just a change of scale depending on the distortion family

999: involved, provided certain computational requirements are fulfilled.

1000: See Appendix~\ref{sect.computability} for computability notions.

1001:

1002: \begin{lemma}\label{lem.rg}

1003: Let  ${\cal X} = \{0,1\}^*$, ${\Y}$, and $d$, be the

1004: source alphabet, destination alphabet,

1005: and distortion measure, respectively.

1006: Assume that the set

1007: $\{\pair{x,y,\delta}\in\mathcal X\times\Y\times\Q: d(x,y)\le \delta\}$

1008: is decidable; that $\Y$ is recursively enumerable; and

1009: %Assume that

1010: %there is an algorithm that given $y\in\Y$ and a rational

1011: %$\delta$ outputs a list of $B(y,\delta)$, and conversely,

1012: %given any list of $B(y,\delta)$ and

1013: %$\delta$, outputs a $y'\in\Y$ with

1014: %$B(y',\delta)=B(y,\delta)$.

1015: that for every $n$ the cardinality

1016: of every ball in ${\A}^{d,\Y}_n$ of radius $\delta$ is at most

1017: $b_n(\delta)$ and at least $b_n(\delta)/\beta(n)$, where

1018: $\beta(n)$ is polynomial in $n$ and $b_n(\delta)$ is a function

1019: of $n,\delta$; and that the distortion family $\A^{d,\Y}$

1020: satisfies properties 1 through 4.

1021: Then, for every $x\in\{0,1\}^n $ and every rational $\delta$

1022: we have

1023: \begin{equation}\label{eq.sfrd}

1024: r_x (\delta ) = g_x(\lceil \log b_n(\delta) \rceil)+O(\K(\delta)+\log n).

1025: \end{equation}

1026: \end{lemma}

1027: \begin{proof}

1028: Fix $n$ and a string $x$ of length $n$.

1029: Consider the auxiliary function

1030: \begin{equation}\label{eq.tilde}

1031: \tilde r_x(\delta) = \min_{y\in \Y} \{\K(B(y,\delta)):

1032:  d(x,y) \leq \delta  \}.

1033: \end{equation}

1034: We claim that

1035: $\tilde r_x(\delta)= r_x(\delta)+O(\K(\delta)+\log n)$.

1036: Indeed, let $y$ witness $r_x(\delta)=k$.

1037: Given $y,\delta,n$ we can compute

1038: a list of elements of the ball $B(y,\delta)$: for all strings

1039: $x'$ of length $n$ determine whether $d(x',y)\le\delta$.

1040: %Moreover, we do not need to know $n$ in advance:

1041: %we can find $n$ as the length of any string $x'$ with $d(x',y)\le\delta$;

1042: %by property 2

1043: %its length equals $n$.

1044: Thus $\K(B(y,\delta))<k+O(\K(\delta)+\log n)$, hence

1045: $\tilde r_x(\delta)<k+O(\K(\delta)+\log n)$.

1046: Conversely, let $B(y, \delta)$ witness $\tilde r_x(\delta)=k$.

1047: Given a list of the elements of $B(y,\delta)$ and $\delta$

1048: we can recursively enumerate ${\Y}$ to find the first element

1049: $y'$ with $B(y',\delta)=B(y,\delta)$ (for every enumerated $y'$ compute

1050: the list $B(y',\delta)$ and compare it to the given list $B(y,\delta)$).

1051: Then,

1052: $\K(y')\le k+O(\K(\delta))$ and $d(x,y')\le\delta$.

1053: Hence $r_x(\delta)<k+O(\K(\delta))$.

1054:

1055: Thus, it suffices to show that

1056: \[

1057: \tilde r_x (\delta ) = g_x(\lceil \log b_n(\delta) \rceil)+O(\log n).

1058: \]

1059:

1060: ($g_x(\lceil \log b_n(\delta) \rceil)\leq\tilde r_x (\delta)$)

1061: Assume $\tilde r_x(\delta)=k$ is witnessed by a distortion ball $B(y, \delta)$.

1062: By our assumption, the  cardinality of $B(y,\delta)$ is at most

1063: $b_n(\delta)$, and hence $g_x(\lceil \log b_n(\delta) \rceil ) \leq k$.

1064:

1065: ($\tilde r_x (\delta) \leq g_x(\lceil \log b_n(\delta) \rceil)+O(\log n)$)

1066: By Lemma~\ref{lem.shapesg},

1067: $g_x(l)$ and $g_x(l-m)$ differ by at most $m+O(\log n)$.

1068: Therefore it suffices to show that

1069: $\tilde r_x (\delta) \leq g_x(\lceil \log b_n(\delta) \rceil-m)$

1070: for some $m=O(\log n)$. We claim that this happens for

1071: $m=\lceil\log\beta(n)\rceil+1$. Indeed, let

1072: $g_x(\lceil \log b_n(\delta) \rceil-m)=k$ be witnessed

1073: by a distortion ball $B$. Then,

1074: $|B|\le 2^{\lceil\log b_n(\delta)\rceil}/(2\beta(n))<

1075: b_n(\delta)/\beta(n)$.

1076: This implies that the radius of $B$ is less than $\delta$

1077: and hence $B$ witnesses $\tilde r_x (\delta)\le k$.

1078: \end{proof}

1079:

1080: \begin{remark}\label{rem.logn}

1081: \rm

1082: When measuring distortion we usually do

1083: not need rational numbers with numerator or denominator more

1084: than $n=|x|$. Then, the term $O(C(\delta))$ in \eqref{eq.sfrd}

1085: is absorbed by the term $O(\log n)$.

1086: Thus, describing the family of $g_x$'s  we obtain an approximate

1087: description of all possible rate-distortion functions $r_x$ for

1088: given destination alphabet and distortion measure, satisfying the computability

1089: conditions, by using the transformation \eqref{eq.sfrd}.

1090: An example of an approximate

1091: rate-distortion curve $r_x$ for some string $x$

1092: of length $n$ for Hamming distortion is given in Figure~\ref{ham.eps}.

1093: \end{remark}

1094: \begin{remark}

1095: \rm

1096: The computability properties of the functions

1097: $r_x$, $d_x$, and $g_x$, as well as the relation between

1098: the destination word for a source word and the related distortion ball, is

1099: explained in Appendix~\ref{sect.computability}.

1100: \end{remark}

1101:

1102: We present an approximate

1103: description of the family of possible $g_x$'s below. It turns

1104: out that the description does not depend on the particular distortion family

1105: $\A$ as long as properties 1 through 4 are satisfied.

1106:

1107:

1108: \begin{definition}

1109: \rm

1110: Let $G_n$ stand for the class of all

1111: functions $g:\{0,1,\dots,n\}\rightarrow {\cal N}$ such

1112: that $g(n)=0$  and

1113: $g(l-1)\in\{g(l),g(l)+1\}$ for all

1114: $1\leq l \leq n$.

1115: \end{definition}

1116:

1117: In other words, a function $g$ is in $G_n$ iff

1118: it is nonincreasing and the function $g(i)+i$

1119: is nondecreasing and $g(n)=0$.

1120: The following result is a generalization to

1121: arbitrary distortion measures of Theorem IV.4

1122: in \cite{VV02}

1123: dealing with $h_x$ (equaling $d_x$ in the particular case

1124: of the distortion family

1125: ${\cal L}$). There, the precision in Item (ii) for source words of length $n$

1126: is $O(\log n)$, rather than the $O(\sqrt{n \log n})$ we obtain

1127: for general distortion families.

1128:

1129: \begin{theorem}\label{theo.allshapesrd}

1130: Let ${\A}$ be a distortion family satisfying

1131: properties~$1$ through~$4$.

1132:

1133: {\rm (i)}  For every $n$ and every string $x$ of length $n$, the function

1134: $g_x(l)$ is equal to $g(l)+O(\log n)$ for some function $g \in G_n$.

1135:

1136: {\rm (ii)}

1137: Conversely, for every $n$ and every function $g$ in $G_n$,

1138: there is a string

1139: $x$ of length $n$ such that for every $l=0,\dots,n$,

1140: $g_x(l)=g(l)+O(\sqrt{n\log n})$.

1141: \end{theorem}

1142:

1143: \begin{remark}

1144: \rm

1145: For fixed $k \leq n$ the number of different integer functions $g \in G_n$

1146: with

1147: $g(0) = k$

1148: %$g(n)=0$, and $g(l)=\{g(l-1), g(l-1)-1\}$,

1149: is ${n \choose k}$.

1150: For $k=\frac{1}{2}n$,

1151: this number is of order $2^n/\sqrt{ n}$,

1152: and therefore far greater than the number

1153: of strings $x$ of length

1154: $n$ and Kolmogorov complexity

1155: $\K(x) = k = \frac{1}{2}n$ which is at most $2^{n/2}$.

1156: This explains the fact that in Theorem~\ref{theo.allshapesrd}, Item (ii),

1157: we cannot precisely match a string $x$ of length $n$ to

1158: every function $g \in G_n$, and therefore have to use approximate

1159: shapes.

1160: \end{remark}

1161:

1162: \begin{example}

1163: \rm

1164: By Theorem~\ref{theo.allshapesrd}, Item (ii), for every $g \in G_n$

1165: there is a string $x$ of length $n$ that has $g$ for its canonical

1166: rate-distortion function $g_x$ up to an additive $O(\sqrt{n \log n})$ term.

1167: By \eqref{binom-entropy}, \eqref{eq.sfrd}, and Remark~\ref{rem.logn},

1168: $$

1169: r_x(\delta)=

1170: g_x(nH(\delta))+O(\log n),

1171: $$

1172: for $0 \leq \delta \leq \frac{1}{2}$.

1173: \begin{figure}[ht]

1174: \begin{center}

1175: \epsfxsize=3.5in

1176: \leftline{\hskip8pc\epsfbox{ie4.eps}}

1177: \end{center}

1178: \caption{An approximate rate-distortion function for Hamming distortion}

1179: \label{ham.eps}

1180: \end{figure}

1181: Figure~\ref{ham.eps} gives the graph of a particular function

1182: $r(\delta) = g(nH(\delta))$ with $g$ defined as follows:

1183:  $g(l) = n(1+H(\frac{1}{6})-H(\frac{1}{3}))-l$

1184: for $0 \leq l \leq nH(\frac{1}{6})$,

1185: $g(l)=n(1+H(\frac{1}{6})-H(\frac{1}{3}))$ for

1186: $nH(\frac{1}{6}) < l \leq nH(\frac{1}{3})$,

1187: and $g(l)=n-l$ for $nH(\frac{1}{3}) < l \leq n$.

1188: In this way, $g \in G_n$.

1189: Thus, there is a string $x$ of length $n$ with its rate-distortion

1190: graph $r_x (\delta)$

1191: in a strip of size $O(\sqrt{n\log n})$ around the

1192: graph of $r(\delta)$. Note that $r_x$ is almost constant on

1193: the segment $[ \frac{1}{6}; \frac{1}{3}]$.

1194: Allowing the

1195: distortion to increase on this interval, all the way from

1196: $\frac{1}{6}$ to $ \frac{1}{3}$, so allowing $n/6$ incorrect extra

1197: bits, we still cannot significantly decrease the rate.

1198: This means that the distortion-rate function $d_x(r)$

1199: of $x$ drops from $\frac{1}{3}$ to $\frac{1}{6}$

1200: near the point $r=n(1-H(\frac{1}{3}))$,

1201: exhibiting a very unsmooth behavior.

1202: \end{example}

1203:

1204: \section{Characterization}

1205:

1206: Theorem~\ref{th-shannon-analog} below states that a destination word that

1207: codes a given source word and

1208:  minimizes the algorithmic mutual information with

1209: the given source word gives no

1210: advantage in rate

1211:  over a minimal Kolmogorov complexity destination word that codes the source word.

1212: This

1213: theorem

1214: can be compared with Shannon's theorem, Theorem~\ref{theo.shannon} in

1215: Appendix~\ref{sect.ratedistortion}, about

1216: the expected rate-distortion curve of a random variable.

1217: %This result on the rate-distortion function

1218: %of an individual source word

1219: %that can be compared with Shannon's product space. In the

1220: %product space, due to asymptotic equidistribution, the characterization

1221: %by minimum information and minimum entropy coincide asymptotically.

1222: %contrasts with Shannon's

1223: %rate-distortion function for

1224: %a random variable:

1225: %in Shannon's case

1226: %the minimum information of some random variable with the

1227: %source random variable can be less than the minimum entropy of

1228: %a function of the source random variable.

1229:

1230:

1231: %In formal terms: let $X$ be a random variable

1232: %with outcomes in ${\cal X}$ and

1233: %$X_1, X_2, \ldots, X_n$ consist of $n$ i.i.d.

1234: %copies of $X$ denoted by $X^n$.

1235: %The second part of Shannon's theorem, Theorem~\ref{theo.shannon},

1236: %states that there exists a random variable $Z$ taking values in

1237: %the destination alphabet ${\Y}$, such that

1238: %we can code the outcomes in ${\cal X}^n$ (the  source words) in about

1239: %$nI(X;Z)$ bits (lengths of the destination words)

1240: %with the average distortion between the source-word outcomes of  $X^n$ and

1241: %their destination words, divided by $n$, being

1242: %close to ${\bf E} d(X,Z)$

1243: %as $n$ grows large. Whether we minimize $H(U)$ or $I(X;U)$

1244: %we obtain approximately $nI(X;Z)$ where $Z$ minimizes the expression.

1245: %The algorithmic version below about individual

1246: %finite binary strings differs from Shannon's theorem as explained in Example~\ref{exam.shannonstheorem}.

1247: %This statement cannot be strengthened further by adding the

1248: %requirement that for every such

1249: %random variable $Z$ there is a random variable $U$

1250: %with ${\bf E}d(X,U)\leq{\bf E} d(X,Z)$ and $H(U)\leq I(X;Z)$.

1251: %In the algorithmic setting an analogue of this

1252:  %strong statement is true,

1253: %as the following theorem shows.

1254:

1255: \begin{theorem}    \label{th-shannon-analog}

1256: Let ${\A}$ be a distortion family

1257: satisfying properties~$2$

1258: and~$3$,

1259: %${\A}_n = {\A} \bigcap \{0,1\}^n$,

1260: and

1261:  ${\A}(x) = \{A  \in {\A}: x \in A\}$.

1262: For every $n$ and string $x$ of length $n$ and every $B \in {\A}(x)$

1263: there is an $A \in {\A}(x)$ with

1264: $\lceil \log |A|\rceil =\lceil \log |B| \rceil$ and

1265: $\K(A)\leq I(x:B)+O(\log \K(B)+\log n)$,

1266: where $I(x:B)=\K(B)-\K(B\mid x)$ stands for the

1267: algorithmic information in $x$ about $B$.

1268: %Here

1269: %$\eps=O(\log n +\K(\A_n)+\log\K(B))$ and $n=|x|$.

1270: \end{theorem}

1271:

1272: For further information about $I(x:B)$ see Definition~\ref{def.mi} in

1273: Appendix~\ref{sect.kolmcompl}.

1274: The proof of Shannon's theorem, Theorem~\ref{theo.shannon},

1275: and the proof of the current theorem are very different.

1276: The latter proof uses techniques

1277: that may be

1278: of independent interest.

1279: In particular, we use an online

1280: set cover algorithm where the sets come sequentially and we always have

1281: to have the elements covered that occur in a certain number of sets,

1282: Lemma~\ref{th5} in Appendix~\ref{sect.proofs}.

1283: %It uses techniques

1284: %that may be

1285: %of independent interest; see Exercise~\ref{th5} on page~\pageref{th5}.

1286: % The

1287: %Kolmogorov complexity of a string $x$

1288:  %is the length of the shortest program that

1289: %produces $x$. It could happen that there are several shortest

1290: %programs for the same string $x$; however, one can prove that

1291: %there could not be too many of them: if there are $2^m$ programs

1292: %of length $k$ that produce string $x$, then there exists a shorter

1293: %program that produces $x$ (of length approximately $k-m$).

1294: %Exercise~\ref{th5} on page~\pageref{th5}

1295:  %generalizes this statement for the case of approximate

1296: %descriptions. Informally, it states the following:

1297: %if there are $2^m$ sets in

1298: %a family ${\A}$ as above, each of

1299: %complexity at most $k$ and containing a given string $x$, then

1300: %one of these sets has complexity about $k-m$.

1301:

1302: %\begin{example}\label{exam.shannonstheorem}

1303: %\rm

1304: %Note that for an appropriate distortion family ${\A}$

1305: %we have that $\lceil \log |A|\rceil = \lceil \log |B| \rceil$ equals

1306: %$\lceil \log b(\delta)\rceil$

1307: %in \eqref{eq.sfrd},

1308: %where it is the log-cardinality of a distortion ball

1309: %in the distortion family ${\A}_n$. In this way we can determine

1310: %the value of $g_x (\lceil \log b(\delta) \rceil)$

1311: %and subsequently retrieve

1312: %both the distortion $\delta$ concerned

1313:  %and the value of the rate-distortion function

1314: %$r_x(\delta)$.

1315: %The theorem states that

1316: %a destination word

1317:  %minimizing the algorithmic mutual information with

1318: %the given source word gives no

1319: %advantage in rate (a pointwise less rate-distortion curve)

1320:  %over a minimal complexity destination word.

1321: %The contrast

1322: %with Shannon's rate-distortion function is already explained

1323: %at the start of this section.

1324: %\end{example}

1325:

1326:

1327:

1328: \begin{example}

1329: \rm

1330: Theorem~\ref{th-shannon-analog} states that

1331: for an appropriate distortion family ${\A}$ of nonempty finite subsets

1332: of $\{0,1\}^*$

1333: and for every string $x \in \{0,1\}^*$, if there exists an $A\in {\A}$

1334: of cardinality

1335: $2^l$ or less

1336: containing $x$ that has small algorithmic information about $x$,

1337: then there exists another

1338: set $B\in {\A}$ containing $x$ that has also at most $2^l$ elements

1339: and has small

1340: Kolmogorov complexity itself.

1341: For example, in the case of Hamming distortion, if for a given string $x$

1342: there exists a string $y$ at Hamming distance

1343: $\delta$ from $x$

1344: that has small information about $x$, then there exists another

1345: string $z$ that is also within distance $\delta$ of  $x$ and has small

1346: Kolmogorov complexity itself (not only small algorithmic

1347: information about $x$).

1348: \end{example}

1349:

1350: \section{Fitness of Destination Word}\label{sect.fitness}

1351:

1352: %For every distortion measure (subject to some mild restrictions)

1353: %and source word,

1354: In Theorem~\ref{th45} we show that if a destination word

1355: of a certain maximal Kolmogorov complexity

1356: has minimal distortion with respect to the source word, then it

1357: also is the (almost) best-fitting destination word in the sense

1358: (explained below)

1359: that

1360: among all destination words of that Kolmogorov complexity

1361: it has the most properties in common with the

1362: source word.

1363: `Fitness' of individual strings to an individual

1364: destination word is hard, if not impossible, to describe

1365: in the probabilistic framework. However, for the combinatoric

1366: and computational notion of Kolmogorov complexity it is natural to describe

1367: this notion using `randomness deficiency' as in Definition~\ref{def.rd} below.

1368:

1369: Reference \cite{VV02} uses `fitness'

1370: with respect to the particular distortion family

1371: ${\cal L}$. We briefly overview the generalization to arbitrary

1372: distortion families satisfying properties 2 and 3 (details,

1373: formal statements and proofs about ${\cal L}$ can be found in the

1374: cited reference).

1375: %Every set $A \in {\A}$ containing a string $x$

1376: %is considered to be a

1377: %model for $x$.

1378: The goodness of fit of a destination word $y$ for

1379: a source word $x$ with respect to an arbitrary distortion family ${\A}$

1380: is defined by the randomness deficiency of $x$ in the

1381: the distortion ball $B(y, \delta)$ with $\delta=d(x,y)$.

1382: The lower the randomness deficiency, the better is the fit.

1383: \begin{definition}\label{def.rd}

1384: \rm

1385: The {\em randomness deficiency} of $x$ in a set $A$ with $x \in A$

1386: is defined as $\delta (x \mid A) = \log |A| - \K(x\mid A)$.

1387: If $\delta (x \mid A)$ is small then $x$ is a {\em typical} element of $A$.

1388: Here `small' is taken as $O(1)$ or $O(\log n)$ where $n=|x|$,

1389: depending on the context of the future statements.

1390: %Here we have not stated what the constant in $O(1)$ is.

1391: %One must agree in advance on a constant $c$ and then call an element

1392: %typical when the deficiency is smaller that $c$, and all later references

1393: %of typicality depend on this $c$.

1394: \end{definition}

1395:

1396: The randomness deficiency can be little smaller

1397: than 0, but not more than

1398: a constant.

1399: \begin{definition}

1400: \rm

1401: Let $\beta$ be an integer parameter and $P \subseteq A$.

1402: We say $P$ is a {\em property} in

1403: $A$ if $P$ is a `majority' subset of

1404: $A$, that is,  $|P| \geq (1-2^{\beta})|A|$. We say that

1405: $x \in A$ \emph{satisfies} property $P$ if

1406: $x \in P$.

1407: \end{definition}

1408:

1409: If the randomness deficiency $\delta(x \mid A)$ is not much greater than 0,

1410: %then $x$ satisfies every property

1411: %that holds for a majority of elements in $A$.

1412: then there are no simple special properties that

1413: single $x$ out from the majority of strings to be drawn from $A$.

1414: This is not just terminology:

1415: If $\delta (x  |  A)$ is small enough,

1416: then $x$ satisfies {\em all} properties of low Kolmogorov complexity

1417: in $A$ (Lemma~\ref{lemma.property} in Appendix~\ref{sect.rd}).

1418: If $A$ is a set containing $x$ such that $\delta(x \mid A)$ is

1419: small

1420: then we say that $x$ is

1421: a set of good fit for $x$.

1422: %This leads to the notion of a model for $x$.

1423: %\begin{definition}\label{def.model}

1424: %\rm

1425: %Let $x$ be a string and ${\A}$ be a distortion family.

1426: %A set $A \in {\A}$ with $x \in A$ is a {\em model} for $x$.

1427: %The set $A$ is a {\em best} model for $x$ if the randomness

1428: %deficiency $\delta (A|x)$ is minimal.

1429: %See also Appendix~\ref{sect.rd},

1430: %\cite{VV02} or the text \cite{LiVi97}.

1431: %\end{definition}

1432: In \cite{VV02} the notion of

1433: models for $x$ is considered: Every finite set of strings

1434: containing $x$ is a {\em model} for $x$.

1435: Let $x$ be a string of length $n$ and choose an integer $i$

1436: between 0 and $n$. Consider models for $x$ of

1437: Kolmogorov

1438: complexity at most $i$.

1439: Theorem~IV.8 and Remark IV.10 in \cite{VV02}

1440: show

1441: for the distortion family ${\cal L}$

1442: %That theorem and the accompanying examples show

1443: that $x$ has minimal

1444: randomness deficiency in every set that witnesses $h_x(i)$

1445: (for ${\cal L}$ we have $h_x(i)=d_x(i)$),

1446: ignoring additive $O(\log n)$ terms. That is, up to the stated precision

1447: every such witness set is the best-fitting model that is

1448: possible at model Kolmogorov complexity at most $i$.

1449:  It is

1450: remarkable, and in fact unexpected to the authors,

1451: that the analogous result

1452: holds for arbitrary distortion families provided

1453: they satisfy properties 2 and 3.

1454:

1455:

1456: \begin{theorem}\label{th45}

1457: Let ${\A}$ be a distortion family

1458: satisfying properties~$2$ and~$3$

1459: and $x$ a string of length $n$.

1460: Let $B$ be a set in $\A$ with

1461: $x \in B$.

1462: Let $A_x$ be a set

1463: of minimal Kolmogorov complexity

1464: among the sets $A\in{\A}$ with $x\in A$ and

1465: $\lceil \log |A| \rceil= \lceil \log |B| \rceil$.

1466: Then,

1467: \[

1468: \K(A_x)+\log |A_x|-\K(x)\leq

1469: \delta(x \mid B)

1470: +O(\log \K(B)+ \log n).

1471: \]

1472: \end{theorem}

1473: \begin{lemma}\label{lemma.deltaab}

1474: For every set $A$ with  $x \in A$,

1475: \begin{equation}\label{eq.deltaab}

1476: \K(A)+\log |A|-\K(x) \ge\delta (x \mid A),

1477: \end{equation}

1478: up to a  $O(\log n)$ additive term.

1479: \end{lemma}

1480: \begin{proof}

1481: The inequality \eqref{eq.deltaab}

1482: means that that

1483: $$\K(A)+\log |A|-\K(x) \ge \log |A|-\K(x\mid A)+O(\log n),$$

1484: that is,

1485: $\K(x)\le \K(A)+\K(x\mid A)+O(\log n)$.

1486: The latter inequality follows

1487: from the general inequality

1488: $\K(x)\le \K(x,y) \leq \K(y)+\K(x\mid y)+O(\log\K(x\mid y))$,

1489: where $\K(x\mid y)\le\K(x)+O(1)\le n+O(1)$.

1490: \end{proof}

1491:

1492: A set $A$ with $x \in A$ is an algorithmic {\em sufficient statistic}

1493: for $x$ if

1494: $\K(A)+\log |A|$ is close to $\K(x)$.

1495: Lemma~\ref{lemma.deltaab} shows that every sufficient statistic for $x$ is

1496: a model of a good fit for $x$.

1497:

1498: \begin{example}\label{th44}

1499: \rm

1500: Consider the elements of every $A\in {\A}$ uniformly distributed.

1501: Assume that we are given a string $x$ that was

1502: obtained by a random sampling

1503: from an unknown set $B\in {\A}$

1504: satisfying $\K(B)\le n=|x|$.

1505: Given $x$

1506: we want to recover $B$, or some $A\in {\A}$ that

1507: is ``a good hypothesis to be the source of $x$'' in the sense

1508: that the randomness deficiency $\delta (x \mid A)$ is small.

1509: Consider the set $A_x$ from  Theorem~\ref{th45} as such

1510: a hypothesis. We claim that

1511: with high probability $\delta(x \mid A_x)$ is of order $O(\log n)$.

1512: More specifically, for every $\beta$ the probability of the event

1513: $\delta(x \mid A_x)>\beta$

1514: is less than

1515: $2^{-\beta+O(\log n)}$,

1516: which is negligible for $\beta=O(\log n)$.

1517: Indeed,

1518: if $x$ is chosen uniformly  at random in $B$, then

1519: with high probability

1520: (Appendix~\ref{sect.rd})

1521: the randomness deficiency $\delta (x \mid B)$ is small.

1522: That is, with probability more than $1-2^{-\beta}$

1523: we have $\delta(x \mid B)\le\beta$.

1524: By Theorem~\ref{th45} and \eqref{eq.deltaab}

1525: we also have $\delta(x \mid A_x)\le\delta(x \mid B)+O(\log n)$.

1526: %It is easy to show \cite{VV02,LiVi97} that

1527: %If $A$ is a sufficient

1528: %statistic for $x$, then by~\eqref{eq.deltaab}

1529: %$x$ is a typical element of $A$ in the sense that it has small

1530: %randomness deficiency $\delta(x|A)$.

1531: %By Theorem~\ref{th45},

1532: Therefore the probability of the event

1533: $\delta(x \mid A_x)>\beta$

1534: is less than

1535: $2^{-\beta+O(\log n)}$.

1536: %By the properties of randomness deficiency,

1537: %the probability that the right-hand side of the inequality

1538: %in Theorem~\ref{th45}

1539: %exceeds $\beta$ is at most $\epsilon$.

1540: %Thus, with high probability

1541: %the set $A_x$ is a sufficient statistic

1542: %for $x$.

1543: \end{example}

1544:

1545:

1546: \begin{example}

1547: \rm

1548: Theorem~\ref{th45} says that for fixed

1549: log-cardinality $l$ the model that has minimal Kolmogorov complexity has

1550: also minimal randomness

1551: deficiency among models of that log-cardinality.

1552: Since $g_x$ satisfies  Lemma~\ref{lem.shapesg}, we have also that for every

1553: $k$ the model of Kolmogorov complexity at most

1554: $k$ that minimizes the log-cardinality also minimizes randomness

1555: deficiency among models of that Kolmogorov complexity.

1556: These models can be computed in the limit, in the first case

1557: by running all programs up to $k$ bits and always keeping the one

1558: that outputs the smallest set in ${\A}$ containing $x$, and in the second case

1559: by running all programs up to $n=|x|$ bits and always keeping the

1560: shortest one that outputs a set in ${\A}$ containing $x$

1561: having log-cardinality at most $l$.

1562: \end{example}

1563:

1564:

1565: \section{Denoising}

1566:

1567: %Assume the setting of Theorem~\ref{th45} and Example~\ref{th44},

1568: %Since

1569: %$\delta(x|A_x)\leq

1570: %\K(A_x)+\log |A_x|-\K(x)\leq  \delta(x|B)+O(\log \K(B)+\log n)$,

1571: In Theorem~\ref{th45} using \eqref{eq.deltaab} we obtain

1572: \begin{equation}\label{eq.dAB}

1573: \delta(x \mid A_x)\le \delta(x \mid B)+O(\log \K(B)+\log n).

1574: \end{equation}

1575: %and $A_x$ is a best model (Definition~\ref{def.model}) for $x$

1576: %at either complexity $k$, or of log-cardinality $l$, and hence both.

1577: This gives a method

1578: to identify good-fitting models for $x$ using compression, as follows.

1579: Let $k= \K(A_x)$ and $l= \lceil \log |B| \rceil$.

1580: If $A_x$ is a

1581: set of minimal Kolmogorov complexity

1582: among sets  $A \in {\A}$ with $x\in A$ and $ \lceil \log |A| \rceil=l$,

1583: then by \eqref{eq.dAB}

1584: the hypothesis ``$x$ is chosen at random

1585: in $A_x$'' is (almost) at least as plausible as

1586: the hypothesis ``$x$ is chosen at random

1587: in $B$'' for every simply described

1588: $B\in {\A}$

1589: (say, $\log \K(B)=O(\log n)$)

1590: with  $ \lceil \log |B| \rceil=l$.

1591:

1592: Let us look at an example

1593: of denoising by compression

1594: (in the ideal sense of Kolmogorov complexity) for Hamming distortion.

1595: Fix a target string $y$ of length $n$ and a

1596: distortion $0 \leq \delta \leq \frac{1}{2}$.

1597: (This string $y$ functions as the destination word.)

1598: Let a string $x$ be a noisy version of

1599: $y$ by changing at most $n\delta$ randomly chosen bits in $y$

1600: (string $x$ functions as the source word).

1601: That is,

1602: the string $x$ is chosen uniformly at random in the Hamming ball

1603: $B=B(y,\delta)$.

1604: Let $\hat{x}$ be

1605: a string witnessing

1606: $r_x(\delta)$, that is, $\hat{x}$ is a string

1607: of minimal Kolmogorov complexity  with $d(x,\hat{x}) \leq \delta$

1608: and $r_x(\delta)=C(\hat{x})$.

1609: %in the Hamming ball $B(x,\delta)$

1610: We claim that at distortion $\delta$ the string

1611:  $\hat{x}$ is a good candidate for

1612: a denoised version of $x$, that is, the target string $y$.

1613: This means that

1614: in the two-part description

1615: $(\hat{x},\hat{x} \oplus x)$

1616: of $x$, the second part (the bitwise XOR of $x$ and $\hat{x}$)

1617: is noise:

1618: $\hat{x} \oplus x$ is a random string

1619: in the Hamming ball $B(00\dots0,\delta)$ in the sense

1620: that $\delta(\hat{x} \oplus x \mid B(00\dots0,\delta))$ is negligible.

1621: Moreover, even the conditional Kolmogorov complexity

1622: $\K(\hat{x} \oplus x \mid \hat x)$ is close to $\log b(\delta)$.

1623:

1624: Indeed,

1625: let $l=\lceil\log|B|\rceil$.

1626: By Definition~\ref{def.gx} of $g_x$,

1627: %and the fact that

1628: %$\log |B| = l + O(\log n)$ by \eqref{binom-entropy},

1629: Theorem~\ref{th45} implies that

1630: $$

1631: g_x(l)+l-\K(x)\le \delta(x \mid B),

1632: $$

1633: ignoring additive terms of $O(\log n)$

1634: and observing that the additive

1635: term $\log \K(B)$ is absorbed by $O(\log n)$.

1636: %Since the Hamming distortion family

1637: %satisfies all properties~$1$

1638: %through~$4$,

1639: %the canonical structure functions $g_x$

1640: %satisfy Theorem~\ref{theo.allshapesrd}.

1641: For every $x$,

1642: the rate-distortion function $r_x$ of $x$ differs from

1643: $g_x$ just by changing the scale of the argument as in \eqref{eq.sfrd}.

1644: More specifically,

1645: %for every $0 \leq \delta \leq \frac{1}{2}$,

1646: we have

1647: $r_x(\delta) = g_x(l)$ and hence

1648: \[

1649: r_x(\delta)+l-\K(x)\leq \delta(x \mid B).

1650: \]

1651: Since we assume that $x$ is chosen uniformly

1652: at random in $B$, the randomness deficiency

1653: $\delta(x \mid B)$ is small, say $O(\log n)$ with high probability.

1654: Since

1655: $r_x(\delta)=\K(\hat{x})=\K(B(\hat{x},\delta))+O(\K(\delta))$,

1656: $\K(\delta)=O(\log n)$, and $l=\lceil\log b(\delta)\rceil$,

1657: it follows that with high probability, and the equalities up to an

1658: additive $O(\log n)$ term,

1659: $$

1660: 0 =  \K(\hat{x})+l- \K(x)= \K(B(\hat{x},\delta))+

1661: \log b(\delta)-\K(x).

1662: $$

1663: Since by construction $x \in B(\hat{x},\delta)$,

1664: the displayed equation shows that

1665: the ball $B(\hat{x},\delta)$ is a sufficient statistic for $x$.

1666: This implies that $x$ is a typical element of $B(\hat{x},\delta)$,

1667: that is, $\K(x\oplus\hat x \mid \hat{x})=\K(x \mid \hat{x})=

1668: \K(x \mid B(\hat{x},\delta),p)$

1669: is close to $\log b(\delta)$.

1670: Here $p$ is an appropriate

1671: program of $O(\C(\delta))=O(\log n)$ bits.

1672: %This means that $x$ has distortion $\delta$ ($\delta n$ bits flipped)

1673: %with respect to $\hat{x}$.

1674:

1675: This provides a method of denoising via compression,

1676: at least in theory.

1677: In order to use the method practically, admittedly with a leap of faith,

1678: we ignore the ubiquitous $O(\log n)$ additive terms,

1679: and use real compressors to

1680: approximate the Kolmogorov complexity, similar to what was done in

1681: \cite{Li01,Li04}.

1682: The Kolmogorov complexity is not computable and can be approximated

1683: by a computable process from above but not from below, while a real

1684: compressor is computable. Therefore, the approximation of the Kolmogorov

1685: complexity by a real compressor involves for some arguments errors that can

1686: be high and are in principle unknowable. Despite all these caveats it turns

1687: out that the practical analogue of the theoretical method works surprisingly

1688: well in all experiments we tried \cite{RV06}.

1689:

1690: \begin{figure}

1691: \begin{center}

1692: \epsfxsize=3.5in

1693: \leftline{\hskip8pc\epsfbox{cross2_euclidean_edited.eps}}

1694: \end{center}

1695: \caption{Denoising of the noisy cross}

1696: \label{fig:cross}

1697: \end{figure}

1698:

1699: As an example, we approximated the distortion-rate

1700: function of a noiseless cross of very low

1701: Kolmogorov complexity, to which artificial noise was added to obtain

1702: a noisy cross,  \cite{RV06}.

1703: Figure~\ref{fig:cross} shows two graphs. The first graph, hitting

1704: the horizontal axis at about 3100 bits, denotes the Hamming distortion

1705: on the vertical axis of the best

1706: model for

1707: the noisy cross with respect to the original noisy cross

1708:  at the rate given on the horizontal axis.

1709: The line hits zero distortion at model cost

1710: bit rate about 3100,

1711: when the original noisy cross is retrieved. The best model of the noisy cross

1712: at this rate, actually the original noisy cross,

1713: is attached to this point. The second graph, hitting the horizontal axis at

1714: about 250 bits, denotes on the vertical axis the Hamming distortion

1715: of the best

1716: model for the noisy cross with respect to the noiseless cross

1717: at the rate given on the horizontal axis.

1718: The line hits almost zero distortion (Hamming distance 3)

1719: at model cost bit rate about 250.  The best model of the noisy cross

1720: at this rate is attached to this point. (The three wrong bits

1721: are at the bottom left corner and upper right armpit.)

1722:  This coincides with a sharp slowing

1723: of the rate of decrease of the first graph. Subsequently, the second graph

1724: rises again because the best model for the noisy cross starts to model

1725: more noise. Thus, the second graph shows us the denoising of the noisy

1726: cross, underfitting left of the point of contact with the horizontal axis,

1727: and overfitting right of that point. This point of best denoising can

1728: also be deduced

1729: from the first graph, where it is the point where

1730: the distortion-rate curve sharply

1731: levels off.

1732: Since this point

1733: has distortion of only $3$ to the

1734: noiseless cross, the distortion-rate

1735: function separates structure and noise very well in this

1736: example.

1737:

1738: In the experiments in \cite{RV06} a specially written

1739: block sorting compression algorithm with a

1740: move-to-front scheme as described in \cite{BW94} was used.

1741: The algorithm is very

1742: similar to a number of common general purpose compressors, such as bzip2

1743: and zzip, but it is simpler and faster for small inputs; the source

1744: code (in C) is available from the authors of \cite{RV06}.

1745:

1746:

1747: \section{Algorithmic versus Probabilistic Rate-Distortion}\label{sect.algprobrd}

1748:

1749: %For every distortion family ${\A}$ satisfying property 2,

1750: %and ${\A}_n = {\A} \bigcap \{0,1\}^n$,

1751: Theorem~\ref{thm.dresf} shows that

1752: Shannon's rate-distortion function

1753: $r^n(\delta)$ of \eqref{eq.rndelta}

1754: for a random variable is pointwise related

1755: to the expected value

1756: of the rate-distortion functions $r_x(\delta)$ of the individual

1757: string $x \in {\A}_n$

1758: (outcomes of the random variable with the expectation taken

1759: over the probabilities of the random variable).

1760: This result generalizes \cite{YS93,MK94,SE03}

1761: to arbitrary computable sources.

1762:

1763: Formally, probabilistic rate-distortion theory is treated in

1764: Appendix~\ref{sect.ratedistortion}.

1765: Let ${\mathbf X}$ and ${\mathbf Y}$ be finite alphabets where we

1766: take ${\mathbf X}=\{0,1\}$ for convenience.

1767: We generalize the setting from i.i.d.

1768: random variables to more general random variables.

1769: Let $X_1, X_2, \ldots , X_n$ be a sequence

1770: of, possibly dependent, random variables with values in ${\mathbf X}^n$

1771: such that

1772: $p(x_1x_2\ldots x_n) = P(X_1=x_1, X_2=x_2, \ldots , X_n=x_n)$

1773: is rational.  With $X=X_1, X_2, \ldots , X_n$ and

1774: $x=x_1x_2 \ldots x_n$,

1775: let $\K(X)$ denote

1776: the Kolmogorov complexity of the set of pairs

1777: $(x,p(x))$ ordered lexicographic.

1778: Let

1779: $E: {\mathbf X}^n \rightarrow {\mathbf Y}^n$ be a code.

1780: Define the Shannon rate-distortion function by

1781: \begin{equation}\label{eq.rndelta}

1782: r^n(\delta) = \min_E

1783: \{ \log |E({\mathbf X}^n)| :

1784: {\bf E} d(x,E(x)) \leq \delta \},

1785: \end{equation}

1786: the expectation ${\bf E}$

1787: taken over the probability mass function $p$.

1788: %Roughly speaking, we prove that

1789: %$r^n(\delta)$ is close to the $p$-expected value

1790: %of $r_x(\delta)$ for $x \in {\A}_n$ and distortion $\delta$.

1791:

1792: \begin{theorem}\label{thm.dresf}

1793: Let

1794: %the distortion family ${\A}$ satisfy property 2, and

1795: %${\A}_n= {\A} \bigcap \{0,1\}^n$. For every $n$ and string $x$

1796: %of length $n$,

1797: %let

1798: $E_0$ be a many-to-one coding function

1799: %achieving the minimum in

1800: %the righthand side of \eqref{eq.rndelta}

1801: defined by $E_0(x)=y$ with

1802: $d(x,y) \leq \delta$ and

1803: $r_x(\delta) = \K(y)$.

1804: Let $|x|=n$. Then,

1805: \[

1806:  {\bf E} r_x(\delta) - \Delta_1

1807: \leq  r^n (\delta)

1808: \leq \min \left\{{\bf E} r_x(\delta)+ \Delta_2 ,

1809:  \max_{x \in {\cal X}^n} r_x(\delta) \right\},

1810: \]

1811: with $\Delta_1 = O(\K(\delta,r^n,X,n))$,

1812: $\Delta_2 = H(L)-H(S)$ with $S(y)= \sum \{p(x): E_0(x)=y\}$,

1813: $L(y)$ is the uniform distribution over the $y$'s over $\mathbf{Y}^n$, and

1814: the expectation ${\bf E}$ is taken over $p$.

1815: \end{theorem}

1816: Note that we have taken ${\cal X}= {\X}= \mathbf{X}^n$

1817: and ${\Y}=\mathbf{Y}^n$.

1818: The $\Delta_1$ quantity satisfies $\lim_{n \rightarrow \infty} \Delta_1 /n =0$.

1819: The quantity $\Delta_2$ is small only in the case where we

1820: have asymptotic equidistribution. This is the original setting of Shannon.

1821: Though independence is not needed, for example ergodic stationarity guarantees

1822: asymptotic equidistribution.

1823:

1824:

1825:

1826:

1827:

1828: \appendix

1829:

1830: \subsection{Shannon Rate Distortion}\label{sect.ratedistortion}

1831: Classical rate-distortion theory

1832: was initiated by Shannon in \cite{Sh48,Sh59}, and %in his celebrated 1948 paper.

1833: we briefly recall his approach.

1834: Let

1835: ${\mathbf X}$ and ${\mathbf Y}$ be finite alphabets.

1836: A single-letter distortion measure is

1837: a function $d$ that maps elements of

1838: $\mathbf X \times \mathbf Y$ to the reals. Define the distortion between

1839: word $x$ and $y$ of the same length $n$ over alphabets

1840: ${\mathbf X}$ and ${\mathbf Y}$, respectively, by

1841: \[

1842: d^n(x,y)= \frac{1}{n}\sum_{i=1}^n d(x_i,y_i).

1843: \]

1844: Let $X$ be a random variable with values in

1845: ${\mathbf X}$. Consider the random variable $X^n$ with values in ${\mathbf X}^n$,

1846: that is, the sequence $X_1,\dots,X_n$ of $n$ independent

1847: copies of $X$.

1848: We want to encode words of length $n$ over ${\mathbf X}$ by words over ${\mathbf Y}$

1849: so that the number

1850: of all code words is small and

1851: the expected distortion between outcomes of $X^n$ and their

1852: codes is small.

1853: The tradeoff between the expected

1854: distortion

1855: and the number of code words used is expressed

1856: by the {\em rate-distortion} function

1857: denoted by $r^n(\delta )$ as in \eqref{eq.rndelta}. It

1858: maps every $\delta \in {\cal R}$

1859: to the minimal natural number

1860: $r$ (we call $r$ the \emph{rate})

1861: having the following property:

1862: There is an encoding function

1863: $E:{\mathbf X}^n \rightarrow {\mathbf Y}^n$ with a range of cardinality at most $2^r$

1864: such that

1865: the expected distortion between the outcomes of $X^n$

1866: and their corresponding codes is at most~$\delta$.

1867:

1868: In \cite{Sh59} Shannon gave the following nonconstructive

1869: asymptotic characterization of $r^n(\delta)$.

1870: Let $Z$ be a random variable with values in ${\mathbf Y}$.

1871: Let $H(Z)$, $H(Z \mid X)$ stand for the  Shannon entropy and conditional Shannon entropy,

1872: respectively. Let $I(X;Z)=H(Z)-H(Z \mid X)$ denote the mutual information

1873: in $X$ and $Z$, and  ${\bf E} d(X,Z)$ stand

1874: for the expected value of  $d(x,z)$ with respect to

1875: the joint probability $P(X=x, Z=z)$ of the random variables $X$ and $Z$.

1876: For a real $\delta$, let $R(\delta)$ denote

1877: the minimal $I(X;Z)$ subject to ${\bf E} d(X,Z)\leq \delta$.

1878: That such a minimum is attained for all $\delta$ can be shown

1879: by compactness arguments.

1880:

1881: \begin{theorem}\label{theo.shannon}

1882: For every $n$ and $\delta$ we have

1883: $r^n(\delta)\geq nR(\delta)$. Conversely,

1884: for every $\delta$ and every positive $\epsilon$,

1885: we have

1886: $r^n(\delta+\epsilon)\leq n(R(\delta)+\epsilon)$

1887: for all large enough $n$.

1888: \end{theorem}

1889:

1890: \subsection{Computability}\label{sect.computability}

1891:

1892: In 1936 A.M. Turing \cite{Tu36} defined the hypothetical `Turing machine'

1893: whose computations are

1894: intended to give an operational and formal definition

1895: of the intuitive notion of computability in the discrete domain.

1896: These Turing machines compute integer functions,

1897: the {\em computable} functions. By using pairs of integers for the

1898: arguments and values we can extend computable functions

1899: to functions with rational arguments and/or values.

1900: The notion of computability can be further

1901: extended, see for example \cite{LiVi97}:

1902: A

1903: function $f$ with rational arguments and real values is

1904: {\em upper semicomputable}

1905: if there is a computable

1906: function  $\phi (x,k)$ with

1907: $x$ an rational number and $k$ a nonnegative integer

1908: such that $\phi(x,k+1) \leq \phi(x,k)$ for every $k$ and

1909:   $\lim_{k \rightarrow \infty} \phi (x,k)=f(x)$.

1910: This means

1911:   that $f$ can be computably approximated from above.

1912: A function $f$ is

1913: {\em lower semicomputable}

1914:   if $-f$ is upper semicomputable.

1915:   A function is called

1916: {\em semicomputable}

1917:   if it is either upper semicomputable or lower semicomputable or both.

1918: If a function $f$ is both upper semicomputable and

1919: lower semicomputable,

1920: then $f$ is

1921: computable.

1922: A countable set $S$ is {\em computably (or recursively) enumerable}

1923: if there is a Turing machine $T$ that outputs all and only the elements of $S$

1924: in some order and does not halt. A countable set $S$ is

1925: {\em decidable (or recursive)}

1926: if there is a Turing machine $T$ that decides for every candidate $a$

1927: whether $a \in S$ and halts.

1928:

1929: \begin{example}\rm

1930: An example of a computable function is $f(n)$ defined as

1931: the $n$th prime number;

1932: an example of a function that is upper semicomputable

1933: but not computable is the Kolmogorov complexity function $\K$ in

1934: Appendix~\ref{sect.kolmcompl}. An example of a recursive set is the set

1935: of prime numbers; an example of a recursively enumerable

1936: set that is not recursive is $\{x \in {\cal N}: \K(x) < |x| \}$.

1937: \end{example}

1938:

1939: Let ${\cal X}=\{0,1\}^*$, and ${\Y}$ and the distortion measure $d$

1940: be given.

1941: Assume that ${\Y}$ is recursively (= computably) enumerable

1942: and the set

1943: $\{\pair{x,y,\delta}\in\mathcal X\times\Y\times\Q: d(x,y)\le \delta\}$

1944: is decidable.

1945: Then $r_x$ is upper semicomputable. Namely, to determine $r_x(\delta)$

1946: %with $|x|=n$

1947: proceed as follows.

1948: %We know $r_x(\delta) \leq n+O(\log n)$.%

1949: Recall that $U$ is the reference universal Turing machine.

1950: Run $U(p)$ for all $p$

1951: %$|p| \leq n +O(\log n)$%

1952: dovetailed fashion (in stage $k$ of the overall computation

1953: execute the $i$th computation step of the $(k-i)$th program).

1954: Interleave this computation

1955: with a process that recursively enumerates  ${\Y}$.

1956: Put all enumerated elements of ${\Y}$ in a set ${\cal W}$.

1957: %Initially, the best candidate program

1958: %$q$ has length $|q|= n+O(\log n)$.

1959: Whenever $U(p)$ halts we put the output in a set ${\cal U}$.

1960: After every step in the overall computation we determine the

1961: minimum length of a program $p$ such that $U(p) \in {\cal W} \bigcap {\cal U}$

1962: and $d(x,U(p))\le\delta$.

1963: We call $p$ a \emph{candidate} program.

1964: The minimal length of all candidate programs can only decrease

1965: in time and eventually becomes equal to $r_x(\delta)$. Thus,

1966: this process

1967: upper semicomputes $r_x (\delta)$.

1968:

1969: The function $g_x$ is also upper semicomputable. The proof is similar

1970: to that used to prove the upper semicomputability of $r_x$.

1971: It follows from \cite{VV02} that in general $d_x$,

1972: and hence its `inverse' $r_x$ and by Lemma~\ref{lem.rg}

1973: the function $g_x$, are not computable.

1974:

1975: Assume that the set $\Y$ is recursively enumerable and

1976: the set

1977: $\{\pair{x,y,\delta}\in\mathcal X\times\Y\times\Q: d(x,y)\le \delta\}$

1978: is decidable. Assume that the resulting distortion family $\mathcal

1979: A^{d,\Y}$

1980: satisfies Property 2.

1981: There is a relation between

1982: destination words and distortion balls. This relation is as follows.

1983:

1984: (i) Communicating a destination word $y$ for a source word $x$

1985:  knowing a rational upper bound

1986: $\delta$ for  the distortion $d(x,y)$

1987: involved is the same as communicating a

1988: distortion ball of radius $\delta$ containing $x$.

1989:

1990: (ii) Given (a list of the elements of) a distortion ball $B$

1991: we can upper semicompute

1992: the least distortion $\delta$ such that $B=B(y,\delta)$ for some $y\in\Y$.

1993:

1994: Ad (i). This implies that the function $\tilde r_x(\delta)$ defined

1995: in \eqref{eq.tilde} differs from $r_x(\delta)$

1996: by $O(\K(\delta)+\log |x|)$.

1997: See the proof of Lemma~\ref{lem.rg}.

1998:

1999: Ad (ii). Let

2000: $B$ be a given ball. Recursively enumerating ${\Y}$ and

2001: the possible $\beta\in \Q$,

2002: we find

2003: for every newly enumerated element of $y \in {\Y}$

2004: whether $B(y, \beta)=B$ (see the proof of  Lemma~\ref{lem.rg}

2005: for an algortihm to find a list of elements of $B(y, \beta)$

2006: given $y,\beta$). Put these $\beta$'s

2007: in a set ${\cal W}$.

2008: Consider the least element of ${\cal W}$ at every computation step.

2009: This process upper semicomputes the

2010: least distortion  $\delta$ corresponding to

2011: the distortion ball $B$.

2012:

2013: \subsection{Kolmogorov Complexity}\label{sect.kolmcompl}

2014:

2015: For precise definitions, notation, and results see the text \cite{LiVi97}.

2016: Informally, the Kolmogorov complexity, or algorithmic entropy, $\K(x)$ of a

2017: string $x$ is the length (number of bits) of a shortest binary

2018: program (string) to compute

2019: $x$ on a fixed reference universal computer

2020: (such as a particular universal Turing machine).

2021: Intuitively, $\K(x)$ represents the minimal amount of information

2022: required to generate $x$ by any effective process.

2023: The conditional Kolmogorov complexity $\K(x \mid  y)$ of $x$ relative to

2024: $y$ is defined similarly as the length of a shortest binary program

2025: to compute $x$, if $y$ is furnished as an auxiliary input to the

2026: computation.

2027: %For technical reasons we use a variant of complexity,

2028: %so-called prefix complexity, which is associated with Turing machines

2029: %for which the set of programs resulting in a halting computation

2030: %is prefix free.

2031: %We realize prefix complexity by considering a special type of Turing

2032: %machine with a one-way input tape, a separate work tape,

2033: %and a one-way output tape. Such Turing

2034: %machines are called {\em prefix} Turing machines. If a machine $T$ halts

2035: %with output $x$

2036: %after having scanned all of $p$ on the input tape,

2037: %but not further, then $T(p)=x$ and

2038: %we call $p$ a {\em program} for $T$.

2039: %It is easy to see that

2040: %$\{p : T(p)=x, x \in \{0,1\}^*\}$ is a {\em prefix code}.

2041:

2042: Let $T_1 ,T_2 , \ldots$ be a standard enumeration

2043: of all (and only) Turing machines with a binary input tape,

2044: for example the lexicographic length-increasing ordered syntactic

2045: Turing machine descriptions, \cite{LiVi97},

2046: and let $\phi_1 , \phi_2 , \ldots$

2047: be the enumeration of corresponding functions

2048: that are computed by the respective Turing machines

2049: ($T_i$ computes $\phi_i$).

2050: These functions are  the

2051: {\em computable (or recursive)}

2052: functions. % (of effectively prefix-free encoded

2053: %arguments).

2054: %The {\em Kolmogorov complexity}

2055: %of $x$ is the length of the shortest binary program

2056: %from which $x$ is computed.

2057: For the development of the theory we

2058: actually require

2059: the Turing machines to use {\em auxiliary} (also

2060: called {\em conditional})

2061: information, by equipping the machines with a special

2062: read-only auxiliary tape containing this information at the outset.

2063: Let $\langle \cdot , \cdot \rangle$ be a computable one to one

2064: {\em pairing function}

2065: on the natural numbers (equivalently, strings)

2066: mapping $\{0,1\}^* \times \{0,1\}^* \rightarrow \{0,1\}^*$ with

2067: $|\langle u,v \rangle| \leq |u|+|v| +O(\log (|u|))$. (We need the extra

2068: $O(\log (|u|))$ bits to separate $u$ from $v$.

2069: For Kolmogorov complexity, it is essential that there

2070: exists a pairing function such that

2071: the length of $\langle u,v \rangle$ is equal to the sum of

2072: the lengths of $u,v$ plus a small value depending only on $|u|$.)

2073: We denote the function computed by a Turing machine $T_i$ with $p$ as input

2074: and $y$ as conditional information by

2075: $\phi_i(p,y)$.

2076:

2077: One of the main achievements of the theory of computation

2078: is that the enumeration $T_1,T_2, \ldots$ contains

2079: a machine, say $T_u$, that is computationally universal in that it can

2080: simulate the computation of every machine in the enumeration when

2081: provided with its index. It does so by computing a

2082: function $\phi_u$ such that

2083:    $\phi_u(\langle i, p\rangle,y)  = \phi_i (p,y)$

2084:     for all $i,p,y$.

2085:     We fix one such machine and designate it as the {\em reference universal

2086:     Turing machine} or {\em reference Turing machine} for short.

2087:

2088: \begin{definition}\label{def.KolmK}

2089:     The {\em conditional Kolmogorov complexity} of $x$ given $y$ (as

2090: auxiliary information) {\em with respect to Turing machine} $T_i$ is

2091:                   \begin{equation}\label{eq.KC}

2092:     \K_i(x \mid y) = \min_p \{|p|: \phi_i(p,y)=x \}.

2093:                   \end{equation}

2094: The {\em conditional Kolmogorov complexity} $\K(x \mid y)$ is defined

2095: as the conditional Kolmogorov complexity

2096: $\K_u (x \mid y)$ with respect to the reference  Turing machine $T_u$

2097: usually denoted by $U$.

2098: The {\em unconditional} version is set to  $\K(x)=\K(x  \mid \epsilon)$.

2099: \end{definition}

2100:

2101: Kolmogorov complexity $\K(x\mid y)$ has

2102: the following crucial property:

2103: $\K(x\mid y)\le \K_i(x \mid y)+c_i$ for

2104: all $i,x,y$, where $c_i$ depends only on

2105: $i$ (asymptotically, the reference Turing machine is not worse

2106: than any other machine).

2107: Intuitively, $\K(x\mid y)$ represents the minimal amount of information

2108: required to generate $x$ by any effective process from input $y$.

2109: %We denote the {\em shortest program} for $x\mid \epsilon$ by $x^*$; then

2110: %$\K(x)= |x^*|$.

2111: %(Actually, $x^*$ is the first shortest program for $x$ in

2112: %an appropriate standard enumeration of all programs for $x$

2113: %such as the halting order.)

2114: The functions $\K( \cdot)$ and $\K( \cdot \mid  \cdot)$,

2115: though defined in terms of a

2116: particular machine model, are machine-independent up to an additive

2117: constant

2118:  and acquire an asymptotically universal and absolute character

2119: through Church's thesis, see for example \cite{LiVi97},

2120: and from the ability of universal machines to

2121: simulate one another and execute any effective process.

2122:   The Kolmogorov complexity of an individual finite object was introduced by

2123: Kolmogorov \cite{Ko65} as an absolute

2124: and objective quantification of the amount of information in it.

2125: The information theory of Shannon \cite{Sh48}, on the other hand,

2126: deals with {\em average} information {\em to communicate}

2127: objects produced by a {\em random source}.

2128:  Since the former theory is much more precise, it is surprising that

2129: analogs of theorems in information theory hold for

2130: Kolmogorov complexity, be it in somewhat weaker form.

2131: For example, let $X$ and $Y$ be random variables

2132: with a joint distribution. Then,

2133: $H(X,Y)\le H(X)+H(Y)$,

2134: where $H(X)$ is the entropy of the marginal

2135: distribution of $X$.

2136: Similarly, let $\K(x,y)$ denote $\K(\langle x,y \rangle)$

2137: where $\langle \cdot,\cdot \rangle$

2138: is a standard pairing

2139: function as defined previously and $x,y$ are strings.

2140: Then we have

2141: $\K(x,y)\le \K(x)+\K(y)+O(\log \K(x))$. Indeed, there is a

2142: Turing machine $T_i$ that provided with  $\langle p,q\rangle$

2143: as an input computes $\langle U(p),U(q)\rangle$

2144: (where $U$ is the reference Turing machine). By construction of $T_i$, we have

2145: $\K_i(x,y)\le \K(x)+\K(y)+O(\log \K(x))$, hence

2146: $\K(x,y)\le \K(x)+\K(y)+O(\log \K(x))$.

2147:

2148: Another interesting similarity is the following:

2149: $I(X;Y)=H(Y)-H(Y \mid X)$

2150:  is the (probabilistic)

2151: {\em information in random variable $X$ about random variable $Y$}.

2152: Here $H(Y \mid X)$ is the conditional entropy of $Y$

2153: given $X$.

2154: Since $I(X;Y)=I(Y;X)$ we call this symmetric quantity the {\em

2155: mutual (probabilistic) information}.

2156: \begin{definition}

2157: \label{def.mi}

2158: \rm

2159: The {\em (algorithmic)  information in $x$ about $y$}

2160: is $I(x:y)=\K(y)-\K(y\mid x)$,

2161: where $x,y$

2162: are finite objects like finite strings or finite sets of finite strings.

2163: \end{definition}

2164:

2165: It is  remarkable that also the algorithmic information

2166: in one finite object about another one is symmetric: $I(x:y)=I(y:x)$ up to

2167: an additive term logarithmic in $\K(x)+\K(y)$. This follows

2168: immediately from the {\em symmetry of information} property

2169: due to A.N. Kolmogorov and L.A. Levin:

2170: %Let $x^*$  denote the shortest program

2171: %for a finite string $x$,

2172: %or, if there are more than one of these, then $x^*$ is the first

2173: %one halting in a fixed standard enumeration of all halting programs.

2174: %Then, by definition, $\K(x)=|x^*|$.

2175: \begin{align}\label{eq.soi}

2176: \K(x,y) & = \K(x)+\K(y \mid x) + O(\log (\K(x)+\K(y))) \\

2177: & = \K(y)+\K(x \mid y)+O(\log (\K(x)+\K(y))) .

2178: \nonumber

2179: \end{align}

2180: %If $X,Y$ are random variables with a computable joint probability mass

2181: %function $p$, then the expectation of of the algorithmic mutual

2182: %information is close to the probabilistic mutual information.

2183:

2184:

2185:

2186: \subsection{Randomness Deficiency and Fitness}\label{sect.rd}

2187: Randomness deficiency of an element $x$ of

2188: a finite set $A$ according to Definition~\ref{def.rd} is

2189: related with the fitness of $x \in A$ (identified with the fitness

2190: of set $A$ as a model for $x$) in the sense of $x$ having most properties

2191: represented by the set $A$. Properties are identified with large

2192: subsets of $A$ whose Kolmogorov complexity is small (the `simple'

2193: subsets).

2194: \begin{lemma}\label{lemma.property}

2195: Let $\beta , \gamma$ be constants.

2196: Assume that $P$ is a subset of $A$ with

2197: $|P| \geq (1-2^{- \beta })|A|$  and

2198: $\K(P\mid A)\leq \gamma$.

2199: Then the randomness deficiency $\delta(x \mid A)$ of every

2200: $x\in A \setminus P$ satisfies

2201: $\delta(x \mid A)> \beta-\gamma-O(\log \log |A|)$

2202: \end{lemma}

2203: \begin{proof}

2204: Since $\delta (x \mid A) = \log |A|-\K(x\mid A)$

2205: and $\K(x\mid A) \leq \K(x\mid A,P)+\K(P\mid A) + O(\log \K(x\mid A,P))$,

2206: while $\K(x\mid A,P) \leq - \beta + \log |A|+O(1)\le \log |A|+O(1)$,

2207: we obtain

2208: $\delta(x \mid A)> \beta-\gamma-O(\log \log |A|)$.

2209: %which is large if $\beta$ is large and $\gamma$ and $\log \log |A|$ are small.

2210: \end{proof}

2211:

2212: The randomness deficiency measures our disbelief

2213: that $x$ can be obtained

2214: by random sampling in $A$ (where all elements of $A$ are

2215: equiprobable).

2216: For every $A$, the randomness deficiency of almost all

2217: elements of $A$ is small:

2218: The number of $x\in A$ with $\delta(x \mid A)>\beta$ is fewer than

2219: $|A|2^{-\beta}$. This can be seen as follows.

2220: The inequality $\delta(x \mid A)>\beta$ implies

2221: $\K(x \mid A)<\log |A|-\beta$.

2222: Since $1+2+2^2+\dots+2^{i-1}=2^i-1$,

2223: there are less than $2^{\log  |A|-\beta}$

2224: programs of fewer than

2225: $\log |A|-\beta$ bits. Therefore,

2226: the number of $x$'s satisfying

2227: the inequality

2228: $\K(x\mid A)<\log |A|-\beta$ cannot be larger.

2229: Thus, with high probability

2230: the randomness

2231: deficiency of an element

2232: randomly chosen in  $A$ is small.

2233: On the other hand, if $\delta(x \mid A)$ is small,

2234: then there is no way to refute the hypothesis

2235: that $x$ was obtained

2236: by random sampling from $A$: Every such

2237: refutation is based on a simply described property

2238: possessed by a majority of elements

2239: of $A$ but not by $x$. Here it is important that we consider

2240: only simply described properties, since otherwise

2241: we can refute the hypothesis by exhibiting the property

2242: $P=A \setminus \{x\}$.

2243:

2244:

2245: \subsection{Covering Coefficient for Hamming Distortion}\label{sect.exhamming}

2246:

2247: The authors find it difficult to believe that the covering result

2248: in the lemma below is new. But neither a literature search nor the

2249: consulting of experts has turned up an appropriate reference.

2250: \begin{lemma}\label{l2}

2251: Consider the distortion family ${\cal H}_n$.

2252: For all $0 \leq d\leq \delta\leq  \frac{1}{2}$ every Hamming ball of radius

2253: $\delta$ in ${\cal H}_n$

2254: can be covered by at most

2255: $\alpha_n b(\delta)/b(d)$

2256: Hamming balls of radius $d$ in ${\cal H}_n$,

2257: where $\alpha_n $ is a  polynomial in $n$.

2258: \end{lemma}

2259:

2260:

2261: \begin{proof}

2262: %If the lemma holds for even $n \geq 2$, then we

2263: %can delete the first bit of every $n$-length string involved and

2264: %have the lemma hold for strings of odd length $n-1$.

2265: %

2266: %Assume that $n$ is even.

2267: Fix a ball with center $y$ and radius $\delta = j/n \leq \frac{1}{2}$ where

2268: $j$ is a natural number.

2269: All the strings in the ball that are

2270: at Hamming distance at most $d$ from $y$

2271: can be covered by one ball

2272: of radius $d$ with center $y$.

2273: Thus it suffices,

2274: for every $\Delta$ of the form $i/n$ with

2275: $i= 2,3, \ldots ,j$

2276: (such

2277: that $d<\Delta\leq \delta$), to cover

2278: the set of all the strings at distance precisely $\Delta$ from $y$

2279: by  $n^{c+1} b(\delta)/b(d)$ balls of radius $d$

2280: for some fixed constant $c$.

2281: Then the ball $B(y, \delta)$ is covered by at most

2282: $j n^{c+1} b(\delta)/b(d) \leq n^{c+2} b(\delta)/b(d)$ balls of

2283: radius $d$.

2284:

2285: Fix

2286: $\Delta$ and let the Hamming sphere $S$ denote the set of all

2287: strings at distance precisely

2288: $\Delta$ from $y$.

2289: Let $f$ be the solution to the equation

2290: $d+f(1-2d)=\Delta$ rounded to the closest rational of the form $i/n$.

2291: Since $d<\Delta\leq  \delta\leq\frac{1}{2}$

2292: this equation has a unique solution and

2293: it lies in the closed real interval

2294: $[0,1]$.

2295: Consider a ball $B$ of radius $d$ with a random center $z$

2296: at distance

2297: $f$ from $y$. Assume that

2298: all centers at distance $f$ from $y$ are chosen with equal probabilities

2299: $1/s(f)$ where $s(f)$ is the number of points in a Hamming

2300: sphere of radius $f$.

2301: \begin{claim}\label{claim.prball}

2302: Let $x$ be a particular string in $S$. Then

2303: \[

2304: \Pr( x \in B) \geq \frac{b(d)}{n^c b(\delta)}

2305: \]

2306: for some fixed positive constant $c$.

2307: \end{claim}

2308:

2309: \begin{proof}

2310: Fix a string $z$ at distance  $f$ from $y$. We first claim

2311: that the ball $B$ of radius $d$ with center

2312: $z$ covers $b(d)/n^c$ strings in $S$.

2313: Without loss of generality,

2314:  assume that the string $y$ consists of only  zeros

2315: and string $z$ consists of $fn$ ones and $(1-f)n$ zeros.

2316: Flip a set of $fd n$ ones

2317: and a set  of

2318: $(1-f)d n$ zeros in $z$ to obtain a string $u$.

2319: The total number of flipped bits is equal to

2320: $d n$ and therefore $u$ is at distance $d$ from

2321: $z$. The number of ones in $u$ is

2322: $fn-fd n+(1-f)d n=\Delta n$ and

2323: therefore $u \in S$.

2324: Different choices of the positions of the same numbers of flipped bits

2325: result in different strings in

2326: $S$. The number of ways to choose the flipped bits is equal to

2327: $$

2328: \binom{fn}{fd n}\binom{(1-f)n}{(1-f)d n}.

2329: $$

2330: By Stirling's formula, this is at least

2331: $$

2332: 2^{fnh(d)+(1-f)nh(d)-O(\log n)}=

2333: 2^{nh(d)-O(\log n)}\ge

2334: \frac{b(d)}{n^c},

2335: $$

2336: where the last inequality follows from \eqref{binom-entropy}.

2337: Therefore a ball $B$ as above covers at least $b(d)/n^c$ strings

2338: of $S$.

2339: The probability

2340: that a ball $B$, chosen uniformly at random as above,

2341: covers a particular string $x\in S$ is the same for every such $x$

2342: since they are in symmetric position.

2343: The number of elements in a Hamming sphere

2344: is smaller than the cardinality of a Hamming ball of the same radius,

2345: $|S| \leq b(\delta)$.

2346: Hence with probability

2347: $$

2348: \frac{b(d)}{n^c |S|}\ge

2349: \frac{ b(d)}{n^c b(\delta)}

2350: $$

2351: a random ball $B$ covers a particular string $x$ in $S$.

2352: \end{proof}

2353:

2354: By Claim~\ref{claim.prball},

2355: the probability that a random ball $B$ does not cover a particular

2356: string $x \in S$ is at most $1-b(d)/(n^c b(\delta))$.

2357: The probability that no ball out of $N$ randomly drawn such

2358: balls $B$ covers

2359: a particular $x \in S$ (all balls are equiprobable) is at most

2360: \[

2361: \left(1-\frac{ b(d)}{n^c b(\delta)}\right)^N

2362: < e^{-N b(d)/(n^c  b(\delta))} .

2363: \]

2364: For $N = n^{c+1}  b(\delta)/ b(d)$,

2365: the exponent of the

2366: right-hand side of the last inequality is  $-n$,

2367: and the probability that $x$ is not covered is at most $e^{-n}$.

2368: This probability remains exponentially small even after

2369: multiplying by $|S| \leq 2^n$, the number of different $x$'s in $S$.

2370: Hence, with probability at least $1- (2/e)^n$

2371: we have that $N$ random balls

2372: of the given type cover all the strings in $S$.

2373: Therefore, there exists a deterministic selection of $N$

2374: such balls that covers all the strings in $S$.

2375: The lemma is proved.

2376: (A more accurate calculation shows that

2377: the lemma holds with $\alpha_n=O(n^4)$.)

2378: \end{proof}

2379:

2380: \begin{corollary}\label{cor.l2}

2381: \rm

2382: Since all strings of length $n$ are either in the Hamming ball

2383: $B(00\ldots 0, \frac{1}{2})$ or in the Hamming ball

2384: $B(11\ldots 1, \frac{1}{2})$ in ${\cal H}_n$,

2385: the lemma implies that the set $\{0,1\}^n$

2386: can be covered by at most

2387: \[

2388: N =  \frac{2\alpha_n  2^{n}}{b(d)}

2389: \]

2390: balls of radius $d$ for every $0 \leq d \leq \frac{1}{2}$.

2391: (A similar, but direct, calculation lets us

2392: replace the factor $2\alpha_n$ by $n$.)

2393: \end{corollary}

2394: %\begin{IEEEproof}

2395: %{\em of Corollary~\ref{cor.l2}.}

2396: %We will first prove

2397: %this corollary,

2398: %and then use the same method to prove the full lemma.

2399: %

2400: %Fix a string $x$. The probability that

2401: %$x$ is \emph{not} covered  by a randomly selected ball of radius $d$

2402: %is equal to  $1-b(d)2^{-n}$ (all balls are

2403: %equiprobable). Thus the probability that no ball out of

2404: %$N$ randomly selected balls of radius $d$ covers $x$ is

2405: %\[

2406: %(1-b(d)2^{-n})^N< e^{-N b(d)2^{-n}}.

2407: %\]

2408: %

2409: %

2410: %Choose $N=n2^{n}/b(d)$. Then the exponent in the right hand side of the last

2411: %displayed inequality is at most

2412: %$-n$, and the probability that  $x$ is not covered is less than

2413: %$e^{-n}$. This probability remains exponentially small even after

2414: %multiplying by $2^n$, the number of different $x$'s.

2415: %Hence, with probability close to 1, $N$ random balls

2416: %cover all the strings of length $n$.

2417: %\end{IEEEproof}

2418:

2419: \subsection{Proofs of the Theorems}

2420: \label{sect.proofs}

2421:

2422: \begin{proof}

2423: {\em of Theorem}~\ref{theo.allshapesrd}.

2424: (i)  Lemma~\ref{lem.shapesg} (assuming properties 1 through 4)

2425: implies that

2426: the canonical structure function $g_x$ of every string $x$ of length

2427: $n$ is close to some function in the family $G_n$. This can be seen

2428: as follows. Fix $x$ and

2429: construct $g$ inductively for $n, n-1, \ldots , 0$. Define

2430: $g(n)=0$

2431: and

2432: $$

2433: g(l-1)=\left\{\begin{array}{ll}

2434: g(l)+1 & \text{if } g(l)<g_x(l-1),\\

2435: g(l) & \text{otherwise.}

2436: \end{array}\right.

2437: $$

2438: By construction this function belongs

2439: to the family $G_n$.

2440: Let us show that

2441: $

2442: g_x(l)=g(l)+O(\log n)$.

2443: First, we prove that

2444: \begin{equation}\label{eq.left}

2445: g(l) \leq g_x(l)

2446: \end{equation}

2447: by induction on $l=n,n-1, \ldots , 0$.

2448: For $l=n$ the inequality is straightforward, since

2449: by definition $g(n)=0$.

2450: Let $0\le l\leq n$.

2451: Assume that $g(i)\le g_x(i)$ for $i=n,n-1, \ldots , l$.

2452: If $g(l) < g_x(l-1)$ then $g(l-1)= g(l)+1$ and therefore

2453: $g(l-1) \leq g_x(l-1)$. If $g(l) \geq g_x(l-1)$ then

2454: $g(l-1) = g(l) \geq g_x(l-1)\ge g_x(l)\ge g(l)$ and hence

2455: $g(l-1) = g_x(l-1)$.

2456:

2457: Second, we prove that

2458: \[

2459: g_x(l)\le g(l)+O(\log n)

2460: \]

2461: for every $l=0,1,\ldots, n$.

2462: Fix an $l$ and consider the least

2463: $m$ with $l \leq m \leq n$ such that $g_x(m)=g(m)$.

2464: If there is no such $m$ we take $m=n$ and observe

2465: that $g_x(n)=O(\log n)= g(n)+ O(\log n)$.

2466: This way, $g_x(m)=g(m)+O(\log n)$ and for every $l<l'\le m$

2467: we have $g(l'-1)<g_x(l'-1)$ due to inequality \eqref{eq.left}

2468: and definition of $m$.

2469: Then

2470: $g_x(l'-1)>g(l'-1)\ge g(l')$, since we know that $g$ is nonincreasing.

2471: Then, by the definition of $g$ we have $g(l'-1)=g(l')+1$.  Thus

2472: we have

2473: $g(l)=g(m)+m-l$.

2474: Hence,

2475: $g_x(l)\le g_x(m)+m-l+O(\log n) = g(m)+m-l+O(\log n)=g(l)+O(\log n)$,

2476: where the inequality follows from Lemma~\ref{lem.shapesg},

2477: the first equality from the assumption that $g_x(m)=g(m)+O(\log n)$,

2478: and the second equality from the previous sentence.

2479:

2480:

2481: (ii)

2482: In Theorem IV.4

2483: in \cite{VV02} we proved a similar statement

2484: for the special distortion family ${\cal L}$

2485: with an error term of $O(\log n)$.

2486: However, for the special case ${\cal L}$

2487: we can let $x$ be equal to the first $x$

2488: satisfying the inequality

2489: $g_x(l)\ge g(l)-O(\log n)$ for every $l$.

2490: In the general case this does not work any more.

2491: Here we construct $x$ together with sets

2492: ensuring the inequalities

2493: $g_x(l)\le g(l)+O(\sqrt{n\log n})$ for every $l=0,\dots,n$.

2494:

2495: The construction is as follows.

2496: Divide the segment $\{0,1,\dots,n\}$ into

2497: $N=\sqrt{n/\log n}$ subsegments of length $\sqrt {n\log n}$ each.

2498: Let

2499: $l_0=n>l_1>\dots>l_N=0$ denote the end points of the

2500: resulting subsegments.

2501:

2502:

2503: To find the desired $x$, we

2504: run the nonhalting algorithm below that takes

2505: $n$ and ${\A}_n$ as input

2506: %covering coefficient $\alpha_n$,

2507: together with the values

2508: of the function $g$ in the points $l_0,\dots,l_N$.

2509: Let $\delta (n)$ be a computable integer valued

2510: function of $n$ of the order $\sqrt {n\log n}$

2511: that will be specified later.

2512: \begin{definition}

2513: \rm

2514: Let $i=0,1,\dots,N$.

2515: A set $F\in\A_n$ is called {\em $i$-forbidden}

2516: if $|F|\le 2^{l_i}$ and

2517: $\K(F) < g(l_i)-\delta (n)$.

2518: A set is called {\em forbidden} if

2519: it is $i$-forbidden for some $i=0,1,\dots,N$.

2520: \end{definition}

2521: We wish to find an $x$ that is outside all forbidden sets

2522: (since this guarantees that $g_x(l_i)\ge g(l_i)-\delta (n)$ for every $i$).

2523: Since $\K(\cdot)$ is upper semicomputable, moreover

2524: property 3 holds, and we are also given $n$ and $g(l_0),\dots,g(l_N)$,

2525: we are able to find all forbidden sets using the following

2526: subroutine.

2527:

2528: \textbf{Subroutine $(n,{\A}_n, g(l_0),g(l_1), \ldots , g(l_n))$:}

2529: \begin{quote}

2530: for every

2531: $F\in \A_n$

2532: upper  semicompute

2533: $\K(F)$; every time we find

2534: $\K(F) < g(l_i)-\delta (n)$

2535: and $|F|\le 2^{l_i}$ for some $i$ and $F$, then print $F$.

2536: {\bf End of Subroutine}

2537: \end{quote}

2538:

2539: This subroutine prints all the forbidden sets in some order. Let

2540: $F_1,\dots,F_T$ be that order. Unfortunately

2541: we do not know when the subroutine will

2542: print the last forbidden set. In other words, we do not

2543: know the number $T$ of forbidden sets. To overcome this problem,

2544: the algorithm will run the subroutine and every time a new

2545: forbidden set $F_t$ is printed, the algorithm will

2546: construct {\em candidate sets}

2547: $B_0(t),\dots,B_N(t)\in\A_n$ satisfying $|B_i(t)|\le 2^{l_i}$ and

2548: $\K(B_i(t)) \le g(l_i)+\delta (n)$

2549: and the following condition

2550: \begin{equation}\label{eq.capcup}

2551: \bigcap_{j=0}^{N}B_j(t) \setminus \bigcup_{j=1}^{t}

2552: F_j\ne \emptyset ,

2553: \end{equation}

2554: for every $t=0,\dots,T$.

2555: For $t=T$ the set $\bigcup_{j=1}^{t}

2556: F_j$ is the union of all forbidden sets, which guarantees the bounds

2557: $g(l_i)-\delta (n)\le g_x(l_i)\le g(l_i)+\delta (n)$

2558: for all $x$ in the set in the left hand side of \eqref{eq.capcup}.

2559: Then we will

2560: prove that these bounds imply that

2561: $g(l)-\delta (n)\le g_x(l)\le g(l)+\delta (n)$

2562: for \emph{every} $l=0,\dots,n$.

2563: Each time a new forbidden set

2564: appears (that is, for every $t=1,\dots,T$)

2565: we will need to update candidate sets so that \eqref{eq.capcup} remains

2566: true. To do that we will maintain a stronger

2567: condition than just non-emptiness of the left hand side of \eqref{eq.capcup}.

2568: Namely, we will maintain the following invariant:

2569: for every $i=0,1, \ldots,  N$,

2570: \begin{equation}\label{eq.invariant}

2571: \left| \bigcap_{j=0}^{i} B_j(t) \setminus \bigcup_{j=1}^{t}

2572: F_j \right| \geq

2573: 2^{l_i-i-1}\alpha_n^{-i}.

2574: \end{equation}

2575: Note that for $i=N$ inequality \eqref{eq.invariant} implies

2576: \eqref{eq.capcup}.

2577:

2578:

2579: {\bf Algorithm

2580: $(n,{\A}_n, g(l_0),g(l_1), \ldots , g(l_n))$:}

2581: \begin{description}

2582: \item

2583: %{\bf Step 1.}

2584: %

2585: %Find set $B_i$ in ${\A}_n$

2586: %of cardinalities at most $2^{l_i}$ such

2587: %that

2588: %$$

2589: %\left|B_0\bigcap\dots\bigcap B_i \right|\ge 2^{l_i-i-1}\alpha_n^{-i}.

2590: %$$

2591: %We will amply fulfil the requirement by

2592: %producing sets with a much larger intersection---without

2593: %the factor of $2^{-i-1}$. {\bf \}}

2594: %{\bf \{}The sets $B_i$ with $i=1,2, \ldots , N$ are constructed inductively.

2595: %Assume that $B_0,\dots,B_i$ are already defined, and the cardinality of their

2596: %joint intersection is at least $2^{l_i}\alpha_n^{-i}$.{\bf \}}

2597: {\bf Initialize.}

2598: Recall that $l_0=n$.

2599: Define the set $B_t(0)=\booln$ for every $t$.

2600: This set is in ${\A}_n$ by property 1.

2601:

2602: {\bf for } $i := 1, \ldots , N$ {\bf do}

2603:

2604: Assume inductively that

2605: $|B_0(0) \bigcap B_1(0) \bigcap \cdots \bigcap B_{i-1} (0)|

2606: \geq 2^{l_{i-1}} \alpha_n^{-i+1}$, where $\alpha_n$

2607: denotes a polynomial upper bound of the covering

2608: coefficient of distortion family ${\A}_n$ existing by property 4.

2609: (The value $\alpha_n$ can be computed from $n$.)

2610: Note that this inequality is satisfied

2611: for $i=1$.

2612: Construct $B_{i}(0)$ by

2613: covering $B_{i-1}(0)$ by at most

2614: $\alpha_n 2^{l_{i-1}-l_{i}}$ sets of cardinality at most

2615: $2^{l_{i}}$

2616: (this cover exists in ${\A}_n$ by property 4).

2617: Trivially, this cover also covers

2618: $B_0(0)\bigcap\dots\bigcap B_{i-1}(0)$.

2619: The intersection of at least one of the covering

2620: sets with $B_0(0)\bigcap\dots\bigcap B_{i-1}(0)$ has cardinality at least

2621: $$

2622: \frac{2^{l_{i-1}}\alpha_n^{-i+1}}{\alpha_n 2^{l_{i-1}-l_{i}}}=

2623: 2^{l_{i}}\alpha_n^{-i}.

2624: $$

2625: Let $B_{i}(0)$ by the first such covering set in a given standard order.

2626: {\bf od}

2627:

2628: Notice that after the Initialization the invariant~\eqref{eq.invariant}

2629: is true for $t=0$, as $\bigcup_{j=1}^tF_j=\emptyset$.

2630: For every $t=1,2,\dots$ perform the following steps 1 and 2

2631: maintaining the

2632: invariant~\eqref{eq.invariant}:

2633:

2634: \item {\bf Step 1.}

2635: Run the subroutine and wait until $t$th forbidden set $F_t$ is printed

2636: (if $t>T$ the algorithms waits forever and never

2637: proceeds to Step 2).

2638:

2639: \item{\bf Step 2.}

2640:

2641: {\bf Case 1.} For every $i = 0,1, \ldots , N$

2642: we have

2643: \begin{equation}

2644: \label{eq.inv}

2645: \left|\bigcap_{j=0}^i B_j(t-1) \setminus \bigcup_{j=1}^t

2646: F_j \right| \geq 2^{l_i-i-1}\alpha_n^{-i}.

2647: \end{equation}

2648: Note the this inequality has one more

2649: forbidden set compared to the invariant~\eqref{eq.invariant}

2650: for $t-1$ (the argument in $B_j(t-1)$), and thus may be false.

2651: If that is the case, then

2652: we let $B_i(t)=B_i(t-1)$ for every

2653: $i=1, \ldots , N$ (this setting maintains invariant~\eqref{eq.invariant}).

2654:

2655: {\bf Case 2.} Assume that

2656: \eqref{eq.inv} is false

2657: for some index $i$.

2658: In this case

2659: find the least such index (we will use later that \eqref{eq.inv}

2660: is true for all $i'<i$).

2661:

2662: We claim that $i>0$. That is,

2663: the inequality \eqref{eq.inv} is true for $i=0$.

2664: In other words, the

2665: the cardinality of $F_1\bigcup \cdots \bigcup F_t$ is not

2666: larger than half

2667: of the cardinality of $B_0(t-1)=\booln$.

2668: Indeed, for every fixed $i$ the total cardinality of all the sets

2669: of simultaneously cardinality at most $2^{l_i}$

2670: and Kolmogorov complexity less than $g(l_i)-\delta (n)$ does not exceed

2671: $2^{g(l_i)-\delta (n)}2^{l_i}$.

2672: Therefore, the total number of elements in

2673: $\bigcup_{j=1}^t F_t$

2674: is at most

2675: $$

2676: \sum_{i=0}^N2^{g(l_i)-\delta (n) +l_i}\le

2677: (N+1)2^{g(\dmax)-\delta (n) +n}=

2678: (N+1)2^{n- \delta (n) }\ll 2^{n-1}= \frac{1}{2}\left|\booln \right|,

2679: $$

2680: where the first inequality follows since the function $g(l)+l$

2681: is monotonic nondecreasing, the first equality since

2682: $g(\dmax)=0$ by definition,

2683: and the last inequality since we will set $\delta(n)$

2684: at order of magnitude $\sqrt{n \log n}$.

2685:

2686: %Without loss of generality, assume

2687: %$i$ is the least such index.

2688:

2689: First let $B_k(t)=B_k(t-1)$ for all $k<i$ (this

2690: maintains invariant~\eqref{eq.invariant} for all $k<i$).

2691: To define $B_i(t)$ find a covering

2692: of $B_{i-1}(t)$ by at most

2693: $\alpha_n 2^{l_{i-1}-l_i}$

2694: sets in ${\A}_n$ of cardinality at most $2^{l_i}$.

2695: Since~\eqref{eq.inv}

2696: is true for index $i-1$, we have

2697: \begin{equation}\label{eq.inter}

2698: \left| \bigcap_{j=0}^{i-1} B_j(t) \setminus

2699: \bigcup_{j=1}^t

2700: F_t \right|

2701:  \geq

2702: 2^{l_{i-1}-i}\alpha_n^{-i+1}.

2703: \end{equation}

2704: Thus

2705: the greatest cardinality of an intersection of the set in \eqref{eq.inter}

2706: with a covering set is at least

2707: $$

2708: \frac{2^{l_{i-1}-i}\alpha_n^{-i+1}}{\alpha_n 2^{l_{i-1}-l_i}}

2709: = 2^{l_i-i}\alpha_n^{-i}.

2710: $$

2711: Let $B_i(t)$ be

2712: the first such covering set in standard order.

2713: Note that $2^{l_i-i}\alpha_n^{-i}$ is at least

2714: twice the

2715: threshold required by invariant~\eqref{eq.invariant}.

2716: Use the same procedure to obtain successively $B_{i+1}(t),\dots,B_N(t)$.

2717: %Finally, define $B_j (t)= B_j(t-1)$ for every $0 \leq j \leq i-1$.

2718: \end{description}

2719:

2720: {\bf End of Algorithm}

2721:

2722: Although the algorithm does not halt,

2723: at some unknown time  the last forbidden set $F_T$ is enumerated.

2724: After this time the candidate sets are not changed anymore.

2725: The invariant \eqref{eq.invariant} with $i=N$ shows that the cardinality

2726: of the set in the left hand side of \eqref{eq.capcup} is

2727: positive

2728: %at least

2729: %$2^{l_N-N-1} /\alpha_n^{-N} > 0$  since $l_N=0$, $N = \sqrt{n/ \log n}$

2730: %and $\alpha_n$ polynomial in $N$. Hence, \eqref{eq.capcup} holds.

2731: hence the set is not empty.

2732:

2733:

2734: Next we show that $\K(B_i(t))\le g(l_i)+\delta(n)$

2735: for every $i$ and every $t=1,\ldots,T$. We will see

2736: that to this end it suffices to upperbound

2737: the number of changes of each candidate set.

2738:

2739: \begin{definition}

2740: \rm

2741: Let $m_i$ be the {\em number of changes of $B_i$}

2742: defined by

2743: $m_i = |\{t: B_i(t) \neq B_i (t-1), \; 1 \leq t\le T \}|$ for

2744: $0 \leq i \leq N$.

2745: \end{definition}

2746: \begin{claim}\label{claim.mi}

2747: \rm

2748: $m_i \leq 2^{g(l_i)+i}$ for $0 \leq i \leq N$.

2749: \end{claim}

2750: \begin{proof}

2751: The Claim is proved by induction on $i$. For

2752: $i=0$ the claim is true,

2753: since $l_0 = n$ and $g(n)=0$ while $m_0=0$ by

2754: initialization in the Algorithm ($B(0)$ never changes).

2755:

2756: ($i > 0$): assume that the Claim

2757: is satisfied for every $j$ with $0 \leq j < i$.

2758: We will prove that $m_i\le 2^{g(l_i)+i}$ by counting

2759: separately the number of changes of $B_i$ of different types.

2760:

2761: {\bf Change of type 1.} The set $B_i$ is changed when

2762: \eqref{eq.inv}

2763: is false for an index strictly

2764: less than $i$.

2765: The number of these changes is at most

2766: \[

2767: m_{i-1} \leq 2^{g(l_{i-1})+i-1} \leq 2^{g(l_{i})+i-1},

2768: \]

2769: where the first inequality follows from the inductive assumption,

2770: and the second inequality by the property of $g$ that it

2771: is nonincreasing.

2772: Namely, since $l_{i-1} > l_i$  we have

2773: $g(l_{i-1}) \leq g(l_i)$.

2774: % $g(j)+j$ is

2775: %is nondecreasing.

2776: %Namely, since $l_{i-1} = l_i + \sqrt{n \log n}$  we have

2777: %therefore that $g(l_i)+l_i \leq g(l_{i-1})+l_{i-1}$ and hence

2778: %$g(l_i) \leq g(l_{i-1})+l_{i-1}-l_i = g(l_{i-1})+\sqrt{n \log n}$.

2779:

2780: {\bf Change of type 2.}  The inequality \eqref{eq.invariant}

2781: is false for $i$ and is true for all smaller indexes.

2782: %To upper bound the number of changes of this type divide

2783: %them again in two categories, recalling the notion

2784: %of the forbidden sets:  the sets in $\A_n$

2785: %of simultaneously cardinality at most

2786: %$2^{l_j}$ and complexity less than

2787: %$g(l_j)-\delta(n)$ for $0 \leq j \leq N$.

2788:

2789: {\bf Change of type 2a.}

2790: After the last change of

2791: $B_i$ at least one $j$-forbidden set for some $j<i$

2792: has been enumerated.

2793: The number of changes of this type is at most the number of

2794: $j$-forbidden sets for $j=0,\dots,i-1$. For every such $j$

2795: these forbidden sets have by definition Kolmogorov complexity less than

2796: $g(l_{j}) - \delta (n)$.

2797: %These $j$'s concerned satisfy $0 \leq j < i$.

2798: Since $l_j \ge l_i$ and $g$

2799: is monotonic nonincreasing we have

2800: $g(l_{j}) \leq g(l_{i})$.

2801: Because there are at most $N$ of these $j$'s,

2802: the number of such forbidden sets is at most

2803: $$N2^{g(l_i)-\delta(n)}\ll 2^{g(l_i)},$$

2804: since we will later choose

2805: $\delta(n)$ of order $\sqrt{n \log n}$,

2806:

2807: {\bf Change of type 2b.}

2808: Finally, for every change of this type, between the last

2809: change of

2810: $B_i$ and the current one

2811: no candidate sets with indexes less than

2812: $i$ have been changed and no $j$-forbidden  sets

2813: with $j<i$ have been enumerated.

2814: Since after the last change of $B_i$ the cardinality of the set in the

2815: left-hand side of \eqref{eq.invariant} was at least

2816: $2^{l_i-i} \alpha_n^{-i}$, which is twice the threshold

2817: in the right-hand side

2818: by the restoration of the invariant in the Algorithm Step 2, Case 2,

2819: the following must hold.

2820: The cardinality of

2821: $\bigcup_{j=1}^t F_j$ increased  by  at least

2822: $2^{l_i-i-1}\alpha_n^{-i}$ since the last change of $B_i$,

2823: and this must be due to enumerating

2824: $j$-forbidden sets for $j=i,\dots,N$.

2825: For every such $j$

2826: every $j$-forbidden

2827: set has cardinality at most $2^{l_j}$

2828: and Kolmogorov complexity less than

2829: $g(l_{j}) - \delta (n)$.

2830: Hence the total number of elements in all

2831: $j$-forbidden sets is less than $2^{l_j}2^{g(l_{j}) - \delta (n)}$.

2832: Since $j\geq i$ and hence $l_j \leq l_i$ while $g(l)+l$

2833: is monotonic nondecreasing we have

2834: $g(l_{j})+l_j \leq g(l_{i})+l_i$.

2835: Because there are at most $N+1$ of these $j$'s,

2836: the total number of elements in all those sets does not exceed

2837: $M=(N+1)2^{g(l_i)-\delta (n)+l_i}$.

2838: %After the last change of

2839: %$B_i$ no forbidden set of cardinality greater

2840: %than $2^{l_i}$ has been enumerated.

2841: The number

2842: of changes of this type is not more than the total number $M$

2843: of elements involved divided by the increments of size

2844: $2^{l_i-i-1}\alpha_n^{-i}$. Hence it is not more than

2845: $$(N+1)2^{g(l_i)-\delta (n)}2^{i+1}\alpha_n^{i}.$$

2846: Let

2847: \begin{align}\label{eq.deltan}

2848: &\delta (n) \geq \log ((N+1)2^{i+10}\alpha_n^{i})

2849: \; \; {\rm and }

2850: \\&\delta (n) =

2851: O (N\log(2\alpha_n))=O(\sqrt{n/\log n} \; \log(2\alpha_n))=

2852: O (\sqrt{n\log n}),

2853: \nonumber

2854: \end{align}

2855: where the last equality uses that $\alpha_n$ is polynomial

2856: in $n$ by property 4.

2857: Then,

2858: the number of changes of type 2b is much less than  $2^{g(l_i)}$.

2859:  The value of $\delta(n)$ can be computed from $n$.

2860:

2861: Summing the numbers of changes of types 1, 2a, and 2b we obtain

2862: $m_i \leq 2^{g(l_i)+i}$, completing the induction.

2863: \end{proof}

2864: \begin{claim}\label{claim.gx}

2865: \rm

2866: Every $x$ in the nonempty set  \eqref{eq.capcup} satisfies

2867: $|g_x(l_i) -  g(l_i)| \leq \delta (n)$

2868: with $\delta (n) = O(\sqrt{n \log n})$

2869: for $i=0,1, \ldots , N$.

2870: \end{claim}

2871: \begin{proof}

2872: By construction $x$  is not an element of any forbidden set

2873: in $\bigcup_{t=1}^T F_t$, and therefore

2874: \[

2875: g_x(l_i) \geq g(l_i) - \delta (n)

2876: \]

2877: for every $i=0,1, \ldots , N$.

2878: By construction $|B_i(T)| \leq 2^{l_i}$, and

2879: to finish the proof it remains to show that

2880: $\K(B_i (T))

2881: \leq g(l_i)+\delta (n)$ so that

2882: $g_x(l_i) \leq g(l_i)+\delta(n)$,

2883: for $i=0,1, \ldots,  N$.

2884: Fix $i$.

2885: The set $B_i(T)$ can be

2886: described by a constant length

2887: program, that is $O(1)$ bits,

2888: that runs the Algorithm and uses the following

2889: information:

2890: \begin{itemize}

2891: \item

2892: A description of

2893: $i$ in $\log N\le\log n$ bits.

2894: \item

2895: A description of

2896: the distortion family $\A_n$ in $O(\log n)$ bits by property 3.

2897: \item

2898: The values of $g$ in the points $l_0,\dots,l_N$

2899: in $N\log n=\sqrt{n\log n}$ bits.

2900: \item

2901: The description of $n$ in $O(\log n)$ bits.

2902: \item

2903: The total number $m_i$

2904: of changes (Case 2 in the Algorithm)

2905: to intermediate versions of $B_i$ in $\log m_i$ bits.

2906: \end{itemize}

2907: We count the number of bits in the description of

2908: $B_i(T)$. The description is effective and by Claim~\ref{claim.mi} with

2909: $i \leq N = \sqrt{n/\log n}$ it

2910: takes at most $g(l_i) + O(\sqrt{n \log n})$ bits. So this is an

2911: upper bound on the Kolmogorov complexity $\K(B_i(T))$.

2912: Therefore, for some $\delta(n)$ satisfying \eqref{eq.deltan} we have

2913: %by Definition~\ref{def.gx} of $g_x$ we obtain

2914: \[

2915: g_x(l_i) \leq g(l_i)+ \delta(n),

2916: \]

2917: for every $i = 0,1, \ldots, N$.

2918: The claim follows from the first and the last displayed

2919: equation in the proof.

2920: \end{proof}

2921:

2922:

2923: Let us show that the statement

2924: of Claim~\ref{claim.gx}

2925: holds not only for the subsequence of values $l_0,l_1, \ldots , l_N$

2926: but for every $l=0,1, \ldots , n$,

2927: %  when we replace $\delta (n)$

2928: %by $O(\sqrt{n \log n})$, and so prove the theorem.

2929:

2930: Let $l_i \leq l \leq l_{i-1}$.

2931: Both functions $g(l),g_x(l)$ are nonincreasing so that

2932: \begin{align*}

2933: &g(l)\in[g(l_{i-1}),g(l_{i})],\\

2934: &g_x(l)\in[g_x(l_{i-1}),g_x(l_{i})]

2935: \subseteq[g(l_{i-1})-O(\sqrt{n\log n}),g(l_{i})+O(\sqrt{n\log n})].

2936: \end{align*}

2937: By the

2938: spacing of the sequence of $l_i$'s

2939: the length of the segment

2940: $[g(l_{i-1}),g(l_{i})]$ is at most

2941: $$

2942: g(l_{i})-g(l_{i-1})\le l_{i-1}-l_{i}

2943:  = \sqrt{n\log n}.

2944: $$

2945: If there is an $x$ such that Claim~\ref{claim.gx}

2946: holds for every $l_i$ with $i=0, \ldots , N$, then

2947: it follows from the above that

2948: $|g(l)-g_x(l)|\le\sqrt{n\log n}+O(\sqrt{n\log n})$ for every $l=0,1, \ldots, n$.

2949: \end{proof}

2950: \vspace{.2in}

2951:

2952: \begin{proof}

2953: {\em of Theorem}~\ref{th-shannon-analog}.

2954: We start with Lemma~\ref{th5} stating a combinatorial fact

2955: that is interesting

2956: in its own right, as explained further in Remark~\ref{rem.previously}.

2957:

2958:

2959: \begin{lemma}\label{th5}

2960: Let $n,m,k$ be natural numbers and

2961: $x$ a string of length $n$. Let ${\BB}$ be a family

2962: of subsets of $\{0,1\}^n$ and

2963: ${\BB}(x) = \{B \in {\BB}: x \in B \}$.  If

2964: ${\BB}(x)$ has at least $2^m$ elements (that is, sets) of

2965: Kolmogorov complexity less than $k$, then

2966: there is an element in ${\BB}(x)$ of Kolmogorov complexity

2967: at most $k-m+O(\K(\BB)+\log n +\log k+\log m)$.

2968: \end{lemma}

2969:

2970:

2971: \begin{proof}

2972: Consider a game between Alice and Bob. They alternate moves

2973: starting with Alice's move.

2974: A move of Alice consists in producing a

2975: subset of $\booln$. A move of

2976: Bob consists in marking some sets previously produced by

2977: Alice (the number of marked sets can be 0).

2978: %There are two versions of the game: the on-line version and the off-line one.

2979: %In the on-line game, C wins if, following every one of his moves,

2980: %every $x\in\X$ that is covered at least $2^k$ times

2981: %by P's sets belongs to a marked set.

2982: %In the off-line game C wins if this condition holds after his last move.

2983: Bob wins if after every one of his moves

2984: every $x\in\X$ that is covered by at least $2^m$

2985: of Alice's sets

2986: belongs to a marked set.

2987: %It is important that this condition is checked

2988: %following every one of C's moves: C cannot

2989: %postpone marking until all P's sets appear.

2990: The length of a play is decided by Alice. She

2991: may stop the game after any of Bob's moves. However the

2992: total number of her moves (and hence Bob's moves)

2993: must be less than $2^k$.

2994: (It is easy to see that without loss of generality

2995: we may assume that Alice makes exactly $2^k-1$ moves.)

2996: Bob can easily win if he marks every set produced by Alice.

2997: However, we want to minimize the total number of marked sets.

2998:

2999: \begin{claim}\label{l53}

3000: %In the off-line game, Consumer has a winning strategy

3001: %that marks at most $2^{r-k}\log|\X|$ sets.

3002: Bob has a winning strategy

3003: that marks at most $O(2^{k-m}k^{2}n)$ sets.

3004: %with $\alpha, \beta$ constants.

3005: \end{claim}

3006:

3007: \begin{proof}

3008: %\begin{remark}

3009: %Remark.

3010: %In the proof of Lemma~\ref{l53} we have not

3011: %presented any explicit strategy for C.

3012: %Here is  a winning strategy

3013: %that marks $2^{r-k} r^2 \ln |\X|$ sets.

3014: We present an explicit

3015: %deterministic and constructive

3016: strategy for Bob, which consists in

3017: %($\tau = 2^k$ moves.)

3018: %Bob's strategy

3019: %with $\tau=2^k$ moves

3020: in executing at every move $t=1,2, \ldots ,2^k -1$

3021: the following algorithm for the sequence

3022: $A_1, A_2, \ldots , A_t$ which has been produced by Alice until then.

3023:

3024: %{\bf for} $j=1,2,\dots,k$ {\bf do}

3025: \begin{description}

3026: \item

3027: {\bf Step 1.}

3028: Let $2^j$ be the largest power

3029: of $2$ dividing $t$.

3030: Consider the last $2^j$ sets in the sequence

3031: $A_1, A_2, \ldots , A_t$ and call them

3032: $D_1,\dots,D_{2^j}$.

3033: \item

3034: {\bf Step 2.}

3035: Let $T$ be the set of $x$'s that occur in at least

3036: $2^{m}/k$ of the

3037: sets $D_1,\dots,D_{2^j}$.

3038: Let $D_p$ be a set such that $|D_p\bigcap T|$ is maximal.

3039: Mark $D_p$ (if there is more than one then choose the one with $p$ least)

3040: and remove all elements of  $D_p\bigcap T$ from $T$.

3041: Call the resulting set $T_1$.

3042: Let $D_q$ be a set such that $|D_q\bigcap T_1|$ is maximal

3043: (if there is more than one then choose the one with $q$ least).

3044: After removing all elements of $D_q\bigcap T_1$ from $T_1$

3045: we obtain a set $T_2$. Repeat the argument until

3046: we obtain $T_{e_j} = \emptyset$.

3047: \end{description}

3048:

3049: Firstly, for the $j$ above we have

3050: $e_j \leq \lceil 2^{j-m}kn\ln2\rceil$.

3051: % sets among $D_1,\dots,D_{2^j}$

3052: %such that the union of the chosen sets covers $T$.

3053: This is proved as follows. We have

3054: $$

3055: \sum_{i=1}^{2^j}|D_i\bigcap T|\ge|T|2^{m}/k,

3056: $$

3057: since every $x\in T$ is counted at least $2^{m}/k$ times in the

3058: sum in the left hand side.

3059: Thus there is a set in the list $D_1, \ldots , D_{2^j}$

3060: such that the cardinality of its intersection

3061: with $T$

3062: is at least $2^{-j}$ times the right hand side.

3063: %$|D_s\bigcap T|\ge |T|2^{m-j}/k$.

3064:  By the choice of $D_p$ it is such a set

3065: and  we have $|D_p\bigcap T|\ge |T|2^{m-j}/k$.

3066:

3067: The set $T$ has lost at least a $(2^{m-j}/k)$th fraction of its

3068: elements, that is, $|T_1|\le |T|(1-2^{m-j}/k)$.

3069: Since $T_1 \subseteq T$, obviously every element of $T_1$

3070: (still) occurs in at least

3071: $2^{m}/k$ of the sets $D_1,\dots,D_{2^j}$.

3072: Thus we can repeat the argument and

3073: mark a set $D_q$ with $|D_q\bigcap T_1|\ge |T_1|2^{m-j}/k$.

3074: After removing all elements of $D_q\bigcap T_1$ from $T_1$

3075: we obtain a set $T_2$ that is at most a $(1-2^{m-j}/k)$th fraction

3076: of $T_1$, that is, $|T_2|\le |T_1|(1-2^{m-j}/k)$.

3077:

3078: Recall that we repeat the procedure $e_j$ times where $e_j$

3079: is the number of repetitions until  $T_{e_j} = \emptyset$.

3080: It follows that $e_j \leq \lceil 2^{j-m}kn\ln2\rceil$

3081: %The number of non-covered strings in the resulting set is

3082: %at most

3083: since

3084: $$

3085: |T|(1-2^{m-j}/k)^{2^{j-m}kn\ln2}<|T|e^{-n\ln2}=|T|2^{-n}\le1.

3086: $$

3087: %That is, all $x\in T$ are covered by marked sets $D$.

3088:

3089: Secondly, for every fixed $j=0,1, \ldots, k-1$

3090: there are at most $2^{k-j}$ different $t$'s ($t=1,2, \ldots , 2^k-1$)

3091: divisible by $2^j$

3092: and the number $d_j = 2^{k-j}e_j$

3093: of marked sets we need

3094: to use for this $j$ satisfies

3095: $d_j \leq 2^{k-j} 2^{j-m} kn\ln2 = 2^{k-m} kn \ln2$.

3096: For all $j=0,\dots,k-1$ together we use a total number of marked sets of

3097: at most

3098: \[

3099:  \sum_{j=0}^{k-1} d_j \leq 2^{k-m} k^2 n\ln 2.

3100: \]

3101: In this way,

3102: after every move $t=1, 2,\ldots , 2^k-1 $ of Bob,

3103: every $x$ occurring in

3104: $2^m$ of Alice's sets belongs to a marked set of Bob.

3105: This can be seen as follows.

3106: Assume to the contrary, that there is an $x$

3107: that occurs in $2^m$ of Alice's sets following move $t$ of Bob,

3108: and $x$ belongs to no set marked by Bob in step $t$ or earlier.

3109: Let $t= 2^{j_1} + 2^{j_2} + \cdots $ with $j_1>j_2>\cdots $

3110: be the binary expansion of $t$. By Bob's strategy,

3111: the element $x$ occurs less than

3112: $2^{m}/k$ times in the first segment of $2^{j_1}$ sets of Alice,

3113: less than $2^{m}/k$ times in the next segment of $2^{j_2}$ of Alice's

3114: sets, and so on.

3115: Thus its total number of occurrences among the $t$ first sets of Alice is

3116: strictly less than $k 2^m/k=2^m$.

3117: The contradiction proves the claim.%

3118: %($\tau < 2^k$ moves.)

3119: %Above, we gave Bob's algorithm

3120: %for $\tau = 2^k$ moves but it is straightforward to restrict

3121: %it to $\tau < 2^k$ moves.

3122: %Namely, Bob expands the number of moves $\tau$ as

3123: %$2^{j_1}+2^{j_2}+ \cdots + 2^{j_l}$

3124: %with $j_1 > j_2 > \cdots > j_l$ and $j_l < k$. Then he sets $j=j_l$

3125: %and considers the {\em last} $2^j$ sets $A_{\tau-2^j+1}, \ldots, A_{\tau}$,

3126: %denoting them by $D_1, \ldots , D_{2^j}$.

3127: %(End of remark.)

3128: %\end{remark}

3129: \end{proof}

3130: Let us finish the proof of the Lemma~\ref{th5}.

3131: %The strategy of Claim~\ref{l53} can be found by the brute force search

3132: %given $n$, $k$ and $m$, as follows.

3133: Given the list of $\BB$,

3134: recursively enumerate the sets in ${\BB}$ of Kolmogorov complexity

3135: less than $k$,

3136: say $B_1, B_2, \ldots ,B_T$ with $T < 2^k$,

3137: and consider this list as a particular sequence of

3138: moves by Alice.

3139: Use Bob's

3140: strategy of Claim~\ref{l53} against Alice's

3141: sequence as above.

3142: Note that recursive enumeration of the sets in  ${\BB}$

3143: of Kolmogorov complexity less than $k$ means that eventually all such

3144: sets will be produced, although we do not know

3145: when the last one is produced. This only means that the time between moves

3146: is unknown, but the alternating moves between Alice and Bob are deterministic

3147: and sequential.

3148: According to Claim~\ref{l53}, Bob's strategy

3149: marks at most

3150: $O(2^{k-m}k^{2}n)$ sets.

3151: These marked sets cover

3152: every string occurring at least $2^m$

3153: times in the sets $B_1, B_2, \ldots ,B_T$.

3154: We do not know when the last set $B_T$ appears in this list,

3155: but Bob's winning strategy of Claim~\ref{l53} ensures

3156: that immediately after recursively enumerating $B_{i}$

3157: $(i \leq T)$ in the list

3158: every string that occurs in

3159: $2^m$ sets in the initial segment $B_1, B_2, \ldots B_{t}$

3160: is covered by a marked set.

3161: The Kolmogorov complexity $\K(B_i)$ of every marked set $B_i$

3162: in the list $B_1, B_2, \ldots , B_T$ is upper bounded by

3163: the logarithm

3164: of the number of

3165: marked sets, that is

3166: $k-m+O(\log k+\log n)$,

3167: plus the description of ${\BB}$,

3168: $k$, $m$, and $n$ including

3169: separators in

3170: $O(\K({\BB})+\log k+\log m+\log n)$ bits.

3171: \end{proof}

3172: We continue the proof of the theorem.

3173: Let the distortion family ${\A}$ satisfy

3174: properties 2 and 3.

3175: Consider

3176: the subfamily $\BB$ of $\A_n$ consisting of all sets $A$ with

3177: $\wwh{\log A}=\wwh{\log B}$.

3178: Let ${\BB}(x)$ be the family $\{B \in {\BB}: x \in B \}$ and

3179: $N$ the number of sets in

3180: ${\BB}(x)$ of Kolmogorov complexity at most

3181: $\K(B)$.

3182:

3183: Given $x,\wwh{\log B},\A_n$ and $\K(B)$

3184: we can generate all $A\in\BB(x)$ of Kolmogorov complexity

3185: at most $\K(B)$.

3186: Then we can describe $B$ by its index among the generated

3187: sets. This shows that the description length

3188: $\K(B \mid x)\le \log N$

3189: (ignoring an additive term of order $O(\log\K(B)+\log n)$ which suffices since

3190: $\K(\wwh{\log B})$ and $\K(\A_n)$ are both $O(\log n)$).

3191:

3192: Since $\K({\A}_n) = O(\log n)$ by property 3,

3193: ${\BB} \subseteq {\A}_n$ while every set $A \in {\BB}$ satisfies

3194: $\lceil \log |A| \rceil = \lceil \log |B| \rceil \leq n$, we have

3195: $\K({\BB}) = O(\log n)$. Let

3196: $k=\K(B)+1$ and $m=\wh{\log N}$,

3197: and ignore additive terms of order $O(\log k+\log m + \log n)$.

3198: Applying  Lemma~\ref{th5}

3199: shows that there is a set  $A\in \BB(x)$

3200: with $\K(A)\le k-m\le \K(B)-\K(B \mid x)=I(x:B)$ and therefore

3201: proves Theorem~\ref{th-shannon-analog}.

3202: \end{proof}

3203:

3204: \begin{remark}\label{rem.previously}

3205: \rm

3206: Previously an analog of Lemma~\ref{th5} was known in the case

3207: when $\BB$ is the class of \emph{all} subsets $\booln$

3208: of {\em fixed} cardinality  $2^l$.

3209: For $l=0$ this is Exercise 4.3.8 (second edition) and 4.3.9

3210: (third edition) of \cite{LiVi97}:

3211: If a string $x$ has at least

3212: $2^m$ descriptions of length at most $k$

3213: ($p$ is called a description of

3214: $x$ if $U(p)=x$ where $U$ is

3215: the reference Turing machine), then

3216: $\K(x)\le k-m+O(\log k+\log m)$. Reference~\cite{VV02}

3217: generalizes this to all $l> 0$:

3218: If a string belongs to at least $2^m$

3219: sets $B$ of cardinality $2^l$ and Kolmogorov complexity  $\K(B)\le k$,

3220: then $x$ belongs to a set $A$ of cardinality $2^l$ and

3221: Kolmogorov complexity

3222: $\K(A)\le k-m+O(\log m+\log k+\log l)$.

3223: \end{remark}

3224: \begin{remark}\label{rem.muchnik}

3225: \rm

3226: %{\em Off-line case:} We show that there is

3227: %a selection of $2^{r-k}\log|\X|$ sets produced by P,

3228: %that cover all $x\in\X$ that are covered by at least $2^m$ sets produced by P.

3229: %Choose at random $2^{r-k}\log|\X|$ of P's sets (all the sets

3230: %are equiprobable).

3231: %Let $x\in\X$ be covered by at least  $2^m$ sets produced by

3232: %P. Then, the probability

3233: %that $x$ is not covered by the chosen sets is at most

3234: %$$

3235: %(1-2^{k-r})^{2^{r-k}\log|\X|}\le e^{-\log|\X|}\ll 1/|\X|.

3236: %$$

3237: %Multiplying this upper bound by $|\X|$ we get less than 1.

3238: %Therefore, there is a selection of $2^{r-k}\log|\X|$ sets produced by P

3239: %that covers all $x\in\X$ with multiplicity $2^m$ or more.

3240: %

3241: %{\em On-line case:}

3242: {\em Probabilistic proof of Claim~\ref{l53}.}

3243: Consider a new game  that has the same rules and one additional

3244: rule: Bob looses if he marks more than $2^{k-m+1}(n+1)\ln2$ sets.

3245: We will prove that in this game Bob has a winning strategy.

3246:

3247: Assume the contrary: Bob has no winning strategy.

3248: %K\H{o}nig's

3249: %infinity lemma \cite{Ko36} implies

3250: %that every tree that contains infinitely many vertices,

3251: %each having finite degree, has at least one infinite simple path.

3252: Since the number of moves in the game is finite (less than

3253: $2^k$), this implies that

3254: Alice has a winning strategy.

3255:

3256: Fix a winning strategy $S$ of Alice. To obtain a contradiction

3257: we design a randomized strategy for Bob that beats Alice's

3258: strategy $S$ with

3259: positive probability. Bob's strategy is very simple:

3260: mark every set produced by Alice with probability $p=2^{-m}(n+1)\ln2$.

3261: \begin{claim}\label{claim.iii}

3262: \rm

3263: (i)

3264: With probability more than $\frac{1}{2}$,

3265: following every move of Bob every

3266: element occurring in at least $2^m$ of Alice's sets is covered

3267: by a marked set of Bob.

3268:

3269: (ii) With probability more than $\frac{1}{2}$, Bob marks

3270: at most  $2^{k-m+1}(n+1)\ln2$ sets.

3271: \end{claim}

3272:

3273: \begin{proof}

3274: (i) Fix $x$ and estimate

3275: the probability that there is move of Bob following which $x$

3276: belongs to $2^m$ of Alice's sets

3277: but belongs to no marked set of Bob.

3278: %We need to show that this happens with probability

3279: %less than $2^{-n-1}$.

3280:

3281: Let $R_i$ be the event

3282: ``following a  move of Bob, string $x$

3283: occurs at least in $i$ sets of Alice

3284: but none of them is marked''.

3285: Let us

3286: prove by induction that

3287: \[

3288: \Pr [R_i]\le(1-p)^{i}.

3289: \]

3290: For $i=0$ the statement is trivial.

3291: To prove the induction step we need to show that

3292: $\Pr [R_{i+1}|R_i]\le 1-p$.

3293:

3294: Let

3295: $z=z_1,z_2,\dots,z_t$ be a sequence of decisions by Bob:

3296: $z_j=1$ if Bob marks the $j$th set produced by Alice and

3297: $z_j=0$ otherwise. Call $z$ \emph{bad} if

3298: following Bob's $t$th move it happens

3299: for the first time that $x$ belongs to $i$  sets produced by Alice

3300: by move $t$ but none of them is  marked.

3301: Then $R_i$ is the disjoint union of the events

3302: ``Bob has made the decisions $z$'' (denoted by $Q_z$) over all bad $z$.

3303: Thus it is enough to prove that

3304: \[

3305: \Pr [R_{i+1} \mid Q_z]\le 1-p.

3306: \]

3307: Given that

3308: Bob has made the decisions $z$, the event $R_{i+1}$

3309: means that after those decisions the strategy $S$ will at some

3310: time in the future produce the

3311: $(i+1)$st set with member

3312: $x$ but Bob will not mark it.

3313: Bob's decision not to mark that set does not depend

3314: on any previous decision and is made with probability $1-p$.

3315: Hence

3316: $$

3317: \Pr [R_{i+1} \mid Q_z]=\Pr [\text{Alice produces

3318: the $(i+1)$st set with member }x \;  \mid  \;Q_z]

3319: \cdot(1-p)

3320: \le1-p.

3321: $$

3322: The induction step is proved.

3323: Therefore,

3324: $\Pr [R_{2^m}]\le (1-p)^{2^m}<e^{-p2^m}=2^{-n-1}$,

3325: where the last equality follows by choice of $p$.

3326:

3327: (ii) The expected number of marked sets is $p2^k$. Thus

3328: the probability that it exceeds $p2^{k+1}$ is less than $\frac{1}{2}$.

3329: \end{proof}

3330:

3331: It follows from Claim~\ref{claim.iii} that there exists a strategy

3332: by Bob that marks at most $2^{k-m+1}(n+1)\ln2$ sets out of Alice's

3333: produced $2^k$ sets, and following every move of Bob every

3334: element occurring in at least $2^m$ of Alice's sets is covered

3335: by a marked set of Bob. Note that we have proved that

3336: this strategy of Bob exists

3337: but we have not constructed it.

3338: Given $n$, $k$ and $m$, the number of games is finite, and

3339: a winning strategy for Bob can be found by brute force search.

3340: %Note that the proof of Claim~\ref{l53} is constructive;

3341: %the probabilistic proof above shows that a winning strategy for Bob's exists;

3342: %it does not show that it is

3343: %computable given Alice's sequence.

3344: %Multiplying this bound by the number of different $x$ we obtain $1/2$.

3345: %

3346: %Then the probability we want to estimate

3347: %is equal to

3348: %\begin{align*}

3349: %&\Pr [x \text{ is covered $i+1$ times by P's sets

3350: %and belongs to none of them}]\\

3351: %=&\sum_{\text{bad }z}

3352: %\Pr (z)\Pr [x \text{ is covered $i+1$ times by P's sets

3353: %and does not belong the $i+1$st of them}|z]\\

3354: %=&\sum_{\text{bad }z}

3355: %\Pr (z)\Pr [x \text{ is covered $i+1$ times}|z](1-p)

3356: %\le\sum_{\text{bad }z}

3357: %\Pr (z)(1-p)

3358: %\\=&

3359: %\Pr [x \text{ is covered $i$ times by P's sets and belongs to none of them}]

3360: %(1-p)

3361: %\le (1-p)^{i+1}.

3362: %\end{align*}

3363: %

3364: %

3365: %

3366: \end{remark}

3367:

3368: \vspace{.2in}

3369: \begin{proof}{\em of Theorem~\ref{th45}}.

3370: Let $B \subseteq\{0,1\}^n$ be a set containing string $x$. Define the

3371: \emph{sufficiency deficiency of $x$ in $B$}

3372: by

3373: $$

3374: \log|B|+\K(B)-\K(x).

3375: $$

3376: This is the number of extra bits incurred by the two-part code for $x$

3377: using $B$ compared to the most optimal one-part code of $x$ using $\K(x)$ bits.

3378: We relate this quantity with

3379: the randomness deficiency $\delta(x \mid B)=\log |B|-\K(x \mid B)$

3380:  of $x$ in the set $B$.

3381: The randomness deficiency is always less than the sufficiency

3382: deficiency, and the

3383: difference between them is equal to $\K(B \mid x)$:

3384: \begin{equation}\label{eq76}

3385: \log|B|+\K(B)-\K(x)-\delta(x \mid B)=\K(B \mid x),

3386: \end{equation}

3387: where the equality follows from the symmetry of

3388: information \eqref{eq.soi},

3389: ignoring here and later in the proof additive terms of order

3390: $O(\log\K(B)+\log n)$.

3391:

3392: By Theorem~\ref{th-shannon-analog}, which assumes

3393: that properties 2 and 3 hold for the distortion family

3394: ${\A}$, there is  $A\in\A(x)$

3395: with $\wwh{\log|A|}=\wwh{\log|B|}$ and

3396: $\K(A)\le \K(B)-\K(B \mid x)$.

3397: Since $A_x$ is a set of minimal Kolmogorov complexity among

3398: such $A$ we have

3399: $\K(A_x)\le \K(B)-\K(B \mid x)$.

3400: Therefore

3401: \begin{align*}

3402: \K(A_x)+\log|A_x|-\K(x)&\le\K(B)-\K(B \mid x)+\log|A_x|-\K(x)\\

3403: &=

3404: \K(B)-\K(B \mid x)+\log|B|-\K(x)=\delta(x \mid B),

3405: \end{align*}

3406: where the last equality is true by~\eqref{eq76}.

3407: \end{proof}

3408:

3409: \vspace{.2in}

3410: \begin{proof}

3411: {\em of  Theorem}~\ref{thm.dresf}.

3412: %We assume that property 2 holds for the distortion family ${\A}$.

3413:

3414: {\em Left inequality.}

3415: Given $\delta$, $n$, $p$, and the (discrete) graph of $r^n$, we can compute an

3416: optimal $E$ as in \eqref{eq.rndelta}  such that $r^n (\delta)

3417: = \log |E(\mathbf{X}^n)|$. Retrieve $E(x)$ %for every $x$

3418: by its index of $r^n (\delta)$ bits in the set $E(\mathbf{X}^n)$.

3419: Then,

3420: \[

3421: \K(E(x)) \leq r^n(\delta) + O(\K(\delta,r^n,X,n)).

3422: \]

3423: By definition, $r_x(\delta) \leq \K(E(x))$.

3424: Taking the expectation of $r_x(\delta)$ over

3425: $p$, we are done.

3426:

3427: {\em Right inequality.}

3428: Define a code $E_0$ such that

3429: $\K(E_0(x)) = r_x(\delta)$

3430: for every $x \in \mathbf{X}^n$.

3431: Let $E_0(\mathbf{X}^n)$ be the range of $E_0$.

3432: Although $E_0(\mathbf{X}^n)$ cannot be computed, it is finite, and trivially

3433: \[

3434: \log |E_0(\mathbf{X}^n)| \leq \max_{x \in \mathbf{X}^n} \K(E_0(x)).

3435: \]

3436: By definition $r^n(\delta) \leq \log |E_0(\mathbf{X}^n)|$, which yields

3437:  $r^n (\delta)

3438: \leq \max_{x \in \mathbf{X}^n} r_x(\delta)$.

3439:

3440: The noiseless coding theorem, \cite{Sh48,LiVi97}, shows that

3441: \[

3442: \sum_{x \in \mathbf{X}^n} p(x)r_x(\delta)

3443: =  \sum_{y \in E_0(\mathbf{X}^n)} S(y) \K(y)

3444:  \geq H(S),

3445: \]

3446: with $S$ the distribution defined in the statement of the theorem.

3447: By definition, $r^n(\delta) \leq \log |\mathbf{Y}^n|$, which yields

3448: $r^n(\delta) \leq H(L)$, with $L$ as in the statement of the theorem.

3449: Together, we obtain

3450: $r^n (\delta)

3451: \leq {\bf E} r_x(\delta)+ \Delta_2$.

3452: \end{proof}

3453:

3454:

3455:

3456:

3457: \section*{Acknowledgements}

3458: We thank Alexander K. Shen for helpful suggestions.

3459: Andrei A. Muchnik gave the probabilistic proof

3460: of Claim~\ref{l53} in Remark~\ref{rem.muchnik} after having seen

3461: the deterministic proof.

3462: Such a probabilistic proof

3463: was independently proposed by Michal Kouck\'y.

3464: We thank the referees for their constructive comments;

3465: one referee pointed out that yet another example would be

3466: the case of Euclidean balls with the usual Euclidean distance, where

3467: the important Property 4 is proved in for example \cite{VG05}.

3468: The work of N.K. Vereshchagin was done in part

3469: while visiting CWI and was supported in part by the grant

3470: 09-01-00709 from Russian Federation

3471: Basic Research Fund and by a visitors grant of NWO.

3472: The work of P.M.B. Vit\'anyi was

3473: supported in part by

3474: the BSIK Project BRICKS

3475: of the Dutch government and NWO, and by the

3476: EU NoE PASCAL (Pattern Analysis, Statistical Modeling,

3477: and Computational Learning).

3478:

3479: \begin{thebibliography}{9}

3480:

3481: \bibitem{Be71}

3482: T. Berger, {\em Rate Distortion Theory: A Mathematical Basis for

3483: Data Compression}, Prentice-Hall, Englewood Cliffs, NJ, 1971.

3484:

3485: \bibitem{BG98}

3486: T. Berger, J.D. Gibson, Lossy source coding, {\em IEEE Trans. Inform. Th.},

3487: 44:6(1998), 2693--2723.

3488:

3489: %\bibitem{BKVV03}

3490: %H.~Buhrman, H.~Klauck, N.K. Vereshchagin, and P.M.B. Vit\'anyi.

3491: %\newblock Individual communication complexity.

3492: %\newblock In {\em Proc. 21th Symp. Theoret. Aspects of Comput. Sci.},

3493: %Lecture Notes in Computer Science, Vol. 2996, Springer-Verlag, Berlin, 2004,

3494: %19--30.

3495:

3496: \bibitem{BW94}

3497:  M. Burrows and D. J. Wheeler, A block-sorting lossless data

3498: compression algorithm, Digital Equipment Corporation, Systems Research

3499: Center, Tech. Rep. 124, May 1994.

3500:

3501: \bibitem{CYV97}

3502: S.C. Chang, B. Yu, M. Vetterli, Image denoising via lossy compression and

3503: wavelet thresholding, {\em Proc. Int. Conf. Image Process. (ICIP'97)},

3504: 1997, 604-607 in Volume 1.

3505:

3506: %\bibitem{CT91}

3507: %T.M. Cover and J.A. Thomas, {\em Elements of Information Theory},

3508: %Wiley, New York, 1991.

3509:

3510: %\bibitem{CV05}

3511: %R. Cilibrasi, P.M.B. Vitanyi, Clustering by compression,

3512: %{\em IEEE Trans. Information Theory}, 51:4(2005)

3513:

3514: \bibitem{Do02}

3515: D. Donoho, The Kolmogorov sampler, {\em Annals of Statistics},

3516: submitted.

3517:

3518: %\bibitem{El57}

3519: %P. Elias, List decoding for noisy channels. {\em Wescon Convention Record,}

3520: %Part 2, Institute for Radio Engineers (now IEEE), 1957, 94--104.

3521:

3522: %\bibitem{El91}

3523: %P. Elias, Error-correcting codes for List decoding,

3524: %{\em IEEE Trans. Inform. Th.}, 37:1(1991), 5--12.

3525:

3526: %\bibitem{flv}

3527: %L. Fortnow,

3528: %T. Lee, N. Vereshchagin,

3529: %Kolmogorov Complexity with Error,

3530: %{\em Proc. Symposium Theoretical Aspects of Comput. Science 2006,}

3531: %Lecture Notes in Computer Science, vol. 3884 (2006) 137--148

3532:

3533: %\bibitem{GHLL97}

3534: %G.~Cohen, I.~Honkala, S.~Litsyn, and A.~Lobstein.

3535: %\newblock {\em Covering Codes}.

3536: %\newblock North-Holland, Amsterdam, 1997.

3537:

3538: \bibitem{GTV01} P. G\'acs, J. Tromp, P.M.B. Vit\'anyi.

3539: Algorithmic statistics, {\em IEEE Trans. Inform. Th.}, 47:6(2001), 2443--2463.

3540:

3541: %\bibitem{GV03}

3542: %P.D. Gr\"unwald and P.M.B. Vit\'anyi, Shannon information and Kolmogorov

3543: %complexity, {\em IEEE Trans. Information Theory}, Submitted.

3544: %http://arxiv.org/abs/cs/0410002

3545:

3546: %\bibitem{IP05}

3547: %iPOD + iTUNES web-page at http://www.apple.com/ipod/

3548:

3549: %\bibitem{Ke04}

3550: %E. Keogh, S. Lonardi, and C.A. Rtanamahatana, Toward parameter-free

3551: %data mining, In: {\em Proc. 10th ACM SIGKDD Intn'l Conf. Knowledge

3552: %Discovery and Data Mining}, Seattle, Washington, USA, August 22---25, 2004,

3553: %206--215.

3554:

3555: %\bibitem{ISW00}

3556: %R.~Impagliazzo, R.~Shaltiel, and A.~Wigderson.

3557: %\newblock Extractors and pseudo-random generators with optimal seed length.

3558: %\newblock In {\em Proceedings of the 32nd ACM Symposium on the Theory of

3559:   %Computing}, pages 1--10. ACM, 2000.

3560:

3561: \bibitem{Ko65}

3562: A.N. Kolmogorov,

3563: {Three approaches to the quantitative definition of information},

3564: {\em Problems Inform. Transmission} 1:1 (1965) 1--7.

3565:

3566: \bibitem{Ko74}

3567:  A.N. Kolmogorov.

3568:  Complexity of Algorithms and Objective Definition of Randomness.

3569:  A talk at Moscow Math. Soc. meeting 4/16/1974.

3570:  An abstract available in {\em Uspekhi Mat. Nauk} 29:4(1974),155;

3571: English translation in \cite{VV02}.

3572:

3573: %\bibitem{LC78}

3574: %S.K. Leung-Yan-Cheong and T.M. Cover,

3575: %Some equivalences between Shannon entropy and Kolmogorov complexity,

3576: %{\em IEEE Trans. Inform. Theory},

3577: %24:3(1978), 331-338.

3578:

3579: %\bibitem{Ko36}

3580: %D. K\H{o}nig, {\em Theorie der Endlichen und Unendlichen Graphen:

3581: %Kombinatorische Topologie der Streckenkomplexe}, Akad. Verlag.,

3582: %Leipzig, 1936.

3583:

3584: \bibitem{LiVi97}

3585: M. Li and P.M.B. Vit\'anyi,

3586: {\em An {I}ntroduction to {K}olmogorov {C}omplexity and {I}ts

3587:   {A}pplications},

3588: Springer-Verlag, New York, 1997 (second edition), 2008 (third edition).

3589:

3590: \bibitem{Li01}

3591: M. Li, J.H. Badger, X. Chen, S. Kwong, P. Kearney, and H. Zhang,

3592: An information-based sequence distance and its application

3593: to whole mitochondrial genome phylogeny,

3594: {\em Bioinformatics}, 17:2(2001), 149--154.

3595:

3596:

3597: \bibitem{Li04}

3598: M. Li, X. Chen, X. Li, B. Ma, P.M.B. Vitanyi,

3599: The similarity metric, {\em IEEE Trans. Inform. Th.}, 50:12(2004), 3250- 3264.

3600:

3601: \bibitem{Na95}

3602: B.K. Natarajan, Filtering random noise from deterministic signals via

3603: data compression, {\em IEEE Trans. on Signal Processing}, 43:11(1995), 2595-2605.

3604:

3605: \bibitem{MK94}

3606: J. Muramatsu, F. Kanaya, Distortion-complexity and rate-distortion function,

3607: {\em IEICE Trans. Fundamentals}, E77-A:8(1994), 1224--1229.

3608:

3609: \bibitem{rum}

3610: Andrey Rumyantsev,

3611: Transmission of information

3612: through a noisy channel in Kolmogorov complexity setting.

3613: Vestnik MGU, Seriya Matematika i Mechanika (Russian), to appear in 2006.

3614:

3615: \bibitem{RV06}

3616: S. de Rooij, P.M.B. Vitanyi,

3617: Approximating rate-distortion graphs of individual data: Experiments

3618: in lossy compression and denoising, {\em IEEE Trans. Comput.},

3619: Submitted. Also: Arxiv preprint cs.IT/0609121, 2006.

3620:

3621:

3622: \bibitem{Sa94}

3623: N. Saito, Simultaneous noise suppression and signal compression

3624: using a library of orthonormal bases and the minimum description

3625: length criterion, Pp. 299--324 in {\em Wavelets in Geophysics},

3626: E. Foufoula-Georgiou, P. Kumar, Eds., Academic Press, 1994.

3627:

3628: %\bibitem{salnikov}

3629: %S. Salnikov.

3630: %\newblock Kolmogorov complexity

3631: %of initial segments of binary sequences.

3632: %Manuscript, 2004.

3633:

3634: \bibitem{Sh48}

3635: C.E. Shannon.

3636: \newblock The mathematical theory of communication.

3637: \newblock {\em Bell System Tech. J.}, 27:379--423, 623--656, 1948.

3638:

3639: \bibitem{Sh59}

3640: C.E. Shannon.

3641: \newblock Coding theorems for a discrete source with a fidelity criterion.

3642: \newblock In {\em IRE National Convention Record, Part 4}, pages 142--163,

3643:   1959.

3644:

3645: \bibitem{Sh83}

3646: A.Kh. Shen, The concept of $(\alpha , \beta )$-stochasticity

3647: in the Kolmogorov sense, and its properties, {\em Soviet Math. Dokl.},

3648: 28:1(1983), 295--299.

3649:

3650:

3651: \bibitem{SE03}

3652: D.M. Sow, A. Eleftheriadis,

3653: Complexity distortion theory,

3654: {\em IEEE Trans. Inform. Th.}, 49:3(2003), 604--608.

3655:

3656: \bibitem{Tu36}

3657: A.M. Turing, On computable numbers, with an application to the

3658: Entscheidungsproblem, {\em Proc. London Mathematical Society}, 42:2(1936),

3659: 230-265, "Correction", 43i(1937), 544-546.

3660:

3661:

3662: \bibitem{VV02}

3663: N.K. Vereshchagin and P.M.B. Vit\'anyi, Kolmogorov's Structure

3664: functions and model selection, {\em IEEE Trans. Inform. Theory},

3665: 50:12(2004), 3265- 3290.

3666:

3667: \bibitem{VG05}

3668: J.L. Verger-Gaugry,

3669: Covering a ball with smaller equal balls in $R^n$,

3670: {\em Discrete and Computational Geometry}, 33(2005), 143--155.

3671:

3672: %\bibitem{Wo58}

3673: %J.M. Wozencraft, List decoding. {\em Quarterly Progress Report},

3674: %Research Laboratory for Electronics, MIT, Vol. 58(1958), 90--95.

3675:

3676:

3677: %\bibitem{Ya89}

3678: %E.-H. Yang, The proof of Levin's conjecture,

3679: %{\em Chinese Science Bull.}, 34:21(1989), 1761--1765.

3680:

3681: \bibitem{Vy87}

3682: V.V. V'yugin,

3683: On the defect of randomness of a finite object with respect to

3684: measures with given complexity bounds, {\em SIAM Theory Probab. Appl.},

3685: 32:3(1987), 508--512.

3686:

3687: \bibitem{YS93}

3688: E.-H. Yang, S.-Y. Shen,

3689: Distortion program-size complexity with respect to a fidelity

3690: criterion and rate-distortion function,

3691: {\em IEEE Trans. Inform. Th.}, 39:1(1993), 288--292.

3692:

3693:

3694:

3695:

3696: \bibitem{Zi80}

3697: J. Ziv, Distortion-rate theory for individual sequences,

3698: {\em IEEE Trans. Inform. Th.}, 26:2(1980), 137--143.

3699:

3700: %\bibitem{ZL70}

3701: %A.K. Zvonkin and L.A. Levin,

3702: %The complexity of finite objects and the development of the concepts

3703:   %of information and randomness by means of the theory of algorithms,

3704: %{\em Russian Math. Surveys} 25:6 (1970) 83-124.

3705:

3706:

3707: \end{thebibliography}

3708: \end{document}

3709: