cs0411014/rate.tex
1: %\documentclass[twocolumn,twoside]{IEEEtran}
2: \documentclass[draft,onecolumn]{IEEEtran}
3: \usepackage{amsmath,amstext,amssymb,epsf}
4: \usepackage[dvips]{graphicx}
5: %\newcommand{\epsffile}[1]{\includegraphics{#1}}
6: \date{}
7: %\setlength{\topmargin}{-0.6in}
8: %\setlength{\textwidth}{6.2in}
9: %\setlength{\oddsidemargin}{0.2in}
10: %\setlength{\evensidemargin}{0.2in}
11: %\setlength{\textheight}{9.4in}
12: 
13: %\newenvironment{changekolya}{}{}
14: \newenvironment{changekolya}{[}{]}
15: %\newenvironment{changekolya}{$\Big[$}{$\Big]$}
16: 
17: \newcommand{\All}{\mathcal U}
18: \newcommand{\Ham}{\mathcal H}
19: \newcommand{\Euc}{\mathcal E}
20: \newcommand{\dd}{l}
21: \newcommand{\tx}{\tilde x}
22: \newcommand{\dds}{m}
23: \newcommand{\ddd}{d}
24: \newcommand{\ddds}{d'}
25: \newcommand{\tir}{r}
26: \renewcommand{\le}{\leq}
27: \renewcommand{\ge}{\geq}
28: \renewcommand{\emptyset}{\varnothing}
29: 
30: \newcommand{\dm}{D}
31: \newcommand{\comp}{s}
32: \newcommand{\A}{\mathcal A}
33: \newcommand{\BB}{\mathcal B}
34: \newcommand{\B}{\Delta}
35: \newcommand{\C}{\mathcal C}
36: \newcommand{\Q}{\mathcal Q}
37: \newcommand{\U}{\mathcal U}
38: \newcommand{\X}{\booln}
39: \newcommand{\Y}{\mathcal{Y}}
40: \newcommand{\dmax}{n}
41: \newcommand{\ymax}{y_0}
42: \newcommand{\dmin}{0}
43: \newcommand{\booln}{\{0,1\}^n}
44: \newcommand{\bool}{\{0,1\}^*}
45: %\newcommand{\dmin}{d_\text{min}}
46: 
47: \newcommand{\K}{C}
48: \newcommand{\KE}{\textit{CE}}
49: \newcommand{\wh}[1]{\lfloor #1 \rfloor}
50: \newcommand{\wwh}[1]{\lceil #1 \rceil}
51: %\newcommand{\poly}{\text{poly}}
52: \newcommand{\eps}{\varepsilon}
53: \newcommand{\Time}{\textit{Time}}
54: \newcommand{\cd}{\textit{CD}}
55: \newcommand{\pair}[1]{\langle #1\rangle}
56: \newcommand{\prob}{\text{Prob}}
57: \newcommand{\ave}{\mathop{\bf E}}
58: %\newcommand{\N}{\mathbb{N}}
59: %\newcommand{\R}{\mathbb{R}}
60: %\newcommand{\Q}{\mathbb{Q}}
61: \newcommand{\len}[2]{l_{#1}(#2)}
62: \newcommand{\close}[2]{#1=\mathcal E(#2)}
63: \newcommand{\dclose}[2]{#1 \cong #2}
64: \newcommand{\last}{\textit{LAST}}
65: \newcommand{\least}{\textit{least}}
66: \newcommand{\bb}[1]{\textit{BB}(#1)}
67: \newcommand{\bbinv}[1]{{\textit{BB}}^{-1}(#1)}
68: \newcommand{\Loss}{{\rm Loss}}
69: 
70: \newtheorem{theorem}{\sc Theorem}
71: \newtheorem{proposition}{\sc Proposition}
72: \newtheorem{lemma}{\sc Lemma}
73: \newtheorem{coro}{\sc Corollary}
74: 
75: %\theoremstyle{remark}
76: %\newtheorem{example}{Example}
77: \newtheorem{nota}{\sc Notation}
78: \newtheorem{defin}{\sc Definition}
79: \newtheorem{rem}{\sc Remark}
80: \newtheorem{cla}{\sc Claim}
81: \newtheorem{ex}{\sc Example}
82: \newenvironment{comment}{\begin{small}\begin{quotation}\hspace{-0.23in}\rm}{\end{quotation}\end{small}}
83: %\newenvironment{proof}{\par \sc Proof.\rm}{\hspace*{\fill}$\Box$\vspace{1ex}}
84: \newenvironment{remark}{\begin{rem}}{\hspace*{\fill}$\Diamond$\end{rem}}
85: \newenvironment{example}{\begin{ex}}{\hspace*{\fill}$\diamondsuit$\end{ex}}
86: \newenvironment{claim}{\begin{cla}}{\end{cla}}
87: \newenvironment{corollary}{\begin{coro}}{\end{coro}}
88: \newenvironment{requirement}{\begin{req}}{\end{req}}
89: \newenvironment{definition}{\begin{defin}}{\end{defin}}
90: %\newenvironment{remark}{\begin{rem}}{\end{rem}}
91: \newenvironment{notation}{\begin{nota}}{\end{nota}}
92: 
93: 
94: 
95: \title{Rate Distortion and Denoising of Individual Data 
96: Using Kolmogorov complexity}
97: \author{
98: Nikolai K. Vereshchagin\thanks{
99: NKV, Dept. Math. Logic \& Theor. Algor.,
100: Moscow State Univ., Russia. Email: nikolay.vereshchagin@gmail.com}
101: and
102: Paul M.B. Vit\'anyi\thanks{
103: PMBV, CWI, Science Park 123, 1098XG Amsterdam, the Netherlands.
104: Email: Paul.Vitanyi@cwi.nl}
105: }
106: 
107: \begin{document}
108: \maketitle
109: \begin{abstract}
110: %Kolmogorov complexity can be used 
111: %to obtain a rate-distortion theory and denoising of individual data
112: %(source words) 
113: %taken to be finite binary strings. We prove for almost all distortion measures
114: %(i) different source words have different rate-distortion curves, and for
115: %every curve from a wide family there is a source word that yields 
116: %this curve approximately; 
117: %(ii) a Kolmogorov complexity
118: %characterization of the rate-distortion curve 
119: %for individual source words in
120: %terms of algorithmic mutual information;
121: %(iii) if a destination word witnesses the rate-distortion curve of 
122: %a given source word
123: %at a given rate, then this destination word captures 
124: %as many properties of the source word as is possible
125: %at this rate;
126: %(iv) application of the last result to the denoising of 
127: %corrupted individual source words; and 
128: %(v) the relation between the expected rate-distortion
129: %curves of the individual source words and Shannon's rate-distortion curve.
130: We examine the structure of families of distortion balls from
131: the perspective of Kolmogorov complexity. Special attention is paid to
132: the canonical rate-distortion function of a source word
133: which returns the minimal Kolmogorov complexity of all distortion balls
134: containing that word subject to a bound on their cardinality. This canonical
135: rate-distortion function is related to the more standard
136: algorithmic rate-distortion function for the given distortion measure. 
137: Examples are given of list distortion, 
138: Hamming distortion, and Euclidean distortion.
139: The algorithmic rate-distortion function can behave
140: differently from Shannon's rate-distortion function.
141: To this end, we show that the canonical
142: rate-distortion function can and does assume a wide class of shapes
143: (unlike Shannon's); we relate low algorithmic mutual information
144: to low Kolmogorov complexity (and consequently suggest that certain aspects of the
145: mutual information formulation of Shannon's rate-distortion function
146: behave differently than would an analogous formulation using algorithmic
147: mutual information); we explore the notion that low Kolmogorov complexity
148: distortion balls containing a given word 
149: capture the interesting properties of that word
150: (which is hard to formalize in Shannon's theory) and this
151: suggests an approach to denoising; and, finally, we show that
152: the different behavior of the rate-distortion curves
153: of individual source words to some extent 
154: disappears after averaging over the source words.
155: \end{abstract}
156: 
157: \section{Introduction}
158: \label{sect.rdsf}
159: Rate distortion theory analyzes the transmission and 
160: storage of information at insufficient bit rates.
161: The aim is to minimize the resulting information loss
162: expressed in a given distortion measure.
163: The original data is called the `source word'
164: and the encoding used for transmission or storage
165: is called the `destination word.' The number of bits available
166: for a destination word is called the `rate.'
167: The choice of distortion
168: measure 
169: is usually a selection of which aspects of the source word are relevant
170: in the setting at hand, and
171: which aspects are irrelevant (such as noise). 
172: For example, in application to 
173: lossy compression of a sound file this results
174: in a compressed file where, among others, the very high and
175: very low inaudible frequencies have been suppressed. 
176: The distortion measure is chosen such that it penalizes 
177: the deletion of the inaudible
178: frequencies but lightly because they are not 
179: relevant for the auditory
180: experience. We study rate distortion of 
181: individual source words using Kolmogorov complexity and show
182: how it is related to
183: denoising.
184: The classical probabilistic theory is 
185: reviewed in Appendix~\ref{sect.ratedistortion}.
186: Computability notions are reviewed in Appendix~\ref{sect.computability}
187: and Kolmogorov complexity in Appendix~\ref{sect.kolmcompl}.
188: Randomness deficiency according to Definition~\ref{def.rd}
189: and its relation to the fitness of a destination word for
190: a source word is explained further in Appendix~\ref{sect.rd}. 
191: Appendix~\ref{sect.exhamming} gives the proof, required
192: for a Hamming distortion example, that
193: every large Hamming ball can be covered by a
194: small number of smaller
195: Hamming balls (each of equal cardinality).
196: More specifically, the number of covering balls 
197: is close to the ratio between the cardinality
198: of the large Hamming ball and the small Hamming ball.
199: The proofs of the theorems are deferred to Appendix~\ref{sect.proofs}.
200: 
201: \subsection{Related Work}
202: In \cite{Ko74} A.N. Kolmogorov formulated the
203: `structure function' which can be viewed as a proposal
204: for non-probabilistic model
205: selection. This function and the associated Kolmogorov
206: sufficient statistics are partially treated in
207: \cite{Sh83,Vy87,GTV01} 
208: and analyzed in detail in \cite{VV02}.
209: We will show that the structure function
210: approach can be generalized to give an approach to 
211: rate distortion and denoising of
212: individual data. 
213: 
214: Classical rate-distortion theory 
215: was initiated by Shannon in~\cite{Sh48}.
216: In~\cite{Sh59} Shannon gave a nonconstructive
217: asymptotic characterization of the expected rate-distortion curve of a
218: random variable
219: (Theorem~\ref{theo.shannon} in Appendix~\ref{sect.ratedistortion}).
220: References \cite{Be71,BG98} treat
221: more general distortion measures and random variables in the Shannon
222: framework.
223: 
224: References~\cite{YS93,MK94,SE03} relate
225: the classical and algorithmic approaches according to traditional
226: information-theoretic concerns. We follow their definitions of
227: the rate-distortion function.
228: The results show that if the source word is obtained from random
229: i.i.d. sources, then with high probability and in expectation
230: its individual rate-distortion curve is close to
231: the Shannon's single rate-distortion curve.
232: In contrast, our Theorem~\ref{theo.allshapesrd} shows that 
233: for distortion measures satisfying properties 1 through 4
234: below
235: there are many different shapes of individual 
236: rate-distortion functions related to the different
237: individual source words,
238: and many of them
239: are very different from Shannon's rate-distortion curve.
240: 
241: 
242: Also Ziv~\cite{Zi80} considers
243: a rate-distortion function for individual data.
244: The rate-distortion function is assigned to
245: every infinite sequence $\omega$ of letters of a finite alphabet $\Gamma$.
246: %(and not to a finite object, as in the present paper).
247: The source words $x$
248: are prefixes of $\omega$
249: and the encoding function is
250: computed by a finite state transducer.
251: Kolmogorov complexity is not involved.
252: 
253: In \cite{Sa94,Na95,CYV97,Do02} 
254: alternative approaches to denoising via compression 
255: and in \cite{RV06,rum} applications of the current work
256: are given.
257: 
258:  In \cite{VV02} Theorems~\ref{theo.allshapesrd}, \ref{th45} were obtained
259:  for a particular distortion measure relevant to model selection (the example 
260: ${\cal L}$ in this paper).
261: The techniques used in that paper
262: do not generalize to prove the current theorems which concern 
263: arbitrary distortion measures
264: satisfying certain properties
265: given below.
266: 
267: 
268: \subsection{Results}
269: A source word is taken to be a finite binary string.
270: Destination words are finite objects (not necessarily finite binary strings).
271: For every destination word encoding a particular source word with
272: a certain distortion, there is a finite set of source words that are
273: encoded by this destination word with at most that distortion.
274: %Therefore, we can loosely 
275: %identify a destination word with the set of source words thus defined
276: %(if there are more than one such set we take the first one in 
277: %lexicographical order).   
278: We call these finite sets of source words `distortion balls.'
279: Our approach is based on the Kolmogorov complexity           
280: of distortion balls. For every source word we
281: define its `canonical' rate-distortion function,
282: %independent of a distortion measure,
283: from which 
284: the algorithmic rate-distortion function of that source word 
285: %for a specific distortion measure 
286: can be obtained by a simple
287: transformation,
288: Lemma~\ref{lem.rg}. 
289: 
290: Below we assume that a distortion measure 
291: satisfies certain properties which are specified in the theorems
292: concerned.
293: In Theorem~\ref{theo.allshapesrd} it is shown that
294: there are distinct canonical rate-distortion curves (and hence distinct
295: rate-distortion curves) associated with 
296: distinct source words (although some curves may coincide). Moreover,
297: every candidate curve from a given family of curves is 
298: realized approximately as the 
299: canonical rate-distortion curve (and hence for a related family
300: of curves every  curve is realized approximately as the 
301: rate-distortion curve) of some
302: source word.
303: In Theorem~\ref{th-shannon-analog} we prove a Kolmogorov
304: complexity analogue for 
305: Shannon's theorem, Theorem~\ref{theo.shannon} 
306: in Appendix~\ref{sect.ratedistortion}, on the characterization
307: of the expected rate-distortion
308: curve of a random variable.
309: The new theorem states approximately the following:
310: For every source word and every destination word there exists
311: another destination word that has Kolmogorov complexity
312: equal to algorithmic information in the first destination word about the
313: source word, up to a logarithmic additive term,
314: and both destination words incur the same distortion
315: with the source word. (The theorem is given in the distortion-ball formulation
316: of destination words.)
317: In Theorem~\ref{th45} we show that, at every rate, 
318: the destination word incurring the least distortion
319: is in fact the `best-fitting' among all destination words at that rate. 
320: `Best-fitting' is taken in the sense of sharing the most
321: properties with the source word.
322: (This notion of a `best-fitting' destination word for a
323: source word can be expressed in Kolmogorov complexity, but 
324: not in the classic probabilistic framework. Hence there is no
325: classical analogue for this theorem.)
326: It turns out that this yields a method of denoising by compression.
327: Finally, in Theorem~\ref{thm.dresf}, we show that the expectation
328: of the algorithmic rate-distortion functions is
329: pointwise related to Shannon's rate-distortion function, where the closeness
330: depends on the Kolmogorov complexities involved and 
331: ergodicity and stationarity of the source.
332: 
333: 
334: 
335: 
336: \section{Preliminaries}
337: 
338: \subsection{Data and Binary Strings}
339: We write {\em string} to mean a finite binary string.
340:   Other finite objects can be encoded into strings in natural
341: ways.  The set of strings is denoted by $\{0,1\}^*$. The {\em length}
342: of a string $x$ is the number of bits in it denoted as $|x|$. The {\em empty}
343: %string $x$ has length $|x| = 0$.
344: string $\epsilon$ has length $|\epsilon| = 0$.
345: Identify the natural numbers 
346: ${\cal N}$ (including 0) and $\{0,1\}^*$ according to the
347: correspondence 
348:  \begin{equation}\label{order}
349:  (0, \epsilon ), (1,0), (2,1), (3,00), (4,01), \ldots . 
350:  \end{equation}
351: Then, $|010|=3$.
352: The emphasis is on binary sequences only for convenience;
353: observations in every finite alphabet can be so encoded in a way
354: that is `theory neutral'. For example, if a finite alphabet $\Sigma$ has
355: cardinality $2^k$, then every element $i \in \Sigma$ can be encoded
356: by $\sigma(i)$ which
357: is a block of bits of length $k$. With this encoding every $x \in \Sigma^*$
358: satisfies that the Kolmogorov complexity 
359: $\K(x)=\K(\sigma(x))$ (see Appendix~\ref{sect.kolmcompl} for basic definitions
360: and results on Kolmogorov complexity) 
361: up to an additive constant that is
362: independent of $x$.
363: 
364: \subsection{Rate-Distortion Vocabulary}
365: %Let ${\cal X}$ be the {\em source alphabet}
366: %consisting of a set of {\em source} objects 
367: %called {\em words} or {\em messages}. 
368: Let ${\cal X}$ be a set, called  
369: the {\em source alphabet} whose elements are called 
370: {\em source words} or {\em messages}. 
371: We also use a set $\Y$ called the {\em destination alphabet},
372: whose elements are called {\em destination words}. 
373: (The destination alphabet is also called the reproduction alphabet.)
374: In general there are no restrictions on the set 
375: ${\cal X}$; it can be countable or uncountable.
376: However, for technical reasons, we assume ${\cal X}= \{0,1\}^*$.
377: On the other hand, it is important that the set $\Y$ consists
378: of {\em finite objects}: we need that the notion of Kolmogorov complexity
379: $\K(y)$ be defined for all $y\in\Y$. 
380: (Again, for basic definitions and results on Kolmogorov complexity 
381: see Appendix~\ref{sect.kolmcompl}.) 
382: In this paper it is not essential
383: whether we use plain Kolmogorov complexity or the  prefix 
384: variant; we use plain Kolmogorov complexity.
385: 
386: 
387: Suppose we want to communicate a source word 
388: $x \in {\cal X}$ using a {\em destination word}
389: $y \in {\Y}$ 
390: that can be encoded in at most $r$ bits in the sense that
391: the Kolmogorov complexity $\K(y) \leq r$. 
392: %(For example, if  $|{\Y}| \ll 2^r$.)
393: %If $x \in {\cal X}$, that is $x$ is a finite object,
394: %and the Kolmogorov complexity 
395: %$\K(x) > r$, 
396: %%or if $x$ is not a finite object in which case we define $\K(x)= \infty$,
397: %then $\K(y) \leq r < \K(x)$ for every destination word
398: %$y \in {\Y}$.
399: %Therefore, $x$ cannot be reproduced from any such $y$.
400: Assume furthermore that we are given
401: a {\em distortion}
402: function
403: $d: {\cal X} \times {\Y} \rightarrow {\cal R} \bigcup \{\infty\}$,
404: that measures the fidelity of the destination word
405: against the source word.
406: Here ${\cal R}$ denotes the nonnegative real numbers,
407: 
408: \begin{definition}\label{def.rddr}
409: \rm
410: Let $x\in {\cal X} = \{0,1\}^*$ and ${\cal Q}$ denote the rational numbers.
411: The {\em rate-distortion function} $r_x: {\cal Q} \rightarrow {\cal N}$ is
412: the minimum number of bits in
413: a destination word $y$ 
414: to obtain a distortion of at most $\delta$ defined by
415: \[
416: r_x(\delta) = \min_{y \in {\Y}} \{\K(y) :  d(x,y)\le \delta\}
417: \]
418: %The domain of $r_x$ is the set $\Q$ of 
419: %rational numbers.
420: The `inverse' of the above function is
421: is the {\em distortion-rate function} $d_x: {\cal N} \rightarrow {\cal R}$
422:  and is 
423: defined by
424: \[
425: d_x (r) = \min_{y \in {\Y}}  \{d(x,y) :    \K(y) \leq r \}.
426: \]
427: %The domain of $d_x$ is $\cal N$.
428: \end{definition}
429: These functions are analogs for individual source words $x$ of the
430: Shannon's rate-distortion
431: function defined in \eqref{eq.rndelta} and its related 
432: distortion-rate function, expressing
433: the least expected rate or distortion at which outcomes 
434: from a random source $X$ can be transmitted,
435: see Appendix~\ref{sect.ratedistortion}.
436: 
437: \subsection{Canonical Rate-Distortion Function}
438: 
439: Let ${\cal X}=\{0,1\}^*$ be the source 
440: alphabet, 
441: ${\Y}$ a destination 
442: alphabet,
443: and $d$ a distortion measure.
444: 
445: \begin{definition}\label{def.distball}
446: \rm
447: A {\em distortion ball} $B(y,\delta)$ centered on $y \in {\Y}$
448: with radius $\delta\in\cal Q$ is defined by
449: \[
450: B(y,\delta)= \{x \in {\cal X}: d(x,y) \leq \delta \},
451: \]
452: and its cardinality is denoted by $b(y,\delta) = |B(y,\delta)|$.
453: (We will consider only pairs $(\Y,d)$
454: such that all distortion balls are finite.)
455: If the cardinality $b(y,\delta)$ depends only on
456: $\delta$ but not on the center $y$, then we denote it by  $b(\delta)$.
457: The family ${\A}^{d,\Y}$ is
458: defined as the set of all nonempty distortion balls.
459: The restriction 
460: to strings of length $n$ is denoted by 
461: ${\A}^{d,\Y}_n$.
462: \end{definition}
463: %Every
464: %distortion ball corresponds uniquely with a (destination word, distortion) 
465: %pair, and if a ball corresponds to more than one such pair
466: %then we take the first one in a given order among the pairs 
467: %having the least distortion.
468: 
469: To define the canonical rate-distortion function we need
470: the notion of the Kolmogorov complexity
471: of a finite set.
472: 
473: \begin{definition}\label{def.kcset}
474: \rm
475: Fix a computable 
476: total order on the set of all strings 
477: (say the order defined in \eqref{order}). 
478: The {\em Kolmogorov complexity $\K(A)$ of a finite set} 
479: %$A=\{x_1, \ldots x_m\} \subseteq \{0,1\}^*$
480: is defined as the length of the shortest 
481: %program $p$
482: string $p$
483: such that the universal reference Turing machine $U$ 
484: %prints $U(p)=x_1, \ldots , x_m$ in a fixed order,
485: %say lexicographic, and halts.  
486: %We require that the constituent elements are 
487: %distinguishable so that we can tell
488: %them apart. 
489: given $p$ as input prints the list of all elements of $A$    
490: in the fixed order 
491: and halts. 
492: We require that the constituent elements are 
493: distinguishable so that we can tell
494: them apart. 
495: Similarly we define the {\em conditional} versions
496: $\K(A\mid z)$ and $\K(z\mid A)$ 
497: where $A$ is a finite set of strings
498: and $z$ is a string or a finite set of strings. 
499: \end{definition}
500: 
501: \begin{remark}
502: \rm
503: In Definition~\ref{def.kcset}
504: it is important that $U(p)$ halts after printing the last 
505: element in the list---in this way we know that the list is complete.
506: If we allowed $U(p)$ to not halt, then we would obtain the 
507: complexity of the so-called \emph{implicit description} of $A$, which can be
508: much smaller than $\K(A)$.
509: \end{remark}
510: \begin{remark}
511: \rm
512: We can allow  $U(p)$ to output the list of elements 
513: in any order in Definition~\ref{def.kcset}. This flexibility 
514: decreases $\K(A)$ 
515: by at most a constant not depending on $A$ but only depending
516: on the order in \eqref{order}.
517: The same applies to $\K(A\mid z)$.
518: On the other hand, if $A$ occurs in a conditional,
519: such as in $\K(z\mid A)$, then 
520: it {\em is} important that elements of $A$ are given in the fixed 
521: order. This is the case since the order in which the 
522: elements of $A$ are listed  
523: can provide extra information.
524: \end{remark}
525: 
526: \begin{definition}\label{def.Kfamily}
527: \rm
528: Fix a computable bijection $\phi$ from the family of all finite
529: subsets of $\{0,1\}^*$ to  $\{0,1\}^*$.
530: Let $\A$ be a finite family of finite subsets of ${\cal X}=\{0,1\}^*$.
531: Define the {\em Kolmogorov complexity} $\K(\A)$ 
532: by $\K(\A)=\K(\{\phi(A)): A\in\A\})$.
533: \end{definition}
534: \begin{remark}
535: \rm
536: An equivalent definition 
537: of $\K(A \mid z)$ and $\K(z \mid A)$ as in Definition~\ref{def.kcset}
538: is as follows. Let $\phi$ be as in Definition~\ref{def.Kfamily}.
539: Then we can define $\K(A \mid z)$ by $\K(\phi(A) \mid z)$ and  $\K(z \mid A)$ 
540: by $\K(z \mid \phi(A))$. 
541: \end{remark}
542: 
543: \begin{definition}\label{def.gx}
544: \rm
545: For every 
546: string $x$ 
547: the {\em canonical rate-distortion function} 
548: %with respect to a distortion family of distortion balls 
549: %${\A}_n^d$ as in Definition \ref{eq.Ad}
550: $g_x:\mathcal N\to\mathcal N$
551: is defined by
552: \[
553: %g_{x}(l) = \min_{B \in {\A}^d_n} \{ \K(B) : x\in B,\log |B| \leq l\}.
554: g_{x}(l) = \min_{B \in {\A}^{d,\Y}} 
555: \{ \K(B) : x\in B,\log |B| \leq l\}.
556: \]
557: \end{definition}
558: 
559: In a similar way we can define
560: the \emph{canonical distortion-rate function}:
561: $$h_x(j)= \min_{B \in {\A}^{d,\Y}}\{\log|B|:
562: x\in B,\ C(B)\le j\}.
563: $$  
564: 
565: \begin{definition}
566: \rm
567: A {\em distortion family} ${\A}$ is a set of finite nonempty 
568: subsets of the set of source words
569: ${\cal X}=\{0,1\}^*$. The restriction to source words of length $n$
570: is denoted by ${\A}_n$. 
571: %By ${\A}_n$ we denote the restriction of ${\A}$
572: %to strings of length~$n$.
573: \end{definition}
574: 
575: Every destination alphabet $\Y$ and 
576: distortion measure $d$ 
577: gives rise to a set of distortion balls 
578: ${\A}^{d,\Y}$, which is a distortion family. Thus
579: the class of distortion families 
580: obviously includes every family of distortion
581: balls (or distortion spheres,
582: which is sometimes more convenient)
583: arising from every combination of 
584: destination set
585: and distortion measure.
586: It is easy to see that we also can 
587: substitute the more general distortion families ${\A}$
588: for ${\A}^{d,\Y}$ in the definitions
589: of the canonical rate-distortion and distortion-rate 
590: function.
591: %For example, the set $A$ in Definition~\ref{def.kcset} 
592: %can be a distortion ball
593: %as in Definition~\ref{def.distball}, and such a distortion ball
594: %is an element of the distortion family ${\A}^d$ consisting of
595: %all distortion balls with respect to a given distortion measure $d$
596: %as in Definition~\ref{def.distfam}.
597: %\end{example}
598: %Given a string $x$, we can look for a
599: %finite set $A \in {\A}^d$ that contains $x$ and is both simple
600: %and small, as follows. 
601: %For every $x\in \{0,1\}^*$ we identify
602: %the set of pairs of integers $(k,l)$ such that
603: %there is $A\in {\A}^d$ with $x\in A$, $\K(A)\le k$, and $ \log |A|\le l$.
604: %The set $P_x$ of all such pairs will be called the {\em profile of $x$}.
605: %Strings of the same complexity can have quite different profiles.
606: %All such pairs $(k,l)$ satisfy the inequality 
607: %$k+l\geq \K(x\mid  \min\{|k|,|l|\})$ up to an additive constant term
608: %(since
609: %we can reconstruct $x$ by providing a constant bit reconstruction
610: %program with a $k$-bit description of $A$,
611: %an $l$-bit ordinal number of $x$ in $A$, 
612: %and the minimum length of these two in the conditional term two tell the two
613: %apart).
614: %%\end{example}
615: 
616: %In \cite{VV02} the authors analyzed Kolmogorov's structure function
617: %which is actually 
618: %%the particular distortion measure of the 
619: %%later Example~\ref{exam.list}.
620: %the canonical distortion-rate function for the distortion 
621: %family $\A$ consisting of \emph{all} finite subsets of 
622: %$\mathcal X=\{0,1\}^*$, 
623: %This distortion family is equal to $\A^{d,\Y}$ 
624: %where ${\Y}$ consists of all nonempty finite
625: %subsets of $\{0,1\}^*$, and the distortion of $x \in \{0,1\}^*$
626: %with respect to $S \subseteq \{0,1\}^*$ is $d(x,S)=\lceil \log |S| \rceil$
627: %\begin{changekolya}if $x\in S$ and $\infty$ otherwise\end{changekolya}.
628: %This is the maximum number of bits required to identify an element
629: %of $S$.
630: %The rate-distortion function 
631: %$r_x(\delta)$ is 
632: %$\min_{S \subseteq \{0,1\}^n}  \{\K(S): d(x,S) \leq \delta \}$.
633: %$\min_{S \subseteq \{0,1\}^*}  \{\K(S): \lceil \log |S| \rceil \leq \delta \}$
634: %and essentially coincides with the canonical rate-distortion function
635: %($g_x$ is the restriction of $r_x$ to $\cal N$).
636: %
637: %
638: %
639: %In general, destination words are not sets. But for the source alphabet
640: %${\cal X} = \{0,1\}^*$, a destination alphabet ${\Y}$,
641: %and a distortion measure $d$, we consider the family of distortion
642: %balls ${\A}^d_n$ consisting of a 
643: %particular subset of all nonempty subsets of $\{0,1\}^n$.
644: %Let $x \in {\cal X}$ be of length $n$ and $y \in {\Y}$ such that
645: %the radius (that is, distortion) 
646: %$d(x,y)= \delta$. Then, we associate the distortion ball $B(y,\delta)$
647: %with the destination word $y$ for the source word $x$.
648: %Because not all nonempty subsets of $\{0,1\}^n$ are distortion balls
649: %of ${\A}^d_n$, the rate-distortion function restricted
650: %to the latter family of distortion balls behaves differently from before. 
651: %The new `rate-distortion function' now becomes
652: %$\min_{S \in {\A}^d_n} \{\K(S): \log |S| 
653: %\leq \delta\}$.  This leads to 
654: %Definition~\ref{def.gx} of the function $g_x$ below. By 
655: %Lemma~\ref{lem.rg} below it turns out that by analyzing the 
656: %single function $g_x$ we obtain the rate-distortion functions for
657: %every distortion measure by a simple transformation requiring
658: %the cardinality of the distortion balls. 
659: %%These rate-distortion functions 
660: %%are therefore computable provided the cardinality of the distortion balls
661: %%is computable. 
662: %%The transformation between a distortion ball and the
663: %%corresponding destination word and distortion 
664: %%is computable provided the distortion ball
665: %%is computable from the destination word and distortion.
666: %
667: %
668: In general,
669: the canonical rate-distortion function of $x$ can be quite different
670: from the rate-distortion function of $x$. However, by 
671: Lemma~\ref{lem.rg} below it turns out that  
672: for every distortion measure satisfying certain conditions
673: and for every $x$  
674: the rate-distortion function 
675: $r_x$ is obtained from $g_x$ by a simple transformation requiring
676: the cardinality of the distortion balls.
677: %$r_x(\delta)=g_x(\lceil \log b(\delta)\rceil)+O(\log|x|+\K(\delta))$ for all
678: %rational $\delta$. Here $b(\delta)$ stands for $\max_{y\in\Y}b(y,\delta)$.
679: 
680: 
681: \begin{remark}
682: Fix a string $x\in\mathcal X=\{0,1\}^*$ 
683: and consider different distortion families $\A$.
684: Let $g_x^\A$ denote the canonical rate-distortion
685: function of $x$ with respect to a family $\A$.
686: Obviously, if $\A\subset\BB$ 
687: then $g_x^\A$ is pointwise not less than 
688: $g_x^\BB$ (and it may happen that $g_x^\A(i)\gg g_x^\BB(i)$ for some $i$). 
689: But as long as $\A$ satisfies certain natural properties, then
690: the set of all possible $g_x$, when
691: $x$ ranges over $\mathcal X$, does not depend on the particular $\A$ 
692: involved, see
693: Theorem~\ref{theo.allshapesrd}.
694: \end{remark}
695: %Consider a source word $x$ of length $n$.
696: %For every natural $l\leq n$, the function value 
697: %$g_x(l)$ is the minimum $k$
698: %such that the pair $(k,l)$ belongs
699: %to the profile of $x$. 
700: 
701: 
702: \subsection{Use of the Big O Term}
703: In the sequel we use `additive constant $c$' or
704: equivalently `additive $O(1)$ term' to mean a constant.
705: accounting for the length of a fixed binary program,
706: independent from every variable or parameter in the expression
707: in which it occurs. 
708: Similarly we use 
709: `$O(f(m,n,\dots))$' to mean a function $g(m,n,\dots)$
710: such that $g(m,n,\dots) \leq c f(m,n,\dots)+c$ where $c$ 
711: is a fixed constant 
712: independent from every variable $m,n,\dots$ in the expression.
713: 
714: 
715: \section{Distortion Measures}
716: 
717: %\begin{definition}
718: %\rm
719: %The {\em Kolmogorov complexity of a finite family ${\A}$ 
720: %of finite nonempty subsets}
721: %$A_1, \ldots, A_m$  
722: %of $\{0,1\}^*$ is defined by $\K({\A})=
723: %\K(A_1, \ldots, A_m)$, where the 
724: %sequence $A_1, \ldots , A_m$ 
725: %is in a fixed order,
726: %say lexicographic, the constituent sets are delimited so we can tell
727: %them apart, and the elements of the constituent sets are in fixed order
728: %(say lexicographic) and delimited.
729: %\end{definition}
730: 
731: Since every family of distortion 
732: balls is a distortion family,
733: considering arbitrary distortion measures and destination alphabets 
734: results in distortion families. We consider
735: the following mild conditions on 
736: distortion families~${\A}$:
737: \begin{description}
738: \item{\bf Property 1.}
739: For every natural number $n$, 
740: the family ${\A}$ contains
741: the set $\{0,1\}^n$ of all strings of length $n$ as an element.
742: \item{\bf Property 2.}
743: All $x,y\in A\in {\A}$ satisfy 
744: $|x|=|y|$.
745: \item{\bf Property 3.}
746: Recall that ${\A}_n = \{A \in {\A}: A \subseteq  \{0,1\}^n \}$.
747: Then, $\K({\A}_n)=O(\log n)$.
748: \item{\bf Property 4.}
749: For every natural $n$, let
750: $\alpha_n$ denote the minimal number
751: that satisfies the following.
752: For every positive integer $c$ every 
753: set $A\in {\A}_n$ can be covered by at most
754: %$\alpha_n |A|/c$ sets $B\in {\A}_n$ with $|B| \leq c$.
755: $\alpha_n |A|/c$ sets $B\in {\A}$ with $|B| \leq c$.
756: Call $\alpha_n$
757: the {\em covering coefficient} related to ${\A}_n$.
758: Property 4 is satisfied if $\alpha_n$ be bounded by
759: a polynomial in $n$.
760: The smaller the covering coefficient is, the more accurate will
761: be the description
762: that we obtain of the shapes of the structure functions below.
763: \end{description}
764: The following three example
765: families ${\A}$ satisfy all four properties.
766: \begin{example} \label{exam.list}
767: \rm
768: ${\cal L}$ {\em the list distortion family}. 
769: Let ${\cal L}_n$ 
770: be the family of all nonempty subsets
771: of $\{0,1\}^n$. 
772: This is the family of distortion balls 
773: for list distortion, which we define as follows.
774: Let 
775: ${\cal X} =\{0,1\}^*$ and 
776: ${\Y}=\bigcup_n\mathcal L_n$.
777: A  source word $x \in \{0,1\}^n$ is 
778: encoded by a destination word
779: which is a subset or {\em list} 
780: $S \subseteq \{0,1\}^n$ with $x \in S$.
781: Given $S$, we can retrieve $x$ by its index of $\log |S|$ bits in $S$,
782: ignoring rounding up, whence the name `list code.'
783: The distortion measure is $d(x,S)= \log |S|$ if $x \in S$,
784: and $\infty$ otherwise. Thus, distortion balls come only in the form 
785: $B(S,\log |S|)$ with cardinality $b(S,\log |S|)=|S|$.
786: Trivially, the covering coefficient 
787: as defined in property~4,
788: for the list distortion family ${\cal L}$,
789: satisfies $\alpha_n \leq 2$.
790: Reference~\cite{VV02} describes
791: all possible canonical distortion-rate curves, called  
792: Kolmogorov's  structure function there and first defined in \cite{Ko74}.
793: %More precisely, 
794: %the function $h_x(i)$ equals $d_x(i)$, the distortion-rate function
795: %for the
796: %distortion family ${\cal L}_n$. 
797: The distortion-rate function for list distortion 
798: coincides with the canonical distortion-rate function.
799: The rate-distortion
800: function of $x$ for list distortion is
801: \[
802: r_x(\delta) = 
803: \min_{S \subseteq \{0,1\}^n} \{\K(S): x \in S , \; \log |S| \leq \delta \}
804: \]
805: and essentially coincides with the canonical rate-distortion function
806: ($g_x$ is the restriction of $r_x$ to $\cal N$). 
807: %The canonical rate-distortion function $g_x$ 
808: %can be converted to the particular rate-distortion
809: %function $r_x$ for a family ${\cal L}_n$ according to
810: %\eqref{eq.sfrd}.
811: \end{example}
812: 
813: \begin{example}
814: \rm
815: ${\cal H}$ {\em the Hamming distortion family}. 
816: Let ${\cal X} = {\Y} =\{0,1\}^*$.
817: A source word  $x \in \{0,1\}^n$ is
818: encoded by a destination word $y \in \{0,1\}^n$. 
819: For every positive integer $n$, the {\em Hamming distance}
820: between two strings $x= x_1 \ldots x_n$ and
821: $y =y_1 \ldots y_n$ is defined by 
822: \begin{equation}\label{eq.hamdist}
823: d(x,y)= \frac{1}{n} |\{i : x_i\neq y_i\}|.
824: \end{equation}
825: If $x$ and $y$ have different lengths, then $d(x,y)=\infty$.
826: A {\em Hamming ball} in $\{0,1\}^n$ with center
827: $y\in \{0,1\}^n$ and radius $\delta$ ($0 \leq \delta \leq 1$)  is the set 
828: $B(y,\delta)=\{x\in\{0,1\}^n: d(x,y)\le \delta \}$.
829: Every $x$ is in either $B( 00\ldots 0,\frac{1}{2})$ or
830: $B(11\ldots 1,\frac{1}{2})$, so we need to consider only 
831: Hamming distance $0 \leq \delta \leq \frac{1}{2}$.
832: Let ${\cal H}_n$ be the family of all Hamming balls
833: in $\{0,1\}^n$. 
834: We will use the following
835: approximation of $b(\delta)$---the cardinality of Hamming balls 
836: in ${\cal H}_n$ of radius 
837: $\delta$.
838: Suppose that $0 \le \delta \le \frac{1}{2}$ and $\delta n$ is an integer,
839: and let
840: $H(\delta)=\delta\log 1/\delta+(1-\delta)\log1/(1-\delta)$
841: be Shannon's binary entropy function. Then,
842: \begin{equation}
843: \label{binom-entropy}
844: 2^{n H(\delta)-\log n/2-O(1)} \leq
845: b(\delta) \leq 2^{nH(\delta)}.
846: \end{equation}
847: In Appendix~\ref{sect.exhamming} 
848: it is shown that the covering coefficient 
849: as defined in property~$4$,
850: for the Hamming distortion family ${\cal H}_n$,
851: satisfies $\alpha_n = n^{O(1)}$. The function
852: \[
853: r_x(\delta) = \min_{y \in \{0,1\}^n} \{\K(y): 
854:  d(x,y) \leq \delta  \}
855: \]
856: is the rate-distortion
857: function of $x$ for Hamming distortion. An approximation to
858: one such function is depicted in Figure~\ref{ham.eps}.
859: \end{example}
860: 
861: \begin{example}
862: \rm
863: ${\cal E}$ {\em the Euclidean distortion family}. 
864: Let ${\cal E}_n$ be 
865: the family of all intervals in $\{0,1\}^n$,
866: where an interval  is a  
867: subset of $\{0,1\}^n$ of the form $\{x: a\leq x\leq b\}$
868: and $\leq$ denotes the lexicographic ordering on $\{0,1\}^n$.
869: Let ${\Y} =\{0,1\}^*$.
870: A  source word $x \in \{0,1\}^n$ is
871: encoded by a destination word $y \in \{0,1\}^n$. 
872: Interpret strings in $\{0,1\}^n$ as
873: binary notations for rational numbers in the segment $[0,1]$.
874: Consider the Euclidean distance $|x-y|$
875: between rational numbers $x$ and $y$.
876: The balls in this metric are intervals;
877: the cardinality of a ball of radius $\delta$
878: is about $\delta 2^n$.
879: Trivially, the covering coefficient 
880: as defined in property~$4$,
881: for the Euclidean distortion family ${\cal E}_n$,
882: satisfies  $\alpha_n \leq 2$.
883: The function
884: \[
885: r_x(\delta) = \min_{y \in \{0,1\}^n} \{ \K(y):  |x-y| \leq \delta \}
886: \]
887: is the rate-distortion
888: function of $x$ for Euclidean distortion.
889: \end{example} 
890: All the properties 1 through 4 
891: are straightforward for all three families, 
892: except property~$4$ in the case
893: of the family of Hamming balls.
894: 
895: \section{Shapes}\label{sec1}
896: 
897: The rate-distortion functions of the 
898: individual strings of length $n$ can assume roughly
899: every shape. That is, every shape 
900: derivable from a function in the large family
901: $G_n$ of Definition~\ref{def.gx} below through transformation
902: \eqref{eq.sfrd}.
903: 
904: We start the formal part of this section. 
905: Let ${\A}$ be a distortion family satisfying
906: properties~1 through~4.
907: 
908: Property $1$ implies that $\{0,1\}^n \in {\A}$ and property $4$ 
909: applied to $\{0,1\}^n$ and $c=1$,
910: for every $n$, implies trivially that
911: the family ${\A}$ contains the singleton set
912: $\{x\}$ for every $x\in\{0,1\}^*$. Hence,
913: $$
914: g_x(0)= \K(\{x\})= \K(x)+O(1).
915: $$ 
916: Property~$1$
917: implies that for every $n$ and string $x$ of length $n$,
918: \[
919: g_x(n)\leq \K(\{0,1\}^n)=\K(n)+O(1)\leq \log n+O(1).
920: \]
921: Together this means that for every $n$ and every
922: string $x$ of length $n$,  
923: the function $g_x(l)$ decreases from about $\K(x)$
924: to about $0$ as $l$ increases from 0 to $n$.
925: 
926: \begin{lemma}\label{lem.shapesg}
927: Let ${\A}$ be a distortion family satisfying
928: properties~$1$ through $4$.
929: For every $n$ and every string $x$ of length $n$ we have
930: $g_x(n)= O(\log n)$, and
931: $0\le g_x(l)-g_x(m)\leq m-l+O(\log n)$
932: for all $l<m\leq n$. 
933: \end{lemma}
934: \begin{proof}
935: The first equation and the left-hand inequality of
936: the second equation are
937: straightforward.
938: To prove
939: the right-hand inequality
940: %translate it
941: %into the following property of the profile of $x$:
942: %If  a pair $(k,m) \in P_x$
943: %and $l<m$, then
944: %also the pair $(k+m-l+O(\log n),l) \in P_x$.
945: %
946: %Let 
947: let $A$ witness $g_x(m)=k$, which implies that 
948: $\K(A)=k$ and $ \log |A|\leq m$. By Property 4 there is 
949: a covering of $A$ by at most $\alpha_n |A|/2^{l}$ sets in ${\A}_n$
950: of cardinality at most $2^{l}$ each. 
951: Given a list of $A$ and a list of $\A_n$, we can find 
952: such a covering.
953: Let $B$ be one of 
954: the covering sets containing $x$.
955: Then, $x$ can be specified by $A,n,l,\A_n$ 
956: and the index $i$ of $B$
957: among the covering sets.
958: We need also $O(\log k+\log\log i+\log\log l +\log \log n)$
959: extra bits to separate the descriptions of $A$ and $\A_n$, and 
960: the binary representations of $i,n,l$, from one another.
961: Without loss of generality we can assume that $k$
962: is less than $n$.
963: Thus all the extra information
964: and separator bits are included in $O(\log n)$ bits.
965: Altogether,
966: $\K(B)\leq \K(A) +m-l +O(\log n)\leq k +m-l +O(\log n)$, which shows
967: that $g_x(l)\le k+m-l+O(\log n)=g_x(m)+m-l+O(\log n)$.
968: \end{proof}
969: 
970: 
971: \begin{example}\rm
972: Lemma~\ref{lem.shapesg} shows
973: that  
974: $$
975: \K(x)-i-O(\log n)\leq g_x(i)\leq n-i+O(\log n),
976: $$
977: for every $0 \leq i \leq n$.
978: The right-hand inequality 
979: is obtained by setting $m=n$, $l=i$ in
980: the lemma, yielding
981: $$
982: g_x(i)=g_x(i)-g_x(n)+O(\log n)\leq n-i+O(\log n).
983: $$
984: The left-hand inequality 
985: is obtained by setting $l=0$, $m=i$ 
986: in the lemma, yielding
987: $$
988: \K(x)-g_x(i)=g_x(0)-g_x(i) +O(1)\le i-0+O(\log n).
989: $$
990: The last displayed equation can also be shown by a simple direct argument:
991: $x$ can be described by the minimal description
992: of the set $A \in {\A}$ 
993: witnessing $g_x(i)$ and by the ordinal number of $x$ in $A$.
994: \end{example}
995: 
996: The rate-distortion
997: function $r_x$ differs
998: from $g_x$ by just a change of scale depending on the distortion family
999: involved, provided certain computational requirements are fulfilled.
1000: See Appendix~\ref{sect.computability} for computability notions. 
1001: 
1002: \begin{lemma}\label{lem.rg}
1003: Let  ${\cal X} = \{0,1\}^*$, ${\Y}$, and $d$, be the 
1004: source alphabet, destination alphabet,
1005: and distortion measure, respectively. 
1006: Assume that the set 
1007: $\{\pair{x,y,\delta}\in\mathcal X\times\Y\times\Q: d(x,y)\le \delta\}$
1008: is decidable; that $\Y$ is recursively enumerable; and  
1009: %Assume that 
1010: %there is an algorithm that given $y\in\Y$ and a rational
1011: %$\delta$ outputs a list of $B(y,\delta)$, and conversely, 
1012: %given any list of $B(y,\delta)$ and  
1013: %$\delta$, outputs a $y'\in\Y$ with
1014: %$B(y',\delta)=B(y,\delta)$. 
1015: that for every $n$ the cardinality 
1016: of every ball in ${\A}^{d,\Y}_n$ of radius $\delta$ is at most 
1017: $b_n(\delta)$ and at least $b_n(\delta)/\beta(n)$, where 
1018: $\beta(n)$ is polynomial in $n$ and $b_n(\delta)$ is a function
1019: of $n,\delta$; and that the distortion family $\A^{d,\Y}$
1020: satisfies properties 1 through 4.  
1021: Then, for every $x\in\{0,1\}^n $ and every rational $\delta$
1022: we have 
1023: \begin{equation}\label{eq.sfrd}
1024: r_x (\delta ) = g_x(\lceil \log b_n(\delta) \rceil)+O(\K(\delta)+\log n).
1025: \end{equation}
1026: \end{lemma}
1027: \begin{proof}
1028: Fix $n$ and a string $x$ of length $n$.
1029: Consider the auxiliary function
1030: \begin{equation}\label{eq.tilde}
1031: \tilde r_x(\delta) = \min_{y\in \Y} \{\K(B(y,\delta)): 
1032:  d(x,y) \leq \delta  \}.
1033: \end{equation}
1034: We claim that 
1035: $\tilde r_x(\delta)= r_x(\delta)+O(\K(\delta)+\log n)$.
1036: Indeed, let $y$ witness $r_x(\delta)=k$. 
1037: Given $y,\delta,n$ we can compute
1038: a list of elements of the ball $B(y,\delta)$: for all strings 
1039: $x'$ of length $n$ determine whether $d(x',y)\le\delta$. 
1040: %Moreover, we do not need to know $n$ in advance:  
1041: %we can find $n$ as the length of any string $x'$ with $d(x',y)\le\delta$;
1042: %by property 2 
1043: %its length equals $n$.
1044: Thus $\K(B(y,\delta))<k+O(\K(\delta)+\log n)$, hence 
1045: $\tilde r_x(\delta)<k+O(\K(\delta)+\log n)$.
1046: Conversely, let $B(y, \delta)$ witness $\tilde r_x(\delta)=k$. 
1047: Given a list of the elements of $B(y,\delta)$ and $\delta$
1048: we can recursively enumerate ${\Y}$ to find the first element
1049: $y'$ with $B(y',\delta)=B(y,\delta)$ (for every enumerated $y'$ compute 
1050: the list $B(y',\delta)$ and compare it to the given list $B(y,\delta)$). 
1051: Then,
1052: $\K(y')\le k+O(\K(\delta))$ and $d(x,y')\le\delta$.
1053: Hence $r_x(\delta)<k+O(\K(\delta))$.
1054: 
1055: Thus, it suffices to show that 
1056: \[
1057: \tilde r_x (\delta ) = g_x(\lceil \log b_n(\delta) \rceil)+O(\log n).
1058: \]
1059: 
1060: ($g_x(\lceil \log b_n(\delta) \rceil)\leq\tilde r_x (\delta)$)
1061: Assume $\tilde r_x(\delta)=k$ is witnessed by a distortion ball $B(y, \delta)$.
1062: By our assumption, the  cardinality of $B(y,\delta)$ is at most 
1063: $b_n(\delta)$, and hence $g_x(\lceil \log b_n(\delta) \rceil ) \leq k$. 
1064: 
1065: ($\tilde r_x (\delta) \leq g_x(\lceil \log b_n(\delta) \rceil)+O(\log n)$)
1066: By Lemma~\ref{lem.shapesg},
1067: $g_x(l)$ and $g_x(l-m)$ differ by at most $m+O(\log n)$. 
1068: Therefore it suffices to show that 
1069: $\tilde r_x (\delta) \leq g_x(\lceil \log b_n(\delta) \rceil-m)$
1070: for some $m=O(\log n)$. We claim that this happens for
1071: $m=\lceil\log\beta(n)\rceil+1$. Indeed, let  
1072: $g_x(\lceil \log b_n(\delta) \rceil-m)=k$ be witnessed
1073: by a distortion ball $B$. Then, 
1074: $|B|\le 2^{\lceil\log b_n(\delta)\rceil}/(2\beta(n))<
1075: b_n(\delta)/\beta(n)$.
1076: This implies that the radius of $B$ is less than $\delta$
1077: and hence $B$ witnesses $\tilde r_x (\delta)\le k$. 
1078: \end{proof}
1079: 
1080: \begin{remark}\label{rem.logn} 
1081: \rm
1082: When measuring distortion we usually do 
1083: not need rational numbers with numerator or denominator more
1084: than $n=|x|$. Then, the term $O(C(\delta))$ in \eqref{eq.sfrd}
1085: is absorbed by the term $O(\log n)$. 
1086: Thus, describing the family of $g_x$'s  we obtain an approximate
1087: description of all possible rate-distortion functions $r_x$ for 
1088: given destination alphabet and distortion measure, satisfying the computability 
1089: conditions, by using the transformation \eqref{eq.sfrd}.
1090: An example of an approximate
1091: rate-distortion curve $r_x$ for some string $x$
1092: of length $n$ for Hamming distortion is given in Figure~\ref{ham.eps}.
1093: \end{remark}
1094: \begin{remark} 
1095: \rm
1096: The computability properties of the functions
1097: $r_x$, $d_x$, and $g_x$, as well as the relation between
1098: the destination word for a source word and the related distortion ball, is
1099: explained in Appendix~\ref{sect.computability}.
1100: \end{remark}
1101: 
1102: We present an approximate 
1103: description of the family of possible $g_x$'s below. It turns
1104: out that the description does not depend on the particular distortion family
1105: $\A$ as long as properties 1 through 4 are satisfied.
1106: 
1107: 
1108: \begin{definition}
1109: \rm
1110: Let $G_n$ stand for the class of all
1111: functions $g:\{0,1,\dots,n\}\rightarrow {\cal N}$ such
1112: that $g(n)=0$  and
1113: $g(l-1)\in\{g(l),g(l)+1\}$ for all
1114: $1\leq l \leq n$.
1115: \end{definition}
1116: 
1117: In other words, a function $g$ is in $G_n$ iff
1118: it is nonincreasing and the function $g(i)+i$
1119: is nondecreasing and $g(n)=0$.
1120: The following result is a generalization to
1121: arbitrary distortion measures of Theorem IV.4
1122: in \cite{VV02}
1123: dealing with $h_x$ (equaling $d_x$ in the particular case
1124: of the distortion family 
1125: ${\cal L}$). There, the precision in Item (ii) for source words of length $n$
1126: is $O(\log n)$, rather than the $O(\sqrt{n \log n})$ we obtain
1127: for general distortion families.
1128: 
1129: \begin{theorem}\label{theo.allshapesrd}
1130: Let ${\A}$ be a distortion family satisfying
1131: properties~$1$ through~$4$.
1132: 
1133: {\rm (i)}  For every $n$ and every string $x$ of length $n$, the function
1134: $g_x(l)$ is equal to $g(l)+O(\log n)$ for some function $g \in G_n$.
1135: 
1136: {\rm (ii)}
1137: Conversely, for every $n$ and every function $g$ in $G_n$,
1138: there is a string
1139: $x$ of length $n$ such that for every $l=0,\dots,n$,
1140: $g_x(l)=g(l)+O(\sqrt{n\log n})$.
1141: \end{theorem}
1142: 
1143: \begin{remark}
1144: \rm
1145: For fixed $k \leq n$ the number of different integer functions $g \in G_n$ 
1146: with
1147: $g(0) = k$ 
1148: %$g(n)=0$, and $g(l)=\{g(l-1), g(l-1)-1\}$,
1149: is ${n \choose k}$. 
1150: For $k=\frac{1}{2}n$,
1151: this number is of order $2^n/\sqrt{ n}$,
1152: and therefore far greater than the number
1153: of strings $x$ of length 
1154: $n$ and Kolmogorov complexity 
1155: $\K(x) = k = \frac{1}{2}n$ which is at most $2^{n/2}$.                   
1156: This explains the fact that in Theorem~\ref{theo.allshapesrd}, Item (ii),
1157: we cannot precisely match a string $x$ of length $n$ to
1158: every function $g \in G_n$, and therefore have to use approximate
1159: shapes.
1160: \end{remark}
1161: 
1162: \begin{example}
1163: \rm
1164: By Theorem~\ref{theo.allshapesrd}, Item (ii), for every $g \in G_n$ 
1165: there is a string $x$ of length $n$ that has $g$ for its canonical
1166: rate-distortion function $g_x$ up to an additive $O(\sqrt{n \log n})$ term. 
1167: By \eqref{binom-entropy}, \eqref{eq.sfrd}, and Remark~\ref{rem.logn},
1168: $$
1169: r_x(\delta)=
1170: g_x(nH(\delta))+O(\log n),
1171: $$
1172: for $0 \leq \delta \leq \frac{1}{2}$.
1173: \begin{figure}[ht]
1174: \begin{center}
1175: \epsfxsize=3.5in
1176: \leftline{\hskip8pc\epsfbox{ie4.eps}}
1177: \end{center}
1178: \caption{An approximate rate-distortion function for Hamming distortion}
1179: \label{ham.eps}
1180: \end{figure}
1181: Figure~\ref{ham.eps} gives the graph of a particular function 
1182: $r(\delta) = g(nH(\delta))$ with $g$ defined as follows:
1183:  $g(l) = n(1+H(\frac{1}{6})-H(\frac{1}{3}))-l$
1184: for $0 \leq l \leq nH(\frac{1}{6})$, 
1185: $g(l)=n(1+H(\frac{1}{6})-H(\frac{1}{3}))$ for 
1186: $nH(\frac{1}{6}) < l \leq nH(\frac{1}{3})$,
1187: and $g(l)=n-l$ for $nH(\frac{1}{3}) < l \leq n$. 
1188: In this way, $g \in G_n$.
1189: Thus, there is a string $x$ of length $n$ with its rate-distortion
1190: graph $r_x (\delta)$ 
1191: in a strip of size $O(\sqrt{n\log n})$ around the
1192: graph of $r(\delta)$. Note that $r_x$ is almost constant on
1193: the segment $[ \frac{1}{6}; \frac{1}{3}]$.
1194: Allowing the
1195: distortion to increase on this interval, all the way from
1196: $\frac{1}{6}$ to $ \frac{1}{3}$, so allowing $n/6$ incorrect extra
1197: bits, we still cannot significantly decrease the rate.
1198: This means that the distortion-rate function $d_x(r)$
1199: of $x$ drops from $\frac{1}{3}$ to $\frac{1}{6}$ 
1200: near the point $r=n(1-H(\frac{1}{3}))$,
1201: exhibiting a very unsmooth behavior.
1202: \end{example}
1203: 
1204: \section{Characterization}
1205: 
1206: Theorem~\ref{th-shannon-analog} below states that a destination word that
1207: codes a given source word and
1208:  minimizes the algorithmic mutual information with 
1209: the given source word gives no
1210: advantage in rate 
1211:  over a minimal Kolmogorov complexity destination word that codes the source word.
1212: This 
1213: theorem
1214: can be compared with Shannon's theorem, Theorem~\ref{theo.shannon} in
1215: Appendix~\ref{sect.ratedistortion}, about
1216: the expected rate-distortion curve of a random variable.
1217: %This result on the rate-distortion function
1218: %of an individual source word 
1219: %that can be compared with Shannon's product space. In the
1220: %product space, due to asymptotic equidistribution, the characterization
1221: %by minimum information and minimum entropy coincide asymptotically.
1222: %contrasts with Shannon's 
1223: %rate-distortion function for
1224: %a random variable:
1225: %in Shannon's case
1226: %the minimum information of some random variable with the 
1227: %source random variable can be less than the minimum entropy of 
1228: %a function of the source random variable.
1229: 
1230: 
1231: %In formal terms: let $X$ be a random variable 
1232: %with outcomes in ${\cal X}$ and
1233: %$X_1, X_2, \ldots, X_n$ consist of $n$ i.i.d. 
1234: %copies of $X$ denoted by $X^n$.
1235: %The second part of Shannon's theorem, Theorem~\ref{theo.shannon},
1236: %states that there exists a random variable $Z$ taking values in 
1237: %the destination alphabet ${\Y}$, such that
1238: %we can code the outcomes in ${\cal X}^n$ (the  source words) in about
1239: %$nI(X;Z)$ bits (lengths of the destination words) 
1240: %with the average distortion between the source-word outcomes of  $X^n$ and 
1241: %their destination words, divided by $n$, being 
1242: %close to ${\bf E} d(X,Z)$
1243: %as $n$ grows large. Whether we minimize $H(U)$ or $I(X;U)$ 
1244: %we obtain approximately $nI(X;Z)$ where $Z$ minimizes the expression. 
1245: %The algorithmic version below about individual
1246: %finite binary strings differs from Shannon's theorem as explained in Example~\ref{exam.shannonstheorem}.
1247: %This statement cannot be strengthened further by adding the
1248: %requirement that for every such
1249: %random variable $Z$ there is a random variable $U$
1250: %with ${\bf E}d(X,U)\leq{\bf E} d(X,Z)$ and $H(U)\leq I(X;Z)$.
1251: %In the algorithmic setting an analogue of this
1252:  %strong statement is true,
1253: %as the following theorem shows. 
1254: 
1255: \begin{theorem}    \label{th-shannon-analog}
1256: Let ${\A}$ be a distortion family 
1257: satisfying properties~$2$
1258: and~$3$, 
1259: %${\A}_n = {\A} \bigcap \{0,1\}^n$, 
1260: and
1261:  ${\A}(x) = \{A  \in {\A}: x \in A\}$.
1262: For every $n$ and string $x$ of length $n$ and every $B \in {\A}(x)$
1263: there is an $A \in {\A}(x)$ with 
1264: $\lceil \log |A|\rceil =\lceil \log |B| \rceil$ and
1265: $\K(A)\leq I(x:B)+O(\log \K(B)+\log n)$,
1266: where $I(x:B)=\K(B)-\K(B\mid x)$ stands for the
1267: algorithmic information in $x$ about $B$.
1268: %Here
1269: %$\eps=O(\log n +\K(\A_n)+\log\K(B))$ and $n=|x|$.
1270: \end{theorem}
1271: 
1272: For further information about $I(x:B)$ see Definition~\ref{def.mi} in
1273: Appendix~\ref{sect.kolmcompl}.
1274: The proof of Shannon's theorem, Theorem~\ref{theo.shannon},
1275: and the proof of the current theorem are very different.
1276: The latter proof uses techniques 
1277: that may be
1278: of independent interest.
1279: In particular, we use an online 
1280: set cover algorithm where the sets come sequentially and we always have
1281: to have the elements covered that occur in a certain number of sets,
1282: Lemma~\ref{th5} in Appendix~\ref{sect.proofs}.
1283: %It uses techniques
1284: %that may be
1285: %of independent interest; see Exercise~\ref{th5} on page~\pageref{th5}.
1286: % The
1287: %Kolmogorov complexity of a string $x$
1288:  %is the length of the shortest program that
1289: %produces $x$. It could happen that there are several shortest
1290: %programs for the same string $x$; however, one can prove that
1291: %there could not be too many of them: if there are $2^m$ programs
1292: %of length $k$ that produce string $x$, then there exists a shorter
1293: %program that produces $x$ (of length approximately $k-m$).
1294: %Exercise~\ref{th5} on page~\pageref{th5}
1295:  %generalizes this statement for the case of approximate
1296: %descriptions. Informally, it states the following:
1297: %if there are $2^m$ sets in
1298: %a family ${\A}$ as above, each of
1299: %complexity at most $k$ and containing a given string $x$, then
1300: %one of these sets has complexity about $k-m$.
1301: 
1302: %\begin{example}\label{exam.shannonstheorem}
1303: %\rm
1304: %Note that for an appropriate distortion family ${\A}$
1305: %we have that $\lceil \log |A|\rceil = \lceil \log |B| \rceil$ equals
1306: %$\lceil \log b(\delta)\rceil$
1307: %in \eqref{eq.sfrd},
1308: %where it is the log-cardinality of a distortion ball 
1309: %in the distortion family ${\A}_n$. In this way we can determine 
1310: %the value of $g_x (\lceil \log b(\delta) \rceil)$
1311: %and subsequently retrieve
1312: %both the distortion $\delta$ concerned
1313:  %and the value of the rate-distortion function
1314: %$r_x(\delta)$.
1315: %The theorem states that 
1316: %a destination word
1317:  %minimizing the algorithmic mutual information with
1318: %the given source word gives no
1319: %advantage in rate (a pointwise less rate-distortion curve)
1320:  %over a minimal complexity destination word.
1321: %The contrast 
1322: %with Shannon's rate-distortion function is already explained 
1323: %at the start of this section.
1324: %\end{example}
1325: 
1326: 
1327: 
1328: \begin{example}
1329: \rm
1330: Theorem~\ref{th-shannon-analog} states that
1331: for an appropriate distortion family ${\A}$ of nonempty finite subsets
1332: of $\{0,1\}^*$ 
1333: and for every string $x \in \{0,1\}^*$, if there exists an $A\in {\A}$
1334: of cardinality
1335: $2^l$ or less
1336: containing $x$ that has small algorithmic information about $x$,
1337: then there exists another
1338: set $B\in {\A}$ containing $x$ that has also at most $2^l$ elements
1339: and has small
1340: Kolmogorov complexity itself.
1341: For example, in the case of Hamming distortion, if for a given string $x$
1342: there exists a string $y$ at Hamming distance
1343: $\delta$ from $x$
1344: that has small information about $x$, then there exists another
1345: string $z$ that is also within distance $\delta$ of  $x$ and has small
1346: Kolmogorov complexity itself (not only small algorithmic 
1347: information about $x$).
1348: \end{example}
1349: 
1350: \section{Fitness of Destination Word}\label{sect.fitness}
1351: 
1352: %For every distortion measure (subject to some mild restrictions)
1353: %and source word,
1354: In Theorem~\ref{th45} we show that if a destination word 
1355: of a certain maximal Kolmogorov complexity
1356: has minimal distortion with respect to the source word, then it
1357: also is the (almost) best-fitting destination word in the sense 
1358: (explained below)
1359: that
1360: among all destination words of that Kolmogorov complexity
1361: it has the most properties in common with the
1362: source word.
1363: `Fitness' of individual strings to an individual
1364: destination word is hard, if not impossible, to describe
1365: in the probabilistic framework. However, for the combinatoric
1366: and computational notion of Kolmogorov complexity it is natural to describe
1367: this notion using `randomness deficiency' as in Definition~\ref{def.rd} below.
1368: 
1369: Reference \cite{VV02} uses `fitness' 
1370: with respect to the particular distortion family
1371: ${\cal L}$. We briefly overview the generalization to arbitrary
1372: distortion families satisfying properties 2 and 3 (details,
1373: formal statements and proofs about ${\cal L}$ can be found in the 
1374: cited reference).
1375: %Every set $A \in {\A}$ containing a string $x$ 
1376: %is considered to be a
1377: %model for $x$. 
1378: The goodness of fit of a destination word $y$ for
1379: a source word $x$ with respect to an arbitrary distortion family ${\A}$
1380: is defined by the randomness deficiency of $x$ in the
1381: the distortion ball $B(y, \delta)$ with $\delta=d(x,y)$. 
1382: The lower the randomness deficiency, the better is the fit.
1383: \begin{definition}\label{def.rd}
1384: \rm
1385: The {\em randomness deficiency} of $x$ in a set $A$ with $x \in A$
1386: is defined as $\delta (x \mid A) = \log |A| - \K(x\mid A)$.
1387: If $\delta (x \mid A)$ is small then $x$ is a {\em typical} element of $A$.
1388: Here `small' is taken as $O(1)$ or $O(\log n)$ where $n=|x|$, 
1389: depending on the context of the future statements.
1390: %Here we have not stated what the constant in $O(1)$ is.
1391: %One must agree in advance on a constant $c$ and then call an element
1392: %typical when the deficiency is smaller that $c$, and all later references
1393: %of typicality depend on this $c$.
1394: \end{definition}
1395: 
1396: The randomness deficiency can be little smaller
1397: than 0, but not more than
1398: a constant.
1399: \begin{definition}
1400: \rm
1401: Let $\beta$ be an integer parameter and $P \subseteq A$.
1402: We say $P$ is a {\em property} in 
1403: $A$ if $P$ is a `majority' subset of
1404: $A$, that is,  $|P| \geq (1-2^{\beta})|A|$. We say that 
1405: $x \in A$ \emph{satisfies} property $P$ if 
1406: $x \in P$.
1407: \end{definition}
1408: 
1409: If the randomness deficiency $\delta(x \mid A)$ is not much greater than 0,
1410: %then $x$ satisfies every property
1411: %that holds for a majority of elements in $A$.
1412: then there are no simple special properties that
1413: single $x$ out from the majority of strings to be drawn from $A$.
1414: This is not just terminology: 
1415: If $\delta (x  |  A)$ is small enough,
1416: then $x$ satisfies {\em all} properties of low Kolmogorov complexity
1417: in $A$ (Lemma~\ref{lemma.property} in Appendix~\ref{sect.rd}).
1418: If $A$ is a set containing $x$ such that $\delta(x \mid A)$ is 
1419: small 
1420: then we say that $x$ is
1421: a set of good fit for $x$.
1422: %This leads to the notion of a model for $x$.
1423: %\begin{definition}\label{def.model}
1424: %\rm
1425: %Let $x$ be a string and ${\A}$ be a distortion family.
1426: %A set $A \in {\A}$ with $x \in A$ is a {\em model} for $x$.
1427: %The set $A$ is a {\em best} model for $x$ if the randomness
1428: %deficiency $\delta (A|x)$ is minimal.
1429: %See also Appendix~\ref{sect.rd},
1430: %\cite{VV02} or the text \cite{LiVi97}.
1431: %\end{definition}
1432: In \cite{VV02} the notion of 
1433: models for $x$ is considered: Every finite set of strings
1434: containing $x$ is a {\em model} for $x$. 
1435: Let $x$ be a string of length $n$ and choose an integer $i$ 
1436: between 0 and $n$. Consider models for $x$ of 
1437: Kolmogorov
1438: complexity at most $i$.
1439: Theorem~IV.8 and Remark IV.10 in \cite{VV02}
1440: show
1441: for the distortion family ${\cal L}$
1442: %That theorem and the accompanying examples show 
1443: that $x$ has minimal
1444: randomness deficiency in every set that witnesses $h_x(i)$ 
1445: (for ${\cal L}$ we have $h_x(i)=d_x(i)$),
1446: ignoring additive $O(\log n)$ terms. That is, up to the stated precision
1447: every such witness set is the best-fitting model that is
1448: possible at model Kolmogorov complexity at most $i$. 
1449:  It is
1450: remarkable, and in fact unexpected to the authors,
1451: that the analogous result
1452: holds for arbitrary distortion families provided 
1453: they satisfy properties 2 and 3.
1454: 
1455: 
1456: \begin{theorem}\label{th45}
1457: Let ${\A}$ be a distortion family 
1458: satisfying properties~$2$ and~$3$ 
1459: and $x$ a string of length $n$. 
1460: Let $B$ be a set in $\A$ with 
1461: $x \in B$.
1462: Let $A_x$ be a set
1463: of minimal Kolmogorov complexity
1464: among the sets $A\in{\A}$ with $x\in A$ and 
1465: $\lceil \log |A| \rceil= \lceil \log |B| \rceil$.
1466: Then,
1467: \[
1468: \K(A_x)+\log |A_x|-\K(x)\leq
1469: \delta(x \mid B)
1470: +O(\log \K(B)+ \log n).
1471: \]
1472: \end{theorem}
1473: \begin{lemma}\label{lemma.deltaab}
1474: For every set $A$ with  $x \in A$,
1475: \begin{equation}\label{eq.deltaab}
1476: \K(A)+\log |A|-\K(x) \ge\delta (x \mid A),
1477: \end{equation}
1478: up to a  $O(\log n)$ additive term. 
1479: \end{lemma}
1480: \begin{proof}
1481: The inequality \eqref{eq.deltaab}
1482: means that that 
1483: $$\K(A)+\log |A|-\K(x) \ge \log |A|-\K(x\mid A)+O(\log n),$$
1484: that is,
1485: $\K(x)\le \K(A)+\K(x\mid A)+O(\log n)$.
1486: The latter inequality follows 
1487: from the general inequality 
1488: $\K(x)\le \K(x,y) \leq \K(y)+\K(x\mid y)+O(\log\K(x\mid y))$,  
1489: where $\K(x\mid y)\le\K(x)+O(1)\le n+O(1)$.
1490: \end{proof}
1491: 
1492: A set $A$ with $x \in A$ is an algorithmic {\em sufficient statistic} 
1493: for $x$ if
1494: $\K(A)+\log |A|$ is close to $\K(x)$.
1495: Lemma~\ref{lemma.deltaab} shows that every sufficient statistic for $x$ is
1496: a model of a good fit for $x$.
1497: 
1498: \begin{example}\label{th44}
1499: \rm
1500: Consider the elements of every $A\in {\A}$ uniformly distributed.
1501: Assume that we are given a string $x$ that was 
1502: obtained by a random sampling
1503: from an unknown set $B\in {\A}$ 
1504: satisfying $\K(B)\le n=|x|$.
1505: Given $x$
1506: we want to recover $B$, or some $A\in {\A}$ that
1507: is ``a good hypothesis to be the source of $x$'' in the sense
1508: that the randomness deficiency $\delta (x \mid A)$ is small. 
1509: Consider the set $A_x$ from  Theorem~\ref{th45} as such
1510: a hypothesis. We claim that 
1511: with high probability $\delta(x \mid A_x)$ is of order $O(\log n)$.
1512: More specifically, for every $\beta$ the probability of the event
1513: $\delta(x \mid A_x)>\beta$
1514: is less than 
1515: $2^{-\beta+O(\log n)}$, 
1516: which is negligible for $\beta=O(\log n)$. 
1517: Indeed, 
1518: if $x$ is chosen uniformly  at random in $B$, then
1519: with high probability 
1520: (Appendix~\ref{sect.rd})
1521: the randomness deficiency $\delta (x \mid B)$ is small.
1522: That is, with probability more than $1-2^{-\beta}$ 
1523: we have $\delta(x \mid B)\le\beta$.
1524: By Theorem~\ref{th45} and \eqref{eq.deltaab}
1525: we also have $\delta(x \mid A_x)\le\delta(x \mid B)+O(\log n)$.
1526: %It is easy to show \cite{VV02,LiVi97} that 
1527: %If $A$ is a sufficient
1528: %statistic for $x$, then by~\eqref{eq.deltaab}
1529: %$x$ is a typical element of $A$ in the sense that it has small
1530: %randomness deficiency $\delta(x|A)$.
1531: %By Theorem~\ref{th45}, 
1532: Therefore the probability of the event
1533: $\delta(x \mid A_x)>\beta$
1534: is less than 
1535: $2^{-\beta+O(\log n)}$.
1536: %By the properties of randomness deficiency,
1537: %the probability that the right-hand side of the inequality
1538: %in Theorem~\ref{th45}
1539: %exceeds $\beta$ is at most $\epsilon$. 
1540: %Thus, with high probability
1541: %the set $A_x$ is a sufficient statistic
1542: %for $x$.
1543: \end{example}
1544: 
1545: 
1546: \begin{example}
1547: \rm
1548: Theorem~\ref{th45} says that for fixed
1549: log-cardinality $l$ the model that has minimal Kolmogorov complexity has
1550: also minimal randomness
1551: deficiency among models of that log-cardinality.
1552: Since $g_x$ satisfies  Lemma~\ref{lem.shapesg}, we have also that for every
1553: $k$ the model of Kolmogorov complexity at most
1554: $k$ that minimizes the log-cardinality also minimizes randomness
1555: deficiency among models of that Kolmogorov complexity. 
1556: These models can be computed in the limit, in the first case
1557: by running all programs up to $k$ bits and always keeping the one
1558: that outputs the smallest set in ${\A}$ containing $x$, and in the second case
1559: by running all programs up to $n=|x|$ bits and always keeping the
1560: shortest one that outputs a set in ${\A}$ containing $x$
1561: having log-cardinality at most $l$.
1562: \end{example}
1563: 
1564: 
1565: \section{Denoising}
1566: 
1567: %Assume the setting of Theorem~\ref{th45} and Example~\ref{th44}, 
1568: %Since 
1569: %$\delta(x|A_x)\leq
1570: %\K(A_x)+\log |A_x|-\K(x)\leq  \delta(x|B)+O(\log \K(B)+\log n)$,
1571: In Theorem~\ref{th45} using \eqref{eq.deltaab} we obtain
1572: \begin{equation}\label{eq.dAB}
1573: \delta(x \mid A_x)\le \delta(x \mid B)+O(\log \K(B)+\log n).
1574: \end{equation}
1575: %and $A_x$ is a best model (Definition~\ref{def.model}) for $x$
1576: %at either complexity $k$, or of log-cardinality $l$, and hence both.
1577: This gives a method
1578: to identify good-fitting models for $x$ using compression, as follows. 
1579: Let $k= \K(A_x)$ and $l= \lceil \log |B| \rceil$.
1580: If $A_x$ is a
1581: set of minimal Kolmogorov complexity
1582: among sets  $A \in {\A}$ with $x\in A$ and $ \lceil \log |A| \rceil=l$,
1583: then by \eqref{eq.dAB}
1584: the hypothesis ``$x$ is chosen at random
1585: in $A_x$'' is (almost) at least as plausible as
1586: the hypothesis ``$x$ is chosen at random
1587: in $B$'' for every simply described
1588: $B\in {\A}$ 
1589: (say, $\log \K(B)=O(\log n)$) 
1590: with  $ \lceil \log |B| \rceil=l$.
1591: 
1592: Let us look at an example
1593: of denoising by compression
1594: (in the ideal sense of Kolmogorov complexity) for Hamming distortion.
1595: Fix a target string $y$ of length $n$ and a 
1596: distortion $0 \leq \delta \leq \frac{1}{2}$.
1597: (This string $y$ functions as the destination word.)
1598: Let a string $x$ be a noisy version of
1599: $y$ by changing at most $n\delta$ randomly chosen bits in $y$
1600: (string $x$ functions as the source word).
1601: That is,
1602: the string $x$ is chosen uniformly at random in the Hamming ball
1603: $B=B(y,\delta)$.
1604: Let $\hat{x}$ be 
1605: a string witnessing 
1606: $r_x(\delta)$, that is, $\hat{x}$ is a string
1607: of minimal Kolmogorov complexity  with $d(x,\hat{x}) \leq \delta$
1608: and $r_x(\delta)=C(\hat{x})$.
1609: %in the Hamming ball $B(x,\delta)$ 
1610: We claim that at distortion $\delta$ the string
1611:  $\hat{x}$ is a good candidate for
1612: a denoised version of $x$, that is, the target string $y$.
1613: This means that
1614: in the two-part description
1615: $(\hat{x},\hat{x} \oplus x)$
1616: of $x$, the second part (the bitwise XOR of $x$ and $\hat{x}$)
1617: is noise: 
1618: $\hat{x} \oplus x$ is a random string 
1619: in the Hamming ball $B(00\dots0,\delta)$ in the sense 
1620: that $\delta(\hat{x} \oplus x \mid B(00\dots0,\delta))$ is negligible.
1621: Moreover, even the conditional Kolmogorov complexity
1622: $\K(\hat{x} \oplus x \mid \hat x)$ is close to $\log b(\delta)$.
1623: 
1624: Indeed, 
1625: let $l=\lceil\log|B|\rceil$.
1626: By Definition~\ref{def.gx} of $g_x$, 
1627: %and the fact that 
1628: %$\log |B| = l + O(\log n)$ by \eqref{binom-entropy},
1629: Theorem~\ref{th45} implies that
1630: $$
1631: g_x(l)+l-\K(x)\le \delta(x \mid B),
1632: $$
1633: ignoring additive terms of $O(\log n)$
1634: and observing that the additive 
1635: term $\log \K(B)$ is absorbed by $O(\log n)$.
1636: %Since the Hamming distortion family
1637: %satisfies all properties~$1$
1638: %through~$4$,
1639: %the canonical structure functions $g_x$
1640: %satisfy Theorem~\ref{theo.allshapesrd}. 
1641: For every $x$,
1642: the rate-distortion function $r_x$ of $x$ differs from
1643: $g_x$ just by changing the scale of the argument as in \eqref{eq.sfrd}.
1644: More specifically,
1645: %for every $0 \leq \delta \leq \frac{1}{2}$, 
1646: we have
1647: $r_x(\delta) = g_x(l)$ and hence
1648: \[
1649: r_x(\delta)+l-\K(x)\leq \delta(x \mid B).
1650: \]
1651: Since we assume that $x$ is chosen uniformly
1652: at random in $B$, the randomness deficiency 
1653: $\delta(x \mid B)$ is small, say $O(\log n)$ with high probability.
1654: Since 
1655: $r_x(\delta)=\K(\hat{x})=\K(B(\hat{x},\delta))+O(\K(\delta))$,
1656: $\K(\delta)=O(\log n)$, and $l=\lceil\log b(\delta)\rceil$,
1657: it follows that with high probability, and the equalities up to an
1658: additive $O(\log n)$ term,
1659: $$
1660: 0 =  \K(\hat{x})+l- \K(x)= \K(B(\hat{x},\delta))+
1661: \log b(\delta)-\K(x).
1662: $$
1663: Since by construction $x \in B(\hat{x},\delta)$, 
1664: the displayed equation shows that 
1665: the ball $B(\hat{x},\delta)$ is a sufficient statistic for $x$.
1666: This implies that $x$ is a typical element of $B(\hat{x},\delta)$,
1667: that is, $\K(x\oplus\hat x \mid \hat{x})=\K(x \mid \hat{x})=
1668: \K(x \mid B(\hat{x},\delta),p)$ 
1669: is close to $\log b(\delta)$.
1670: Here $p$ is an appropriate
1671: program of $O(\C(\delta))=O(\log n)$ bits.
1672: %This means that $x$ has distortion $\delta$ ($\delta n$ bits flipped)
1673: %with respect to $\hat{x}$.
1674: 
1675: This provides a method of denoising via compression, 
1676: at least in theory.
1677: In order to use the method practically, admittedly with a leap of faith,
1678: we ignore the ubiquitous $O(\log n)$ additive terms,
1679: and use real compressors to
1680: approximate the Kolmogorov complexity, similar to what was done in  
1681: \cite{Li01,Li04}.
1682: The Kolmogorov complexity is not computable and can be approximated
1683: by a computable process from above but not from below, while a real 
1684: compressor is computable. Therefore, the approximation of the Kolmogorov 
1685: complexity by a real compressor involves for some arguments errors that can 
1686: be high and are in principle unknowable. Despite all these caveats it turns
1687: out that the practical analogue of the theoretical method works surprisingly
1688: well in all experiments we tried \cite{RV06}. 
1689: 
1690: \begin{figure}
1691: \begin{center}
1692: \epsfxsize=3.5in
1693: \leftline{\hskip8pc\epsfbox{cross2_euclidean_edited.eps}}
1694: \end{center}
1695: \caption{Denoising of the noisy cross}
1696: \label{fig:cross}
1697: \end{figure}
1698: 
1699: As an example, we approximated the distortion-rate
1700: function of a noiseless cross of very low
1701: Kolmogorov complexity, to which artificial noise was added to obtain
1702: a noisy cross,  \cite{RV06}. 
1703: Figure~\ref{fig:cross} shows two graphs. The first graph, hitting
1704: the horizontal axis at about 3100 bits, denotes the Hamming distortion
1705: on the vertical axis of the best
1706: model for
1707: the noisy cross with respect to the original noisy cross
1708:  at the rate given on the horizontal axis.
1709: The line hits zero distortion at model cost
1710: bit rate about 3100,
1711: when the original noisy cross is retrieved. The best model of the noisy cross
1712: at this rate, actually the original noisy cross, 
1713: is attached to this point. The second graph, hitting the horizontal axis at
1714: about 250 bits, denotes on the vertical axis the Hamming distortion 
1715: of the best
1716: model for the noisy cross with respect to the noiseless cross
1717: at the rate given on the horizontal axis.
1718: The line hits almost zero distortion (Hamming distance 3)
1719: at model cost bit rate about 250.  The best model of the noisy cross
1720: at this rate is attached to this point. (The three wrong bits
1721: are at the bottom left corner and upper right armpit.)
1722:  This coincides with a sharp slowing
1723: of the rate of decrease of the first graph. Subsequently, the second graph
1724: rises again because the best model for the noisy cross starts to model
1725: more noise. Thus, the second graph shows us the denoising of the noisy
1726: cross, underfitting left of the point of contact with the horizontal axis,
1727: and overfitting right of that point. This point of best denoising can 
1728: also be deduced
1729: from the first graph, where it is the point where 
1730: the distortion-rate curve sharply
1731: levels off. 
1732: Since this point
1733: has distortion of only $3$ to the
1734: noiseless cross, the distortion-rate
1735: function separates structure and noise very well in this
1736: example. 
1737: 
1738: In the experiments in \cite{RV06} a specially written
1739: block sorting compression algorithm with a
1740: move-to-front scheme as described in \cite{BW94} was used. 
1741: The algorithm is very
1742: similar to a number of common general purpose compressors, such as bzip2
1743: and zzip, but it is simpler and faster for small inputs; the source
1744: code (in C) is available from the authors of \cite{RV06}.
1745: 
1746: 
1747: \section{Algorithmic versus Probabilistic Rate-Distortion}\label{sect.algprobrd}
1748: 
1749: %For every distortion family ${\A}$ satisfying property 2,
1750: %and ${\A}_n = {\A} \bigcap \{0,1\}^n$,
1751: Theorem~\ref{thm.dresf} shows that
1752: Shannon's rate-distortion function
1753: $r^n(\delta)$ of \eqref{eq.rndelta}
1754: for a random variable is pointwise related
1755: to the expected value
1756: of the rate-distortion functions $r_x(\delta)$ of the individual
1757: string $x \in {\A}_n$ 
1758: (outcomes of the random variable with the expectation taken
1759: over the probabilities of the random variable). 
1760: This result generalizes \cite{YS93,MK94,SE03}
1761: to arbitrary computable sources.
1762: 
1763: Formally, probabilistic rate-distortion theory is treated in
1764: Appendix~\ref{sect.ratedistortion}. 
1765: Let ${\mathbf X}$ and ${\mathbf Y}$ be finite alphabets where we
1766: take ${\mathbf X}=\{0,1\}$ for convenience.
1767: We generalize the setting from i.i.d.
1768: random variables to more general random variables.
1769: Let $X_1, X_2, \ldots , X_n$ be a sequence
1770: of, possibly dependent, random variables with values in ${\mathbf X}^n$
1771: such that 
1772: $p(x_1x_2\ldots x_n) = P(X_1=x_1, X_2=x_2, \ldots , X_n=x_n)$ 
1773: is rational.  With $X=X_1, X_2, \ldots , X_n$ and
1774: $x=x_1x_2 \ldots x_n$,
1775: let $\K(X)$ denote
1776: the Kolmogorov complexity of the set of pairs 
1777: $(x,p(x))$ ordered lexicographic.
1778: Let 
1779: $E: {\mathbf X}^n \rightarrow {\mathbf Y}^n$ be a code.
1780: Define the Shannon rate-distortion function by 
1781: \begin{equation}\label{eq.rndelta}
1782: r^n(\delta) = \min_E
1783: \{ \log |E({\mathbf X}^n)| :
1784: {\bf E} d(x,E(x)) \leq \delta \},
1785: \end{equation}
1786: the expectation ${\bf E}$
1787: taken over the probability mass function $p$.
1788: %Roughly speaking, we prove that
1789: %$r^n(\delta)$ is close to the $p$-expected value
1790: %of $r_x(\delta)$ for $x \in {\A}_n$ and distortion $\delta$.
1791: 
1792: \begin{theorem}\label{thm.dresf}
1793: Let 
1794: %the distortion family ${\A}$ satisfy property 2, and
1795: %${\A}_n= {\A} \bigcap \{0,1\}^n$. For every $n$ and string $x$
1796: %of length $n$,
1797: %let 
1798: $E_0$ be a many-to-one coding function 
1799: %achieving the minimum in
1800: %the righthand side of \eqref{eq.rndelta} 
1801: defined by $E_0(x)=y$ with
1802: $d(x,y) \leq \delta$ and
1803: $r_x(\delta) = \K(y)$. 
1804: Let $|x|=n$. Then,
1805: \[
1806:  {\bf E} r_x(\delta) - \Delta_1
1807: \leq  r^n (\delta)
1808: \leq \min \left\{{\bf E} r_x(\delta)+ \Delta_2 ,  
1809:  \max_{x \in {\cal X}^n} r_x(\delta) \right\},
1810: \]
1811: with $\Delta_1 = O(\K(\delta,r^n,X,n))$,
1812: $\Delta_2 = H(L)-H(S)$ with $S(y)= \sum \{p(x): E_0(x)=y\}$,
1813: $L(y)$ is the uniform distribution over the $y$'s over $\mathbf{Y}^n$, and
1814: the expectation ${\bf E}$ is taken over $p$.
1815: \end{theorem}
1816: Note that we have taken ${\cal X}= {\X}= \mathbf{X}^n$
1817: and ${\Y}=\mathbf{Y}^n$.
1818: The $\Delta_1$ quantity satisfies $\lim_{n \rightarrow \infty} \Delta_1 /n =0$.
1819: The quantity $\Delta_2$ is small only in the case where we
1820: have asymptotic equidistribution. This is the original setting of Shannon.
1821: Though independence is not needed, for example ergodic stationarity guarantees
1822: asymptotic equidistribution.
1823: 
1824: 
1825: 
1826: 
1827: 
1828: \appendix
1829: 
1830: \subsection{Shannon Rate Distortion}\label{sect.ratedistortion}
1831: Classical rate-distortion theory
1832: was initiated by Shannon in \cite{Sh48,Sh59}, and %in his celebrated 1948 paper.
1833: we briefly recall his approach.
1834: Let 
1835: ${\mathbf X}$ and ${\mathbf Y}$ be finite alphabets.
1836: A single-letter distortion measure is
1837: a function $d$ that maps elements of
1838: $\mathbf X \times \mathbf Y$ to the reals. Define the distortion between
1839: word $x$ and $y$ of the same length $n$ over alphabets
1840: ${\mathbf X}$ and ${\mathbf Y}$, respectively, by
1841: \[
1842: d^n(x,y)= \frac{1}{n}\sum_{i=1}^n d(x_i,y_i).
1843: \]
1844: Let $X$ be a random variable with values in
1845: ${\mathbf X}$. Consider the random variable $X^n$ with values in ${\mathbf X}^n$,
1846: that is, the sequence $X_1,\dots,X_n$ of $n$ independent
1847: copies of $X$.
1848: We want to encode words of length $n$ over ${\mathbf X}$ by words over ${\mathbf Y}$
1849: so that the number
1850: of all code words is small and
1851: the expected distortion between outcomes of $X^n$ and their
1852: codes is small.
1853: The tradeoff between the expected
1854: distortion
1855: and the number of code words used is expressed
1856: by the {\em rate-distortion} function
1857: denoted by $r^n(\delta )$ as in \eqref{eq.rndelta}. It
1858: maps every $\delta \in {\cal R}$
1859: to the minimal natural number
1860: $r$ (we call $r$ the \emph{rate})
1861: having the following property:
1862: There is an encoding function
1863: $E:{\mathbf X}^n \rightarrow {\mathbf Y}^n$ with a range of cardinality at most $2^r$
1864: such that
1865: the expected distortion between the outcomes of $X^n$
1866: and their corresponding codes is at most~$\delta$.
1867: 
1868: In \cite{Sh59} Shannon gave the following nonconstructive
1869: asymptotic characterization of $r^n(\delta)$.
1870: Let $Z$ be a random variable with values in ${\mathbf Y}$.
1871: Let $H(Z)$, $H(Z \mid X)$ stand for the  Shannon entropy and conditional Shannon entropy,
1872: respectively. Let $I(X;Z)=H(Z)-H(Z \mid X)$ denote the mutual information
1873: in $X$ and $Z$, and  ${\bf E} d(X,Z)$ stand
1874: for the expected value of  $d(x,z)$ with respect to
1875: the joint probability $P(X=x, Z=z)$ of the random variables $X$ and $Z$. 
1876: For a real $\delta$, let $R(\delta)$ denote
1877: the minimal $I(X;Z)$ subject to ${\bf E} d(X,Z)\leq \delta$.
1878: That such a minimum is attained for all $\delta$ can be shown
1879: by compactness arguments.
1880: 
1881: \begin{theorem}\label{theo.shannon}
1882: For every $n$ and $\delta$ we have
1883: $r^n(\delta)\geq nR(\delta)$. Conversely,
1884: for every $\delta$ and every positive $\epsilon$,
1885: we have
1886: $r^n(\delta+\epsilon)\leq n(R(\delta)+\epsilon)$
1887: for all large enough $n$.
1888: \end{theorem}
1889: 
1890: \subsection{Computability}\label{sect.computability}
1891: 
1892: In 1936 A.M. Turing \cite{Tu36} defined the hypothetical `Turing machine' 
1893: whose computations are 
1894: intended to give an operational and formal definition 
1895: of the intuitive notion of computability in the discrete domain.
1896: These Turing machines compute integer functions, 
1897: the {\em computable} functions. By using pairs of integers for the 
1898: arguments and values we can extend computable functions
1899: to functions with rational arguments and/or values.
1900: The notion of computability can be further
1901: extended, see for example \cite{LiVi97}:
1902: A 
1903: function $f$ with rational arguments and real values is
1904: {\em upper semicomputable}
1905: if there is a computable
1906: function  $\phi (x,k)$ with 
1907: $x$ an rational number and $k$ a nonnegative integer
1908: such that $\phi(x,k+1) \leq \phi(x,k)$ for every $k$ and
1909:   $\lim_{k \rightarrow \infty} \phi (x,k)=f(x)$.
1910: This means
1911:   that $f$ can be computably approximated from above.
1912: A function $f$ is
1913: {\em lower semicomputable}
1914:   if $-f$ is upper semicomputable.
1915:   A function is called
1916: {\em semicomputable}
1917:   if it is either upper semicomputable or lower semicomputable or both.
1918: If a function $f$ is both upper semicomputable and
1919: lower semicomputable,
1920: then $f$ is 
1921: computable.
1922: A countable set $S$ is {\em computably (or recursively) enumerable}
1923: if there is a Turing machine $T$ that outputs all and only the elements of $S$
1924: in some order and does not halt. A countable set $S$ is 
1925: {\em decidable (or recursive)}
1926: if there is a Turing machine $T$ that decides for every candidate $a$
1927: whether $a \in S$ and halts. 
1928: 
1929: \begin{example}\rm
1930: An example of a computable function is $f(n)$ defined as
1931: the $n$th prime number;
1932: an example of a function that is upper semicomputable
1933: but not computable is the Kolmogorov complexity function $\K$ in
1934: Appendix~\ref{sect.kolmcompl}. An example of a recursive set is the set
1935: of prime numbers; an example of a recursively enumerable
1936: set that is not recursive is $\{x \in {\cal N}: \K(x) < |x| \}$.
1937: \end{example}
1938: 
1939: Let ${\cal X}=\{0,1\}^*$, and ${\Y}$ and the distortion measure $d$ 
1940: be given.
1941: Assume that ${\Y}$ is recursively (= computably) enumerable
1942: and the set 
1943: $\{\pair{x,y,\delta}\in\mathcal X\times\Y\times\Q: d(x,y)\le \delta\}$
1944: is decidable.  
1945: Then $r_x$ is upper semicomputable. Namely, to determine $r_x(\delta)$
1946: %with $|x|=n$ 
1947: proceed as follows.
1948: %We know $r_x(\delta) \leq n+O(\log n)$.%
1949: Recall that $U$ is the reference universal Turing machine.
1950: Run $U(p)$ for all $p$ 
1951: %$|p| \leq n +O(\log n)$%
1952: dovetailed fashion (in stage $k$ of the overall computation
1953: execute the $i$th computation step of the $(k-i)$th program).
1954: Interleave this computation
1955: with a process that recursively enumerates  ${\Y}$. 
1956: Put all enumerated elements of ${\Y}$ in a set ${\cal W}$. 
1957: %Initially, the best candidate program
1958: %$q$ has length $|q|= n+O(\log n)$.
1959: Whenever $U(p)$ halts we put the output in a set ${\cal U}$.
1960: After every step in the overall computation we determine the 
1961: minimum length of a program $p$ such that $U(p) \in {\cal W} \bigcap {\cal U}$ 
1962: and $d(x,U(p))\le\delta$. 
1963: We call $p$ a \emph{candidate} program.
1964: The minimal length of all candidate programs can only decrease
1965: in time and eventually becomes equal to $r_x(\delta)$. Thus,
1966: this process 
1967: upper semicomputes $r_x (\delta)$. 
1968: 
1969: The function $g_x$ is also upper semicomputable. The proof is similar
1970: to that used to prove the upper semicomputability of $r_x$.
1971: It follows from \cite{VV02} that in general $d_x$,
1972: and hence its `inverse' $r_x$ and by Lemma~\ref{lem.rg}
1973: the function $g_x$, are not computable. 
1974: 
1975: Assume that the set $\Y$ is recursively enumerable and
1976: the set
1977: $\{\pair{x,y,\delta}\in\mathcal X\times\Y\times\Q: d(x,y)\le \delta\}$
1978: is decidable. Assume that the resulting distortion family $\mathcal
1979: A^{d,\Y}$
1980: satisfies Property 2.
1981: There is a relation between
1982: destination words and distortion balls. This relation is as follows.
1983: 
1984: (i) Communicating a destination word $y$ for a source word $x$
1985:  knowing a rational upper bound
1986: $\delta$ for  the distortion $d(x,y)$
1987: involved is the same as communicating a
1988: distortion ball of radius $\delta$ containing $x$.
1989: 
1990: (ii) Given (a list of the elements of) a distortion ball $B$
1991: we can upper semicompute
1992: the least distortion $\delta$ such that $B=B(y,\delta)$ for some $y\in\Y$.
1993: 
1994: Ad (i). This implies that the function $\tilde r_x(\delta)$ defined
1995: in \eqref{eq.tilde} differs from $r_x(\delta)$
1996: by $O(\K(\delta)+\log |x|)$.
1997: See the proof of Lemma~\ref{lem.rg}.
1998: 
1999: Ad (ii). Let
2000: $B$ be a given ball. Recursively enumerating ${\Y}$ and 
2001: the possible $\beta\in \Q$,
2002: we find
2003: for every newly enumerated element of $y \in {\Y}$
2004: whether $B(y, \beta)=B$ (see the proof of  Lemma~\ref{lem.rg}
2005: for an algortihm to find a list of elements of $B(y, \beta)$
2006: given $y,\beta$). Put these $\beta$'s
2007: in a set ${\cal W}$.
2008: Consider the least element of ${\cal W}$ at every computation step.
2009: This process upper semicomputes the
2010: least distortion  $\delta$ corresponding to
2011: the distortion ball $B$.
2012: 
2013: \subsection{Kolmogorov Complexity}\label{sect.kolmcompl}
2014: 
2015: For precise definitions, notation, and results see the text \cite{LiVi97}.
2016: Informally, the Kolmogorov complexity, or algorithmic entropy, $\K(x)$ of a
2017: string $x$ is the length (number of bits) of a shortest binary
2018: program (string) to compute
2019: $x$ on a fixed reference universal computer
2020: (such as a particular universal Turing machine).
2021: Intuitively, $\K(x)$ represents the minimal amount of information
2022: required to generate $x$ by any effective process.
2023: The conditional Kolmogorov complexity $\K(x \mid  y)$ of $x$ relative to
2024: $y$ is defined similarly as the length of a shortest binary program
2025: to compute $x$, if $y$ is furnished as an auxiliary input to the
2026: computation.
2027: %For technical reasons we use a variant of complexity,
2028: %so-called prefix complexity, which is associated with Turing machines
2029: %for which the set of programs resulting in a halting computation
2030: %is prefix free.
2031: %We realize prefix complexity by considering a special type of Turing
2032: %machine with a one-way input tape, a separate work tape,
2033: %and a one-way output tape. Such Turing
2034: %machines are called {\em prefix} Turing machines. If a machine $T$ halts
2035: %with output $x$
2036: %after having scanned all of $p$ on the input tape,
2037: %but not further, then $T(p)=x$ and
2038: %we call $p$ a {\em program} for $T$.
2039: %It is easy to see that
2040: %$\{p : T(p)=x, x \in \{0,1\}^*\}$ is a {\em prefix code}.
2041: 
2042: Let $T_1 ,T_2 , \ldots$ be a standard enumeration
2043: of all (and only) Turing machines with a binary input tape,
2044: for example the lexicographic length-increasing ordered syntactic
2045: Turing machine descriptions, \cite{LiVi97},
2046: and let $\phi_1 , \phi_2 , \ldots$
2047: be the enumeration of corresponding functions
2048: that are computed by the respective Turing machines
2049: ($T_i$ computes $\phi_i$).
2050: These functions are  the
2051: {\em computable (or recursive)}
2052: functions. % (of effectively prefix-free encoded
2053: %arguments). 
2054: %The {\em Kolmogorov complexity}
2055: %of $x$ is the length of the shortest binary program
2056: %from which $x$ is computed.
2057: For the development of the theory we
2058: actually require
2059: the Turing machines to use {\em auxiliary} (also
2060: called {\em conditional})
2061: information, by equipping the machines with a special
2062: read-only auxiliary tape containing this information at the outset.
2063: Let $\langle \cdot , \cdot \rangle$ be a computable one to one 
2064: {\em pairing function}
2065: on the natural numbers (equivalently, strings)
2066: mapping $\{0,1\}^* \times \{0,1\}^* \rightarrow \{0,1\}^*$ with
2067: $|\langle u,v \rangle| \leq |u|+|v| +O(\log (|u|))$. (We need the extra
2068: $O(\log (|u|))$ bits to separate $u$ from $v$. 
2069: For Kolmogorov complexity, it is essential that there 
2070: exists a pairing function such that  
2071: the length of $\langle u,v \rangle$ is equal to the sum of 
2072: the lengths of $u,v$ plus a small value depending only on $|u|$.)
2073: We denote the function computed by a Turing machine $T_i$ with $p$ as input
2074: and $y$ as conditional information by
2075: $\phi_i(p,y)$.
2076: 
2077: One of the main achievements of the theory of computation
2078: is that the enumeration $T_1,T_2, \ldots$ contains
2079: a machine, say $T_u$, that is computationally universal in that it can
2080: simulate the computation of every machine in the enumeration when
2081: provided with its index. It does so by computing a 
2082: function $\phi_u$ such that 
2083:    $\phi_u(\langle i, p\rangle,y)  = \phi_i (p,y)$
2084:     for all $i,p,y$.
2085:     We fix one such machine and designate it as the {\em reference universal
2086:     Turing machine} or {\em reference Turing machine} for short.
2087: 
2088: \begin{definition}\label{def.KolmK}
2089:     The {\em conditional Kolmogorov complexity} of $x$ given $y$ (as
2090: auxiliary information) {\em with respect to Turing machine} $T_i$ is
2091:                   \begin{equation}\label{eq.KC}
2092:     \K_i(x \mid y) = \min_p \{|p|: \phi_i(p,y)=x \}.
2093:                   \end{equation}
2094: The {\em conditional Kolmogorov complexity} $\K(x \mid y)$ is defined
2095: as the conditional Kolmogorov complexity 
2096: $\K_u (x \mid y)$ with respect to the reference  Turing machine $T_u$ 
2097: usually denoted by $U$.
2098: The {\em unconditional} version is set to  $\K(x)=\K(x  \mid \epsilon)$.
2099: \end{definition}  
2100: 
2101: Kolmogorov complexity $\K(x\mid y)$ has 
2102: the following crucial property: 
2103: $\K(x\mid y)\le \K_i(x \mid y)+c_i$ for 
2104: all $i,x,y$, where $c_i$ depends only on
2105: $i$ (asymptotically, the reference Turing machine is not worse
2106: than any other machine).
2107: Intuitively, $\K(x\mid y)$ represents the minimal amount of information
2108: required to generate $x$ by any effective process from input $y$.
2109: %We denote the {\em shortest program} for $x\mid \epsilon$ by $x^*$; then
2110: %$\K(x)= |x^*|$.
2111: %(Actually, $x^*$ is the first shortest program for $x$ in
2112: %an appropriate standard enumeration of all programs for $x$
2113: %such as the halting order.)
2114: The functions $\K( \cdot)$ and $\K( \cdot \mid  \cdot)$,
2115: though defined in terms of a
2116: particular machine model, are machine-independent up to an additive
2117: constant
2118:  and acquire an asymptotically universal and absolute character
2119: through Church's thesis, see for example \cite{LiVi97}, 
2120: and from the ability of universal machines to
2121: simulate one another and execute any effective process.
2122:   The Kolmogorov complexity of an individual finite object was introduced by
2123: Kolmogorov \cite{Ko65} as an absolute
2124: and objective quantification of the amount of information in it.
2125: The information theory of Shannon \cite{Sh48}, on the other hand,
2126: deals with {\em average} information {\em to communicate}
2127: objects produced by a {\em random source}.
2128:  Since the former theory is much more precise, it is surprising that
2129: analogs of theorems in information theory hold for
2130: Kolmogorov complexity, be it in somewhat weaker form.
2131: For example, let $X$ and $Y$ be random variables
2132: with a joint distribution. Then,
2133: $H(X,Y)\le H(X)+H(Y)$,
2134: where $H(X)$ is the entropy of the marginal
2135: distribution of $X$. 
2136: Similarly, let $\K(x,y)$ denote $\K(\langle x,y \rangle)$ 
2137: where $\langle \cdot,\cdot \rangle$
2138: is a standard pairing 
2139: function as defined previously and $x,y$ are strings.
2140: Then we have 
2141: $\K(x,y)\le \K(x)+\K(y)+O(\log \K(x))$. Indeed, there is a
2142: Turing machine $T_i$ that provided with  $\langle p,q\rangle$ 
2143: as an input computes $\langle U(p),U(q)\rangle$  
2144: (where $U$ is the reference Turing machine). By construction of $T_i$, we have
2145: $\K_i(x,y)\le \K(x)+\K(y)+O(\log \K(x))$, hence
2146: $\K(x,y)\le \K(x)+\K(y)+O(\log \K(x))$.
2147: 
2148: Another interesting similarity is the following:
2149: $I(X;Y)=H(Y)-H(Y \mid X)$
2150:  is the (probabilistic)
2151: {\em information in random variable $X$ about random variable $Y$}.
2152: Here $H(Y \mid X)$ is the conditional entropy of $Y$
2153: given $X$.
2154: Since $I(X;Y)=I(Y;X)$ we call this symmetric quantity the {\em
2155: mutual (probabilistic) information}. 
2156: \begin{definition}
2157: \label{def.mi}
2158: \rm
2159: The {\em (algorithmic)  information in $x$ about $y$} 
2160: is $I(x:y)=\K(y)-\K(y\mid x)$,
2161: where $x,y$
2162: are finite objects like finite strings or finite sets of finite strings.
2163: \end{definition}
2164: 
2165: It is  remarkable that also the algorithmic information
2166: in one finite object about another one is symmetric: $I(x:y)=I(y:x)$ up to
2167: an additive term logarithmic in $\K(x)+\K(y)$. This follows
2168: immediately from the {\em symmetry of information} property
2169: due to A.N. Kolmogorov and L.A. Levin: 
2170: %Let $x^*$  denote the shortest program
2171: %for a finite string $x$,
2172: %or, if there are more than one of these, then $x^*$ is the first
2173: %one halting in a fixed standard enumeration of all halting programs.
2174: %Then, by definition, $\K(x)=|x^*|$.
2175: \begin{align}\label{eq.soi}
2176: \K(x,y) & = \K(x)+\K(y \mid x) + O(\log (\K(x)+\K(y))) \\
2177: & = \K(y)+\K(x \mid y)+O(\log (\K(x)+\K(y))) .
2178: \nonumber
2179: \end{align}
2180: %If $X,Y$ are random variables with a computable joint probability mass
2181: %function $p$, then the expectation of of the algorithmic mutual
2182: %information is close to the probabilistic mutual information.
2183: 
2184: 
2185: 
2186: \subsection{Randomness Deficiency and Fitness}\label{sect.rd}
2187: Randomness deficiency of an element $x$ of
2188: a finite set $A$ according to Definition~\ref{def.rd} is
2189: related with the fitness of $x \in A$ (identified with the fitness
2190: of set $A$ as a model for $x$) in the sense of $x$ having most properties 
2191: represented by the set $A$. Properties are identified with large
2192: subsets of $A$ whose Kolmogorov complexity is small (the `simple'
2193: subsets).
2194: \begin{lemma}\label{lemma.property}
2195: Let $\beta , \gamma$ be constants.
2196: Assume that $P$ is a subset of $A$ with
2197: $|P| \geq (1-2^{- \beta })|A|$  and
2198: $\K(P\mid A)\leq \gamma$.
2199: Then the randomness deficiency $\delta(x \mid A)$ of every
2200: $x\in A \setminus P$ satisfies 
2201: $\delta(x \mid A)> \beta-\gamma-O(\log \log |A|)$
2202: \end{lemma}
2203: \begin{proof} 
2204: Since $\delta (x \mid A) = \log |A|-\K(x\mid A)$
2205: and $\K(x\mid A) \leq \K(x\mid A,P)+\K(P\mid A) + O(\log \K(x\mid A,P))$,
2206: while $\K(x\mid A,P) \leq - \beta + \log |A|+O(1)\le \log |A|+O(1)$,
2207: we obtain
2208: $\delta(x \mid A)> \beta-\gamma-O(\log \log |A|)$.
2209: %which is large if $\beta$ is large and $\gamma$ and $\log \log |A|$ are small.
2210: \end{proof}
2211: 
2212: The randomness deficiency measures our disbelief
2213: that $x$ can be obtained
2214: by random sampling in $A$ (where all elements of $A$ are
2215: equiprobable). 
2216: For every $A$, the randomness deficiency of almost all
2217: elements of $A$ is small:
2218: The number of $x\in A$ with $\delta(x \mid A)>\beta$ is fewer than
2219: $|A|2^{-\beta}$. This can be seen as follows. 
2220: The inequality $\delta(x \mid A)>\beta$ implies
2221: $\K(x \mid A)<\log |A|-\beta$.
2222: Since $1+2+2^2+\dots+2^{i-1}=2^i-1$, 
2223: there are less than $2^{\log  |A|-\beta}$
2224: programs of fewer than
2225: $\log |A|-\beta$ bits. Therefore, 
2226: the number of $x$'s satisfying
2227: the inequality 
2228: $\K(x\mid A)<\log |A|-\beta$ cannot be larger.
2229: Thus, with high probability  
2230: the randomness
2231: deficiency of an element
2232: randomly chosen in  $A$ is small.
2233: On the other hand, if $\delta(x \mid A)$ is small,
2234: then there is no way to refute the hypothesis
2235: that $x$ was obtained
2236: by random sampling from $A$: Every such
2237: refutation is based on a simply described property
2238: possessed by a majority of elements 
2239: of $A$ but not by $x$. Here it is important that we consider
2240: only simply described properties, since otherwise
2241: we can refute the hypothesis by exhibiting the property
2242: $P=A \setminus \{x\}$.
2243: 
2244: 
2245: \subsection{Covering Coefficient for Hamming Distortion}\label{sect.exhamming}
2246: 
2247: The authors find it difficult to believe that the covering result
2248: in the lemma below is new. But neither a literature search nor the
2249: consulting of experts has turned up an appropriate reference.
2250: \begin{lemma}\label{l2}
2251: Consider the distortion family ${\cal H}_n$.
2252: For all $0 \leq d\leq \delta\leq  \frac{1}{2}$ every Hamming ball of radius
2253: $\delta$ in ${\cal H}_n$
2254: can be covered by at most
2255: $\alpha_n b(\delta)/b(d)$
2256: Hamming balls of radius $d$ in ${\cal H}_n$,
2257: where $\alpha_n $ is a  polynomial in $n$.
2258: \end{lemma}
2259: 
2260: 
2261: \begin{proof}
2262: %If the lemma holds for even $n \geq 2$, then we
2263: %can delete the first bit of every $n$-length string involved and
2264: %have the lemma hold for strings of odd length $n-1$.
2265: %
2266: %Assume that $n$ is even.
2267: Fix a ball with center $y$ and radius $\delta = j/n \leq \frac{1}{2}$ where
2268: $j$ is a natural number.
2269: All the strings in the ball that are
2270: at Hamming distance at most $d$ from $y$
2271: can be covered by one ball
2272: of radius $d$ with center $y$.
2273: Thus it suffices,
2274: for every $\Delta$ of the form $i/n$ with 
2275: $i= 2,3, \ldots ,j$ 
2276: (such
2277: that $d<\Delta\leq \delta$), to cover
2278: the set of all the strings at distance precisely $\Delta$ from $y$
2279: by  $n^{c+1} b(\delta)/b(d)$ balls of radius $d$ 
2280: for some fixed constant $c$.
2281: Then the ball $B(y, \delta)$ is covered by at most 
2282: $j n^{c+1} b(\delta)/b(d) \leq n^{c+2} b(\delta)/b(d)$ balls of 
2283: radius $d$.
2284: 
2285: Fix
2286: $\Delta$ and let the Hamming sphere $S$ denote the set of all 
2287: strings at distance precisely 
2288: $\Delta$ from $y$.
2289: Let $f$ be the solution to the equation
2290: $d+f(1-2d)=\Delta$ rounded to the closest rational of the form $i/n$.
2291: Since $d<\Delta\leq  \delta\leq\frac{1}{2}$
2292: this equation has a unique solution and
2293: it lies in the closed real interval
2294: $[0,1]$.
2295: Consider a ball $B$ of radius $d$ with a random center $z$
2296: at distance
2297: $f$ from $y$. Assume that
2298: all centers at distance $f$ from $y$ are chosen with equal probabilities
2299: $1/s(f)$ where $s(f)$ is the number of points in a Hamming
2300: sphere of radius $f$.
2301: \begin{claim}\label{claim.prball}
2302: Let $x$ be a particular string in $S$. Then
2303: \[
2304: \Pr( x \in B) \geq \frac{b(d)}{n^c b(\delta)}
2305: \]
2306: for some fixed positive constant $c$.
2307: \end{claim}
2308: 
2309: \begin{proof}
2310: Fix a string $z$ at distance  $f$ from $y$. We first claim
2311: that the ball $B$ of radius $d$ with center
2312: $z$ covers $b(d)/n^c$ strings in $S$.
2313: Without loss of generality, 
2314:  assume that the string $y$ consists of only  zeros
2315: and string $z$ consists of $fn$ ones and $(1-f)n$ zeros.
2316: Flip a set of $fd n$ ones
2317: and a set  of
2318: $(1-f)d n$ zeros in $z$ to obtain a string $u$.
2319: The total number of flipped bits is equal to
2320: $d n$ and therefore $u$ is at distance $d$ from
2321: $z$. The number of ones in $u$ is
2322: $fn-fd n+(1-f)d n=\Delta n$ and
2323: therefore $u \in S$.
2324: Different choices of the positions of the same numbers of flipped bits
2325: result in different strings in
2326: $S$. The number of ways to choose the flipped bits is equal to
2327: $$
2328: \binom{fn}{fd n}\binom{(1-f)n}{(1-f)d n}.
2329: $$
2330: By Stirling's formula, this is at least
2331: $$
2332: 2^{fnh(d)+(1-f)nh(d)-O(\log n)}=
2333: 2^{nh(d)-O(\log n)}\ge
2334: \frac{b(d)}{n^c},
2335: $$
2336: where the last inequality follows from \eqref{binom-entropy}.
2337: Therefore a ball $B$ as above covers at least $b(d)/n^c$ strings
2338: of $S$.
2339: The probability
2340: that a ball $B$, chosen uniformly at random as above,
2341: covers a particular string $x\in S$ is the same for every such $x$
2342: since they are in symmetric position.
2343: The number of elements in a Hamming sphere 
2344: is smaller than the cardinality of a Hamming ball of the same radius,
2345: $|S| \leq b(\delta)$.
2346: Hence with probability
2347: $$
2348: \frac{b(d)}{n^c |S|}\ge
2349: \frac{ b(d)}{n^c b(\delta)} 
2350: $$
2351: a random ball $B$ covers a particular string $x$ in $S$.
2352: \end{proof}
2353: 
2354: By Claim~\ref{claim.prball}, 
2355: the probability that a random ball $B$ does not cover a particular
2356: string $x \in S$ is at most $1-b(d)/(n^c b(\delta))$.
2357: The probability that no ball out of $N$ randomly drawn such
2358: balls $B$ covers 
2359: a particular $x \in S$ (all balls are equiprobable) is at most 
2360: \[
2361: \left(1-\frac{ b(d)}{n^c b(\delta)}\right)^N
2362: < e^{-N b(d)/(n^c  b(\delta))} .
2363: \]
2364: For $N = n^{c+1}  b(\delta)/ b(d)$,
2365: the exponent of the 
2366: right-hand side of the last inequality is  $-n$,
2367: and the probability that $x$ is not covered is at most $e^{-n}$. 
2368: This probability remains exponentially small even after
2369: multiplying by $|S| \leq 2^n$, the number of different $x$'s in $S$.
2370: Hence, with probability at least $1- (2/e)^n$ 
2371: we have that $N$ random balls
2372: of the given type cover all the strings in $S$. 
2373: Therefore, there exists a deterministic selection of $N$
2374: such balls that covers all the strings in $S$.
2375: The lemma is proved.
2376: (A more accurate calculation shows that
2377: the lemma holds with $\alpha_n=O(n^4)$.)
2378: \end{proof}
2379: 
2380: \begin{corollary}\label{cor.l2}
2381: \rm
2382: Since all strings of length $n$ are either in the Hamming ball
2383: $B(00\ldots 0, \frac{1}{2})$ or in the Hamming ball
2384: $B(11\ldots 1, \frac{1}{2})$ in ${\cal H}_n$,
2385: the lemma implies that the set $\{0,1\}^n$
2386: can be covered by at most
2387: \[
2388: N =  \frac{2\alpha_n  2^{n}}{b(d)}
2389: \]
2390: balls of radius $d$ for every $0 \leq d \leq \frac{1}{2}$.
2391: (A similar, but direct, calculation lets us
2392: replace the factor $2\alpha_n$ by $n$.)
2393: \end{corollary}
2394: %\begin{IEEEproof}
2395: %{\em of Corollary~\ref{cor.l2}.}
2396: %We will first prove
2397: %this corollary,
2398: %and then use the same method to prove the full lemma.
2399: %
2400: %Fix a string $x$. The probability that
2401: %$x$ is \emph{not} covered  by a randomly selected ball of radius $d$
2402: %is equal to  $1-b(d)2^{-n}$ (all balls are
2403: %equiprobable). Thus the probability that no ball out of
2404: %$N$ randomly selected balls of radius $d$ covers $x$ is
2405: %\[
2406: %(1-b(d)2^{-n})^N< e^{-N b(d)2^{-n}}.
2407: %\]
2408: %
2409: %
2410: %Choose $N=n2^{n}/b(d)$. Then the exponent in the right hand side of the last
2411: %displayed inequality is at most
2412: %$-n$, and the probability that  $x$ is not covered is less than
2413: %$e^{-n}$. This probability remains exponentially small even after
2414: %multiplying by $2^n$, the number of different $x$'s.
2415: %Hence, with probability close to 1, $N$ random balls
2416: %cover all the strings of length $n$.
2417: %\end{IEEEproof}
2418: 
2419: \subsection{Proofs of the Theorems}
2420: \label{sect.proofs}
2421: 
2422: \begin{proof}
2423: {\em of Theorem}~\ref{theo.allshapesrd}.
2424: (i)  Lemma~\ref{lem.shapesg} (assuming properties 1 through 4) 
2425: implies that
2426: the canonical structure function $g_x$ of every string $x$ of length
2427: $n$ is close to some function in the family $G_n$. This can be seen
2428: as follows. Fix $x$ and
2429: construct $g$ inductively for $n, n-1, \ldots , 0$. Define
2430: $g(n)=0$
2431: and
2432: $$
2433: g(l-1)=\left\{\begin{array}{ll}
2434: g(l)+1 & \text{if } g(l)<g_x(l-1),\\
2435: g(l) & \text{otherwise.}
2436: \end{array}\right.
2437: $$
2438: By construction this function belongs
2439: to the family $G_n$.
2440: Let us show that
2441: $
2442: g_x(l)=g(l)+O(\log n)$.
2443: First, we prove that
2444: \begin{equation}\label{eq.left}
2445: g(l) \leq g_x(l)
2446: \end{equation}
2447: by induction on $l=n,n-1, \ldots , 0$.
2448: For $l=n$ the inequality is straightforward, since
2449: by definition $g(n)=0$.
2450: Let $0\le l\leq n$. 
2451: Assume that $g(i)\le g_x(i)$ for $i=n,n-1, \ldots , l$.
2452: If $g(l) < g_x(l-1)$ then $g(l-1)= g(l)+1$ and therefore 
2453: $g(l-1) \leq g_x(l-1)$. If $g(l) \geq g_x(l-1)$ then
2454: $g(l-1) = g(l) \geq g_x(l-1)\ge g_x(l)\ge g(l)$ and hence 
2455: $g(l-1) = g_x(l-1)$.
2456: 
2457: Second, we prove that
2458: \[
2459: g_x(l)\le g(l)+O(\log n)
2460: \]
2461: for every $l=0,1,\ldots, n$.
2462: Fix an $l$ and consider the least
2463: $m$ with $l \leq m \leq n$ such that $g_x(m)=g(m)$.
2464: If there is no such $m$ we take $m=n$ and observe
2465: that $g_x(n)=O(\log n)= g(n)+ O(\log n)$.
2466: This way, $g_x(m)=g(m)+O(\log n)$ and for every $l<l'\le m$
2467: we have $g(l'-1)<g_x(l'-1)$ due to inequality \eqref{eq.left}
2468: and definition of $m$.
2469: Then 
2470: $g_x(l'-1)>g(l'-1)\ge g(l')$, since we know that $g$ is nonincreasing.
2471: Then, by the definition of $g$ we have $g(l'-1)=g(l')+1$.  Thus
2472: we have
2473: $g(l)=g(m)+m-l$.
2474: Hence,
2475: $g_x(l)\le g_x(m)+m-l+O(\log n) = g(m)+m-l+O(\log n)=g(l)+O(\log n)$,
2476: where the inequality follows from Lemma~\ref{lem.shapesg},
2477: the first equality from the assumption that $g_x(m)=g(m)+O(\log n)$,
2478: and the second equality from the previous sentence.
2479: 
2480: 
2481: (ii)
2482: In Theorem IV.4 
2483: in \cite{VV02} we proved a similar statement
2484: for the special distortion family ${\cal L}$
2485: with an error term of $O(\log n)$.
2486: However, for the special case ${\cal L}$
2487: we can let $x$ be equal to the first $x$
2488: satisfying the inequality
2489: $g_x(l)\ge g(l)-O(\log n)$ for every $l$.
2490: In the general case this does not work any more.
2491: Here we construct $x$ together with sets
2492: ensuring the inequalities
2493: $g_x(l)\le g(l)+O(\sqrt{n\log n})$ for every $l=0,\dots,n$.
2494: 
2495: The construction is as follows.
2496: Divide the segment $\{0,1,\dots,n\}$ into
2497: $N=\sqrt{n/\log n}$ subsegments of length $\sqrt {n\log n}$ each.
2498: Let
2499: $l_0=n>l_1>\dots>l_N=0$ denote the end points of the
2500: resulting subsegments.
2501: 
2502: 
2503: To find the desired $x$, we
2504: run the nonhalting algorithm below that takes
2505: $n$ and ${\A}_n$ as input 
2506: %covering coefficient $\alpha_n$,
2507: together with the values
2508: of the function $g$ in the points $l_0,\dots,l_N$.
2509: Let $\delta (n)$ be a computable integer valued 
2510: function of $n$ of the order $\sqrt {n\log n}$
2511: that will be specified later. 
2512: \begin{definition}
2513: \rm
2514: Let $i=0,1,\dots,N$.
2515: A set $F\in\A_n$ is called {\em $i$-forbidden}
2516: if $|F|\le 2^{l_i}$ and 
2517: $\K(F) < g(l_i)-\delta (n)$.
2518: A set is called {\em forbidden} if 
2519: it is $i$-forbidden for some $i=0,1,\dots,N$.
2520: \end{definition}
2521: We wish to find an $x$ that is outside all forbidden sets
2522: (since this guarantees that $g_x(l_i)\ge g(l_i)-\delta (n)$ for every $i$).
2523: Since $\K(\cdot)$ is upper semicomputable, moreover 
2524: property 3 holds, and we are also given $n$ and $g(l_0),\dots,g(l_N)$,
2525: we are able to find all forbidden sets using the following
2526: subroutine.
2527: 
2528: \textbf{Subroutine $(n,{\A}_n, g(l_0),g(l_1), \ldots , g(l_n))$:}
2529: \begin{quote} 
2530: for every 
2531: $F\in \A_n$ 
2532: upper  semicompute
2533: $\K(F)$; every time we find
2534: $\K(F) < g(l_i)-\delta (n)$ 
2535: and $|F|\le 2^{l_i}$ for some $i$ and $F$, then print $F$.
2536: {\bf End of Subroutine}
2537: \end{quote}
2538: 
2539: This subroutine prints all the forbidden sets in some order. Let 
2540: $F_1,\dots,F_T$ be that order. Unfortunately 
2541: we do not know when the subroutine will 
2542: print the last forbidden set. In other words, we do not 
2543: know the number $T$ of forbidden sets. To overcome this problem,
2544: the algorithm will run the subroutine and every time a new  
2545: forbidden set $F_t$ is printed, the algorithm will  
2546: construct {\em candidate sets}
2547: $B_0(t),\dots,B_N(t)\in\A_n$ satisfying $|B_i(t)|\le 2^{l_i}$ and  
2548: $\K(B_i(t)) \le g(l_i)+\delta (n)$  
2549: and the following condition
2550: \begin{equation}\label{eq.capcup}
2551: \bigcap_{j=0}^{N}B_j(t) \setminus \bigcup_{j=1}^{t}
2552: F_j\ne \emptyset ,
2553: \end{equation}
2554: for every $t=0,\dots,T$.
2555: For $t=T$ the set $\bigcup_{j=1}^{t}
2556: F_j$ is the union of all forbidden sets, which guarantees the bounds
2557: $g(l_i)-\delta (n)\le g_x(l_i)\le g(l_i)+\delta (n)$
2558: for all $x$ in the set in the left hand side of \eqref{eq.capcup}. 
2559: Then we will 
2560: prove that these bounds imply that 
2561: $g(l)-\delta (n)\le g_x(l)\le g(l)+\delta (n)$
2562: for \emph{every} $l=0,\dots,n$.
2563: Each time a new forbidden set 
2564: appears (that is, for every $t=1,\dots,T$) 
2565: we will need to update candidate sets so that \eqref{eq.capcup} remains 
2566: true. To do that we will maintain a stronger 
2567: condition than just non-emptiness of the left hand side of \eqref{eq.capcup}.
2568: Namely, we will maintain the following invariant:
2569: for every $i=0,1, \ldots,  N$, 
2570: \begin{equation}\label{eq.invariant}
2571: \left| \bigcap_{j=0}^{i} B_j(t) \setminus \bigcup_{j=1}^{t}
2572: F_j \right| \geq
2573: 2^{l_i-i-1}\alpha_n^{-i}.
2574: \end{equation}
2575: Note that for $i=N$ inequality \eqref{eq.invariant} implies
2576: \eqref{eq.capcup}. 
2577: 
2578: 
2579: {\bf Algorithm 
2580: $(n,{\A}_n, g(l_0),g(l_1), \ldots , g(l_n))$:}
2581: \begin{description}
2582: \item 
2583: %{\bf Step 1.}
2584: % 
2585: %Find set $B_i$ in ${\A}_n$
2586: %of cardinalities at most $2^{l_i}$ such
2587: %that
2588: %$$
2589: %\left|B_0\bigcap\dots\bigcap B_i \right|\ge 2^{l_i-i-1}\alpha_n^{-i}.
2590: %$$
2591: %We will amply fulfil the requirement by
2592: %producing sets with a much larger intersection---without
2593: %the factor of $2^{-i-1}$. {\bf \}}
2594: %{\bf \{}The sets $B_i$ with $i=1,2, \ldots , N$ are constructed inductively.
2595: %Assume that $B_0,\dots,B_i$ are already defined, and the cardinality of their
2596: %joint intersection is at least $2^{l_i}\alpha_n^{-i}$.{\bf \}}
2597: {\bf Initialize.}
2598: Recall that $l_0=n$.
2599: Define the set $B_t(0)=\booln$ for every $t$.
2600: This set is in ${\A}_n$ by property 1.
2601: 
2602: {\bf for } $i := 1, \ldots , N$ {\bf do}
2603: 
2604: Assume inductively that 
2605: $|B_0(0) \bigcap B_1(0) \bigcap \cdots \bigcap B_{i-1} (0)| 
2606: \geq 2^{l_{i-1}} \alpha_n^{-i+1}$, where $\alpha_n$ 
2607: denotes a polynomial upper bound of the covering
2608: coefficient of distortion family ${\A}_n$ existing by property 4. 
2609: (The value $\alpha_n$ can be computed from $n$.)
2610: Note that this inequality is satisfied
2611: for $i=1$.
2612: Construct $B_{i}(0)$ by
2613: covering $B_{i-1}(0)$ by at most
2614: $\alpha_n 2^{l_{i-1}-l_{i}}$ sets of cardinality at most
2615: $2^{l_{i}}$
2616: (this cover exists in ${\A}_n$ by property 4).
2617: Trivially, this cover also covers
2618: $B_0(0)\bigcap\dots\bigcap B_{i-1}(0)$.
2619: The intersection of at least one of the covering
2620: sets with $B_0(0)\bigcap\dots\bigcap B_{i-1}(0)$ has cardinality at least
2621: $$
2622: \frac{2^{l_{i-1}}\alpha_n^{-i+1}}{\alpha_n 2^{l_{i-1}-l_{i}}}=
2623: 2^{l_{i}}\alpha_n^{-i}.
2624: $$
2625: Let $B_{i}(0)$ by the first such covering set in a given standard order.
2626: {\bf od}
2627: 
2628: Notice that after the Initialization the invariant~\eqref{eq.invariant}
2629: is true for $t=0$, as $\bigcup_{j=1}^tF_j=\emptyset$.
2630: For every $t=1,2,\dots$ perform the following steps 1 and 2
2631: maintaining the 
2632: invariant~\eqref{eq.invariant}: 
2633: 
2634: \item {\bf Step 1.}
2635: Run the subroutine and wait until $t$th forbidden set $F_t$ is printed 
2636: (if $t>T$ the algorithms waits forever and never
2637: proceeds to Step 2). 
2638: 
2639: \item{\bf Step 2.}
2640: 
2641: {\bf Case 1.} For every $i = 0,1, \ldots , N$ 
2642: we have 
2643: \begin{equation}
2644: \label{eq.inv}
2645: \left|\bigcap_{j=0}^i B_j(t-1) \setminus \bigcup_{j=1}^t
2646: F_j \right| \geq 2^{l_i-i-1}\alpha_n^{-i}.
2647: \end{equation} 
2648: Note the this inequality has one more 
2649: forbidden set compared to the invariant~\eqref{eq.invariant} 
2650: for $t-1$ (the argument in $B_j(t-1)$), and thus may be false. 
2651: If that is the case, then 
2652: we let $B_i(t)=B_i(t-1)$ for every 
2653: $i=1, \ldots , N$ (this setting maintains invariant~\eqref{eq.invariant}). 
2654: 
2655: {\bf Case 2.} Assume that 
2656: \eqref{eq.inv} is false 
2657: for some index $i$.
2658: In this case 
2659: find the least such index (we will use later that \eqref{eq.inv} 
2660: is true for all $i'<i$). 
2661: 
2662: We claim that $i>0$. That is,  
2663: the inequality \eqref{eq.inv} is true for $i=0$.
2664: In other words, the 
2665: the cardinality of $F_1\bigcup \cdots \bigcup F_t$ is not
2666: larger than half
2667: of the cardinality of $B_0(t-1)=\booln$.
2668: Indeed, for every fixed $i$ the total cardinality of all the sets
2669: of simultaneously cardinality at most $2^{l_i}$ 
2670: and Kolmogorov complexity less than $g(l_i)-\delta (n)$ does not exceed
2671: $2^{g(l_i)-\delta (n)}2^{l_i}$. 
2672: Therefore, the total number of elements in 
2673: $\bigcup_{j=1}^t F_t$
2674: is at most
2675: $$
2676: \sum_{i=0}^N2^{g(l_i)-\delta (n) +l_i}\le
2677: (N+1)2^{g(\dmax)-\delta (n) +n}=
2678: (N+1)2^{n- \delta (n) }\ll 2^{n-1}= \frac{1}{2}\left|\booln \right|,
2679: $$ 
2680: where the first inequality follows since the function $g(l)+l$
2681: is monotonic nondecreasing, the first equality since 
2682: $g(\dmax)=0$ by definition,
2683: and the last inequality since we will set $\delta(n)$
2684: at order of magnitude $\sqrt{n \log n}$.
2685: 
2686: %Without loss of generality, assume
2687: %$i$ is the least such index.
2688: 
2689: First let $B_k(t)=B_k(t-1)$ for all $k<i$ (this
2690: maintains invariant~\eqref{eq.invariant} for all $k<i$).
2691: To define $B_i(t)$ find a covering
2692: of $B_{i-1}(t)$ by at most
2693: $\alpha_n 2^{l_{i-1}-l_i}$
2694: sets in ${\A}_n$ of cardinality at most $2^{l_i}$.
2695: Since~\eqref{eq.inv} 
2696: is true for index $i-1$, we have
2697: \begin{equation}\label{eq.inter}
2698: \left| \bigcap_{j=0}^{i-1} B_j(t) \setminus 
2699: \bigcup_{j=1}^t
2700: F_t \right|
2701:  \geq
2702: 2^{l_{i-1}-i}\alpha_n^{-i+1}.
2703: \end{equation}
2704: Thus 
2705: the greatest cardinality of an intersection of the set in \eqref{eq.inter}
2706: with a covering set is at least
2707: $$
2708: \frac{2^{l_{i-1}-i}\alpha_n^{-i+1}}{\alpha_n 2^{l_{i-1}-l_i}}
2709: = 2^{l_i-i}\alpha_n^{-i}.
2710: $$
2711: Let $B_i(t)$ be
2712: the first such covering set in standard order.
2713: Note that $2^{l_i-i}\alpha_n^{-i}$ is at least
2714: twice the
2715: threshold required by invariant~\eqref{eq.invariant}. 
2716: Use the same procedure to obtain successively $B_{i+1}(t),\dots,B_N(t)$.
2717: %Finally, define $B_j (t)= B_j(t-1)$ for every $0 \leq j \leq i-1$.
2718: \end{description}
2719: 
2720: {\bf End of Algorithm}
2721: 
2722: Although the algorithm does not halt,
2723: at some unknown time  the last forbidden set $F_T$ is enumerated.
2724: After this time the candidate sets are not changed anymore.
2725: The invariant \eqref{eq.invariant} with $i=N$ shows that the cardinality 
2726: of the set in the left hand side of \eqref{eq.capcup} is 
2727: positive 
2728: %at least
2729: %$2^{l_N-N-1} /\alpha_n^{-N} > 0$  since $l_N=0$, $N = \sqrt{n/ \log n}$
2730: %and $\alpha_n$ polynomial in $N$. Hence, \eqref{eq.capcup} holds.
2731: hence the set is not empty.
2732: 
2733: 
2734: Next we show that $\K(B_i(t))\le g(l_i)+\delta(n)$
2735: for every $i$ and every $t=1,\ldots,T$. We will see  
2736: that to this end it suffices to upperbound
2737: the number of changes of each candidate set. 
2738: 
2739: \begin{definition}
2740: \rm
2741: Let $m_i$ be the {\em number of changes of $B_i$}
2742: defined by 
2743: $m_i = |\{t: B_i(t) \neq B_i (t-1), \; 1 \leq t\le T \}|$ for
2744: $0 \leq i \leq N$.
2745: \end{definition}
2746: \begin{claim}\label{claim.mi}
2747: \rm
2748: $m_i \leq 2^{g(l_i)+i}$ for $0 \leq i \leq N$. 
2749: \end{claim}
2750: \begin{proof}
2751: The Claim is proved by induction on $i$. For 
2752: $i=0$ the claim is true,
2753: since $l_0 = n$ and $g(n)=0$ while $m_0=0$ by
2754: initialization in the Algorithm ($B(0)$ never changes). 
2755: 
2756: ($i > 0$): assume that the Claim 
2757: is satisfied for every $j$ with $0 \leq j < i$.
2758: We will prove that $m_i\le 2^{g(l_i)+i}$ by counting
2759: separately the number of changes of $B_i$ of different types.
2760: 
2761: {\bf Change of type 1.} The set $B_i$ is changed when 
2762: \eqref{eq.inv} 
2763: is false for an index strictly
2764: less than $i$. 
2765: The number of these changes is at most 
2766: \[
2767: m_{i-1} \leq 2^{g(l_{i-1})+i-1} \leq 2^{g(l_{i})+i-1},
2768: \]
2769: where the first inequality follows from the inductive assumption,
2770: and the second inequality by the property of $g$ that it
2771: is nonincreasing.
2772: Namely, since $l_{i-1} > l_i$  we have
2773: $g(l_{i-1}) \leq g(l_i)$.
2774: % $g(j)+j$ is 
2775: %is nondecreasing. 
2776: %Namely, since $l_{i-1} = l_i + \sqrt{n \log n}$  we have
2777: %therefore that $g(l_i)+l_i \leq g(l_{i-1})+l_{i-1}$ and hence
2778: %$g(l_i) \leq g(l_{i-1})+l_{i-1}-l_i = g(l_{i-1})+\sqrt{n \log n}$.
2779: 
2780: {\bf Change of type 2.}  The inequality \eqref{eq.invariant} 
2781: is false for $i$ and is true for all smaller indexes.
2782: %To upper bound the number of changes of this type divide
2783: %them again in two categories, recalling the notion
2784: %of the forbidden sets:  the sets in $\A_n$
2785: %of simultaneously cardinality at most
2786: %$2^{l_j}$ and complexity less than
2787: %$g(l_j)-\delta(n)$ for $0 \leq j \leq N$.
2788: 
2789: {\bf Change of type 2a.}
2790: After the last change of
2791: $B_i$ at least one $j$-forbidden set for some $j<i$  
2792: has been enumerated.
2793: The number of changes of this type is at most the number of
2794: $j$-forbidden sets for $j=0,\dots,i-1$. For every such $j$  
2795: these forbidden sets have by definition Kolmogorov complexity less than
2796: $g(l_{j}) - \delta (n)$. 
2797: %These $j$'s concerned satisfy $0 \leq j < i$.
2798: Since $l_j \ge l_i$ and $g$
2799: is monotonic nonincreasing we have
2800: $g(l_{j}) \leq g(l_{i})$. 
2801: Because there are at most $N$ of these $j$'s,
2802: the number of such forbidden sets is at most
2803: $$N2^{g(l_i)-\delta(n)}\ll 2^{g(l_i)},$$
2804: since we will later choose
2805: $\delta(n)$ of order $\sqrt{n \log n}$, 
2806: 
2807: {\bf Change of type 2b.}
2808: Finally, for every change of this type, between the last
2809: change of
2810: $B_i$ and the current one
2811: no candidate sets with indexes less than
2812: $i$ have been changed and no $j$-forbidden  sets
2813: with $j<i$ have been enumerated.
2814: Since after the last change of $B_i$ the cardinality of the set in the 
2815: left-hand side of \eqref{eq.invariant} was at least 
2816: $2^{l_i-i} \alpha_n^{-i}$, which is twice the threshold 
2817: in the right-hand side
2818: by the restoration of the invariant in the Algorithm Step 2, Case 2,
2819: the following must hold.
2820: The cardinality of 
2821: $\bigcup_{j=1}^t F_j$ increased  by  at least
2822: $2^{l_i-i-1}\alpha_n^{-i}$ since the last change of $B_i$,
2823: and this must be due to enumerating
2824: $j$-forbidden sets for $j=i,\dots,N$.
2825: For every such $j$ 
2826: every $j$-forbidden
2827: set has cardinality at most $2^{l_j}$
2828: and Kolmogorov complexity less than
2829: $g(l_{j}) - \delta (n)$. 
2830: Hence the total number of elements in all
2831: $j$-forbidden sets is less than $2^{l_j}2^{g(l_{j}) - \delta (n)}$.
2832: Since $j\geq i$ and hence $l_j \leq l_i$ while $g(l)+l$
2833: is monotonic nondecreasing we have
2834: $g(l_{j})+l_j \leq g(l_{i})+l_i$.
2835: Because there are at most $N+1$ of these $j$'s,
2836: the total number of elements in all those sets does not exceed
2837: $M=(N+1)2^{g(l_i)-\delta (n)+l_i}$.
2838: %After the last change of
2839: %$B_i$ no forbidden set of cardinality greater
2840: %than $2^{l_i}$ has been enumerated.
2841: The number
2842: of changes of this type is not more than the total number $M$
2843: of elements involved divided by the increments of size
2844: $2^{l_i-i-1}\alpha_n^{-i}$. Hence it is not more than
2845: $$(N+1)2^{g(l_i)-\delta (n)}2^{i+1}\alpha_n^{i}.$$
2846: Let
2847: \begin{align}\label{eq.deltan}
2848: &\delta (n) \geq \log ((N+1)2^{i+10}\alpha_n^{i})
2849: \; \; {\rm and }
2850: \\&\delta (n) = 
2851: O (N\log(2\alpha_n))=O(\sqrt{n/\log n} \; \log(2\alpha_n))=
2852: O (\sqrt{n\log n}),
2853: \nonumber
2854: \end{align}
2855: where the last equality uses that $\alpha_n$ is polynomial
2856: in $n$ by property 4. 
2857: Then,
2858: the number of changes of type 2b is much less than  $2^{g(l_i)}$.
2859:  The value of $\delta(n)$ can be computed from $n$.
2860: 
2861: Summing the numbers of changes of types 1, 2a, and 2b we obtain
2862: $m_i \leq 2^{g(l_i)+i}$, completing the induction.
2863: \end{proof}
2864: \begin{claim}\label{claim.gx}
2865: \rm
2866: Every $x$ in the nonempty set  \eqref{eq.capcup} satisfies
2867: $|g_x(l_i) -  g(l_i)| \leq \delta (n)$
2868: with $\delta (n) = O(\sqrt{n \log n})$
2869: for $i=0,1, \ldots , N$.
2870: \end{claim}
2871: \begin{proof}
2872: By construction $x$  is not an element of any forbidden set
2873: in $\bigcup_{t=1}^T F_t$, and therefore
2874: \[
2875: g_x(l_i) \geq g(l_i) - \delta (n)
2876: \]
2877: for every $i=0,1, \ldots , N$.
2878: By construction $|B_i(T)| \leq 2^{l_i}$, and
2879: to finish the proof it remains to show that 
2880: $\K(B_i (T))
2881: \leq g(l_i)+\delta (n)$ so that 
2882: $g_x(l_i) \leq g(l_i)+\delta(n)$, 
2883: for $i=0,1, \ldots,  N$.
2884: Fix $i$. 
2885: The set $B_i(T)$ can be 
2886: described by a constant length 
2887: program, that is $O(1)$ bits,
2888: that runs the Algorithm and uses the following
2889: information:
2890: \begin{itemize}
2891: \item
2892: A description of 
2893: $i$ in $\log N\le\log n$ bits.
2894: \item
2895: A description of 
2896: the distortion family $\A_n$ in $O(\log n)$ bits by property 3.
2897: \item
2898: The values of $g$ in the points $l_0,\dots,l_N$
2899: in $N\log n=\sqrt{n\log n}$ bits.
2900: \item
2901: The description of $n$ in $O(\log n)$ bits.
2902: \item
2903: The total number $m_i$
2904: of changes (Case 2 in the Algorithm)
2905: to intermediate versions of $B_i$ in $\log m_i$ bits.
2906: \end{itemize}
2907: We count the number of bits in the description of 
2908: $B_i(T)$. The description is effective and by Claim~\ref{claim.mi} with
2909: $i \leq N = \sqrt{n/\log n}$ it
2910: takes at most $g(l_i) + O(\sqrt{n \log n})$ bits. So this is an
2911: upper bound on the Kolmogorov complexity $\K(B_i(T))$. 
2912: Therefore, for some $\delta(n)$ satisfying \eqref{eq.deltan} we have
2913: %by Definition~\ref{def.gx} of $g_x$ we obtain
2914: \[
2915: g_x(l_i) \leq g(l_i)+ \delta(n),
2916: \]
2917: for every $i = 0,1, \ldots, N$. 
2918: The claim follows from the first and the last displayed 
2919: equation in the proof.
2920: \end{proof}
2921: 
2922: 
2923: Let us show that the statement 
2924: of Claim~\ref{claim.gx} 
2925: holds not only for the subsequence of values $l_0,l_1, \ldots , l_N$
2926: but for every $l=0,1, \ldots , n$,
2927: %  when we replace $\delta (n)$
2928: %by $O(\sqrt{n \log n})$, and so prove the theorem.
2929: 
2930: Let $l_i \leq l \leq l_{i-1}$.
2931: Both functions $g(l),g_x(l)$ are nonincreasing so that
2932: \begin{align*}
2933: &g(l)\in[g(l_{i-1}),g(l_{i})],\\
2934: &g_x(l)\in[g_x(l_{i-1}),g_x(l_{i})]
2935: \subseteq[g(l_{i-1})-O(\sqrt{n\log n}),g(l_{i})+O(\sqrt{n\log n})].
2936: \end{align*}
2937: By the 
2938: spacing of the sequence of $l_i$'s
2939: the length of the segment
2940: $[g(l_{i-1}),g(l_{i})]$ is at most
2941: $$
2942: g(l_{i})-g(l_{i-1})\le l_{i-1}-l_{i}
2943:  = \sqrt{n\log n}.
2944: $$
2945: If there is an $x$ such that Claim~\ref{claim.gx} 
2946: holds for every $l_i$ with $i=0, \ldots , N$, then 
2947: it follows from the above that
2948: $|g(l)-g_x(l)|\le\sqrt{n\log n}+O(\sqrt{n\log n})$ for every $l=0,1, \ldots, n$.
2949: \end{proof}
2950: \vspace{.2in}
2951: 
2952: \begin{proof}
2953: {\em of Theorem}~\ref{th-shannon-analog}.
2954: We start with Lemma~\ref{th5} stating a combinatorial fact 
2955: that is interesting
2956: in its own right, as explained further in Remark~\ref{rem.previously}.
2957: 
2958: 
2959: \begin{lemma}\label{th5}
2960: Let $n,m,k$ be natural numbers and
2961: $x$ a string of length $n$. Let ${\BB}$ be a family
2962: of subsets of $\{0,1\}^n$ and 
2963: ${\BB}(x) = \{B \in {\BB}: x \in B \}$.  If 
2964: ${\BB}(x)$ has at least $2^m$ elements (that is, sets) of
2965: Kolmogorov complexity less than $k$, then
2966: there is an element in ${\BB}(x)$ of Kolmogorov complexity
2967: at most $k-m+O(\K(\BB)+\log n +\log k+\log m)$.
2968: \end{lemma}
2969: 
2970: 
2971: \begin{proof}
2972: Consider a game between Alice and Bob. They alternate moves
2973: starting with Alice's move.
2974: A move of Alice consists in producing a
2975: subset of $\booln$. A move of
2976: Bob consists in marking some sets previously produced by
2977: Alice (the number of marked sets can be 0).
2978: %There are two versions of the game: the on-line version and the off-line one.
2979: %In the on-line game, C wins if, following every one of his moves,
2980: %every $x\in\X$ that is covered at least $2^k$ times
2981: %by P's sets belongs to a marked set.
2982: %In the off-line game C wins if this condition holds after his last move.
2983: Bob wins if after every one of his moves 
2984: every $x\in\X$ that is covered by at least $2^m$
2985: of Alice's sets
2986: belongs to a marked set.
2987: %It is important that this condition is checked
2988: %following every one of C's moves: C cannot
2989: %postpone marking until all P's sets appear.
2990: The length of a play is decided by Alice. She 
2991: may stop the game after any of Bob's moves. However the 
2992: total number of her moves (and hence Bob's moves) 
2993: must be less than $2^k$. 
2994: (It is easy to see that without loss of generality
2995: we may assume that Alice makes exactly $2^k-1$ moves.)
2996: Bob can easily win if he marks every set produced by Alice.
2997: However, we want to minimize the total number of marked sets.
2998: 
2999: \begin{claim}\label{l53}
3000: %In the off-line game, Consumer has a winning strategy
3001: %that marks at most $2^{r-k}\log|\X|$ sets.
3002: Bob has a winning strategy
3003: that marks at most $O(2^{k-m}k^{2}n)$ sets.
3004: %with $\alpha, \beta$ constants.
3005: \end{claim}
3006: 
3007: \begin{proof}
3008: %\begin{remark}
3009: %Remark.
3010: %In the proof of Lemma~\ref{l53} we have not
3011: %presented any explicit strategy for C.
3012: %Here is  a winning strategy
3013: %that marks $2^{r-k} r^2 \ln |\X|$ sets.
3014: We present an explicit 
3015: %deterministic and constructive 
3016: strategy for Bob, which consists in
3017: %($\tau = 2^k$ moves.)
3018: %Bob's strategy 
3019: %with $\tau=2^k$ moves
3020: in executing at every move $t=1,2, \ldots ,2^k -1$
3021: the following algorithm for the sequence 
3022: $A_1, A_2, \ldots , A_t$ which has been produced by Alice until then.
3023: 
3024: %{\bf for} $j=1,2,\dots,k$ {\bf do}
3025: \begin{description}
3026: \item
3027: {\bf Step 1.} 
3028: Let $2^j$ be the largest power 
3029: of $2$ dividing $t$. 
3030: Consider the last $2^j$ sets in the sequence 
3031: $A_1, A_2, \ldots , A_t$ and call them
3032: $D_1,\dots,D_{2^j}$.
3033: \item
3034: {\bf Step 2.}
3035: Let $T$ be the set of $x$'s that occur in at least 
3036: $2^{m}/k$ of the 
3037: sets $D_1,\dots,D_{2^j}$. 
3038: Let $D_p$ be a set such that $|D_p\bigcap T|$ is maximal.
3039: Mark $D_p$ (if there is more than one then choose the one with $p$ least)
3040: and remove all elements of  $D_p\bigcap T$ from $T$.
3041: Call the resulting set $T_1$. 
3042: Let $D_q$ be a set such that $|D_q\bigcap T_1|$ is maximal
3043: (if there is more than one then choose the one with $q$ least).
3044: After removing all elements of $D_q\bigcap T_1$ from $T_1$
3045: we obtain a set $T_2$. Repeat the argument until 
3046: we obtain $T_{e_j} = \emptyset$.
3047: \end{description}
3048: 
3049: Firstly, for the $j$ above we have
3050: $e_j \leq \lceil 2^{j-m}kn\ln2\rceil$.
3051: % sets among $D_1,\dots,D_{2^j}$
3052: %such that the union of the chosen sets covers $T$.
3053: This is proved as follows. We have 
3054: $$
3055: \sum_{i=1}^{2^j}|D_i\bigcap T|\ge|T|2^{m}/k,
3056: $$
3057: since every $x\in T$ is counted at least $2^{m}/k$ times in the
3058: sum in the left hand side.
3059: Thus there is a set in the list $D_1, \ldots , D_{2^j}$ 
3060: such that the cardinality of its intersection
3061: with $T$ 
3062: is at least $2^{-j}$ times the right hand side.
3063: %$|D_s\bigcap T|\ge |T|2^{m-j}/k$. 
3064:  By the choice of $D_p$ it is such a set
3065: and  we have $|D_p\bigcap T|\ge |T|2^{m-j}/k$.
3066: 
3067: The set $T$ has lost at least a $(2^{m-j}/k)$th fraction of its
3068: elements, that is, $|T_1|\le |T|(1-2^{m-j}/k)$. 
3069: Since $T_1 \subseteq T$, obviously every element of $T_1$ 
3070: (still) occurs in at least 
3071: $2^{m}/k$ of the sets $D_1,\dots,D_{2^j}$.
3072: Thus we can repeat the argument and 
3073: mark a set $D_q$ with $|D_q\bigcap T_1|\ge |T_1|2^{m-j}/k$. 
3074: After removing all elements of $D_q\bigcap T_1$ from $T_1$
3075: we obtain a set $T_2$ that is at most a $(1-2^{m-j}/k)$th fraction 
3076: of $T_1$, that is, $|T_2|\le |T_1|(1-2^{m-j}/k)$. 
3077: 
3078: Recall that we repeat the procedure $e_j$ times where $e_j$
3079: is the number of repetitions until  $T_{e_j} = \emptyset$.
3080: It follows that $e_j \leq \lceil 2^{j-m}kn\ln2\rceil$
3081: %The number of non-covered strings in the resulting set is
3082: %at most
3083: since
3084: $$
3085: |T|(1-2^{m-j}/k)^{2^{j-m}kn\ln2}<|T|e^{-n\ln2}=|T|2^{-n}\le1.
3086: $$
3087: %That is, all $x\in T$ are covered by marked sets $D$.
3088: 
3089: Secondly, for every fixed $j=0,1, \ldots, k-1$ 
3090: there are at most $2^{k-j}$ different $t$'s ($t=1,2, \ldots , 2^k-1$)
3091: divisible by $2^j$ 
3092: and the number $d_j = 2^{k-j}e_j$
3093: of marked sets we need
3094: to use for this $j$ satisfies 
3095: $d_j \leq 2^{k-j} 2^{j-m} kn\ln2 = 2^{k-m} kn \ln2$.
3096: For all $j=0,\dots,k-1$ together we use a total number of marked sets of
3097: at most
3098: \[
3099:  \sum_{j=0}^{k-1} d_j \leq 2^{k-m} k^2 n\ln 2.
3100: \]
3101: In this way, 
3102: after every move $t=1, 2,\ldots , 2^k-1 $ of Bob,
3103: every $x$ occurring in
3104: $2^m$ of Alice's sets belongs to a marked set of Bob.
3105: This can be seen as follows.
3106: Assume to the contrary, that there is an $x$
3107: that occurs in $2^m$ of Alice's sets following move $t$ of Bob,
3108: and $x$ belongs to no set marked by Bob in step $t$ or earlier.
3109: Let $t= 2^{j_1} + 2^{j_2} + \cdots $ with $j_1>j_2>\cdots $
3110: be the binary expansion of $t$. By Bob's strategy, 
3111: the element $x$ occurs less than
3112: $2^{m}/k$ times in the first segment of $2^{j_1}$ sets of Alice, 
3113: less than $2^{m}/k$ times in the next segment of $2^{j_2}$ of Alice's 
3114: sets, and so on.
3115: Thus its total number of occurrences among the $t$ first sets of Alice is
3116: strictly less than $k 2^m/k=2^m$.
3117: The contradiction proves the claim.%
3118: %($\tau < 2^k$ moves.)
3119: %Above, we gave Bob's algorithm
3120: %for $\tau = 2^k$ moves but it is straightforward to restrict 
3121: %it to $\tau < 2^k$ moves.
3122: %Namely, Bob expands the number of moves $\tau$ as 
3123: %$2^{j_1}+2^{j_2}+ \cdots + 2^{j_l}$
3124: %with $j_1 > j_2 > \cdots > j_l$ and $j_l < k$. Then he sets $j=j_l$
3125: %and considers the {\em last} $2^j$ sets $A_{\tau-2^j+1}, \ldots, A_{\tau}$,
3126: %denoting them by $D_1, \ldots , D_{2^j}$.
3127: %(End of remark.)
3128: %\end{remark}
3129: \end{proof}
3130: Let us finish the proof of the Lemma~\ref{th5}.
3131: %The strategy of Claim~\ref{l53} can be found by the brute force search
3132: %given $n$, $k$ and $m$, as follows.
3133: Given the list of $\BB$,
3134: recursively enumerate the sets in ${\BB}$ of Kolmogorov complexity 
3135: less than $k$,
3136: say $B_1, B_2, \ldots ,B_T$ with $T < 2^k$,
3137: and consider this list as a particular sequence of
3138: moves by Alice. 
3139: Use Bob's 
3140: strategy of Claim~\ref{l53} against Alice's 
3141: sequence as above. 
3142: Note that recursive enumeration of the sets in  ${\BB}$ 
3143: of Kolmogorov complexity less than $k$ means that eventually all such
3144: sets will be produced, although we do not know
3145: when the last one is produced. This only means that the time between moves
3146: is unknown, but the alternating moves between Alice and Bob are deterministic
3147: and sequential.
3148: According to Claim~\ref{l53}, Bob's strategy
3149: marks at most
3150: $O(2^{k-m}k^{2}n)$ sets.
3151: These marked sets cover
3152: every string occurring at least $2^m$ 
3153: times in the sets $B_1, B_2, \ldots ,B_T$.
3154: We do not know when the last set $B_T$ appears in this list,
3155: but Bob's winning strategy of Claim~\ref{l53} ensures
3156: that immediately after recursively enumerating $B_{i}$ 
3157: $(i \leq T)$ in the list
3158: every string that occurs in 
3159: $2^m$ sets in the initial segment $B_1, B_2, \ldots B_{t}$
3160: is covered by a marked set.
3161: The Kolmogorov complexity $\K(B_i)$ of every marked set $B_i$
3162: in the list $B_1, B_2, \ldots , B_T$ is upper bounded by
3163: the logarithm 
3164: of the number of
3165: marked sets, that is
3166: $k-m+O(\log k+\log n)$,
3167: plus the description of ${\BB}$,
3168: $k$, $m$, and $n$ including
3169: separators in
3170: $O(\K({\BB})+\log k+\log m+\log n)$ bits.
3171: \end{proof}
3172: We continue the proof of the theorem.
3173: Let the distortion family ${\A}$ satisfy
3174: properties 2 and 3.
3175: Consider
3176: the subfamily $\BB$ of $\A_n$ consisting of all sets $A$ with
3177: $\wwh{\log A}=\wwh{\log B}$.
3178: Let ${\BB}(x)$ be the family $\{B \in {\BB}: x \in B \}$ and
3179: $N$ the number of sets in
3180: ${\BB}(x)$ of Kolmogorov complexity at most
3181: $\K(B)$.
3182: 
3183: Given $x,\wwh{\log B},\A_n$ and $\K(B)$ 
3184: we can generate all $A\in\BB(x)$ of Kolmogorov complexity
3185: at most $\K(B)$.
3186: Then we can describe $B$ by its index among the generated
3187: sets. This shows that the description length 
3188: $\K(B \mid x)\le \log N$
3189: (ignoring an additive term of order $O(\log\K(B)+\log n)$ which suffices since
3190: $\K(\wwh{\log B})$ and $\K(\A_n)$ are both $O(\log n)$).
3191: 
3192: Since $\K({\A}_n) = O(\log n)$ by property 3,
3193: ${\BB} \subseteq {\A}_n$ while every set $A \in {\BB}$ satisfies
3194: $\lceil \log |A| \rceil = \lceil \log |B| \rceil \leq n$, we have
3195: $\K({\BB}) = O(\log n)$. Let
3196: $k=\K(B)+1$ and $m=\wh{\log N}$,
3197: and ignore additive terms of order $O(\log k+\log m + \log n)$.
3198: Applying  Lemma~\ref{th5} 
3199: shows that there is a set  $A\in \BB(x)$
3200: with $\K(A)\le k-m\le \K(B)-\K(B \mid x)=I(x:B)$ and therefore 
3201: proves Theorem~\ref{th-shannon-analog}.
3202: \end{proof}
3203: 
3204: \begin{remark}\label{rem.previously}
3205: \rm
3206: Previously an analog of Lemma~\ref{th5} was known in the case
3207: when $\BB$ is the class of \emph{all} subsets $\booln$
3208: of {\em fixed} cardinality  $2^l$.
3209: For $l=0$ this is Exercise 4.3.8 (second edition) and 4.3.9
3210: (third edition) of \cite{LiVi97}:
3211: If a string $x$ has at least
3212: $2^m$ descriptions of length at most $k$
3213: ($p$ is called a description of
3214: $x$ if $U(p)=x$ where $U$ is
3215: the reference Turing machine), then
3216: $\K(x)\le k-m+O(\log k+\log m)$. Reference~\cite{VV02}
3217: generalizes this to all $l> 0$:
3218: If a string belongs to at least $2^m$
3219: sets $B$ of cardinality $2^l$ and Kolmogorov complexity  $\K(B)\le k$,
3220: then $x$ belongs to a set $A$ of cardinality $2^l$ and
3221: Kolmogorov complexity 
3222: $\K(A)\le k-m+O(\log m+\log k+\log l)$.
3223: \end{remark}
3224: \begin{remark}\label{rem.muchnik}
3225: \rm
3226: %{\em Off-line case:} We show that there is
3227: %a selection of $2^{r-k}\log|\X|$ sets produced by P,
3228: %that cover all $x\in\X$ that are covered by at least $2^m$ sets produced by P.
3229: %Choose at random $2^{r-k}\log|\X|$ of P's sets (all the sets
3230: %are equiprobable).
3231: %Let $x\in\X$ be covered by at least  $2^m$ sets produced by
3232: %P. Then, the probability
3233: %that $x$ is not covered by the chosen sets is at most
3234: %$$
3235: %(1-2^{k-r})^{2^{r-k}\log|\X|}\le e^{-\log|\X|}\ll 1/|\X|.
3236: %$$
3237: %Multiplying this upper bound by $|\X|$ we get less than 1.
3238: %Therefore, there is a selection of $2^{r-k}\log|\X|$ sets produced by P
3239: %that covers all $x\in\X$ with multiplicity $2^m$ or more.
3240: %
3241: %{\em On-line case:}
3242: {\em Probabilistic proof of Claim~\ref{l53}.}
3243: Consider a new game  that has the same rules and one additional
3244: rule: Bob looses if he marks more than $2^{k-m+1}(n+1)\ln2$ sets.
3245: We will prove that in this game Bob has a winning strategy.
3246: 
3247: Assume the contrary: Bob has no winning strategy. 
3248: %K\H{o}nig's
3249: %infinity lemma \cite{Ko36} implies
3250: %that every tree that contains infinitely many vertices, 
3251: %each having finite degree, has at least one infinite simple path.
3252: Since the number of moves in the game is finite (less than 
3253: $2^k$), this implies that
3254: Alice has a winning strategy.
3255: 
3256: Fix a winning strategy $S$ of Alice. To obtain a contradiction
3257: we design a randomized strategy for Bob that beats Alice's
3258: strategy $S$ with
3259: positive probability. Bob's strategy is very simple:
3260: mark every set produced by Alice with probability $p=2^{-m}(n+1)\ln2$.
3261: \begin{claim}\label{claim.iii}
3262: \rm
3263: (i)
3264: With probability more than $\frac{1}{2}$,
3265: following every move of Bob every
3266: element occurring in at least $2^m$ of Alice's sets is covered 
3267: by a marked set of Bob.
3268: 
3269: (ii) With probability more than $\frac{1}{2}$, Bob marks
3270: at most  $2^{k-m+1}(n+1)\ln2$ sets.
3271: \end{claim}
3272: 
3273: \begin{proof}
3274: (i) Fix $x$ and estimate
3275: the probability that there is move of Bob following which $x$
3276: belongs to $2^m$ of Alice's sets 
3277: but belongs to no marked set of Bob.
3278: %We need to show that this happens with probability
3279: %less than $2^{-n-1}$.
3280: 
3281: Let $R_i$ be the event
3282: ``following a  move of Bob, string $x$
3283: occurs at least in $i$ sets of Alice
3284: but none of them is marked''.
3285: Let us
3286: prove by induction that
3287: \[
3288: \Pr [R_i]\le(1-p)^{i}.
3289: \]
3290: For $i=0$ the statement is trivial.
3291: To prove the induction step we need to show that
3292: $\Pr [R_{i+1}|R_i]\le 1-p$.
3293: 
3294: Let
3295: $z=z_1,z_2,\dots,z_t$ be a sequence of decisions by Bob:
3296: $z_j=1$ if Bob marks the $j$th set produced by Alice and
3297: $z_j=0$ otherwise. Call $z$ \emph{bad} if
3298: following Bob's $t$th move it happens
3299: for the first time that $x$ belongs to $i$  sets produced by Alice
3300: by move $t$ but none of them is  marked.
3301: Then $R_i$ is the disjoint union of the events
3302: ``Bob has made the decisions $z$'' (denoted by $Q_z$) over all bad $z$.
3303: Thus it is enough to prove that
3304: \[
3305: \Pr [R_{i+1} \mid Q_z]\le 1-p.
3306: \]
3307: Given that
3308: Bob has made the decisions $z$, the event $R_{i+1}$
3309: means that after those decisions the strategy $S$ will at some
3310: time in the future produce the
3311: $(i+1)$st set with member
3312: $x$ but Bob will not mark it.
3313: Bob's decision not to mark that set does not depend
3314: on any previous decision and is made with probability $1-p$.
3315: Hence
3316: $$
3317: \Pr [R_{i+1} \mid Q_z]=\Pr [\text{Alice produces 
3318: the $(i+1)$st set with member }x \;  \mid  \;Q_z]
3319: \cdot(1-p)
3320: \le1-p.
3321: $$
3322: The induction step is proved.
3323: Therefore,
3324: $\Pr [R_{2^m}]\le (1-p)^{2^m}<e^{-p2^m}=2^{-n-1}$,
3325: where the last equality follows by choice of $p$.
3326: 
3327: (ii) The expected number of marked sets is $p2^k$. Thus
3328: the probability that it exceeds $p2^{k+1}$ is less than $\frac{1}{2}$.
3329: \end{proof}
3330: 
3331: It follows from Claim~\ref{claim.iii} that there exists a strategy
3332: by Bob that marks at most $2^{k-m+1}(n+1)\ln2$ sets out of Alice's
3333: produced $2^k$ sets, and following every move of Bob every
3334: element occurring in at least $2^m$ of Alice's sets is covered
3335: by a marked set of Bob. Note that we have proved that
3336: this strategy of Bob exists
3337: but we have not constructed it.
3338: Given $n$, $k$ and $m$, the number of games is finite, and  
3339: a winning strategy for Bob can be found by brute force search.
3340: %Note that the proof of Claim~\ref{l53} is constructive; 
3341: %the probabilistic proof above shows that a winning strategy for Bob's exists; 
3342: %it does not show that it is
3343: %computable given Alice's sequence. 
3344: %Multiplying this bound by the number of different $x$ we obtain $1/2$.
3345: %
3346: %Then the probability we want to estimate
3347: %is equal to
3348: %\begin{align*}
3349: %&\Pr [x \text{ is covered $i+1$ times by P's sets
3350: %and belongs to none of them}]\\
3351: %=&\sum_{\text{bad }z}
3352: %\Pr (z)\Pr [x \text{ is covered $i+1$ times by P's sets
3353: %and does not belong the $i+1$st of them}|z]\\
3354: %=&\sum_{\text{bad }z}
3355: %\Pr (z)\Pr [x \text{ is covered $i+1$ times}|z](1-p)
3356: %\le\sum_{\text{bad }z}
3357: %\Pr (z)(1-p)
3358: %\\=&
3359: %\Pr [x \text{ is covered $i$ times by P's sets and belongs to none of them}]
3360: %(1-p)
3361: %\le (1-p)^{i+1}.
3362: %\end{align*}
3363: %
3364: %
3365: %
3366: \end{remark}
3367: 
3368: \vspace{.2in}
3369: \begin{proof}{\em of Theorem~\ref{th45}}.
3370: Let $B \subseteq\{0,1\}^n$ be a set containing string $x$. Define the
3371: \emph{sufficiency deficiency of $x$ in $B$}
3372: by
3373: $$
3374: \log|B|+\K(B)-\K(x).
3375: $$
3376: This is the number of extra bits incurred by the two-part code for $x$
3377: using $B$ compared to the most optimal one-part code of $x$ using $\K(x)$ bits.
3378: We relate this quantity with 
3379: the randomness deficiency $\delta(x \mid B)=\log |B|-\K(x \mid B)$
3380:  of $x$ in the set $B$.
3381: The randomness deficiency is always less than the sufficiency
3382: deficiency, and the
3383: difference between them is equal to $\K(B \mid x)$:
3384: \begin{equation}\label{eq76}
3385: \log|B|+\K(B)-\K(x)-\delta(x \mid B)=\K(B \mid x),
3386: \end{equation}
3387: where the equality follows from the symmetry of 
3388: information \eqref{eq.soi},
3389: ignoring here and later in the proof additive terms of order
3390: $O(\log\K(B)+\log n)$.
3391: 
3392: By Theorem~\ref{th-shannon-analog}, which assumes
3393: that properties 2 and 3 hold for the distortion family
3394: ${\A}$, there is  $A\in\A(x)$
3395: with $\wwh{\log|A|}=\wwh{\log|B|}$ and
3396: $\K(A)\le \K(B)-\K(B \mid x)$.
3397: Since $A_x$ is a set of minimal Kolmogorov complexity among
3398: such $A$ we have
3399: $\K(A_x)\le \K(B)-\K(B \mid x)$.
3400: Therefore
3401: \begin{align*}
3402: \K(A_x)+\log|A_x|-\K(x)&\le\K(B)-\K(B \mid x)+\log|A_x|-\K(x)\\
3403: &=
3404: \K(B)-\K(B \mid x)+\log|B|-\K(x)=\delta(x \mid B),
3405: \end{align*}
3406: where the last equality is true by~\eqref{eq76}.
3407: \end{proof}
3408: 
3409: \vspace{.2in}
3410: \begin{proof}
3411: {\em of  Theorem}~\ref{thm.dresf}.
3412: %We assume that property 2 holds for the distortion family ${\A}$.
3413: 
3414: {\em Left inequality.} 
3415: Given $\delta$, $n$, $p$, and the (discrete) graph of $r^n$, we can compute an
3416: optimal $E$ as in \eqref{eq.rndelta}  such that $r^n (\delta)
3417: = \log |E(\mathbf{X}^n)|$. Retrieve $E(x)$ %for every $x$
3418: by its index of $r^n (\delta)$ bits in the set $E(\mathbf{X}^n)$.
3419: Then,
3420: \[
3421: \K(E(x)) \leq r^n(\delta) + O(\K(\delta,r^n,X,n)).
3422: \]
3423: By definition, $r_x(\delta) \leq \K(E(x))$.
3424: Taking the expectation of $r_x(\delta)$ over
3425: $p$, we are done.
3426: 
3427: {\em Right inequality.}
3428: Define a code $E_0$ such that
3429: $\K(E_0(x)) = r_x(\delta)$
3430: for every $x \in \mathbf{X}^n$.
3431: Let $E_0(\mathbf{X}^n)$ be the range of $E_0$.
3432: Although $E_0(\mathbf{X}^n)$ cannot be computed, it is finite, and trivially
3433: \[
3434: \log |E_0(\mathbf{X}^n)| \leq \max_{x \in \mathbf{X}^n} \K(E_0(x)).
3435: \]
3436: By definition $r^n(\delta) \leq \log |E_0(\mathbf{X}^n)|$, which yields
3437:  $r^n (\delta)
3438: \leq \max_{x \in \mathbf{X}^n} r_x(\delta)$.
3439: 
3440: The noiseless coding theorem, \cite{Sh48,LiVi97}, shows that
3441: \[
3442: \sum_{x \in \mathbf{X}^n} p(x)r_x(\delta) 
3443: =  \sum_{y \in E_0(\mathbf{X}^n)} S(y) \K(y)
3444:  \geq H(S),
3445: \]
3446: with $S$ the distribution defined in the statement of the theorem.
3447: By definition, $r^n(\delta) \leq \log |\mathbf{Y}^n|$, which yields
3448: $r^n(\delta) \leq H(L)$, with $L$ as in the statement of the theorem.
3449: Together, we obtain
3450: $r^n (\delta)
3451: \leq {\bf E} r_x(\delta)+ \Delta_2$.
3452: \end{proof}
3453: 
3454: 
3455: 
3456: 
3457: \section*{Acknowledgements}
3458: We thank Alexander K. Shen for helpful suggestions.
3459: Andrei A. Muchnik gave the probabilistic proof
3460: of Claim~\ref{l53} in Remark~\ref{rem.muchnik} after having seen
3461: the deterministic proof. 
3462: Such a probabilistic proof 
3463: was independently proposed by Michal Kouck\'y.
3464: We thank the referees for their constructive comments;
3465: one referee pointed out that yet another example would be
3466: the case of Euclidean balls with the usual Euclidean distance, where
3467: the important Property 4 is proved in for example \cite{VG05}.
3468: The work of N.K. Vereshchagin was done in part
3469: while visiting CWI and was supported in part by the grant
3470: 09-01-00709 from Russian Federation
3471: Basic Research Fund and by a visitors grant of NWO.
3472: The work of P.M.B. Vit\'anyi was
3473: supported in part by
3474: the BSIK Project BRICKS
3475: of the Dutch government and NWO, and by the
3476: EU NoE PASCAL (Pattern Analysis, Statistical Modeling, 
3477: and Computational Learning).
3478: 
3479: \begin{thebibliography}{9}
3480: 
3481: \bibitem{Be71}
3482: T. Berger, {\em Rate Distortion Theory: A Mathematical Basis for
3483: Data Compression}, Prentice-Hall, Englewood Cliffs, NJ, 1971.
3484: 
3485: \bibitem{BG98}
3486: T. Berger, J.D. Gibson, Lossy source coding, {\em IEEE Trans. Inform. Th.},
3487: 44:6(1998), 2693--2723.
3488: 
3489: %\bibitem{BKVV03}
3490: %H.~Buhrman, H.~Klauck, N.K. Vereshchagin, and P.M.B. Vit\'anyi.
3491: %\newblock Individual communication complexity.
3492: %\newblock In {\em Proc. 21th Symp. Theoret. Aspects of Comput. Sci.},
3493: %Lecture Notes in Computer Science, Vol. 2996, Springer-Verlag, Berlin, 2004,
3494: %19--30.
3495: 
3496: \bibitem{BW94}
3497:  M. Burrows and D. J. Wheeler, A block-sorting lossless data
3498: compression algorithm, Digital Equipment Corporation, Systems Research
3499: Center, Tech. Rep. 124, May 1994.
3500: 
3501: \bibitem{CYV97}
3502: S.C. Chang, B. Yu, M. Vetterli, Image denoising via lossy compression and
3503: wavelet thresholding, {\em Proc. Int. Conf. Image Process. (ICIP'97)},
3504: 1997, 604-607 in Volume 1.
3505: 
3506: %\bibitem{CT91}
3507: %T.M. Cover and J.A. Thomas, {\em Elements of Information Theory},
3508: %Wiley, New York, 1991.
3509: 
3510: %\bibitem{CV05}
3511: %R. Cilibrasi, P.M.B. Vitanyi, Clustering by compression, 
3512: %{\em IEEE Trans. Information Theory}, 51:4(2005)
3513: 
3514: \bibitem{Do02}
3515: D. Donoho, The Kolmogorov sampler, {\em Annals of Statistics},
3516: submitted.
3517: 
3518: %\bibitem{El57}
3519: %P. Elias, List decoding for noisy channels. {\em Wescon Convention Record,}
3520: %Part 2, Institute for Radio Engineers (now IEEE), 1957, 94--104.
3521: 
3522: %\bibitem{El91}
3523: %P. Elias, Error-correcting codes for List decoding,
3524: %{\em IEEE Trans. Inform. Th.}, 37:1(1991), 5--12. 
3525: 
3526: %\bibitem{flv}
3527: %L. Fortnow,
3528: %T. Lee, N. Vereshchagin,
3529: %Kolmogorov Complexity with Error,
3530: %{\em Proc. Symposium Theoretical Aspects of Comput. Science 2006,}
3531: %Lecture Notes in Computer Science, vol. 3884 (2006) 137--148
3532: 
3533: %\bibitem{GHLL97}
3534: %G.~Cohen, I.~Honkala, S.~Litsyn, and A.~Lobstein.
3535: %\newblock {\em Covering Codes}.
3536: %\newblock North-Holland, Amsterdam, 1997.
3537: 
3538: \bibitem{GTV01} P. G\'acs, J. Tromp, P.M.B. Vit\'anyi. 
3539: Algorithmic statistics, {\em IEEE Trans. Inform. Th.}, 47:6(2001), 2443--2463.
3540: 
3541: %\bibitem{GV03}
3542: %P.D. Gr\"unwald and P.M.B. Vit\'anyi, Shannon information and Kolmogorov
3543: %complexity, {\em IEEE Trans. Information Theory}, Submitted.
3544: %http://arxiv.org/abs/cs/0410002
3545:                                                                                
3546: %\bibitem{IP05}
3547: %iPOD + iTUNES web-page at http://www.apple.com/ipod/
3548: 
3549: %\bibitem{Ke04}
3550: %E. Keogh, S. Lonardi, and C.A. Rtanamahatana, Toward parameter-free
3551: %data mining, In: {\em Proc. 10th ACM SIGKDD Intn'l Conf. Knowledge
3552: %Discovery and Data Mining}, Seattle, Washington, USA, August 22---25, 2004,
3553: %206--215.
3554: 
3555: %\bibitem{ISW00}
3556: %R.~Impagliazzo, R.~Shaltiel, and A.~Wigderson.
3557: %\newblock Extractors and pseudo-random generators with optimal seed length.
3558: %\newblock In {\em Proceedings of the 32nd ACM Symposium on the Theory of
3559:   %Computing}, pages 1--10. ACM, 2000.
3560: 
3561: \bibitem{Ko65}
3562: A.N. Kolmogorov,
3563: {Three approaches to the quantitative definition of information},
3564: {\em Problems Inform. Transmission} 1:1 (1965) 1--7.
3565:                                                                                 
3566: \bibitem{Ko74}
3567:  A.N. Kolmogorov.
3568:  Complexity of Algorithms and Objective Definition of Randomness.
3569:  A talk at Moscow Math. Soc. meeting 4/16/1974.
3570:  An abstract available in {\em Uspekhi Mat. Nauk} 29:4(1974),155;
3571: English translation in \cite{VV02}.
3572: 
3573: %\bibitem{LC78}
3574: %S.K. Leung-Yan-Cheong and T.M. Cover,
3575: %Some equivalences between Shannon entropy and Kolmogorov complexity,
3576: %{\em IEEE Trans. Inform. Theory},
3577: %24:3(1978), 331-338.
3578: 
3579: %\bibitem{Ko36}
3580: %D. K\H{o}nig, {\em Theorie der Endlichen und Unendlichen Graphen: 
3581: %Kombinatorische Topologie der Streckenkomplexe}, Akad. Verlag.,
3582: %Leipzig, 1936.
3583: 
3584: \bibitem{LiVi97}
3585: M. Li and P.M.B. Vit\'anyi,
3586: {\em An {I}ntroduction to {K}olmogorov {C}omplexity and {I}ts
3587:   {A}pplications},
3588: Springer-Verlag, New York, 1997 (second edition), 2008 (third edition).
3589: 
3590: \bibitem{Li01}
3591: M. Li, J.H. Badger, X. Chen, S. Kwong, P. Kearney, and H. Zhang,
3592: An information-based sequence distance and its application
3593: to whole mitochondrial genome phylogeny,
3594: {\em Bioinformatics}, 17:2(2001), 149--154.
3595: 
3596: 
3597: \bibitem{Li04}
3598: M. Li, X. Chen, X. Li, B. Ma, P.M.B. Vitanyi, 
3599: The similarity metric, {\em IEEE Trans. Inform. Th.}, 50:12(2004), 3250- 3264.
3600: 
3601: \bibitem{Na95}
3602: B.K. Natarajan, Filtering random noise from deterministic signals via
3603: data compression, {\em IEEE Trans. on Signal Processing}, 43:11(1995), 2595-2605.
3604: 
3605: \bibitem{MK94}
3606: J. Muramatsu, F. Kanaya, Distortion-complexity and rate-distortion function,
3607: {\em IEICE Trans. Fundamentals}, E77-A:8(1994), 1224--1229. 
3608: 
3609: \bibitem{rum}
3610: Andrey Rumyantsev,
3611: Transmission of information
3612: through a noisy channel in Kolmogorov complexity setting.
3613: Vestnik MGU, Seriya Matematika i Mechanika (Russian), to appear in 2006.
3614: 
3615: \bibitem{RV06}
3616: S. de Rooij, P.M.B. Vitanyi,
3617: Approximating rate-distortion graphs of individual data: Experiments
3618: in lossy compression and denoising, {\em IEEE Trans. Comput.},
3619: Submitted. Also: Arxiv preprint cs.IT/0609121, 2006. 
3620: 
3621: 
3622: \bibitem{Sa94}
3623: N. Saito, Simultaneous noise suppression and signal compression
3624: using a library of orthonormal bases and the minimum description
3625: length criterion, Pp. 299--324 in {\em Wavelets in Geophysics}, 
3626: E. Foufoula-Georgiou, P. Kumar, Eds., Academic Press, 1994.
3627: 
3628: %\bibitem{salnikov}
3629: %S. Salnikov.
3630: %\newblock Kolmogorov complexity
3631: %of initial segments of binary sequences.
3632: %Manuscript, 2004.
3633: 
3634: \bibitem{Sh48}
3635: C.E. Shannon.
3636: \newblock The mathematical theory of communication.
3637: \newblock {\em Bell System Tech. J.}, 27:379--423, 623--656, 1948.
3638:                                                                                 
3639: \bibitem{Sh59}
3640: C.E. Shannon.
3641: \newblock Coding theorems for a discrete source with a fidelity criterion.
3642: \newblock In {\em IRE National Convention Record, Part 4}, pages 142--163,
3643:   1959.
3644: 
3645: \bibitem{Sh83}
3646: A.Kh. Shen, The concept of $(\alpha , \beta )$-stochasticity
3647: in the Kolmogorov sense, and its properties, {\em Soviet Math. Dokl.},
3648: 28:1(1983), 295--299.
3649: 
3650: 
3651: \bibitem{SE03}
3652: D.M. Sow, A. Eleftheriadis,
3653: Complexity distortion theory,
3654: {\em IEEE Trans. Inform. Th.}, 49:3(2003), 604--608.
3655: 
3656: \bibitem{Tu36}
3657: A.M. Turing, On computable numbers, with an application to the 
3658: Entscheidungsproblem, {\em Proc. London Mathematical Society}, 42:2(1936),
3659: 230-265, "Correction", 43i(1937), 544-546.
3660: 
3661: 
3662: \bibitem{VV02}
3663: N.K. Vereshchagin and P.M.B. Vit\'anyi, Kolmogorov's Structure 
3664: functions and model selection, {\em IEEE Trans. Inform. Theory}, 
3665: 50:12(2004), 3265- 3290.
3666: 
3667: \bibitem{VG05}
3668: J.L. Verger-Gaugry,
3669: Covering a ball with smaller equal balls in $R^n$,
3670: {\em Discrete and Computational Geometry}, 33(2005), 143--155.
3671: 
3672: %\bibitem{Wo58}
3673: %J.M. Wozencraft, List decoding. {\em Quarterly Progress Report},
3674: %Research Laboratory for Electronics, MIT, Vol. 58(1958), 90--95.
3675: 
3676: 
3677: %\bibitem{Ya89}
3678: %E.-H. Yang, The proof of Levin's conjecture,
3679: %{\em Chinese Science Bull.}, 34:21(1989), 1761--1765.
3680: 
3681: \bibitem{Vy87}
3682: V.V. V'yugin,
3683: On the defect of randomness of a finite object with respect to
3684: measures with given complexity bounds, {\em SIAM Theory Probab. Appl.},
3685: 32:3(1987), 508--512.
3686: 
3687: \bibitem{YS93}
3688: E.-H. Yang, S.-Y. Shen,
3689: Distortion program-size complexity with respect to a fidelity
3690: criterion and rate-distortion function,
3691: {\em IEEE Trans. Inform. Th.}, 39:1(1993), 288--292.
3692: 
3693: 
3694: 
3695: 
3696: \bibitem{Zi80}
3697: J. Ziv, Distortion-rate theory for individual sequences,
3698: {\em IEEE Trans. Inform. Th.}, 26:2(1980), 137--143.
3699: 
3700: %\bibitem{ZL70}
3701: %A.K. Zvonkin and L.A. Levin,
3702: %The complexity of finite objects and the development of the concepts
3703:   %of information and randomness by means of the theory of algorithms,
3704: %{\em Russian Math. Surveys} 25:6 (1970) 83-124.
3705: 
3706: 
3707: \end{thebibliography}
3708: \end{document}
3709: