cs0503026/cs0503026
1: 
2: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3: %%        On Generalized Computable Universal Priors         %%
4: %%                 and their Convergence                     %%
5: %%             Marcus Hutter: Start: 01.08.02                %%
6: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
7: 
8: \newif\ifjournal\journalfalse   % journal style versus no-style
9: 
10: %-------------------------------%
11: %   Document-Style              %
12: %-------------------------------%
13: \ifjournal
14: \documentclass{elsart}
15: \usepackage{latexsym}
16: \sloppy
17: 
18: \else
19: 
20: \documentclass[12pt,twoside]{article}
21: \usepackage{latexsym}
22: 
23: \pagestyle{myheadings}
24: \markboth{\sc Marcus Hutter, Technical Report, IDSIA-05-05
25: }{\sc Computable Universal Priors}
26: \setcounter{tocdepth}{4} \setcounter{secnumdepth}{2}
27: \topmargin=0mm  \oddsidemargin=5mm \evensidemargin=5mm
28: \textwidth=15cm \textheight=22cm
29: \sloppy
30: \fi
31: 
32: %-------------------------------%
33: %       My Math-Spacings        %
34: %-------------------------------%
35: \ifjournal
36: \def\beq{\begin{equation}}    \def\eeq{\end{equation}}
37: \def\beqn{\begin{displaymath}}\def\eeqn{\end{displaymath}}
38: \def\bqa{\begin{eqnarray}}    \def\eqa{\end{eqnarray}}
39: \def\bqan{\begin{eqnarray*}}  \def\eqan{\end{eqnarray*}}
40: \else
41: \def\,{\mskip 3mu} \def\>{\mskip 4mu plus 2mu minus 4mu} \def\;{\mskip 5mu plus 5mu} \def\!{\mskip-3mu}
42: \def\dispmuskip{\thinmuskip= 3mu plus 0mu minus 2mu \medmuskip=  4mu plus 2mu minus 2mu \thickmuskip=5mu plus 5mu minus 2mu}
43: \def\textmuskip{\thinmuskip= 0mu                    \medmuskip=  1mu plus 1mu minus 1mu \thickmuskip=2mu plus 3mu minus 1mu}
44: \textmuskip
45: \def\eqsp{\vspace{0ex}}
46: \def\beq{\dispmuskip\eqsp\begin{equation}}    \def\eeq{\eqsp\end{equation}\textmuskip}
47: \def\beqn{\dispmuskip\eqsp\begin{displaymath}}\def\eeqn{\eqsp\end{displaymath}\textmuskip}
48: \def\bqa{\dispmuskip\eqsp\begin{eqnarray}}    \def\eqa{\eqsp\end{eqnarray}\textmuskip}
49: \def\bqan{\dispmuskip\eqsp\begin{eqnarray*}}  \def\eqan{\eqsp\end{eqnarray*}\textmuskip}
50: \fi
51: 
52: %-------------------------------%
53: %   Macro-Definitions           %
54: %-------------------------------%
55: \ifjournal
56: \def\cal{\mathcal}
57: \else
58: \newenvironment{keyword}{\centerline{\bf\small
59: Keywords}\vspace{-1ex}\begin{quote}\small}{\par\end{quote}\vskip 1ex}
60: \fi
61: \newtheorem{theorem}{Theorem}
62: \newtheorem{corollary}[theorem]{Corollary}
63: \newtheorem{lemma}[theorem]{Lemma}
64: \newtheorem{definition}[theorem]{Definition}
65: \newtheorem{tablex}[theorem]{Table}
66: \newtheorem{figurex}[equation]{Figure}
67: 
68: \def\ftheorem#1#2#3{\begin{theorem}[#2]\label{#1} #3 \end{theorem} }
69: \def\fcorollary#1#2#3{\begin{corollary}[#2]\label{#1} #3 \end{corollary} }
70: \def\flemma#1#2#3{\begin{lemma}[#2]\label{#1} #3 \end{lemma} }
71: \def\fdefinition#1#2#3{\begin{definition}[#2]\label{#1} #3 \end{definition} }
72: \def\ftablex#1#2#3{\begin{tablex}[#2]\label{#1} #3 \end{tablex} }
73: \def\ffigurex#1#2#3#4{{#4}\begin{figurex}[#2]\label{#1}#3\end{figurex}}
74: 
75: \ifjournal
76: \def\paradot#1{{\itshape{#1.}}}
77: \def\paranodot#1{{\itshape{#1}}}
78: \else
79: \def\myparskip{\vspace{1.5ex plus 0.5ex minus 0.5ex}\noindent}
80: \def\paradot#1{\myparskip{\bfseries\boldmath{#1.}}}
81: \def\paranodot#1{\myparskip{\bfseries\boldmath{#1}}}
82: \fi
83: \def\toinfty#1{\stackrel{#1\to\infty}{\longrightarrow}}
84: \def\nq{\hspace{-1em}}
85: \def\qed{\hspace*{\fill}$\Box\quad$}
86: \def\odn{{\textstyle{1\over n}}}
87: \def\odt{{\textstyle{1\over 2}}}
88: \def\odf{{\textstyle{1\over 4}}}
89: \def\eps{\varepsilon}                   % for small positive number
90: \def\epstr{\epsilon}                    % for empty string
91: \def\qmbox#1{{\quad\mbox{#1}\quad}}
92: \def\argmax{\mathop{\rm arg\,max}}          % maxarg
93: \def\argmin{\mathop{\rm arg\,min}}          % minarg
94: \def\geqm{\unrhd}
95: \def\ngeqm{{\not\unrhd}}
96: \def\v#1{{\bf #1}}
97: \def\l{{\ell}}                          % length of string or program
98: \def\M{{\cal M}}                        % Set of prob. distributions
99: \def\X{{\cal X}}                        % input/perception set/alphabet
100: \def\Y{{\cal Y}}                        % output/action set/alphabet
101: \def\R{{\cal R}}                        % reward set subset of reals
102: \def\F{{\cal F}}                        % Generic performance measure
103: \def\I{{\cal I}}                        % some set
104: \def\S{{\cal S}}                        % some set
105: \def\Q{{\cal Q}}
106: \def\E{{\bf E}}                         % Expectation value
107: \def\P{{\bf P}}                         % Expectation value
108: \def\B{\{0,1\}}                        % Binary set (or \SetB)
109: \def\Km{K\!m}
110: \def\MM{M}                              % Solomonoff's prior
111: \def\th{\theta}
112: \def\e{{\rm e}}                        % natural e
113: \def\SetN{I\!\!N} \def\SetQ{I\!\!\!Q} \def\SetR{I\!\!R} \def\SetZ{Z\!\!\!Z}
114: \def\lb{\log}
115: \def\sumprime{\mathop{{\sum\nolimits'}}}
116: \def\text#1{\mbox{\scriptsize{#1}}}    % if not using amstex
117: 
118: \begin{document}
119: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
120: %                      T i t l e - P a g e                      %
121: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
122: 
123: \ifjournal
124: 
125: \begin{frontmatter}
126: \title{On Generalized Computable Universal Priors and their Convergence}
127: \author{Marcus Hutter}
128: \address{IDSIA, Galleria 2, CH-6928 Manno-Lugano, Switzerland \\
129: marcus@idsia.ch \hspace{9ex} http://www.idsia.ch/$^{_{_\sim}}\!$marcus}
130: 
131: \thanks{A preliminary version appeared
132:   in the proceedings of the ALT 2003 conference \cite{Hutter:03unipriors}.
133:   This work was supported by SNF grant 2000-61847.00 to J\"urgen Schmidhuber.}
134: 
135: \else
136: 
137: \title{\vskip -25mm\normalsize\sc Technical Report \hfill IDSIA-05-05
138: \vskip 2mm\bf\LARGE\hrule height5pt \vskip 3mm
139: \sc On Generalized Computable Universal Priors and their Convergence%
140: \thanks{A preliminary version appeared
141:   in the proceedings of the ALT 2003 conference \cite{Hutter:03unipriors}.\newline
142:   \hspace*{4ex}This work was supported by SNF grant 2000-61847.00 to J\"urgen Schmidhuber.}
143: \vskip 2mm \hrule height2pt \vskip 5mm}
144: \author{{\bf Marcus Hutter}\\[3mm]
145: \normalsize IDSIA, Galleria 2, CH-6928\ Manno-Lugano, Switzerland\\
146: \normalsize marcus@idsia.ch \hspace{8.5ex} http://www.idsia.ch/$^{_{_\sim}}\!$marcus}
147: \date{11 March 2005}
148: \maketitle
149: 
150: \fi
151: 
152: \begin{abstract}
153: \noindent Solomonoff unified Occam's razor and Epicurus' principle
154: of multiple explanations to one elegant, formal, universal theory
155: of inductive inference, which initiated the field of algorithmic
156: information theory. His central result is that the posterior of
157: the universal semimeasure $\MM$ converges rapidly to the true
158: sequence generating posterior $\mu$, if the latter is computable.
159: Hence, $M$ is eligible as a universal predictor in case of unknown
160: $\mu$. The first part of the paper investigates the existence and
161: convergence of computable universal (semi)measures for a hierarchy
162: of computability classes: recursive, estimable, enumerable, and
163: approximable. For instance, $\MM$ is known to be enumerable, but
164: not estimable, and to dominate all enumerable semimeasures. We
165: present proofs for discrete and continuous semimeasures. The
166: second part investigates more closely the types of convergence,
167: possibly implied by universality: in difference and in ratio, with
168: probability 1, in mean sum, and for Martin-L{\"o}f random
169: sequences. We introduce a generalized concept of randomness for
170: individual sequences and use it to exhibit difficulties regarding
171: these issues. In particular, we show that convergence fails
172: (holds) on generalized-random sequences in gappy (dense) Bernoulli
173: classes.
174: \end{abstract}
175: 
176: \begin{keyword}
177: Sequence prediction;
178: Algorithmic Information Theory;
179: Solomonoff's prior;
180: universal probability;
181: mixture distributions;
182: posterior convergence;
183: computability concepts;
184: Martin-L{\"o}f randomness.
185: \end{keyword}
186: 
187: \ifjournal\end{frontmatter}\else\pagebreak\fi
188: 
189: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
190: \section{Introduction}\label{secIntro}
191: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
192: 
193: %Induction and Occam's razor
194: All induction problems can be phrased as sequence prediction
195: tasks. This is, for instance, obvious for time-series prediction,
196: but also includes classification tasks. Having observed data $x_t$
197: at times $t<n$, the task is to predict the $t$-th symbol $x_t$ from
198: sequence $x=x_1...x_{t-1}$.
199: %
200: The key concept to attack general induction problems is {\em
201: Occam's razor} (simplicity) principle, which says that ``{\em
202: Entities should not be multiplied beyond necessity}.'' and to a
203: less extent Epicurus' principle of multiple explanations. The
204: former/latter may be interpreted as to keep the simplest/all
205: theories consistent with the observations $x_1...x_{t-1}$ and to
206: use these theories to predict $x_t$.
207: %
208: %Kolmogorov complexity
209: Kolmogorov (and others) defined the complexity of a string as the
210: length of its shortest description on a universal Turing machine.
211: The Kolmogorov complexity $K$ is an excellent universal complexity
212: measure, suitable for quantifying Occam's razor. There is (only)
213: one disadvantage: $K$ is not computable.
214: 
215: % computability concepts
216: More precisely, a function $f$ is said to be {\em recursive} (or
217: {\em finitely computable}) if there exists a Turing machine that,
218: given $x$, computes $f(x)$ and then halts. Some functions are not
219: recursive but still {\em approximable} (or {\em limit-computable})
220: in the sense that there is a nonhalting Turing machine with an
221: infinite ($x$-dependent) output sequence $y_1,y_2,y_3,...$ and
222: $\lim_{t\to\infty}y_t=f(x)$. If additionally the output sequence
223: is monotone increasing/decreasing, then $f$ is said to be {\em
224: lower/upper semicomputable} (or {\em enumerable/co-enumerable}).
225: Finally we call $f$ {\em estimable} if some Turing machine, given
226: $x$ and a precision $\eps$, finitely computes an
227: $\eps$-approximation of $x$.
228: %
229: The major algorithmic property of $K$ is that it is co-enumerable,
230: but not recursive.
231: 
232: %Solomonoff's universal prior
233: More suitable for predictions is Solomonoff's
234: \cite{Solomonoff:64,Solomonoff:78} {\em universal prior} $\MM(x)$
235: defined as the probability that the output of a universal monotone Turing
236: machine $U$ starts with string $x$ when provided with fair
237: coin flips on the input tape. $\MM(x)$ is enumerable and roughly
238: $2^{-K(x)}$, hence implementing Occam's and also Epicurus'
239: principles.
240: 
241: %Universal sequence prediction (dominance and convergence)
242: Assume now that strings $x$ are sampled from a probability
243: distribution $\mu$, i.e.\ the probability of a string starting
244: with $x$ shall be $\mu(x)$.
245: %
246: The probability of observing $x_t$ at time $t$, given past
247: observations $x_1...x_{t-1}$ is
248: $\mu(x_t|x_1...x_{t-1})=\mu(x_1...x_t)/\mu(x_1...x_{t-1})$.
249: %
250: Solomonoff's \cite{Solomonoff:78} central result is that the
251: universal posterior
252: $\MM(x_t|x_1...x_{t-1})=\MM(x_1...x_t)/\MM(x_1...x_{t-1})$
253: converges rapidly to the true (objective) posterior probability
254: $\mu(x_t|x_1...x_{t-1})$, if $\mu$ is an estimable measure, hence
255: $\MM$ can be used for predictions in case of unknown $\mu$.
256: %
257: One representation of $\MM$ is as a $2^{-K(\mu)}$-weighted sum of
258: {\em all} enumerable ``defective'' probability measures, called
259: semimeasures.
260: %
261: The (from this representation obvious) dominance $\MM(x) \geq
262: 2^{-K(\mu)}\mu(x)$ for all enumerable $\mu$ is the central
263: ingredient in the convergence proof.
264: 
265: %General mixture distributions
266: Dominance and convergence immediately generalize to arbitrary
267: weighted sums of (semi)measures of some arbitrary countable set
268: $\M$.
269: %
270: So what is so special about the class of all enumerable
271: semimeasures $\M_{enum}^{semi}$? The larger we choose $\M$ the
272: less restrictive is the essential assumption that $\M$ should
273: contain the true distribution $\mu$.
274: %
275: Why not restrict to the still rather general class of estimable or
276: recursive (semi)measures? For {\em every} countable
277: class $\M$ and $\xi_\M(x):=\sum_{\nu\in\M} w_\nu \nu(x)$ with
278: $w_\nu>0$, the important dominance $\xi_\M(x)\geq w_\nu
279: \nu(x)\,\forall\nu\in\M$ is satisfied. The question is what
280: properties $\xi_\M$ possesses. The distinguishing property of
281: $\M_{enum}^{semi}$ is that $\MM=\xi_{\M_{enum}^{semi}}$ is itself
282: an element of $\M_{enum}^{semi}$.
283: %
284: On the other hand, for prediction, $\xi_\M\in\M$ is not by itself
285: an important property. What matters is  whether $\xi_\M$ is
286: computable (in one of the senses we defined above) to avoid
287: getting into the (un)realm of non-constructive math.
288: 
289: %1st goal of this work
290: Our first contribution is to classify the existence of generalized
291: computable (semi)measures.
292: %
293: From \cite{Zvonkin:70} we know that there is an enumerable
294: semimeasure (namely $\MM$) that dominates all enumerable
295: semimeasures in $\M_{enum}^{semi}$. We show that there is {\em no}
296: estimable semimeasure that dominates all recursive measures (also
297: mentioned in \cite{Zvonkin:70}), and there is {\em no}
298: approximable semimeasure that dominates all approximable measures.
299: From this it follows that for a universal (semi)measure that at
300: least satisfies the weakest form of computability, namely being
301: approximable, the largest dominated class among the classes
302: considered in this work is the class of enumerable semimeasures.
303: This is the distinguishing property of $\M_{enum}^{semi}$ and
304: $\MM$.
305: %
306: This investigation was motivated by recent
307: generalizations of Kolmogorov complexity and Solomonoff's prior by
308: Schmidhuber \cite{Schmidhuber:00toe,Schmidhuber:02gtm}.
309: 
310: %2nd goal of this work
311: The second contribution is to investigate more closely the types of
312: convergence, possibly implied by universality: in difference and
313: in ratio, with probability 1, in mean sum, and for Martin-L{\"o}f
314: random sequences.
315: %
316: We introduce a generalized concept of randomness for individual
317: sequences and use it to exhibit difficulties regarding these
318: issues. More concretely, we consider countable classes $\M$ of
319: Bernoulli environments and show that $\xi_\M$ converges to $\mu$
320: on all generalized random sequences if and only if the class is
321: dense.
322: 
323: %------------------------------%
324: \paradot{Contents}
325: %------------------------------%
326: In Section~\ref{secCC} we review various computability concepts
327: and discuss their relation.
328: %
329: In Section~\ref{secUniM} we define the prefix Kolmogorov
330: complexity $K$, the concept of (semi)measures, Solomonoff's
331: universal prior $\MM$, and explain its universality.
332: %
333: Section~\ref{secUSP} summarizes Solomonoff's major convergence
334: result, discusses general mixture distributions and the important
335: universality property -- multiplicative dominance.
336: %
337: In Section~\ref{secUSM} we define seven classes of (semi)measures
338: based on four computability concepts. Each class may or may not
339: contain a (semi)measures that dominates all elements of another
340: class. We reduce the analysis of these 49 cases to four basic
341: cases. Domination (essentially by $\MM$) is known to be true for
342: two cases. The other two cases do not allow for domination.
343: %
344: In Section~\ref{secConv} we investigate more closely the type of
345: convergence implied by universality. We summarize the result on
346: posterior convergence in difference $(\xi-\mu\to 0)$ and improve
347: the previous result \cite{Li:97} on the convergence in ratio
348: $\xi/\mu\to 1$ by showing rapid convergence without use
349: of martingales.
350: %
351: In Section~\ref{secMLconv} we investigate whether convergence for
352: all Martin-L{\"o}f random sequences could hold. We define a
353: generalized concept of randomness for individual sequences and use
354: it to show that proofs based on universality cannot decide this
355: question.
356: %
357: Section~\ref{secConc} concludes the paper.
358: 
359: %------------------------------%
360: \paradot{Notation}
361: %------------------------------%
362: %Strings
363: We denote strings of length $n$ over finite alphabet $\X$ by
364: $x=x_1x_2...x_n$ with $x_t\in\X$ and further abbreviate
365: $x_{1:n}:=x_1x_2...x_{n-1}x_n$ and $x_{<n}:=x_1... x_{n-1}$,
366: $\epstr$ for the empty string, $\l(x)$ for the length of string $x$,
367: and $\omega=x_{1:\infty}$ for infinite sequences.
368: We write $xy$ for the concatenation of string $x$ with $y$.
369: %
370: % Asymptotic notation
371: We abbreviate $\lim_{n\to\infty}[f(n)-g(n)]=0$ by
372: $f(n)\toinfty{n}g(n)$ and say $f$ converges to $g$, without
373: implying that $\lim_{n\to\infty}g(n)$ itself exists. We write
374: $f(x)\geqm  g(x)$ for $g(x)=O(f(x))$, i.e.\ if $\exists c>0:
375: f(x)\geq c g(x)\forall x$.
376: 
377: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
378: \section{Computability Concepts}\label{secCC}
379: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
380: % computability concepts
381: We define several computability concepts weaker than can be captured
382: by halting Turing machines.
383: 
384: %------------------------------%
385: \fdefinition{defCompFunc}{Computable functions}{
386: %------------------------------%
387: We consider functions $f:\SetN\to\SetR$:
388: \begin{itemize}\ifjournal\parskip=0ex\parsep=0ex\itemsep=0.5ex\fi
389: \item[]
390: $\nq f$ is {\em recursive} or {\em finitely computable} {\it iff}
391: there are Turing machines $T_{1/2}$ with output interpreted as natural
392: numbers and $f(x)={T_1(x)\over T_2(x)}$,
393: \item[]
394: $\nq f$ is {\em approximable} or {\em limit-computable} {\it iff}
395: $\exists$ recursive $\phi(\cdot,\cdot)$ with
396: $\lim_{t\to\infty}\phi(x,t)=f(x)$.
397: \item[]
398: $\nq f$ is {\em enumerable} or {\em lower semicomputable} {\it
399: iff} additionally $\phi(x,t)\leq\phi(x,t+1)$.
400: \item[]
401: $\nq f$ is {\em co-enumerable} or {\em upper semicomputable} {\it
402: iff} $[-f]$ is lower semicomputable.
403: \item[]
404: $\nq f$ is {\em semicomputable} {\it iff} $f$ is lower- {\it or}
405: upper semicomputable.
406: \item[]
407: $\nq f$ is {\em estimable} {\it iff} $f$ is lower- {\it and} upper
408: semicomputable.
409: \end{itemize}
410: }%------------------------------%
411: 
412: \noindent If $f$ is estimable we can finitely compute an
413: $\eps$-approximation of $f$ by upper and lower semicomputing $f$
414: and terminating when differing by less than $\eps$. This means
415: that there is a Turing machine which, given $x$ and $\eps$,
416: finitely computes $\hat y\in\SetQ$ such that $|\hat y-f(x)|<\eps$.
417: Moreover it gives an interval estimate $f(x)\in[\hat y-\eps,\hat
418: y+\eps]$. An estimable integer-valued function is recursive (take
419: any $\eps<\odt$).
420: %
421: Note that if $f$ is only approximable or semicomputable we can
422: still come arbitrarily close to $f(x)$ but we cannot devise a
423: terminating algorithm that produces an $\eps$-approximation. In
424: the case of lower/upper semicomputability we can at least
425: finitely compute lower/upper bounds to $f(x)$. In case of
426: approximability, the weakest computability form, even this
427: capability is lost.
428: 
429: \begin{center}\small
430: \fbox{\parbox{11ex}{recursive=\\ finitely\\ computable}}
431: $\Rightarrow$
432: \fbox{\parbox{9ex}{estimable}}
433: %
434: \parbox{26ex}{\raisebox{-3ex}{$\Rightarrow$} \fbox{
435: \parbox{17ex}{enumerable=\\lower semi-\\ computable}}
436: \raisebox{-3ex}{$\Rightarrow$} \\[2ex]
437: \raisebox{3ex}{$\Rightarrow$} \fbox{
438: \parbox{17ex}{co-enumerable=\\ upper semi-\\
439: computable}} \raisebox{3ex}{$\Rightarrow$}}
440: \fbox{\parbox{11ex}{semi-\\ computable}}
441: $\Rightarrow$
442: \fbox{\parbox{18ex}{approximable=\\ limit-computable}}
443: \end{center}
444: 
445: \noindent What we call {\em estimable/recursive/finitely
446: computable} is often just called {\em computable}, but it makes
447: sense to separate the concepts in this work, since finite
448: computability is conceptually easier and some previous results
449: have only been proved for this case. Sometimes we us
450: the word {\em computable} generically for some of the
451: computability forms of Definition~\ref{defCompFunc}.
452: 
453: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
454: \section{The Universal Prior $\MM$}\label{secUniM}
455: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
456: 
457: % Universal prior
458: The prefix Kolmogorov complexity $K(x)$ is defined as the length
459: of the shortest binary (prefix) program $p\in\B^*$ for which a
460: universal prefix Turing machine $U$ (with binary program tape and
461: $\X$ary output tape) outputs string $x\in\X^*$, and similarly
462: $K(x|y)$ in case of side information $y$
463: \cite{Kolmogorov:65,Levin:74,Gacs:74,Chaitin:75}:
464: \beqn
465:   K(x)=\min\{\l(p):U(p)=x\},\qquad
466:   K(x|y)=\min\{\l(p):U(p,y)=x\}
467: \eeqn
468: Solomonoff \cite[Eq.(7)]{Solomonoff:64} defined (earlier) the
469: closely related quantity, the universal posterior
470: $\MM(y|x)=M(xy)/M(x)$.
471: %
472: The universal prior $M(x)$ can be defined as the probability that
473: the output of a universal monotone Turing machine $U$ starts with
474: $x$ when provided with fair coin flips on the input tape.
475: Formally, $\MM$ can be defined as
476: \beq\label{Mdef}
477:   \MM(x)\;:=\;\sum_{p\;:\;U(p)=x*}\nq 2^{-\l(p)}
478: \eeq
479: where the sum is over minimal programs $p$ for which $U$ outputs a
480: string starting with $x$. The so-called minimal programs are
481: defined similarly to the prefix programs, but $U$ need not to
482: halt, which is indicated by the $*$. Minimal programs are those
483: which are left to the input head in the moment when $U$ wrote the
484: last bit of $x$ \cite{Li:97,Hutter:04uaibook}.
485: %
486: Before we can discuss the stochastic properties of $\MM$ we
487: need the concept of (semi)measures for strings.
488: 
489: %------------------------------%
490: \fdefinition{defSemi}{Continuous (Semi)measures}{
491: %------------------------------%
492: $\mu(x)$ denotes the probability that a sequence starts
493: with string $x$. We call $\mu\geq 0$ a (continuous) semimeasure if
494: $\mu(\epstr)\leq 1$ and $\mu(x)\geq\sum_{a\in\X}\mu(xa)$, and a
495: (probability) measure if equalities hold.
496: }%------------------------------%
497: 
498: % motivation of nomenclature
499: \noindent The reason for calling $\mu$ with the above property a
500: probability measure is that it satisfies Kolmogorov's axioms of
501: probability in the following sense: The sample space is
502: $\X^\infty$ with elements
503: $\omega=\omega_1\omega_2\omega_3...\in\X^\infty$ being infinite
504: sequences over alphabet $\X$. The set of events (the
505: $\sigma$-algebra) is defined as the
506: set generated from the cylinder sets
507: $\Gamma_{x_{1:n}}:=\{\omega:\omega_{1:n}=x_{1:n}\}$ by countable
508: union and complement. A probability
509: measure $\mu$ is uniquely defined by giving its values
510: $\mu(\Gamma_{x_{1:n}})$ on the cylinder sets, which we abbreviate
511: by $\mu(x_{1:n})$. We will also call $\mu$ a measure, or even more
512: loose a probability distribution.
513: 
514: \noindent We have $\sum_{a\in\X}\MM(xa)<\MM(x)$ because there are
515: programs $p$ that output $x$, not followed by any $a\in\X$.
516: They just stop after printing $x$ or continue forever without any
517: further output. Together with $\MM(\epstr)=1$ this shows that $\MM$
518: is a semimeasure, but {\it not} a probability measure. We can now
519: state the fundamental property of $\MM$ \cite{Zvonkin:70,Solomonoff:78}:
520: 
521: %------------------------------%
522: \ftheorem{thUniM}{Universality of $\MM$}{
523: %------------------------------%
524: The universal prior $\MM$ is an enumerable semimeasure that
525: multiplicatively dominates all enumerable semimeasures in the
526: sense that $\MM(x) \;\geqm\; 2^{-K(\rho)}\cdot \rho(x)$
527: for all enumerable semimeasures $\rho$. $\MM$ is enumerable, but not
528: estimable (nor recursive).
529: }%------------------------------%
530: 
531: % Explanation
532: \noindent The Kolmogorov complexity of a function like $\rho$ is
533: defined as the length of the shortest self-delimiting code of a
534: Turing machine computing this function in the sense of Definition
535: \ref{defCompFunc}. Up to a multiplicative constant, $\MM$ assigns higher
536: probability to all $x$ than any other computable probability
537: distribution.
538: 
539: % Normalization of $\MM$
540: It is possible to normalize $\MM$ to a true probability measure
541: $\MM_{norm}$ \cite{Solomonoff:78,Li:97} with dominance still being
542: true, but at the expense of giving up enumerability ($\MM_{norm}$
543: is still approximable). $\MM$ is more convenient when studying
544: algorithmic questions, but a true probability measure like
545: $\MM_{norm}$ is more convenient when studying stochastic questions.
546: 
547: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
548: \section{Universal Sequence Prediction}\label{secUSP}
549: %\subsection{Solomonoff's Universal Sequence Prediction Scheme}
550: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
551: 
552: % Occam & Epicurus in $\MM =2^-K$
553: In which sense does $\MM$ incorporate Occam's razor and Epicurus'
554: principle of multiple explanations? Since the shortest programs
555: $p$ dominate the sum in $M$, $\MM(x)$ is roughly equal to
556: $2^{-K(x)}$ ($\MM(x)=2^{-K(x)+O(K(\l(x))}$), i.e.\
557: $\MM$ assigns high probability to simple
558: strings. More useful is to think of $x$ as being the observed
559: history. We see from (\ref{Mdef}) that every program $p$
560: consistent with history $x$ is allowed to contribute to $\MM$
561: (Epicurus). On the other hand, shorter programs give significantly
562: larger contribution (Occam). How does all this affect prediction?
563: If $\MM(x)$ describes our (subjective) prior belief in $x$, then
564: $\MM(y|x):=\MM(xy)/\MM(x)$ must be our posterior belief in $y$.
565: %
566: From the symmetry of algorithmic information $K(xy)\approx
567: K(y|x)+K(x)$, and $\MM(x)\approx 2^{-K(x)}$ and $\MM(xy)\approx
568: 2^{-K(xy)}$ we get $\MM(y|x)\approx 2^{-K(y|x)}$. This tells us
569: that $\MM$ predicts $y$ with high probability iff $y$ has an easy
570: explanation, given $x$ (Occam \& Epicurus).
571: 
572: % Caution
573: The above qualitative discussion should not create the impression
574: that $\MM(x)$ and $2^{-K(x)}$ always lead to predictors of
575: comparable quality. Indeed, in the online/incremental setting,
576: $K(y)=O(1)$ invalidates the consideration above. The proof of
577: (\ref{eukdist}) below, for instance, depends on $\MM$ being a
578: semimeasure and the chain rule being exactly true, neither of them is
579: satisfied by $2^{-K(x)}$. See \cite{Hutter:03unimdl} for a
580: detailed analysis.
581: 
582: % Solomonoff's universal sequence prediction
583: Sequence prediction algorithms try to predict the continuation
584: $x_t\in\X$ of a given sequence $x_1...x_{t-1}$.
585: %
586: The following bound shows that $M$ predicts computable sequences well:
587: \beq\label{eqDetMbnd}
588:   \sum_{t=1}^\infty(1\!-\!\MM(x_t|x_{<t}))^2 \;\leq\;
589:   -\odt \sum_{t=1}^\infty\ln \MM(x_t|x_{<t}) \;=\;
590:   -\odt\ln\MM(x_{1:\infty}) \;\leq\;
591:   \odt\ln 2\cdot \Km(x_{1:\infty}),
592: \eeq
593: where the monotone complexity
594: $\Km(x_{1:\infty})=\min\{\l(p):U(p)=x_{1:\infty}\}$ is defined as
595: the length of the shortest (nonhalting) program computing
596: $x_{1:\infty}$ \cite{Zvonkin:70,Levin:73random}. In the first
597: inequality we have used $(1-a)^2\leq-\odt\ln a$ for $0\leq a\leq
598: 1$. In the equality we exchanged the sum with the logarithm and
599: eliminated the resulting product by the chain rule. In the last inequality
600: we used $\MM(x)\geq 2^{-\Km(x)}$, which follows from
601: (\ref{Mdef}) by dropping all terms in $\sum_p$ except for the
602: shortest $p$ computing $x$. If $x_{1:\infty}$ is a computable
603: sequence, then $\Km(x_{1:\infty})$ is finite, which implies
604: $\MM(x_t|x_{<t})\to 1$
605: ($\sum_{t=1}^\infty(1-a_t)^2<\infty\Rightarrow a_t\to 1$). This
606: means, that if the environment is a computable sequence
607: (whichsoever, e.g.\ the digits of $\pi$ or $e$ in $\X$ary
608: representation), after having seen the first few digits, $\MM$
609: correctly predicts the next digit with high probability, i.e.\ it
610: recognizes the structure of the sequence.
611: 
612: Assume now that the true sequence is
613: drawn from a computable
614: probability distribution $\mu$, i.e.\ the true (objective)
615: probability of $x_{1:t}$ is $\mu(x_{1:t})$. The probability of
616: $x_t$ given $x_{<t}$ hence is
617: $\mu(x_t|x_{<t})=\mu(x_{1:t})/\mu(x_{<t})$.
618: %
619: Solomonoff's \cite{Solomonoff:78} central result is that $\MM$
620: converges to $\mu$. More precisely, for binary alphabet, he showed that
621: \beq\label{eukdist}
622:   \sum_{t=1}^\infty
623:   \nq\nq\;\sum_{\qquad x_{<t}\in\B^{t-1}}\nq\nq\;
624:   \mu(x_{<t}) \Big(\MM(0|x_{<t})-\mu(0|x_{<t})\Big)^2
625:   \;\leq\;
626:   {\odt}\ln 2\!\cdot\!K(\mu)+O(1) \;<\; \infty.
627: \eeq
628: The infinite sum can only be finite if the difference
629: $\MM(0|x_{<t})-\mu(0|x_{<t})$ tends to zero for $t\to\infty$ with
630: $\mu$-probability $1$ (see Definition~\ref{defConv}$(i)$ and
631: \cite{Hutter:01alpha} or Section~\ref{secConv} for general
632: alphabet). This holds for {\it any} computable probability
633: distribution $\mu$. The reason for the astonishing property of a
634: single (universal) function to converge to {\it any} computable
635: probability distribution lies in the fact that the set of
636: $\mu$-random sequences differ for different $\mu$. Past data
637: $x_{<t}$ are exploited to get a (with $t\to\infty$) improving
638: estimate $\MM(x_t|x_{<t})$ of $\mu(x_t|x_{<t})$.
639: 
640: % Bayes mixtures
641: The universality property (Theorem~\ref{thUniM}) is the central
642: ingredient in the proof of (\ref{eukdist}). The proof
643: involves the construction of a semimeasure $\xi$
644: whose dominance is obvious. The hard part is to show its
645: enumerability and equivalence to $\MM$.
646: Let $\M$ be the (countable) set of all enumerable semimeasures
647: and define
648: \beq\label{xidef}
649:   \xi(x):=\sum_{\nu\in\M}2^{-K(\nu)}\nu(x).
650: \eeq
651: Then dominance
652: \beq\label{xidom}
653:  \xi(x)\geq 2^{-K(\nu)}\nu(x)\quad\forall\,\nu\in\M
654: \eeq
655: is obvious. Is $\xi$ lower semicomputable? To answer this
656: question one has to be more precise. Levin \cite{Zvonkin:70} has
657: shown that the set of {\em all} lower semicomputable semimeasures
658: is enumerable (with repetitions). For this (ordered multi) set
659: $\M=\M_{enum}^{semi}:=\{\nu_1,\nu_2,\nu_3,...\}$ and
660: $K(\nu_i):=K(i)$ one can easily see that $\xi$ is lower
661: semicomputable. Finally proving $\MM(x)\geqm\xi(x)$ also
662: establishes universality of $\MM$ (see \cite{Solomonoff:78,Li:97}
663: for details).
664: 
665: The advantage of $\xi$ over $\MM$ is that it immediately
666: generalizes to arbitrary weighted sums of (semi)measures
667: for arbitrary countable $\M$.
668: 
669: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
670: \section{Universal (Semi)Measures}\label{secUSM}
671: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
672: 
673: What is so special about the set of all enumerable
674: semimeasures $\M_{enum}^{semi}$? The larger we choose $\M$ the less restrictive
675: is the assumption that $\M$ should contain the true distribution
676: $\mu$, which will be essential throughout the paper.
677: %
678: Why do not restrict to the still rather general class of estimable
679: or recursive (semi)measures? It is clear that for every
680: countable (multi)set $\M$, the universal or mixture distribution
681: \beq\label{defxi}
682:   \xi(x):=\xi_\M(x):=\sum_{\nu\in\M} w_\nu \nu(x)
683:   \qmbox{with} \sum_{\nu\in\M}w_\nu\leq 1 \qmbox{and} w_\nu>0
684: \eeq
685: dominates all $\nu\in\M$. This dominance is
686: necessary for the desired convergence $\xi\to\mu$ similarly to
687: (\ref{eukdist}). The question is what properties $\xi$ possesses.
688: The distinguishing property of $\M_{enum}^{semi}$ is that $\xi$ is
689: itself an element of $\M_{enum}^{semi}$. When concerned with
690: predictions, $\xi_\M\in\M$ is not by itself an important property,
691: but whether $\xi$ is computable in one of the senses of Definition
692: \ref{defCompFunc}. We define
693: \bqan
694:  \M_1\geqm\M_2 & :\Leftrightarrow &
695:  \mbox{there is an element of $\M_1$ that dominates all elements of
696:  $\M_2$} \\
697:  & :\Leftrightarrow &
698: \exists\rho\!\in\!\M_1\;\forall\nu\!\in\!\M_2\;\exists w_\nu\!>\!0
699: \;\forall x:\rho(x)\!\geq\!w_\nu\nu(x).
700: \eqan
701: $\geqm $ is transitive (but not necessarily reflexive) in the
702: sense that $\M_1 \geqm \M_2 \geqm \M_3$ implies $\M_1 \geqm \M_3$
703: and $\M_0 \supseteq \M_1 \geqm \M_2 \supseteq \M_3$ implies $\M_0
704: \geqm \M_3$.
705: %
706: For the computability concepts introduced in Section~\ref{secCC}
707: we have the following proper set inclusions
708: \beqn
709: \begin{array}{ccccccc}
710:   \M_{rec}^{msr}  & \subset & \M_{est}^{msr}  & \equiv  & \M_{enum}^{msr}  & \subset & \M_{appr}^{msr} \\
711:         \cap       &         &      \cap       &         &       \cap       &         &     \cap        \\
712:   \M_{rec}^{semi} & \subset & \M_{est}^{semi} & \subset & \M_{enum}^{semi} & \subset & \M_{appr}^{semi}
713: \end{array}
714: \eeqn
715: %
716: where $\M^{msr}_c$ stands for the set of all probability measures
717: of appropriate computability type $c\in\{$rec=recursive, est=estimable, enum=enumerable,
718: appr=approximable$\}$, and similarly for semimeasures
719: $\M^{semi}_c$. From an enumeration of a measure $\rho$ one can
720: construct a co-enumeration by exploiting
721: $\rho(x_{1:n})=1-\sum_{y_{1:n}\neq x_{1:n}}\rho(y_{1:n})$. This
722: shows that every enumerable measure is also co-enumerable, hence
723: estimable, which proves the identity $\equiv$ above.
724: 
725: With this notation, Theorem~\ref{thUniM} implies
726: $\M_{enum}^{semi}\geqm\M_{enum}^{semi}$. Transitivity allows to
727: conclude, for instance, that
728: $\M_{appr}^{semi}\geqm\M_{rec}^{msr}$, i.e.\ that there is an
729: approximable semimeasure that dominates all recursive measures.
730: 
731: The standard ``diagonalization'' way of proving
732: $\M_1\ngeqm\M_2$ is to take an arbitrary
733: $\mu\in\M_1$ and ``increase'' it to $\rho$ such that
734: $\mu\ngeqm\rho$ and show that $\rho\in\M_2$.
735: There are $7\times 7$ combinations of (semi)measures $\M_1$ with
736: $\M_2$ for which $\M_1\geqm\M_2$ could be true or false. There are
737: four basic cases, explicated in the following theorem, from which
738: the other 49 combinations displayed in Table~\ref{tabUniSMsr}
739: follow by transitivity.
740: 
741: %------------------------------%
742: \ftheorem{thNoUniApp}{Universal (semi)measures}{
743: %------------------------------%
744: A semimeasure $\rho$ is said to be universal for $\M$ if it
745: multiplicatively dominates all elements of $\M$ in the sense
746: $\forall\nu\exists w_\nu>0:\rho(x)\geq w_\nu\nu(x)\forall x$. The
747: following holds true:
748: \begin{list}{}{\parsep=1ex}
749: \item[$o)$]
750: $\exists\rho:\{\rho\}\geqm\M$: For every countable set
751: of (semi)measures $\M$, there is a (semi)measure that dominates
752: all elements of $\M$.
753: \item[$i)$]
754: $\M_{enum}^{semi}\geqm\M_{enum}^{semi}$:
755: The class of enumerable semimeasures {\em contains}
756: a universal element.
757: \item[$ii)$]
758: $\M_{appr}^{msr}\geqm\M_{enum}^{semi}$:
759: There {\em is} an approximable measure that dominates all enumerable
760: semimeasures.
761: \item[$iii)$]
762: $\M_{est}^{semi}\ngeqm\M_{rec}^{msr}$: There is
763: {\em no} estimable semimeasure that dominates all recursive
764: measures.
765: \item[$iv)$]
766: $\M_{appr}^{semi}\ngeqm\M_{appr}^{msr}$: There is
767: {\em no} approximable semimeasure that dominates all approximable
768: measures.
769: \end{list}
770: }%------------------------------%
771: 
772: \begin{table}[thb]
773: \ftablex{tabUniSMsr}{Existence of universal (semi)measures}{%
774: The entry in row $r$ and column $c$ indicates whether there is an
775: $r$-able (semi)measure $\rho$ dominating the set $\M$ that contains all
776: $c$-able (semi)measures, where $r,c\in\{$recurs, estimat, enumer,
777: approxim$\}$. Enumerable measures are estimable. This is the
778: reason why the enum.\ row and column in case of measures are
779: missing. The superscript indicates from which part of Theorem
780: \ref{thNoUniApp} the answer follows. For the bold face entries
781: directly, for the others using transitivity of $\geqm $.
782: \begin{center}
783: \begin{tabular}{|c|c||c|c|c|c||c|c|c|}\hline
784:       $\nwarrow$ &  $\M$ & \multicolumn{4}{c||}{semimeasure} & \multicolumn{3}{c|}{measure}\\ \hline
785: $\rho$&$\searrow$& rec.      & est.       & enum.         & appr.     & rec.          & est.       & appr.        \\ \hline\hline
786:       s  & rec. & no$^{iii}$ & no$^{iii}$ & no$^{iii}$    & no$^{iv}$ & no$^{iii}$    & no$^{iii}$ & no$^{iv}$    \\ \cline{2-9}
787:       e  & est.  & no$^{iii}$ & no$^{iii}$ & no$^{iii}$    & no$^{iv}$ & {\bf no}$^{\bf iii}$& no$^{iii}$ & no$^{iv}$    \\ \cline{2-9}
788:       m  & enum. & yes$^{i}$  & yes$^{i}$  & {\bf yes}$^{\bf i}$ & no$^{iv}$ & yes$^{i}$     & yes$^{i}$  & no$^{iv}$    \\ \cline{2-9}
789:       i  &appr.  & yes$^{i}$  & yes$^{i}$  & yes$^{i}$     & no$^{iv}$ & yes$^{i}$     & yes$^{i}$  & {\bf no}$^{\bf iv}$\\ \hline\hline
790:       m  & rec.  & no$^{iii}$ & no$^{iii}$ & no$^{iii}$    & no$^{iv}$ & no$^{iii}$    & no$^{iii}$ & no$^{iv}$    \\ \cline{2-9}
791:       s  & est.  & no$^{iii}$ & no$^{iii}$ & no$^{iii}$    & no$^{iv}$ & no$^{iii}$    & no$^{iii}$ & no$^{iv}$    \\ \cline{2-9}
792:       r  &appr.  & yes$^{ii}$ & yes$^{ii}$ & {\bf yes}$^{\bf ii}$& no$^{iv}$ & yes$^{ii}$    & yes$^{ii}$ & no$^{iv}$    \\ \hline
793: \end{tabular}
794: \end{center}}
795: \end{table}
796: 
797: \noindent If we ask for a universal (semi)measure that at least
798: satisfies the weakest form of computability, namely being
799: approximable, we see that the largest dominated set among the 7
800: sets defined above is the set of enumerable semimeasures. This is
801: the reason why $\M_{enum}^{semi}$ plays a special role. On the
802: other hand, $\M_{enum}^{semi}$ is not the largest set dominated by
803: an approximable semimeasure, and indeed no such largest set
804: exists. One may, hence, ask for ``natural'' larger sets $\M$. One
805: such set, namely the set of cumulatively enumerable semimeasures
806: $\M_{\text{CEM}}$, has recently been discovered by Schmidhuber
807: \cite{Schmidhuber:00toe,Schmidhuber:02gtm}, for which even
808: $\xi_{\text{CEM}}\in\M_{\text{CEM}}$ holds.
809: 
810: \noindent Theorem~\ref{thNoUniApp} also holds for {\em discrete
811: (semi)measures} $P$ defined as follows:
812: 
813: %------------------------------%
814: \fdefinition{defDSemi}{Discrete (semi)measures}{
815: %------------------------------%
816: $P(x)$ denotes the probability of $x\in\SetN$. We call
817: $P:\SetN\to[0,1]$ a discrete (semi)measure if $\sum_{x\in\SetN}
818: P(x)\stackrel{(<)}=1$.
819: }%------------------------------%
820: %
821: Theorem~\ref{thNoUniApp}
822: $(i)$ is Levin's major result \cite[Thm.4.3.1 \& Thm.4.5.1]{Li:97}, %
823: and $(ii)$ is due to Solomonoff \cite{Solomonoff:78}. %
824: The proof of $\M_{rec}^{semi}\ngeqm\M_{rec}^{semi}$ in
825: \cite[p249]{Li:97} contains minor errors and is not extensible to
826: $(iii)$, and the proof in \cite[p276]{Li:97} only applies to
827: infinite alphabet and not to the binary/finite case considered
828: here. $\M_{est}^{semi}\ngeqm\M_{est}^{semi}$
829: is mentioned in \cite{Zvonkin:70} without proof.
830: %
831: A direct proof of $(iv)$ can be found in \cite{Hutter:04uaibook}.
832: %
833: Here, we reduce $(iv)$ to $(iii)$ by exploiting the following
834: elementary fact (well-known for integer-valued functions, see
835: e.g.\ \cite[p634]{Simpson:77}):
836: 
837: %------------------------------%
838: \flemma{lemOracle}{Approximable = $H$-estimable}{
839: %------------------------------%
840: A function is approximable iff it is estimable with the help of
841: the halting oracle.
842: }%------------------------------%
843: 
844: %------------------------------%
845: \paradot{Proof}
846: %------------------------------%
847: With $H$-computable we mean, computable with the help of the
848: halting oracle, or equivalently, computable under extra input of
849: the halting sequence $h=h_{1:\infty}\in\B^\infty$, where $h_n=1$
850: $:\Leftrightarrow$ $U(n)$ halts.
851: 
852: Assume $f$ is approximable, i.e.\ $\forall\eps\exists y,m:
853: R(m,y,\eps)$, where relation $R(m,y,\eps):=[\forall n\geq
854: m:|f_n(x)-y|<\eps]$ and recursive $f_n\to f$. Fix $\eps>0$.
855: Search (dovetail) for $m\in\SetN$ and $y$ ($\in\odt\eps\SetZ$ is
856: sufficient) such that $R(m,y,\eps)=$true. $R$ is
857: co-enumerable, hence $H$-decidable, hence $y$ can be $H$-computed,
858: hence $f$ is $H$-estimable, since $f(x)=y\pm O(\eps)$.
859: 
860: Now assume that $f$ is $H$-estimable, i.e.\ $\exists T\in$TM
861: $\forall\eps,x:|T(x,\eps,h)-f(x)|<\eps$. Since $h$ is
862: co-enumerable, $T$ and hence $f$ are approximable. More formally,
863: let $h_n^t=1$ $:\Leftrightarrow$ $U(n)$ halts within $t$ steps.
864: Then $g(x,\eps) := T(x,\eps,h) = T(x,\eps,\lim_{t\to\infty}h^t) =
865: \lim_{t\to\infty}T(x,\eps,h^t)$ is approximable, where the
866: exchange of limits holds, since $T$ only reads $n_{x\eps}<\infty$
867: bits of $h$ and $h_{1:n_{x\eps}}=h^t_{1:n_{x\eps}}$ for
868: sufficiently large $t$. \qed
869: 
870: 
871: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
872: \section{Proof of Theorem~\ref{thNoUniApp}}\label{secProof}
873: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
874: 
875: We first prove the theorem for discrete (semi)measures $P$ (Definition
876: \ref{defDSemi}), since it contains the essential ideas in a
877: cleaner form. We then present the proof for continuous
878: (semi)measures $\mu$ (Definition~\ref{defSemi}). We present proofs
879: for binary alphabet $\X=\B$ only. The proofs naturally generalize from
880: binary to arbitrary finite alphabet. $\arg\min_x f(x)$ is the $x$
881: that minimizes $f(x)$. Ties are broken in an arbitrary but
882: computable way (e.g.\ by taking the smallest $x$).
883: 
884: %------------------------------%
885: \paradot{Proof (discrete case)}\\%
886: %------------------------------%
887: \paranodot{(o)} $Q(x):=\sum_{P\in\M}w_P P(x)$
888: with $w_P>0$ obviously dominates all $P\in\M$ (with constant
889: $w_P$). With $\sum_P w_P=1$ and all $P$ being discrete
890: (semi)measures also $Q$ is a discrete (semi)measure.
891: 
892: \paranodot{(i)} See \cite[Thm.4.3.1]{Li:97}.
893: 
894: \paranodot{(ii)} Let $P$ be the universal element in
895: $\M_{enum}^{semi}$ and $\alpha:=\sum_x P(x)$. We normalize $P$ by
896: $Q(x):={1\over\alpha}P(x)$. Since $\alpha\leq 1$ we have $Q(x)\geq
897: P(x)$. Hence $Q\geq P\geqm\M_{enum}^{semi}$. As a
898: ratio between two enumerable functions, $Q$ is still approximable,
899: hence $\M_{appr}^{msr}\geqm\M_{enum}^{semi}$.
900: 
901: \paranodot{(iii)}
902: Let $P\in\M_{rec}^{semi}$. We partition $\SetN$ into chunks
903: $I_n:=\{2^{n-1},...,2^n-1\}$ ($n\geq 1$) of increasing size. With
904: $x_n:=\arg\min_{x\in I_n}P(x)$ we define $Q(x_n):={1\over
905: n(n+1)}\forall n$ and $Q(x):=0$ for all other $x$. Exploiting that
906: a minimum is smaller than an average and that $\mu$ is a
907: semimeasure, we get
908: \beqn
909: P(x_n)=\min_{x\in I_n}P(x)\leq{1\over|I_n|}\sum_{x\in
910: I_n}P(x)\leq{1\over|I_n|}={1\over 2^{n-1}}= {n(n+1)\over
911: 2^{n-1}} Q(x_n)
912: \eeqn
913: Since ${n(n+1)\over 2^{n-1}}\to 0$ for $n\to\infty$, $P$ cannot
914: dominate $Q$ ($P\ngeqm Q$). With $P$ also $Q$
915: is recursive. Since $P$ was an arbitrary recursive semimeasure
916: and $Q$ is a recursive measure ($\sum Q(x)=\sum[{1\over
917: n(n+1)}]=\sum[{1\over n}-{1\over n+1}]=1$) this implies
918: $\M_{rec}^{semi}\ngeqm\M_{rec}^{msr}$.
919: 
920: Assume now that there is an estimable semimeasure
921: $S\geqm\M_{rec}^{msr}$. We construct a recursive semimeasure
922: $P\geqm S$ as follows. Choose an initial $\eps>0$ and finitely
923: compute an $\eps$-approximation $\hat S$ of $S(x)$. If $\hat
924: S>2\eps$ define $P(x):=\odt\hat S$, else halve $\eps$ and repeat
925: the process. Since $S(x)>0$ (otherwise it could not dominate,
926: e.g.\ $T(x):={1\over x(x+1)}\in\M_{rec}^{msr}$) the loop
927: terminates after finite time. So $P$ is recursive. Inserting $\hat
928: S=2P(x)$ and $\eps<\odt\hat S=P(x)$ into $|S(x)-\hat S|<\eps$ we
929: get $|S(x)-2P(x)|<P(x)$, which implies $S(x)\geq P(x)$ and
930: $S(x)\leq 3P(x)$. The former implies $\sum_x P(x)\leq \sum_x
931: S(x)\leq 1$, i.e.\ $P$ is a semimeasure. The latter implies
932: $P\geq{1\over 3}S\geqm\M_{rec}^{msr}$. Hence $P$ is a recursive
933: semimeasure dominating all recursive measures, which contradicts
934: what we have proven in the first half of $(iii)$. Hence the
935: assumption on $S$ was wrong which establishes
936: $\M_{est}^{semi}\ngeqm\M_{rec}^{msr}$.
937: 
938: \paranodot{(iv)} From $(iii)$ we know that
939: $\M_{est}^{semi}\ngeqm\M_{est}^{msr}$. The proof and hence result
940: remains valid under the halting oracle, i.e.\
941: $\M_{H\text{-}est}^{semi}\ngeqm\M_{H\text{-}est}^{msr}$. By Lemma
942: \ref{lemOracle}, the $H$-estimable functions/(semi)measures coincide
943: with the approximable functions/(semi)measures, hence
944: $\M_{appr}^{semi}\ngeqm\M_{appr}^{msr}$. \qed
945: 
946: %------------------------------%
947: \paradot{Proof (continuous case)}\\%
948: %------------------------------%
949: The major difference to the discrete case is that one also has to
950: take care that $\rho(x)\stackrel{(>)}=\rho(x0)+\rho(x1)$, $x\in\B^*$, is
951: respected. On the other hand, the chunking $I_n:=\B^n$ is more
952: natural here.
953: 
954: \paranodot{(o)} $\rho(x):=\sum_{\nu\in\M}w_\nu \nu(x)$ with $w_\nu>0$
955: obviously dominates all $\nu\in\M$ (with domination constant
956: $w_\nu$). With $\sum_\nu w_\nu=1$ and all $\nu$ being
957: (semi)measures also $\rho$ is a (semi)measure.
958: 
959: \paranodot{(i)} See \cite[Thm.4.5.1]{Li:97}.
960: 
961: \paranodot{(ii)} Let $\xi$ be a universal element in $\M_{enum}^{semi}$.
962: We define \cite{Solomonoff:78}
963: \beqn
964:   \xi_{norm}(x_{1:n}) \;:=\;
965:   \prod_{t=1}^n{\xi(x_{1:t}) \over \xi(x_{<t}0)+\xi(x_{<t}1)}.
966: \eeqn
967: By induction one can show that $\xi_{norm}$ is a measure and
968: that $\xi_{norm}(x)\geq\xi(x)\forall x$, hence
969: $\xi_{norm}\geq\xi\geqm\M_{enum}^{semi}$. As a ratio
970: of enumerable functions, $\xi_{norm}$ is still approximable, hence
971: $\M_{appr}^{msr}\geqm\M_{enum}^{semi}$.
972: 
973: \paranodot{(iii)} Analogous to the discrete case we could start by
974: recursively defining $x_k^*:=\arg\min_{x_k}\mu(x_{<k}^*x_k)$ for
975: $\mu\in\M_{rec}^{semi}$. See \cite{Hutter:03unipriors} for a proof
976: along this line.
977: %
978: Simpler is to directly consider $\mu\in\M_{est}^{semi}$ and to
979: compute $x^*_{1:\infty}$ recursively by computing some
980: $\eps$-approximation $e(x_k|x^*_{<t})$ of $\mu(x_k|x^*_{<t})$ and
981: define $x^*_k=\arg\max_{x_k}e(x_k|x^*_{<t})$, which implies
982: $\mu(x^*_k|x^*_{<t})\leq\odt+\eps$. Finally we define measure
983: $\rho$ by $\rho(x_{1:k}^*)=1\forall k$ and $\rho(x)=0$ for all $x$
984: that are not prefixes of $x_{1:\infty}^*$.
985: %
986: Hence
987: $\mu(x_{1:n}^*)\leq(\odt+\eps)^n=(\odt+\eps)^n\rho(x_{1:n}^*)$,
988: which demonstrates that $\mu$ does not dominate $\rho$ for
989: $\eps<\odt$. Since $\mu\in\M_{est}^{semi}$ was arbitrary and
990: $\rho$ is a recursive measure, this implies
991: $\M_{est}^{semi}\ngeqm\M_{rec}^{msr}$.
992: 
993: \paranodot{(iv)} Identical to discrete case. \qed
994: 
995: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
996: \section{Posterior Convergence}\label{secConv}
997: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
998: 
999: We investigated in detail the computational properties of
1000: various mixture distributions $\xi$. A mixture $\xi_\M$
1001: multiplicatively dominates all distributions in $\M$. We
1002: mentioned that dominance implies posterior convergence. In this
1003: section we present in more detail what dominance implies and what
1004: not.
1005: 
1006: Convergence of $\xi(x_t|x_{<t})$ to $\mu(x_t|x_{<t})$ with
1007: $\mu$-probability 1 tells us that $\xi(x_t|x_{<t})$ is close to
1008: $\mu(x_t|x_{<t})$ for sufficiently large $t$ on `most'
1009: sequences $x_{1:\infty}$. It says nothing about the speed of
1010: convergence, nor whether convergence is true for any {\em particular}
1011: sequence (of measure 0). Convergence {\em in mean sum} defined
1012: below is intended to capture the rate of convergence,
1013: Martin-L\"{o}f randomness is used to capture convergence
1014: properties for individual sequences.
1015: 
1016: Martin-L\"{o}f randomness is a very important concept of
1017: randomness of individual sequences, which is closely related to
1018: Kolmogorov complexity and Solomonoff's universal prior. Levin gave
1019: a characterization equivalent to Martin-L\"{o}f's original
1020: definition \cite{Levin:73random}:
1021: 
1022: %------------------------------%
1023: \ftheorem{defML}{Martin-L\"{o}f random sequences}{
1024: %------------------------------%
1025: A sequence $x_{1:\infty}$ is $\mu$-Martin-L\"{o}f random
1026: ($\mu$.M.L.) iff there is a constant $c$ such that
1027: $\MM(x_{1:n})\leq c\cdot \mu(x_{1:n})$ for all $n$.
1028: }%------------------------------%
1029: 
1030: \noindent An  equivalent formulation for estimable $\mu$ is:
1031: \beq\label{KmMLr}
1032:   x_{1:\infty} \mbox{ is $\mu$.M.L.-random}
1033:   \quad\Leftrightarrow\quad
1034:   \Km(x_{1:n})= -\log\mu(x_{1:n})+O(1) \;\forall n
1035: \eeq
1036: Theorem~\ref{defML} follows from
1037: (\ref{KmMLr}) by exponentiation, ``using $2^{-\Km}\approx\MM$''
1038: and noting that $\MM\geqm\mu$ follows from universality of $\MM$.
1039: Consider the special case of $\mu$ being a fair coin, i.e.\
1040: $\mu(x_{1:n})=2^{-n}$, then $x_{1:\infty}$ is M.L.\ random {\em
1041: iff} $\Km(x_{1:n})=n+O(1)$, i.e.\ if $x_{1:n}$ is incompressible.
1042: For general $\mu$, $-\lb\mu(x_{1:n})$ is the length of the
1043: Shannon-Fano code of $x_{1:n}$, hence $x_{1:\infty}$ is
1044: $\mu$.M.L.-random {\em iff} the Shannon-Fano code is optimal.
1045: 
1046: One can show that a $\mu$.M.L.-random sequence $x_{1:\infty}$
1047: passes {\em all} thinkable effective randomness tests, e.g.\ the
1048: law of large numbers, the law of the iterated logarithm, etc.
1049: In particular, the set of all $\mu$.M.L.-random sequences has
1050: $\mu$-measure 1.
1051: %
1052: The following generalization is natural when considering general
1053: Bayes mixtures $\xi$ as in this work:
1054: 
1055: %------------------------------%
1056: \fdefinition{defmuMr}{$\mu/\xi$-random sequences}{
1057: %------------------------------%
1058: A sequence $x_{1:\infty}$ is called $\mu/\xi$-random
1059: ($\mu.\xi$.r.) iff there is a constant $c$ such that
1060: $\xi(x_{1:n})\leq c\cdot \mu(x_{1:n})$ for all $n$.
1061: }%------------------------------%
1062: 
1063: Typically, $\xi$ is a mixture over some $\M$ as defined in
1064: (\ref{defxi}), in which case the reverse inequality
1065: $\xi(x)\geqm\mu(x)$ is also true (for all $x$). For finite $\M$ or
1066: if $\xi\in\M$, the definition of $\mu/\xi$-randomness depends only
1067: on $\M$, and not on the specific weights $w_\nu$ used in $\xi$. For
1068: $\M=\M_{enum}^{semi}$, $\mu/\xi$-randomness is just
1069: $\mu$.M.L.-randomness. The larger $\M$, the more patterns are
1070: recognized as nonrandom.
1071: Roughly speaking, those regularities characterized by some
1072: $\nu\in\M$ are recognized by $\mu/\xi$-randomness, i.e.\ for
1073: $\M\subset\M_{enum}^{semi}$ some $\mu/\xi$-random strings may not
1074: be M.L.\ random.
1075: %
1076: Other randomness concepts, e.g.\ those by Schnorr, Ko, van
1077: Lambalgen, Lutz, Kurtz, von Mises, Wald, and Church (see
1078: \cite{Wang:96,Lambalgen:87,Schnorr:71}), could possibly also be
1079: characterized in terms of $\mu/\xi$-randomness for particular
1080: choices of $\cal M$.
1081: 
1082: %------------------------------%
1083: %\paradot{Convergence of Random Sequences}%\label{secConvRSeq}
1084: %------------------------------%
1085: A classical (nonrandom)
1086: real-valued sequence $a_t$ is defined to converge to $a_*$, short
1087: $a_t\to a_*$ if $\forall\eps\exists t_0\forall t\geq
1088: t_0:|a_t-a_*|<\eps$. We are interested in convergence properties
1089: of random sequences $z_t(\omega)$ for $t\to\infty$ (e.g.\
1090: $z_t(\omega)=\xi(\omega_t|\omega_{<t})-\mu(\omega_t|\omega_{<t})$).
1091: %
1092: We denote $\mu$-expectations by $\E$. The expected value of a
1093: function $f:\X^t\to\SetR$, dependent on $x_{1:t}$, independent of
1094: $x_{t+1:\infty}$, and possibly undefined on a set of $\mu$-measure
1095: 0, is $\E[f] =
1096: \sumprime_{\!x_{1:t}\in\X^t}\mu(x_{1:t})f(x_{1:t})$. The prime
1097: denotes that the sum is restricted to $x_{1:t}$ with
1098: $\mu(x_{1:t})\neq 0$. Similarly we use $\P[..]$ to denote the
1099: $\mu$-probability of event $[..]$.
1100: %
1101: We define four convergence concepts for random sequences.
1102: 
1103: %------------------------------%
1104: \fdefinition{defConv}{Convergence of random sequences}{
1105: %------------------------------%
1106: Let $z_1(\omega),z_2(\omega),...$ be a sequence of real-valued
1107: random variables. $z_t$ is said to
1108: converge for $t\to\infty$ to (random variable) $z_*$
1109: \begin{list}{}{\itemsep=1ex\leftmargin=8ex}
1110: \item[$i)$] with probability 1 (w.p.1) $:\Leftrightarrow$
1111:   $\P[\{\omega:z_t\to z_*\}]=1$,
1112: \item[$ii)$] in mean sum (i.m.s.) $:\Leftrightarrow$
1113: $\sum_{t=1}^\infty\E[(z_t-z_*)^2]<\infty$,
1114: \item[$iii)$] for every $\mu$-Martin-L{\"o}f random sequence ($\mu$.M.L.) $:\Leftrightarrow$ \\
1115: $\forall\omega:$ If $[\exists c\forall n:
1116: \MM(\omega_{1:n})\leq c\mu(\omega_{1:n})]$
1117:   then $z_t(\omega)\to z_*(\omega)$ for $t\to\infty$,
1118: \item[$iv)$] for every $\mu/\xi$-random sequence ($\mu.\xi$.r.) $:\Leftrightarrow$ \\
1119: $\forall\omega:$ If $[\exists c\forall n:
1120: \xi(\omega_{1:n})\leq c\mu(\omega_{1:n})]$
1121:   then $z_t(\omega)\to z_*(\omega)$ for $t\to\infty$.
1122: \end{list}
1123: }%------------------------------%
1124: 
1125: \noindent In statistics, $(i)$ is the ``default'' characterization of
1126: convergence of random sequences.
1127: %
1128: Convergence i.m.s.\ $(ii)$ is very strong: it provides a rate of
1129: convergence in the sense that the expected number of times $t$ in
1130: which $z_t$ deviates more than $\eps$ from $z_*$ is finite and
1131: bounded by $c/\eps^2$ and the probability that the number of
1132: $\eps$-deviations exceeds $c\over\eps^2\delta$ is smaller than
1133: $\delta$, where $c:=\sum_{t=1}^\infty\E[(z_t-z_*)^2]$.
1134: Nothing can be said for {\em which} $t$ these deviations occur.
1135: If, additionally, $|z_t-z_*|$ were monotone decreasing, then
1136: $|z_t-z_*|=o(t^{-1/2})$ could be concluded.
1137: %
1138: $(iii)$ uses Martin-L\"{o}f's notion of randomness of {\em individual}
1139: sequences to define convergence M.L. Since this work
1140: deals with general Bayes mixtures $\xi$, we generalized in $(iv)$
1141: the definition of convergence M.L.\ based on $\MM$ to
1142: convergence $\mu.\xi$.r.\ based on $\xi$ in a natural way.
1143: %
1144: One can show that convergence i.m.s.\ implies convergence w.p.1.
1145: Also convergence M.L.\ implies convergence w.p.1.
1146: %
1147: Universality of $\xi$ implies the following posterior convergence results:
1148: 
1149: %------------------------------%
1150: %\paradot{Convergence of $\xi$ to $\mu$}\label{subsecConv}
1151: %------------------------------%
1152: 
1153: %------------------------------%
1154: \ftheorem{thConv}{Convergence of $\xi$ to $\mu$}{
1155: %------------------------------%
1156: Let there be sequences $x_1x_2...$ over a finite alphabet $\X$
1157: drawn with probability $\mu(x_{1:n})\in\M$ for the first $n$
1158: symbols, where $\mu$ is a measure and $\M$ a countable set of
1159: (semi)measures. The universal/mixture posterior probability
1160: $\xi(x_t|x_{<t})$
1161: of the next symbol $x_t$ given $x_{<t}$
1162: is related to the true posterior probability $\mu(x_t|x_{<t})$
1163: in the following way:\vspace{-1ex}
1164: \beqn
1165:    \sum_{t=1}^n\E{\textstyle\left[\left(\sqrt{{\xi(x_t|x_{<t})
1166:           \over\mu(x_t|x_{<t})}}-1\right)^2\right]} \;\leq\;
1167:    \sum_{t=1}^n\E\bigg[\sum_{x'_t}
1168:         \left(\sqrt{\xi(x'_t|x_{<t})}-\sqrt{\mu(x'_t|x_{<t})}\right)^2\bigg]
1169:         \;\leq\; \ln{w_\mu^{-1}} \;<\; \infty
1170: \eeqn
1171: where $w_\mu$ is the weight (\ref{defxi}) of $\mu$ in $\xi$.
1172: }%------------------------------%
1173: 
1174: \noindent Theorem~\ref{thConv} implies
1175: \beqn
1176:  \mbox{$\sqrt{\xi(x'_t|x_{<t})} \to \sqrt{\mu(x'_t|x_{<t})}$
1177:  for any $x'_t$ and
1178:  $\sqrt{{\xi(x_t|x_{<t})\over\mu(x_t|x_{<t})}} \to 1$, both
1179:  i.m.s.\ for $t\to\infty$}.
1180: \eeqn
1181: %
1182: \noindent The latter strengthens the result
1183: $\xi(x_t|x_{<t})/\mu(x_t|x_{<t})\to 1$ w.p.1 derived by G\'acs
1184: \cite[Thm.5.2.2]{Li:97} in that it also provides the ``speed'' of
1185: convergence.
1186: 
1187: Note also the subtle difference between the two convergence
1188: results. For {\em any} sequence $x'_{1:\infty}$ (possibly constant
1189: and not necessarily $\mu$-random),
1190: $\mu(x'_t|x_{<t})-\xi(x'_t|x_{<t})$ converges to zero w.p.1
1191: (referring to $x_{1:\infty}$), but no statement is possible for
1192: $\xi(x'_t|x_{<t})/\mu(x'_t|x_{<t})$, since
1193: $\lim\,\inf\mu(x'_t|x_{<t})$ could be zero. On the other hand, if
1194: we stay {\em on}-sequence ($x'_{1:\infty} =
1195: x_{1:\infty}$), we have $\xi(x_t|x_{<t})/\mu(x_t|x_{<t})
1196: \to 1$ w.p.1 (whether $\inf\mu(x_t|x_{<t})$ tends to zero or not does
1197: not matter).
1198: %
1199: Indeed, it is easy to give an example where
1200: $\xi(x'_t|x_{<t})/\mu(x'_t|x_{<t})$ diverges. If we choose
1201: \beqn
1202:   \M=\{\mu_1,\mu_2\},\quad
1203:   \mu\!\equiv\!\mu_1,\quad
1204:   \mu_1(1|x_{<t})=\odt t^{-3} \qmbox{and}
1205:   \mu_2(1|x_{<t})=\odt t^{-2}
1206: \eeqn
1207: the contribution of $\mu_2$ to $\xi$ causes $\xi$ to fall
1208: off like $\mu_2 \sim t^{-2}$, much slower than $\mu \sim
1209: t^{-3}$ causing the quotient to diverge:
1210: \bqan
1211: \mu_1(0_{1:n}) &\!=\!& \prod_{t=1}^n(1-\odt
1212: t^{-3})\stackrel{n\to\infty}\longrightarrow c_1=0.450...>0
1213: \;\Rightarrow\; 0_{1:\infty}\;\mbox{is a
1214: $\mu$-random sequence},
1215: \\
1216: \mu_2(0_{1:n}) &\!=\!& \prod_{t=1}^n(1\!-\!\odt
1217: t^{-2})\stackrel{n\to\infty}\longrightarrow c_2=0.358...>0
1218: \;\Rightarrow\; \xi(0_{1:n})
1219: \to w_1c_1+w_2c_2=:c_\xi>0
1220: \\
1221: \xi(0_{<t}1) &\!=\!&
1222: w_1\mu_1(1|0_{<t})\mu_1(0_{<t})+w_2\mu_2(1|0_{<t})\mu_2(0_{<t})\to
1223: \odt w_2c_2 t^{-2}
1224: \eqan
1225: \beqn
1226: \Rightarrow \quad\xi(1|0_{<t})= {\xi(0_{<t}1)\over \xi(0_{<t})}
1227: \rightarrow {w_2c_2\over 2c_\xi}t^{-2}
1228: \quad\Rightarrow\quad
1229: {\xi(1|0_{<t})\over\mu(1|0_{<t})}\to {w_2c_2\over c_\xi}t\to\infty\quad \mbox{diverges}.
1230: \eeqn
1231: 
1232: %------------------------------%
1233: \paradot{Proof}
1234: %------------------------------%
1235: For a probability distribution $y_i\geq 0$ with $\sum_i y_i=1$ and a
1236: semi-distribution $z_i\geq 0$ with $\sum_i z_i\leq 1$ and
1237: $i=\{1,...,N\}$, the Hellinger distance $h(\vec
1238: y,\vec z):=\sum_i(\sqrt{y_i}-\sqrt{z_i})^2$ is upper bounded by the relative
1239: entropy $d(\vec
1240: y,\vec z)=\sum_i y_i\ln{y_i\over z_i}$ (and $0\ln{0\over z}:=0$).
1241: %
1242: This can be seen as follows: For arbitrary $0\leq y\leq 1$ and
1243: $0\leq z\leq 1$ we define
1244: \bqan
1245:   f(y,z) &:=& y\ln{y\over z}-(\sqrt{y}-\sqrt{z})^2+z-y =
1246:   2y g(\sqrt{z/y})
1247: \\
1248:   \qmbox{with}
1249:   g(t) &:=& -\ln t+t-1\geq 0.
1250: \eqan
1251: This shows $f\geq 0$,
1252: and hence $\sum_i f(y_i,z_i)\geq 0$, which implies
1253: \beqn
1254:   \sum_i y_i\ln{y_i\over z_i}-\sum_i(\sqrt{y_i}-\sqrt{z_i})^2 \geq
1255:   \sum_i y_i- \sum_i z_i \geq 1-1 = 0.
1256: \eeqn
1257: The (conditional) $\mu$-expectations of a function $f:\X^t\to\SetR$ are defined as
1258: \beqn
1259:  \E[f]=\sumprime_{x_{1:t}\in\X^t}\!\!\mu(x_{1:t})f(x_{1:t})
1260:  \qmbox{and}
1261:   \E_t[f]:=\E[f|x_{<t}]=\sumprime_{x_t\in\X}\mu(x_t|x_{<t})f(x_{1:t}),
1262: \eeqn
1263: where $\sumprime$ sums over all $x_t$ or $x_{1:t}$ for which
1264: $\mu(x_{1:t})\neq 0$.
1265: If we insert
1266: $\X=\{1,...,N\}$,
1267: $N=|\X|$,
1268: $i=x_t$,
1269: $y_i=\mu_t:=\mu(x_t|x_{<t})$, and
1270: $z_i=\xi_t:=\xi(x_t|x_{<t})$
1271: into $h$ and $d$ we get (w.p.1)
1272: \beqn\label{distdD}
1273:   h_t(x_{<t}) \;:=\; \textstyle \sum_{x_t}
1274:   (\sqrt{\mu_t}\!-\!\sqrt{\xi_t})^2 \qquad \leq \qquad
1275:   d_t(x_{<t}) \;:=\; \textstyle
1276:   \sum_{x_t}\mu_t\ln{\mu_t \over \xi_t} =
1277:   \E_t[\ln{\mu_t\over\xi_t}].
1278: \eeqn
1279: %
1280: Taking the expectation $\E$ and the sum $\sum_{t=1}^n$ we get
1281: \beq\label{entropyapp}
1282:   \sum_{t=1}^n
1283:   \E[d_t(x_{<t})] =
1284:   \sum_{t=1}^n\E[\E_t[
1285:   \ln{\mu_t\over\xi_t}]] =
1286:   \E[
1287:   \ln \prod_{t=1}^n{\mu_t\over\xi_t}] =
1288:   \E[
1289:   \ln{\mu(x_{1:n}) \over \xi(x_{1:n})}] \leq
1290:   \ln{w_\mu^{-1}}
1291: \eeq
1292: where we have used $\E[\E_t[..]]=\E[..]$ and exchanged the $t$-sum
1293: with the expectation $\E$, which transforms to a product inside
1294: the logarithm. In the last equality we have used the chain rule for
1295: $\mu$ and $\xi$. Using universality $\xi(x_{1:n})\geq
1296: w_\mu\mu(x_{1:n})$ yields the final inequality. Finally
1297: \beqn
1298:   \E_t\bigg[\Big(\sqrt{\xi_t\over \mu_t}-1\Big)^2\bigg] =
1299:   \sum_{x_t}\!'\mu_t
1300:   \Big(\sqrt{\xi_t\over \mu_t}-1\Big)^2  =
1301:   \sum_{x_t}\!'(\sqrt{\xi_t}-\sqrt{\mu_t})^2 \leq
1302:   h_t(x_{<t})\leq
1303:   d_t(x_{<t}).
1304: \eeqn
1305: Taking the expectation $\E$ and the sum $\sum_{t=1}^n$ and
1306: chaining the result with (\ref{entropyapp}) yields Theorem
1307: \ref{thConv}. \qed
1308: 
1309: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1310: \section{Convergence in Martin-L{\"o}f Sense}\label{secMLconv}
1311: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1312: 
1313: An interesting open question is whether $\xi$ converges to $\mu$
1314: (in difference or ratio) individually for all Martin-L\"{o}f
1315: random sequences. Clearly, convergence $\mu$.M.L. may at most fail
1316: for a set of sequences with $\mu$-measure zero. A convergence
1317: M.L.\ result would be particularly interesting and natural for
1318: Solomonoff's universal prior $M$, since M.L.\ randomness can be
1319: defined in terms of $\MM$ (see Theorem~\ref{defML}). Attempts to
1320: convert the bounds in Theorem~\ref{thConv} to effective
1321: $\mu$.M.L.-randomness tests fail, since $M(x_t|x_{<t})$ is not
1322: enumerable. The proof of $M/\mu\stackrel{M.L.}\longrightarrow 1$
1323: given in \cite[Thm.5.2.2]{Li:97} and \cite[Thm.10]{Vitanyi:00} is
1324: incomplete.$\!$\footnote{The formulation of their theorem is quite
1325: misleading in general: ``{\it Let $\mu$ be a positive recursive
1326: measure. If the length of $y$ is fixed and the length of $x$ grows
1327: to infinity, then $M(y|x)/\mu(y|x)\to 1$ with $\mu$-probability
1328: one. The infinite sequences $\omega$ with prefixes $x$ satisfying
1329: the displayed asymptotics are precisely [`$\Rightarrow$' {\em and}
1330: `$\Leftarrow$'] the $\mu$-random sequences.}'' First, for
1331: off-sequence $y$ convergence w.p.1 does not hold ($xy$ must be
1332: demanded to be a prefix of $\omega$). Second, the proof of
1333: `$\Leftarrow$' has gaps (see main text). Last, `$\Rightarrow$' is
1334: given without proof and is wrong \cite{Hutter:04mlconvx}. Also the assertion
1335: in \cite[Thm.5.2.1]{Li:97} that $S_t:=\E\sum_{x'_t}
1336: (\mu(x'_t|x_{<t})-M(x'_t|x_{<t}))^2$ converges to zero faster than
1337: $1/t$ cannot be made, since $S_t$ does not decrease
1338: monotonically \cite[Prob.2.7]{Hutter:04uaibook}. For example, for
1339: $a_t:=1/\sqrt{t}$ if $t$ is a cube and 0 otherwise, we have
1340: $\sum_{t=1}^\infty a_t<\infty$, but $a_t\neq o(1/t)$.} The
1341: implication ``$\MM(x_{1:n})\leq c\cdot\mu(x_{1:n})\forall
1342: n\Rightarrow \lim_{n\to\infty}\MM(x_{1:n})/\mu(x_{1:n})$ exists''
1343: has been used, but not proven, and is indeed generally
1344: wrong \cite{Hutter:04mlconvx}.
1345: %
1346: Theorem~\ref{defML} only implies
1347: $\sup_n\MM(x_{1:n})/\mu(x_{1:n})<\infty$ for M.L.\ random
1348: sequences $x_{1:\infty}$, and \cite[pp.\ 324--325]{Doob:53}
1349: implies only that $\lim_{n\to\infty}\MM(x_{1:n})/\mu(x_{1:n})$ exists
1350: w.p.1, and not $\mu$.M.L.
1351: %
1352: Vovk \cite{Vovk:87} shows that for two estimable
1353: semimeasures $\mu$ and $\rho$ and $x_{1:\infty}$ being $\mu$
1354: {\em and} $\rho$ M.L.\ random that
1355: \beqn
1356: \sum_{t=1}^\infty\sum_{x'_t}\left(\sqrt{\mu(x'_t|x_{<t})}-\sqrt{\rho(x'_t|x_{<t})}\right)^2<\infty
1357: \qmbox{and}
1358: \sum_{t=1}^\infty\left({\rho(x_t|x_{<t})\over\mu(x_t|x_{<t})}-1\right)^2<\infty.
1359: \eeqn
1360: If $\MM$ were estimable, then this would imply posterior
1361: $\MM\to\mu$ and $\MM/\mu\to 1$ for every $\mu$.M.L.-random
1362: sequence $x_{1:\infty}$, since {\em every} sequence is $\MM$.M.L.\
1363: random. Since $\MM$ is {\em not} estimable, Vovk's theorem cannot
1364: be applied and it is not obvious how to generalize it. So the
1365: question of individual convergence remains open. More generally,
1366: one may ask whether $\xi_\M\to\mu$ for every $\mu/\xi$-random
1367: sequence. It turns out that this is true for some $\M$, but false for others.
1368: 
1369: %------------------------------%
1370: \ftheorem{thMLConv}{$\mu/\xi$-convergence of $\xi$ to $\mu$}{
1371: %------------------------------%
1372: Let $\X=\B$ be binary and
1373: $\M_\Theta:=\{\mu_\th:\mu_\th(1|x_{<t})=\th\,\forall t,\;
1374: \th\in\Theta\}$ be the set of Bernoulli($\th$) distributions
1375: with parameters $\th\in\Theta$. Let $\Theta_D$ be a countable
1376: dense subset of $[0,1]$, e.g.\ $[0,1]\cap\SetQ$, and let $\Theta_G$
1377: be a countable subset of $[0,1]$ with a gap in the sense that
1378: there exist $0<\th_0<\th_1<1$ such that
1379: $[\th_0,\th_1]\cap\Theta_G=\{\th_0,\th_1\}$, e.g.\
1380: $\Theta_G=\{\odf,\odt\}$ or $\Theta_G=([0,{1\over
1381: 4}]\cup[{1\over 2},1])\cap\SetQ$. Then
1382: \begin{list}{}{\ifjournal\itemsep=1ex\fi}
1383: \item[$i)$] If $x_{1:\infty}$ is $\mu/\xi_{\M_{\Theta_D}}$ random with
1384: $\mu\in\M_{\Theta_D}$, then $\xi_{\M_{\Theta_D}}(x_t|x_{<t})\to\mu(x_t|x_{<t})$,
1385: \item[$ii)$] There are $\mu\in\M_{\Theta_G}$ and $\mu/\xi_{\M_{\Theta_G}}\!\!$
1386: random $x_{1:\infty}$ for which
1387: $\xi_{\M_{\Theta_G}}\!\!(x_t|x_{<t})\not\to\mu(x_t|x_{<t})\!\!$
1388: \end{list}
1389: }%------------------------------%
1390: 
1391: \noindent Our original/main motivation of studying
1392: $\mu/\xi$-randomness is the implication of Theorem~\ref{thMLConv}
1393: that $\MM\stackrel{\mbox{\tiny M.L.}}\longrightarrow\mu$ cannot be
1394: decided from $M$ being a mixture distribution or from the
1395: universality property (Theorem~\ref{thUniM}) alone. Further
1396: structural properties of $\M_{enum}^{semi}$ have to be employed.
1397: For Bernoulli sequences, convergence $\mu.\xi_{\M_\Theta}$.r.\ is
1398: related to denseness of $\M_\Theta$. Maybe a denseness
1399: characterization of $\M_{enum}^{semi}$ can solve the question of
1400: convergence M.L.\ of $M$. The property $\MM\in\M_{enum}^{semi}$ is
1401: also not sufficient to resolve this question, since there are
1402: $\M\ni\xi$ for which $\xi\stackrel{\mu.\xi.r}\longrightarrow\mu$
1403: and $\M\ni\xi$ for which
1404: $\xi\not\stackrel{\mu.\xi.r}\longrightarrow\mu$. Theorem
1405: \ref{thMLConv} can be generalized to i.i.d.\ sequences over
1406: general finite alphabet $\X$.
1407: 
1408: The idea to prove $(ii)$ is to construct a sequence $x_{1:\infty}$
1409: that is $\mu_{\th_0}/\xi$-random {\em and} $\mu_{\th_1}/\xi$-random
1410: for $\th_0\neq\th_1$. This is possible if and only if $\Theta$
1411: contains a gap and $\th_0$ and $\th_1$ are the boundaries of the
1412: gap. Obviously $\xi$ cannot converge to $\th_0$ {\em and} $\th_1$,
1413: thus proving non-convergence. For no $\th\in[0,1]$ will this
1414: $x_{1:\infty}$ be $\mu_\th$ M.L.-random. Finally, the proof of
1415: Theorem~\ref{thMLConv}
1416: makes essential use of the mixture representation of $\xi$, as
1417: opposed to the proof of Theorem~\ref{thConv} which only needs
1418: dominance $\xi\geqm\M$.
1419: 
1420: An example for $(ii)$ is $\M=\{\mu_0,\mu_1\}$,
1421: $\mu_0(1|x_{<t})=\mu_1(0|x_{<t})={1\over 4}$,
1422: $x_{1:\infty}=(01)^\infty=01010101...$ $\Rightarrow$ $\mu_0(x_{1:2n})=
1423: \mu_1(x_{1:2n})=\xi(x_{1:2n})=({1\over 4})^n({3\over 4})^n$
1424: $\Rightarrow$ $x_{1:\infty}$ is
1425: $\mu_0/\xi$-random {\em and}
1426: $\mu_1/\xi$-random, but
1427: $\mu_0(x_{2n}|x_{<2n})={1\over 4}$,
1428: $\mu_0(x_{2n+1}|x_{1:2n})={3\over 4}$,
1429: $\mu_1(x_{2n}|x_{<2n})={3\over 4}$,
1430: $\mu_1(x_{2n+1}|x_{1:2n})={1\over 4}$ and
1431: $\xi(x_{2n}|x_{<2n})={3\over 8}$,
1432: $\xi(x_{2n+1}|x_{1:2n})={1\over 2}$ for $w_0=w_1=\odt$
1433: $\Rightarrow$ $\xi(x_n|x_{<n})\not\to\mu_{0/1}(x_n|x_{<n})$.
1434: 
1435: %------------------------------%
1436: \paradot{Proof}
1437: %------------------------------%
1438: Let $\X=\B$ and $\M=\{\mu_\th:\th\in\Theta\}$ with countable
1439: $\Theta\subset[0,1]$ and
1440: $\mu_\th(1|x_{1:n})=\th=1-\mu_\th(0|x_{1:n})$, which implies
1441: \beqn
1442:   \mu_\th(x_{1:n}) = \th^{n_1}(1-\th)^{n-n_1},\qquad
1443:   n_1:=x_1\!+...+\!x_n, \qquad
1444:   \hat\th\equiv\hat\th_n:={n_1\over n}
1445: \eeqn
1446: $\hat\th$ depends on $n$; all other used/defined $\th$ will be
1447: independent of $n$. We assume $\th_{\!\cdot\cdot}\in\Theta$, where
1448: $..$ stands for some (possible empty) index, and
1449: $\ddot\th\in[0,1]$ (possibly $\not\in\Theta$), where $\ddot{}$
1450: stands for some superscript, i.e.\ $\mu_{\th_{\!\cdot\cdot}}$ and
1451: $w_{\th_{\!\cdot\cdot}}$ make sense, whereas $\mu_{\ddot\th}$ and
1452: $w_{\ddot\th}$ do not. $\xi$ is defined in the standard way as
1453: \beq\label{MLxiuni}
1454:   \xi(x_{1:n})=\sum_{\th\in\Theta}w_\th\mu_\th(x_{1:n})
1455:   \quad\Rightarrow\quad
1456:   \xi(x_{1:n})\geq w_\th \mu_\th(x_{1:n}),
1457: \eeq
1458: where $\sum_\th w_\th=1$ and $w_\th>0\,\forall\th$.
1459: In the following let $\mu=\mu_{\th_0}\in\M$ be the true environment.
1460: \beq\label{MLmuMr}
1461:   \omega=x_{1:\infty} \mbox{ is } \mu/\xi\mbox{-random}
1462:   \quad\Leftrightarrow\quad
1463:   \exists c_\omega : {\xi(x_{1:n})\leq c_\omega\!\cdot\!\mu_{\th_0}(x_{1:n})}
1464:   \;\forall n
1465: \eeq
1466: For binary alphabet it is sufficient to establish whether
1467: $\xi(1|x_{1:n}) \toinfty{n} \th_0\equiv\mu(1|x_{1:n})$ for
1468: $\mu/\xi$-random $x_{1:\infty}$ in order to decide
1469: $\xi(x_n|x_{<n})\to\mu(x_n|x_{<n})$.
1470: We need the following posterior
1471: representation of $\xi$:
1472: \beq\label{MLpw}
1473:   \xi(1|x_{1:n})=\sum_{\th\in\Theta}w_n^\th \mu_\th(1|x_{1:n}),\quad
1474:   w_n^\th:=w_\th{\mu_\th(x_{1:n})\over\xi(x_{1:n})}
1475:   \leq {w_\th\over w_{\th_0}}{\mu_\th(x_{1:n})\over\mu_{\th_0}(x_{1:n})},\quad
1476:   \sum_{\th\in\Theta}w_n^\th=1
1477: \eeq
1478: The ratio $\mu_\th/\mu_{\th_0}$ can be represented as follows:
1479: \beq\label{MLmuRatio}
1480:   {\mu_\th(x_{1:n})\over\mu_{\th_0}(x_{1:n})}
1481:   = {\th^{n_1}(1\!-\!\th)^{n-n_1}\over \th_0^{n_1}(1\!-\!\th_0)^{n-n_1}}
1482:   = \left[\bigg({\th\over\th_0}\bigg)^{\hat\th_n}
1483:           \bigg({1\!-\!\th\over 1\!-\!\th_0}\bigg)^{1-\hat\th_n}\right]^n
1484:   = \mbox{\Large\e}^{\,\displaystyle n[D(\hat\th_n||\th_0)\!-\!D(\hat\th_n||\th)]}
1485: \eeq
1486: \beqn
1487:   \qmbox{where}\textstyle
1488:   D(\hat\th||\th) = \hat\th\ln{\hat\th\over\th} +
1489:                     (1\!-\!\hat\th)\ln{1-\hat\th\over 1-\th}
1490: \eeqn
1491: is the relative entropy between $\hat\th$ and $\th$, which is
1492: continuous in $\hat\th$ and $\th$, and is $0$ if and only if
1493: $\hat\th=\th$. We also need the following implication for sets
1494: $\Omega\subseteq\Theta$:
1495: \bqa \nonumber
1496:   & & \mbox{If}\quad
1497:   w_n^\th\leq w_\th g_\th(n)\toinfty{n} 0 \qmbox{and}
1498:   g_\th(n)\leq c\;\forall\th\!\in\!\Omega,
1499: \\ \label{MLsumconv}
1500:   & & \mbox{then}\quad
1501:   \sum_{\th\in\Omega}w_n^\th \mu_\th(1|x_{1:n}) \;\leq\;
1502:   \sum_{\th\in\Omega}w_n^\th \toinfty{n} 0,
1503: \eqa
1504: which easily follows from boundedness $\sum_\th w_n^\th\leq 1$ and
1505: $\mu_\th\leq 1$ \cite[Lem.5.28$ii$]{Hutter:04uaibook}. We now
1506: prove Theorem~\ref{thMLConv}. We leave the special considerations
1507: necessary when $0,1\in\Theta$ to the reader and assume,
1508: henceforth, $0,1\not\in\Theta$.
1509: 
1510: %------------------------------%
1511: {\bf (i)} Let $\Theta$ be a countable dense subset of $(0,1)$ and
1512: $x_{1:\infty}$ be $\mu/\xi$-random. Using (\ref{MLxiuni}) and
1513: (\ref{MLmuMr}) in (\ref{MLmuRatio}) for $\th\in\Theta$ to be
1514: determined later we can bound
1515: \beq\label{MLenbnd2}
1516:   \e^{n[D(\hat\th_n||\th_0)-D(\hat\th_n||\th)]}
1517:   = {\mu_\th(x_{1:n})\over\mu_{\th_0}(x_{1:n})}
1518:   \leq {c_\omega\over w_\th}
1519:   = :c<\infty
1520: \eeq
1521: Let us assume that $\hat\th\equiv\hat\th_n\not\to\th_0$. This
1522: implies that there exists a cluster point $\tilde\th\neq\th_0$ of
1523: sequence $\hat\th_n$, i.e.\ $\hat\th_n$ is infinitely often in an
1524: $\eps$-neighborhood of $\tilde\th$, e.g.\ $D(\hat\th_n||\tilde\th)\leq\eps$
1525: for infinitely many $n$. $\tilde\th\in[0,1]$ may be outside $\Theta$.
1526: Since $\tilde\th\neq\th_0$ this implies that $\hat\th_n$ must be ``far''
1527: away from $\th_0$ infinitely often. For instance, for $\eps={1\over
1528: 4}(\tilde\th-\th_0)^2$, using $D(\hat\th||\tilde\th)+D(\hat\th||\th_0)
1529: \geq (\tilde\th-\th_0)^2$, we get $D(\hat\th||\th_0)\geq 3\eps$. We
1530: now choose $\th\in\Theta$  so near to $\tilde\th$ such that
1531: $|D(\hat\th||\th)-D(\hat\th||\tilde\th)|\leq\eps$ (here we use
1532: denseness of $\Theta$). Chaining all inequalities we get
1533: $D(\hat\th||\th_0)-D(\hat\th||\th)\geq 3\eps-\eps-\eps=\eps>0$.
1534: This, together with (\ref{MLenbnd2}) implies $\e^{n\eps}\leq c$ for
1535: infinitely many $n$ which is impossible. Hence, the assumption
1536: $\hat\th_n\not\to\th_0$ was wrong.
1537: 
1538: Now, $\hat\th_n\to\th_0$ implies that for arbitrary
1539: $\th\neq\th_0$, $\th\in\Theta$ and for sufficiently large $n$
1540: there exists $\delta_\th>0$ such that $D(\hat\th_n||\th)\geq 2\delta_\th$
1541: (since $D(\th_0||\th)\neq 0)$ and $D(\hat\th_n||\th_0)\leq\delta_\th$.
1542: This implies
1543: \beqn\label{MLwto0}
1544:   w_n^\th \;\leq\; {w_\th\over w_{\th_0}}
1545:   \e^{n[D(\hat\th_n||\th_0)\!-\!D(\hat\th_n||\th)]}
1546:   \;\leq\; {w_\th\over w_{\th_0}} \e^{-n\delta_\th}
1547:   \;\toinfty{n}\; 0,
1548: \eeqn
1549: where we have used (\ref{MLpw}) and (\ref{MLmuRatio}) in the first
1550: inequality and the second inequality holds for sufficiently large
1551: $n$. Hence $\sum_{\th\neq\th_0} w_n^\th\to 0$ by (\ref{MLsumconv})
1552: and $w_n^{\th_0}\to 1$ by normalization (\ref{MLpw}), which finally gives
1553: \beqn
1554:   \xi(1|x_{1:n})=w_n^{\th_0} \mu_{\th_0}(1|x_{1:n}) +
1555:   \sum_{\th\neq\th_0}w_n^\th \mu_\th(1|x_{1:n}) \;\toinfty{n}
1556:   \mu_{\th_0}(1|x_{1:n}).
1557: \eeqn
1558: 
1559: %------------------------------%
1560: {\bf (ii)} We first consider the case $\Theta=\{\th_0,\th_1\}$:
1561: Let us choose $\bar\th$ ($=\ln({1-\th_0\over
1562: 1-\th_1})/\ln({\th_1\over\th_0}{1-\th_0\over 1-\th_1})
1563: \not\in\Theta$) in the (KL) middle of $\th_0$ and $\th_1$ such
1564: that
1565: \beq\label{MLMid}
1566:   D(\bar\th||\th_0)=D(\bar\th||\th_1), \qquad
1567:   0 < \th_0 < \bar\th < \th_1 < 1,
1568: \eeq
1569: \beqn
1570:   \mbox{and choose $x_{1:\infty}$ such that $\hat\th_n:={n_1\over n}$
1571:   satisfies $|\hat\th_n-\bar\th|\leq{1\over n}
1572:   \quad(\Rightarrow\;\hat\th_n\toinfty{n}\bar\th)$}
1573: \eeqn
1574: We will show that $x_{1:\infty}$
1575: is $\mu_{\th_0}/\xi$-random {\em and} $\mu_{\th_1}/\xi$-random.
1576: Obviously no $\xi$ can converge to $\th_0$
1577: {\em and} $\th_1$, thus proving $\M$-non-convergence.
1578: ($x_{1:\infty}$ is obviously not $\mu_{\th_{0/1}}$ M.L.-random,
1579: since the relative frequency $\hat\th_n\not\to\th_{0/1}$.
1580: $x_{1:\infty}$ is not even $\mu_{\bar\th}$ M.L.-random, since
1581: $\hat\th_n$ converges too fast ($\sim\odn$). $x_{1:\infty}$ is
1582: indeed very regular, whereas ${n_1\over n}$ of a truly
1583: $\mu_{\bar\th}$ M.L.-random sequence has fluctuations of the order
1584: $1/\sqrt n$. The fast convergence is necessary for
1585: doubly $\mu/\xi$-randomness.
1586: %
1587: The reason that $x_{1:\infty}$ is $\mu/\xi$-random, but not M.L.-random is
1588: that $\mu/\xi$-randomness is a weaker concept than M.L.-randomness for
1589: $\M\subset\M_{enum}^{semi}$. Only regularities characterized by
1590: $\nu\in\M$ are recognized by $\mu/\xi$-randomness.)
1591: 
1592: In the following we assume that $n$ is sufficiently large
1593: such that $\th_0\leq\hat\th_n\leq\th_1$.  We need
1594: \beq\label{MLDD}
1595:   |D(\hat\th||\th)-D(\bar\th||\th)| \leq c|\hat\th-\bar\th|
1596:   \quad\forall\,\th,\hat\th,\bar\th\in[\th_0,\th_1]
1597:   \qmbox{with} \textstyle c:=\ln\!{\th_1(1-\th_0)\over\th_0(1-\th_1)} < \infty
1598: \eeq
1599: which follows for $\hat\th\geq\bar\th$ (similarly
1600: $\hat\th\leq\bar\th$) from
1601: \beqn
1602:   D(\hat\th||\th)-D(\bar\th||\th) = \int_{\bar\th}^{\hat\th}
1603:   [{\textstyle\ln{\th'\over\th}-\ln{1-\th'\over 1-\th}}]d\th'
1604:   \leq \int_{\bar\th}^{\hat\th}
1605:   [{\textstyle\ln{\th_1\over\th_0}-\ln{1-\th_1\over 1-\th_0}}]d\th'
1606:   = c\!\cdot\!(\hat\th-\bar\th)
1607: \eeqn
1608: where we have increased $\th'$ to $\th_1$ and decreased $\th$ to
1609: $\th_0$ in the inequality. Using (\ref{MLDD}) in (\ref{MLmuRatio})
1610: twice we get
1611: \beq\label{MLmu01}
1612:   {\mu_{\th_1}(x_{1:n})\over\mu_{\th_0}(x_{1:n})}
1613:   =
1614:   \e^{n[D(\hat\th_n||\th_0)-D(\hat\th_n||\th_1)]}
1615:   \leq
1616:   \e^{n[D(\bar\th||\th_0)+c|\hat\th_n-\bar\th|-
1617:        D(\bar\th||\th_1)+c|\hat\th_n-\bar\th|]}
1618:   \leq
1619:   \e^{2c}
1620: \eeq
1621: where we have used (\ref{MLMid}) in the last inequality. Now,
1622: (\ref{MLmu01}) and (\ref{MLpw}) lead to
1623: \beq\label{MLwgeq0}
1624:   w_n^{\th_0}
1625:   = w_{\th_0}{\mu_{\th_0}(x_{1:n})\over\xi(x_{1:n})}
1626:   = [1+{w_{\th_1}\over w_{\th_0}}{\mu_{\th_1}(x_{1:n})\over\mu_{\th_0}(x_{1:n})}]^{-1}
1627:   \geq [1+{w_{\th_1}\over w_{\th_0}}\e^{2c}]^{-1}=:c_0>0,
1628: \eeq
1629: which shows that $x_{1:\infty}$ is $\mu_{\th_0}/\xi$-random by
1630: (\ref{MLmuMr}). Exchanging $\th_0\leftrightarrow\th_1$ in
1631: (\ref{MLmu01}) and (\ref{MLwgeq0}) we similarly get
1632: $w_n^{\th_1}\geq c_1>0$, which implies (using
1633: $w_n^{\th_0}+w_n^{\th_1}=1$)
1634: \beq\label{MLnonconv2}
1635:   \xi(1|x_{1:n})=
1636:   \sum_{\th\in\{\th_0,\th_1\}}w_n^\th \mu_\th(1|x_{1:n})
1637:   = w_n^{\th_0}\!\cdot\!\th_0 + w_n^{\th_1}\!\cdot\!\th_1
1638:   \neq \th_0 = \mu_{\th_0}(1|x_{1:n}).
1639: \eeq
1640: This shows $\xi(1|x_{1:n}) \;\;\not\!\!\!\toinfty{n}
1641: \mu(1|x_{1:n})$.
1642: One can show that $\xi(1|x_{1:n})$ does not only not converge to
1643: $\th_0$ (and $\th_1$), but that it does not converge at all. The
1644: fast convergence demand $|\hat\th_n-\bar\th|\leq\odn$ on
1645: $x_{1:\infty}$ can be weakened to
1646: $\hat\th_n\leq\bar\th+O(\odn)\,\forall n$ and
1647: $\hat\th_n\geq\bar\th-O(\odn)$ for infinitely many $n$, then
1648: $x_{1:\infty}$ is still $\mu_{\th_0}/\xi$-random, and
1649: $w_n^{\th_1}\geq c_1'>0$ for infinitely many $n$, which is
1650: sufficient to prove $\xi\not\to\mu$.
1651: 
1652: We now consider general $\Theta$ with gap in the sense that there exist
1653: $0<\th_0<\th_1<1$ with
1654: $[\th_0,\th_1]\cap\Theta=\{\th_0,\th_1\}$: We show
1655: that all $\th\neq\th_0,\th_1$ give asymptotically no contribution
1656: to $\xi(1|x_{1:n})$, i.e.\ (\ref{MLnonconv2}) still applies. Let
1657: $\th\in\Theta\setminus\{\th_0,\th_1\}$; all other definitions as
1658: before. Then
1659: $\delta_\th:=D(\bar\th||\th)-D(\bar\th||\th_{0/1})>0$, since
1660: $\th$ is farther than $\th_{0/1}$ away from $\bar\th$
1661: ($|\th-\bar\th|>|\th_{0/1}-\bar\th|$). Similarly to (\ref{MLmu01}) with
1662: $\th$ instead $\th_1$ we get
1663: \beqn
1664:   {\mu_\th(x_{1:n})\over\mu_{\th_0}(x_{1:n})}
1665:   = \e^{n[D(\hat\th_n||\th_0)-D(\hat\th_n||\th)]}
1666:   \leq \e^{2c}\!\cdot\!
1667:     \e^{n[D(\bar\th||\th_0)-D(\bar\th||\th)]}
1668:   = \e^{2c}\e^{-n\delta_\th}
1669:   \toinfty{n} 0
1670: \eeqn
1671: Hence $w_n^\th\leq{w_\th\over w_{\th_0}}\e^{2c}\e^{-n\delta_\th}\to
1672: 0$ from (\ref{MLpw}) and
1673: $\eps_n:=\sum_{\th\in\Theta\setminus\{\th_0,\th_1\}}
1674: w_n^\th\mu_\th(1|x_{1:n})\toinfty{n} 0$ from (\ref{MLsumconv}).
1675: Hence $
1676:   \xi(1|x_{1:n})
1677:   = w_n^{\th_0}\cdot\th_0 + w_n^{\th_1}\cdot\th_1 + \eps_n
1678:   \neq \th_0 = \mu_{\th_0}(1|x_{1:n})
1679: $
1680: for sufficiently large $n$, since $\eps_n\to 0$, $w_n^{\th_1}\geq c'_1>0$
1681: and $\th_0\neq\th_1$.
1682: \qed
1683: 
1684: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1685: \section{Conclusions}\label{secConc}
1686: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1687: 
1688: For a hierarchy of four computability definitions, we completed
1689: the classification of the existence of computable (semi)measures
1690: dominating all computable (semi)measures. Dominance is an important
1691: property of a prior, since it implies rapid convergence of the
1692: corresponding posterior with probability one.
1693: %
1694: A strengthening would be convergence for all Martin-L{\"o}f (M.L.)
1695: random sequences. This seems natural, since M.L.\ randomness can
1696: be defined in terms of Solomonoff's prior $M$, so there is a close
1697: connection.
1698: %
1699: Contrary to what was believed before, the question of posterior
1700: convergence $M/\mu\to 1$ for all M.L.\ random sequences is still
1701: open. Some exciting progress has been made recently in
1702: \cite{Hutter:04mlconvx}, partially answering this question.
1703: %
1704: We introduced a new flexible notion of
1705: $\mu/\xi$-randomness which contains Martin-L{\"of} randomness as a
1706: special case. Though this notion may have a wider range of
1707: application, the main purpose for its introduction was to show
1708: that standard proof attempts of
1709: $M/\mu\stackrel{M.L.}\longrightarrow 1$ based on dominance only
1710: must fail. This follows from the derived result that the validity
1711: of $\xi/\mu\to 1$ for $\mu/\xi$-random sequences depends on the
1712: Bayes mixture $\xi$.
1713: 
1714: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1715: %         Bibliography        %
1716: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1717: 
1718: \begin{small}
1719: \begin{thebibliography}{Hut03b}
1720: 
1721: \bibitem[Cha75]{Chaitin:75}
1722: G.~J. Chaitin.
1723: \newblock A theory of program size formally identical to information theory.
1724: \newblock {\em Journal of the ACM}, 22(3):329--340, 1975.
1725: 
1726: \bibitem[Doo53]{Doob:53}
1727: J.~L. Doob.
1728: \newblock {\em Stochastic Processes}.
1729: \newblock Wiley, New York, 1953.
1730: 
1731: \bibitem[G{\'a}c74]{Gacs:74}
1732: P.~G{\'a}cs.
1733: \newblock On the symmetry of algorithmic information.
1734: \newblock {\em Soviet Mathematics Doklady}, 15:1477--1480, 1974.
1735: 
1736: \bibitem[HM04]{Hutter:04mlconvx}
1737: M.~Hutter and An.~A. Muchnik.
1738: \newblock Universal convergence of semimeasures on individual random sequences.
1739: \newblock In {\em Proc. 15th International Conf. on Algorithmic Learning Theory
1740:   ({ALT-2004})}, volume 3244 of {\em LNAI}, pages 234--248, Padova, 2004.
1741:   Springer, Berlin.
1742: 
1743: \bibitem[Hut01]{Hutter:01alpha}
1744: M.~Hutter.
1745: \newblock Convergence and error bounds for universal prediction of nonbinary
1746:   sequences.
1747: \newblock In {\em Proc. 12th European Conf. on Machine Learning (ECML-2001)},
1748:   volume 2167 of {\em LNAI}, pages 239--250, Freiburg, 2001. Springer, Berlin.
1749: 
1750: \bibitem[Hut03a]{Hutter:03unipriors}
1751: M.~Hutter.
1752: \newblock On the existence and convergence of computable universal priors.
1753: \newblock In {\em Proc. 14th International Conf. on Algorithmic Learning Theory
1754:   ({ALT-2003})}, volume 2842 of {\em LNAI}, pages 298--312, Sapporo, 2003.
1755:   Springer, Berlin.
1756: 
1757: \bibitem[Hut03b]{Hutter:03unimdl}
1758: M.~Hutter.
1759: \newblock Sequence prediction based on monotone complexity.
1760: \newblock In {\em Proc. 16th Annual Conf. on Learning Theory ({COLT-2003})},
1761:   volume 2777 of {\em LNAI}, pages 506--521, Washington, DC, 2003. Springer,
1762:   Berlin.
1763: 
1764: \bibitem[Hut04]{Hutter:04uaibook}
1765: M.~Hutter.
1766: \newblock {\em Universal Artificial Intelligence: Sequential Decisions based on
1767:   Algorithmic Probability}.
1768: \newblock Springer, Berlin, 2004.
1769: \newblock 300 pages, http://www.idsia.ch/$_{^{\sim}}$marcus/ai/uaibook.htm.
1770: 
1771: \bibitem[Kol65]{Kolmogorov:65}
1772: A.~N. Kolmogorov.
1773: \newblock Three approaches to the quantitative definition of information.
1774: \newblock {\em Problems of Information and Transmission}, 1(1):1--7, 1965.
1775: 
1776: \bibitem[Lam87]{Lambalgen:87}
1777: {M. van} Lambalgen.
1778: \newblock {\em Random Sequences}.
1779: \newblock PhD thesis, University of Amsterdam, 1987.
1780: 
1781: \bibitem[Lev73]{Levin:73random}
1782: L.~A. Levin.
1783: \newblock On the notion of a random sequence.
1784: \newblock {\em Soviet Mathematics Doklady}, 14(5):1413--1416, 1973.
1785: 
1786: \bibitem[Lev74]{Levin:74}
1787: L.~A. Levin.
1788: \newblock Laws of information conservation (non-growth) and aspects of the
1789:   foundation of probability theory.
1790: \newblock {\em Problems of Information Transmission}, 10(3):206--210, 1974.
1791: 
1792: \bibitem[LV97]{Li:97}
1793: M.~Li and P.~M.~B. Vit\'anyi.
1794: \newblock {\em An Introduction to {K}olmogorov Complexity and its
1795:   Applications}.
1796: \newblock Springer, Berlin, 2nd edition, 1997.
1797: 
1798: \bibitem[Sch71]{Schnorr:71}
1799: C.~P. Schnorr.
1800: \newblock {\em Zuf{\"a}lligkeit und Wahrscheinlichkeit}.
1801: \newblock Springer, Berlin, 1971.
1802: 
1803: \bibitem[Sch00]{Schmidhuber:00toe}
1804: J.~Schmidhuber.
1805: \newblock Algorithmic theories of everything.
1806: \newblock Report IDSIA-20-00, quant-ph/0011122, {IDSIA}, Manno (Lugano),
1807:   Switzerland, 2000.
1808: 
1809: \bibitem[Sch02]{Schmidhuber:02gtm}
1810: J.~Schmidhuber.
1811: \newblock Hierarchies of generalized {Kolmogorov} complexities and
1812:   nonenumerable universal measures computable in the limit.
1813: \newblock {\em International Journal of Foundations of Computer Science},
1814:   13(4):587--612, 2002.
1815: 
1816: \bibitem[Sim77]{Simpson:77}
1817: S.~G. Simpson.
1818: \newblock Degrees of unsolvability: A survey of results.
1819: \newblock In J.~Barwise, editor, {\em Handbook of Mathematical Logic}, pages
1820:   631--652. North-Holland, Amsterdam, 1977.
1821: 
1822: \bibitem[Sol64]{Solomonoff:64}
1823: R.~J. Solomonoff.
1824: \newblock A formal theory of inductive inference: Parts 1 and 2.
1825: \newblock {\em Information and Control}, 7:1--22 and 224--254, 1964.
1826: 
1827: \bibitem[Sol78]{Solomonoff:78}
1828: R.~J. Solomonoff.
1829: \newblock Complexity-based induction systems: Comparisons and convergence
1830:   theorems.
1831: \newblock {\em IEEE Transaction on Information Theory}, IT-24:422--432, 1978.
1832: 
1833: \bibitem[VL00]{Vitanyi:00}
1834: P.~M.~B. Vit\'anyi and M.~Li.
1835: \newblock Minimum description length induction, {B}ayesianism, and {K}olmogorov
1836:   complexity.
1837: \newblock {\em IEEE Transactions on Information Theory}, 46(2):446--464, 2000.
1838: 
1839: \bibitem[Vov87]{Vovk:87}
1840: V.~G. Vovk.
1841: \newblock On a randomness criterion.
1842: \newblock {\em Soviet Mathematics Doklady}, 35(3):656--660, 1987.
1843: 
1844: \bibitem[Wan96]{Wang:96}
1845: Y.~Wang.
1846: \newblock {\em Randomness and Complexity}.
1847: \newblock PhD thesis, Universit{\"a}t Heidelberg, 1996.
1848: 
1849: \bibitem[ZL70]{Zvonkin:70}
1850: A.~K. Zvonkin and L.~A. Levin.
1851: \newblock The complexity of finite objects and the development of the concepts
1852:   of information and randomness by means of the theory of algorithms.
1853: \newblock {\em Russian Mathematical Surveys}, 25(6):83--124, 1970.
1854: 
1855: \end{thebibliography}
1856: \end{small}
1857: \end{document}
1858: 
1859: %---------------------End-of-UniPriorx.tex--------------------%
1860: