cs0305052/cs0305052
1: 
2: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3: %% On the Existence and Convergence Computable Universal Priors %%
4: %%     Marcus Hutter: Start: 01.08.02  LastEdit: 29.05.03    %%
5: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
6: 
7: %-------------------------------%
8: %   Document-Style              %
9: %-------------------------------%
10: \documentclass[12pt,twoside]{article}
11: \pagestyle{myheadings}
12: \markboth{\sc Marcus Hutter, Technical Report IDSIA-05-03
13: }{\sc Computable Universal Priors}
14: \setcounter{tocdepth}{4} \setcounter{secnumdepth}{2}
15: \topmargin=0mm  \oddsidemargin=5mm \evensidemargin=5mm
16: \textwidth=15cm \textheight=22cm
17: \sloppy
18: 
19: %-------------------------------%
20: %       My Math-Spacings        %
21: %-------------------------------%
22: \def\,{\mskip 3mu} \def\>{\mskip 4mu plus 2mu minus 4mu} \def\;{\mskip 5mu plus 5mu} \def\!{\mskip-3mu}
23: \def\dispmuskip{\thinmuskip= 3mu plus 0mu minus 2mu \medmuskip=  4mu plus 2mu minus 2mu \thickmuskip=5mu plus 5mu minus 2mu}
24: \def\textmuskip{\thinmuskip= 0mu                    \medmuskip=  1mu plus 1mu minus 1mu \thickmuskip=2mu plus 3mu minus 1mu}
25: \textmuskip
26: \def\beq{\dispmuskip\begin{equation}}    \def\eeq{\end{equation}\textmuskip}
27: \def\beqn{\dispmuskip\begin{displaymath}}\def\eeqn{\end{displaymath}\textmuskip}
28: \def\bqa{\dispmuskip\begin{eqnarray}}    \def\eqa{\end{eqnarray}\textmuskip}
29: \def\bqan{\dispmuskip\begin{eqnarray*}}  \def\eqan{\end{eqnarray*}\textmuskip}
30: 
31: %-------------------------------%
32: %   Macro-Definitions           %
33: %-------------------------------%
34: \newtheorem{theorem}{Theorem}
35: \newtheorem{corollary}[theorem]{Corollary}
36: \newtheorem{lemma}[theorem]{Lemma}
37: \newtheorem{definition}[theorem]{Definition}
38: 
39: \newenvironment{keywords}{\centerline{\bf\small
40: Keywords}\vspace{-1ex}\begin{quote}\small}{\par\end{quote}\vskip
41: 1ex}
42: \newtheorem{tablex}[theorem]{Table}
43: \newtheorem{figurex}[equation]{Figure}
44: 
45: \def\ftheorem#1#2#3{\begin{theorem}[#2]\label{#1} #3 \end{theorem} }
46: \def\fcorollary#1#2#3{\begin{corollary}[#2]\label{#1} #3 \end{corollary} }
47: \def\flemma#1#2#3{\begin{lemma}[#2]\label{#1} #3 \end{lemma} }
48: \def\fdefinition#1#2#3{\begin{definition}[#2]\label{#1} #3 \end{definition} }
49: \def\ftablex#1#2#3{\begin{tablex}[#2]\label{#1} #3 \end{tablex} }
50: \def\ffigurex#1#2#3#4{{#4}\begin{figurex}[#2]\label{#1}#3\end{figurex}}
51: 
52: \def\idx#1{\index{#1}#1} %\idx{name} for also in text
53: \def\indxs#1#2{\index{#1!#2}\index{#2!#1}} %\idx{name} for also in text
54: \def\paragraph#1{\vspace{1ex}\noindent{\bf{#1.}}}
55: \def\paranodot#1{\vspace{1ex}\noindent{\bf{#1}}}
56: \def\myparskip{\vspace{1.5ex plus 1ex minus 1ex}\noindent}
57: \def\ff{\Longrightarrow}
58: \def\gdw{\Longleftrightarrow}
59: \def\toinfty#1{\stackrel{#1\to\infty}{\longrightarrow}}
60: \def\nq{\hspace{-1em}}
61: \def\qed{\hspace*{\fill}$\Box\quad$}
62: \def\odt{{\textstyle{1\over 2}}}
63: \def\odf{{\textstyle{1\over 4}}}
64: \def\eps{\varepsilon}                   % for small positive number
65: \def\epstr{\epsilon}                    % for empty string
66: \def\blank{{\,_\sqcup\,}}                 % blank position
67: \def\pfx{`}                              %prefix code
68: \def\qmbox#1{{\quad\mbox{#1}\quad}}
69: \def\argmax{\mathop{\rm arg\,max}}          % maxarg
70: \def\argmin{\mathop{\rm arg\,min}}          % minarg
71: 
72: \def\eqm{\stackrel\times=}             % for some reason
73: \def\leqm{\stackrel\times\leq}
74: \def\geqm{\stackrel\times\geq}
75: 
76: \def\odn{{\textstyle{1\over n}}}
77: \def\v#1{{\bf #1}}
78: \def\l{{l}}                             % length of string or program
79: \def\M{{\cal M}}                        % Set of prob. distributions
80: \def\X{{\cal X}}                        % input/perception set/alphabet
81: \def\Y{{\cal Y}}                        % output/action set/alphabet
82: \def\R{{\cal R}}                        % reward set subset of reals
83: \def\F{{\cal F}}                        % Generic performance measure
84: \def\I{{\cal I}}                        % some set
85: \def\S{{\cal S}}                        % some set
86: \def\Q{{\cal Q}}
87: \def\E{{\bf E}}                         % Expectation value
88: \def\P{{\bf P}}                         % Expectation value
89: \def\B{\{0,1\}}                        % Binary set (or \set{B})
90: \def\MM{M}                              % Solomonoff's prior
91: \def\th{\theta}
92: 
93: \def\Set#1{{\if#1Q{I\!\!\!#1}\else\if#1Z{Z\!\!\!Z}\else{I\!\!#1}\fi\fi}}
94: \def\lb{\log}
95: \def\sumprime{\mathop{{\sum\nolimits'}}}
96: 
97: \begin{document}
98: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
99: %                      T i t l e - P a g e                      %
100: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
101: 
102: \title{\vskip -15mm\normalsize\sc Technical Report \hfill IDSIA-05-03
103: \vskip 2mm\bf\LARGE\hrule height5pt \vskip 3mm
104: \sc On the Existence and Convergence \\ of Computable Universal Priors\thanks{%
105: This work was supported by SNF grant 2000-61847.00 to J\"{u}rgen
106: Schmidhuber.}
107: \vskip 6mm \hrule height2pt \vskip 5mm}
108: \author{{\bf Marcus Hutter}\\[3mm]
109: \normalsize IDSIA, Galleria 2, CH-6928\ Manno-Lugano, Switzerland\\
110: \normalsize marcus@idsia.ch \hspace{8.5ex} http://www.idsia.ch/$^{_{_\sim}}\!$marcus}
111: \date{29 May 2003}
112: \maketitle
113: 
114: \begin{abstract}
115: \noindent Solomonoff unified Occam's razor and Epicurus' principle
116: of multiple explanations to one elegant, formal, universal theory
117: of inductive inference, which initiated the field of algorithmic
118: information theory. His central result is that the posterior of
119: his universal semimeasure $\MM$ converges rapidly to the true
120: sequence generating posterior $\mu$, if the latter is computable.
121: Hence, $M$ is eligible as a universal predictor in case of unknown
122: $\mu$.
123: %
124: We investigate the existence and convergence of computable
125: universal (semi)measures for a hierarchy of computability classes:
126: finitely computable, estimable, enumerable, and approximable.
127: For instance, $\MM$ is known to be enumerable, but not finitely
128: computable, and to dominate all enumerable semimeasures.
129: %
130: We define seven classes of (semi)measures based on these four
131: computability concepts. Each class may or may not contain a
132: (semi)measure which dominates all elements of another class. The
133: analysis of these 49 cases can be reduced to four basic cases, two
134: of them being new. The results hold for discrete and continuous
135: semimeasures.
136: %
137: We also investigate more closely the types of convergence, possibly
138: implied by universality: in difference and in ratio, with probability
139: 1, in mean sum, and for Martin-L{\"o}f random sequences.
140: %
141: We introduce a generalized concept of randomness for individual
142: sequences and use it to exhibit difficulties regarding these
143: issues.
144: \end{abstract}
145: 
146: \begin{keywords}
147: Sequence prediction;
148: Algorithmic Information Theory;
149: Solomonoff's prior;
150: universal probability;
151: mixture distributions;
152: posterior convergence;
153: computability concepts;
154: Martin-L{\"o}f randomness.
155: \end{keywords}
156: 
157: \pagebreak
158: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
159: \section{Introduction}\label{secIntro}
160: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
161: 
162: All induction problems can be phrased as sequence prediction
163: tasks. This is, for instance, obvious for time series prediction,
164: but also includes classification tasks. Having observed data $x_t$
165: at times $t<n$, the task is to predict the $t$-th symbol $x_t$
166: from sequence $x=x_1...x_{t-1}$.
167: %
168: The key concept to attack general induction problems is
169: Occam's razor and to a less extend Epicurus' principle of
170: multiple explanations. The former/latter may be interpreted as to
171: keep the simplest/all theories consistent with the observations
172: $x_1...x_{t-1}$ and to use these theories to predict $x_t$.
173: %
174: Solomonoff \cite{Solomonoff:64,Solomonoff:78} formalized and
175: combined both principles in his universal prior $\MM(x)$ which
176: assigns high/low probability to simple/complex environments, hence
177: implementing Occam and Epicurus.
178: %
179: Solomonoff's \cite{Solomonoff:78} central result is that if
180: the probability $\mu(x_t|x_1...x_{t-1})$ of observing $x_t$ at
181: time $t$, given past observations $x_1...x_{t-1}$ is
182: a computable function, then the
183: universal posterior
184: $\MM(x_t|x_1...x_{t-1})$
185: converges rapidly for $t\to\infty$ to the true posterior
186: $\mu(x_t|x_1...x_{t-1})$, hence
187: $\MM$ represents a universal predictor in case of unknown $\mu$.
188: 
189: One representation of $\MM$ is as a weighted sum of
190: {\em all} enumerable ``defective'' probability measures, called
191: semimeasures (see Definition \ref{defSemi}).
192: %
193: The (from this representation obvious) dominance $\MM(x)\geq
194: const.\times\mu(x)$ for all computable $\mu$ is the central
195: ingredient in the convergence proof.
196: %
197: %General mixture distributions
198: What is so special about the class of all enumerable
199: semimeasures $\M_{enum}^{semi}$? The larger we choose $\M$ the
200: less restrictive is the essential assumption that $\M$ should
201: contain the true distribution $\mu$.
202: %
203: Why not restrict to the still rather general class of estimable or
204: finitely computable (semi)measures? For {\em every} countable
205: class $\M$ and $\xi_\M(x):=\sum_{\nu\in\M} w_\nu \nu(x)$ with
206: $w_\nu>0$, the important dominance $\xi_\M(x)\geq w_\nu
207: \nu(x)\,\forall\nu\in\M$ is satisfied. The question is what
208: properties does $\xi_\M$ possess. The distinguishing property of
209: $\MM=\xi_{\M_{enum}^{semi}}$ is that it is itself
210: an element of $\M_{enum}^{semi}$.
211: %
212: On the other hand, for prediction $\xi_\M\in\M$ is not by itself
213: an important property. What matters is  whether $\xi_\M$ is
214: computable (in one of the senses defined) to avoid
215: getting into the (un)realm of non-constructive math.
216: 
217: %Goal of this work
218: The intention of this work is to investigate the existence,
219: computability and convergence of universal (semi)measures for
220: various computability classes: finitely computable $\subset$
221: estimable $\subset$ enumerable $\subset$ approximable (see
222: Definition \ref{defCompFunc}). For instance, $\MM(x)$ is
223: enumerable, but not finitely computable. The research in this work
224: was motivated by recent generalizations of Kolmogorov complexity
225: and Solomonoff's prior by Schmidhuber \cite{Schmidhuber:02gtm} to
226: approximable (and others not here discussed) cases.
227: 
228: %------------------------------%
229: \paragraph{Contents}
230: %------------------------------%
231: In Section \ref{secCC} we review various computability concepts
232: and discuss their relation.
233: %
234: In Section \ref{secUniM} we define the prefix Kolmogorov
235: complexity $K$, the concept of (semi)measures, Solomonoff's
236: universal prior $\MM$, and explain its universality.
237: %
238: Section \ref{secUSP} summarizes Solomonoff's major convergence
239: result, discusses general mixture distributions and the important
240: universality property -- multiplicative dominance.
241: %
242: In Section \ref{secUSM} we define seven classes of (semi)measures
243: based on four computability concepts. Each class may or may not
244: contain a (semi)measures which dominates all elements of another
245: class. We reduce the analysis of these 49 cases to four basic
246: cases. Domination (essentially by $\MM$) is known to be true for
247: two cases. The two new cases do not allow for domination.
248: %
249: In Section \ref{secConv} we investigate more closely the type of
250: convergence implied by universality. We summarize the result on
251: posterior convergence in difference $(\xi-\mu\to 0)$ and improve
252: the previous result \cite{Li:97} on the convergence in ratio
253: $\xi/\mu\to 1$ by showing rapid convergence without use
254: of Martingales.
255: %
256: In Section \ref{secMLconv} we investigate whether convergence for
257: all Martin-L{\"o}f random sequences could hold. We define a
258: generalized concept of randomness for individual sequences and use
259: it to show that proofs based on universality cannot decide this
260: question.
261: %
262: Section \ref{secConc} concludes the paper. Proofs will be
263: presented elsewhere.
264: 
265: %------------------------------%
266: \paragraph{Notation}
267: %------------------------------%
268: %Strings
269: We denote strings of length $n$ over finite alphabet $\X$ by
270: $x=x_1x_2...x_n$ with $x_t\in\X$ and further abbreviate
271: $x_{1:n}:=x_1x_2...x_{n-1}x_n$ and $x_{<n}:=x_1... x_{n-1}$,
272: $\epstr$ for the empty string, $\l(x)$ for the length of string $x$,
273: and $\omega=x_{1:\infty}$ for infinite sequences.
274: %
275: % Asymptotic notation
276: We abbreviate $\lim_{n\to\infty}[f(n)-g(n)]=0$ by
277: $f(n)\toinfty{n}g(n)$ and say $f$ converges to $g$, without
278: implying that $\lim_{n\to\infty}g(n)$ itself exists. We write
279: $f(x)\geqm  g(x)$ for $g(x)=O(f(x))$.
280: 
281: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
282: \section{Computability Concepts}\label{secCC}
283: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
284: We define several computability concepts weaker than can be captured
285: by halting Turing machines.
286: 
287: %------------------------------%
288: \fdefinition{defCompFunc}{Computable functions}{
289: %------------------------------%
290: We consider functions $f:\Set{N}\to\Set{R}$:
291: \begin{itemize}
292: \item[]
293: $\nq f$ is {\em finitely computable} or {\em recursive} {\it iff}
294: there are Turing machines $T_{1/2}$ with output interpreted as natural
295: numbers and $f(x)={T_1(x)\over T_2(x)}$,
296: \item[]
297: $\nq f$ is {\em approximable} {\it iff}
298: $\phi(\cdot,\cdot)$ is finitely computable and
299: $\lim_{t\to\infty}\phi(x,t)=f(x)$.
300: \item[]
301: $\nq f$ is {\em lower semi-computable} or {\em enumerable} {\it
302: iff} additionally $\phi(x,t)\leq\phi(x,t+1)$.
303: \item[]
304: $\nq f$ is {\em upper semi-computable} or {\em co-enumerable} {\it
305: iff} $[-f]$ is lower semi-computable.
306: %additionally $\phi(x,t)\geq\phi(x,t+1)$.
307: \item[]
308: $\nq f$ is {\em semi-computable} {\it iff} $f$ is lower- {\it or}
309: upper semi-computable.
310: \item[]
311: $\nq f$ is {\em estimable} {\it iff} $f$ is lower- {\it and} upper
312: semi-computable.
313: \end{itemize}
314: }%------------------------------%
315: 
316: \noindent If $f$ is estimable we can finitely compute an
317: $\eps$-approximation of $f$ by upper and lower semi-computing $f$
318: and terminating when differing by less than $\eps$. This means
319: that there is a Turing machine which, given $x$ and $\eps$,
320: finitely computes $\hat y$ such that $|\hat y-f(x)|<\eps$.
321: Moreover it gives an interval estimate $f(x)\in[\hat y-\eps,\hat
322: y+\eps]$. An estimable integer-valued function is finitely
323: computable (take any $\eps<1$).
324: %
325: Note that if $f$ is only approximable or semi-computable we can
326: still come arbitrarily close to $f(x)$ but we cannot devise a
327: terminating algorithm which produces an $\eps$-approximation. In
328: the case of lower/upper semi-computability we can at least
329: finitely compute lower/upper bounds to $f(x)$. In case of
330: approximability, the weakest computability form, even this
331: capability is lost.
332: %
333: In analogy to lower/upper semi-computability one may think of
334: notions like lower/upper estimability but they are easily shown to
335: coincide with estimability. The following implications are valid:
336: 
337: \begin{center}\small
338: \fbox{\parbox{11ex}{recursive=\\ finitely\\ computable}}
339: $\Rightarrow$
340: \fbox{\parbox{9ex}{estimable}}
341: %
342: \parbox{26ex}{\raisebox{-3ex}{$\Rightarrow$} \fbox{
343: \parbox{17ex}{enumerable=\\lower semi-\\ computable}}
344: \raisebox{-3ex}{$\Rightarrow$} \\[2ex]
345: \raisebox{3ex}{$\Rightarrow$} \fbox{
346: \parbox{17ex}{co-enumerable=\\ upper semi-\\
347: computable}} \raisebox{3ex}{$\Rightarrow$}}
348: \fbox{\parbox{11ex}{semi-\\ computable}}
349: $\Rightarrow$
350: \fbox{approximable}
351: \end{center}
352: 
353: \noindent In the following we use the term computable synonymous
354: to finitely computable, but sometimes also generically for some of
355: the computability forms of Definition \ref{defCompFunc}.
356: %
357: What we call {\em estimable} is often just called {\em
358: computable}, but it makes sense to separate the concepts of
359: finite computability and estimability in this work, since the
360: former is conceptually easier and some previous results have only
361: been proved for this case.
362: 
363: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
364: \section{The Universal Prior $\MM$}\label{secUniM}
365: %\subsection{Solomonoff's Universal Prior}
366: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
367: \index{Turing machine!universal}
368: \index{Turing machine!prefix}
369: \index{tape!unidirectional}
370: \index{tape!bidirectional}%
371: \index{semimeasure!universal}
372: %
373: % Universal prior
374: The prefix Kolmogorov complexity $K(x)$ is defined as the length
375: of the shortest binary program $p\in\B^*$ for which a universal prefix
376: Turing machine $U$ (with binary program tape and $\X$ary output
377: tape) outputs string $x\in\X^*$, and similarly $K(x|y)$ in case of side
378: information $y$ \cite{Li:97}:
379: \beqn
380:   K(x)=\min\{\l(p):U(p)=x\},\qquad
381:   K(x|y)=\min\{\l(p):U(p,y)=x\}
382: \eeqn
383: Solomonoff
384: \cite{Solomonoff:64,Solomonoff:78}
385: (with a flaw fixed by Levin \cite{Zvonkin:70})
386: defined (earlier) the closely related
387: quantity, the universal prior $\MM(x)$.
388: %
389: It is defined as the
390: probability that the output of a universal Turing machine starts
391: with $x$ when provided with \idx{fair coin flips} on the input
392: tape. Formally, $\MM$ can be defined as
393: \beq\label{Mdef}
394:   \MM(x)\;:=\;\sum_{p\;:\;U(p)=x*}\nq 2^{-\l(p)}
395: \eeq
396: where the sum is over all so called minimal programs $p$ for which
397: $U$ outputs a string starting with $x$ (indicated by the $*$).
398: %
399: Before we can discuss the stochastic properties of $\MM$ we
400: need the concept of (semi)measures for strings.
401: 
402: \index{semimeasure!enumerable}
403: %------------------------------%
404: \fdefinition{defSemi}{Continuous (Semi)measures}{
405: %------------------------------%
406: $\mu(x)$ denotes the probability that a sequence starts
407: with string $x$. We call $\mu\geq 0$ a (continuous) semimeasure if
408: $\mu(\epstr)\leq 1$ and $\mu(x)\geq\mu(x0)+\mu(x1)$, and a
409: (probability) measure if equality holds.
410: }%------------------------------%
411: 
412: \noindent We have $\MM(x0)+\MM(x1)<\MM(x)$ because there are
413: programs $p$, which output $x$, neither followed by $0$ nor $1$.
414: They just stop after printing $x$ or continue forever without any
415: further output. Together with $\MM(\epstr)=1$ this shows that $\MM$
416: is a semimeasure, but {\it not} a probability measure. We can now
417: state the fundamental property of $\MM$ \cite{Solomonoff:78}:
418: 
419: %------------------------------%
420: \ftheorem{thUniM}{Universality of $\MM$}{
421: %------------------------------%
422: The universal prior $\MM$ is an enumerable semimeasure which
423: multiplicatively dominates all enumerable semimeasures in the
424: sense that $\MM(x) \;\geqm\; 2^{-K(\rho)}\cdot \rho(x)$
425: for all an enumerable semimeasures $\rho$. $\MM$ is enumerable, but not
426: estimable or finitely computable.
427: }%------------------------------%
428: \indxs{multiplicative}{majorization}
429: \indxs{probability distribution}{computable}
430: 
431: % Explanation
432: \noindent The Kolmogorov complexity of a function like $\rho$ is
433: defined as the length of the shortest self-delimiting code of a
434: Turing machine computing this function in the sense of Definition
435: \ref{defCompFunc}. Up to a multiplicative constant, $\MM$ assigns higher
436: probability to all $x$ than any other computable probability
437: distribution.
438: 
439: % Normalization of $\MM$
440: It is possible to normalize $\MM$ to a true probability measure
441: $\MM_{norm}$ \cite{Solomonoff:78,Li:97} with dominance still being
442: true, but at the expense of giving up enumerability ($\MM_{norm}$
443: is still approximable). $\MM$ is more convenient when studying
444: algorithmic questions, but a true probability measure like
445: $\MM_{norm}$ is more convenient when studying stochastic questions.
446: 
447: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
448: \section{Universal Sequence Prediction}\label{secUSP}
449: %\subsection{Solomonoff's Universal Sequence Prediction Scheme}
450: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
451: 
452: % Occam & Epicurus in $\MM =2^-K$
453: In which sense does $\MM$ incorporate Occam's razor and Epicurus'
454: principle of multiple explanations? Since the shortest programs
455: $p$ dominate the sum in $M$, $\MM(x)$ is roughly equal to
456: $2^{-K(x)}$ ($\MM(x)=2^{-K(x)+O(K(\l(x))}$), i.e.\
457: $\MM$ assigns high probability to simple
458: strings. More useful is to think of $x$ as being the observed
459: history. We see from (\ref{Mdef}) that every program $p$
460: consistent with history $x$ is allowed to contribute to $\MM$
461: (Epicurus). On the other hand shorter programs give significantly
462: larger contribution (Occam). How does all this affect prediction?
463: If $\MM(x)$ describes our (subjective) prior belief in $x$, then
464: $\MM(y|x):=\MM(xy)/\MM(x)$ must be our posterior belief in $y$.
465: %
466: From the symmetry of algorithmic information $K(xy)\approx
467: K(y|x)+K(x)$, and $\MM(x)\approx 2^{-K(x)}$ and $\MM(xy)\approx
468: 2^{-K(xy)}$ we get $\MM(y|x)\approx 2^{-K(y|x)}$. This tells us
469: that $\MM$ predicts $y$ with high probability iff $y$ has an easy
470: explanation, given $x$ (Occam \& Epicurus).
471: 
472: % Caution
473: The above qualitative discussion should not create the impression
474: that $\MM(x)$ and $2^{-K(x)}$ always lead to predictors of
475: comparable quality. Indeed in the online/incremental setting,
476: $K(y)=O(1)$ invalidates the consideration above. The proof of
477: (\ref{eukdist}) below, for instance, depends on $\MM$ being a
478: semimeasure and the chain rule being exactly true, neither of them is
479: satisfied by $2^{-K(x)}$. See \cite{Hutter:03unimdl} for a more
480: detailed analysis.
481: 
482: % Solomonoff's universal sequence prediction
483: \index{sequence prediction!Solomonoff} Sequence
484: prediction algorithms try to predict the continuation $x_t\in\X$
485: of a given sequence $x_1...x_{t-1}$.
486: %
487: We assume that the true sequence is
488: drawn from a computable
489: probability distribution $\mu$, i.e.\ the true (objective)
490: probability of $x_{1:t}$ is $\mu(x_{1:t})$. The probability of
491: $x_t$ given $x_{<t}$ hence is
492: $\mu(x_t|x_{<t})=\mu(x_{1:t})/\mu(x_{<t})$.
493: %
494: Solomonoff's \cite{Solomonoff:78} central result is that $\MM$
495: converges to $\mu$. More precisely, for binary alphabet, he showed that
496: \beq\label{eukdist}
497:   \sum_{t=1}^\infty
498:   \nq\nq\;\sum_{\qquad x_{<t}\in\B^{t-1}}\nq\nq\;
499:   \mu(x_{<t}) \Big(\MM(0|x_{<t})-\mu(0|x_{<t})\Big)^2
500:   \;\leq\;
501:   {\odt}\ln 2\!\cdot\!K(\mu)+O(1) \;<\; \infty.
502: \eeq
503: The infinite sum can only be finite if the difference
504: $\MM(0|x_{<t})-\mu(0|x_{<t})$ tends to zero for $t\to\infty$ with
505: $\mu$ probability $1$ (see Definition \ref{defConv}$(i)$ and
506: \cite{Hutter:01alpha} or Section \ref{secConv} for general
507: alphabet). This holds for {\it any} computable probability
508: distribution $\mu$. The reason for the astonishing property of a
509: single (universal) function to converge to {\it any} computable
510: probability distribution lies in the fact that the set of
511: $\mu$-random sequences differ for different $\mu$. Past data
512: $x_{<t}$ are exploited to get a (with $t\to\infty$) improving
513: estimate $\MM(x_t|x_{<t})$ of $\mu(x_t|x_{<t})$.
514: 
515: 
516: % Bayes-mixtures
517: The universality property (Theorem \ref{thUniM}) is the central
518: ingredient in the proof of (\ref{eukdist}). The proof
519: involves the construction of a semimeasure $\xi$
520: whose dominance is obvious. The hard part is to show its
521: enumerability and equivalence to $\MM$.
522: Let $\M$ be the (countable) set of all enumerable semimeasures
523: and define
524: \beq\label{xidef}
525:   \xi(x):=\sum_{\nu\in\M}2^{-K(\nu)}\nu(x).
526: \eeq
527: Then dominance
528: \beq\label{xidom}
529:  \xi(x)\geq 2^{-K(\nu)}\nu(x)\quad\forall\,\nu\in\M
530: \eeq
531: is obvious. Is $\xi$ lower semi-computable? To answer this
532: question one has to be more precise. Levin \cite{Zvonkin:70} has
533: shown that the set of {\em all} lower semi-computable semimeasures
534: is enumerable (with repetitions). For this (ordered multi) set
535: $\M=\M_{enum}^{semi}:=\{\nu_1,\nu_2,\nu_3,...\}$ and
536: $K(\nu_i):=K(i)$ one can easily see that $\xi$ is lower
537: semi-computable. Finally proving $\MM(x)\eqm\xi(x)$ also
538: establishes universality of $\MM$ (see \cite{Solomonoff:78,Li:97}
539: for details).
540: 
541: The advantage of $\xi$ over $\MM$ is that it immediately
542: generalizes to arbitrary weighted sums of (semi)measures
543: for arbitrary countable $\M$.
544: 
545: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
546: \section{Universal (Semi)Measures}\label{secUSM}
547: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
548: 
549: What is so special about the set of all enumerable
550: semimeasures $\M_{enum}^{semi}$? The larger we choose $\M$ the less restrictive
551: is the assumption that $\M$ should contain the true distribution
552: $\mu$, which will be essential throughout the paper.
553: %
554: Why do not restrict to the still rather general class of estimable
555: or finitely computable (semi)measures? It is clear that for every
556: countable set $\M$,
557: \beq\label{defxi}
558:   \xi(x):=\xi_\M(x):=\sum_{\nu\in\M} w_\nu \nu(x)
559:   \qmbox{with} \sum_{\nu\in\M}w_\nu\leq 1 \qmbox{and} w_\nu>0
560: \eeq
561: dominates all $\nu\in\M$. This dominance is
562: necessary for the desired convergence $\xi\to\mu$ similarly to
563: (\ref{eukdist}). The question is what properties $\xi$ possesses.
564: The distinguishing property of $\M_{enum}^{semi}$ is that $\xi$ is
565: itself an element of $\M_{enum}^{semi}$. When concerned with
566: predictions, $\xi_\M\in\M$ is not by itself an important property,
567: but whether $\xi$ is computable in one of the senses of Definition
568: \ref{defCompFunc}. We define
569: \bqan
570:  \M_1\geqm\M_2 & :\Leftrightarrow &
571:  \mbox{there is an element of $\M_1$ which dominates all elements of
572:  $\M_2$} \\
573:  & :\Leftrightarrow &
574: \exists\rho\!\in\!\M_1\;\forall\nu\!\in\!\M_2\;\exists w_\nu\!>\!0
575: \;\forall x:\rho(x)\!\geq\!w_\nu\nu(x).
576: \eqan
577: $\geqm $ is transitive (but not necessarily reflexive) in the
578: sense that $\M_1 \geqm \M_2 \geqm \M_3$ implies $\M_1 \geqm \M_3$
579: and $\M_0 \supseteq \M_1 \geqm \M_2 \supseteq \M_3$ implies $\M_0
580: \geqm \M_3$.
581: %
582: For the computability concepts introduced in Section \ref{secCC}
583: we have the following proper set inclusions
584: \beqn
585: \begin{array}{ccccccc}
586:   \M_{comp}^{msr}  & \subset & \M_{est}^{msr}  & \equiv  & \M_{enum}^{msr}  & \subset & \M_{appr}^{msr} \\
587:         \cap       &         &      \cap       &         &       \cap       &         &     \cap        \\
588:   \M_{comp}^{semi} & \subset & \M_{est}^{semi} & \subset & \M_{enum}^{semi} & \subset & \M_{appr}^{semi}
589: \end{array}
590: \eeqn
591: %
592: where $\M^{msr}_c$ stands for the set of all probability measures
593: of appropriate computability type $c\in\{$comp=finitely
594: computable, est=estimable, enum=enumerable,
595: appr=approximable$\}$, and similarly for semimeasures
596: $\M^{semi}_c$. From an enumeration of a measures $\rho$ on can
597: construct a co-enumeration by exploiting
598: $\rho(x_{1:n})=1-\sum_{y_{1:n}\neq x_{1:n}}\rho(y_{1:n})$. This
599: shows that every enumerable measure is also co-enumerable, hence
600: estimable, which proves the identity $\equiv$ above.
601: 
602: With this notation, Theorem \ref{thUniM} implies
603: $\M_{enum}^{semi}\geqm\M_{enum}^{semi}$. Transitivity allows to
604: conclude, for instance, that
605: $\M_{appr}^{semi}\geqm\M_{comp}^{msr}$, i.e.\ that there is an
606: approximable semimeasure which dominates all computable measures.
607: 
608: The standard ``diagonalization'' way of proving
609: $\M_1\stackrel\times{\not\geq}\M_2$ is to take an arbitrary
610: $\mu\in\M_1$ and ``increase'' it to $\rho$ such that
611: $\mu\stackrel\times{\not\geq}\rho$ and show that $\rho\in\M_2$.
612: There are $7\times 7$ combinations of (semi)measures $\M_1$ with
613: $\M_2$ for which $\M_1\geqm\M_2$ could be true or false. There are
614: four basic cases, explicated in the following theorem, from which
615: the other 49 combinations displayed in Table \ref{tabUniSMsr}
616: follow by transitivity.
617: 
618: %------------------------------%
619: \ftheorem{thNoUniApp}{Universal (semi)measures}{
620: %------------------------------%
621: A semimeasure $\rho$ is said to be universal for $\M$ if it
622: multiplicatively dominates all elements of $\M$ in the sense
623: $\forall\nu\exists w_\nu>0:\rho(x)\geq w_\nu\nu(x)\forall x$. The
624: following holds true:
625: \begin{itemize}
626: \item[$o)$]
627: $\exists\rho:\{\rho\}\geqm\M$: For every countable set
628: of (semi)measures $\M$, there is a (semi)measure which dominates
629: all elements of $\M$.
630: \item[$i)$]
631: $\M_{enum}^{semi}\geqm\M_{enum}^{semi}$:
632: The class of enumerable semimeasures {\em contains}
633: a universal element.
634: \item[$ii)$]
635: $\M_{appr}^{msr}\geqm\M_{enum}^{semi}$:
636: There {\em is} an approximable measure which dominates all enumerable
637: semimeasures.
638: \item[$iii)$]
639: $\M_{est}^{semi}\stackrel\times{\not\geq}\M_{comp}^{msr}$: There is
640: {\em no} estimable semimeasure which dominates all computable
641: measures.
642: \item[$iv)$]
643: $\M_{appr}^{semi}\stackrel\times{\not\geq}\M_{appr}^{msr}$: There is
644: {\em no} approximable semimeasure which dominates all approximable
645: measures.
646: \end{itemize}
647: }%------------------------------%
648: 
649: \begin{table}[thb]
650: \ftablex{tabUniSMsr}{Existence of universal (semi)measures}{%
651: The entry in row $r$ and column $c$ indicates whether there is a
652: $r$-able (semi)measure $\rho$ for the set $\M$ which contains all
653: $c$-able (semi)measures, where $r,c\in\{$comput, estimat, enumer,
654: approxim$\}$. Enumerable measures are estimable. This is the
655: reason why the enum. row and column in case of measures is
656: missing. The superscript indicates from which part of Theorem
657: \ref{thNoUniApp} the answer follows. For the bold face entries
658: directly, for the others using transitivity of $\geqm $.
659: \begin{center}
660: \begin{tabular}{|c|c||c|c|c|c||c|c|c|}\hline
661:       $\nwarrow$ &  $\M$ & \multicolumn{4}{c||}{semimeasure} & \multicolumn{3}{c|}{measure}\\ \hline
662: $\rho$&$\searrow$& comp.      & est.       & enum.         & appr.     & comp.         & est.       & appr.        \\ \hline\hline
663:       s  & comp. & no$^{iii}$ & no$^{iii}$ & no$^{iii}$    & no$^{iv}$ & no$^{iii}$    & no$^{iii}$ & no$^{iv}$    \\ \cline{2-9}
664:       e  & est.  & no$^{iii}$ & no$^{iii}$ & no$^{iii}$    & no$^{iv}$ & {\bf no}$^{\bf iii}$& no$^{iii}$ & no$^{iv}$    \\ \cline{2-9}
665:       m  & enum. & yes$^{i}$  & yes$^{i}$  & {\bf yes}$^{\bf i}$ & no$^{iv}$ & yes$^{i}$     & yes$^{i}$  & no$^{iv}$    \\ \cline{2-9}
666:       i  &appr.  & yes$^{i}$  & yes$^{i}$  & yes$^{i}$     & no$^{iv}$ & yes$^{i}$     & yes$^{i}$  & {\bf no}$^{\bf iv}$\\ \hline\hline
667:       m  & comp. & no$^{iii}$ & no$^{iii}$ & no$^{iii}$    & no$^{iv}$ & no$^{iii}$    & no$^{iii}$ & no$^{iv}$    \\ \cline{2-9}
668:       s  & est.  & no$^{iii}$ & no$^{iii}$ & no$^{iii}$    & no$^{iv}$ & no$^{iii}$    & no$^{iii}$ & no$^{iv}$    \\ \cline{2-9}
669:       r  &appr.  & yes$^{ii}$ & yes$^{ii}$ & {\bf yes}$^{\bf ii}$& no$^{iv}$ & yes$^{ii}$    & yes$^{ii}$ & no$^{iv}$    \\ \hline
670: \end{tabular}
671: \end{center}}
672: \end{table}
673: 
674: \noindent If we ask for a universal (semi)measure which at least satisfies
675: the weakest form of computability, namely being approximable, we
676: see that the largest dominated set among the 7 sets defined above
677: is the set of enumerable semimeasures. This is the reason why
678: $\M_{enum}^{semi}$ plays a special role.
679: On the other hand, $\M_{enum}^{semi}$ is not the largest set
680: dominated by an approximable semimeasure, and indeed no such
681: largest set exists. One may, hence, ask for ``natural'' larger
682: sets $\M$. One such set, namely the set of cumulatively enumerable
683: semimeasures $\M_{CEM}$, has recently been discovered by
684: Schmidhuber \cite{Schmidhuber:02gtm}, for which even
685: $\xi_{CEM}\in\M_{CEM}$ holds.
686: 
687: \noindent Theorem \ref{thNoUniApp} also holds for {\em discrete
688: (semi)measures} $P$ defined as follows:
689: 
690: \index{semimeasure!enumerable}
691: %------------------------------%
692: \fdefinition{defDSemi}{Discrete (Semi)measures}{
693: %------------------------------%
694: $P(x)$ denotes the probability of $x\in\Set N$. We call
695: $P:\Set{N}\to[0,1]$ a discrete (semi)measure if $\sum_{x\in\Set{N}}
696: P(x)\stackrel{(<)}=1$.
697: }%------------------------------%
698: 
699: \noindent Theorem \ref{thNoUniApp}
700: %$(o)$ is elementary,
701: $(i)$ is Levin's major result \cite[Th.4.3.1 \& Th.4.5.1]{Li:97}, %
702: $(ii)$ is due to Solomonoff \cite{Solomonoff:78}, %
703: the proof of
704: $\M_{comp}^{semi}\stackrel\times{\not\geq}\M_{comp}^{semi}$ in
705: \cite[p249]{Li:97} contains minor errors and is not extensible to
706: $(iii)$ and the proof in \cite[p276]{Li:97} only applies to
707: infinite alphabet and not to the binary/finite case considered
708: here. A complete proof of $(o)-(iv)$ for discrete and continuous
709: (semi)measures is given elsewhere.
710: 
711: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
712: \section{Posterior Convergence}\label{secConv}
713: %\subsection{Definition of Random Sequences}
714: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
715: 
716: We have investigated in detail the computational properties of
717: various mixture distributions $\xi$. A mixture $\xi_\M$
718: multiplicatively dominates all distributions in $\M$. We have
719: mentioned that dominance implies posterior convergence. In this
720: section we present in more detail what dominance implies and what
721: not.
722: 
723: Convergence of $\xi(x_t|x_{<t})$ to $\mu(x_t|x_{<t})$ with
724: $\mu$-probability 1 tells us that $\xi(x_t|x_{<t})$ is close to
725: $\mu(x_t|x_{<t})$ for sufficiently large $t$ and ``most''
726: sequences $x_{1:\infty}$. It says nothing about the speed of
727: convergence, nor whether convergence is true for any {\em particular}
728: sequence (of measure 0). Convergence {\em in mean sum} defined
729: below is intended to capture the rate of convergence,
730: Martin-L\"{o}f randomness is used to capture convergence
731: properties for individual sequences.
732: 
733: Martin-L\"{o}f randomness is a very important concept of
734: randomness of individual sequences, which is closely related to
735: Kolmogorov complexity and Solomonoff's universal prior. Levin gave
736: a characterization equivalent to Martin-L\"{o}f's original
737: definition \cite{Levin:73random}:
738: 
739: %------------------------------%
740: \ftheorem{defML}{Martin-L\"{o}f random sequences}{
741: %------------------------------%
742: A sequence $x_{1:\infty}$ is $\mu$-Martin-L\"{o}f random
743: ($\mu$.M.L.) iff there is a constant $c$ such that
744: $\MM(x_{1:n})\leq c\cdot \mu(x_{1:n})$ for all $n$.
745: }%------------------------------%
746: 
747: \noindent
748: One can show that a $\mu$.M.L.\ random sequence $x_{1:\infty}$
749: passes {\em all} thinkable effective randomness tests, e.g.\ the
750: law of large numbers, the law of the iterated logarithm, etc.
751: In particular, the set of all $\mu$.M.L. random sequences has
752: $\mu$-measure 1.
753: %
754: The following generalization is natural when considering general
755: Bayes-mixtures $\xi$ as in this work:
756: 
757: %------------------------------%
758: \fdefinition{defmuMr}{$\mu/\xi$-random sequences}{
759: %------------------------------%
760: A sequence $x_{1:\infty}$ is called $\mu/\xi$-random
761: ($\mu.\xi$.r.) iff there is a constant $c$ such that
762: $\xi(x_{1:n})\leq c\cdot \mu(x_{1:n})$ for all $n$.
763: }%------------------------------%
764: 
765: Typically, $\xi$ is a mixture over some $\M$ as defined in
766: (\ref{xidef}), in which case the reverse inequality
767: $\xi(x)\geqm\mu(x)$ is also true (for all $x$). For finite $\M$ or
768: if $\xi\in\M$, the definition of $\mu/\xi$-randomness depends only
769: on $\M$, and not on the specific weights used in $\xi$. For
770: $\M=\M_{enum}^{semi}$, $\mu/\xi$-randomness is just $\mu$.M.L.\
771: randomness. The larger $\M$, the more patterns are recognized as
772: non-random.
773: %($\M_{enum}^{semi}\supset\M_\Theta$).
774: Roughly speaking, those regularities characterized by some
775: $\nu\in\M$ are recognized by $\mu/\xi$-randomness, i.e.\ for
776: $\M\subset\M_{enum}^{semi}$ some $\mu/\xi$-random strings may not
777: be M.L.\ random.
778: %
779: Other randomness concepts, e.g.\ those by Schnorr, Ko, van
780: Lambalgen, Lutz, Kurtz, von Mises, Wald, and Church (see
781: \cite{Wang:96,Lambalgen:87,Schnorr:71}), could possibly also be
782: characterized in terms of $\mu/\xi$-randomness for particular
783: choices of $\cal M$.
784: 
785: %------------------------------%
786: %\paragraph{Convergence of Random Sequences}%\label{secConvRSeq}
787: %------------------------------%
788: \indxs{random sequence}{convergence} A classical (non-random)
789: real-valued sequence $a_t$ is defined to converge to $a_*$, short
790: $a_t\to a_*$ if $\forall\eps\exists t_0\forall t\geq
791: t_0:|a_t-a_*|<\eps$. We are interested in convergence properties
792: of random sequences $z_t(\omega)$ for $t\to\infty$ (e.g.\
793: $z_t(\omega)=\xi(\omega_t|\omega_{<t})-\mu(\omega_t|\omega_{<t})$).
794: %
795: We denote $\mu$-expectations by $\E$. The expected value of a
796: function $f:\X^t\to\Set R$, dependent on $x_{1:t}$, independent of
797: $x_{t+1:\infty}$, and possibly undefined on a set of $\mu$-measure
798: 0, is $\E[f] =
799: \sumprime_{\!x_{1:t}\in\X^t}\mu(x_{1:t})f(x_{1:t})$. The prime
800: denotes that the sum is restricted to $x_{1:t}$ with
801: $\mu(x_{1:t})\neq 0$. Similarly we use $\P[..]$ to denote the
802: $\mu$-probability of event $[..]$
803: %
804: We define four convergence concepts for random sequences.
805: 
806: \index{convergence!with probability 1}%
807: \index{convergence!in the mean}
808: \index{convergence!in mean sum}
809: \index{convergence!in probability}
810: \index{convergence!Martin-L\"of}
811: \index{convergence!$\M$}
812: %------------------------------%
813: \fdefinition{defConv}{Convergence of random sequences}{
814: %------------------------------%
815: Let $z_1(\omega),z_2(\omega),...$ be a sequence of real-valued
816: random variables. $z_t$ is said to
817: converge for $t\to\infty$ to random variable $z_*(\omega)$
818: \begin{itemize}\itemindent8ex
819: \item[$i)$] with probability 1 (w.p.1) $:\Leftrightarrow$
820:   $\P[\{\omega:z_t\to z_*\}]=1$,
821: %  \\ $\Leftrightarrow$
822: %  $\forall\eps:\P[\sup_{s\geq t}|z_t-z_s|\geq\eps]\to 0$ for $t\to\infty$,
823: \item[$ii)$] in mean sum (i.m.s.) $:\Leftrightarrow$
824: $\sum_{t=1}^\infty\E[(z_t-z_*)^2]<\infty$,
825: \item[$iii)$] for every $\mu$-Martin-L{\"o}f random sequence ($\mu$.M.L.) $:\Leftrightarrow$ \\
826: \hspace*{8ex}$\forall\omega:$ $[\exists c\forall n:
827: \MM(\omega_{1:n})\leq c\mu(\omega_{1:n})]$
828:   implies $z_t(\omega)\to z_*(\omega)$ for $t\to\infty$,
829: \item[$iv)$] for every $\mu/\xi$-random sequence ($\mu.\xi$.r.) $:\Leftrightarrow$ \\
830: \hspace*{8ex}$\forall\omega:$ $[\exists c\forall n:
831: \xi(\omega_{1:n})\leq c\mu(\omega_{1:n})]$
832:   implies $z_t(\omega)\to z_*(\omega)$ for $t\to\infty$.
833: \end{itemize}
834: }%------------------------------%
835: 
836: \noindent In statistics, $(i)$ is the ``default'' characterization of
837: convergence of random sequences.
838: %
839: Convergence i.m.s.\ $(ii)$ is very strong: it
840: provides a rate of convergence in the sense that the expected
841: number of times $t$ in which $z_t$ deviates more than $\eps$ from
842: $z_*$ is finite and bounded by
843: $\sum_{t=1}^\infty\E[(z_t-z_*)^2]/\eps^2$.
844: Nothing can be said for {\em which} $t$ these deviations occur.
845: If, additionally, $|z_t-z_*|$ were monotone decreasing, then
846: $|z_t-z_*|=o(t^{-1/2})$ could be concluded.
847: %
848: $(iii)$ uses Martin-L\"{o}f's notion of randomness of {\em individual}
849: sequences to define convergence M.L. Since this work
850: deals with general Bayes-mixtures $\xi$, we generalized in $(iv)$
851: the definition of convergence M.L.\ based on $\MM$ to
852: convergence $\mu.\xi$.r.\ based on $\xi$ in a natural way.
853: %
854: One can show that convergence i.m.s.\ implies convergence w.p.1.
855: Also convergence M.L.\ implies convergence w.p.1.
856: \index{random sequence!convergence relations}
857: \index{convergence!relations}
858: %
859: Universality of $\xi$ implies the following posterior convergence results:
860: 
861: %------------------------------%
862: %\paragraph{Convergence of $\xi$ to $\mu$}\label{subsecConv}
863: %------------------------------%
864: \index{convergence!$\xi$ to $\mu$}
865: 
866: %------------------------------%
867: \ftheorem{thConv}{Convergence of $\xi$ to $\mu$}{
868: %------------------------------%
869: Let there be sequences $x_1x_2...$ over a finite alphabet $\X$
870: drawn with probability $\mu(x_{1:n})\in\M$ for the first $n$
871: symbols, where $\mu$ is a measure. The universal posterior
872: probability $\xi(x_t|x_{<t})$
873: of the next symbol $x_t$ given $x_{<t}$ %defined in (\ref{xidefsp})
874: is related to the true posterior probability $\mu(x_t|x_{<t})$
875: in the following way:\vspace{-1ex}
876: \beqn
877:    \sum_{t=1}^n\E{\textstyle\left[\left(\sqrt{{\xi(x_t|x_{<t})
878:           \over\mu(x_t|x_{<t})}}-1\right)^2\right]} \;\leq\;
879:    \sum_{t=1}^n\E\bigg[\sum_{x'_t}
880:         \left(\sqrt{\xi(x'_t|x_{<t})}-\sqrt{\mu(x'_t|x_{<t})}\right)^2\bigg]
881:         \;\leq\; \ln{w_\mu^{-1}} \;<\; \infty
882: \eeqn
883: where $w_\mu$ is the weight (\ref{defxi}) of $\mu$ in $\xi$.
884: }%------------------------------%
885: 
886: \noindent Theorem \ref{thConv} implies
887: \beqn
888:  \mbox{$\sqrt{\xi(x'_t|x_{<t})} \to \sqrt{\mu(x'_t|x_{<t})}$
889:  for any $x'_t$ and
890:  $\sqrt{{\xi(x_t|x_{<t})\over\mu(x_t|x_{<t})}} \to 1$, both
891:  i.m.s.\ for $t\to\infty$}.
892: \eeqn
893: %
894: %Gacs Martingale proof
895: \indxs{semi-martingale}{convergence}\index{martingales}%
896: \noindent The latter strengthens the result
897: $\xi(x_t|x_{<t})/\mu(x_t|x_{<t})\to 1$ w.p.1 derived by G\'acs in
898: \cite[Th.5.2.2]{Li:97} in that it also provides the ``speed'' of
899: convergence.
900: 
901: Note also the subtle difference between the two convergence
902: results. For {\em any} sequence $x'_{1:\infty}$ (possibly constant
903: and not necessarily $\mu$-random),
904: $\mu(x'_t|x_{<t})-\xi(x'_t|x_{<t})$ converges to zero w.p.1
905: (referring to $x_{1:\infty}$), but no statement is possible for
906: $\xi(x'_t|x_{<t})/\mu(x'_t|x_{<t})$, since
907: $\lim\,\inf\mu(x'_t|x_{<t})$ could be zero. On the other hand, if
908: we stay {\em on} the $\mu$-random sequence ($x'_{1:\infty} =
909: x_{1:\infty}$), we have $\xi(x_t|x_{<t})/\mu(x_t|x_{<t})
910: \to 1$ (whether $\inf\mu(x_t|x_{<t})$ tends to zero or not does
911: not matter).
912: %
913: Indeed, it is easy to see that $\xi(1|0_{<t})/\mu(1|0_{<t})\propto
914: t\to\infty$ diverges for $\M=\{\mu,\nu\}$, $\mu(1|x_{<t}):=\odt
915: t^{-3}$ and $\nu(1|x_{<t}):=\odt t^{-2}$, although $0_{1:\infty}$ is
916: $\mu$-random. % \cite{Hutter:01op}. No longer there.
917: %
918: 
919: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
920: \section{Convergence in Martin-L{\"o}f Sense}\label{secMLconv}
921: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
922: 
923: An interesting open question is whether $\xi$ converges to $\mu$
924: (in difference or ratio) individually for all Martin-L\"{o}f
925: random sequences. Clearly, convergence $\mu$.M.L. may at most fail
926: for a set of sequences with $\mu$-measure zero. A convergence
927: M.L.\ result would be particularly interesting and natural for
928: Solomonoff's universal prior $M$, since M.L.\ randomness can be
929: defined in terms of $\MM$ (see Theorem \ref{defML}). Attempts to
930: convert the bounds in Theorem \ref{thConv} to effective
931: $\mu$.M.L.\ randomness tests fail, since $M(x_t|x_{<t})$ is not
932: enumerable. The proof given of $M/\mu\stackrel{M.L.}\longrightarrow 1$
933: in \cite[Th.5.2.2]{Li:97} and \cite[Th.10]{Vitanyi:00} is
934: incomplete.$\!$\footnote{The formulation of their Theorem is quite
935: misleading in general: ``{\it Let $\mu$ be a positive recursive
936: measure. If the length of $y$ is fixed and the length of $x$ grows
937: to infinity, then $M(y|x)/\mu(y|x)\to 1$ with $\mu$-probability
938: one. The infinite sequences $\omega$ with prefixes $x$ satisfying
939: the displayed asymptotics are precisely [`$\Rightarrow$' {\em and}
940: `$\Leftarrow$'] the $\mu$-random sequences.}'' First, for
941: off-sequence $y$ convergence w.p.1 does not hold ($xy$ must be
942: demanded to be a prefix of $\omega$). Second, the proof of
943: `$\Leftarrow$' is loopy (see main text). Last, `$\Rightarrow$' is
944: given without proof and is probably wrong. Also the assertion in
945: \cite[Th.5.2.1]{Li:97} that $S_t:=\E\sum_{x'_t}
946: (\mu(x'_t|x_{<t})-M(x'_t|x_{<t}))^2$ converges to zero faster than
947: $1/t$ cannot be made, since $S_t$ may not decrease monotonically.}
948: The implication ``$\MM(x_{1:n})\leq c\cdot\mu(x_{1:n})\forall
949: n\Rightarrow \lim_{n\to\infty}\MM(x_{1:n})/\mu(x_{1:n})$ exists''
950: has been used, but not proven, and may indeed be wrong.
951: 
952: Vovk \cite{Vovk:87} shows that for two finitely computable
953: semi-measures $\mu$ and $\rho$ and $x_{1:\infty}$ being $\mu$
954: {\em and} $\rho$ M.L.\ random that
955: \beqn
956: \sum_{t=1}^\infty\sum_{x'_t}\left(\sqrt{\mu(x'_t|x_{<t})}-\sqrt{\rho(x'_t|x_{<t})}\right)^2<\infty
957: \qmbox{and}
958: \sum_{t=1}^\infty\left({\rho(x_t|x_{<t})\over\mu(x_t|x_{<t})}-1\right)^2<\infty.
959: \eeqn
960: If $\MM$ were recursive, then this would imply posterior
961: $\MM\to\mu$ and $\MM/\mu\to 1$ for every $\mu$.M.L.\ random
962: sequence $x_{1:\infty}$, since {\em every} sequence is $\MM$.M.L.\
963: random. Since $\MM$ is {\em not} recursive Vovk's theorem cannot
964: be applied and it is not obvious how to generalize it. So the
965: question of individual convergence remains open. More generally,
966: one may ask whether $\xi_\M\to\mu$ for every $\mu/\xi$-random
967: sequence. It turns out that this is true for some $\M$, but false for others.
968: 
969: %------------------------------%
970: \ftheorem{thMLConv}{$\mu/\xi$-convergence of $\xi$ to $\mu$}{
971: %------------------------------%
972: Let $\X=\B$ be binary and
973: $\M_\Theta:=\{\mu_\th:\mu_\th(1|x_{<t})=\th\,\forall t,\;
974: \th\in\Theta\}$ be the set of Bernoulli($\th$) distributions
975: with parameters $\th\in\Theta$. Let $\Theta_D$ be a countable
976: dense subset of $[0,1]$, e.g.\ $[0,1]\cap\Set Q$ and let $\Theta_G$
977: be a countable subset of $[0,1]$ with a gap in the sense that
978: there exist $0<\th_0<\th_1<1$ such that
979: $[\th_0,\th_1]\cap\Theta_G=\{\th_0,\th_1\}$, e.g.\
980: $\Theta_G=\{\odf,\odt\}$ or $\Theta_G=([0,{1\over
981: 4}]\cup[{1\over 2},1])\cap\Set Q$. Then
982: \begin{itemize}
983: \item[$i)$] If $x_{1:\infty}$ is $\mu/\xi_{\M_{\Theta_D}}$ random with
984: $\mu\in\M_{\Theta_D}$, then $\xi_{\M_{\Theta_D}}(x_t|x_{<t})\to\mu(x_t|x_{<t})$,
985: \item[$ii)$] There are $\mu\in\M_{\Theta_G}$ and $\mu/\xi_{\M_{\Theta_G}}\!\!$
986: random $x_{1:\infty}$ for which
987: $\xi_{\M_{\Theta_G}}\!\!(x_t|x_{<t})\not\to\mu(x_t|x_{<t})\!\!$
988: \end{itemize}\vspace{-1ex}
989: }%------------------------------%
990: 
991: \noindent Our original/main motivation of studying
992: $\mu/\xi$-randomness is the implication of Theorem \ref{thMLConv}
993: that $\MM\stackrel{\mbox{\tiny M.L.}}\longrightarrow\mu$ cannot be
994: decided from $M$ being a mixture distribution or from the
995: universality property (Theorem \ref{thUniM}) alone. Further
996: structural properties of $\M_{enum}^{semi}$ have to be employed.
997: For Bernoulli sequences, convergence $\mu.\xi_{\M_\Theta}$.r.\ is
998: related to denseness of $\M_\Theta$. Maybe a denseness
999: characterization of $\M_{enum}^{semi}$ can solve the question of
1000: convergence M.L.\ of $M$. The property $\MM\in\M_{enum}^{semi}$ is
1001: also not sufficient to resolve this question, since there are
1002: $\M\ni\xi$ for which $\xi\stackrel{\mu.\xi.r}\longrightarrow\mu$
1003: and $\M\ni\xi$ for which
1004: $\xi\not\stackrel{\mu.\xi.r}\longrightarrow\mu$. Theorem
1005: \ref{thMLConv} can be generalized to i.i.d.\ sequences over
1006: general finite alphabet $\X$.
1007: 
1008: The idea to prove $(ii)$ is to construct a sequence $x_{1:\infty}$
1009: which is $\mu_{\th_0}\M$-random {\em and} $\mu_{\th_1}\M$-random
1010: for $\th_0\neq\th_1$. This is possible if and only if $\Theta$
1011: contains a gap and $\th_0$ and $\th_1$ are the boundaries of the
1012: gap. Obviously $\xi$ cannot converge to $\th_0$ {\em and} $\th_1$,
1013: thus proving $\M$-non-convergence. For no $\th\in[0,1]$ will this
1014: $x_{1:\infty}$ be $\mu_\th$ M.L.-random. Finally, the proof of
1015: Theorem \ref{thMLConv}
1016: makes essential use of the mixture representation of $\xi$, as
1017: opposed to the proof of Theorem \ref{thConv} which only needs
1018: dominance $\xi\geqm\M$.
1019: 
1020: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1021: \section{Conclusions}\label{secConc}
1022: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1023: 
1024: For a hierarchy of four computability definitions, we completed
1025: the classification of the existence of computable (semi)measures
1026: dominating all computable (semi)measures. Dominance is an important
1027: property of a prior, since it implies rapid convergence of the
1028: corresponding posterior with probability one.
1029: %
1030: A strengthening would be convergence for all Martin-L{\"o}f (M.L.)
1031: random sequences. This seems natural, since M.L.\ randomness can
1032: be defined in terms of Solomonoff's prior $M$, so there is a close
1033: connection.
1034: %
1035: Contrary to what was believed before, the question of posterior
1036: convergence $M/\mu\to 1$ for all M.L.\ random sequences is still
1037: open. We introduced a new flexible notion of $\mu/\xi$-randomness
1038: which contains Martin-L{\"of} randomness as a special case. Though
1039: this notion may have a wider range of application, the main
1040: purpose for its introduction was to show that standard proof
1041: attempts of $M/\mu\stackrel{M.L.}\longrightarrow 1$ based on
1042: dominance only must fail. This follows from the
1043: derived result that the validity of $\xi/\mu\to 1$ for
1044: $\mu/\xi$-random sequences depends on the Bayes mixture $\xi$.
1045: 
1046: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1047: %         Bibliography        %
1048: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1049: {\small
1050: \begin{thebibliography}{Wan96}
1051: 
1052: \bibitem[Hut01]{Hutter:01alpha}
1053: M.~Hutter.
1054: \newblock Convergence and error bounds of universal prediction for general
1055:   alphabet.
1056: \newblock {\em Proceedings of the 12th Eurpean Conference on Machine Learning
1057:   (ECML-2001)}, pages 239--250, 2001.
1058: 
1059: \bibitem[Hut03]{Hutter:03unimdl}
1060: M.~Hutter.
1061: \newblock Sequence prediction based on monotone complexity.
1062: \newblock Technical Report IDSIA-09-03, 2003.
1063: 
1064: \bibitem[Lam87]{Lambalgen:87}
1065: {M. van} Lambalgen.
1066: \newblock {\em Random Sequences}.
1067: \newblock PhD thesis, University of Amsterdam, 1987.
1068: 
1069: \bibitem[Lev73]{Levin:73random}
1070: L.~A. Levin.
1071: \newblock On the notion of a random sequence.
1072: \newblock {\em Soviet Math. Dokl.}, 14(5):1413--1416, 1973.
1073: 
1074: \bibitem[LV97]{Li:97}
1075: M.~Li and P.~M.~B. Vit\'anyi.
1076: \newblock {\em An introduction to {Kolmogorov} complexity and its
1077:   applications}.
1078: \newblock Springer, 2nd edition, 1997.
1079: 
1080: \bibitem[Sch71]{Schnorr:71}
1081: C.~P. Schnorr.
1082: \newblock {\em Zuf{\"a}lligkeit und Wahrscheinlichkeit}.
1083: \newblock Springer, Berlin, 1971.
1084: 
1085: \bibitem[Sch02]{Schmidhuber:02gtm}
1086: J.~Schmidhuber.
1087: \newblock Hierarchies of generalized {Kolmogorov} complexities and
1088:   nonenumerable universal measures computable in the limit.
1089: \newblock {\em International Journal of Foundations of Computer Science},
1090:   13(4):587--612, 2002.
1091: 
1092: \bibitem[Sol64]{Solomonoff:64}
1093: R.~J. Solomonoff.
1094: \newblock A formal theory of inductive inference: Part 1 and 2.
1095: \newblock {\em Inform. Control}, 7:1--22, 224--254, 1964.
1096: 
1097: \bibitem[Sol78]{Solomonoff:78}
1098: R.~J. Solomonoff.
1099: \newblock Complexity-based induction systems: comparisons and convergence
1100:   theorems.
1101: \newblock {\em IEEE Trans. Inform. Theory}, IT-24:422--432, 1978.
1102: 
1103: \bibitem[VL00]{Vitanyi:00}
1104: P.~M. Vit{\'a}nyi and M.~Li.
1105: \newblock Minimum description length induction, {B}ayesianism, and {K}olmogorov
1106:   complexity.
1107: \newblock {\em IEEE Trans. on Information Theory}, 46(2):446--464, 2000.
1108: 
1109: \bibitem[Vov87]{Vovk:87}
1110: V.~G. Vovk.
1111: \newblock On a randomness criterion.
1112: \newblock {\em DOKLADY: Russian Academy of Sciences Doklady. Mathematics
1113:   (formerly Soviet Mathematics--Doklady)}, 35(3):656--660, 1987.
1114: 
1115: \bibitem[Wan96]{Wang:96}
1116: Y.~Wang.
1117: \newblock {\em Randomness and Complexity}.
1118: \newblock PhD thesis, 1996.
1119: 
1120: \bibitem[ZL70]{Zvonkin:70}
1121: A.~K. Zvonkin and L.~A. Levin.
1122: \newblock The complexity of finite objects and the development of the concepts
1123:   of information and randomness by means of the theory of algorithms.
1124: \newblock {\em Russian Mathematical Surveys}, 25(6):83--124, 1970.
1125: 
1126: \end{thebibliography}
1127: }
1128: 
1129: \end{document}
1130: %---------------------End-of-UniPriors.tex--------------------%
1131: