0305:cs0305052/cs0305052

1:

2: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

3: %% On the Existence and Convergence Computable Universal Priors %%

4: %%     Marcus Hutter: Start: 01.08.02  LastEdit: 29.05.03    %%

5: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

6:

7: %-------------------------------%

8: %   Document-Style              %

9: %-------------------------------%

10: \documentclass[12pt,twoside]{article}

11: \pagestyle{myheadings}

12: \markboth{\sc Marcus Hutter, Technical Report IDSIA-05-03

13: }{\sc Computable Universal Priors}

14: \setcounter{tocdepth}{4} \setcounter{secnumdepth}{2}

15: \topmargin=0mm  \oddsidemargin=5mm \evensidemargin=5mm

16: \textwidth=15cm \textheight=22cm

17: \sloppy

18:

19: %-------------------------------%

20: %       My Math-Spacings        %

21: %-------------------------------%

22: \def\,{\mskip 3mu} \def\>{\mskip 4mu plus 2mu minus 4mu} \def\;{\mskip 5mu plus 5mu} \def\!{\mskip-3mu}

23: \def\dispmuskip{\thinmuskip= 3mu plus 0mu minus 2mu \medmuskip=  4mu plus 2mu minus 2mu \thickmuskip=5mu plus 5mu minus 2mu}

24: \def\textmuskip{\thinmuskip= 0mu                    \medmuskip=  1mu plus 1mu minus 1mu \thickmuskip=2mu plus 3mu minus 1mu}

25: \textmuskip

26: \def\beq{\dispmuskip\begin{equation}}    \def\eeq{\end{equation}\textmuskip}

27: \def\beqn{\dispmuskip\begin{displaymath}}\def\eeqn{\end{displaymath}\textmuskip}

28: \def\bqa{\dispmuskip\begin{eqnarray}}    \def\eqa{\end{eqnarray}\textmuskip}

29: \def\bqan{\dispmuskip\begin{eqnarray*}}  \def\eqan{\end{eqnarray*}\textmuskip}

30:

31: %-------------------------------%

32: %   Macro-Definitions           %

33: %-------------------------------%

34: \newtheorem{theorem}{Theorem}

35: \newtheorem{corollary}[theorem]{Corollary}

36: \newtheorem{lemma}[theorem]{Lemma}

37: \newtheorem{definition}[theorem]{Definition}

38:

39: \newenvironment{keywords}{\centerline{\bf\small

40: Keywords}\vspace{-1ex}\begin{quote}\small}{\par\end{quote}\vskip

41: 1ex}

42: \newtheorem{tablex}[theorem]{Table}

43: \newtheorem{figurex}[equation]{Figure}

44:

45: \def\ftheorem#1#2#3{\begin{theorem}[#2]\label{#1} #3 \end{theorem} }

46: \def\fcorollary#1#2#3{\begin{corollary}[#2]\label{#1} #3 \end{corollary} }

47: \def\flemma#1#2#3{\begin{lemma}[#2]\label{#1} #3 \end{lemma} }

48: \def\fdefinition#1#2#3{\begin{definition}[#2]\label{#1} #3 \end{definition} }

49: \def\ftablex#1#2#3{\begin{tablex}[#2]\label{#1} #3 \end{tablex} }

50: \def\ffigurex#1#2#3#4{{#4}\begin{figurex}[#2]\label{#1}#3\end{figurex}}

51:

52: \def\idx#1{\index{#1}#1} %\idx{name} for also in text

53: \def\indxs#1#2{\index{#1!#2}\index{#2!#1}} %\idx{name} for also in text

54: \def\paragraph#1{\vspace{1ex}\noindent{\bf{#1.}}}

55: \def\paranodot#1{\vspace{1ex}\noindent{\bf{#1}}}

56: \def\myparskip{\vspace{1.5ex plus 1ex minus 1ex}\noindent}

57: \def\ff{\Longrightarrow}

58: \def\gdw{\Longleftrightarrow}

59: \def\toinfty#1{\stackrel{#1\to\infty}{\longrightarrow}}

60: \def\nq{\hspace{-1em}}

61: \def\qed{\hspace*{\fill}$\Box\quad$}

62: \def\odt{{\textstyle{1\over 2}}}

63: \def\odf{{\textstyle{1\over 4}}}

64: \def\eps{\varepsilon}                   % for small positive number

65: \def\epstr{\epsilon}                    % for empty string

66: \def\blank{{\,_\sqcup\,}}                 % blank position

67: \def\pfx{`}                              %prefix code

68: \def\qmbox#1{{\quad\mbox{#1}\quad}}

69: \def\argmax{\mathop{\rm arg\,max}}          % maxarg

70: \def\argmin{\mathop{\rm arg\,min}}          % minarg

71:

72: \def\eqm{\stackrel\times=}             % for some reason

73: \def\leqm{\stackrel\times\leq}

74: \def\geqm{\stackrel\times\geq}

75:

76: \def\odn{{\textstyle{1\over n}}}

77: \def\v#1{{\bf #1}}

78: \def\l{{l}}                             % length of string or program

79: \def\M{{\cal M}}                        % Set of prob. distributions

80: \def\X{{\cal X}}                        % input/perception set/alphabet

81: \def\Y{{\cal Y}}                        % output/action set/alphabet

82: \def\R{{\cal R}}                        % reward set subset of reals

83: \def\F{{\cal F}}                        % Generic performance measure

84: \def\I{{\cal I}}                        % some set

85: \def\S{{\cal S}}                        % some set

86: \def\Q{{\cal Q}}

87: \def\E{{\bf E}}                         % Expectation value

88: \def\P{{\bf P}}                         % Expectation value

89: \def\B{\{0,1\}}                        % Binary set (or \set{B})

90: \def\MM{M}                              % Solomonoff's prior

91: \def\th{\theta}

92:

93: \def\Set#1{{\if#1Q{I\!\!\!#1}\else\if#1Z{Z\!\!\!Z}\else{I\!\!#1}\fi\fi}}

94: \def\lb{\log}

95: \def\sumprime{\mathop{{\sum\nolimits'}}}

96:

97: \begin{document}

98: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

99: %                      T i t l e - P a g e                      %

100: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

101:

102: \title{\vskip -15mm\normalsize\sc Technical Report \hfill IDSIA-05-03

103: \vskip 2mm\bf\LARGE\hrule height5pt \vskip 3mm

104: \sc On the Existence and Convergence \\ of Computable Universal Priors\thanks{%

105: This work was supported by SNF grant 2000-61847.00 to J\"{u}rgen

106: Schmidhuber.}

107: \vskip 6mm \hrule height2pt \vskip 5mm}

108: \author{{\bf Marcus Hutter}\\[3mm]

109: \normalsize IDSIA, Galleria 2, CH-6928\ Manno-Lugano, Switzerland\\

110: \normalsize marcus@idsia.ch \hspace{8.5ex} http://www.idsia.ch/$^{_{_\sim}}\!$marcus}

111: \date{29 May 2003}

112: \maketitle

113:

114: \begin{abstract}

115: \noindent Solomonoff unified Occam's razor and Epicurus' principle

116: of multiple explanations to one elegant, formal, universal theory

117: of inductive inference, which initiated the field of algorithmic

118: information theory. His central result is that the posterior of

119: his universal semimeasure $\MM$ converges rapidly to the true

120: sequence generating posterior $\mu$, if the latter is computable.

121: Hence, $M$ is eligible as a universal predictor in case of unknown

122: $\mu$.

123: %

124: We investigate the existence and convergence of computable

125: universal (semi)measures for a hierarchy of computability classes:

126: finitely computable, estimable, enumerable, and approximable.

127: For instance, $\MM$ is known to be enumerable, but not finitely

128: computable, and to dominate all enumerable semimeasures.

129: %

130: We define seven classes of (semi)measures based on these four

131: computability concepts. Each class may or may not contain a

132: (semi)measure which dominates all elements of another class. The

133: analysis of these 49 cases can be reduced to four basic cases, two

134: of them being new. The results hold for discrete and continuous

135: semimeasures.

136: %

137: We also investigate more closely the types of convergence, possibly

138: implied by universality: in difference and in ratio, with probability

139: 1, in mean sum, and for Martin-L{\"o}f random sequences.

140: %

141: We introduce a generalized concept of randomness for individual

142: sequences and use it to exhibit difficulties regarding these

143: issues.

144: \end{abstract}

145:

146: \begin{keywords}

147: Sequence prediction;

148: Algorithmic Information Theory;

149: Solomonoff's prior;

150: universal probability;

151: mixture distributions;

152: posterior convergence;

153: computability concepts;

154: Martin-L{\"o}f randomness.

155: \end{keywords}

156:

157: \pagebreak

158: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

159: \section{Introduction}\label{secIntro}

160: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

161:

162: All induction problems can be phrased as sequence prediction

163: tasks. This is, for instance, obvious for time series prediction,

164: but also includes classification tasks. Having observed data $x_t$

165: at times $t<n$, the task is to predict the $t$-th symbol $x_t$

166: from sequence $x=x_1...x_{t-1}$.

167: %

168: The key concept to attack general induction problems is

169: Occam's razor and to a less extend Epicurus' principle of

170: multiple explanations. The former/latter may be interpreted as to

171: keep the simplest/all theories consistent with the observations

172: $x_1...x_{t-1}$ and to use these theories to predict $x_t$.

173: %

174: Solomonoff \cite{Solomonoff:64,Solomonoff:78} formalized and

175: combined both principles in his universal prior $\MM(x)$ which

176: assigns high/low probability to simple/complex environments, hence

177: implementing Occam and Epicurus.

178: %

179: Solomonoff's \cite{Solomonoff:78} central result is that if

180: the probability $\mu(x_t|x_1...x_{t-1})$ of observing $x_t$ at

181: time $t$, given past observations $x_1...x_{t-1}$ is

182: a computable function, then the

183: universal posterior

184: $\MM(x_t|x_1...x_{t-1})$

185: converges rapidly for $t\to\infty$ to the true posterior

186: $\mu(x_t|x_1...x_{t-1})$, hence

187: $\MM$ represents a universal predictor in case of unknown $\mu$.

188:

189: One representation of $\MM$ is as a weighted sum of

190: {\em all} enumerable ``defective'' probability measures, called

191: semimeasures (see Definition \ref{defSemi}).

192: %

193: The (from this representation obvious) dominance $\MM(x)\geq

194: const.\times\mu(x)$ for all computable $\mu$ is the central

195: ingredient in the convergence proof.

196: %

197: %General mixture distributions

198: What is so special about the class of all enumerable

199: semimeasures $\M_{enum}^{semi}$? The larger we choose $\M$ the

200: less restrictive is the essential assumption that $\M$ should

201: contain the true distribution $\mu$.

202: %

203: Why not restrict to the still rather general class of estimable or

204: finitely computable (semi)measures? For {\em every} countable

205: class $\M$ and $\xi_\M(x):=\sum_{\nu\in\M} w_\nu \nu(x)$ with

206: $w_\nu>0$, the important dominance $\xi_\M(x)\geq w_\nu

207: \nu(x)\,\forall\nu\in\M$ is satisfied. The question is what

208: properties does $\xi_\M$ possess. The distinguishing property of

209: $\MM=\xi_{\M_{enum}^{semi}}$ is that it is itself

210: an element of $\M_{enum}^{semi}$.

211: %

212: On the other hand, for prediction $\xi_\M\in\M$ is not by itself

213: an important property. What matters is  whether $\xi_\M$ is

214: computable (in one of the senses defined) to avoid

215: getting into the (un)realm of non-constructive math.

216:

217: %Goal of this work

218: The intention of this work is to investigate the existence,

219: computability and convergence of universal (semi)measures for

220: various computability classes: finitely computable $\subset$

221: estimable $\subset$ enumerable $\subset$ approximable (see

222: Definition \ref{defCompFunc}). For instance, $\MM(x)$ is

223: enumerable, but not finitely computable. The research in this work

224: was motivated by recent generalizations of Kolmogorov complexity

225: and Solomonoff's prior by Schmidhuber \cite{Schmidhuber:02gtm} to

226: approximable (and others not here discussed) cases.

227:

228: %------------------------------%

229: \paragraph{Contents}

230: %------------------------------%

231: In Section \ref{secCC} we review various computability concepts

232: and discuss their relation.

233: %

234: In Section \ref{secUniM} we define the prefix Kolmogorov

235: complexity $K$, the concept of (semi)measures, Solomonoff's

236: universal prior $\MM$, and explain its universality.

237: %

238: Section \ref{secUSP} summarizes Solomonoff's major convergence

239: result, discusses general mixture distributions and the important

240: universality property -- multiplicative dominance.

241: %

242: In Section \ref{secUSM} we define seven classes of (semi)measures

243: based on four computability concepts. Each class may or may not

244: contain a (semi)measures which dominates all elements of another

245: class. We reduce the analysis of these 49 cases to four basic

246: cases. Domination (essentially by $\MM$) is known to be true for

247: two cases. The two new cases do not allow for domination.

248: %

249: In Section \ref{secConv} we investigate more closely the type of

250: convergence implied by universality. We summarize the result on

251: posterior convergence in difference $(\xi-\mu\to 0)$ and improve

252: the previous result \cite{Li:97} on the convergence in ratio

253: $\xi/\mu\to 1$ by showing rapid convergence without use

254: of Martingales.

255: %

256: In Section \ref{secMLconv} we investigate whether convergence for

257: all Martin-L{\"o}f random sequences could hold. We define a

258: generalized concept of randomness for individual sequences and use

259: it to show that proofs based on universality cannot decide this

260: question.

261: %

262: Section \ref{secConc} concludes the paper. Proofs will be

263: presented elsewhere.

264:

265: %------------------------------%

266: \paragraph{Notation}

267: %------------------------------%

268: %Strings

269: We denote strings of length $n$ over finite alphabet $\X$ by

270: $x=x_1x_2...x_n$ with $x_t\in\X$ and further abbreviate

271: $x_{1:n}:=x_1x_2...x_{n-1}x_n$ and $x_{<n}:=x_1... x_{n-1}$,

272: $\epstr$ for the empty string, $\l(x)$ for the length of string $x$,

273: and $\omega=x_{1:\infty}$ for infinite sequences.

274: %

275: % Asymptotic notation

276: We abbreviate $\lim_{n\to\infty}[f(n)-g(n)]=0$ by

277: $f(n)\toinfty{n}g(n)$ and say $f$ converges to $g$, without

278: implying that $\lim_{n\to\infty}g(n)$ itself exists. We write

279: $f(x)\geqm  g(x)$ for $g(x)=O(f(x))$.

280:

281: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

282: \section{Computability Concepts}\label{secCC}

283: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

284: We define several computability concepts weaker than can be captured

285: by halting Turing machines.

286:

287: %------------------------------%

288: \fdefinition{defCompFunc}{Computable functions}{

289: %------------------------------%

290: We consider functions $f:\Set{N}\to\Set{R}$:

291: \begin{itemize}

292: \item[]

293: $\nq f$ is {\em finitely computable} or {\em recursive} {\it iff}

294: there are Turing machines $T_{1/2}$ with output interpreted as natural

295: numbers and $f(x)={T_1(x)\over T_2(x)}$,

296: \item[]

297: $\nq f$ is {\em approximable} {\it iff}

298: $\phi(\cdot,\cdot)$ is finitely computable and

299: $\lim_{t\to\infty}\phi(x,t)=f(x)$.

300: \item[]

301: $\nq f$ is {\em lower semi-computable} or {\em enumerable} {\it

302: iff} additionally $\phi(x,t)\leq\phi(x,t+1)$.

303: \item[]

304: $\nq f$ is {\em upper semi-computable} or {\em co-enumerable} {\it

305: iff} $[-f]$ is lower semi-computable.

306: %additionally $\phi(x,t)\geq\phi(x,t+1)$.

307: \item[]

308: $\nq f$ is {\em semi-computable} {\it iff} $f$ is lower- {\it or}

309: upper semi-computable.

310: \item[]

311: $\nq f$ is {\em estimable} {\it iff} $f$ is lower- {\it and} upper

312: semi-computable.

313: \end{itemize}

314: }%------------------------------%

315:

316: \noindent If $f$ is estimable we can finitely compute an

317: $\eps$-approximation of $f$ by upper and lower semi-computing $f$

318: and terminating when differing by less than $\eps$. This means

319: that there is a Turing machine which, given $x$ and $\eps$,

320: finitely computes $\hat y$ such that $|\hat y-f(x)|<\eps$.

321: Moreover it gives an interval estimate $f(x)\in[\hat y-\eps,\hat

322: y+\eps]$. An estimable integer-valued function is finitely

323: computable (take any $\eps<1$).

324: %

325: Note that if $f$ is only approximable or semi-computable we can

326: still come arbitrarily close to $f(x)$ but we cannot devise a

327: terminating algorithm which produces an $\eps$-approximation. In

328: the case of lower/upper semi-computability we can at least

329: finitely compute lower/upper bounds to $f(x)$. In case of

330: approximability, the weakest computability form, even this

331: capability is lost.

332: %

333: In analogy to lower/upper semi-computability one may think of

334: notions like lower/upper estimability but they are easily shown to

335: coincide with estimability. The following implications are valid:

336:

337: \begin{center}\small

338: \fbox{\parbox{11ex}{recursive=\\ finitely\\ computable}}

339: $\Rightarrow$

340: \fbox{\parbox{9ex}{estimable}}

341: %

342: \parbox{26ex}{\raisebox{-3ex}{$\Rightarrow$} \fbox{

343: \parbox{17ex}{enumerable=\\lower semi-\\ computable}}

344: \raisebox{-3ex}{$\Rightarrow$} \\[2ex]

345: \raisebox{3ex}{$\Rightarrow$} \fbox{

346: \parbox{17ex}{co-enumerable=\\ upper semi-\\

347: computable}} \raisebox{3ex}{$\Rightarrow$}}

348: \fbox{\parbox{11ex}{semi-\\ computable}}

349: $\Rightarrow$

350: \fbox{approximable}

351: \end{center}

352:

353: \noindent In the following we use the term computable synonymous

354: to finitely computable, but sometimes also generically for some of

355: the computability forms of Definition \ref{defCompFunc}.

356: %

357: What we call {\em estimable} is often just called {\em

358: computable}, but it makes sense to separate the concepts of

359: finite computability and estimability in this work, since the

360: former is conceptually easier and some previous results have only

361: been proved for this case.

362:

363: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

364: \section{The Universal Prior $\MM$}\label{secUniM}

365: %\subsection{Solomonoff's Universal Prior}

366: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

367: \index{Turing machine!universal}

368: \index{Turing machine!prefix}

369: \index{tape!unidirectional}

370: \index{tape!bidirectional}%

371: \index{semimeasure!universal}

372: %

373: % Universal prior

374: The prefix Kolmogorov complexity $K(x)$ is defined as the length

375: of the shortest binary program $p\in\B^*$ for which a universal prefix

376: Turing machine $U$ (with binary program tape and $\X$ary output

377: tape) outputs string $x\in\X^*$, and similarly $K(x|y)$ in case of side

378: information $y$ \cite{Li:97}:

379: \beqn

380:   K(x)=\min\{\l(p):U(p)=x\},\qquad

381:   K(x|y)=\min\{\l(p):U(p,y)=x\}

382: \eeqn

383: Solomonoff

384: \cite{Solomonoff:64,Solomonoff:78}

385: (with a flaw fixed by Levin \cite{Zvonkin:70})

386: defined (earlier) the closely related

387: quantity, the universal prior $\MM(x)$.

388: %

389: It is defined as the

390: probability that the output of a universal Turing machine starts

391: with $x$ when provided with \idx{fair coin flips} on the input

392: tape. Formally, $\MM$ can be defined as

393: \beq\label{Mdef}

394:   \MM(x)\;:=\;\sum_{p\;:\;U(p)=x*}\nq 2^{-\l(p)}

395: \eeq

396: where the sum is over all so called minimal programs $p$ for which

397: $U$ outputs a string starting with $x$ (indicated by the $*$).

398: %

399: Before we can discuss the stochastic properties of $\MM$ we

400: need the concept of (semi)measures for strings.

401:

402: \index{semimeasure!enumerable}

403: %------------------------------%

404: \fdefinition{defSemi}{Continuous (Semi)measures}{

405: %------------------------------%

406: $\mu(x)$ denotes the probability that a sequence starts

407: with string $x$. We call $\mu\geq 0$ a (continuous) semimeasure if

408: $\mu(\epstr)\leq 1$ and $\mu(x)\geq\mu(x0)+\mu(x1)$, and a

409: (probability) measure if equality holds.

410: }%------------------------------%

411:

412: \noindent We have $\MM(x0)+\MM(x1)<\MM(x)$ because there are

413: programs $p$, which output $x$, neither followed by $0$ nor $1$.

414: They just stop after printing $x$ or continue forever without any

415: further output. Together with $\MM(\epstr)=1$ this shows that $\MM$

416: is a semimeasure, but {\it not} a probability measure. We can now

417: state the fundamental property of $\MM$ \cite{Solomonoff:78}:

418:

419: %------------------------------%

420: \ftheorem{thUniM}{Universality of $\MM$}{

421: %------------------------------%

422: The universal prior $\MM$ is an enumerable semimeasure which

423: multiplicatively dominates all enumerable semimeasures in the

424: sense that $\MM(x) \;\geqm\; 2^{-K(\rho)}\cdot \rho(x)$

425: for all an enumerable semimeasures $\rho$. $\MM$ is enumerable, but not

426: estimable or finitely computable.

427: }%------------------------------%

428: \indxs{multiplicative}{majorization}

429: \indxs{probability distribution}{computable}

430:

431: % Explanation

432: \noindent The Kolmogorov complexity of a function like $\rho$ is

433: defined as the length of the shortest self-delimiting code of a

434: Turing machine computing this function in the sense of Definition

435: \ref{defCompFunc}. Up to a multiplicative constant, $\MM$ assigns higher

436: probability to all $x$ than any other computable probability

437: distribution.

438:

439: % Normalization of $\MM$

440: It is possible to normalize $\MM$ to a true probability measure

441: $\MM_{norm}$ \cite{Solomonoff:78,Li:97} with dominance still being

442: true, but at the expense of giving up enumerability ($\MM_{norm}$

443: is still approximable). $\MM$ is more convenient when studying

444: algorithmic questions, but a true probability measure like

445: $\MM_{norm}$ is more convenient when studying stochastic questions.

446:

447: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

448: \section{Universal Sequence Prediction}\label{secUSP}

449: %\subsection{Solomonoff's Universal Sequence Prediction Scheme}

450: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

451:

452: % Occam & Epicurus in $\MM =2^-K$

453: In which sense does $\MM$ incorporate Occam's razor and Epicurus'

454: principle of multiple explanations? Since the shortest programs

455: $p$ dominate the sum in $M$, $\MM(x)$ is roughly equal to

456: $2^{-K(x)}$ ($\MM(x)=2^{-K(x)+O(K(\l(x))}$), i.e.\

457: $\MM$ assigns high probability to simple

458: strings. More useful is to think of $x$ as being the observed

459: history. We see from (\ref{Mdef}) that every program $p$

460: consistent with history $x$ is allowed to contribute to $\MM$

461: (Epicurus). On the other hand shorter programs give significantly

462: larger contribution (Occam). How does all this affect prediction?

463: If $\MM(x)$ describes our (subjective) prior belief in $x$, then

464: $\MM(y|x):=\MM(xy)/\MM(x)$ must be our posterior belief in $y$.

465: %

466: From the symmetry of algorithmic information $K(xy)\approx

467: K(y|x)+K(x)$, and $\MM(x)\approx 2^{-K(x)}$ and $\MM(xy)\approx

468: 2^{-K(xy)}$ we get $\MM(y|x)\approx 2^{-K(y|x)}$. This tells us

469: that $\MM$ predicts $y$ with high probability iff $y$ has an easy

470: explanation, given $x$ (Occam \& Epicurus).

471:

472: % Caution

473: The above qualitative discussion should not create the impression

474: that $\MM(x)$ and $2^{-K(x)}$ always lead to predictors of

475: comparable quality. Indeed in the online/incremental setting,

476: $K(y)=O(1)$ invalidates the consideration above. The proof of

477: (\ref{eukdist}) below, for instance, depends on $\MM$ being a

478: semimeasure and the chain rule being exactly true, neither of them is

479: satisfied by $2^{-K(x)}$. See \cite{Hutter:03unimdl} for a more

480: detailed analysis.

481:

482: % Solomonoff's universal sequence prediction

483: \index{sequence prediction!Solomonoff} Sequence

484: prediction algorithms try to predict the continuation $x_t\in\X$

485: of a given sequence $x_1...x_{t-1}$.

486: %

487: We assume that the true sequence is

488: drawn from a computable

489: probability distribution $\mu$, i.e.\ the true (objective)

490: probability of $x_{1:t}$ is $\mu(x_{1:t})$. The probability of

491: $x_t$ given $x_{<t}$ hence is

492: $\mu(x_t|x_{<t})=\mu(x_{1:t})/\mu(x_{<t})$.

493: %

494: Solomonoff's \cite{Solomonoff:78} central result is that $\MM$

495: converges to $\mu$. More precisely, for binary alphabet, he showed that

496: \beq\label{eukdist}

497:   \sum_{t=1}^\infty

498:   \nq\nq\;\sum_{\qquad x_{<t}\in\B^{t-1}}\nq\nq\;

499:   \mu(x_{<t}) \Big(\MM(0|x_{<t})-\mu(0|x_{<t})\Big)^2

500:   \;\leq\;

501:   {\odt}\ln 2\!\cdot\!K(\mu)+O(1) \;<\; \infty.

502: \eeq

503: The infinite sum can only be finite if the difference

504: $\MM(0|x_{<t})-\mu(0|x_{<t})$ tends to zero for $t\to\infty$ with

505: $\mu$ probability $1$ (see Definition \ref{defConv}$(i)$ and

506: \cite{Hutter:01alpha} or Section \ref{secConv} for general

507: alphabet). This holds for {\it any} computable probability

508: distribution $\mu$. The reason for the astonishing property of a

509: single (universal) function to converge to {\it any} computable

510: probability distribution lies in the fact that the set of

511: $\mu$-random sequences differ for different $\mu$. Past data

512: $x_{<t}$ are exploited to get a (with $t\to\infty$) improving

513: estimate $\MM(x_t|x_{<t})$ of $\mu(x_t|x_{<t})$.

514:

515:

516: % Bayes-mixtures

517: The universality property (Theorem \ref{thUniM}) is the central

518: ingredient in the proof of (\ref{eukdist}). The proof

519: involves the construction of a semimeasure $\xi$

520: whose dominance is obvious. The hard part is to show its

521: enumerability and equivalence to $\MM$.

522: Let $\M$ be the (countable) set of all enumerable semimeasures

523: and define

524: \beq\label{xidef}

525:   \xi(x):=\sum_{\nu\in\M}2^{-K(\nu)}\nu(x).

526: \eeq

527: Then dominance

528: \beq\label{xidom}

529:  \xi(x)\geq 2^{-K(\nu)}\nu(x)\quad\forall\,\nu\in\M

530: \eeq

531: is obvious. Is $\xi$ lower semi-computable? To answer this

532: question one has to be more precise. Levin \cite{Zvonkin:70} has

533: shown that the set of {\em all} lower semi-computable semimeasures

534: is enumerable (with repetitions). For this (ordered multi) set

535: $\M=\M_{enum}^{semi}:=\{\nu_1,\nu_2,\nu_3,...\}$ and

536: $K(\nu_i):=K(i)$ one can easily see that $\xi$ is lower

537: semi-computable. Finally proving $\MM(x)\eqm\xi(x)$ also

538: establishes universality of $\MM$ (see \cite{Solomonoff:78,Li:97}

539: for details).

540:

541: The advantage of $\xi$ over $\MM$ is that it immediately

542: generalizes to arbitrary weighted sums of (semi)measures

543: for arbitrary countable $\M$.

544:

545: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

546: \section{Universal (Semi)Measures}\label{secUSM}

547: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

548:

549: What is so special about the set of all enumerable

550: semimeasures $\M_{enum}^{semi}$? The larger we choose $\M$ the less restrictive

551: is the assumption that $\M$ should contain the true distribution

552: $\mu$, which will be essential throughout the paper.

553: %

554: Why do not restrict to the still rather general class of estimable

555: or finitely computable (semi)measures? It is clear that for every

556: countable set $\M$,

557: \beq\label{defxi}

558:   \xi(x):=\xi_\M(x):=\sum_{\nu\in\M} w_\nu \nu(x)

559:   \qmbox{with} \sum_{\nu\in\M}w_\nu\leq 1 \qmbox{and} w_\nu>0

560: \eeq

561: dominates all $\nu\in\M$. This dominance is

562: necessary for the desired convergence $\xi\to\mu$ similarly to

563: (\ref{eukdist}). The question is what properties $\xi$ possesses.

564: The distinguishing property of $\M_{enum}^{semi}$ is that $\xi$ is

565: itself an element of $\M_{enum}^{semi}$. When concerned with

566: predictions, $\xi_\M\in\M$ is not by itself an important property,

567: but whether $\xi$ is computable in one of the senses of Definition

568: \ref{defCompFunc}. We define

569: \bqan

570:  \M_1\geqm\M_2 & :\Leftrightarrow &

571:  \mbox{there is an element of $\M_1$ which dominates all elements of

572:  $\M_2$} \\

573:  & :\Leftrightarrow &

574: \exists\rho\!\in\!\M_1\;\forall\nu\!\in\!\M_2\;\exists w_\nu\!>\!0

575: \;\forall x:\rho(x)\!\geq\!w_\nu\nu(x).

576: \eqan

577: $\geqm $ is transitive (but not necessarily reflexive) in the

578: sense that $\M_1 \geqm \M_2 \geqm \M_3$ implies $\M_1 \geqm \M_3$

579: and $\M_0 \supseteq \M_1 \geqm \M_2 \supseteq \M_3$ implies $\M_0

580: \geqm \M_3$.

581: %

582: For the computability concepts introduced in Section \ref{secCC}

583: we have the following proper set inclusions

584: \beqn

585: \begin{array}{ccccccc}

586:   \M_{comp}^{msr}  & \subset & \M_{est}^{msr}  & \equiv  & \M_{enum}^{msr}  & \subset & \M_{appr}^{msr} \\

587:         \cap       &         &      \cap       &         &       \cap       &         &     \cap        \\

588:   \M_{comp}^{semi} & \subset & \M_{est}^{semi} & \subset & \M_{enum}^{semi} & \subset & \M_{appr}^{semi}

589: \end{array}

590: \eeqn

591: %

592: where $\M^{msr}_c$ stands for the set of all probability measures

593: of appropriate computability type $c\in\{$comp=finitely

594: computable, est=estimable, enum=enumerable,

595: appr=approximable$\}$, and similarly for semimeasures

596: $\M^{semi}_c$. From an enumeration of a measures $\rho$ on can

597: construct a co-enumeration by exploiting

598: $\rho(x_{1:n})=1-\sum_{y_{1:n}\neq x_{1:n}}\rho(y_{1:n})$. This

599: shows that every enumerable measure is also co-enumerable, hence

600: estimable, which proves the identity $\equiv$ above.

601:

602: With this notation, Theorem \ref{thUniM} implies

603: $\M_{enum}^{semi}\geqm\M_{enum}^{semi}$. Transitivity allows to

604: conclude, for instance, that

605: $\M_{appr}^{semi}\geqm\M_{comp}^{msr}$, i.e.\ that there is an

606: approximable semimeasure which dominates all computable measures.

607:

608: The standard ``diagonalization'' way of proving

609: $\M_1\stackrel\times{\not\geq}\M_2$ is to take an arbitrary

610: $\mu\in\M_1$ and ``increase'' it to $\rho$ such that

611: $\mu\stackrel\times{\not\geq}\rho$ and show that $\rho\in\M_2$.

612: There are $7\times 7$ combinations of (semi)measures $\M_1$ with

613: $\M_2$ for which $\M_1\geqm\M_2$ could be true or false. There are

614: four basic cases, explicated in the following theorem, from which

615: the other 49 combinations displayed in Table \ref{tabUniSMsr}

616: follow by transitivity.

617:

618: %------------------------------%

619: \ftheorem{thNoUniApp}{Universal (semi)measures}{

620: %------------------------------%

621: A semimeasure $\rho$ is said to be universal for $\M$ if it

622: multiplicatively dominates all elements of $\M$ in the sense

623: $\forall\nu\exists w_\nu>0:\rho(x)\geq w_\nu\nu(x)\forall x$. The

624: following holds true:

625: \begin{itemize}

626: \item[$o)$]

627: $\exists\rho:\{\rho\}\geqm\M$: For every countable set

628: of (semi)measures $\M$, there is a (semi)measure which dominates

629: all elements of $\M$.

630: \item[$i)$]

631: $\M_{enum}^{semi}\geqm\M_{enum}^{semi}$:

632: The class of enumerable semimeasures {\em contains}

633: a universal element.

634: \item[$ii)$]

635: $\M_{appr}^{msr}\geqm\M_{enum}^{semi}$:

636: There {\em is} an approximable measure which dominates all enumerable

637: semimeasures.

638: \item[$iii)$]

639: $\M_{est}^{semi}\stackrel\times{\not\geq}\M_{comp}^{msr}$: There is

640: {\em no} estimable semimeasure which dominates all computable

641: measures.

642: \item[$iv)$]

643: $\M_{appr}^{semi}\stackrel\times{\not\geq}\M_{appr}^{msr}$: There is

644: {\em no} approximable semimeasure which dominates all approximable

645: measures.

646: \end{itemize}

647: }%------------------------------%

648:

649: \begin{table}[thb]

650: \ftablex{tabUniSMsr}{Existence of universal (semi)measures}{%

651: The entry in row $r$ and column $c$ indicates whether there is a

652: $r$-able (semi)measure $\rho$ for the set $\M$ which contains all

653: $c$-able (semi)measures, where $r,c\in\{$comput, estimat, enumer,

654: approxim$\}$. Enumerable measures are estimable. This is the

655: reason why the enum. row and column in case of measures is

656: missing. The superscript indicates from which part of Theorem

657: \ref{thNoUniApp} the answer follows. For the bold face entries

658: directly, for the others using transitivity of $\geqm $.

659: \begin{center}

660: \begin{tabular}{|c|c||c|c|c|c||c|c|c|}\hline

661:       $\nwarrow$ &  $\M$ & \multicolumn{4}{c||}{semimeasure} & \multicolumn{3}{c|}{measure}\\ \hline

662: $\rho$&$\searrow$& comp.      & est.       & enum.         & appr.     & comp.         & est.       & appr.        \\ \hline\hline

663:       s  & comp. & no$^{iii}$ & no$^{iii}$ & no$^{iii}$    & no$^{iv}$ & no$^{iii}$    & no$^{iii}$ & no$^{iv}$    \\ \cline{2-9}

664:       e  & est.  & no$^{iii}$ & no$^{iii}$ & no$^{iii}$    & no$^{iv}$ & {\bf no}$^{\bf iii}$& no$^{iii}$ & no$^{iv}$    \\ \cline{2-9}

665:       m  & enum. & yes$^{i}$  & yes$^{i}$  & {\bf yes}$^{\bf i}$ & no$^{iv}$ & yes$^{i}$     & yes$^{i}$  & no$^{iv}$    \\ \cline{2-9}

666:       i  &appr.  & yes$^{i}$  & yes$^{i}$  & yes$^{i}$     & no$^{iv}$ & yes$^{i}$     & yes$^{i}$  & {\bf no}$^{\bf iv}$\\ \hline\hline

667:       m  & comp. & no$^{iii}$ & no$^{iii}$ & no$^{iii}$    & no$^{iv}$ & no$^{iii}$    & no$^{iii}$ & no$^{iv}$    \\ \cline{2-9}

668:       s  & est.  & no$^{iii}$ & no$^{iii}$ & no$^{iii}$    & no$^{iv}$ & no$^{iii}$    & no$^{iii}$ & no$^{iv}$    \\ \cline{2-9}

669:       r  &appr.  & yes$^{ii}$ & yes$^{ii}$ & {\bf yes}$^{\bf ii}$& no$^{iv}$ & yes$^{ii}$    & yes$^{ii}$ & no$^{iv}$    \\ \hline

670: \end{tabular}

671: \end{center}}

672: \end{table}

673:

674: \noindent If we ask for a universal (semi)measure which at least satisfies

675: the weakest form of computability, namely being approximable, we

676: see that the largest dominated set among the 7 sets defined above

677: is the set of enumerable semimeasures. This is the reason why

678: $\M_{enum}^{semi}$ plays a special role.

679: On the other hand, $\M_{enum}^{semi}$ is not the largest set

680: dominated by an approximable semimeasure, and indeed no such

681: largest set exists. One may, hence, ask for ``natural'' larger

682: sets $\M$. One such set, namely the set of cumulatively enumerable

683: semimeasures $\M_{CEM}$, has recently been discovered by

684: Schmidhuber \cite{Schmidhuber:02gtm}, for which even

685: $\xi_{CEM}\in\M_{CEM}$ holds.

686:

687: \noindent Theorem \ref{thNoUniApp} also holds for {\em discrete

688: (semi)measures} $P$ defined as follows:

689:

690: \index{semimeasure!enumerable}

691: %------------------------------%

692: \fdefinition{defDSemi}{Discrete (Semi)measures}{

693: %------------------------------%

694: $P(x)$ denotes the probability of $x\in\Set N$. We call

695: $P:\Set{N}\to[0,1]$ a discrete (semi)measure if $\sum_{x\in\Set{N}}

696: P(x)\stackrel{(<)}=1$.

697: }%------------------------------%

698:

699: \noindent Theorem \ref{thNoUniApp}

700: %$(o)$ is elementary,

701: $(i)$ is Levin's major result \cite[Th.4.3.1 \& Th.4.5.1]{Li:97}, %

702: $(ii)$ is due to Solomonoff \cite{Solomonoff:78}, %

703: the proof of

704: $\M_{comp}^{semi}\stackrel\times{\not\geq}\M_{comp}^{semi}$ in

705: \cite[p249]{Li:97} contains minor errors and is not extensible to

706: $(iii)$ and the proof in \cite[p276]{Li:97} only applies to

707: infinite alphabet and not to the binary/finite case considered

708: here. A complete proof of $(o)-(iv)$ for discrete and continuous

709: (semi)measures is given elsewhere.

710:

711: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

712: \section{Posterior Convergence}\label{secConv}

713: %\subsection{Definition of Random Sequences}

714: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

715:

716: We have investigated in detail the computational properties of

717: various mixture distributions $\xi$. A mixture $\xi_\M$

718: multiplicatively dominates all distributions in $\M$. We have

719: mentioned that dominance implies posterior convergence. In this

720: section we present in more detail what dominance implies and what

721: not.

722:

723: Convergence of $\xi(x_t|x_{<t})$ to $\mu(x_t|x_{<t})$ with

724: $\mu$-probability 1 tells us that $\xi(x_t|x_{<t})$ is close to

725: $\mu(x_t|x_{<t})$ for sufficiently large $t$ and ``most''

726: sequences $x_{1:\infty}$. It says nothing about the speed of

727: convergence, nor whether convergence is true for any {\em particular}

728: sequence (of measure 0). Convergence {\em in mean sum} defined

729: below is intended to capture the rate of convergence,

730: Martin-L\"{o}f randomness is used to capture convergence

731: properties for individual sequences.

732:

733: Martin-L\"{o}f randomness is a very important concept of

734: randomness of individual sequences, which is closely related to

735: Kolmogorov complexity and Solomonoff's universal prior. Levin gave

736: a characterization equivalent to Martin-L\"{o}f's original

737: definition \cite{Levin:73random}:

738:

739: %------------------------------%

740: \ftheorem{defML}{Martin-L\"{o}f random sequences}{

741: %------------------------------%

742: A sequence $x_{1:\infty}$ is $\mu$-Martin-L\"{o}f random

743: ($\mu$.M.L.) iff there is a constant $c$ such that

744: $\MM(x_{1:n})\leq c\cdot \mu(x_{1:n})$ for all $n$.

745: }%------------------------------%

746:

747: \noindent

748: One can show that a $\mu$.M.L.\ random sequence $x_{1:\infty}$

749: passes {\em all} thinkable effective randomness tests, e.g.\ the

750: law of large numbers, the law of the iterated logarithm, etc.

751: In particular, the set of all $\mu$.M.L. random sequences has

752: $\mu$-measure 1.

753: %

754: The following generalization is natural when considering general

755: Bayes-mixtures $\xi$ as in this work:

756:

757: %------------------------------%

758: \fdefinition{defmuMr}{$\mu/\xi$-random sequences}{

759: %------------------------------%

760: A sequence $x_{1:\infty}$ is called $\mu/\xi$-random

761: ($\mu.\xi$.r.) iff there is a constant $c$ such that

762: $\xi(x_{1:n})\leq c\cdot \mu(x_{1:n})$ for all $n$.

763: }%------------------------------%

764:

765: Typically, $\xi$ is a mixture over some $\M$ as defined in

766: (\ref{xidef}), in which case the reverse inequality

767: $\xi(x)\geqm\mu(x)$ is also true (for all $x$). For finite $\M$ or

768: if $\xi\in\M$, the definition of $\mu/\xi$-randomness depends only

769: on $\M$, and not on the specific weights used in $\xi$. For

770: $\M=\M_{enum}^{semi}$, $\mu/\xi$-randomness is just $\mu$.M.L.\

771: randomness. The larger $\M$, the more patterns are recognized as

772: non-random.

773: %($\M_{enum}^{semi}\supset\M_\Theta$).

774: Roughly speaking, those regularities characterized by some

775: $\nu\in\M$ are recognized by $\mu/\xi$-randomness, i.e.\ for

776: $\M\subset\M_{enum}^{semi}$ some $\mu/\xi$-random strings may not

777: be M.L.\ random.

778: %

779: Other randomness concepts, e.g.\ those by Schnorr, Ko, van

780: Lambalgen, Lutz, Kurtz, von Mises, Wald, and Church (see

781: \cite{Wang:96,Lambalgen:87,Schnorr:71}), could possibly also be

782: characterized in terms of $\mu/\xi$-randomness for particular

783: choices of $\cal M$.

784:

785: %------------------------------%

786: %\paragraph{Convergence of Random Sequences}%\label{secConvRSeq}

787: %------------------------------%

788: \indxs{random sequence}{convergence} A classical (non-random)

789: real-valued sequence $a_t$ is defined to converge to $a_*$, short

790: $a_t\to a_*$ if $\forall\eps\exists t_0\forall t\geq

791: t_0:|a_t-a_*|<\eps$. We are interested in convergence properties

792: of random sequences $z_t(\omega)$ for $t\to\infty$ (e.g.\

793: $z_t(\omega)=\xi(\omega_t|\omega_{<t})-\mu(\omega_t|\omega_{<t})$).

794: %

795: We denote $\mu$-expectations by $\E$. The expected value of a

796: function $f:\X^t\to\Set R$, dependent on $x_{1:t}$, independent of

797: $x_{t+1:\infty}$, and possibly undefined on a set of $\mu$-measure

798: 0, is $\E[f] =

799: \sumprime_{\!x_{1:t}\in\X^t}\mu(x_{1:t})f(x_{1:t})$. The prime

800: denotes that the sum is restricted to $x_{1:t}$ with

801: $\mu(x_{1:t})\neq 0$. Similarly we use $\P[..]$ to denote the

802: $\mu$-probability of event $[..]$

803: %

804: We define four convergence concepts for random sequences.

805:

806: \index{convergence!with probability 1}%

807: \index{convergence!in the mean}

808: \index{convergence!in mean sum}

809: \index{convergence!in probability}

810: \index{convergence!Martin-L\"of}

811: \index{convergence!$\M$}

812: %------------------------------%

813: \fdefinition{defConv}{Convergence of random sequences}{

814: %------------------------------%

815: Let $z_1(\omega),z_2(\omega),...$ be a sequence of real-valued

816: random variables. $z_t$ is said to

817: converge for $t\to\infty$ to random variable $z_*(\omega)$

818: \begin{itemize}\itemindent8ex

819: \item[$i)$] with probability 1 (w.p.1) $:\Leftrightarrow$

820:   $\P[\{\omega:z_t\to z_*\}]=1$,

821: %  \\ $\Leftrightarrow$

822: %  $\forall\eps:\P[\sup_{s\geq t}|z_t-z_s|\geq\eps]\to 0$ for $t\to\infty$,

823: \item[$ii)$] in mean sum (i.m.s.) $:\Leftrightarrow$

824: $\sum_{t=1}^\infty\E[(z_t-z_*)^2]<\infty$,

825: \item[$iii)$] for every $\mu$-Martin-L{\"o}f random sequence ($\mu$.M.L.) $:\Leftrightarrow$ \\

826: \hspace*{8ex}$\forall\omega:$ $[\exists c\forall n:

827: \MM(\omega_{1:n})\leq c\mu(\omega_{1:n})]$

828:   implies $z_t(\omega)\to z_*(\omega)$ for $t\to\infty$,

829: \item[$iv)$] for every $\mu/\xi$-random sequence ($\mu.\xi$.r.) $:\Leftrightarrow$ \\

830: \hspace*{8ex}$\forall\omega:$ $[\exists c\forall n:

831: \xi(\omega_{1:n})\leq c\mu(\omega_{1:n})]$

832:   implies $z_t(\omega)\to z_*(\omega)$ for $t\to\infty$.

833: \end{itemize}

834: }%------------------------------%

835:

836: \noindent In statistics, $(i)$ is the ``default'' characterization of

837: convergence of random sequences.

838: %

839: Convergence i.m.s.\ $(ii)$ is very strong: it

840: provides a rate of convergence in the sense that the expected

841: number of times $t$ in which $z_t$ deviates more than $\eps$ from

842: $z_*$ is finite and bounded by

843: $\sum_{t=1}^\infty\E[(z_t-z_*)^2]/\eps^2$.

844: Nothing can be said for {\em which} $t$ these deviations occur.

845: If, additionally, $|z_t-z_*|$ were monotone decreasing, then

846: $|z_t-z_*|=o(t^{-1/2})$ could be concluded.

847: %

848: $(iii)$ uses Martin-L\"{o}f's notion of randomness of {\em individual}

849: sequences to define convergence M.L. Since this work

850: deals with general Bayes-mixtures $\xi$, we generalized in $(iv)$

851: the definition of convergence M.L.\ based on $\MM$ to

852: convergence $\mu.\xi$.r.\ based on $\xi$ in a natural way.

853: %

854: One can show that convergence i.m.s.\ implies convergence w.p.1.

855: Also convergence M.L.\ implies convergence w.p.1.

856: \index{random sequence!convergence relations}

857: \index{convergence!relations}

858: %

859: Universality of $\xi$ implies the following posterior convergence results:

860:

861: %------------------------------%

862: %\paragraph{Convergence of $\xi$ to $\mu$}\label{subsecConv}

863: %------------------------------%

864: \index{convergence!$\xi$ to $\mu$}

865:

866: %------------------------------%

867: \ftheorem{thConv}{Convergence of $\xi$ to $\mu$}{

868: %------------------------------%

869: Let there be sequences $x_1x_2...$ over a finite alphabet $\X$

870: drawn with probability $\mu(x_{1:n})\in\M$ for the first $n$

871: symbols, where $\mu$ is a measure. The universal posterior

872: probability $\xi(x_t|x_{<t})$

873: of the next symbol $x_t$ given $x_{<t}$ %defined in (\ref{xidefsp})

874: is related to the true posterior probability $\mu(x_t|x_{<t})$

875: in the following way:\vspace{-1ex}

876: \beqn

877:    \sum_{t=1}^n\E{\textstyle\left[\left(\sqrt{{\xi(x_t|x_{<t})

878:           \over\mu(x_t|x_{<t})}}-1\right)^2\right]} \;\leq\;

879:    \sum_{t=1}^n\E\bigg[\sum_{x'_t}

880:         \left(\sqrt{\xi(x'_t|x_{<t})}-\sqrt{\mu(x'_t|x_{<t})}\right)^2\bigg]

881:         \;\leq\; \ln{w_\mu^{-1}} \;<\; \infty

882: \eeqn

883: where $w_\mu$ is the weight (\ref{defxi}) of $\mu$ in $\xi$.

884: }%------------------------------%

885:

886: \noindent Theorem \ref{thConv} implies

887: \beqn

888:  \mbox{$\sqrt{\xi(x'_t|x_{<t})} \to \sqrt{\mu(x'_t|x_{<t})}$

889:  for any $x'_t$ and

890:  $\sqrt{{\xi(x_t|x_{<t})\over\mu(x_t|x_{<t})}} \to 1$, both

891:  i.m.s.\ for $t\to\infty$}.

892: \eeqn

893: %

894: %Gacs Martingale proof

895: \indxs{semi-martingale}{convergence}\index{martingales}%

896: \noindent The latter strengthens the result

897: $\xi(x_t|x_{<t})/\mu(x_t|x_{<t})\to 1$ w.p.1 derived by G\'acs in

898: \cite[Th.5.2.2]{Li:97} in that it also provides the ``speed'' of

899: convergence.

900:

901: Note also the subtle difference between the two convergence

902: results. For {\em any} sequence $x'_{1:\infty}$ (possibly constant

903: and not necessarily $\mu$-random),

904: $\mu(x'_t|x_{<t})-\xi(x'_t|x_{<t})$ converges to zero w.p.1

905: (referring to $x_{1:\infty}$), but no statement is possible for

906: $\xi(x'_t|x_{<t})/\mu(x'_t|x_{<t})$, since

907: $\lim\,\inf\mu(x'_t|x_{<t})$ could be zero. On the other hand, if

908: we stay {\em on} the $\mu$-random sequence ($x'_{1:\infty} =

909: x_{1:\infty}$), we have $\xi(x_t|x_{<t})/\mu(x_t|x_{<t})

910: \to 1$ (whether $\inf\mu(x_t|x_{<t})$ tends to zero or not does

911: not matter).

912: %

913: Indeed, it is easy to see that $\xi(1|0_{<t})/\mu(1|0_{<t})\propto

914: t\to\infty$ diverges for $\M=\{\mu,\nu\}$, $\mu(1|x_{<t}):=\odt

915: t^{-3}$ and $\nu(1|x_{<t}):=\odt t^{-2}$, although $0_{1:\infty}$ is

916: $\mu$-random. % \cite{Hutter:01op}. No longer there.

917: %

918:

919: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

920: \section{Convergence in Martin-L{\"o}f Sense}\label{secMLconv}

921: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

922:

923: An interesting open question is whether $\xi$ converges to $\mu$

924: (in difference or ratio) individually for all Martin-L\"{o}f

925: random sequences. Clearly, convergence $\mu$.M.L. may at most fail

926: for a set of sequences with $\mu$-measure zero. A convergence

927: M.L.\ result would be particularly interesting and natural for

928: Solomonoff's universal prior $M$, since M.L.\ randomness can be

929: defined in terms of $\MM$ (see Theorem \ref{defML}). Attempts to

930: convert the bounds in Theorem \ref{thConv} to effective

931: $\mu$.M.L.\ randomness tests fail, since $M(x_t|x_{<t})$ is not

932: enumerable. The proof given of $M/\mu\stackrel{M.L.}\longrightarrow 1$

933: in \cite[Th.5.2.2]{Li:97} and \cite[Th.10]{Vitanyi:00} is

934: incomplete.$\!$\footnote{The formulation of their Theorem is quite

935: misleading in general: ``{\it Let $\mu$ be a positive recursive

936: measure. If the length of $y$ is fixed and the length of $x$ grows

937: to infinity, then $M(y|x)/\mu(y|x)\to 1$ with $\mu$-probability

938: one. The infinite sequences $\omega$ with prefixes $x$ satisfying

939: the displayed asymptotics are precisely [`$\Rightarrow$' {\em and}

940: `$\Leftarrow$'] the $\mu$-random sequences.}'' First, for

941: off-sequence $y$ convergence w.p.1 does not hold ($xy$ must be

942: demanded to be a prefix of $\omega$). Second, the proof of

943: `$\Leftarrow$' is loopy (see main text). Last, `$\Rightarrow$' is

944: given without proof and is probably wrong. Also the assertion in

945: \cite[Th.5.2.1]{Li:97} that $S_t:=\E\sum_{x'_t}

946: (\mu(x'_t|x_{<t})-M(x'_t|x_{<t}))^2$ converges to zero faster than

947: $1/t$ cannot be made, since $S_t$ may not decrease monotonically.}

948: The implication ``$\MM(x_{1:n})\leq c\cdot\mu(x_{1:n})\forall

949: n\Rightarrow \lim_{n\to\infty}\MM(x_{1:n})/\mu(x_{1:n})$ exists''

950: has been used, but not proven, and may indeed be wrong.

951:

952: Vovk \cite{Vovk:87} shows that for two finitely computable

953: semi-measures $\mu$ and $\rho$ and $x_{1:\infty}$ being $\mu$

954: {\em and} $\rho$ M.L.\ random that

955: \beqn

956: \sum_{t=1}^\infty\sum_{x'_t}\left(\sqrt{\mu(x'_t|x_{<t})}-\sqrt{\rho(x'_t|x_{<t})}\right)^2<\infty

957: \qmbox{and}

958: \sum_{t=1}^\infty\left({\rho(x_t|x_{<t})\over\mu(x_t|x_{<t})}-1\right)^2<\infty.

959: \eeqn

960: If $\MM$ were recursive, then this would imply posterior

961: $\MM\to\mu$ and $\MM/\mu\to 1$ for every $\mu$.M.L.\ random

962: sequence $x_{1:\infty}$, since {\em every} sequence is $\MM$.M.L.\

963: random. Since $\MM$ is {\em not} recursive Vovk's theorem cannot

964: be applied and it is not obvious how to generalize it. So the

965: question of individual convergence remains open. More generally,

966: one may ask whether $\xi_\M\to\mu$ for every $\mu/\xi$-random

967: sequence. It turns out that this is true for some $\M$, but false for others.

968:

969: %------------------------------%

970: \ftheorem{thMLConv}{$\mu/\xi$-convergence of $\xi$ to $\mu$}{

971: %------------------------------%

972: Let $\X=\B$ be binary and

973: $\M_\Theta:=\{\mu_\th:\mu_\th(1|x_{<t})=\th\,\forall t,\;

974: \th\in\Theta\}$ be the set of Bernoulli($\th$) distributions

975: with parameters $\th\in\Theta$. Let $\Theta_D$ be a countable

976: dense subset of $[0,1]$, e.g.\ $[0,1]\cap\Set Q$ and let $\Theta_G$

977: be a countable subset of $[0,1]$ with a gap in the sense that

978: there exist $0<\th_0<\th_1<1$ such that

979: $[\th_0,\th_1]\cap\Theta_G=\{\th_0,\th_1\}$, e.g.\

980: $\Theta_G=\{\odf,\odt\}$ or $\Theta_G=([0,{1\over

981: 4}]\cup[{1\over 2},1])\cap\Set Q$. Then

982: \begin{itemize}

983: \item[$i)$] If $x_{1:\infty}$ is $\mu/\xi_{\M_{\Theta_D}}$ random with

984: $\mu\in\M_{\Theta_D}$, then $\xi_{\M_{\Theta_D}}(x_t|x_{<t})\to\mu(x_t|x_{<t})$,

985: \item[$ii)$] There are $\mu\in\M_{\Theta_G}$ and $\mu/\xi_{\M_{\Theta_G}}\!\!$

986: random $x_{1:\infty}$ for which

987: $\xi_{\M_{\Theta_G}}\!\!(x_t|x_{<t})\not\to\mu(x_t|x_{<t})\!\!$

988: \end{itemize}\vspace{-1ex}

989: }%------------------------------%

990:

991: \noindent Our original/main motivation of studying

992: $\mu/\xi$-randomness is the implication of Theorem \ref{thMLConv}

993: that $\MM\stackrel{\mbox{\tiny M.L.}}\longrightarrow\mu$ cannot be

994: decided from $M$ being a mixture distribution or from the

995: universality property (Theorem \ref{thUniM}) alone. Further

996: structural properties of $\M_{enum}^{semi}$ have to be employed.

997: For Bernoulli sequences, convergence $\mu.\xi_{\M_\Theta}$.r.\ is

998: related to denseness of $\M_\Theta$. Maybe a denseness

999: characterization of $\M_{enum}^{semi}$ can solve the question of

1000: convergence M.L.\ of $M$. The property $\MM\in\M_{enum}^{semi}$ is

1001: also not sufficient to resolve this question, since there are

1002: $\M\ni\xi$ for which $\xi\stackrel{\mu.\xi.r}\longrightarrow\mu$

1003: and $\M\ni\xi$ for which

1004: $\xi\not\stackrel{\mu.\xi.r}\longrightarrow\mu$. Theorem

1005: \ref{thMLConv} can be generalized to i.i.d.\ sequences over

1006: general finite alphabet $\X$.

1007:

1008: The idea to prove $(ii)$ is to construct a sequence $x_{1:\infty}$

1009: which is $\mu_{\th_0}\M$-random {\em and} $\mu_{\th_1}\M$-random

1010: for $\th_0\neq\th_1$. This is possible if and only if $\Theta$

1011: contains a gap and $\th_0$ and $\th_1$ are the boundaries of the

1012: gap. Obviously $\xi$ cannot converge to $\th_0$ {\em and} $\th_1$,

1013: thus proving $\M$-non-convergence. For no $\th\in[0,1]$ will this

1014: $x_{1:\infty}$ be $\mu_\th$ M.L.-random. Finally, the proof of

1015: Theorem \ref{thMLConv}

1016: makes essential use of the mixture representation of $\xi$, as

1017: opposed to the proof of Theorem \ref{thConv} which only needs

1018: dominance $\xi\geqm\M$.

1019:

1020: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1021: \section{Conclusions}\label{secConc}

1022: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1023:

1024: For a hierarchy of four computability definitions, we completed

1025: the classification of the existence of computable (semi)measures

1026: dominating all computable (semi)measures. Dominance is an important

1027: property of a prior, since it implies rapid convergence of the

1028: corresponding posterior with probability one.

1029: %

1030: A strengthening would be convergence for all Martin-L{\"o}f (M.L.)

1031: random sequences. This seems natural, since M.L.\ randomness can

1032: be defined in terms of Solomonoff's prior $M$, so there is a close

1033: connection.

1034: %

1035: Contrary to what was believed before, the question of posterior

1036: convergence $M/\mu\to 1$ for all M.L.\ random sequences is still

1037: open. We introduced a new flexible notion of $\mu/\xi$-randomness

1038: which contains Martin-L{\"of} randomness as a special case. Though

1039: this notion may have a wider range of application, the main

1040: purpose for its introduction was to show that standard proof

1041: attempts of $M/\mu\stackrel{M.L.}\longrightarrow 1$ based on

1042: dominance only must fail. This follows from the

1043: derived result that the validity of $\xi/\mu\to 1$ for

1044: $\mu/\xi$-random sequences depends on the Bayes mixture $\xi$.

1045:

1046: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1047: %         Bibliography        %

1048: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1049: {\small

1050: \begin{thebibliography}{Wan96}

1051:

1052: \bibitem[Hut01]{Hutter:01alpha}

1053: M.~Hutter.

1054: \newblock Convergence and error bounds of universal prediction for general

1055:   alphabet.

1056: \newblock {\em Proceedings of the 12th Eurpean Conference on Machine Learning

1057:   (ECML-2001)}, pages 239--250, 2001.

1058:

1059: \bibitem[Hut03]{Hutter:03unimdl}

1060: M.~Hutter.

1061: \newblock Sequence prediction based on monotone complexity.

1062: \newblock Technical Report IDSIA-09-03, 2003.

1063:

1064: \bibitem[Lam87]{Lambalgen:87}

1065: {M. van} Lambalgen.

1066: \newblock {\em Random Sequences}.

1067: \newblock PhD thesis, University of Amsterdam, 1987.

1068:

1069: \bibitem[Lev73]{Levin:73random}

1070: L.~A. Levin.

1071: \newblock On the notion of a random sequence.

1072: \newblock {\em Soviet Math. Dokl.}, 14(5):1413--1416, 1973.

1073:

1074: \bibitem[LV97]{Li:97}

1075: M.~Li and P.~M.~B. Vit\'anyi.

1076: \newblock {\em An introduction to {Kolmogorov} complexity and its

1077:   applications}.

1078: \newblock Springer, 2nd edition, 1997.

1079:

1080: \bibitem[Sch71]{Schnorr:71}

1081: C.~P. Schnorr.

1082: \newblock {\em Zuf{\"a}lligkeit und Wahrscheinlichkeit}.

1083: \newblock Springer, Berlin, 1971.

1084:

1085: \bibitem[Sch02]{Schmidhuber:02gtm}

1086: J.~Schmidhuber.

1087: \newblock Hierarchies of generalized {Kolmogorov} complexities and

1088:   nonenumerable universal measures computable in the limit.

1089: \newblock {\em International Journal of Foundations of Computer Science},

1090:   13(4):587--612, 2002.

1091:

1092: \bibitem[Sol64]{Solomonoff:64}

1093: R.~J. Solomonoff.

1094: \newblock A formal theory of inductive inference: Part 1 and 2.

1095: \newblock {\em Inform. Control}, 7:1--22, 224--254, 1964.

1096:

1097: \bibitem[Sol78]{Solomonoff:78}

1098: R.~J. Solomonoff.

1099: \newblock Complexity-based induction systems: comparisons and convergence

1100:   theorems.

1101: \newblock {\em IEEE Trans. Inform. Theory}, IT-24:422--432, 1978.

1102:

1103: \bibitem[VL00]{Vitanyi:00}

1104: P.~M. Vit{\'a}nyi and M.~Li.

1105: \newblock Minimum description length induction, {B}ayesianism, and {K}olmogorov

1106:   complexity.

1107: \newblock {\em IEEE Trans. on Information Theory}, 46(2):446--464, 2000.

1108:

1109: \bibitem[Vov87]{Vovk:87}

1110: V.~G. Vovk.

1111: \newblock On a randomness criterion.

1112: \newblock {\em DOKLADY: Russian Academy of Sciences Doklady. Mathematics

1113:   (formerly Soviet Mathematics--Doklady)}, 35(3):656--660, 1987.

1114:

1115: \bibitem[Wan96]{Wang:96}

1116: Y.~Wang.

1117: \newblock {\em Randomness and Complexity}.

1118: \newblock PhD thesis, 1996.

1119:

1120: \bibitem[ZL70]{Zvonkin:70}

1121: A.~K. Zvonkin and L.~A. Levin.

1122: \newblock The complexity of finite objects and the development of the concepts

1123:   of information and randomness by means of the theory of algorithms.

1124: \newblock {\em Russian Mathematical Surveys}, 25(6):83--124, 1970.

1125:

1126: \end{thebibliography}

1127: }

1128:

1129: \end{document}

1130: %---------------------End-of-UniPriors.tex--------------------%

1131: