0503:cs0503026/cs0503026

1:

2: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

3: %%        On Generalized Computable Universal Priors         %%

4: %%                 and their Convergence                     %%

5: %%             Marcus Hutter: Start: 01.08.02                %%

6: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

7:

8: \newif\ifjournal\journalfalse   % journal style versus no-style

9:

10: %-------------------------------%

11: %   Document-Style              %

12: %-------------------------------%

13: \ifjournal

14: \documentclass{elsart}

15: \usepackage{latexsym}

16: \sloppy

17:

18: \else

19:

20: \documentclass[12pt,twoside]{article}

21: \usepackage{latexsym}

22:

23: \pagestyle{myheadings}

24: \markboth{\sc Marcus Hutter, Technical Report, IDSIA-05-05

25: }{\sc Computable Universal Priors}

26: \setcounter{tocdepth}{4} \setcounter{secnumdepth}{2}

27: \topmargin=0mm  \oddsidemargin=5mm \evensidemargin=5mm

28: \textwidth=15cm \textheight=22cm

29: \sloppy

30: \fi

31:

32: %-------------------------------%

33: %       My Math-Spacings        %

34: %-------------------------------%

35: \ifjournal

36: \def\beq{\begin{equation}}    \def\eeq{\end{equation}}

37: \def\beqn{\begin{displaymath}}\def\eeqn{\end{displaymath}}

38: \def\bqa{\begin{eqnarray}}    \def\eqa{\end{eqnarray}}

39: \def\bqan{\begin{eqnarray*}}  \def\eqan{\end{eqnarray*}}

40: \else

41: \def\,{\mskip 3mu} \def\>{\mskip 4mu plus 2mu minus 4mu} \def\;{\mskip 5mu plus 5mu} \def\!{\mskip-3mu}

42: \def\dispmuskip{\thinmuskip= 3mu plus 0mu minus 2mu \medmuskip=  4mu plus 2mu minus 2mu \thickmuskip=5mu plus 5mu minus 2mu}

43: \def\textmuskip{\thinmuskip= 0mu                    \medmuskip=  1mu plus 1mu minus 1mu \thickmuskip=2mu plus 3mu minus 1mu}

44: \textmuskip

45: \def\eqsp{\vspace{0ex}}

46: \def\beq{\dispmuskip\eqsp\begin{equation}}    \def\eeq{\eqsp\end{equation}\textmuskip}

47: \def\beqn{\dispmuskip\eqsp\begin{displaymath}}\def\eeqn{\eqsp\end{displaymath}\textmuskip}

48: \def\bqa{\dispmuskip\eqsp\begin{eqnarray}}    \def\eqa{\eqsp\end{eqnarray}\textmuskip}

49: \def\bqan{\dispmuskip\eqsp\begin{eqnarray*}}  \def\eqan{\eqsp\end{eqnarray*}\textmuskip}

50: \fi

51:

52: %-------------------------------%

53: %   Macro-Definitions           %

54: %-------------------------------%

55: \ifjournal

56: \def\cal{\mathcal}

57: \else

58: \newenvironment{keyword}{\centerline{\bf\small

59: Keywords}\vspace{-1ex}\begin{quote}\small}{\par\end{quote}\vskip 1ex}

60: \fi

61: \newtheorem{theorem}{Theorem}

62: \newtheorem{corollary}[theorem]{Corollary}

63: \newtheorem{lemma}[theorem]{Lemma}

64: \newtheorem{definition}[theorem]{Definition}

65: \newtheorem{tablex}[theorem]{Table}

66: \newtheorem{figurex}[equation]{Figure}

67:

68: \def\ftheorem#1#2#3{\begin{theorem}[#2]\label{#1} #3 \end{theorem} }

69: \def\fcorollary#1#2#3{\begin{corollary}[#2]\label{#1} #3 \end{corollary} }

70: \def\flemma#1#2#3{\begin{lemma}[#2]\label{#1} #3 \end{lemma} }

71: \def\fdefinition#1#2#3{\begin{definition}[#2]\label{#1} #3 \end{definition} }

72: \def\ftablex#1#2#3{\begin{tablex}[#2]\label{#1} #3 \end{tablex} }

73: \def\ffigurex#1#2#3#4{{#4}\begin{figurex}[#2]\label{#1}#3\end{figurex}}

74:

75: \ifjournal

76: \def\paradot#1{{\itshape{#1.}}}

77: \def\paranodot#1{{\itshape{#1}}}

78: \else

79: \def\myparskip{\vspace{1.5ex plus 0.5ex minus 0.5ex}\noindent}

80: \def\paradot#1{\myparskip{\bfseries\boldmath{#1.}}}

81: \def\paranodot#1{\myparskip{\bfseries\boldmath{#1}}}

82: \fi

83: \def\toinfty#1{\stackrel{#1\to\infty}{\longrightarrow}}

84: \def\nq{\hspace{-1em}}

85: \def\qed{\hspace*{\fill}$\Box\quad$}

86: \def\odn{{\textstyle{1\over n}}}

87: \def\odt{{\textstyle{1\over 2}}}

88: \def\odf{{\textstyle{1\over 4}}}

89: \def\eps{\varepsilon}                   % for small positive number

90: \def\epstr{\epsilon}                    % for empty string

91: \def\qmbox#1{{\quad\mbox{#1}\quad}}

92: \def\argmax{\mathop{\rm arg\,max}}          % maxarg

93: \def\argmin{\mathop{\rm arg\,min}}          % minarg

94: \def\geqm{\unrhd}

95: \def\ngeqm{{\not\unrhd}}

96: \def\v#1{{\bf #1}}

97: \def\l{{\ell}}                          % length of string or program

98: \def\M{{\cal M}}                        % Set of prob. distributions

99: \def\X{{\cal X}}                        % input/perception set/alphabet

100: \def\Y{{\cal Y}}                        % output/action set/alphabet

101: \def\R{{\cal R}}                        % reward set subset of reals

102: \def\F{{\cal F}}                        % Generic performance measure

103: \def\I{{\cal I}}                        % some set

104: \def\S{{\cal S}}                        % some set

105: \def\Q{{\cal Q}}

106: \def\E{{\bf E}}                         % Expectation value

107: \def\P{{\bf P}}                         % Expectation value

108: \def\B{\{0,1\}}                        % Binary set (or \SetB)

109: \def\Km{K\!m}

110: \def\MM{M}                              % Solomonoff's prior

111: \def\th{\theta}

112: \def\e{{\rm e}}                        % natural e

113: \def\SetN{I\!\!N} \def\SetQ{I\!\!\!Q} \def\SetR{I\!\!R} \def\SetZ{Z\!\!\!Z}

114: \def\lb{\log}

115: \def\sumprime{\mathop{{\sum\nolimits'}}}

116: \def\text#1{\mbox{\scriptsize{#1}}}    % if not using amstex

117:

118: \begin{document}

119: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

120: %                      T i t l e - P a g e                      %

121: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

122:

123: \ifjournal

124:

125: \begin{frontmatter}

126: \title{On Generalized Computable Universal Priors and their Convergence}

127: \author{Marcus Hutter}

128: \address{IDSIA, Galleria 2, CH-6928 Manno-Lugano, Switzerland \\

129: marcus@idsia.ch \hspace{9ex} http://www.idsia.ch/$^{_{_\sim}}\!$marcus}

130:

131: \thanks{A preliminary version appeared

132:   in the proceedings of the ALT 2003 conference \cite{Hutter:03unipriors}.

133:   This work was supported by SNF grant 2000-61847.00 to J\"urgen Schmidhuber.}

134:

135: \else

136:

137: \title{\vskip -25mm\normalsize\sc Technical Report \hfill IDSIA-05-05

138: \vskip 2mm\bf\LARGE\hrule height5pt \vskip 3mm

139: \sc On Generalized Computable Universal Priors and their Convergence%

140: \thanks{A preliminary version appeared

141:   in the proceedings of the ALT 2003 conference \cite{Hutter:03unipriors}.\newline

142:   \hspace*{4ex}This work was supported by SNF grant 2000-61847.00 to J\"urgen Schmidhuber.}

143: \vskip 2mm \hrule height2pt \vskip 5mm}

144: \author{{\bf Marcus Hutter}\\[3mm]

145: \normalsize IDSIA, Galleria 2, CH-6928\ Manno-Lugano, Switzerland\\

146: \normalsize marcus@idsia.ch \hspace{8.5ex} http://www.idsia.ch/$^{_{_\sim}}\!$marcus}

147: \date{11 March 2005}

148: \maketitle

149:

150: \fi

151:

152: \begin{abstract}

153: \noindent Solomonoff unified Occam's razor and Epicurus' principle

154: of multiple explanations to one elegant, formal, universal theory

155: of inductive inference, which initiated the field of algorithmic

156: information theory. His central result is that the posterior of

157: the universal semimeasure $\MM$ converges rapidly to the true

158: sequence generating posterior $\mu$, if the latter is computable.

159: Hence, $M$ is eligible as a universal predictor in case of unknown

160: $\mu$. The first part of the paper investigates the existence and

161: convergence of computable universal (semi)measures for a hierarchy

162: of computability classes: recursive, estimable, enumerable, and

163: approximable. For instance, $\MM$ is known to be enumerable, but

164: not estimable, and to dominate all enumerable semimeasures. We

165: present proofs for discrete and continuous semimeasures. The

166: second part investigates more closely the types of convergence,

167: possibly implied by universality: in difference and in ratio, with

168: probability 1, in mean sum, and for Martin-L{\"o}f random

169: sequences. We introduce a generalized concept of randomness for

170: individual sequences and use it to exhibit difficulties regarding

171: these issues. In particular, we show that convergence fails

172: (holds) on generalized-random sequences in gappy (dense) Bernoulli

173: classes.

174: \end{abstract}

175:

176: \begin{keyword}

177: Sequence prediction;

178: Algorithmic Information Theory;

179: Solomonoff's prior;

180: universal probability;

181: mixture distributions;

182: posterior convergence;

183: computability concepts;

184: Martin-L{\"o}f randomness.

185: \end{keyword}

186:

187: \ifjournal\end{frontmatter}\else\pagebreak\fi

188:

189: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

190: \section{Introduction}\label{secIntro}

191: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

192:

193: %Induction and Occam's razor

194: All induction problems can be phrased as sequence prediction

195: tasks. This is, for instance, obvious for time-series prediction,

196: but also includes classification tasks. Having observed data $x_t$

197: at times $t<n$, the task is to predict the $t$-th symbol $x_t$ from

198: sequence $x=x_1...x_{t-1}$.

199: %

200: The key concept to attack general induction problems is {\em

201: Occam's razor} (simplicity) principle, which says that ``{\em

202: Entities should not be multiplied beyond necessity}.'' and to a

203: less extent Epicurus' principle of multiple explanations. The

204: former/latter may be interpreted as to keep the simplest/all

205: theories consistent with the observations $x_1...x_{t-1}$ and to

206: use these theories to predict $x_t$.

207: %

208: %Kolmogorov complexity

209: Kolmogorov (and others) defined the complexity of a string as the

210: length of its shortest description on a universal Turing machine.

211: The Kolmogorov complexity $K$ is an excellent universal complexity

212: measure, suitable for quantifying Occam's razor. There is (only)

213: one disadvantage: $K$ is not computable.

214:

215: % computability concepts

216: More precisely, a function $f$ is said to be {\em recursive} (or

217: {\em finitely computable}) if there exists a Turing machine that,

218: given $x$, computes $f(x)$ and then halts. Some functions are not

219: recursive but still {\em approximable} (or {\em limit-computable})

220: in the sense that there is a nonhalting Turing machine with an

221: infinite ($x$-dependent) output sequence $y_1,y_2,y_3,...$ and

222: $\lim_{t\to\infty}y_t=f(x)$. If additionally the output sequence

223: is monotone increasing/decreasing, then $f$ is said to be {\em

224: lower/upper semicomputable} (or {\em enumerable/co-enumerable}).

225: Finally we call $f$ {\em estimable} if some Turing machine, given

226: $x$ and a precision $\eps$, finitely computes an

227: $\eps$-approximation of $x$.

228: %

229: The major algorithmic property of $K$ is that it is co-enumerable,

230: but not recursive.

231:

232: %Solomonoff's universal prior

233: More suitable for predictions is Solomonoff's

234: \cite{Solomonoff:64,Solomonoff:78} {\em universal prior} $\MM(x)$

235: defined as the probability that the output of a universal monotone Turing

236: machine $U$ starts with string $x$ when provided with fair

237: coin flips on the input tape. $\MM(x)$ is enumerable and roughly

238: $2^{-K(x)}$, hence implementing Occam's and also Epicurus'

239: principles.

240:

241: %Universal sequence prediction (dominance and convergence)

242: Assume now that strings $x$ are sampled from a probability

243: distribution $\mu$, i.e.\ the probability of a string starting

244: with $x$ shall be $\mu(x)$.

245: %

246: The probability of observing $x_t$ at time $t$, given past

247: observations $x_1...x_{t-1}$ is

248: $\mu(x_t|x_1...x_{t-1})=\mu(x_1...x_t)/\mu(x_1...x_{t-1})$.

249: %

250: Solomonoff's \cite{Solomonoff:78} central result is that the

251: universal posterior

252: $\MM(x_t|x_1...x_{t-1})=\MM(x_1...x_t)/\MM(x_1...x_{t-1})$

253: converges rapidly to the true (objective) posterior probability

254: $\mu(x_t|x_1...x_{t-1})$, if $\mu$ is an estimable measure, hence

255: $\MM$ can be used for predictions in case of unknown $\mu$.

256: %

257: One representation of $\MM$ is as a $2^{-K(\mu)}$-weighted sum of

258: {\em all} enumerable ``defective'' probability measures, called

259: semimeasures.

260: %

261: The (from this representation obvious) dominance $\MM(x) \geq

262: 2^{-K(\mu)}\mu(x)$ for all enumerable $\mu$ is the central

263: ingredient in the convergence proof.

264:

265: %General mixture distributions

266: Dominance and convergence immediately generalize to arbitrary

267: weighted sums of (semi)measures of some arbitrary countable set

268: $\M$.

269: %

270: So what is so special about the class of all enumerable

271: semimeasures $\M_{enum}^{semi}$? The larger we choose $\M$ the

272: less restrictive is the essential assumption that $\M$ should

273: contain the true distribution $\mu$.

274: %

275: Why not restrict to the still rather general class of estimable or

276: recursive (semi)measures? For {\em every} countable

277: class $\M$ and $\xi_\M(x):=\sum_{\nu\in\M} w_\nu \nu(x)$ with

278: $w_\nu>0$, the important dominance $\xi_\M(x)\geq w_\nu

279: \nu(x)\,\forall\nu\in\M$ is satisfied. The question is what

280: properties $\xi_\M$ possesses. The distinguishing property of

281: $\M_{enum}^{semi}$ is that $\MM=\xi_{\M_{enum}^{semi}}$ is itself

282: an element of $\M_{enum}^{semi}$.

283: %

284: On the other hand, for prediction, $\xi_\M\in\M$ is not by itself

285: an important property. What matters is  whether $\xi_\M$ is

286: computable (in one of the senses we defined above) to avoid

287: getting into the (un)realm of non-constructive math.

288:

289: %1st goal of this work

290: Our first contribution is to classify the existence of generalized

291: computable (semi)measures.

292: %

293: From \cite{Zvonkin:70} we know that there is an enumerable

294: semimeasure (namely $\MM$) that dominates all enumerable

295: semimeasures in $\M_{enum}^{semi}$. We show that there is {\em no}

296: estimable semimeasure that dominates all recursive measures (also

297: mentioned in \cite{Zvonkin:70}), and there is {\em no}

298: approximable semimeasure that dominates all approximable measures.

299: From this it follows that for a universal (semi)measure that at

300: least satisfies the weakest form of computability, namely being

301: approximable, the largest dominated class among the classes

302: considered in this work is the class of enumerable semimeasures.

303: This is the distinguishing property of $\M_{enum}^{semi}$ and

304: $\MM$.

305: %

306: This investigation was motivated by recent

307: generalizations of Kolmogorov complexity and Solomonoff's prior by

308: Schmidhuber \cite{Schmidhuber:00toe,Schmidhuber:02gtm}.

309:

310: %2nd goal of this work

311: The second contribution is to investigate more closely the types of

312: convergence, possibly implied by universality: in difference and

313: in ratio, with probability 1, in mean sum, and for Martin-L{\"o}f

314: random sequences.

315: %

316: We introduce a generalized concept of randomness for individual

317: sequences and use it to exhibit difficulties regarding these

318: issues. More concretely, we consider countable classes $\M$ of

319: Bernoulli environments and show that $\xi_\M$ converges to $\mu$

320: on all generalized random sequences if and only if the class is

321: dense.

322:

323: %------------------------------%

324: \paradot{Contents}

325: %------------------------------%

326: In Section~\ref{secCC} we review various computability concepts

327: and discuss their relation.

328: %

329: In Section~\ref{secUniM} we define the prefix Kolmogorov

330: complexity $K$, the concept of (semi)measures, Solomonoff's

331: universal prior $\MM$, and explain its universality.

332: %

333: Section~\ref{secUSP} summarizes Solomonoff's major convergence

334: result, discusses general mixture distributions and the important

335: universality property -- multiplicative dominance.

336: %

337: In Section~\ref{secUSM} we define seven classes of (semi)measures

338: based on four computability concepts. Each class may or may not

339: contain a (semi)measures that dominates all elements of another

340: class. We reduce the analysis of these 49 cases to four basic

341: cases. Domination (essentially by $\MM$) is known to be true for

342: two cases. The other two cases do not allow for domination.

343: %

344: In Section~\ref{secConv} we investigate more closely the type of

345: convergence implied by universality. We summarize the result on

346: posterior convergence in difference $(\xi-\mu\to 0)$ and improve

347: the previous result \cite{Li:97} on the convergence in ratio

348: $\xi/\mu\to 1$ by showing rapid convergence without use

349: of martingales.

350: %

351: In Section~\ref{secMLconv} we investigate whether convergence for

352: all Martin-L{\"o}f random sequences could hold. We define a

353: generalized concept of randomness for individual sequences and use

354: it to show that proofs based on universality cannot decide this

355: question.

356: %

357: Section~\ref{secConc} concludes the paper.

358:

359: %------------------------------%

360: \paradot{Notation}

361: %------------------------------%

362: %Strings

363: We denote strings of length $n$ over finite alphabet $\X$ by

364: $x=x_1x_2...x_n$ with $x_t\in\X$ and further abbreviate

365: $x_{1:n}:=x_1x_2...x_{n-1}x_n$ and $x_{<n}:=x_1... x_{n-1}$,

366: $\epstr$ for the empty string, $\l(x)$ for the length of string $x$,

367: and $\omega=x_{1:\infty}$ for infinite sequences.

368: We write $xy$ for the concatenation of string $x$ with $y$.

369: %

370: % Asymptotic notation

371: We abbreviate $\lim_{n\to\infty}[f(n)-g(n)]=0$ by

372: $f(n)\toinfty{n}g(n)$ and say $f$ converges to $g$, without

373: implying that $\lim_{n\to\infty}g(n)$ itself exists. We write

374: $f(x)\geqm  g(x)$ for $g(x)=O(f(x))$, i.e.\ if $\exists c>0:

375: f(x)\geq c g(x)\forall x$.

376:

377: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

378: \section{Computability Concepts}\label{secCC}

379: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

380: % computability concepts

381: We define several computability concepts weaker than can be captured

382: by halting Turing machines.

383:

384: %------------------------------%

385: \fdefinition{defCompFunc}{Computable functions}{

386: %------------------------------%

387: We consider functions $f:\SetN\to\SetR$:

388: \begin{itemize}\ifjournal\parskip=0ex\parsep=0ex\itemsep=0.5ex\fi

389: \item[]

390: $\nq f$ is {\em recursive} or {\em finitely computable} {\it iff}

391: there are Turing machines $T_{1/2}$ with output interpreted as natural

392: numbers and $f(x)={T_1(x)\over T_2(x)}$,

393: \item[]

394: $\nq f$ is {\em approximable} or {\em limit-computable} {\it iff}

395: $\exists$ recursive $\phi(\cdot,\cdot)$ with

396: $\lim_{t\to\infty}\phi(x,t)=f(x)$.

397: \item[]

398: $\nq f$ is {\em enumerable} or {\em lower semicomputable} {\it

399: iff} additionally $\phi(x,t)\leq\phi(x,t+1)$.

400: \item[]

401: $\nq f$ is {\em co-enumerable} or {\em upper semicomputable} {\it

402: iff} $[-f]$ is lower semicomputable.

403: \item[]

404: $\nq f$ is {\em semicomputable} {\it iff} $f$ is lower- {\it or}

405: upper semicomputable.

406: \item[]

407: $\nq f$ is {\em estimable} {\it iff} $f$ is lower- {\it and} upper

408: semicomputable.

409: \end{itemize}

410: }%------------------------------%

411:

412: \noindent If $f$ is estimable we can finitely compute an

413: $\eps$-approximation of $f$ by upper and lower semicomputing $f$

414: and terminating when differing by less than $\eps$. This means

415: that there is a Turing machine which, given $x$ and $\eps$,

416: finitely computes $\hat y\in\SetQ$ such that $|\hat y-f(x)|<\eps$.

417: Moreover it gives an interval estimate $f(x)\in[\hat y-\eps,\hat

418: y+\eps]$. An estimable integer-valued function is recursive (take

419: any $\eps<\odt$).

420: %

421: Note that if $f$ is only approximable or semicomputable we can

422: still come arbitrarily close to $f(x)$ but we cannot devise a

423: terminating algorithm that produces an $\eps$-approximation. In

424: the case of lower/upper semicomputability we can at least

425: finitely compute lower/upper bounds to $f(x)$. In case of

426: approximability, the weakest computability form, even this

427: capability is lost.

428:

429: \begin{center}\small

430: \fbox{\parbox{11ex}{recursive=\\ finitely\\ computable}}

431: $\Rightarrow$

432: \fbox{\parbox{9ex}{estimable}}

433: %

434: \parbox{26ex}{\raisebox{-3ex}{$\Rightarrow$} \fbox{

435: \parbox{17ex}{enumerable=\\lower semi-\\ computable}}

436: \raisebox{-3ex}{$\Rightarrow$} \\[2ex]

437: \raisebox{3ex}{$\Rightarrow$} \fbox{

438: \parbox{17ex}{co-enumerable=\\ upper semi-\\

439: computable}} \raisebox{3ex}{$\Rightarrow$}}

440: \fbox{\parbox{11ex}{semi-\\ computable}}

441: $\Rightarrow$

442: \fbox{\parbox{18ex}{approximable=\\ limit-computable}}

443: \end{center}

444:

445: \noindent What we call {\em estimable/recursive/finitely

446: computable} is often just called {\em computable}, but it makes

447: sense to separate the concepts in this work, since finite

448: computability is conceptually easier and some previous results

449: have only been proved for this case. Sometimes we us

450: the word {\em computable} generically for some of the

451: computability forms of Definition~\ref{defCompFunc}.

452:

453: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

454: \section{The Universal Prior $\MM$}\label{secUniM}

455: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

456:

457: % Universal prior

458: The prefix Kolmogorov complexity $K(x)$ is defined as the length

459: of the shortest binary (prefix) program $p\in\B^*$ for which a

460: universal prefix Turing machine $U$ (with binary program tape and

461: $\X$ary output tape) outputs string $x\in\X^*$, and similarly

462: $K(x|y)$ in case of side information $y$

463: \cite{Kolmogorov:65,Levin:74,Gacs:74,Chaitin:75}:

464: \beqn

465:   K(x)=\min\{\l(p):U(p)=x\},\qquad

466:   K(x|y)=\min\{\l(p):U(p,y)=x\}

467: \eeqn

468: Solomonoff \cite[Eq.(7)]{Solomonoff:64} defined (earlier) the

469: closely related quantity, the universal posterior

470: $\MM(y|x)=M(xy)/M(x)$.

471: %

472: The universal prior $M(x)$ can be defined as the probability that

473: the output of a universal monotone Turing machine $U$ starts with

474: $x$ when provided with fair coin flips on the input tape.

475: Formally, $\MM$ can be defined as

476: \beq\label{Mdef}

477:   \MM(x)\;:=\;\sum_{p\;:\;U(p)=x*}\nq 2^{-\l(p)}

478: \eeq

479: where the sum is over minimal programs $p$ for which $U$ outputs a

480: string starting with $x$. The so-called minimal programs are

481: defined similarly to the prefix programs, but $U$ need not to

482: halt, which is indicated by the $*$. Minimal programs are those

483: which are left to the input head in the moment when $U$ wrote the

484: last bit of $x$ \cite{Li:97,Hutter:04uaibook}.

485: %

486: Before we can discuss the stochastic properties of $\MM$ we

487: need the concept of (semi)measures for strings.

488:

489: %------------------------------%

490: \fdefinition{defSemi}{Continuous (Semi)measures}{

491: %------------------------------%

492: $\mu(x)$ denotes the probability that a sequence starts

493: with string $x$. We call $\mu\geq 0$ a (continuous) semimeasure if

494: $\mu(\epstr)\leq 1$ and $\mu(x)\geq\sum_{a\in\X}\mu(xa)$, and a

495: (probability) measure if equalities hold.

496: }%------------------------------%

497:

498: % motivation of nomenclature

499: \noindent The reason for calling $\mu$ with the above property a

500: probability measure is that it satisfies Kolmogorov's axioms of

501: probability in the following sense: The sample space is

502: $\X^\infty$ with elements

503: $\omega=\omega_1\omega_2\omega_3...\in\X^\infty$ being infinite

504: sequences over alphabet $\X$. The set of events (the

505: $\sigma$-algebra) is defined as the

506: set generated from the cylinder sets

507: $\Gamma_{x_{1:n}}:=\{\omega:\omega_{1:n}=x_{1:n}\}$ by countable

508: union and complement. A probability

509: measure $\mu$ is uniquely defined by giving its values

510: $\mu(\Gamma_{x_{1:n}})$ on the cylinder sets, which we abbreviate

511: by $\mu(x_{1:n})$. We will also call $\mu$ a measure, or even more

512: loose a probability distribution.

513:

514: \noindent We have $\sum_{a\in\X}\MM(xa)<\MM(x)$ because there are

515: programs $p$ that output $x$, not followed by any $a\in\X$.

516: They just stop after printing $x$ or continue forever without any

517: further output. Together with $\MM(\epstr)=1$ this shows that $\MM$

518: is a semimeasure, but {\it not} a probability measure. We can now

519: state the fundamental property of $\MM$ \cite{Zvonkin:70,Solomonoff:78}:

520:

521: %------------------------------%

522: \ftheorem{thUniM}{Universality of $\MM$}{

523: %------------------------------%

524: The universal prior $\MM$ is an enumerable semimeasure that

525: multiplicatively dominates all enumerable semimeasures in the

526: sense that $\MM(x) \;\geqm\; 2^{-K(\rho)}\cdot \rho(x)$

527: for all enumerable semimeasures $\rho$. $\MM$ is enumerable, but not

528: estimable (nor recursive).

529: }%------------------------------%

530:

531: % Explanation

532: \noindent The Kolmogorov complexity of a function like $\rho$ is

533: defined as the length of the shortest self-delimiting code of a

534: Turing machine computing this function in the sense of Definition

535: \ref{defCompFunc}. Up to a multiplicative constant, $\MM$ assigns higher

536: probability to all $x$ than any other computable probability

537: distribution.

538:

539: % Normalization of $\MM$

540: It is possible to normalize $\MM$ to a true probability measure

541: $\MM_{norm}$ \cite{Solomonoff:78,Li:97} with dominance still being

542: true, but at the expense of giving up enumerability ($\MM_{norm}$

543: is still approximable). $\MM$ is more convenient when studying

544: algorithmic questions, but a true probability measure like

545: $\MM_{norm}$ is more convenient when studying stochastic questions.

546:

547: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

548: \section{Universal Sequence Prediction}\label{secUSP}

549: %\subsection{Solomonoff's Universal Sequence Prediction Scheme}

550: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

551:

552: % Occam & Epicurus in $\MM =2^-K$

553: In which sense does $\MM$ incorporate Occam's razor and Epicurus'

554: principle of multiple explanations? Since the shortest programs

555: $p$ dominate the sum in $M$, $\MM(x)$ is roughly equal to

556: $2^{-K(x)}$ ($\MM(x)=2^{-K(x)+O(K(\l(x))}$), i.e.\

557: $\MM$ assigns high probability to simple

558: strings. More useful is to think of $x$ as being the observed

559: history. We see from (\ref{Mdef}) that every program $p$

560: consistent with history $x$ is allowed to contribute to $\MM$

561: (Epicurus). On the other hand, shorter programs give significantly

562: larger contribution (Occam). How does all this affect prediction?

563: If $\MM(x)$ describes our (subjective) prior belief in $x$, then

564: $\MM(y|x):=\MM(xy)/\MM(x)$ must be our posterior belief in $y$.

565: %

566: From the symmetry of algorithmic information $K(xy)\approx

567: K(y|x)+K(x)$, and $\MM(x)\approx 2^{-K(x)}$ and $\MM(xy)\approx

568: 2^{-K(xy)}$ we get $\MM(y|x)\approx 2^{-K(y|x)}$. This tells us

569: that $\MM$ predicts $y$ with high probability iff $y$ has an easy

570: explanation, given $x$ (Occam \& Epicurus).

571:

572: % Caution

573: The above qualitative discussion should not create the impression

574: that $\MM(x)$ and $2^{-K(x)}$ always lead to predictors of

575: comparable quality. Indeed, in the online/incremental setting,

576: $K(y)=O(1)$ invalidates the consideration above. The proof of

577: (\ref{eukdist}) below, for instance, depends on $\MM$ being a

578: semimeasure and the chain rule being exactly true, neither of them is

579: satisfied by $2^{-K(x)}$. See \cite{Hutter:03unimdl} for a

580: detailed analysis.

581:

582: % Solomonoff's universal sequence prediction

583: Sequence prediction algorithms try to predict the continuation

584: $x_t\in\X$ of a given sequence $x_1...x_{t-1}$.

585: %

586: The following bound shows that $M$ predicts computable sequences well:

587: \beq\label{eqDetMbnd}

588:   \sum_{t=1}^\infty(1\!-\!\MM(x_t|x_{<t}))^2 \;\leq\;

589:   -\odt \sum_{t=1}^\infty\ln \MM(x_t|x_{<t}) \;=\;

590:   -\odt\ln\MM(x_{1:\infty}) \;\leq\;

591:   \odt\ln 2\cdot \Km(x_{1:\infty}),

592: \eeq

593: where the monotone complexity

594: $\Km(x_{1:\infty})=\min\{\l(p):U(p)=x_{1:\infty}\}$ is defined as

595: the length of the shortest (nonhalting) program computing

596: $x_{1:\infty}$ \cite{Zvonkin:70,Levin:73random}. In the first

597: inequality we have used $(1-a)^2\leq-\odt\ln a$ for $0\leq a\leq

598: 1$. In the equality we exchanged the sum with the logarithm and

599: eliminated the resulting product by the chain rule. In the last inequality

600: we used $\MM(x)\geq 2^{-\Km(x)}$, which follows from

601: (\ref{Mdef}) by dropping all terms in $\sum_p$ except for the

602: shortest $p$ computing $x$. If $x_{1:\infty}$ is a computable

603: sequence, then $\Km(x_{1:\infty})$ is finite, which implies

604: $\MM(x_t|x_{<t})\to 1$

605: ($\sum_{t=1}^\infty(1-a_t)^2<\infty\Rightarrow a_t\to 1$). This

606: means, that if the environment is a computable sequence

607: (whichsoever, e.g.\ the digits of $\pi$ or $e$ in $\X$ary

608: representation), after having seen the first few digits, $\MM$

609: correctly predicts the next digit with high probability, i.e.\ it

610: recognizes the structure of the sequence.

611:

612: Assume now that the true sequence is

613: drawn from a computable

614: probability distribution $\mu$, i.e.\ the true (objective)

615: probability of $x_{1:t}$ is $\mu(x_{1:t})$. The probability of

616: $x_t$ given $x_{<t}$ hence is

617: $\mu(x_t|x_{<t})=\mu(x_{1:t})/\mu(x_{<t})$.

618: %

619: Solomonoff's \cite{Solomonoff:78} central result is that $\MM$

620: converges to $\mu$. More precisely, for binary alphabet, he showed that

621: \beq\label{eukdist}

622:   \sum_{t=1}^\infty

623:   \nq\nq\;\sum_{\qquad x_{<t}\in\B^{t-1}}\nq\nq\;

624:   \mu(x_{<t}) \Big(\MM(0|x_{<t})-\mu(0|x_{<t})\Big)^2

625:   \;\leq\;

626:   {\odt}\ln 2\!\cdot\!K(\mu)+O(1) \;<\; \infty.

627: \eeq

628: The infinite sum can only be finite if the difference

629: $\MM(0|x_{<t})-\mu(0|x_{<t})$ tends to zero for $t\to\infty$ with

630: $\mu$-probability $1$ (see Definition~\ref{defConv}$(i)$ and

631: \cite{Hutter:01alpha} or Section~\ref{secConv} for general

632: alphabet). This holds for {\it any} computable probability

633: distribution $\mu$. The reason for the astonishing property of a

634: single (universal) function to converge to {\it any} computable

635: probability distribution lies in the fact that the set of

636: $\mu$-random sequences differ for different $\mu$. Past data

637: $x_{<t}$ are exploited to get a (with $t\to\infty$) improving

638: estimate $\MM(x_t|x_{<t})$ of $\mu(x_t|x_{<t})$.

639:

640: % Bayes mixtures

641: The universality property (Theorem~\ref{thUniM}) is the central

642: ingredient in the proof of (\ref{eukdist}). The proof

643: involves the construction of a semimeasure $\xi$

644: whose dominance is obvious. The hard part is to show its

645: enumerability and equivalence to $\MM$.

646: Let $\M$ be the (countable) set of all enumerable semimeasures

647: and define

648: \beq\label{xidef}

649:   \xi(x):=\sum_{\nu\in\M}2^{-K(\nu)}\nu(x).

650: \eeq

651: Then dominance

652: \beq\label{xidom}

653:  \xi(x)\geq 2^{-K(\nu)}\nu(x)\quad\forall\,\nu\in\M

654: \eeq

655: is obvious. Is $\xi$ lower semicomputable? To answer this

656: question one has to be more precise. Levin \cite{Zvonkin:70} has

657: shown that the set of {\em all} lower semicomputable semimeasures

658: is enumerable (with repetitions). For this (ordered multi) set

659: $\M=\M_{enum}^{semi}:=\{\nu_1,\nu_2,\nu_3,...\}$ and

660: $K(\nu_i):=K(i)$ one can easily see that $\xi$ is lower

661: semicomputable. Finally proving $\MM(x)\geqm\xi(x)$ also

662: establishes universality of $\MM$ (see \cite{Solomonoff:78,Li:97}

663: for details).

664:

665: The advantage of $\xi$ over $\MM$ is that it immediately

666: generalizes to arbitrary weighted sums of (semi)measures

667: for arbitrary countable $\M$.

668:

669: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

670: \section{Universal (Semi)Measures}\label{secUSM}

671: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

672:

673: What is so special about the set of all enumerable

674: semimeasures $\M_{enum}^{semi}$? The larger we choose $\M$ the less restrictive

675: is the assumption that $\M$ should contain the true distribution

676: $\mu$, which will be essential throughout the paper.

677: %

678: Why do not restrict to the still rather general class of estimable

679: or recursive (semi)measures? It is clear that for every

680: countable (multi)set $\M$, the universal or mixture distribution

681: \beq\label{defxi}

682:   \xi(x):=\xi_\M(x):=\sum_{\nu\in\M} w_\nu \nu(x)

683:   \qmbox{with} \sum_{\nu\in\M}w_\nu\leq 1 \qmbox{and} w_\nu>0

684: \eeq

685: dominates all $\nu\in\M$. This dominance is

686: necessary for the desired convergence $\xi\to\mu$ similarly to

687: (\ref{eukdist}). The question is what properties $\xi$ possesses.

688: The distinguishing property of $\M_{enum}^{semi}$ is that $\xi$ is

689: itself an element of $\M_{enum}^{semi}$. When concerned with

690: predictions, $\xi_\M\in\M$ is not by itself an important property,

691: but whether $\xi$ is computable in one of the senses of Definition

692: \ref{defCompFunc}. We define

693: \bqan

694:  \M_1\geqm\M_2 & :\Leftrightarrow &

695:  \mbox{there is an element of $\M_1$ that dominates all elements of

696:  $\M_2$} \\

697:  & :\Leftrightarrow &

698: \exists\rho\!\in\!\M_1\;\forall\nu\!\in\!\M_2\;\exists w_\nu\!>\!0

699: \;\forall x:\rho(x)\!\geq\!w_\nu\nu(x).

700: \eqan

701: $\geqm $ is transitive (but not necessarily reflexive) in the

702: sense that $\M_1 \geqm \M_2 \geqm \M_3$ implies $\M_1 \geqm \M_3$

703: and $\M_0 \supseteq \M_1 \geqm \M_2 \supseteq \M_3$ implies $\M_0

704: \geqm \M_3$.

705: %

706: For the computability concepts introduced in Section~\ref{secCC}

707: we have the following proper set inclusions

708: \beqn

709: \begin{array}{ccccccc}

710:   \M_{rec}^{msr}  & \subset & \M_{est}^{msr}  & \equiv  & \M_{enum}^{msr}  & \subset & \M_{appr}^{msr} \\

711:         \cap       &         &      \cap       &         &       \cap       &         &     \cap        \\

712:   \M_{rec}^{semi} & \subset & \M_{est}^{semi} & \subset & \M_{enum}^{semi} & \subset & \M_{appr}^{semi}

713: \end{array}

714: \eeqn

715: %

716: where $\M^{msr}_c$ stands for the set of all probability measures

717: of appropriate computability type $c\in\{$rec=recursive, est=estimable, enum=enumerable,

718: appr=approximable$\}$, and similarly for semimeasures

719: $\M^{semi}_c$. From an enumeration of a measure $\rho$ one can

720: construct a co-enumeration by exploiting

721: $\rho(x_{1:n})=1-\sum_{y_{1:n}\neq x_{1:n}}\rho(y_{1:n})$. This

722: shows that every enumerable measure is also co-enumerable, hence

723: estimable, which proves the identity $\equiv$ above.

724:

725: With this notation, Theorem~\ref{thUniM} implies

726: $\M_{enum}^{semi}\geqm\M_{enum}^{semi}$. Transitivity allows to

727: conclude, for instance, that

728: $\M_{appr}^{semi}\geqm\M_{rec}^{msr}$, i.e.\ that there is an

729: approximable semimeasure that dominates all recursive measures.

730:

731: The standard ``diagonalization'' way of proving

732: $\M_1\ngeqm\M_2$ is to take an arbitrary

733: $\mu\in\M_1$ and ``increase'' it to $\rho$ such that

734: $\mu\ngeqm\rho$ and show that $\rho\in\M_2$.

735: There are $7\times 7$ combinations of (semi)measures $\M_1$ with

736: $\M_2$ for which $\M_1\geqm\M_2$ could be true or false. There are

737: four basic cases, explicated in the following theorem, from which

738: the other 49 combinations displayed in Table~\ref{tabUniSMsr}

739: follow by transitivity.

740:

741: %------------------------------%

742: \ftheorem{thNoUniApp}{Universal (semi)measures}{

743: %------------------------------%

744: A semimeasure $\rho$ is said to be universal for $\M$ if it

745: multiplicatively dominates all elements of $\M$ in the sense

746: $\forall\nu\exists w_\nu>0:\rho(x)\geq w_\nu\nu(x)\forall x$. The

747: following holds true:

748: \begin{list}{}{\parsep=1ex}

749: \item[$o)$]

750: $\exists\rho:\{\rho\}\geqm\M$: For every countable set

751: of (semi)measures $\M$, there is a (semi)measure that dominates

752: all elements of $\M$.

753: \item[$i)$]

754: $\M_{enum}^{semi}\geqm\M_{enum}^{semi}$:

755: The class of enumerable semimeasures {\em contains}

756: a universal element.

757: \item[$ii)$]

758: $\M_{appr}^{msr}\geqm\M_{enum}^{semi}$:

759: There {\em is} an approximable measure that dominates all enumerable

760: semimeasures.

761: \item[$iii)$]

762: $\M_{est}^{semi}\ngeqm\M_{rec}^{msr}$: There is

763: {\em no} estimable semimeasure that dominates all recursive

764: measures.

765: \item[$iv)$]

766: $\M_{appr}^{semi}\ngeqm\M_{appr}^{msr}$: There is

767: {\em no} approximable semimeasure that dominates all approximable

768: measures.

769: \end{list}

770: }%------------------------------%

771:

772: \begin{table}[thb]

773: \ftablex{tabUniSMsr}{Existence of universal (semi)measures}{%

774: The entry in row $r$ and column $c$ indicates whether there is an

775: $r$-able (semi)measure $\rho$ dominating the set $\M$ that contains all

776: $c$-able (semi)measures, where $r,c\in\{$recurs, estimat, enumer,

777: approxim$\}$. Enumerable measures are estimable. This is the

778: reason why the enum.\ row and column in case of measures are

779: missing. The superscript indicates from which part of Theorem

780: \ref{thNoUniApp} the answer follows. For the bold face entries

781: directly, for the others using transitivity of $\geqm $.

782: \begin{center}

783: \begin{tabular}{|c|c||c|c|c|c||c|c|c|}\hline

784:       $\nwarrow$ &  $\M$ & \multicolumn{4}{c||}{semimeasure} & \multicolumn{3}{c|}{measure}\\ \hline

785: $\rho$&$\searrow$& rec.      & est.       & enum.         & appr.     & rec.          & est.       & appr.        \\ \hline\hline

786:       s  & rec. & no$^{iii}$ & no$^{iii}$ & no$^{iii}$    & no$^{iv}$ & no$^{iii}$    & no$^{iii}$ & no$^{iv}$    \\ \cline{2-9}

787:       e  & est.  & no$^{iii}$ & no$^{iii}$ & no$^{iii}$    & no$^{iv}$ & {\bf no}$^{\bf iii}$& no$^{iii}$ & no$^{iv}$    \\ \cline{2-9}

788:       m  & enum. & yes$^{i}$  & yes$^{i}$  & {\bf yes}$^{\bf i}$ & no$^{iv}$ & yes$^{i}$     & yes$^{i}$  & no$^{iv}$    \\ \cline{2-9}

789:       i  &appr.  & yes$^{i}$  & yes$^{i}$  & yes$^{i}$     & no$^{iv}$ & yes$^{i}$     & yes$^{i}$  & {\bf no}$^{\bf iv}$\\ \hline\hline

790:       m  & rec.  & no$^{iii}$ & no$^{iii}$ & no$^{iii}$    & no$^{iv}$ & no$^{iii}$    & no$^{iii}$ & no$^{iv}$    \\ \cline{2-9}

791:       s  & est.  & no$^{iii}$ & no$^{iii}$ & no$^{iii}$    & no$^{iv}$ & no$^{iii}$    & no$^{iii}$ & no$^{iv}$    \\ \cline{2-9}

792:       r  &appr.  & yes$^{ii}$ & yes$^{ii}$ & {\bf yes}$^{\bf ii}$& no$^{iv}$ & yes$^{ii}$    & yes$^{ii}$ & no$^{iv}$    \\ \hline

793: \end{tabular}

794: \end{center}}

795: \end{table}

796:

797: \noindent If we ask for a universal (semi)measure that at least

798: satisfies the weakest form of computability, namely being

799: approximable, we see that the largest dominated set among the 7

800: sets defined above is the set of enumerable semimeasures. This is

801: the reason why $\M_{enum}^{semi}$ plays a special role. On the

802: other hand, $\M_{enum}^{semi}$ is not the largest set dominated by

803: an approximable semimeasure, and indeed no such largest set

804: exists. One may, hence, ask for ``natural'' larger sets $\M$. One

805: such set, namely the set of cumulatively enumerable semimeasures

806: $\M_{\text{CEM}}$, has recently been discovered by Schmidhuber

807: \cite{Schmidhuber:00toe,Schmidhuber:02gtm}, for which even

808: $\xi_{\text{CEM}}\in\M_{\text{CEM}}$ holds.

809:

810: \noindent Theorem~\ref{thNoUniApp} also holds for {\em discrete

811: (semi)measures} $P$ defined as follows:

812:

813: %------------------------------%

814: \fdefinition{defDSemi}{Discrete (semi)measures}{

815: %------------------------------%

816: $P(x)$ denotes the probability of $x\in\SetN$. We call

817: $P:\SetN\to[0,1]$ a discrete (semi)measure if $\sum_{x\in\SetN}

818: P(x)\stackrel{(<)}=1$.

819: }%------------------------------%

820: %

821: Theorem~\ref{thNoUniApp}

822: $(i)$ is Levin's major result \cite[Thm.4.3.1 \& Thm.4.5.1]{Li:97}, %

823: and $(ii)$ is due to Solomonoff \cite{Solomonoff:78}. %

824: The proof of $\M_{rec}^{semi}\ngeqm\M_{rec}^{semi}$ in

825: \cite[p249]{Li:97} contains minor errors and is not extensible to

826: $(iii)$, and the proof in \cite[p276]{Li:97} only applies to

827: infinite alphabet and not to the binary/finite case considered

828: here. $\M_{est}^{semi}\ngeqm\M_{est}^{semi}$

829: is mentioned in \cite{Zvonkin:70} without proof.

830: %

831: A direct proof of $(iv)$ can be found in \cite{Hutter:04uaibook}.

832: %

833: Here, we reduce $(iv)$ to $(iii)$ by exploiting the following

834: elementary fact (well-known for integer-valued functions, see

835: e.g.\ \cite[p634]{Simpson:77}):

836:

837: %------------------------------%

838: \flemma{lemOracle}{Approximable = $H$-estimable}{

839: %------------------------------%

840: A function is approximable iff it is estimable with the help of

841: the halting oracle.

842: }%------------------------------%

843:

844: %------------------------------%

845: \paradot{Proof}

846: %------------------------------%

847: With $H$-computable we mean, computable with the help of the

848: halting oracle, or equivalently, computable under extra input of

849: the halting sequence $h=h_{1:\infty}\in\B^\infty$, where $h_n=1$

850: $:\Leftrightarrow$ $U(n)$ halts.

851:

852: Assume $f$ is approximable, i.e.\ $\forall\eps\exists y,m:

853: R(m,y,\eps)$, where relation $R(m,y,\eps):=[\forall n\geq

854: m:|f_n(x)-y|<\eps]$ and recursive $f_n\to f$. Fix $\eps>0$.

855: Search (dovetail) for $m\in\SetN$ and $y$ ($\in\odt\eps\SetZ$ is

856: sufficient) such that $R(m,y,\eps)=$true. $R$ is

857: co-enumerable, hence $H$-decidable, hence $y$ can be $H$-computed,

858: hence $f$ is $H$-estimable, since $f(x)=y\pm O(\eps)$.

859:

860: Now assume that $f$ is $H$-estimable, i.e.\ $\exists T\in$TM

861: $\forall\eps,x:|T(x,\eps,h)-f(x)|<\eps$. Since $h$ is

862: co-enumerable, $T$ and hence $f$ are approximable. More formally,

863: let $h_n^t=1$ $:\Leftrightarrow$ $U(n)$ halts within $t$ steps.

864: Then $g(x,\eps) := T(x,\eps,h) = T(x,\eps,\lim_{t\to\infty}h^t) =

865: \lim_{t\to\infty}T(x,\eps,h^t)$ is approximable, where the

866: exchange of limits holds, since $T$ only reads $n_{x\eps}<\infty$

867: bits of $h$ and $h_{1:n_{x\eps}}=h^t_{1:n_{x\eps}}$ for

868: sufficiently large $t$. \qed

869:

870:

871: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

872: \section{Proof of Theorem~\ref{thNoUniApp}}\label{secProof}

873: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

874:

875: We first prove the theorem for discrete (semi)measures $P$ (Definition

876: \ref{defDSemi}), since it contains the essential ideas in a

877: cleaner form. We then present the proof for continuous

878: (semi)measures $\mu$ (Definition~\ref{defSemi}). We present proofs

879: for binary alphabet $\X=\B$ only. The proofs naturally generalize from

880: binary to arbitrary finite alphabet. $\arg\min_x f(x)$ is the $x$

881: that minimizes $f(x)$. Ties are broken in an arbitrary but

882: computable way (e.g.\ by taking the smallest $x$).

883:

884: %------------------------------%

885: \paradot{Proof (discrete case)}\\%

886: %------------------------------%

887: \paranodot{(o)} $Q(x):=\sum_{P\in\M}w_P P(x)$

888: with $w_P>0$ obviously dominates all $P\in\M$ (with constant

889: $w_P$). With $\sum_P w_P=1$ and all $P$ being discrete

890: (semi)measures also $Q$ is a discrete (semi)measure.

891:

892: \paranodot{(i)} See \cite[Thm.4.3.1]{Li:97}.

893:

894: \paranodot{(ii)} Let $P$ be the universal element in

895: $\M_{enum}^{semi}$ and $\alpha:=\sum_x P(x)$. We normalize $P$ by

896: $Q(x):={1\over\alpha}P(x)$. Since $\alpha\leq 1$ we have $Q(x)\geq

897: P(x)$. Hence $Q\geq P\geqm\M_{enum}^{semi}$. As a

898: ratio between two enumerable functions, $Q$ is still approximable,

899: hence $\M_{appr}^{msr}\geqm\M_{enum}^{semi}$.

900:

901: \paranodot{(iii)}

902: Let $P\in\M_{rec}^{semi}$. We partition $\SetN$ into chunks

903: $I_n:=\{2^{n-1},...,2^n-1\}$ ($n\geq 1$) of increasing size. With

904: $x_n:=\arg\min_{x\in I_n}P(x)$ we define $Q(x_n):={1\over

905: n(n+1)}\forall n$ and $Q(x):=0$ for all other $x$. Exploiting that

906: a minimum is smaller than an average and that $\mu$ is a

907: semimeasure, we get

908: \beqn

909: P(x_n)=\min_{x\in I_n}P(x)\leq{1\over|I_n|}\sum_{x\in

910: I_n}P(x)\leq{1\over|I_n|}={1\over 2^{n-1}}= {n(n+1)\over

911: 2^{n-1}} Q(x_n)

912: \eeqn

913: Since ${n(n+1)\over 2^{n-1}}\to 0$ for $n\to\infty$, $P$ cannot

914: dominate $Q$ ($P\ngeqm Q$). With $P$ also $Q$

915: is recursive. Since $P$ was an arbitrary recursive semimeasure

916: and $Q$ is a recursive measure ($\sum Q(x)=\sum[{1\over

917: n(n+1)}]=\sum[{1\over n}-{1\over n+1}]=1$) this implies

918: $\M_{rec}^{semi}\ngeqm\M_{rec}^{msr}$.

919:

920: Assume now that there is an estimable semimeasure

921: $S\geqm\M_{rec}^{msr}$. We construct a recursive semimeasure

922: $P\geqm S$ as follows. Choose an initial $\eps>0$ and finitely

923: compute an $\eps$-approximation $\hat S$ of $S(x)$. If $\hat

924: S>2\eps$ define $P(x):=\odt\hat S$, else halve $\eps$ and repeat

925: the process. Since $S(x)>0$ (otherwise it could not dominate,

926: e.g.\ $T(x):={1\over x(x+1)}\in\M_{rec}^{msr}$) the loop

927: terminates after finite time. So $P$ is recursive. Inserting $\hat

928: S=2P(x)$ and $\eps<\odt\hat S=P(x)$ into $|S(x)-\hat S|<\eps$ we

929: get $|S(x)-2P(x)|<P(x)$, which implies $S(x)\geq P(x)$ and

930: $S(x)\leq 3P(x)$. The former implies $\sum_x P(x)\leq \sum_x

931: S(x)\leq 1$, i.e.\ $P$ is a semimeasure. The latter implies

932: $P\geq{1\over 3}S\geqm\M_{rec}^{msr}$. Hence $P$ is a recursive

933: semimeasure dominating all recursive measures, which contradicts

934: what we have proven in the first half of $(iii)$. Hence the

935: assumption on $S$ was wrong which establishes

936: $\M_{est}^{semi}\ngeqm\M_{rec}^{msr}$.

937:

938: \paranodot{(iv)} From $(iii)$ we know that

939: $\M_{est}^{semi}\ngeqm\M_{est}^{msr}$. The proof and hence result

940: remains valid under the halting oracle, i.e.\

941: $\M_{H\text{-}est}^{semi}\ngeqm\M_{H\text{-}est}^{msr}$. By Lemma

942: \ref{lemOracle}, the $H$-estimable functions/(semi)measures coincide

943: with the approximable functions/(semi)measures, hence

944: $\M_{appr}^{semi}\ngeqm\M_{appr}^{msr}$. \qed

945:

946: %------------------------------%

947: \paradot{Proof (continuous case)}\\%

948: %------------------------------%

949: The major difference to the discrete case is that one also has to

950: take care that $\rho(x)\stackrel{(>)}=\rho(x0)+\rho(x1)$, $x\in\B^*$, is

951: respected. On the other hand, the chunking $I_n:=\B^n$ is more

952: natural here.

953:

954: \paranodot{(o)} $\rho(x):=\sum_{\nu\in\M}w_\nu \nu(x)$ with $w_\nu>0$

955: obviously dominates all $\nu\in\M$ (with domination constant

956: $w_\nu$). With $\sum_\nu w_\nu=1$ and all $\nu$ being

957: (semi)measures also $\rho$ is a (semi)measure.

958:

959: \paranodot{(i)} See \cite[Thm.4.5.1]{Li:97}.

960:

961: \paranodot{(ii)} Let $\xi$ be a universal element in $\M_{enum}^{semi}$.

962: We define \cite{Solomonoff:78}

963: \beqn

964:   \xi_{norm}(x_{1:n}) \;:=\;

965:   \prod_{t=1}^n{\xi(x_{1:t}) \over \xi(x_{<t}0)+\xi(x_{<t}1)}.

966: \eeqn

967: By induction one can show that $\xi_{norm}$ is a measure and

968: that $\xi_{norm}(x)\geq\xi(x)\forall x$, hence

969: $\xi_{norm}\geq\xi\geqm\M_{enum}^{semi}$. As a ratio

970: of enumerable functions, $\xi_{norm}$ is still approximable, hence

971: $\M_{appr}^{msr}\geqm\M_{enum}^{semi}$.

972:

973: \paranodot{(iii)} Analogous to the discrete case we could start by

974: recursively defining $x_k^*:=\arg\min_{x_k}\mu(x_{<k}^*x_k)$ for

975: $\mu\in\M_{rec}^{semi}$. See \cite{Hutter:03unipriors} for a proof

976: along this line.

977: %

978: Simpler is to directly consider $\mu\in\M_{est}^{semi}$ and to

979: compute $x^*_{1:\infty}$ recursively by computing some

980: $\eps$-approximation $e(x_k|x^*_{<t})$ of $\mu(x_k|x^*_{<t})$ and

981: define $x^*_k=\arg\max_{x_k}e(x_k|x^*_{<t})$, which implies

982: $\mu(x^*_k|x^*_{<t})\leq\odt+\eps$. Finally we define measure

983: $\rho$ by $\rho(x_{1:k}^*)=1\forall k$ and $\rho(x)=0$ for all $x$

984: that are not prefixes of $x_{1:\infty}^*$.

985: %

986: Hence

987: $\mu(x_{1:n}^*)\leq(\odt+\eps)^n=(\odt+\eps)^n\rho(x_{1:n}^*)$,

988: which demonstrates that $\mu$ does not dominate $\rho$ for

989: $\eps<\odt$. Since $\mu\in\M_{est}^{semi}$ was arbitrary and

990: $\rho$ is a recursive measure, this implies

991: $\M_{est}^{semi}\ngeqm\M_{rec}^{msr}$.

992:

993: \paranodot{(iv)} Identical to discrete case. \qed

994:

995: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

996: \section{Posterior Convergence}\label{secConv}

997: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

998:

999: We investigated in detail the computational properties of

1000: various mixture distributions $\xi$. A mixture $\xi_\M$

1001: multiplicatively dominates all distributions in $\M$. We

1002: mentioned that dominance implies posterior convergence. In this

1003: section we present in more detail what dominance implies and what

1004: not.

1005:

1006: Convergence of $\xi(x_t|x_{<t})$ to $\mu(x_t|x_{<t})$ with

1007: $\mu$-probability 1 tells us that $\xi(x_t|x_{<t})$ is close to

1008: $\mu(x_t|x_{<t})$ for sufficiently large $t$ on `most'

1009: sequences $x_{1:\infty}$. It says nothing about the speed of

1010: convergence, nor whether convergence is true for any {\em particular}

1011: sequence (of measure 0). Convergence {\em in mean sum} defined

1012: below is intended to capture the rate of convergence,

1013: Martin-L\"{o}f randomness is used to capture convergence

1014: properties for individual sequences.

1015:

1016: Martin-L\"{o}f randomness is a very important concept of

1017: randomness of individual sequences, which is closely related to

1018: Kolmogorov complexity and Solomonoff's universal prior. Levin gave

1019: a characterization equivalent to Martin-L\"{o}f's original

1020: definition \cite{Levin:73random}:

1021:

1022: %------------------------------%

1023: \ftheorem{defML}{Martin-L\"{o}f random sequences}{

1024: %------------------------------%

1025: A sequence $x_{1:\infty}$ is $\mu$-Martin-L\"{o}f random

1026: ($\mu$.M.L.) iff there is a constant $c$ such that

1027: $\MM(x_{1:n})\leq c\cdot \mu(x_{1:n})$ for all $n$.

1028: }%------------------------------%

1029:

1030: \noindent An  equivalent formulation for estimable $\mu$ is:

1031: \beq\label{KmMLr}

1032:   x_{1:\infty} \mbox{ is $\mu$.M.L.-random}

1033:   \quad\Leftrightarrow\quad

1034:   \Km(x_{1:n})= -\log\mu(x_{1:n})+O(1) \;\forall n

1035: \eeq

1036: Theorem~\ref{defML} follows from

1037: (\ref{KmMLr}) by exponentiation, ``using $2^{-\Km}\approx\MM$''

1038: and noting that $\MM\geqm\mu$ follows from universality of $\MM$.

1039: Consider the special case of $\mu$ being a fair coin, i.e.\

1040: $\mu(x_{1:n})=2^{-n}$, then $x_{1:\infty}$ is M.L.\ random {\em

1041: iff} $\Km(x_{1:n})=n+O(1)$, i.e.\ if $x_{1:n}$ is incompressible.

1042: For general $\mu$, $-\lb\mu(x_{1:n})$ is the length of the

1043: Shannon-Fano code of $x_{1:n}$, hence $x_{1:\infty}$ is

1044: $\mu$.M.L.-random {\em iff} the Shannon-Fano code is optimal.

1045:

1046: One can show that a $\mu$.M.L.-random sequence $x_{1:\infty}$

1047: passes {\em all} thinkable effective randomness tests, e.g.\ the

1048: law of large numbers, the law of the iterated logarithm, etc.

1049: In particular, the set of all $\mu$.M.L.-random sequences has

1050: $\mu$-measure 1.

1051: %

1052: The following generalization is natural when considering general

1053: Bayes mixtures $\xi$ as in this work:

1054:

1055: %------------------------------%

1056: \fdefinition{defmuMr}{$\mu/\xi$-random sequences}{

1057: %------------------------------%

1058: A sequence $x_{1:\infty}$ is called $\mu/\xi$-random

1059: ($\mu.\xi$.r.) iff there is a constant $c$ such that

1060: $\xi(x_{1:n})\leq c\cdot \mu(x_{1:n})$ for all $n$.

1061: }%------------------------------%

1062:

1063: Typically, $\xi$ is a mixture over some $\M$ as defined in

1064: (\ref{defxi}), in which case the reverse inequality

1065: $\xi(x)\geqm\mu(x)$ is also true (for all $x$). For finite $\M$ or

1066: if $\xi\in\M$, the definition of $\mu/\xi$-randomness depends only

1067: on $\M$, and not on the specific weights $w_\nu$ used in $\xi$. For

1068: $\M=\M_{enum}^{semi}$, $\mu/\xi$-randomness is just

1069: $\mu$.M.L.-randomness. The larger $\M$, the more patterns are

1070: recognized as nonrandom.

1071: Roughly speaking, those regularities characterized by some

1072: $\nu\in\M$ are recognized by $\mu/\xi$-randomness, i.e.\ for

1073: $\M\subset\M_{enum}^{semi}$ some $\mu/\xi$-random strings may not

1074: be M.L.\ random.

1075: %

1076: Other randomness concepts, e.g.\ those by Schnorr, Ko, van

1077: Lambalgen, Lutz, Kurtz, von Mises, Wald, and Church (see

1078: \cite{Wang:96,Lambalgen:87,Schnorr:71}), could possibly also be

1079: characterized in terms of $\mu/\xi$-randomness for particular

1080: choices of $\cal M$.

1081:

1082: %------------------------------%

1083: %\paradot{Convergence of Random Sequences}%\label{secConvRSeq}

1084: %------------------------------%

1085: A classical (nonrandom)

1086: real-valued sequence $a_t$ is defined to converge to $a_*$, short

1087: $a_t\to a_*$ if $\forall\eps\exists t_0\forall t\geq

1088: t_0:|a_t-a_*|<\eps$. We are interested in convergence properties

1089: of random sequences $z_t(\omega)$ for $t\to\infty$ (e.g.\

1090: $z_t(\omega)=\xi(\omega_t|\omega_{<t})-\mu(\omega_t|\omega_{<t})$).

1091: %

1092: We denote $\mu$-expectations by $\E$. The expected value of a

1093: function $f:\X^t\to\SetR$, dependent on $x_{1:t}$, independent of

1094: $x_{t+1:\infty}$, and possibly undefined on a set of $\mu$-measure

1095: 0, is $\E[f] =

1096: \sumprime_{\!x_{1:t}\in\X^t}\mu(x_{1:t})f(x_{1:t})$. The prime

1097: denotes that the sum is restricted to $x_{1:t}$ with

1098: $\mu(x_{1:t})\neq 0$. Similarly we use $\P[..]$ to denote the

1099: $\mu$-probability of event $[..]$.

1100: %

1101: We define four convergence concepts for random sequences.

1102:

1103: %------------------------------%

1104: \fdefinition{defConv}{Convergence of random sequences}{

1105: %------------------------------%

1106: Let $z_1(\omega),z_2(\omega),...$ be a sequence of real-valued

1107: random variables. $z_t$ is said to

1108: converge for $t\to\infty$ to (random variable) $z_*$

1109: \begin{list}{}{\itemsep=1ex\leftmargin=8ex}

1110: \item[$i)$] with probability 1 (w.p.1) $:\Leftrightarrow$

1111:   $\P[\{\omega:z_t\to z_*\}]=1$,

1112: \item[$ii)$] in mean sum (i.m.s.) $:\Leftrightarrow$

1113: $\sum_{t=1}^\infty\E[(z_t-z_*)^2]<\infty$,

1114: \item[$iii)$] for every $\mu$-Martin-L{\"o}f random sequence ($\mu$.M.L.) $:\Leftrightarrow$ \\

1115: $\forall\omega:$ If $[\exists c\forall n:

1116: \MM(\omega_{1:n})\leq c\mu(\omega_{1:n})]$

1117:   then $z_t(\omega)\to z_*(\omega)$ for $t\to\infty$,

1118: \item[$iv)$] for every $\mu/\xi$-random sequence ($\mu.\xi$.r.) $:\Leftrightarrow$ \\

1119: $\forall\omega:$ If $[\exists c\forall n:

1120: \xi(\omega_{1:n})\leq c\mu(\omega_{1:n})]$

1121:   then $z_t(\omega)\to z_*(\omega)$ for $t\to\infty$.

1122: \end{list}

1123: }%------------------------------%

1124:

1125: \noindent In statistics, $(i)$ is the ``default'' characterization of

1126: convergence of random sequences.

1127: %

1128: Convergence i.m.s.\ $(ii)$ is very strong: it provides a rate of

1129: convergence in the sense that the expected number of times $t$ in

1130: which $z_t$ deviates more than $\eps$ from $z_*$ is finite and

1131: bounded by $c/\eps^2$ and the probability that the number of

1132: $\eps$-deviations exceeds $c\over\eps^2\delta$ is smaller than

1133: $\delta$, where $c:=\sum_{t=1}^\infty\E[(z_t-z_*)^2]$.

1134: Nothing can be said for {\em which} $t$ these deviations occur.

1135: If, additionally, $|z_t-z_*|$ were monotone decreasing, then

1136: $|z_t-z_*|=o(t^{-1/2})$ could be concluded.

1137: %

1138: $(iii)$ uses Martin-L\"{o}f's notion of randomness of {\em individual}

1139: sequences to define convergence M.L. Since this work

1140: deals with general Bayes mixtures $\xi$, we generalized in $(iv)$

1141: the definition of convergence M.L.\ based on $\MM$ to

1142: convergence $\mu.\xi$.r.\ based on $\xi$ in a natural way.

1143: %

1144: One can show that convergence i.m.s.\ implies convergence w.p.1.

1145: Also convergence M.L.\ implies convergence w.p.1.

1146: %

1147: Universality of $\xi$ implies the following posterior convergence results:

1148:

1149: %------------------------------%

1150: %\paradot{Convergence of $\xi$ to $\mu$}\label{subsecConv}

1151: %------------------------------%

1152:

1153: %------------------------------%

1154: \ftheorem{thConv}{Convergence of $\xi$ to $\mu$}{

1155: %------------------------------%

1156: Let there be sequences $x_1x_2...$ over a finite alphabet $\X$

1157: drawn with probability $\mu(x_{1:n})\in\M$ for the first $n$

1158: symbols, where $\mu$ is a measure and $\M$ a countable set of

1159: (semi)measures. The universal/mixture posterior probability

1160: $\xi(x_t|x_{<t})$

1161: of the next symbol $x_t$ given $x_{<t}$

1162: is related to the true posterior probability $\mu(x_t|x_{<t})$

1163: in the following way:\vspace{-1ex}

1164: \beqn

1165:    \sum_{t=1}^n\E{\textstyle\left[\left(\sqrt{{\xi(x_t|x_{<t})

1166:           \over\mu(x_t|x_{<t})}}-1\right)^2\right]} \;\leq\;

1167:    \sum_{t=1}^n\E\bigg[\sum_{x'_t}

1168:         \left(\sqrt{\xi(x'_t|x_{<t})}-\sqrt{\mu(x'_t|x_{<t})}\right)^2\bigg]

1169:         \;\leq\; \ln{w_\mu^{-1}} \;<\; \infty

1170: \eeqn

1171: where $w_\mu$ is the weight (\ref{defxi}) of $\mu$ in $\xi$.

1172: }%------------------------------%

1173:

1174: \noindent Theorem~\ref{thConv} implies

1175: \beqn

1176:  \mbox{$\sqrt{\xi(x'_t|x_{<t})} \to \sqrt{\mu(x'_t|x_{<t})}$

1177:  for any $x'_t$ and

1178:  $\sqrt{{\xi(x_t|x_{<t})\over\mu(x_t|x_{<t})}} \to 1$, both

1179:  i.m.s.\ for $t\to\infty$}.

1180: \eeqn

1181: %

1182: \noindent The latter strengthens the result

1183: $\xi(x_t|x_{<t})/\mu(x_t|x_{<t})\to 1$ w.p.1 derived by G\'acs

1184: \cite[Thm.5.2.2]{Li:97} in that it also provides the ``speed'' of

1185: convergence.

1186:

1187: Note also the subtle difference between the two convergence

1188: results. For {\em any} sequence $x'_{1:\infty}$ (possibly constant

1189: and not necessarily $\mu$-random),

1190: $\mu(x'_t|x_{<t})-\xi(x'_t|x_{<t})$ converges to zero w.p.1

1191: (referring to $x_{1:\infty}$), but no statement is possible for

1192: $\xi(x'_t|x_{<t})/\mu(x'_t|x_{<t})$, since

1193: $\lim\,\inf\mu(x'_t|x_{<t})$ could be zero. On the other hand, if

1194: we stay {\em on}-sequence ($x'_{1:\infty} =

1195: x_{1:\infty}$), we have $\xi(x_t|x_{<t})/\mu(x_t|x_{<t})

1196: \to 1$ w.p.1 (whether $\inf\mu(x_t|x_{<t})$ tends to zero or not does

1197: not matter).

1198: %

1199: Indeed, it is easy to give an example where

1200: $\xi(x'_t|x_{<t})/\mu(x'_t|x_{<t})$ diverges. If we choose

1201: \beqn

1202:   \M=\{\mu_1,\mu_2\},\quad

1203:   \mu\!\equiv\!\mu_1,\quad

1204:   \mu_1(1|x_{<t})=\odt t^{-3} \qmbox{and}

1205:   \mu_2(1|x_{<t})=\odt t^{-2}

1206: \eeqn

1207: the contribution of $\mu_2$ to $\xi$ causes $\xi$ to fall

1208: off like $\mu_2 \sim t^{-2}$, much slower than $\mu \sim

1209: t^{-3}$ causing the quotient to diverge:

1210: \bqan

1211: \mu_1(0_{1:n}) &\!=\!& \prod_{t=1}^n(1-\odt

1212: t^{-3})\stackrel{n\to\infty}\longrightarrow c_1=0.450...>0

1213: \;\Rightarrow\; 0_{1:\infty}\;\mbox{is a

1214: $\mu$-random sequence},

1215: \\

1216: \mu_2(0_{1:n}) &\!=\!& \prod_{t=1}^n(1\!-\!\odt

1217: t^{-2})\stackrel{n\to\infty}\longrightarrow c_2=0.358...>0

1218: \;\Rightarrow\; \xi(0_{1:n})

1219: \to w_1c_1+w_2c_2=:c_\xi>0

1220: \\

1221: \xi(0_{<t}1) &\!=\!&

1222: w_1\mu_1(1|0_{<t})\mu_1(0_{<t})+w_2\mu_2(1|0_{<t})\mu_2(0_{<t})\to

1223: \odt w_2c_2 t^{-2}

1224: \eqan

1225: \beqn

1226: \Rightarrow \quad\xi(1|0_{<t})= {\xi(0_{<t}1)\over \xi(0_{<t})}

1227: \rightarrow {w_2c_2\over 2c_\xi}t^{-2}

1228: \quad\Rightarrow\quad

1229: {\xi(1|0_{<t})\over\mu(1|0_{<t})}\to {w_2c_2\over c_\xi}t\to\infty\quad \mbox{diverges}.

1230: \eeqn

1231:

1232: %------------------------------%

1233: \paradot{Proof}

1234: %------------------------------%

1235: For a probability distribution $y_i\geq 0$ with $\sum_i y_i=1$ and a

1236: semi-distribution $z_i\geq 0$ with $\sum_i z_i\leq 1$ and

1237: $i=\{1,...,N\}$, the Hellinger distance $h(\vec

1238: y,\vec z):=\sum_i(\sqrt{y_i}-\sqrt{z_i})^2$ is upper bounded by the relative

1239: entropy $d(\vec

1240: y,\vec z)=\sum_i y_i\ln{y_i\over z_i}$ (and $0\ln{0\over z}:=0$).

1241: %

1242: This can be seen as follows: For arbitrary $0\leq y\leq 1$ and

1243: $0\leq z\leq 1$ we define

1244: \bqan

1245:   f(y,z) &:=& y\ln{y\over z}-(\sqrt{y}-\sqrt{z})^2+z-y =

1246:   2y g(\sqrt{z/y})

1247: \\

1248:   \qmbox{with}

1249:   g(t) &:=& -\ln t+t-1\geq 0.

1250: \eqan

1251: This shows $f\geq 0$,

1252: and hence $\sum_i f(y_i,z_i)\geq 0$, which implies

1253: \beqn

1254:   \sum_i y_i\ln{y_i\over z_i}-\sum_i(\sqrt{y_i}-\sqrt{z_i})^2 \geq

1255:   \sum_i y_i- \sum_i z_i \geq 1-1 = 0.

1256: \eeqn

1257: The (conditional) $\mu$-expectations of a function $f:\X^t\to\SetR$ are defined as

1258: \beqn

1259:  \E[f]=\sumprime_{x_{1:t}\in\X^t}\!\!\mu(x_{1:t})f(x_{1:t})

1260:  \qmbox{and}

1261:   \E_t[f]:=\E[f|x_{<t}]=\sumprime_{x_t\in\X}\mu(x_t|x_{<t})f(x_{1:t}),

1262: \eeqn

1263: where $\sumprime$ sums over all $x_t$ or $x_{1:t}$ for which

1264: $\mu(x_{1:t})\neq 0$.

1265: If we insert

1266: $\X=\{1,...,N\}$,

1267: $N=|\X|$,

1268: $i=x_t$,

1269: $y_i=\mu_t:=\mu(x_t|x_{<t})$, and

1270: $z_i=\xi_t:=\xi(x_t|x_{<t})$

1271: into $h$ and $d$ we get (w.p.1)

1272: \beqn\label{distdD}

1273:   h_t(x_{<t}) \;:=\; \textstyle \sum_{x_t}

1274:   (\sqrt{\mu_t}\!-\!\sqrt{\xi_t})^2 \qquad \leq \qquad

1275:   d_t(x_{<t}) \;:=\; \textstyle

1276:   \sum_{x_t}\mu_t\ln{\mu_t \over \xi_t} =

1277:   \E_t[\ln{\mu_t\over\xi_t}].

1278: \eeqn

1279: %

1280: Taking the expectation $\E$ and the sum $\sum_{t=1}^n$ we get

1281: \beq\label{entropyapp}

1282:   \sum_{t=1}^n

1283:   \E[d_t(x_{<t})] =

1284:   \sum_{t=1}^n\E[\E_t[

1285:   \ln{\mu_t\over\xi_t}]] =

1286:   \E[

1287:   \ln \prod_{t=1}^n{\mu_t\over\xi_t}] =

1288:   \E[

1289:   \ln{\mu(x_{1:n}) \over \xi(x_{1:n})}] \leq

1290:   \ln{w_\mu^{-1}}

1291: \eeq

1292: where we have used $\E[\E_t[..]]=\E[..]$ and exchanged the $t$-sum

1293: with the expectation $\E$, which transforms to a product inside

1294: the logarithm. In the last equality we have used the chain rule for

1295: $\mu$ and $\xi$. Using universality $\xi(x_{1:n})\geq

1296: w_\mu\mu(x_{1:n})$ yields the final inequality. Finally

1297: \beqn

1298:   \E_t\bigg[\Big(\sqrt{\xi_t\over \mu_t}-1\Big)^2\bigg] =

1299:   \sum_{x_t}\!'\mu_t

1300:   \Big(\sqrt{\xi_t\over \mu_t}-1\Big)^2  =

1301:   \sum_{x_t}\!'(\sqrt{\xi_t}-\sqrt{\mu_t})^2 \leq

1302:   h_t(x_{<t})\leq

1303:   d_t(x_{<t}).

1304: \eeqn

1305: Taking the expectation $\E$ and the sum $\sum_{t=1}^n$ and

1306: chaining the result with (\ref{entropyapp}) yields Theorem

1307: \ref{thConv}. \qed

1308:

1309: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1310: \section{Convergence in Martin-L{\"o}f Sense}\label{secMLconv}

1311: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1312:

1313: An interesting open question is whether $\xi$ converges to $\mu$

1314: (in difference or ratio) individually for all Martin-L\"{o}f

1315: random sequences. Clearly, convergence $\mu$.M.L. may at most fail

1316: for a set of sequences with $\mu$-measure zero. A convergence

1317: M.L.\ result would be particularly interesting and natural for

1318: Solomonoff's universal prior $M$, since M.L.\ randomness can be

1319: defined in terms of $\MM$ (see Theorem~\ref{defML}). Attempts to

1320: convert the bounds in Theorem~\ref{thConv} to effective

1321: $\mu$.M.L.-randomness tests fail, since $M(x_t|x_{<t})$ is not

1322: enumerable. The proof of $M/\mu\stackrel{M.L.}\longrightarrow 1$

1323: given in \cite[Thm.5.2.2]{Li:97} and \cite[Thm.10]{Vitanyi:00} is

1324: incomplete.$\!$\footnote{The formulation of their theorem is quite

1325: misleading in general: ``{\it Let $\mu$ be a positive recursive

1326: measure. If the length of $y$ is fixed and the length of $x$ grows

1327: to infinity, then $M(y|x)/\mu(y|x)\to 1$ with $\mu$-probability

1328: one. The infinite sequences $\omega$ with prefixes $x$ satisfying

1329: the displayed asymptotics are precisely [`$\Rightarrow$' {\em and}

1330: `$\Leftarrow$'] the $\mu$-random sequences.}'' First, for

1331: off-sequence $y$ convergence w.p.1 does not hold ($xy$ must be

1332: demanded to be a prefix of $\omega$). Second, the proof of

1333: `$\Leftarrow$' has gaps (see main text). Last, `$\Rightarrow$' is

1334: given without proof and is wrong \cite{Hutter:04mlconvx}. Also the assertion

1335: in \cite[Thm.5.2.1]{Li:97} that $S_t:=\E\sum_{x'_t}

1336: (\mu(x'_t|x_{<t})-M(x'_t|x_{<t}))^2$ converges to zero faster than

1337: $1/t$ cannot be made, since $S_t$ does not decrease

1338: monotonically \cite[Prob.2.7]{Hutter:04uaibook}. For example, for

1339: $a_t:=1/\sqrt{t}$ if $t$ is a cube and 0 otherwise, we have

1340: $\sum_{t=1}^\infty a_t<\infty$, but $a_t\neq o(1/t)$.} The

1341: implication ``$\MM(x_{1:n})\leq c\cdot\mu(x_{1:n})\forall

1342: n\Rightarrow \lim_{n\to\infty}\MM(x_{1:n})/\mu(x_{1:n})$ exists''

1343: has been used, but not proven, and is indeed generally

1344: wrong \cite{Hutter:04mlconvx}.

1345: %

1346: Theorem~\ref{defML} only implies

1347: $\sup_n\MM(x_{1:n})/\mu(x_{1:n})<\infty$ for M.L.\ random

1348: sequences $x_{1:\infty}$, and \cite[pp.\ 324--325]{Doob:53}

1349: implies only that $\lim_{n\to\infty}\MM(x_{1:n})/\mu(x_{1:n})$ exists

1350: w.p.1, and not $\mu$.M.L.

1351: %

1352: Vovk \cite{Vovk:87} shows that for two estimable

1353: semimeasures $\mu$ and $\rho$ and $x_{1:\infty}$ being $\mu$

1354: {\em and} $\rho$ M.L.\ random that

1355: \beqn

1356: \sum_{t=1}^\infty\sum_{x'_t}\left(\sqrt{\mu(x'_t|x_{<t})}-\sqrt{\rho(x'_t|x_{<t})}\right)^2<\infty

1357: \qmbox{and}

1358: \sum_{t=1}^\infty\left({\rho(x_t|x_{<t})\over\mu(x_t|x_{<t})}-1\right)^2<\infty.

1359: \eeqn

1360: If $\MM$ were estimable, then this would imply posterior

1361: $\MM\to\mu$ and $\MM/\mu\to 1$ for every $\mu$.M.L.-random

1362: sequence $x_{1:\infty}$, since {\em every} sequence is $\MM$.M.L.\

1363: random. Since $\MM$ is {\em not} estimable, Vovk's theorem cannot

1364: be applied and it is not obvious how to generalize it. So the

1365: question of individual convergence remains open. More generally,

1366: one may ask whether $\xi_\M\to\mu$ for every $\mu/\xi$-random

1367: sequence. It turns out that this is true for some $\M$, but false for others.

1368:

1369: %------------------------------%

1370: \ftheorem{thMLConv}{$\mu/\xi$-convergence of $\xi$ to $\mu$}{

1371: %------------------------------%

1372: Let $\X=\B$ be binary and

1373: $\M_\Theta:=\{\mu_\th:\mu_\th(1|x_{<t})=\th\,\forall t,\;

1374: \th\in\Theta\}$ be the set of Bernoulli($\th$) distributions

1375: with parameters $\th\in\Theta$. Let $\Theta_D$ be a countable

1376: dense subset of $[0,1]$, e.g.\ $[0,1]\cap\SetQ$, and let $\Theta_G$

1377: be a countable subset of $[0,1]$ with a gap in the sense that

1378: there exist $0<\th_0<\th_1<1$ such that

1379: $[\th_0,\th_1]\cap\Theta_G=\{\th_0,\th_1\}$, e.g.\

1380: $\Theta_G=\{\odf,\odt\}$ or $\Theta_G=([0,{1\over

1381: 4}]\cup[{1\over 2},1])\cap\SetQ$. Then

1382: \begin{list}{}{\ifjournal\itemsep=1ex\fi}

1383: \item[$i)$] If $x_{1:\infty}$ is $\mu/\xi_{\M_{\Theta_D}}$ random with

1384: $\mu\in\M_{\Theta_D}$, then $\xi_{\M_{\Theta_D}}(x_t|x_{<t})\to\mu(x_t|x_{<t})$,

1385: \item[$ii)$] There are $\mu\in\M_{\Theta_G}$ and $\mu/\xi_{\M_{\Theta_G}}\!\!$

1386: random $x_{1:\infty}$ for which

1387: $\xi_{\M_{\Theta_G}}\!\!(x_t|x_{<t})\not\to\mu(x_t|x_{<t})\!\!$

1388: \end{list}

1389: }%------------------------------%

1390:

1391: \noindent Our original/main motivation of studying

1392: $\mu/\xi$-randomness is the implication of Theorem~\ref{thMLConv}

1393: that $\MM\stackrel{\mbox{\tiny M.L.}}\longrightarrow\mu$ cannot be

1394: decided from $M$ being a mixture distribution or from the

1395: universality property (Theorem~\ref{thUniM}) alone. Further

1396: structural properties of $\M_{enum}^{semi}$ have to be employed.

1397: For Bernoulli sequences, convergence $\mu.\xi_{\M_\Theta}$.r.\ is

1398: related to denseness of $\M_\Theta$. Maybe a denseness

1399: characterization of $\M_{enum}^{semi}$ can solve the question of

1400: convergence M.L.\ of $M$. The property $\MM\in\M_{enum}^{semi}$ is

1401: also not sufficient to resolve this question, since there are

1402: $\M\ni\xi$ for which $\xi\stackrel{\mu.\xi.r}\longrightarrow\mu$

1403: and $\M\ni\xi$ for which

1404: $\xi\not\stackrel{\mu.\xi.r}\longrightarrow\mu$. Theorem

1405: \ref{thMLConv} can be generalized to i.i.d.\ sequences over

1406: general finite alphabet $\X$.

1407:

1408: The idea to prove $(ii)$ is to construct a sequence $x_{1:\infty}$

1409: that is $\mu_{\th_0}/\xi$-random {\em and} $\mu_{\th_1}/\xi$-random

1410: for $\th_0\neq\th_1$. This is possible if and only if $\Theta$

1411: contains a gap and $\th_0$ and $\th_1$ are the boundaries of the

1412: gap. Obviously $\xi$ cannot converge to $\th_0$ {\em and} $\th_1$,

1413: thus proving non-convergence. For no $\th\in[0,1]$ will this

1414: $x_{1:\infty}$ be $\mu_\th$ M.L.-random. Finally, the proof of

1415: Theorem~\ref{thMLConv}

1416: makes essential use of the mixture representation of $\xi$, as

1417: opposed to the proof of Theorem~\ref{thConv} which only needs

1418: dominance $\xi\geqm\M$.

1419:

1420: An example for $(ii)$ is $\M=\{\mu_0,\mu_1\}$,

1421: $\mu_0(1|x_{<t})=\mu_1(0|x_{<t})={1\over 4}$,

1422: $x_{1:\infty}=(01)^\infty=01010101...$ $\Rightarrow$ $\mu_0(x_{1:2n})=

1423: \mu_1(x_{1:2n})=\xi(x_{1:2n})=({1\over 4})^n({3\over 4})^n$

1424: $\Rightarrow$ $x_{1:\infty}$ is

1425: $\mu_0/\xi$-random {\em and}

1426: $\mu_1/\xi$-random, but

1427: $\mu_0(x_{2n}|x_{<2n})={1\over 4}$,

1428: $\mu_0(x_{2n+1}|x_{1:2n})={3\over 4}$,

1429: $\mu_1(x_{2n}|x_{<2n})={3\over 4}$,

1430: $\mu_1(x_{2n+1}|x_{1:2n})={1\over 4}$ and

1431: $\xi(x_{2n}|x_{<2n})={3\over 8}$,

1432: $\xi(x_{2n+1}|x_{1:2n})={1\over 2}$ for $w_0=w_1=\odt$

1433: $\Rightarrow$ $\xi(x_n|x_{<n})\not\to\mu_{0/1}(x_n|x_{<n})$.

1434:

1435: %------------------------------%

1436: \paradot{Proof}

1437: %------------------------------%

1438: Let $\X=\B$ and $\M=\{\mu_\th:\th\in\Theta\}$ with countable

1439: $\Theta\subset[0,1]$ and

1440: $\mu_\th(1|x_{1:n})=\th=1-\mu_\th(0|x_{1:n})$, which implies

1441: \beqn

1442:   \mu_\th(x_{1:n}) = \th^{n_1}(1-\th)^{n-n_1},\qquad

1443:   n_1:=x_1\!+...+\!x_n, \qquad

1444:   \hat\th\equiv\hat\th_n:={n_1\over n}

1445: \eeqn

1446: $\hat\th$ depends on $n$; all other used/defined $\th$ will be

1447: independent of $n$. We assume $\th_{\!\cdot\cdot}\in\Theta$, where

1448: $..$ stands for some (possible empty) index, and

1449: $\ddot\th\in[0,1]$ (possibly $\not\in\Theta$), where $\ddot{}$

1450: stands for some superscript, i.e.\ $\mu_{\th_{\!\cdot\cdot}}$ and

1451: $w_{\th_{\!\cdot\cdot}}$ make sense, whereas $\mu_{\ddot\th}$ and

1452: $w_{\ddot\th}$ do not. $\xi$ is defined in the standard way as

1453: \beq\label{MLxiuni}

1454:   \xi(x_{1:n})=\sum_{\th\in\Theta}w_\th\mu_\th(x_{1:n})

1455:   \quad\Rightarrow\quad

1456:   \xi(x_{1:n})\geq w_\th \mu_\th(x_{1:n}),

1457: \eeq

1458: where $\sum_\th w_\th=1$ and $w_\th>0\,\forall\th$.

1459: In the following let $\mu=\mu_{\th_0}\in\M$ be the true environment.

1460: \beq\label{MLmuMr}

1461:   \omega=x_{1:\infty} \mbox{ is } \mu/\xi\mbox{-random}

1462:   \quad\Leftrightarrow\quad

1463:   \exists c_\omega : {\xi(x_{1:n})\leq c_\omega\!\cdot\!\mu_{\th_0}(x_{1:n})}

1464:   \;\forall n

1465: \eeq

1466: For binary alphabet it is sufficient to establish whether

1467: $\xi(1|x_{1:n}) \toinfty{n} \th_0\equiv\mu(1|x_{1:n})$ for

1468: $\mu/\xi$-random $x_{1:\infty}$ in order to decide

1469: $\xi(x_n|x_{<n})\to\mu(x_n|x_{<n})$.

1470: We need the following posterior

1471: representation of $\xi$:

1472: \beq\label{MLpw}

1473:   \xi(1|x_{1:n})=\sum_{\th\in\Theta}w_n^\th \mu_\th(1|x_{1:n}),\quad

1474:   w_n^\th:=w_\th{\mu_\th(x_{1:n})\over\xi(x_{1:n})}

1475:   \leq {w_\th\over w_{\th_0}}{\mu_\th(x_{1:n})\over\mu_{\th_0}(x_{1:n})},\quad

1476:   \sum_{\th\in\Theta}w_n^\th=1

1477: \eeq

1478: The ratio $\mu_\th/\mu_{\th_0}$ can be represented as follows:

1479: \beq\label{MLmuRatio}

1480:   {\mu_\th(x_{1:n})\over\mu_{\th_0}(x_{1:n})}

1481:   = {\th^{n_1}(1\!-\!\th)^{n-n_1}\over \th_0^{n_1}(1\!-\!\th_0)^{n-n_1}}

1482:   = \left[\bigg({\th\over\th_0}\bigg)^{\hat\th_n}

1483:           \bigg({1\!-\!\th\over 1\!-\!\th_0}\bigg)^{1-\hat\th_n}\right]^n

1484:   = \mbox{\Large\e}^{\,\displaystyle n[D(\hat\th_n||\th_0)\!-\!D(\hat\th_n||\th)]}

1485: \eeq

1486: \beqn

1487:   \qmbox{where}\textstyle

1488:   D(\hat\th||\th) = \hat\th\ln{\hat\th\over\th} +

1489:                     (1\!-\!\hat\th)\ln{1-\hat\th\over 1-\th}

1490: \eeqn

1491: is the relative entropy between $\hat\th$ and $\th$, which is

1492: continuous in $\hat\th$ and $\th$, and is $0$ if and only if

1493: $\hat\th=\th$. We also need the following implication for sets

1494: $\Omega\subseteq\Theta$:

1495: \bqa \nonumber

1496:   & & \mbox{If}\quad

1497:   w_n^\th\leq w_\th g_\th(n)\toinfty{n} 0 \qmbox{and}

1498:   g_\th(n)\leq c\;\forall\th\!\in\!\Omega,

1499: \\ \label{MLsumconv}

1500:   & & \mbox{then}\quad

1501:   \sum_{\th\in\Omega}w_n^\th \mu_\th(1|x_{1:n}) \;\leq\;

1502:   \sum_{\th\in\Omega}w_n^\th \toinfty{n} 0,

1503: \eqa

1504: which easily follows from boundedness $\sum_\th w_n^\th\leq 1$ and

1505: $\mu_\th\leq 1$ \cite[Lem.5.28$ii$]{Hutter:04uaibook}. We now

1506: prove Theorem~\ref{thMLConv}. We leave the special considerations

1507: necessary when $0,1\in\Theta$ to the reader and assume,

1508: henceforth, $0,1\not\in\Theta$.

1509:

1510: %------------------------------%

1511: {\bf (i)} Let $\Theta$ be a countable dense subset of $(0,1)$ and

1512: $x_{1:\infty}$ be $\mu/\xi$-random. Using (\ref{MLxiuni}) and

1513: (\ref{MLmuMr}) in (\ref{MLmuRatio}) for $\th\in\Theta$ to be

1514: determined later we can bound

1515: \beq\label{MLenbnd2}

1516:   \e^{n[D(\hat\th_n||\th_0)-D(\hat\th_n||\th)]}

1517:   = {\mu_\th(x_{1:n})\over\mu_{\th_0}(x_{1:n})}

1518:   \leq {c_\omega\over w_\th}

1519:   = :c<\infty

1520: \eeq

1521: Let us assume that $\hat\th\equiv\hat\th_n\not\to\th_0$. This

1522: implies that there exists a cluster point $\tilde\th\neq\th_0$ of

1523: sequence $\hat\th_n$, i.e.\ $\hat\th_n$ is infinitely often in an

1524: $\eps$-neighborhood of $\tilde\th$, e.g.\ $D(\hat\th_n||\tilde\th)\leq\eps$

1525: for infinitely many $n$. $\tilde\th\in[0,1]$ may be outside $\Theta$.

1526: Since $\tilde\th\neq\th_0$ this implies that $\hat\th_n$ must be ``far''

1527: away from $\th_0$ infinitely often. For instance, for $\eps={1\over

1528: 4}(\tilde\th-\th_0)^2$, using $D(\hat\th||\tilde\th)+D(\hat\th||\th_0)

1529: \geq (\tilde\th-\th_0)^2$, we get $D(\hat\th||\th_0)\geq 3\eps$. We

1530: now choose $\th\in\Theta$  so near to $\tilde\th$ such that

1531: $|D(\hat\th||\th)-D(\hat\th||\tilde\th)|\leq\eps$ (here we use

1532: denseness of $\Theta$). Chaining all inequalities we get

1533: $D(\hat\th||\th_0)-D(\hat\th||\th)\geq 3\eps-\eps-\eps=\eps>0$.

1534: This, together with (\ref{MLenbnd2}) implies $\e^{n\eps}\leq c$ for

1535: infinitely many $n$ which is impossible. Hence, the assumption

1536: $\hat\th_n\not\to\th_0$ was wrong.

1537:

1538: Now, $\hat\th_n\to\th_0$ implies that for arbitrary

1539: $\th\neq\th_0$, $\th\in\Theta$ and for sufficiently large $n$

1540: there exists $\delta_\th>0$ such that $D(\hat\th_n||\th)\geq 2\delta_\th$

1541: (since $D(\th_0||\th)\neq 0)$ and $D(\hat\th_n||\th_0)\leq\delta_\th$.

1542: This implies

1543: \beqn\label{MLwto0}

1544:   w_n^\th \;\leq\; {w_\th\over w_{\th_0}}

1545:   \e^{n[D(\hat\th_n||\th_0)\!-\!D(\hat\th_n||\th)]}

1546:   \;\leq\; {w_\th\over w_{\th_0}} \e^{-n\delta_\th}

1547:   \;\toinfty{n}\; 0,

1548: \eeqn

1549: where we have used (\ref{MLpw}) and (\ref{MLmuRatio}) in the first

1550: inequality and the second inequality holds for sufficiently large

1551: $n$. Hence $\sum_{\th\neq\th_0} w_n^\th\to 0$ by (\ref{MLsumconv})

1552: and $w_n^{\th_0}\to 1$ by normalization (\ref{MLpw}), which finally gives

1553: \beqn

1554:   \xi(1|x_{1:n})=w_n^{\th_0} \mu_{\th_0}(1|x_{1:n}) +

1555:   \sum_{\th\neq\th_0}w_n^\th \mu_\th(1|x_{1:n}) \;\toinfty{n}

1556:   \mu_{\th_0}(1|x_{1:n}).

1557: \eeqn

1558:

1559: %------------------------------%

1560: {\bf (ii)} We first consider the case $\Theta=\{\th_0,\th_1\}$:

1561: Let us choose $\bar\th$ ($=\ln({1-\th_0\over

1562: 1-\th_1})/\ln({\th_1\over\th_0}{1-\th_0\over 1-\th_1})

1563: \not\in\Theta$) in the (KL) middle of $\th_0$ and $\th_1$ such

1564: that

1565: \beq\label{MLMid}

1566:   D(\bar\th||\th_0)=D(\bar\th||\th_1), \qquad

1567:   0 < \th_0 < \bar\th < \th_1 < 1,

1568: \eeq

1569: \beqn

1570:   \mbox{and choose $x_{1:\infty}$ such that $\hat\th_n:={n_1\over n}$

1571:   satisfies $|\hat\th_n-\bar\th|\leq{1\over n}

1572:   \quad(\Rightarrow\;\hat\th_n\toinfty{n}\bar\th)$}

1573: \eeqn

1574: We will show that $x_{1:\infty}$

1575: is $\mu_{\th_0}/\xi$-random {\em and} $\mu_{\th_1}/\xi$-random.

1576: Obviously no $\xi$ can converge to $\th_0$

1577: {\em and} $\th_1$, thus proving $\M$-non-convergence.

1578: ($x_{1:\infty}$ is obviously not $\mu_{\th_{0/1}}$ M.L.-random,

1579: since the relative frequency $\hat\th_n\not\to\th_{0/1}$.

1580: $x_{1:\infty}$ is not even $\mu_{\bar\th}$ M.L.-random, since

1581: $\hat\th_n$ converges too fast ($\sim\odn$). $x_{1:\infty}$ is

1582: indeed very regular, whereas ${n_1\over n}$ of a truly

1583: $\mu_{\bar\th}$ M.L.-random sequence has fluctuations of the order

1584: $1/\sqrt n$. The fast convergence is necessary for

1585: doubly $\mu/\xi$-randomness.

1586: %

1587: The reason that $x_{1:\infty}$ is $\mu/\xi$-random, but not M.L.-random is

1588: that $\mu/\xi$-randomness is a weaker concept than M.L.-randomness for

1589: $\M\subset\M_{enum}^{semi}$. Only regularities characterized by

1590: $\nu\in\M$ are recognized by $\mu/\xi$-randomness.)

1591:

1592: In the following we assume that $n$ is sufficiently large

1593: such that $\th_0\leq\hat\th_n\leq\th_1$.  We need

1594: \beq\label{MLDD}

1595:   |D(\hat\th||\th)-D(\bar\th||\th)| \leq c|\hat\th-\bar\th|

1596:   \quad\forall\,\th,\hat\th,\bar\th\in[\th_0,\th_1]

1597:   \qmbox{with} \textstyle c:=\ln\!{\th_1(1-\th_0)\over\th_0(1-\th_1)} < \infty

1598: \eeq

1599: which follows for $\hat\th\geq\bar\th$ (similarly

1600: $\hat\th\leq\bar\th$) from

1601: \beqn

1602:   D(\hat\th||\th)-D(\bar\th||\th) = \int_{\bar\th}^{\hat\th}

1603:   [{\textstyle\ln{\th'\over\th}-\ln{1-\th'\over 1-\th}}]d\th'

1604:   \leq \int_{\bar\th}^{\hat\th}

1605:   [{\textstyle\ln{\th_1\over\th_0}-\ln{1-\th_1\over 1-\th_0}}]d\th'

1606:   = c\!\cdot\!(\hat\th-\bar\th)

1607: \eeqn

1608: where we have increased $\th'$ to $\th_1$ and decreased $\th$ to

1609: $\th_0$ in the inequality. Using (\ref{MLDD}) in (\ref{MLmuRatio})

1610: twice we get

1611: \beq\label{MLmu01}

1612:   {\mu_{\th_1}(x_{1:n})\over\mu_{\th_0}(x_{1:n})}

1613:   =

1614:   \e^{n[D(\hat\th_n||\th_0)-D(\hat\th_n||\th_1)]}

1615:   \leq

1616:   \e^{n[D(\bar\th||\th_0)+c|\hat\th_n-\bar\th|-

1617:        D(\bar\th||\th_1)+c|\hat\th_n-\bar\th|]}

1618:   \leq

1619:   \e^{2c}

1620: \eeq

1621: where we have used (\ref{MLMid}) in the last inequality. Now,

1622: (\ref{MLmu01}) and (\ref{MLpw}) lead to

1623: \beq\label{MLwgeq0}

1624:   w_n^{\th_0}

1625:   = w_{\th_0}{\mu_{\th_0}(x_{1:n})\over\xi(x_{1:n})}

1626:   = [1+{w_{\th_1}\over w_{\th_0}}{\mu_{\th_1}(x_{1:n})\over\mu_{\th_0}(x_{1:n})}]^{-1}

1627:   \geq [1+{w_{\th_1}\over w_{\th_0}}\e^{2c}]^{-1}=:c_0>0,

1628: \eeq

1629: which shows that $x_{1:\infty}$ is $\mu_{\th_0}/\xi$-random by

1630: (\ref{MLmuMr}). Exchanging $\th_0\leftrightarrow\th_1$ in

1631: (\ref{MLmu01}) and (\ref{MLwgeq0}) we similarly get

1632: $w_n^{\th_1}\geq c_1>0$, which implies (using

1633: $w_n^{\th_0}+w_n^{\th_1}=1$)

1634: \beq\label{MLnonconv2}

1635:   \xi(1|x_{1:n})=

1636:   \sum_{\th\in\{\th_0,\th_1\}}w_n^\th \mu_\th(1|x_{1:n})

1637:   = w_n^{\th_0}\!\cdot\!\th_0 + w_n^{\th_1}\!\cdot\!\th_1

1638:   \neq \th_0 = \mu_{\th_0}(1|x_{1:n}).

1639: \eeq

1640: This shows $\xi(1|x_{1:n}) \;\;\not\!\!\!\toinfty{n}

1641: \mu(1|x_{1:n})$.

1642: One can show that $\xi(1|x_{1:n})$ does not only not converge to

1643: $\th_0$ (and $\th_1$), but that it does not converge at all. The

1644: fast convergence demand $|\hat\th_n-\bar\th|\leq\odn$ on

1645: $x_{1:\infty}$ can be weakened to

1646: $\hat\th_n\leq\bar\th+O(\odn)\,\forall n$ and

1647: $\hat\th_n\geq\bar\th-O(\odn)$ for infinitely many $n$, then

1648: $x_{1:\infty}$ is still $\mu_{\th_0}/\xi$-random, and

1649: $w_n^{\th_1}\geq c_1'>0$ for infinitely many $n$, which is

1650: sufficient to prove $\xi\not\to\mu$.

1651:

1652: We now consider general $\Theta$ with gap in the sense that there exist

1653: $0<\th_0<\th_1<1$ with

1654: $[\th_0,\th_1]\cap\Theta=\{\th_0,\th_1\}$: We show

1655: that all $\th\neq\th_0,\th_1$ give asymptotically no contribution

1656: to $\xi(1|x_{1:n})$, i.e.\ (\ref{MLnonconv2}) still applies. Let

1657: $\th\in\Theta\setminus\{\th_0,\th_1\}$; all other definitions as

1658: before. Then

1659: $\delta_\th:=D(\bar\th||\th)-D(\bar\th||\th_{0/1})>0$, since

1660: $\th$ is farther than $\th_{0/1}$ away from $\bar\th$

1661: ($|\th-\bar\th|>|\th_{0/1}-\bar\th|$). Similarly to (\ref{MLmu01}) with

1662: $\th$ instead $\th_1$ we get

1663: \beqn

1664:   {\mu_\th(x_{1:n})\over\mu_{\th_0}(x_{1:n})}

1665:   = \e^{n[D(\hat\th_n||\th_0)-D(\hat\th_n||\th)]}

1666:   \leq \e^{2c}\!\cdot\!

1667:     \e^{n[D(\bar\th||\th_0)-D(\bar\th||\th)]}

1668:   = \e^{2c}\e^{-n\delta_\th}

1669:   \toinfty{n} 0

1670: \eeqn

1671: Hence $w_n^\th\leq{w_\th\over w_{\th_0}}\e^{2c}\e^{-n\delta_\th}\to

1672: 0$ from (\ref{MLpw}) and

1673: $\eps_n:=\sum_{\th\in\Theta\setminus\{\th_0,\th_1\}}

1674: w_n^\th\mu_\th(1|x_{1:n})\toinfty{n} 0$ from (\ref{MLsumconv}).

1675: Hence $

1676:   \xi(1|x_{1:n})

1677:   = w_n^{\th_0}\cdot\th_0 + w_n^{\th_1}\cdot\th_1 + \eps_n

1678:   \neq \th_0 = \mu_{\th_0}(1|x_{1:n})

1679: $

1680: for sufficiently large $n$, since $\eps_n\to 0$, $w_n^{\th_1}\geq c'_1>0$

1681: and $\th_0\neq\th_1$.

1682: \qed

1683:

1684: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1685: \section{Conclusions}\label{secConc}

1686: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1687:

1688: For a hierarchy of four computability definitions, we completed

1689: the classification of the existence of computable (semi)measures

1690: dominating all computable (semi)measures. Dominance is an important

1691: property of a prior, since it implies rapid convergence of the

1692: corresponding posterior with probability one.

1693: %

1694: A strengthening would be convergence for all Martin-L{\"o}f (M.L.)

1695: random sequences. This seems natural, since M.L.\ randomness can

1696: be defined in terms of Solomonoff's prior $M$, so there is a close

1697: connection.

1698: %

1699: Contrary to what was believed before, the question of posterior

1700: convergence $M/\mu\to 1$ for all M.L.\ random sequences is still

1701: open. Some exciting progress has been made recently in

1702: \cite{Hutter:04mlconvx}, partially answering this question.

1703: %

1704: We introduced a new flexible notion of

1705: $\mu/\xi$-randomness which contains Martin-L{\"of} randomness as a

1706: special case. Though this notion may have a wider range of

1707: application, the main purpose for its introduction was to show

1708: that standard proof attempts of

1709: $M/\mu\stackrel{M.L.}\longrightarrow 1$ based on dominance only

1710: must fail. This follows from the derived result that the validity

1711: of $\xi/\mu\to 1$ for $\mu/\xi$-random sequences depends on the

1712: Bayes mixture $\xi$.

1713:

1714: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1715: %         Bibliography        %

1716: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1717:

1718: \begin{small}

1719: \begin{thebibliography}{Hut03b}

1720:

1721: \bibitem[Cha75]{Chaitin:75}

1722: G.~J. Chaitin.

1723: \newblock A theory of program size formally identical to information theory.

1724: \newblock {\em Journal of the ACM}, 22(3):329--340, 1975.

1725:

1726: \bibitem[Doo53]{Doob:53}

1727: J.~L. Doob.

1728: \newblock {\em Stochastic Processes}.

1729: \newblock Wiley, New York, 1953.

1730:

1731: \bibitem[G{\'a}c74]{Gacs:74}

1732: P.~G{\'a}cs.

1733: \newblock On the symmetry of algorithmic information.

1734: \newblock {\em Soviet Mathematics Doklady}, 15:1477--1480, 1974.

1735:

1736: \bibitem[HM04]{Hutter:04mlconvx}

1737: M.~Hutter and An.~A. Muchnik.

1738: \newblock Universal convergence of semimeasures on individual random sequences.

1739: \newblock In {\em Proc. 15th International Conf. on Algorithmic Learning Theory

1740:   ({ALT-2004})}, volume 3244 of {\em LNAI}, pages 234--248, Padova, 2004.

1741:   Springer, Berlin.

1742:

1743: \bibitem[Hut01]{Hutter:01alpha}

1744: M.~Hutter.

1745: \newblock Convergence and error bounds for universal prediction of nonbinary

1746:   sequences.

1747: \newblock In {\em Proc. 12th European Conf. on Machine Learning (ECML-2001)},

1748:   volume 2167 of {\em LNAI}, pages 239--250, Freiburg, 2001. Springer, Berlin.

1749:

1750: \bibitem[Hut03a]{Hutter:03unipriors}

1751: M.~Hutter.

1752: \newblock On the existence and convergence of computable universal priors.

1753: \newblock In {\em Proc. 14th International Conf. on Algorithmic Learning Theory

1754:   ({ALT-2003})}, volume 2842 of {\em LNAI}, pages 298--312, Sapporo, 2003.

1755:   Springer, Berlin.

1756:

1757: \bibitem[Hut03b]{Hutter:03unimdl}

1758: M.~Hutter.

1759: \newblock Sequence prediction based on monotone complexity.

1760: \newblock In {\em Proc. 16th Annual Conf. on Learning Theory ({COLT-2003})},

1761:   volume 2777 of {\em LNAI}, pages 506--521, Washington, DC, 2003. Springer,

1762:   Berlin.

1763:

1764: \bibitem[Hut04]{Hutter:04uaibook}

1765: M.~Hutter.

1766: \newblock {\em Universal Artificial Intelligence: Sequential Decisions based on

1767:   Algorithmic Probability}.

1768: \newblock Springer, Berlin, 2004.

1769: \newblock 300 pages, http://www.idsia.ch/$_{^{\sim}}$marcus/ai/uaibook.htm.

1770:

1771: \bibitem[Kol65]{Kolmogorov:65}

1772: A.~N. Kolmogorov.

1773: \newblock Three approaches to the quantitative definition of information.

1774: \newblock {\em Problems of Information and Transmission}, 1(1):1--7, 1965.

1775:

1776: \bibitem[Lam87]{Lambalgen:87}

1777: {M. van} Lambalgen.

1778: \newblock {\em Random Sequences}.

1779: \newblock PhD thesis, University of Amsterdam, 1987.

1780:

1781: \bibitem[Lev73]{Levin:73random}

1782: L.~A. Levin.

1783: \newblock On the notion of a random sequence.

1784: \newblock {\em Soviet Mathematics Doklady}, 14(5):1413--1416, 1973.

1785:

1786: \bibitem[Lev74]{Levin:74}

1787: L.~A. Levin.

1788: \newblock Laws of information conservation (non-growth) and aspects of the

1789:   foundation of probability theory.

1790: \newblock {\em Problems of Information Transmission}, 10(3):206--210, 1974.

1791:

1792: \bibitem[LV97]{Li:97}

1793: M.~Li and P.~M.~B. Vit\'anyi.

1794: \newblock {\em An Introduction to {K}olmogorov Complexity and its

1795:   Applications}.

1796: \newblock Springer, Berlin, 2nd edition, 1997.

1797:

1798: \bibitem[Sch71]{Schnorr:71}

1799: C.~P. Schnorr.

1800: \newblock {\em Zuf{\"a}lligkeit und Wahrscheinlichkeit}.

1801: \newblock Springer, Berlin, 1971.

1802:

1803: \bibitem[Sch00]{Schmidhuber:00toe}

1804: J.~Schmidhuber.

1805: \newblock Algorithmic theories of everything.

1806: \newblock Report IDSIA-20-00, quant-ph/0011122, {IDSIA}, Manno (Lugano),

1807:   Switzerland, 2000.

1808:

1809: \bibitem[Sch02]{Schmidhuber:02gtm}

1810: J.~Schmidhuber.

1811: \newblock Hierarchies of generalized {Kolmogorov} complexities and

1812:   nonenumerable universal measures computable in the limit.

1813: \newblock {\em International Journal of Foundations of Computer Science},

1814:   13(4):587--612, 2002.

1815:

1816: \bibitem[Sim77]{Simpson:77}

1817: S.~G. Simpson.

1818: \newblock Degrees of unsolvability: A survey of results.

1819: \newblock In J.~Barwise, editor, {\em Handbook of Mathematical Logic}, pages

1820:   631--652. North-Holland, Amsterdam, 1977.

1821:

1822: \bibitem[Sol64]{Solomonoff:64}

1823: R.~J. Solomonoff.

1824: \newblock A formal theory of inductive inference: Parts 1 and 2.

1825: \newblock {\em Information and Control}, 7:1--22 and 224--254, 1964.

1826:

1827: \bibitem[Sol78]{Solomonoff:78}

1828: R.~J. Solomonoff.

1829: \newblock Complexity-based induction systems: Comparisons and convergence

1830:   theorems.

1831: \newblock {\em IEEE Transaction on Information Theory}, IT-24:422--432, 1978.

1832:

1833: \bibitem[VL00]{Vitanyi:00}

1834: P.~M.~B. Vit\'anyi and M.~Li.

1835: \newblock Minimum description length induction, {B}ayesianism, and {K}olmogorov

1836:   complexity.

1837: \newblock {\em IEEE Transactions on Information Theory}, 46(2):446--464, 2000.

1838:

1839: \bibitem[Vov87]{Vovk:87}

1840: V.~G. Vovk.

1841: \newblock On a randomness criterion.

1842: \newblock {\em Soviet Mathematics Doklady}, 35(3):656--660, 1987.

1843:

1844: \bibitem[Wan96]{Wang:96}

1845: Y.~Wang.

1846: \newblock {\em Randomness and Complexity}.

1847: \newblock PhD thesis, Universit{\"a}t Heidelberg, 1996.

1848:

1849: \bibitem[ZL70]{Zvonkin:70}

1850: A.~K. Zvonkin and L.~A. Levin.

1851: \newblock The complexity of finite objects and the development of the concepts

1852:   of information and randomness by means of the theory of algorithms.

1853: \newblock {\em Russian Mathematical Surveys}, 25(6):83--124, 1970.

1854:

1855: \end{thebibliography}

1856: \end{small}

1857: \end{document}

1858:

1859: %---------------------End-of-UniPriorx.tex--------------------%

1860: