0607:cs0607136/cs0607136

1: % Last changed: 28 Jul 2006

2: % Spell checked: 28 Jul 2006

3: % 1438 lines, 39 KB

4: \newif\ifJOURNAL

5: \JOURNALfalse

6: \newif\ifCONF

7: \CONFfalse

8: \newif\ifarXiv

9: \arXivfalse

10: \newif\ifWP

11: \WPfalse

12: \newif\ifFULL

13: \FULLfalse

14:

15: \newif\ifLATIN

16: \LATINfalse

17:

18: %\JOURNALtrue		% choose JOURNAL, arXiv, WP, or FULL

19: %\CONFtrue

20: \arXivtrue

21: %\WPtrue

22: %\FULLtrue		% this version is not for publication and contains extra remarks and questions

23:

24: %\LATINtrue		% LATIN means that the Cyrillic references should be set in Latin

25: \ifarXiv\LATINtrue\fi	% for submitting to arXiv

26:

27: \newif\ifnotJOURNAL	% derivative conditional

28: \notJOURNALtrue

29: \ifJOURNAL\notJOURNALfalse\fi

30:

31: \newif\ifnotarXiv	% derivative conditional

32: \notarXivtrue

33: \ifarXiv\notarXivfalse\fi

34:

35: \newif\ifTR		% derivative conditionals (TR = arXiv or WP)

36: \TRfalse

37: \ifarXiv\TRtrue\fi

38: \ifWP\TRtrue\fi

39: \newif\ifnotTR

40: \notTRtrue

41: \ifarXiv\notTRfalse\fi

42: \ifWP\notTRfalse\fi

43:

44: \newif\ifnotLATIN	% derivative conditional

45: \notLATINtrue

46: \ifLATIN\notLATINfalse\fi

47:

48: \ifJOURNAL

49:   \newcommand{\DFI}{vovk/etal:2005AIStatslocal}		% former \GTPVIII

50:   \newcommand{\DFII}{vovk/etal:2005ALT}			% former \GTPX

51:   \newcommand{\DFIII}{vovk:2005ALT-DF03}		% former \GTPXIII

52:   \newcommand{\DFIV}{vovk:2005ALT-DF04}			% former \GTPXIV

53:   \newcommand{\DFV}{DF05arXiv}				% former \GTPXI

54:   \newcommand{\DFVI}{DF06arXiv}				% former \GTPXVI

55: \fi

56: \ifarXiv

57:   \newcommand{\DFI}{DF01arXiv}		% former \GTPVIII

58:   \newcommand{\DFII}{DF02arXiv}		% former \GTPX

59:   \newcommand{\DFIII}{DF03arXiv}	% former \GTPXIII

60:   \newcommand{\DFIV}{DF04arXiv}		% former \GTPXIV

61:   \newcommand{\DFV}{DF05arXiv}		% former \GTPXI

62:   \newcommand{\DFVI}{DF06arXiv}		% former \GTPXVI

63:   \newcommand{\DFVII}{DF07arXiv}	% former \GTPXVII

64:   \newcommand{\DFVIII}{DF08arXiv}

65: \fi

66: \ifWP

67:   \newcommand{\DFI}{GTP8}		% former \GTPVIII

68:   \newcommand{\DFII}{GTP10}		% former \GTPX

69:   \newcommand{\DFIII}{GTP13}		% former \GTPXIII

70:   \newcommand{\DFIV}{GTP14}		% former \GTPXIV

71:   \newcommand{\DFV}{GTP11}		% former \GTPXI

72:   \newcommand{\DFVI}{GTP16}		% former \GTPXVII

73:   \newcommand{\DFVII}{GTP17}		% former \GTPXVII

74:   \newcommand{\DFVIII}{DF08arXiv}

75: \fi

76: \ifFULL

77:   \newcommand{\DFI}{DF01arXiv}		% former \GTPVIII

78:   \newcommand{\DFII}{DF02arXiv}		% former \GTPX

79:   \newcommand{\DFIII}{DF03arXiv}	% former \GTPXIII

80:   \newcommand{\DFIV}{DF04arXiv}		% former \GTPXIV

81:   \newcommand{\DFV}{DF05arXiv}		% former \GTPXI

82:   \newcommand{\DFVI}{DF06arXiv}		% former \GTPXVI

83:   \newcommand{\DFVII}{DF07arXiv}	% former \GTPXVII

84:   \newcommand{\DFVIII}{DF08arXiv}

85: \fi

86:

87: \ifnotLATIN

88:   \newcommand{\KolmogorovTikhomirov}{kolmogorov/tikhomirov:1959}

89:   \newcommand{\Tikhomirov}{tikhomirov:1987}

90: \fi

91: \ifLATIN

92:   \newcommand{\KolmogorovTikhomirov}{kolmogorov/tikhomirov:1959latin}

93:   \newcommand{\Tikhomirov}{tikhomirov:1987latin}

94: \fi

95:

96: \ifJOURNAL

97: \documentclass{article}

98: \usepackage{amsmath,amsfonts,amssymb,latexsym,graphicx}

99: \newcommand{\Extra}[1]{}

100: \fi

101:

102: \ifCONF

103: \documentclass{article}

104: \usepackage{amsmath,amsfonts,amssymb,latexsym,graphicx}

105: \newcommand{\Extra}[1]{}

106: \fi

107:

108: \ifarXiv

109: \documentclass{article}

110: \usepackage{amsmath,amsfonts,amssymb,latexsym,graphicx}

111: \newcommand{\Extra}[1]{}

112: \fi

113:

114: \ifWP

115: \documentclass{gtarticle}

116: \usepackage{amsmath,amsfonts,amssymb,latexsym,epsfig,graphicx}

117: \renewcommand{\Extra}[1]{#1}

118: \fi

119:

120: \ifFULL

121: \documentclass{article}

122: \usepackage{amsmath,amsfonts,amssymb,latexsym,color,epigraph,graphicx,eepic}

123: \newcommand{\Extra}[1]{\red{#1}}

124: \newcommand{\red}[1]{\textcolor{red}{#1}}

125: \newcommand{\blue}[1]{\textcolor{blue}{#1}}

126: \newcommand{\bluebegin}{\begingroup\color{blue}}

127: \newcommand{\blueend}{\endgroup}

128: \newcommand{\redbegin}{\begingroup\color{red}}

129: \newcommand{\redend}{\endgroup}

130: \fi

131:

132: \emergencystretch=5mm

133: \tolerance=400

134: \allowdisplaybreaks[4]

135:

136: \newcommand{\Vladimir}{Vladimir}

137: \newcommand{\DOT}{.}

138:

139: \ifnotLATIN

140: \input{OT2enc.def}

141: \newenvironment{cyr}

142: {\fontencoding{OT2}\fontfamily{wncyr}\fontseries{m}\fontshape{n}\selectfont}

143: {\fontencoding{OT1}\fontfamily{tir}\selectfont}

144: \usepackage{CJK}

145: \fi

146:

147: \newcommand{\st}{\mathrel{\!|\!}}

148: \newcommand{\givn}{\mathrel{|}}

149: \newcommand{\D}{\,\mathrm{d}}

150: \newcommand{\dd}{\mathrm{d}}

151:

152: \newcommand{\III}{\mathbb{I}}

153: \newcommand{\PPP}{\mathcal{P}}		% all probability measures

154:

155: \newcommand{\BL}{\mathrm{BL}}		% bounded Lipschitz

156:

157: \newcommand{\diam}{\mathop{\mathrm{diam}}\nolimits}

158:

159: \newcommand{\bbbp}{\mathbb{P}}		% auxiliary (probability)

160: \newcommand{\Prob}{\mathop{\bbbp}\nolimits}

161: \newcommand{\bbbe}{\mathbb{E}}		% auxiliary (expectation)

162: \newcommand{\Expect}{\mathop{\bbbe}\nolimits}

163:

164: \newcommand{\cpc}{\mathop{\mathrm{cpc}}\nolimits}

165: \newcommand{\ucpc}{\mathop{\overline{\mathstrut\mathrm{cpc}}}\nolimits}

166: \newcommand{\lcpc}{\mathop{\underline{\mathstrut\mathrm{cpc}}}\nolimits}

167:

168: \newcommand{\bbbr}{\mathbb{R}}		% the real numbers

169:

170: \newtheorem{lemma}{Lemma}

171: \newtheorem{proposition}{Proposition}

172: \newtheorem{corollary}{Corollary}

173: \newtheorem{remark}{Remark}

174: \newtheorem{theorem}{Theorem}

175: \newenvironment{proof}

176:   {\trivlist\item[\hskip\labelsep\textbf{Proof}]}

177:   {\endtrivlist}

178:

179: \newenvironment{Proof}[1]

180:   {\trivlist\item[\hskip\labelsep\textbf{Proof #1\,}]}

181:   {\endtrivlist}

182: \newcommand{\boxforqed}{\rule{.3em}{1.5ex}}

183: \newcommand{\qedtext}{\unskip\nobreak\hfil

184:   \penalty50\hskip1em\null\nobreak\hfil\boxforqed

185:   \parfillskip=0pt\finalhyphendemerits=0\endgraf}

186: %\newcommand{\qedmath}{\eqno\boxforqed}

187: \newcommand{\qedmath}{\tag*{\boxforqed}}

188: \newenvironment{remark*}

189:   {\trivlist\item[\hskip\labelsep{\bfseries Remark}]\relax}

190:   {\endtrivlist}

191:

192: \ifJOURNAL

193: \title{Competing with Markov prediction strategies}

194: \author{Vladimir Vovk\\[5mm]

195:  Computer Learning Research Centre\\

196:   Department of Computer Science\\

197:   Royal Holloway, University of London,

198:   Egham, Surrey TW20 0EX, UK\\

199:   \texttt{vovk@cs.rhul.ac.uk}}

200: \fi

201:

202: \ifCONF

203: \title{Competing with Markov prediction strategies}

204: \author{Vladimir Vovk\\[5mm]

205:  Computer Learning Research Centre\\

206:   Department of Computer Science\\

207:   Royal Holloway, University of London,

208:   Egham, Surrey TW20 0EX, UK\\

209:   \texttt{vovk@cs.rhul.ac.uk}}

210: \fi

211:

212: \ifarXiv

213: \title{Competing with Markov prediction strategies}

214: \author{Vladimir Vovk\\

215: \texttt{vovk{\rm@}cs.rhul.ac.uk}\\

216: \texttt{http://vovk.net}}

217: \fi

218:

219: \ifWP

220: \title{Competing with Markov prediction strategies}

221: \author{Vladimir Vovk}

222: \newcommand{\No}{20}

223: % For the two dates option: uncomment the next 2 lines

224: % \twodatestrue

225: % \newcommand{\firstposted}{July 13, 2006}

226: \fi

227:

228: \ifFULL

229: \title{Competing with Markov prediction strategies}

230: \author{Vladimir Vovk\\

231: \texttt{vovk{\rm@}cs.rhul.ac.uk}\\

232: \texttt{http://vovk.net}}

233: \fi

234:

235: \begin{document}

236: \maketitle

237: \begin{abstract}

238:   Assuming that the loss function is convex in the prediction,

239:   we construct a prediction strategy

240:   universal for the class of Markov prediction strategies,

241:   not necessarily continuous.

242:   Allowing randomization,

243:   we remove the requirement of convexity.

244: \end{abstract}

245:

246: \section{Introduction}

247: \label{sec:introduction}

248:

249: This paper belongs to the area of research

250: known as universal prediction of individual sequences

251: (see \cite{cesabianchi/lugosi:2006} for a review):

252: the predictor's goal is to compete with a wide benchmark class of prediction strategies.

253: In the previous papers \cite{\DFVII} and \cite{\DFVIII}

254: we constructed prediction strategies

255: competitive with the important classes of Markov and stationary,

256: respectively,

257: continuous prediction strategies.

258: In this paper we consider competing against possibly discontinuous strategies.

259: Our main results assert the existence of prediction strategies

260: competitive with the Markov strategies.

261:

262: This paper's idea of transition from continuous to general benchmark classes

263: was motivated by Skorokhod's topology for the space $D$

264: of ``c\`adl\`ag'' functions, most of which are discontinuous.

265: Skorokhod's idea was to allow small deformations not only along the vertical axis

266: but also along the horizontal axis when defining neighborhoods.

267: Skorokhod's topology was metrized by Kolmogorov so that it became a separable space

268: (\cite{billingsley:1968}, Appendix III; \cite{shiryaev:1989latin}, p.~913),

269: which allows us to apply one of the numerous algorithms for prediction with expert advice

270: (Kalnishkan and Vyugin's Weak Aggregating Algorithm in this paper)

271: to construct a universal algorithm.

272:

273: In Section \ref{sec:results} we give the main definitions and state our main results,

274: Theorems \ref{thm:deterministic} and \ref{thm:randomized};

275: their proofs are given in Sections \ref{sec:proof-deterministic} and \ref{sec:proof-randomized},

276: respectively.

277:

278: \section{Main results}

279: \label{sec:results}

280:

281: The \emph{game of prediction} between two players,

282: called Predictor and Reality,

283: is played according to the following protocol

284: (of \emph{perfect information},

285: in the sense that either player can see the other player's moves made so far).

286:

287: \bigskip

288:

289: \noindent

290: \textsc{Prediction protocol}\nopagebreak

291: \begin{tabbing}

292:   \qquad\=\qquad\=\qquad\kill

293:   FOR $n=1,2,\dots$:\\

294:   \> Reality announces $x_n\in\mathbf{X}$.\\

295:   \> Predictor announces $\gamma_n\in\Gamma$.\\

296:   \> Reality announces $y_n\in\mathbf{Y}$.\\

297:   END FOR.

298: \end{tabbing}

299:

300: \noindent

301: The game proceeds in rounds numbered by the positive integers $n$.

302: At the beginning of each round $n=1,2,\ldots$ Predictor is given some \emph{signal} $x_n$

303: relevant to predicting the following \emph{observation} $y_n$.

304: The signal is taken from the \emph{signal space} $\mathbf{X}$

305: and the observation from the \emph{observation space} $\mathbf{Y}$.

306: Predictor then announces his prediction $\gamma_n$,

307: taken from the \emph{prediction space} $\Gamma$,

308: and the prediction's quality in light of the actual observation

309: is measured by a \emph{loss function}

310: $\lambda:\Gamma\times\mathbf{Y}\to\bbbr$.

311:

312: We will always assume that the signal space $\mathbf{X}$,

313: the prediction space $\Gamma$,

314: and the observation space $\mathbf{Y}$

315: are non-empty sets;

316: $\mathbf{X}$ and $\Gamma$ will often be equipped with additional structures.

317:

318: \subsection*{Markov-universal prediction strategies: deterministic case}

319:

320: Predictor's strategies in the prediction protocol will be called

321: \emph{prediction strategies}.

322: Formally such a strategy is a function

323: \begin{equation*}

324:   D:

325:   \bigcup_{n=1}^{\infty}

326:   \left(

327:     \mathbf{X}\times\mathbf{Y}

328:   \right)^{n-1}

329:   \times

330:   \mathbf{X}

331:   \to

332:   \Gamma;

333: \end{equation*}

334: it maps each history $(x_1,y_1,\ldots,x_{n-1},y_{n-1},x_n)$ to the chosen prediction.

335: In this paper we will be especially interested

336: in \emph{Markov strategies},

337: which are functions $D:\mathbf{X}\to\Gamma$;

338: intuitively,

339: $D(x_n)$ is the recommended prediction on round $n$.

340: The restriction to Markov strategies

341: is not a severe one,

342: since the signal $x_n$ can encode as much of the past as we want

343: (cf.\ \cite{kolmogorov:1931}, footnote 1);

344: in particular, $x_n$ can contain information about the previous observations

345: $y_1,\ldots,y_{n-1}$.

346: In this paper

347: Markov prediction strategies will also be called \emph{prediction rules}

348: (as in \cite{\DFVII};

349: in a more general context, however, it would be risky to omit ``Markov''

350: since ``prediction rule'' is too easy to confuse with ``prediction strategy'').

351:

352: For both our theorems we will need the notion of ``approximation''

353: to a signal $x\in\mathbf{X}$;

354: intuitively, the ``$m$-approximation'' of $x$ is another signal $\phi_m(x)$

355: which is as close to $x$ as possible but carries only $m$ bits of information.

356: If $\mathbf{X}=[0,1]$,

357: a reasonable definition of $\phi_m(x)$ would be to take the binary expansion of $x$

358: but remove all the binary digits starting from the $(m+1)$th after the binary dot.

359: In general,

360: we will have to equip $\mathbf{X}$ with an ``approximation structure'';

361: we will do this following Kolmogorov and Tikhomirov

362: (\cite{\Tikhomirov}, Section 2,

363: \cite{shiryaev:1989latin}, p.~913% this is p.~49 of 80 in the file

364: \ifFULL\bluebegin, \cite{tikhomirov:1976}\blueend\fi).

365:

366: Consider a sequence of mappings $\phi_m:\mathbf{X}\to\mathbf{X}$,

367: $m=1,2,\ldots$,

368: such that each $\phi_m$ is idempotent,

369: in the sense $\phi_m(\phi_m(x))=\phi_m(x)$ for all $x\in\mathbf{X}$,

370: and $\phi_m(\mathbf{X})$ contains $2^m$ elements.

371: (Such mappings are coding-theory analogues of projections in linear algebra

372: and contractions in topology;

373: $\phi_m(x)$ can be thought of as the result of encoding $x$,

374: sending it over an $m$-bit channel,

375: and restoring $x$ as well as possible at the receiving end.)

376: It is the sequence $\phi=\{\phi_m\st m=1,2,\ldots\}$

377: that will be referred to as an \emph{approximation structure}.

378:

379: If $\mathbf{X}$ is a totally bounded (say, compact) metric space,

380: there is an approximation structure $\phi$ such that

381: \begin{equation}\label{eq:fine}

382:   \lim_{m\to\infty}

383:   \rho

384:   \left(

385:     x,

386:     \phi_m(x)

387:   \right)

388:   =

389:   0

390: \end{equation}

391: uniformly in $x\in\mathbf{X}$.

392: (We often let $\rho$ stand for the metric in various metric spaces,

393: always clear from the context.)

394: In fact,

395: the \emph{$m$th Kolmogorov diameter}

396: \begin{equation*}

397:   \mathcal{K}_m(\mathbf{X})

398:   :=

399:   \frac12

400:   \inf_{\phi}

401:   \sup_{x\in\mathbf{X}}

402:   \diam

403:   \left(

404:     \phi_m^{-1}(\phi_m(x))

405:   \right)

406: \end{equation*}

407: of $\mathbf{X}$ is essentially the inverse function

408: to the $\epsilon$-entropy $\mathcal{H}_{\epsilon}(\mathbf{X})$.

409: See \cite{\KolmogorovTikhomirov}

410: for precise values and estimates of $\mathcal{K}_m(\mathbf{X})$

411: for numerous totally bounded metric spaces $\mathbf{X}$.

412:

413: A prediction strategy is \emph{Markov-universal} for a loss function $\lambda$

414: and an approximation structure $\phi$

415: if it guarantees that

416: for any prediction rule $D$ and any $m=1,2,\ldots$

417: there exists a number $N_{D,m}$ such that for any $N\ge N_{D,m}$

418: and any sequence $x_1,y_1,x_2,y_2,\ldots$ of Reality's moves

419: its responses $\gamma_n$ satisfy

420: \begin{equation*} % \label{eq:dominates-deterministic}

421:   \frac1N

422:   \sum_{n=1}^N

423:   \lambda

424:   (\gamma_n,y_n)

425:   \le

426:   \frac1N

427:   \sum_{n=1}^N

428:   \lambda

429:   \Bigl(

430:     D(\phi_m(x_n)),y_n

431:   \Bigr)

432:   +

433:   2^{-m}.

434: \end{equation*}

435: \begin{theorem}\label{thm:deterministic}

436:   Suppose $\mathbf{X}$ is equipped with an approximation structure $\phi$,

437:   $\Gamma$ is a closed convex subset of a separable Banach space,

438:   and the loss function $\lambda(\gamma,y)$

439:   is bounded, convex in the variable $\gamma\in\Gamma$,

440:   and uniformly continuous in $\gamma\in\Gamma$

441:   uniformly in $y\in\mathbf{Y}$.

442:   There exists a Markov-universal for $\lambda$ and $\phi$ prediction strategy.

443: \end{theorem}

444: A Markov-universal prediction strategy will be constructed in the next section.

445: Theorem \ref{thm:deterministic} says that, under its conditions,

446: \begin{equation}\label{eq:simpler}

447:   \limsup_{N\to\infty}

448:   \left(

449:     \frac1N

450:     \sum_{n=1}^N

451:     \lambda

452:     (\gamma_n,y_n)

453:     -

454:     \frac1N

455:     \sum_{n=1}^N

456:     \lambda

457:     \Bigl(

458:       D(\phi_m(x_n)),y_n

459:     \Bigr)

460:   \right)

461:   \le

462:   0

463: \end{equation}

464: uniformly in $x_1,y_1,x_2,y_2,\ldots$

465: for all $m=1,2,\ldots$ and all $D:\mathbf{X}\to\Gamma$.

466: % This statement is cruder than Theorem \ref{thm:deterministic} itself

467: % but slightly simpler.

468:

469: If $\mathbf{X}$ is a compact metric space and (\ref{eq:fine})

470: holds uniformly in $x\in\mathbf{X}$,

471: (\ref{eq:simpler}) implies

472: \begin{equation*}

473:   \limsup_{N\to\infty}

474:   \left(

475:     \frac1N

476:     \sum_{n=1}^N

477:     \lambda

478:     (\gamma_n,y_n)

479:     -

480:     \frac1N

481:     \sum_{n=1}^N

482:     \lambda

483:     (D(x_n),y_n)

484:   \right)

485:   \le

486:   0

487: \end{equation*}

488: for all continuous prediction rules $D$;

489: this is close to Theorem 1 in \cite{\DFVII}.

490: The advance of this paper as compared to \cite{\DFVII} is that our main results

491: do not assume that $D$ is continuous.

492:

493: \subsection*{Markov-universal prediction strategies: randomized case}

494:

495: When the loss function $\lambda(\gamma,y)$ is not required to be convex in $\gamma$,

496: the conclusion of Theorem \ref{thm:deterministic} may become false

497: (\cite{kalnishkan/vyugin:2005}, Theorem 2).

498: The situation changes if we consider randomized prediction strategies.

499:

500: A \emph{randomized prediction strategy} is a function

501: \begin{equation*}

502:   D:

503:   \bigcup_{n=1}^{\infty}

504:   (\mathbf{X}\times\mathbf{Y})^{n-1}\times\mathbf{X}

505:   \to

506:   \PPP(\Gamma)

507: \end{equation*}

508: mapping the past to the probability measures on the prediction space.

509: In other words, this is a strategy for Predictor

510: in the extended game of prediction with the prediction space $\PPP(\Gamma)$.

511: A \emph{Markov randomized prediction strategy},

512: or \emph{randomized prediction rule} for brevity,

513: is a function $D:\mathbf{X}\to\PPP(\Gamma)$.

514:

515: We will say that a randomized prediction strategy outputting $\gamma_n$

516: is \emph{Markov-universal} for a loss function $\lambda$ and an approximation structure $\phi$ if,

517: for any randomized prediction rule $D$ and any $m=1,2,\ldots$,

518: there exists $N_{D,m}$ such that,

519: for any sequence $x_{1},y_{1},x_{2},y_{2},\ldots$ of Reality's moves,

520: \begin{equation}\label{eq:dominates-randomized}

521:   \sup_{N\ge N_{D,m}}

522:   \left(

523:     \frac1N

524:     \sum_{n=1}^N

525:     \lambda(g_{n},y_n)

526:     -

527:     \frac1N

528:     \sum_{n=1}^N

529:     \lambda(d_{n},y_n)

530:   \right)

531:   \le

532:   2^{-m}

533: \end{equation}

534: with probability at least $1-2^{-m}$,

535: where $g_{1},g_{2},\ldots,d_{1},d_{2},\ldots$ are independent random variables

536: distributed as

537: \begin{equation}\label{eq:distributed}

538:   g_{n}

539:   \sim

540:   \gamma_n,

541:   \enspace

542:   d_{n}

543:   \sim

544:   D(\phi_m(x_n)),

545:   \quad

546:   n=1,2,\ldots\,.

547: \end{equation}

548: Intuitively,

549: the word ``probability'' after (\ref{eq:dominates-randomized})

550: refers only to the prediction strategies' internal randomization;

551: it is not assumed that Reality behaves stochastically.

552: We will use this definition only in the case

553: where the loss function $\lambda$ is continuous in the prediction,

554: and so (\ref{eq:dominates-randomized}) will indeed be an event

555: having a probability.

556: \begin{theorem}\label{thm:randomized}

557:   Suppose the signal space $\mathbf{X}$ is equipped with an approximation structure $\phi$,

558:   $\Gamma$ is a separable topological space,

559:   and the loss function $\lambda$ is bounded

560:   and such that the set of functions $\{\lambda(\cdot,y)\st y\in\mathbf{Y}\}$

561:   is equicontinuous.

562:   There exists a randomized prediction strategy

563:   that is Markov-universal for $\lambda$ and $\phi$.

564: \end{theorem}

565: A Markov-universal prediction strategy is constructed in Section \ref{sec:proof-randomized}.

566: The randomized version of (\ref{eq:simpler}),

567: immediately following from Theorem \ref{thm:randomized},

568: is

569: \begin{equation*}

570:   \limsup_{N\to\infty}

571:   \left(

572:     \frac1N

573:     \sum_{n=1}^N

574:     \lambda

575:     (g_n,y_n)

576:     -

577:     \frac1N

578:     \sum_{n=1}^N

579:     \lambda

580:     (d_n,y_n)

581:   \right)

582:   \le

583:   0

584:   \quad

585:   \text{a.s.},

586: \end{equation*}

587: for all $m=1,2,\ldots$ and all $D:\mathbf{X}\to\Gamma$,

588: where $g_{1},g_{2},\ldots,d_{1},d_{2},\ldots$ are independent

589: and distributed as (\ref{eq:distributed}).

590: \ifFULL\bluebegin

591: If $\mathbf{X}$ is a metric compact and (\ref{eq:fine})

592: holds uniformly in $x$,

593: one might be able to obtain the following analogue of Theorem 2 in \cite{\DFVII}:

594: for continuous prediction rules $D$,

595: \begin{equation*}

596:   \limsup_{N\to\infty}

597:   \left(

598:     \frac1N

599:     \sum_{n=1}^N

600:     \lambda

601:     (g_n,y_n)

602:     -

603:     \frac1N

604:     \sum_{n=1}^N

605:     \lambda

606:     (d_n,y_n)

607:   \right)

608:   \le

609:   0

610:   \quad

611:   \text{a.s.},

612: \end{equation*}

613: where $g_{1},g_{2},\ldots,d_{1},d_{2},\ldots$ are independent

614: and distributed as

615: \begin{equation*}

616:   g_{n}

617:   \sim

618:   \gamma_n,

619:   \enspace

620:   d_{n}

621:   \sim

622:   D(x_n),

623:   \quad

624:   n=1,2,\ldots\,.

625: \end{equation*}

626: \blueend\fi

627:

628: \section{Proof of Theorem \ref{thm:deterministic}}

629: \label{sec:proof-deterministic}

630:

631: Let us fix a dense countable subset $\Gamma^*$ of $\Gamma$.

632: We will say that a function $D:\mathbf{X}\to\Gamma$

633: is \emph{$m$-elementary} if $D(\mathbf{X})\subseteq\Gamma^*$

634: and $D(x)$ depends on $x$ only via $\phi_m(x)$;

635: a function is \emph{elementary} if it is $m$-elementary for some $m$.

636: There are countably many elementary functions;

637: let us enumerate them as $D_1,D_2,\ldots$\,.

638: We will refer to these functions as \emph{experts}.

639: We will apply a special case of Kalnishkan and Vyugin's

640: \cite{kalnishkan/vyugin:2005}

641: Weak Aggregating Algorithm (WAA) to the sequence of experts

642: (as in \cite{\DFVIII}).

643:

644: Let $q_1,q_2,\ldots$ be a sequence of positive numbers summing to 1,

645: $\sum_{k=1}^{\infty}q_k=1$.

646: Define

647: \begin{equation*}

648:   l_n^{(k)}

649:   :=

650:   \lambda

651:   \left(

652:     D_k(x_n),y_n

653:   \right),

654:   \quad

655:   L_N^{(k)}

656:   :=

657:   \sum_{n=1}^N

658:   l_n^{(k)}

659: \end{equation*}

660: to be the instantaneous loss of the $k$th expert $D_k$ on the $n$th round

661: and his cumulative loss over the first $N$ rounds.

662: For all $n,k=1,2,\ldots$ define

663: \begin{equation*}

664:   w_n^{(k)}

665:   :=

666:   q_k

667:   \beta_n^{L_{n-1}^{(k)}},

668:   \quad

669:   \beta_n

670:   :=

671:   \exp

672:   \left(

673:     -\frac{1}{\sqrt{n}}

674:   \right)

675: \end{equation*}

676: ($w_n^{(k)}$ are the weights of the experts to use on round $n$)

677: and

678: \begin{equation*}

679:   p_n^{(k)}

680:   :=

681:   \frac

682:   {w_n^{(k)}}

683:   {\sum_{k=1}^{\infty}w_n^{(k)}}

684: \end{equation*}

685: (the normalized weights;

686: it is obvious that the denominator is positive and finite).

687: The WAA's prediction on round $n$ is

688: \begin{equation}\label{eq:WAA}

689:   \gamma_n

690:   :=

691:   \sum_{k=1}^{\infty}

692:   p_n^{(k)}

693:   D_k(x_n).

694: \end{equation}

695: To make this series convergent,

696: we may take $q_k:=2^{-k}$ and reorder $D_k$ so that

697: $\sup_x\left\|D_k(x)\right\|\le k$ for all $k$.

698: In this case we will automatically have $\gamma_n\in\Gamma$ since

699: \begin{multline}\label{eq:convergence-to-0}

700:   \gamma_n

701:   -

702:   \sum_{k=1}^K

703:   \frac{p_n^{(k)}}{\sum_{k=1}^K p_n^{(k)}}

704:   D_k(x_n)\\

705:   =

706:   \sum_{k=1}^K

707:   \left(

708:     1

709:     -

710:     \frac{1}{\sum_{k=1}^K p_n^{(k)}}

711:   \right)

712:   p_n^{(k)}

713:   D_k(x_n)

714:   +

715:   \sum_{k=K+1}^{\infty}

716:   p_n^{(k)}

717:   D_k(x_n)

718:   \to

719:   0

720: \end{multline}

721: as $K\to\infty$.

722:

723: Let $l_n:=\lambda(\gamma_n,y_n)$ be the WAA's loss on round $n$

724: and

725: $

726:   L_N

727:   :=

728:   \sum_{n=1}^N

729:   l_n

730: $

731: be its cumulative loss over the first $N$ rounds.

732: \begin{lemma}[\cite{kalnishkan/vyugin:2005}, Lemma 9]\label{lem:9}

733:   The WAA guarantees that, for all $N=1,2,\ldots$,

734:   \begin{equation}\label{eq:lemma9}

735:     L_N

736:     \le

737:     \sum_{n=1}^N

738:     \sum_{k=1}^{\infty}

739:     p_n^{(k)}

740:     l_n^{(k)}

741:     -

742:     \sum_{n=1}^N

743:     \log_{\beta_n}

744:     \sum_{k=1}^{\infty}

745:     p_n^{(k)}

746:     \beta_n^{l_n^{(k)}}

747:     +

748:     \log_{\beta_N}

749:     \sum_{k=1}^{\infty}

750:     q_k

751:     \beta_N^{L_N^{(k)}}.

752:   \end{equation}

753: \end{lemma}

754: The first two terms on the right-hand side of (\ref{eq:lemma9})

755: are sums over the first $N$ rounds of different kinds of mean of the experts' losses

756: (see, e.g., \cite{hardy/etal:1952}, Chapter III,

757: for a general definition of the mean);

758: we will see later that they nearly cancel each other out.

759: If those two terms are ignored,

760: the remaining part of (\ref{eq:lemma9}) is identical

761: (except that $\beta$ now depends on $n$)

762: to the main property of the ``Aggregating Algorithm''

763: (see, e.g., \cite{vovk:2001competitive}, Lemma 1).

764: All infinite series in (\ref{eq:lemma9}) are trivially convergent.

765:

766: In the proof of Lemma \ref{lem:9} we will use the following property

767: of ``countable convexity'' of $\lambda$:

768: \begin{equation}\label{eq:countable-convexity}

769:   l_n\le\sum_{k=1}^{\infty}p_n^{(k)}l_n^{(k)}.

770: \end{equation}

771: This property follows from (\ref{eq:convergence-to-0}) and

772: \begin{equation*}

773:   \lambda

774:   \left(

775:     \sum_{k=1}^K

776:     \frac{p_n^{(k)}}{\sum_{k=1}^K p_n^{(k)}}

777:     D_k(x_n),

778:     y_n

779:   \right)

780:   \le

781:   \sum_{k=1}^K

782:   \frac{p_n^{(k)}}{\sum_{k=1}^K p_n^{(k)}}

783:   \lambda

784:   \left(

785:     D_k(x_n),

786:     y_n

787:   \right)

788: \end{equation*}

789: if we let $K\to\infty$.

790:

791: \begin{Proof}{of Lemma \ref{lem:9}}

792:   The proof is by induction on $N$.

793:   For $N=1$,

794:   (\ref{eq:lemma9}) follows from the countable convexity (\ref{eq:countable-convexity})

795:   and $p_1^{(k)}=q_k$.

796:   Assuming (\ref{eq:lemma9}),

797:   we obtain

798:   \begin{multline*}

799:     L_{N+1}

800:     =

801:     L_N + l_{N+1}

802:     \le

803:     L_N

804:     +

805:     \sum_{k=1}^{\infty}

806:     p_{N+1}^{(k)}

807:     l_{N+1}^{(k)}\\

808:     \le

809:     \sum_{n=1}^{N+1}

810:     \sum_{k=1}^{\infty}

811:     p_n^{(k)}

812:     l_n^{(k)}

813:     -

814:     \sum_{n=1}^N

815:     \log_{\beta_n}

816:     \sum_{k=1}^{\infty}

817:     p_n^{(k)}

818:     \beta_n^{l_n^{(k)}}

819:     +

820:     \log_{\beta_N}

821:     \sum_{k=1}^{\infty}

822:     q_k

823:     \beta_N^{L_N^{(k)}}

824:   \end{multline*}

825:   (the first ``$\le$'' again used the countable convexity (\ref{eq:countable-convexity})).

826:   Therefore,

827:   it remains to prove

828:   \begin{equation*}

829:     \log_{\beta_N}

830:     \sum_{k=1}^{\infty}

831:     q_k

832:     \beta_N^{L_N^{(k)}}

833:     \le

834:     -\log_{\beta_{N+1}}

835:     \sum_{k=1}^{\infty}

836:     p_{N+1}^{(k)}

837:     \beta_{N+1}^{l_{N+1}^{(k)}}

838:     +

839:     \log_{\beta_{N+1}}

840:     \sum_{k=1}^{\infty}

841:     q_k

842:     \beta_{N+1}^{L_{N+1}^{(k)}}.

843:   \end{equation*}

844:   By the definition of $p_n^{(k)}$

845:   this can be rewritten as

846:   \begin{equation*}

847:     \log_{\beta_N}

848:     \sum_{k=1}^{\infty}

849:     q_k

850:     \beta_N^{L_N^{(k)}}

851:     \le

852:     -\log_{\beta_{N+1}}

853:     \frac

854:     {

855:       \sum_{k=1}^{\infty}

856:       q_k

857:       \beta_{N+1}^{L_{N}^{(k)}}

858:       \beta_{N+1}^{l_{N+1}^{(k)}}

859:     }

860:     {

861:       \sum_{k=1}^{\infty}

862:       q_k

863:       \beta_{N+1}^{L_{N}^{(k)}}

864:     }

865:     +

866:     \log_{\beta_{N+1}}

867:     \sum_{k=1}^{\infty}

868:     q_k

869:     \beta_{N+1}^{L_{N+1}^{(k)}},

870:   \end{equation*}

871:   which after cancellation becomes

872:   \begin{equation}\label{eq:to-check}

873:     \log_{\beta_N}

874:     \sum_{k=1}^{\infty}

875:     q_k

876:     \beta_N^{L_N^{(k)}}

877:     \le

878:     \log_{\beta_{N+1}}

879:     \sum_{k=1}^{\infty}

880:     q_k

881:     \beta_{N+1}^{L_{N}^{(k)}}.

882:   \end{equation}

883:   The last inequality follows from the general result

884:   about comparison of different means

885:   (\cite{hardy/etal:1952}, Theorem 85),

886:   but we can also check it directly

887:   (following \cite{kalnishkan/vyugin:2005}).

888:   Let $\beta_{N+1}=\beta_N^a$,

889:   where $0<a<1$.

890:   Then (\ref{eq:to-check}) can be rewritten as

891:   \begin{equation*}

892:     \left(

893:       \sum_{k=1}^{\infty}

894:       q_k

895:       \beta_N^{L_N^{(k)}}

896:     \right)^a

897:     \ge

898:     \sum_{k=1}^{\infty}

899:     q_k

900:     \beta_{N}^{aL_{N}^{(k)}},

901:   \end{equation*}

902:   and the last inequality follows from the concavity of the function $t\mapsto t^a$.

903:   \qedtext

904: \end{Proof}

905:

906: \begin{lemma}[\cite{kalnishkan/vyugin:2005}, Lemma 5]

907:   Let $L$ be an upper bound on $\left|\lambda\right|$.

908:   The WAA guarantees that, for all $N$ and $K$,

909:   \begin{equation}\label{eq:lemma5}

910:     L_N

911:     \le

912:     L_N^{(K)}

913:     +

914:     \left(

915:       L^2 e^L + \ln\frac{1}{q_K}

916:     \right)

917:     \sqrt{N}.

918:   \end{equation}

919: \end{lemma}

920: \begin{proof}

921:   From (\ref{eq:lemma9}),

922:   we obtain:

923:   \begin{align*}

924:     L_N

925:     &\le

926:     \sum_{n=1}^N

927:     \sum_{k=1}^{\infty}

928:     p_n^{(k)}

929:     l_n^{(k)}

930:     +

931:     \sum_{n=1}^N

932:     \sqrt{n}

933:     \ln

934:     \sum_{k=1}^{\infty}

935:     p_n^{(k)}

936:     \exp

937:     \left(

938:       -\frac{l_n^{(k)}}{\sqrt{n}}

939:     \right)

940:     +

941:     \log_{\beta_N}

942:     q_K

943:     +

944:     L_N^{(K)}\\

945:     &\le

946:     \sum_{n=1}^N

947:     \sum_{k=1}^{\infty}

948:     p_n^{(k)}

949:     l_n^{(k)}

950:     +

951:     \sum_{n=1}^N

952:     \sqrt{n}

953:     \left(

954:       \sum_{k=1}^{\infty}

955:       p_n^{(k)}

956:       \left(

957:         1

958:         -

959:         \frac{l_n^{(k)}}{\sqrt{n}}

960:         +

961:         \frac{\left(l_n^{(k)}\right)^2}{2n}

962:         e^L

963:       \right)

964:       -

965:       1

966:     \right)\\

967:     &\quad{}+

968:     \log_{\beta_N}

969:     q_K

970:     +

971:     L_N^{(K)}\\

972:     &=

973:     L_N^{(K)}

974:     +

975:     \frac12

976:     \sum_{n=1}^N

977:     \frac{1}{\sqrt{n}}

978:     \sum_{k=1}^{\infty}

979:     p_n^{(k)}

980:     \left(l_n^{(k)}\right)^2

981:     e^L

982:     +

983:     \sqrt{N}\ln\frac{1}{q_K}\\

984:     &\le

985:     L_N^{(K)}

986:     +

987:     \frac{L^2e^L}{2}

988:     \sum_{n=1}^N

989:     \frac{1}{\sqrt{n}}

990:     +

991:     \sqrt{N}\ln\frac{1}{q_K}

992:     \le

993:     L_N^{(K)}

994:     +

995:     \frac{L^2e^L}{2}

996:     \int_0^N

997:     \frac{\D t}{\sqrt{t}}

998:     +

999:     \sqrt{N}\ln\frac{1}{q_K}\\

1000:     &=

1001:     L_N^{(K)}

1002:     +

1003:     L^2e^L\sqrt{N}

1004:     +

1005:     \sqrt{N}\ln\frac{1}{q_K}

1006:   \end{align*}

1007:   (in the second ``$\le$'' we used the inequalities $e^t\le1+t+\frac{t^2}{2}e^{\left|t\right|}$

1008:   and $\ln t\le t-1$).

1009:   \qedtext

1010: \end{proof}

1011:

1012: \begin{remark*}

1013:   There is no term $e^L$ in \cite{kalnishkan/vyugin:2005}

1014:   since that paper only considers non-negative loss functions.

1015:   (Notice that even without assuming non-negativity

1016:   this term is very crude and can be easily improved.)

1017: \end{remark*}

1018:

1019: Now it is easy to prove Theorem \ref{thm:deterministic}.

1020: The definition of Markov-universality can be restated as follows:

1021: a prediction strategy outputting $\gamma_n$ is Markov-universal

1022: if and only if

1023: for any prediction rule $D$, any $m=1,2,\ldots$,

1024: and any $\epsilon>0$

1025: there exists $N_{D,m,\epsilon}$ such that, for any $N\ge N_{D,m,\epsilon}$

1026: and any $x_1,y_1,x_2,y_2,\ldots$,

1027: \begin{equation}\label{eq:dominates-deterministic-version}

1028:   \frac1N

1029:   \sum_{n=1}^N

1030:   \lambda

1031:   (\gamma_n,y_n)

1032:   \le

1033:   \frac1N

1034:   \sum_{n=1}^N

1035:   \lambda

1036:   \Bigl(

1037:     D(\phi_m(x_n)),y_n

1038:   \Bigr)

1039:   +

1040:   \epsilon.

1041: \end{equation}

1042: Let $\gamma_n$ be output by the WAA

1043: and let us consider any prediction rule $D$,

1044: any $m\in\{1,2,\ldots\}$, and any $\epsilon>0$.

1045: Choose $\delta>0$ such that

1046: $\left|\lambda(\gamma,y)-\lambda(\gamma',y)\right|<\epsilon/2$

1047: whenever $\rho(\gamma,\gamma')<\delta$

1048: and choose an $m$-elementary expert $D_K$ such that,

1049: for all $x\in\phi_m(\mathbf{X})$,

1050: $\rho(D(x),D_{K}(x))<\delta$.

1051:

1052: From (\ref{eq:lemma5}) we obtain

1053: \begin{multline}\label{eq:chain}

1054:   \frac1N

1055:   \sum_{n=1}^N

1056:   \lambda(\gamma_n,y_n)

1057:   -

1058:   \frac1N

1059:   \sum_{n=1}^N

1060:   \lambda

1061:   \Bigl(

1062:     D(\phi_m(x_n)),y_n

1063:   \Bigr)\\

1064:   \le

1065:   \frac1N

1066:   \sum_{n=1}^N

1067:   \lambda(\gamma_n,y_n)

1068:   -

1069:   \frac1N

1070:   \sum_{n=1}^N

1071:   \lambda

1072:   \Bigl(

1073:     D_{K}(\phi_m(x_n)),y_n

1074:   \Bigr)

1075:   +

1076:   \frac{\epsilon}{2}\\

1077:   =

1078:   \frac1N

1079:   \sum_{n=1}^N

1080:   \lambda(\gamma_n,y_n)

1081:   -

1082:   \frac1N

1083:   \sum_{n=1}^N

1084:   \lambda

1085:   \Bigl(

1086:     D_{K}(x_n),y_n

1087:   \Bigr)

1088:   +

1089:   \frac{\epsilon}{2}\\

1090:   \le

1091:   \left(

1092:     L^2e^L + \ln\frac{1}{q_{K}}

1093:   \right)

1094:   \frac{1}{\sqrt{N}}

1095:   +

1096:   \frac{\epsilon}{2};

1097: \end{multline}

1098: now (\ref{eq:dominates-deterministic-version}) is obvious.

1099:

1100: \section{Proof of Theorem \ref{thm:randomized}}

1101: \label{sec:proof-randomized}

1102:

1103: \ifFULL\bluebegin

1104:   Unfortunately,

1105:   Theorem \ref{thm:deterministic} cannot be applied

1106:   to the extended game of prediction with the prediction space $\PPP(\Gamma)$ directly:

1107:   the theorem assumes that $\Gamma$ is a subset of a Banach space,

1108:   whereas,

1109:   even assuming $\Gamma$ compact,

1110:   the dual to an infinite-dimensional Banach space is never even metrizable

1111:   in the weak$^*$ topology

1112:   (\cite{rudin:1991}, 3.16).

1113:   The proof of Theorem \ref{thm:deterministic}, however,

1114:   still works for the new game.

1115: \blueend\fi

1116:

1117: A convenient pseudo-metric on $\Gamma$ can be defined by

1118: \begin{equation*}

1119:   \rho(g,g')

1120:   :=

1121:   \sup

1122:   \left\{

1123:     \lambda(g,y)

1124:     -

1125:     \lambda(g',y)

1126:     \st

1127:     y\in\mathbf{Y}

1128:   \right\},

1129:   \quad

1130:   g,g'\in\Gamma

1131: \end{equation*}

1132: (cf.\ \cite{dudley:2002}, Corollary 11.3.4).

1133: Let us redefine $\Gamma$ as the quotient space obtained from the original $\Gamma$

1134: by identifying $g$ and $g'$ for which $\rho(g,g')=0$

1135: (\cite{engelking:1989}, Section 2.4);

1136: in other words,

1137: we will not distinguish predictions that always lead to identical losses.

1138: Now $\rho$ becomes a metric on $\Gamma$.

1139: Let $\Gamma^*$ be a countable dense subset of the original topological space $\Gamma$

1140: (which is separable as a subset of a separable Banach space);

1141: the condition of equicontinuity implies that $\Gamma^*$

1142: (formally defined as the set of equivalence classes

1143: containing elements of the original $\Gamma^*$)

1144: remains a dense subset in $\Gamma$ equipped with the metric $\rho$.

1145: % We can see that $\Gamma$ remains a separable space.

1146:

1147: We define the norm of a function $f:\Gamma\to\bbbr$ as

1148: \begin{equation*}

1149:   \left\|f\right\|_{\BL}

1150:   :=

1151:   \sup_{g,g'\in\Gamma:g\ne g'}

1152:   \frac{\left|f(g)-f(g')\right|}{\rho(g,g')}

1153:   +

1154:   \sup_{g\in\Gamma}

1155:   \left|f(g)\right|;

1156: \end{equation*}

1157: this norm is finite for bounded Lipschitz functions

1158: (which form a Banach space under this norm:

1159: see \cite{dudley:2002}, Section 11.2).

1160: Notice that

1161: \begin{equation}\label{eq:BL-for-lambda}

1162:   \left\|\lambda\right\|_{\BL}

1163:   :=

1164:   \sup_{y\in\mathbf{Y}}

1165:   \left\|\lambda(\cdot,y)\right\|_{\BL}

1166:   <

1167:   \infty.

1168: \end{equation}

1169:

1170: Next define

1171: \begin{equation}\label{eq:expected-loss}

1172:   \lambda(\gamma,y)

1173:   :=

1174:   \int_{\Gamma}

1175:   \lambda(g,y)

1176:   \gamma(\dd g),

1177: \end{equation}

1178: where $\gamma$ is a probability measure on $\Gamma$.

1179: This is the loss function in a new game of prediction

1180: with the prediction space $\PPP(\Gamma)$;

1181: it is linear and, therefore, convex in $\gamma$.

1182: (In general,

1183: the role of randomization in this paper

1184: is to make the loss function convex in the prediction.)

1185:

1186: As a metric on $\PPP(\Gamma)$ we will take the Fortet--Mourier metric

1187: (\cite{dudley:2002}, Section 11.3)

1188: defined as

1189: \begin{equation*}

1190:   \beta(\gamma,\gamma')

1191:   :=

1192:   \sup_{f:\left\|f\right\|_{\BL}\le1}

1193:   \left|

1194:     \int_{\Gamma}

1195:     f

1196:     \D

1197:     (\gamma-\gamma')

1198:   \right|.

1199: \end{equation*}

1200: The topology on $\PPP(\Gamma)$ induced by this metric

1201: is called the \emph{topology of weak convergence}

1202: (\cite{billingsley:1968};

1203: weak convergence is called simply ``convergence'' in \cite{dudley:2002};

1204: for the proof of equivalence of several natural definitions

1205: of the topology of weak convergence,

1206: see \cite{dudley:2002}, Theorem 11.3.3).

1207:

1208: Let us check that the loss function (\ref{eq:expected-loss}) is also

1209: bounded Lipschitz, in the sense of (\ref{eq:BL-for-lambda}):

1210: if $\gamma,\gamma'\in\PPP(\gamma)$ and $y\in\mathbf{Y}$,

1211: \begin{equation*}

1212:   \left|

1213:     \lambda(\gamma,y)

1214:     -

1215:     \lambda(\gamma',y)

1216:   \right|

1217:   =

1218:   \left|

1219:     \int_{\Gamma}

1220:     \lambda(g,y)

1221:     (\gamma-\gamma')

1222:     (\dd g)

1223:   \right|

1224:   \le

1225:   \left\|\lambda\right\|_{\BL}

1226:   \beta(\gamma,\gamma').

1227: \end{equation*}

1228:

1229: It is easy to see that the space $\PPP(\Gamma)$ with metric $\beta$ is separable:

1230: e.g., the set of probability measures concentrated on finite subsets of $\Gamma^*$

1231: and taking rational values is dense in $\PPP(\Gamma)$

1232: (cf.\ \cite{billingsley:1968}, Appendix III).

1233: Let us enumerate the elements of a dense countable set in $\PPP(\Gamma)$

1234: as $D_1,D_2,\ldots$;

1235: as in the previous section,

1236: we will use the WAA to merge all \emph{experts} $D_k$.

1237:

1238: The convergence of the mixture (\ref{eq:WAA}) to a probability measure on $\Gamma$

1239: is now obvious.

1240: The countable convexity (\ref{eq:countable-convexity})

1241: now holds with equality,

1242: \begin{equation*}

1243:   \lambda

1244:   \left(

1245:     \sum_{k=1}^{\infty}

1246:     p_n^{(k)}

1247:     D_k(x_n),

1248:     y_n

1249:   \right)

1250:   =

1251:   \sum_{k=1}^{\infty}

1252:   p_n^{(k)}

1253:   \lambda

1254:   \left(

1255:     D_k(x_n),

1256:     y_n

1257:   \right),

1258: \end{equation*}

1259: and follows from the general fact that

1260: \begin{equation*}

1261:   \int f \D \sum_{k=1}^{\infty} p_k P_k

1262:   =

1263:   \sum_{k=1}^{\infty}

1264:   p_k

1265:   \int f \D P_k

1266: \end{equation*}

1267: for bounded Borel $f:\Gamma\to\bbbr$,

1268: positive $p_1,p_2,\ldots$ summing to $1$,

1269: and $P_1,P_2,\ldots\in\PPP(\Gamma)$

1270: (this is obviously true for simple $f$

1271: and follows for arbitrary integrable $f$ from the definition of Lebesgue integral:

1272: see, e.g., \cite{dudley:2002}, Section 4.1).

1273:

1274: Therefore, it is easy to check

1275: that the chain (\ref{eq:chain}) still works

1276: (with $\PPP(\Gamma)$ equipped with metric $\beta$)

1277: and we can rephrase the previous section's result as follows.

1278: For any randomized prediction rule $D$, any $m=1,2,\ldots$,

1279: and any $\epsilon>0$

1280: there exists $N_{D,m,\epsilon}$ such that, for any $N\ge N_{D,m,\epsilon}$

1281: and any $x_1,y_1,x_2,y_2,\ldots$,

1282: the WAA's predictions $\gamma_n\in\PPP(\Gamma)$

1283: are guaranteed to satisfy

1284: \begin{equation}\label{eq:mean}

1285:   \frac1N

1286:   \sum_{n=1}^N

1287:   \lambda

1288:   (\gamma_n,y_n)

1289:   \le

1290:   \frac1N

1291:   \sum_{n=1}^N

1292:   \lambda

1293:   \Bigl(

1294:     D(\phi_m(x_n)),y_n

1295:   \Bigr)

1296:   +

1297:   \frac{\epsilon}{2}

1298: \end{equation}

1299: (cf.\  (\ref{eq:dominates-deterministic-version})).

1300:

1301: The loss function is bounded in absolute value

1302: by a constant $L$,

1303: and so the law of the iterated logarithm

1304: (in Kolmogorov's finitary form,

1305: \cite{kolmogorov:1929}, the end of the introductory section;

1306: the condition that the cumulative variance tends to infinity

1307: is easy to get rid of:

1308: see, e.g., \cite{shafer/vovk:2001}, (5.8))

1309: implies that for any $\delta>0$ there exists $N_{\delta}$

1310: such that the conjunction of

1311: \begin{equation*}

1312:   \sup_{N\ge N_{\delta}}

1313:   \left|

1314:     \sum_{n=1}^N

1315:     \bigl(

1316:       \lambda(g_n,y_n)

1317:       -

1318:       \lambda(\gamma_n,y_n)

1319:     \bigr)

1320:   \right|

1321:   \le

1322:   \sqrt{2.01 L^2 N\ln\ln N}

1323: \end{equation*}

1324: and

1325: \begin{equation*}

1326:   \sup_{N\ge N_{\delta}}

1327:   \left|

1328:     \sum_{n=1}^N

1329:     \bigl(

1330:       \lambda(d_n,y_n)

1331:       -

1332:       \lambda(D(x_n),y_n)

1333:     \bigr)

1334:   \right|

1335:   \le

1336:   \sqrt{2.01 L^2 N\ln\ln N}

1337: \end{equation*}

1338: holds with probability at least $1-\delta$.

1339: Combining the last two inequalities with (\ref{eq:mean})

1340: we can see that for any randomized prediction rule $D$, any $m=1,2,\ldots$,

1341: any $\epsilon>0$, and any $\delta>0$

1342: there exists $N_{D,m,\epsilon,\delta}$ such that,

1343: for any $x_1,y_1,x_2,y_2,\ldots$,

1344: the WAA's responses $\gamma_n\in\PPP(\Gamma)$ to $x_1,y_1,x_2,y_2,\ldots$

1345: are guaranteed to satisfy

1346: \begin{equation*}

1347:   \sup_{N\ge N_{D,m,\epsilon,\delta}}

1348:   \left(

1349:     \frac1N

1350:     \sum_{n=1}^N

1351:     \lambda(g_n,y_n)

1352:     -

1353:     \frac1N

1354:     \sum_{n=1}^N

1355:     \lambda(d_n,y_n)

1356:   \right)

1357:   \le

1358:   \epsilon

1359: \end{equation*}

1360: with probability at least $1-\delta$.

1361: This is equivalent to the WAA (applied to $D_1,D_2,\ldots$)

1362: being a Markov-universal randomized prediction strategy.

1363:

1364: \section{Conclusion}

1365: \label{sec:conclusion}

1366:

1367: An interesting theoretical problem

1368: is to state more explicit versions

1369: of Theorems \ref{thm:deterministic} and \ref{thm:randomized}:

1370: for example,

1371: to give an explicit expression for $N_{D,m}$.

1372:

1373: The field of lossy compression is now well developed,

1374: and it would be interesting to apply our prediction algorithms

1375: (perhaps with the Weak Aggregating Algorithm replaced

1376: by an algorithm based on, say, gradient descent \cite{cesabianchi/lugosi:2006}

1377: or defensive forecasting \cite{\DFVII})

1378: to the approximation structures induced by popular lossy compression algorithms.

1379:

1380: \subsection*{Acknowledgments}

1381:

1382: This work was partially supported by MRC (grant S505/65).

1383:

1384: \begin{thebibliography}{10}

1385: \bibitem{billingsley:1968}

1386: Patrick Billingsley.

1387: \newblock {\em Convergence of Probability Measures}.

1388: \newblock Wiley, New York, 1968.

1389:

1390: \bibitem{cesabianchi/lugosi:2006}

1391: Nicol\`o Cesa-Bianchi and G\'abor Lugosi.

1392: \newblock {\em Prediction, Learning, and Games}.

1393: \newblock Cambridge University Press, Cambridge, 2006.

1394:

1395: \bibitem{dudley:2002}

1396: Richard~M. Dudley.

1397: \newblock {\em Real Analysis and Probability}, volume~74 of {\em Cambridge

1398:   Studies in Advanced Mathematics}.

1399: \newblock Cambridge University Press, Cambridge, England, revised edition,

1400:   2002.

1401:

1402: \bibitem{engelking:1989}

1403: Ryszard Engelking.

1404: \newblock {\em General Topology}, volume~6 of {\em Sigma Series in Pure

1405:   Mathematics}.

1406: \newblock Heldermann, Berlin, second edition, 1989.

1407:

1408: \bibitem{hardy/etal:1952}

1409: G\DOT{}~H\DOT{} Hardy, John~E\DOT{} Littlewood, and George P\'olya.

1410: \newblock {\em Inequalities}.

1411: \newblock Cambridge University Press, Cambridge, second edition, 1952.

1412:

1413: \bibitem{kalnishkan/vyugin:2005}

1414: Yuri Kalnishkan and Michael~V\DOT{} Vyugin.

1415: \newblock The {W}eak {A}ggregating {A}lgorithm and weak mixability.

1416: \newblock In Peter Auer and Ron Meir, editors, {\em Proceedings of the

1417:   Eighteenth Annual Conference on Learning Theory}, volume 3559 of {\em Lecture

1418:   Notes in Computer Science}, pages 188--203, Berlin, 2005. Springer.

1419: \newblock The journal version is being prepared for the Special Issue of

1420:   \emph{Journal of Machine Learning Research} devoted to COLT'2005; all

1421:   references are to the journal version.

1422:

1423: \bibitem{kolmogorov:1929}

1424: Andrei~N\DOT{} Kolmogorov.

1425: \newblock {\"U}ber das {G}esetz des iterierten {L}ogarithmus.

1426: \newblock {\em Mathematische Annalen}, 101:126--135, 1929.

1427:

1428: \bibitem{kolmogorov:1931}

1429: Andrei~N\DOT{} Kolmogorov.

1430: \newblock {\"U}ber die analytischen {M}ethoden in der

1431:   {W}ahrscheinlichkeitsrechnung.

1432: \newblock {\em Mathematische Annalen}, 104:415--458, 1931.

1433:

1434: \bibitem{kolmogorov/tikhomirov:1959latin}

1435: Andrei~N\DOT{} Kolmogorov and Vladimir~M\DOT{} Tikhomirov.

1436: \newblock $\epsilon$-entropy and $\epsilon$-capacity of sets in functional

1437:   spaces (in {R}ussian).

1438: \newblock {\em Uspekhi Matematicheskikh Nauk}, 14(2):3--86, 1959.

1439:

1440: \bibitem{shafer/vovk:2001}

1441: Glenn Shafer and \Vladimir{} Vovk.

1442: \newblock {\em Probability and Finance: It's Only a Game!}

1443: \newblock Wiley, New York, 2001.

1444:

1445: \bibitem{shiryaev:1989latin}

1446: Albert~N\DOT{} Shiryaev.

1447: \newblock Kolmogorov: life and creative activities.

1448: \newblock {\em Annals of Probability}, 17:866--944, 1989.

1449:

1450: \bibitem{tikhomirov:1987latin}

1451: Vladimir~M\DOT{} Tikhomirov.

1452: \newblock $\epsilon$-entropy and $\epsilon$-capacity (in {R}ussian).

1453: \newblock In Yury~V\DOT{} Prokhorov and Albert~N\DOT{} Shiryaev, editors, {\em

1454:   Kolmogorov. Teoriya In\-for\-ma\-tsii i Teoriya Algoritmov}, pages 262--269.

1455:   Nauka, Moscow, 1987.

1456:

1457: \bibitem{vovk:2001competitive}

1458: Vladimir Vovk.

1459: \newblock Competitive on-line statistics.

1460: \newblock {\em International Statistical Review}, 69:213--248, 2001.

1461:

1462: \bibitem{DF08arXiv}

1463: \Vladimir{} Vovk.

1464: \newblock Competing with stationary prediction strategies.

1465: \newblock Technical Report \texttt{arXiv:cs.LG/0607067}, \texttt{arXiv.org}

1466:   e-Print archive, July 2006.

1467:

1468: \bibitem{DF07arXiv}

1469: \Vladimir{} Vovk.

1470: \newblock Predictions as statements and decisions.

1471: \newblock Technical Report \texttt{arXiv:cs.LG/0606093}, \texttt{arXiv.org}

1472:   e-Print archive, June 2006.

1473:

1474: \end{thebibliography}

1475: \end{document}

1476: