cs0607136/cs0607136
1: % Last changed: 28 Jul 2006
2: % Spell checked: 28 Jul 2006
3: % 1438 lines, 39 KB
4: \newif\ifJOURNAL
5: \JOURNALfalse
6: \newif\ifCONF
7: \CONFfalse
8: \newif\ifarXiv
9: \arXivfalse
10: \newif\ifWP
11: \WPfalse
12: \newif\ifFULL
13: \FULLfalse
14: 
15: \newif\ifLATIN
16: \LATINfalse
17: 
18: %\JOURNALtrue		% choose JOURNAL, arXiv, WP, or FULL
19: %\CONFtrue
20: \arXivtrue
21: %\WPtrue
22: %\FULLtrue		% this version is not for publication and contains extra remarks and questions
23: 
24: %\LATINtrue		% LATIN means that the Cyrillic references should be set in Latin
25: \ifarXiv\LATINtrue\fi	% for submitting to arXiv
26: 
27: \newif\ifnotJOURNAL	% derivative conditional
28: \notJOURNALtrue
29: \ifJOURNAL\notJOURNALfalse\fi
30: 
31: \newif\ifnotarXiv	% derivative conditional
32: \notarXivtrue
33: \ifarXiv\notarXivfalse\fi
34: 
35: \newif\ifTR		% derivative conditionals (TR = arXiv or WP)
36: \TRfalse
37: \ifarXiv\TRtrue\fi
38: \ifWP\TRtrue\fi
39: \newif\ifnotTR
40: \notTRtrue
41: \ifarXiv\notTRfalse\fi
42: \ifWP\notTRfalse\fi
43: 
44: \newif\ifnotLATIN	% derivative conditional
45: \notLATINtrue
46: \ifLATIN\notLATINfalse\fi
47: 
48: \ifJOURNAL
49:   \newcommand{\DFI}{vovk/etal:2005AIStatslocal}		% former \GTPVIII
50:   \newcommand{\DFII}{vovk/etal:2005ALT}			% former \GTPX
51:   \newcommand{\DFIII}{vovk:2005ALT-DF03}		% former \GTPXIII
52:   \newcommand{\DFIV}{vovk:2005ALT-DF04}			% former \GTPXIV
53:   \newcommand{\DFV}{DF05arXiv}				% former \GTPXI
54:   \newcommand{\DFVI}{DF06arXiv}				% former \GTPXVI
55: \fi
56: \ifarXiv
57:   \newcommand{\DFI}{DF01arXiv}		% former \GTPVIII
58:   \newcommand{\DFII}{DF02arXiv}		% former \GTPX
59:   \newcommand{\DFIII}{DF03arXiv}	% former \GTPXIII
60:   \newcommand{\DFIV}{DF04arXiv}		% former \GTPXIV
61:   \newcommand{\DFV}{DF05arXiv}		% former \GTPXI
62:   \newcommand{\DFVI}{DF06arXiv}		% former \GTPXVI
63:   \newcommand{\DFVII}{DF07arXiv}	% former \GTPXVII
64:   \newcommand{\DFVIII}{DF08arXiv}
65: \fi
66: \ifWP
67:   \newcommand{\DFI}{GTP8}		% former \GTPVIII
68:   \newcommand{\DFII}{GTP10}		% former \GTPX
69:   \newcommand{\DFIII}{GTP13}		% former \GTPXIII
70:   \newcommand{\DFIV}{GTP14}		% former \GTPXIV
71:   \newcommand{\DFV}{GTP11}		% former \GTPXI
72:   \newcommand{\DFVI}{GTP16}		% former \GTPXVII
73:   \newcommand{\DFVII}{GTP17}		% former \GTPXVII
74:   \newcommand{\DFVIII}{DF08arXiv}
75: \fi
76: \ifFULL
77:   \newcommand{\DFI}{DF01arXiv}		% former \GTPVIII
78:   \newcommand{\DFII}{DF02arXiv}		% former \GTPX
79:   \newcommand{\DFIII}{DF03arXiv}	% former \GTPXIII
80:   \newcommand{\DFIV}{DF04arXiv}		% former \GTPXIV
81:   \newcommand{\DFV}{DF05arXiv}		% former \GTPXI
82:   \newcommand{\DFVI}{DF06arXiv}		% former \GTPXVI
83:   \newcommand{\DFVII}{DF07arXiv}	% former \GTPXVII
84:   \newcommand{\DFVIII}{DF08arXiv}
85: \fi
86: 
87: \ifnotLATIN
88:   \newcommand{\KolmogorovTikhomirov}{kolmogorov/tikhomirov:1959}
89:   \newcommand{\Tikhomirov}{tikhomirov:1987}
90: \fi
91: \ifLATIN
92:   \newcommand{\KolmogorovTikhomirov}{kolmogorov/tikhomirov:1959latin}
93:   \newcommand{\Tikhomirov}{tikhomirov:1987latin}
94: \fi
95: 
96: \ifJOURNAL
97: \documentclass{article}
98: \usepackage{amsmath,amsfonts,amssymb,latexsym,graphicx}
99: \newcommand{\Extra}[1]{}
100: \fi
101: 
102: \ifCONF
103: \documentclass{article}
104: \usepackage{amsmath,amsfonts,amssymb,latexsym,graphicx}
105: \newcommand{\Extra}[1]{}
106: \fi
107: 
108: \ifarXiv
109: \documentclass{article}
110: \usepackage{amsmath,amsfonts,amssymb,latexsym,graphicx}
111: \newcommand{\Extra}[1]{}
112: \fi
113: 
114: \ifWP
115: \documentclass{gtarticle}
116: \usepackage{amsmath,amsfonts,amssymb,latexsym,epsfig,graphicx}
117: \renewcommand{\Extra}[1]{#1}
118: \fi
119: 
120: \ifFULL
121: \documentclass{article}
122: \usepackage{amsmath,amsfonts,amssymb,latexsym,color,epigraph,graphicx,eepic}
123: \newcommand{\Extra}[1]{\red{#1}}
124: \newcommand{\red}[1]{\textcolor{red}{#1}}
125: \newcommand{\blue}[1]{\textcolor{blue}{#1}}
126: \newcommand{\bluebegin}{\begingroup\color{blue}}
127: \newcommand{\blueend}{\endgroup}
128: \newcommand{\redbegin}{\begingroup\color{red}}
129: \newcommand{\redend}{\endgroup}
130: \fi
131: 
132: \emergencystretch=5mm
133: \tolerance=400
134: \allowdisplaybreaks[4]
135: 
136: \newcommand{\Vladimir}{Vladimir}
137: \newcommand{\DOT}{.}
138: 
139: \ifnotLATIN
140: \input{OT2enc.def}
141: \newenvironment{cyr}
142: {\fontencoding{OT2}\fontfamily{wncyr}\fontseries{m}\fontshape{n}\selectfont}
143: {\fontencoding{OT1}\fontfamily{tir}\selectfont}
144: \usepackage{CJK}
145: \fi
146: 
147: \newcommand{\st}{\mathrel{\!|\!}}
148: \newcommand{\givn}{\mathrel{|}}
149: \newcommand{\D}{\,\mathrm{d}}
150: \newcommand{\dd}{\mathrm{d}}
151: 
152: \newcommand{\III}{\mathbb{I}}
153: \newcommand{\PPP}{\mathcal{P}}		% all probability measures
154: 
155: \newcommand{\BL}{\mathrm{BL}}		% bounded Lipschitz
156: 
157: \newcommand{\diam}{\mathop{\mathrm{diam}}\nolimits}
158: 
159: \newcommand{\bbbp}{\mathbb{P}}		% auxiliary (probability)
160: \newcommand{\Prob}{\mathop{\bbbp}\nolimits}
161: \newcommand{\bbbe}{\mathbb{E}}		% auxiliary (expectation)
162: \newcommand{\Expect}{\mathop{\bbbe}\nolimits}
163: 
164: \newcommand{\cpc}{\mathop{\mathrm{cpc}}\nolimits}
165: \newcommand{\ucpc}{\mathop{\overline{\mathstrut\mathrm{cpc}}}\nolimits}
166: \newcommand{\lcpc}{\mathop{\underline{\mathstrut\mathrm{cpc}}}\nolimits}
167: 
168: \newcommand{\bbbr}{\mathbb{R}}		% the real numbers
169: 
170: \newtheorem{lemma}{Lemma}
171: \newtheorem{proposition}{Proposition}
172: \newtheorem{corollary}{Corollary}
173: \newtheorem{remark}{Remark}
174: \newtheorem{theorem}{Theorem}
175: \newenvironment{proof}
176:   {\trivlist\item[\hskip\labelsep\textbf{Proof}]}
177:   {\endtrivlist}
178: 
179: \newenvironment{Proof}[1]
180:   {\trivlist\item[\hskip\labelsep\textbf{Proof #1\,}]}
181:   {\endtrivlist}
182: \newcommand{\boxforqed}{\rule{.3em}{1.5ex}}
183: \newcommand{\qedtext}{\unskip\nobreak\hfil
184:   \penalty50\hskip1em\null\nobreak\hfil\boxforqed
185:   \parfillskip=0pt\finalhyphendemerits=0\endgraf}
186: %\newcommand{\qedmath}{\eqno\boxforqed}
187: \newcommand{\qedmath}{\tag*{\boxforqed}}
188: \newenvironment{remark*}
189:   {\trivlist\item[\hskip\labelsep{\bfseries Remark}]\relax}
190:   {\endtrivlist}
191: 
192: \ifJOURNAL
193: \title{Competing with Markov prediction strategies}
194: \author{Vladimir Vovk\\[5mm]
195:  Computer Learning Research Centre\\
196:   Department of Computer Science\\
197:   Royal Holloway, University of London,
198:   Egham, Surrey TW20 0EX, UK\\
199:   \texttt{vovk@cs.rhul.ac.uk}}
200: \fi
201: 
202: \ifCONF
203: \title{Competing with Markov prediction strategies}
204: \author{Vladimir Vovk\\[5mm]
205:  Computer Learning Research Centre\\
206:   Department of Computer Science\\
207:   Royal Holloway, University of London,
208:   Egham, Surrey TW20 0EX, UK\\
209:   \texttt{vovk@cs.rhul.ac.uk}}
210: \fi
211: 
212: \ifarXiv
213: \title{Competing with Markov prediction strategies}
214: \author{Vladimir Vovk\\
215: \texttt{vovk{\rm@}cs.rhul.ac.uk}\\
216: \texttt{http://vovk.net}}
217: \fi
218: 
219: \ifWP
220: \title{Competing with Markov prediction strategies}
221: \author{Vladimir Vovk}
222: \newcommand{\No}{20}
223: % For the two dates option: uncomment the next 2 lines
224: % \twodatestrue
225: % \newcommand{\firstposted}{July 13, 2006}
226: \fi
227: 
228: \ifFULL
229: \title{Competing with Markov prediction strategies}
230: \author{Vladimir Vovk\\
231: \texttt{vovk{\rm@}cs.rhul.ac.uk}\\
232: \texttt{http://vovk.net}}
233: \fi
234: 
235: \begin{document}
236: \maketitle
237: \begin{abstract}
238:   Assuming that the loss function is convex in the prediction,
239:   we construct a prediction strategy
240:   universal for the class of Markov prediction strategies,
241:   not necessarily continuous.
242:   Allowing randomization,
243:   we remove the requirement of convexity.
244: \end{abstract}
245: 
246: \section{Introduction}
247: \label{sec:introduction}
248: 
249: This paper belongs to the area of research
250: known as universal prediction of individual sequences
251: (see \cite{cesabianchi/lugosi:2006} for a review):
252: the predictor's goal is to compete with a wide benchmark class of prediction strategies.
253: In the previous papers \cite{\DFVII} and \cite{\DFVIII}
254: we constructed prediction strategies
255: competitive with the important classes of Markov and stationary,
256: respectively,
257: continuous prediction strategies.
258: In this paper we consider competing against possibly discontinuous strategies.
259: Our main results assert the existence of prediction strategies
260: competitive with the Markov strategies.
261: 
262: This paper's idea of transition from continuous to general benchmark classes
263: was motivated by Skorokhod's topology for the space $D$
264: of ``c\`adl\`ag'' functions, most of which are discontinuous.
265: Skorokhod's idea was to allow small deformations not only along the vertical axis
266: but also along the horizontal axis when defining neighborhoods.
267: Skorokhod's topology was metrized by Kolmogorov so that it became a separable space
268: (\cite{billingsley:1968}, Appendix III; \cite{shiryaev:1989latin}, p.~913),
269: which allows us to apply one of the numerous algorithms for prediction with expert advice
270: (Kalnishkan and Vyugin's Weak Aggregating Algorithm in this paper)
271: to construct a universal algorithm.
272: 
273: In Section \ref{sec:results} we give the main definitions and state our main results,
274: Theorems \ref{thm:deterministic} and \ref{thm:randomized};
275: their proofs are given in Sections \ref{sec:proof-deterministic} and \ref{sec:proof-randomized},
276: respectively.
277: 
278: \section{Main results}
279: \label{sec:results}
280: 
281: The \emph{game of prediction} between two players,
282: called Predictor and Reality,
283: is played according to the following protocol
284: (of \emph{perfect information},
285: in the sense that either player can see the other player's moves made so far).
286: 
287: \bigskip
288: 
289: \noindent
290: \textsc{Prediction protocol}\nopagebreak
291: \begin{tabbing}
292:   \qquad\=\qquad\=\qquad\kill
293:   FOR $n=1,2,\dots$:\\
294:   \> Reality announces $x_n\in\mathbf{X}$.\\
295:   \> Predictor announces $\gamma_n\in\Gamma$.\\
296:   \> Reality announces $y_n\in\mathbf{Y}$.\\
297:   END FOR.
298: \end{tabbing}
299: 
300: \noindent
301: The game proceeds in rounds numbered by the positive integers $n$.
302: At the beginning of each round $n=1,2,\ldots$ Predictor is given some \emph{signal} $x_n$
303: relevant to predicting the following \emph{observation} $y_n$.
304: The signal is taken from the \emph{signal space} $\mathbf{X}$
305: and the observation from the \emph{observation space} $\mathbf{Y}$.
306: Predictor then announces his prediction $\gamma_n$,
307: taken from the \emph{prediction space} $\Gamma$,
308: and the prediction's quality in light of the actual observation
309: is measured by a \emph{loss function}
310: $\lambda:\Gamma\times\mathbf{Y}\to\bbbr$.
311: 
312: We will always assume that the signal space $\mathbf{X}$,
313: the prediction space $\Gamma$,
314: and the observation space $\mathbf{Y}$
315: are non-empty sets;
316: $\mathbf{X}$ and $\Gamma$ will often be equipped with additional structures.
317: 
318: \subsection*{Markov-universal prediction strategies: deterministic case}
319: 
320: Predictor's strategies in the prediction protocol will be called
321: \emph{prediction strategies}.
322: Formally such a strategy is a function
323: \begin{equation*}
324:   D:
325:   \bigcup_{n=1}^{\infty}
326:   \left(
327:     \mathbf{X}\times\mathbf{Y}
328:   \right)^{n-1}
329:   \times
330:   \mathbf{X}
331:   \to
332:   \Gamma;
333: \end{equation*}
334: it maps each history $(x_1,y_1,\ldots,x_{n-1},y_{n-1},x_n)$ to the chosen prediction.
335: In this paper we will be especially interested
336: in \emph{Markov strategies},
337: which are functions $D:\mathbf{X}\to\Gamma$;
338: intuitively,
339: $D(x_n)$ is the recommended prediction on round $n$.
340: The restriction to Markov strategies
341: is not a severe one,
342: since the signal $x_n$ can encode as much of the past as we want
343: (cf.\ \cite{kolmogorov:1931}, footnote 1);
344: in particular, $x_n$ can contain information about the previous observations
345: $y_1,\ldots,y_{n-1}$.
346: In this paper
347: Markov prediction strategies will also be called \emph{prediction rules}
348: (as in \cite{\DFVII};
349: in a more general context, however, it would be risky to omit ``Markov''
350: since ``prediction rule'' is too easy to confuse with ``prediction strategy'').
351: 
352: For both our theorems we will need the notion of ``approximation''
353: to a signal $x\in\mathbf{X}$;
354: intuitively, the ``$m$-approximation'' of $x$ is another signal $\phi_m(x)$
355: which is as close to $x$ as possible but carries only $m$ bits of information.
356: If $\mathbf{X}=[0,1]$,
357: a reasonable definition of $\phi_m(x)$ would be to take the binary expansion of $x$
358: but remove all the binary digits starting from the $(m+1)$th after the binary dot.
359: In general,
360: we will have to equip $\mathbf{X}$ with an ``approximation structure'';
361: we will do this following Kolmogorov and Tikhomirov
362: (\cite{\Tikhomirov}, Section 2,
363: \cite{shiryaev:1989latin}, p.~913% this is p.~49 of 80 in the file
364: \ifFULL\bluebegin, \cite{tikhomirov:1976}\blueend\fi).
365: 
366: Consider a sequence of mappings $\phi_m:\mathbf{X}\to\mathbf{X}$,
367: $m=1,2,\ldots$,
368: such that each $\phi_m$ is idempotent,
369: in the sense $\phi_m(\phi_m(x))=\phi_m(x)$ for all $x\in\mathbf{X}$,
370: and $\phi_m(\mathbf{X})$ contains $2^m$ elements.
371: (Such mappings are coding-theory analogues of projections in linear algebra
372: and contractions in topology;
373: $\phi_m(x)$ can be thought of as the result of encoding $x$,
374: sending it over an $m$-bit channel,
375: and restoring $x$ as well as possible at the receiving end.)
376: It is the sequence $\phi=\{\phi_m\st m=1,2,\ldots\}$
377: that will be referred to as an \emph{approximation structure}.
378: 
379: If $\mathbf{X}$ is a totally bounded (say, compact) metric space,
380: there is an approximation structure $\phi$ such that
381: \begin{equation}\label{eq:fine}
382:   \lim_{m\to\infty}
383:   \rho
384:   \left(
385:     x,
386:     \phi_m(x)
387:   \right)
388:   =
389:   0
390: \end{equation}
391: uniformly in $x\in\mathbf{X}$.
392: (We often let $\rho$ stand for the metric in various metric spaces,
393: always clear from the context.)
394: In fact,
395: the \emph{$m$th Kolmogorov diameter}
396: \begin{equation*}
397:   \mathcal{K}_m(\mathbf{X})
398:   :=
399:   \frac12
400:   \inf_{\phi}
401:   \sup_{x\in\mathbf{X}}
402:   \diam
403:   \left(
404:     \phi_m^{-1}(\phi_m(x))
405:   \right)
406: \end{equation*}
407: of $\mathbf{X}$ is essentially the inverse function
408: to the $\epsilon$-entropy $\mathcal{H}_{\epsilon}(\mathbf{X})$.
409: See \cite{\KolmogorovTikhomirov}
410: for precise values and estimates of $\mathcal{K}_m(\mathbf{X})$
411: for numerous totally bounded metric spaces $\mathbf{X}$.
412: 
413: A prediction strategy is \emph{Markov-universal} for a loss function $\lambda$
414: and an approximation structure $\phi$
415: if it guarantees that
416: for any prediction rule $D$ and any $m=1,2,\ldots$
417: there exists a number $N_{D,m}$ such that for any $N\ge N_{D,m}$
418: and any sequence $x_1,y_1,x_2,y_2,\ldots$ of Reality's moves
419: its responses $\gamma_n$ satisfy
420: \begin{equation*} % \label{eq:dominates-deterministic}
421:   \frac1N
422:   \sum_{n=1}^N
423:   \lambda
424:   (\gamma_n,y_n)
425:   \le
426:   \frac1N
427:   \sum_{n=1}^N
428:   \lambda
429:   \Bigl(
430:     D(\phi_m(x_n)),y_n
431:   \Bigr)
432:   +
433:   2^{-m}.
434: \end{equation*}
435: \begin{theorem}\label{thm:deterministic}
436:   Suppose $\mathbf{X}$ is equipped with an approximation structure $\phi$,
437:   $\Gamma$ is a closed convex subset of a separable Banach space,
438:   and the loss function $\lambda(\gamma,y)$
439:   is bounded, convex in the variable $\gamma\in\Gamma$,
440:   and uniformly continuous in $\gamma\in\Gamma$
441:   uniformly in $y\in\mathbf{Y}$.
442:   There exists a Markov-universal for $\lambda$ and $\phi$ prediction strategy.
443: \end{theorem}
444: A Markov-universal prediction strategy will be constructed in the next section.
445: Theorem \ref{thm:deterministic} says that, under its conditions,
446: \begin{equation}\label{eq:simpler}
447:   \limsup_{N\to\infty}
448:   \left(
449:     \frac1N
450:     \sum_{n=1}^N
451:     \lambda
452:     (\gamma_n,y_n)
453:     -
454:     \frac1N
455:     \sum_{n=1}^N
456:     \lambda
457:     \Bigl(
458:       D(\phi_m(x_n)),y_n
459:     \Bigr)
460:   \right)
461:   \le
462:   0
463: \end{equation}
464: uniformly in $x_1,y_1,x_2,y_2,\ldots$
465: for all $m=1,2,\ldots$ and all $D:\mathbf{X}\to\Gamma$.
466: % This statement is cruder than Theorem \ref{thm:deterministic} itself
467: % but slightly simpler.
468: 
469: If $\mathbf{X}$ is a compact metric space and (\ref{eq:fine})
470: holds uniformly in $x\in\mathbf{X}$,
471: (\ref{eq:simpler}) implies
472: \begin{equation*}
473:   \limsup_{N\to\infty}
474:   \left(
475:     \frac1N
476:     \sum_{n=1}^N
477:     \lambda
478:     (\gamma_n,y_n)
479:     -
480:     \frac1N
481:     \sum_{n=1}^N
482:     \lambda
483:     (D(x_n),y_n)
484:   \right)
485:   \le
486:   0
487: \end{equation*}
488: for all continuous prediction rules $D$;
489: this is close to Theorem 1 in \cite{\DFVII}.
490: The advance of this paper as compared to \cite{\DFVII} is that our main results
491: do not assume that $D$ is continuous.
492: 
493: \subsection*{Markov-universal prediction strategies: randomized case}
494: 
495: When the loss function $\lambda(\gamma,y)$ is not required to be convex in $\gamma$,
496: the conclusion of Theorem \ref{thm:deterministic} may become false
497: (\cite{kalnishkan/vyugin:2005}, Theorem 2).
498: The situation changes if we consider randomized prediction strategies.
499: 
500: A \emph{randomized prediction strategy} is a function
501: \begin{equation*}
502:   D:
503:   \bigcup_{n=1}^{\infty}
504:   (\mathbf{X}\times\mathbf{Y})^{n-1}\times\mathbf{X}
505:   \to
506:   \PPP(\Gamma)
507: \end{equation*}
508: mapping the past to the probability measures on the prediction space.
509: In other words, this is a strategy for Predictor
510: in the extended game of prediction with the prediction space $\PPP(\Gamma)$.
511: A \emph{Markov randomized prediction strategy},
512: or \emph{randomized prediction rule} for brevity,
513: is a function $D:\mathbf{X}\to\PPP(\Gamma)$.
514: 
515: We will say that a randomized prediction strategy outputting $\gamma_n$
516: is \emph{Markov-universal} for a loss function $\lambda$ and an approximation structure $\phi$ if,
517: for any randomized prediction rule $D$ and any $m=1,2,\ldots$,
518: there exists $N_{D,m}$ such that,
519: for any sequence $x_{1},y_{1},x_{2},y_{2},\ldots$ of Reality's moves,
520: \begin{equation}\label{eq:dominates-randomized}
521:   \sup_{N\ge N_{D,m}}
522:   \left(
523:     \frac1N
524:     \sum_{n=1}^N
525:     \lambda(g_{n},y_n)
526:     -
527:     \frac1N
528:     \sum_{n=1}^N
529:     \lambda(d_{n},y_n)
530:   \right)
531:   \le
532:   2^{-m}
533: \end{equation}
534: with probability at least $1-2^{-m}$,
535: where $g_{1},g_{2},\ldots,d_{1},d_{2},\ldots$ are independent random variables
536: distributed as
537: \begin{equation}\label{eq:distributed}
538:   g_{n}
539:   \sim
540:   \gamma_n,
541:   \enspace
542:   d_{n}
543:   \sim
544:   D(\phi_m(x_n)),
545:   \quad
546:   n=1,2,\ldots\,.
547: \end{equation}
548: Intuitively,
549: the word ``probability'' after (\ref{eq:dominates-randomized})
550: refers only to the prediction strategies' internal randomization;
551: it is not assumed that Reality behaves stochastically.
552: We will use this definition only in the case
553: where the loss function $\lambda$ is continuous in the prediction,
554: and so (\ref{eq:dominates-randomized}) will indeed be an event
555: having a probability.
556: \begin{theorem}\label{thm:randomized}
557:   Suppose the signal space $\mathbf{X}$ is equipped with an approximation structure $\phi$,
558:   $\Gamma$ is a separable topological space,
559:   and the loss function $\lambda$ is bounded
560:   and such that the set of functions $\{\lambda(\cdot,y)\st y\in\mathbf{Y}\}$
561:   is equicontinuous.
562:   There exists a randomized prediction strategy
563:   that is Markov-universal for $\lambda$ and $\phi$.
564: \end{theorem}
565: A Markov-universal prediction strategy is constructed in Section \ref{sec:proof-randomized}.
566: The randomized version of (\ref{eq:simpler}),
567: immediately following from Theorem \ref{thm:randomized},
568: is
569: \begin{equation*}
570:   \limsup_{N\to\infty}
571:   \left(
572:     \frac1N
573:     \sum_{n=1}^N
574:     \lambda
575:     (g_n,y_n)
576:     -
577:     \frac1N
578:     \sum_{n=1}^N
579:     \lambda
580:     (d_n,y_n)
581:   \right)
582:   \le
583:   0
584:   \quad
585:   \text{a.s.},
586: \end{equation*}
587: for all $m=1,2,\ldots$ and all $D:\mathbf{X}\to\Gamma$,
588: where $g_{1},g_{2},\ldots,d_{1},d_{2},\ldots$ are independent
589: and distributed as (\ref{eq:distributed}).
590: \ifFULL\bluebegin
591: If $\mathbf{X}$ is a metric compact and (\ref{eq:fine})
592: holds uniformly in $x$,
593: one might be able to obtain the following analogue of Theorem 2 in \cite{\DFVII}:
594: for continuous prediction rules $D$,
595: \begin{equation*}
596:   \limsup_{N\to\infty}
597:   \left(
598:     \frac1N
599:     \sum_{n=1}^N
600:     \lambda
601:     (g_n,y_n)
602:     -
603:     \frac1N
604:     \sum_{n=1}^N
605:     \lambda
606:     (d_n,y_n)
607:   \right)
608:   \le
609:   0
610:   \quad
611:   \text{a.s.},
612: \end{equation*}
613: where $g_{1},g_{2},\ldots,d_{1},d_{2},\ldots$ are independent
614: and distributed as
615: \begin{equation*}
616:   g_{n}
617:   \sim
618:   \gamma_n,
619:   \enspace
620:   d_{n}
621:   \sim
622:   D(x_n),
623:   \quad
624:   n=1,2,\ldots\,.
625: \end{equation*}
626: \blueend\fi
627: 
628: \section{Proof of Theorem \ref{thm:deterministic}}
629: \label{sec:proof-deterministic}
630: 
631: Let us fix a dense countable subset $\Gamma^*$ of $\Gamma$.
632: We will say that a function $D:\mathbf{X}\to\Gamma$
633: is \emph{$m$-elementary} if $D(\mathbf{X})\subseteq\Gamma^*$
634: and $D(x)$ depends on $x$ only via $\phi_m(x)$;
635: a function is \emph{elementary} if it is $m$-elementary for some $m$.
636: There are countably many elementary functions;
637: let us enumerate them as $D_1,D_2,\ldots$\,.
638: We will refer to these functions as \emph{experts}.
639: We will apply a special case of Kalnishkan and Vyugin's
640: \cite{kalnishkan/vyugin:2005}
641: Weak Aggregating Algorithm (WAA) to the sequence of experts
642: (as in \cite{\DFVIII}).
643: 
644: Let $q_1,q_2,\ldots$ be a sequence of positive numbers summing to 1,
645: $\sum_{k=1}^{\infty}q_k=1$.
646: Define
647: \begin{equation*}
648:   l_n^{(k)}
649:   :=
650:   \lambda
651:   \left(
652:     D_k(x_n),y_n
653:   \right),
654:   \quad
655:   L_N^{(k)}
656:   :=
657:   \sum_{n=1}^N
658:   l_n^{(k)}
659: \end{equation*}
660: to be the instantaneous loss of the $k$th expert $D_k$ on the $n$th round
661: and his cumulative loss over the first $N$ rounds.
662: For all $n,k=1,2,\ldots$ define
663: \begin{equation*}
664:   w_n^{(k)}
665:   :=
666:   q_k
667:   \beta_n^{L_{n-1}^{(k)}},
668:   \quad
669:   \beta_n
670:   :=
671:   \exp
672:   \left(
673:     -\frac{1}{\sqrt{n}}
674:   \right)
675: \end{equation*}
676: ($w_n^{(k)}$ are the weights of the experts to use on round $n$)
677: and
678: \begin{equation*}
679:   p_n^{(k)}
680:   :=
681:   \frac
682:   {w_n^{(k)}}
683:   {\sum_{k=1}^{\infty}w_n^{(k)}}
684: \end{equation*}
685: (the normalized weights;
686: it is obvious that the denominator is positive and finite).
687: The WAA's prediction on round $n$ is
688: \begin{equation}\label{eq:WAA}
689:   \gamma_n
690:   :=
691:   \sum_{k=1}^{\infty}
692:   p_n^{(k)}
693:   D_k(x_n).
694: \end{equation}
695: To make this series convergent,
696: we may take $q_k:=2^{-k}$ and reorder $D_k$ so that
697: $\sup_x\left\|D_k(x)\right\|\le k$ for all $k$.
698: In this case we will automatically have $\gamma_n\in\Gamma$ since
699: \begin{multline}\label{eq:convergence-to-0}
700:   \gamma_n
701:   -
702:   \sum_{k=1}^K
703:   \frac{p_n^{(k)}}{\sum_{k=1}^K p_n^{(k)}}
704:   D_k(x_n)\\
705:   =
706:   \sum_{k=1}^K
707:   \left(
708:     1
709:     -
710:     \frac{1}{\sum_{k=1}^K p_n^{(k)}}
711:   \right)
712:   p_n^{(k)}
713:   D_k(x_n)
714:   +
715:   \sum_{k=K+1}^{\infty}
716:   p_n^{(k)}
717:   D_k(x_n)
718:   \to
719:   0
720: \end{multline}
721: as $K\to\infty$.
722: 
723: Let $l_n:=\lambda(\gamma_n,y_n)$ be the WAA's loss on round $n$
724: and
725: $
726:   L_N
727:   :=
728:   \sum_{n=1}^N
729:   l_n
730: $
731: be its cumulative loss over the first $N$ rounds.
732: \begin{lemma}[\cite{kalnishkan/vyugin:2005}, Lemma 9]\label{lem:9}
733:   The WAA guarantees that, for all $N=1,2,\ldots$,
734:   \begin{equation}\label{eq:lemma9}
735:     L_N
736:     \le
737:     \sum_{n=1}^N
738:     \sum_{k=1}^{\infty}
739:     p_n^{(k)}
740:     l_n^{(k)}
741:     -
742:     \sum_{n=1}^N
743:     \log_{\beta_n}
744:     \sum_{k=1}^{\infty}
745:     p_n^{(k)}
746:     \beta_n^{l_n^{(k)}}
747:     +
748:     \log_{\beta_N}
749:     \sum_{k=1}^{\infty}
750:     q_k
751:     \beta_N^{L_N^{(k)}}.
752:   \end{equation}
753: \end{lemma}
754: The first two terms on the right-hand side of (\ref{eq:lemma9})
755: are sums over the first $N$ rounds of different kinds of mean of the experts' losses
756: (see, e.g., \cite{hardy/etal:1952}, Chapter III,
757: for a general definition of the mean);
758: we will see later that they nearly cancel each other out.
759: If those two terms are ignored,
760: the remaining part of (\ref{eq:lemma9}) is identical
761: (except that $\beta$ now depends on $n$)
762: to the main property of the ``Aggregating Algorithm''
763: (see, e.g., \cite{vovk:2001competitive}, Lemma 1).
764: All infinite series in (\ref{eq:lemma9}) are trivially convergent.
765: 
766: In the proof of Lemma \ref{lem:9} we will use the following property
767: of ``countable convexity'' of $\lambda$:
768: \begin{equation}\label{eq:countable-convexity}
769:   l_n\le\sum_{k=1}^{\infty}p_n^{(k)}l_n^{(k)}.
770: \end{equation}
771: This property follows from (\ref{eq:convergence-to-0}) and
772: \begin{equation*}
773:   \lambda
774:   \left(
775:     \sum_{k=1}^K
776:     \frac{p_n^{(k)}}{\sum_{k=1}^K p_n^{(k)}}
777:     D_k(x_n),
778:     y_n
779:   \right)
780:   \le
781:   \sum_{k=1}^K
782:   \frac{p_n^{(k)}}{\sum_{k=1}^K p_n^{(k)}}
783:   \lambda
784:   \left(
785:     D_k(x_n),
786:     y_n
787:   \right)
788: \end{equation*}
789: if we let $K\to\infty$.
790: 
791: \begin{Proof}{of Lemma \ref{lem:9}}
792:   The proof is by induction on $N$.
793:   For $N=1$,
794:   (\ref{eq:lemma9}) follows from the countable convexity (\ref{eq:countable-convexity})
795:   and $p_1^{(k)}=q_k$.
796:   Assuming (\ref{eq:lemma9}),
797:   we obtain
798:   \begin{multline*}
799:     L_{N+1}
800:     =
801:     L_N + l_{N+1}
802:     \le
803:     L_N
804:     +
805:     \sum_{k=1}^{\infty}
806:     p_{N+1}^{(k)}
807:     l_{N+1}^{(k)}\\
808:     \le
809:     \sum_{n=1}^{N+1}
810:     \sum_{k=1}^{\infty}
811:     p_n^{(k)}
812:     l_n^{(k)}
813:     -
814:     \sum_{n=1}^N
815:     \log_{\beta_n}
816:     \sum_{k=1}^{\infty}
817:     p_n^{(k)}
818:     \beta_n^{l_n^{(k)}}
819:     +
820:     \log_{\beta_N}
821:     \sum_{k=1}^{\infty}
822:     q_k
823:     \beta_N^{L_N^{(k)}}
824:   \end{multline*}
825:   (the first ``$\le$'' again used the countable convexity (\ref{eq:countable-convexity})).
826:   Therefore,
827:   it remains to prove
828:   \begin{equation*}
829:     \log_{\beta_N}
830:     \sum_{k=1}^{\infty}
831:     q_k
832:     \beta_N^{L_N^{(k)}}
833:     \le
834:     -\log_{\beta_{N+1}}
835:     \sum_{k=1}^{\infty}
836:     p_{N+1}^{(k)}
837:     \beta_{N+1}^{l_{N+1}^{(k)}}
838:     +
839:     \log_{\beta_{N+1}}
840:     \sum_{k=1}^{\infty}
841:     q_k
842:     \beta_{N+1}^{L_{N+1}^{(k)}}.
843:   \end{equation*}
844:   By the definition of $p_n^{(k)}$
845:   this can be rewritten as
846:   \begin{equation*}
847:     \log_{\beta_N}
848:     \sum_{k=1}^{\infty}
849:     q_k
850:     \beta_N^{L_N^{(k)}}
851:     \le
852:     -\log_{\beta_{N+1}}
853:     \frac
854:     {
855:       \sum_{k=1}^{\infty}
856:       q_k
857:       \beta_{N+1}^{L_{N}^{(k)}}
858:       \beta_{N+1}^{l_{N+1}^{(k)}}
859:     }
860:     {
861:       \sum_{k=1}^{\infty}
862:       q_k
863:       \beta_{N+1}^{L_{N}^{(k)}}
864:     }
865:     +
866:     \log_{\beta_{N+1}}
867:     \sum_{k=1}^{\infty}
868:     q_k
869:     \beta_{N+1}^{L_{N+1}^{(k)}},
870:   \end{equation*}
871:   which after cancellation becomes
872:   \begin{equation}\label{eq:to-check}
873:     \log_{\beta_N}
874:     \sum_{k=1}^{\infty}
875:     q_k
876:     \beta_N^{L_N^{(k)}}
877:     \le
878:     \log_{\beta_{N+1}}
879:     \sum_{k=1}^{\infty}
880:     q_k
881:     \beta_{N+1}^{L_{N}^{(k)}}.
882:   \end{equation}
883:   The last inequality follows from the general result
884:   about comparison of different means
885:   (\cite{hardy/etal:1952}, Theorem 85),
886:   but we can also check it directly
887:   (following \cite{kalnishkan/vyugin:2005}).
888:   Let $\beta_{N+1}=\beta_N^a$,
889:   where $0<a<1$.
890:   Then (\ref{eq:to-check}) can be rewritten as
891:   \begin{equation*}
892:     \left(
893:       \sum_{k=1}^{\infty}
894:       q_k
895:       \beta_N^{L_N^{(k)}}
896:     \right)^a
897:     \ge
898:     \sum_{k=1}^{\infty}
899:     q_k
900:     \beta_{N}^{aL_{N}^{(k)}},
901:   \end{equation*}
902:   and the last inequality follows from the concavity of the function $t\mapsto t^a$.
903:   \qedtext
904: \end{Proof}
905: 
906: \begin{lemma}[\cite{kalnishkan/vyugin:2005}, Lemma 5]
907:   Let $L$ be an upper bound on $\left|\lambda\right|$.
908:   The WAA guarantees that, for all $N$ and $K$,
909:   \begin{equation}\label{eq:lemma5}
910:     L_N
911:     \le
912:     L_N^{(K)}
913:     +
914:     \left(
915:       L^2 e^L + \ln\frac{1}{q_K}
916:     \right)
917:     \sqrt{N}.
918:   \end{equation}
919: \end{lemma}
920: \begin{proof}
921:   From (\ref{eq:lemma9}),
922:   we obtain:
923:   \begin{align*}
924:     L_N
925:     &\le
926:     \sum_{n=1}^N
927:     \sum_{k=1}^{\infty}
928:     p_n^{(k)}
929:     l_n^{(k)}
930:     +
931:     \sum_{n=1}^N
932:     \sqrt{n}
933:     \ln
934:     \sum_{k=1}^{\infty}
935:     p_n^{(k)}
936:     \exp
937:     \left(
938:       -\frac{l_n^{(k)}}{\sqrt{n}}
939:     \right)
940:     +
941:     \log_{\beta_N}
942:     q_K
943:     +
944:     L_N^{(K)}\\
945:     &\le
946:     \sum_{n=1}^N
947:     \sum_{k=1}^{\infty}
948:     p_n^{(k)}
949:     l_n^{(k)}
950:     +
951:     \sum_{n=1}^N
952:     \sqrt{n}
953:     \left(
954:       \sum_{k=1}^{\infty}
955:       p_n^{(k)}
956:       \left(
957:         1
958:         -
959:         \frac{l_n^{(k)}}{\sqrt{n}}
960:         +
961:         \frac{\left(l_n^{(k)}\right)^2}{2n}
962:         e^L
963:       \right)
964:       -
965:       1
966:     \right)\\
967:     &\quad{}+
968:     \log_{\beta_N}
969:     q_K
970:     +
971:     L_N^{(K)}\\
972:     &=
973:     L_N^{(K)}
974:     +
975:     \frac12
976:     \sum_{n=1}^N
977:     \frac{1}{\sqrt{n}}
978:     \sum_{k=1}^{\infty}
979:     p_n^{(k)}
980:     \left(l_n^{(k)}\right)^2
981:     e^L
982:     +
983:     \sqrt{N}\ln\frac{1}{q_K}\\
984:     &\le
985:     L_N^{(K)}
986:     +
987:     \frac{L^2e^L}{2}
988:     \sum_{n=1}^N
989:     \frac{1}{\sqrt{n}}
990:     +
991:     \sqrt{N}\ln\frac{1}{q_K}
992:     \le
993:     L_N^{(K)}
994:     +
995:     \frac{L^2e^L}{2}
996:     \int_0^N
997:     \frac{\D t}{\sqrt{t}}
998:     +
999:     \sqrt{N}\ln\frac{1}{q_K}\\
1000:     &=
1001:     L_N^{(K)}
1002:     +
1003:     L^2e^L\sqrt{N}
1004:     +
1005:     \sqrt{N}\ln\frac{1}{q_K}
1006:   \end{align*}
1007:   (in the second ``$\le$'' we used the inequalities $e^t\le1+t+\frac{t^2}{2}e^{\left|t\right|}$
1008:   and $\ln t\le t-1$).
1009:   \qedtext
1010: \end{proof}
1011: 
1012: \begin{remark*}
1013:   There is no term $e^L$ in \cite{kalnishkan/vyugin:2005}
1014:   since that paper only considers non-negative loss functions.
1015:   (Notice that even without assuming non-negativity
1016:   this term is very crude and can be easily improved.)
1017: \end{remark*}
1018: 
1019: Now it is easy to prove Theorem \ref{thm:deterministic}.
1020: The definition of Markov-universality can be restated as follows:
1021: a prediction strategy outputting $\gamma_n$ is Markov-universal
1022: if and only if
1023: for any prediction rule $D$, any $m=1,2,\ldots$,
1024: and any $\epsilon>0$
1025: there exists $N_{D,m,\epsilon}$ such that, for any $N\ge N_{D,m,\epsilon}$
1026: and any $x_1,y_1,x_2,y_2,\ldots$,
1027: \begin{equation}\label{eq:dominates-deterministic-version}
1028:   \frac1N
1029:   \sum_{n=1}^N
1030:   \lambda
1031:   (\gamma_n,y_n)
1032:   \le
1033:   \frac1N
1034:   \sum_{n=1}^N
1035:   \lambda
1036:   \Bigl(
1037:     D(\phi_m(x_n)),y_n
1038:   \Bigr)
1039:   +
1040:   \epsilon.
1041: \end{equation}
1042: Let $\gamma_n$ be output by the WAA
1043: and let us consider any prediction rule $D$,
1044: any $m\in\{1,2,\ldots\}$, and any $\epsilon>0$.
1045: Choose $\delta>0$ such that
1046: $\left|\lambda(\gamma,y)-\lambda(\gamma',y)\right|<\epsilon/2$
1047: whenever $\rho(\gamma,\gamma')<\delta$
1048: and choose an $m$-elementary expert $D_K$ such that,
1049: for all $x\in\phi_m(\mathbf{X})$,
1050: $\rho(D(x),D_{K}(x))<\delta$.
1051: 
1052: From (\ref{eq:lemma5}) we obtain
1053: \begin{multline}\label{eq:chain}
1054:   \frac1N
1055:   \sum_{n=1}^N
1056:   \lambda(\gamma_n,y_n)
1057:   -
1058:   \frac1N
1059:   \sum_{n=1}^N
1060:   \lambda
1061:   \Bigl(
1062:     D(\phi_m(x_n)),y_n
1063:   \Bigr)\\
1064:   \le
1065:   \frac1N
1066:   \sum_{n=1}^N
1067:   \lambda(\gamma_n,y_n)
1068:   -
1069:   \frac1N
1070:   \sum_{n=1}^N
1071:   \lambda
1072:   \Bigl(
1073:     D_{K}(\phi_m(x_n)),y_n
1074:   \Bigr)
1075:   +
1076:   \frac{\epsilon}{2}\\
1077:   =
1078:   \frac1N
1079:   \sum_{n=1}^N
1080:   \lambda(\gamma_n,y_n)
1081:   -
1082:   \frac1N
1083:   \sum_{n=1}^N
1084:   \lambda
1085:   \Bigl(
1086:     D_{K}(x_n),y_n
1087:   \Bigr)
1088:   +
1089:   \frac{\epsilon}{2}\\
1090:   \le
1091:   \left(
1092:     L^2e^L + \ln\frac{1}{q_{K}}
1093:   \right)
1094:   \frac{1}{\sqrt{N}}
1095:   +
1096:   \frac{\epsilon}{2};
1097: \end{multline}
1098: now (\ref{eq:dominates-deterministic-version}) is obvious.
1099: 
1100: \section{Proof of Theorem \ref{thm:randomized}}
1101: \label{sec:proof-randomized}
1102: 
1103: \ifFULL\bluebegin
1104:   Unfortunately,
1105:   Theorem \ref{thm:deterministic} cannot be applied
1106:   to the extended game of prediction with the prediction space $\PPP(\Gamma)$ directly:
1107:   the theorem assumes that $\Gamma$ is a subset of a Banach space,
1108:   whereas,
1109:   even assuming $\Gamma$ compact,
1110:   the dual to an infinite-dimensional Banach space is never even metrizable
1111:   in the weak$^*$ topology
1112:   (\cite{rudin:1991}, 3.16).
1113:   The proof of Theorem \ref{thm:deterministic}, however,
1114:   still works for the new game.
1115: \blueend\fi
1116: 
1117: A convenient pseudo-metric on $\Gamma$ can be defined by
1118: \begin{equation*}
1119:   \rho(g,g')
1120:   :=
1121:   \sup
1122:   \left\{
1123:     \lambda(g,y)
1124:     -
1125:     \lambda(g',y)
1126:     \st
1127:     y\in\mathbf{Y}
1128:   \right\},
1129:   \quad
1130:   g,g'\in\Gamma
1131: \end{equation*}
1132: (cf.\ \cite{dudley:2002}, Corollary 11.3.4).
1133: Let us redefine $\Gamma$ as the quotient space obtained from the original $\Gamma$
1134: by identifying $g$ and $g'$ for which $\rho(g,g')=0$
1135: (\cite{engelking:1989}, Section 2.4);
1136: in other words,
1137: we will not distinguish predictions that always lead to identical losses.
1138: Now $\rho$ becomes a metric on $\Gamma$.
1139: Let $\Gamma^*$ be a countable dense subset of the original topological space $\Gamma$
1140: (which is separable as a subset of a separable Banach space);
1141: the condition of equicontinuity implies that $\Gamma^*$
1142: (formally defined as the set of equivalence classes
1143: containing elements of the original $\Gamma^*$)
1144: remains a dense subset in $\Gamma$ equipped with the metric $\rho$.
1145: % We can see that $\Gamma$ remains a separable space.
1146: 
1147: We define the norm of a function $f:\Gamma\to\bbbr$ as
1148: \begin{equation*}
1149:   \left\|f\right\|_{\BL}
1150:   :=
1151:   \sup_{g,g'\in\Gamma:g\ne g'}
1152:   \frac{\left|f(g)-f(g')\right|}{\rho(g,g')}
1153:   +
1154:   \sup_{g\in\Gamma}
1155:   \left|f(g)\right|;
1156: \end{equation*}
1157: this norm is finite for bounded Lipschitz functions
1158: (which form a Banach space under this norm:
1159: see \cite{dudley:2002}, Section 11.2).
1160: Notice that
1161: \begin{equation}\label{eq:BL-for-lambda}
1162:   \left\|\lambda\right\|_{\BL}
1163:   :=
1164:   \sup_{y\in\mathbf{Y}}
1165:   \left\|\lambda(\cdot,y)\right\|_{\BL}
1166:   <
1167:   \infty.
1168: \end{equation}
1169: 
1170: Next define
1171: \begin{equation}\label{eq:expected-loss}
1172:   \lambda(\gamma,y)
1173:   :=
1174:   \int_{\Gamma}
1175:   \lambda(g,y)
1176:   \gamma(\dd g),
1177: \end{equation}
1178: where $\gamma$ is a probability measure on $\Gamma$.
1179: This is the loss function in a new game of prediction
1180: with the prediction space $\PPP(\Gamma)$;
1181: it is linear and, therefore, convex in $\gamma$.
1182: (In general,
1183: the role of randomization in this paper
1184: is to make the loss function convex in the prediction.)
1185: 
1186: As a metric on $\PPP(\Gamma)$ we will take the Fortet--Mourier metric
1187: (\cite{dudley:2002}, Section 11.3)
1188: defined as
1189: \begin{equation*}
1190:   \beta(\gamma,\gamma')
1191:   :=
1192:   \sup_{f:\left\|f\right\|_{\BL}\le1}
1193:   \left|
1194:     \int_{\Gamma}
1195:     f
1196:     \D
1197:     (\gamma-\gamma')
1198:   \right|.
1199: \end{equation*}
1200: The topology on $\PPP(\Gamma)$ induced by this metric
1201: is called the \emph{topology of weak convergence}
1202: (\cite{billingsley:1968};
1203: weak convergence is called simply ``convergence'' in \cite{dudley:2002};
1204: for the proof of equivalence of several natural definitions
1205: of the topology of weak convergence,
1206: see \cite{dudley:2002}, Theorem 11.3.3).
1207: 
1208: Let us check that the loss function (\ref{eq:expected-loss}) is also
1209: bounded Lipschitz, in the sense of (\ref{eq:BL-for-lambda}):
1210: if $\gamma,\gamma'\in\PPP(\gamma)$ and $y\in\mathbf{Y}$,
1211: \begin{equation*}
1212:   \left|
1213:     \lambda(\gamma,y)
1214:     -
1215:     \lambda(\gamma',y)
1216:   \right|
1217:   =
1218:   \left|
1219:     \int_{\Gamma}
1220:     \lambda(g,y)
1221:     (\gamma-\gamma')
1222:     (\dd g)
1223:   \right|
1224:   \le
1225:   \left\|\lambda\right\|_{\BL}
1226:   \beta(\gamma,\gamma').
1227: \end{equation*}
1228: 
1229: It is easy to see that the space $\PPP(\Gamma)$ with metric $\beta$ is separable:
1230: e.g., the set of probability measures concentrated on finite subsets of $\Gamma^*$
1231: and taking rational values is dense in $\PPP(\Gamma)$
1232: (cf.\ \cite{billingsley:1968}, Appendix III).
1233: Let us enumerate the elements of a dense countable set in $\PPP(\Gamma)$
1234: as $D_1,D_2,\ldots$;
1235: as in the previous section,
1236: we will use the WAA to merge all \emph{experts} $D_k$.
1237: 
1238: The convergence of the mixture (\ref{eq:WAA}) to a probability measure on $\Gamma$
1239: is now obvious.
1240: The countable convexity (\ref{eq:countable-convexity})
1241: now holds with equality,
1242: \begin{equation*}
1243:   \lambda
1244:   \left(
1245:     \sum_{k=1}^{\infty}
1246:     p_n^{(k)}
1247:     D_k(x_n),
1248:     y_n
1249:   \right)
1250:   =
1251:   \sum_{k=1}^{\infty}
1252:   p_n^{(k)}
1253:   \lambda
1254:   \left(
1255:     D_k(x_n),
1256:     y_n
1257:   \right),
1258: \end{equation*}
1259: and follows from the general fact that
1260: \begin{equation*}
1261:   \int f \D \sum_{k=1}^{\infty} p_k P_k
1262:   =
1263:   \sum_{k=1}^{\infty}
1264:   p_k
1265:   \int f \D P_k
1266: \end{equation*}
1267: for bounded Borel $f:\Gamma\to\bbbr$,
1268: positive $p_1,p_2,\ldots$ summing to $1$,
1269: and $P_1,P_2,\ldots\in\PPP(\Gamma)$
1270: (this is obviously true for simple $f$
1271: and follows for arbitrary integrable $f$ from the definition of Lebesgue integral:
1272: see, e.g., \cite{dudley:2002}, Section 4.1).
1273: 
1274: Therefore, it is easy to check
1275: that the chain (\ref{eq:chain}) still works
1276: (with $\PPP(\Gamma)$ equipped with metric $\beta$)
1277: and we can rephrase the previous section's result as follows.
1278: For any randomized prediction rule $D$, any $m=1,2,\ldots$,
1279: and any $\epsilon>0$
1280: there exists $N_{D,m,\epsilon}$ such that, for any $N\ge N_{D,m,\epsilon}$
1281: and any $x_1,y_1,x_2,y_2,\ldots$,
1282: the WAA's predictions $\gamma_n\in\PPP(\Gamma)$
1283: are guaranteed to satisfy
1284: \begin{equation}\label{eq:mean}
1285:   \frac1N
1286:   \sum_{n=1}^N
1287:   \lambda
1288:   (\gamma_n,y_n)
1289:   \le
1290:   \frac1N
1291:   \sum_{n=1}^N
1292:   \lambda
1293:   \Bigl(
1294:     D(\phi_m(x_n)),y_n
1295:   \Bigr)
1296:   +
1297:   \frac{\epsilon}{2}
1298: \end{equation}
1299: (cf.\  (\ref{eq:dominates-deterministic-version})).
1300: 
1301: The loss function is bounded in absolute value
1302: by a constant $L$,
1303: and so the law of the iterated logarithm
1304: (in Kolmogorov's finitary form,
1305: \cite{kolmogorov:1929}, the end of the introductory section;
1306: the condition that the cumulative variance tends to infinity
1307: is easy to get rid of:
1308: see, e.g., \cite{shafer/vovk:2001}, (5.8))
1309: implies that for any $\delta>0$ there exists $N_{\delta}$
1310: such that the conjunction of
1311: \begin{equation*}
1312:   \sup_{N\ge N_{\delta}}
1313:   \left|
1314:     \sum_{n=1}^N
1315:     \bigl(
1316:       \lambda(g_n,y_n)
1317:       -
1318:       \lambda(\gamma_n,y_n)
1319:     \bigr)
1320:   \right|
1321:   \le
1322:   \sqrt{2.01 L^2 N\ln\ln N}
1323: \end{equation*}
1324: and
1325: \begin{equation*}
1326:   \sup_{N\ge N_{\delta}}
1327:   \left|
1328:     \sum_{n=1}^N
1329:     \bigl(
1330:       \lambda(d_n,y_n)
1331:       -
1332:       \lambda(D(x_n),y_n)
1333:     \bigr)
1334:   \right|
1335:   \le
1336:   \sqrt{2.01 L^2 N\ln\ln N}
1337: \end{equation*}
1338: holds with probability at least $1-\delta$.
1339: Combining the last two inequalities with (\ref{eq:mean})
1340: we can see that for any randomized prediction rule $D$, any $m=1,2,\ldots$,
1341: any $\epsilon>0$, and any $\delta>0$
1342: there exists $N_{D,m,\epsilon,\delta}$ such that,
1343: for any $x_1,y_1,x_2,y_2,\ldots$,
1344: the WAA's responses $\gamma_n\in\PPP(\Gamma)$ to $x_1,y_1,x_2,y_2,\ldots$
1345: are guaranteed to satisfy
1346: \begin{equation*}
1347:   \sup_{N\ge N_{D,m,\epsilon,\delta}}
1348:   \left(
1349:     \frac1N
1350:     \sum_{n=1}^N
1351:     \lambda(g_n,y_n)
1352:     -
1353:     \frac1N
1354:     \sum_{n=1}^N
1355:     \lambda(d_n,y_n)
1356:   \right)
1357:   \le
1358:   \epsilon
1359: \end{equation*}
1360: with probability at least $1-\delta$.
1361: This is equivalent to the WAA (applied to $D_1,D_2,\ldots$)
1362: being a Markov-universal randomized prediction strategy.
1363: 
1364: \section{Conclusion}
1365: \label{sec:conclusion}
1366: 
1367: An interesting theoretical problem
1368: is to state more explicit versions
1369: of Theorems \ref{thm:deterministic} and \ref{thm:randomized}:
1370: for example,
1371: to give an explicit expression for $N_{D,m}$.
1372: 
1373: The field of lossy compression is now well developed,
1374: and it would be interesting to apply our prediction algorithms
1375: (perhaps with the Weak Aggregating Algorithm replaced
1376: by an algorithm based on, say, gradient descent \cite{cesabianchi/lugosi:2006}
1377: or defensive forecasting \cite{\DFVII})
1378: to the approximation structures induced by popular lossy compression algorithms.
1379: 
1380: \subsection*{Acknowledgments}
1381: 
1382: This work was partially supported by MRC (grant S505/65).
1383: 
1384: \begin{thebibliography}{10}
1385: \bibitem{billingsley:1968}
1386: Patrick Billingsley.
1387: \newblock {\em Convergence of Probability Measures}.
1388: \newblock Wiley, New York, 1968.
1389: 
1390: \bibitem{cesabianchi/lugosi:2006}
1391: Nicol\`o Cesa-Bianchi and G\'abor Lugosi.
1392: \newblock {\em Prediction, Learning, and Games}.
1393: \newblock Cambridge University Press, Cambridge, 2006.
1394: 
1395: \bibitem{dudley:2002}
1396: Richard~M. Dudley.
1397: \newblock {\em Real Analysis and Probability}, volume~74 of {\em Cambridge
1398:   Studies in Advanced Mathematics}.
1399: \newblock Cambridge University Press, Cambridge, England, revised edition,
1400:   2002.
1401: 
1402: \bibitem{engelking:1989}
1403: Ryszard Engelking.
1404: \newblock {\em General Topology}, volume~6 of {\em Sigma Series in Pure
1405:   Mathematics}.
1406: \newblock Heldermann, Berlin, second edition, 1989.
1407: 
1408: \bibitem{hardy/etal:1952}
1409: G\DOT{}~H\DOT{} Hardy, John~E\DOT{} Littlewood, and George P\'olya.
1410: \newblock {\em Inequalities}.
1411: \newblock Cambridge University Press, Cambridge, second edition, 1952.
1412: 
1413: \bibitem{kalnishkan/vyugin:2005}
1414: Yuri Kalnishkan and Michael~V\DOT{} Vyugin.
1415: \newblock The {W}eak {A}ggregating {A}lgorithm and weak mixability.
1416: \newblock In Peter Auer and Ron Meir, editors, {\em Proceedings of the
1417:   Eighteenth Annual Conference on Learning Theory}, volume 3559 of {\em Lecture
1418:   Notes in Computer Science}, pages 188--203, Berlin, 2005. Springer.
1419: \newblock The journal version is being prepared for the Special Issue of
1420:   \emph{Journal of Machine Learning Research} devoted to COLT'2005; all
1421:   references are to the journal version.
1422: 
1423: \bibitem{kolmogorov:1929}
1424: Andrei~N\DOT{} Kolmogorov.
1425: \newblock {\"U}ber das {G}esetz des iterierten {L}ogarithmus.
1426: \newblock {\em Mathematische Annalen}, 101:126--135, 1929.
1427: 
1428: \bibitem{kolmogorov:1931}
1429: Andrei~N\DOT{} Kolmogorov.
1430: \newblock {\"U}ber die analytischen {M}ethoden in der
1431:   {W}ahrscheinlichkeitsrechnung.
1432: \newblock {\em Mathematische Annalen}, 104:415--458, 1931.
1433: 
1434: \bibitem{kolmogorov/tikhomirov:1959latin}
1435: Andrei~N\DOT{} Kolmogorov and Vladimir~M\DOT{} Tikhomirov.
1436: \newblock $\epsilon$-entropy and $\epsilon$-capacity of sets in functional
1437:   spaces (in {R}ussian).
1438: \newblock {\em Uspekhi Matematicheskikh Nauk}, 14(2):3--86, 1959.
1439: 
1440: \bibitem{shafer/vovk:2001}
1441: Glenn Shafer and \Vladimir{} Vovk.
1442: \newblock {\em Probability and Finance: It's Only a Game!}
1443: \newblock Wiley, New York, 2001.
1444: 
1445: \bibitem{shiryaev:1989latin}
1446: Albert~N\DOT{} Shiryaev.
1447: \newblock Kolmogorov: life and creative activities.
1448: \newblock {\em Annals of Probability}, 17:866--944, 1989.
1449: 
1450: \bibitem{tikhomirov:1987latin}
1451: Vladimir~M\DOT{} Tikhomirov.
1452: \newblock $\epsilon$-entropy and $\epsilon$-capacity (in {R}ussian).
1453: \newblock In Yury~V\DOT{} Prokhorov and Albert~N\DOT{} Shiryaev, editors, {\em
1454:   Kolmogorov. Teoriya In\-for\-ma\-tsii i Teoriya Algoritmov}, pages 262--269.
1455:   Nauka, Moscow, 1987.
1456: 
1457: \bibitem{vovk:2001competitive}
1458: Vladimir Vovk.
1459: \newblock Competitive on-line statistics.
1460: \newblock {\em International Statistical Review}, 69:213--248, 2001.
1461: 
1462: \bibitem{DF08arXiv}
1463: \Vladimir{} Vovk.
1464: \newblock Competing with stationary prediction strategies.
1465: \newblock Technical Report \texttt{arXiv:cs.LG/0607067}, \texttt{arXiv.org}
1466:   e-Print archive, July 2006.
1467: 
1468: \bibitem{DF07arXiv}
1469: \Vladimir{} Vovk.
1470: \newblock Predictions as statements and decisions.
1471: \newblock Technical Report \texttt{arXiv:cs.LG/0606093}, \texttt{arXiv.org}
1472:   e-Print archive, June 2006.
1473: 
1474: \end{thebibliography}
1475: \end{document}
1476: