1: % Last changed: 28 Jul 2006
2: % Spell checked: 28 Jul 2006
3: % 1438 lines, 39 KB
4: \newif\ifJOURNAL
5: \JOURNALfalse
6: \newif\ifCONF
7: \CONFfalse
8: \newif\ifarXiv
9: \arXivfalse
10: \newif\ifWP
11: \WPfalse
12: \newif\ifFULL
13: \FULLfalse
14:
15: \newif\ifLATIN
16: \LATINfalse
17:
18: %\JOURNALtrue % choose JOURNAL, arXiv, WP, or FULL
19: %\CONFtrue
20: \arXivtrue
21: %\WPtrue
22: %\FULLtrue % this version is not for publication and contains extra remarks and questions
23:
24: %\LATINtrue % LATIN means that the Cyrillic references should be set in Latin
25: \ifarXiv\LATINtrue\fi % for submitting to arXiv
26:
27: \newif\ifnotJOURNAL % derivative conditional
28: \notJOURNALtrue
29: \ifJOURNAL\notJOURNALfalse\fi
30:
31: \newif\ifnotarXiv % derivative conditional
32: \notarXivtrue
33: \ifarXiv\notarXivfalse\fi
34:
35: \newif\ifTR % derivative conditionals (TR = arXiv or WP)
36: \TRfalse
37: \ifarXiv\TRtrue\fi
38: \ifWP\TRtrue\fi
39: \newif\ifnotTR
40: \notTRtrue
41: \ifarXiv\notTRfalse\fi
42: \ifWP\notTRfalse\fi
43:
44: \newif\ifnotLATIN % derivative conditional
45: \notLATINtrue
46: \ifLATIN\notLATINfalse\fi
47:
48: \ifJOURNAL
49: \newcommand{\DFI}{vovk/etal:2005AIStatslocal} % former \GTPVIII
50: \newcommand{\DFII}{vovk/etal:2005ALT} % former \GTPX
51: \newcommand{\DFIII}{vovk:2005ALT-DF03} % former \GTPXIII
52: \newcommand{\DFIV}{vovk:2005ALT-DF04} % former \GTPXIV
53: \newcommand{\DFV}{DF05arXiv} % former \GTPXI
54: \newcommand{\DFVI}{DF06arXiv} % former \GTPXVI
55: \fi
56: \ifarXiv
57: \newcommand{\DFI}{DF01arXiv} % former \GTPVIII
58: \newcommand{\DFII}{DF02arXiv} % former \GTPX
59: \newcommand{\DFIII}{DF03arXiv} % former \GTPXIII
60: \newcommand{\DFIV}{DF04arXiv} % former \GTPXIV
61: \newcommand{\DFV}{DF05arXiv} % former \GTPXI
62: \newcommand{\DFVI}{DF06arXiv} % former \GTPXVI
63: \newcommand{\DFVII}{DF07arXiv} % former \GTPXVII
64: \newcommand{\DFVIII}{DF08arXiv}
65: \fi
66: \ifWP
67: \newcommand{\DFI}{GTP8} % former \GTPVIII
68: \newcommand{\DFII}{GTP10} % former \GTPX
69: \newcommand{\DFIII}{GTP13} % former \GTPXIII
70: \newcommand{\DFIV}{GTP14} % former \GTPXIV
71: \newcommand{\DFV}{GTP11} % former \GTPXI
72: \newcommand{\DFVI}{GTP16} % former \GTPXVII
73: \newcommand{\DFVII}{GTP17} % former \GTPXVII
74: \newcommand{\DFVIII}{DF08arXiv}
75: \fi
76: \ifFULL
77: \newcommand{\DFI}{DF01arXiv} % former \GTPVIII
78: \newcommand{\DFII}{DF02arXiv} % former \GTPX
79: \newcommand{\DFIII}{DF03arXiv} % former \GTPXIII
80: \newcommand{\DFIV}{DF04arXiv} % former \GTPXIV
81: \newcommand{\DFV}{DF05arXiv} % former \GTPXI
82: \newcommand{\DFVI}{DF06arXiv} % former \GTPXVI
83: \newcommand{\DFVII}{DF07arXiv} % former \GTPXVII
84: \newcommand{\DFVIII}{DF08arXiv}
85: \fi
86:
87: \ifnotLATIN
88: \newcommand{\KolmogorovTikhomirov}{kolmogorov/tikhomirov:1959}
89: \newcommand{\Tikhomirov}{tikhomirov:1987}
90: \fi
91: \ifLATIN
92: \newcommand{\KolmogorovTikhomirov}{kolmogorov/tikhomirov:1959latin}
93: \newcommand{\Tikhomirov}{tikhomirov:1987latin}
94: \fi
95:
96: \ifJOURNAL
97: \documentclass{article}
98: \usepackage{amsmath,amsfonts,amssymb,latexsym,graphicx}
99: \newcommand{\Extra}[1]{}
100: \fi
101:
102: \ifCONF
103: \documentclass{article}
104: \usepackage{amsmath,amsfonts,amssymb,latexsym,graphicx}
105: \newcommand{\Extra}[1]{}
106: \fi
107:
108: \ifarXiv
109: \documentclass{article}
110: \usepackage{amsmath,amsfonts,amssymb,latexsym,graphicx}
111: \newcommand{\Extra}[1]{}
112: \fi
113:
114: \ifWP
115: \documentclass{gtarticle}
116: \usepackage{amsmath,amsfonts,amssymb,latexsym,epsfig,graphicx}
117: \renewcommand{\Extra}[1]{#1}
118: \fi
119:
120: \ifFULL
121: \documentclass{article}
122: \usepackage{amsmath,amsfonts,amssymb,latexsym,color,epigraph,graphicx,eepic}
123: \newcommand{\Extra}[1]{\red{#1}}
124: \newcommand{\red}[1]{\textcolor{red}{#1}}
125: \newcommand{\blue}[1]{\textcolor{blue}{#1}}
126: \newcommand{\bluebegin}{\begingroup\color{blue}}
127: \newcommand{\blueend}{\endgroup}
128: \newcommand{\redbegin}{\begingroup\color{red}}
129: \newcommand{\redend}{\endgroup}
130: \fi
131:
132: \emergencystretch=5mm
133: \tolerance=400
134: \allowdisplaybreaks[4]
135:
136: \newcommand{\Vladimir}{Vladimir}
137: \newcommand{\DOT}{.}
138:
139: \ifnotLATIN
140: \input{OT2enc.def}
141: \newenvironment{cyr}
142: {\fontencoding{OT2}\fontfamily{wncyr}\fontseries{m}\fontshape{n}\selectfont}
143: {\fontencoding{OT1}\fontfamily{tir}\selectfont}
144: \usepackage{CJK}
145: \fi
146:
147: \newcommand{\st}{\mathrel{\!|\!}}
148: \newcommand{\givn}{\mathrel{|}}
149: \newcommand{\D}{\,\mathrm{d}}
150: \newcommand{\dd}{\mathrm{d}}
151:
152: \newcommand{\III}{\mathbb{I}}
153: \newcommand{\PPP}{\mathcal{P}} % all probability measures
154:
155: \newcommand{\BL}{\mathrm{BL}} % bounded Lipschitz
156:
157: \newcommand{\diam}{\mathop{\mathrm{diam}}\nolimits}
158:
159: \newcommand{\bbbp}{\mathbb{P}} % auxiliary (probability)
160: \newcommand{\Prob}{\mathop{\bbbp}\nolimits}
161: \newcommand{\bbbe}{\mathbb{E}} % auxiliary (expectation)
162: \newcommand{\Expect}{\mathop{\bbbe}\nolimits}
163:
164: \newcommand{\cpc}{\mathop{\mathrm{cpc}}\nolimits}
165: \newcommand{\ucpc}{\mathop{\overline{\mathstrut\mathrm{cpc}}}\nolimits}
166: \newcommand{\lcpc}{\mathop{\underline{\mathstrut\mathrm{cpc}}}\nolimits}
167:
168: \newcommand{\bbbr}{\mathbb{R}} % the real numbers
169:
170: \newtheorem{lemma}{Lemma}
171: \newtheorem{proposition}{Proposition}
172: \newtheorem{corollary}{Corollary}
173: \newtheorem{remark}{Remark}
174: \newtheorem{theorem}{Theorem}
175: \newenvironment{proof}
176: {\trivlist\item[\hskip\labelsep\textbf{Proof}]}
177: {\endtrivlist}
178:
179: \newenvironment{Proof}[1]
180: {\trivlist\item[\hskip\labelsep\textbf{Proof #1\,}]}
181: {\endtrivlist}
182: \newcommand{\boxforqed}{\rule{.3em}{1.5ex}}
183: \newcommand{\qedtext}{\unskip\nobreak\hfil
184: \penalty50\hskip1em\null\nobreak\hfil\boxforqed
185: \parfillskip=0pt\finalhyphendemerits=0\endgraf}
186: %\newcommand{\qedmath}{\eqno\boxforqed}
187: \newcommand{\qedmath}{\tag*{\boxforqed}}
188: \newenvironment{remark*}
189: {\trivlist\item[\hskip\labelsep{\bfseries Remark}]\relax}
190: {\endtrivlist}
191:
192: \ifJOURNAL
193: \title{Competing with Markov prediction strategies}
194: \author{Vladimir Vovk\\[5mm]
195: Computer Learning Research Centre\\
196: Department of Computer Science\\
197: Royal Holloway, University of London,
198: Egham, Surrey TW20 0EX, UK\\
199: \texttt{vovk@cs.rhul.ac.uk}}
200: \fi
201:
202: \ifCONF
203: \title{Competing with Markov prediction strategies}
204: \author{Vladimir Vovk\\[5mm]
205: Computer Learning Research Centre\\
206: Department of Computer Science\\
207: Royal Holloway, University of London,
208: Egham, Surrey TW20 0EX, UK\\
209: \texttt{vovk@cs.rhul.ac.uk}}
210: \fi
211:
212: \ifarXiv
213: \title{Competing with Markov prediction strategies}
214: \author{Vladimir Vovk\\
215: \texttt{vovk{\rm@}cs.rhul.ac.uk}\\
216: \texttt{http://vovk.net}}
217: \fi
218:
219: \ifWP
220: \title{Competing with Markov prediction strategies}
221: \author{Vladimir Vovk}
222: \newcommand{\No}{20}
223: % For the two dates option: uncomment the next 2 lines
224: % \twodatestrue
225: % \newcommand{\firstposted}{July 13, 2006}
226: \fi
227:
228: \ifFULL
229: \title{Competing with Markov prediction strategies}
230: \author{Vladimir Vovk\\
231: \texttt{vovk{\rm@}cs.rhul.ac.uk}\\
232: \texttt{http://vovk.net}}
233: \fi
234:
235: \begin{document}
236: \maketitle
237: \begin{abstract}
238: Assuming that the loss function is convex in the prediction,
239: we construct a prediction strategy
240: universal for the class of Markov prediction strategies,
241: not necessarily continuous.
242: Allowing randomization,
243: we remove the requirement of convexity.
244: \end{abstract}
245:
246: \section{Introduction}
247: \label{sec:introduction}
248:
249: This paper belongs to the area of research
250: known as universal prediction of individual sequences
251: (see \cite{cesabianchi/lugosi:2006} for a review):
252: the predictor's goal is to compete with a wide benchmark class of prediction strategies.
253: In the previous papers \cite{\DFVII} and \cite{\DFVIII}
254: we constructed prediction strategies
255: competitive with the important classes of Markov and stationary,
256: respectively,
257: continuous prediction strategies.
258: In this paper we consider competing against possibly discontinuous strategies.
259: Our main results assert the existence of prediction strategies
260: competitive with the Markov strategies.
261:
262: This paper's idea of transition from continuous to general benchmark classes
263: was motivated by Skorokhod's topology for the space $D$
264: of ``c\`adl\`ag'' functions, most of which are discontinuous.
265: Skorokhod's idea was to allow small deformations not only along the vertical axis
266: but also along the horizontal axis when defining neighborhoods.
267: Skorokhod's topology was metrized by Kolmogorov so that it became a separable space
268: (\cite{billingsley:1968}, Appendix III; \cite{shiryaev:1989latin}, p.~913),
269: which allows us to apply one of the numerous algorithms for prediction with expert advice
270: (Kalnishkan and Vyugin's Weak Aggregating Algorithm in this paper)
271: to construct a universal algorithm.
272:
273: In Section \ref{sec:results} we give the main definitions and state our main results,
274: Theorems \ref{thm:deterministic} and \ref{thm:randomized};
275: their proofs are given in Sections \ref{sec:proof-deterministic} and \ref{sec:proof-randomized},
276: respectively.
277:
278: \section{Main results}
279: \label{sec:results}
280:
281: The \emph{game of prediction} between two players,
282: called Predictor and Reality,
283: is played according to the following protocol
284: (of \emph{perfect information},
285: in the sense that either player can see the other player's moves made so far).
286:
287: \bigskip
288:
289: \noindent
290: \textsc{Prediction protocol}\nopagebreak
291: \begin{tabbing}
292: \qquad\=\qquad\=\qquad\kill
293: FOR $n=1,2,\dots$:\\
294: \> Reality announces $x_n\in\mathbf{X}$.\\
295: \> Predictor announces $\gamma_n\in\Gamma$.\\
296: \> Reality announces $y_n\in\mathbf{Y}$.\\
297: END FOR.
298: \end{tabbing}
299:
300: \noindent
301: The game proceeds in rounds numbered by the positive integers $n$.
302: At the beginning of each round $n=1,2,\ldots$ Predictor is given some \emph{signal} $x_n$
303: relevant to predicting the following \emph{observation} $y_n$.
304: The signal is taken from the \emph{signal space} $\mathbf{X}$
305: and the observation from the \emph{observation space} $\mathbf{Y}$.
306: Predictor then announces his prediction $\gamma_n$,
307: taken from the \emph{prediction space} $\Gamma$,
308: and the prediction's quality in light of the actual observation
309: is measured by a \emph{loss function}
310: $\lambda:\Gamma\times\mathbf{Y}\to\bbbr$.
311:
312: We will always assume that the signal space $\mathbf{X}$,
313: the prediction space $\Gamma$,
314: and the observation space $\mathbf{Y}$
315: are non-empty sets;
316: $\mathbf{X}$ and $\Gamma$ will often be equipped with additional structures.
317:
318: \subsection*{Markov-universal prediction strategies: deterministic case}
319:
320: Predictor's strategies in the prediction protocol will be called
321: \emph{prediction strategies}.
322: Formally such a strategy is a function
323: \begin{equation*}
324: D:
325: \bigcup_{n=1}^{\infty}
326: \left(
327: \mathbf{X}\times\mathbf{Y}
328: \right)^{n-1}
329: \times
330: \mathbf{X}
331: \to
332: \Gamma;
333: \end{equation*}
334: it maps each history $(x_1,y_1,\ldots,x_{n-1},y_{n-1},x_n)$ to the chosen prediction.
335: In this paper we will be especially interested
336: in \emph{Markov strategies},
337: which are functions $D:\mathbf{X}\to\Gamma$;
338: intuitively,
339: $D(x_n)$ is the recommended prediction on round $n$.
340: The restriction to Markov strategies
341: is not a severe one,
342: since the signal $x_n$ can encode as much of the past as we want
343: (cf.\ \cite{kolmogorov:1931}, footnote 1);
344: in particular, $x_n$ can contain information about the previous observations
345: $y_1,\ldots,y_{n-1}$.
346: In this paper
347: Markov prediction strategies will also be called \emph{prediction rules}
348: (as in \cite{\DFVII};
349: in a more general context, however, it would be risky to omit ``Markov''
350: since ``prediction rule'' is too easy to confuse with ``prediction strategy'').
351:
352: For both our theorems we will need the notion of ``approximation''
353: to a signal $x\in\mathbf{X}$;
354: intuitively, the ``$m$-approximation'' of $x$ is another signal $\phi_m(x)$
355: which is as close to $x$ as possible but carries only $m$ bits of information.
356: If $\mathbf{X}=[0,1]$,
357: a reasonable definition of $\phi_m(x)$ would be to take the binary expansion of $x$
358: but remove all the binary digits starting from the $(m+1)$th after the binary dot.
359: In general,
360: we will have to equip $\mathbf{X}$ with an ``approximation structure'';
361: we will do this following Kolmogorov and Tikhomirov
362: (\cite{\Tikhomirov}, Section 2,
363: \cite{shiryaev:1989latin}, p.~913% this is p.~49 of 80 in the file
364: \ifFULL\bluebegin, \cite{tikhomirov:1976}\blueend\fi).
365:
366: Consider a sequence of mappings $\phi_m:\mathbf{X}\to\mathbf{X}$,
367: $m=1,2,\ldots$,
368: such that each $\phi_m$ is idempotent,
369: in the sense $\phi_m(\phi_m(x))=\phi_m(x)$ for all $x\in\mathbf{X}$,
370: and $\phi_m(\mathbf{X})$ contains $2^m$ elements.
371: (Such mappings are coding-theory analogues of projections in linear algebra
372: and contractions in topology;
373: $\phi_m(x)$ can be thought of as the result of encoding $x$,
374: sending it over an $m$-bit channel,
375: and restoring $x$ as well as possible at the receiving end.)
376: It is the sequence $\phi=\{\phi_m\st m=1,2,\ldots\}$
377: that will be referred to as an \emph{approximation structure}.
378:
379: If $\mathbf{X}$ is a totally bounded (say, compact) metric space,
380: there is an approximation structure $\phi$ such that
381: \begin{equation}\label{eq:fine}
382: \lim_{m\to\infty}
383: \rho
384: \left(
385: x,
386: \phi_m(x)
387: \right)
388: =
389: 0
390: \end{equation}
391: uniformly in $x\in\mathbf{X}$.
392: (We often let $\rho$ stand for the metric in various metric spaces,
393: always clear from the context.)
394: In fact,
395: the \emph{$m$th Kolmogorov diameter}
396: \begin{equation*}
397: \mathcal{K}_m(\mathbf{X})
398: :=
399: \frac12
400: \inf_{\phi}
401: \sup_{x\in\mathbf{X}}
402: \diam
403: \left(
404: \phi_m^{-1}(\phi_m(x))
405: \right)
406: \end{equation*}
407: of $\mathbf{X}$ is essentially the inverse function
408: to the $\epsilon$-entropy $\mathcal{H}_{\epsilon}(\mathbf{X})$.
409: See \cite{\KolmogorovTikhomirov}
410: for precise values and estimates of $\mathcal{K}_m(\mathbf{X})$
411: for numerous totally bounded metric spaces $\mathbf{X}$.
412:
413: A prediction strategy is \emph{Markov-universal} for a loss function $\lambda$
414: and an approximation structure $\phi$
415: if it guarantees that
416: for any prediction rule $D$ and any $m=1,2,\ldots$
417: there exists a number $N_{D,m}$ such that for any $N\ge N_{D,m}$
418: and any sequence $x_1,y_1,x_2,y_2,\ldots$ of Reality's moves
419: its responses $\gamma_n$ satisfy
420: \begin{equation*} % \label{eq:dominates-deterministic}
421: \frac1N
422: \sum_{n=1}^N
423: \lambda
424: (\gamma_n,y_n)
425: \le
426: \frac1N
427: \sum_{n=1}^N
428: \lambda
429: \Bigl(
430: D(\phi_m(x_n)),y_n
431: \Bigr)
432: +
433: 2^{-m}.
434: \end{equation*}
435: \begin{theorem}\label{thm:deterministic}
436: Suppose $\mathbf{X}$ is equipped with an approximation structure $\phi$,
437: $\Gamma$ is a closed convex subset of a separable Banach space,
438: and the loss function $\lambda(\gamma,y)$
439: is bounded, convex in the variable $\gamma\in\Gamma$,
440: and uniformly continuous in $\gamma\in\Gamma$
441: uniformly in $y\in\mathbf{Y}$.
442: There exists a Markov-universal for $\lambda$ and $\phi$ prediction strategy.
443: \end{theorem}
444: A Markov-universal prediction strategy will be constructed in the next section.
445: Theorem \ref{thm:deterministic} says that, under its conditions,
446: \begin{equation}\label{eq:simpler}
447: \limsup_{N\to\infty}
448: \left(
449: \frac1N
450: \sum_{n=1}^N
451: \lambda
452: (\gamma_n,y_n)
453: -
454: \frac1N
455: \sum_{n=1}^N
456: \lambda
457: \Bigl(
458: D(\phi_m(x_n)),y_n
459: \Bigr)
460: \right)
461: \le
462: 0
463: \end{equation}
464: uniformly in $x_1,y_1,x_2,y_2,\ldots$
465: for all $m=1,2,\ldots$ and all $D:\mathbf{X}\to\Gamma$.
466: % This statement is cruder than Theorem \ref{thm:deterministic} itself
467: % but slightly simpler.
468:
469: If $\mathbf{X}$ is a compact metric space and (\ref{eq:fine})
470: holds uniformly in $x\in\mathbf{X}$,
471: (\ref{eq:simpler}) implies
472: \begin{equation*}
473: \limsup_{N\to\infty}
474: \left(
475: \frac1N
476: \sum_{n=1}^N
477: \lambda
478: (\gamma_n,y_n)
479: -
480: \frac1N
481: \sum_{n=1}^N
482: \lambda
483: (D(x_n),y_n)
484: \right)
485: \le
486: 0
487: \end{equation*}
488: for all continuous prediction rules $D$;
489: this is close to Theorem 1 in \cite{\DFVII}.
490: The advance of this paper as compared to \cite{\DFVII} is that our main results
491: do not assume that $D$ is continuous.
492:
493: \subsection*{Markov-universal prediction strategies: randomized case}
494:
495: When the loss function $\lambda(\gamma,y)$ is not required to be convex in $\gamma$,
496: the conclusion of Theorem \ref{thm:deterministic} may become false
497: (\cite{kalnishkan/vyugin:2005}, Theorem 2).
498: The situation changes if we consider randomized prediction strategies.
499:
500: A \emph{randomized prediction strategy} is a function
501: \begin{equation*}
502: D:
503: \bigcup_{n=1}^{\infty}
504: (\mathbf{X}\times\mathbf{Y})^{n-1}\times\mathbf{X}
505: \to
506: \PPP(\Gamma)
507: \end{equation*}
508: mapping the past to the probability measures on the prediction space.
509: In other words, this is a strategy for Predictor
510: in the extended game of prediction with the prediction space $\PPP(\Gamma)$.
511: A \emph{Markov randomized prediction strategy},
512: or \emph{randomized prediction rule} for brevity,
513: is a function $D:\mathbf{X}\to\PPP(\Gamma)$.
514:
515: We will say that a randomized prediction strategy outputting $\gamma_n$
516: is \emph{Markov-universal} for a loss function $\lambda$ and an approximation structure $\phi$ if,
517: for any randomized prediction rule $D$ and any $m=1,2,\ldots$,
518: there exists $N_{D,m}$ such that,
519: for any sequence $x_{1},y_{1},x_{2},y_{2},\ldots$ of Reality's moves,
520: \begin{equation}\label{eq:dominates-randomized}
521: \sup_{N\ge N_{D,m}}
522: \left(
523: \frac1N
524: \sum_{n=1}^N
525: \lambda(g_{n},y_n)
526: -
527: \frac1N
528: \sum_{n=1}^N
529: \lambda(d_{n},y_n)
530: \right)
531: \le
532: 2^{-m}
533: \end{equation}
534: with probability at least $1-2^{-m}$,
535: where $g_{1},g_{2},\ldots,d_{1},d_{2},\ldots$ are independent random variables
536: distributed as
537: \begin{equation}\label{eq:distributed}
538: g_{n}
539: \sim
540: \gamma_n,
541: \enspace
542: d_{n}
543: \sim
544: D(\phi_m(x_n)),
545: \quad
546: n=1,2,\ldots\,.
547: \end{equation}
548: Intuitively,
549: the word ``probability'' after (\ref{eq:dominates-randomized})
550: refers only to the prediction strategies' internal randomization;
551: it is not assumed that Reality behaves stochastically.
552: We will use this definition only in the case
553: where the loss function $\lambda$ is continuous in the prediction,
554: and so (\ref{eq:dominates-randomized}) will indeed be an event
555: having a probability.
556: \begin{theorem}\label{thm:randomized}
557: Suppose the signal space $\mathbf{X}$ is equipped with an approximation structure $\phi$,
558: $\Gamma$ is a separable topological space,
559: and the loss function $\lambda$ is bounded
560: and such that the set of functions $\{\lambda(\cdot,y)\st y\in\mathbf{Y}\}$
561: is equicontinuous.
562: There exists a randomized prediction strategy
563: that is Markov-universal for $\lambda$ and $\phi$.
564: \end{theorem}
565: A Markov-universal prediction strategy is constructed in Section \ref{sec:proof-randomized}.
566: The randomized version of (\ref{eq:simpler}),
567: immediately following from Theorem \ref{thm:randomized},
568: is
569: \begin{equation*}
570: \limsup_{N\to\infty}
571: \left(
572: \frac1N
573: \sum_{n=1}^N
574: \lambda
575: (g_n,y_n)
576: -
577: \frac1N
578: \sum_{n=1}^N
579: \lambda
580: (d_n,y_n)
581: \right)
582: \le
583: 0
584: \quad
585: \text{a.s.},
586: \end{equation*}
587: for all $m=1,2,\ldots$ and all $D:\mathbf{X}\to\Gamma$,
588: where $g_{1},g_{2},\ldots,d_{1},d_{2},\ldots$ are independent
589: and distributed as (\ref{eq:distributed}).
590: \ifFULL\bluebegin
591: If $\mathbf{X}$ is a metric compact and (\ref{eq:fine})
592: holds uniformly in $x$,
593: one might be able to obtain the following analogue of Theorem 2 in \cite{\DFVII}:
594: for continuous prediction rules $D$,
595: \begin{equation*}
596: \limsup_{N\to\infty}
597: \left(
598: \frac1N
599: \sum_{n=1}^N
600: \lambda
601: (g_n,y_n)
602: -
603: \frac1N
604: \sum_{n=1}^N
605: \lambda
606: (d_n,y_n)
607: \right)
608: \le
609: 0
610: \quad
611: \text{a.s.},
612: \end{equation*}
613: where $g_{1},g_{2},\ldots,d_{1},d_{2},\ldots$ are independent
614: and distributed as
615: \begin{equation*}
616: g_{n}
617: \sim
618: \gamma_n,
619: \enspace
620: d_{n}
621: \sim
622: D(x_n),
623: \quad
624: n=1,2,\ldots\,.
625: \end{equation*}
626: \blueend\fi
627:
628: \section{Proof of Theorem \ref{thm:deterministic}}
629: \label{sec:proof-deterministic}
630:
631: Let us fix a dense countable subset $\Gamma^*$ of $\Gamma$.
632: We will say that a function $D:\mathbf{X}\to\Gamma$
633: is \emph{$m$-elementary} if $D(\mathbf{X})\subseteq\Gamma^*$
634: and $D(x)$ depends on $x$ only via $\phi_m(x)$;
635: a function is \emph{elementary} if it is $m$-elementary for some $m$.
636: There are countably many elementary functions;
637: let us enumerate them as $D_1,D_2,\ldots$\,.
638: We will refer to these functions as \emph{experts}.
639: We will apply a special case of Kalnishkan and Vyugin's
640: \cite{kalnishkan/vyugin:2005}
641: Weak Aggregating Algorithm (WAA) to the sequence of experts
642: (as in \cite{\DFVIII}).
643:
644: Let $q_1,q_2,\ldots$ be a sequence of positive numbers summing to 1,
645: $\sum_{k=1}^{\infty}q_k=1$.
646: Define
647: \begin{equation*}
648: l_n^{(k)}
649: :=
650: \lambda
651: \left(
652: D_k(x_n),y_n
653: \right),
654: \quad
655: L_N^{(k)}
656: :=
657: \sum_{n=1}^N
658: l_n^{(k)}
659: \end{equation*}
660: to be the instantaneous loss of the $k$th expert $D_k$ on the $n$th round
661: and his cumulative loss over the first $N$ rounds.
662: For all $n,k=1,2,\ldots$ define
663: \begin{equation*}
664: w_n^{(k)}
665: :=
666: q_k
667: \beta_n^{L_{n-1}^{(k)}},
668: \quad
669: \beta_n
670: :=
671: \exp
672: \left(
673: -\frac{1}{\sqrt{n}}
674: \right)
675: \end{equation*}
676: ($w_n^{(k)}$ are the weights of the experts to use on round $n$)
677: and
678: \begin{equation*}
679: p_n^{(k)}
680: :=
681: \frac
682: {w_n^{(k)}}
683: {\sum_{k=1}^{\infty}w_n^{(k)}}
684: \end{equation*}
685: (the normalized weights;
686: it is obvious that the denominator is positive and finite).
687: The WAA's prediction on round $n$ is
688: \begin{equation}\label{eq:WAA}
689: \gamma_n
690: :=
691: \sum_{k=1}^{\infty}
692: p_n^{(k)}
693: D_k(x_n).
694: \end{equation}
695: To make this series convergent,
696: we may take $q_k:=2^{-k}$ and reorder $D_k$ so that
697: $\sup_x\left\|D_k(x)\right\|\le k$ for all $k$.
698: In this case we will automatically have $\gamma_n\in\Gamma$ since
699: \begin{multline}\label{eq:convergence-to-0}
700: \gamma_n
701: -
702: \sum_{k=1}^K
703: \frac{p_n^{(k)}}{\sum_{k=1}^K p_n^{(k)}}
704: D_k(x_n)\\
705: =
706: \sum_{k=1}^K
707: \left(
708: 1
709: -
710: \frac{1}{\sum_{k=1}^K p_n^{(k)}}
711: \right)
712: p_n^{(k)}
713: D_k(x_n)
714: +
715: \sum_{k=K+1}^{\infty}
716: p_n^{(k)}
717: D_k(x_n)
718: \to
719: 0
720: \end{multline}
721: as $K\to\infty$.
722:
723: Let $l_n:=\lambda(\gamma_n,y_n)$ be the WAA's loss on round $n$
724: and
725: $
726: L_N
727: :=
728: \sum_{n=1}^N
729: l_n
730: $
731: be its cumulative loss over the first $N$ rounds.
732: \begin{lemma}[\cite{kalnishkan/vyugin:2005}, Lemma 9]\label{lem:9}
733: The WAA guarantees that, for all $N=1,2,\ldots$,
734: \begin{equation}\label{eq:lemma9}
735: L_N
736: \le
737: \sum_{n=1}^N
738: \sum_{k=1}^{\infty}
739: p_n^{(k)}
740: l_n^{(k)}
741: -
742: \sum_{n=1}^N
743: \log_{\beta_n}
744: \sum_{k=1}^{\infty}
745: p_n^{(k)}
746: \beta_n^{l_n^{(k)}}
747: +
748: \log_{\beta_N}
749: \sum_{k=1}^{\infty}
750: q_k
751: \beta_N^{L_N^{(k)}}.
752: \end{equation}
753: \end{lemma}
754: The first two terms on the right-hand side of (\ref{eq:lemma9})
755: are sums over the first $N$ rounds of different kinds of mean of the experts' losses
756: (see, e.g., \cite{hardy/etal:1952}, Chapter III,
757: for a general definition of the mean);
758: we will see later that they nearly cancel each other out.
759: If those two terms are ignored,
760: the remaining part of (\ref{eq:lemma9}) is identical
761: (except that $\beta$ now depends on $n$)
762: to the main property of the ``Aggregating Algorithm''
763: (see, e.g., \cite{vovk:2001competitive}, Lemma 1).
764: All infinite series in (\ref{eq:lemma9}) are trivially convergent.
765:
766: In the proof of Lemma \ref{lem:9} we will use the following property
767: of ``countable convexity'' of $\lambda$:
768: \begin{equation}\label{eq:countable-convexity}
769: l_n\le\sum_{k=1}^{\infty}p_n^{(k)}l_n^{(k)}.
770: \end{equation}
771: This property follows from (\ref{eq:convergence-to-0}) and
772: \begin{equation*}
773: \lambda
774: \left(
775: \sum_{k=1}^K
776: \frac{p_n^{(k)}}{\sum_{k=1}^K p_n^{(k)}}
777: D_k(x_n),
778: y_n
779: \right)
780: \le
781: \sum_{k=1}^K
782: \frac{p_n^{(k)}}{\sum_{k=1}^K p_n^{(k)}}
783: \lambda
784: \left(
785: D_k(x_n),
786: y_n
787: \right)
788: \end{equation*}
789: if we let $K\to\infty$.
790:
791: \begin{Proof}{of Lemma \ref{lem:9}}
792: The proof is by induction on $N$.
793: For $N=1$,
794: (\ref{eq:lemma9}) follows from the countable convexity (\ref{eq:countable-convexity})
795: and $p_1^{(k)}=q_k$.
796: Assuming (\ref{eq:lemma9}),
797: we obtain
798: \begin{multline*}
799: L_{N+1}
800: =
801: L_N + l_{N+1}
802: \le
803: L_N
804: +
805: \sum_{k=1}^{\infty}
806: p_{N+1}^{(k)}
807: l_{N+1}^{(k)}\\
808: \le
809: \sum_{n=1}^{N+1}
810: \sum_{k=1}^{\infty}
811: p_n^{(k)}
812: l_n^{(k)}
813: -
814: \sum_{n=1}^N
815: \log_{\beta_n}
816: \sum_{k=1}^{\infty}
817: p_n^{(k)}
818: \beta_n^{l_n^{(k)}}
819: +
820: \log_{\beta_N}
821: \sum_{k=1}^{\infty}
822: q_k
823: \beta_N^{L_N^{(k)}}
824: \end{multline*}
825: (the first ``$\le$'' again used the countable convexity (\ref{eq:countable-convexity})).
826: Therefore,
827: it remains to prove
828: \begin{equation*}
829: \log_{\beta_N}
830: \sum_{k=1}^{\infty}
831: q_k
832: \beta_N^{L_N^{(k)}}
833: \le
834: -\log_{\beta_{N+1}}
835: \sum_{k=1}^{\infty}
836: p_{N+1}^{(k)}
837: \beta_{N+1}^{l_{N+1}^{(k)}}
838: +
839: \log_{\beta_{N+1}}
840: \sum_{k=1}^{\infty}
841: q_k
842: \beta_{N+1}^{L_{N+1}^{(k)}}.
843: \end{equation*}
844: By the definition of $p_n^{(k)}$
845: this can be rewritten as
846: \begin{equation*}
847: \log_{\beta_N}
848: \sum_{k=1}^{\infty}
849: q_k
850: \beta_N^{L_N^{(k)}}
851: \le
852: -\log_{\beta_{N+1}}
853: \frac
854: {
855: \sum_{k=1}^{\infty}
856: q_k
857: \beta_{N+1}^{L_{N}^{(k)}}
858: \beta_{N+1}^{l_{N+1}^{(k)}}
859: }
860: {
861: \sum_{k=1}^{\infty}
862: q_k
863: \beta_{N+1}^{L_{N}^{(k)}}
864: }
865: +
866: \log_{\beta_{N+1}}
867: \sum_{k=1}^{\infty}
868: q_k
869: \beta_{N+1}^{L_{N+1}^{(k)}},
870: \end{equation*}
871: which after cancellation becomes
872: \begin{equation}\label{eq:to-check}
873: \log_{\beta_N}
874: \sum_{k=1}^{\infty}
875: q_k
876: \beta_N^{L_N^{(k)}}
877: \le
878: \log_{\beta_{N+1}}
879: \sum_{k=1}^{\infty}
880: q_k
881: \beta_{N+1}^{L_{N}^{(k)}}.
882: \end{equation}
883: The last inequality follows from the general result
884: about comparison of different means
885: (\cite{hardy/etal:1952}, Theorem 85),
886: but we can also check it directly
887: (following \cite{kalnishkan/vyugin:2005}).
888: Let $\beta_{N+1}=\beta_N^a$,
889: where $0<a<1$.
890: Then (\ref{eq:to-check}) can be rewritten as
891: \begin{equation*}
892: \left(
893: \sum_{k=1}^{\infty}
894: q_k
895: \beta_N^{L_N^{(k)}}
896: \right)^a
897: \ge
898: \sum_{k=1}^{\infty}
899: q_k
900: \beta_{N}^{aL_{N}^{(k)}},
901: \end{equation*}
902: and the last inequality follows from the concavity of the function $t\mapsto t^a$.
903: \qedtext
904: \end{Proof}
905:
906: \begin{lemma}[\cite{kalnishkan/vyugin:2005}, Lemma 5]
907: Let $L$ be an upper bound on $\left|\lambda\right|$.
908: The WAA guarantees that, for all $N$ and $K$,
909: \begin{equation}\label{eq:lemma5}
910: L_N
911: \le
912: L_N^{(K)}
913: +
914: \left(
915: L^2 e^L + \ln\frac{1}{q_K}
916: \right)
917: \sqrt{N}.
918: \end{equation}
919: \end{lemma}
920: \begin{proof}
921: From (\ref{eq:lemma9}),
922: we obtain:
923: \begin{align*}
924: L_N
925: &\le
926: \sum_{n=1}^N
927: \sum_{k=1}^{\infty}
928: p_n^{(k)}
929: l_n^{(k)}
930: +
931: \sum_{n=1}^N
932: \sqrt{n}
933: \ln
934: \sum_{k=1}^{\infty}
935: p_n^{(k)}
936: \exp
937: \left(
938: -\frac{l_n^{(k)}}{\sqrt{n}}
939: \right)
940: +
941: \log_{\beta_N}
942: q_K
943: +
944: L_N^{(K)}\\
945: &\le
946: \sum_{n=1}^N
947: \sum_{k=1}^{\infty}
948: p_n^{(k)}
949: l_n^{(k)}
950: +
951: \sum_{n=1}^N
952: \sqrt{n}
953: \left(
954: \sum_{k=1}^{\infty}
955: p_n^{(k)}
956: \left(
957: 1
958: -
959: \frac{l_n^{(k)}}{\sqrt{n}}
960: +
961: \frac{\left(l_n^{(k)}\right)^2}{2n}
962: e^L
963: \right)
964: -
965: 1
966: \right)\\
967: &\quad{}+
968: \log_{\beta_N}
969: q_K
970: +
971: L_N^{(K)}\\
972: &=
973: L_N^{(K)}
974: +
975: \frac12
976: \sum_{n=1}^N
977: \frac{1}{\sqrt{n}}
978: \sum_{k=1}^{\infty}
979: p_n^{(k)}
980: \left(l_n^{(k)}\right)^2
981: e^L
982: +
983: \sqrt{N}\ln\frac{1}{q_K}\\
984: &\le
985: L_N^{(K)}
986: +
987: \frac{L^2e^L}{2}
988: \sum_{n=1}^N
989: \frac{1}{\sqrt{n}}
990: +
991: \sqrt{N}\ln\frac{1}{q_K}
992: \le
993: L_N^{(K)}
994: +
995: \frac{L^2e^L}{2}
996: \int_0^N
997: \frac{\D t}{\sqrt{t}}
998: +
999: \sqrt{N}\ln\frac{1}{q_K}\\
1000: &=
1001: L_N^{(K)}
1002: +
1003: L^2e^L\sqrt{N}
1004: +
1005: \sqrt{N}\ln\frac{1}{q_K}
1006: \end{align*}
1007: (in the second ``$\le$'' we used the inequalities $e^t\le1+t+\frac{t^2}{2}e^{\left|t\right|}$
1008: and $\ln t\le t-1$).
1009: \qedtext
1010: \end{proof}
1011:
1012: \begin{remark*}
1013: There is no term $e^L$ in \cite{kalnishkan/vyugin:2005}
1014: since that paper only considers non-negative loss functions.
1015: (Notice that even without assuming non-negativity
1016: this term is very crude and can be easily improved.)
1017: \end{remark*}
1018:
1019: Now it is easy to prove Theorem \ref{thm:deterministic}.
1020: The definition of Markov-universality can be restated as follows:
1021: a prediction strategy outputting $\gamma_n$ is Markov-universal
1022: if and only if
1023: for any prediction rule $D$, any $m=1,2,\ldots$,
1024: and any $\epsilon>0$
1025: there exists $N_{D,m,\epsilon}$ such that, for any $N\ge N_{D,m,\epsilon}$
1026: and any $x_1,y_1,x_2,y_2,\ldots$,
1027: \begin{equation}\label{eq:dominates-deterministic-version}
1028: \frac1N
1029: \sum_{n=1}^N
1030: \lambda
1031: (\gamma_n,y_n)
1032: \le
1033: \frac1N
1034: \sum_{n=1}^N
1035: \lambda
1036: \Bigl(
1037: D(\phi_m(x_n)),y_n
1038: \Bigr)
1039: +
1040: \epsilon.
1041: \end{equation}
1042: Let $\gamma_n$ be output by the WAA
1043: and let us consider any prediction rule $D$,
1044: any $m\in\{1,2,\ldots\}$, and any $\epsilon>0$.
1045: Choose $\delta>0$ such that
1046: $\left|\lambda(\gamma,y)-\lambda(\gamma',y)\right|<\epsilon/2$
1047: whenever $\rho(\gamma,\gamma')<\delta$
1048: and choose an $m$-elementary expert $D_K$ such that,
1049: for all $x\in\phi_m(\mathbf{X})$,
1050: $\rho(D(x),D_{K}(x))<\delta$.
1051:
1052: From (\ref{eq:lemma5}) we obtain
1053: \begin{multline}\label{eq:chain}
1054: \frac1N
1055: \sum_{n=1}^N
1056: \lambda(\gamma_n,y_n)
1057: -
1058: \frac1N
1059: \sum_{n=1}^N
1060: \lambda
1061: \Bigl(
1062: D(\phi_m(x_n)),y_n
1063: \Bigr)\\
1064: \le
1065: \frac1N
1066: \sum_{n=1}^N
1067: \lambda(\gamma_n,y_n)
1068: -
1069: \frac1N
1070: \sum_{n=1}^N
1071: \lambda
1072: \Bigl(
1073: D_{K}(\phi_m(x_n)),y_n
1074: \Bigr)
1075: +
1076: \frac{\epsilon}{2}\\
1077: =
1078: \frac1N
1079: \sum_{n=1}^N
1080: \lambda(\gamma_n,y_n)
1081: -
1082: \frac1N
1083: \sum_{n=1}^N
1084: \lambda
1085: \Bigl(
1086: D_{K}(x_n),y_n
1087: \Bigr)
1088: +
1089: \frac{\epsilon}{2}\\
1090: \le
1091: \left(
1092: L^2e^L + \ln\frac{1}{q_{K}}
1093: \right)
1094: \frac{1}{\sqrt{N}}
1095: +
1096: \frac{\epsilon}{2};
1097: \end{multline}
1098: now (\ref{eq:dominates-deterministic-version}) is obvious.
1099:
1100: \section{Proof of Theorem \ref{thm:randomized}}
1101: \label{sec:proof-randomized}
1102:
1103: \ifFULL\bluebegin
1104: Unfortunately,
1105: Theorem \ref{thm:deterministic} cannot be applied
1106: to the extended game of prediction with the prediction space $\PPP(\Gamma)$ directly:
1107: the theorem assumes that $\Gamma$ is a subset of a Banach space,
1108: whereas,
1109: even assuming $\Gamma$ compact,
1110: the dual to an infinite-dimensional Banach space is never even metrizable
1111: in the weak$^*$ topology
1112: (\cite{rudin:1991}, 3.16).
1113: The proof of Theorem \ref{thm:deterministic}, however,
1114: still works for the new game.
1115: \blueend\fi
1116:
1117: A convenient pseudo-metric on $\Gamma$ can be defined by
1118: \begin{equation*}
1119: \rho(g,g')
1120: :=
1121: \sup
1122: \left\{
1123: \lambda(g,y)
1124: -
1125: \lambda(g',y)
1126: \st
1127: y\in\mathbf{Y}
1128: \right\},
1129: \quad
1130: g,g'\in\Gamma
1131: \end{equation*}
1132: (cf.\ \cite{dudley:2002}, Corollary 11.3.4).
1133: Let us redefine $\Gamma$ as the quotient space obtained from the original $\Gamma$
1134: by identifying $g$ and $g'$ for which $\rho(g,g')=0$
1135: (\cite{engelking:1989}, Section 2.4);
1136: in other words,
1137: we will not distinguish predictions that always lead to identical losses.
1138: Now $\rho$ becomes a metric on $\Gamma$.
1139: Let $\Gamma^*$ be a countable dense subset of the original topological space $\Gamma$
1140: (which is separable as a subset of a separable Banach space);
1141: the condition of equicontinuity implies that $\Gamma^*$
1142: (formally defined as the set of equivalence classes
1143: containing elements of the original $\Gamma^*$)
1144: remains a dense subset in $\Gamma$ equipped with the metric $\rho$.
1145: % We can see that $\Gamma$ remains a separable space.
1146:
1147: We define the norm of a function $f:\Gamma\to\bbbr$ as
1148: \begin{equation*}
1149: \left\|f\right\|_{\BL}
1150: :=
1151: \sup_{g,g'\in\Gamma:g\ne g'}
1152: \frac{\left|f(g)-f(g')\right|}{\rho(g,g')}
1153: +
1154: \sup_{g\in\Gamma}
1155: \left|f(g)\right|;
1156: \end{equation*}
1157: this norm is finite for bounded Lipschitz functions
1158: (which form a Banach space under this norm:
1159: see \cite{dudley:2002}, Section 11.2).
1160: Notice that
1161: \begin{equation}\label{eq:BL-for-lambda}
1162: \left\|\lambda\right\|_{\BL}
1163: :=
1164: \sup_{y\in\mathbf{Y}}
1165: \left\|\lambda(\cdot,y)\right\|_{\BL}
1166: <
1167: \infty.
1168: \end{equation}
1169:
1170: Next define
1171: \begin{equation}\label{eq:expected-loss}
1172: \lambda(\gamma,y)
1173: :=
1174: \int_{\Gamma}
1175: \lambda(g,y)
1176: \gamma(\dd g),
1177: \end{equation}
1178: where $\gamma$ is a probability measure on $\Gamma$.
1179: This is the loss function in a new game of prediction
1180: with the prediction space $\PPP(\Gamma)$;
1181: it is linear and, therefore, convex in $\gamma$.
1182: (In general,
1183: the role of randomization in this paper
1184: is to make the loss function convex in the prediction.)
1185:
1186: As a metric on $\PPP(\Gamma)$ we will take the Fortet--Mourier metric
1187: (\cite{dudley:2002}, Section 11.3)
1188: defined as
1189: \begin{equation*}
1190: \beta(\gamma,\gamma')
1191: :=
1192: \sup_{f:\left\|f\right\|_{\BL}\le1}
1193: \left|
1194: \int_{\Gamma}
1195: f
1196: \D
1197: (\gamma-\gamma')
1198: \right|.
1199: \end{equation*}
1200: The topology on $\PPP(\Gamma)$ induced by this metric
1201: is called the \emph{topology of weak convergence}
1202: (\cite{billingsley:1968};
1203: weak convergence is called simply ``convergence'' in \cite{dudley:2002};
1204: for the proof of equivalence of several natural definitions
1205: of the topology of weak convergence,
1206: see \cite{dudley:2002}, Theorem 11.3.3).
1207:
1208: Let us check that the loss function (\ref{eq:expected-loss}) is also
1209: bounded Lipschitz, in the sense of (\ref{eq:BL-for-lambda}):
1210: if $\gamma,\gamma'\in\PPP(\gamma)$ and $y\in\mathbf{Y}$,
1211: \begin{equation*}
1212: \left|
1213: \lambda(\gamma,y)
1214: -
1215: \lambda(\gamma',y)
1216: \right|
1217: =
1218: \left|
1219: \int_{\Gamma}
1220: \lambda(g,y)
1221: (\gamma-\gamma')
1222: (\dd g)
1223: \right|
1224: \le
1225: \left\|\lambda\right\|_{\BL}
1226: \beta(\gamma,\gamma').
1227: \end{equation*}
1228:
1229: It is easy to see that the space $\PPP(\Gamma)$ with metric $\beta$ is separable:
1230: e.g., the set of probability measures concentrated on finite subsets of $\Gamma^*$
1231: and taking rational values is dense in $\PPP(\Gamma)$
1232: (cf.\ \cite{billingsley:1968}, Appendix III).
1233: Let us enumerate the elements of a dense countable set in $\PPP(\Gamma)$
1234: as $D_1,D_2,\ldots$;
1235: as in the previous section,
1236: we will use the WAA to merge all \emph{experts} $D_k$.
1237:
1238: The convergence of the mixture (\ref{eq:WAA}) to a probability measure on $\Gamma$
1239: is now obvious.
1240: The countable convexity (\ref{eq:countable-convexity})
1241: now holds with equality,
1242: \begin{equation*}
1243: \lambda
1244: \left(
1245: \sum_{k=1}^{\infty}
1246: p_n^{(k)}
1247: D_k(x_n),
1248: y_n
1249: \right)
1250: =
1251: \sum_{k=1}^{\infty}
1252: p_n^{(k)}
1253: \lambda
1254: \left(
1255: D_k(x_n),
1256: y_n
1257: \right),
1258: \end{equation*}
1259: and follows from the general fact that
1260: \begin{equation*}
1261: \int f \D \sum_{k=1}^{\infty} p_k P_k
1262: =
1263: \sum_{k=1}^{\infty}
1264: p_k
1265: \int f \D P_k
1266: \end{equation*}
1267: for bounded Borel $f:\Gamma\to\bbbr$,
1268: positive $p_1,p_2,\ldots$ summing to $1$,
1269: and $P_1,P_2,\ldots\in\PPP(\Gamma)$
1270: (this is obviously true for simple $f$
1271: and follows for arbitrary integrable $f$ from the definition of Lebesgue integral:
1272: see, e.g., \cite{dudley:2002}, Section 4.1).
1273:
1274: Therefore, it is easy to check
1275: that the chain (\ref{eq:chain}) still works
1276: (with $\PPP(\Gamma)$ equipped with metric $\beta$)
1277: and we can rephrase the previous section's result as follows.
1278: For any randomized prediction rule $D$, any $m=1,2,\ldots$,
1279: and any $\epsilon>0$
1280: there exists $N_{D,m,\epsilon}$ such that, for any $N\ge N_{D,m,\epsilon}$
1281: and any $x_1,y_1,x_2,y_2,\ldots$,
1282: the WAA's predictions $\gamma_n\in\PPP(\Gamma)$
1283: are guaranteed to satisfy
1284: \begin{equation}\label{eq:mean}
1285: \frac1N
1286: \sum_{n=1}^N
1287: \lambda
1288: (\gamma_n,y_n)
1289: \le
1290: \frac1N
1291: \sum_{n=1}^N
1292: \lambda
1293: \Bigl(
1294: D(\phi_m(x_n)),y_n
1295: \Bigr)
1296: +
1297: \frac{\epsilon}{2}
1298: \end{equation}
1299: (cf.\ (\ref{eq:dominates-deterministic-version})).
1300:
1301: The loss function is bounded in absolute value
1302: by a constant $L$,
1303: and so the law of the iterated logarithm
1304: (in Kolmogorov's finitary form,
1305: \cite{kolmogorov:1929}, the end of the introductory section;
1306: the condition that the cumulative variance tends to infinity
1307: is easy to get rid of:
1308: see, e.g., \cite{shafer/vovk:2001}, (5.8))
1309: implies that for any $\delta>0$ there exists $N_{\delta}$
1310: such that the conjunction of
1311: \begin{equation*}
1312: \sup_{N\ge N_{\delta}}
1313: \left|
1314: \sum_{n=1}^N
1315: \bigl(
1316: \lambda(g_n,y_n)
1317: -
1318: \lambda(\gamma_n,y_n)
1319: \bigr)
1320: \right|
1321: \le
1322: \sqrt{2.01 L^2 N\ln\ln N}
1323: \end{equation*}
1324: and
1325: \begin{equation*}
1326: \sup_{N\ge N_{\delta}}
1327: \left|
1328: \sum_{n=1}^N
1329: \bigl(
1330: \lambda(d_n,y_n)
1331: -
1332: \lambda(D(x_n),y_n)
1333: \bigr)
1334: \right|
1335: \le
1336: \sqrt{2.01 L^2 N\ln\ln N}
1337: \end{equation*}
1338: holds with probability at least $1-\delta$.
1339: Combining the last two inequalities with (\ref{eq:mean})
1340: we can see that for any randomized prediction rule $D$, any $m=1,2,\ldots$,
1341: any $\epsilon>0$, and any $\delta>0$
1342: there exists $N_{D,m,\epsilon,\delta}$ such that,
1343: for any $x_1,y_1,x_2,y_2,\ldots$,
1344: the WAA's responses $\gamma_n\in\PPP(\Gamma)$ to $x_1,y_1,x_2,y_2,\ldots$
1345: are guaranteed to satisfy
1346: \begin{equation*}
1347: \sup_{N\ge N_{D,m,\epsilon,\delta}}
1348: \left(
1349: \frac1N
1350: \sum_{n=1}^N
1351: \lambda(g_n,y_n)
1352: -
1353: \frac1N
1354: \sum_{n=1}^N
1355: \lambda(d_n,y_n)
1356: \right)
1357: \le
1358: \epsilon
1359: \end{equation*}
1360: with probability at least $1-\delta$.
1361: This is equivalent to the WAA (applied to $D_1,D_2,\ldots$)
1362: being a Markov-universal randomized prediction strategy.
1363:
1364: \section{Conclusion}
1365: \label{sec:conclusion}
1366:
1367: An interesting theoretical problem
1368: is to state more explicit versions
1369: of Theorems \ref{thm:deterministic} and \ref{thm:randomized}:
1370: for example,
1371: to give an explicit expression for $N_{D,m}$.
1372:
1373: The field of lossy compression is now well developed,
1374: and it would be interesting to apply our prediction algorithms
1375: (perhaps with the Weak Aggregating Algorithm replaced
1376: by an algorithm based on, say, gradient descent \cite{cesabianchi/lugosi:2006}
1377: or defensive forecasting \cite{\DFVII})
1378: to the approximation structures induced by popular lossy compression algorithms.
1379:
1380: \subsection*{Acknowledgments}
1381:
1382: This work was partially supported by MRC (grant S505/65).
1383:
1384: \begin{thebibliography}{10}
1385: \bibitem{billingsley:1968}
1386: Patrick Billingsley.
1387: \newblock {\em Convergence of Probability Measures}.
1388: \newblock Wiley, New York, 1968.
1389:
1390: \bibitem{cesabianchi/lugosi:2006}
1391: Nicol\`o Cesa-Bianchi and G\'abor Lugosi.
1392: \newblock {\em Prediction, Learning, and Games}.
1393: \newblock Cambridge University Press, Cambridge, 2006.
1394:
1395: \bibitem{dudley:2002}
1396: Richard~M. Dudley.
1397: \newblock {\em Real Analysis and Probability}, volume~74 of {\em Cambridge
1398: Studies in Advanced Mathematics}.
1399: \newblock Cambridge University Press, Cambridge, England, revised edition,
1400: 2002.
1401:
1402: \bibitem{engelking:1989}
1403: Ryszard Engelking.
1404: \newblock {\em General Topology}, volume~6 of {\em Sigma Series in Pure
1405: Mathematics}.
1406: \newblock Heldermann, Berlin, second edition, 1989.
1407:
1408: \bibitem{hardy/etal:1952}
1409: G\DOT{}~H\DOT{} Hardy, John~E\DOT{} Littlewood, and George P\'olya.
1410: \newblock {\em Inequalities}.
1411: \newblock Cambridge University Press, Cambridge, second edition, 1952.
1412:
1413: \bibitem{kalnishkan/vyugin:2005}
1414: Yuri Kalnishkan and Michael~V\DOT{} Vyugin.
1415: \newblock The {W}eak {A}ggregating {A}lgorithm and weak mixability.
1416: \newblock In Peter Auer and Ron Meir, editors, {\em Proceedings of the
1417: Eighteenth Annual Conference on Learning Theory}, volume 3559 of {\em Lecture
1418: Notes in Computer Science}, pages 188--203, Berlin, 2005. Springer.
1419: \newblock The journal version is being prepared for the Special Issue of
1420: \emph{Journal of Machine Learning Research} devoted to COLT'2005; all
1421: references are to the journal version.
1422:
1423: \bibitem{kolmogorov:1929}
1424: Andrei~N\DOT{} Kolmogorov.
1425: \newblock {\"U}ber das {G}esetz des iterierten {L}ogarithmus.
1426: \newblock {\em Mathematische Annalen}, 101:126--135, 1929.
1427:
1428: \bibitem{kolmogorov:1931}
1429: Andrei~N\DOT{} Kolmogorov.
1430: \newblock {\"U}ber die analytischen {M}ethoden in der
1431: {W}ahrscheinlichkeitsrechnung.
1432: \newblock {\em Mathematische Annalen}, 104:415--458, 1931.
1433:
1434: \bibitem{kolmogorov/tikhomirov:1959latin}
1435: Andrei~N\DOT{} Kolmogorov and Vladimir~M\DOT{} Tikhomirov.
1436: \newblock $\epsilon$-entropy and $\epsilon$-capacity of sets in functional
1437: spaces (in {R}ussian).
1438: \newblock {\em Uspekhi Matematicheskikh Nauk}, 14(2):3--86, 1959.
1439:
1440: \bibitem{shafer/vovk:2001}
1441: Glenn Shafer and \Vladimir{} Vovk.
1442: \newblock {\em Probability and Finance: It's Only a Game!}
1443: \newblock Wiley, New York, 2001.
1444:
1445: \bibitem{shiryaev:1989latin}
1446: Albert~N\DOT{} Shiryaev.
1447: \newblock Kolmogorov: life and creative activities.
1448: \newblock {\em Annals of Probability}, 17:866--944, 1989.
1449:
1450: \bibitem{tikhomirov:1987latin}
1451: Vladimir~M\DOT{} Tikhomirov.
1452: \newblock $\epsilon$-entropy and $\epsilon$-capacity (in {R}ussian).
1453: \newblock In Yury~V\DOT{} Prokhorov and Albert~N\DOT{} Shiryaev, editors, {\em
1454: Kolmogorov. Teoriya In\-for\-ma\-tsii i Teoriya Algoritmov}, pages 262--269.
1455: Nauka, Moscow, 1987.
1456:
1457: \bibitem{vovk:2001competitive}
1458: Vladimir Vovk.
1459: \newblock Competitive on-line statistics.
1460: \newblock {\em International Statistical Review}, 69:213--248, 2001.
1461:
1462: \bibitem{DF08arXiv}
1463: \Vladimir{} Vovk.
1464: \newblock Competing with stationary prediction strategies.
1465: \newblock Technical Report \texttt{arXiv:cs.LG/0607067}, \texttt{arXiv.org}
1466: e-Print archive, July 2006.
1467:
1468: \bibitem{DF07arXiv}
1469: \Vladimir{} Vovk.
1470: \newblock Predictions as statements and decisions.
1471: \newblock Technical Report \texttt{arXiv:cs.LG/0606093}, \texttt{arXiv.org}
1472: e-Print archive, June 2006.
1473:
1474: \end{thebibliography}
1475: \end{document}
1476: