cs0607067/cs0607067
1: % Last changed: 13 Jul 2006
2: % Spell checked: 13 Jul 2006
3: % 2120 lines, 68 KB
4: \newif\ifJOURNAL
5: \JOURNALfalse
6: \newif\ifCONF
7: \CONFfalse
8: \newif\ifarXiv
9: \arXivfalse
10: \newif\ifWP
11: \WPfalse
12: \newif\ifFULL
13: \FULLfalse
14: 
15: \newif\ifLATIN
16: \LATINfalse
17: 
18: %\JOURNALtrue		% choose JOURNAL, arXiv, WP, or FULL
19: %\CONFtrue
20: \arXivtrue
21: %\WPtrue
22: %\FULLtrue		% this version is not for publication and contains extra remarks and questions
23: 
24: %\LATINtrue		% LATIN means that the Cyrillic references should be set in Latin
25: \ifarXiv\LATINtrue\fi	% for submitting to arXiv
26: 
27: \newif\ifnotJOURNAL	% derivative conditional
28: \notJOURNALtrue
29: \ifJOURNAL\notJOURNALfalse\fi
30: 
31: \newif\ifnotarXiv	% derivative conditional
32: \notarXivtrue
33: \ifarXiv\notarXivfalse\fi
34: 
35: \newif\ifTR		% derivative conditionals (TR = arXiv or WP)
36: \TRfalse
37: \ifarXiv\TRtrue\fi
38: \ifWP\TRtrue\fi
39: \newif\ifnotTR
40: \notTRtrue
41: \ifarXiv\notTRfalse\fi
42: \ifWP\notTRfalse\fi
43: 
44: \newif\ifnotLATIN	% derivative conditional
45: \notLATINtrue
46: \ifLATIN\notLATINfalse\fi
47: 
48: \ifJOURNAL
49:   \newcommand{\GTPVII}{vovk/shafer:2005RSS}
50:   \newcommand{\GTPVIII}{vovk/etal:2005AIStatslocal}
51:   \newcommand{\GTPX}{vovk/etal:2005ALT}
52:   \newcommand{\GTPXI}{GTP11arXiv-local}
53:   \newcommand{\GTPXIII}{vovk:2005ALT-GTP13}
54:   \newcommand{\GTPXIV}{vovk:2005ALT-GTP14}
55:   \newcommand{\GTPXVI}{GTP16arXiv-local}
56: \fi
57: \ifarXiv
58:   \newcommand{\GTPVII}{GTP7}
59:   \newcommand{\GTPVIII}{GTP8arXiv}
60:   \newcommand{\GTPX}{GTP10arXiv}
61:   \newcommand{\GTPXI}{GTP11arXiv}
62:   \newcommand{\GTPXIII}{GTP13arXiv}
63:   \newcommand{\GTPXIV}{GTP14arXiv}
64:   \newcommand{\GTPXVII}{GTP17arXiv}
65: \fi
66: \ifWP
67:   \newcommand{\GTPVII}{GTP7}
68:   \newcommand{\GTPVIII}{GTP8}
69:   \newcommand{\GTPX}{GTP10}
70:   \newcommand{\GTPXI}{GTP11}
71:   \newcommand{\GTPXIII}{GTP13}
72:   \newcommand{\GTPXIV}{GTP14}
73:   \newcommand{\GTPXVII}{GTP17}
74: \fi
75: \ifFULL
76:   \newcommand{\GTPVII}{GTP7}
77:   \newcommand{\GTPVIII}{GTP8arXiv}
78:   \newcommand{\GTPX}{GTP10arXiv}
79:   \newcommand{\GTPXI}{GTP11arXiv}
80:   \newcommand{\GTPXIII}{GTP13arXiv}
81:   \newcommand{\GTPXIV}{GTP14arXiv}
82: \fi
83: 
84: \ifnotLATIN
85:   \newcommand{\KolmogorovTikhomirov}{kolmogorov/tikhomirov:1959}
86:   \newcommand{\KolmogorovCRfull}{kolmogorov:1941CR}
87:   \newcommand{\KolmogorovStationary}{kolmogorov:1941}
88: \fi
89: \ifLATIN
90:   \newcommand{\KolmogorovTikhomirov}{kolmogorov/tikhomirov:1959latin}
91:   \newcommand{\KolmogorovCRfull}{kolmogorov:1941CR-latin}
92:   \newcommand{\KolmogorovStationary}{kolmogorov:1941-latin}
93: \fi
94: 
95: \ifJOURNAL
96: \documentclass[toc]{article}
97: \usepackage{amsmath,amsfonts,amssymb,latexsym,graphicx}
98: \newcommand{\Extra}[1]{}
99: \fi
100: 
101: \ifCONF
102: \documentclass[toc]{article}
103: \usepackage{amsmath,amsfonts,amssymb,latexsym,graphicx}
104: \newcommand{\Extra}[1]{}
105: \fi
106: 
107: \ifarXiv
108: \documentclass[toc]{article}
109: \usepackage{amsmath,amsfonts,amssymb,latexsym,graphicx}
110: \newcommand{\Extra}[1]{}
111: \fi
112: 
113: \ifWP
114: \documentclass[toc]{gtarticle}
115: \usepackage{amsmath,amsfonts,amssymb,latexsym,epsfig,graphicx}
116: \renewcommand{\Extra}[1]{#1}
117: \fi
118: 
119: \ifFULL
120: \documentclass{article}
121: \usepackage{amsmath,amsfonts,amssymb,latexsym,color,epigraph,graphicx,eepic}
122: \newcommand{\Extra}[1]{\red{#1}}
123: \newcommand{\red}[1]{\textcolor{red}{#1}}
124: \newcommand{\blue}[1]{\textcolor{blue}{#1}}
125: \newcommand{\bluebegin}{\begingroup\color{blue}}
126: \newcommand{\blueend}{\endgroup}
127: \newcommand{\redbegin}{\begingroup\color{red}}
128: \newcommand{\redend}{\endgroup}
129: \fi
130: 
131: \emergencystretch=5mm
132: \tolerance=400
133: \allowdisplaybreaks[4]
134: 
135: \newcommand{\Vladimir}{Vladimir}
136: \newcommand{\DOT}{.}
137: 
138: \ifnotLATIN
139: \input{OT2enc.def}
140: \newenvironment{cyr}
141: {\fontencoding{OT2}\fontfamily{wncyr}\fontseries{m}\fontshape{n}\selectfont}
142: {\fontencoding{OT1}\fontfamily{tir}\selectfont}
143: \usepackage{CJK}
144: \fi
145: 
146: \newcommand{\st}{\mathrel{\!|\!}}
147: \newcommand{\givn}{\mathrel{|}}
148: \newcommand{\D}{\,\mathrm{d}}
149: \newcommand{\dd}{\mathrm{d}}
150: 
151: \newcommand{\K}{\mathcal{K}}		% capital
152: \newcommand{\kkk}{\mathbf{k}}		% kernel
153: \newcommand{\ccc}{\mathbf{c}}		% constant
154: \newcommand{\III}{\mathbb{I}}
155: \newcommand{\CCC}{\mathcal{C}}		% class of prediction rules
156: \newcommand{\FFF}{\mathcal{F}}		% function space
157: \newcommand{\GGG}{\mathcal{G}}		% function space
158: \newcommand{\HHH}{\mathcal{H}}		% Hilbert space
159: \newcommand{\PPP}{\mathcal{P}}		% all probability measures
160: \newcommand{\SSS}{\mathcal{S}}		% Sobolev space
161: 
162: \newcommand{\Int}{\mathop{\mathrm{Int}}\nolimits}
163: 
164: \newcommand{\bbbp}{\mathbb{P}}		% auxiliary (probability)
165: \newcommand{\Prob}{\mathop{\bbbp}\nolimits}
166: \newcommand{\bbbe}{\mathbb{E}}		% auxiliary (expectation)
167: \newcommand{\Expect}{\mathop{\bbbe}\nolimits}
168: 
169: \newcommand{\cpc}{\mathop{\mathrm{cpc}}\nolimits}
170: \newcommand{\ucpc}{\mathop{\overline{\mathstrut\mathrm{cpc}}}\nolimits}
171: \newcommand{\lcpc}{\mathop{\underline{\mathstrut\mathrm{cpc}}}\nolimits}
172: 
173: \newcommand{\bbbr}{\mathbb{R}}		% the real numbers
174: 
175: \newtheorem{lemma}{Lemma}
176: \newtheorem{proposition}{Proposition}
177: \newtheorem{corollary}{Corollary}
178: \newtheorem{remark}{Remark}
179: \newtheorem{theorem}{Theorem}
180: \newenvironment{proof}
181:   {\trivlist\item[\hskip\labelsep\textbf{Proof}]}
182:   {\endtrivlist}
183: 
184: \newenvironment{Proof}[1]
185:   {\trivlist\item[\hskip\labelsep\textbf{Proof #1\,}]}
186:   {\endtrivlist}
187: \newcommand{\boxforqed}{\rule{.3em}{1.5ex}}
188: \newcommand{\qedtext}{\unskip\nobreak\hfil
189:   \penalty50\hskip1em\null\nobreak\hfil\boxforqed
190:   \parfillskip=0pt\finalhyphendemerits=0\endgraf}
191: %\newcommand{\qedmath}{\eqno\boxforqed}
192: \newcommand{\qedmath}{\tag*{\boxforqed}}
193: \newenvironment{remark*}
194:   {\trivlist\item[\hskip\labelsep{\bfseries Remark}]\relax}
195:   {\endtrivlist}
196: 
197: \ifJOURNAL
198: \title{Competing with stationary prediction strategies}
199: \author{Vladimir Vovk\\[5mm]
200:  Computer Learning Research Centre\\
201:   Department of Computer Science\\
202:   Royal Holloway, University of London,
203:   Egham, Surrey TW20 0EX, UK\\
204:   \texttt{vovk@cs.rhul.ac.uk}}
205: \fi
206: 
207: \ifCONF
208: \title{Competing with stationary prediction strategies}
209: \author{Vladimir Vovk\\[5mm]
210:  Computer Learning Research Centre\\
211:   Department of Computer Science\\
212:   Royal Holloway, University of London,
213:   Egham, Surrey TW20 0EX, UK\\
214:   \texttt{vovk@cs.rhul.ac.uk}}
215: \fi
216: 
217: \ifarXiv
218: \title{Competing with stationary prediction strategies}%\\(draft: comments welcome)}
219: \author{Vladimir Vovk\\
220: \texttt{vovk{\rm@}cs.rhul.ac.uk}\\
221: \texttt{http://vovk.net}}
222: \fi
223: 
224: \ifWP
225: \title{Competing with stationary prediction strategies}
226: \author{Vladimir Vovk}
227: \newcommand{\No}{18}
228: % For the two dates option: uncomment the next 2 lines
229: % \twodatestrue
230: % \newcommand{\firstposted}{July 13, 2006}
231: \fi
232: 
233: \ifFULL
234: \title{Competing with stationary prediction strategies}
235: \author{Vladimir Vovk\\
236: \texttt{vovk{\rm@}cs.rhul.ac.uk}\\
237: \texttt{http://vovk.net}}
238: \fi
239: 
240: \begin{document}
241: \maketitle
242: \begin{abstract}
243:   In this paper we introduce the class of stationary prediction strategies
244:   and construct a prediction algorithm
245:   that asymptotically performs as well as the best continuous stationary strategy.
246:   We make mild compactness assumptions but no stochastic assumptions
247:   about the environment.
248:   In particular,
249:   no assumption of stationarity is made about the environment,
250:   and the stationarity of the considered strategies
251:   only means that they do not depend explicitly on time;
252:   we argue that it is natural to consider only stationary strategies
253:   even for highly non-stationary environments.
254: \end{abstract}
255: 
256: \section{Introduction}
257: \label{sec:introduction}
258: 
259: This paper belongs to the area of learning theory
260: that has been variously referred to as prediction with expert advice,
261: competitive on-line prediction,
262: prediction of individual sequences,
263: and universal on-line learning;
264: see \cite{cesabianchi/lugosi:2006} for a review.
265: There are many proof techniques known in this field;
266: this paper is based on Kalnishkan and Vyugin's Weak Aggregating Algorithm
267: \cite{kalnishkan/vyugin:2005},
268: but it is possible that some of the numerous other techniques
269: could be used instead.
270: 
271: In Section \ref{sec:results} we give the main definitions
272: and state our main results, Theorems \ref{thm:deterministic-compact}--\ref{thm:randomized};
273: their proofs are given
274: in Sections \ref{sec:proof-deterministic-compact}--\ref{sec:proof-randomized}.
275: In Section \ref{sec:stationarity}
276: we informally discuss the notion of stationarity,
277: and Section \ref{sec:conclusion} concludes.
278: 
279: \section{Main results}
280: \label{sec:results}
281: 
282: The \emph{game of prediction} between Predictor and Reality
283: is played according to the following protocol
284: (of \emph{perfect information},
285: in the sense that either player can see the other player's moves made so far).
286: 
287: \bigskip
288: 
289: \noindent
290: \textsc{Prediction protocol}\nopagebreak
291: \begin{tabbing}
292:   \qquad\=\qquad\=\qquad\kill
293:   Reality announces $(\ldots,x_{-1},y_{-1},x_0,y_0)\in(\mathbf{X}\times\mathbf{Y})^{\infty}$.\\
294:   FOR $n=1,2,\dots$:\\
295:   \> Reality announces $x_n\in\mathbf{X}$.\\
296:   \> Predictor announces $\gamma_n\in\Gamma$.\\
297:   \> Reality announces $y_n\in\mathbf{Y}$.\\
298:   END FOR.
299: \end{tabbing}
300: 
301: \noindent
302: After Reality's first move the game proceeds in rounds numbered by the positive integers $n$.
303: At the beginning of each round $n=1,2,\ldots$ Predictor is given some signal $x_n$
304: relevant to predicting the following observation $y_n$.
305: The signal is taken from the \emph{signal space} $\mathbf{X}$
306: and the observations from the \emph{observation space} $\mathbf{Y}$.
307: Predictor then announces his prediction $\gamma_n$,
308: taken from the \emph{prediction space} $\Gamma$,
309: and the prediction's quality in light of the actual observation
310: is measured by a \emph{loss function}
311: $\lambda:\Gamma\times\mathbf{Y}\to\bbbr$.
312: % (The prediction protocol with a fixed loss function
313: % will sometimes be referred to as a \emph{prediction game},
314: % or \emph{game of prediction}.)
315: At the beginning of the game Reality chooses the infinite past,
316: $(x_n,y_n)$ for all $n\le0$.
317: 
318: In the games of prediction traditionally considered in machine learning
319: there is no infinite past.
320: This situation is modeled in our framework by extending the signal space and observation space
321: by new elements ${?}\in\mathbf{X}$ and ${?}\in\mathbf{Y}$,
322: defining $\lambda(\gamma,{?})$ arbitrarily,
323: and making Reality announce the infinite past
324: $(\ldots,x_{-1},y_{-1},x_0,y_0)=(\ldots,{?},{?},{?},{?})$
325: and refrain from announcing $x_n={?}$ or $y_n={?}$ afterwards
326: (intuitively, $?$ corresponds to ``no feedback from Reality'').
327: 
328: We will always assume that the signal space $\mathbf{X}$,
329: the prediction space $\Gamma$,
330: and the observation space $\mathbf{Y}$
331: are non-empty topological spaces
332: and that the loss function $\lambda$ is continuous.
333: Moreover,
334: we are mainly interested in the case
335: where $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$ are locally compact metric spaces,
336: the prime examples being Euclidean spaces and their open and closed subsets.
337: Our first results will be stated for the case
338: where all three spaces $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$ are compact.
339: 
340: \begin{remark*}
341:   Our results can be easily extended to the case
342:   where the loss on the $n$th round is allowed to depend,
343:   in addition to $\gamma_n$ and $y_n$,
344:   on the past $\ldots,x_{n-1},y_{n-1},x_n$.
345:   This would, however, complicate the notation.
346: \end{remark*}
347: 
348: Predictor's strategies in the prediction protocol will be called
349: \emph{prediction strategies}
350: (or \emph{prediction algorithms},
351: when they are defined explicitly and we want to emphasize this).
352: Mathematically such a strategy is a function
353: $D:(\mathbf{X}\times\mathbf{Y})^{\infty}\times\mathbf{X}\times\{1,2,\ldots\}\to\Gamma$;
354: it maps each history $(\ldots,x_{n-1},y_{n-1},x_n)$
355: and the current time $n$ to the chosen prediction.
356: In this paper we will only be interested in continuous prediction strategies $D$
357: (according to the traditional point of view \cite{martin-lof:1970},
358: going back to Brouwer,
359: only continuous prediction strategies can be computable;
360: although it should be mentioned that nowadays
361: there are influential definitions of computability
362: \cite{blum/etal:1989,blum/etal:1998}
363: not requiring continuity).
364: An especially natural class of strategies
365: is formed by the \emph{stationary prediction strategies}
366: $D:(\mathbf{X}\times\mathbf{Y})^{\infty}\times\mathbf{X}\to\Gamma$,
367: which do not depend on time explicitly;
368: since the origin of time is usually chosen arbitrarily,
369: this appears a reasonable restriction
370: (see Section \ref{sec:stationarity} for a further discussion).
371: 
372: \subsection*{Universal prediction strategies: compact deterministic case}
373: 
374: In this and next subsections we will assume that the spaces $\mathbf{X},\Gamma,\mathbf{Y}$
375: are all compact.
376: A prediction strategy is \emph{CS universal} for a loss function $\lambda$ if
377: its predictions $\gamma_n$ satisfy
378: \begin{equation}\label{eq:dominates-deterministic-compact}
379:   \limsup_{N\to\infty}
380:   \Biggl(
381:     \frac1N
382:     \sum_{n=1}^N
383:     \lambda
384:     (\gamma_n,y_n)
385:     {}-
386:     \frac1N
387:     \sum_{n=1}^N
388:     \lambda
389:     \bigl(
390:       D(\ldots,x_{n-1},y_{n-1},x_n),y_n
391:     \bigr)
392:   \Biggr)
393:   \le
394:   0
395: \end{equation}
396: for any continuous stationary prediction strategy $D$
397: and any biinfinite $\ldots,x_{-1},y_{-1},x_{0},y_{0},x_{1},y_{1},\ldots$\,.
398: (``CS'' refers to the continuity and stationarity of the prediction strategies
399: we are competing with.)
400: \begin{theorem}\label{thm:deterministic-compact}
401:   Suppose $\mathbf{X}$ and $\mathbf{Y}$ are compact metric spaces,
402:   $\Gamma$ is a compact convex subset of a Banach space,
403:   and the loss function $\lambda(\gamma,y)$ is continuous in $(\gamma,y)$
404:   and convex in the variable $\gamma\in\Gamma$.
405:   There exists a CS universal prediction algorithm.
406: \end{theorem}
407: A CS universal prediction algorithm will be constructed in the next section.
408: 
409: \subsection*{Universal prediction strategies: compact randomized case}
410: 
411: When the loss function $\lambda(\gamma,y)$ is not convex in $\gamma$,
412: two difficulties appear:
413: \begin{itemize}
414: \item
415:   the conclusion of Theorem \ref{thm:deterministic-compact} becomes false
416:   if the convexity requirement is removed
417:   (\cite{kalnishkan/vyugin:2005}, Theorem 2);
418: \item
419:   in some cases the notion of a continuous prediction strategy becomes vacuous:
420:   e.g., there are no non-constant continuous stationary prediction strategies
421:   when $\Gamma=\{0,1\}$
422:   and $(\mathbf{X}\times\mathbf{Y})^{\infty}\times\mathbf{X}$ is connected
423:   (the latter condition is equivalent to $\mathbf{X}$ and $\mathbf{Y}$
424:   being connected---see \cite{engelking:1989}, Theorem 6.1.15).
425: \end{itemize}
426: To overcome these difficulties,
427: we consider randomized prediction strategies.
428: The proof of Theorem \ref{thm:deterministic-compact}
429: will give a universal, in a natural sense,
430: randomized prediction algorithm;
431: on the other hand,
432: there will be a vast supply of continuous stationary prediction strategies.
433: 
434: \begin{remark*}
435:   In fact,
436:   the second difficulty is more apparent than real:
437:   for example, in the binary case ($\mathbf{Y}=\{0,1\}$)
438:   there are many non-trivial continuous prediction strategies
439:   in the canonical form of the prediction game \cite{vovk:1990}
440:   with the prediction space redefined as the boundary of the set of superpredictions
441:   \cite{kalnishkan/vyugin:2005}.
442: \end{remark*}
443: 
444: A \emph{randomized prediction strategy} is a function
445: $D:(\mathbf{X}\times\mathbf{Y})^{\infty}\times\mathbf{X}\times\{1,2,\ldots\}\to\PPP(\Gamma)$
446: mapping the past complemented by the current time
447: to the probability measures on the prediction space;
448: $\PPP(\Gamma)$ is always equipped with the topology of weak convergence
449: (\cite{billingsley:1968};
450: this topology is also discussed, in the compact case,
451: in Section \ref{sec:proof-randomized-compact} below).
452: In other words, this is a prediction strategy
453: in the extended game of prediction with the prediction space $\PPP(\Gamma)$.
454: Analogously,
455: a \emph{stationary randomized prediction strategy} is a function
456: $D:(\mathbf{X}\times\mathbf{Y})^{\infty}\times\mathbf{X}\to\PPP(\Gamma)$.
457: 
458: Let us say that a randomized prediction strategy outputting $\gamma_n$
459: is \emph{CS universal} for a loss function $\lambda$ if,
460: for any continuous stationary randomized prediction strategy $D$
461: and any biinfinite $\ldots,x_{-1},y_{-1},x_{0},y_{0},x_{1},y_{1},\ldots$,
462: \begin{equation}\label{eq:dominates-randomized-compact}
463:   \limsup_{N\to\infty}
464:   \left(
465:     \frac1N
466:     \sum_{n=1}^N
467:     \lambda(g_{n},y_n)
468:     -
469:     \frac1N
470:     \sum_{n=1}^N
471:     \lambda(d_{n},y_n)
472:   \right)
473:   \le
474:   0
475:   \enspace
476:   \textrm{a.s.},
477: \end{equation}
478: where $g_{1},g_{2},\ldots,d_{1},d_{2},\ldots$ are independent random variables
479: distributed as
480: \begin{align}
481:   g_{n}
482:   &\sim
483:   \gamma_n\label{eq:distributed-1},\\
484:   d_{n}
485:   &\sim
486:   D(\ldots,x_{n-1},y_{n-1},x_n),\label{eq:distributed-2}
487: \end{align}
488: $n=1,2,\ldots$\,.
489: Intuitively,
490: the ``a.s.''\ in (\ref{eq:dominates-randomized-compact})
491: refers to the prediction strategies' internal randomization.
492: \begin{theorem}\label{thm:randomized-compact}
493:   Let $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$ be compact metric spaces
494:   and $\lambda$ be a continuous loss function.
495:   There exists a CS universal randomized prediction algorithm.
496: \end{theorem}
497: 
498: \ifFULL\bluebegin
499:   Let $\Sigma:=(\mathbf{X}\times\mathbf{Y})^{\infty}\mathbf{X}$ be a metric space.
500:   For any discrete (e.g., finite) subset $\{\sigma_1,\sigma_2,\ldots\}$ of $\Sigma$
501:   and any sequence $\gamma_n\in\PPP(\Gamma)$ of probability measures on $\Gamma$
502:   there exists a continuous stationary randomized prediction strategy $D$
503:   such that $D(\sigma_n)=\gamma_n$ for all $n$
504:   (indeed, it suffices to set $D(\sigma):=\sum_n\phi_n(\sigma)\gamma_n$,
505:   where $\phi_n:\Sigma\to[0,1]$, $n=1,2,\ldots$,
506:   are continuous functions with disjoint supports
507:   such that $\phi_n(\sigma_n)=1$ for all $n$).
508:   Therefore, there is no shortage of continuous stationary randomized prediction strategies.
509: \blueend\fi
510: 
511: \subsection*{Simple reductions to the compact case}
512: 
513: In the following two subsections we will discuss the case
514: where the signal, prediction, and observation spaces
515: are not required to be compact.
516: The goal of this subsection is to show that the compact case
517: is not as special as it may seem,
518: as far as Theorem \ref{thm:randomized-compact} is concerned.
519: The rest of the paper does not depend on this subsection.
520: 
521: In general,
522: we might consider $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$
523: together with their fixed compactifications
524: $\overline{\mathbf{X}}$, $\overline{\Gamma}$, and $\overline{\mathbf{Y}}$
525: (without loss of generality we can and will assume that
526: $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$
527: are dense in their compactifications,
528: and then the compactifications will be the closures of the original spaces,
529: which explains our notation).
530: \ifFULL\bluebegin
531:   Problem in the case of Theorem \ref{thm:deterministic-compact}:
532:   $\overline{\Gamma}$ may cease to be a compact convex subset of a Banach space.
533: \blueend\fi
534: Let us suppose that $\lambda$ is bounded and continuous,
535: and, moreover, can be continuously extended to the product
536: $\overline{\Gamma}\times\overline{\mathbf{Y}}$
537: of the compactifications;
538: such an extension is then unique and will also be denoted $\lambda$.
539: 
540: If $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$
541: are Euclidean spaces their natural compactifications
542: might be chosen as Aleksandrov's one-point compactification
543: (\cite{engelking:1989}, Theorem 3.5.11),
544: the corresponding projective space
545: (with $\bbbr\mathrm{P}^L$ being the compactification of $\bbbr^L$),
546: or the corresponding closed unit ball
547: (with the interior of the closed unit ball in $\bbbr^L$
548: identified with $\bbbr^L$
549: by mapping a vector $v$ of length $l\in[0,1)$ in the former set
550: to the vector $(\tan(\pi l/2))v$).
551: The Stone--\v{C}ech compactification
552: (\cite{engelking:1989}, Section 3.6)
553: will usually be too large:
554: we will want our compactifications to be metrizable.
555: 
556: Theorem \ref{thm:randomized-compact} will remain true
557: if instead of assuming $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$ to be metric compacts
558: we assume that
559: $\overline{\mathbf{X}}$, $\overline{\Gamma}$, and $\overline{\mathbf{Y}}$
560: are metric compacts
561: and if in the definition of CS universality (\ref{eq:dominates-randomized-compact})
562: we only consider continuous stationary prediction strategies
563: that have a continuous extension to
564: $(\overline{\mathbf{X}}\times\overline{\mathbf{Y}})^{\infty}\times\overline{\mathbf{X}}$.
565: 
566: \ifFULL\bluebegin
567:   As an example,
568:   suppose $\mathbf{X}$ is a Euclidean space
569:   and consider a prediction strategy
570:   $D(\ldots,x_{n-1},y_{n-1},x_{n})$ that only depends on $x_n$.
571:   Then $D$ can be extended to the compactification of $\mathbf{X}$ if it:
572:   tends to a limit as $\left\|x\right\|\to\infty$
573:   (in the case of Aleksandrov's compactification);
574:   tends to a limit in every direction
575:   (in the case of the closed unit ball);
576:   tends to a limit in every direction
577:   with the limits in opposite directions coinciding
578:   (in the case of the projective space).
579: \blueend\fi
580: 
581: \begin{remark*}
582:   An elegant way to avoid considering compactifications
583:   would be to assume that $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$
584:   are metrizable proximity spaces
585:   (see \cite{engelking:1989}, Section 8.4, or \cite{naimpally/warrack:1970},
586:   where \cite{engelking:1989}'s ``proximity spaces'' are called ``separated proximity spaces'')
587:   and to consider only proximity prediction strategies.
588:   By Smirnov's theorem (\cite{engelking:1989}, Theorem 8.4.13 and also Theorem 8.4.9;
589:   \cite{naimpally/warrack:1970}, Theorem 7.7)
590:   a proximity space can be identified with the corresponding topological space
591:   equipped with a compactification.
592:   Assuming that the loss function $\lambda$ is a bounded proximity function,
593:   it can be uniquely continuously extended to the compactification
594:   $\overline{\Gamma}\times\overline{\mathbf{Y}}$
595:   (\cite{naimpally/warrack:1970}, Theorem 7.10),
596:   and every proximity stationary prediction strategy can be identified
597:   with a continuous function on the compactification
598:   $(\overline{\mathbf{X}}\times\overline{\mathbf{Y}})^{\infty}\times\overline{\mathbf{X}}$
599:   (by the same theorem).
600:   To ensure that the compactifications are metrizable,
601:   it is sufficient to assume that the proximity spaces are second-countable
602:   (i.e., have countable proximity weights;
603:   see \cite{naimpally/warrack:1970}, Theorem 8.14,
604:   and \cite{engelking:1989}, Theorem 4.2.8).
605:   We chose the slightly clumsier language of compactifications
606:   because the notion of a topological space is much more familiar
607:   than that of a proximity space.
608: \end{remark*}
609: 
610: \subsection*{Universal prediction strategies: deterministic case}
611: 
612: Let us say that a set in a topological space is \emph{precompact}
613: if its closure is compact.
614: In Euclidean spaces,
615: precompactness means boundedness.
616: In this and next subsections we drop the assumption of compactness
617: of $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$,
618: and so we have to redefine the notion of CS universality.
619: 
620: A prediction strategy outputting $\gamma_n\in\PPP(\Gamma)$
621: is \emph{CS universal}
622: for a loss function $\lambda$ if,
623: for any continuous stationary prediction strategy $D$
624: and for any biinfinite $\ldots,x_{-1},y_{-1},x_{0},y_{0},x_{1},y_{1},\ldots$,
625: \begin{multline}\label{eq:dominates-deterministic}
626:   \bigl(
627:     \{\ldots,x_{-1},x_0,x_1,\ldots\}
628:     \text{ and }
629:     \{\ldots,y_{-1},y_0,y_1,\ldots\}
630:     \text{ are precompact}
631:   \bigr)\\
632:   \Longrightarrow
633:   \limsup_{N\to\infty}
634:   \Biggl(
635:     \frac1N
636:     \sum_{n=1}^N
637:     \lambda(\gamma_n,y_n)
638:     -
639:     \frac1N
640:     \sum_{n=1}^N
641:     \lambda
642:     \bigl(
643:       D(\ldots,x_{n-1},y_{n-1},x_n),y_n
644:     \bigr)
645:   \Biggr)
646:   \le
647:   0.
648: \end{multline}
649: The intuition behind the antecedent of (\ref{eq:dominates-deterministic}),
650: in the Euclidean case,
651: is that the prediction algorithm
652: knows that $\left\|x_n\right\|$ and $\left\|y_n\right\|$ are bounded
653: but does not know an upper bound in advance.
654: 
655: Let us say that the loss function $\lambda$ is \emph{large at infinity}
656: if, for all $y^*\in\mathbf{Y}$,
657: \begin{equation*}
658:   \lim_{\substack{y\to y^*\\\gamma\to\infty}}
659:   \lambda(\gamma,y)
660:   =
661:   \infty
662: \end{equation*}
663: (in the sense that for each constant $M$
664: there exists a neighborhood $O_{y^*}\ni y^*$ and compact $C\subseteq\Gamma$ such that
665: $\lambda\left(\Gamma\setminus C,O_{y^*}\right)\subseteq(M,\infty)$).
666: Intuitively, we require that faraway $\gamma\in\Gamma$
667: should be poor predictions for nearby $y^*\in\mathbf{Y}$.
668: This assumption is satisfied for most of the usual loss functions
669: used in competitive on-line prediction.
670: \ifFULL\bluebegin
671:   (A notable exception is the \emph{log-loss game},
672:   where $\Gamma=(0,1)$, $\mathbf{Y}=\{0,1\}$,
673:   and $\lambda(\gamma,y)=-y\ln\gamma-(1-y)\ln(1-\gamma)$;
674:   for the log-loss game our construction still works
675:   if we replace the WAA of \cite{kalnishkan/vyugin:2005}
676:   by the AA of \cite{vovk:1990} in the proof.)
677: \blueend\fi
678: \begin{theorem}\label{thm:deterministic}
679:   Suppose $\mathbf{X}$ and $\mathbf{Y}$ are locally compact metric spaces,
680:   $\Gamma$ is a convex subset of a Banach space,
681:   and the loss function $\lambda(\gamma,y)$ is continuous,
682:   large at infinity, and convex in the variable $\gamma\in\Gamma$.
683:   There exists a CS universal prediction algorithm.
684: \end{theorem}
685: To have a specific example in mind,
686: the reader might check that $\mathbf{X}=\bbbr^{K}$, $\Gamma=\mathbf{Y}=\bbbr^{L}$,
687: and $\lambda(\gamma,y):=\left\|y-\gamma\right\|$
688: satisfy the conditions of the theorem.
689: 
690: \subsection*{Universal prediction strategies: randomized case}
691: 
692: We say that a randomized prediction strategy
693: outputting randomized predictions $\gamma_n$
694: is \emph{CS universal} if,
695: for any continuous stationary randomized prediction strategy $D$
696: and for any biinfinite $\ldots,x_{-1},y_{-1},x_{0},y_{0},x_{1},y_{1},\ldots$,
697: \begin{multline}\label{eq:dominates-randomized}
698:   \bigl(
699:     \{\ldots,x_{-1},x_0,x_1,\ldots\}
700:     \text{ and }
701:     \{\ldots,y_{-1},y_0,y_1,\ldots\}
702:     \text{ are precompact}
703:   \bigr)\\
704:   \Longrightarrow
705:   \left(
706:     \limsup_{N\to\infty}
707:     \left(
708:       \frac1N
709:       \sum_{n=1}^N
710:       \lambda(g_{n},y_n)
711:       -
712:       \frac1N
713:       \sum_{n=1}^N
714:       \lambda(d_{n},y_n)
715:     \right)
716:     \le
717:     0
718:     \enspace
719:     \textrm{a.s.}
720:   \right),
721: \end{multline}
722: where $g_{1},g_{2},\ldots,d_{1},d_{2},\ldots$ are independent random variables
723: distributed according to (\ref{eq:distributed-1})--(\ref{eq:distributed-2}).
724: \begin{theorem}\label{thm:randomized}
725:   Let $\mathbf{X}$ and $\mathbf{Y}$ be locally compact metric spaces,
726:   $\Gamma$ be a metric space,
727:   and $\lambda$ be a continuous and large at infinity loss function.
728:   There exists a CS universal randomized prediction algorithm.
729: \end{theorem}
730: 
731: \section{Proof of Theorem \ref{thm:deterministic-compact}}
732: \label{sec:proof-deterministic-compact}
733: 
734: In the rest of the paper
735: we will be using the notation $\Sigma$ for $(\mathbf{X}\times\mathbf{Y})^{\infty}\times\mathbf{X}$.
736: By Tikhonov's theorem (\cite{engelking:1989}, Theorem 3.2.4)
737: this is a compact space;
738: it is also metrizable
739: (\cite{engelking:1989}, Theorem 4.2.2).
740: Another standard piece of notation throughout the rest of the paper
741: will be $\sigma_n:=(\ldots,x_{n-1},y_{n-1},x_n)\in\Sigma$.
742: Remember that $\lambda$, as a continuous function on a compact set,
743: is bounded below and above (\cite{engelking:1989}, Theorem 3.10.6).
744: 
745: Let $\Gamma^{\Sigma}$ be the set of all continuous functions
746: from $\Sigma$ to $\Gamma$
747: with the \emph{topology of uniform convergence},
748: generated by the metric
749: \begin{equation*}
750:   \hat\rho(D_1,D_2)
751:   :=
752:   \sup_{\sigma\in\Sigma}
753:   \rho
754:   \bigl(
755:     D_1(\sigma),D_2(\sigma)
756:   \bigr),
757: \end{equation*}
758: $\rho$ being the metric in $\Gamma$
759: (induced by the norm in the containing Banach space).
760: Since the topological space $\Gamma^{\Sigma}$ is separable
761: (\cite{engelking:1989}, Corollary 4.2.18
762: in combination with Theorem 4.2.8),
763: we can choose a dense sequence $D_1,D_2,\ldots$ in $\Gamma^{\Sigma}$.
764: 
765: \begin{remark*}
766:   The topology in $\Gamma^{\Sigma}$ is defined via a metric,
767:   and this is one the very few places in this paper where we need a specific metric
768:   (for brevity we often talk about ``metric spaces'',
769:   but this can always be replaced by ``metrizable topological spaces'').
770:   Without using the metric,
771:   we could say that the topology in $\Gamma^{\Sigma}$ is the compact-open topology
772:   (\cite{engelking:1989}, Section 3.4).
773:   Since $\Sigma$ is compact,
774:   the compact-open topology on $\Gamma^{\Sigma}$
775:   coincides with the topology of uniform convergence
776:   (\cite{engelking:1989}, Theorem 4.2.17).
777:   The separability of $\Gamma^{\Sigma}$ now follows
778:   from \cite{engelking:1989}, Theorem 3.4.16 in combination with Theorem 4.2.8.
779: \end{remark*}
780: 
781: The next step is to apply Kalnishkan and Vyugin's
782: \cite{kalnishkan/vyugin:2005}
783: Weak Aggregating Algorithm (WAA) to this sequence.
784: We cannot just refer to \cite{kalnishkan/vyugin:2005}
785: and will have to redo their derivation of the WAA's main property
786: since Kalnishkan and Vyugin only consider the case
787: of finitely many ``experts'' $D_k$
788: and finite $\mathbf{Y}$.
789: (Although in other respects
790: we will not need their algorithm in full generality
791: and so slightly simplify it.)
792: 
793: Let $q_1,q_2,\ldots$ be a sequence of positive numbers summing to 1,
794: $\sum_{k=1}^{\infty}q_k=1$.
795: Define
796: \begin{equation*}
797:   l_n^{(k)}
798:   :=
799:   \lambda
800:   \left(
801:     D_k(\sigma_n),y_n
802:   \right),
803:   \quad
804:   L_N^{(k)}
805:   :=
806:   \sum_{n=1}^N
807:   l_n^{(k)}
808: \end{equation*}
809: to be the instantaneous loss of the $k$th expert $D_k$ on the $n$th round
810: and his cumulative loss over the first $N$ rounds.
811: For all $n,k=1,2,\ldots$ define
812: \begin{equation*}
813:   w_n^{(k)}
814:   :=
815:   q_k
816:   \beta_n^{L_{n-1}^{(k)}},
817:   \quad
818:   \beta_n
819:   :=
820:   \exp
821:   \left(
822:     -\frac{1}{\sqrt{n}}
823:   \right)
824: \end{equation*}
825: ($w_n^{(k)}$ are the weights of the experts to use on round $n$)
826: and
827: \begin{equation*}
828:   p_n^{(k)}
829:   :=
830:   \frac
831:   {w_n^{(k)}}
832:   {\sum_{k=1}^{\infty}w_n^{(k)}}
833: \end{equation*}
834: (the normalized weights;
835: it is obvious that the denominator is positive and finite).
836: The WAA's prediction on round $n$ is
837: \begin{equation}\label{eq:WAA}
838:   \gamma_n
839:   :=
840:   \sum_{k=1}^{\infty}
841:   p_n^{(k)}
842:   D_k(\sigma_n)
843: \end{equation}
844: (the series is convergent in the Banach space
845: since the compactness of $\Gamma$ implies
846: $\sup_{\gamma\in\Gamma}\left\|\gamma\right\|<\infty$,
847: and $\gamma_n\in\Gamma$ since
848: \begin{multline}\label{eq:convergence-to-0}
849:   \gamma_n
850:   -
851:   \sum_{k=1}^K
852:   \frac{p_n^{(k)}}{\sum_{k=1}^K p_n^{(k)}}
853:   D_k(\sigma_n)\\
854:   =
855:   \sum_{k=1}^K
856:   \left(
857:     1
858:     -
859:     \frac{1}{\sum_{k=1}^K p_n^{(k)}}
860:   \right)
861:   p_n^{(k)}
862:   D_k(\sigma_n)
863:   +
864:   \sum_{k=K+1}^{\infty}
865:   p_n^{(k)}
866:   D_k(\sigma_n)
867:   \to
868:   0
869: \end{multline}
870: as $K\to\infty$).
871: 
872: Let $l_n:=\lambda(\gamma_n,y_n)$ be the WAA's loss on round $n$
873: and
874: $
875:   L_N
876:   :=
877:   \sum_{n=1}^N
878:   l_n
879: $
880: be its cumulative loss over the first $N$ rounds.
881: \begin{lemma}[\cite{kalnishkan/vyugin:2005}, Lemma 9]\label{lem:9}
882:   The WAA guarantees that, for all $N$,
883:   \begin{equation}\label{eq:lemma9}
884:     L_N
885:     \le
886:     \sum_{n=1}^N
887:     \sum_{k=1}^{\infty}
888:     p_n^{(k)}
889:     l_n^{(k)}
890:     -
891:     \sum_{n=1}^N
892:     \log_{\beta_n}
893:     \sum_{k=1}^{\infty}
894:     p_n^{(k)}
895:     \beta_n^{l_n^{(k)}}
896:     +
897:     \log_{\beta_N}
898:     \sum_{k=1}^{\infty}
899:     q_k
900:     \beta_N^{L_N^{(k)}}.
901:   \end{equation}
902: \end{lemma}
903: The first two terms on the right-hand side of (\ref{eq:lemma9})
904: are sums over the first $N$ rounds of different kinds of mean of the experts' losses
905: (see, e.g., \cite{hardy/etal:1952}, Chapter III,
906: for a general definition of the mean);
907: we will see later that they nearly cancel each other out.
908: If those two terms are ignored,
909: the remaining part of (\ref{eq:lemma9}) is identical
910: (except that $\beta$ now depends on $n$)
911: to the main property of the ``Aggregating Algorithm''
912: (see, e.g., \cite{vovk:2001competitive}, Lemma 1).
913: All infinite series in (\ref{eq:lemma9}) are trivially convergent.
914: \begin{Proof}{of Lemma \ref{lem:9}}
915:   The proof is by induction on $N$.
916:   Assuming (\ref{eq:lemma9}),
917:   we obtain
918:   \begin{multline*}
919:     L_{N+1}
920:     =
921:     L_N + l_{N+1}
922:     \le
923:     L_N
924:     +
925:     \sum_{k=1}^{\infty}
926:     p_{N+1}^{(k)}
927:     l_{N+1}^{(k)}\\
928:     \le
929:     \sum_{n=1}^{N+1}
930:     \sum_{k=1}^{\infty}
931:     p_n^{(k)}
932:     l_n^{(k)}
933:     -
934:     \sum_{n=1}^N
935:     \log_{\beta_n}
936:     \sum_{k=1}^{\infty}
937:     p_n^{(k)}
938:     \beta_n^{l_n^{(k)}}
939:     +
940:     \log_{\beta_N}
941:     \sum_{k=1}^{\infty}
942:     q_k
943:     \beta_N^{L_N^{(k)}}
944:   \end{multline*}
945:   (the first ``$\le$'' used the ``countable convexity''
946:   $l_n\le\sum_{k=1}^{\infty}p_n^{(k)}l_n^{(k)}$,
947:   which follows from (\ref{eq:convergence-to-0}) and
948:   \begin{equation*}
949:     \lambda
950:     \left(
951:       \sum_{k=1}^K
952:       \frac{p_n^{(k)}}{\sum_{k=1}^K p_n^{(k)}}
953:       D_k(\sigma_n),
954:       y_n
955:     \right)
956:     \le
957:     \sum_{k=1}^K
958:     \frac{p_n^{(k)}}{\sum_{k=1}^K p_n^{(k)}}
959:     \lambda
960:     \left(
961:       D_k(\sigma_n),
962:       y_n
963:     \right)
964:   \end{equation*}
965:   if we let $K\to\infty$).
966:   Therefore,
967:   it remains to prove
968:   \begin{equation*}
969:     \log_{\beta_N}
970:     \sum_{k=1}^{\infty}
971:     q_k
972:     \beta_N^{L_N^{(k)}}
973:     \le
974:     -\log_{\beta_{N+1}}
975:     \sum_{k=1}^{\infty}
976:     p_{N+1}^{(k)}
977:     \beta_{N+1}^{l_{N+1}^{(k)}}
978:     +
979:     \log_{\beta_{N+1}}
980:     \sum_{k=1}^{\infty}
981:     q_k
982:     \beta_{N+1}^{L_{N+1}^{(k)}}.
983:   \end{equation*}
984:   By the definition of $p_n^{(k)}$
985:   this can be rewritten as
986:   \begin{equation*}
987:     \log_{\beta_N}
988:     \sum_{k=1}^{\infty}
989:     q_k
990:     \beta_N^{L_N^{(k)}}
991:     \le
992:     -\log_{\beta_{N+1}}
993:     \frac
994:     {
995:       \sum_{k=1}^{\infty}
996:       q_k
997:       \beta_{N+1}^{L_{N}^{(k)}}
998:       \beta_{N+1}^{l_{N+1}^{(k)}}
999:     }
1000:     {
1001:       \sum_{k=1}^{\infty}
1002:       q_k
1003:       \beta_{N+1}^{L_{N}^{(k)}}
1004:     }
1005:     +
1006:     \log_{\beta_{N+1}}
1007:     \sum_{k=1}^{\infty}
1008:     q_k
1009:     \beta_{N+1}^{L_{N+1}^{(k)}},
1010:   \end{equation*}
1011:   which after cancellation becomes
1012:   \begin{equation}\label{eq:to-check}
1013:     \log_{\beta_N}
1014:     \sum_{k=1}^{\infty}
1015:     q_k
1016:     \beta_N^{L_N^{(k)}}
1017:     \le
1018:     \log_{\beta_{N+1}}
1019:     \sum_{k=1}^{\infty}
1020:     q_k
1021:     \beta_{N+1}^{L_{N}^{(k)}}.
1022:   \end{equation}
1023:   The last inequality follows from the general result
1024:   about comparison of different means
1025:   (\cite{hardy/etal:1952}, Theorem 85),
1026:   but we can also check it directly
1027:   (following \cite{kalnishkan/vyugin:2005}).
1028:   Let $\beta_{N+1}=\beta_N^a$,
1029:   where $0<a<1$.
1030:   Then (\ref{eq:to-check}) can be rewritten as
1031:   \begin{equation*}
1032:     \left(
1033:       \sum_{k=1}^{\infty}
1034:       q_k
1035:       \beta_N^{L_N^{(k)}}
1036:     \right)^a
1037:     \ge
1038:     \sum_{k=1}^{\infty}
1039:     q_k
1040:     \beta_{N}^{aL_{N}^{(k)}},
1041:   \end{equation*}
1042:   and the last inequality follows from the concavity of the function $t\mapsto t^a$.
1043:   \qedtext
1044: \end{Proof}
1045: 
1046: \begin{lemma}[\cite{kalnishkan/vyugin:2005}, Lemma 5]
1047:   Let $L$ be an upper bound on $\left|\lambda\right|$.
1048:   The WAA guarantees that, for all $N$ and $K$,
1049:   \begin{equation}\label{eq:lemma5}
1050:     L_N
1051:     \le
1052:     L_N^{(K)}
1053:     +
1054:     \left(
1055:       L^2 e^L + \ln\frac{1}{q_K}
1056:     \right)
1057:     \sqrt{N}.
1058:   \end{equation}
1059: \end{lemma}
1060: (There is no term $e^L$ in \cite{kalnishkan/vyugin:2005}
1061: since it only considers non-negative loss functions.)
1062: \begin{proof}
1063:   From (\ref{eq:lemma9}),
1064:   we obtain:
1065:   \begin{align*}
1066:     L_N
1067:     &\le
1068:     \sum_{n=1}^N
1069:     \sum_{k=1}^{\infty}
1070:     p_n^{(k)}
1071:     l_n^{(k)}
1072:     +
1073:     \sum_{n=1}^N
1074:     \sqrt{n}
1075:     \ln
1076:     \sum_{k=1}^{\infty}
1077:     p_n^{(k)}
1078:     \exp
1079:     \left(
1080:       -\frac{l_n^{(k)}}{\sqrt{n}}
1081:     \right)
1082:     +
1083:     \log_{\beta_N}
1084:     q_K
1085:     +
1086:     L_N^{(K)}\\
1087:     &\le
1088:     \sum_{n=1}^N
1089:     \sum_{k=1}^{\infty}
1090:     p_n^{(k)}
1091:     l_n^{(k)}
1092:     +
1093:     \sum_{n=1}^N
1094:     \sqrt{n}
1095:     \left(
1096:       \sum_{k=1}^{\infty}
1097:       p_n^{(k)}
1098:       \left(
1099:         1
1100:         -
1101:         \frac{l_n^{(k)}}{\sqrt{n}}
1102:         +
1103:         \frac{\left(l_n^{(k)}\right)^2}{2n}
1104:         e^L
1105:       \right)
1106:       -
1107:       1
1108:     \right)\\
1109:     &\quad{}+
1110:     \log_{\beta_N}
1111:     q_K
1112:     +
1113:     L_N^{(K)}\\
1114:     &=
1115:     L_N^{(K)}
1116:     +
1117:     \frac12
1118:     \sum_{n=1}^N
1119:     \frac{1}{\sqrt{n}}
1120:     \sum_{k=1}^{\infty}
1121:     p_n^{(k)}
1122:     \left(l_n^{(k)}\right)^2
1123:     e^L
1124:     +
1125:     \sqrt{N}\ln\frac{1}{q_K}\\
1126:     &\le
1127:     L_N^{(K)}
1128:     +
1129:     \frac{L^2e^L}{2}
1130:     \sum_{n=1}^N
1131:     \frac{1}{\sqrt{n}}
1132:     +
1133:     \sqrt{N}\ln\frac{1}{q_K}
1134:     \le
1135:     L_N^{(K)}
1136:     +
1137:     \frac{L^2e^L}{2}
1138:     \int_0^N
1139:     \frac{\D t}{\sqrt{t}}
1140:     +
1141:     \sqrt{N}\ln\frac{1}{q_K}\\
1142:     &\le
1143:     L_N^{(K)}
1144:     +
1145:     L^2e^L\sqrt{N}
1146:     +
1147:     \sqrt{N}\ln\frac{1}{q_K}
1148:   \end{align*}
1149:   (in the second ``$\le$'' we used the inequalities $e^t\le1+t+\frac{t^2}{2}e^{\left|t\right|}$
1150:   and $\ln t\le t-1$).
1151:   \qedtext
1152: \end{proof}
1153: 
1154: Now it is easy to prove Theorem \ref{thm:deterministic-compact}.
1155: Let $\gamma_n$ be the predictions output by the WAA.
1156: Consider any continuous stationary prediction strategy $D$.
1157: Since every continuous function on a metric compact is uniformly continuous
1158: (\cite{engelking:1989}, Theorem 4.3.32),
1159: for any $\epsilon>0$ we can find $\delta>0$ such that
1160: $\left|\lambda(\gamma_1,y)-\lambda(\gamma_2,y)\right|<\epsilon$
1161: whenever $\rho(\gamma_1,\gamma_2)<\delta$.
1162: We can further find $K$ such that $\hat\rho(D_K,D)<\delta$,
1163: and (\ref{eq:lemma5}) then gives,
1164: for all biinfinite $\ldots,x_{-1},y_{-1},x_{0},y_{0},x_{1},y_{1},\ldots$,
1165: \begin{multline*}
1166:   \limsup_{N\to\infty}
1167:   \Biggl(
1168:     \frac1N
1169:     \sum_{n=1}^N
1170:     \lambda(\gamma_n,y_n)
1171:     -
1172:     \frac1N
1173:     \sum_{n=1}^N
1174:     \lambda(D(\sigma_n),y_n)
1175:   \Biggr)\\
1176:   \le
1177:   \limsup_{N\to\infty}
1178:   \Biggl(
1179:     \frac1N
1180:     \sum_{n=1}^N
1181:     \lambda(\gamma_n,y_n)
1182:     -
1183:     \frac1N
1184:     \sum_{n=1}^N
1185:     \lambda(D_K(\sigma_n),y_n)
1186:   \Biggr)
1187:   +
1188:   \epsilon\\
1189:   \le
1190:   \limsup_{N\to\infty}
1191:   \left(
1192:     L^2e^L + \ln\frac{1}{q_K}
1193:   \right)
1194:   \frac{1}{\sqrt{N}}
1195:   +
1196:   \epsilon
1197:   =
1198:   \epsilon;
1199: \end{multline*}
1200: since $\epsilon$ can be arbitrarily small
1201: the WAA is CS universal.
1202: 
1203: \section{Proof of Theorem \ref{thm:randomized-compact}}
1204: \label{sec:proof-randomized-compact}
1205: 
1206: Let us first recall some useful facts about the probability measures
1207: on a metric compact $\Omega$
1208: (we will be following \cite{\GTPXVII}).
1209: The Banach space of all continuous real-valued functions on $\Omega$
1210: with the usual pointwise addition and scalar action
1211: and the sup norm will be denoted $C(\Omega)$.
1212: By one of the Riesz representation theorems
1213: (\cite{dudley:2002}, 7.4.1; see also 7.1.1),
1214: the mapping $\mu\mapsto I_{\mu}$,
1215: where
1216: $
1217:   I_{\mu}(f):=\int_{\Omega}f\D\mu
1218: $,
1219: is a linear isometry
1220: between the set of all finite Borel signed measures $\mu$ on $\Omega$
1221: with the total variation norm
1222: and the dual space $C'(\Omega)$ to $C(\Omega)$
1223: with the standard dual norm
1224: (\cite{rudin:1991}, Chapter 4).
1225: We will identify the finite Borel signed measures $\mu$ on $\Omega$
1226: with the corresponding $I_{\mu}\in C'(\Omega)$.
1227: This makes the set $\PPP(\Omega)$ of probability measures on $\Omega$
1228: a convex closed subset of $C'(\Omega)$.
1229: 
1230: We will be interested, however,
1231: in a different topology on $C'(\Omega)$,
1232: the weakest topology for which all evaluation functionals
1233: $\mu\in C'(\Omega)\mapsto\mu(f)$, $f\in C(\Omega)$,
1234: are continuous.
1235: This topology is known as the \emph{weak${}^*$ topology}
1236: (\cite{rudin:1991}, 3.14),
1237: and the topology inherited by $\PPP(\Omega)$
1238: is known as the \emph{topology of weak convergence}
1239: (\cite{billingsley:1968}, Appendix III).
1240: The point mass $\delta_{\omega}$, $\omega\in\Omega$,
1241: is defined to be the probability measure concentrated at $\omega$,
1242: $\delta_{\omega}(\{\omega\})=1$.
1243: The simple example of a sequence of point masses $\delta_{\omega_n}$
1244: such that $\omega_n\to\omega$ as $n\to\infty$ and $\omega_n\ne\omega$ for all $n$
1245: shows that the topology of weak convergence is different from the dual norm topology:
1246: $\delta_{\omega_n}\to\delta_{\omega}$ holds in one but does not hold in the other.
1247: 
1248: It is not difficult to check that $\PPP(\Omega)$ remains a closed subset of $C'(\Omega)$
1249: in the weak${}^*$ topology
1250: (\cite{bourbaki:integration}, III.2.7, Proposition 7).
1251: By the Banach--Alaoglu theorem
1252: (\cite{rudin:1991}, 3.15)
1253: $\PPP(\Omega)$ is compact in the topology of weak convergence
1254: (this is a special case of Prokhorov's theorem,
1255: \cite{billingsley:1968}, Appendix III, Theorem 6).
1256: In the rest of this paper,
1257: $\PPP(\Omega)$
1258: (and all other spaces of probability measures)
1259: are always equipped with the topology of weak convergence.
1260: 
1261: Since $\Omega$ is a metric compact,
1262: $\PPP(\Omega)$ is also metrizable
1263: (by the well-known Prokhorov metric:
1264: \cite{billingsley:1968}, Appendix III, Theorem 6).
1265: 
1266: Define
1267: \begin{equation}\label{eq:expected-loss}
1268:   \lambda(\gamma,y)
1269:   :=
1270:   \int_{\Gamma}
1271:   \lambda(g,y)
1272:   \gamma(\dd g),
1273: \end{equation}
1274: where $\gamma$ is a probability measure on $\Gamma$.
1275: This is the loss function in a new game of prediction
1276: with the prediction space $\PPP(\Gamma)$;
1277: it is convex in $\gamma$.
1278: 
1279: Let us check that the loss function (\ref{eq:expected-loss}) is continuous.
1280: If $\gamma_n\to\gamma$ and $y_n\to y$
1281: for some $(\gamma,y)\in\PPP(\Gamma)\times\mathbf{Y}$,
1282: \begin{equation*}
1283:   \left|
1284:     \lambda(\gamma_n,y_n)
1285:     -
1286:     \lambda(\gamma,y)
1287:   \right|
1288:   \le
1289:   \left|
1290:     \lambda(\gamma_n,y_n)
1291:     -
1292:     \lambda(\gamma_n,y)
1293:   \right|
1294:   +
1295:   \left|
1296:     \lambda(\gamma_n,y)
1297:     -
1298:     \lambda(\gamma,y)
1299:   \right|
1300:   \to
1301:   0
1302: \end{equation*}
1303: (the first addend tends to zero because of the uniform continuity
1304: of $\lambda:\Gamma\times\mathbf{Y}\to\bbbr$
1305: and the second addend by the definition of the topology of weak convergence).
1306: 
1307: Unfortunately,
1308: Theorem \ref{thm:deterministic-compact} cannot be applied
1309: to the new game of prediction directly:
1310: the theorem assumes that $\Gamma$ is a subset of a Banach space,
1311: whereas the dual to an infinite-dimensional Banach space is never even metrizable
1312: in the weak$^*$ topology
1313: (\cite{rudin:1991}, 3.16).
1314: The proof of Theorem \ref{thm:deterministic-compact}, however,
1315: still works for the new game.
1316: 
1317: It is clear that the mixture (\ref{eq:WAA}) is a probability measure.
1318: The result of the previous section is still true,
1319: and the randomized prediction strategy (\ref{eq:WAA})
1320: produces $\gamma_n\in\PPP(\Gamma)$ that are guaranteed to satisfy
1321: \begin{equation}\label{eq:mean}
1322:   \limsup_{N\to\infty}
1323:   \left(
1324:     \frac1N
1325:     \sum_{n=1}^N
1326:     \lambda(\gamma_n,y_n)
1327:     -
1328:     \frac1N
1329:     \sum_{n=1}^N
1330:     \lambda(D(\sigma_n),y_n)
1331:   \right)
1332:   \le
1333:   0,
1334: \end{equation}
1335: for any continuous stationary randomized prediction strategy $D$.
1336: The loss function is bounded in absolute value
1337: by a constant $L$,
1338: and so the law of the iterated logarithm
1339: (see, e.g., \cite{shafer/vovk:2001}, (5.8))
1340: implies that
1341: \begin{align}
1342:   \limsup_{N\to\infty}
1343:   \frac
1344:   {
1345:     \left|
1346:       \sum_{n=1}^N
1347:       \bigl(
1348:         \lambda(g_n,y_n)
1349:         -
1350:         \lambda(\gamma_n,y_n)
1351:       \bigr)
1352:     \right|
1353:   }
1354:   {
1355:     \sqrt{2L^2N\ln\ln N}
1356:   }
1357:   &\le
1358:   1,\label{eq:LIL-1}\\
1359:   \limsup_{N\to\infty}
1360:   \frac
1361:   {
1362:     \left|
1363:       \sum_{n=1}^N
1364:       \bigl(
1365:         \lambda(d_n,y_n)
1366:         -
1367:         \lambda(D(\sigma_n),y_n)
1368:       \bigr)
1369:     \right|
1370:   }
1371:   {
1372:     \sqrt{2L^2N\ln\ln N}
1373:   }
1374:   &\le
1375:   1\label{eq:LIL-2}
1376: \end{align}
1377: with probability one.
1378: Combining the last two inequalities with (\ref{eq:mean}) gives
1379: \begin{equation*}
1380:   \limsup_{N\to\infty}
1381:   \left(
1382:     \frac1N
1383:     \sum_{n=1}^N
1384:     \lambda(g_n,y_n)
1385:     -
1386:     \frac1N
1387:     \sum_{n=1}^N
1388:     \lambda(d_n,y_n)
1389:   \right)
1390:   \le
1391:   0
1392:   \enspace
1393:   \textrm{a.s.}
1394: \end{equation*}
1395: Therefore, the WAA (applied to $D_1,D_2,\ldots$)
1396: is a universal continuous randomized prediction strategy.
1397: 
1398: \section{Proof of Theorem \ref{thm:deterministic}}
1399: \label{sec:proof-deterministic}
1400: 
1401: In view of Theorem \ref{thm:deterministic-compact},
1402: we only need to get rid of the assumption of compactness
1403: of $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$.
1404: 
1405: \subsection*{Game of removal}
1406: 
1407: The proofs of Theorems \ref{thm:deterministic} and \ref{thm:randomized}
1408: will be based on the following game
1409: (an abstract version of the ``doubling trick'',
1410: \cite{cesabianchi/lugosi:2006})
1411: played in a topological space $X$:
1412: 
1413: \bigskip
1414: 
1415: \noindent
1416: \textsc{Game of removal $G(X)$}\nopagebreak
1417: \begin{tabbing}
1418:   \qquad\=\qquad\=\qquad\kill
1419:   FOR $n=1,2,\dots$:\\
1420:   \> Remover announces compact $K_n\subseteq X$.\\
1421:   \> Evader announces $p_n\notin K_n$.\\
1422:   END FOR.
1423: \end{tabbing}
1424: \textbf{Winner:}
1425: Evader if the set $\left\{p_1,p_2,\ldots\right\}$ is precompact;
1426: Remover otherwise.
1427: 
1428: \bigskip
1429: 
1430: \noindent
1431: Intuitively,
1432: the goal of Evader is to avoid being removed to the infinity.
1433: Without loss of generality
1434: we will assume that Remover always announces a non-decreasing sequence of compact sets:
1435: $K_1\subseteq K_2\subseteq\cdots$.
1436: \begin{lemma}[Gruenhage]\label{lem:Gruenhage}
1437:   Remover has a winning strategy in $G(X)$
1438:   if $X$ is a locally compact and paracompact space.
1439: \end{lemma}
1440: \begin{proof}
1441:   We will follow the proof of Theorem 4.1 in \cite{gruenhage:2006}
1442:   (the easy direction).
1443:   If $X$ is locally compact and $\sigma$-compact,
1444:   there exists a non-decreasing sequence $K_1\subseteq K_2\subseteq\cdots$
1445:   of compact sets covering $X$,
1446:   and each $K_n$ can be extended to compact $K^*_n$
1447:   so that $\Int K^*_n\supseteq K_n$
1448:   (\cite{engelking:1989}, Theorem 3.3.2).
1449:   Remover will obviously win $G(X)$ choosing $K^*_1,K^*_2,\ldots$ as his moves.
1450: 
1451:   If $X$ is the sum of locally compact $\sigma$-compact spaces $X_s$, $s\in S$,
1452:   Remover plays, for each $s\in S$, the strategy described in the previous paragraph
1453:   on the subsequence of Evader's moves belonging to $X_s$.
1454:   If Evader chooses $p_n\in X_s$ for infinitely many $X_s$,
1455:   those $X_s$ will form an open cover of the closure of $\{p_1,p_2,\ldots\}$
1456:   without a finite subcover.
1457:   If $x_n$ are chosen from only finitely many $X_s$,
1458:   there will be infinitely many $x_n$ chosen from some $X_s$,
1459:   and the result of the previous paragraph can be applied.
1460:   It remains to remember that each locally compact paracompact
1461:   can be represented as the sum of locally compact $\sigma$-compact subsets
1462:   (\cite{engelking:1989}, Theorem 5.1.27).
1463:   \qedtext
1464: \end{proof}
1465: 
1466: \subsection*{Large at infinity loss functions}
1467: 
1468: We will need the following useful property of large at infinity loss functions.
1469: \begin{lemma}\label{lem:loss}
1470:   Let $\lambda$ be a loss function that is large at infinity.
1471:   For each compact set $B\subseteq\mathbf{Y}$ and each constant $M$
1472:   there exists a compact set $C\subseteq\Gamma$ such that
1473:   \begin{equation}\label{eq:loss}
1474:     \forall\gamma\notin C,y\in B:
1475:     \quad
1476:     \lambda(\gamma,y)
1477:     >
1478:     M.
1479:   \end{equation}
1480: \end{lemma}
1481: \begin{proof}
1482:   For each point $y^*\in B$
1483:   fix a neighborhood $O_{y^*}\ni y^*$
1484:   and a compact set $C(y^*)\subseteq\Gamma$ such that
1485:   $\lambda\left(\Gamma\setminus C(y^*),O_{y^*}\right)\subseteq(M,\infty)$.
1486:   Since the sets $O_{y^*}$ form an open cover of $B$,
1487:   we can find this cover's finite subcover
1488:   $\{O_{y^*_1},\ldots,O_{y^*_n}\}$.
1489:   It is clear that
1490:   \begin{equation*}
1491:     C
1492:     :=
1493:     \bigcup_{j=1,\ldots,n}
1494:     C
1495:     \left(
1496:       O_{y^*_j}
1497:     \right)
1498:   \end{equation*}
1499:   satisfies (\ref{eq:loss}).
1500:   \qedtext
1501: \end{proof}
1502: In fact,
1503: the only property of large at infinity loss functions that we will be using
1504: is that in the conclusion of Lemma \ref{lem:loss}.
1505: In particular, it implies the following lemma.
1506: \begin{lemma}\label{lem:C-det}
1507:   Under the conditions of Theorem \ref{thm:deterministic},
1508:   for each compact set $B\subseteq\mathbf{Y}$
1509:   there exists a compact convex set $C=C(B)\subseteq\Gamma$
1510:   such that for each continuous stationary prediction strategy
1511:   $D:\Sigma\to\Gamma$
1512:   there exists a continuous stationary prediction strategy
1513:   $D':\Sigma\to C$
1514:   that dominates $D$ in the sense
1515:   \begin{equation}\label{eq:prediction-type}
1516:     \forall\sigma\in\Sigma,y\in B:
1517:     \quad
1518:     \lambda(D'(\sigma),y)
1519:     \le
1520:     \lambda(D(\sigma),y).
1521:   \end{equation}
1522: \end{lemma}
1523: \ifFULL\bluebegin
1524:   In fact,
1525:   we only need Lemmas \ref{lem:C-det} and \ref{lem:C-rand}
1526:   for $D':A\to C$.
1527: \blueend\fi
1528: \begin{proof}
1529:   Without loss of generality $B$ is assumed non-empty.
1530:   Fix any $\gamma_0\in\Gamma$.
1531:   Let
1532:   \begin{equation*}
1533:     M_1
1534:     :=
1535:     \sup_{y\in B}
1536:     \lambda(\gamma_0,y),
1537:   \end{equation*}
1538:   let $C_1\subseteq\Gamma$ be a compact set such that  
1539:   \begin{equation*}
1540:     \forall \gamma\notin C_1,y\in B:
1541:     \quad
1542:     \lambda(\gamma,y)
1543:     >
1544:     M_1+1,
1545:   \end{equation*}
1546:   let
1547:   \begin{equation*}
1548:     M_2
1549:     :=
1550:     \sup_{(\gamma,y)\in C_1\times B}
1551:     \lambda(\gamma,y),
1552:   \end{equation*}
1553:   and let $C_2\subseteq\Gamma$ be a compact set such that  
1554:   \begin{equation*}
1555:     \forall\gamma\notin C_2,y\in B:
1556:     \quad
1557:     \lambda(\gamma,y)
1558:     >
1559:     M_2+1.
1560:   \end{equation*}
1561:   It is obvious that $M_1\le M_2$ and $\gamma_0\in C_1\subseteq C_2$.
1562:   We can and will assume $C_2$ convex
1563:   (see \cite{rudin:1991}, Theorem 3.20(c)).
1564: 
1565:   Let us now check that $C_1$ lies inside the interior of $C_2$.
1566:   Indeed, for any fixed $y\in B$ and $\gamma\in C_1$,
1567:   we have $\lambda(\gamma,y)\le M_2$;
1568:   since $\lambda(\gamma',y)>M_2+1$ for all $\gamma'\notin C_2$,
1569:   some neighborhood of $\gamma$ will lie completely in $C_2$.
1570: 
1571:   Let $D:\Sigma\to\Gamma$
1572:   be a continuous stationary prediction strategy.
1573:   We will show that (\ref{eq:prediction-type}) holds
1574:   for some continuous stationary prediction strategy $D'$
1575:   taking values in the compact convex set $C(B):=C_2$.
1576:   Namely,
1577:   we define
1578:   \begin{multline*}
1579:     D'(\sigma)
1580:     :=\\
1581:     \begin{cases}
1582:       D(\sigma) & \text{if $D(\sigma)\in C_1$}\\
1583:       \frac{\rho(D(\sigma),\Gamma\setminus C_2)}{\rho(D(\sigma),C_1)+\rho(D(\sigma),\Gamma\setminus C_2)} D(\sigma)
1584:       +\frac{\rho(D(\sigma),C_1)}{\rho(D(\sigma),C_1)+\rho(D(\sigma),\Gamma\setminus C_2)} \gamma_0
1585:       & \text{if $D(\sigma)\in C_2\setminus C_1$}\\
1586:       \gamma_0 & \text{if $D(\sigma)\in \Gamma\setminus C_2$}
1587:     \end{cases}
1588:   \end{multline*}
1589:   where $\rho$ is the metric on $\Gamma$;
1590:   the denominator $\rho(D(\sigma),C_1)+\rho(D(\sigma),\Gamma\setminus C_2)$
1591:   is positive since already $\rho(D(\sigma),C_1)$ is positive.
1592:   Since $C_2$ is convex,
1593:   we can see that $D'$ indeed takes values in $C_2$.
1594:   The only points $x$ at which the continuity of $D'$ is not obvious
1595:   are those for which $D(\sigma)$ lies on the boundary of $C_1$:
1596:   in this case
1597:   one has to use the fact that $C_1$ is covered by the interior of $C_2$.
1598: 
1599:   It remains to check (\ref{eq:prediction-type});
1600:   the only non-trivial case is $D(\sigma)\in C_2\setminus C_1$.
1601:   By the convexity of $\lambda(\gamma,y)$ in $\gamma$,
1602:   the inequality in (\ref{eq:prediction-type}) will follow from
1603:   \begin{multline*}
1604:     \frac{\rho(D(\sigma),\Gamma\setminus C_2)}{\rho(D(\sigma),C_1)+\rho(D(\sigma),\Gamma\setminus C_2)}
1605:     \lambda(D(\sigma),y)\\
1606:     +\frac{\rho(D(\sigma),C_1)}{\rho(D(\sigma),C_1)+\rho(D(\sigma),\Gamma\setminus C_2)}
1607:     \lambda(\gamma_0,y)
1608:     \le
1609:     \lambda(D(\sigma),y),
1610:   \end{multline*}
1611:   i.e.,
1612:   \begin{equation*}
1613:     \lambda(\gamma_0,y)
1614:     \le
1615:     \lambda(D(\sigma),y).
1616:   \end{equation*}
1617:   Since the left-hand side of the last inequality is at most $M_1$
1618:   and its right-hand side exceeds $M_1+1$,
1619:   it holds true.
1620:   \qedtext
1621: \end{proof}
1622: \begin{remark*}
1623:   If the loss function is allowed to depend on the infinite past,
1624:   the $\sigma$s in Lemma \ref{lem:C-det} will have to be restricted
1625:   to a compact set $A\subseteq\Sigma$
1626:   and the compact set $C$ will depend not only on $B$ but also on $A$
1627:   (see Lemma 18 of \cite{\GTPXVII}).
1628: \end{remark*}
1629: 
1630: \subsection*{The proof}
1631: 
1632: For each compact $B\subseteq\mathbf{Y}$
1633: fix a compact convex $C(B)\subseteq\Gamma$ as in Lemma \ref{lem:C-det}.
1634: Predictor's strategy ensuring (\ref{eq:dominates-deterministic})
1635: is constructed from Remover's winning strategy in $G(\mathbf{X}\times\mathbf{Y})$
1636: (see Lemma \ref{lem:Gruenhage};
1637: metric spaces are paracompact by the Stone theorem,
1638: \cite{engelking:1989}, Theorem 5.1.3)
1639: and from Predictor's strategies $\SSS(A,B)$ outputting predictions
1640: \begin{equation}\label{eq:gamma}
1641:   \gamma_n\in C(B)
1642: \end{equation}
1643: and ensuring the consequent of (\ref{eq:dominates-deterministic})
1644: for all continuous
1645: \begin{equation}\label{eq:DABC}
1646:   D:(A\times B)^{\infty}\times A\to C(B)
1647: \end{equation}
1648: under the assumption that $(x_n,y_n)\in A\times B$
1649: for given compact $A\subseteq\mathbf{X}$ and $B\subseteq\mathbf{Y}$
1650: (the existence of such $\SSS(A,B)$
1651: is asserted in Theorem \ref{thm:deterministic-compact}).
1652: Remover's moves are assumed to be of the form $A\times B$
1653: for compact $A\subseteq\mathbf{X}$ and $B\subseteq\mathbf{Y}$.
1654: Predictor is simultaneously playing the game of removal
1655: $G(\mathbf{X}\times\mathbf{Y})$ as Evader.
1656: 
1657: At the beginning of the game of prediction
1658: Predictor asks Remover to make his first move $A_1\times B_1$ in the game of removal;
1659: without loss of generality
1660: we assume that $A_1\times B_1$ contains all $(x_n,y_n)$, $n\le0$
1661: (there is nothing to prove if $\{(x_n,y_n)\st n\le0\}$ is not precompact).
1662: Predictor then plays the game of prediction using the strategy $\SSS(A_1,B_1)$
1663: until Reality chooses $(x_n,y_n)\notin A_1\times B_1$
1664: (forever if Reality never chooses such $(x_n,y_n)$).
1665: As soon as such $(x_n,y_n)$ is chosen,
1666: Predictor announces $(x_n,y_n)$ in the game of removal
1667: and notes Remover's response $(A_2,B_2)$.
1668: He then continues playing the game of prediction using the strategy $\SSS(A_2,B_2)$
1669: until Reality chooses $(x_n,y_n)\notin A_2\times B_2$,
1670: etc.
1671: 
1672: Let us check that this strategy for Predictor
1673: will always ensure (\ref{eq:dominates-deterministic}).
1674: If Reality chooses $(x_n,y_n)$ outside Predictor's current $A_k\times B_k$
1675: finitely often,
1676: the consequent of (\ref{eq:dominates-deterministic}) will be satisfied
1677: for all continuous stationary $D:\Sigma\to C(B_K)$
1678: ($B_K$ being the second component of Remover's last move $(A_K,B_K)$)
1679: and so, by Lemma \ref{lem:C-det},
1680: for all continuous stationary $D:\Sigma\to\Gamma$.
1681: If Reality chooses $(x_n,y_n)$ outside Predictor's current $A_k\times B_k$
1682: infinitely often,
1683: the set of $(x_n,y_n)$, $n=1,2,\ldots$, will not be precompact,
1684: and so the antecedent of (\ref{eq:dominates-deterministic}) will be violated.
1685: 
1686: \section{Proof of Theorem \ref{thm:randomized}}
1687: \label{sec:proof-randomized}
1688: 
1689: When $\gamma$ ranges over $\PPP(C)$
1690: (identified with the subset of $\PPP(\Gamma)$
1691: consisting of the measures concentrated on $C$)
1692: for a compact $C\subseteq\Gamma$,
1693: the loss function (\ref{eq:expected-loss}),
1694: as we have seen, is continuous.
1695: The following analogue of Lemma \ref{lem:C-det} will be useful.
1696: \begin{lemma}\label{lem:C-rand}
1697:   Under the conditions of Theorem \ref{thm:randomized},
1698:   for each compact set $B\subseteq\mathbf{Y}$
1699:   there exists a compact convex set $C=C(B)\subseteq\Gamma$
1700:   such that for each continuous stationary randomized prediction strategy
1701:   $D:\Sigma\to\PPP(\Gamma)$
1702:   there exists a continuous stationary randomized prediction strategy
1703:   $D':\Sigma\to\PPP(C)$
1704:   such that (\ref{eq:prediction-type}) holds
1705:   ($D'$ dominates $D$ ``on average'').
1706: \end{lemma}
1707: (In fact, this lemma is not needed
1708: for the proof of Theorem \ref{thm:randomized} as we stated it,
1709: but it will imply that $\gamma_n$ dominate $D(\sigma_n)$ on average,
1710: for any continuous stationary randomized prediction strategy $D$:
1711: see (\ref{eq:stage-K}).)
1712: \begin{proof}
1713:   Define $\gamma_0$, $M_1$, $C_1$, $M_2$, and $C_2$
1714:   as in the proof of Lemma \ref{lem:C-det}.
1715:   Fix a continuous function $f_1:\Gamma\to[0,1]$ such that $f_1=1$ on $C_1$
1716:   and $f_1=0$ on $\Gamma\setminus C_2$
1717:   (such an $f_1$ exists by the Tietze--Uryson theorem,
1718:   \cite{engelking:1989}, Theorem 2.1.8).
1719:   Set $f_2:=1-f_1$.
1720:   Let $D:\Sigma\to\PPP(\Gamma)$ be a continuous stationary randomized prediction strategy.
1721:   For each $\sigma\in\Sigma$,
1722:   split $D(\sigma)$ into two measures on $\Gamma$
1723:   absolutely continuous with respect to $D(\sigma)$:
1724:   $D_1(\sigma)$ with Radon--Nikodym density $f_1$
1725:   and $D_2(\sigma)$ with Radon--Nikodym density $f_2$;
1726:   set
1727:   \begin{equation*}
1728:     D'(\sigma)
1729:     :=
1730:     D_1(\sigma)
1731:     +
1732:     \left|D_2(\sigma)\right|
1733:     \delta_{\gamma_0}
1734:   \end{equation*}
1735:   (letting $\left|P\right|:=P(\Gamma)$ for $P$ a measure on $\Gamma$).
1736:   It is clear that the stationary randomized prediction strategy $D'$ is continuous
1737:   (in the topology of weak convergence, as usual),
1738:   takes values in $\PPP(C_2)$,
1739:   and
1740:   \begin{multline*}
1741:     \lambda(D'(\sigma),y)
1742:     =
1743:     \int_{\Gamma}
1744:       \lambda(\gamma,y)
1745:       f_1(\gamma)
1746:     D(\sigma)(\dd\gamma)
1747:     +
1748:     \lambda(\gamma_0,y)
1749:     \int_{\Gamma}
1750:       f_2(\gamma)
1751:     D(\sigma)(\dd\gamma)\\
1752:     \le
1753:     \int_{\Gamma}
1754:       \lambda(\gamma,y)
1755:       f_1(\gamma)
1756:     D(\sigma)(\dd\gamma)
1757:     +
1758:     \int_{\Gamma}
1759:       M_1
1760:       f_2(\gamma)
1761:     D(\sigma)(\dd\gamma)\\
1762:     \le
1763:     \int_{\Gamma}
1764:       \lambda(\gamma,y)
1765:       f_1(\gamma)
1766:     D(\sigma)(\dd\gamma)
1767:     +
1768:     \int_{\Gamma}
1769:       \lambda(\gamma,y)
1770:       f_2(\gamma)
1771:     D(\sigma)(\dd\gamma)
1772:     =
1773:     \lambda(D(\sigma),y)
1774:   \end{multline*}
1775:   for all $(\sigma,y)\in\Sigma\times B$.
1776:   So we can take $C(B):=C_2$.
1777:   \qedtext
1778: \end{proof}
1779: Fix one of the mappings $B\mapsto C(B)$
1780: whose existence is asserted by the lemma.
1781: 
1782: We will prove that the prediction strategy of the previous section
1783: with (\ref{eq:gamma}) replaced by
1784: $
1785:   \gamma_n\in\PPP(C(B))
1786: $
1787: and (\ref{eq:DABC}) replaced by
1788: \begin{equation*}
1789:   D:(A\times B)^{\infty}\times A\to\PPP(C(B))
1790: \end{equation*}
1791: is CS universal.
1792: Let $D:\Sigma\to\PPP(\Gamma)$ be a continuous stationary randomized prediction strategy,
1793: i.e., a continuous stationary prediction strategy
1794: in the new game of prediction with loss function (\ref{eq:expected-loss}).
1795: Let $(A_K,B_K)$ be Remover's last move
1796: (if Remover makes infinitely many moves,
1797: the antecedent of (\ref{eq:dominates-randomized}) is false,
1798: and there is nothing to prove),
1799: and let $D':\Sigma\to\PPP(C(B_K))$ be a continuous stationary randomized prediction strategy
1800: satisfying (\ref{eq:prediction-type}) with $B:=B_K$.
1801: From some $n$ on
1802: our randomized prediction algorithm produces $\gamma_n\in\PPP(\Gamma)$
1803: concentrated on $C(B_K)$,
1804: and they will satisfy
1805: \begin{multline}\label{eq:stage-K}
1806:   \limsup_{N\to\infty}
1807:   \left(
1808:     \frac1N
1809:     \sum_{n=1}^N
1810:     \lambda(\gamma_n,y_n)
1811:     -
1812:     \frac1N
1813:     \sum_{n=1}^N
1814:     \lambda(D(\sigma_n),y_n)
1815:   \right)\\
1816:   \le
1817:   \limsup_{N\to\infty}
1818:   \left(
1819:     \frac1N
1820:     \sum_{n=1}^N
1821:     \lambda(\gamma_n,y_n)
1822:     -
1823:     \frac1N
1824:     \sum_{n=1}^N
1825:     \lambda(D'(\sigma_n),y_n)
1826:   \right)
1827:   \le
1828:   0.
1829: \end{multline}
1830: This is an interesting property
1831: but slightly different from what Theorem \ref{thm:randomized} asserts.
1832: 
1833: According to the proof of Lemma \ref{lem:C-rand},
1834: we can, and we will, assume that $D'(\sigma_n)$
1835: generates outcomes $d'_n$ in two steps:
1836: first $d_n$ is generated from $D(\sigma_n)$,
1837: and then it is replaced by $\gamma_0$ with probability $f_2(\sigma_n)$.
1838: The loss function is bounded in absolute value
1839: on the compact set
1840: $C(B_K)\times B_K$ by a constant $L$.
1841: From the law of the iterated logarithm
1842: (see (\ref{eq:LIL-1}) and (\ref{eq:LIL-2}))
1843: applied to the losses of $\gamma_n$ and $d'_n$
1844: we now obtain,
1845: instead of (\ref{eq:stage-K}),
1846: \begin{multline*}
1847:   \limsup_{N\to\infty}
1848:   \left(
1849:     \frac1N
1850:     \sum_{n=1}^N
1851:     \lambda(g_n,y_n)
1852:     -
1853:     \frac1N
1854:     \sum_{n=1}^N
1855:     \lambda(d_n,y_n)
1856:   \right)\\
1857:   \le
1858:   \limsup_{N\to\infty}
1859:   \left(
1860:     \frac1N
1861:     \sum_{n=1}^N
1862:     \lambda(g_n,y_n)
1863:     -
1864:     \frac1N
1865:     \sum_{n=1}^N
1866:     \lambda(d'_n,y_n)
1867:   \right)\\
1868:   =
1869:   \limsup_{N\to\infty}
1870:   \left(
1871:     \frac1N
1872:     \sum_{n=1}^N
1873:     \lambda(\gamma_n,y_n)
1874:     -
1875:     \frac1N
1876:     \sum_{n=1}^N
1877:     \lambda(D'(\sigma_n),y_n)
1878:   \right)
1879:   \le
1880:   0
1881:   \enspace
1882:   \textrm{a.s.};
1883: \end{multline*}
1884: it remains to compare this with (\ref{eq:dominates-randomized}).
1885: 
1886: \section{Stationarity and continuity}
1887: \label{sec:stationarity}
1888: 
1889: As we said earlier,
1890: the assumption of stationarity is very natural
1891: for prediction strategies:
1892: it just means that the arbitrary origin of time is not taken into account
1893: (in the spirit of the invariance principle in statistics;
1894: see, e.g., \cite{lehmann:1986}, Section 6.1).
1895: Stationary strategies can detect and make use of all kinds of trends
1896: and one-off phenomena;
1897: e.g.,
1898: they can perform well when the rate of environment change is constantly increasing
1899: (as in our own environment).
1900: There need not be stationarity in the environment.
1901: 
1902: Interestingly,
1903: our prediction algorithms are continuous (or can be made continuous)
1904: but not stationary.
1905: First we discuss the continuity
1906: of the prediction algorithms
1907: constructed in the proofs of our four theorems.
1908: \begin{description}
1909: \item[Theorem \ref{thm:deterministic-compact}]
1910:   It is easy to check that the WAA is continuous;
1911:   by the Weierstrass $M$-test,
1912:   (\ref{eq:WAA}) converges uniformly
1913:   and so its sum is continuous.
1914: \item[Theorem \ref{thm:randomized-compact}]
1915:   To check that $\gamma_n$ is a continuous function of
1916:   $\sigma_n$ in the topology of weak convergence,
1917:   we only need to check that $\int f\D\gamma_n$ is a continuous function of $\sigma_n$
1918:   for each $f\in C(\Sigma)$.
1919:   This again follows from the Weierstrass $M$-test.
1920: \item[Theorem \ref{thm:deterministic}]
1921:   As described,
1922:   Predictor's strategy is not continuous
1923:   since his behavior changes suddenly when Reality outputs $(x_n,y_n)$
1924:   outside his current $A_k\times B_k$,
1925:   but it is clear that it can be ``smoothed around the edges''
1926:   to ensure continuity.
1927: \item[Theorem \ref{thm:randomized}]
1928:   The situation is analogous to Theorem \ref{thm:deterministic}.
1929: \end{description}
1930: 
1931: For concreteness,
1932: we will discuss stationarity only in the case of Theorem \ref{thm:deterministic-compact}.
1933: We know that the WAA is a prediction strategy that is continuous
1934: as a function of the type $\Sigma\times\{1,2,\ldots\}\to\Gamma$.
1935: It is not stationary
1936: (i.e., we cannot get rid of the $\{1,2,\ldots\}$)
1937: because it has to keep track of the experts' losses
1938: since the beginning of the game of prediction.
1939: Stationary strategies can depend on time only in a limited way:
1940: e.g., in terms of our own environment,
1941: they can depend on the time of day or the season.
1942: But the WAA's dependence is much heavier:
1943: it has to know precisely the time that has elapsed since the beginning.
1944: 
1945: Let us now check that
1946: there are no universal continuous stationary prediction strategies
1947: under conditions of Theorem \ref{thm:deterministic-compact}.
1948: Suppose $\Gamma$ is such that there exists $f:\Gamma\to\Gamma$
1949: without fixed points
1950: (i.e., $f(\gamma)\ne\gamma$ for all $\gamma\in\Gamma$;
1951: we can take, e.g., a circle as $\Gamma$).
1952: If $D$ were a universal continuous stationary strategy,
1953: we could define another continuous stationary strategy $D'(\sigma):=f(D(\sigma))$
1954: and make Reality collude with $D'$
1955: (i.e., output $y_n$ leading to a significantly smaller loss for $D'$;
1956: this can be done for an appropriate choice of $\lambda$,
1957: and in fact can be done for all usual $\lambda$).
1958: 
1959: \iffalse
1960: In conclusion let us check that,
1961: for a wide class of loss function $\lambda$
1962: there are no universal continuous stationary prediction strategies.
1963: Indeed,
1964: suppose that for some $\gamma_1,\gamma_2\in\Gamma$ and $y_1,y_2\in\mathbf{Y}$,
1965: \begin{align*}
1966:   \lambda(\gamma_1,y_1)
1967:   &<
1968:   \lambda(\gamma_2,y_1)\\
1969:   \lambda(\gamma_2,y_2)
1970:   &<
1971:   \lambda(\gamma_1,y_2)\\
1972:   \inf_{\gamma\in\Gamma}
1973:   \max_{i=1,2}
1974:   \left(
1975:     \lambda(\gamma,y_i)
1976:     -
1977:     \lambda(\gamma_i,y_i)
1978:   \right)
1979:   &>
1980:   0
1981: \end{align*}
1982: (the first condition means that $\gamma_1$ is the ``right'' prediction for $y_1$,
1983: the second condition that $\gamma_2$ is the ``right'' prediction for $y_2$,
1984: and the third condition is that no $\gamma\in\Gamma$
1985: can simultaneously compete with $\gamma_1$ on $y_1$ and with $\gamma_2$ on $y_2$);
1986: this is a mild condition satisfied for the standard loss functions.
1987: Did not work.
1988: \fi
1989: 
1990: \subsection*{Stationary Reality}
1991: 
1992: A standard problem in probability theory is where Reality
1993: is governed by a stationary probability measure;
1994: of course, only stationary prediction strategies are considered.
1995: In this subsection we will list several references
1996: for this problem,
1997: considering, for simplicity, only the case where the signals $x_n$ are absent
1998: (formally, we assume that $\mathbf{X}$ is a one-element set
1999: and omit the $x_n$, which now do not carry any information, from our notation).
2000: 
2001: The problem of prediction has been studied extensively
2002: for both strictly stationary sequences of observations
2003: and wide sense stationary sequences
2004: (the definitions and a general discussion of ``strict sense'' and ``wide sense'' concepts
2005: can be found in \cite{doob:1953}, Chapter 2, Sections 8 and 3).
2006: We will first assume that $\ldots,y_{-1},y_0,y_1,\ldots$
2007: form a wide sense stationary sequence of random variables
2008: and then a strictly stationary sequence.
2009: 
2010: The natural mode of prediction for wide sense stationary sequences
2011: is linear prediction.
2012: The problem of linear prediction
2013: (not necessarily one-step-ahead, as in this paper)
2014: of wide sense stationary sequences
2015: was posed and solved by Kolmogorov
2016: \cite{kolmogorov:1939,\KolmogorovCRfull,\KolmogorovStationary};
2017: later but independently this was done by Wiener
2018: \cite{wiener:1949}.
2019: 
2020: Kolmogorov and Wiener assumed the probability distribution of the observations known.
2021: There are many efficient ways to estimate the spectral density of this probability distribution
2022: (in terms of which the optimal linear predictor is expressed);
2023: see, e.g., \cite{anderson:1971}, Chapter 9, for a review.
2024: (An early idea of spectral estimation was proposed by Einstein in 1914:
2025: see \cite{newton:2002}, p.~363.)
2026: 
2027: The problem of existence of universal prediction strategies
2028: for strictly stationary and ergodic sequences of observations
2029: was posed by Cover \cite{cover:1975},
2030: and such strategies were found by Ornstein \cite{ornstein:1978}
2031: for finite $\mathbf{Y}$
2032: and Algoet \cite{algoet:1992} for $\mathbf{Y}$ a Polish space.
2033: Papers \cite{gyorfi/etal:1999,gyorfi/lugosi:2001,nobel:2003}
2034: construct such strategies
2035: using techniques very similar to those of this paper.
2036: 
2037: \section{Conclusion}
2038: \label{sec:conclusion}
2039: 
2040: An interesting direction of further research
2041: is to obtain non-asymptotic versions of our results.
2042: If the benchmark class of continuous stationary prediction strategies
2043: is compact,
2044: loss bounds can be given in terms of $\epsilon$-entropy
2045: \cite{\KolmogorovTikhomirov}.
2046: In general,
2047: one can give loss bounds in terms of a nested family
2048: of compact sets
2049: whose union is dense in the set of continuous stationary prediction strategies
2050: (in analogy with Vapnik and Chervonenkis's principle
2051: of structural risk minimization \cite{vapnik:1998}).
2052: 
2053: \ifFULL\bluebegin
2054:   It would be interesting to explore unconditional continuous predictive complexity
2055:   in the simplest case without $x$s and with $\mathbf{Y}=\{0,1\}$
2056:   (and with the log loss or the square loss function).
2057: \blueend\fi
2058: 
2059: \subsection*{Acknowledgments}
2060: 
2061: I am grateful to Yura Kalnishkan and Ilia Nouretdinov
2062: for useful comments.
2063: The construction of CS universal prediction strategies
2064: is based on Alex Smola's and G\'abor Lugosi's suggestions.
2065: This work was partially supported by MRC (grant S505/65).
2066: 
2067: \begin{thebibliography}{10}
2068: 
2069: \bibitem{algoet:1992}
2070: Paul~H\DOT{} Algoet.
2071: \newblock Universal schemes for prediction, gambling and portfolio selection.
2072: \newblock {\em Annals of Probability}, 20:901--941, 1992.
2073: \newblock Corrections: 23:474--478, 1995.
2074: 
2075: \bibitem{anderson:1971}
2076: T\DOT{}~W\DOT{} Anderson.
2077: \newblock {\em The Statistical Analysis of Time Series}.
2078: \newblock Wiley, New York, 1971.
2079: \newblock Wiley Classics Library edition: 1994.
2080: 
2081: \bibitem{billingsley:1968}
2082: Patrick Billingsley.
2083: \newblock {\em Convergence of Probability Measures}.
2084: \newblock Wiley, New York, 1968.
2085: 
2086: \bibitem{blum/etal:1998}
2087: Lenore Blum, Felipe Cucker, Michael Shub, and Steve Smale.
2088: \newblock {\em Complexity and Real Computation}.
2089: \newblock Springer, New York, 1998.
2090: 
2091: \bibitem{blum/etal:1989}
2092: Lenore Blum, Michael Shub, and Steve Smale.
2093: \newblock On a theory of computation and complexity over the real numbers:
2094:   {NP}-completeness, recursive functions and universal machines.
2095: \newblock {\em Bulletin of the American Mathematical Society}, 21:1--46, 1989.
2096: 
2097: \bibitem{bourbaki:integration}
2098: Nicolas Bourbaki.
2099: \newblock {\em El\'ements de math\'ematique, Livre VI, Int\'egration, Chapitres
2100:   1 \`a 4}.
2101: \newblock Hermann, Paris, first edition, 1952.
2102: 
2103: \bibitem{cesabianchi/lugosi:2006}
2104: Nicol\`o Cesa-Bianchi and G\'abor Lugosi.
2105: \newblock {\em Prediction, Learning, and Games}.
2106: \newblock Cambridge University Press, Cambridge, 2006.
2107: 
2108: \bibitem{cover:1975}
2109: Tom~M\DOT{} Cover.
2110: \newblock Open problems in information theory.
2111: \newblock In {\em Moscow Information Theory Workshop}, New York, 1975. IEEE
2112:   Press.
2113: 
2114: \bibitem{doob:1953}
2115: Joseph~L\DOT{} Doob.
2116: \newblock {\em Stochastic Processes}.
2117: \newblock Wiley, New York, 1953.
2118: 
2119: \bibitem{dudley:2002}
2120: Richard~M. Dudley.
2121: \newblock {\em Real Analysis and Probability}, volume~74 of {\em Cambridge
2122:   Studies in Advanced Mathematics}.
2123: \newblock Cambridge University Press, Cambridge, England, 2002.
2124: \newblock Originally published in 1989.
2125: 
2126: \bibitem{engelking:1989}
2127: Ryszard Engelking.
2128: \newblock {\em General Topology}, volume~6 of {\em Sigma Series in Pure
2129:   Mathematics}.
2130: \newblock Heldermann, Berlin, second edition, 1989.
2131: 
2132: \bibitem{gruenhage:2006}
2133: Gary Gruenhage.
2134: \newblock The story of a topological game.
2135: \newblock {\em Rocky Mountain Journal of Mathematics}, 2006.
2136: \newblock To appear.
2137: 
2138: \bibitem{gyorfi/lugosi:2001}
2139: L\'aszl\'o Gy\"orfi and G\'abor Lugosi.
2140: \newblock Strategies for sequential prediction of stationary time series.
2141: \newblock In Moshe Dror, Pierre L'Ecuyer, and Ferenc Szidarovszky, editors,
2142:   {\em Modeling Uncertainty: An Examination of its Theory, Methods, and
2143:   Applications}. Kluwer, 2001.
2144: 
2145: \bibitem{gyorfi/etal:1999}
2146: L\'aszl\'o Gy\"orfi, G\'abor Lugosi, and G\DOT{} Morvai.
2147: \newblock A simple randomized algorithm for consistent sequential prediction of
2148:   ergodic time series.
2149: \newblock {\em IEEE Transactions on Information Theory}, 45:2642--2650, 1999.
2150: 
2151: \bibitem{hardy/etal:1952}
2152: G\DOT{}~H\DOT{} Hardy, John~E\DOT{} Littlewood, and George P\'olya.
2153: \newblock {\em Inequalities}.
2154: \newblock Cambridge University Press, Cambridge, second edition, 1952.
2155: 
2156: \bibitem{kalnishkan/vyugin:2005}
2157: Yuri Kalnishkan and Michael~V\DOT{} Vyugin.
2158: \newblock The {W}eak {A}ggregating {A}lgorithm and weak mixability.
2159: \newblock In Peter Auer and Ron Meir, editors, {\em Proceedings of the
2160:   Eighteenth Annual Conference on Learning Theory}, volume 3559 of {\em Lecture
2161:   Notes in Computer Science}, pages 188--203, Berlin, 2005. Springer.
2162: \newblock The journal version is being prepared for the Special Issue of
2163:   \emph{Journal of Machine Learning Research} devoted to COLT'2005; all
2164:   references are to the journal version.
2165: 
2166: \bibitem{kolmogorov:1939}
2167: Andrei~N\DOT{} Kolmogorov.
2168: \newblock Sur l'interpolation et extrapolation des suites stationnaires.
2169: \newblock {\em Comptes rendus de S\'eances de l'Academie des Sciences},
2170:   208:2043--2045, 1939.
2171: 
2172: \bibitem{kolmogorov:1941CR-latin}
2173: Andrei~N\DOT{} Kolmogorov.
2174: \newblock Interpolation and extrapolation of stationary random sequences (in
2175:   {R}ussian).
2176: \newblock {\em Izvestiya AN SSSR. Mathematics series}, 5:3--14, 1941.
2177: 
2178: \bibitem{kolmogorov:1941-latin}
2179: Andrei~N\DOT{} Kolmogorov.
2180: \newblock Stationary sequences in {H}ilbert space (in {R}ussian).
2181: \newblock {\em Byulleten' MGU. Mathematics}, 2(6):1--40, 1941.
2182: 
2183: \bibitem{kolmogorov/tikhomirov:1959latin}
2184: Andrei~N\DOT{} Kolmogorov and Vladimir~M\DOT{} Tikhomirov.
2185: \newblock $\epsilon$-entropy and $\epsilon$-capacity of sets in functional
2186:   spaces (in {R}ussian).
2187: \newblock {\em Uspekhi Matematicheskikh Nauk}, 14(2):3--86, 1959.
2188: 
2189: \bibitem{lehmann:1986}
2190: E\DOT{}~L\DOT{} Lehmann.
2191: \newblock {\em Testing Statistical Hypotheses}.
2192: \newblock Springer, New York, second edition, 1986.
2193: 
2194: \bibitem{martin-lof:1970}
2195: Per Martin-L\"of.
2196: \newblock {\em Notes on Constructive Mathematics}.
2197: \newblock Almqvist \& Wiksell, Stockholm, 1970.
2198: 
2199: \bibitem{naimpally/warrack:1970}
2200: Som~A\DOT{} Naimpally and Brian~D\DOT{} Warrack.
2201: \newblock {\em Proximity Spaces}, volume~59 of {\em Cambridge Tracts in
2202:   Mathematics and Mathematical Physics}.
2203: \newblock Cambridge University Press, London, 1970.
2204: 
2205: \bibitem{newton:2002}
2206: H\DOT{}~Joseph Newton.
2207: \newblock A conversation with {E}manuel {P}arzen.
2208: \newblock {\em Statistical Science}, 17:357--378, 2002.
2209: 
2210: \bibitem{nobel:2003}
2211: Andrew~B\DOT{} Nobel.
2212: \newblock On optimal sequential prediction for general processes.
2213: \newblock {\em IEEE Transactions on Information Theory}, 49:83--98, 2003.
2214: 
2215: \bibitem{ornstein:1978}
2216: D\DOT{}~S\DOT{} Ornstein.
2217: \newblock Guessing the next output of a stationary process.
2218: \newblock {\em Israel Journal of Mathematics}, 30:292--296, 1978.
2219: 
2220: \bibitem{rudin:1991}
2221: Walter Rudin.
2222: \newblock {\em Functional Analysis}.
2223: \newblock McGraw-Hill, Boston, second edition, 1991.
2224: 
2225: \bibitem{shafer/vovk:2001}
2226: Glenn Shafer and \Vladimir{} Vovk.
2227: \newblock {\em Probability and Finance: It's Only a Game!}
2228: \newblock Wiley, New York, 2001.
2229: 
2230: \bibitem{vapnik:1998}
2231: Vladimir~N\DOT{} Vapnik.
2232: \newblock {\em Statistical Learning Theory}.
2233: \newblock Wiley, New York, 1998.
2234: 
2235: \bibitem{vovk:1990}
2236: \Vladimir{} Vovk.
2237: \newblock Aggregating strategies.
2238: \newblock In Mark Fulk and John Case, editors, {\em Proceedings of the Third
2239:   Annual Workshop on Computational Learning Theory}, pages 371--383, San Mateo,
2240:   CA, 1990. Morgan Kaufmann.
2241: 
2242: \bibitem{vovk:2001competitive}
2243: Vladimir Vovk.
2244: \newblock Competitive on-line statistics.
2245: \newblock {\em International Statistical Review}, 69:213--248, 2001.
2246: 
2247: \bibitem{GTP17arXiv}
2248: \Vladimir{} Vovk.
2249: \newblock Predictions as statements and decisions.
2250: \newblock Technical Report \texttt{arXiv:cs.LG/0606093}, \texttt{arXiv.org}
2251:   e-Print archive, June 2006.
2252: 
2253: \bibitem{wiener:1949}
2254: Norbert Wiener.
2255: \newblock {\em Extrapolation, Interpolation, and Smoothing of Stationary Time
2256:   Series with Engineering Applications}.
2257: \newblock Technology Press of the Massachusetts Institute of Technology,
2258:   Cambridge, MA, 1949.
2259: \newblock Reprinted from a secret 1942 publication.
2260: 
2261: \end{thebibliography}
2262: 
2263: \ifWP
2264:   \DFlastpage
2265: \fi
2266: \end{document}
2267: