1: % Last changed: 13 Jul 2006
2: % Spell checked: 13 Jul 2006
3: % 2120 lines, 68 KB
4: \newif\ifJOURNAL
5: \JOURNALfalse
6: \newif\ifCONF
7: \CONFfalse
8: \newif\ifarXiv
9: \arXivfalse
10: \newif\ifWP
11: \WPfalse
12: \newif\ifFULL
13: \FULLfalse
14:
15: \newif\ifLATIN
16: \LATINfalse
17:
18: %\JOURNALtrue % choose JOURNAL, arXiv, WP, or FULL
19: %\CONFtrue
20: \arXivtrue
21: %\WPtrue
22: %\FULLtrue % this version is not for publication and contains extra remarks and questions
23:
24: %\LATINtrue % LATIN means that the Cyrillic references should be set in Latin
25: \ifarXiv\LATINtrue\fi % for submitting to arXiv
26:
27: \newif\ifnotJOURNAL % derivative conditional
28: \notJOURNALtrue
29: \ifJOURNAL\notJOURNALfalse\fi
30:
31: \newif\ifnotarXiv % derivative conditional
32: \notarXivtrue
33: \ifarXiv\notarXivfalse\fi
34:
35: \newif\ifTR % derivative conditionals (TR = arXiv or WP)
36: \TRfalse
37: \ifarXiv\TRtrue\fi
38: \ifWP\TRtrue\fi
39: \newif\ifnotTR
40: \notTRtrue
41: \ifarXiv\notTRfalse\fi
42: \ifWP\notTRfalse\fi
43:
44: \newif\ifnotLATIN % derivative conditional
45: \notLATINtrue
46: \ifLATIN\notLATINfalse\fi
47:
48: \ifJOURNAL
49: \newcommand{\GTPVII}{vovk/shafer:2005RSS}
50: \newcommand{\GTPVIII}{vovk/etal:2005AIStatslocal}
51: \newcommand{\GTPX}{vovk/etal:2005ALT}
52: \newcommand{\GTPXI}{GTP11arXiv-local}
53: \newcommand{\GTPXIII}{vovk:2005ALT-GTP13}
54: \newcommand{\GTPXIV}{vovk:2005ALT-GTP14}
55: \newcommand{\GTPXVI}{GTP16arXiv-local}
56: \fi
57: \ifarXiv
58: \newcommand{\GTPVII}{GTP7}
59: \newcommand{\GTPVIII}{GTP8arXiv}
60: \newcommand{\GTPX}{GTP10arXiv}
61: \newcommand{\GTPXI}{GTP11arXiv}
62: \newcommand{\GTPXIII}{GTP13arXiv}
63: \newcommand{\GTPXIV}{GTP14arXiv}
64: \newcommand{\GTPXVII}{GTP17arXiv}
65: \fi
66: \ifWP
67: \newcommand{\GTPVII}{GTP7}
68: \newcommand{\GTPVIII}{GTP8}
69: \newcommand{\GTPX}{GTP10}
70: \newcommand{\GTPXI}{GTP11}
71: \newcommand{\GTPXIII}{GTP13}
72: \newcommand{\GTPXIV}{GTP14}
73: \newcommand{\GTPXVII}{GTP17}
74: \fi
75: \ifFULL
76: \newcommand{\GTPVII}{GTP7}
77: \newcommand{\GTPVIII}{GTP8arXiv}
78: \newcommand{\GTPX}{GTP10arXiv}
79: \newcommand{\GTPXI}{GTP11arXiv}
80: \newcommand{\GTPXIII}{GTP13arXiv}
81: \newcommand{\GTPXIV}{GTP14arXiv}
82: \fi
83:
84: \ifnotLATIN
85: \newcommand{\KolmogorovTikhomirov}{kolmogorov/tikhomirov:1959}
86: \newcommand{\KolmogorovCRfull}{kolmogorov:1941CR}
87: \newcommand{\KolmogorovStationary}{kolmogorov:1941}
88: \fi
89: \ifLATIN
90: \newcommand{\KolmogorovTikhomirov}{kolmogorov/tikhomirov:1959latin}
91: \newcommand{\KolmogorovCRfull}{kolmogorov:1941CR-latin}
92: \newcommand{\KolmogorovStationary}{kolmogorov:1941-latin}
93: \fi
94:
95: \ifJOURNAL
96: \documentclass[toc]{article}
97: \usepackage{amsmath,amsfonts,amssymb,latexsym,graphicx}
98: \newcommand{\Extra}[1]{}
99: \fi
100:
101: \ifCONF
102: \documentclass[toc]{article}
103: \usepackage{amsmath,amsfonts,amssymb,latexsym,graphicx}
104: \newcommand{\Extra}[1]{}
105: \fi
106:
107: \ifarXiv
108: \documentclass[toc]{article}
109: \usepackage{amsmath,amsfonts,amssymb,latexsym,graphicx}
110: \newcommand{\Extra}[1]{}
111: \fi
112:
113: \ifWP
114: \documentclass[toc]{gtarticle}
115: \usepackage{amsmath,amsfonts,amssymb,latexsym,epsfig,graphicx}
116: \renewcommand{\Extra}[1]{#1}
117: \fi
118:
119: \ifFULL
120: \documentclass{article}
121: \usepackage{amsmath,amsfonts,amssymb,latexsym,color,epigraph,graphicx,eepic}
122: \newcommand{\Extra}[1]{\red{#1}}
123: \newcommand{\red}[1]{\textcolor{red}{#1}}
124: \newcommand{\blue}[1]{\textcolor{blue}{#1}}
125: \newcommand{\bluebegin}{\begingroup\color{blue}}
126: \newcommand{\blueend}{\endgroup}
127: \newcommand{\redbegin}{\begingroup\color{red}}
128: \newcommand{\redend}{\endgroup}
129: \fi
130:
131: \emergencystretch=5mm
132: \tolerance=400
133: \allowdisplaybreaks[4]
134:
135: \newcommand{\Vladimir}{Vladimir}
136: \newcommand{\DOT}{.}
137:
138: \ifnotLATIN
139: \input{OT2enc.def}
140: \newenvironment{cyr}
141: {\fontencoding{OT2}\fontfamily{wncyr}\fontseries{m}\fontshape{n}\selectfont}
142: {\fontencoding{OT1}\fontfamily{tir}\selectfont}
143: \usepackage{CJK}
144: \fi
145:
146: \newcommand{\st}{\mathrel{\!|\!}}
147: \newcommand{\givn}{\mathrel{|}}
148: \newcommand{\D}{\,\mathrm{d}}
149: \newcommand{\dd}{\mathrm{d}}
150:
151: \newcommand{\K}{\mathcal{K}} % capital
152: \newcommand{\kkk}{\mathbf{k}} % kernel
153: \newcommand{\ccc}{\mathbf{c}} % constant
154: \newcommand{\III}{\mathbb{I}}
155: \newcommand{\CCC}{\mathcal{C}} % class of prediction rules
156: \newcommand{\FFF}{\mathcal{F}} % function space
157: \newcommand{\GGG}{\mathcal{G}} % function space
158: \newcommand{\HHH}{\mathcal{H}} % Hilbert space
159: \newcommand{\PPP}{\mathcal{P}} % all probability measures
160: \newcommand{\SSS}{\mathcal{S}} % Sobolev space
161:
162: \newcommand{\Int}{\mathop{\mathrm{Int}}\nolimits}
163:
164: \newcommand{\bbbp}{\mathbb{P}} % auxiliary (probability)
165: \newcommand{\Prob}{\mathop{\bbbp}\nolimits}
166: \newcommand{\bbbe}{\mathbb{E}} % auxiliary (expectation)
167: \newcommand{\Expect}{\mathop{\bbbe}\nolimits}
168:
169: \newcommand{\cpc}{\mathop{\mathrm{cpc}}\nolimits}
170: \newcommand{\ucpc}{\mathop{\overline{\mathstrut\mathrm{cpc}}}\nolimits}
171: \newcommand{\lcpc}{\mathop{\underline{\mathstrut\mathrm{cpc}}}\nolimits}
172:
173: \newcommand{\bbbr}{\mathbb{R}} % the real numbers
174:
175: \newtheorem{lemma}{Lemma}
176: \newtheorem{proposition}{Proposition}
177: \newtheorem{corollary}{Corollary}
178: \newtheorem{remark}{Remark}
179: \newtheorem{theorem}{Theorem}
180: \newenvironment{proof}
181: {\trivlist\item[\hskip\labelsep\textbf{Proof}]}
182: {\endtrivlist}
183:
184: \newenvironment{Proof}[1]
185: {\trivlist\item[\hskip\labelsep\textbf{Proof #1\,}]}
186: {\endtrivlist}
187: \newcommand{\boxforqed}{\rule{.3em}{1.5ex}}
188: \newcommand{\qedtext}{\unskip\nobreak\hfil
189: \penalty50\hskip1em\null\nobreak\hfil\boxforqed
190: \parfillskip=0pt\finalhyphendemerits=0\endgraf}
191: %\newcommand{\qedmath}{\eqno\boxforqed}
192: \newcommand{\qedmath}{\tag*{\boxforqed}}
193: \newenvironment{remark*}
194: {\trivlist\item[\hskip\labelsep{\bfseries Remark}]\relax}
195: {\endtrivlist}
196:
197: \ifJOURNAL
198: \title{Competing with stationary prediction strategies}
199: \author{Vladimir Vovk\\[5mm]
200: Computer Learning Research Centre\\
201: Department of Computer Science\\
202: Royal Holloway, University of London,
203: Egham, Surrey TW20 0EX, UK\\
204: \texttt{vovk@cs.rhul.ac.uk}}
205: \fi
206:
207: \ifCONF
208: \title{Competing with stationary prediction strategies}
209: \author{Vladimir Vovk\\[5mm]
210: Computer Learning Research Centre\\
211: Department of Computer Science\\
212: Royal Holloway, University of London,
213: Egham, Surrey TW20 0EX, UK\\
214: \texttt{vovk@cs.rhul.ac.uk}}
215: \fi
216:
217: \ifarXiv
218: \title{Competing with stationary prediction strategies}%\\(draft: comments welcome)}
219: \author{Vladimir Vovk\\
220: \texttt{vovk{\rm@}cs.rhul.ac.uk}\\
221: \texttt{http://vovk.net}}
222: \fi
223:
224: \ifWP
225: \title{Competing with stationary prediction strategies}
226: \author{Vladimir Vovk}
227: \newcommand{\No}{18}
228: % For the two dates option: uncomment the next 2 lines
229: % \twodatestrue
230: % \newcommand{\firstposted}{July 13, 2006}
231: \fi
232:
233: \ifFULL
234: \title{Competing with stationary prediction strategies}
235: \author{Vladimir Vovk\\
236: \texttt{vovk{\rm@}cs.rhul.ac.uk}\\
237: \texttt{http://vovk.net}}
238: \fi
239:
240: \begin{document}
241: \maketitle
242: \begin{abstract}
243: In this paper we introduce the class of stationary prediction strategies
244: and construct a prediction algorithm
245: that asymptotically performs as well as the best continuous stationary strategy.
246: We make mild compactness assumptions but no stochastic assumptions
247: about the environment.
248: In particular,
249: no assumption of stationarity is made about the environment,
250: and the stationarity of the considered strategies
251: only means that they do not depend explicitly on time;
252: we argue that it is natural to consider only stationary strategies
253: even for highly non-stationary environments.
254: \end{abstract}
255:
256: \section{Introduction}
257: \label{sec:introduction}
258:
259: This paper belongs to the area of learning theory
260: that has been variously referred to as prediction with expert advice,
261: competitive on-line prediction,
262: prediction of individual sequences,
263: and universal on-line learning;
264: see \cite{cesabianchi/lugosi:2006} for a review.
265: There are many proof techniques known in this field;
266: this paper is based on Kalnishkan and Vyugin's Weak Aggregating Algorithm
267: \cite{kalnishkan/vyugin:2005},
268: but it is possible that some of the numerous other techniques
269: could be used instead.
270:
271: In Section \ref{sec:results} we give the main definitions
272: and state our main results, Theorems \ref{thm:deterministic-compact}--\ref{thm:randomized};
273: their proofs are given
274: in Sections \ref{sec:proof-deterministic-compact}--\ref{sec:proof-randomized}.
275: In Section \ref{sec:stationarity}
276: we informally discuss the notion of stationarity,
277: and Section \ref{sec:conclusion} concludes.
278:
279: \section{Main results}
280: \label{sec:results}
281:
282: The \emph{game of prediction} between Predictor and Reality
283: is played according to the following protocol
284: (of \emph{perfect information},
285: in the sense that either player can see the other player's moves made so far).
286:
287: \bigskip
288:
289: \noindent
290: \textsc{Prediction protocol}\nopagebreak
291: \begin{tabbing}
292: \qquad\=\qquad\=\qquad\kill
293: Reality announces $(\ldots,x_{-1},y_{-1},x_0,y_0)\in(\mathbf{X}\times\mathbf{Y})^{\infty}$.\\
294: FOR $n=1,2,\dots$:\\
295: \> Reality announces $x_n\in\mathbf{X}$.\\
296: \> Predictor announces $\gamma_n\in\Gamma$.\\
297: \> Reality announces $y_n\in\mathbf{Y}$.\\
298: END FOR.
299: \end{tabbing}
300:
301: \noindent
302: After Reality's first move the game proceeds in rounds numbered by the positive integers $n$.
303: At the beginning of each round $n=1,2,\ldots$ Predictor is given some signal $x_n$
304: relevant to predicting the following observation $y_n$.
305: The signal is taken from the \emph{signal space} $\mathbf{X}$
306: and the observations from the \emph{observation space} $\mathbf{Y}$.
307: Predictor then announces his prediction $\gamma_n$,
308: taken from the \emph{prediction space} $\Gamma$,
309: and the prediction's quality in light of the actual observation
310: is measured by a \emph{loss function}
311: $\lambda:\Gamma\times\mathbf{Y}\to\bbbr$.
312: % (The prediction protocol with a fixed loss function
313: % will sometimes be referred to as a \emph{prediction game},
314: % or \emph{game of prediction}.)
315: At the beginning of the game Reality chooses the infinite past,
316: $(x_n,y_n)$ for all $n\le0$.
317:
318: In the games of prediction traditionally considered in machine learning
319: there is no infinite past.
320: This situation is modeled in our framework by extending the signal space and observation space
321: by new elements ${?}\in\mathbf{X}$ and ${?}\in\mathbf{Y}$,
322: defining $\lambda(\gamma,{?})$ arbitrarily,
323: and making Reality announce the infinite past
324: $(\ldots,x_{-1},y_{-1},x_0,y_0)=(\ldots,{?},{?},{?},{?})$
325: and refrain from announcing $x_n={?}$ or $y_n={?}$ afterwards
326: (intuitively, $?$ corresponds to ``no feedback from Reality'').
327:
328: We will always assume that the signal space $\mathbf{X}$,
329: the prediction space $\Gamma$,
330: and the observation space $\mathbf{Y}$
331: are non-empty topological spaces
332: and that the loss function $\lambda$ is continuous.
333: Moreover,
334: we are mainly interested in the case
335: where $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$ are locally compact metric spaces,
336: the prime examples being Euclidean spaces and their open and closed subsets.
337: Our first results will be stated for the case
338: where all three spaces $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$ are compact.
339:
340: \begin{remark*}
341: Our results can be easily extended to the case
342: where the loss on the $n$th round is allowed to depend,
343: in addition to $\gamma_n$ and $y_n$,
344: on the past $\ldots,x_{n-1},y_{n-1},x_n$.
345: This would, however, complicate the notation.
346: \end{remark*}
347:
348: Predictor's strategies in the prediction protocol will be called
349: \emph{prediction strategies}
350: (or \emph{prediction algorithms},
351: when they are defined explicitly and we want to emphasize this).
352: Mathematically such a strategy is a function
353: $D:(\mathbf{X}\times\mathbf{Y})^{\infty}\times\mathbf{X}\times\{1,2,\ldots\}\to\Gamma$;
354: it maps each history $(\ldots,x_{n-1},y_{n-1},x_n)$
355: and the current time $n$ to the chosen prediction.
356: In this paper we will only be interested in continuous prediction strategies $D$
357: (according to the traditional point of view \cite{martin-lof:1970},
358: going back to Brouwer,
359: only continuous prediction strategies can be computable;
360: although it should be mentioned that nowadays
361: there are influential definitions of computability
362: \cite{blum/etal:1989,blum/etal:1998}
363: not requiring continuity).
364: An especially natural class of strategies
365: is formed by the \emph{stationary prediction strategies}
366: $D:(\mathbf{X}\times\mathbf{Y})^{\infty}\times\mathbf{X}\to\Gamma$,
367: which do not depend on time explicitly;
368: since the origin of time is usually chosen arbitrarily,
369: this appears a reasonable restriction
370: (see Section \ref{sec:stationarity} for a further discussion).
371:
372: \subsection*{Universal prediction strategies: compact deterministic case}
373:
374: In this and next subsections we will assume that the spaces $\mathbf{X},\Gamma,\mathbf{Y}$
375: are all compact.
376: A prediction strategy is \emph{CS universal} for a loss function $\lambda$ if
377: its predictions $\gamma_n$ satisfy
378: \begin{equation}\label{eq:dominates-deterministic-compact}
379: \limsup_{N\to\infty}
380: \Biggl(
381: \frac1N
382: \sum_{n=1}^N
383: \lambda
384: (\gamma_n,y_n)
385: {}-
386: \frac1N
387: \sum_{n=1}^N
388: \lambda
389: \bigl(
390: D(\ldots,x_{n-1},y_{n-1},x_n),y_n
391: \bigr)
392: \Biggr)
393: \le
394: 0
395: \end{equation}
396: for any continuous stationary prediction strategy $D$
397: and any biinfinite $\ldots,x_{-1},y_{-1},x_{0},y_{0},x_{1},y_{1},\ldots$\,.
398: (``CS'' refers to the continuity and stationarity of the prediction strategies
399: we are competing with.)
400: \begin{theorem}\label{thm:deterministic-compact}
401: Suppose $\mathbf{X}$ and $\mathbf{Y}$ are compact metric spaces,
402: $\Gamma$ is a compact convex subset of a Banach space,
403: and the loss function $\lambda(\gamma,y)$ is continuous in $(\gamma,y)$
404: and convex in the variable $\gamma\in\Gamma$.
405: There exists a CS universal prediction algorithm.
406: \end{theorem}
407: A CS universal prediction algorithm will be constructed in the next section.
408:
409: \subsection*{Universal prediction strategies: compact randomized case}
410:
411: When the loss function $\lambda(\gamma,y)$ is not convex in $\gamma$,
412: two difficulties appear:
413: \begin{itemize}
414: \item
415: the conclusion of Theorem \ref{thm:deterministic-compact} becomes false
416: if the convexity requirement is removed
417: (\cite{kalnishkan/vyugin:2005}, Theorem 2);
418: \item
419: in some cases the notion of a continuous prediction strategy becomes vacuous:
420: e.g., there are no non-constant continuous stationary prediction strategies
421: when $\Gamma=\{0,1\}$
422: and $(\mathbf{X}\times\mathbf{Y})^{\infty}\times\mathbf{X}$ is connected
423: (the latter condition is equivalent to $\mathbf{X}$ and $\mathbf{Y}$
424: being connected---see \cite{engelking:1989}, Theorem 6.1.15).
425: \end{itemize}
426: To overcome these difficulties,
427: we consider randomized prediction strategies.
428: The proof of Theorem \ref{thm:deterministic-compact}
429: will give a universal, in a natural sense,
430: randomized prediction algorithm;
431: on the other hand,
432: there will be a vast supply of continuous stationary prediction strategies.
433:
434: \begin{remark*}
435: In fact,
436: the second difficulty is more apparent than real:
437: for example, in the binary case ($\mathbf{Y}=\{0,1\}$)
438: there are many non-trivial continuous prediction strategies
439: in the canonical form of the prediction game \cite{vovk:1990}
440: with the prediction space redefined as the boundary of the set of superpredictions
441: \cite{kalnishkan/vyugin:2005}.
442: \end{remark*}
443:
444: A \emph{randomized prediction strategy} is a function
445: $D:(\mathbf{X}\times\mathbf{Y})^{\infty}\times\mathbf{X}\times\{1,2,\ldots\}\to\PPP(\Gamma)$
446: mapping the past complemented by the current time
447: to the probability measures on the prediction space;
448: $\PPP(\Gamma)$ is always equipped with the topology of weak convergence
449: (\cite{billingsley:1968};
450: this topology is also discussed, in the compact case,
451: in Section \ref{sec:proof-randomized-compact} below).
452: In other words, this is a prediction strategy
453: in the extended game of prediction with the prediction space $\PPP(\Gamma)$.
454: Analogously,
455: a \emph{stationary randomized prediction strategy} is a function
456: $D:(\mathbf{X}\times\mathbf{Y})^{\infty}\times\mathbf{X}\to\PPP(\Gamma)$.
457:
458: Let us say that a randomized prediction strategy outputting $\gamma_n$
459: is \emph{CS universal} for a loss function $\lambda$ if,
460: for any continuous stationary randomized prediction strategy $D$
461: and any biinfinite $\ldots,x_{-1},y_{-1},x_{0},y_{0},x_{1},y_{1},\ldots$,
462: \begin{equation}\label{eq:dominates-randomized-compact}
463: \limsup_{N\to\infty}
464: \left(
465: \frac1N
466: \sum_{n=1}^N
467: \lambda(g_{n},y_n)
468: -
469: \frac1N
470: \sum_{n=1}^N
471: \lambda(d_{n},y_n)
472: \right)
473: \le
474: 0
475: \enspace
476: \textrm{a.s.},
477: \end{equation}
478: where $g_{1},g_{2},\ldots,d_{1},d_{2},\ldots$ are independent random variables
479: distributed as
480: \begin{align}
481: g_{n}
482: &\sim
483: \gamma_n\label{eq:distributed-1},\\
484: d_{n}
485: &\sim
486: D(\ldots,x_{n-1},y_{n-1},x_n),\label{eq:distributed-2}
487: \end{align}
488: $n=1,2,\ldots$\,.
489: Intuitively,
490: the ``a.s.''\ in (\ref{eq:dominates-randomized-compact})
491: refers to the prediction strategies' internal randomization.
492: \begin{theorem}\label{thm:randomized-compact}
493: Let $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$ be compact metric spaces
494: and $\lambda$ be a continuous loss function.
495: There exists a CS universal randomized prediction algorithm.
496: \end{theorem}
497:
498: \ifFULL\bluebegin
499: Let $\Sigma:=(\mathbf{X}\times\mathbf{Y})^{\infty}\mathbf{X}$ be a metric space.
500: For any discrete (e.g., finite) subset $\{\sigma_1,\sigma_2,\ldots\}$ of $\Sigma$
501: and any sequence $\gamma_n\in\PPP(\Gamma)$ of probability measures on $\Gamma$
502: there exists a continuous stationary randomized prediction strategy $D$
503: such that $D(\sigma_n)=\gamma_n$ for all $n$
504: (indeed, it suffices to set $D(\sigma):=\sum_n\phi_n(\sigma)\gamma_n$,
505: where $\phi_n:\Sigma\to[0,1]$, $n=1,2,\ldots$,
506: are continuous functions with disjoint supports
507: such that $\phi_n(\sigma_n)=1$ for all $n$).
508: Therefore, there is no shortage of continuous stationary randomized prediction strategies.
509: \blueend\fi
510:
511: \subsection*{Simple reductions to the compact case}
512:
513: In the following two subsections we will discuss the case
514: where the signal, prediction, and observation spaces
515: are not required to be compact.
516: The goal of this subsection is to show that the compact case
517: is not as special as it may seem,
518: as far as Theorem \ref{thm:randomized-compact} is concerned.
519: The rest of the paper does not depend on this subsection.
520:
521: In general,
522: we might consider $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$
523: together with their fixed compactifications
524: $\overline{\mathbf{X}}$, $\overline{\Gamma}$, and $\overline{\mathbf{Y}}$
525: (without loss of generality we can and will assume that
526: $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$
527: are dense in their compactifications,
528: and then the compactifications will be the closures of the original spaces,
529: which explains our notation).
530: \ifFULL\bluebegin
531: Problem in the case of Theorem \ref{thm:deterministic-compact}:
532: $\overline{\Gamma}$ may cease to be a compact convex subset of a Banach space.
533: \blueend\fi
534: Let us suppose that $\lambda$ is bounded and continuous,
535: and, moreover, can be continuously extended to the product
536: $\overline{\Gamma}\times\overline{\mathbf{Y}}$
537: of the compactifications;
538: such an extension is then unique and will also be denoted $\lambda$.
539:
540: If $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$
541: are Euclidean spaces their natural compactifications
542: might be chosen as Aleksandrov's one-point compactification
543: (\cite{engelking:1989}, Theorem 3.5.11),
544: the corresponding projective space
545: (with $\bbbr\mathrm{P}^L$ being the compactification of $\bbbr^L$),
546: or the corresponding closed unit ball
547: (with the interior of the closed unit ball in $\bbbr^L$
548: identified with $\bbbr^L$
549: by mapping a vector $v$ of length $l\in[0,1)$ in the former set
550: to the vector $(\tan(\pi l/2))v$).
551: The Stone--\v{C}ech compactification
552: (\cite{engelking:1989}, Section 3.6)
553: will usually be too large:
554: we will want our compactifications to be metrizable.
555:
556: Theorem \ref{thm:randomized-compact} will remain true
557: if instead of assuming $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$ to be metric compacts
558: we assume that
559: $\overline{\mathbf{X}}$, $\overline{\Gamma}$, and $\overline{\mathbf{Y}}$
560: are metric compacts
561: and if in the definition of CS universality (\ref{eq:dominates-randomized-compact})
562: we only consider continuous stationary prediction strategies
563: that have a continuous extension to
564: $(\overline{\mathbf{X}}\times\overline{\mathbf{Y}})^{\infty}\times\overline{\mathbf{X}}$.
565:
566: \ifFULL\bluebegin
567: As an example,
568: suppose $\mathbf{X}$ is a Euclidean space
569: and consider a prediction strategy
570: $D(\ldots,x_{n-1},y_{n-1},x_{n})$ that only depends on $x_n$.
571: Then $D$ can be extended to the compactification of $\mathbf{X}$ if it:
572: tends to a limit as $\left\|x\right\|\to\infty$
573: (in the case of Aleksandrov's compactification);
574: tends to a limit in every direction
575: (in the case of the closed unit ball);
576: tends to a limit in every direction
577: with the limits in opposite directions coinciding
578: (in the case of the projective space).
579: \blueend\fi
580:
581: \begin{remark*}
582: An elegant way to avoid considering compactifications
583: would be to assume that $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$
584: are metrizable proximity spaces
585: (see \cite{engelking:1989}, Section 8.4, or \cite{naimpally/warrack:1970},
586: where \cite{engelking:1989}'s ``proximity spaces'' are called ``separated proximity spaces'')
587: and to consider only proximity prediction strategies.
588: By Smirnov's theorem (\cite{engelking:1989}, Theorem 8.4.13 and also Theorem 8.4.9;
589: \cite{naimpally/warrack:1970}, Theorem 7.7)
590: a proximity space can be identified with the corresponding topological space
591: equipped with a compactification.
592: Assuming that the loss function $\lambda$ is a bounded proximity function,
593: it can be uniquely continuously extended to the compactification
594: $\overline{\Gamma}\times\overline{\mathbf{Y}}$
595: (\cite{naimpally/warrack:1970}, Theorem 7.10),
596: and every proximity stationary prediction strategy can be identified
597: with a continuous function on the compactification
598: $(\overline{\mathbf{X}}\times\overline{\mathbf{Y}})^{\infty}\times\overline{\mathbf{X}}$
599: (by the same theorem).
600: To ensure that the compactifications are metrizable,
601: it is sufficient to assume that the proximity spaces are second-countable
602: (i.e., have countable proximity weights;
603: see \cite{naimpally/warrack:1970}, Theorem 8.14,
604: and \cite{engelking:1989}, Theorem 4.2.8).
605: We chose the slightly clumsier language of compactifications
606: because the notion of a topological space is much more familiar
607: than that of a proximity space.
608: \end{remark*}
609:
610: \subsection*{Universal prediction strategies: deterministic case}
611:
612: Let us say that a set in a topological space is \emph{precompact}
613: if its closure is compact.
614: In Euclidean spaces,
615: precompactness means boundedness.
616: In this and next subsections we drop the assumption of compactness
617: of $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$,
618: and so we have to redefine the notion of CS universality.
619:
620: A prediction strategy outputting $\gamma_n\in\PPP(\Gamma)$
621: is \emph{CS universal}
622: for a loss function $\lambda$ if,
623: for any continuous stationary prediction strategy $D$
624: and for any biinfinite $\ldots,x_{-1},y_{-1},x_{0},y_{0},x_{1},y_{1},\ldots$,
625: \begin{multline}\label{eq:dominates-deterministic}
626: \bigl(
627: \{\ldots,x_{-1},x_0,x_1,\ldots\}
628: \text{ and }
629: \{\ldots,y_{-1},y_0,y_1,\ldots\}
630: \text{ are precompact}
631: \bigr)\\
632: \Longrightarrow
633: \limsup_{N\to\infty}
634: \Biggl(
635: \frac1N
636: \sum_{n=1}^N
637: \lambda(\gamma_n,y_n)
638: -
639: \frac1N
640: \sum_{n=1}^N
641: \lambda
642: \bigl(
643: D(\ldots,x_{n-1},y_{n-1},x_n),y_n
644: \bigr)
645: \Biggr)
646: \le
647: 0.
648: \end{multline}
649: The intuition behind the antecedent of (\ref{eq:dominates-deterministic}),
650: in the Euclidean case,
651: is that the prediction algorithm
652: knows that $\left\|x_n\right\|$ and $\left\|y_n\right\|$ are bounded
653: but does not know an upper bound in advance.
654:
655: Let us say that the loss function $\lambda$ is \emph{large at infinity}
656: if, for all $y^*\in\mathbf{Y}$,
657: \begin{equation*}
658: \lim_{\substack{y\to y^*\\\gamma\to\infty}}
659: \lambda(\gamma,y)
660: =
661: \infty
662: \end{equation*}
663: (in the sense that for each constant $M$
664: there exists a neighborhood $O_{y^*}\ni y^*$ and compact $C\subseteq\Gamma$ such that
665: $\lambda\left(\Gamma\setminus C,O_{y^*}\right)\subseteq(M,\infty)$).
666: Intuitively, we require that faraway $\gamma\in\Gamma$
667: should be poor predictions for nearby $y^*\in\mathbf{Y}$.
668: This assumption is satisfied for most of the usual loss functions
669: used in competitive on-line prediction.
670: \ifFULL\bluebegin
671: (A notable exception is the \emph{log-loss game},
672: where $\Gamma=(0,1)$, $\mathbf{Y}=\{0,1\}$,
673: and $\lambda(\gamma,y)=-y\ln\gamma-(1-y)\ln(1-\gamma)$;
674: for the log-loss game our construction still works
675: if we replace the WAA of \cite{kalnishkan/vyugin:2005}
676: by the AA of \cite{vovk:1990} in the proof.)
677: \blueend\fi
678: \begin{theorem}\label{thm:deterministic}
679: Suppose $\mathbf{X}$ and $\mathbf{Y}$ are locally compact metric spaces,
680: $\Gamma$ is a convex subset of a Banach space,
681: and the loss function $\lambda(\gamma,y)$ is continuous,
682: large at infinity, and convex in the variable $\gamma\in\Gamma$.
683: There exists a CS universal prediction algorithm.
684: \end{theorem}
685: To have a specific example in mind,
686: the reader might check that $\mathbf{X}=\bbbr^{K}$, $\Gamma=\mathbf{Y}=\bbbr^{L}$,
687: and $\lambda(\gamma,y):=\left\|y-\gamma\right\|$
688: satisfy the conditions of the theorem.
689:
690: \subsection*{Universal prediction strategies: randomized case}
691:
692: We say that a randomized prediction strategy
693: outputting randomized predictions $\gamma_n$
694: is \emph{CS universal} if,
695: for any continuous stationary randomized prediction strategy $D$
696: and for any biinfinite $\ldots,x_{-1},y_{-1},x_{0},y_{0},x_{1},y_{1},\ldots$,
697: \begin{multline}\label{eq:dominates-randomized}
698: \bigl(
699: \{\ldots,x_{-1},x_0,x_1,\ldots\}
700: \text{ and }
701: \{\ldots,y_{-1},y_0,y_1,\ldots\}
702: \text{ are precompact}
703: \bigr)\\
704: \Longrightarrow
705: \left(
706: \limsup_{N\to\infty}
707: \left(
708: \frac1N
709: \sum_{n=1}^N
710: \lambda(g_{n},y_n)
711: -
712: \frac1N
713: \sum_{n=1}^N
714: \lambda(d_{n},y_n)
715: \right)
716: \le
717: 0
718: \enspace
719: \textrm{a.s.}
720: \right),
721: \end{multline}
722: where $g_{1},g_{2},\ldots,d_{1},d_{2},\ldots$ are independent random variables
723: distributed according to (\ref{eq:distributed-1})--(\ref{eq:distributed-2}).
724: \begin{theorem}\label{thm:randomized}
725: Let $\mathbf{X}$ and $\mathbf{Y}$ be locally compact metric spaces,
726: $\Gamma$ be a metric space,
727: and $\lambda$ be a continuous and large at infinity loss function.
728: There exists a CS universal randomized prediction algorithm.
729: \end{theorem}
730:
731: \section{Proof of Theorem \ref{thm:deterministic-compact}}
732: \label{sec:proof-deterministic-compact}
733:
734: In the rest of the paper
735: we will be using the notation $\Sigma$ for $(\mathbf{X}\times\mathbf{Y})^{\infty}\times\mathbf{X}$.
736: By Tikhonov's theorem (\cite{engelking:1989}, Theorem 3.2.4)
737: this is a compact space;
738: it is also metrizable
739: (\cite{engelking:1989}, Theorem 4.2.2).
740: Another standard piece of notation throughout the rest of the paper
741: will be $\sigma_n:=(\ldots,x_{n-1},y_{n-1},x_n)\in\Sigma$.
742: Remember that $\lambda$, as a continuous function on a compact set,
743: is bounded below and above (\cite{engelking:1989}, Theorem 3.10.6).
744:
745: Let $\Gamma^{\Sigma}$ be the set of all continuous functions
746: from $\Sigma$ to $\Gamma$
747: with the \emph{topology of uniform convergence},
748: generated by the metric
749: \begin{equation*}
750: \hat\rho(D_1,D_2)
751: :=
752: \sup_{\sigma\in\Sigma}
753: \rho
754: \bigl(
755: D_1(\sigma),D_2(\sigma)
756: \bigr),
757: \end{equation*}
758: $\rho$ being the metric in $\Gamma$
759: (induced by the norm in the containing Banach space).
760: Since the topological space $\Gamma^{\Sigma}$ is separable
761: (\cite{engelking:1989}, Corollary 4.2.18
762: in combination with Theorem 4.2.8),
763: we can choose a dense sequence $D_1,D_2,\ldots$ in $\Gamma^{\Sigma}$.
764:
765: \begin{remark*}
766: The topology in $\Gamma^{\Sigma}$ is defined via a metric,
767: and this is one the very few places in this paper where we need a specific metric
768: (for brevity we often talk about ``metric spaces'',
769: but this can always be replaced by ``metrizable topological spaces'').
770: Without using the metric,
771: we could say that the topology in $\Gamma^{\Sigma}$ is the compact-open topology
772: (\cite{engelking:1989}, Section 3.4).
773: Since $\Sigma$ is compact,
774: the compact-open topology on $\Gamma^{\Sigma}$
775: coincides with the topology of uniform convergence
776: (\cite{engelking:1989}, Theorem 4.2.17).
777: The separability of $\Gamma^{\Sigma}$ now follows
778: from \cite{engelking:1989}, Theorem 3.4.16 in combination with Theorem 4.2.8.
779: \end{remark*}
780:
781: The next step is to apply Kalnishkan and Vyugin's
782: \cite{kalnishkan/vyugin:2005}
783: Weak Aggregating Algorithm (WAA) to this sequence.
784: We cannot just refer to \cite{kalnishkan/vyugin:2005}
785: and will have to redo their derivation of the WAA's main property
786: since Kalnishkan and Vyugin only consider the case
787: of finitely many ``experts'' $D_k$
788: and finite $\mathbf{Y}$.
789: (Although in other respects
790: we will not need their algorithm in full generality
791: and so slightly simplify it.)
792:
793: Let $q_1,q_2,\ldots$ be a sequence of positive numbers summing to 1,
794: $\sum_{k=1}^{\infty}q_k=1$.
795: Define
796: \begin{equation*}
797: l_n^{(k)}
798: :=
799: \lambda
800: \left(
801: D_k(\sigma_n),y_n
802: \right),
803: \quad
804: L_N^{(k)}
805: :=
806: \sum_{n=1}^N
807: l_n^{(k)}
808: \end{equation*}
809: to be the instantaneous loss of the $k$th expert $D_k$ on the $n$th round
810: and his cumulative loss over the first $N$ rounds.
811: For all $n,k=1,2,\ldots$ define
812: \begin{equation*}
813: w_n^{(k)}
814: :=
815: q_k
816: \beta_n^{L_{n-1}^{(k)}},
817: \quad
818: \beta_n
819: :=
820: \exp
821: \left(
822: -\frac{1}{\sqrt{n}}
823: \right)
824: \end{equation*}
825: ($w_n^{(k)}$ are the weights of the experts to use on round $n$)
826: and
827: \begin{equation*}
828: p_n^{(k)}
829: :=
830: \frac
831: {w_n^{(k)}}
832: {\sum_{k=1}^{\infty}w_n^{(k)}}
833: \end{equation*}
834: (the normalized weights;
835: it is obvious that the denominator is positive and finite).
836: The WAA's prediction on round $n$ is
837: \begin{equation}\label{eq:WAA}
838: \gamma_n
839: :=
840: \sum_{k=1}^{\infty}
841: p_n^{(k)}
842: D_k(\sigma_n)
843: \end{equation}
844: (the series is convergent in the Banach space
845: since the compactness of $\Gamma$ implies
846: $\sup_{\gamma\in\Gamma}\left\|\gamma\right\|<\infty$,
847: and $\gamma_n\in\Gamma$ since
848: \begin{multline}\label{eq:convergence-to-0}
849: \gamma_n
850: -
851: \sum_{k=1}^K
852: \frac{p_n^{(k)}}{\sum_{k=1}^K p_n^{(k)}}
853: D_k(\sigma_n)\\
854: =
855: \sum_{k=1}^K
856: \left(
857: 1
858: -
859: \frac{1}{\sum_{k=1}^K p_n^{(k)}}
860: \right)
861: p_n^{(k)}
862: D_k(\sigma_n)
863: +
864: \sum_{k=K+1}^{\infty}
865: p_n^{(k)}
866: D_k(\sigma_n)
867: \to
868: 0
869: \end{multline}
870: as $K\to\infty$).
871:
872: Let $l_n:=\lambda(\gamma_n,y_n)$ be the WAA's loss on round $n$
873: and
874: $
875: L_N
876: :=
877: \sum_{n=1}^N
878: l_n
879: $
880: be its cumulative loss over the first $N$ rounds.
881: \begin{lemma}[\cite{kalnishkan/vyugin:2005}, Lemma 9]\label{lem:9}
882: The WAA guarantees that, for all $N$,
883: \begin{equation}\label{eq:lemma9}
884: L_N
885: \le
886: \sum_{n=1}^N
887: \sum_{k=1}^{\infty}
888: p_n^{(k)}
889: l_n^{(k)}
890: -
891: \sum_{n=1}^N
892: \log_{\beta_n}
893: \sum_{k=1}^{\infty}
894: p_n^{(k)}
895: \beta_n^{l_n^{(k)}}
896: +
897: \log_{\beta_N}
898: \sum_{k=1}^{\infty}
899: q_k
900: \beta_N^{L_N^{(k)}}.
901: \end{equation}
902: \end{lemma}
903: The first two terms on the right-hand side of (\ref{eq:lemma9})
904: are sums over the first $N$ rounds of different kinds of mean of the experts' losses
905: (see, e.g., \cite{hardy/etal:1952}, Chapter III,
906: for a general definition of the mean);
907: we will see later that they nearly cancel each other out.
908: If those two terms are ignored,
909: the remaining part of (\ref{eq:lemma9}) is identical
910: (except that $\beta$ now depends on $n$)
911: to the main property of the ``Aggregating Algorithm''
912: (see, e.g., \cite{vovk:2001competitive}, Lemma 1).
913: All infinite series in (\ref{eq:lemma9}) are trivially convergent.
914: \begin{Proof}{of Lemma \ref{lem:9}}
915: The proof is by induction on $N$.
916: Assuming (\ref{eq:lemma9}),
917: we obtain
918: \begin{multline*}
919: L_{N+1}
920: =
921: L_N + l_{N+1}
922: \le
923: L_N
924: +
925: \sum_{k=1}^{\infty}
926: p_{N+1}^{(k)}
927: l_{N+1}^{(k)}\\
928: \le
929: \sum_{n=1}^{N+1}
930: \sum_{k=1}^{\infty}
931: p_n^{(k)}
932: l_n^{(k)}
933: -
934: \sum_{n=1}^N
935: \log_{\beta_n}
936: \sum_{k=1}^{\infty}
937: p_n^{(k)}
938: \beta_n^{l_n^{(k)}}
939: +
940: \log_{\beta_N}
941: \sum_{k=1}^{\infty}
942: q_k
943: \beta_N^{L_N^{(k)}}
944: \end{multline*}
945: (the first ``$\le$'' used the ``countable convexity''
946: $l_n\le\sum_{k=1}^{\infty}p_n^{(k)}l_n^{(k)}$,
947: which follows from (\ref{eq:convergence-to-0}) and
948: \begin{equation*}
949: \lambda
950: \left(
951: \sum_{k=1}^K
952: \frac{p_n^{(k)}}{\sum_{k=1}^K p_n^{(k)}}
953: D_k(\sigma_n),
954: y_n
955: \right)
956: \le
957: \sum_{k=1}^K
958: \frac{p_n^{(k)}}{\sum_{k=1}^K p_n^{(k)}}
959: \lambda
960: \left(
961: D_k(\sigma_n),
962: y_n
963: \right)
964: \end{equation*}
965: if we let $K\to\infty$).
966: Therefore,
967: it remains to prove
968: \begin{equation*}
969: \log_{\beta_N}
970: \sum_{k=1}^{\infty}
971: q_k
972: \beta_N^{L_N^{(k)}}
973: \le
974: -\log_{\beta_{N+1}}
975: \sum_{k=1}^{\infty}
976: p_{N+1}^{(k)}
977: \beta_{N+1}^{l_{N+1}^{(k)}}
978: +
979: \log_{\beta_{N+1}}
980: \sum_{k=1}^{\infty}
981: q_k
982: \beta_{N+1}^{L_{N+1}^{(k)}}.
983: \end{equation*}
984: By the definition of $p_n^{(k)}$
985: this can be rewritten as
986: \begin{equation*}
987: \log_{\beta_N}
988: \sum_{k=1}^{\infty}
989: q_k
990: \beta_N^{L_N^{(k)}}
991: \le
992: -\log_{\beta_{N+1}}
993: \frac
994: {
995: \sum_{k=1}^{\infty}
996: q_k
997: \beta_{N+1}^{L_{N}^{(k)}}
998: \beta_{N+1}^{l_{N+1}^{(k)}}
999: }
1000: {
1001: \sum_{k=1}^{\infty}
1002: q_k
1003: \beta_{N+1}^{L_{N}^{(k)}}
1004: }
1005: +
1006: \log_{\beta_{N+1}}
1007: \sum_{k=1}^{\infty}
1008: q_k
1009: \beta_{N+1}^{L_{N+1}^{(k)}},
1010: \end{equation*}
1011: which after cancellation becomes
1012: \begin{equation}\label{eq:to-check}
1013: \log_{\beta_N}
1014: \sum_{k=1}^{\infty}
1015: q_k
1016: \beta_N^{L_N^{(k)}}
1017: \le
1018: \log_{\beta_{N+1}}
1019: \sum_{k=1}^{\infty}
1020: q_k
1021: \beta_{N+1}^{L_{N}^{(k)}}.
1022: \end{equation}
1023: The last inequality follows from the general result
1024: about comparison of different means
1025: (\cite{hardy/etal:1952}, Theorem 85),
1026: but we can also check it directly
1027: (following \cite{kalnishkan/vyugin:2005}).
1028: Let $\beta_{N+1}=\beta_N^a$,
1029: where $0<a<1$.
1030: Then (\ref{eq:to-check}) can be rewritten as
1031: \begin{equation*}
1032: \left(
1033: \sum_{k=1}^{\infty}
1034: q_k
1035: \beta_N^{L_N^{(k)}}
1036: \right)^a
1037: \ge
1038: \sum_{k=1}^{\infty}
1039: q_k
1040: \beta_{N}^{aL_{N}^{(k)}},
1041: \end{equation*}
1042: and the last inequality follows from the concavity of the function $t\mapsto t^a$.
1043: \qedtext
1044: \end{Proof}
1045:
1046: \begin{lemma}[\cite{kalnishkan/vyugin:2005}, Lemma 5]
1047: Let $L$ be an upper bound on $\left|\lambda\right|$.
1048: The WAA guarantees that, for all $N$ and $K$,
1049: \begin{equation}\label{eq:lemma5}
1050: L_N
1051: \le
1052: L_N^{(K)}
1053: +
1054: \left(
1055: L^2 e^L + \ln\frac{1}{q_K}
1056: \right)
1057: \sqrt{N}.
1058: \end{equation}
1059: \end{lemma}
1060: (There is no term $e^L$ in \cite{kalnishkan/vyugin:2005}
1061: since it only considers non-negative loss functions.)
1062: \begin{proof}
1063: From (\ref{eq:lemma9}),
1064: we obtain:
1065: \begin{align*}
1066: L_N
1067: &\le
1068: \sum_{n=1}^N
1069: \sum_{k=1}^{\infty}
1070: p_n^{(k)}
1071: l_n^{(k)}
1072: +
1073: \sum_{n=1}^N
1074: \sqrt{n}
1075: \ln
1076: \sum_{k=1}^{\infty}
1077: p_n^{(k)}
1078: \exp
1079: \left(
1080: -\frac{l_n^{(k)}}{\sqrt{n}}
1081: \right)
1082: +
1083: \log_{\beta_N}
1084: q_K
1085: +
1086: L_N^{(K)}\\
1087: &\le
1088: \sum_{n=1}^N
1089: \sum_{k=1}^{\infty}
1090: p_n^{(k)}
1091: l_n^{(k)}
1092: +
1093: \sum_{n=1}^N
1094: \sqrt{n}
1095: \left(
1096: \sum_{k=1}^{\infty}
1097: p_n^{(k)}
1098: \left(
1099: 1
1100: -
1101: \frac{l_n^{(k)}}{\sqrt{n}}
1102: +
1103: \frac{\left(l_n^{(k)}\right)^2}{2n}
1104: e^L
1105: \right)
1106: -
1107: 1
1108: \right)\\
1109: &\quad{}+
1110: \log_{\beta_N}
1111: q_K
1112: +
1113: L_N^{(K)}\\
1114: &=
1115: L_N^{(K)}
1116: +
1117: \frac12
1118: \sum_{n=1}^N
1119: \frac{1}{\sqrt{n}}
1120: \sum_{k=1}^{\infty}
1121: p_n^{(k)}
1122: \left(l_n^{(k)}\right)^2
1123: e^L
1124: +
1125: \sqrt{N}\ln\frac{1}{q_K}\\
1126: &\le
1127: L_N^{(K)}
1128: +
1129: \frac{L^2e^L}{2}
1130: \sum_{n=1}^N
1131: \frac{1}{\sqrt{n}}
1132: +
1133: \sqrt{N}\ln\frac{1}{q_K}
1134: \le
1135: L_N^{(K)}
1136: +
1137: \frac{L^2e^L}{2}
1138: \int_0^N
1139: \frac{\D t}{\sqrt{t}}
1140: +
1141: \sqrt{N}\ln\frac{1}{q_K}\\
1142: &\le
1143: L_N^{(K)}
1144: +
1145: L^2e^L\sqrt{N}
1146: +
1147: \sqrt{N}\ln\frac{1}{q_K}
1148: \end{align*}
1149: (in the second ``$\le$'' we used the inequalities $e^t\le1+t+\frac{t^2}{2}e^{\left|t\right|}$
1150: and $\ln t\le t-1$).
1151: \qedtext
1152: \end{proof}
1153:
1154: Now it is easy to prove Theorem \ref{thm:deterministic-compact}.
1155: Let $\gamma_n$ be the predictions output by the WAA.
1156: Consider any continuous stationary prediction strategy $D$.
1157: Since every continuous function on a metric compact is uniformly continuous
1158: (\cite{engelking:1989}, Theorem 4.3.32),
1159: for any $\epsilon>0$ we can find $\delta>0$ such that
1160: $\left|\lambda(\gamma_1,y)-\lambda(\gamma_2,y)\right|<\epsilon$
1161: whenever $\rho(\gamma_1,\gamma_2)<\delta$.
1162: We can further find $K$ such that $\hat\rho(D_K,D)<\delta$,
1163: and (\ref{eq:lemma5}) then gives,
1164: for all biinfinite $\ldots,x_{-1},y_{-1},x_{0},y_{0},x_{1},y_{1},\ldots$,
1165: \begin{multline*}
1166: \limsup_{N\to\infty}
1167: \Biggl(
1168: \frac1N
1169: \sum_{n=1}^N
1170: \lambda(\gamma_n,y_n)
1171: -
1172: \frac1N
1173: \sum_{n=1}^N
1174: \lambda(D(\sigma_n),y_n)
1175: \Biggr)\\
1176: \le
1177: \limsup_{N\to\infty}
1178: \Biggl(
1179: \frac1N
1180: \sum_{n=1}^N
1181: \lambda(\gamma_n,y_n)
1182: -
1183: \frac1N
1184: \sum_{n=1}^N
1185: \lambda(D_K(\sigma_n),y_n)
1186: \Biggr)
1187: +
1188: \epsilon\\
1189: \le
1190: \limsup_{N\to\infty}
1191: \left(
1192: L^2e^L + \ln\frac{1}{q_K}
1193: \right)
1194: \frac{1}{\sqrt{N}}
1195: +
1196: \epsilon
1197: =
1198: \epsilon;
1199: \end{multline*}
1200: since $\epsilon$ can be arbitrarily small
1201: the WAA is CS universal.
1202:
1203: \section{Proof of Theorem \ref{thm:randomized-compact}}
1204: \label{sec:proof-randomized-compact}
1205:
1206: Let us first recall some useful facts about the probability measures
1207: on a metric compact $\Omega$
1208: (we will be following \cite{\GTPXVII}).
1209: The Banach space of all continuous real-valued functions on $\Omega$
1210: with the usual pointwise addition and scalar action
1211: and the sup norm will be denoted $C(\Omega)$.
1212: By one of the Riesz representation theorems
1213: (\cite{dudley:2002}, 7.4.1; see also 7.1.1),
1214: the mapping $\mu\mapsto I_{\mu}$,
1215: where
1216: $
1217: I_{\mu}(f):=\int_{\Omega}f\D\mu
1218: $,
1219: is a linear isometry
1220: between the set of all finite Borel signed measures $\mu$ on $\Omega$
1221: with the total variation norm
1222: and the dual space $C'(\Omega)$ to $C(\Omega)$
1223: with the standard dual norm
1224: (\cite{rudin:1991}, Chapter 4).
1225: We will identify the finite Borel signed measures $\mu$ on $\Omega$
1226: with the corresponding $I_{\mu}\in C'(\Omega)$.
1227: This makes the set $\PPP(\Omega)$ of probability measures on $\Omega$
1228: a convex closed subset of $C'(\Omega)$.
1229:
1230: We will be interested, however,
1231: in a different topology on $C'(\Omega)$,
1232: the weakest topology for which all evaluation functionals
1233: $\mu\in C'(\Omega)\mapsto\mu(f)$, $f\in C(\Omega)$,
1234: are continuous.
1235: This topology is known as the \emph{weak${}^*$ topology}
1236: (\cite{rudin:1991}, 3.14),
1237: and the topology inherited by $\PPP(\Omega)$
1238: is known as the \emph{topology of weak convergence}
1239: (\cite{billingsley:1968}, Appendix III).
1240: The point mass $\delta_{\omega}$, $\omega\in\Omega$,
1241: is defined to be the probability measure concentrated at $\omega$,
1242: $\delta_{\omega}(\{\omega\})=1$.
1243: The simple example of a sequence of point masses $\delta_{\omega_n}$
1244: such that $\omega_n\to\omega$ as $n\to\infty$ and $\omega_n\ne\omega$ for all $n$
1245: shows that the topology of weak convergence is different from the dual norm topology:
1246: $\delta_{\omega_n}\to\delta_{\omega}$ holds in one but does not hold in the other.
1247:
1248: It is not difficult to check that $\PPP(\Omega)$ remains a closed subset of $C'(\Omega)$
1249: in the weak${}^*$ topology
1250: (\cite{bourbaki:integration}, III.2.7, Proposition 7).
1251: By the Banach--Alaoglu theorem
1252: (\cite{rudin:1991}, 3.15)
1253: $\PPP(\Omega)$ is compact in the topology of weak convergence
1254: (this is a special case of Prokhorov's theorem,
1255: \cite{billingsley:1968}, Appendix III, Theorem 6).
1256: In the rest of this paper,
1257: $\PPP(\Omega)$
1258: (and all other spaces of probability measures)
1259: are always equipped with the topology of weak convergence.
1260:
1261: Since $\Omega$ is a metric compact,
1262: $\PPP(\Omega)$ is also metrizable
1263: (by the well-known Prokhorov metric:
1264: \cite{billingsley:1968}, Appendix III, Theorem 6).
1265:
1266: Define
1267: \begin{equation}\label{eq:expected-loss}
1268: \lambda(\gamma,y)
1269: :=
1270: \int_{\Gamma}
1271: \lambda(g,y)
1272: \gamma(\dd g),
1273: \end{equation}
1274: where $\gamma$ is a probability measure on $\Gamma$.
1275: This is the loss function in a new game of prediction
1276: with the prediction space $\PPP(\Gamma)$;
1277: it is convex in $\gamma$.
1278:
1279: Let us check that the loss function (\ref{eq:expected-loss}) is continuous.
1280: If $\gamma_n\to\gamma$ and $y_n\to y$
1281: for some $(\gamma,y)\in\PPP(\Gamma)\times\mathbf{Y}$,
1282: \begin{equation*}
1283: \left|
1284: \lambda(\gamma_n,y_n)
1285: -
1286: \lambda(\gamma,y)
1287: \right|
1288: \le
1289: \left|
1290: \lambda(\gamma_n,y_n)
1291: -
1292: \lambda(\gamma_n,y)
1293: \right|
1294: +
1295: \left|
1296: \lambda(\gamma_n,y)
1297: -
1298: \lambda(\gamma,y)
1299: \right|
1300: \to
1301: 0
1302: \end{equation*}
1303: (the first addend tends to zero because of the uniform continuity
1304: of $\lambda:\Gamma\times\mathbf{Y}\to\bbbr$
1305: and the second addend by the definition of the topology of weak convergence).
1306:
1307: Unfortunately,
1308: Theorem \ref{thm:deterministic-compact} cannot be applied
1309: to the new game of prediction directly:
1310: the theorem assumes that $\Gamma$ is a subset of a Banach space,
1311: whereas the dual to an infinite-dimensional Banach space is never even metrizable
1312: in the weak$^*$ topology
1313: (\cite{rudin:1991}, 3.16).
1314: The proof of Theorem \ref{thm:deterministic-compact}, however,
1315: still works for the new game.
1316:
1317: It is clear that the mixture (\ref{eq:WAA}) is a probability measure.
1318: The result of the previous section is still true,
1319: and the randomized prediction strategy (\ref{eq:WAA})
1320: produces $\gamma_n\in\PPP(\Gamma)$ that are guaranteed to satisfy
1321: \begin{equation}\label{eq:mean}
1322: \limsup_{N\to\infty}
1323: \left(
1324: \frac1N
1325: \sum_{n=1}^N
1326: \lambda(\gamma_n,y_n)
1327: -
1328: \frac1N
1329: \sum_{n=1}^N
1330: \lambda(D(\sigma_n),y_n)
1331: \right)
1332: \le
1333: 0,
1334: \end{equation}
1335: for any continuous stationary randomized prediction strategy $D$.
1336: The loss function is bounded in absolute value
1337: by a constant $L$,
1338: and so the law of the iterated logarithm
1339: (see, e.g., \cite{shafer/vovk:2001}, (5.8))
1340: implies that
1341: \begin{align}
1342: \limsup_{N\to\infty}
1343: \frac
1344: {
1345: \left|
1346: \sum_{n=1}^N
1347: \bigl(
1348: \lambda(g_n,y_n)
1349: -
1350: \lambda(\gamma_n,y_n)
1351: \bigr)
1352: \right|
1353: }
1354: {
1355: \sqrt{2L^2N\ln\ln N}
1356: }
1357: &\le
1358: 1,\label{eq:LIL-1}\\
1359: \limsup_{N\to\infty}
1360: \frac
1361: {
1362: \left|
1363: \sum_{n=1}^N
1364: \bigl(
1365: \lambda(d_n,y_n)
1366: -
1367: \lambda(D(\sigma_n),y_n)
1368: \bigr)
1369: \right|
1370: }
1371: {
1372: \sqrt{2L^2N\ln\ln N}
1373: }
1374: &\le
1375: 1\label{eq:LIL-2}
1376: \end{align}
1377: with probability one.
1378: Combining the last two inequalities with (\ref{eq:mean}) gives
1379: \begin{equation*}
1380: \limsup_{N\to\infty}
1381: \left(
1382: \frac1N
1383: \sum_{n=1}^N
1384: \lambda(g_n,y_n)
1385: -
1386: \frac1N
1387: \sum_{n=1}^N
1388: \lambda(d_n,y_n)
1389: \right)
1390: \le
1391: 0
1392: \enspace
1393: \textrm{a.s.}
1394: \end{equation*}
1395: Therefore, the WAA (applied to $D_1,D_2,\ldots$)
1396: is a universal continuous randomized prediction strategy.
1397:
1398: \section{Proof of Theorem \ref{thm:deterministic}}
1399: \label{sec:proof-deterministic}
1400:
1401: In view of Theorem \ref{thm:deterministic-compact},
1402: we only need to get rid of the assumption of compactness
1403: of $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$.
1404:
1405: \subsection*{Game of removal}
1406:
1407: The proofs of Theorems \ref{thm:deterministic} and \ref{thm:randomized}
1408: will be based on the following game
1409: (an abstract version of the ``doubling trick'',
1410: \cite{cesabianchi/lugosi:2006})
1411: played in a topological space $X$:
1412:
1413: \bigskip
1414:
1415: \noindent
1416: \textsc{Game of removal $G(X)$}\nopagebreak
1417: \begin{tabbing}
1418: \qquad\=\qquad\=\qquad\kill
1419: FOR $n=1,2,\dots$:\\
1420: \> Remover announces compact $K_n\subseteq X$.\\
1421: \> Evader announces $p_n\notin K_n$.\\
1422: END FOR.
1423: \end{tabbing}
1424: \textbf{Winner:}
1425: Evader if the set $\left\{p_1,p_2,\ldots\right\}$ is precompact;
1426: Remover otherwise.
1427:
1428: \bigskip
1429:
1430: \noindent
1431: Intuitively,
1432: the goal of Evader is to avoid being removed to the infinity.
1433: Without loss of generality
1434: we will assume that Remover always announces a non-decreasing sequence of compact sets:
1435: $K_1\subseteq K_2\subseteq\cdots$.
1436: \begin{lemma}[Gruenhage]\label{lem:Gruenhage}
1437: Remover has a winning strategy in $G(X)$
1438: if $X$ is a locally compact and paracompact space.
1439: \end{lemma}
1440: \begin{proof}
1441: We will follow the proof of Theorem 4.1 in \cite{gruenhage:2006}
1442: (the easy direction).
1443: If $X$ is locally compact and $\sigma$-compact,
1444: there exists a non-decreasing sequence $K_1\subseteq K_2\subseteq\cdots$
1445: of compact sets covering $X$,
1446: and each $K_n$ can be extended to compact $K^*_n$
1447: so that $\Int K^*_n\supseteq K_n$
1448: (\cite{engelking:1989}, Theorem 3.3.2).
1449: Remover will obviously win $G(X)$ choosing $K^*_1,K^*_2,\ldots$ as his moves.
1450:
1451: If $X$ is the sum of locally compact $\sigma$-compact spaces $X_s$, $s\in S$,
1452: Remover plays, for each $s\in S$, the strategy described in the previous paragraph
1453: on the subsequence of Evader's moves belonging to $X_s$.
1454: If Evader chooses $p_n\in X_s$ for infinitely many $X_s$,
1455: those $X_s$ will form an open cover of the closure of $\{p_1,p_2,\ldots\}$
1456: without a finite subcover.
1457: If $x_n$ are chosen from only finitely many $X_s$,
1458: there will be infinitely many $x_n$ chosen from some $X_s$,
1459: and the result of the previous paragraph can be applied.
1460: It remains to remember that each locally compact paracompact
1461: can be represented as the sum of locally compact $\sigma$-compact subsets
1462: (\cite{engelking:1989}, Theorem 5.1.27).
1463: \qedtext
1464: \end{proof}
1465:
1466: \subsection*{Large at infinity loss functions}
1467:
1468: We will need the following useful property of large at infinity loss functions.
1469: \begin{lemma}\label{lem:loss}
1470: Let $\lambda$ be a loss function that is large at infinity.
1471: For each compact set $B\subseteq\mathbf{Y}$ and each constant $M$
1472: there exists a compact set $C\subseteq\Gamma$ such that
1473: \begin{equation}\label{eq:loss}
1474: \forall\gamma\notin C,y\in B:
1475: \quad
1476: \lambda(\gamma,y)
1477: >
1478: M.
1479: \end{equation}
1480: \end{lemma}
1481: \begin{proof}
1482: For each point $y^*\in B$
1483: fix a neighborhood $O_{y^*}\ni y^*$
1484: and a compact set $C(y^*)\subseteq\Gamma$ such that
1485: $\lambda\left(\Gamma\setminus C(y^*),O_{y^*}\right)\subseteq(M,\infty)$.
1486: Since the sets $O_{y^*}$ form an open cover of $B$,
1487: we can find this cover's finite subcover
1488: $\{O_{y^*_1},\ldots,O_{y^*_n}\}$.
1489: It is clear that
1490: \begin{equation*}
1491: C
1492: :=
1493: \bigcup_{j=1,\ldots,n}
1494: C
1495: \left(
1496: O_{y^*_j}
1497: \right)
1498: \end{equation*}
1499: satisfies (\ref{eq:loss}).
1500: \qedtext
1501: \end{proof}
1502: In fact,
1503: the only property of large at infinity loss functions that we will be using
1504: is that in the conclusion of Lemma \ref{lem:loss}.
1505: In particular, it implies the following lemma.
1506: \begin{lemma}\label{lem:C-det}
1507: Under the conditions of Theorem \ref{thm:deterministic},
1508: for each compact set $B\subseteq\mathbf{Y}$
1509: there exists a compact convex set $C=C(B)\subseteq\Gamma$
1510: such that for each continuous stationary prediction strategy
1511: $D:\Sigma\to\Gamma$
1512: there exists a continuous stationary prediction strategy
1513: $D':\Sigma\to C$
1514: that dominates $D$ in the sense
1515: \begin{equation}\label{eq:prediction-type}
1516: \forall\sigma\in\Sigma,y\in B:
1517: \quad
1518: \lambda(D'(\sigma),y)
1519: \le
1520: \lambda(D(\sigma),y).
1521: \end{equation}
1522: \end{lemma}
1523: \ifFULL\bluebegin
1524: In fact,
1525: we only need Lemmas \ref{lem:C-det} and \ref{lem:C-rand}
1526: for $D':A\to C$.
1527: \blueend\fi
1528: \begin{proof}
1529: Without loss of generality $B$ is assumed non-empty.
1530: Fix any $\gamma_0\in\Gamma$.
1531: Let
1532: \begin{equation*}
1533: M_1
1534: :=
1535: \sup_{y\in B}
1536: \lambda(\gamma_0,y),
1537: \end{equation*}
1538: let $C_1\subseteq\Gamma$ be a compact set such that
1539: \begin{equation*}
1540: \forall \gamma\notin C_1,y\in B:
1541: \quad
1542: \lambda(\gamma,y)
1543: >
1544: M_1+1,
1545: \end{equation*}
1546: let
1547: \begin{equation*}
1548: M_2
1549: :=
1550: \sup_{(\gamma,y)\in C_1\times B}
1551: \lambda(\gamma,y),
1552: \end{equation*}
1553: and let $C_2\subseteq\Gamma$ be a compact set such that
1554: \begin{equation*}
1555: \forall\gamma\notin C_2,y\in B:
1556: \quad
1557: \lambda(\gamma,y)
1558: >
1559: M_2+1.
1560: \end{equation*}
1561: It is obvious that $M_1\le M_2$ and $\gamma_0\in C_1\subseteq C_2$.
1562: We can and will assume $C_2$ convex
1563: (see \cite{rudin:1991}, Theorem 3.20(c)).
1564:
1565: Let us now check that $C_1$ lies inside the interior of $C_2$.
1566: Indeed, for any fixed $y\in B$ and $\gamma\in C_1$,
1567: we have $\lambda(\gamma,y)\le M_2$;
1568: since $\lambda(\gamma',y)>M_2+1$ for all $\gamma'\notin C_2$,
1569: some neighborhood of $\gamma$ will lie completely in $C_2$.
1570:
1571: Let $D:\Sigma\to\Gamma$
1572: be a continuous stationary prediction strategy.
1573: We will show that (\ref{eq:prediction-type}) holds
1574: for some continuous stationary prediction strategy $D'$
1575: taking values in the compact convex set $C(B):=C_2$.
1576: Namely,
1577: we define
1578: \begin{multline*}
1579: D'(\sigma)
1580: :=\\
1581: \begin{cases}
1582: D(\sigma) & \text{if $D(\sigma)\in C_1$}\\
1583: \frac{\rho(D(\sigma),\Gamma\setminus C_2)}{\rho(D(\sigma),C_1)+\rho(D(\sigma),\Gamma\setminus C_2)} D(\sigma)
1584: +\frac{\rho(D(\sigma),C_1)}{\rho(D(\sigma),C_1)+\rho(D(\sigma),\Gamma\setminus C_2)} \gamma_0
1585: & \text{if $D(\sigma)\in C_2\setminus C_1$}\\
1586: \gamma_0 & \text{if $D(\sigma)\in \Gamma\setminus C_2$}
1587: \end{cases}
1588: \end{multline*}
1589: where $\rho$ is the metric on $\Gamma$;
1590: the denominator $\rho(D(\sigma),C_1)+\rho(D(\sigma),\Gamma\setminus C_2)$
1591: is positive since already $\rho(D(\sigma),C_1)$ is positive.
1592: Since $C_2$ is convex,
1593: we can see that $D'$ indeed takes values in $C_2$.
1594: The only points $x$ at which the continuity of $D'$ is not obvious
1595: are those for which $D(\sigma)$ lies on the boundary of $C_1$:
1596: in this case
1597: one has to use the fact that $C_1$ is covered by the interior of $C_2$.
1598:
1599: It remains to check (\ref{eq:prediction-type});
1600: the only non-trivial case is $D(\sigma)\in C_2\setminus C_1$.
1601: By the convexity of $\lambda(\gamma,y)$ in $\gamma$,
1602: the inequality in (\ref{eq:prediction-type}) will follow from
1603: \begin{multline*}
1604: \frac{\rho(D(\sigma),\Gamma\setminus C_2)}{\rho(D(\sigma),C_1)+\rho(D(\sigma),\Gamma\setminus C_2)}
1605: \lambda(D(\sigma),y)\\
1606: +\frac{\rho(D(\sigma),C_1)}{\rho(D(\sigma),C_1)+\rho(D(\sigma),\Gamma\setminus C_2)}
1607: \lambda(\gamma_0,y)
1608: \le
1609: \lambda(D(\sigma),y),
1610: \end{multline*}
1611: i.e.,
1612: \begin{equation*}
1613: \lambda(\gamma_0,y)
1614: \le
1615: \lambda(D(\sigma),y).
1616: \end{equation*}
1617: Since the left-hand side of the last inequality is at most $M_1$
1618: and its right-hand side exceeds $M_1+1$,
1619: it holds true.
1620: \qedtext
1621: \end{proof}
1622: \begin{remark*}
1623: If the loss function is allowed to depend on the infinite past,
1624: the $\sigma$s in Lemma \ref{lem:C-det} will have to be restricted
1625: to a compact set $A\subseteq\Sigma$
1626: and the compact set $C$ will depend not only on $B$ but also on $A$
1627: (see Lemma 18 of \cite{\GTPXVII}).
1628: \end{remark*}
1629:
1630: \subsection*{The proof}
1631:
1632: For each compact $B\subseteq\mathbf{Y}$
1633: fix a compact convex $C(B)\subseteq\Gamma$ as in Lemma \ref{lem:C-det}.
1634: Predictor's strategy ensuring (\ref{eq:dominates-deterministic})
1635: is constructed from Remover's winning strategy in $G(\mathbf{X}\times\mathbf{Y})$
1636: (see Lemma \ref{lem:Gruenhage};
1637: metric spaces are paracompact by the Stone theorem,
1638: \cite{engelking:1989}, Theorem 5.1.3)
1639: and from Predictor's strategies $\SSS(A,B)$ outputting predictions
1640: \begin{equation}\label{eq:gamma}
1641: \gamma_n\in C(B)
1642: \end{equation}
1643: and ensuring the consequent of (\ref{eq:dominates-deterministic})
1644: for all continuous
1645: \begin{equation}\label{eq:DABC}
1646: D:(A\times B)^{\infty}\times A\to C(B)
1647: \end{equation}
1648: under the assumption that $(x_n,y_n)\in A\times B$
1649: for given compact $A\subseteq\mathbf{X}$ and $B\subseteq\mathbf{Y}$
1650: (the existence of such $\SSS(A,B)$
1651: is asserted in Theorem \ref{thm:deterministic-compact}).
1652: Remover's moves are assumed to be of the form $A\times B$
1653: for compact $A\subseteq\mathbf{X}$ and $B\subseteq\mathbf{Y}$.
1654: Predictor is simultaneously playing the game of removal
1655: $G(\mathbf{X}\times\mathbf{Y})$ as Evader.
1656:
1657: At the beginning of the game of prediction
1658: Predictor asks Remover to make his first move $A_1\times B_1$ in the game of removal;
1659: without loss of generality
1660: we assume that $A_1\times B_1$ contains all $(x_n,y_n)$, $n\le0$
1661: (there is nothing to prove if $\{(x_n,y_n)\st n\le0\}$ is not precompact).
1662: Predictor then plays the game of prediction using the strategy $\SSS(A_1,B_1)$
1663: until Reality chooses $(x_n,y_n)\notin A_1\times B_1$
1664: (forever if Reality never chooses such $(x_n,y_n)$).
1665: As soon as such $(x_n,y_n)$ is chosen,
1666: Predictor announces $(x_n,y_n)$ in the game of removal
1667: and notes Remover's response $(A_2,B_2)$.
1668: He then continues playing the game of prediction using the strategy $\SSS(A_2,B_2)$
1669: until Reality chooses $(x_n,y_n)\notin A_2\times B_2$,
1670: etc.
1671:
1672: Let us check that this strategy for Predictor
1673: will always ensure (\ref{eq:dominates-deterministic}).
1674: If Reality chooses $(x_n,y_n)$ outside Predictor's current $A_k\times B_k$
1675: finitely often,
1676: the consequent of (\ref{eq:dominates-deterministic}) will be satisfied
1677: for all continuous stationary $D:\Sigma\to C(B_K)$
1678: ($B_K$ being the second component of Remover's last move $(A_K,B_K)$)
1679: and so, by Lemma \ref{lem:C-det},
1680: for all continuous stationary $D:\Sigma\to\Gamma$.
1681: If Reality chooses $(x_n,y_n)$ outside Predictor's current $A_k\times B_k$
1682: infinitely often,
1683: the set of $(x_n,y_n)$, $n=1,2,\ldots$, will not be precompact,
1684: and so the antecedent of (\ref{eq:dominates-deterministic}) will be violated.
1685:
1686: \section{Proof of Theorem \ref{thm:randomized}}
1687: \label{sec:proof-randomized}
1688:
1689: When $\gamma$ ranges over $\PPP(C)$
1690: (identified with the subset of $\PPP(\Gamma)$
1691: consisting of the measures concentrated on $C$)
1692: for a compact $C\subseteq\Gamma$,
1693: the loss function (\ref{eq:expected-loss}),
1694: as we have seen, is continuous.
1695: The following analogue of Lemma \ref{lem:C-det} will be useful.
1696: \begin{lemma}\label{lem:C-rand}
1697: Under the conditions of Theorem \ref{thm:randomized},
1698: for each compact set $B\subseteq\mathbf{Y}$
1699: there exists a compact convex set $C=C(B)\subseteq\Gamma$
1700: such that for each continuous stationary randomized prediction strategy
1701: $D:\Sigma\to\PPP(\Gamma)$
1702: there exists a continuous stationary randomized prediction strategy
1703: $D':\Sigma\to\PPP(C)$
1704: such that (\ref{eq:prediction-type}) holds
1705: ($D'$ dominates $D$ ``on average'').
1706: \end{lemma}
1707: (In fact, this lemma is not needed
1708: for the proof of Theorem \ref{thm:randomized} as we stated it,
1709: but it will imply that $\gamma_n$ dominate $D(\sigma_n)$ on average,
1710: for any continuous stationary randomized prediction strategy $D$:
1711: see (\ref{eq:stage-K}).)
1712: \begin{proof}
1713: Define $\gamma_0$, $M_1$, $C_1$, $M_2$, and $C_2$
1714: as in the proof of Lemma \ref{lem:C-det}.
1715: Fix a continuous function $f_1:\Gamma\to[0,1]$ such that $f_1=1$ on $C_1$
1716: and $f_1=0$ on $\Gamma\setminus C_2$
1717: (such an $f_1$ exists by the Tietze--Uryson theorem,
1718: \cite{engelking:1989}, Theorem 2.1.8).
1719: Set $f_2:=1-f_1$.
1720: Let $D:\Sigma\to\PPP(\Gamma)$ be a continuous stationary randomized prediction strategy.
1721: For each $\sigma\in\Sigma$,
1722: split $D(\sigma)$ into two measures on $\Gamma$
1723: absolutely continuous with respect to $D(\sigma)$:
1724: $D_1(\sigma)$ with Radon--Nikodym density $f_1$
1725: and $D_2(\sigma)$ with Radon--Nikodym density $f_2$;
1726: set
1727: \begin{equation*}
1728: D'(\sigma)
1729: :=
1730: D_1(\sigma)
1731: +
1732: \left|D_2(\sigma)\right|
1733: \delta_{\gamma_0}
1734: \end{equation*}
1735: (letting $\left|P\right|:=P(\Gamma)$ for $P$ a measure on $\Gamma$).
1736: It is clear that the stationary randomized prediction strategy $D'$ is continuous
1737: (in the topology of weak convergence, as usual),
1738: takes values in $\PPP(C_2)$,
1739: and
1740: \begin{multline*}
1741: \lambda(D'(\sigma),y)
1742: =
1743: \int_{\Gamma}
1744: \lambda(\gamma,y)
1745: f_1(\gamma)
1746: D(\sigma)(\dd\gamma)
1747: +
1748: \lambda(\gamma_0,y)
1749: \int_{\Gamma}
1750: f_2(\gamma)
1751: D(\sigma)(\dd\gamma)\\
1752: \le
1753: \int_{\Gamma}
1754: \lambda(\gamma,y)
1755: f_1(\gamma)
1756: D(\sigma)(\dd\gamma)
1757: +
1758: \int_{\Gamma}
1759: M_1
1760: f_2(\gamma)
1761: D(\sigma)(\dd\gamma)\\
1762: \le
1763: \int_{\Gamma}
1764: \lambda(\gamma,y)
1765: f_1(\gamma)
1766: D(\sigma)(\dd\gamma)
1767: +
1768: \int_{\Gamma}
1769: \lambda(\gamma,y)
1770: f_2(\gamma)
1771: D(\sigma)(\dd\gamma)
1772: =
1773: \lambda(D(\sigma),y)
1774: \end{multline*}
1775: for all $(\sigma,y)\in\Sigma\times B$.
1776: So we can take $C(B):=C_2$.
1777: \qedtext
1778: \end{proof}
1779: Fix one of the mappings $B\mapsto C(B)$
1780: whose existence is asserted by the lemma.
1781:
1782: We will prove that the prediction strategy of the previous section
1783: with (\ref{eq:gamma}) replaced by
1784: $
1785: \gamma_n\in\PPP(C(B))
1786: $
1787: and (\ref{eq:DABC}) replaced by
1788: \begin{equation*}
1789: D:(A\times B)^{\infty}\times A\to\PPP(C(B))
1790: \end{equation*}
1791: is CS universal.
1792: Let $D:\Sigma\to\PPP(\Gamma)$ be a continuous stationary randomized prediction strategy,
1793: i.e., a continuous stationary prediction strategy
1794: in the new game of prediction with loss function (\ref{eq:expected-loss}).
1795: Let $(A_K,B_K)$ be Remover's last move
1796: (if Remover makes infinitely many moves,
1797: the antecedent of (\ref{eq:dominates-randomized}) is false,
1798: and there is nothing to prove),
1799: and let $D':\Sigma\to\PPP(C(B_K))$ be a continuous stationary randomized prediction strategy
1800: satisfying (\ref{eq:prediction-type}) with $B:=B_K$.
1801: From some $n$ on
1802: our randomized prediction algorithm produces $\gamma_n\in\PPP(\Gamma)$
1803: concentrated on $C(B_K)$,
1804: and they will satisfy
1805: \begin{multline}\label{eq:stage-K}
1806: \limsup_{N\to\infty}
1807: \left(
1808: \frac1N
1809: \sum_{n=1}^N
1810: \lambda(\gamma_n,y_n)
1811: -
1812: \frac1N
1813: \sum_{n=1}^N
1814: \lambda(D(\sigma_n),y_n)
1815: \right)\\
1816: \le
1817: \limsup_{N\to\infty}
1818: \left(
1819: \frac1N
1820: \sum_{n=1}^N
1821: \lambda(\gamma_n,y_n)
1822: -
1823: \frac1N
1824: \sum_{n=1}^N
1825: \lambda(D'(\sigma_n),y_n)
1826: \right)
1827: \le
1828: 0.
1829: \end{multline}
1830: This is an interesting property
1831: but slightly different from what Theorem \ref{thm:randomized} asserts.
1832:
1833: According to the proof of Lemma \ref{lem:C-rand},
1834: we can, and we will, assume that $D'(\sigma_n)$
1835: generates outcomes $d'_n$ in two steps:
1836: first $d_n$ is generated from $D(\sigma_n)$,
1837: and then it is replaced by $\gamma_0$ with probability $f_2(\sigma_n)$.
1838: The loss function is bounded in absolute value
1839: on the compact set
1840: $C(B_K)\times B_K$ by a constant $L$.
1841: From the law of the iterated logarithm
1842: (see (\ref{eq:LIL-1}) and (\ref{eq:LIL-2}))
1843: applied to the losses of $\gamma_n$ and $d'_n$
1844: we now obtain,
1845: instead of (\ref{eq:stage-K}),
1846: \begin{multline*}
1847: \limsup_{N\to\infty}
1848: \left(
1849: \frac1N
1850: \sum_{n=1}^N
1851: \lambda(g_n,y_n)
1852: -
1853: \frac1N
1854: \sum_{n=1}^N
1855: \lambda(d_n,y_n)
1856: \right)\\
1857: \le
1858: \limsup_{N\to\infty}
1859: \left(
1860: \frac1N
1861: \sum_{n=1}^N
1862: \lambda(g_n,y_n)
1863: -
1864: \frac1N
1865: \sum_{n=1}^N
1866: \lambda(d'_n,y_n)
1867: \right)\\
1868: =
1869: \limsup_{N\to\infty}
1870: \left(
1871: \frac1N
1872: \sum_{n=1}^N
1873: \lambda(\gamma_n,y_n)
1874: -
1875: \frac1N
1876: \sum_{n=1}^N
1877: \lambda(D'(\sigma_n),y_n)
1878: \right)
1879: \le
1880: 0
1881: \enspace
1882: \textrm{a.s.};
1883: \end{multline*}
1884: it remains to compare this with (\ref{eq:dominates-randomized}).
1885:
1886: \section{Stationarity and continuity}
1887: \label{sec:stationarity}
1888:
1889: As we said earlier,
1890: the assumption of stationarity is very natural
1891: for prediction strategies:
1892: it just means that the arbitrary origin of time is not taken into account
1893: (in the spirit of the invariance principle in statistics;
1894: see, e.g., \cite{lehmann:1986}, Section 6.1).
1895: Stationary strategies can detect and make use of all kinds of trends
1896: and one-off phenomena;
1897: e.g.,
1898: they can perform well when the rate of environment change is constantly increasing
1899: (as in our own environment).
1900: There need not be stationarity in the environment.
1901:
1902: Interestingly,
1903: our prediction algorithms are continuous (or can be made continuous)
1904: but not stationary.
1905: First we discuss the continuity
1906: of the prediction algorithms
1907: constructed in the proofs of our four theorems.
1908: \begin{description}
1909: \item[Theorem \ref{thm:deterministic-compact}]
1910: It is easy to check that the WAA is continuous;
1911: by the Weierstrass $M$-test,
1912: (\ref{eq:WAA}) converges uniformly
1913: and so its sum is continuous.
1914: \item[Theorem \ref{thm:randomized-compact}]
1915: To check that $\gamma_n$ is a continuous function of
1916: $\sigma_n$ in the topology of weak convergence,
1917: we only need to check that $\int f\D\gamma_n$ is a continuous function of $\sigma_n$
1918: for each $f\in C(\Sigma)$.
1919: This again follows from the Weierstrass $M$-test.
1920: \item[Theorem \ref{thm:deterministic}]
1921: As described,
1922: Predictor's strategy is not continuous
1923: since his behavior changes suddenly when Reality outputs $(x_n,y_n)$
1924: outside his current $A_k\times B_k$,
1925: but it is clear that it can be ``smoothed around the edges''
1926: to ensure continuity.
1927: \item[Theorem \ref{thm:randomized}]
1928: The situation is analogous to Theorem \ref{thm:deterministic}.
1929: \end{description}
1930:
1931: For concreteness,
1932: we will discuss stationarity only in the case of Theorem \ref{thm:deterministic-compact}.
1933: We know that the WAA is a prediction strategy that is continuous
1934: as a function of the type $\Sigma\times\{1,2,\ldots\}\to\Gamma$.
1935: It is not stationary
1936: (i.e., we cannot get rid of the $\{1,2,\ldots\}$)
1937: because it has to keep track of the experts' losses
1938: since the beginning of the game of prediction.
1939: Stationary strategies can depend on time only in a limited way:
1940: e.g., in terms of our own environment,
1941: they can depend on the time of day or the season.
1942: But the WAA's dependence is much heavier:
1943: it has to know precisely the time that has elapsed since the beginning.
1944:
1945: Let us now check that
1946: there are no universal continuous stationary prediction strategies
1947: under conditions of Theorem \ref{thm:deterministic-compact}.
1948: Suppose $\Gamma$ is such that there exists $f:\Gamma\to\Gamma$
1949: without fixed points
1950: (i.e., $f(\gamma)\ne\gamma$ for all $\gamma\in\Gamma$;
1951: we can take, e.g., a circle as $\Gamma$).
1952: If $D$ were a universal continuous stationary strategy,
1953: we could define another continuous stationary strategy $D'(\sigma):=f(D(\sigma))$
1954: and make Reality collude with $D'$
1955: (i.e., output $y_n$ leading to a significantly smaller loss for $D'$;
1956: this can be done for an appropriate choice of $\lambda$,
1957: and in fact can be done for all usual $\lambda$).
1958:
1959: \iffalse
1960: In conclusion let us check that,
1961: for a wide class of loss function $\lambda$
1962: there are no universal continuous stationary prediction strategies.
1963: Indeed,
1964: suppose that for some $\gamma_1,\gamma_2\in\Gamma$ and $y_1,y_2\in\mathbf{Y}$,
1965: \begin{align*}
1966: \lambda(\gamma_1,y_1)
1967: &<
1968: \lambda(\gamma_2,y_1)\\
1969: \lambda(\gamma_2,y_2)
1970: &<
1971: \lambda(\gamma_1,y_2)\\
1972: \inf_{\gamma\in\Gamma}
1973: \max_{i=1,2}
1974: \left(
1975: \lambda(\gamma,y_i)
1976: -
1977: \lambda(\gamma_i,y_i)
1978: \right)
1979: &>
1980: 0
1981: \end{align*}
1982: (the first condition means that $\gamma_1$ is the ``right'' prediction for $y_1$,
1983: the second condition that $\gamma_2$ is the ``right'' prediction for $y_2$,
1984: and the third condition is that no $\gamma\in\Gamma$
1985: can simultaneously compete with $\gamma_1$ on $y_1$ and with $\gamma_2$ on $y_2$);
1986: this is a mild condition satisfied for the standard loss functions.
1987: Did not work.
1988: \fi
1989:
1990: \subsection*{Stationary Reality}
1991:
1992: A standard problem in probability theory is where Reality
1993: is governed by a stationary probability measure;
1994: of course, only stationary prediction strategies are considered.
1995: In this subsection we will list several references
1996: for this problem,
1997: considering, for simplicity, only the case where the signals $x_n$ are absent
1998: (formally, we assume that $\mathbf{X}$ is a one-element set
1999: and omit the $x_n$, which now do not carry any information, from our notation).
2000:
2001: The problem of prediction has been studied extensively
2002: for both strictly stationary sequences of observations
2003: and wide sense stationary sequences
2004: (the definitions and a general discussion of ``strict sense'' and ``wide sense'' concepts
2005: can be found in \cite{doob:1953}, Chapter 2, Sections 8 and 3).
2006: We will first assume that $\ldots,y_{-1},y_0,y_1,\ldots$
2007: form a wide sense stationary sequence of random variables
2008: and then a strictly stationary sequence.
2009:
2010: The natural mode of prediction for wide sense stationary sequences
2011: is linear prediction.
2012: The problem of linear prediction
2013: (not necessarily one-step-ahead, as in this paper)
2014: of wide sense stationary sequences
2015: was posed and solved by Kolmogorov
2016: \cite{kolmogorov:1939,\KolmogorovCRfull,\KolmogorovStationary};
2017: later but independently this was done by Wiener
2018: \cite{wiener:1949}.
2019:
2020: Kolmogorov and Wiener assumed the probability distribution of the observations known.
2021: There are many efficient ways to estimate the spectral density of this probability distribution
2022: (in terms of which the optimal linear predictor is expressed);
2023: see, e.g., \cite{anderson:1971}, Chapter 9, for a review.
2024: (An early idea of spectral estimation was proposed by Einstein in 1914:
2025: see \cite{newton:2002}, p.~363.)
2026:
2027: The problem of existence of universal prediction strategies
2028: for strictly stationary and ergodic sequences of observations
2029: was posed by Cover \cite{cover:1975},
2030: and such strategies were found by Ornstein \cite{ornstein:1978}
2031: for finite $\mathbf{Y}$
2032: and Algoet \cite{algoet:1992} for $\mathbf{Y}$ a Polish space.
2033: Papers \cite{gyorfi/etal:1999,gyorfi/lugosi:2001,nobel:2003}
2034: construct such strategies
2035: using techniques very similar to those of this paper.
2036:
2037: \section{Conclusion}
2038: \label{sec:conclusion}
2039:
2040: An interesting direction of further research
2041: is to obtain non-asymptotic versions of our results.
2042: If the benchmark class of continuous stationary prediction strategies
2043: is compact,
2044: loss bounds can be given in terms of $\epsilon$-entropy
2045: \cite{\KolmogorovTikhomirov}.
2046: In general,
2047: one can give loss bounds in terms of a nested family
2048: of compact sets
2049: whose union is dense in the set of continuous stationary prediction strategies
2050: (in analogy with Vapnik and Chervonenkis's principle
2051: of structural risk minimization \cite{vapnik:1998}).
2052:
2053: \ifFULL\bluebegin
2054: It would be interesting to explore unconditional continuous predictive complexity
2055: in the simplest case without $x$s and with $\mathbf{Y}=\{0,1\}$
2056: (and with the log loss or the square loss function).
2057: \blueend\fi
2058:
2059: \subsection*{Acknowledgments}
2060:
2061: I am grateful to Yura Kalnishkan and Ilia Nouretdinov
2062: for useful comments.
2063: The construction of CS universal prediction strategies
2064: is based on Alex Smola's and G\'abor Lugosi's suggestions.
2065: This work was partially supported by MRC (grant S505/65).
2066:
2067: \begin{thebibliography}{10}
2068:
2069: \bibitem{algoet:1992}
2070: Paul~H\DOT{} Algoet.
2071: \newblock Universal schemes for prediction, gambling and portfolio selection.
2072: \newblock {\em Annals of Probability}, 20:901--941, 1992.
2073: \newblock Corrections: 23:474--478, 1995.
2074:
2075: \bibitem{anderson:1971}
2076: T\DOT{}~W\DOT{} Anderson.
2077: \newblock {\em The Statistical Analysis of Time Series}.
2078: \newblock Wiley, New York, 1971.
2079: \newblock Wiley Classics Library edition: 1994.
2080:
2081: \bibitem{billingsley:1968}
2082: Patrick Billingsley.
2083: \newblock {\em Convergence of Probability Measures}.
2084: \newblock Wiley, New York, 1968.
2085:
2086: \bibitem{blum/etal:1998}
2087: Lenore Blum, Felipe Cucker, Michael Shub, and Steve Smale.
2088: \newblock {\em Complexity and Real Computation}.
2089: \newblock Springer, New York, 1998.
2090:
2091: \bibitem{blum/etal:1989}
2092: Lenore Blum, Michael Shub, and Steve Smale.
2093: \newblock On a theory of computation and complexity over the real numbers:
2094: {NP}-completeness, recursive functions and universal machines.
2095: \newblock {\em Bulletin of the American Mathematical Society}, 21:1--46, 1989.
2096:
2097: \bibitem{bourbaki:integration}
2098: Nicolas Bourbaki.
2099: \newblock {\em El\'ements de math\'ematique, Livre VI, Int\'egration, Chapitres
2100: 1 \`a 4}.
2101: \newblock Hermann, Paris, first edition, 1952.
2102:
2103: \bibitem{cesabianchi/lugosi:2006}
2104: Nicol\`o Cesa-Bianchi and G\'abor Lugosi.
2105: \newblock {\em Prediction, Learning, and Games}.
2106: \newblock Cambridge University Press, Cambridge, 2006.
2107:
2108: \bibitem{cover:1975}
2109: Tom~M\DOT{} Cover.
2110: \newblock Open problems in information theory.
2111: \newblock In {\em Moscow Information Theory Workshop}, New York, 1975. IEEE
2112: Press.
2113:
2114: \bibitem{doob:1953}
2115: Joseph~L\DOT{} Doob.
2116: \newblock {\em Stochastic Processes}.
2117: \newblock Wiley, New York, 1953.
2118:
2119: \bibitem{dudley:2002}
2120: Richard~M. Dudley.
2121: \newblock {\em Real Analysis and Probability}, volume~74 of {\em Cambridge
2122: Studies in Advanced Mathematics}.
2123: \newblock Cambridge University Press, Cambridge, England, 2002.
2124: \newblock Originally published in 1989.
2125:
2126: \bibitem{engelking:1989}
2127: Ryszard Engelking.
2128: \newblock {\em General Topology}, volume~6 of {\em Sigma Series in Pure
2129: Mathematics}.
2130: \newblock Heldermann, Berlin, second edition, 1989.
2131:
2132: \bibitem{gruenhage:2006}
2133: Gary Gruenhage.
2134: \newblock The story of a topological game.
2135: \newblock {\em Rocky Mountain Journal of Mathematics}, 2006.
2136: \newblock To appear.
2137:
2138: \bibitem{gyorfi/lugosi:2001}
2139: L\'aszl\'o Gy\"orfi and G\'abor Lugosi.
2140: \newblock Strategies for sequential prediction of stationary time series.
2141: \newblock In Moshe Dror, Pierre L'Ecuyer, and Ferenc Szidarovszky, editors,
2142: {\em Modeling Uncertainty: An Examination of its Theory, Methods, and
2143: Applications}. Kluwer, 2001.
2144:
2145: \bibitem{gyorfi/etal:1999}
2146: L\'aszl\'o Gy\"orfi, G\'abor Lugosi, and G\DOT{} Morvai.
2147: \newblock A simple randomized algorithm for consistent sequential prediction of
2148: ergodic time series.
2149: \newblock {\em IEEE Transactions on Information Theory}, 45:2642--2650, 1999.
2150:
2151: \bibitem{hardy/etal:1952}
2152: G\DOT{}~H\DOT{} Hardy, John~E\DOT{} Littlewood, and George P\'olya.
2153: \newblock {\em Inequalities}.
2154: \newblock Cambridge University Press, Cambridge, second edition, 1952.
2155:
2156: \bibitem{kalnishkan/vyugin:2005}
2157: Yuri Kalnishkan and Michael~V\DOT{} Vyugin.
2158: \newblock The {W}eak {A}ggregating {A}lgorithm and weak mixability.
2159: \newblock In Peter Auer and Ron Meir, editors, {\em Proceedings of the
2160: Eighteenth Annual Conference on Learning Theory}, volume 3559 of {\em Lecture
2161: Notes in Computer Science}, pages 188--203, Berlin, 2005. Springer.
2162: \newblock The journal version is being prepared for the Special Issue of
2163: \emph{Journal of Machine Learning Research} devoted to COLT'2005; all
2164: references are to the journal version.
2165:
2166: \bibitem{kolmogorov:1939}
2167: Andrei~N\DOT{} Kolmogorov.
2168: \newblock Sur l'interpolation et extrapolation des suites stationnaires.
2169: \newblock {\em Comptes rendus de S\'eances de l'Academie des Sciences},
2170: 208:2043--2045, 1939.
2171:
2172: \bibitem{kolmogorov:1941CR-latin}
2173: Andrei~N\DOT{} Kolmogorov.
2174: \newblock Interpolation and extrapolation of stationary random sequences (in
2175: {R}ussian).
2176: \newblock {\em Izvestiya AN SSSR. Mathematics series}, 5:3--14, 1941.
2177:
2178: \bibitem{kolmogorov:1941-latin}
2179: Andrei~N\DOT{} Kolmogorov.
2180: \newblock Stationary sequences in {H}ilbert space (in {R}ussian).
2181: \newblock {\em Byulleten' MGU. Mathematics}, 2(6):1--40, 1941.
2182:
2183: \bibitem{kolmogorov/tikhomirov:1959latin}
2184: Andrei~N\DOT{} Kolmogorov and Vladimir~M\DOT{} Tikhomirov.
2185: \newblock $\epsilon$-entropy and $\epsilon$-capacity of sets in functional
2186: spaces (in {R}ussian).
2187: \newblock {\em Uspekhi Matematicheskikh Nauk}, 14(2):3--86, 1959.
2188:
2189: \bibitem{lehmann:1986}
2190: E\DOT{}~L\DOT{} Lehmann.
2191: \newblock {\em Testing Statistical Hypotheses}.
2192: \newblock Springer, New York, second edition, 1986.
2193:
2194: \bibitem{martin-lof:1970}
2195: Per Martin-L\"of.
2196: \newblock {\em Notes on Constructive Mathematics}.
2197: \newblock Almqvist \& Wiksell, Stockholm, 1970.
2198:
2199: \bibitem{naimpally/warrack:1970}
2200: Som~A\DOT{} Naimpally and Brian~D\DOT{} Warrack.
2201: \newblock {\em Proximity Spaces}, volume~59 of {\em Cambridge Tracts in
2202: Mathematics and Mathematical Physics}.
2203: \newblock Cambridge University Press, London, 1970.
2204:
2205: \bibitem{newton:2002}
2206: H\DOT{}~Joseph Newton.
2207: \newblock A conversation with {E}manuel {P}arzen.
2208: \newblock {\em Statistical Science}, 17:357--378, 2002.
2209:
2210: \bibitem{nobel:2003}
2211: Andrew~B\DOT{} Nobel.
2212: \newblock On optimal sequential prediction for general processes.
2213: \newblock {\em IEEE Transactions on Information Theory}, 49:83--98, 2003.
2214:
2215: \bibitem{ornstein:1978}
2216: D\DOT{}~S\DOT{} Ornstein.
2217: \newblock Guessing the next output of a stationary process.
2218: \newblock {\em Israel Journal of Mathematics}, 30:292--296, 1978.
2219:
2220: \bibitem{rudin:1991}
2221: Walter Rudin.
2222: \newblock {\em Functional Analysis}.
2223: \newblock McGraw-Hill, Boston, second edition, 1991.
2224:
2225: \bibitem{shafer/vovk:2001}
2226: Glenn Shafer and \Vladimir{} Vovk.
2227: \newblock {\em Probability and Finance: It's Only a Game!}
2228: \newblock Wiley, New York, 2001.
2229:
2230: \bibitem{vapnik:1998}
2231: Vladimir~N\DOT{} Vapnik.
2232: \newblock {\em Statistical Learning Theory}.
2233: \newblock Wiley, New York, 1998.
2234:
2235: \bibitem{vovk:1990}
2236: \Vladimir{} Vovk.
2237: \newblock Aggregating strategies.
2238: \newblock In Mark Fulk and John Case, editors, {\em Proceedings of the Third
2239: Annual Workshop on Computational Learning Theory}, pages 371--383, San Mateo,
2240: CA, 1990. Morgan Kaufmann.
2241:
2242: \bibitem{vovk:2001competitive}
2243: Vladimir Vovk.
2244: \newblock Competitive on-line statistics.
2245: \newblock {\em International Statistical Review}, 69:213--248, 2001.
2246:
2247: \bibitem{GTP17arXiv}
2248: \Vladimir{} Vovk.
2249: \newblock Predictions as statements and decisions.
2250: \newblock Technical Report \texttt{arXiv:cs.LG/0606093}, \texttt{arXiv.org}
2251: e-Print archive, June 2006.
2252:
2253: \bibitem{wiener:1949}
2254: Norbert Wiener.
2255: \newblock {\em Extrapolation, Interpolation, and Smoothing of Stationary Time
2256: Series with Engineering Applications}.
2257: \newblock Technology Press of the Massachusetts Institute of Technology,
2258: Cambridge, MA, 1949.
2259: \newblock Reprinted from a secret 1942 publication.
2260:
2261: \end{thebibliography}
2262:
2263: \ifWP
2264: \DFlastpage
2265: \fi
2266: \end{document}
2267: