0607:cs0607067/cs0607067

1: % Last changed: 13 Jul 2006

2: % Spell checked: 13 Jul 2006

3: % 2120 lines, 68 KB

4: \newif\ifJOURNAL

5: \JOURNALfalse

6: \newif\ifCONF

7: \CONFfalse

8: \newif\ifarXiv

9: \arXivfalse

10: \newif\ifWP

11: \WPfalse

12: \newif\ifFULL

13: \FULLfalse

14:

15: \newif\ifLATIN

16: \LATINfalse

17:

18: %\JOURNALtrue		% choose JOURNAL, arXiv, WP, or FULL

19: %\CONFtrue

20: \arXivtrue

21: %\WPtrue

22: %\FULLtrue		% this version is not for publication and contains extra remarks and questions

23:

24: %\LATINtrue		% LATIN means that the Cyrillic references should be set in Latin

25: \ifarXiv\LATINtrue\fi	% for submitting to arXiv

26:

27: \newif\ifnotJOURNAL	% derivative conditional

28: \notJOURNALtrue

29: \ifJOURNAL\notJOURNALfalse\fi

30:

31: \newif\ifnotarXiv	% derivative conditional

32: \notarXivtrue

33: \ifarXiv\notarXivfalse\fi

34:

35: \newif\ifTR		% derivative conditionals (TR = arXiv or WP)

36: \TRfalse

37: \ifarXiv\TRtrue\fi

38: \ifWP\TRtrue\fi

39: \newif\ifnotTR

40: \notTRtrue

41: \ifarXiv\notTRfalse\fi

42: \ifWP\notTRfalse\fi

43:

44: \newif\ifnotLATIN	% derivative conditional

45: \notLATINtrue

46: \ifLATIN\notLATINfalse\fi

47:

48: \ifJOURNAL

49:   \newcommand{\GTPVII}{vovk/shafer:2005RSS}

50:   \newcommand{\GTPVIII}{vovk/etal:2005AIStatslocal}

51:   \newcommand{\GTPX}{vovk/etal:2005ALT}

52:   \newcommand{\GTPXI}{GTP11arXiv-local}

53:   \newcommand{\GTPXIII}{vovk:2005ALT-GTP13}

54:   \newcommand{\GTPXIV}{vovk:2005ALT-GTP14}

55:   \newcommand{\GTPXVI}{GTP16arXiv-local}

56: \fi

57: \ifarXiv

58:   \newcommand{\GTPVII}{GTP7}

59:   \newcommand{\GTPVIII}{GTP8arXiv}

60:   \newcommand{\GTPX}{GTP10arXiv}

61:   \newcommand{\GTPXI}{GTP11arXiv}

62:   \newcommand{\GTPXIII}{GTP13arXiv}

63:   \newcommand{\GTPXIV}{GTP14arXiv}

64:   \newcommand{\GTPXVII}{GTP17arXiv}

65: \fi

66: \ifWP

67:   \newcommand{\GTPVII}{GTP7}

68:   \newcommand{\GTPVIII}{GTP8}

69:   \newcommand{\GTPX}{GTP10}

70:   \newcommand{\GTPXI}{GTP11}

71:   \newcommand{\GTPXIII}{GTP13}

72:   \newcommand{\GTPXIV}{GTP14}

73:   \newcommand{\GTPXVII}{GTP17}

74: \fi

75: \ifFULL

76:   \newcommand{\GTPVII}{GTP7}

77:   \newcommand{\GTPVIII}{GTP8arXiv}

78:   \newcommand{\GTPX}{GTP10arXiv}

79:   \newcommand{\GTPXI}{GTP11arXiv}

80:   \newcommand{\GTPXIII}{GTP13arXiv}

81:   \newcommand{\GTPXIV}{GTP14arXiv}

82: \fi

83:

84: \ifnotLATIN

85:   \newcommand{\KolmogorovTikhomirov}{kolmogorov/tikhomirov:1959}

86:   \newcommand{\KolmogorovCRfull}{kolmogorov:1941CR}

87:   \newcommand{\KolmogorovStationary}{kolmogorov:1941}

88: \fi

89: \ifLATIN

90:   \newcommand{\KolmogorovTikhomirov}{kolmogorov/tikhomirov:1959latin}

91:   \newcommand{\KolmogorovCRfull}{kolmogorov:1941CR-latin}

92:   \newcommand{\KolmogorovStationary}{kolmogorov:1941-latin}

93: \fi

94:

95: \ifJOURNAL

96: \documentclass[toc]{article}

97: \usepackage{amsmath,amsfonts,amssymb,latexsym,graphicx}

98: \newcommand{\Extra}[1]{}

99: \fi

100:

101: \ifCONF

102: \documentclass[toc]{article}

103: \usepackage{amsmath,amsfonts,amssymb,latexsym,graphicx}

104: \newcommand{\Extra}[1]{}

105: \fi

106:

107: \ifarXiv

108: \documentclass[toc]{article}

109: \usepackage{amsmath,amsfonts,amssymb,latexsym,graphicx}

110: \newcommand{\Extra}[1]{}

111: \fi

112:

113: \ifWP

114: \documentclass[toc]{gtarticle}

115: \usepackage{amsmath,amsfonts,amssymb,latexsym,epsfig,graphicx}

116: \renewcommand{\Extra}[1]{#1}

117: \fi

118:

119: \ifFULL

120: \documentclass{article}

121: \usepackage{amsmath,amsfonts,amssymb,latexsym,color,epigraph,graphicx,eepic}

122: \newcommand{\Extra}[1]{\red{#1}}

123: \newcommand{\red}[1]{\textcolor{red}{#1}}

124: \newcommand{\blue}[1]{\textcolor{blue}{#1}}

125: \newcommand{\bluebegin}{\begingroup\color{blue}}

126: \newcommand{\blueend}{\endgroup}

127: \newcommand{\redbegin}{\begingroup\color{red}}

128: \newcommand{\redend}{\endgroup}

129: \fi

130:

131: \emergencystretch=5mm

132: \tolerance=400

133: \allowdisplaybreaks[4]

134:

135: \newcommand{\Vladimir}{Vladimir}

136: \newcommand{\DOT}{.}

137:

138: \ifnotLATIN

139: \input{OT2enc.def}

140: \newenvironment{cyr}

141: {\fontencoding{OT2}\fontfamily{wncyr}\fontseries{m}\fontshape{n}\selectfont}

142: {\fontencoding{OT1}\fontfamily{tir}\selectfont}

143: \usepackage{CJK}

144: \fi

145:

146: \newcommand{\st}{\mathrel{\!|\!}}

147: \newcommand{\givn}{\mathrel{|}}

148: \newcommand{\D}{\,\mathrm{d}}

149: \newcommand{\dd}{\mathrm{d}}

150:

151: \newcommand{\K}{\mathcal{K}}		% capital

152: \newcommand{\kkk}{\mathbf{k}}		% kernel

153: \newcommand{\ccc}{\mathbf{c}}		% constant

154: \newcommand{\III}{\mathbb{I}}

155: \newcommand{\CCC}{\mathcal{C}}		% class of prediction rules

156: \newcommand{\FFF}{\mathcal{F}}		% function space

157: \newcommand{\GGG}{\mathcal{G}}		% function space

158: \newcommand{\HHH}{\mathcal{H}}		% Hilbert space

159: \newcommand{\PPP}{\mathcal{P}}		% all probability measures

160: \newcommand{\SSS}{\mathcal{S}}		% Sobolev space

161:

162: \newcommand{\Int}{\mathop{\mathrm{Int}}\nolimits}

163:

164: \newcommand{\bbbp}{\mathbb{P}}		% auxiliary (probability)

165: \newcommand{\Prob}{\mathop{\bbbp}\nolimits}

166: \newcommand{\bbbe}{\mathbb{E}}		% auxiliary (expectation)

167: \newcommand{\Expect}{\mathop{\bbbe}\nolimits}

168:

169: \newcommand{\cpc}{\mathop{\mathrm{cpc}}\nolimits}

170: \newcommand{\ucpc}{\mathop{\overline{\mathstrut\mathrm{cpc}}}\nolimits}

171: \newcommand{\lcpc}{\mathop{\underline{\mathstrut\mathrm{cpc}}}\nolimits}

172:

173: \newcommand{\bbbr}{\mathbb{R}}		% the real numbers

174:

175: \newtheorem{lemma}{Lemma}

176: \newtheorem{proposition}{Proposition}

177: \newtheorem{corollary}{Corollary}

178: \newtheorem{remark}{Remark}

179: \newtheorem{theorem}{Theorem}

180: \newenvironment{proof}

181:   {\trivlist\item[\hskip\labelsep\textbf{Proof}]}

182:   {\endtrivlist}

183:

184: \newenvironment{Proof}[1]

185:   {\trivlist\item[\hskip\labelsep\textbf{Proof #1\,}]}

186:   {\endtrivlist}

187: \newcommand{\boxforqed}{\rule{.3em}{1.5ex}}

188: \newcommand{\qedtext}{\unskip\nobreak\hfil

189:   \penalty50\hskip1em\null\nobreak\hfil\boxforqed

190:   \parfillskip=0pt\finalhyphendemerits=0\endgraf}

191: %\newcommand{\qedmath}{\eqno\boxforqed}

192: \newcommand{\qedmath}{\tag*{\boxforqed}}

193: \newenvironment{remark*}

194:   {\trivlist\item[\hskip\labelsep{\bfseries Remark}]\relax}

195:   {\endtrivlist}

196:

197: \ifJOURNAL

198: \title{Competing with stationary prediction strategies}

199: \author{Vladimir Vovk\\[5mm]

200:  Computer Learning Research Centre\\

201:   Department of Computer Science\\

202:   Royal Holloway, University of London,

203:   Egham, Surrey TW20 0EX, UK\\

204:   \texttt{vovk@cs.rhul.ac.uk}}

205: \fi

206:

207: \ifCONF

208: \title{Competing with stationary prediction strategies}

209: \author{Vladimir Vovk\\[5mm]

210:  Computer Learning Research Centre\\

211:   Department of Computer Science\\

212:   Royal Holloway, University of London,

213:   Egham, Surrey TW20 0EX, UK\\

214:   \texttt{vovk@cs.rhul.ac.uk}}

215: \fi

216:

217: \ifarXiv

218: \title{Competing with stationary prediction strategies}%\\(draft: comments welcome)}

219: \author{Vladimir Vovk\\

220: \texttt{vovk{\rm@}cs.rhul.ac.uk}\\

221: \texttt{http://vovk.net}}

222: \fi

223:

224: \ifWP

225: \title{Competing with stationary prediction strategies}

226: \author{Vladimir Vovk}

227: \newcommand{\No}{18}

228: % For the two dates option: uncomment the next 2 lines

229: % \twodatestrue

230: % \newcommand{\firstposted}{July 13, 2006}

231: \fi

232:

233: \ifFULL

234: \title{Competing with stationary prediction strategies}

235: \author{Vladimir Vovk\\

236: \texttt{vovk{\rm@}cs.rhul.ac.uk}\\

237: \texttt{http://vovk.net}}

238: \fi

239:

240: \begin{document}

241: \maketitle

242: \begin{abstract}

243:   In this paper we introduce the class of stationary prediction strategies

244:   and construct a prediction algorithm

245:   that asymptotically performs as well as the best continuous stationary strategy.

246:   We make mild compactness assumptions but no stochastic assumptions

247:   about the environment.

248:   In particular,

249:   no assumption of stationarity is made about the environment,

250:   and the stationarity of the considered strategies

251:   only means that they do not depend explicitly on time;

252:   we argue that it is natural to consider only stationary strategies

253:   even for highly non-stationary environments.

254: \end{abstract}

255:

256: \section{Introduction}

257: \label{sec:introduction}

258:

259: This paper belongs to the area of learning theory

260: that has been variously referred to as prediction with expert advice,

261: competitive on-line prediction,

262: prediction of individual sequences,

263: and universal on-line learning;

264: see \cite{cesabianchi/lugosi:2006} for a review.

265: There are many proof techniques known in this field;

266: this paper is based on Kalnishkan and Vyugin's Weak Aggregating Algorithm

267: \cite{kalnishkan/vyugin:2005},

268: but it is possible that some of the numerous other techniques

269: could be used instead.

270:

271: In Section \ref{sec:results} we give the main definitions

272: and state our main results, Theorems \ref{thm:deterministic-compact}--\ref{thm:randomized};

273: their proofs are given

274: in Sections \ref{sec:proof-deterministic-compact}--\ref{sec:proof-randomized}.

275: In Section \ref{sec:stationarity}

276: we informally discuss the notion of stationarity,

277: and Section \ref{sec:conclusion} concludes.

278:

279: \section{Main results}

280: \label{sec:results}

281:

282: The \emph{game of prediction} between Predictor and Reality

283: is played according to the following protocol

284: (of \emph{perfect information},

285: in the sense that either player can see the other player's moves made so far).

286:

287: \bigskip

288:

289: \noindent

290: \textsc{Prediction protocol}\nopagebreak

291: \begin{tabbing}

292:   \qquad\=\qquad\=\qquad\kill

293:   Reality announces $(\ldots,x_{-1},y_{-1},x_0,y_0)\in(\mathbf{X}\times\mathbf{Y})^{\infty}$.\\

294:   FOR $n=1,2,\dots$:\\

295:   \> Reality announces $x_n\in\mathbf{X}$.\\

296:   \> Predictor announces $\gamma_n\in\Gamma$.\\

297:   \> Reality announces $y_n\in\mathbf{Y}$.\\

298:   END FOR.

299: \end{tabbing}

300:

301: \noindent

302: After Reality's first move the game proceeds in rounds numbered by the positive integers $n$.

303: At the beginning of each round $n=1,2,\ldots$ Predictor is given some signal $x_n$

304: relevant to predicting the following observation $y_n$.

305: The signal is taken from the \emph{signal space} $\mathbf{X}$

306: and the observations from the \emph{observation space} $\mathbf{Y}$.

307: Predictor then announces his prediction $\gamma_n$,

308: taken from the \emph{prediction space} $\Gamma$,

309: and the prediction's quality in light of the actual observation

310: is measured by a \emph{loss function}

311: $\lambda:\Gamma\times\mathbf{Y}\to\bbbr$.

312: % (The prediction protocol with a fixed loss function

313: % will sometimes be referred to as a \emph{prediction game},

314: % or \emph{game of prediction}.)

315: At the beginning of the game Reality chooses the infinite past,

316: $(x_n,y_n)$ for all $n\le0$.

317:

318: In the games of prediction traditionally considered in machine learning

319: there is no infinite past.

320: This situation is modeled in our framework by extending the signal space and observation space

321: by new elements ${?}\in\mathbf{X}$ and ${?}\in\mathbf{Y}$,

322: defining $\lambda(\gamma,{?})$ arbitrarily,

323: and making Reality announce the infinite past

324: $(\ldots,x_{-1},y_{-1},x_0,y_0)=(\ldots,{?},{?},{?},{?})$

325: and refrain from announcing $x_n={?}$ or $y_n={?}$ afterwards

326: (intuitively, $?$ corresponds to ``no feedback from Reality'').

327:

328: We will always assume that the signal space $\mathbf{X}$,

329: the prediction space $\Gamma$,

330: and the observation space $\mathbf{Y}$

331: are non-empty topological spaces

332: and that the loss function $\lambda$ is continuous.

333: Moreover,

334: we are mainly interested in the case

335: where $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$ are locally compact metric spaces,

336: the prime examples being Euclidean spaces and their open and closed subsets.

337: Our first results will be stated for the case

338: where all three spaces $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$ are compact.

339:

340: \begin{remark*}

341:   Our results can be easily extended to the case

342:   where the loss on the $n$th round is allowed to depend,

343:   in addition to $\gamma_n$ and $y_n$,

344:   on the past $\ldots,x_{n-1},y_{n-1},x_n$.

345:   This would, however, complicate the notation.

346: \end{remark*}

347:

348: Predictor's strategies in the prediction protocol will be called

349: \emph{prediction strategies}

350: (or \emph{prediction algorithms},

351: when they are defined explicitly and we want to emphasize this).

352: Mathematically such a strategy is a function

353: $D:(\mathbf{X}\times\mathbf{Y})^{\infty}\times\mathbf{X}\times\{1,2,\ldots\}\to\Gamma$;

354: it maps each history $(\ldots,x_{n-1},y_{n-1},x_n)$

355: and the current time $n$ to the chosen prediction.

356: In this paper we will only be interested in continuous prediction strategies $D$

357: (according to the traditional point of view \cite{martin-lof:1970},

358: going back to Brouwer,

359: only continuous prediction strategies can be computable;

360: although it should be mentioned that nowadays

361: there are influential definitions of computability

362: \cite{blum/etal:1989,blum/etal:1998}

363: not requiring continuity).

364: An especially natural class of strategies

365: is formed by the \emph{stationary prediction strategies}

366: $D:(\mathbf{X}\times\mathbf{Y})^{\infty}\times\mathbf{X}\to\Gamma$,

367: which do not depend on time explicitly;

368: since the origin of time is usually chosen arbitrarily,

369: this appears a reasonable restriction

370: (see Section \ref{sec:stationarity} for a further discussion).

371:

372: \subsection*{Universal prediction strategies: compact deterministic case}

373:

374: In this and next subsections we will assume that the spaces $\mathbf{X},\Gamma,\mathbf{Y}$

375: are all compact.

376: A prediction strategy is \emph{CS universal} for a loss function $\lambda$ if

377: its predictions $\gamma_n$ satisfy

378: \begin{equation}\label{eq:dominates-deterministic-compact}

379:   \limsup_{N\to\infty}

380:   \Biggl(

381:     \frac1N

382:     \sum_{n=1}^N

383:     \lambda

384:     (\gamma_n,y_n)

385:     {}-

386:     \frac1N

387:     \sum_{n=1}^N

388:     \lambda

389:     \bigl(

390:       D(\ldots,x_{n-1},y_{n-1},x_n),y_n

391:     \bigr)

392:   \Biggr)

393:   \le

394:   0

395: \end{equation}

396: for any continuous stationary prediction strategy $D$

397: and any biinfinite $\ldots,x_{-1},y_{-1},x_{0},y_{0},x_{1},y_{1},\ldots$\,.

398: (``CS'' refers to the continuity and stationarity of the prediction strategies

399: we are competing with.)

400: \begin{theorem}\label{thm:deterministic-compact}

401:   Suppose $\mathbf{X}$ and $\mathbf{Y}$ are compact metric spaces,

402:   $\Gamma$ is a compact convex subset of a Banach space,

403:   and the loss function $\lambda(\gamma,y)$ is continuous in $(\gamma,y)$

404:   and convex in the variable $\gamma\in\Gamma$.

405:   There exists a CS universal prediction algorithm.

406: \end{theorem}

407: A CS universal prediction algorithm will be constructed in the next section.

408:

409: \subsection*{Universal prediction strategies: compact randomized case}

410:

411: When the loss function $\lambda(\gamma,y)$ is not convex in $\gamma$,

412: two difficulties appear:

413: \begin{itemize}

414: \item

415:   the conclusion of Theorem \ref{thm:deterministic-compact} becomes false

416:   if the convexity requirement is removed

417:   (\cite{kalnishkan/vyugin:2005}, Theorem 2);

418: \item

419:   in some cases the notion of a continuous prediction strategy becomes vacuous:

420:   e.g., there are no non-constant continuous stationary prediction strategies

421:   when $\Gamma=\{0,1\}$

422:   and $(\mathbf{X}\times\mathbf{Y})^{\infty}\times\mathbf{X}$ is connected

423:   (the latter condition is equivalent to $\mathbf{X}$ and $\mathbf{Y}$

424:   being connected---see \cite{engelking:1989}, Theorem 6.1.15).

425: \end{itemize}

426: To overcome these difficulties,

427: we consider randomized prediction strategies.

428: The proof of Theorem \ref{thm:deterministic-compact}

429: will give a universal, in a natural sense,

430: randomized prediction algorithm;

431: on the other hand,

432: there will be a vast supply of continuous stationary prediction strategies.

433:

434: \begin{remark*}

435:   In fact,

436:   the second difficulty is more apparent than real:

437:   for example, in the binary case ($\mathbf{Y}=\{0,1\}$)

438:   there are many non-trivial continuous prediction strategies

439:   in the canonical form of the prediction game \cite{vovk:1990}

440:   with the prediction space redefined as the boundary of the set of superpredictions

441:   \cite{kalnishkan/vyugin:2005}.

442: \end{remark*}

443:

444: A \emph{randomized prediction strategy} is a function

445: $D:(\mathbf{X}\times\mathbf{Y})^{\infty}\times\mathbf{X}\times\{1,2,\ldots\}\to\PPP(\Gamma)$

446: mapping the past complemented by the current time

447: to the probability measures on the prediction space;

448: $\PPP(\Gamma)$ is always equipped with the topology of weak convergence

449: (\cite{billingsley:1968};

450: this topology is also discussed, in the compact case,

451: in Section \ref{sec:proof-randomized-compact} below).

452: In other words, this is a prediction strategy

453: in the extended game of prediction with the prediction space $\PPP(\Gamma)$.

454: Analogously,

455: a \emph{stationary randomized prediction strategy} is a function

456: $D:(\mathbf{X}\times\mathbf{Y})^{\infty}\times\mathbf{X}\to\PPP(\Gamma)$.

457:

458: Let us say that a randomized prediction strategy outputting $\gamma_n$

459: is \emph{CS universal} for a loss function $\lambda$ if,

460: for any continuous stationary randomized prediction strategy $D$

461: and any biinfinite $\ldots,x_{-1},y_{-1},x_{0},y_{0},x_{1},y_{1},\ldots$,

462: \begin{equation}\label{eq:dominates-randomized-compact}

463:   \limsup_{N\to\infty}

464:   \left(

465:     \frac1N

466:     \sum_{n=1}^N

467:     \lambda(g_{n},y_n)

468:     -

469:     \frac1N

470:     \sum_{n=1}^N

471:     \lambda(d_{n},y_n)

472:   \right)

473:   \le

474:   0

475:   \enspace

476:   \textrm{a.s.},

477: \end{equation}

478: where $g_{1},g_{2},\ldots,d_{1},d_{2},\ldots$ are independent random variables

479: distributed as

480: \begin{align}

481:   g_{n}

482:   &\sim

483:   \gamma_n\label{eq:distributed-1},\\

484:   d_{n}

485:   &\sim

486:   D(\ldots,x_{n-1},y_{n-1},x_n),\label{eq:distributed-2}

487: \end{align}

488: $n=1,2,\ldots$\,.

489: Intuitively,

490: the ``a.s.''\ in (\ref{eq:dominates-randomized-compact})

491: refers to the prediction strategies' internal randomization.

492: \begin{theorem}\label{thm:randomized-compact}

493:   Let $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$ be compact metric spaces

494:   and $\lambda$ be a continuous loss function.

495:   There exists a CS universal randomized prediction algorithm.

496: \end{theorem}

497:

498: \ifFULL\bluebegin

499:   Let $\Sigma:=(\mathbf{X}\times\mathbf{Y})^{\infty}\mathbf{X}$ be a metric space.

500:   For any discrete (e.g., finite) subset $\{\sigma_1,\sigma_2,\ldots\}$ of $\Sigma$

501:   and any sequence $\gamma_n\in\PPP(\Gamma)$ of probability measures on $\Gamma$

502:   there exists a continuous stationary randomized prediction strategy $D$

503:   such that $D(\sigma_n)=\gamma_n$ for all $n$

504:   (indeed, it suffices to set $D(\sigma):=\sum_n\phi_n(\sigma)\gamma_n$,

505:   where $\phi_n:\Sigma\to[0,1]$, $n=1,2,\ldots$,

506:   are continuous functions with disjoint supports

507:   such that $\phi_n(\sigma_n)=1$ for all $n$).

508:   Therefore, there is no shortage of continuous stationary randomized prediction strategies.

509: \blueend\fi

510:

511: \subsection*{Simple reductions to the compact case}

512:

513: In the following two subsections we will discuss the case

514: where the signal, prediction, and observation spaces

515: are not required to be compact.

516: The goal of this subsection is to show that the compact case

517: is not as special as it may seem,

518: as far as Theorem \ref{thm:randomized-compact} is concerned.

519: The rest of the paper does not depend on this subsection.

520:

521: In general,

522: we might consider $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$

523: together with their fixed compactifications

524: $\overline{\mathbf{X}}$, $\overline{\Gamma}$, and $\overline{\mathbf{Y}}$

525: (without loss of generality we can and will assume that

526: $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$

527: are dense in their compactifications,

528: and then the compactifications will be the closures of the original spaces,

529: which explains our notation).

530: \ifFULL\bluebegin

531:   Problem in the case of Theorem \ref{thm:deterministic-compact}:

532:   $\overline{\Gamma}$ may cease to be a compact convex subset of a Banach space.

533: \blueend\fi

534: Let us suppose that $\lambda$ is bounded and continuous,

535: and, moreover, can be continuously extended to the product

536: $\overline{\Gamma}\times\overline{\mathbf{Y}}$

537: of the compactifications;

538: such an extension is then unique and will also be denoted $\lambda$.

539:

540: If $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$

541: are Euclidean spaces their natural compactifications

542: might be chosen as Aleksandrov's one-point compactification

543: (\cite{engelking:1989}, Theorem 3.5.11),

544: the corresponding projective space

545: (with $\bbbr\mathrm{P}^L$ being the compactification of $\bbbr^L$),

546: or the corresponding closed unit ball

547: (with the interior of the closed unit ball in $\bbbr^L$

548: identified with $\bbbr^L$

549: by mapping a vector $v$ of length $l\in[0,1)$ in the former set

550: to the vector $(\tan(\pi l/2))v$).

551: The Stone--\v{C}ech compactification

552: (\cite{engelking:1989}, Section 3.6)

553: will usually be too large:

554: we will want our compactifications to be metrizable.

555:

556: Theorem \ref{thm:randomized-compact} will remain true

557: if instead of assuming $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$ to be metric compacts

558: we assume that

559: $\overline{\mathbf{X}}$, $\overline{\Gamma}$, and $\overline{\mathbf{Y}}$

560: are metric compacts

561: and if in the definition of CS universality (\ref{eq:dominates-randomized-compact})

562: we only consider continuous stationary prediction strategies

563: that have a continuous extension to

564: $(\overline{\mathbf{X}}\times\overline{\mathbf{Y}})^{\infty}\times\overline{\mathbf{X}}$.

565:

566: \ifFULL\bluebegin

567:   As an example,

568:   suppose $\mathbf{X}$ is a Euclidean space

569:   and consider a prediction strategy

570:   $D(\ldots,x_{n-1},y_{n-1},x_{n})$ that only depends on $x_n$.

571:   Then $D$ can be extended to the compactification of $\mathbf{X}$ if it:

572:   tends to a limit as $\left\|x\right\|\to\infty$

573:   (in the case of Aleksandrov's compactification);

574:   tends to a limit in every direction

575:   (in the case of the closed unit ball);

576:   tends to a limit in every direction

577:   with the limits in opposite directions coinciding

578:   (in the case of the projective space).

579: \blueend\fi

580:

581: \begin{remark*}

582:   An elegant way to avoid considering compactifications

583:   would be to assume that $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$

584:   are metrizable proximity spaces

585:   (see \cite{engelking:1989}, Section 8.4, or \cite{naimpally/warrack:1970},

586:   where \cite{engelking:1989}'s ``proximity spaces'' are called ``separated proximity spaces'')

587:   and to consider only proximity prediction strategies.

588:   By Smirnov's theorem (\cite{engelking:1989}, Theorem 8.4.13 and also Theorem 8.4.9;

589:   \cite{naimpally/warrack:1970}, Theorem 7.7)

590:   a proximity space can be identified with the corresponding topological space

591:   equipped with a compactification.

592:   Assuming that the loss function $\lambda$ is a bounded proximity function,

593:   it can be uniquely continuously extended to the compactification

594:   $\overline{\Gamma}\times\overline{\mathbf{Y}}$

595:   (\cite{naimpally/warrack:1970}, Theorem 7.10),

596:   and every proximity stationary prediction strategy can be identified

597:   with a continuous function on the compactification

598:   $(\overline{\mathbf{X}}\times\overline{\mathbf{Y}})^{\infty}\times\overline{\mathbf{X}}$

599:   (by the same theorem).

600:   To ensure that the compactifications are metrizable,

601:   it is sufficient to assume that the proximity spaces are second-countable

602:   (i.e., have countable proximity weights;

603:   see \cite{naimpally/warrack:1970}, Theorem 8.14,

604:   and \cite{engelking:1989}, Theorem 4.2.8).

605:   We chose the slightly clumsier language of compactifications

606:   because the notion of a topological space is much more familiar

607:   than that of a proximity space.

608: \end{remark*}

609:

610: \subsection*{Universal prediction strategies: deterministic case}

611:

612: Let us say that a set in a topological space is \emph{precompact}

613: if its closure is compact.

614: In Euclidean spaces,

615: precompactness means boundedness.

616: In this and next subsections we drop the assumption of compactness

617: of $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$,

618: and so we have to redefine the notion of CS universality.

619:

620: A prediction strategy outputting $\gamma_n\in\PPP(\Gamma)$

621: is \emph{CS universal}

622: for a loss function $\lambda$ if,

623: for any continuous stationary prediction strategy $D$

624: and for any biinfinite $\ldots,x_{-1},y_{-1},x_{0},y_{0},x_{1},y_{1},\ldots$,

625: \begin{multline}\label{eq:dominates-deterministic}

626:   \bigl(

627:     \{\ldots,x_{-1},x_0,x_1,\ldots\}

628:     \text{ and }

629:     \{\ldots,y_{-1},y_0,y_1,\ldots\}

630:     \text{ are precompact}

631:   \bigr)\\

632:   \Longrightarrow

633:   \limsup_{N\to\infty}

634:   \Biggl(

635:     \frac1N

636:     \sum_{n=1}^N

637:     \lambda(\gamma_n,y_n)

638:     -

639:     \frac1N

640:     \sum_{n=1}^N

641:     \lambda

642:     \bigl(

643:       D(\ldots,x_{n-1},y_{n-1},x_n),y_n

644:     \bigr)

645:   \Biggr)

646:   \le

647:   0.

648: \end{multline}

649: The intuition behind the antecedent of (\ref{eq:dominates-deterministic}),

650: in the Euclidean case,

651: is that the prediction algorithm

652: knows that $\left\|x_n\right\|$ and $\left\|y_n\right\|$ are bounded

653: but does not know an upper bound in advance.

654:

655: Let us say that the loss function $\lambda$ is \emph{large at infinity}

656: if, for all $y^*\in\mathbf{Y}$,

657: \begin{equation*}

658:   \lim_{\substack{y\to y^*\\\gamma\to\infty}}

659:   \lambda(\gamma,y)

660:   =

661:   \infty

662: \end{equation*}

663: (in the sense that for each constant $M$

664: there exists a neighborhood $O_{y^*}\ni y^*$ and compact $C\subseteq\Gamma$ such that

665: $\lambda\left(\Gamma\setminus C,O_{y^*}\right)\subseteq(M,\infty)$).

666: Intuitively, we require that faraway $\gamma\in\Gamma$

667: should be poor predictions for nearby $y^*\in\mathbf{Y}$.

668: This assumption is satisfied for most of the usual loss functions

669: used in competitive on-line prediction.

670: \ifFULL\bluebegin

671:   (A notable exception is the \emph{log-loss game},

672:   where $\Gamma=(0,1)$, $\mathbf{Y}=\{0,1\}$,

673:   and $\lambda(\gamma,y)=-y\ln\gamma-(1-y)\ln(1-\gamma)$;

674:   for the log-loss game our construction still works

675:   if we replace the WAA of \cite{kalnishkan/vyugin:2005}

676:   by the AA of \cite{vovk:1990} in the proof.)

677: \blueend\fi

678: \begin{theorem}\label{thm:deterministic}

679:   Suppose $\mathbf{X}$ and $\mathbf{Y}$ are locally compact metric spaces,

680:   $\Gamma$ is a convex subset of a Banach space,

681:   and the loss function $\lambda(\gamma,y)$ is continuous,

682:   large at infinity, and convex in the variable $\gamma\in\Gamma$.

683:   There exists a CS universal prediction algorithm.

684: \end{theorem}

685: To have a specific example in mind,

686: the reader might check that $\mathbf{X}=\bbbr^{K}$, $\Gamma=\mathbf{Y}=\bbbr^{L}$,

687: and $\lambda(\gamma,y):=\left\|y-\gamma\right\|$

688: satisfy the conditions of the theorem.

689:

690: \subsection*{Universal prediction strategies: randomized case}

691:

692: We say that a randomized prediction strategy

693: outputting randomized predictions $\gamma_n$

694: is \emph{CS universal} if,

695: for any continuous stationary randomized prediction strategy $D$

696: and for any biinfinite $\ldots,x_{-1},y_{-1},x_{0},y_{0},x_{1},y_{1},\ldots$,

697: \begin{multline}\label{eq:dominates-randomized}

698:   \bigl(

699:     \{\ldots,x_{-1},x_0,x_1,\ldots\}

700:     \text{ and }

701:     \{\ldots,y_{-1},y_0,y_1,\ldots\}

702:     \text{ are precompact}

703:   \bigr)\\

704:   \Longrightarrow

705:   \left(

706:     \limsup_{N\to\infty}

707:     \left(

708:       \frac1N

709:       \sum_{n=1}^N

710:       \lambda(g_{n},y_n)

711:       -

712:       \frac1N

713:       \sum_{n=1}^N

714:       \lambda(d_{n},y_n)

715:     \right)

716:     \le

717:     0

718:     \enspace

719:     \textrm{a.s.}

720:   \right),

721: \end{multline}

722: where $g_{1},g_{2},\ldots,d_{1},d_{2},\ldots$ are independent random variables

723: distributed according to (\ref{eq:distributed-1})--(\ref{eq:distributed-2}).

724: \begin{theorem}\label{thm:randomized}

725:   Let $\mathbf{X}$ and $\mathbf{Y}$ be locally compact metric spaces,

726:   $\Gamma$ be a metric space,

727:   and $\lambda$ be a continuous and large at infinity loss function.

728:   There exists a CS universal randomized prediction algorithm.

729: \end{theorem}

730:

731: \section{Proof of Theorem \ref{thm:deterministic-compact}}

732: \label{sec:proof-deterministic-compact}

733:

734: In the rest of the paper

735: we will be using the notation $\Sigma$ for $(\mathbf{X}\times\mathbf{Y})^{\infty}\times\mathbf{X}$.

736: By Tikhonov's theorem (\cite{engelking:1989}, Theorem 3.2.4)

737: this is a compact space;

738: it is also metrizable

739: (\cite{engelking:1989}, Theorem 4.2.2).

740: Another standard piece of notation throughout the rest of the paper

741: will be $\sigma_n:=(\ldots,x_{n-1},y_{n-1},x_n)\in\Sigma$.

742: Remember that $\lambda$, as a continuous function on a compact set,

743: is bounded below and above (\cite{engelking:1989}, Theorem 3.10.6).

744:

745: Let $\Gamma^{\Sigma}$ be the set of all continuous functions

746: from $\Sigma$ to $\Gamma$

747: with the \emph{topology of uniform convergence},

748: generated by the metric

749: \begin{equation*}

750:   \hat\rho(D_1,D_2)

751:   :=

752:   \sup_{\sigma\in\Sigma}

753:   \rho

754:   \bigl(

755:     D_1(\sigma),D_2(\sigma)

756:   \bigr),

757: \end{equation*}

758: $\rho$ being the metric in $\Gamma$

759: (induced by the norm in the containing Banach space).

760: Since the topological space $\Gamma^{\Sigma}$ is separable

761: (\cite{engelking:1989}, Corollary 4.2.18

762: in combination with Theorem 4.2.8),

763: we can choose a dense sequence $D_1,D_2,\ldots$ in $\Gamma^{\Sigma}$.

764:

765: \begin{remark*}

766:   The topology in $\Gamma^{\Sigma}$ is defined via a metric,

767:   and this is one the very few places in this paper where we need a specific metric

768:   (for brevity we often talk about ``metric spaces'',

769:   but this can always be replaced by ``metrizable topological spaces'').

770:   Without using the metric,

771:   we could say that the topology in $\Gamma^{\Sigma}$ is the compact-open topology

772:   (\cite{engelking:1989}, Section 3.4).

773:   Since $\Sigma$ is compact,

774:   the compact-open topology on $\Gamma^{\Sigma}$

775:   coincides with the topology of uniform convergence

776:   (\cite{engelking:1989}, Theorem 4.2.17).

777:   The separability of $\Gamma^{\Sigma}$ now follows

778:   from \cite{engelking:1989}, Theorem 3.4.16 in combination with Theorem 4.2.8.

779: \end{remark*}

780:

781: The next step is to apply Kalnishkan and Vyugin's

782: \cite{kalnishkan/vyugin:2005}

783: Weak Aggregating Algorithm (WAA) to this sequence.

784: We cannot just refer to \cite{kalnishkan/vyugin:2005}

785: and will have to redo their derivation of the WAA's main property

786: since Kalnishkan and Vyugin only consider the case

787: of finitely many ``experts'' $D_k$

788: and finite $\mathbf{Y}$.

789: (Although in other respects

790: we will not need their algorithm in full generality

791: and so slightly simplify it.)

792:

793: Let $q_1,q_2,\ldots$ be a sequence of positive numbers summing to 1,

794: $\sum_{k=1}^{\infty}q_k=1$.

795: Define

796: \begin{equation*}

797:   l_n^{(k)}

798:   :=

799:   \lambda

800:   \left(

801:     D_k(\sigma_n),y_n

802:   \right),

803:   \quad

804:   L_N^{(k)}

805:   :=

806:   \sum_{n=1}^N

807:   l_n^{(k)}

808: \end{equation*}

809: to be the instantaneous loss of the $k$th expert $D_k$ on the $n$th round

810: and his cumulative loss over the first $N$ rounds.

811: For all $n,k=1,2,\ldots$ define

812: \begin{equation*}

813:   w_n^{(k)}

814:   :=

815:   q_k

816:   \beta_n^{L_{n-1}^{(k)}},

817:   \quad

818:   \beta_n

819:   :=

820:   \exp

821:   \left(

822:     -\frac{1}{\sqrt{n}}

823:   \right)

824: \end{equation*}

825: ($w_n^{(k)}$ are the weights of the experts to use on round $n$)

826: and

827: \begin{equation*}

828:   p_n^{(k)}

829:   :=

830:   \frac

831:   {w_n^{(k)}}

832:   {\sum_{k=1}^{\infty}w_n^{(k)}}

833: \end{equation*}

834: (the normalized weights;

835: it is obvious that the denominator is positive and finite).

836: The WAA's prediction on round $n$ is

837: \begin{equation}\label{eq:WAA}

838:   \gamma_n

839:   :=

840:   \sum_{k=1}^{\infty}

841:   p_n^{(k)}

842:   D_k(\sigma_n)

843: \end{equation}

844: (the series is convergent in the Banach space

845: since the compactness of $\Gamma$ implies

846: $\sup_{\gamma\in\Gamma}\left\|\gamma\right\|<\infty$,

847: and $\gamma_n\in\Gamma$ since

848: \begin{multline}\label{eq:convergence-to-0}

849:   \gamma_n

850:   -

851:   \sum_{k=1}^K

852:   \frac{p_n^{(k)}}{\sum_{k=1}^K p_n^{(k)}}

853:   D_k(\sigma_n)\\

854:   =

855:   \sum_{k=1}^K

856:   \left(

857:     1

858:     -

859:     \frac{1}{\sum_{k=1}^K p_n^{(k)}}

860:   \right)

861:   p_n^{(k)}

862:   D_k(\sigma_n)

863:   +

864:   \sum_{k=K+1}^{\infty}

865:   p_n^{(k)}

866:   D_k(\sigma_n)

867:   \to

868:   0

869: \end{multline}

870: as $K\to\infty$).

871:

872: Let $l_n:=\lambda(\gamma_n,y_n)$ be the WAA's loss on round $n$

873: and

874: $

875:   L_N

876:   :=

877:   \sum_{n=1}^N

878:   l_n

879: $

880: be its cumulative loss over the first $N$ rounds.

881: \begin{lemma}[\cite{kalnishkan/vyugin:2005}, Lemma 9]\label{lem:9}

882:   The WAA guarantees that, for all $N$,

883:   \begin{equation}\label{eq:lemma9}

884:     L_N

885:     \le

886:     \sum_{n=1}^N

887:     \sum_{k=1}^{\infty}

888:     p_n^{(k)}

889:     l_n^{(k)}

890:     -

891:     \sum_{n=1}^N

892:     \log_{\beta_n}

893:     \sum_{k=1}^{\infty}

894:     p_n^{(k)}

895:     \beta_n^{l_n^{(k)}}

896:     +

897:     \log_{\beta_N}

898:     \sum_{k=1}^{\infty}

899:     q_k

900:     \beta_N^{L_N^{(k)}}.

901:   \end{equation}

902: \end{lemma}

903: The first two terms on the right-hand side of (\ref{eq:lemma9})

904: are sums over the first $N$ rounds of different kinds of mean of the experts' losses

905: (see, e.g., \cite{hardy/etal:1952}, Chapter III,

906: for a general definition of the mean);

907: we will see later that they nearly cancel each other out.

908: If those two terms are ignored,

909: the remaining part of (\ref{eq:lemma9}) is identical

910: (except that $\beta$ now depends on $n$)

911: to the main property of the ``Aggregating Algorithm''

912: (see, e.g., \cite{vovk:2001competitive}, Lemma 1).

913: All infinite series in (\ref{eq:lemma9}) are trivially convergent.

914: \begin{Proof}{of Lemma \ref{lem:9}}

915:   The proof is by induction on $N$.

916:   Assuming (\ref{eq:lemma9}),

917:   we obtain

918:   \begin{multline*}

919:     L_{N+1}

920:     =

921:     L_N + l_{N+1}

922:     \le

923:     L_N

924:     +

925:     \sum_{k=1}^{\infty}

926:     p_{N+1}^{(k)}

927:     l_{N+1}^{(k)}\\

928:     \le

929:     \sum_{n=1}^{N+1}

930:     \sum_{k=1}^{\infty}

931:     p_n^{(k)}

932:     l_n^{(k)}

933:     -

934:     \sum_{n=1}^N

935:     \log_{\beta_n}

936:     \sum_{k=1}^{\infty}

937:     p_n^{(k)}

938:     \beta_n^{l_n^{(k)}}

939:     +

940:     \log_{\beta_N}

941:     \sum_{k=1}^{\infty}

942:     q_k

943:     \beta_N^{L_N^{(k)}}

944:   \end{multline*}

945:   (the first ``$\le$'' used the ``countable convexity''

946:   $l_n\le\sum_{k=1}^{\infty}p_n^{(k)}l_n^{(k)}$,

947:   which follows from (\ref{eq:convergence-to-0}) and

948:   \begin{equation*}

949:     \lambda

950:     \left(

951:       \sum_{k=1}^K

952:       \frac{p_n^{(k)}}{\sum_{k=1}^K p_n^{(k)}}

953:       D_k(\sigma_n),

954:       y_n

955:     \right)

956:     \le

957:     \sum_{k=1}^K

958:     \frac{p_n^{(k)}}{\sum_{k=1}^K p_n^{(k)}}

959:     \lambda

960:     \left(

961:       D_k(\sigma_n),

962:       y_n

963:     \right)

964:   \end{equation*}

965:   if we let $K\to\infty$).

966:   Therefore,

967:   it remains to prove

968:   \begin{equation*}

969:     \log_{\beta_N}

970:     \sum_{k=1}^{\infty}

971:     q_k

972:     \beta_N^{L_N^{(k)}}

973:     \le

974:     -\log_{\beta_{N+1}}

975:     \sum_{k=1}^{\infty}

976:     p_{N+1}^{(k)}

977:     \beta_{N+1}^{l_{N+1}^{(k)}}

978:     +

979:     \log_{\beta_{N+1}}

980:     \sum_{k=1}^{\infty}

981:     q_k

982:     \beta_{N+1}^{L_{N+1}^{(k)}}.

983:   \end{equation*}

984:   By the definition of $p_n^{(k)}$

985:   this can be rewritten as

986:   \begin{equation*}

987:     \log_{\beta_N}

988:     \sum_{k=1}^{\infty}

989:     q_k

990:     \beta_N^{L_N^{(k)}}

991:     \le

992:     -\log_{\beta_{N+1}}

993:     \frac

994:     {

995:       \sum_{k=1}^{\infty}

996:       q_k

997:       \beta_{N+1}^{L_{N}^{(k)}}

998:       \beta_{N+1}^{l_{N+1}^{(k)}}

999:     }

1000:     {

1001:       \sum_{k=1}^{\infty}

1002:       q_k

1003:       \beta_{N+1}^{L_{N}^{(k)}}

1004:     }

1005:     +

1006:     \log_{\beta_{N+1}}

1007:     \sum_{k=1}^{\infty}

1008:     q_k

1009:     \beta_{N+1}^{L_{N+1}^{(k)}},

1010:   \end{equation*}

1011:   which after cancellation becomes

1012:   \begin{equation}\label{eq:to-check}

1013:     \log_{\beta_N}

1014:     \sum_{k=1}^{\infty}

1015:     q_k

1016:     \beta_N^{L_N^{(k)}}

1017:     \le

1018:     \log_{\beta_{N+1}}

1019:     \sum_{k=1}^{\infty}

1020:     q_k

1021:     \beta_{N+1}^{L_{N}^{(k)}}.

1022:   \end{equation}

1023:   The last inequality follows from the general result

1024:   about comparison of different means

1025:   (\cite{hardy/etal:1952}, Theorem 85),

1026:   but we can also check it directly

1027:   (following \cite{kalnishkan/vyugin:2005}).

1028:   Let $\beta_{N+1}=\beta_N^a$,

1029:   where $0<a<1$.

1030:   Then (\ref{eq:to-check}) can be rewritten as

1031:   \begin{equation*}

1032:     \left(

1033:       \sum_{k=1}^{\infty}

1034:       q_k

1035:       \beta_N^{L_N^{(k)}}

1036:     \right)^a

1037:     \ge

1038:     \sum_{k=1}^{\infty}

1039:     q_k

1040:     \beta_{N}^{aL_{N}^{(k)}},

1041:   \end{equation*}

1042:   and the last inequality follows from the concavity of the function $t\mapsto t^a$.

1043:   \qedtext

1044: \end{Proof}

1045:

1046: \begin{lemma}[\cite{kalnishkan/vyugin:2005}, Lemma 5]

1047:   Let $L$ be an upper bound on $\left|\lambda\right|$.

1048:   The WAA guarantees that, for all $N$ and $K$,

1049:   \begin{equation}\label{eq:lemma5}

1050:     L_N

1051:     \le

1052:     L_N^{(K)}

1053:     +

1054:     \left(

1055:       L^2 e^L + \ln\frac{1}{q_K}

1056:     \right)

1057:     \sqrt{N}.

1058:   \end{equation}

1059: \end{lemma}

1060: (There is no term $e^L$ in \cite{kalnishkan/vyugin:2005}

1061: since it only considers non-negative loss functions.)

1062: \begin{proof}

1063:   From (\ref{eq:lemma9}),

1064:   we obtain:

1065:   \begin{align*}

1066:     L_N

1067:     &\le

1068:     \sum_{n=1}^N

1069:     \sum_{k=1}^{\infty}

1070:     p_n^{(k)}

1071:     l_n^{(k)}

1072:     +

1073:     \sum_{n=1}^N

1074:     \sqrt{n}

1075:     \ln

1076:     \sum_{k=1}^{\infty}

1077:     p_n^{(k)}

1078:     \exp

1079:     \left(

1080:       -\frac{l_n^{(k)}}{\sqrt{n}}

1081:     \right)

1082:     +

1083:     \log_{\beta_N}

1084:     q_K

1085:     +

1086:     L_N^{(K)}\\

1087:     &\le

1088:     \sum_{n=1}^N

1089:     \sum_{k=1}^{\infty}

1090:     p_n^{(k)}

1091:     l_n^{(k)}

1092:     +

1093:     \sum_{n=1}^N

1094:     \sqrt{n}

1095:     \left(

1096:       \sum_{k=1}^{\infty}

1097:       p_n^{(k)}

1098:       \left(

1099:         1

1100:         -

1101:         \frac{l_n^{(k)}}{\sqrt{n}}

1102:         +

1103:         \frac{\left(l_n^{(k)}\right)^2}{2n}

1104:         e^L

1105:       \right)

1106:       -

1107:       1

1108:     \right)\\

1109:     &\quad{}+

1110:     \log_{\beta_N}

1111:     q_K

1112:     +

1113:     L_N^{(K)}\\

1114:     &=

1115:     L_N^{(K)}

1116:     +

1117:     \frac12

1118:     \sum_{n=1}^N

1119:     \frac{1}{\sqrt{n}}

1120:     \sum_{k=1}^{\infty}

1121:     p_n^{(k)}

1122:     \left(l_n^{(k)}\right)^2

1123:     e^L

1124:     +

1125:     \sqrt{N}\ln\frac{1}{q_K}\\

1126:     &\le

1127:     L_N^{(K)}

1128:     +

1129:     \frac{L^2e^L}{2}

1130:     \sum_{n=1}^N

1131:     \frac{1}{\sqrt{n}}

1132:     +

1133:     \sqrt{N}\ln\frac{1}{q_K}

1134:     \le

1135:     L_N^{(K)}

1136:     +

1137:     \frac{L^2e^L}{2}

1138:     \int_0^N

1139:     \frac{\D t}{\sqrt{t}}

1140:     +

1141:     \sqrt{N}\ln\frac{1}{q_K}\\

1142:     &\le

1143:     L_N^{(K)}

1144:     +

1145:     L^2e^L\sqrt{N}

1146:     +

1147:     \sqrt{N}\ln\frac{1}{q_K}

1148:   \end{align*}

1149:   (in the second ``$\le$'' we used the inequalities $e^t\le1+t+\frac{t^2}{2}e^{\left|t\right|}$

1150:   and $\ln t\le t-1$).

1151:   \qedtext

1152: \end{proof}

1153:

1154: Now it is easy to prove Theorem \ref{thm:deterministic-compact}.

1155: Let $\gamma_n$ be the predictions output by the WAA.

1156: Consider any continuous stationary prediction strategy $D$.

1157: Since every continuous function on a metric compact is uniformly continuous

1158: (\cite{engelking:1989}, Theorem 4.3.32),

1159: for any $\epsilon>0$ we can find $\delta>0$ such that

1160: $\left|\lambda(\gamma_1,y)-\lambda(\gamma_2,y)\right|<\epsilon$

1161: whenever $\rho(\gamma_1,\gamma_2)<\delta$.

1162: We can further find $K$ such that $\hat\rho(D_K,D)<\delta$,

1163: and (\ref{eq:lemma5}) then gives,

1164: for all biinfinite $\ldots,x_{-1},y_{-1},x_{0},y_{0},x_{1},y_{1},\ldots$,

1165: \begin{multline*}

1166:   \limsup_{N\to\infty}

1167:   \Biggl(

1168:     \frac1N

1169:     \sum_{n=1}^N

1170:     \lambda(\gamma_n,y_n)

1171:     -

1172:     \frac1N

1173:     \sum_{n=1}^N

1174:     \lambda(D(\sigma_n),y_n)

1175:   \Biggr)\\

1176:   \le

1177:   \limsup_{N\to\infty}

1178:   \Biggl(

1179:     \frac1N

1180:     \sum_{n=1}^N

1181:     \lambda(\gamma_n,y_n)

1182:     -

1183:     \frac1N

1184:     \sum_{n=1}^N

1185:     \lambda(D_K(\sigma_n),y_n)

1186:   \Biggr)

1187:   +

1188:   \epsilon\\

1189:   \le

1190:   \limsup_{N\to\infty}

1191:   \left(

1192:     L^2e^L + \ln\frac{1}{q_K}

1193:   \right)

1194:   \frac{1}{\sqrt{N}}

1195:   +

1196:   \epsilon

1197:   =

1198:   \epsilon;

1199: \end{multline*}

1200: since $\epsilon$ can be arbitrarily small

1201: the WAA is CS universal.

1202:

1203: \section{Proof of Theorem \ref{thm:randomized-compact}}

1204: \label{sec:proof-randomized-compact}

1205:

1206: Let us first recall some useful facts about the probability measures

1207: on a metric compact $\Omega$

1208: (we will be following \cite{\GTPXVII}).

1209: The Banach space of all continuous real-valued functions on $\Omega$

1210: with the usual pointwise addition and scalar action

1211: and the sup norm will be denoted $C(\Omega)$.

1212: By one of the Riesz representation theorems

1213: (\cite{dudley:2002}, 7.4.1; see also 7.1.1),

1214: the mapping $\mu\mapsto I_{\mu}$,

1215: where

1216: $

1217:   I_{\mu}(f):=\int_{\Omega}f\D\mu

1218: $,

1219: is a linear isometry

1220: between the set of all finite Borel signed measures $\mu$ on $\Omega$

1221: with the total variation norm

1222: and the dual space $C'(\Omega)$ to $C(\Omega)$

1223: with the standard dual norm

1224: (\cite{rudin:1991}, Chapter 4).

1225: We will identify the finite Borel signed measures $\mu$ on $\Omega$

1226: with the corresponding $I_{\mu}\in C'(\Omega)$.

1227: This makes the set $\PPP(\Omega)$ of probability measures on $\Omega$

1228: a convex closed subset of $C'(\Omega)$.

1229:

1230: We will be interested, however,

1231: in a different topology on $C'(\Omega)$,

1232: the weakest topology for which all evaluation functionals

1233: $\mu\in C'(\Omega)\mapsto\mu(f)$, $f\in C(\Omega)$,

1234: are continuous.

1235: This topology is known as the \emph{weak${}^*$ topology}

1236: (\cite{rudin:1991}, 3.14),

1237: and the topology inherited by $\PPP(\Omega)$

1238: is known as the \emph{topology of weak convergence}

1239: (\cite{billingsley:1968}, Appendix III).

1240: The point mass $\delta_{\omega}$, $\omega\in\Omega$,

1241: is defined to be the probability measure concentrated at $\omega$,

1242: $\delta_{\omega}(\{\omega\})=1$.

1243: The simple example of a sequence of point masses $\delta_{\omega_n}$

1244: such that $\omega_n\to\omega$ as $n\to\infty$ and $\omega_n\ne\omega$ for all $n$

1245: shows that the topology of weak convergence is different from the dual norm topology:

1246: $\delta_{\omega_n}\to\delta_{\omega}$ holds in one but does not hold in the other.

1247:

1248: It is not difficult to check that $\PPP(\Omega)$ remains a closed subset of $C'(\Omega)$

1249: in the weak${}^*$ topology

1250: (\cite{bourbaki:integration}, III.2.7, Proposition 7).

1251: By the Banach--Alaoglu theorem

1252: (\cite{rudin:1991}, 3.15)

1253: $\PPP(\Omega)$ is compact in the topology of weak convergence

1254: (this is a special case of Prokhorov's theorem,

1255: \cite{billingsley:1968}, Appendix III, Theorem 6).

1256: In the rest of this paper,

1257: $\PPP(\Omega)$

1258: (and all other spaces of probability measures)

1259: are always equipped with the topology of weak convergence.

1260:

1261: Since $\Omega$ is a metric compact,

1262: $\PPP(\Omega)$ is also metrizable

1263: (by the well-known Prokhorov metric:

1264: \cite{billingsley:1968}, Appendix III, Theorem 6).

1265:

1266: Define

1267: \begin{equation}\label{eq:expected-loss}

1268:   \lambda(\gamma,y)

1269:   :=

1270:   \int_{\Gamma}

1271:   \lambda(g,y)

1272:   \gamma(\dd g),

1273: \end{equation}

1274: where $\gamma$ is a probability measure on $\Gamma$.

1275: This is the loss function in a new game of prediction

1276: with the prediction space $\PPP(\Gamma)$;

1277: it is convex in $\gamma$.

1278:

1279: Let us check that the loss function (\ref{eq:expected-loss}) is continuous.

1280: If $\gamma_n\to\gamma$ and $y_n\to y$

1281: for some $(\gamma,y)\in\PPP(\Gamma)\times\mathbf{Y}$,

1282: \begin{equation*}

1283:   \left|

1284:     \lambda(\gamma_n,y_n)

1285:     -

1286:     \lambda(\gamma,y)

1287:   \right|

1288:   \le

1289:   \left|

1290:     \lambda(\gamma_n,y_n)

1291:     -

1292:     \lambda(\gamma_n,y)

1293:   \right|

1294:   +

1295:   \left|

1296:     \lambda(\gamma_n,y)

1297:     -

1298:     \lambda(\gamma,y)

1299:   \right|

1300:   \to

1301:   0

1302: \end{equation*}

1303: (the first addend tends to zero because of the uniform continuity

1304: of $\lambda:\Gamma\times\mathbf{Y}\to\bbbr$

1305: and the second addend by the definition of the topology of weak convergence).

1306:

1307: Unfortunately,

1308: Theorem \ref{thm:deterministic-compact} cannot be applied

1309: to the new game of prediction directly:

1310: the theorem assumes that $\Gamma$ is a subset of a Banach space,

1311: whereas the dual to an infinite-dimensional Banach space is never even metrizable

1312: in the weak$^*$ topology

1313: (\cite{rudin:1991}, 3.16).

1314: The proof of Theorem \ref{thm:deterministic-compact}, however,

1315: still works for the new game.

1316:

1317: It is clear that the mixture (\ref{eq:WAA}) is a probability measure.

1318: The result of the previous section is still true,

1319: and the randomized prediction strategy (\ref{eq:WAA})

1320: produces $\gamma_n\in\PPP(\Gamma)$ that are guaranteed to satisfy

1321: \begin{equation}\label{eq:mean}

1322:   \limsup_{N\to\infty}

1323:   \left(

1324:     \frac1N

1325:     \sum_{n=1}^N

1326:     \lambda(\gamma_n,y_n)

1327:     -

1328:     \frac1N

1329:     \sum_{n=1}^N

1330:     \lambda(D(\sigma_n),y_n)

1331:   \right)

1332:   \le

1333:   0,

1334: \end{equation}

1335: for any continuous stationary randomized prediction strategy $D$.

1336: The loss function is bounded in absolute value

1337: by a constant $L$,

1338: and so the law of the iterated logarithm

1339: (see, e.g., \cite{shafer/vovk:2001}, (5.8))

1340: implies that

1341: \begin{align}

1342:   \limsup_{N\to\infty}

1343:   \frac

1344:   {

1345:     \left|

1346:       \sum_{n=1}^N

1347:       \bigl(

1348:         \lambda(g_n,y_n)

1349:         -

1350:         \lambda(\gamma_n,y_n)

1351:       \bigr)

1352:     \right|

1353:   }

1354:   {

1355:     \sqrt{2L^2N\ln\ln N}

1356:   }

1357:   &\le

1358:   1,\label{eq:LIL-1}\\

1359:   \limsup_{N\to\infty}

1360:   \frac

1361:   {

1362:     \left|

1363:       \sum_{n=1}^N

1364:       \bigl(

1365:         \lambda(d_n,y_n)

1366:         -

1367:         \lambda(D(\sigma_n),y_n)

1368:       \bigr)

1369:     \right|

1370:   }

1371:   {

1372:     \sqrt{2L^2N\ln\ln N}

1373:   }

1374:   &\le

1375:   1\label{eq:LIL-2}

1376: \end{align}

1377: with probability one.

1378: Combining the last two inequalities with (\ref{eq:mean}) gives

1379: \begin{equation*}

1380:   \limsup_{N\to\infty}

1381:   \left(

1382:     \frac1N

1383:     \sum_{n=1}^N

1384:     \lambda(g_n,y_n)

1385:     -

1386:     \frac1N

1387:     \sum_{n=1}^N

1388:     \lambda(d_n,y_n)

1389:   \right)

1390:   \le

1391:   0

1392:   \enspace

1393:   \textrm{a.s.}

1394: \end{equation*}

1395: Therefore, the WAA (applied to $D_1,D_2,\ldots$)

1396: is a universal continuous randomized prediction strategy.

1397:

1398: \section{Proof of Theorem \ref{thm:deterministic}}

1399: \label{sec:proof-deterministic}

1400:

1401: In view of Theorem \ref{thm:deterministic-compact},

1402: we only need to get rid of the assumption of compactness

1403: of $\mathbf{X}$, $\Gamma$, and $\mathbf{Y}$.

1404:

1405: \subsection*{Game of removal}

1406:

1407: The proofs of Theorems \ref{thm:deterministic} and \ref{thm:randomized}

1408: will be based on the following game

1409: (an abstract version of the ``doubling trick'',

1410: \cite{cesabianchi/lugosi:2006})

1411: played in a topological space $X$:

1412:

1413: \bigskip

1414:

1415: \noindent

1416: \textsc{Game of removal $G(X)$}\nopagebreak

1417: \begin{tabbing}

1418:   \qquad\=\qquad\=\qquad\kill

1419:   FOR $n=1,2,\dots$:\\

1420:   \> Remover announces compact $K_n\subseteq X$.\\

1421:   \> Evader announces $p_n\notin K_n$.\\

1422:   END FOR.

1423: \end{tabbing}

1424: \textbf{Winner:}

1425: Evader if the set $\left\{p_1,p_2,\ldots\right\}$ is precompact;

1426: Remover otherwise.

1427:

1428: \bigskip

1429:

1430: \noindent

1431: Intuitively,

1432: the goal of Evader is to avoid being removed to the infinity.

1433: Without loss of generality

1434: we will assume that Remover always announces a non-decreasing sequence of compact sets:

1435: $K_1\subseteq K_2\subseteq\cdots$.

1436: \begin{lemma}[Gruenhage]\label{lem:Gruenhage}

1437:   Remover has a winning strategy in $G(X)$

1438:   if $X$ is a locally compact and paracompact space.

1439: \end{lemma}

1440: \begin{proof}

1441:   We will follow the proof of Theorem 4.1 in \cite{gruenhage:2006}

1442:   (the easy direction).

1443:   If $X$ is locally compact and $\sigma$-compact,

1444:   there exists a non-decreasing sequence $K_1\subseteq K_2\subseteq\cdots$

1445:   of compact sets covering $X$,

1446:   and each $K_n$ can be extended to compact $K^*_n$

1447:   so that $\Int K^*_n\supseteq K_n$

1448:   (\cite{engelking:1989}, Theorem 3.3.2).

1449:   Remover will obviously win $G(X)$ choosing $K^*_1,K^*_2,\ldots$ as his moves.

1450:

1451:   If $X$ is the sum of locally compact $\sigma$-compact spaces $X_s$, $s\in S$,

1452:   Remover plays, for each $s\in S$, the strategy described in the previous paragraph

1453:   on the subsequence of Evader's moves belonging to $X_s$.

1454:   If Evader chooses $p_n\in X_s$ for infinitely many $X_s$,

1455:   those $X_s$ will form an open cover of the closure of $\{p_1,p_2,\ldots\}$

1456:   without a finite subcover.

1457:   If $x_n$ are chosen from only finitely many $X_s$,

1458:   there will be infinitely many $x_n$ chosen from some $X_s$,

1459:   and the result of the previous paragraph can be applied.

1460:   It remains to remember that each locally compact paracompact

1461:   can be represented as the sum of locally compact $\sigma$-compact subsets

1462:   (\cite{engelking:1989}, Theorem 5.1.27).

1463:   \qedtext

1464: \end{proof}

1465:

1466: \subsection*{Large at infinity loss functions}

1467:

1468: We will need the following useful property of large at infinity loss functions.

1469: \begin{lemma}\label{lem:loss}

1470:   Let $\lambda$ be a loss function that is large at infinity.

1471:   For each compact set $B\subseteq\mathbf{Y}$ and each constant $M$

1472:   there exists a compact set $C\subseteq\Gamma$ such that

1473:   \begin{equation}\label{eq:loss}

1474:     \forall\gamma\notin C,y\in B:

1475:     \quad

1476:     \lambda(\gamma,y)

1477:     >

1478:     M.

1479:   \end{equation}

1480: \end{lemma}

1481: \begin{proof}

1482:   For each point $y^*\in B$

1483:   fix a neighborhood $O_{y^*}\ni y^*$

1484:   and a compact set $C(y^*)\subseteq\Gamma$ such that

1485:   $\lambda\left(\Gamma\setminus C(y^*),O_{y^*}\right)\subseteq(M,\infty)$.

1486:   Since the sets $O_{y^*}$ form an open cover of $B$,

1487:   we can find this cover's finite subcover

1488:   $\{O_{y^*_1},\ldots,O_{y^*_n}\}$.

1489:   It is clear that

1490:   \begin{equation*}

1491:     C

1492:     :=

1493:     \bigcup_{j=1,\ldots,n}

1494:     C

1495:     \left(

1496:       O_{y^*_j}

1497:     \right)

1498:   \end{equation*}

1499:   satisfies (\ref{eq:loss}).

1500:   \qedtext

1501: \end{proof}

1502: In fact,

1503: the only property of large at infinity loss functions that we will be using

1504: is that in the conclusion of Lemma \ref{lem:loss}.

1505: In particular, it implies the following lemma.

1506: \begin{lemma}\label{lem:C-det}

1507:   Under the conditions of Theorem \ref{thm:deterministic},

1508:   for each compact set $B\subseteq\mathbf{Y}$

1509:   there exists a compact convex set $C=C(B)\subseteq\Gamma$

1510:   such that for each continuous stationary prediction strategy

1511:   $D:\Sigma\to\Gamma$

1512:   there exists a continuous stationary prediction strategy

1513:   $D':\Sigma\to C$

1514:   that dominates $D$ in the sense

1515:   \begin{equation}\label{eq:prediction-type}

1516:     \forall\sigma\in\Sigma,y\in B:

1517:     \quad

1518:     \lambda(D'(\sigma),y)

1519:     \le

1520:     \lambda(D(\sigma),y).

1521:   \end{equation}

1522: \end{lemma}

1523: \ifFULL\bluebegin

1524:   In fact,

1525:   we only need Lemmas \ref{lem:C-det} and \ref{lem:C-rand}

1526:   for $D':A\to C$.

1527: \blueend\fi

1528: \begin{proof}

1529:   Without loss of generality $B$ is assumed non-empty.

1530:   Fix any $\gamma_0\in\Gamma$.

1531:   Let

1532:   \begin{equation*}

1533:     M_1

1534:     :=

1535:     \sup_{y\in B}

1536:     \lambda(\gamma_0,y),

1537:   \end{equation*}

1538:   let $C_1\subseteq\Gamma$ be a compact set such that

1539:   \begin{equation*}

1540:     \forall \gamma\notin C_1,y\in B:

1541:     \quad

1542:     \lambda(\gamma,y)

1543:     >

1544:     M_1+1,

1545:   \end{equation*}

1546:   let

1547:   \begin{equation*}

1548:     M_2

1549:     :=

1550:     \sup_{(\gamma,y)\in C_1\times B}

1551:     \lambda(\gamma,y),

1552:   \end{equation*}

1553:   and let $C_2\subseteq\Gamma$ be a compact set such that

1554:   \begin{equation*}

1555:     \forall\gamma\notin C_2,y\in B:

1556:     \quad

1557:     \lambda(\gamma,y)

1558:     >

1559:     M_2+1.

1560:   \end{equation*}

1561:   It is obvious that $M_1\le M_2$ and $\gamma_0\in C_1\subseteq C_2$.

1562:   We can and will assume $C_2$ convex

1563:   (see \cite{rudin:1991}, Theorem 3.20(c)).

1564:

1565:   Let us now check that $C_1$ lies inside the interior of $C_2$.

1566:   Indeed, for any fixed $y\in B$ and $\gamma\in C_1$,

1567:   we have $\lambda(\gamma,y)\le M_2$;

1568:   since $\lambda(\gamma',y)>M_2+1$ for all $\gamma'\notin C_2$,

1569:   some neighborhood of $\gamma$ will lie completely in $C_2$.

1570:

1571:   Let $D:\Sigma\to\Gamma$

1572:   be a continuous stationary prediction strategy.

1573:   We will show that (\ref{eq:prediction-type}) holds

1574:   for some continuous stationary prediction strategy $D'$

1575:   taking values in the compact convex set $C(B):=C_2$.

1576:   Namely,

1577:   we define

1578:   \begin{multline*}

1579:     D'(\sigma)

1580:     :=\\

1581:     \begin{cases}

1582:       D(\sigma) & \text{if $D(\sigma)\in C_1$}\\

1583:       \frac{\rho(D(\sigma),\Gamma\setminus C_2)}{\rho(D(\sigma),C_1)+\rho(D(\sigma),\Gamma\setminus C_2)} D(\sigma)

1584:       +\frac{\rho(D(\sigma),C_1)}{\rho(D(\sigma),C_1)+\rho(D(\sigma),\Gamma\setminus C_2)} \gamma_0

1585:       & \text{if $D(\sigma)\in C_2\setminus C_1$}\\

1586:       \gamma_0 & \text{if $D(\sigma)\in \Gamma\setminus C_2$}

1587:     \end{cases}

1588:   \end{multline*}

1589:   where $\rho$ is the metric on $\Gamma$;

1590:   the denominator $\rho(D(\sigma),C_1)+\rho(D(\sigma),\Gamma\setminus C_2)$

1591:   is positive since already $\rho(D(\sigma),C_1)$ is positive.

1592:   Since $C_2$ is convex,

1593:   we can see that $D'$ indeed takes values in $C_2$.

1594:   The only points $x$ at which the continuity of $D'$ is not obvious

1595:   are those for which $D(\sigma)$ lies on the boundary of $C_1$:

1596:   in this case

1597:   one has to use the fact that $C_1$ is covered by the interior of $C_2$.

1598:

1599:   It remains to check (\ref{eq:prediction-type});

1600:   the only non-trivial case is $D(\sigma)\in C_2\setminus C_1$.

1601:   By the convexity of $\lambda(\gamma,y)$ in $\gamma$,

1602:   the inequality in (\ref{eq:prediction-type}) will follow from

1603:   \begin{multline*}

1604:     \frac{\rho(D(\sigma),\Gamma\setminus C_2)}{\rho(D(\sigma),C_1)+\rho(D(\sigma),\Gamma\setminus C_2)}

1605:     \lambda(D(\sigma),y)\\

1606:     +\frac{\rho(D(\sigma),C_1)}{\rho(D(\sigma),C_1)+\rho(D(\sigma),\Gamma\setminus C_2)}

1607:     \lambda(\gamma_0,y)

1608:     \le

1609:     \lambda(D(\sigma),y),

1610:   \end{multline*}

1611:   i.e.,

1612:   \begin{equation*}

1613:     \lambda(\gamma_0,y)

1614:     \le

1615:     \lambda(D(\sigma),y).

1616:   \end{equation*}

1617:   Since the left-hand side of the last inequality is at most $M_1$

1618:   and its right-hand side exceeds $M_1+1$,

1619:   it holds true.

1620:   \qedtext

1621: \end{proof}

1622: \begin{remark*}

1623:   If the loss function is allowed to depend on the infinite past,

1624:   the $\sigma$s in Lemma \ref{lem:C-det} will have to be restricted

1625:   to a compact set $A\subseteq\Sigma$

1626:   and the compact set $C$ will depend not only on $B$ but also on $A$

1627:   (see Lemma 18 of \cite{\GTPXVII}).

1628: \end{remark*}

1629:

1630: \subsection*{The proof}

1631:

1632: For each compact $B\subseteq\mathbf{Y}$

1633: fix a compact convex $C(B)\subseteq\Gamma$ as in Lemma \ref{lem:C-det}.

1634: Predictor's strategy ensuring (\ref{eq:dominates-deterministic})

1635: is constructed from Remover's winning strategy in $G(\mathbf{X}\times\mathbf{Y})$

1636: (see Lemma \ref{lem:Gruenhage};

1637: metric spaces are paracompact by the Stone theorem,

1638: \cite{engelking:1989}, Theorem 5.1.3)

1639: and from Predictor's strategies $\SSS(A,B)$ outputting predictions

1640: \begin{equation}\label{eq:gamma}

1641:   \gamma_n\in C(B)

1642: \end{equation}

1643: and ensuring the consequent of (\ref{eq:dominates-deterministic})

1644: for all continuous

1645: \begin{equation}\label{eq:DABC}

1646:   D:(A\times B)^{\infty}\times A\to C(B)

1647: \end{equation}

1648: under the assumption that $(x_n,y_n)\in A\times B$

1649: for given compact $A\subseteq\mathbf{X}$ and $B\subseteq\mathbf{Y}$

1650: (the existence of such $\SSS(A,B)$

1651: is asserted in Theorem \ref{thm:deterministic-compact}).

1652: Remover's moves are assumed to be of the form $A\times B$

1653: for compact $A\subseteq\mathbf{X}$ and $B\subseteq\mathbf{Y}$.

1654: Predictor is simultaneously playing the game of removal

1655: $G(\mathbf{X}\times\mathbf{Y})$ as Evader.

1656:

1657: At the beginning of the game of prediction

1658: Predictor asks Remover to make his first move $A_1\times B_1$ in the game of removal;

1659: without loss of generality

1660: we assume that $A_1\times B_1$ contains all $(x_n,y_n)$, $n\le0$

1661: (there is nothing to prove if $\{(x_n,y_n)\st n\le0\}$ is not precompact).

1662: Predictor then plays the game of prediction using the strategy $\SSS(A_1,B_1)$

1663: until Reality chooses $(x_n,y_n)\notin A_1\times B_1$

1664: (forever if Reality never chooses such $(x_n,y_n)$).

1665: As soon as such $(x_n,y_n)$ is chosen,

1666: Predictor announces $(x_n,y_n)$ in the game of removal

1667: and notes Remover's response $(A_2,B_2)$.

1668: He then continues playing the game of prediction using the strategy $\SSS(A_2,B_2)$

1669: until Reality chooses $(x_n,y_n)\notin A_2\times B_2$,

1670: etc.

1671:

1672: Let us check that this strategy for Predictor

1673: will always ensure (\ref{eq:dominates-deterministic}).

1674: If Reality chooses $(x_n,y_n)$ outside Predictor's current $A_k\times B_k$

1675: finitely often,

1676: the consequent of (\ref{eq:dominates-deterministic}) will be satisfied

1677: for all continuous stationary $D:\Sigma\to C(B_K)$

1678: ($B_K$ being the second component of Remover's last move $(A_K,B_K)$)

1679: and so, by Lemma \ref{lem:C-det},

1680: for all continuous stationary $D:\Sigma\to\Gamma$.

1681: If Reality chooses $(x_n,y_n)$ outside Predictor's current $A_k\times B_k$

1682: infinitely often,

1683: the set of $(x_n,y_n)$, $n=1,2,\ldots$, will not be precompact,

1684: and so the antecedent of (\ref{eq:dominates-deterministic}) will be violated.

1685:

1686: \section{Proof of Theorem \ref{thm:randomized}}

1687: \label{sec:proof-randomized}

1688:

1689: When $\gamma$ ranges over $\PPP(C)$

1690: (identified with the subset of $\PPP(\Gamma)$

1691: consisting of the measures concentrated on $C$)

1692: for a compact $C\subseteq\Gamma$,

1693: the loss function (\ref{eq:expected-loss}),

1694: as we have seen, is continuous.

1695: The following analogue of Lemma \ref{lem:C-det} will be useful.

1696: \begin{lemma}\label{lem:C-rand}

1697:   Under the conditions of Theorem \ref{thm:randomized},

1698:   for each compact set $B\subseteq\mathbf{Y}$

1699:   there exists a compact convex set $C=C(B)\subseteq\Gamma$

1700:   such that for each continuous stationary randomized prediction strategy

1701:   $D:\Sigma\to\PPP(\Gamma)$

1702:   there exists a continuous stationary randomized prediction strategy

1703:   $D':\Sigma\to\PPP(C)$

1704:   such that (\ref{eq:prediction-type}) holds

1705:   ($D'$ dominates $D$ ``on average'').

1706: \end{lemma}

1707: (In fact, this lemma is not needed

1708: for the proof of Theorem \ref{thm:randomized} as we stated it,

1709: but it will imply that $\gamma_n$ dominate $D(\sigma_n)$ on average,

1710: for any continuous stationary randomized prediction strategy $D$:

1711: see (\ref{eq:stage-K}).)

1712: \begin{proof}

1713:   Define $\gamma_0$, $M_1$, $C_1$, $M_2$, and $C_2$

1714:   as in the proof of Lemma \ref{lem:C-det}.

1715:   Fix a continuous function $f_1:\Gamma\to[0,1]$ such that $f_1=1$ on $C_1$

1716:   and $f_1=0$ on $\Gamma\setminus C_2$

1717:   (such an $f_1$ exists by the Tietze--Uryson theorem,

1718:   \cite{engelking:1989}, Theorem 2.1.8).

1719:   Set $f_2:=1-f_1$.

1720:   Let $D:\Sigma\to\PPP(\Gamma)$ be a continuous stationary randomized prediction strategy.

1721:   For each $\sigma\in\Sigma$,

1722:   split $D(\sigma)$ into two measures on $\Gamma$

1723:   absolutely continuous with respect to $D(\sigma)$:

1724:   $D_1(\sigma)$ with Radon--Nikodym density $f_1$

1725:   and $D_2(\sigma)$ with Radon--Nikodym density $f_2$;

1726:   set

1727:   \begin{equation*}

1728:     D'(\sigma)

1729:     :=

1730:     D_1(\sigma)

1731:     +

1732:     \left|D_2(\sigma)\right|

1733:     \delta_{\gamma_0}

1734:   \end{equation*}

1735:   (letting $\left|P\right|:=P(\Gamma)$ for $P$ a measure on $\Gamma$).

1736:   It is clear that the stationary randomized prediction strategy $D'$ is continuous

1737:   (in the topology of weak convergence, as usual),

1738:   takes values in $\PPP(C_2)$,

1739:   and

1740:   \begin{multline*}

1741:     \lambda(D'(\sigma),y)

1742:     =

1743:     \int_{\Gamma}

1744:       \lambda(\gamma,y)

1745:       f_1(\gamma)

1746:     D(\sigma)(\dd\gamma)

1747:     +

1748:     \lambda(\gamma_0,y)

1749:     \int_{\Gamma}

1750:       f_2(\gamma)

1751:     D(\sigma)(\dd\gamma)\\

1752:     \le

1753:     \int_{\Gamma}

1754:       \lambda(\gamma,y)

1755:       f_1(\gamma)

1756:     D(\sigma)(\dd\gamma)

1757:     +

1758:     \int_{\Gamma}

1759:       M_1

1760:       f_2(\gamma)

1761:     D(\sigma)(\dd\gamma)\\

1762:     \le

1763:     \int_{\Gamma}

1764:       \lambda(\gamma,y)

1765:       f_1(\gamma)

1766:     D(\sigma)(\dd\gamma)

1767:     +

1768:     \int_{\Gamma}

1769:       \lambda(\gamma,y)

1770:       f_2(\gamma)

1771:     D(\sigma)(\dd\gamma)

1772:     =

1773:     \lambda(D(\sigma),y)

1774:   \end{multline*}

1775:   for all $(\sigma,y)\in\Sigma\times B$.

1776:   So we can take $C(B):=C_2$.

1777:   \qedtext

1778: \end{proof}

1779: Fix one of the mappings $B\mapsto C(B)$

1780: whose existence is asserted by the lemma.

1781:

1782: We will prove that the prediction strategy of the previous section

1783: with (\ref{eq:gamma}) replaced by

1784: $

1785:   \gamma_n\in\PPP(C(B))

1786: $

1787: and (\ref{eq:DABC}) replaced by

1788: \begin{equation*}

1789:   D:(A\times B)^{\infty}\times A\to\PPP(C(B))

1790: \end{equation*}

1791: is CS universal.

1792: Let $D:\Sigma\to\PPP(\Gamma)$ be a continuous stationary randomized prediction strategy,

1793: i.e., a continuous stationary prediction strategy

1794: in the new game of prediction with loss function (\ref{eq:expected-loss}).

1795: Let $(A_K,B_K)$ be Remover's last move

1796: (if Remover makes infinitely many moves,

1797: the antecedent of (\ref{eq:dominates-randomized}) is false,

1798: and there is nothing to prove),

1799: and let $D':\Sigma\to\PPP(C(B_K))$ be a continuous stationary randomized prediction strategy

1800: satisfying (\ref{eq:prediction-type}) with $B:=B_K$.

1801: From some $n$ on

1802: our randomized prediction algorithm produces $\gamma_n\in\PPP(\Gamma)$

1803: concentrated on $C(B_K)$,

1804: and they will satisfy

1805: \begin{multline}\label{eq:stage-K}

1806:   \limsup_{N\to\infty}

1807:   \left(

1808:     \frac1N

1809:     \sum_{n=1}^N

1810:     \lambda(\gamma_n,y_n)

1811:     -

1812:     \frac1N

1813:     \sum_{n=1}^N

1814:     \lambda(D(\sigma_n),y_n)

1815:   \right)\\

1816:   \le

1817:   \limsup_{N\to\infty}

1818:   \left(

1819:     \frac1N

1820:     \sum_{n=1}^N

1821:     \lambda(\gamma_n,y_n)

1822:     -

1823:     \frac1N

1824:     \sum_{n=1}^N

1825:     \lambda(D'(\sigma_n),y_n)

1826:   \right)

1827:   \le

1828:   0.

1829: \end{multline}

1830: This is an interesting property

1831: but slightly different from what Theorem \ref{thm:randomized} asserts.

1832:

1833: According to the proof of Lemma \ref{lem:C-rand},

1834: we can, and we will, assume that $D'(\sigma_n)$

1835: generates outcomes $d'_n$ in two steps:

1836: first $d_n$ is generated from $D(\sigma_n)$,

1837: and then it is replaced by $\gamma_0$ with probability $f_2(\sigma_n)$.

1838: The loss function is bounded in absolute value

1839: on the compact set

1840: $C(B_K)\times B_K$ by a constant $L$.

1841: From the law of the iterated logarithm

1842: (see (\ref{eq:LIL-1}) and (\ref{eq:LIL-2}))

1843: applied to the losses of $\gamma_n$ and $d'_n$

1844: we now obtain,

1845: instead of (\ref{eq:stage-K}),

1846: \begin{multline*}

1847:   \limsup_{N\to\infty}

1848:   \left(

1849:     \frac1N

1850:     \sum_{n=1}^N

1851:     \lambda(g_n,y_n)

1852:     -

1853:     \frac1N

1854:     \sum_{n=1}^N

1855:     \lambda(d_n,y_n)

1856:   \right)\\

1857:   \le

1858:   \limsup_{N\to\infty}

1859:   \left(

1860:     \frac1N

1861:     \sum_{n=1}^N

1862:     \lambda(g_n,y_n)

1863:     -

1864:     \frac1N

1865:     \sum_{n=1}^N

1866:     \lambda(d'_n,y_n)

1867:   \right)\\

1868:   =

1869:   \limsup_{N\to\infty}

1870:   \left(

1871:     \frac1N

1872:     \sum_{n=1}^N

1873:     \lambda(\gamma_n,y_n)

1874:     -

1875:     \frac1N

1876:     \sum_{n=1}^N

1877:     \lambda(D'(\sigma_n),y_n)

1878:   \right)

1879:   \le

1880:   0

1881:   \enspace

1882:   \textrm{a.s.};

1883: \end{multline*}

1884: it remains to compare this with (\ref{eq:dominates-randomized}).

1885:

1886: \section{Stationarity and continuity}

1887: \label{sec:stationarity}

1888:

1889: As we said earlier,

1890: the assumption of stationarity is very natural

1891: for prediction strategies:

1892: it just means that the arbitrary origin of time is not taken into account

1893: (in the spirit of the invariance principle in statistics;

1894: see, e.g., \cite{lehmann:1986}, Section 6.1).

1895: Stationary strategies can detect and make use of all kinds of trends

1896: and one-off phenomena;

1897: e.g.,

1898: they can perform well when the rate of environment change is constantly increasing

1899: (as in our own environment).

1900: There need not be stationarity in the environment.

1901:

1902: Interestingly,

1903: our prediction algorithms are continuous (or can be made continuous)

1904: but not stationary.

1905: First we discuss the continuity

1906: of the prediction algorithms

1907: constructed in the proofs of our four theorems.

1908: \begin{description}

1909: \item[Theorem \ref{thm:deterministic-compact}]

1910:   It is easy to check that the WAA is continuous;

1911:   by the Weierstrass $M$-test,

1912:   (\ref{eq:WAA}) converges uniformly

1913:   and so its sum is continuous.

1914: \item[Theorem \ref{thm:randomized-compact}]

1915:   To check that $\gamma_n$ is a continuous function of

1916:   $\sigma_n$ in the topology of weak convergence,

1917:   we only need to check that $\int f\D\gamma_n$ is a continuous function of $\sigma_n$

1918:   for each $f\in C(\Sigma)$.

1919:   This again follows from the Weierstrass $M$-test.

1920: \item[Theorem \ref{thm:deterministic}]

1921:   As described,

1922:   Predictor's strategy is not continuous

1923:   since his behavior changes suddenly when Reality outputs $(x_n,y_n)$

1924:   outside his current $A_k\times B_k$,

1925:   but it is clear that it can be ``smoothed around the edges''

1926:   to ensure continuity.

1927: \item[Theorem \ref{thm:randomized}]

1928:   The situation is analogous to Theorem \ref{thm:deterministic}.

1929: \end{description}

1930:

1931: For concreteness,

1932: we will discuss stationarity only in the case of Theorem \ref{thm:deterministic-compact}.

1933: We know that the WAA is a prediction strategy that is continuous

1934: as a function of the type $\Sigma\times\{1,2,\ldots\}\to\Gamma$.

1935: It is not stationary

1936: (i.e., we cannot get rid of the $\{1,2,\ldots\}$)

1937: because it has to keep track of the experts' losses

1938: since the beginning of the game of prediction.

1939: Stationary strategies can depend on time only in a limited way:

1940: e.g., in terms of our own environment,

1941: they can depend on the time of day or the season.

1942: But the WAA's dependence is much heavier:

1943: it has to know precisely the time that has elapsed since the beginning.

1944:

1945: Let us now check that

1946: there are no universal continuous stationary prediction strategies

1947: under conditions of Theorem \ref{thm:deterministic-compact}.

1948: Suppose $\Gamma$ is such that there exists $f:\Gamma\to\Gamma$

1949: without fixed points

1950: (i.e., $f(\gamma)\ne\gamma$ for all $\gamma\in\Gamma$;

1951: we can take, e.g., a circle as $\Gamma$).

1952: If $D$ were a universal continuous stationary strategy,

1953: we could define another continuous stationary strategy $D'(\sigma):=f(D(\sigma))$

1954: and make Reality collude with $D'$

1955: (i.e., output $y_n$ leading to a significantly smaller loss for $D'$;

1956: this can be done for an appropriate choice of $\lambda$,

1957: and in fact can be done for all usual $\lambda$).

1958:

1959: \iffalse

1960: In conclusion let us check that,

1961: for a wide class of loss function $\lambda$

1962: there are no universal continuous stationary prediction strategies.

1963: Indeed,

1964: suppose that for some $\gamma_1,\gamma_2\in\Gamma$ and $y_1,y_2\in\mathbf{Y}$,

1965: \begin{align*}

1966:   \lambda(\gamma_1,y_1)

1967:   &<

1968:   \lambda(\gamma_2,y_1)\\

1969:   \lambda(\gamma_2,y_2)

1970:   &<

1971:   \lambda(\gamma_1,y_2)\\

1972:   \inf_{\gamma\in\Gamma}

1973:   \max_{i=1,2}

1974:   \left(

1975:     \lambda(\gamma,y_i)

1976:     -

1977:     \lambda(\gamma_i,y_i)

1978:   \right)

1979:   &>

1980:   0

1981: \end{align*}

1982: (the first condition means that $\gamma_1$ is the ``right'' prediction for $y_1$,

1983: the second condition that $\gamma_2$ is the ``right'' prediction for $y_2$,

1984: and the third condition is that no $\gamma\in\Gamma$

1985: can simultaneously compete with $\gamma_1$ on $y_1$ and with $\gamma_2$ on $y_2$);

1986: this is a mild condition satisfied for the standard loss functions.

1987: Did not work.

1988: \fi

1989:

1990: \subsection*{Stationary Reality}

1991:

1992: A standard problem in probability theory is where Reality

1993: is governed by a stationary probability measure;

1994: of course, only stationary prediction strategies are considered.

1995: In this subsection we will list several references

1996: for this problem,

1997: considering, for simplicity, only the case where the signals $x_n$ are absent

1998: (formally, we assume that $\mathbf{X}$ is a one-element set

1999: and omit the $x_n$, which now do not carry any information, from our notation).

2000:

2001: The problem of prediction has been studied extensively

2002: for both strictly stationary sequences of observations

2003: and wide sense stationary sequences

2004: (the definitions and a general discussion of ``strict sense'' and ``wide sense'' concepts

2005: can be found in \cite{doob:1953}, Chapter 2, Sections 8 and 3).

2006: We will first assume that $\ldots,y_{-1},y_0,y_1,\ldots$

2007: form a wide sense stationary sequence of random variables

2008: and then a strictly stationary sequence.

2009:

2010: The natural mode of prediction for wide sense stationary sequences

2011: is linear prediction.

2012: The problem of linear prediction

2013: (not necessarily one-step-ahead, as in this paper)

2014: of wide sense stationary sequences

2015: was posed and solved by Kolmogorov

2016: \cite{kolmogorov:1939,\KolmogorovCRfull,\KolmogorovStationary};

2017: later but independently this was done by Wiener

2018: \cite{wiener:1949}.

2019:

2020: Kolmogorov and Wiener assumed the probability distribution of the observations known.

2021: There are many efficient ways to estimate the spectral density of this probability distribution

2022: (in terms of which the optimal linear predictor is expressed);

2023: see, e.g., \cite{anderson:1971}, Chapter 9, for a review.

2024: (An early idea of spectral estimation was proposed by Einstein in 1914:

2025: see \cite{newton:2002}, p.~363.)

2026:

2027: The problem of existence of universal prediction strategies

2028: for strictly stationary and ergodic sequences of observations

2029: was posed by Cover \cite{cover:1975},

2030: and such strategies were found by Ornstein \cite{ornstein:1978}

2031: for finite $\mathbf{Y}$

2032: and Algoet \cite{algoet:1992} for $\mathbf{Y}$ a Polish space.

2033: Papers \cite{gyorfi/etal:1999,gyorfi/lugosi:2001,nobel:2003}

2034: construct such strategies

2035: using techniques very similar to those of this paper.

2036:

2037: \section{Conclusion}

2038: \label{sec:conclusion}

2039:

2040: An interesting direction of further research

2041: is to obtain non-asymptotic versions of our results.

2042: If the benchmark class of continuous stationary prediction strategies

2043: is compact,

2044: loss bounds can be given in terms of $\epsilon$-entropy

2045: \cite{\KolmogorovTikhomirov}.

2046: In general,

2047: one can give loss bounds in terms of a nested family

2048: of compact sets

2049: whose union is dense in the set of continuous stationary prediction strategies

2050: (in analogy with Vapnik and Chervonenkis's principle

2051: of structural risk minimization \cite{vapnik:1998}).

2052:

2053: \ifFULL\bluebegin

2054:   It would be interesting to explore unconditional continuous predictive complexity

2055:   in the simplest case without $x$s and with $\mathbf{Y}=\{0,1\}$

2056:   (and with the log loss or the square loss function).

2057: \blueend\fi

2058:

2059: \subsection*{Acknowledgments}

2060:

2061: I am grateful to Yura Kalnishkan and Ilia Nouretdinov

2062: for useful comments.

2063: The construction of CS universal prediction strategies

2064: is based on Alex Smola's and G\'abor Lugosi's suggestions.

2065: This work was partially supported by MRC (grant S505/65).

2066:

2067: \begin{thebibliography}{10}

2068:

2069: \bibitem{algoet:1992}

2070: Paul~H\DOT{} Algoet.

2071: \newblock Universal schemes for prediction, gambling and portfolio selection.

2072: \newblock {\em Annals of Probability}, 20:901--941, 1992.

2073: \newblock Corrections: 23:474--478, 1995.

2074:

2075: \bibitem{anderson:1971}

2076: T\DOT{}~W\DOT{} Anderson.

2077: \newblock {\em The Statistical Analysis of Time Series}.

2078: \newblock Wiley, New York, 1971.

2079: \newblock Wiley Classics Library edition: 1994.

2080:

2081: \bibitem{billingsley:1968}

2082: Patrick Billingsley.

2083: \newblock {\em Convergence of Probability Measures}.

2084: \newblock Wiley, New York, 1968.

2085:

2086: \bibitem{blum/etal:1998}

2087: Lenore Blum, Felipe Cucker, Michael Shub, and Steve Smale.

2088: \newblock {\em Complexity and Real Computation}.

2089: \newblock Springer, New York, 1998.

2090:

2091: \bibitem{blum/etal:1989}

2092: Lenore Blum, Michael Shub, and Steve Smale.

2093: \newblock On a theory of computation and complexity over the real numbers:

2094:   {NP}-completeness, recursive functions and universal machines.

2095: \newblock {\em Bulletin of the American Mathematical Society}, 21:1--46, 1989.

2096:

2097: \bibitem{bourbaki:integration}

2098: Nicolas Bourbaki.

2099: \newblock {\em El\'ements de math\'ematique, Livre VI, Int\'egration, Chapitres

2100:   1 \`a 4}.

2101: \newblock Hermann, Paris, first edition, 1952.

2102:

2103: \bibitem{cesabianchi/lugosi:2006}

2104: Nicol\`o Cesa-Bianchi and G\'abor Lugosi.

2105: \newblock {\em Prediction, Learning, and Games}.

2106: \newblock Cambridge University Press, Cambridge, 2006.

2107:

2108: \bibitem{cover:1975}

2109: Tom~M\DOT{} Cover.

2110: \newblock Open problems in information theory.

2111: \newblock In {\em Moscow Information Theory Workshop}, New York, 1975. IEEE

2112:   Press.

2113:

2114: \bibitem{doob:1953}

2115: Joseph~L\DOT{} Doob.

2116: \newblock {\em Stochastic Processes}.

2117: \newblock Wiley, New York, 1953.

2118:

2119: \bibitem{dudley:2002}

2120: Richard~M. Dudley.

2121: \newblock {\em Real Analysis and Probability}, volume~74 of {\em Cambridge

2122:   Studies in Advanced Mathematics}.

2123: \newblock Cambridge University Press, Cambridge, England, 2002.

2124: \newblock Originally published in 1989.

2125:

2126: \bibitem{engelking:1989}

2127: Ryszard Engelking.

2128: \newblock {\em General Topology}, volume~6 of {\em Sigma Series in Pure

2129:   Mathematics}.

2130: \newblock Heldermann, Berlin, second edition, 1989.

2131:

2132: \bibitem{gruenhage:2006}

2133: Gary Gruenhage.

2134: \newblock The story of a topological game.

2135: \newblock {\em Rocky Mountain Journal of Mathematics}, 2006.

2136: \newblock To appear.

2137:

2138: \bibitem{gyorfi/lugosi:2001}

2139: L\'aszl\'o Gy\"orfi and G\'abor Lugosi.

2140: \newblock Strategies for sequential prediction of stationary time series.

2141: \newblock In Moshe Dror, Pierre L'Ecuyer, and Ferenc Szidarovszky, editors,

2142:   {\em Modeling Uncertainty: An Examination of its Theory, Methods, and

2143:   Applications}. Kluwer, 2001.

2144:

2145: \bibitem{gyorfi/etal:1999}

2146: L\'aszl\'o Gy\"orfi, G\'abor Lugosi, and G\DOT{} Morvai.

2147: \newblock A simple randomized algorithm for consistent sequential prediction of

2148:   ergodic time series.

2149: \newblock {\em IEEE Transactions on Information Theory}, 45:2642--2650, 1999.

2150:

2151: \bibitem{hardy/etal:1952}

2152: G\DOT{}~H\DOT{} Hardy, John~E\DOT{} Littlewood, and George P\'olya.

2153: \newblock {\em Inequalities}.

2154: \newblock Cambridge University Press, Cambridge, second edition, 1952.

2155:

2156: \bibitem{kalnishkan/vyugin:2005}

2157: Yuri Kalnishkan and Michael~V\DOT{} Vyugin.

2158: \newblock The {W}eak {A}ggregating {A}lgorithm and weak mixability.

2159: \newblock In Peter Auer and Ron Meir, editors, {\em Proceedings of the

2160:   Eighteenth Annual Conference on Learning Theory}, volume 3559 of {\em Lecture

2161:   Notes in Computer Science}, pages 188--203, Berlin, 2005. Springer.

2162: \newblock The journal version is being prepared for the Special Issue of

2163:   \emph{Journal of Machine Learning Research} devoted to COLT'2005; all

2164:   references are to the journal version.

2165:

2166: \bibitem{kolmogorov:1939}

2167: Andrei~N\DOT{} Kolmogorov.

2168: \newblock Sur l'interpolation et extrapolation des suites stationnaires.

2169: \newblock {\em Comptes rendus de S\'eances de l'Academie des Sciences},

2170:   208:2043--2045, 1939.

2171:

2172: \bibitem{kolmogorov:1941CR-latin}

2173: Andrei~N\DOT{} Kolmogorov.

2174: \newblock Interpolation and extrapolation of stationary random sequences (in

2175:   {R}ussian).

2176: \newblock {\em Izvestiya AN SSSR. Mathematics series}, 5:3--14, 1941.

2177:

2178: \bibitem{kolmogorov:1941-latin}

2179: Andrei~N\DOT{} Kolmogorov.

2180: \newblock Stationary sequences in {H}ilbert space (in {R}ussian).

2181: \newblock {\em Byulleten' MGU. Mathematics}, 2(6):1--40, 1941.

2182:

2183: \bibitem{kolmogorov/tikhomirov:1959latin}

2184: Andrei~N\DOT{} Kolmogorov and Vladimir~M\DOT{} Tikhomirov.

2185: \newblock $\epsilon$-entropy and $\epsilon$-capacity of sets in functional

2186:   spaces (in {R}ussian).

2187: \newblock {\em Uspekhi Matematicheskikh Nauk}, 14(2):3--86, 1959.

2188:

2189: \bibitem{lehmann:1986}

2190: E\DOT{}~L\DOT{} Lehmann.

2191: \newblock {\em Testing Statistical Hypotheses}.

2192: \newblock Springer, New York, second edition, 1986.

2193:

2194: \bibitem{martin-lof:1970}

2195: Per Martin-L\"of.

2196: \newblock {\em Notes on Constructive Mathematics}.

2197: \newblock Almqvist \& Wiksell, Stockholm, 1970.

2198:

2199: \bibitem{naimpally/warrack:1970}

2200: Som~A\DOT{} Naimpally and Brian~D\DOT{} Warrack.

2201: \newblock {\em Proximity Spaces}, volume~59 of {\em Cambridge Tracts in

2202:   Mathematics and Mathematical Physics}.

2203: \newblock Cambridge University Press, London, 1970.

2204:

2205: \bibitem{newton:2002}

2206: H\DOT{}~Joseph Newton.

2207: \newblock A conversation with {E}manuel {P}arzen.

2208: \newblock {\em Statistical Science}, 17:357--378, 2002.

2209:

2210: \bibitem{nobel:2003}

2211: Andrew~B\DOT{} Nobel.

2212: \newblock On optimal sequential prediction for general processes.

2213: \newblock {\em IEEE Transactions on Information Theory}, 49:83--98, 2003.

2214:

2215: \bibitem{ornstein:1978}

2216: D\DOT{}~S\DOT{} Ornstein.

2217: \newblock Guessing the next output of a stationary process.

2218: \newblock {\em Israel Journal of Mathematics}, 30:292--296, 1978.

2219:

2220: \bibitem{rudin:1991}

2221: Walter Rudin.

2222: \newblock {\em Functional Analysis}.

2223: \newblock McGraw-Hill, Boston, second edition, 1991.

2224:

2225: \bibitem{shafer/vovk:2001}

2226: Glenn Shafer and \Vladimir{} Vovk.

2227: \newblock {\em Probability and Finance: It's Only a Game!}

2228: \newblock Wiley, New York, 2001.

2229:

2230: \bibitem{vapnik:1998}

2231: Vladimir~N\DOT{} Vapnik.

2232: \newblock {\em Statistical Learning Theory}.

2233: \newblock Wiley, New York, 1998.

2234:

2235: \bibitem{vovk:1990}

2236: \Vladimir{} Vovk.

2237: \newblock Aggregating strategies.

2238: \newblock In Mark Fulk and John Case, editors, {\em Proceedings of the Third

2239:   Annual Workshop on Computational Learning Theory}, pages 371--383, San Mateo,

2240:   CA, 1990. Morgan Kaufmann.

2241:

2242: \bibitem{vovk:2001competitive}

2243: Vladimir Vovk.

2244: \newblock Competitive on-line statistics.

2245: \newblock {\em International Statistical Review}, 69:213--248, 2001.

2246:

2247: \bibitem{GTP17arXiv}

2248: \Vladimir{} Vovk.

2249: \newblock Predictions as statements and decisions.

2250: \newblock Technical Report \texttt{arXiv:cs.LG/0606093}, \texttt{arXiv.org}

2251:   e-Print archive, June 2006.

2252:

2253: \bibitem{wiener:1949}

2254: Norbert Wiener.

2255: \newblock {\em Extrapolation, Interpolation, and Smoothing of Stationary Time

2256:   Series with Engineering Applications}.

2257: \newblock Technology Press of the Massachusetts Institute of Technology,

2258:   Cambridge, MA, 1949.

2259: \newblock Reprinted from a secret 1942 publication.

2260:

2261: \end{thebibliography}

2262:

2263: \ifWP

2264:   \DFlastpage

2265: \fi

2266: \end{document}

2267: