0810:0810.5288/GL2.tex

1:

2:

3: \documentclass[aos, reqno, preprint]{imsart}%

4: \RequirePackage{amsthm, amsmath, natbib, amsfonts, amssymb}%

5: \RequirePackage[OT1]{fontenc}%

6: \usepackage{graphicx, color}%

7: \usepackage{tikz}%

8: \usepackage{natbib}%

9:

10:

11: \numberwithin{equation}{section}%

12: \theoremstyle{plain}%

13: % \newtheorem{theorem}{Theorem}[section]

14:

15: \definecolor{darkblue}{rgb}{0.0,0.0,0.7}

16:

17: \RequirePackage[%

18: colorlinks = true,%

19: linkcolor = darkblue,%

20: citecolor = darkblue,%

21: urlcolor = darkblue, %

22: ]{hyperref}%

23:

24:

25: \hypersetup{%

26:   pdfauthor = {St\'ephane Ga\"iffas, Guillaume Lecu\'e},%

27:   pdftitle = {Adaptive estimation of the regression with an assumption

28:     free design},%

29:   pdfcreator = {pdflatex},%

30:   pdfproducer = {pdflatex}}

31:

32: \startlocaldefs

33:

34: \def \egal {\stackrel{{\rm def}}{=}}

35:

36: \newcommand \cA{{\cal A}}

37: \newcommand \cB{{\cal B}}

38: \newcommand \cC{{\cal C}}

39: \newcommand \cD{{\cal D}}

40: \newcommand \cE{{\cal E}}

41: \newcommand \cF{{\cal F}}

42: \newcommand \cG{{\cal G}}

43: \newcommand \cH{{\cal H}}

44: \newcommand \cI{{\cal I}}

45: \newcommand \cL{{\cal L}}

46: \newcommand \cM{{\cal M}}

47: \newcommand \cN{{\cal N}}

48: \newcommand \cO{{\cal O}}

49: \newcommand \cP{{\cal P}}

50: \newcommand \cR{{\cal R}}

51: \newcommand \cQ{{\cal Q}}

52: \newcommand \cS{{\cal S}}

53: \newcommand \cU{{\cal U}}

54: \newcommand \cX{{\cal X}}

55: \newcommand \cY{{\cal Y}}

56: \newcommand \cZ{{\cal Z}}

57: \newcommand{\smin}{s_{\min}}%

58: \newcommand{\smax}{s_{\max}}

59:

60: \newcommand \R{{\mathbb  R}}

61: \newcommand \E{{\mathbb  E}}

62: \newcommand \V{{\mathbb  V}}

63:

64: \newcommand{\T}{^{\top}}%

65: \newcommand{\var}{\text{Var}}%

66: \newcommand{\prodsca}[2]{\langle #1,#2 \rangle}%

67: \newcommand{\norm}[1]{\|#1\|}%

68: \newcommand{\ind}[1]{\mathbf 1_{#1}}%

69: \newcommand{\mb}{\mathbf}

70: \newcommand{\sumin}{\sum_{i=1}^n}

71: \newcommand{\sumim}{\sum_{i=1}^m}

72: \newcommand{\bs}{\boldsymbol}

73:

74: \newcommand{\grad}{\triangledown}

75:

76: \DeclareMathOperator*{\supp}{Supp}

77:

78: \DeclareMathOperator{\limInf}{liminf}

79: \DeclareMathOperator{\limSup}{limsup}

80:

81: \DeclareMathOperator*{\argmin}{argmin}

82: \DeclareMathOperator*{\argmax}{argmax}

83: \DeclareMathOperator{\pen}{pen}

84:

85: \DeclareMathOperator{\diag}{diag}

86: \DeclareMathOperator{\Span}{span}

87:

88:

89: \newcommand{\1}{{\rm 1}\kern-0.24em{\rm I}}

90: \newcommand{\hfn}{{\hat{f}_n}}

91: \renewcommand{\hat}{\widehat}

92:

93: % \newtheorem{theo}{Theorem}%

94: \newtheorem{theorem}{Theorem}%

95: \newtheorem{corollary}{Corollary}%

96: \newtheorem{lemma}{Lemma}%

97: \newtheorem{proposition}{Proposition}%

98: % \newtheorem*{assumption}{Assumption}%

99: \theoremstyle{remark}%

100: \newtheorem*{remark}{Remark}%

101: \newtheorem{definition}{Definition}%

102: \newtheorem*{assumption}{Assumption}%

103: \newtheorem{example}{Example}%

104:

105:

106: \endlocaldefs

107:

108:

109: % \linespread{1.4}

110:

111:

112: \begin{document}

113:

114: \begin{frontmatter}

115:

116:   \title{Aggregation of penalized empirical risk minimizers in

117:     regression}%

118:   \runtitle{Aggregation of penalized empirical risk minimizers}

119:

120:   \begin{aug}

121:     \author{\fnms{St\'ephane} \snm{Ga\"iffas}

122:       \ead[label=e1]{stephane.gaiffas@upmc.fr}} and

123:     \author{\fnms{ Guillaume} \snm{Lecu\'e}

124:       \ead[label=e2]{lecue@latp.univ-mrs.fr}}

125:

126:     \runauthor{S. Ga\"iffas and G. Lecu\'e} \affiliation{Universit\'e

127:       Paris~6 and CNRS, LATP Marseille}

128:

129:     \address{Universit\'e Paris 6  \\

130:       Laboratoire de Statistique Th\'eorique et Appliqu\'ee \\

131:       175 rue du Chevaleret \\

132:       75013 Paris \\

133:       \printead{e1}}

134:

135:     \address{ Laboratoire d'abalyse, topologie et probabilit\'e\\

136: 	   Centre de Math�matiques et Informatique\\

137: 	  Technop�le de Ch�teau-Gombert\\

138: 	  39 rue F. Joliot Curie\\

139: 	  13453 Marseille Cedex 13\\

140: 	  France\\

141:       \printead{e2}}

142:   \end{aug}

143:

144:   \begin{abstract}

145:     We give a general result concerning the rates of convergence of

146:     penalized empirical risk minimizers (PERM) in the regression

147:     model. Then, we consider the problem of agnostic learning of the

148:     regression, and give in this context an oracle inequality and a

149:     lower bound for PERM over a finite class. These results hold for a

150:     general multivariate random design, the only assumption being the

151:     compactness of the support of its law (allowing discrete

152:     distributions for instance). Then, using these results, we

153:     construct adaptive estimators. We consider as examples adaptive

154:     estimation over anisotropic Besov spaces or reproductive kernel

155:     Hilbert spaces. Finally, we provide an empirical evidence that

156:     aggregation leads to more stable estimators than more standard

157:     cross-validation or generalized cross-validation methods for the

158:     selection of the smoothing parameter, when the number of

159:     observation is small.

160:     % estimators which are Our aggregation

161:     % approach is motivated by a lower bound for PERM procedures over

162:     % a finite set of weak estimators, which proves that PERM

163:     % procedures are suboptimal compared to some exponential weighted

164:     % averaged schemes.

165:     % We propose an adaptive estimator of the multivariate regression

166:     % function $f_0$ from i.i.d. observations. Without assumption on

167:     % the law $P_X$ of the covariates, besides almost sure

168:     % boundedness, we prove that the standard rate $n^{-s / (2s + 1)}$

169:     % can be achieved by an adaptive estimator, where $n$ denotes the

170:     % sample size and $s$ the smoothness of $f_0$ measured in some

171:     % sense, including Besov smoothness. The assumption on the noise

172:     % is fairly general.

173:   \end{abstract}

174:

175: \begin{keyword}[class=AMS]

176:   \kwd[Primary ]{62G08}

177:   \kwd[; secondary ]{62H12}

178: \end{keyword}

179:

180: \begin{keyword}

181:   \kwd{Nonparametric regression, agnostic learning, aggregation,

182:     adaptive estimation, random design, anisotropic Besov space,

183:     Reproductive Kernel Hilbert Spaces}

184: \end{keyword}

185:

186: \end{frontmatter}

187:

188:

189: \section{Introduction}

190: \label{sec:introduction}

191:

192: \subsection{Motivations}

193:

194: In this paper, we explore some statistical properties of penalized

195: empirical risk minimization (PERM) and aggregation procedures in the

196: regression model. From these properties, we will be able to obtain

197: results concerning adaptive estimation for several problems. Given a

198: data set $D_n$, we consider two problems. Let us define the norm

199: $\norm{g}^2 := \int g(x)^2 P_X(dx)$ where $P_X$ is the law of the

200: covariates and let $E[\cdot]$ be the expectation w.r.t. the joint law

201: of $D_n$. The first problem is the problem of estimation of the

202: regression function $f_0$. Namely, we aim at constructing some

203: procedure $\bar{f}_n$ satisfying

204: \begin{equation}

205:   \label{eq:RateOfConvergence}

206:   E \|\bar{f}_n - f_0 \|^2 \leq \psi(n)

207: \end{equation}

208: where $\psi(n)$, called the {\it rate of convergence}, is a quantity

209: we wish very small as $n$ increases. To get this kind of inequality,

210: it is well-known that one has to assume that $f_0$ belongs to a set

211: with a small complexity (cf., for instance, the "No free Lunch

212: theorem" in \cite{DGL:96}). This is what we do in

213: Section~\ref{sec:pena_least_squares} below, where an assumption on the

214: complexity is considered, see Assumption ($C_\beta$) on the metric

215: entropy.

216:

217: However, this kind of ``a priori'' may not be fulfilled. That is why

218: the second problem, called {\it agnostic learning} has been introduced

219: (cf. \cite{H:92,KSSH:94} and references therein). For this problem, one is given a set $F$ of

220: functions. Without any assumption on $f_0$, we want to construct (from

221: the data) a procedure $\tilde{f}$ which has a risk as close as

222: possible to the smallest risk over $F$. Namely, we want to obtain {\it

223:   oracle inequalities}, that is inequalities of the form

224: \begin{equation*}

225:   E \| \tilde{f} - f_0 \|^2 \leq C \min_{f\in F} \|f - f_0 \|^2 +

226:   \phi(n,F),

227: \end{equation*}

228: where $C \geq 1$ and $\phi(n,F)$ is called the {\it residue}, which is

229: the quantity that we want to be small as $n$ increases.  When $F$ is

230: of finite cardinality $M$, the agnostic problem is called {\it

231:   aggregation problem} and the residue $\phi(n,F) = \phi(n,M)$ is

232: called {\it rate of aggregation}. The main difference between the

233: problems of estimation and aggregation is that we don't need any

234: assumption on $f_0$ for the second problem. Nevertheless, aggregation

235: methods have been widely used to construct adaptive procedures for the

236: estimation problem. That is the reason why we study aggregation

237: procedures in Section~\ref{sec:ERM_finite} below. We will use these

238: procedures in Section~\ref{sec:examples} to construct adaptive

239: estimators in several particular cases, such as adaptive estimation in

240: reproductive kernel Hilbert spaces (RKHS) or adaptive estimation over

241: anisotropic Besov spaces.

242:

243: In Section~\ref{sec:ERM_finite}, we also prove that the ``natural''

244: aggregation procedure, namely empirical risk minimization (ERM) (or

245: its penalized version), fails to achieve the optimal rate of

246: aggregation in this setup. This result motivates the use of an

247: aggregation procedure instead of the most common ERM. Moreover, we

248: provide an empirical evidence in Section~\ref{sec:simulations} that

249: aggregation (with jackknife) is more stable than the classical

250: cross-validation or generalized cross-validation procedures when the

251: number of observations and the signal-to-noise ratio are small.

252:

253: The approach proposed in this paper allows to give rates of

254: convergence for adaptive estimators over very general function sets,

255: such as the anisotropic besov space, with very mild assumption on the

256: law of the covariates: all the results are stated with the sole

257: assumption that the law of the covariates is compact.

258:

259:

260: % We propose an adaptive estimator of the multivariate regression

261: % function $f_0$ from i.i.d. observations. This procedure has strong

262: % adaptation properties: it is adaptive for a very large range of

263: % smoothness classes, including Besov spaces, in the sense that it

264: % achieves the optimal convergence rate without assumption on the

265: % design (or covariates) distribution, besides almost sure

266: % boundedness. % Moreover, this estimator can reduce the dimension of the

267: % % problem, when a single-index assumption is satisfied.

268: % Adaptation is realized via aggregation of several so-called \emph{weak}

269: % estimators, that have in common this strong \emph{design adaptation}

270: % property. The explanatory variable $Y$ is not assumed to be bounded

271: % (we consider subgaussian noise), thus the setting considered here is

272: % more general than the so-called ``distribution free non-parametric

273: % estimation'', see for instance~\cite{kohler02}, which contains a very

274: % exhaustive and detailed presentation of methods that handle the

275: % situation where the knowledge about $P_X$ is very poor.

276:

277: % Adaptation is achieved via \emph{aggregation} (or aggregation) of penalized

278: % least squares estimators over general spaces. From a theoretical point

279: % of view, we use probability techniques coming from empirical process

280: % theory such as covering numbers, peeling and chaining. These

281: % techniques are technical recipes that allows to counterpart the

282: % massiveness of the smoothness classes considered in nonparametric

283: % statistics. On this topic, we refer to~\cite{kohler02},

284: % \cite{vandegeer88, van_de_geer00}, which contains tools and ideas of

285: % importance here (concerning penalized least squares). From a more

286: % practical point of view, the adaptive estimator presented here allows

287: % to mix estimators that are known to provide good results for certain

288: % types of curves. A simple example proposed here is the aggregation of

289: % smoothing splines (least squares with Sobolev penalty). Instead of

290: % selecting the smoothing parameter via GCV (generalized cross

291: % validation), which is of common use in practice, we suggest to apply a

292: % aggregation algorithm to estimators computed with different smoothing

293: % parameters. This allows to consider splines with different orders

294: % simultaneously, while cubic splines are often considered alone in

295: % applications. We show (see Section~\ref{sec:simulations}) that this

296: % provides a more stable procedure than GCV, and that it gives better

297: % results. Moreover, we provide here theoretical results for this

298: % adaptive method, while theoretical knowledge about GCV (concerning

299: % adaptive rates of convergence) is poor. Furthermore, we can mix

300: % smoothing splines with other estimators, like wavelet soft

301: % thresholding for instance (least squares with a particular Besov

302: % penalty). Such an estimator gives good results whathever the curve is:

303: % either a smooth curve, coming from econometric data for instance, or a

304: % signal with bumps or rapid oscillations. When the covariates are

305: % multivariate, we can even further mix purely nonparametric estimators

306: % (with curse of dimensionality) with semiparametric estimators that

307: % process the data using the single-index assumption. The resulting

308: % adaptive estimator provides good results, whether or not the data is

309: % well explained by a single-index model, and it is rate-optimal in both

310: % cases.

311:

312: % The main drawback of our aggregation strategy is that it has a higher

313: % computational cost than a single estimation technique with data-driven

314: % selection of smoothing parameters. But, the counterpart is that when

315: % we aggregate estimators, we do not need to test if some model is

316: % better than another. For instance, we do not test if a single-index

317: % model explains well the data, we just mix all the estimators (purely

318: % nonparametric and single-index) using our aggregation rule, and come

319: % up with an estimator that does a job which is close to the best among

320: % them, whatever the model is (it must be emphasized at this point that

321: % actually, the performance of the aggregate is much better than the

322: % best among them, this is discudded in Section~\ref{sec:simulations}

323: % below).

324:

325: % This general formulation of penalized least squares estimation

326: % includes several standard ones, for instance penalized splines (when

327: % $\mathcal F$ is a Sobolev class) or Besov-penalty least squares

328: % estimators, that are commonly considered in signal or image-processing

329: % papers, see for instance ????. As a consequence, the general upper

330: % bound stated in Theorem~\ref{thm:least_sq} provides directly the same

331: % general upper bound for such estimators, provided that the class

332: % $\mathcal F$ satisfies some complexity bound,

333: % see~\eqref{eq:covering_assumption}.

334:

335: % \texttt{balance entre le temps de calcul, mais pas de test a

336: %   faire....}

337:

338: % This idea was previously developped in the pioneering works of

339: % \texttt{citer Zhang a fond ici....} and concerning aggregation, we

340: % refer to see also the works by ?????

341:

342: % distribution free assumption non-parametric estimation This upper

343: % bound is stated without any assumption on the law of the covariates,

344: % besides boundedness.  . In particular, we do not need to assume that

345: % the law of the covariate have a density with respect to the Lebesgue

346: % measure. this upper bound is valid when the corovatiates are

347: % discrete, or satisfies an upper bound We prove that this estimator

348: % converges with the optimal rate of convergence general This

349: % estimator is based on multivariate penalized least squares

350: % estimates, and By We construct an adaptive estimator of the

351: % regression, We propose a new algorithm for the estimation of both

352: % the index and the link function in the single index model. Un beau

353: % abstract

354:

355:

356: \subsection{The model}

357: \label{sec:model}

358:

359: Let $(X, Y), (X_1, Y_1), \ldots, (X_n, Y_n)$, be independent and

360: identically distributed variables in $\mathbb R^d \times \mathbb

361: R$. We consider the regression model

362: \begin{equation}

363:   \label{eq:model}

364:   Y = f_0(X) + \sigma \varepsilon,

365: \end{equation}

366: where $f_0 : \mathbb R^d \rightarrow \mathbb R$ and $\varepsilon$ is

367: called noise. To simplify, we assume that the noise level $\sigma$ is

368: known. We denote by $P$ the probability distribution of $(X,Y)$ and by

369: $P_X$ the margin distribution in $X$ or \emph{design}, or

370: \emph{covariates} distribution. We denote by $P^n$ the joint

371: distribution of the sample

372: \begin{equation*}

373:   D_n := [ (X_i, Y_i) \;;\; 1 \leq i \leq n],

374: \end{equation*}

375: and by $P_n = P^n[\cdot | X^n]$ where $X^n := (X_1, \ldots, X_n)$, the

376: joint distribution of the sample $D_n$ conditional on the design $X^n

377: := (X_1, \ldots, X_n)$. The expectation w.r.t. $P_n$ is denoted by

378: $E_n$. The noise $\varepsilon$ is symmetrical and subgaussian

379: conditionally on $X$. Indeed, we assume that there is $b_\varepsilon >

380: 0$ such that

381: \begin{equation}

382:   \label{eq:subgaussian}

383:   (G1)(b_\varepsilon): \quad E[\exp(t\varepsilon) | X] \leq

384:   \exp(b_\varepsilon^2t^2/2) \quad \forall t > 0

385: \end{equation}

386: which is equivalent (up to an appropriate choice for the constant

387: $b_\varepsilon$) to

388: \begin{equation*}

389:   \nonumber(G2)(b_\varepsilon) : P[\varepsilon > t | X] \leq

390:   \exp(-t^2/(2b_\varepsilon^2)) \quad \forall t > 0.

391: \end{equation*}

392: Assumption~\eqref{eq:subgaussian} is standard in nonparametric

393: regression, it includes the models of bounded and Gaussian

394: regression. An important fact, that will be used in the proofs, is

395: that for $\varepsilon_1,\ldots,\varepsilon_n$ independent and such

396: that $\varepsilon_i$ satisfies $(G1)(b_i)$ for any $i=1,\ldots,n$, the

397: random variable $\sum_{i=1}^n a_i \varepsilon_i$ satisfies $(G1)(\sum

398: a_i^2b_i^2$) for any $a_1,\ldots,a_n \in \R$ and thus the

399: concentration property $(G2)(\sqrt{2}\sum a_i^2b_i^2$). Other

400: equivalent definitions of subgaussianity are, when $\varepsilon$ is

401: symmetrical, to assume that $E[ \exp(\varepsilon^2/b_\varepsilon^2 |

402: X) ] \leq 2$ for some $b_\varepsilon > 0$, or $(E[ |\varepsilon|^p |

403: X])^{1/p} \leq b_\varepsilon \sqrt{p}$ for any $p \geq 1$.

404:

405: Concerning the design, we only assume that $X$ has a compact support,

406: and without loss of generality we can take its support equal to $[0,

407: 1]^d$. In particular we do not need $P_X$ to be continuous with

408: respect to the the Lebesgue measure. Note that the problem of adaptive

409: estimation with such a general multivariate design is not common in

410: literature. In the so-called ``distribution free nonparametric

411: estimation'' framework, when we want to obtain convergence rates and

412: not only the consistency of the estimators, it is, as far as we know,

413: always assumed that $|Y| \leq L$ a.s. for some constant $L > 0$, see

414: for instance~\cite{kohler02}, \cite{kohler_krzyzak01a},

415: \cite{kohler_krzyzak01b}, \cite{kohler00} and~\cite{kerk_picard07},

416: which is a setting less general than the one considered here.

417:

418: \begin{remark}

419:   The results presented here can be extended to subexponential noise,

420:   that is when $E[ \exp(|\varepsilon| / b_\varepsilon) | X] \leq 2$

421:   for some $b_\varepsilon > 0$, but it involves complications

422:   (chaining with an adaptative truncation argument in the proof of

423:   Theorem~\ref{thm:devia1} below, see for instance~\cite{BLM99}

424:   or~\cite{van_de_geer00}, among others) that we prefer to skip

425:   here. % It can also be seen that extra smoothness in the noise, that

426:   % is $E_n[ \exp(b |\varepsilon|^p) ] \leq 1$ with $p \geq 2$ does

427:   % not actually improve the results presented here (the rates of

428:   % convergence remains the same), but this problem is beyond the

429:   % scope of this paper.

430: \end{remark}

431:

432: % \begin{remark}

433: %   To avoid complications, we assume that the noise level

434: %   $\sigma(\cdot)$ is known, and such that $\sigma_0 < \sigma(X) \leq

435: %   \sigma_1$ a.s. for some $0 < \sigma_0 < \sigma_1$. If not, one can

436: %   replace penalized least squares by weighted penalized least squares

437: %   to handle heteroscedastic noise and one can do a slight modification

438: %   in the weights in the aggregation algorithm, see ??????

439: % \end{remark}

440:

441: %% \begin{definition}

442: %%   \label{def:orlicz}

443: %%   A \emph{Young} function is a convex, increasing function $\psi$ on

444: %%   $\mathbb R^+ \rightarrow \mathbb R^+$ such that $\psi(0) = 0$ and

445: %%   $\lim_{x \rightarrow +\infty}\psi(x) = +\infty$. We define the

446: %%   \emph{Orlicz seminorm} $\norm{\varepsilon}_\psi$ of a random variable

447: %%   $\varepsilon$ by

448: %%   \begin{equation*}

449: %%     \norm{\varepsilon}_\psi := \inf \{ c > 0 : E[ \psi(|\varepsilon| / c) ]

450: %%     \leq 1 \},

451: %%   \end{equation*}

452: %%   with usual convention $\norm{\varepsilon}_\psi = +\infty$ when the

453: %%   infimum is taken over an empty set. We define also

454: %%   \begin{equation*}

455: %%     \norm{\varepsilon}_{n, \psi} := \inf \{ c > 0 : E_n[ \psi(|\varepsilon| / c) ]

456: %%     \leq 1 \text{ a.s. }\}.

457: %%   \end{equation*}

458: %% \end{definition}                %

459:

460: % \begin{assumption}[Model assumption]

461: %   Throughout the paper, we assume that $E_n[\varepsilon] = 0$, and that

462: %   for some $p, B > 0$

463: %   \begin{equation*}

464: %     \norm{\varepsilon}_{n, \psi_p} \leq B

465: %   \end{equation*}

466: %  almost surely, where $\psi_p(x) := \exp(|x|^p) - 1$.

467: % \end{assumption}

468:

469: % This assumption on the model is very general. First, it includes

470: % most of the standard assumptions on the noise that are considered in

471: % nonparametric regression literature. For instance, when $p=2$, this

472: % noise assumption means that the noise is subgaussian conditionally

473: % on the design. It includes also noises which are, conditionally on

474: % the design, gaussian ($p=2$), double exponential ($p=1$) or bounded

475: % almost surely ($p=\infty$, bounded regression). Note that the

476: % statisticien does not need to know the parameter $p$.

477:

478: % If $\psi(x) = \exp(x^2) - 1$, then $\norm{\varepsilon}_\psi < +\infty$

479: % if and only if $\varepsilon$ is subgaussian, namely such that $E[ \exp(

480: % b \varepsilon^2) ] \leq B$ for some $b, B > 0$. In what follows, we

481: % assume that the noise $\varepsilon$ satisfies $\norm{\varepsilon}_{\psi_p}

482: % < +\infty$ for some $p > 0$, where $\psi(x) = |x|^p$. This

483: % assumption includes many standard noises, such as gaussian,

484: % subgaussian, or double exponential noise, among many others.

485:

486: % Moreover, we assume that the $\varepsilon_i$ are independent of $X^n :=

487: % (X_1, \ldots, X_n)$ for $1 \leq i \leq n$.

488:

489: % \section{Construction of the procedure}

490: % \label{sec:construction}

491:

492: % \begin{figure}[htbp]

493: %   \centering

494: %   \label{fig:split}

495: %   \begin{tikzpicture}

496: %     \begin{scope}[shape=rectangle,rounded corners,%

497: %       minimum size=0.8cm,fill=white]%

498: %       \tikzstyle{every node}=[draw,fill]%

499: %       \node (D_n) at (0,0) {whole sample $D_n$};%

500: %       \node (D_m) at (1.5, 1.5) {training sample $D_m$};%

501: %       \node (D_l) at (1.5, -1.5) {learning sample $D_{(m)}$};%

502: %       \node (weak) at (6, 1.5) {weak estimators $\{ \bar f_\lambda ;

503: %         \lambda \in \Lambda \}$};%

504: %       \node (weights) at (6, -1.5) {weights $\{ \hat \theta_\lambda

505: %         ; \lambda \in \Lambda \}$};%

506: %       \node (aggregate) at (9, 0) {aggregated estimator $\hat

507: %         {\mathsf f}$};%

508: %     \end{scope}

509: %     \draw[] (D_n) -- (D_m);%

510: %     \draw[->,very thick] (D_m) -- (weak);%

511: %     % -- (weak) -- (aggregate);%

512: %     \draw[] (D_n) -- (D_l);%

513: %     \draw[->,very thick] (D_l) -- (weights);%

514: %     \draw[->,very thick] (weak) -- (aggregate);%

515: %     \draw[->,very thick] (weights) -- (aggregate);%

516: %     % \draw[] (D_n) -- (q_1) -- (q_2) -| (q_E);%

517: % %     \draw[->,shorten >=2pt] (D_n) .. controls +(75:1.4cm) and

518: % %     +(105:1.4cm) .. node[above] {$x$} (D_n);

519: %   \end{tikzpicture}

520: %   \caption{Splitting the sample}

521: % \end{figure}

522:

523: \section{PERM over a large function set}

524: \label{sec:pena_least_squares}

525:

526: We consider the following problem of estimation: we fix a function

527: space $\mathcal F$ and we want to recover $f_0$ based on the sample

528: $D_n$ using the knowledge that $f_0 \in \mathcal F$. The set $\mathcal

529: F$ is endowed with a seminorm $|\cdot|_{\mathcal F}$. To fix the

530: ideas, when $d=1$, one can think for instance of the Sobolev space

531: $\mathcal F = W_2^s$ of functions such that $|f|_{\mathcal F}^2 = \int

532: f^{(s)}(t)^2 dt < +\infty$, where $s$ is a natural integer and

533: $f^{(s)}$ is the $s$-th derivative of $f$. In this case, the estimator

534: described below is the so-called \emph{smoothing spline estimator},

535: see for instance \cite{wahba90}. Several other examples are given in

536: Section~\ref{sec:examples} below.

537:

538: \subsection{Definition of the PERM}

539:

540: The idea of penalized empirical risk minimization is to make the

541: balance between the goodness-of-fit of the estimator to the data with

542: its smoothness. The quantity $|f|_{\mathcal F}$ measures the

543: smoothness (or ``roughness'') of $f \in \mathcal F$ and the balance is

544: quantifyied by a parameter $h > 0$.

545: \begin{definition}[PERM]

546:   \label{def:perm}

547:   Let $\lambda = (h, \mathcal F)$ be fixed. We say that $\bar

548:   f_\lambda$ is a penalized empirical risk minimizer if it minimizes

549:   \begin{equation}

550:     \label{eq:pena_least_sq}

551:     R_n(f)  + \pen_\lambda(f)

552:   \end{equation}

553:   over $\mathcal F$, where $\pen_\lambda(f) := h^2 |f|_{\mathcal

554:     F}^\alpha$ for some $\alpha > 0$ and where

555:   \begin{equation*}

556:     R_n(f) := \norm{Y - f}_n^2 = \frac{1}{n} \sum_{i=1}^n (Y_i -

557:     f(X_i))^2

558:   \end{equation*}

559:   is the empirical risk of $f$ over the sample $D_n$.

560: \end{definition}

561:

562: The parameter $\alpha$ is a tuning parameter, which can be chosen

563: depending on the seminorm $|\cdot|_{\mathcal F}$, see the examples in

564: Section~\ref{sec:examples}. For simplicity, we shall always assume

565: that a PERM $\bar f_\lambda$ exists, since we can always find $\tilde

566: f_\lambda$ such that $R_n(\tilde f_\lambda) + \pen_{\lambda}(\tilde

567: f_\lambda) \leq \inf_{f \in \mathcal F} \{ R_n(f) + \pen_{\lambda}(f)

568: \} + 1 / n$ which satisfies the same upper bound from

569: Theorem~\ref{thm:least_sq} (see below) as an hypothetic $\bar

570: f_\lambda$. However, a minimizer may not be necessarily unique, but

571: this is not a problem for the theoretical results proposed below. PERM

572: has been studied in a tremendous number of papers, we only refer to

573: \cite{van_de_geer00, vdg07}, \cite{massart03} and \cite{kohler02},

574: which are the closest to the material proposed in this Section.

575:

576: In Theorem~\ref{thm:least_sq} below we propose a general upper bound

577: for PERM over a space $\mathcal F$ that satisfies the complexity

578: Assumption $(C_\beta)$ below. The proof of this upper bound involves a

579: result concerning the supremum of the empirical process $Z(f) :=

580: \sigma n^{-1/2} \sum_{i=1}^n f(X_i) \varepsilon_i$ over $f \in

581: \mathcal F$ which is given in Theorem~\ref{thm:devia1} below.

582:

583: % \subsection{Main definitions}

584:

585: \subsection{Some definitions and useful tools}

586:

587: Let $(E, \norm{\cdot})$ be a normed space. For $z \in E$, we denote by

588: $B(z, \delta)$ the ball centered at $z$ with radius $\delta$. We say

589: that $\{ z_1, \ldots, z_p \}$ is a $\delta$-cover of some set $A

590: \subset E$ if

591: \begin{equation*}

592:   A \subset \bigcup_{1 \leq i \leq p} B(z_i, \delta).

593: \end{equation*}

594: The \emph{$\delta$-covering number} $N(\delta, A, \norm{\cdot})$ is

595: the minimal size of a $\delta$-cover of~$A$ and

596: \begin{equation*}

597:   H(\delta, A, \norm{\cdot}) := \log N(\delta, A, \norm{\cdot})

598: \end{equation*}

599: is the \emph{$\delta$-entropy} of $A$. The main assumption in this

600: section concerns the complexity of the space $\mathcal F$, which is

601: quantified by a bound on the entropy of its unit ball $B_{\mathcal F}

602: := \{ f \in \mathcal F : |f|_{\mathcal F} \leq 1 \}$. We denote for

603: short $H_\infty(\delta, A) = H(\delta, A, \norm{\cdot}_\infty)$ where

604: $\norm{f}_\infty := \sup_{x \in [0,1]^d} |f(x)|$. We denote by $C([0,

605: 1]^d)$ the set of continuous functions on $[0, 1]^d$.

606: \begin{assumption}[$C_\beta$]

607:   We assume that $\mathcal F \subset C([0, 1]^d)$ and that there is a

608:   number $\beta \in (0, 2)$ such that for any $\delta > 0$, we have

609:   \begin{equation}

610:     % \label{eq:covering_assumption}

611:     H_\infty\big( \delta, B_{\mathcal F} \big)

612:     \leq D \delta^{-\beta}

613:     % \exp\Big( D \Big( \frac{R}{\delta} \Big)^{d/s} \Big),

614:   \end{equation}

615:   where $D > 0$ is independent of $\delta$.

616: \end{assumption}

617: This assumption entails that, for any radius $R > 0$, we have

618: \begin{equation*}

619:   H_\infty\big( \delta, B_{\mathcal F}(R) \big) \leq D

620:   \Big(\frac{R}{\delta}\Big)^{\beta}

621: \end{equation*}

622: where $B_{\mathcal F}(R) := \{ f \in \mathcal F : |f|_{\mathcal F}

623: \leq R \}$.

624: % that thatsince this assumption entails that for any ball Define the

625: % ball $\mathcal F(R) := \{ f \in \mathcal F : |f|_{\mathcal F} \leq R

626: % \}$.  The main assumption in this section concerns the complexity of

627: % the space $\mathcal F$, which is quantified by a bound on the

628: % entropy of its balls $\mathcal F(R)$. We denote for short

629: % $H_\infty(\delta, A) = H(\delta, A, \norm{\cdot}_\infty)$ where

630: % $\norm{f}_\infty := \sup_{x \in [0,1]^d} |f(x)|$. We denote by

631: % $C([0, 1]^d)$ the set of continuous functions on $[0, 1]^d$.

632: % \begin{assumption}[$C_\beta$]

633: %   We assume that $\mathcal F \subset C([0, 1]^d)$ and that there is a

634: %   number $\beta \in (0, 2)$ such that for any positive $\delta$ and

635: %   $R$, we have

636: %   \begin{equation}

637: %     % \label{eq:covering_assumption}

638: %     H_\infty\big( \delta, \mathcal F(R) \big)

639: %     \leq D \Big(\frac{R}{\delta}\Big)^{\beta}

640: %     % \exp\Big( D \Big( \frac{R}{\delta} \Big)^{d/s} \Big),

641: %   \end{equation}

642: %   where $D > 0$ is independent of $\delta$ and $R$.

643: % \end{assumption}

644: % \begin{remark}

645: Assumption~$(C_\beta)$ is satisfied by barely all smoothness spaces

646: considered in nonparametric literature (at least when the smoothness

647: of the space is large enough compared to the dimension, see

648: below). The most general space that we consider in this paper and

649: which satisfies~$(C_\beta)$ is the anisotropic Besov space $B_{p,

650:   q}^{\bs s}$, where $\bs s = (s_1, \ldots, s_d)$ is a vector of

651: positive numbers. This space is precisely defined in

652: Appendix~\ref{sec:appendix_approximation}. Each $s_i$ corresponds to

653: the smoothness in the direction $e_i$, where $\{ e_1, \ldots, e_d \}$

654: is the canonical basis of $\mathbb R^d$. The computation of the

655: entropy of $B_{p, q}^{\bs s}$ can be found in~\cite{triebel06}, we

656: give more details in Appendix~\ref{sec:appendix_approximation}. If

657: $\bs {\bar s}$ is the harmonic mean of $\bs s$, namely

658:   \begin{equation}

659:     \label{eq:harmonic_mean}

660:     \frac{1}{\bs {\bar s}} := \frac{1}{d} \sum_{i=1}^d

661:     \frac{1}{s_i},

662:   \end{equation}

663:   then $B_{p, q}^{\bs s}$ satisfies~$(C_\beta)$ with $\beta = d / \bs

664:   {\bar s}$, given that $\bs {\bar s} > d / s$, which is the usual

665:   condition to have the embedding $B_{p, q}^{\bs s} \subset C([0,

666:   1]^d)$.

667: %\end{remark}

668:

669: \begin{remark}

670:   Under the restriction $\beta \in (0, 2)$, the Dudley's entropy

671:   integral satisfies

672:   \begin{equation*}

673:     \int_0^{ {\rm diam}( B_{\mathcal F}, \|\cdot\|_\infty)}

674:     \sqrt{H_\infty(\delta, B_{\mathcal F})} d\delta<\infty,

675:   \end{equation*}

676:   where $\text{diam}(B_{\cF},\|\cdot\|_\infty)$ is the

677:   $L_\infty$-diameter of $B_{\mathcal F}$.  This is a standard

678:   assumption coming from empirical process theory. It is related to

679:   the so-called chaining argument, that we use in the proof of

680:   Theorem~\ref{thm:devia1}. However, in order to consider a larger

681:   space of functions $\mathcal F$, we could think of function spaces

682:   with a complexity $\beta \geq 2$. In this case, using a slightly

683:   different chaining argument (cf. \cite{vdVW:96}), the quantity

684:   appearing in the upper bound of some subgaussian process is of the

685:   type $\int_{c/\sqrt{n}}^{\text{diam}(B_{ \cF},\|\cdot\|_\infty)}

686:   \sqrt{H_\infty(\delta, B_{\mathcal F})} d\delta$ which converges

687:   whatever $\beta$ is. However, such considerations are beyond the

688:   scope of the paper and are to be considered in a future work.

689: \end{remark}

690:

691:

692: % if $\bs s / s \in \mu \mathbb \log Z_+^d$ for some $\mu > 0$

693:

694: %  $d / s $we denote by $s$ We give a precise overview of such results

695: % in Appendix~\ref{sec:appendix}.

696:

697: %  this condition is

698:

699: % If

700: % $\mathcal F = B_{p,\infty}^s([0,1]^d)$, where

701: % $B_{p,\infty}^s([0,1]^d)$ is the Besov space with smoothness $s$

702: % (see~\cite{devore_lorentz93} for precise definitions and properties of

703: % Besov spaces), then condition~\eqref{eq:covering_assumption} holds,

704: % see~\cite{birge_massart00}. This result is precisely recalled in

705: % Theorem~\ref{thm:birge_massart}, see in Appendix.

706:

707: % if $|\mathcal|$In certain cases, an appropriate choice of $\alpha$

708: % allows to simplify minimization of \eqref{eq:pena_least_sq}, see the

709: % examples given below. This definition includes several standard

710: % estimators: smoothing splines (take $\mathcal F$ as a Sobolev space)

711: % and when $\mathcal F$ is a Besov space, $\bar f_\lambda$ is related

712: % to other popular denoising techniques. This is explained in details

713: % later in the Section.

714:

715: \subsection{About the supremum of the process $Z(\cdot)$}

716: \label{sec:process_Z0}

717:

718: The beginning of the proof of Theorem~\ref{thm:least_sq} is, as usual

719: with the proof of upper bounds for $M$-estimators, based on an

720: inequality that links the empirical norm of estimation and the

721: empirical process of the model. This idea goes back to key

722: papers~\cite{vandegeer90} and \cite{birge_massart93}, see

723: also~\cite{van_de_geer00, vdg07} and \cite{massart03} for a detailed

724: presentation. In regression, it writes, if $\bar f$ is a PERM and if

725: $f_0 \in \mathcal F$:

726: \begin{align*}

727:   \norm{\bar f - f_0}_n^2 + \pen(\bar f) &\leq \frac{2}{\sqrt{n}}

728:   Z_n(\bar f - f_0) + \pen(f_0),

729: %   &\leq \sup_{f \in \mathcal F} \frac{2}{\sqrt{n}} Z_n(f - f_0) +

730: %   \pen(f_0),

731: \end{align*}

732: where

733: \begin{equation}

734:   \label{eq:Z_n_def}

735:   Z_n(f) := \frac{\sigma}{\sqrt{n}} \sum_{i=1}^n f(X_i) \varepsilon_i.

736: \end{equation}

737: This inequality explains why the next Theorem~\ref{thm:devia1} is the

738: main ingredient of the proof of Theorem~\ref{thm:least_sq}

739: below. Then, an important remark is that~\eqref{eq:subgaussian}

740: entails

741: \begin{equation}

742:   \label{eq:deviaZnf}

743:   P_n[Z_n(f) > z] \leq \exp\Big( \frac{-z^2}{2 b^2 \norm{f}_n^2}

744:   \Big)

745: \end{equation}

746: for any fixed $f$, $z > 0$ and $n \geq 1$, where $\norm{f}_n^2 :=

747: n^{-1} \sum_{i=1}^n f(X_i)^2$ and where we take for short $b := \sigma

748: b_\varepsilon$. This deviation inequality is at the core of the proof

749: of Theorem~\ref{thm:devia1} below. Let us introduce the

750: \emph{empirical ball} $B_n(f_0, \delta) := \{ f : \norm{f - f_0}_n

751: \leq \delta \}$ and let us recall that $P_n := P^n[\cdot | X^n]$ is

752: the joint law of the sample $D_n$ conditionally to the design $X^n =

753: (X_1, \ldots, X_n)$.

754:

755: \begin{theorem}

756:   \label{thm:devia1}

757:   Let $Z_n(\cdot)$ be the empirical process~\eqref{eq:Z_n_def} and

758:   assume that $(\mathcal F, |\cdot|_{\mathcal F})$ satisfies

759:   $(C_\beta)$. Then\textup, if $f_0 \in \mathcal F$\textup, we can

760:   find constants $z_1 > 0$ and $D_1 > 0$ such that\textup:

761:   \begin{align}

762:     \label{eq:deviaZ_n}

763:     P_n \Big[ \sup_{f \in \mathcal F \cap B_n(f_0, \delta)} \frac{

764:       Z_n(f - f_0) }{\norm{f - f_0}_n^{1 - \beta / 2} (1 +

765:       |f|_{\mathcal F})^{\beta / 2} } > z \Big] \leq \exp( - D_1 z^2

766:     \delta^{-\beta} )

767:   \end{align}

768:   for any $\delta > 0$ and $z \geq z_1$ \textup(we recall that $\beta

769:   \in (0, 2)$\textup).

770: \end{theorem}

771:

772: The proof of this Theorem is given is

773: Section~\ref{sec:proof_main_results}, it uses techniques from

774: empirical process theory such as peeling and chaining. It is a uniform

775: version of~\eqref{eq:deviaZnf}, localized around $f_0$ (for the

776: empirical norm). In this theorem, we use the ``weighting trick'' that

777: was introduced in~\cite{vandegeer90, van_de_geer00}: we divide

778: $Z_n(\cdot)$ by $\norm{f - f_0}_n$ and $|f|_{\mathcal F}$ in order to

779: counterpart, respectively, the variance of $Z_n(\cdot)$ and the

780: massiveness of the class $\mathcal F$. This renormalization of the

781: empirical process is also at the core of the proof of

782: Theorem~\ref{thm:least_sq}.

783:

784: % \begin{remark}

785: %   There is no measurability problem in the inequality stated in

786: %   Theorem~\ref{thm:devia1} since the supremum holds over $\mathcal F$,

787: %   which is assumed to be included in the separable space $C([0,

788: %   1]^d)$.

789: % \end{remark}

790:

791: % is close to results given in~\cite{van_de_geer00}, where a general

792: % presentation of the use of empirical process techniques for

793: % nonparametric estimation is proposed.  See also~\cite{kohler02} for

794: % the situation where $|Y| \leq L$ almost surely for some constant $L

795: % > 0$ and~\cite{massart03} for a detailed presentation of the use of

796: % concentration inequalities in nonparametric statistics.

797:

798: % Thus, the proof relies on the study of the process $Z(\cdot)$. In

799: % Theorem~\ref{thm:devia1} below (see Section~\ref{sec:process_Z0}) we

800: % give a deviation inequality for the supremum of this process over a

801: % general space satisfying the complexity

802: % bound~\eqref{eq:covering_assumption}. This kind of result was

803: % previously used by~\cite{vandegeer90}, among many others, in order

804: % to derive upper bounds for least squares and penalized least squares

805: % estimators. See also~\cite{van_de_geer00}

806:

807:

808:

809: \subsection{Upper bound for the PERM}

810:

811: Theorem~\ref{thm:least_sq} below provides an upper bound for the mean

812: integrated squared error (MISE) of the PERM, both for integration

813: w.r.t. the empirical norm $\norm{f}_n^2 = n^{-1} \sum_{i=1}^n

814: f(X_i)^2$ and the norm $\norm{f}^2 := \int f(x)^2 P_X(dx)$.

815:

816: \begin{theorem}

817:   \label{thm:least_sq}

818:   Let $\mathcal F$ be a space of functions satisfying $(C_\beta)$.

819:   % endowed with a seminorm $|\cdot|_{\mathcal F}$ which satisfies the

820:   % covering preperty~\eqref{eq:covering_assumption} for some $s > d /

821:   % 2$.

822:   Let $\lambda = (h, \mathcal F)$ and $\bar f_{\lambda}$ be a PERM

823:   given by~\eqref{eq:pena_least_sq}, where $h$ satisfies

824:   \begin{equation}

825:     \label{eq:bandwidth}

826:     h = a n^{-1 / (2 + \beta)}

827:   \end{equation}

828:   for some constant $a > 0$ and where $\alpha > 2\beta / (\beta +

829:   2)$. If $f_0 \in \mathcal F$, we have\textup:

830:   \begin{equation*}

831:     E_n \norm{\bar f_{\lambda} - f_0}_n^2 \leq C_1(1 + |f_0|_{\mathcal

832:       F}^\alpha) n^{-2 / (2 + \beta)}

833:   \end{equation*}

834:   for $n$ large enough, where $C_1$ is a fixed constant depending on

835:   $a$, $\beta$, $\alpha$ and $b$. If we assume further that

836:   $\norm{\bar f_\lambda - f_0}_\infty \leq Q$ a.s. for some constant

837:   $Q > 0$, we have

838:   \begin{equation*}

839:     E^n \norm{\bar f_\lambda - f_0}^2 \leq C_2 (1 + |f_0|_{\mathcal

840:       F}^\alpha ) n^{-2 / (2 + \beta)}

841:   \end{equation*}

842:   for $n$ large enough, where $C_2$ is a fixed constant depending on

843:   $C_1$ and $Q$.

844: \end{theorem}

845:

846: % \begin{remark}

847: %   Theorem~\ref{thm:least_sq} improves previous results

848: %   by~\cite{kohler02}, see in particular Chapter~21, in several

849: %   ways. The class $\mathcal F$ here is very general, while it is a

850: %   Sobolev class in~\cite{kohler02}. We do not need to assume that $|Y|

851: %   \leq L$, and the rate in Theorem~\ref{thm:least_sq} corresponds to

852: %   the minimax optimal rate (for a Sobolev class for instance), since

853: %   there is not extra $\log n$ terms.

854: % \end{remark}

855:

856:

857: \begin{remark}

858:   Theorem~\ref{thm:least_sq} holds if we truncate $\bar f_\lambda$ by

859:   some constant $Q$ such that $\norm{f_0}_\infty \leq Q$. Such a

860:   truncation cannot be avoided in such a general regression

861:   setting. Indeed, the PERM is, without truncation, in general non

862:   consistent, see the example from Problem~20.4, p.~430

863:   in~\cite{kohler02}.

864: \end{remark}

865:

866: \begin{remark}

867:   Theorem~\ref{thm:least_sq} holds for any design law $P_X$, even for

868:   the degenerate case where $P_X = \delta_x$ for some fixed point $x

869:   \in [0,1]^d$, where $\delta$ is the Dirac probability measure. Of

870:   course, in this case, the rate $n^{-2 / (2 + \beta)}$ becomes

871:   suboptimal, since the estimation problem with such a $P_X$ is no

872:   more ``truly nonparametric''. Indeed, for a discrete $P_X$ with

873:   finite support, it is proved in~\cite{hamers_kohler04} that the

874:   optimal rate is the parametric rate $1/n$ using a local averaging

875:   estimator.

876: \end{remark}

877:

878: % Several consequences of Theorem~\ref{thm:least_sq} are given in

879: % Section~\ref{sec:examples}, such as the convergence rates of the

880: % PERM in the anisotropic Besov space $B_{p, q}^{\bs s}$, the

881: % convergence rates for PERM in reproductive kernel Hilbert spaces,

882: % and several smoothing spline type estimators, such as the so-called

883: % thin plate spline, or an estimator that we call anisotropic spline

884: % smoother, which was, as far as we know, not previously considered in

885: % literature.

886:

887: \subsection{About the smoothing parameter $h$}

888: \label{sec:about_h}

889:

890: It is well-known that in practice, the choice of the parameter $h$ is

891: of first importance. From the theoretical point of view, in order to

892: make $\bar f_\lambda$ rate-optimal, $h$ must equal in order to a

893: quantity involving the complexity of $\mathcal F$: see

894: condition~\eqref{eq:bandwidth} on the bandwidth and the

895: Assumption~$(C_\beta)$. This problem is commonplace in nonparametric

896: statistics. Indeed, the role of the penalty

897: in~\eqref{eq:pena_least_sq} is to make the balance with the

898: massiveness of the space $\mathcal F$. Without this penalty, or if $h$

899: is too small, $\bar f_{\lambda}$ roughly interpolates the data, which

900: is not suitable when the aim is denoising (this phenomenon is called

901: \emph{overfitting}).

902:

903: Of course, the complexity parameter $\beta$ is unknown to the

904: statistician, and even worse, it does not necessarily make sense in

905: practice. So, several procedures are proposed to select $h$ based on

906: the data. The most popular are the leave-one-out cross validation (CV)

907: and the simpler generalized cross validation (GCV), which is often

908: used with smoothing spline estimators because of its computational

909: simplicity, see~\cite{wahba90} among others. Such methods are known to

910: provide good results in most cases. However, there is, as far as we

911: know, no convergence rates results for estimators based on CV or GCV

912: selection of smoothing parameters. In Section~\ref{sec:examples}

913: below, we propose an alternative approach. Indeed, instead of

914: selecting one particular $h$, we mix several estimators computed for

915: different $h$ in some grid using an aggregation algorithm. This

916: aggregation algorithm is described in Section~\ref{sec:ERM_finite}. We

917: show that this approach allows to construct adaptive estimators with

918: optimal rates of convergence in several particular cases, see

919: Section~\ref{sec:examples}. Moreover, we prove empirically in

920: Section~\ref{sec:simulations} that the aggregation approach is more

921: stable than CV or GCV when the number of observations is small.

922:

923:

924:

925:   % \begin{remark}

926:   %   An inspection of the proof of Theorem~\ref{thm:least_sq} shows

927:   %   that the term $o(h^2)$ is going to zero as $h$ goes to $0$

928:   %   faster than any power function of $m$.  When $h$ is of order

929:   %   $m^{-s/(2s + 1)}$, which is the best choice theoretically, we

930:   %   have

931: %   \begin{equation*}

932: %     \sup_{f \in \mathcal F(R)} E \norm{\bar f - f}_{L^2(P_X^m)}^2 \leq

933: %     (C_1 + 2 R^2)  m^{-2s / (2s + d)}

934: %   \end{equation*}

935: %   which is the standard minimax convergence rate over classes with

936: %   smoothness $s$, at least when $P_X$ has a density with respect to

937: %   the Lebesgue measure which is continuous and bounded away from

938: %   $0$.  Such smoothness classes include Sobolev balls (for $s >

939: %   d/2$) and Besov balls ...

940: % \end{remark}

941:

942: % \begin{remark}

943: %   In the proof of Theorem~\ref{thm:least_sq}, we do not use the

944: %   explicit form of the estimator $\bar f_{\mathcal F}$: we only need

945: %   the minimization property~\eqref{eq:pena_least_sq}. This entails

946: %   that the scheme of proof is quite generic, and could be used for

947: %   other estimators as well (namely, $M$-estimators.) This scheme of

948: %   proof was previously used in the key paper~\cite{vandegeer90}, see

949: %   also~\cite{van_de_geer00}. It relies on a deviation inequality for

950: %   the supremum of a particular empirical process over a smoothness

951: %   class $\mathcal F$, which is stated in Section~\ref{sec:process_Z0}

952: %   below.

953: % \end{remark}

954:

955: % We first prove that the ``natural'' aggregation procedure, namely

956: % empirical risk minimization (or its penalized version), fails to

957: % achieve the optimal rate of aggregation in this setup. This

958: % motivates the choice

959:

960: % In this section, we explore some statistical properties of penalized

961: % empirical risk minimization over a finite set of functions.

962:

963: % In general, given is a data set $D_n$, we can consider two

964: % problems. The first one is the problem of estimation treated in the

965: % previous sections.  Namely, we aim at constructing some procedure

966: % $\bar{f}$ satysfying

967: % \begin{equation}

968: %   \label{eq:RateOfConvergence}

969: %   E \|\bar{f}-f_0 \|^2 \leq \psi(n)

970: % \end{equation}

971: % where $\psi(n)$, called the {\it rate of convergence}, is a quantity

972: % we wish very small as $n$ increases. To get this kind of inequality,

973: % we have to assume $f_0$ to belong to a set with a small complexity (at

974: % least compact). That is the reason why we introduced Assumption

975: % ($C_\beta$) in Section~\ref{sec:pena_least_squares}. Actually, this

976: % kind of ``a priori'' may not be fulfilled. That is why the second

977: % problem, called {\it agnostic learning} has been introduced. For this

978: % problem, one is given a set $F$ of functions. Without any assumption

979: % on $f_0$, we want to construct (from the data) a procedure $\tilde{f}$

980: % which has a risk as close as possible to the smallest risk over

981: % $F$. Namely, we want to obtain {\it oracle inequalities}, that is

982: % inequalities of the form

983: % \begin{equation*}

984: %   E \| \tilde{f} - f_0 \|^2 \leq C \min_{f\in F} \|f - f_0 \|^2 +

985: %   \phi(n,F),

986: % \end{equation*}

987: % where $C \geq 1$ and $\phi(n,F)$ is called the {\it residue}, which

988: % is the quantity that we want to be small as $n$ increases.  When $F$

989: % is of finite cardinality $M$, the agnostic problem is called {\it

990: %   aggregation problem} and the residue $\phi(n,F) = \phi(n,M)$ is

991: % called {\it rate of aggregation}. The main difference between the

992: % problems of estimation and aggregation is that we don't need any

993: % assumption on $f_0$ for the second problem. Nevertheless,

994: % aggregation method have been widely used to construct adaptive

995: % procedures for the estimation problem.  That is the reason why we

996: % study aggregation procedures in this section. We will use these

997: % procedures to construct estimation procedures which will be adaptive

998: % to the complexity parameter $\beta$ introduced in Assumption

999: % ($C_\beta$).

1000:

1001:

1002: \section{PERM and aggregation over a finite set of functions}

1003: \label{sec:ERM_finite}

1004:

1005: Let us fix a set $F(\Lambda) := \{ f_\lambda : \lambda \in \Lambda \}$

1006: of arbitrary functions, and denote by $M = |\Lambda|$ its

1007: cardinality. % We will choose specific sets $F(\Lambda)$ in

1008: % Section~\ref{sec:examples}, but in this section it remains generic.

1009:

1010: \subsection{Suboptimality of PERM over a finite set}

1011:

1012: In this section, we prove that minimizing the empirical risk

1013: $R_n(\cdot)$ (or a penalized version) on $F(\Lambda)$ is a suboptimal

1014: aggregation procedure in the sense of~\cite{tsy:03}. According to

1015: \cite{tsy:03}, the optimal rate of aggregation in the gaussian

1016: regression model is $(\log M) /n$. This means that it is the minimum

1017: price one has to pay in order to mimic the best function among a class

1018: of $M$ functions with $n$ observations. This rate is achieved by the

1019: aggregate with cumulative exponential weights, see~\cite{catbook:01}

1020: and~\cite{jrt:06}.

1021: % temperature parameter $T\geq 2 \max_{f\in F(\Lambda)} \| f_0 -

1022: % f\|_\infty^2 + 2\sigma^2$

1023: In Theorem~\ref{TheoWeaknessERMRegression} below, we prove that the

1024: usual PERM procedure cannot achieve this rate and thus, that it is

1025: suboptimal compared to the aggregation methods with exponential

1026: weights. The lower bounds for aggregation methods appearing in the

1027: literature (see~\cite{tsy:03, jrt:06, LecJMLR:06}) are usually based

1028: on minimax theory arguments. The one considered here is based on

1029: geometric considerations, and involves an explicit example that makes

1030: the PERM fail. For that, we consider the Gaussian regression model

1031: with uniform design.

1032: \begin{assumption}[G]

1033:   Assume that $\varepsilon$ is standard Gaussian and that $X$ is

1034:   univariate and uniformly distributed on $[0, 1]$.

1035: \end{assumption}

1036: % where the design is uniformly distributed on $[0,1]$. That is the

1037: % model \eqref{eq:model} where $X$ has a uniform distribution on

1038: % $[0,1]$ (we consider here the case $d=1$) where the noise

1039: % $\varepsilon$ is a standard normal Gaussian variable.

1040: \begin{theorem}

1041:   \label{TheoWeaknessERMRegression}

1042:   Let $M \geq 2$ be an integer and assume that \textup{(G)} holds. % In

1043:   % the gaussian regression model with a design uniformly distributed

1044:   % on $[0,1]$,

1045:   We can find a regression function $f_0$ and a family $F(\Lambda)$ of

1046:   cardinality $M$ such that, if one considers a penalization

1047:   satisfying $|\pen(f)| \leq C \sqrt{(\log M)/n}, \forall f \in

1048:   F(\Lambda)$ with $0\leq C <\sigma (24\sqrt{2}c^*)^{-1}$ \textup($c^*$ is

1049:   an absolute constant from the Sudakov minorization, see

1050:   Theorem~\ref{TheoSudakov} in

1051:   Appendix~\ref{sec:appendix_proba}\textup), the PERM procedure

1052:   defined by

1053:   \begin{equation*}

1054:     \tilde{f}_n \in \argmin_{f \in F(\Lambda)}( R_n(f) + \pen(f))

1055:   \end{equation*}

1056:   satisfies

1057:   \begin{equation*}

1058:     E^n \| \tilde{f}_n - f_0 \|^2 \geq \min_{f \in

1059:       F(\Lambda)} \| f - f_0 \|^2 + C_3 \sqrt{\frac{\log

1060:         M}{n}}

1061:   \end{equation*}

1062:   for any integer $n \geq 1$ and $M\geq M_0(\sigma)$ such that $n^{-1}

1063:   \log[(M-1)(M-2)] \leq 1/4$ where $C_3$ is an absolute constant.

1064: \end{theorem}

1065: This result tells that, in some particular cases, the PERM cannot

1066: mimic the best element in a class of cardinality $M$ faster than

1067: $((\log M)/n)^{1/2}$. This rate is very far from the optimal one

1068: $(\log M)/n$.

1069:

1070: Let $F(\Lambda)$ be the set that we consider in the proof of

1071: Theorem~\ref{TheoWeaknessERMRegression} (see

1072: Section~\ref{sec:proof_main_results} below), and take $\pen(f) = 0$.

1073: Using Monte-Carlo (we do $5000$ loops), we compute the excess risk $E

1074: \| \tilde{f}_n - f_0 \|^2 - \min_{f \in F(\Lambda)} \| f - f_0 \|^2$

1075: of the ERM. In Figure~\ref{fig:subERM} below, we compare the excess

1076: risk and the bound $((\log M) / n)^{1/2}$ for several values of $M$

1077: and $n$. It turns out that, for this set $F(\Lambda)$, the lower bound

1078: $((\log M) / n)^{1/2}$ is indeed accurate for the excess

1079: risk. Actually, by using the classical symmetrization argument and the

1080: Dudley's entropy integral, it is easy to obtain an upper bound for the

1081: excess risk of the ERM of the order of $((\log M) / n)^{1/2}$ for any

1082: class $F(\Lambda)$ of cardinality $M$.

1083:

1084: \begin{figure}[htbp]

1085:   \centering

1086:   \includegraphics[width=4.3cm]{excess1.pdf}%

1087:   \includegraphics[width=4.3cm]{excess2.pdf}%

1088:   \includegraphics[width=4.3cm]{excess3.pdf}%

1089:   \caption{The excess risk of the ERM compared to $((\log M) /

1090:     n)^{1/2}$ for several values of $M$ and $n$

1091:     \textup($x$-axis\textup)}

1092:   \label{fig:subERM}

1093: \end{figure}

1094:

1095: \subsection{Aggregation}

1096: \label{sec:aggregation}

1097:

1098: % Let $F(\Lambda) = \{ f_\lambda : \lambda \in \Lambda \}$ be a finite

1099: % class of functions. In what follows, $ f_\lambda $ will be one of

1100: % the non-adaptive PERM defined in the previous section and

1101: % constructed with only a part of the data wich is assumed to be fixed

1102: % in this section.

1103: For each $ f_\lambda \in F(\Lambda)$, we compute a weight $\theta(

1104: f_\lambda) \in [0,1]$ such that $\sum_{\lambda \in \Lambda} \theta(

1105: f_{\lambda}) = 1$. These weights give a level of significance to each

1106: $ f_\lambda \in F(\Lambda)$.  The aggregated estimator is then the

1107: convex combination

1108: \begin{equation}

1109:   \label{eq:aggregate}

1110:   \hat {\mathsf f} := \sum_{\lambda \in \Lambda} \theta(f_\lambda)

1111:   f_\lambda,

1112: \end{equation}

1113: where the weight of $f \in F(\Lambda)$ is given by

1114: \begin{equation}

1115:   \label{eq:weights}

1116:   \theta(f) := \frac{\exp\big( - n R_{n}(f) / T

1117:     \big)}{\sum_{\lambda \in \Lambda} \exp\big(-n R_{n}(

1118:     f_\lambda)/T \big) },

1119: \end{equation}

1120: where $T > 0$ is the so-called \emph{temperature} parameter and where

1121: $R_n(f)$ is the empirical risk of $f$. This aggregation algorithm

1122: (with ``Gibbs'' or ``exponential'' weights) can also be found for

1123: instance in~\cite{catbook:01, leung_barron06, juditsky_etal05,

1124:   juditsky_nazin05, yang:00, yang04, LecAoS:07}. See

1125: also~\cite{gaiffas_lecue07} for adaptation by aggregation in a

1126: semiparametric model.

1127:

1128: The next theorem is an oracle inequality for the aggregation

1129: method~\eqref{eq:weights}. It will be useful to derive the adaptive

1130: upper bounds stated in Section~\ref{sec:examples} below.

1131: \begin{theorem}

1132:   \label{thm:oracle}

1133:   % We assume that the noise $\varepsilon$ is symmetric.

1134:   Assume that for any $f \in F(\Lambda)$, we have $\norm{f -

1135:     f_0}_\infty \leq Q$ for some $Q > 0$. For any $a > 0$, the

1136:   aggregation method~\eqref{eq:weights} satisfies

1137:   \begin{equation*}

1138:     E^n \norm{\hat {\mathsf f} - f_0}^2 \leq (1+ a) \min_{f \in

1139:       F(\Lambda)} \norm{f - f_0}^2 + (C + T) \frac{(\log

1140:       n)^{1/2} \log M}{n},

1141:   \end{equation*}

1142:   where $C$ is a constant depending on $a, Q$ and $\sigma$.

1143: \end{theorem}

1144: When $T$ is too large, the weights~\eqref{eq:weights} are close to the

1145: uniform law over the set of weak estimators, and of course, the

1146: resulting aggregate is inaccurate. When $T$ is too small, one weight

1147: is close to $1$, and the others close to $0$: in this situation, the

1148: aggregate does barely the same job as the ERM procedure. This is not

1149: suitable since Theorem~\ref{TheoWeaknessERMRegression} told us that

1150: ERM is suboptimal. Hence, $T$ realize a tradeoff between the ERM and the

1151: uniform weights procedure.

1152: % It is a $T$ is somehow a regularization parameter of this tradeoff.

1153: % the estimator obtained by empirical risk minimization (ERM). This

1154: % behavior can be also explained by

1155: % equation~\eqref{eq:oracle_minimization} in the proof of

1156: % Theorem~\ref{thm:oracle}. Indeed, the exponential

1157: % weights~\eqref{eq:weights} A counterpart of the oracle inequality is

1158: % Theorem~\ref{TheoWeaknessERMRegression}, where we show that any

1159: % penalized empirical risk minimization algorithm is suboptimal

1160: % compared to the cumulative version of the aggregation algorithm

1161: % (\ref{eq:aggregate}) . This result tell us that $T$ shall not be too

1162: % large, since when $T$ is large, the aggregation algorithm

1163: % (\ref{eq:aggregate}) is close to the empirical risk minimization,

1164: % which is suboptimal (see Theorem~\ref{TheoWeaknessERMRegression}).

1165: It can be simply chosen by minimization of the empirical risk. We know

1166: empirically that it provides good results, see~\cite{gaiffas_lecue07}.

1167: Namely, we select the temperature

1168: \begin{equation}

1169:   \label{Tslection}

1170:   \hat T := \argmin_{T \in \mathcal T} \sum_{i=1}^n \big( Y_i - \hat

1171:   {\mathsf f}^{(T)} (X_i) \big)^2,

1172: \end{equation}

1173: where $\hat {\mathsf f}^{(T)}$ is the aggregated

1174: estimator~\eqref{eq:aggregate} with temperature $T$ and where

1175: $\mathcal T$ is some set of temperatures. This is what we do in the

1176: empirical study conducted in Section~\ref{sec:simulations}.

1177:

1178: % The ERM already gives good results, but if $T$ is chosen carefully,

1179: % we expect to obtain an estimator which outperforms the ERM.

1180:

1181: % This fact is confirmed by the numerical study conducted in

1182: % Section~\ref{sec:numerical}, where the choice of $T$ is done using a

1183: % simple leave-one-out cross-validation algorithm over the whole

1184: % sample for aggregates obtained with several $T$.

1185:

1186: % We can understand the aggregation algorithm in the following way:

1187: % first, we compute the least squares of each weak estimators. This is

1188: % the most natural way of assessing the level of significance of some

1189: % estimator among the other ones. Then, we put a Gibbs law over the

1190: % set of weak estimators. The mass of each estimator relies on its

1191: % least squares (over the learning sample). Finally, the aggregate is

1192: % simply the mean expected estimator according to this law.  In

1193: % Section~\ref{sec:aggregation}, we propose an oracle inequality for

1194: % the aggregation algorithm (see Theorem~\ref{thm:oracle}), which is

1195: % the key result in the proof of the adaptive upper bound stated in

1196: % Theorem ?. The choice of the temperature parameter $T$ is discussed

1197: % in Section~\ref{sec:aggregation}.

1198:

1199:

1200: % Second, any penalized selection algorithm (ie, an algorithm that

1201: % selects a particular $\bar f_\lambda$ among $F(\Lambda)$ via a

1202: % penalized least squares minimization criterion) is suboptimal

1203: % compared to an aggregation procedure.

1204:

1205:

1206: % Another strategy for an adaptive choice of the smoothing parameter

1207: % $h$ in penalized least squares is complexity reguralization, which

1208: % was initiated by Vapnik, see~\cite{vapnik98}, and~\cite{kohler02},

1209: % among others. In \cite{kohler02}, the complexity regularization

1210: % approach is adopted to construct an adaptive estimator of the

1211: % regression. In this book, rates of convergence for the least squares

1212: % and penalized least squares estimators are given in the so-called

1213: % ``distribution free'' framework, where it is assumed that $|Y| \leq

1214: % L$ almost surely for a known positive constant $L$, and where there

1215: % is no assumption on $P_X$. In literature, the assumption $|Y| \leq

1216: % L$ is mandatory in order to derive rates of convergence in this

1217: % general setting for $P_X$. Note that this is also the standard

1218: % setting in learning theory. In Chapter~21 from~\cite{kohler02}, an

1219: % upper bound is obtained for the penalized least squares estimator,

1220: % in the case where $X$ is univariate and $\mathcal F$ is a Sobolev

1221: % space (smoothing splines). Herein, the convergence rate is shown to

1222: % be of order $(\log n)^2 n^{-2s/(2s + 1)}$ which is, up to the $(\log

1223: % n)^2$ term, optimal in this context. Thus, the results stated in

1224: % Section ???? improves upon complexity regularization in several

1225: % ways: the results are adaptive, holds in the multivariate case,

1226: % unbounded response $Y$ are taken into account, other spaces than the

1227: % Sobolev space can be considered and the rates are optimal (without

1228: % an extra logarithm).

1229:

1230: % \subsection{Oracle inequality}

1231:

1232:

1233:

1234:

1235: % \begin{remark} %[Why don't we use the standard aggregation algorithm?]

1236: %   The standard aggregation algorithm (with exponantial weights) in the

1237: %   regression model is somewhat different from the one considered

1238: %   here. Usually, the weights are a Gibbs law over the set of

1239: %   estimators, with potential equals to the least squares over the

1240: %   learning sample. Here, we considered a potential equals to the

1241: %   penalized least squares. This weighting scheme is somewhat tuned to

1242: %   the situation where the weak estimators (or \emph{weak learners})

1243: %   are penalized least squares. The reason is the following: actually,

1244: %   the aggregation estimator is a reguralized version of the empirical risk

1245: %   minimizer estimator (ERM). It does a better job than the ERM when

1246: %   the temperature parameter (which can be understood as a

1247: %   reguralization parameter) is not too large.

1248:

1249: %  is This allows to

1250: %   construct an adaptive estimator that does a better than more popular

1251: %   techniques for selecting the smoothness parameter $h$, such as the

1252: %   GCV technique, which provides satisfactory results is most cases.

1253: % \end{remark}

1254:

1255:

1256: % We recall that $m < n$ is the training sample size, which is a

1257: % fraction of $n$ \texttt{ATTENTION !} We recall that $D_m$, $D_{(m)}$

1258: % and $D_n$ stand for the training, the learning, and the whole sample

1259: % (respectively). We denote, repsectively, by $P^m$, $P^{(m)}$ and by

1260: % $P^n$ the corresponding empirical measures, and by $P_X^m$,

1261: % $P_X^{(m)}$ and $P_X^n$ the empirical measures for $X$. Moreover, for

1262: % short, we shall denote $\norm{f}^2 := \int f^2 d P_X$ and

1263: % $\norm{f}_n^2 = \int f^2 dP_X^n$, and we consider

1264: % $\prodsca{\cdot}{\cdot}$ and $\prodsca{\cdot}{\cdot}_n$ the associated

1265: % inner products. We define in the same way $\norm{f}_m$ and

1266: % $\norm{f}_{(m)}$. % In this section, we shall denote by $f_0$ the true

1267: % % regression function.

1268:

1269: % We denote $\bar f$ and $J(f)$ instead of $\bar f_{\lambda}$ and

1270: % $J_s(f)$. We recall that $\pen(f) = h^2 J(f)^2$, where $J(f)^2 = 1 +

1271: % \norm{f}_\infty^2 + \tilde J(f)$. We denote by $|A|$ the cardinal of a

1272: % finite set $A$. We denote $\varepsilon = (\varepsilon_1, \ldots,

1273: % \varepsilon_n)$, and by convention $\norm{\varepsilon}_n^2 = \sum_{1 \leq i

1274: %   \leq n} \varepsilon_i^2 / n$, with the same definition for $Y = (Y_1,

1275: % \ldots, Y_n)$.

1276:

1277:

1278: % The resulting estimator is \emph{adaptive}, as showed below in the

1279: % Section, and as shown numerically in Section~\ref{sec:simulations}.

1280:

1281:

1282: % shall \emph{adapt} both to the complexity of $\mathcal F$ where

1283: % $f_0$ belongs to, which is measured by some smoothness paramerer

1284: % $s$, see~\eqref{eq:covering_assumption}, and to the smoothness

1285: % parameter $h$. \texttt{pas terrible la derniere phrase, et mal dit}

1286:

1287: \section{Examples of adaptive results}

1288: \label{sec:examples}

1289:

1290:

1291: %\section{Examples of PERM over large function sets}

1292:

1293: In this section, we construct adaptive estimators for several

1294: regression problems using the tools from

1295: Section~\ref{sec:pena_least_squares} and~\ref{sec:ERM_finite}. This

1296: involves, as usual with algorithms coming from statistical learning

1297: theory, a split of the sample into two parts (an exception can be

1298: found in~\cite{leung_barron06}). The main steps of the construction of

1299: adaptive estimators given in this section are:

1300: \begin{enumerate}

1301: \item split, at random, the whole sample $D_n$ into a \emph{training

1302:     sample}

1303: \begin{equation*}

1304:   D_m := [(X_i, Y_i) : 1 \leq i \leq m],

1305: \end{equation*}

1306: where $m < n$, and a \emph{learning sample}

1307: \begin{equation*}

1308:   D_{(m)} := [(X_i, Y_i) : m + 1 \leq i \leq n];

1309: \end{equation*}

1310: \item choose a set $\Lambda$ of parameters and compute, using the

1311:   training sample $D_m$, the corresponding class $F(\Lambda) = \{ \bar

1312:   f_\lambda : \lambda \in \Lambda \}$ of PERM (see

1313:   Definition~\ref{def:perm} in

1314:   Section~\ref{sec:pena_least_squares}). Each $\Lambda$ depends on the

1315:   considered problem of adaptive estimation, see below;

1316: \item using the learning sample $D_{(m)}$, compute the aggregation

1317:   weights and the aggregated estimator $\hat {\mathsf f}_n$,

1318:   respectively given by Equations~\eqref{eq:weights}

1319:   and~\eqref{eq:aggregate}.

1320: \end{enumerate}

1321:

1322: Then, using Theorem~\ref{thm:least_sq} (see

1323: Section~\ref{sec:pena_least_squares}) and Theorem~\ref{sec:ERM_finite}

1324: (see Section~\ref{sec:ERM_finite}), we will derive adaptive upper

1325: bounds for estimators $\hat {\mathsf f}_n$ constructed in this

1326: way. Throughout the section, we shall assume the following.

1327:

1328: \begin{assumption}[Split size]

1329:   Let $\ell$ be learning sample size, so that $\ell + m = n$. We shall

1330:   assume from now on, to simplify the presentation, that $\ell$ is a

1331:   fraction of $n$, typically $n/2$ or $n/4$.

1332: \end{assumption}

1333:

1334: \subsection{About the split, jackknife}

1335: \label{sec:jackknife}

1336:

1337: % \begin{remark}[Jackknife]

1338: The behavior of the aggregate $\hat {\mathsf f}_n$ can depend strongly

1339: on the split selected in Step~1, in particular when the number of

1340: observations is small. Hence, a good strategy is to jackknife: repeat,

1341: say, $J$ times Steps 1--3 to obtain aggregates $\{ \hat {\mathsf

1342:   f}_n^{(1)}, \ldots, \hat {\mathsf f}_n^{(J)} \}$, and compute the

1343: mean:

1344: \begin{equation*}

1345:   \hat {\mathsf f}_n := \frac{1}{J} \sum_{j=1}^J \hat {\mathsf

1346:     f}_n^{(j)}.

1347: \end{equation*}

1348: This jackknifed estimator provides better results than a single

1349: aggregate, see Section~\ref{sec:simulations} for an empirical study,

1350: where we show also that it gives more stable estimators than the ones

1351: involving cross-validation of generalized cross-validation. By

1352: convexity of $f \mapsto \norm{f - f_0}^2$, the jackknifed estimator

1353: satisfies the same upper bounds as a single aggregate: each of the

1354: adaptive upper bounds stated below also holds when we use the

1355: jackknife.

1356:

1357: For the set of weak estimators considered in this paper, the split of

1358: the data is not a theoretical artefact. Indeed, if one skips Step~1

1359: (compute $F(\Lambda)$ and $\hat {\mathsf f}_n$ using the whole sample

1360: $D_n$), then $\hat {\mathsf f}_n$ has a very poor performance. An

1361: empirical illustration of this phenomenon is given in

1362: Figure~\ref{fig:split_effect}. Herein, we show the aggregation

1363: weights~\eqref{eq:weights} when the data is splitted and when it is

1364: not splitted. We consider an univariate design and cubic smoothing

1365: splines. Namely, we compute the set $F(\Lambda)$ of PERM

1366: (see~\eqref{eq:pena_least_sq}) with $\mathcal F = \{ f \in L^2([0, 1])

1367: : \int f^{(2)}(t) dt < +\infty \}$ and penalty $\pen(f) = h^2 \int

1368: f^{(2)}(t) dt$, where $f^{(2)}$ stands for the second derivative of

1369: $f$. We do that for several smoothing parameters $h$ in a grid $H$, so

1370: that $\Lambda := \{ (h, \mathcal F) : h \in H \}$. We used the

1371: \texttt{smooth.spline} routine in the \texttt{R} software to compute

1372: $F(\Lambda)$.

1373: \begin{figure}[htbp]

1374:   \centering

1375:   \includegraphics[width=6cm]{weightssplit.pdf}%

1376:   \includegraphics[width=6cm]{weightsnosplit.pdf}%

1377:   \caption{Aggregation weights with split \textup(left\textup) and

1378:     without split \textup(right\textup) and smoothing parameter

1379:     obtained by cross-validation \textup(vertical line\textup)}

1380:   \label{fig:split_effect}

1381: \end{figure}

1382: In Figure~\ref{fig:split_effect}, the x-axis is related to the value

1383: of $h$: it is the value of the parameter \texttt{spar} from the

1384: \texttt{smooth.spline} routine. The vertical line is the value of

1385: \texttt{spar} selected by cross-validation. The conclusion from

1386: Figure~\ref{fig:split_effect} is that, when the data is not splitted,

1387: an overfitting phenomenon occurs: the aggregation algorithm does not

1388: work, since it does not concentrate around a value of

1389: \texttt{spar}. Of course, the resulting aggregated estimator has a

1390: very poor performance.

1391:

1392:

1393: % \subsection{Weak estimators\textup: penalized least squares}

1394: % Using the training sample, we compute a family

1395: % \begin{equation*}

1396: %   F(\Lambda) := \{ \bar f_\lambda : \lambda \in \Lambda \}

1397: % \end{equation*}

1398: % of \emph{weak} estimators of the regression $f_0$. Each of these

1399: % estimators depend on a parameter $\lambda$ which makes them work

1400: % based on the data ``as if'' $f_0$ had some prescribed

1401: % properties. The parameter $\lambda$ writes $\lambda = (h, \mathcal

1402: % F)$, where $h > 0$ is a smoothing parameter, and where $\mathcal F$

1403: % is a smoothness space of function endowed with a seminorm

1404: % $|\cdot|_{\mathcal F}$.  The estimator $\bar f_\lambda$ is not

1405: % adaptive, since it depends on the choice of the tuning parameters

1406: % $h$ and $\mathcal F$ (we recall that we write $\lambda = (h,

1407: % \mathcal F)$ for short).  An obvious

1408:

1409: % \begin{remark} (ne pas enlever cette remarque du tex

1410: %   The following criticism about data splitting is obvious: the weak

1411: %   estimators only use the training sample, which is smaller (typically

1412: %   two times smaller) than the whole sample, so each of them is less

1413: %   accurate than an estimator using the whole sample. This remark holds

1414: %   true when the learning sample is used to select one of them. If we

1415: %   do not select one of them, but mixes all of them according to the

1416: %   aggregation algorithm~(\ref{eq:aggregate}) for instance, then this

1417: %   is no more the case. We give an empirical evidence of this fact in

1418: %   Section~\ref{sec:simulations}, where we compare the CV (cross

1419: %   validation) and GCV (generalized cross validation) methods with our

1420: %   aggregation approach for the selection of the parameter $h$ in cubic

1421: %   spline estimation.

1422: % \end{remark}

1423:

1424:

1425: \subsection{How to derive the adaptive upper bounds}

1426: \label{sec:derive_adaptive}

1427:

1428: In every examples considered below, the scheme to derive adaptive

1429: upper bounds is as follows. Say that $(\mathcal F_\beta : \beta \in

1430: B)$ is a set of embedded functions classes ($\mathcal F_\beta \subset

1431: \mathcal F_{\beta'}$ if $\beta < \beta'$) where each $\mathcal

1432: F_\beta$ satisfy Assumption~$(C_\beta)$. Let $B_n$ be an appropriate

1433: discretization of $B$. Let $\hat {\mathsf f}_n$ be the aggregated

1434: estimator obtained using Steps~1--3 (see the beginning of the

1435: section), with parameter $\Lambda = \Lambda_n = \{ (n^{-2 / (2 +

1436:   \beta)}, \mathcal F_\beta) : \beta \in B_n \}$ and let $M_n$ be the

1437: cardinality of $F(\Lambda_n)$. Let $E^{m}$ and $E^{(m)}$ be the

1438: expectations with respect to, repectively, the joint laws of $D_m$ and

1439: $D_{(m)}$, so that, by independence, we have $E^n[\cdot] =

1440: E^m[E^{(m)}[\cdot]]$. Let $f_0 \in \mathcal F_{\beta_0}$ for some

1441: $\beta_0 \in B$. Using Theorem~\ref{thm:oracle}, we have

1442: \begin{align*}

1443:   E^{(m)} \norm{\hat {\mathsf f}_n - f_0}^2 &\leq C \min_{f \in

1444:     F(\Lambda_n)} \norm{f - f_0}^2 + \frac{C (\log

1445:     n)^{1/2} \log M_n}{n} \\

1446:   & \leq C \norm{\bar f_{\lambda_n} - f_0}^2 + \frac{C (\log n)^{1/2}

1447:     \log M_n}{n},

1448: \end{align*}

1449: where $\lambda_n = (n^{-2 / (2 + \beta_n)}, \mathcal F_{\beta_n})$,

1450: with $\beta_n \in B_n$ chosen such that $\mathcal F_{\beta_0} \subset

1451: \mathcal F_{\beta_n}$ and $n^{-2 / (2 + \beta_n)} \leq C_1 n^{-2 / (2

1452:   + \beta_0)}$. Then, integrating w.r.t. to $E^{m}$ and using

1453: Theorem~\ref{thm:least_sq}, we have, if $M_n$ is no more than a power

1454: of $n$:

1455: \begin{align*}

1456:   E^n \norm{\hat {\mathsf f}_n - f_0}^2 &\leq C E^m \norm{\bar

1457:     f_{\lambda_n} - f_0}^2 + o(n^{-2 / (2 + \beta_0)}) \\

1458:   & \leq C_2 n^{-2 / (2 + \beta_n)} + o(n^{-2 / (2 + \beta_0)}) \leq

1459:   C_3 n^{-2 / (2 + \beta_0)}.

1460: \end{align*}

1461: This prove that, if $f_0 \in \mathcal F_{\beta_0}$ for some $\beta_0

1462: \in B$, we have $E^n \norm{\hat {\mathsf f}_n - f_0}^2 \leq C_3 n^{-2

1463:   / (2 + \beta_0)}$, thus $\hat {\mathsf f}_n$ is indeed adaptive over

1464: $(\mathcal F_\beta : \beta \in B)$.

1465:

1466:

1467: \subsection{Sobolev spaces, spline estimators}

1468: \label{sec:sobolev_spaces}

1469:

1470: When $\mathcal F$ is a Sobolev space, the

1471: PERM~\eqref{eq:pena_least_sq} with $\alpha = 2$ is a very popular

1472: smoothing technique: see, among others, \cite{wahba90} and

1473: \cite{green_silverman94}. The most simple example is when $d=1$ and

1474: \begin{equation*}

1475:   \mathcal F = W_2^s([0, 1]) := \Big\{ f \in L^2([0, 1]) :

1476:   |f|_{W_2^s}^2 := \int_0^1 f^{(s)}(t)^2 dt < \infty \Big\},

1477: \end{equation*}

1478: where $s$ is some natural integer and $f^{(s)}$ stands for the $s$-th

1479: derivative of $f$. In this case, the PERM is called a \emph{smoothing

1480:   spline}, since in this situation the unique minimizer

1481: of~\eqref{eq:pena_least_sq} is a spline, see for

1482: instance~\cite{wahba90} or~\cite{kohler02}. When $s = 2$ (cubic

1483: splines), the routine \texttt{smooth.spline} from the \texttt{R}

1484: software (and for other softwares as well) neatly computes the

1485: solution to~\eqref{eq:pena_least_sq} using the B-spline basis, and

1486: chooses the parameter $h$ via generalized cross-validation (GCV). % Our

1487: % aggregation approach is an alternative to the selection of $h$ via

1488: % GCV, which is more stable when $n$ is small, see

1489: % Section~\ref{sec:simulations}.

1490:

1491: The $d$-dimensional case is easily understood with the definition of

1492: $W_2^s([0, 1]^d)$ as the space of functions $f \in L^2([0, 1]^d)$ with

1493: all derivatives of total order $s$ in $L^2([0,1]^d)$. Namely,

1494: \begin{equation*}

1495:   W_2^s([0, 1]^d) := \Big\{ f \in L^2([0, 1]^d) :

1496:   |f|_{W_2^s([0, 1]^d)}^2 < \infty \Big\},

1497: \end{equation*}

1498: where

1499: \begin{equation}

1500:   \label{eq:usual_roughness}

1501:   |f|_{W_2^s([0, 1]^d)}^2 := \sum_{\mathbf k \in \mathbb N_0^d :

1502:     |\mathbf k| = s} \frac{s

1503:     !}{\mathbf k !} \int_{[0,1]^d} ( D_{\mathbf k} f(x) )^2 dx,

1504: \end{equation}

1505: where for $\mathbf k = (k_1, \ldots, k_d)$ we use the notations

1506: $\mathbf k ! := \prod_{i=1}^d k_i !$ and $|\mathbf k| := \sum_{i=1}^d

1507: k_i$ and where $D_{\mathbf k}$ is the differential operator

1508: $\partial^s / (\partial^{k_1} \cdots \partial^{k_d})$. When $d > 1$,

1509: the PERM for the choice $\mathcal F = W_2^s([0, 1]^d)$ is called a

1510: \emph{thin plate spline}, see again for instance~\cite{wahba90}

1511: or~\cite{kohler02}, where the practical computation of such PERM is

1512: explained in details. The usual assumption $s > d / 2$ gives the

1513: embedding $W_s([0, 1]^d) \subset C[0, 1]^d$ and that

1514: Assumption~$(C_\beta)$ holds, see~\cite{birman_solomjak67}. The

1515: situation where $s$ is not an integer is a particular case of what we

1516: do in Section~\ref{sec:anisotropic_besov} below. The case where

1517: $\mathcal F$ is a Sobolev space is actually a particular case of both

1518: the next sections. Indeed, it is well known (see~\cite{wahba90} for

1519: instance) that a Sobolev space is a Reproductive Kernel Hilbert Space

1520: (RKHS) for an appropriate kernel choice, and that it is also a Besov

1521: space $B_{2, 2}^s$.

1522:

1523: % \texttt{verifier le lien besov et sobolev multidim... dire que

1524: %   sobolev est un cas particuler du rkhs, et que c'est le bon point

1525: %   de vue pour le calcul des thin plates, citer le mec qui fait ca a

1526: %   la fin dans le bouquin.... }

1527:

1528: % Using the B-Spline basis (see~\cite{devore_lorentz93} for a precise

1529: % definition), the minimization~\eqref{eq:pena_least_sq} can be

1530: % written as a ridge regression problem, with a solution that can be

1531: % computed directly via the resolution of the corresponding linear

1532: % system.

1533:

1534:

1535: \subsection{Reproductive Kernel Hilbert Spaces}

1536: \label{sec:RKHS}

1537:

1538:

1539: Reproductive Kernel Hilbert Spaces (cf.~\cite{aronszajn50}), RKHS for

1540: short, provide a unified context for regularization in a wide variety

1541: of statistical model. Computational properties of estimators obtained

1542: by minimization of a functional onto a RKHS make these functions space

1543: very useful for statisticians. In this short section, we briefly

1544: recall some definitions and computational properties of RKHS.

1545:

1546: Let $\cX$ be an abstract space (in this paper, we take

1547: $\cX=[0,1]^d$). We say that $K:\cX\times\cX\longmapsto\mathbb{R}$ is a

1548: {\it reproducing kernel}, RK for short, if for any integer $p$ and any

1549: points $x_1,\ldots,x_p$ in $\cX$, the matrix $(K(x_i,x_j))_{1\leq

1550:   i,j\leq p}$ is symmetric positive definite. Let $K$ be a RK. The

1551: Hilbert space associated with $K$, called {\it Reproducing Kernel

1552:   Hilbert Space} and denoted by $\cH_K$, is the completion of the

1553: space of all the finite linear combination $\sum_j a_j K(x_j,\cdot)$

1554: endowed with the inner product $\prodsca{\sum_j a_j

1555:   K(x_j,\cdot)}{\sum_k b_k K(y_k,\cdot)}_{K}=\sum_{j,k}a_j b_k

1556: K(x_j,y_k)$. We denote by $|\cdot|_K$ the associated norm on $\cH_K$.

1557:

1558: The representer theorem (see~\cite{kimeldorf_wahba71} for results on

1559: optimization in RKHS) is at the heart of minimization of functional

1560: onto RKHS. The solution of the minimization problem

1561: \begin{equation}

1562:   \label{eq:RKHS_estimator}

1563:   \bar{f} \in \argmin_{f \in \cH_K} \{ R_n(f) + h^2|f|_{\cH_K}^2 \}

1564: \end{equation}

1565: is the linear combination

1566: \begin{equation*}

1567:   \bar{f} (\cdot) = \sum_{i=1}^n \alpha_i K(X_i,\cdot),\mbox{ where }

1568:   \boldsymbol {\alpha} = (\alpha_i)_{1 \leq i \leq n} = (\mathbf K_X +

1569:   n h^2 \mathbf I_n)^{-1} \mathbf Y,

1570: \end{equation*}

1571: where $\mathbf K_X$ is the Gram matrix $(K(X_i,X_j))_{1\leq i,j\leq

1572:   n}$, where $\mathbf Y = (Y_1, \ldots, Y_n)$ and where $\mathbf I_n$

1573: is the identity matrix in $\mathbb R^n$. They are many different ways

1574: to simplify the computation of the coefficients $\boldsymbol{\alpha}$,

1575: see for instance~\cite{amato_antoniadis_pensky06}.

1576:

1577: In order to derive convergence rates for the estimator defined

1578: in~\eqref{eq:RKHS_estimator} from Theorem~\ref{thm:least_sq}, we use

1579: some results about covering numbers of RKHS obtained

1580: in~\cite{cucker_smale02} (other results on the entropy of RKHS can be

1581: found in \cite{SS:07,CS:98}). Let now assume that $P_X$ is a Borel

1582: measure. If $K$ is a {\it Mercer kernel} (this is a continuous

1583: reproducing kernel), the RKHS associated with $K$ is the set

1584: \begin{equation*}

1585:   \label{eq:Mercer_kernel}

1586:   \cH_K=\Big\{f\in L_2(P_X): f=\sum_{j=1}^\infty a_j \psi_j \mbox{

1587:     s.t. } \sum_{j=1}^\infty \lambda_j^{-1} a_j^2\leq \infty\Big\},

1588: \end{equation*}

1589: where $(\lambda_j)_{j\geq1}$ is the sequence of decreasing eigenvalues

1590: of the operator

1591: \begin{equation*}

1592:   L_K:\left\{\begin{array}{ccc}

1593:       L^2(P_X) & \longrightarrow & L^2(P_X)\\

1594:       f        & \longmapsto     & \int_\cX K(\cdot,y)f(y)dP_X(y)

1595:     \end{array} \right.

1596: \end{equation*}

1597: and $(\psi_j)_{j\leq1}$ the sequence of corresponding

1598: eigenvectors. According to Proposition~9 and Theorem~D in

1599: \cite{cucker_smale02}, if for any $k\geq1$ the $k$-th eigenvalue of

1600: $L_K$ is such that

1601: \begin{equation}

1602:   \label{eq:rkhs_eigenvalue}

1603:   \lambda_k \leq C k^{-l}

1604: \end{equation}

1605: for some $C > 0$ and $l > 1/2$ then the entropy of $B_K(R) := \{f \in

1606: \cH_K : |f|_K \leq R\}$ satisfies for any $\delta > 0$:

1607: % the ball of radius $R$ of the RKHS $\cH_K$, denoted by

1608: \begin{equation*}

1609:   H_\infty(\delta, B_K(R)) \leq \Big(\frac{2 R C_l}{\delta}

1610:   \Big)^{1/l},

1611: \end{equation*}

1612: where $C_l$ is slightly greater than $6Cl^l$. In this case,

1613: Theorem~\ref{thm:least_sq} and the arguments from

1614: Section~\ref{sec:derive_adaptive} gives the following result.

1615:

1616: \begin{corollary}[Adaptive upper bound for RKHS]

1617:   \label{cor:rkhs}

1618:   Let $\bar f$ be defined by~\eqref{eq:RKHS_estimator} with a

1619:   reproducing kernel $K$ such that the eigenvalues of the operator

1620:   $L_K$ satisfy~\eqref{eq:rkhs_eigenvalue}. Then, if $h = a n^{-l /

1621:     (2l + 1)}$ and $\norm{\bar f - f_0}_\infty \leq Q$, we have

1622:   \begin{equation*}

1623:     E^n \norm{\bar f - f_0}_{L^2(P_X)}^2 \leq C_2 (1 +

1624:     |f_0|^2_{\mathcal H_K}) n^{-2l / (2l + 1)}

1625:   \end{equation*}

1626:   when $n$ is large enough.

1627:

1628:   Now, let $L = [l_{\min}, l_{\max}]$ where $l_{\min} > 1/2$ and

1629:   $(\mathcal H_l : l \in L)$ be a family of nested RKHS. Assume that

1630:   the kernel of each $\mathcal H_l$

1631:   satisfies~\eqref{eq:rkhs_eigenvalue}. Let $\hat {\mathsf f}_n$ be

1632:   the aggregated estimator defined by Steps~1-3 with $\Lambda_n = \{

1633:   \lambda = (n^{-l / (2l + 1)}, \mathcal H_l) : l \in L_n \}$ and $L_n

1634:   := \{ l_{\min}, l_{\min} + (\log n)^{-1}, \ldots, l_{\max} \}$. We

1635:   have, if $f_0 \in \mathcal H_l$ for some $l \in L$,

1636:   \begin{equation*}

1637:     E^n \norm{\hat {\mathsf f}_n - f_0}_{L^2(P_X)}^2 \leq C_2 (1 +

1638:     |f_0|^2_{\mathcal H_l}) n^{-2l / (2l + 1)}

1639:   \end{equation*}

1640:   when $n$ is large enough.

1641: \end{corollary}

1642:

1643:

1644:

1645: \subsection{Anisotropic Besov spaces}

1646: \label{sec:anisotropic_besov}

1647:

1648:

1649: In nonparametric estimation literature, Besov spaces are of particular

1650: interest since they include functions with \emph{inhomogeneous

1651:   smoothness}, for instance functions with rapid oscillations or

1652: bumps. Roughly, these spaces are used in statistics when we want to

1653: prove theoretically that some adaptive estimator is able to recover

1654: the details of a functions. When one considers a multivariate

1655: regression, the question of anisotropic smoothness naturally arises.

1656: Anisotropy means that the smoothness of $f_0$ differs in function of

1657: coordinates. As far as we know, adaptive estimation of a multivariate

1658: curve with anisotropic smoothness was previously considered only in

1659: Gaussian white noise or density models, see~\cite{hoffmann_lepski02},

1660: \cite{kerk_lepski_picard01}, \cite{kerk_lepski_picard07},

1661: \cite{neumann00}.  There is no results concerning the adaptive

1662: estimation of the regression with anisotropic smoothness on a general

1663: random design.

1664:

1665: In this Section, we construct, using Steps~1-3, an adaptive estimator

1666: over anisotropic Besov spaces $B_{p, q}^{\bs s}$, where $\bs s = (s_1,

1667: \ldots, s_d)$ is the vector of smoothnesses. If $\{ e_1, \ldots, e_d

1668: \}$ is the canonical basis of $\mathbb R^d$, each $s_i$ is the

1669: smoothness in the direction $e_i$. A precise definition of $B_{p,

1670:   q}^{\bs s}$ is given in

1671: Appendix~\ref{sec:appendix_approximation}. Let $s$ be the harmonic

1672: mean of $\bs s$, see~\eqref{eq:harmonic_mean}. Let us introduce two

1673: vectors $\bs s^{\min}$ and $\bs s^{\max}$ in $\mathbb R_+^d$ with

1674: positive coordinates and harmonic means $\bar {\bs s}^{\min}$ and

1675: $\bar {\bs s}^{\max}$ respectively. Assume that $\bs s^{\min} \leq

1676: {\bs s}^{\max}$, which means that $s_i^{\min} \leq s_i^{\max}$ for any

1677: $i \in \{ 1, \ldots, d \}$ and assume that $\bar {\bs s}^{\min} > d /

1678: \min(p, 2)$. In view of Theorem~\ref{thm:anisotropic_entropy} and the

1679: embedding~\eqref{eq:anisotropic_embedding} (see

1680: Appendix~\ref{sec:appendix_approximation}), we know that Assumption

1681: $(C_\beta)$ holds for every $B_{p, \infty}^{\bs s}$ such that $\bs s

1682: \geq \bs s^{\min}$ with $\beta = d / \bar {\bs s}$ (and every $B_{p,

1683:   q}^{\bs s}$, since $B_{p, q}^{\bs s} \subset B_{p, \infty}^{\bs

1684:   s}$), where $\bar {\bs s}$ is the harmonic mean of $\bs s$. Consider

1685: the ``cube of smoothness''

1686: \begin{equation}

1687:   \label{eq:smoothness_cube}

1688:   \bs S := \prod_{i=1}^d [s_i^{\min}, s_i^{\max}],

1689: \end{equation}

1690: and consider the uniform discretization of this cube with step $(\log

1691: n)^{-1}$:

1692: \begin{equation}

1693:   \label{eq:discr_smoothness_cube}

1694:   \bs S_n := \prod_{i=1}^d  \big\{ s_i^{\min}

1695:   + k (\log n)^{-1} :1\leq k \leq [ (s_i^{\max} - s_i^{\min}) \log n ]

1696:   \big\},

1697: \end{equation}

1698: and the set of parameters

1699: \begin{equation*}

1700:   \Lambda(\bs S) := \{ \lambda = (n^{- \bar {\bs s} / (2 \bar {\bs s}

1701:     + d)}, B_{p, q}^{\bs s}) : \bs s \in \bs S_n \}.

1702: \end{equation*}

1703: Now, we compute, following Steps~1-3, the aggregated estimator $\hat

1704: {\mathsf f}_n^{\bs S}$ with set of parameters $\Lambda(\bs S)$ (see

1705: the beginning of the section). Following the arguments from

1706: Section~\ref{sec:derive_adaptive}, we can prove in the following

1707: Corollary~\ref{cor:anisotropic_besov_rate} that $\hat {\mathsf

1708:   f}_n^{\bs S}$ is adaptive over the whole range of anisotropic Besov

1709: spaces $\{ B_{p, q}^{\bs s} : \bs s \in \bs S \}$.

1710:

1711: % the want to construct an estimator which is adaptive

1712: % over the whole range of anisotropic Besov spaces $\{ B_{p, q}^{\bs s}

1713: % : \bs s \in \bs S \}$. This is done in two steps:

1714: % \begin{enumerate}

1715: % \item First, using the training sample, compute the family of PERM

1716: %   (see Definition~\ref{def:perm})

1717: % \begin{equation*}

1718: %   F(\bs S) := \{ \bar f_\lambda : \lambda \in \Lambda(\bs S) \}

1719: % \end{equation*}

1720: % where

1721:

1722: % where $s$ is the harmonic mean of $\bs s$. In

1723: % Definition~\ref{def:perm}, we can take $\alpha = p$, see Remark ???

1724: % above (\texttt{remarque sur les sequence spaces...}.

1725: % \item Then, consider $F(\bs S)$ as a family of weak estimators, and

1726: %   apply the aggregation algorithm on it. Namely, we compute the

1727: %   aggregate

1728: %   \begin{equation*}

1729: %     \hat {\mathsf f}_n^{\bs S} := \sum_{\lambda \in \Lambda(\bs S)}

1730: %     \theta(\bar f_\lambda ) \bar f_\lambda,

1731: %   \end{equation*}

1732: %   where the weights $\theta(\bar f)$ are given by~\eqref{eq:weights}.

1733: % \end{enumerate}

1734:

1735:

1736: % The adaptive upper bound stated in

1737: % Corollary~\ref{cor:anisotropic_besov_rate} follows from the arguments

1738: % from Section~\ref{sec:derive_adaptive}.

1739:

1740:

1741: % An immediate consequence of Theorem~\ref{thm:least_sq} is the

1742: % following convergence rate of the PERM in the anisotropic Besov space

1743: % $B_{p, \infty}^{\bs s}$ (see Section~\ref{sec:appendix_approximation}

1744: % for a definition) where we recall that

1745:

1746:

1747: \begin{corollary}

1748:   \label{cor:anisotropic_besov_rate}

1749:   Assume that $\norm{\bar f - f_0}_\infty \leq Q$ for every $\bar f

1750:   \in F(\bs S)$. If $f_0 \in B_{p, q}^{\bs s}$ for some $s \in \bs S$,

1751:   then

1752:   \begin{equation*}

1753:     E^n \norm{\hat {\mathsf f}_n^{\bs S} - f_0}_{L^2(P_X)}^2 \leq C

1754:     n^{-2 \bar {\bs s} / (2 \bar {\bs s} + d)}

1755:   \end{equation*}

1756:   when $n$ is large enough, where $C$ is a constant depending on $\bs

1757:   S, d$ and $Q$.

1758:   % Let $\bar f_\lambda$ be the same as in Theorem~\ref{thm:least_sq}

1759:   % with $\mathcal F = B_{p, \infty}^{\bs s}$ and $h = a n^{-s / (2s +

1760:   %   d)}$ where $s$ is the harmonic mean of $\bs s$. Assume that $s >

1761:   % d / p$ and that $\norm{\bar f_\lambda - f_0}_\infty \leq Q$ and

1762:   % $\norm{\alpha_0}_\infty \leq Q$ for some constant $Q > 0$. Then,

1763:   % uniformly over the ball $B_{p,\infty}^{\bs s}(R) = \{ f :

1764:   % |f|_{B_{p,\infty}^{\bs s}} \leq R \}$, we have\textup:

1765:   % \begin{equation*}

1766:   %   \sup_{f_0 \in B_{p, \infty}^{\bs s}(R)} E \norm{\bar f_\lambda -

1767:   %     f_0}^2 \leq C_3 (1 + R^2) n^{-2s / (2s + d)}

1768:   % \end{equation*}

1769:   % when $n$ is large enough.

1770: \end{corollary}

1771:

1772: % Note that the same result holds for any $B_{p, q}^{\bs s}$ with $q >

1773: % 0$ because of the embedding $B_{p, q}^{\bs s} \subset B_{p,

1774: %   \infty}^{\bs s}$.

1775: In Corollary~\ref{cor:anisotropic_besov_rate} we recover the

1776: ``expected'' minimax rate $n^{-2 \bar {\bs s} / (2 \bar {\bs s} + d)}$

1777: of estimation of a $d$-dimensional curve in a Besov space. Note that

1778: there is no regular or sparse zone here, since the error of estimation

1779: is measured with $L^2(P_X)$ norm. A minimax lower bound over $B_{p,

1780:   q}^{\bs s}$ can be easily obtained using standard arguments, such as

1781: the ones from~\cite{tsybakov03}, together with Bernstein estimates

1782: over $B_{p, q}^{\bs s}$ that can be found in~\cite{hochmuth02}. Note

1783: that the only assumption required on the design law in this corollary

1784: is the compactness of its support.

1785:

1786:

1787: % This theorem proves that $\hat {\mathsf f}_n^{\bs S}$ is adaptive over

1788: % the whole range of anisotropic Besov spaces $\{ B_{p, q}^{\bs s} : \bs

1789: % s \in \bs S \}$. Its proof, which can be found in

1790: % Section~\ref{sec:proof_main_results}, is an easy consequence of

1791: % Theorems~\ref{thm:least_sq} and~\ref{thm:oracle}, together with the

1792: % embedding and entropy properties of these spaces, which are given in

1793: % Appendix~\ref{sec:appendix_approximation}.

1794:

1795: % \texttt{rajouter en remarque estimation pour differents p, et le fait

1796: %   qu'on peut tronquer par Q, on peut aussi mettre $Q_1$ et $Q_2$, pas

1797: %   forcement le meme Q...}

1798:

1799:

1800:

1801:

1802:

1803:

1804:

1805:

1806:

1807:

1808:

1809: % % \subsection{About the practical computation of PERM estimators}

1810:

1811: % Although the practical computation of the PERM in the RKHS case is

1812: % very easy, see Section~\ref{sec:RKHS} (it is one of the reason that

1813: % makes it so popular), the computation of the other PERM proposed in

1814: % this section is less clear. For the Sobolev PERM (smoothing spline

1815: % type estimators) in the isotropic case, this is well

1816: % understood. Indeed, the computation of the thin-plate spline is a

1817: % particular case of RKHS, so its computation ..... see ???? Wahba ou

1818: % Gyorfi kohler ???  \texttt{rajouter des trucs ici}

1819:

1820: % In the Besov case, in particular the anisotropic case, the compu

1821:

1822: % \begin{equation*}

1823: %   \mathbf f_n = \mathbf A \boldsymbol \theta

1824: % \end{equation*}

1825:

1826: % \begin{equation*}

1827: %   |f|_{B_{p,q}^s}^q := \sum_{j \geq 0} \Big( 2^{j(s + d/2 - d/p)} \Big(

1828: %   \sum_{k \in K_j } \sum_{e \in E} |\beta_{e, j, \mathbf k}|^p

1829: %   \Big)^{1/p} \Big)^q

1830: % \end{equation*}

1831: % where $E := \{ 0, 1 \}^d - \{ (0, \ldots, 0) \}$ and

1832:

1833: % \begin{equation*}

1834: %   |f|_{B_{2, 2}^{s}}^2 := \sum_{j \geq 0} 2^{2js} \sum_{\mathbf k

1835: %     \in K_j} \sum_{e \in E} |\beta_{e,j,k}|^2

1836: % \end{equation*}

1837:

1838:

1839: % \begin{example}[Lasso and Elastic estimators]

1840: %   When the complexity parameter $s$ of the class $\cF$ of functions

1841: %   within the regression function belongs to is such that $s>d/2$,

1842: %   Theorem~\ref{thm:least_sq} provides convergence rate for the

1843: %   penalized least square estimator with the semi-norm of $f$ for

1844: %   penalty term (not only for the square of the semi-norm of $f$). We

1845: %   are going to apply this result to obtain convergence rates for the

1846: %   Lasso and Elastic estimators.

1847:

1848: %   Take $M\geq2$ and $f_1,\ldots,f_M$ some functions from $[0,1]^d$ to $\mathbb{R}$. Consider the span $\cF$, in $L^2([0,1]^d)$, of these functions. That is

1849: %   \begin{equation*}

1850: %     \cF={\rm Span}(f_1,\ldots,f_M).

1851: %   \end{equation*}For identifiability reason, we will assume the following algebra  assumption:

1852: %   \begin{assumption}

1853: %     The dimension of the linear subspace $\cF\subset L^2([0,1]^d)$ is $M$.

1854: %   \end{assumption}

1855: %   Any element $f\in\cF$ is then associated with a unique vector  $\theta\in \mathbb{R}^M$ such that $f=f_\theta :=\sum_{j=1}^M\theta_j f_j$. We are going to endowed the space $\cF$ with the norm

1856: %   \begin{equation}\label{eq:Elastic_Penality}

1857: %     |f_\theta|_\cF=\omega\|\theta\|_1+(1-\omega)\|\theta\|_2,

1858: %   \end{equation}where $\omega\in[0,1]$ and $\|\theta\|_p=\big(\sum_{j=1}^M|\theta_j|^p \big)^{1/p}, \forall p\geq1$.

1859: %   The penalized least squares estimator with the penalty term given by~\eqref{eq:Elastic_Penality} is called the {\it elastic estimator }. When $\omega=1$, the elastic estimator is the {\it Lasso estimator}.

1860:

1861: %   Within this framework, the set $\{f_1,\ldots,f_M\}$ is usually called the {\it dictionary}. When $M=d$, $f_j(x)=x_j$ (for any $x=(x_1,\ldots,x_d)\in [0,1]^d$ and $j\in\{1,\ldots,d\}$) and $f_0$ is assumed to belonging to $\cF$, model \eqref{eq:model} is the classical gaussian linear regression model

1862: %   \begin{equation}

1863: %     \label{eq:Model_Linear_Gaussian}

1864: %     \mathbf{Y}=\mathbf{X}\theta_0+\sigma(X)\boldsymbol{\varepsilon},

1865: %   \end{equation}where $\mathbf{Y}=(Y_1,\ldots,Y_n)^t$, $\mathbf{X}$ is the matrix $n\times d$ with lines $X_i^t,i=1,\ldots,n$, $\theta_0\in\mathbb{R}^d$ is such that $f_0=f_{\theta_0}$ and $\boldsymbol{\varepsilon}$ is the vector of noise $(\varepsilon_1,\ldots,\varepsilon_M)^t$. Lasso and Elastic estimators are usually studied in this framework.

1866:

1867: %   We are going to study elastic estimators for a general dictionary. We are not going to deal with the problem of {\it Sign consistency} of the Lasso estimator but only with the convergence rate of this estimator and of the more general elastic estimator. For that, we assume the classical geometric assumption on the dictionary:

1868: %   \begin{assumption}\label{As:Isometry_Gram_Matrix}

1869: %     Let $\Gamma=(\prodsca{f_i}{f_j})_{1\leq i,j\leq M}$ be the Gram matrix of the dictionary $\{f_1,\ldots,f_M\}$ for the inner product $\prodsca{f}{g}=\int_{[0,1]^d}fgdP_X$. We assume that, there exists an absolute constant $c>0$ such that for any vector $\theta\in \mathbb{R}^d$, we have \begin{equation*}\theta^t \Gamma \theta\geq c \|\theta\|_2^2.\end{equation*}

1870: %   \end{assumption}

1871:

1872: %   We don't need to split the sample thus we take $m=n$ observations to construct the estimators. We take $\bar{\theta}\in\mathbb{R}^M$ such that

1873: %   \begin{equation}

1874: %     f_{\bar{\theta}}\in \argmin_{f_\theta\in \cF} \big[\frac{1}{n}\sum_{i=1}^n(Y_i-f_\theta(X_i))^2+h^2 |f_\theta|_\cF\big]

1875: %   \end{equation}where the norm $|\cdot|_\cF$ is defined in equation~\eqref{eq:Elastic_Penality}. Assumption~\ref{As:Isometry_Gram_Matrix} yields $c\|\bar{\theta}-\theta_0\|_2^2\leq \|f_{\bar{\theta}}-f_0\|_2^2$. To obtain rates of convergence using Theorem~\ref{thm:least_sq}, we have to control the entropy of $L_\infty$-balls of the model $\cF$. It is easy to see that \begin{equation*}

1876: %     H(\delta,\cF(R),\|\cdot\|_\infty)\leq M \log\big(\frac{2MR}{\delta} \big), \mbox{ where } M=\max_{1\leq j \leq M}\|f_j\|_\infty.

1877: %   \end{equation*} \texttt{Il faut regarder pour quels $R/\delta$ le plus petit on applique cette inegalit�}. If we have $M$ such that $M \log\big(\frac{2MR}{\delta} \big)\leq D (R/\delta)^{d/s}$ then, applying Proposition~\ref{prop:least_sq} \texttt{si on pouvait se passer de tronquer les estimateurs dans la Proposition 1 ce serait bien ici. Voir Einmahl et Masson?}, the elastic estimator $\bar{\theta}$ with $h\geq a n^{s/(2s+d)}$ satisfies

1878: %   \begin{equation*}

1879: %   \mathbb{E}\|\bar{\theta}-\theta_0\|_2^2\leq C(\theta_0)h^2,

1880: % \end{equation*}where $C(\theta_0)\leq

1881: % C_1/c+2(\omega\|\theta\|_1+(1-\omega)\|\theta\|_2)/c$.

1882: % \end{example}

1883:

1884: % Usually, the ``roughness'' of a function $f \in W_s$ is measured by ,

1885: % consisting of a subsample of size $m < n$ of the whole sample $D_n$

1886: % (for more details about splitting the sample, see below.)

1887:

1888:

1889:

1890:

1891:

1892: % The next corollary is an approximation type result.  \texttt{resultat

1893: %   d'approximation ici}. Let $\tilde f_{(s, h)}$ be given by

1894: % \begin{equation*}

1895: %   \tilde f_{(s, h)} := \argmin_{\tilde f \in W_s} \big\{ \norm{f -

1896: %     \tilde f}_n^2 + \pen(\tilde f) \big\},

1897: % \end{equation*}

1898: % where $\pen(f)$ is given by~\eqref{eq:pen}. A consequence of

1899: % Theorem~\ref{thm:least_sq} is as follows.

1900:

1901: % \begin{corollary}

1902: %   \label{cor:spline_approx}

1903: %   Under the same assumptions as in Theorem~\ref{thm:least_sq}, we have

1904: %   \begin{equation*}

1905: %     E_T \norm{\tilde f - f}_{L^2(P_T)}^2 \leq C h^2,

1906: %   \end{equation*}

1907: %   where $E_T$ is the joint law of of $(T_1, \ldots, T_n)$.

1908: % \end{corollary}

1909:

1910:

1911:

1912: % Then, we use again

1913: % Lemma~\ref{thm:devia1}: we consider this time the event $\mathcal

1914: % B_{f_0}( z_1, \gamma_m)$, where $z_1 > 0$ is a fixed constant given by

1915: % Lemma~\ref{thm:devia1}. We have this time

1916: % \begin{equation}

1917: %   \label{eq:deviaB2}

1918: %   P\big[ \mathcal B_{f_0}( z_1, \gamma_m)^\complement \big] \leq \exp(

1919: %   -D_3 (\log m)^{-d(1 + d / s) / s} h^{-d/s} ),

1920: % \end{equation}

1921: % where $D_3 := D_1 z_1^2 (4\alpha)^{-d / (2s)}$ and in view

1922: % of~\eqref{eq:thm1trick}, we have

1923: % \begin{equation*}

1924: %   \norm{\bar f - f_0}_m^2 + \pen(\bar f) \leq 16 z_1^2 \alpha h^2

1925: % \end{equation*}

1926: % on $\mathcal B_{f_0}( z_1, \gamma_m)$. Then, if

1927: % \begin{equation*}

1928: %   \mathcal B := \mathcal B_{f_0}( (\log m)^{1 + d / (2s)}, 2 \sigma_1

1929: %   t_m ) \cap \mathcal B_{f_0}(z_1, \gamma_m),

1930: % \end{equation*}

1931: % we have $P[ \mathcal B^\complement ] = o(h^2)$. Putting all this

1932: % together, we obtain:

1933: % \begin{align*}

1934: %   E \norm{\bar f - f_0}^2 &\leq \frac{(Q+Q_n)^2}{m} + 10 z_0 h^2 +

1935: %   o(h^2) \\

1936: %   &+ E[ A_2 \ind{\norm{\varepsilon}_m \leq t_m} (\ind{e_m \leq b_m} +

1937: %   \ind{b_m \leq e_m} (\ind{\mathcal B} + \ind{\mathcal

1938: %     B^\complement} ) ) ] \\

1939: %   &\leq (10 z_0 + 16 z_1^2 \alpha + 1 + \norm{f_0}_\infty^2 +

1940: %   \tilde J(f_0)^2 ) h^2 + o(h^2).

1941: %   % &\leq \frac{(Q+Q_n)^2}{m} + 10 z_0 h^2 + o(h^2) + \pend(f_0) + 16

1942: %   % z_1^2 \alpha h^2 + \\

1943: %   % &+

1944: %   % &= E[ A_1 ] +  \\

1945: %   % &\leq ( 10 z_0 + 16 z_1^2 \alpha + 16 J_s(f)^2 ) h^2 + \Delta_n \\

1946: %   % &= \big( C + 16 (\norm{f}_\infty^2 + \smallint f^{(s)}(t)^2 dt)

1947: %   % \big) h^2 + \Delta_n

1948: % \end{align*}

1949: % % where $C := 10 z_0 + 16 (z_1^2 \alpha + 1)$ and where $\Delta_n :=

1950: % % P[ A_1 \geq 10 z_0 h^2 ] + P[\mathcal B^{\complement}]$ is such that

1951: % % $n^\beta \Delta_n$ goes to $0$ for any $\beta > 0$, in view

1952: % % of~\eqref{eq:deviaA1}, \eqref{eq:deviaB1} and~\eqref{eq:deviaB2}.

1953: % This concludes the proof of Theorem~\ref{thm:least_sq}.

1954: % \end{proof}

1955:

1956:

1957: % \subsection*{Proof of Corollary~\ref{cor:spline_approx}}

1958:

1959:

1960: % As in the proof of Theorem~\ref{thm:least_sq}, we have

1961: % \begin{equation*}

1962: %   E \norm{\bar f - f}^2 \leq 10 z_0 h^2 + 2 Q^2 P[A_1 \geq 10 z_0 h^2] +

1963: %   E[ A_2 ],

1964: % \end{equation*}

1965: % where in view of Lemma~\ref{lem:devia2} we have $ P[ A_1 \geq 10 z_0

1966: % h^2] \leq \exp( -n h^2)$, and by the definition of $\tilde f_{(s,

1967: %   h)}$, we have

1968: % \begin{equation*}

1969: %   \norm{f - \tilde f}_n^2 + \pen(\tilde f) \leq \norm{f - f^*}_n^2 +

1970: %   \pen(f^*) \quad \forall f^* \in W_s,

1971: % \end{equation*}

1972: % which gives $\norm{f - \tilde f}_n^2 + \pen(\tilde f) \leq \pen(f)$ if

1973: % $f^* = f \in W_s$. This concludes the proof of the corollary. \hfill

1974: % $\square$

1975:

1976:

1977: % But since the Cauchy-Schwarz inequality gives

1978: % \begin{equation*}

1979: %   0 \leq 2 \norm{Y - f}_n \norm{\bar f - f}_n + \pen(f) - \pen(\bar f)

1980: %   \leq 2 \sigma \norm{\bar f - f}_n + \pen(f) - \pen(\bar f),

1981: % \end{equation*}

1982: % we have that necessarily,

1983: % \begin{equation}

1984: %   \label{eq:trick_pen}

1985: %   \pen(\bar f) \leq 2 \sigma \norm{\bar f - f}_n + \pen(f).

1986: % \end{equation}

1987: % This gives

1988: % \begin{equation*}

1989: %   B \leq 4 (  )

1990: % \end{equation*}

1991:

1992: % We can rewrite it in the following way:

1993: % \begin{align*}

1994: %   \sqrt{n} \norm{\bar f - f}_n^{2k + 1 / (2k)} &\leq \frac{2

1995: %     \sqrt{n} \prodsca{Y - f}{\bar f - f}_n }{ \norm{\bar f - f}_n^{1

1996: %       - 1/(2k)} } + \frac{\sqrt{n} (\pen(f) - \pen(\bar f)) }{

1997: %     \norm{\bar f -

1998: %       f}_n^{1 - 1/(2k)} } \\

1999: %   &=: e_n + b_n.

2000: % \end{align*}

2001:

2002:

2003: %\section{Adaptation}

2004: %\label{sec:adaptation}

2005:

2006:

2007:

2008:

2009: % \subsection{Adaptative estimation over anisotropic Besov spaces}

2010:

2011: \section{Empirical study}

2012: \label{sec:simulations}

2013:

2014: In this Section, we compare empirically our aggregation procedure with

2015: the popular cross-validation (CV) and generalized cross-validation

2016: (GCV) procedures for the selection of the smoothing parameter $h$ (see

2017: Section~\ref{sec:about_h}) in smoothing splines (we use the

2018: \texttt{smooth.spline} routine from the \texttt{R} software, see

2019: \texttt{http://www.r-project.org/}). Concerning CV, GCV and smoothing

2020: splines, we refer to~\cite{wahba90}

2021: and~\cite{green_silverman94}. Those routines provide satisfactory

2022: results in most cases, in particular for the examples of regression

2023: functions considered here. However, we show that when the sample size

2024: $n$ is small (less than 50), and when the noise level is high (we take

2025: root-signal-to-noise ratio equals to $2$), then our aggregation

2026: approach is more stable, see Figure~\ref{fig:mises} below. Here in, we

2027: consider two examples of regression function, given, for $x \in [-1,

2028: 1]$, by:

2029: \begin{itemize}

2030: \item \texttt{hardsine}$(x) = 2 \sin(1 + x) \sin( 2 \pi x^2 + 1)$

2031: \item \texttt{oscsine}$(x) = (x+1) \sin(4 \pi x^2 )$.

2032: \end{itemize}

2033: We simply take $X$ uniformly distributed on $[-1, 1]$ and Gaussian

2034: noise with variance $\sigma$ chosen so that the root-signal-to-noise

2035: ratio is $2$. In Figure~\ref{fig:examples} we show typical simulation

2036: in this setting, where $n = 30$.

2037: \begin{figure}[htbp]

2038:   \centering

2039:   \includegraphics[width=6cm]{data1.pdf}%

2040:   \includegraphics[width=6cm]{data2.pdf}%

2041: %  \includegraphics[width=4.3cm]{n30r2agg.pdf}%

2042:   \caption{Examples of simulated data, for

2043:     $f_0$\texttt{=\textup{harsine}} \textup(left\textup) and

2044:     $f_0$\texttt{=\textup{oscsine}} \textup(right\textup)}

2045:   \label{fig:examples}

2046: \end{figure}

2047:

2048: In Figure~\ref{fig:mises}, we show the mises $E\norm{\hat f_n -

2049:   f_0}_n^2$ computed by Monte Carlo using $1000$ simulations of the

2050: model. The tuning of the estimators in both examples is the following:

2051: for GCV, we simply use the \texttt{smooth.spline} routine with default

2052: selection of $h$ by GCV. For CV, we use the same routine, with the

2053: option \texttt{cv=TRUE} so that CV is used instead. For aggregation,

2054: we use Steps~1-3 (see Section~\ref{sec:examples}). Step~1 is done with

2055: $m=3n/4$ and $\ell = n/4$. For Step~2, we use the

2056: \texttt{smooth.spline} routine to compute a set of weak estimators,

2057: using the option \texttt{spar=x}, where \texttt{x} lies in the set $\{

2058: 0, 0.01, 0.02 \ldots, 1 \}$. The parameter \texttt{spar} is related to

2059: the value of the smoothing parameter $h$. For Step~3, we compute the

2060: weights with temperature given by~\eqref{Tslection} (over the training

2061: sample) and the set $\mathcal T = \{ 10, 20, \ldots, 100 \}$. Then, we

2062: repeat steps~1-3 $J=100$ times and compute the jackknifed estimator,

2063: see Section~\ref{sec:jackknife}. This gives our aggregated estimator.

2064:

2065: On Figure~\ref{fig:mises}, we plot the MISEs (the mean of the $1000$

2066: MISEs obtained for each simulation) for sample sizes $n \in \{ 20, 30,

2067: 50, 100 \}$ and in Figure~\ref{fig:sd} we plot the corresponding

2068: standard deviations. The conclusion is that for small $n$, aggregation

2069: provides a more accurate and stable estimation than the GCV or

2070: CV. When $n$ is $100$ or larger, than the aggregation procedure has

2071: barely the same accuracy as GCV or CV.

2072:

2073: \begin{figure}[htbp]

2074:   \centering

2075:   \includegraphics[width=6cm]{mises1.pdf}%

2076:   \includegraphics[width=6cm]{mises2.pdf}%

2077:   \caption{MISE for $f_0$\textup{=\texttt{harsine}}

2078:     \textup(left\textup) and $f_0$\textup{=\texttt{oscsine}}

2079:     \textup(right\textup)}

2080:   \label{fig:mises}

2081: \end{figure}

2082:

2083: \begin{figure}[htbp]

2084:   \centering

2085:   \includegraphics[width=6cm]{sd1.pdf}%

2086:   \includegraphics[width=6cm]{sd2.pdf}%

2087:   \caption{standard deviation of the MISE for

2088:     $f_0$\textup{=\texttt{harsine}} \textup(left\textup) and

2089:     $f_0$\textup{=\texttt{oscsine}} \textup(right\textup)}

2090:   \label{fig:sd}

2091: \end{figure}

2092:

2093:

2094:

2095: % \begin{table*}[htbp]

2096: %   \caption{Estimated MISE \textup(using 1000

2097: %     replications\textup) and standard deviations \textup(between

2098: %     brackets\textup) for $f = \texttt{\textup{hardsine}}$}

2099: %   \begin{tabular}{lccc}

2100: %     \hline

2101: %     $n$ & GCV &  CV &  AGG \\ \hline

2102: %     $20$ & 0.224 (0.132)  & 0.233  (0.172) & \textbf{0.188}

2103: %     (\textbf{0.089}) \\

2104: %     $30$ &  0.177 (0.124)  & 0.153 (0.103) & \textbf{0.146}

2105: %     (\textbf{0.064}) \\

2106: %     $50$ & $6.56 \times 10^{-2}$ ($9.86 \times 10^{-2}$) & $\mathbf{4.75

2107: %       \times 10^{-2}}$ ($5.53 \times 10^{-2}$) & $4.84 \times 10^{-2}$

2108: %     ($\mathbf{5.29 \times 10^{-2}}$) \\

2109: %     $100$ & $2.82 \times 10^{-3}$ ($1.24 \times 10^{-2}$) &

2110: %     $\mathbf{2.60 \times 10^{-3}}$ ($\mathbf{1.07 \times 10^{-2}}$) &

2111: %     $2.89 \times 10^{-3}$ ($1.17 \times 10^{-2}$) \\

2112: %     \hline

2113: %     \hline

2114: %   \end{tabular}

2115: %   \label{tab:mises1}

2116: % \end{table*}

2117:

2118:

2119: % \begin{table*}[htbp]

2120: %   \caption{Estimated MISE \textup(using 1000

2121: %     replications\textup) and standard deviations \textup(between

2122: %     brackets\textup) for $f = \texttt{\textup{oscsine}}$}

2123: %   \begin{tabular}{lccc}

2124: %     \hline

2125: %     $n$ & GCV &  CV &  AGG \\ \hline

2126: %     $20$ & 0.235 (0.195) & 0.167 (0.094) & 0.123 (0.09) \\

2127: %     $30$ &  &  & \\

2128:

2129: %     0.07323741  ( 0.04325123 )

2130:

2131: %     $50$ & $6.56 \times 10^{-2}$ ($9.86 \times 10^{-2}$) & $\mathbf{4.75

2132: %       \times 10^{-2}}$ ($5.53 \times 10^{-2}$) & $4.84 \times 10^{-2}$

2133: %     ($\mathbf{5.29 \times 10^{-2}}$) \\

2134: %     $100$ & $2.82 \times 10^{-3}$ ($1.24 \times 10^{-2}$) &

2135: %     $\mathbf{2.60 \times 10^{-3}}$ ($\mathbf{1.07 \times 10^{-2}}$) &

2136: %     $2.89 \times 10^{-3}$ ($1.17 \times 10^{-2}$) \\

2137: %     \hline

2138: %     \hline

2139: %   \end{tabular}

2140: %   \label{tab:mises2}

2141: % \end{table*}

2142:

2143:

2144:

2145: \section{Proofs of the main results}

2146: \label{sec:proof_main_results}

2147:

2148: We recall that $P_n$ stands for the joint law of the training sample

2149: $D_n$ conditional on $X^n := (X_1, \ldots, X_n)$, that is $P_n :=

2150: P^n[\cdot | X^n]$.

2151:

2152: % Note that if $\{ z_1, \ldots, z_p \}$ is a $\delta$-cover of $A

2153: % \subset E$, where $(E, \norm{\cdot})$ is some normed space, we can

2154: % find a $2\delta$-cover of $A$ with same size $p$ which is included

2155: % in $A$. Thus, we shall always assume without loss of generality that

2156: % a $\delta$-cover is included in the space it covers.

2157:

2158: \begin{proof}[Proof of Theorem~\ref{thm:devia1}]

2159:   First, we use the \emph{peeling} argument: we decompose $B_n(f_0,

2160:   \delta)$ into the union of the sets $S_j$ for $j \geq 0$, where for

2161:   $\delta_j := \delta 2^{-j/\beta}$

2162:   \begin{equation*}

2163:     S_j := B_n(f_0, \delta_j ) - B_n(f_0, \delta_{j+1}),

2164:   \end{equation*}

2165:   and decompose $\mathcal F$ into the union of the sets

2166:   \begin{equation*}

2167:     B_\cF(2^{k/\beta}) - B_\cF(2^{(k-1)/\beta}) = \{ f \in \mathcal F

2168:     : 2^{(k-1) / \beta} < |f|_{\mathcal F} \leq 2^{k / \beta} \},

2169:  \end{equation*}

2170:  for $k \geq 1$, where $B_{\mathcal F}(2^{k/\beta}) = \{ f \in

2171:  \mathcal F : |f|_{\mathcal F} \leq 2^{k/\beta}\}$ This gives that the

2172:  left hand side of~\eqref{eq:deviaZ_n} is smaller than

2173:    \begin{align*}

2174:      \sum_{j \geq 0} & P_n\Big[ \sup_{ \substack{f \in S_j \text{

2175:            s.t. } \\ |f|_{\mathcal F} \leq 1} } \frac{ Z(f - f_0)

2176:      }{\norm{f - f_0}_n^{1 - \beta/2} (1 + |f|_{\mathcal F})^{\beta/2}

2177:      } > z \Big] \\

2178:      &+ \sum_{j \geq 0} \sum_{k \geq 1} P_n \Big[ \sup_{ f

2179:          \in S_j\cap B_{\mathcal F}(2^{k/\beta})} \frac{ Z(f - f_0) }{\norm{f -

2180:          f_0}_n^{1 - \beta/2} (1 + |f|_{\mathcal F})^{\beta/2} } > z

2181:      \Big],

2182:    \end{align*}

2183:    which is smaller than

2184:    \begin{equation*}

2185:      \sum_{j,k \geq 0}P_n \Big[ \sup_{f \in

2186:          B_n(f_0, \delta_j)\cap B_\cF(2^{k/\beta}) }

2187:      Z(f - f_0) > z(\delta, j, k) \Big] =: \sum_{j,k \geq 0}  P_{j, k},

2188:    \end{equation*}

2189:    where $z(\delta, j, k) := z \delta_j^{1 - \beta/2}

2190:    2^{k/2-1/2}$. Let us consider, for any $\delta > 0$, a minimal

2191:    $\delta$-covering $F(\delta, k)$ of the set $B_{\mathcal

2192:      F}(2^{k/\beta})$ for the

2193:    $\norm{\cdot}_\infty$-norm. Assumption~$(C_\beta)$ implies

2194:    \begin{equation*}

2195:      | F(\delta, k) | \leq \exp\big( D (2^{k/\beta} / \delta)^{\beta} \big)

2196:      = \exp( D 2^k \delta^{-\beta} ).

2197:    \end{equation*}

2198:    Moreover, without loss of generality, we can assume that $F(\delta,

2199:    k) \subset B_{\mathcal F}(2^{k/\beta})$. For any $i \in \mathbb N$

2200:    and $j, k$ fixed, we introduce

2201:    \begin{equation}

2202:      \label{eq:Fi}

2203:      F^{(i)} := F(\delta_{i,j}, k) \text{ where } \delta_{i,j} :=

2204:      \delta_j 2^{-i/\beta} = \delta 2^{-(i+j)/\beta},

2205:    \end{equation}

2206:    and, for any $f\in B_\cF(2^{k/\beta})$ we denote by $\pi_i(f)$ an

2207:    element of $F^{(i)}$ such that $\norm{\pi_i(f) - f}_\infty \leq

2208:    \delta_{i,j}$. We have

2209:    \begin{align*}

2210:      P_{j,k} &\leq P_n\Big[ \sup_{ f \in B_n(f_0, \delta_j)\cap

2211:        B_\cF(2^{k/\beta})} | Z(\pi_0(f) - f_0) | > z(\delta, j, k) / 2

2212:      \Big] \\ & + P_n \Big[ \sup_{ f \in B_n(f_0, \delta_j) \cap

2213:        B_\cF(2^{k/\beta})} | Z(f - \pi_0(f))| > z(\delta, j, k) / 2

2214:      \Big] \\ &=: P_{j,k,1} + P_{j,k,2}.

2215:    \end{align*}

2216:    First, we consider $P_{j,k,1}$:

2217:    \begin{align*}

2218:      P_{j,k,1} \leq P_n \Big[ \sup_{f \in F^{(0)} \cap B_n(f_0,

2219:        \delta_j) } | Z(\pi_0(f) - f_0) | > z(\delta, j, k) / 2 \Big].

2220:    \end{align*}

2221:    We use~\eqref{eq:deviaZnf} and the union bound over $F^{(0)}$

2222:    together with the fact that $f \in B_n(f_0, \delta_j)$ to obtain:

2223:    \begin{equation*}

2224:      P_{j,k,1} \leq |F^{(0)}| \exp\Big( \frac{-a z^2(\delta, j, k)}{4

2225:        \delta_j^2} \Big) = \exp\Big( \frac{2^{j+k}}{\delta^{\beta}} (D - a z^2 / 8 ) \Big),

2226:    \end{equation*}

2227:    where $a := (2b^2)^{-1}$. Now, in order to control $P_{j,k,2}$, we

2228:    use the so-called chaining argument, which involves increasing

2229:    approximations by the covers $F^{(i)}$, see~\eqref{eq:Fi}. Let us

2230:    consider

2231:    \begin{equation*}

2232:      E_i := (2^{1/\beta - 1/2} - 1) 2^{-i (1/\beta-1/2) }

2233:    \end{equation*}

2234:    for $i \geq 1$ ($E_i > 0$ since $\beta \in(0, 2)$). By linearity of

2235:    $Z_n(\cdot)$ and since $\sum_{i \geq 1} E_i = 1$, we have

2236:    \begin{align*}

2237:      P_{j,k,2} &\leq \sum_{i \geq 1} P_n\Big[ \sup_{ \substack{ f \in

2238:          B_n(f_0, \delta_j) \\ |f|_{\mathcal F} \leq 2^{k/\beta} } } |

2239:      Z(\pi_i(f) - \pi_{i-1}(f)) | > E_i z(\delta, j, k) / 2 \Big] \\

2240:      &=: \sum_{i \geq 1} P_{i, j, k, 2}.

2241:    \end{align*}

2242:    Now, since

2243:    \begin{align*}

2244:      \norm{\pi_i(f) - \pi_{i-1}(f)}_n &\leq \norm{\pi_i(f) -

2245:        \pi_{i-1}(f)}_\infty \\

2246:      &\leq \norm{\pi_i(f) - f}_\infty + \norm{\pi_{i-1}(f) - f}_\infty \\

2247:      & \leq \delta_{i,j} + \delta_{i-1,j} = \delta_{i,j} (1 +

2248:      2^{1/\beta}),

2249:    \end{align*}

2250:    and since the number of pairs $\{ \pi_i(f), \pi_{i-1}(f) \}$ is at

2251:    most

2252:    \begin{equation*}

2253:      |F^{(i)}| \times |F^{(i-1)}| \leq \exp \Big( \frac{3 D 2^{i + j +

2254:          k}}{2 \delta^{\beta}} \Big),

2255:    \end{equation*}

2256:    we obtain using again~\eqref{eq:deviaZnf}:

2257:    \begin{align*}

2258:      P_{i, j, k, 2} &\leq |F^{(i)}| \times |F^{(i-1)}| \times

2259:      \exp\Big( \frac{-a E_i^2 z^2(\delta, j, k)}{4 \delta_{i,j}^2 (1 +

2260:        2^{1/\beta})^2} \Big) \\

2261:      &= \exp\Big( \frac{2^{i+j+k}}{\delta^{\beta}} \big( 3 D / 2 - C_1

2262:      z^2 \big) \Big)

2263:    \end{align*}

2264:    where $C_1 = C_1(s, d, a) := a(2^{1/\beta -

2265:      1/2} - 1) / (8 (1 + 2^{1/\beta})^2) > 0$. Then, if we choose $z_1

2266:    := (3 / C_1)^{1/2}$, we have for any $z \geq z_1$ and $D_1 := C_1 /

2267:    2$:

2268:    \begin{align*}

2269:      \sum_{j, k \geq 0} P_{j,k} &\leq \sum_{j,k \geq 0} \Big(

2270:      P_{j,k,1} + \sum_{i \geq 1} P_{i,j,k,2} \Big) \\

2271:      &\leq \sum_{j,k \geq 0} \Big( \exp( -D_1 2^{j+k} z^2

2272:      \delta^{-\beta} ) + \sum_{i \geq 1} \exp( -D_1 2^{i+j+k} z^2

2273:      \delta^{-\beta} ) \Big)

2274:    \end{align*}

2275:    and the Theorem follows.

2276: \end{proof}

2277:

2278:

2279: \begin{proof}[Proof of Theorem~\ref{thm:least_sq}]

2280:   For short, we shall write $\bar f$ instead of $\bar f_\lambda$, and

2281:   $\pen(f)$ instead of $\pen_\lambda(f)$. In view

2282:   of~\eqref{eq:pena_least_sq}, we have

2283:   \begin{equation}

2284:     \label{eq:f_bar_prop}

2285:     \norm{Y - \bar f}_n^2 + \pen(\bar f) \leq \norm{Y - f}_n^2 +

2286:     \pen(f) \quad \forall f \in \mathcal F,

2287:   \end{equation}

2288:   which is equivalent to

2289:   \begin{equation*}

2290:     \norm{\bar f - f}_n^2 + \pen(\bar f) \leq 2 \prodsca{Y -

2291:       f}{\bar f - f}_n + \pen(f) \quad \forall f \in \mathcal F,

2292:   \end{equation*}

2293:   where $\prodsca{f}{g}_n = n^{-1} \sum_{i=1}^n f(X_i) g(X_i)$. This

2294:   entails, since $f_0 \in \mathcal F$, that

2295:   \begin{equation}

2296:     \label{eq:trick1}

2297:     \norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq

2298:     \frac{2}{\sqrt{n}} Z(\bar f - f_0) + \pen(f_0)

2299:   \end{equation}

2300:   where $Z(\cdot)$ is the empirical process given

2301:   by~\eqref{eq:Z_n_def}. Recall that $B_n(f_0, \delta)$ stands for the

2302:   ball centered at $f_0$ with radius $\delta$ for the norm

2303:   $\norm{\cdot}_n$. Let us introduce the event

2304:   \begin{equation}

2305:     \label{eq:event_Z}

2306:     \mathcal Z(z, \delta) := \Big\{ \sup_{f \in \mathcal F \cap

2307:       B_n(f_0, \delta)} \frac{ Z(f - f_0) }{\norm{f - f_0}_n^{1 -

2308:         \beta/2} (1 + |f|_{\mathcal F})^{\beta/2} } \leq z \Big\}.

2309:   \end{equation}

2310:   In view of Theorem~\ref{thm:devia1}, see

2311:   Section~\ref{sec:process_Z0}, we can find constants $z_1 > 0$ and

2312:   $D_1 > 0$ such that\textup:

2313:   \begin{align*}

2314:     P_n\big[ \mathcal Z(z, \delta)^\complement \big] \leq \exp( - D_1

2315:     z^2 \delta^{-\beta} ),

2316:   \end{align*}

2317:   for any $\delta > 0$ and $z \geq z_1$. When $2 n^{-1/2} Z(\bar f -

2318:   f_0) \leq \pen(f_0)$, we have $\norm{\bar f - f_0}_n^2 \leq 2

2319:   \pen(f_0)$. When $2 n^{-1/2} Z(\bar f - f_0) \geq \pen(f_0)$, we

2320:   have, for any $z>0$, in view of~\eqref{eq:trick1}, whenever $\bar f \in B_n(f_0,

2321:   \delta)$ for some $\delta > 0$, that on $\mathcal Z(z, \delta)$,

2322:   \begin{equation*}

2323:     \norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq \frac{4 z}{\sqrt{n}}

2324:     \norm{\bar f - f_0}_n^{1 - \beta/2} (1 + |\bar f|_{\mathcal

2325:       F})^{\beta/2}.

2326:   \end{equation*}

2327:   If $|\bar f|_{\mathcal F} \leq 1$, this entails

2328:   \begin{equation*}

2329:     \norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq ( a^{-2}(2^\beta 4

2330:     z)^{4 / (2 + \beta)} + 1) h^2.

2331:   \end{equation*}

2332:   Otherwise, we have

2333:   \begin{equation*}

2334:     \norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq \frac{2^{\beta/2} 4

2335:       z}{\sqrt{n}} \norm{\bar f - f_0}_n^{1 - \beta/2} |\bar

2336:     f|_{\mathcal F}^{\beta/2},

2337:   \end{equation*}

2338:   and we use the following lemma.

2339:   \begin{lemma}

2340:     \label{lem:logtrick}

2341:     Let $r, I, h, \varepsilon$ be positive numbers, $\beta \in (0, 2)$

2342:     and $\alpha > 2\beta / (\beta + 2)$. Then, if

2343:     \begin{equation}

2344:       \label{eq:logtrick}

2345:       r^2 + h^2 I^\alpha \leq \varepsilon\, r^{1 - \beta / 2} I^{\beta / 2},

2346:     \end{equation}

2347:     we have

2348:     \begin{equation*}

2349:       r \leq ( \varepsilon^{\alpha} h^{-\beta} )^{2 / (2\alpha + \alpha \beta

2350:         -  2 \beta)}, \quad I \leq (\varepsilon^2

2351:       h^{-(\beta + 2)})^{2 / (2 \alpha + \alpha \beta - 2\beta)}

2352:     \end{equation*}

2353:     and consequently

2354:     \begin{equation*}

2355:       r^2 + h^2 I^\alpha \leq 2 (\varepsilon^\alpha

2356:       h^{-\beta})^{4/(2\alpha + \alpha \beta - 2\beta)}.

2357:     \end{equation*}

2358:   \end{lemma}

2359:   The proof of this Lemma is given in Section~\ref{sec:lemmas_proofs}

2360:   below. It entails, since $h = a n^{-1 / (2 + \beta)}$ and $\alpha >

2361:   2\beta / (\beta+2)$, that

2362:   \begin{equation*}

2363:     \norm{\bar f - f_0}_n^2 + h^2 |\bar f|_{\mathcal F}^{\alpha} \leq

2364:     2 ((2^{\beta/2} 4 z)^{\alpha} a^{-\beta})^{4 / (2\alpha + \alpha

2365:       \beta - 2\beta)} n^{-2 / (\beta+2)}.

2366:   \end{equation*}

2367:    Thus,

2368:   when $\bar f \in B_n(f_0, \delta)$, we have on $\mathcal Z(z, \delta)$:

2369:   \begin{equation*}

2370:     \norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq p(z)^2 h^2

2371:   \end{equation*}

2372:   where

2373:   \begin{equation*}

2374:     p(z)^2 := C_1 (1 + z^{4 / (2 + \beta)} + z^{4\alpha / (2\alpha + \alpha

2375:       \beta - 2\beta)})

2376:   \end{equation*}

2377:   and $C_1$ is a constant depending on $\alpha, \beta$ and $a$.

2378:   % It can be readily seen that this inequality entails

2379: %   \begin{align}

2380: %     \label{eq:thm1trick}

2381: %     \nonumber \norm{\bar f - f_0}_n &\leq \Big(\frac{z 4

2382: %       2^{\beta/2}}{m^{\alpha} h^{2 \beta}} \Big)^{1/(2\alpha + \alpha

2383: %       \beta - 2\beta)} \\ &\leq (z 4 2^{\beta/2} a^{-\beta/\alpha}

2384: %     )^{2 \alpha/(2\alpha + \alpha \beta - 2\beta)} m^{-1 / (2 + \beta)},

2385: %   \end{align}

2386: %   where we used~\eqref{eq:bandwidth}, and

2387: %   \begin{equation}

2388: %     \label{eq:delta1}

2389: %     \norm{\bar f - f_0}_n \leq C (1 + |f_0|^{d /(2s+d)}) p(z) m^{-s /

2390: %       (2s + d)} =: p(z) \delta_1,

2391: %   \end{equation}

2392: %   on $\mathcal Z(z, \delta)$, where $p(z) := (z^{2\alpha s / (\alpha

2393: %     (2s + d) - 2d)} \vee z^{d / (2s+d)})$ and $C := 2^{(4s+d)/(2s+d)}

2394: %   \vee (4 2^{d/(2s)} a^{-d / (2\alpha s )})^{2\alpha s / (2\alpha s +

2395: %     \alpha d - 2 d)}$.

2396:   Let us assume for now that $\norm{\bar f - f_0}_n \leq \delta$ for

2397:   some $\delta > 0$, and let us introduce

2398:   \begin{equation*}

2399:     \mathcal Z_1(z, \delta) := \mathcal Z(z, \delta) \cap \mathcal

2400:     Z(z_1, p(z) h),

2401:   \end{equation*}

2402:   where $z_1$ is a constant coming from Theorem~\ref{thm:devia1}. On

2403:   $\mathcal Z_1(z, \delta)$, we have

2404:   \begin{equation}

2405:     \label{eq:on_Z1}

2406:     \norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq p(z_1)^2 h^2.

2407:   \end{equation}

2408:   Indeed, we have $\bar{f}\in B_n(f_0,\delta)$ thus, on $\mathcal Z(z, \delta)$, $\norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq p(z_1)^2 h^2$ and so

2409:   $\norm{\bar f - f_0}_n^2\leq p(z)^2 h^2$. Thus, on the event $\mathcal

2410:     Z(z_1, p(z) h)$, we have (\ref{eq:on_Z1}). Moreover, Theorem~\ref{thm:devia1} yields

2411:   \begin{equation}

2412:     \label{eq:deviaB1}

2413:     P_n \big[ \mathcal Z_1( z, \delta)^\complement \big] \leq \exp(

2414:     -D_1 z^2 \delta^{-\beta}) + \exp( -D_1 z_1^2 (p(z) h)^{-\beta}).

2415:   \end{equation}

2416:   Now, in view of~\eqref{eq:f_bar_prop} and since $f_0 \in \mathcal

2417:   F$, we have the following rough majoration:

2418:   \begin{align}

2419:     \nonumber \norm{\bar f - f_0}_n^2 + \pen(\bar f) &\leq 2

2420:     (\norm{\bar f - Y}^2_n + \pen(\bar f) ) + 2 \norm{f_0 - Y}_n^2

2421:     \\ \nonumber &\leq 2 ( \norm{f_0 - Y}_n^2 + \pen(f_0)) + 2

2422:     \norm{f_0 - Y}_n^2 \\

2423:     \label{eq:rough1}

2424:     &\leq 4 \sigma^2 \norm{\varepsilon}_n^2 + 2 \pen(f_0),

2425:   \end{align}

2426:   which entails

2427:   \begin{equation*}

2428:     E_n\big[ \big( \norm{\bar f - f_0}_n^2 + \pen(\bar f) \big)^2

2429:     \big] \leq \sigma^4 C(\varepsilon)^2 + 8 h^4 |f_0|_{\mathcal F}^{2\alpha}

2430:   \end{equation*}

2431:   where $C(\varepsilon)^2 = 32( E[\varepsilon^4] / n + 2

2432:   (E[\varepsilon^2])^2)$. Putting all this together, we obtain, by a

2433:   decomposition of $E_n[\norm{\bar f - f_0}_n^2 + \pen(\bar f)]$ over

2434:   the union of the sets $\{ \norm{\bar f - f_0}_n\leq \delta \} \cap

2435:   \mathcal Z_1(z, \delta)$, $\mathcal Z_1(z, \delta)^\complement$ and

2436:   $\{\norm{\bar f - f_0}_n > \delta \}$ that

2437:   \begin{align*}

2438:     E_n[ \norm{\bar f - &f_0}_n^2 + \pen(\bar f)] \leq p(z_1)^2 h^2 \\

2439:     &+ (\sigma^2 C(\varepsilon) + 2\sqrt{2} h^2

2440:     |f_0|_{\mathcal F}^\alpha)\big(

2441:     P_n[ \mathcal Z_1(z, \delta)^\complement]^{1/2}+P_n[ \norm{\bar f - f_0}_n > \delta]^{1/2}\big).

2442:   \end{align*}

2443:   In view of~\eqref{eq:rough1}, if $\delta > 2 \pen(f_0)\vee1$ then we have

2444:   $\{ \norm{\bar f - f_0}_n^2 > \delta^2 \} \subset \{

2445:   \norm{\varepsilon}_n^2 > (\delta^2 - \delta) / (4 \sigma^2)\}$.

2446:   Thus, using the subgaussianity assumption~\eqref{eq:subgaussian}, we

2447:   have $P[ \norm{\bar f - f_0}_n > \delta ]^{1/2} \leq \exp( - (\delta^2

2448:   - \delta)^2 / (8 \sigma^2)) \leq ( \exp(-C_2 (\log n)^4)) =

2449:   o(h^2)$ if one chooses $\delta = \log n$. Now,

2450:   using~\eqref{eq:deviaB1} with this choice of $\delta$ and $z = (\log

2451:   n)^{1 + \beta/2}$ we have also $P_n[ \mathcal Z_1(z,

2452:   \delta)^\complement]^{1/2} \leq \exp( -C_3 (\log n)^2) =

2453:   o(h^2)$. This concludes the proof of the first upper bound of

2454:   Theorem~\ref{thm:least_sq}.

2455:

2456:   To prove the upper bound for the integrated norm $\norm{\cdot}$

2457:   instead of the empirical norm $\norm{\cdot}_n$, we decompose

2458:   $\norm{\bar f - f_0}^2 = A_1 + A_2$ where

2459:   \begin{equation*}

2460:     A_1 := \norm{\bar f - f_0}^2 - 8 ( \norm{\bar f - f_0}_n^2

2461:     + \pen(\bar f)) \text{ and } A_2 := 8 ( \norm{\bar f - f_0}_n^2 +

2462:     \pen(\bar f)).

2463:   \end{equation*}

2464:   The first part of Theorem~\ref{thm:least_sq} provides

2465:   \begin{equation*}

2466:     E^n[A_2] \leq C_1 ( 1 + |f_0|_{\mathcal F}^\alpha) n^{-2 / (2 +

2467:       \beta)}.

2468:   \end{equation*}

2469:   Recall that we assumed that $\norm{\bar f - f_0}_\infty \leq Q$

2470:   a.s. for the second part of the Theorem. To handle $A_1$, we use the

2471:   following Lemma.

2472:   \begin{lemma}

2473:     \label{lem:devia2}

2474:     Let $(\mathcal F, |\cdot|_{\mathcal F})$ and $h$ satisfy the same

2475:     assumptions as in Theorem~\ref{thm:least_sq}. Define $\mathcal F_Q

2476:     := \{ f \in \mathcal F : \norm{f - f_0}_\infty \leq Q \}$. We can

2477:     find constants $z_0, D_0 > 0$ such that for any $z \geq

2478:     z_0$\textup:

2479:     \begin{align*}

2480:       P_X^n \big[ \exists f \in \mathcal F_Q : \norm{f - f_0}^2 &- 8

2481:       (\norm{f - f_0}_n^2 + \pen(f)) \geq 10 z h^2 \big] \\

2482:       &\leq \exp \big( - D_0 n h^2 z \big),

2483:     \end{align*}

2484:     where $z_0$ and $D_0$ are constants depending on $a, \alpha,

2485:     \beta$ and $Q$.

2486:   \end{lemma}

2487:   The proof of Lemma~\ref{lem:devia2} is given in

2488:   Section~\ref{sec:lemmas_proofs}. Using together

2489:   Lemma~\ref{lem:devia2} and the fact that $A_1 \leq Q^2$ a.s., we

2490:   have by a decomposition over the union of $\{ A_1 \geq 10 z_0 h^2

2491:   \}$ and $\{ A_1 < 10 z_0 h^2 \}$:

2492:   \begin{equation*}

2493:     E^n [A_1] \leq 10 z_0 h^2 + o(h^2).

2494:   \end{equation*}

2495:   This concludes the proof of Theorem~\ref{thm:least_sq}.

2496: \end{proof}

2497: %%%% END OF PROOF %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

2498:

2499:

2500: \begin{figure}[htbp]

2501: %% \includegraphics[width=12cm]{designs.pdf}

2502:   \begin{tikzpicture}[scale=2]

2503:     \draw[thick] (0,0) node[anchor=south] {$f_0$} circle (1); %

2504:     \draw (0,0) -- (1,0) node[anchor=west] {$f_1$}; %

2505:     \draw (0,0) -- (40:1cm) node[anchor=south west]{$f_{M-1}$}; %

2506:     \draw (0,0) -- (150:1cm) node[anchor=south east] {$f_3$}; %

2507:     \draw (0,0) -- (190:1cm) node[anchor=east]{$f_{2}$}; %

2508:     \draw (0,0) -- (290:1cm) ; %

2509:     \draw[<->, very thick] (290:1cm) -- (290:0.6cm) node[anchor=east]

2510:     {$f_M$} node[pos=0.5, right] {$h$}; %

2511:     \draw[mark=x] (50:1cm) ;

2512:   \end{tikzpicture}

2513:   \caption{Example of a setup in which ERM performs badly. The set

2514:     $F(\Lambda) = \{f_1, \ldots, f_M \}$ is the dictionary from which

2515:     we want to mimic the best element and $f_0$ is the regression

2516:     function.}

2517:   \label{fig:badsetup}

2518: \end{figure}

2519:

2520: \begin{proof}[Proof of Theorem \ref{TheoWeaknessERMRegression}]

2521:   We consider a random variable $X$ uniformly distributed on $[0,1]$

2522:   and its dyadic representation:

2523:   \begin{equation}

2524:     \label{EquaDyadicRegression}

2525:     X = \sum_{k = 1}^{+\infty} X^{(k)} 2^{-k},

2526:   \end{equation}

2527:   where $(X^{(k)} : k \geq 1)$ is a sequence of i.i.d. random

2528:   variables following a Bernoulli $\cB(1/2,1)$ with parameter $1/2$.

2529:   The random variable $X$ is the design of the regression model worked

2530:   out here. For the regression function we take

2531:   \begin{equation}

2532:     \label{FunctionBasisRegression}

2533:     f_0(x) =

2534:     \begin{cases}

2535:       \; 2h &\text{ if } x^{(M)} = 1 \\

2536:       \; h & \text{ if } x^{(M)} = 0,

2537:     \end{cases}

2538:   \end{equation}

2539:   where $x$ has the dyadic decomposition $x=\sum_{k \geq 1}

2540:   x^{(k)}2^{-k}$ where $x^{(k)} \in \{ 0, 1 \}$ and

2541:   \begin{equation*}

2542:     h=\frac{C}{4}\sqrt{\frac{\log M}{n}}.

2543:   \end{equation*}

2544:   We consider the dictionary of functions $F_M = \{f_1, \ldots, f_M\}$

2545:   \begin{equation}

2546:     \label{FunctionBasisRegression}

2547:     f_j(x) = 2x^{(j)}-1, \quad \forall j\in\{1,\ldots,M\},

2548:   \end{equation}

2549:   where again $(x^{(j)} : j \geq 1)$ is the dyadic decomposition of $x

2550:   \in [0,1]$. The dictionary $F_M$ is chosen so that we have, for any

2551:   $j \in \{ 1, \ldots ,M-1 \}$

2552:   \begin{equation*}

2553:     \| f_j - f_0 \|_{L^2([0,1])}^2 = \frac{5 h^2}{2} + 1 \;\text{ and }\;

2554:     \|f_M - f_0 \|_{L^2([0,1])}^2 = \frac{5h^2}{2} - h + 1.

2555:   \end{equation*}

2556:   Thus, we have

2557:   \begin{equation*}

2558:     \min_{j=1,\ldots,M} \|f_j - f_0 \|_{L^2([0,1])}^2 = \|f_M - f_0

2559:     \|_{L^2([0,1])}^2 = \frac{5h^2}{2} -h + 1.

2560:   \end{equation*}

2561:   This geometrical setup for $F(\Lambda)$, which is a unfavourable

2562:   setup for the ERM, is represented in Figure~\ref{fig:badsetup}. For

2563:   \begin{equation*}

2564:     \hat{f}_n := \tilde{f}_n^{\rm PERM} \in \argmin_{f \in F_M}

2565:     \big(R_n(f) + \pen(f) \big),

2566:   \end{equation*}

2567:   where we take $R_n(f) = \frac{1}{n} \sum_{i=1}^n (Y_i-f(X_i))^2 =\|

2568:   Y - f \|^2_n$, we have

2569:   \begin{equation}

2570:     \label{InegGaussian}

2571:     E \|\hat{f}_n - f_0 \|_{L^2([0,1])}^2 =

2572:     \min_{j=1,\ldots,M} \|f_j - f_0 \|_{L^2([0,1])}^2 + h

2573:     P[\hat{f}_n\neq f_M].

2574:   \end{equation}

2575:   Now, we upper bound $P[ \hat{f}_n= f_M]$. If we define

2576:   \begin{equation*}

2577:     N_j := \frac{1}{\sqrt{n}} \sum_{i=1}^n\zeta_i^{(j)}

2578:     \varepsilon_i \text{ and } \zeta_i^{(j)} := 2X_i^{(j)}-1,

2579:   \end{equation*}

2580:   we have by the definition of $h$ and since $\zeta_i^{(j)} \in \{ -1,

2581:   1\}$:

2582:   \begin{align*}

2583:     \frac{\sqrt{n}}{2 \sigma} (\norm{Y - f_M}_n^2 &- \norm{Y -

2584:       f_j}_n^2) \\

2585:     & = N_j - N_M + \frac{h}{2 \sigma \sqrt{n}} \sum_{i=1}^n

2586:     (\zeta_i^{(j)} \zeta_i^{(M)} + 3(\zeta_i^{(j)} - \zeta_i^{(M)}) -

2587:     1) \\

2588:     &\geq N_j - N_M - \frac{4C}{\sigma} \sqrt{\log M}.

2589:   \end{align*}

2590:   This entails, for $\bar N_{M-1} := \max_{1 \leq j \leq N-1} N_j$,

2591:   that

2592:   \begin{align*}

2593:     P[ \hat{f}_n= f_M] &= P \Big[ \bigcap_{j=1}^{M-1} \Big\{ \norm{Y -

2594:       f_M}_n^2 - \norm{Y - f_j}_n^2 \leq \pen(f_j) - \pen(f_M) \Big\}

2595:     \Big] \\

2596:     &\leq P\Big[ N_M \geq \bar N_{M-1} - \frac{6C}{\sigma} \sqrt{\log

2597:       M} \Big].

2598:   \end{align*}

2599:   % \begin{eqnarray*}

2600:   %   \lefteqn{\mathbb{P}[\hat{f}_n= f_M]= \mathbb{P}[\forall

2601:   %     j=1,\ldots,M-1, A_n(f_M)+

2602:   %     {\rm{pen}}(f_M)\leq A_n(f_j)+{\rm{pen}}(f_j)]}\\

2603:   %   &=&\mathbb{P}[\forall j=1,\ldots,M-1,

2604:   %   \frac{1}{\sqrt{n}}\sum_{i=1}^n (Y_i-f_M(X_i))^2 \leq

2605:   %   \frac{1}{\sqrt{n}}\sum_{i=1}^n

2606:   %   (Y_i-f_j(X_i))^2\\

2607:   %   &&+\sqrt{n}({\rm{pen}}(f_j)-{\rm{pen}}(f_M))]\\

2608:   %   &\leq& \mathbb{P}[\forall j=1,\ldots,M-1, N_M\geq

2609:   %   N_j\\&&+\frac{1}{\sigma\sqrt{n}}\sum_{i=1}^n

2610:   %   \frac{h}{2}(\zeta_i^{(M)}\zeta_i^{(j)}-1)

2611:   %   +\frac{3h}{2}(\zeta_i^{(j)}-1)-\frac{C}{\sigma}\sqrt{\log M}],

2612:   % \end{eqnarray*}

2613:   % where for any

2614:   % $j=1,\ldots,M$,

2615:   It is easy to check that $N_1, \ldots, N_M$ are $M$ normalized

2616:   standard gaussian random variables uncorrelated (but dependent). We

2617:   denote by $\boldsymbol{\zeta}$ the family of Rademacher variables

2618:   $(\zeta_i^{(j)} : i=1,\ldots,n ; j=1,\ldots,M)$. We have for any

2619:   $6C/\sigma <\gamma< (2\sqrt{2}c^*)^{-1}$ ($c^*$ is the ``Sudakov

2620:   constant'', see Theorem~\ref{TheoSudakov}),

2621:   \begin{align}

2622:     \label{EquaSudakov}

2623:     P[\hat{f}_n = f_M] &\leq E \Big[ P\Big( N_M \geq \bar N_{M-1} -

2624:     \frac{6C}{\sigma}\sqrt{\log M} \Big| \boldsymbol{\zeta} \Big)

2625:     \Big] \nonumber \\

2626:     &\leq P \big[ N_M \geq - \gamma \sqrt{\log M}

2627:     +  E(\bar N_{M-1} | \boldsymbol{\zeta} ) \big] \\

2628:     &+ E \Big[ P\Big\{ E( \bar N_{M-1} | \boldsymbol{\zeta} ) - \bar

2629:     N_{M-1} \geq (\gamma - \frac{6C}{\sigma}) \sqrt{\log M} \Big|

2630:     \boldsymbol{\zeta} \Big\} \Big]. \nonumber

2631:   \end{align}

2632:   Conditionally to $\boldsymbol{\zeta}$, the vector

2633:   $(N_1,\ldots,N_{M-1})$ is a linear transform of the Gaussian vector

2634:   $(\varepsilon_1, \ldots, \varepsilon_n)$. Hence, conditionally to

2635:   $\boldsymbol{\zeta}$, $(N_1,\ldots,N_{M-1})$ is a gaussian

2636:   vector. Thus, we can use a standard deviation result for the

2637:   supremum of Gaussian random vectors (see for

2638:   instance~\cite{massart03}, Chapter~3.2.4), which leads to the

2639:   following inequality for the second term of the RHS

2640:   in~\eqref{EquaSudakov}:

2641:   \begin{align*}

2642:     % \label{EquaSecondTerm}

2643:     P \Big\{ E( \bar N_{M-1} | \boldsymbol{\zeta} ) - \bar N_{M-1}

2644:     \geq (\gamma &- \frac{6C}{\sigma}) \sqrt{\log M} \Big|

2645:     \boldsymbol{\zeta}

2646:     \Big\} \\

2647:     &\leq \exp(-(3C/\sigma-\gamma/2)^2\log M).

2648:     % \mathbb{P}[\mathbb{E} [ \max_{j=1,\ldots,M-1}N_j |

2649:     % \boldsymbol{\zeta}]&-\max_{j=1,\ldots,M-1}N_j\geq

2650:     % (\gamma-2C/\sigma)\sqrt{\log

2651:     %   M}|\boldsymbol{\zeta}]\nonumber\\

2652:     % &\leq \exp(-(C/\sigma-\gamma/2)^2\log M).

2653:   \end{align*}

2654:   Remark that we used $E[ N_j^2 | \boldsymbol{\zeta}] = 1$ for any $j

2655:   = 1, \ldots, M-1$. For the first term in the RHS

2656:   of~\eqref{EquaSudakov}, we have

2657:   \begin{align}

2658:     \label{EquaIerTermSudakov}

2659:     P &\Big [N_M \geq - \gamma \sqrt{\log M}

2660:     + E( \bar N_{M-1} | \boldsymbol{\zeta} ) \Big] \nonumber\\

2661:     &\leq P \Big[N_M \geq - 2 \gamma \sqrt{\log M}

2662:     + E(\bar N_{M-1}) \Big] \\

2663:     &+P \Big[ - \gamma\sqrt{\log M} + E(\bar N_{M-1}) \geq E(\bar

2664:     N_{M-1} | \boldsymbol{\zeta}) \Big]. \nonumber

2665:   \end{align}

2666:   Next, we use Sudakov's Theorem (cf. Theorem \ref{TheoSudakov} in

2667:   Appendix~\ref{sec:appendix_proba}) to lower bound $E( \bar

2668:   N_{M-1})$. Since $(N_1,\ldots,N_{M-1})$ is, conditionally to

2669:   $\boldsymbol{\zeta}$, a Gaussian vector and since for any $1 \leq j

2670:   \neq k \leq M$ we have

2671:   \begin{equation*}

2672:     E[(N_k-N_j)^2 | \boldsymbol{\zeta}] = \frac{1}{n}

2673:     \sum_{i=1}^n (\zeta_i^{(k)} - \zeta_i^{(j)})^2

2674:   \end{equation*}

2675:   then, according to Sudakov's minoration

2676:   (cf. Theorem~\ref{TheoSudakov} in the Appendix), there exits an

2677:   absolute constant $c^* > 0$ such that

2678:   \begin{equation*}

2679:     %\label{EquaSudakPrimal}

2680:     c^* E[\bar N_{M-1} | \boldsymbol{\zeta}] \geq

2681:     \min_{1 \leq j \neq k \leq M-1} \Big(\frac{1}{n}\sum_{i=1}^n

2682:     (\zeta_i^{(k)} - \zeta_i^{(j)})^2\Big)^{1/2} \sqrt{\log M}.

2683:   \end{equation*}

2684:   Thus, we have

2685:   \begin{align*}

2686:     \label{EquaSudak3}

2687:     c^* E[\bar N_{M-1}] &\geq E\Big[ \min_{j \neq k} \Big(\frac{1}{n}

2688:     \sum_{i=1}^n (\zeta_i^{(k)} - \zeta_i^{(j)})^2

2689:     \Big)^{1/2} \Big] \sqrt{\log M} \\

2690:     &\geq \sqrt{2} \Big(1 - E\Big[ \max_{j\neq k} \frac{1}{n}

2691:     \sum_{i=1}^n \zeta_i^{(k)} \zeta_i^{(j)} \Big] \Big) \sqrt{\log M},

2692:   \end{align*}

2693:   where we used the fact that $\sqrt{x} \geq x/\sqrt{2}, \forall x \in

2694:   [0,2]$.

2695:   % \begin{equation}

2696:   %   \label{equaSudak2}

2697:   %   E\Big[\min_{k \neq j \in \{1, \ldots, M-1\} } \Big( \frac{1}{n}

2698:   %   \sum_{i=1}^n (\zeta_i^{(k)} - \zeta_i^{(j)})^2 \Big)^{1/2} \Big]

2699:   %   \geq \sqrt{2} \Big(1 - E\Big[ \max_{j\neq k} \frac{1}{n} \sum_{i=1}^n

2700:   %   \zeta_i^{(k)} \zeta_i^{(j)} \Big] \Big)

2701:   % \end{equation}

2702:   Besides, using Hoeffding's inequality we have $E[\exp(s

2703:   \xi^{(j,k)})] \leq \exp(s^2/(2n))$ for any $s > 0$, where

2704:   $\xi^{(j,k)} := n^{-1} \sum_{i=1}^n \zeta_i^{(k)} \zeta_i^{(j)}$.

2705:   Then, using a maximal inequality (cf.  Theorem~\ref{TheoMaxConcIneq}

2706:   in Appendix~\ref{sec:appendix_proba}) and since $n^{-1}

2707:   \log[(M-1)(M-2)] \leq 1/4$, we have

2708:   \begin{equation}

2709:     \label{EquaSudakFinal}

2710:     E\Big[\max_{j\neq k} \frac{1}{n} \sum_{i=1}^n

2711:     \zeta_i^{(k)} \zeta_i^{(j)} \Big] \leq

2712:     \Big(\frac{1}{n} \log[(M-1)(M-2)] \Big)^{1/2} \leq

2713:     \frac{1}{2}.

2714:   \end{equation}

2715:   This entails

2716:   \begin{equation*}

2717:     c^* E[ \bar N_{M-1} ] \geq \Big(\frac{\log M}{2} \Big)^{1/2}.

2718:   \end{equation*}

2719:   Thus, using this inequality in the first RHS

2720:   of~\eqref{EquaIerTermSudakov} and the usual inequality on the tail

2721:   of a Gaussian random variable ($N_M$ is standard Gaussian), we

2722:   obtain:

2723:   \begin{align}

2724:     \label{EquaFirstTerm}

2725:     P\Big[N_M \geq &-2\gamma \sqrt{\log M} + E(\bar N_{M-1}) \Big]

2726:     \leq P\Big[ N_M \geq ((c^*\sqrt{2})^{-1}-2\gamma)

2727:     \sqrt{\log M}\Big]\nonumber\\

2728:     &\leq \mathbb{P}\Big[N_M \geq ((c^*\sqrt{2})^{-1}-2\gamma)

2729:     \sqrt{\log

2730:       M}\Big]\\

2731:     &\leq \exp\Big(-((c^*\sqrt{2})^{-1}-2\gamma)^2(\log

2732:     M)/2\Big).\nonumber

2733:   \end{align}

2734:   Remark that we used $2\sqrt{2}c^* \gamma < 1$. For the second term

2735:   in (\ref{EquaIerTermSudakov}), we apply the concentration inequality

2736:   of Theorem \ref{TheoEinmahlMasson} to the non-negative random

2737:   variable $E[\bar N_{M-1}|\boldsymbol{\zeta}]$. We first have to

2738:   control the second moment of this variable. We know that,

2739:   conditionally to $\boldsymbol{\zeta}$,

2740:   $N_j|\boldsymbol{\zeta}\sim\cN(0,1)$ thus,

2741:   $N_j|\boldsymbol{\zeta}\in L_{\psi_2}$ (for more details on Orlicz

2742:   norm, we refer the reader to~\cite{vdVW:96}). Thus,

2743:   \begin{equation*}

2744:     \norm{\max_{1\leq j\leq M-1} N_j|\boldsymbol{\zeta}}_{\psi_2}\leq K

2745:     \psi_2^{-1}(M)\max_{1\leq j\leq M-1}\norm{N_j|\boldsymbol{\zeta}}_{\psi_2}

2746:   \end{equation*}

2747:   (cf. Lemma 2.2.2 in \cite{vdVW:96}). Since

2748:   $\norm{N_j|\boldsymbol{\zeta}}_{\psi_2}^2=1$, we have $\norm{\max_{1\leq j\leq M-1}

2749:     N_j|\boldsymbol{\zeta}}_{\psi_2}\leq K \sqrt{\log M}$. In particular, we have

2750:   $E\big[\max_{1\leq j\leq M-1} N_j^2|\boldsymbol{\zeta}\big]\leq

2751:   K\log M$ and so $E\big(E[\bar

2752:   N_{M-1}|\boldsymbol{\zeta}]\big)^2\leq K\log M$. Theorem

2753:   \ref{TheoEinmahlMasson} provides

2754:   \begin{equation}

2755:     \label{SecondTermEquaSuda}

2756:     P\Big[ -\gamma\sqrt{\log

2757:       M}+E[\bar N_{M-1}]\geq E[\bar N_{M-1}|\boldsymbol{\zeta}]\Big]\leq

2758:     \exp(-\gamma^2/c_0),

2759:   \end{equation}

2760:   where $c_0$ is an absolute constant.

2761:

2762: Finally, combining (\ref{EquaSudakov}), (\ref{EquaFirstTerm}),

2763: (\ref{EquaIerTermSudakov}), (\ref{SecondTermEquaSuda}) in the initial

2764: inequality (\ref{EquaSudakov}), we obtain

2765: \begin{align*}

2766: P[\hat{f}_n= f_M] &\leq \exp(-(3C/\sigma-\gamma)^2\log M)\\

2767: &+

2768: \exp\Big(-((c^*\sqrt{2})^{-1}-2\gamma)^2(\log M)/2\Big)+

2769: \exp(-\gamma^2/c_0).

2770: \end{align*}

2771: Take $\gamma=(12\sqrt{2}c^*)^{-1}$. It is easy to find an integer $M_0(\sigma)$ depending only on $\sigma$ such that for any $M\geq M_0$, we have $P[\hat{f}_n= f_M]\leq c_1<1$, where $c_1$ is an absolute constant.

2772: We complete the proof by using this last result in

2773: (\ref{InegGaussian}).

2774: \end{proof}

2775: %%%% END OF PROOF %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%AGGREGATION

2776:

2777:

2778: \begin{proof}[Proof of Theorem~\ref{thm:oracle}]

2779:   We recall that we have a dictionary (set of functions) $F(\Lambda)$

2780:   of cardinality $M$ such that $\norm{f_\lambda - f_0}_\infty \leq Q$

2781:   for all $\lambda \in \Lambda$. Let us define the risk

2782:   \begin{equation*}

2783:     R(f) := E[(Y - f(X))^2]

2784:   \end{equation*}

2785:   and the linearized risk over $F(\Lambda)$, given by

2786:   \begin{equation*}

2787:     \mathsf R(\theta) := \sum_{\lambda \in \Lambda} \theta_\lambda

2788:     R(f_\lambda)

2789:   \end{equation*}

2790:   for $\theta \in \Theta$, where we recall that

2791:   \begin{equation*}

2792:     \Theta := \{ \theta \in \mathbf R^{|\Lambda|} ; \theta_\lambda

2793:     \geq 0,\; \sum_{\lambda \in \Lambda} \theta_\lambda = 1 \}.

2794:   \end{equation*}

2795:   We denote by $R_{n}(f)$ the empirical risk of $f$ over the sample

2796:   $D_{n}$, which is given by

2797:   \begin{equation*}

2798:     R_{n}(f) := \frac{1}{n} \sum_{i=1}^n (Y_i - f(X_i))^2,

2799:   \end{equation*}

2800:   and we define similarly the linearized empirical risk

2801:   \begin{equation*}

2802:     \mathsf R_{n}(\theta) := \sum_{\lambda \in \Lambda}

2803:     \theta_\lambda R_{n}(f_\lambda).

2804:   \end{equation*}

2805:   The excess risk of a function $f$ is given by $R(f) - R(f_0) =

2806:   \norm{f - f_0}^2$. By convexity of the risk, the aggregate $\hat

2807:   {\mathsf f}= \sum_{\lambda \in \Lambda} \hat \theta_\lambda

2808:   f_\lambda$ defined in (\ref{eq:aggregate}), satisfies, for any $a >

2809:   0$,

2810:   \begin{align*}

2811:     R(\hat {\mathsf f}) - R(f_0) &\leq \mathsf R(\hat \theta) - R(f_0) \\

2812:     &\leq (1 + a) (\mathsf R_{n}(\hat \theta) - R_{n}(f_0)) \\

2813:     &+ \mathsf R(\hat \theta) - R(f_0) - (1 + a) (\mathsf R_{n}(\hat

2814:     \theta) - R_{n}(f_0)),

2815:   \end{align*}

2816:   where it is easy to see that the Gibbs weights $\hat \theta = (\hat

2817:   \theta_\lambda)_{\lambda \in \Lambda} = (\hat

2818:   \theta(f_\lambda))_{\lambda \in \Lambda}$ are the unique solution to

2819:   the minimization problem

2820:   \begin{equation*}

2821:     \min_{\theta \in \Theta} \Big\{ \mathsf R_{n}(\theta) +

2822:     \frac{T}{ n} \sum_{\lambda \in \Lambda} \theta_\lambda \log

2823:     \theta_\lambda \Big\},

2824:   \end{equation*}

2825:   where $T$ is the temperature parameter, see~\eqref{eq:weights}, and

2826:   where we use the convention $0 \log 0 = 0$. Let $\hat \lambda$ be

2827:   such that $f_{\hat \lambda}$ is the ERM in $F(\Lambda)$, namely

2828:   \begin{equation*}

2829:     R_{n}(f_{\hat \lambda}) := \min_{\lambda \in \Lambda}

2830:     R_{n}(f_\lambda).

2831:   \end{equation*}

2832:   Since

2833:   \begin{equation*}

2834:     \sum_{\lambda \in \Lambda} \hat \theta_\lambda \log \Big( \frac{\hat

2835:       \theta_\lambda}{1 / |\Lambda|} \Big) = K(\hat \theta | u) \geq 0

2836:   \end{equation*}

2837:   where $K(\hat \theta | u)$ denotes the Kullback-Leibler divergence

2838:   between the weights $\hat \theta$ and the uniform weights $u := (1 /

2839:   |\Lambda|)_{\lambda \in \Lambda}$, we have

2840:   \begin{align*}

2841:     \mathsf R_{n}(\hat \theta) &\leq \mathsf R_{n}(\hat \theta) +

2842:     \frac{T}{ n} K(\hat \theta | u) \\

2843:     &= \mathsf R_{n}(\hat \theta) + \sum_{\lambda \in \Lambda} \hat

2844:     \theta_\lambda \log \hat \theta_\lambda + \frac{T\log |\Lambda|}{

2845:       n} \\

2846:     &\leq \mathsf R_{n}(e_{\hat \lambda}) + \frac{T\log |\Lambda|}{

2847:       n} = R_{n}(f_{\hat \lambda}) + \frac{T\log |\Lambda|}{n},

2848:   \end{align*}

2849:   where $e_\lambda \in \Theta$ is the vector with $1$ for the

2850:   $\lambda$-th coordinate and $0$ elsewhere. This gives

2851:   \begin{align*}

2852:     R(\hat {\mathsf f}) - R(f_0) &\leq (1 + a) \min_{\lambda \in \Lambda}

2853:     (R_{n}(f_\lambda) - R_{n}(f_0))+ (1 + a)

2854:     \frac{T\log |\Lambda|}{ n} \\

2855:     &+ \mathsf R(\hat \theta) - R(f_0) - (1 + a) (\mathsf R_{n}(\hat

2856:     \theta) - R_{n}(f_0)),

2857:   \end{align*}

2858:   and consequently

2859:   \begin{align*}

2860:     E \norm{\hat {\mathsf f} - f_0}^2 &\leq (1 + a) \min_{\lambda \in

2861:       \Lambda} \norm{f_\lambda - f_0}^2 + (1 + a) \frac{T\log

2862:       |\Lambda|}{n} \\

2863:     &+ E[ \mathsf R(\hat \theta) - R(f_0) - (1 + a) (\mathsf

2864:     R_{n}(\hat \theta) - R_{n}(f_0)) ].

2865:   \end{align*}

2866:   Since $\mathsf R(\cdot)$ and $\mathsf R_{n}$ are linear on

2867:   $\Theta$, we have

2868:   \begin{align*}

2869:     \mathsf R(\hat \theta) - R(f_0) &- (1 + a) (\mathsf R_{n}(\hat

2870:     \theta) - R_{n}(f_0)) \\

2871:     &\leq \max_{f \in F(\Lambda)} ( R(f) - R(f_0) - (1 + a)

2872:     (R_{n}(f) - R_{n}(f_0)) ).

2873:   \end{align*}

2874:   Thus, we have

2875:   \begin{equation}\label{eq:Main0}

2876:     E \norm{\hat {\mathsf f} - f_0}^2 \leq (1 + a)

2877:     \min_{\lambda \in \Lambda} \norm{f_\lambda - f_0}^2 + (1 + a)

2878:     \frac{\log |\Lambda|}{T n} + \mathcal R_n,

2879:   \end{equation}

2880:   where $\mathcal R_n := E [ \max_{f \in F(\Lambda)} \{ R(f) - R(f_0)

2881:   - (1 + a) (R_{n}(f) - R_{n}(f_0)) \} ] $. Now, we upper bound

2882:   $\mathcal R_n$. Introduce the random variables

2883:   \begin{align*}

2884:     \tilde{Z}_i(f) &:= (f(X_i) - f_0(X_i))^2 + 2 \sigma \varepsilon_i

2885:     I( |\varepsilon_i| \leq K) (f_0(X_i) - f(X_i)), \\

2886:     \bar Z_i(f) &:= 2 \sigma \varepsilon_i I(|\varepsilon_i| > K)

2887:     (f_0(X_i) - f(X_i)),

2888:   \end{align*}

2889:   and the two following processes indexed by $f \in F(\Lambda)$:

2890:   \begin{equation*}

2891:     \tilde{\zeta}(f) := \frac{1}{n}\sum_{i=1}^n \Big(

2892:     E[\tilde{Z}_i(f)] - (1+a) \tilde{Z}_i(f) \Big) \text{ and }

2893:     \bar{\zeta}(f) := \frac{1+a}{n} \sum_{i=1}^n\bar{Z}_i(f).

2894:   \end{equation*}

2895:   We use the symmetry of $\varepsilon$ to get

2896:   \begin{equation*}

2897:     \mathcal R_n \leq E \Big[ \max_{f \in F(\Lambda)}

2898:     \tilde{\zeta}(f) \Big] + E \Big[ \max_{f \in F(\Lambda)}

2899:     \bar{\zeta}(f) \Big].

2900:   \end{equation*}

2901:   First, we upper bound $E[ \max_{f \in F(\Lambda)}

2902:   \tilde{\zeta}(f)]$. The random variable $\tilde{\zeta}(f)$ is

2903:   bounded and satisfies the following Bernstein's type condition

2904:   (see~\cite{BM:06}): $\forall f \in F(\Lambda), E [

2905:   \tilde{\zeta}(f)^2] \leq (Q^2 + 4 \sigma^2) E[\tilde{\zeta}(f)]$. We

2906:   apply the union bound and the Bernstein's inequality

2907:   (cf. \cite{vdVW:96}) to get, for any $\delta>0$,

2908:   \begin{align*}

2909:     P \Big[\max_{f\in F(\Lambda)} \tilde{\zeta}(f) \geq \delta \Big]

2910:     &\leq \sum_{f\in F(\Lambda)} P\Big[ \frac{1}{n}\sum_{i=1}^n

2911:     E[\tilde{Z}_i(f)] - \tilde{Z}_i(f) \geq

2912:     \frac{\delta + a E[\tilde{Z}_i(f)] }{1+a} \Big] \\

2913:     &\leq M \exp(-C n \delta),

2914:   \end{align*}

2915:   where $C := a [8 (Q^2 + \sigma^2 (1 + a)^2 + (4Q / 3)(1 + a)(Q +

2916:   2K)]^{-1}$. Hence, a direct computation gives

2917:   \begin{equation}

2918:     \label{eq:I1}

2919:     E\Big[ \max_{f\in F(\Lambda)} \tilde{\zeta}(f) \Big] \leq

2920:     \frac{4 \log M}{C n}.

2921:   \end{equation}

2922:   Now, we upper bound $E [\max_{f\in F(\Lambda)}\bar{\zeta}(f) ]$. We

2923:   have

2924:   \begin{align}

2925:     \label{eq:I2}

2926:     \nonumber E \Big[ \max_{f\in F(\Lambda)} \bar{\zeta}(f) \Big]

2927:     &\leq 4 Q (1 + a) E \big[ |\varepsilon| I(|\varepsilon| > K) \big] \\

2928:     &\leq 4 Q (1 + a) \sigma P (|\varepsilon|>K)^{1/2} \\

2929:     &\leq 4Q(1+a) \sigma \exp(-K^2 / (2 b_\varepsilon^2)).

2930:   \end{align}

2931:   Finally, combining equations \eqref{eq:Main0},~\eqref{eq:I1})

2932:   and~\eqref{eq:I2} with $K = b_\varepsilon \sqrt{2 \log n}$,

2933:   concludes the proof of Theorem~\ref{thm:oracle}.

2934: \end{proof}

2935:

2936: %%%% END OF PROOF %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%AGGREGATION

2937: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

2938:

2939:

2940:

2941: % \begin{proof}[Proof of Theorem~\ref{thm:adaptive_anisotropic}]

2942:

2943: %   Let $f_0 \in B_{p, q}^{\bs s}$ for $\bs s \in \bs S$. Consider

2944: %   $\bs s_n \in \bs S_n$ such that $\bs s_n \leq \bs s \leq \bs s_n +

2945: %   u (\log n)^{-1}$ coordinatewise, where $u = (1, \ldots, 1)$. In

2946: %   view of embedding~\eqref{eq:anisotropic_embedding}, we have $B_{p,

2947: %     q}^{\bs s} \subset B_{p, q}^{\bs s_n}$ and if $r_n(\bs s) =

2948: %   n^{-g(\bs s)}$ where

2949: %   \begin{equation*}

2950: %     g(\bs s) = g(s_1, \ldots, s_d) = \Big(2 + \sum_{i=1}^d

2951: %     \frac{d}{s_i} \Big)^{-1},

2952: %   \end{equation*}

2953: %   it is easy to see that

2954: %   \begin{equation*}

2955: %     r_n(\bs s) \leq r_n(\bs s_n) \leq \exp(d^2) r_n(\bs s).

2956: %   \end{equation*}

2957: %   The proof is then a direct consequence of the oracle inequality from

2958: %   Theorem~\ref{thm:oracle} and the upper bound for PERM from

2959: %   Theorem~\ref{thm:least_sq}. \texttt{rajouter quelques details...}

2960: % \end{proof}

2961:

2962:

2963: \section{Proofs of the lemmas}

2964: \label{sec:lemmas_proofs}

2965:

2966:

2967: \begin{proof}[Proof of Lemma~\ref{lem:logtrick}]

2968:   Since $\beta \in (0, 2)$ we have $\alpha > 2 \beta / (\beta + 2) >

2969:   \beta/2$. Thus, inequality~\eqref{eq:logtrick} gives

2970:   \begin{align*}

2971:     \log(r^2 + h^2 I^\alpha) &\leq \log(\varepsilon) + (1 -

2972:     \frac{\beta}{2}) \log(r) - (1 - \frac{\beta}{2\alpha}) \log(r^2)

2973:     \\

2974:     & - \frac{\beta}{\alpha} \log(h) + (1 - \frac{\beta}{2\alpha})

2975:     \log(r^2) + \frac{\beta}{2\alpha} \log(h^2 I^\alpha) \\

2976:     &\leq \log(\varepsilon) + (\frac{\beta}{\alpha} - 1 -

2977:     \frac{\beta}{2}) \log(r) - \frac{\beta}{\alpha} \log(h) + \log(

2978:     r^2 + h^2 I^\alpha)

2979:   \end{align*}

2980:   and consequently

2981:   \begin{equation*}

2982:     r^{1 + \beta / 2 - \beta/\alpha} \leq \varepsilon h^{-\beta/\alpha}

2983:   \end{equation*}

2984:   which entails $r \leq ( \varepsilon^{\alpha} h^{-\beta} )^{2 / (2\alpha

2985:     + \alpha \beta - 2 \beta)}$. Now, using this inequality together

2986:   with $h^2 I^\alpha \leq \varepsilon\, r^{1 - \beta / 2} I^{\beta / 2}$

2987:   provides the upper bound for $I$. The last inequality easily follows.

2988: \end{proof}

2989:

2990:

2991: \begin{proof}[Proof of Lemma~\ref{lem:devia2}]

2992:   [The proof consists of a \emph{peeling} of $\mathcal F$ into

2993:   subspaces with complexity controlled by Assumption~$(C_\beta)$ and

2994:   the use of Bernstein's inequality.] Let us denote for short

2995:   $\mathcal F$ instead of $\mathcal F_Q$. Since $\bar f \in \mathcal

2996:   F$, we have

2997:   \begin{align*}

2998:     P \big[ \norm{\bar f &- f_0}^2 - 8 (\norm{\bar f - f_0}_n^2 +

2999:     \pen(\bar f)) \geq 10 z h^2 \big] \\

3000:     &\leq P \big[ \exists f \in \mathcal F : \norm{f - f_0}^2 - 8

3001:     ( \norm{f - f_0}_n^2 + \pen(f) ) \geq 10 z h^2 \big] \\

3002:     &\leq P[A_1] + \sum_{k \geq 2} P[A_k],

3003:   \end{align*}

3004:   where

3005:   \begin{align*}

3006:     A_1 := \big\{ \exists f &\in \mathcal F,\;\pen(f) \leq 2^{\alpha /

3007:       \beta} h^2 : \\

3008:     &\norm{f - f_0}^2 - 8 (\norm{f - f_0}_n^2 + \pen(f) ) \geq 10 z

3009:     h^2 \big\}

3010:   \end{align*}

3011:   and for $k \geq 2$,

3012:   \begin{align*}

3013:     A_k := \big\{ \exists f \in \mathcal F,\; &2^{\alpha (k-1) /

3014:       \beta} h^2 < \pen(f) \leq 2^{\alpha k / \beta} h^2 : \\

3015:     &\norm{f - f_0}^2 - 8 (\norm{f - f_0}_n^2 + \pen(f) ) \geq 10 z

3016:     h^2 \big\}.

3017:   \end{align*}

3018:   Hence, since $z \geq z_0 \geq 1$ and $\alpha / \beta = 2 / (\beta +

3019:   2) > 1/2$ since $\beta < 2$, we have $P[A_k] \leq P_k$ for any $k

3020:   \geq 1$, where

3021:   \begin{align*}

3022:     P_k := P \big[ \exists f \in \mathcal F,\; &\pen(f) \leq 2^{\alpha

3023:       k / \beta} h^2 : \\

3024:     &\norm{f - f_0}^2 - 8 \norm{f - f_0}_n^2 \geq 2 z h^2 + 4

3025:     2^{\alpha k / \beta} h^2 \big].

3026:   \end{align*}

3027:   Now, let $F(\delta, k)$ be a minimal $\delta$-covering for the norm

3028:   $\norm{\cdot}_\infty$ of the set

3029:   \begin{equation*}

3030:     \{ f \in \mathcal F : \pen(f) \leq 2^{\alpha k / \beta} h^2 \} =

3031:     \{ f \in \mathcal F : |f|_{\mathcal F} \leq 2^{k /\beta} \},

3032:   \end{equation*}

3033:   where we recall that $\pen(f) = h^2 |f|_{\mathcal

3034:     F}^\alpha$. Assumption~$(C_\beta)$ entails

3035:   \begin{equation}

3036:     \label{eq:covering1}

3037:     | F(\delta, k) | \leq \exp ( D 2^{k} \delta^{-\beta} ).

3038:   \end{equation}

3039:   Since for any $f_1, f_2 \in \mathcal F$ such that $\norm{f_1 -

3040:     f_2}_\infty \leq \delta$, we have

3041:   \begin{equation*}

3042:     \norm{f_1 - f_0}^2 \leq 2\norm{f_2 - f_0}^2 + 2 \delta^2 \quad

3043:     \text{ and } \quad 2

3044:     \norm{f_1 - f_0}_n^2 \geq 2\norm{f_2 - f_0}_n^2 - 2 \delta^2,

3045:   \end{equation*}

3046:   we obtain

3047:   \begin{align*}

3048:     P_k &\leq P \big[ \exists f \in F(\delta, k) : 2 \norm{f - f_0}^2

3049:     - 4 \norm{f - f_0}_n^2 + 6 \delta^2 \geq 2 z h^2 + 4 2^{\alpha k /

3050:       \beta} h^2 \big] \\

3051:     &\leq \sum_{f\in F(\delta, k)} \times P \big[ \norm{f - f_0}^2 - \norm{f -

3052:       f_0}_n^2 \geq t_k(z) \big],

3053:   \end{align*}

3054:   where $t_k(z) := z h^2 / 2 + 2^{\alpha k / \beta} h^2 - 3 \delta^2 /

3055:   2 + \norm{f - f_0}^2 / 2$. Let $f \in F(\delta, k)$ be fixed. We

3056:   introduce the random variables $U_i := (f(X_i) - f_0(X_i))^2$, so

3057:   that $\norm{f - f_0}_n^2 = \sum_{i=1}^n U_i / n$ and $E[U_1] =

3058:   \norm{f - f_0}^2$. Note that the $U_i$ are independent, such that $0

3059:   \leq U_i \leq Q^2$, and $\var [U_1] \leq E [U_1^2] \leq Q^2 E [U_1]

3060:   \leq Q^2 \norm{f - f_0}^2$. Hence, if $t_k(z) \geq \norm{f - f_0}^2

3061:   / 2$, Bernstein's inequality entails

3062:   \begin{align*}

3063:     P \big[ \norm{f - f_0}^2 &- \norm{f - f_0}_n^2 \geq t_k(z) \big]

3064:     = P \Big[ \sum_{i=1}^n (U_i - E [U_1]) \geq n t_k(z) \Big] \\

3065:     &\leq \exp \Big( \frac{-n t_k(z)^2}{2( Q^2 \norm{f -

3066:         f_0}^2 + Q^2 t_k(z) / 3)} \Big) \\

3067:     &\leq \exp \Big( \frac{-3 n ( z h^2 + 2^{\alpha k / \beta +1} h^2

3068:       - 3 \delta^2 )}{28 Q^2} \Big).

3069:   \end{align*}

3070:   By taking $\delta := (2^{\alpha k / \beta} h^2 / 3)^{1/2}$, we have

3071:   $t_k(z) \geq \norm{f - f_0}^2 / 2$ and \eqref{eq:covering1} becomes

3072:   \begin{equation*}

3073:     | F(\delta, k) | \leq \exp \Big( D_1 n h^2 2^{k(1 - \alpha / 2)}

3074:     \Big),

3075:   \end{equation*}

3076:   where we used~\eqref{eq:bandwidth} and took $D_1 := D 3^{\beta / 2}

3077:   / a^{\beta + 2}$. Hence, for $D_2 := 3 / (28 Q^2)$, we have

3078:   \begin{equation*}

3079:     P_k \leq \exp\Big( D_1 n h^2 2^{k (1 - \alpha / 2)} - D_2 n h^2 (z +

3080:     2^{\alpha k / \beta}) \Big).

3081:   \end{equation*}

3082:   Now, we choose

3083:   \begin{equation*}

3084:     K := \Big[ \frac{\log (\min(D_2 / D_1, 1) / 2)}{(1 - \alpha / 2 - \alpha

3085:       / \beta) \log 2} \Big] + 1,

3086:   \end{equation*}

3087:   where $[x]$ is the integer part of $x$, and where we recall that

3088:   $\alpha > 2 \beta / (\beta + 2)$, so that $1 - \alpha / 2 - \alpha /

3089:   \beta < 0$. The conclusion of the proof follows easily by the

3090:   decomposition $\sum_{k \geq 1} P_k = \sum_{1 \leq k < K} P_k +

3091:   \sum_{k \geq K} P_k$, if $z \geq z_1$ for the choice $z_1 := 2 (

3092:   2^{K \alpha / \beta} - D_1 2^{K(1 - \alpha / 2)} / D_2)$.

3093: \end{proof}

3094:

3095: % \begin{align*}

3096: %   \exp( -D_Q n h^2 2^k) \exp\big( -n h^2 ( D_Q z - D 6^{1/(2s)}

3097: %   \alpha^{-(2+1/s)}) \big).

3098: % \end{align*}

3099: % \begin{equation*}

3100: %   P \big[ \norm{f^* - f}^2 - \norm{f^* - f}_n^2 \geq t_k \big] \leq

3101: % \end{equation*}

3102: % thus

3103: % \begin{equation*}

3104: %   P_k \leq \exp\big( D 6^{1/(2s)} h^{-1/s} -D_Q n h^2 ( z + 2^k )

3105: %   \big),

3106: % \end{equation*}

3107: % for any $k \geq 1$. But since $h \geq \alpha n^{-s / (2s + 1)}$, we

3108: % have

3109: % \begin{equation*}

3110: %   P_k \leq

3111: % \end{equation*}

3112: % This gives

3113: % \begin{equation*}

3114: %   \sum_{k \geq 1} P_k \leq \exp\big( -n h^2 (D_Q (z+1) - D 6^{1/(2s)}

3115: %   \alpha^{-(2+1/s)} ) \big),

3116: % \end{equation*}

3117: % which entails Lemma~\ref{lem:devia2} for $z_0$ given by

3118: % \begin{equation*}

3119: %   z_0 := \max\Big(0, \frac{D 6^{1/(2s)} \alpha^{-(2+1/s)} + 1}{D_Q} -

3120: %   1\Big). \qedhere

3121: % \end{equation*}

3122: % Let $K' \in \mathbb N$ be such that $D \alpha^{-1} (3 /

3123: % \alpha)^{1/(2s)} + 1 \leq D_Q 2^k$, and take $K := \max(4,

3124: % K')$. This choice entails

3125: % \begin{equation*}

3126: %   \sum_{k \geq K} P_k \leq \exp( -D_Q n h z) \sum_{k \geq K} \exp( -k

3127: %   n h ) \leq 2^{-1} \exp( -n h (D_Q z + K) ).

3128: % \end{equation*}

3129: % Now, for $k < K$, we have for any $z \geq z_0$, where

3130: % \begin{equation*}

3131: %   z_0  := \max(0, D_Q^{-1} (2^{K/2} D \alpha^{-1} (3 / \alpha)^{1 /

3132: %     (2s) } + 1) - 2),

3133: % \end{equation*}

3134: % that

3135: % \begin{equation*}

3136: %   \sum_{1 \leq k \leq K} P_k \leq K \exp(-n h),

3137: % \end{equation*}

3138: % hence which concludes the proof of Lemma~\ref{lem:devia2}.  \hfill

3139: % $\square$

3140:

3141:

3142: % \subsection*{}

3143:

3144:

3145:

3146:

3147:

3148: % \begin{lemma}

3149: %   \label{lem:spline_bounded}

3150: %   Let $P_X$ be such that $|\supp P_X| > s$ \textup(the support

3151: %   contains at least $s+1$ points.\textup) Let $f \in W_s$ be such that

3152: %   $\norm{f - f_0}_m \leq \delta$ for some $\delta > 0$ and some

3153: %   function $f_0$.\textup) Then, we can find positive constants $C_0,

3154: %   C, D$ such that

3155: %   \begin{equation*}

3156: %     P\big[ \norm{f}_\infty > C_0( \delta + \norm{f_0}_m + J(f)) \big]

3157: %     \geq C \exp( -D n ).

3158: %   \end{equation*}

3159: % \end{lemma}

3160:

3161:

3162: % \begin{proof}[Proof of Lemma~\ref{lem:spline_bounded}]

3163: %   Since $f \in W_s$, we can write using the Sobolev-embedding theorem

3164: %   that

3165: %   \begin{equation*}

3166: %     f = f_1 + f_2

3167: %   \end{equation*}

3168: %   where $f_1 = \sum_{|\alpha| < s} b_\alpha x^{\alpha}$ and $f_2$ is

3169: %   such that $\norm{f_2}_\infty \leq J(f_2) = J(f)$. Moreover, we have

3170: %   \begin{equation*}

3171: %     \norm{f_1}_\infty \leq \norm{b}_\infty \leq C(s) (b^{\top} b )^{1/2},

3172: %   \end{equation*}

3173: %   where $b = (b_\alpha)_{|\alpha| < s}$. For $p = (p_1, \ldots, p_d)$

3174: %   and $q = (q_1, \ldots, q_d)$ such that $|p| < s$ and $|q| < s$, let

3175: %   us introduce the matrices $A_m$ and $A$ with entries

3176: %   \begin{equation*}

3177: %     (A_m)_{p,q} = \int x^{p+q} P_X^m(dx), \quad (A)_{p,q} = \int

3178: %     x^{p+q} P_X(dx).

3179: %   \end{equation*}

3180: %   The matrix $A$ is positive definite. Indeed, otherwise, we can find

3181: %   a vector $b$ such that

3182: %   \begin{equation*}

3183: %     0 = b^{\top} A b = E \Big[ \Big( \sum_{|\alpha| < s} b_\alpha

3184: %     X^{\alpha} \Big)^2 \Big],

3185: %   \end{equation*}

3186: %   which entails that the polynomial $\sum_{|\alpha| < s} b_\alpha

3187: %   x^{\alpha}$ is zero for almost every $x \in \supp P_X$, which is not

3188: %   possible since we assumed that $|\supp(P_X)| > s$. Then, let us

3189: %   denote by $\lambda(A) > 0$ the smallest eigenvalue of $A$. On the

3190: %   event $\{ \norm{A_m - A}_\infty \leq \lambda(A)/2 \}$, we have

3191: %   \begin{equation*}

3192: %     b^{\top} b \leq \lambda(A)^{-1} (b^{\top} A_m  b + b^{\top} b

3193: %     \lambda(A) / 2 ),

3194: %   \end{equation*}

3195: %   which entails

3196: %   \begin{equation*}

3197: %     b^{\top} b \leq 2 \lambda(A)^{-1} b^{\top} A_m b = 2

3198: %     \lambda(A)^{-1} \norm{f_1}_m^2.

3199: %   \end{equation*}

3200: %   Now, since $\norm{f - f_0}_m \leq \delta$, we have

3201: %   \begin{equation*}

3202: %     \norm{f_1}_m \leq \norm{f}_m + \norm{f_2}_m \leq \delta +

3203: %     \norm{f_0}_m + J(f),

3204: %   \end{equation*}

3205: %   and putting all this together, this gives that on $\{ \norm{A_m -

3206: %     A}_\infty \leq \lambda(A)/2 \}$:

3207: %   \begin{equation*}

3208: %     \norm{f}_\infty \leq C_0 (\delta + \norm{f_0}_m + J(\bar f)),

3209: %   \end{equation*}

3210: %   where $C_0 := (2 C(s) \lambda(A)^{-1} )^{1/2}$. By Hoeffding's

3211: %   inequality, we have

3212: %   \begin{equation*}

3213: %     P[ \norm{A_m - A}_\infty > \lambda(A)/2 ] \leq C(s)^2 \exp( -D n)

3214: %   \end{equation*}

3215: %   with $D := \lambda(A)^2 / (8 M_X^2)$, where $M_X$ is the radius of

3216: %   the support of $P_X$. This concludes the proof of the Lemma.

3217: % \end{proof}

3218:

3219:

3220: \appendix

3221:

3222: \section{Function spaces}

3223: \label{sec:appendix_approximation}

3224:

3225: In this section we give precise definitions of the spaces of functions

3226: considered in the paper, and give useful related results. The

3227: definitions and results presented here can be found

3228: in~\cite{triebel06}, in particular in Chapter~5 which is about

3229: anisotropic spaces, anisotropic multiresolutions, and entropy numbers

3230: of the embeddings of such spaces (see Section~5.3.3) that we use in

3231: particular to derive condition $(C_\beta)$, for the anisotropic Besov

3232: space, see Section~\ref{sec:pena_least_squares}.

3233:

3234: % If $\bs k

3235: % = (k_1, \ldots, k_d)$ with $k_i \geq 0$ we define the \emph{iterated

3236: %   difference} by

3237: % \begin{equation*}

3238: %   \Delta_h^{\bs k} f(x) = \Delta_{h_1 e_1}^{k_1} \circ \cdots \circ

3239: %   \Delta_{h_d e_d}^{k_d} f(x)

3240: % \end{equation*}

3241:

3242: \subsection{Anisotropic Besov space}

3243:

3244: Let $\{ e_1, \ldots, e_d \}$ be the canonical basis of $\mathbb R^d$

3245: and $\bs s = (s_1, \ldots, s_d)$ with $s_i > 0$ be a vector of

3246: directional smoothness, where $s_i$ corresponds to the smoothness in

3247: direction $e_i$. Let us fix $1 \leq p, q \leq \infty$. If $f$ is a

3248: function in $\mathbb R^d$, we define $\Delta_h^k f$ as the

3249: \emph{difference} of order $k \geq 1$ and step $h \in \mathbb R^d$,

3250: given by $\Delta_h^1 f(x) = f(x + h) - f(x)$ and $\Delta_h^k f(x) =

3251: \Delta_h^1(\Delta_h^{k-1}f)(x)$ for any $x \in \mathbb R^d$. We say

3252: that $f \in L^p(\mathbb R^d)$ belongs to the anisotropic Besov space

3253: $B_{p, q}^{\bs s}(\mathbb R^d)$ if the semi-norm

3254: \begin{equation*}

3255:   |f|_{B_{p, q}^{\bs s}(\mathbb R^d)} := \sum_{i=1}^d \Big(

3256:   \int_0^1 (t^{-s_i} \norm{\Delta_{t e_i}^{k_i} f}_{p})^q

3257:   \frac{dt}{t} \Big)^{1/q}

3258: \end{equation*}

3259: is finite (with the usual modifications when $p = \infty$ or $q =

3260: \infty$). We know that the norms

3261: \begin{equation*}

3262:   \norm{f}_{B_{p, q}^{\bs s}} := \norm{f}_p + |f|_{B_{p, q}^{\bs s}}

3263: \end{equation*}

3264: are equivalent for any choice of $k_i > s_i$. An equivalent definition

3265: of the seminorm can be given using the directional differences and the

3266: anisotropic distance, see Theorem~5.8 in~\cite{triebel06}.

3267: % To make the presentation simple, we first define on $\mathbb R^d$

3268: % and then on some domain $\Omega \subset \mathbb R^d$.

3269: Following Section~5.3.3 in~\cite{triebel06}, we can define the

3270: anisotropic Besov space on an arbitrary domain $\Omega \subset \mathbb

3271: R^d$ (think of $\Omega$ as the support of the design $X$) in the

3272: following way. We define $B_{p, q}^{\bs s}(\Omega)$ as the set of all

3273: $f \in L^p(\Omega)$ such that there is $g \in B_{p, q}^{\bs s}(\mathbb

3274: R^d)$ with restriction $g | \Omega$ to $\Omega$ equal to $f$ in

3275: $L^p(\Omega)$. Moreover,

3276: \begin{equation*}

3277:   \norm{f}_{B_{p, q}^{\bs s}(\Omega)} = \inf_{g : g|\Omega = f}

3278:   \norm{g}_{B_{p, q}^{\bs s}(\mathbb R^d)},

3279: \end{equation*}

3280: where the infimum is taken over all $g \in B_{p, q}^{\bs s}(\mathbb

3281: R^d)$ such that $g | \Omega = f$. In an equivalent way, the space

3282: $B_{p, q}^{\bs s}(\Omega)$ can be defined using intrisic

3283: characterisations by differences, see Section~4.1.4

3284: in~\cite{triebel06}, where the idea is, roughly, to restrict the

3285: increments $h$ in the differences $\Delta_h^k$ so that the support of

3286: $\Delta_h^k f$ is included in $\Omega$.

3287:

3288: In what follows, we shall remove from the notations the dependence on

3289: $\Omega$, since it is does not affect the definitions and results

3290: below. Moreover, for what we need in this paper, we shall simply take

3291: $\Omega$ as the support of the design $X$. Several explicit particular

3292: cases for the space $B_{p, q}^{\bs s}$ are of interest. If $\bs s =

3293: (s, \ldots, s)$ for some $s > 0$, then $B_{p, q}^{\bs s}$ is the

3294: standard isotropic Besov space. When $p = q = 2$ and $s = (s_1,

3295: \ldots, s_d)$ has integer coordinates, $B_{2, 2}^{\bs s}$ is the

3296: anisotropic Sobolev space

3297: \begin{equation*}

3298:   B_{2, 2}^{\bs s} = W_2^{\bs s} = \Big\{ f \in L^2 : \sum_{i=1}^d

3299:   \Big\| \frac{\partial^{s_i} f}{\partial x_i^{s_i}} \Big\|_2 < \infty

3300:   \Big\}.

3301: \end{equation*}

3302: If $\bs s$ has non-integer coordinates, then $B_{2, 2}^{\bs s}$ is the

3303: anisotropic Bessel-potential space

3304: \begin{equation*}

3305:   H^{\bs s} = \Big\{ f \in L^2 : \sum_{i=1}^d \Big\| (1 +

3306:   |\xi_i|^2)^{s_i/2} \hat f(\xi) \Big\|_2 < \infty \Big\}.

3307: \end{equation*}

3308:

3309:

3310: The results described in the next section are direct consequences of

3311: the transference method, see Section~5.3 in~\cite{triebel06}. Roughly,

3312: the idea is to transfer problems for anisotropic spaces via sequence

3313: space (one can think of sequence of wavelet coefficients for instance)

3314: to isotropic spaces. This technique allows to prove the statements

3315: below. Note that another technique of proof based on replicant coding

3316: can be used, see~\cite{kerk_picard_replicant_03}. This is commented

3317: below.

3318:

3319: \subsection{Embeddings and entropy numbers}

3320:

3321: % Let $\bs \sigma = (\sigma_1, \ldots, \sigma_d)$ be a fixed vector with

3322: % $\sigma_i > 0$ and harmonic mean equal to $1$, that is $\sum_{i=1}^d 1

3323: % / \sigma_i = d$. If $s > 0$, we denote for short $s \bs \sigma = (s

3324: % \sigma_1, \ldots, s \sigma_d)$.

3325:

3326: % Using together Theorems~5.28 and~1.97 in \cite{triebel06}, we have the

3327: % following statements. If $0 < s_1 < s_0$, we have

3328: % \begin{equation}

3329: %   \label{eq:embedding1}

3330: %   B_{p, q}^{s_0 \bs \sigma} \subset B_{p, q}^{s_1 \bs \sigma}.

3331: % \end{equation}

3332:

3333: Let us first mention the following obvious embedding, which is useful

3334: for the proof of adaptive upper bound (see

3335: Section~\ref{sec:derive_adaptive}). If $0 < \bs s_1 \leq \bs s_0$

3336: coordinatewise, that is $0 < s_{1, i} \leq s_{0, i}$ for any $i \in \{

3337: 1, \ldots, d \}$, we have

3338: \begin{equation}

3339:   \label{eq:anisotropic_embedding}

3340:   B_{p, q}^{\bs s_0} \subset B_{p, q}^{\bs s_1}.

3341: \end{equation}

3342: This simply follows from the fact that $B_{p, q}^{\bs s} =

3343: \cap_{i=1}^d B_{p, q, i}^{s_i}$, where $B_{p, q, i}^{s_i}$ is the

3344: corresponding Besov space in the $i$-th direction of coordinates, with

3345: norm $L^p$ extended to the other $d-1$ directions (see Remark~5.7 in

3346: \cite{triebel06}) together with the standard embedding for the

3347: isotropic Besov space.

3348:

3349: % \subsection{Entropy numbers}

3350:

3351: As we mentioned below, Assumption~$(C_\beta)$ (see

3352: Section~\ref{sec:pena_least_squares}) is satisfied for barely all

3353: smoothness spaces considered in nonparametric literature. In

3354: particular, if $\mathcal F = B_{p,q}^{\bs s}$ is the anisotropic Besov

3355: space defined above, $(C_\beta)$ is satisfied: it is a consequence of

3356: a more general Theorem (see Theorem~5.30 in \cite{triebel06})

3357: concerning the entropy numbers of embeddings (see Definition~1.87 in

3358: \cite{triebel06}). Here, we only give a simplified version of this

3359: Theorem, which is sufficient to derive $(C_\beta)$. Indeed, if one

3360: takes $\bs s_0 = \bs s$, $p_0 = p$, $q_0 = q$ and $\bs s_1 = 0$, $p_0

3361: = \infty$, $q_0 = \infty$ in Theorem~5.30 from \cite{triebel06}, we

3362: obtain the following

3363: \begin{theorem}

3364:   \label{thm:anisotropic_entropy}

3365:   Let $1 \leq p, q \leq \infty$ and $\bs s = (s_1, \ldots, s_d)$ where

3366:   $s_i > 0$\textup, and let $\bs {\bar s}$ be the harmonic mean of

3367:   $\bs s$ \textup(see~\eqref{eq:harmonic_mean}\textup). Whenever $\bs

3368:   {\bar s} > d / p$\textup, we have

3369:   \begin{equation*}

3370:     B_{p, q}^{\bs s} \subset C(\Omega),

3371:   \end{equation*}

3372:   where $C(\Omega)$ is the set of continuous functions on

3373:   $\Omega$\textup, and for any $\delta > 0$\textup, the sup-norm

3374:   entropy of the unit ball of the anisotropic Besov space\textup,

3375:   namely the set

3376:   \begin{equation*}

3377:     U_{p, q}^{\bs s} := \{ f \in B_{p, q}^{\bs s} :

3378:     |f|_{B_{p,q}^{\bs s}} \leq 1 \}

3379:   \end{equation*}

3380:   satisfies

3381:   \begin{equation}

3382:     H_\infty(\delta, U_{p, q}^{\bs s}) \leq D \delta^{-\bs {\bar s} / d},

3383:   \end{equation}

3384:   where $D > 0$ is a constant independent of $\delta$.

3385: \end{theorem}

3386:

3387: For the isotropic Sobolev space, Theorem~\ref{thm:anisotropic_entropy}

3388: was obtained in the key paper~\cite{birman_solomjak67} (see

3389: Theorem~5.2 herein), and for the isotropic Besov space, it can be

3390: found, among others, in~\cite{birge_massart00}

3391: and~\cite{kerk_picard_replicant_03}.

3392:

3393: \begin{remark}

3394:   A more constructive computation of the entropy of anisotropic Besov

3395:   spaces can be done using the replicant coding approach, which is

3396:   done for Besov bodies in~\cite{kerk_picard_replicant_03}. Using this

3397:   approach together with an anisotropic multiresolution analysis based

3398:   on compactly supported wavelets or atoms, see Section~5.2

3399:   in~\cite{triebel06}, we can obtain a direct computation of the

3400:   entropy. The idea is to do a quantization of the wavelet

3401:   coefficients, and then to code them using a replication of their

3402:   binary representation, and to use 01 as a separator (so that the

3403:   coding is injective). A lower bound for the entropy can be obtained

3404:   as an elegant consequence of Hoeffding's deviation inequality for

3405:   sums of i.i.d. variables and a combinatorial lemma.

3406: \end{remark}

3407:

3408: % \texttt{faudra rajouter les jackson et bernstein estimates pour la

3409: %   borne inf sur besov anisotropes}

3410:

3411:

3412: % \begin{theorem}[Birg\'e and Massart (2000), Corollary~1]

3413: %   \label{thm:birge_massart}

3414:

3415: % \end{theorem}

3416:

3417: % \begin{remark}

3418: %   When $p=2$ and $s \in \mathbb N_0$, we recover the result

3419: %   from~\cite{birman_solomjak67}, namely

3420: %   \begin{equation*}

3421: %     N\big( \delta, W_s(R), \norm{\cdot}_{L^q} \big) \leq

3422: %     \exp\Big( D \Big( \frac{L}{\delta} \Big)^{d/s} \Big),

3423: %   \end{equation*}

3424: %   where $W_s(R) := \{ f \in W_s : J_s(f) \leq R \}$,

3425: %   see~\eqref{eq:usual_roughness}.

3426: % \end{remark}

3427: % The result from~\cite{birman_solomjak67} was previously used

3428: % in~\cite{mammen_vandegeer97}, for estimation in partial linear

3429: % models. % In~\cite{birman_solomjak67}, it is stated in a more general

3430: % % setting, for any $L^q$-norm with $1 \leq q \leq +\infty$.

3431: % The fact that this result holds for the $L^q$-norm, $q = \infty$

3432: % included, is important here. Indeed, a cover for $L^\infty$-norm is

3433: % also a cover for both the $L^2(P_X)$ and $L^2(P_X^m)$ norms (simply

3434: % write that $\norm{f}_{L^2(P_X)} \leq \norm{f}_\infty$ and

3435: % $\norm{f}_{L^2(P_X^m)} \leq \norm{f}_\infty$.)

3436:

3437: % \texttt{Besov sur un domaine plutot ??}

3438:

3439: % \subsection{Multiscale setting}

3440:

3441: % Let $M$ be a dilatation matrix in $\mathcal M_d(\mathbb Z)$, namely a

3442: % matrix with integer entries and eigenvalues outside the unit disk. Let

3443: % $\varphi \in H^s$. We say that $\varphi$ is a \emph{$M$-scaling

3444: %   function} if it is compactly supported, if $\int_{\mathbb R^d}

3445: % \varphi(x) dx = 1$ and if

3446: % \begin{itemize}

3447: % \item there is a finite sequence of complex numbers $(h_k)_{k \in

3448: %     \mathbb Z^d}$ such that

3449: %   \begin{equation*}

3450: %     \varphi(x) = |\det M|^{1/2} \sum_{k \in \mathbb Z^d} h_k

3451: %     \varphi(M x - k) ;

3452: %   \end{equation*}

3453: % \item $\{ \varphi(\cdot - k) \}_{k \in \mathbb Z^d}$ is a Riesz basis

3454: %   for the space it spans.

3455: % \end{itemize}

3456: % Two $M$-scaling functions $\varphi$ and $\tilde \varphi$ are

3457: % \emph{biorthogonal $M$-scaling functions} if the systems $\{

3458: % \varphi(\cdot - k) \}_{k \in \mathbb Z^d}$ and $\{ \tilde

3459: % \varphi(\cdot - k) \}_{k \in \mathbb Z^d}$ are orthogonal.

3460:

3461: % The construction of \emph{compactly supported} $M$-scaling functions

3462: % for an arbitrary dilatation matrix is a very difficult subject of

3463: % current research. Indeed, even in one-dimension, when $M = m$ is not

3464: % integer, there is no scaling functions with compact support, see

3465: % [1].

3466:

3467: % If $M = \diag(m_1, \ldots, m_d)$ where the $m_i \geq 2$ are integers,

3468: % we can construct biorthogonal $M$-scaling functions using tensor

3469: % products of one dimensional $m_i$-scaling functions $\varphi_i, \tilde

3470: % \varphi_i \in H^s(\mathbb R)$ for an arbitrary large smoothness

3471: % $s$. The construction of biorthogonal compactly supported

3472: % one-dimensional $m$-scaling functions for any integer $m \geq 2$ can

3473: % be found in ???? Then, can we simply consider

3474: % \begin{equation*}

3475: %   \varphi(x) = \prod_{i=1}^d \varphi_i(x_i) \text{ and } \tilde

3476: %   \varphi(x) = \prod_{i=1}^d \tilde \varphi_i(x_i)

3477: % \end{equation*}

3478: % to obtain compactly supported biorthogonal $M$-scaling functions. Let

3479: % us consider the matrix

3480: % \begin{equation}

3481: %   \label{eq:particular_dilatation_matrix}

3482: %   M = \diag(\lambda^{1 / \sigma_1}, \ldots,

3483: %   \lambda^{1 / \sigma_d}).

3484: % \end{equation}

3485: % The following Lemma can be found in

3486: % \begin{lemma}[see Lemma 3.2 in  ????]

3487: %   Let $\bs \sigma = (\sigma_1, \ldots, \sigma_d) > 0$. The following

3488: %   conditions are equivalent:

3489: %   \begin{itemize}

3490: %   \item There is a number $\lambda > 1$ such that $\lambda^{1 /

3491: %       \sigma_i} \in \mathbb Z_+$ for $1 \leq i \leq d$

3492: %   \item There is a number $\mu > 0$ such that $(1 / \sigma_1, \ldots,

3493: %     1 / \sigma_d) \in \mu \log \mathbb Z_+^d$.

3494: %   \end{itemize}

3495: % \end{lemma}

3496: % Thus, we know that when

3497: % \begin{equation}

3498: %   \label{eq:anisotropic_restriction}

3499: %   \Big( \frac{1}{\sigma_1}, \ldots, \frac{1}{\sigma_d} \Big) \in \mu

3500: %   \log \mathbb Z_+^d

3501: % \end{equation}

3502: % for some $\mu > 0$, we can find compactly supported biorthogonal

3503: % $M$-scaling functions. A multiresolution analysis of $L^\pi$ for $1

3504: % \leq \pi \leq \infty$ based on such scaling functions can be easily

3505: % construted, in the same way as in the dyadic case where $m_i =

3506: % 2$). This is explained in details in ????. We define $\varphi_{j,

3507: %   k}(x) := |\det M|^{j / \pi} \varphi(M^j x - k)$ where $j \in \mathbb

3508: % Z$ is the resolution level and $k \in \mathbb Z^d$ is the localization

3509: % parameter. When $M$ is given

3510: % by~\eqref{eq:particular_dilatation_matrix} we can write

3511: % \begin{equation*}

3512: %   \varphi_{j, k}(x) := \lambda^{j d / p} \varphi(M^j x - k).

3513: % \end{equation*}

3514: % These dilated and translated scaling functions are normalized in

3515: % $L^\pi$ (if $\pi = \infty$, take $\pi = 1$ in the above definition and

3516: % take a scaling function \texttt{c'est faux !!!} divide $\varphi_{j,

3517: %   k}$ by $\norm{\varphi}_\infty$, so that $\norm{\varphi_{j,

3518: %     k}}_\infty = 1$). If we define for $j \in \mathbb Z$

3519: % \begin{equation*}

3520: %   V_j = \overline{ \Span\{ \varphi_{j, k} : k \in \mathbb Z^d \} }

3521: % \end{equation*}

3522: % which is the closure of the $\Span$ of the $\varphi_{j, k}$ in

3523: % $L^\pi$, then $(V_j)_{j \in \mathbb Z_d}$ is a multiresolution

3524: % analysis of $L^\pi$ (again, if $\pi = \infty$ then $L^\infty$ is

3525: % replaced by $C(\mathbb R^d)$). We can define in the same way dilated

3526: % and translated scaling functions $\tilde \varphi_{j, k}$, and

3527: % construct as a consequence multiresolution analysis of $L^\pi$.

3528:

3529: % A remark of first importance in what follows is then the following: if

3530: % $x$ is fixed, then $\varphi_{e, j, k}$ \texttt{mettre ca apres la MRA}

3531: % $K_j$ of cardinaly $|K_j| \approx \lambda^{j d}$ (recall that by

3532: % construction $\lambda$ is an integer).

3533:

3534: % $\beta_{e, j, k} := \langle f, \tilde \psi_{e, j, k} \rangle$

3535:

3536: % $E = \{ 1, \ldots, m \}$

3537:

3538: % \texttt{apres la MRA:}

3539: % For any $f \in L^\pi$

3540: % \begin{equation*}

3541: %   \Big\| \sum_{e \in E, k \in K_j} \beta_{e, k} \psi_{e, j, k}

3542: %   \Big\|_{L^\pi} \approx \Big( \sum_{e \in E, k \in K_j} |\beta_{e, j,

3543: %     k}|^\pi \Big)^{1/\pi}

3544: % \end{equation*}

3545: % with the usual modification whenever $\pi = \infty$.

3546:

3547: % or equivalently,

3548:

3549:

3550: % where the above sums are convergent in $L^\pi$

3551:

3552: % \texttt{mettre estimees de jackson et bernstein}

3553:

3554:

3555:

3556: % This is the reason why the entropy of anisotropic Besov space

3557: % will given only be able to use the caracterization of anisotropic

3558: % Besov spaces by wavelet coefficients for

3559:

3560:

3561:

3562: % This is the reason why the

3563:

3564: % It is well-known that wavelets are a powerful tool for the

3565: % characterazition of Besov spaces, by means of sums weighted sums of

3566: % wavelet coefficient. Besov isotropic classes can be defined in this

3567: % way, using basis with

3568:

3569: % The use of compaclty supported wavelets is of first importance in

3570: % statistics for instance, and the fact that the number of

3571:

3572: % A key tool for the

3573:

3574: % A powerful way of described isotropic Besov spaces is Wavelet. Indeed,

3575: % it is well known that

3576:

3577: % If $\beta_{j, k} = \prodsca{f}{\tilde \psi_{e, j, k}}$

3578:

3579: % \begin{equation*}

3580: %   \frac{1}{C} \Big( \sum_{e, k} |\beta_{e, j, k} |^p \Big)^{1/p} \leq \Big

3581: %   \| \sum_{e=1}^{m-1} \sum_{k \in \mathbb Z^d}  \beta_{e, j, k}

3582: %   \psi_{e, j, k} \Big\|_p \leq C \Big( \sum_{e, k} |\beta_{e, j, k}

3583: %   |^p \Big)^{1/p}

3584: % \end{equation*}

3585:

3586:

3587:

3588: \section{Some probabilistic tools}

3589: \label{sec:appendix_proba}

3590:

3591:

3592: For the first Theorem we refer to \cite{EM:96}. The two following

3593: Theorems can be found, for instance, in

3594: \cite{massart03,vdVW:96,ledoux_talagrand91}.

3595:

3596:

3597: \begin{theorem}[Einmahl and Masson (1996)]

3598:   \label{TheoEinmahlMasson}

3599:   Let $Z_1,\ldots,Z_n$ be $n$ independent non-negative random

3600:   variables such that $E[Z_i^2]\leq \sigma^2,\forall i=1, \ldots, n$.

3601:   Then, we have, for any $\delta > 0$,

3602:   \begin{equation*}

3603:     P \Big[\sum_{i=1}^n Z_i - E[Z_i] \leq -n \delta \Big]

3604:     \leq \exp\Big(-\frac{n \delta^2}{2\sigma^2} \Big).

3605:   \end{equation*}

3606: \end{theorem}

3607:

3608:

3609: \begin{theorem}[Sudakov]

3610:   \label{TheoSudakov}

3611:   There exists an absolute constant $c^*>0$ such that for any integer

3612:   $M$, any centered gaussian vector $X = (X_1,\ldots,X_M)$ in

3613:   $\mathbb{R}^M$, we have,

3614:   \begin{equation*}

3615:     c^* E[\max_{1\leq j\leq M}X_j] \geq \varepsilon \sqrt{\log M},

3616:   \end{equation*}

3617:   where $\varepsilon := \min \Big\{ \sqrt{E[(X_i-X_j)^2]} : i \neq j

3618:   \in \{1, \ldots, M\} \Big\}$.

3619: \end{theorem}

3620:

3621: \begin{theorem}[Maximal inequality]

3622:   \label{TheoMaxConcIneq}

3623:   Let $Y_1, \ldots, Y_M$ be $M$ random variables satisfying

3624:   $E[\exp(sY_j)] \leq \exp((s^2\sigma^2)/2)$ for any integer $j$ and

3625:   any $s>0$. Then, we have

3626:   \begin{equation*}

3627:     E[ \max_{1 \leq j \leq M} Y_j] \leq \sigma \sqrt{\log M}.

3628:   \end{equation*}

3629: \end{theorem}

3630:

3631: % \begin{theorem}[Berry-Ess{\'e}en]\label{TheoBerry}

3632: % Suppose that $(X_i)_{i\in\mathbb{N}}$ is a sequence of i.i.d. random

3633: % variables with mean $\mu$ and variance $\sigma^2>0$. Then, for all

3634: % $n$,

3635: % $$\sup_{t\in\mathbb{R}}\left\vert\mathbb{P}\Big(\frac{\sum_{i=1}^n X_i-n\mu}

3636: % {\sigma \sqrt{n}}\leq t \Big)-\Phi(t)\right\vert\leq

3637: % \frac{33}{4}\frac{\mathbb{E}|X_1-\mu|^3}{\sigma^3\sqrt{n}}.$$

3638: % \end{theorem}

3639:

3640: \par

3641:

3642:

3643: % \bibliographystyle{ims}

3644:

3645: \begin{thebibliography}{48}

3646: \expandafter\ifx\csname natexlab\endcsname\relax\def\natexlab#1{#1}\fi

3647: \expandafter\ifx\csname url\endcsname\relax

3648:   \def\url#1{\texttt{#1}}\fi

3649: \expandafter\ifx\csname urlprefix\endcsname\relax\def\urlprefix{URL }\fi

3650: \providecommand{\eprint}[2][]{\url{#2}}

3651:

3652: \bibitem[{Amato et~al.(2006)Amato, Antoniadis and

3653:   Pensky}]{amato_antoniadis_pensky06}

3654: \textsc{Amato, U.}, \textsc{Antoniadis, A.} and \textsc{Pensky, M.} (2006).

3655: \newblock Wavelet kernel penalized estimation for non-equispaced design

3656:   regression.

3657: \newblock \textit{Stat. Comput.}, \textbf{16} 37--55.

3658:

3659: \bibitem[{Aronszajn(1950)}]{aronszajn50}

3660: \textsc{Aronszajn, N.} (1950).

3661: \newblock Theory of reproducing kernels.

3662: \newblock \textit{Trans. Amer. Math. Soc.}, \textbf{68} 337--404.

3663:

3664: \bibitem[{Bartlett and Mendelson(2006)}]{BM:06}

3665: \textsc{Bartlett, P.~L.} and \textsc{Mendelson, S.} (2006).

3666: \newblock Empirical minimization.

3667: \newblock \textit{Probab. Theory Related Fields}, \textbf{135} 311--334.

3668:

3669: \bibitem[{Birg\'e and Massart(1993)}]{birge_massart93}

3670: \textsc{Birg\'e, L.} and \textsc{Massart, P.} (1993).

3671: \newblock {Rates of convergence for minimum contrast estimators.}

3672: \newblock \textit{Probab. Theory Relat. Fields}, \textbf{97} 113--150.

3673:

3674: \bibitem[{Birg{\'e} and Massart(2000)}]{birge_massart00}

3675: \textsc{Birg{\'e}, L.} and \textsc{Massart, P.} (2000).

3676: \newblock An adaptive compression algorithm in {B}esov spaces.

3677: \newblock \textit{Constr. Approx.}, \textbf{16} 1--36.

3678:

3679: \bibitem[{Birman and Solomjak(1967)}]{birman_solomjak67}

3680: \textsc{Birman, M.~{\v{S}}.} and \textsc{Solomjak, M.~Z.} (1967).

3681: \newblock Piecewise polynomial approximations of functions of classes

3682:   {$W_p^{\alpha}$}.

3683: \newblock \textit{Mat. Sb. (N.S.)}, \textbf{73 (115)} 331--355.

3684:

3685: \bibitem[{Bitouz{\'e} et~al.(1999)Bitouz{\'e}, Laurent and Massart}]{BLM99}

3686: \textsc{Bitouz{\'e}, D.}, \textsc{Laurent, B.} and \textsc{Massart, P.} (1999).

3687: \newblock A {D}voretzky-{K}iefer-{W}olfowitz type inequality for the

3688:   {K}aplan-{M}eier estimator.

3689: \newblock \textit{Ann. Inst. H. Poincar\'e Probab. Statist.}, \textbf{35}

3690:   735--763.

3691:

3692: \bibitem[{Carl and Stephani(1990)}]{CS:98}

3693: \textsc{Carl, B.} and \textsc{Stephani, I.} (1990).

3694: \newblock \textit{Entropy, compactness and the approximation of operators},

3695:   vol.~98 of \textit{Cambridge Tracts in Mathematics}.

3696: \newblock Cambridge University Press, Cambridge.

3697:

3698: \bibitem[{Catoni(2001)}]{catbook:01}

3699: \textsc{Catoni, O.} (2001).

3700: \newblock \textit{Statistical Learning Theory and Stochastic Optimization}.

3701: \newblock Ecole d'{\'e}t{\'e} de Probabilit{\'e}s de Saint-Flour 2001, Lecture

3702:   Notes in Mathematics, Springer, N.Y.

3703:

3704: \bibitem[{Cucker and Smale(2002)}]{cucker_smale02}

3705: \textsc{Cucker, F.} and \textsc{Smale, S.} (2002).

3706: \newblock On the mathematical foundations of learning.

3707: \newblock \textit{Bull. Amer. Math. Soc. (N.S.)}, \textbf{39} 1--49

3708:   (electronic).

3709:

3710: \bibitem[{Devroye et~al.(1996)Devroye, Gy{\"o}rfi and Lugosi}]{DGL:96}

3711: \textsc{Devroye, L.}, \textsc{Gy{\"o}rfi, L.} and \textsc{Lugosi, G.} (1996).

3712: \newblock \textit{A probabilistic theory of pattern recognition}, vol.~31 of

3713:   \textit{Applications of Mathematics (New York)}.

3714: \newblock Springer-Verlag, New York.

3715:

3716: \bibitem[{Einmahl and Mason(1996)}]{EM:96}

3717: \textsc{Einmahl, U.} and \textsc{Mason, D.~M.} (1996).

3718: \newblock Some universal results on the behavior of increments of partial sums.

3719: \newblock \textit{Ann. Probab.}, \textbf{24} 1388--1407.

3720:

3721: \bibitem[{Ga\"iffas and Lecu\'e(2007)}]{gaiffas_lecue07}

3722: \textsc{Ga\"iffas, S.} and \textsc{Lecu\'e, G.} (2007).

3723: \newblock Optimal rates and adaptation in the single-index model using

3724:   aggregation.

3725: \newblock \textit{Electronic Journal of Statistics}, \textbf{1} 538--573.

3726:

3727: \bibitem[{Green and Silverman(1994)}]{green_silverman94}

3728: \textsc{Green, P.~J.} and \textsc{Silverman, B.~W.} (1994).

3729: \newblock \textit{Nonparametric regression and generalized linear models},

3730:   vol.~58 of \textit{Monographs on Statistics and Applied Probability}.

3731: \newblock Chapman \& Hall, London.

3732: \newblock A roughness penalty approach.

3733:

3734: \bibitem[{Gy{\"o}rfi et~al.(2002)Gy{\"o}rfi, Kohler, Krzy{\.z}ak and

3735:   Walk}]{kohler02}

3736: \textsc{Gy{\"o}rfi, L.}, \textsc{Kohler, M.}, \textsc{Krzy{\.z}ak, A.} and

3737:   \textsc{Walk, H.} (2002).

3738: \newblock \textit{A distribution-free theory of nonparametric regression}.

3739: \newblock Springer Series in Statistics, Springer-Verlag, New York.

3740:

3741: \bibitem[{Hamers and Kohler(2004)}]{hamers_kohler04}

3742: \textsc{Hamers, M.} and \textsc{Kohler, M.} (2004).

3743: \newblock How well can a regression function be estimated if the distribution

3744:   of the (random) design is concentrated on a finite set?

3745: \newblock \textit{J. Statist. Plann. Inference}, \textbf{123} 377--394.

3746:

3747: \bibitem[{Haussler(1992)}]{H:92}

3748: \textsc{Haussler, D.} (1992).

3749: \newblock Decision-theoretic generalizations of the {PAC} model for neural net

3750:   and other learning applications.

3751: \newblock \textit{Inform. and Comput.}, \textbf{100} 78--150.

3752:

3753: \bibitem[{Hochmuth(2002)}]{hochmuth02}

3754: \textsc{Hochmuth, R.} (2002).

3755: \newblock Wavelet characterizations for anisotropic {B}esov spaces.

3756: \newblock \textit{Appl. Comput. Harmon. Anal.}, \textbf{12} 179--208.

3757:

3758: \bibitem[{Hoffmann and Lepski(2002)}]{hoffmann_lepski02}

3759: \textsc{Hoffmann, M.} and \textsc{Lepski, O.~V.} (2002).

3760: \newblock Random rates in anisotropic regression.

3761: \newblock \textit{The Annals of Statistics}, \textbf{30} 325--396.

3762:

3763: \bibitem[{Juditsky et~al.(2005{\natexlab{a}})Juditsky, Rigollet and

3764:   Tsybakov}]{juditsky_etal05}

3765: \textsc{Juditsky, A.}, \textsc{Rigollet, P.} and \textsc{Tsybakov, A.}

3766:   (2005{\natexlab{a}}).

3767: \newblock Learning by mirror averaging.

3768: \newblock \urlprefix\url{http://arxiv.org/abs/math/0511468}.

3769:

3770: \bibitem[{Juditsky et~al.(2005{\natexlab{b}})Juditsky, Nazin, Tsybakov and

3771:   Vayatis}]{juditsky_nazin05}

3772: \textsc{Juditsky, A.~B.}, \textsc{Nazin, A.~V.}, \textsc{Tsybakov, A.~B.} and

3773:   \textsc{Vayatis, N.} (2005{\natexlab{b}}).

3774: \newblock Recursive aggregation of estimators by the mirror descent method with

3775:   averaging.

3776: \newblock \textit{Problemy Peredachi Informatsii}, \textbf{41} 78--96.

3777:

3778: \bibitem[{Juditsky et~al.(2006)Juditsky, Rigollet and Tsybakov}]{jrt:06}

3779: \textsc{Juditsky, A.~B.}, \textsc{Rigollet, P.} and \textsc{Tsybakov, A.~B.}

3780:   (2006).

3781: \newblock Learning by mirror averaging.

3782: \newblock To appear in the {\it Ann. Statist.}. Available at

3783:   http://www.imstat.org/aos/future\_papers.html.

3784:

3785: \bibitem[{Kearns et~al.(1994)Kearns, Schapire, Sellie and

3786:   Hellerstein}]{KSSH:94}

3787: \textsc{Kearns, M.~J.}, \textsc{Schapire, R.~E.}, \textsc{Sellie, L.~M.} and

3788:   \textsc{Hellerstein, L.} (1994).

3789: \newblock Toward efficient agnostic learning.

3790: \newblock In \textit{Machine Learning}. ACM Press, 341--352.

3791:

3792: \bibitem[{Kerkyacharian et~al.(2001)Kerkyacharian, Lepski and

3793:   Picard}]{kerk_lepski_picard01}

3794: \textsc{Kerkyacharian, G.}, \textsc{Lepski, O.} and \textsc{Picard, D.} (2001).

3795: \newblock Nonlinear estimation in anisotropic multi-index denoising.

3796: \newblock \textit{Probab. Theory Related Fields}, \textbf{121} 137--170.

3797:

3798: \bibitem[{Kerkyacharian et~al.(2007)Kerkyacharian, Lepski and

3799:   Picard}]{kerk_lepski_picard07}

3800: \textsc{Kerkyacharian, G.}, \textsc{Lepski, O.} and \textsc{Picard, D.} (2007).

3801: \newblock Nonlinear estimation in anisotropic multiindex denoising. {S}parse

3802:   case.

3803: \newblock \textit{Teor. Veroyatn. Primen.}, \textbf{52} 150--171.

3804:

3805: \bibitem[{Kerkyacharian and Picard(2003)}]{kerk_picard_replicant_03}

3806: \textsc{Kerkyacharian, G.} and \textsc{Picard, D.} (2003).

3807: \newblock Replicant compression coding in {B}esov spaces.

3808: \newblock \textit{ESAIM Probab. Stat.}, \textbf{7} 239--250 (electronic).

3809:

3810: \bibitem[{Kerkyacharian and Picard(2007)}]{kerk_picard07}

3811: \textsc{Kerkyacharian, G.} and \textsc{Picard, D.} (2007).

3812: \newblock Thresholding in learning theory.

3813: \newblock \textit{Constr. Approx.}, \textbf{26} 173--203.

3814:

3815: \bibitem[{Kimeldorf and Wahba(1971)}]{kimeldorf_wahba71}

3816: \textsc{Kimeldorf, G.} and \textsc{Wahba, G.} (1971).

3817: \newblock Some results on {T}chebycheffian spline functions.

3818: \newblock \textit{J. Math. Anal. Appl.}, \textbf{33} 82--95.

3819:

3820: \bibitem[{Kohler(2000)}]{kohler00}

3821: \textsc{Kohler, M.} (2000).

3822: \newblock Inequalities for uniform deviations of averages from expectations

3823:   with applications to nonparametric regression.

3824: \newblock \textit{J. Statist. Plann. Inference}, \textbf{89} 1--23.

3825:

3826: \bibitem[{Kohler and Krzy{\.z}ak(2001{\natexlab{a}})}]{kohler_krzyzak01a}

3827: \textsc{Kohler, M.} and \textsc{Krzy{\.z}ak, A.} (2001{\natexlab{a}}).

3828: \newblock Nonparametric regression estimation using penalized least squares.

3829: \newblock \textit{IEEE Trans. Inform. Theory}, \textbf{47} 3054--3058.

3830:

3831: \bibitem[{Kohler and Krzy{\.z}ak(2001{\natexlab{b}})}]{kohler_krzyzak01b}

3832: \textsc{Kohler, M.} and \textsc{Krzy{\.z}ak, A.} (2001{\natexlab{b}}).

3833: \newblock Nonparametric regression estimation using penalized least squares.

3834: \newblock \textit{IEEE Trans. Inform. Theory}, \textbf{47} 3054--3058.

3835:

3836: \bibitem[{Lecu{\'e}(2006)}]{LecJMLR:06}

3837: \textsc{Lecu{\'e}, G.} (2006).

3838: \newblock Lower bounds and aggregation in density estimation.

3839: \newblock \textit{J. Mach. Learn. Res.}, \textbf{7} 971--981.

3840:

3841: \bibitem[{Lecu{\'e}(2007)}]{LecAoS:07}

3842: \textsc{Lecu{\'e}, G.} (2007).

3843: \newblock Simultaneous adaptation to the margin and to complexity in

3844:   classification.

3845: \newblock \textit{Ann. Statist.}, \textbf{35} 1698--1721.

3846:

3847: \bibitem[{Ledoux and Talagrand(1991)}]{ledoux_talagrand91}

3848: \textsc{Ledoux, M.} and \textsc{Talagrand, M.} (1991).

3849: \newblock \textit{Probability in {B}anach spaces}, vol.~23 of

3850:   \textit{Ergebnisse der Mathematik und ihrer Grenzgebiete (3) [Results in

3851:   Mathematics and Related Areas (3)]}.

3852: \newblock Springer-Verlag, Berlin.

3853: \newblock Isoperimetry and processes.

3854:

3855: \bibitem[{Leung and Barron(2006)}]{leung_barron06}

3856: \textsc{Leung, G.} and \textsc{Barron, A.~R.} (2006).

3857: \newblock Information theory and mixing least-squares regressions.

3858: \newblock \textit{IEEE Trans. Inform. Theory}, \textbf{52} 3396--3410.

3859:

3860: \bibitem[{Massart(2007)}]{massart03}

3861: \textsc{Massart, P.} (2007).

3862: \newblock \textit{Concentration inequalities and model selection}, vol. 1896 of

3863:   \textit{Lecture Notes in Mathematics}.

3864: \newblock Springer, Berlin.

3865: \newblock Lectures from the 33rd Summer School on Probability Theory held in

3866:   Saint-Flour, July 6--23, 2003, With a foreword by Jean Picard.

3867:

3868: \bibitem[{Neumann(2000)}]{neumann00}

3869: \textsc{Neumann, M.~H.} (2000).

3870: \newblock Multivariate wavelet thresholding in anisotropic function spaces.

3871: \newblock \textit{Statist. Sinica}, \textbf{10} 399--431.

3872:

3873: \bibitem[{Steinwart and Scovel(2007)}]{SS:07}

3874: \textsc{Steinwart, I.} and \textsc{Scovel, C.} (2007).

3875: \newblock Fast rates for support vector machines using {G}aussian kernels.

3876: \newblock \textit{Ann. Statist.}, \textbf{35} 575--607.

3877:

3878: \bibitem[{Triebel(2006)}]{triebel06}

3879: \textsc{Triebel, H.} (2006).

3880: \newblock \textit{Theory of function spaces. {III}}, vol. 100 of

3881:   \textit{Monographs in Mathematics}.

3882: \newblock Birkh\"auser Verlag, Basel.

3883:

3884: \bibitem[{Tsybakov(2003{\natexlab{a}})}]{tsybakov03}

3885: \textsc{Tsybakov, A.} (2003{\natexlab{a}}).

3886: \newblock \textit{Introduction � l'estimation non-paramétrique}.

3887: \newblock Springer.

3888:

3889: \bibitem[{Tsybakov(2003{\natexlab{b}})}]{tsy:03}

3890: \textsc{Tsybakov, A.~B.} (2003{\natexlab{b}}).

3891: \newblock Optimal rates of aggregation.

3892: \newblock \textit{Computational Learning Theory and Kernel Machines.

3893:   B.Sch{\"o}lkopf and M.Warmuth, eds. Lecture Notes in Artificial

3894:   Intelligence}, \textbf{2777} 303--313.

3895: \newblock Springer, Heidelberg.

3896:

3897: \bibitem[{van~de Geer(1990)}]{vandegeer90}

3898: \textsc{van~de Geer, S.} (1990).

3899: \newblock Estimating a regression function.

3900: \newblock \textit{Ann. Statist.}, \textbf{18} 907--924.

3901:

3902: \bibitem[{van~de Geer(2007)}]{vdg07}

3903: \textsc{van~de Geer, S.} (2007).

3904: \newblock Oracle inequalities and regularization.

3905: \newblock In \textit{Lectures on empirical processes}. EMS Ser. Lect. Math.,

3906:   Eur. Math. Soc., Z\"urich, 191--252.

3907:

3908: \bibitem[{van~de Geer(2000)}]{van_de_geer00}

3909: \textsc{van~de Geer, S.~A.} (2000).

3910: \newblock \textit{Applications of empirical process theory}, vol.~6 of

3911:   \textit{Cambridge Series in Statistical and Probabilistic Mathematics}.

3912: \newblock Cambridge University Press, Cambridge.

3913:

3914: \bibitem[{van~der Vaart and Wellner(1996)}]{vdVW:96}

3915: \textsc{van~der Vaart, A.~W.} and \textsc{Wellner, J.~A.} (1996).

3916: \newblock \textit{Weak convergence and empirical processes}.

3917: \newblock Springer Series in Statistics, Springer-Verlag, New York.

3918: \newblock With applications to statistics.

3919:

3920: \bibitem[{Wahba(1990)}]{wahba90}

3921: \textsc{Wahba, G.} (1990).

3922: \newblock \textit{Spline models for observational data}, vol.~59 of

3923:   \textit{CBMS-NSF Regional Conference Series in Applied Mathematics}.

3924: \newblock Society for Industrial and Applied Mathematics (SIAM), Philadelphia,

3925:   PA.

3926:

3927: \bibitem[{Yang(2000)}]{yang:00}

3928: \textsc{Yang, Y.} (2000).

3929: \newblock Mixing strategies for density estimation.

3930: \newblock \textit{Ann. Statist.}, \textbf{28} 75--87.

3931:

3932: \bibitem[{Yang(2004)}]{yang04}

3933: \textsc{Yang, Y.} (2004).

3934: \newblock Aggregating regression procedures to improve performance.

3935: \newblock \textit{Bernoulli}, \textbf{10} 25--47.

3936:

3937: \end{thebibliography}

3938:

3939:

3940: % \bibliography{biblio}

3941:

3942:

3943: \end{document}