0810.5288/GL2.tex
1: 
2: 
3: \documentclass[aos, reqno, preprint]{imsart}%
4: \RequirePackage{amsthm, amsmath, natbib, amsfonts, amssymb}%
5: \RequirePackage[OT1]{fontenc}%
6: \usepackage{graphicx, color}%
7: \usepackage{tikz}%
8: \usepackage{natbib}%
9: 
10: 
11: \numberwithin{equation}{section}%
12: \theoremstyle{plain}%
13: % \newtheorem{theorem}{Theorem}[section]
14: 
15: \definecolor{darkblue}{rgb}{0.0,0.0,0.7}
16: 
17: \RequirePackage[%
18: colorlinks = true,%
19: linkcolor = darkblue,%
20: citecolor = darkblue,%
21: urlcolor = darkblue, %
22: ]{hyperref}%
23: 
24: 
25: \hypersetup{%
26:   pdfauthor = {St\'ephane Ga\"iffas, Guillaume Lecu\'e},%
27:   pdftitle = {Adaptive estimation of the regression with an assumption
28:     free design},%
29:   pdfcreator = {pdflatex},%
30:   pdfproducer = {pdflatex}}
31: 
32: \startlocaldefs
33: 
34: \def \egal {\stackrel{{\rm def}}{=}}
35: 
36: \newcommand \cA{{\cal A}}
37: \newcommand \cB{{\cal B}}
38: \newcommand \cC{{\cal C}}
39: \newcommand \cD{{\cal D}}
40: \newcommand \cE{{\cal E}}
41: \newcommand \cF{{\cal F}}
42: \newcommand \cG{{\cal G}}
43: \newcommand \cH{{\cal H}}
44: \newcommand \cI{{\cal I}}
45: \newcommand \cL{{\cal L}}
46: \newcommand \cM{{\cal M}}
47: \newcommand \cN{{\cal N}}
48: \newcommand \cO{{\cal O}}
49: \newcommand \cP{{\cal P}}
50: \newcommand \cR{{\cal R}}
51: \newcommand \cQ{{\cal Q}}
52: \newcommand \cS{{\cal S}}
53: \newcommand \cU{{\cal U}}
54: \newcommand \cX{{\cal X}}
55: \newcommand \cY{{\cal Y}}
56: \newcommand \cZ{{\cal Z}}
57: \newcommand{\smin}{s_{\min}}%
58: \newcommand{\smax}{s_{\max}}
59: 
60: \newcommand \R{{\mathbb  R}}
61: \newcommand \E{{\mathbb  E}}
62: \newcommand \V{{\mathbb  V}}
63: 
64: \newcommand{\T}{^{\top}}%
65: \newcommand{\var}{\text{Var}}%
66: \newcommand{\prodsca}[2]{\langle #1,#2 \rangle}%
67: \newcommand{\norm}[1]{\|#1\|}%
68: \newcommand{\ind}[1]{\mathbf 1_{#1}}%
69: \newcommand{\mb}{\mathbf}
70: \newcommand{\sumin}{\sum_{i=1}^n}
71: \newcommand{\sumim}{\sum_{i=1}^m}
72: \newcommand{\bs}{\boldsymbol}
73: 
74: \newcommand{\grad}{\triangledown}
75: 
76: \DeclareMathOperator*{\supp}{Supp}
77: 
78: \DeclareMathOperator{\limInf}{liminf}
79: \DeclareMathOperator{\limSup}{limsup}
80: 
81: \DeclareMathOperator*{\argmin}{argmin}
82: \DeclareMathOperator*{\argmax}{argmax}
83: \DeclareMathOperator{\pen}{pen}
84: 
85: \DeclareMathOperator{\diag}{diag}
86: \DeclareMathOperator{\Span}{span}
87: 
88: 
89: \newcommand{\1}{{\rm 1}\kern-0.24em{\rm I}}
90: \newcommand{\hfn}{{\hat{f}_n}}
91: \renewcommand{\hat}{\widehat}
92: 
93: % \newtheorem{theo}{Theorem}%
94: \newtheorem{theorem}{Theorem}%
95: \newtheorem{corollary}{Corollary}%
96: \newtheorem{lemma}{Lemma}%
97: \newtheorem{proposition}{Proposition}%
98: % \newtheorem*{assumption}{Assumption}%
99: \theoremstyle{remark}%
100: \newtheorem*{remark}{Remark}%
101: \newtheorem{definition}{Definition}%
102: \newtheorem*{assumption}{Assumption}%
103: \newtheorem{example}{Example}%
104: 
105: 
106: \endlocaldefs
107: 
108: 
109: % \linespread{1.4}
110: 
111: 
112: \begin{document}
113: 
114: \begin{frontmatter}
115: 
116:   \title{Aggregation of penalized empirical risk minimizers in
117:     regression}%
118:   \runtitle{Aggregation of penalized empirical risk minimizers}
119: 
120:   \begin{aug}
121:     \author{\fnms{St\'ephane} \snm{Ga\"iffas}
122:       \ead[label=e1]{stephane.gaiffas@upmc.fr}} and
123:     \author{\fnms{ Guillaume} \snm{Lecu\'e}
124:       \ead[label=e2]{lecue@latp.univ-mrs.fr}}
125: 
126:     \runauthor{S. Ga\"iffas and G. Lecu\'e} \affiliation{Universit\'e
127:       Paris~6 and CNRS, LATP Marseille}
128: 
129:     \address{Universit\'e Paris 6  \\
130:       Laboratoire de Statistique Th\'eorique et Appliqu\'ee \\
131:       175 rue du Chevaleret \\
132:       75013 Paris \\
133:       \printead{e1}}
134: 
135:     \address{ Laboratoire d'abalyse, topologie et probabilit\'e\\
136: 	   Centre de Mathématiques et Informatique\\
137: 	  Technopôle de Château-Gombert\\
138: 	  39 rue F. Joliot Curie\\
139: 	  13453 Marseille Cedex 13\\
140: 	  France\\
141:       \printead{e2}}
142:   \end{aug}
143: 
144:   \begin{abstract}
145:     We give a general result concerning the rates of convergence of
146:     penalized empirical risk minimizers (PERM) in the regression
147:     model. Then, we consider the problem of agnostic learning of the
148:     regression, and give in this context an oracle inequality and a
149:     lower bound for PERM over a finite class. These results hold for a
150:     general multivariate random design, the only assumption being the
151:     compactness of the support of its law (allowing discrete
152:     distributions for instance). Then, using these results, we
153:     construct adaptive estimators. We consider as examples adaptive
154:     estimation over anisotropic Besov spaces or reproductive kernel
155:     Hilbert spaces. Finally, we provide an empirical evidence that
156:     aggregation leads to more stable estimators than more standard
157:     cross-validation or generalized cross-validation methods for the
158:     selection of the smoothing parameter, when the number of
159:     observation is small.
160:     % estimators which are Our aggregation
161:     % approach is motivated by a lower bound for PERM procedures over
162:     % a finite set of weak estimators, which proves that PERM
163:     % procedures are suboptimal compared to some exponential weighted
164:     % averaged schemes.
165:     % We propose an adaptive estimator of the multivariate regression
166:     % function $f_0$ from i.i.d. observations. Without assumption on
167:     % the law $P_X$ of the covariates, besides almost sure
168:     % boundedness, we prove that the standard rate $n^{-s / (2s + 1)}$
169:     % can be achieved by an adaptive estimator, where $n$ denotes the
170:     % sample size and $s$ the smoothness of $f_0$ measured in some
171:     % sense, including Besov smoothness. The assumption on the noise
172:     % is fairly general.
173:   \end{abstract}
174: 
175: \begin{keyword}[class=AMS]
176:   \kwd[Primary ]{62G08}
177:   \kwd[; secondary ]{62H12}
178: \end{keyword}
179: 
180: \begin{keyword}
181:   \kwd{Nonparametric regression, agnostic learning, aggregation,
182:     adaptive estimation, random design, anisotropic Besov space,
183:     Reproductive Kernel Hilbert Spaces}
184: \end{keyword}
185: 
186: \end{frontmatter}
187: 
188: 
189: \section{Introduction}
190: \label{sec:introduction}
191: 
192: \subsection{Motivations}
193: 
194: In this paper, we explore some statistical properties of penalized
195: empirical risk minimization (PERM) and aggregation procedures in the
196: regression model. From these properties, we will be able to obtain
197: results concerning adaptive estimation for several problems. Given a
198: data set $D_n$, we consider two problems. Let us define the norm
199: $\norm{g}^2 := \int g(x)^2 P_X(dx)$ where $P_X$ is the law of the
200: covariates and let $E[\cdot]$ be the expectation w.r.t. the joint law
201: of $D_n$. The first problem is the problem of estimation of the
202: regression function $f_0$. Namely, we aim at constructing some
203: procedure $\bar{f}_n$ satisfying
204: \begin{equation}
205:   \label{eq:RateOfConvergence}
206:   E \|\bar{f}_n - f_0 \|^2 \leq \psi(n)
207: \end{equation}
208: where $\psi(n)$, called the {\it rate of convergence}, is a quantity
209: we wish very small as $n$ increases. To get this kind of inequality,
210: it is well-known that one has to assume that $f_0$ belongs to a set
211: with a small complexity (cf., for instance, the "No free Lunch
212: theorem" in \cite{DGL:96}). This is what we do in
213: Section~\ref{sec:pena_least_squares} below, where an assumption on the
214: complexity is considered, see Assumption ($C_\beta$) on the metric
215: entropy.
216: 
217: However, this kind of ``a priori'' may not be fulfilled. That is why
218: the second problem, called {\it agnostic learning} has been introduced
219: (cf. \cite{H:92,KSSH:94} and references therein). For this problem, one is given a set $F$ of
220: functions. Without any assumption on $f_0$, we want to construct (from
221: the data) a procedure $\tilde{f}$ which has a risk as close as
222: possible to the smallest risk over $F$. Namely, we want to obtain {\it
223:   oracle inequalities}, that is inequalities of the form
224: \begin{equation*}
225:   E \| \tilde{f} - f_0 \|^2 \leq C \min_{f\in F} \|f - f_0 \|^2 +
226:   \phi(n,F),
227: \end{equation*}
228: where $C \geq 1$ and $\phi(n,F)$ is called the {\it residue}, which is
229: the quantity that we want to be small as $n$ increases.  When $F$ is
230: of finite cardinality $M$, the agnostic problem is called {\it
231:   aggregation problem} and the residue $\phi(n,F) = \phi(n,M)$ is
232: called {\it rate of aggregation}. The main difference between the
233: problems of estimation and aggregation is that we don't need any
234: assumption on $f_0$ for the second problem. Nevertheless, aggregation
235: methods have been widely used to construct adaptive procedures for the
236: estimation problem. That is the reason why we study aggregation
237: procedures in Section~\ref{sec:ERM_finite} below. We will use these
238: procedures in Section~\ref{sec:examples} to construct adaptive
239: estimators in several particular cases, such as adaptive estimation in
240: reproductive kernel Hilbert spaces (RKHS) or adaptive estimation over
241: anisotropic Besov spaces.
242: 
243: In Section~\ref{sec:ERM_finite}, we also prove that the ``natural''
244: aggregation procedure, namely empirical risk minimization (ERM) (or
245: its penalized version), fails to achieve the optimal rate of
246: aggregation in this setup. This result motivates the use of an
247: aggregation procedure instead of the most common ERM. Moreover, we
248: provide an empirical evidence in Section~\ref{sec:simulations} that
249: aggregation (with jackknife) is more stable than the classical
250: cross-validation or generalized cross-validation procedures when the
251: number of observations and the signal-to-noise ratio are small.
252: 
253: The approach proposed in this paper allows to give rates of
254: convergence for adaptive estimators over very general function sets,
255: such as the anisotropic besov space, with very mild assumption on the
256: law of the covariates: all the results are stated with the sole
257: assumption that the law of the covariates is compact.
258: 
259: 
260: % We propose an adaptive estimator of the multivariate regression
261: % function $f_0$ from i.i.d. observations. This procedure has strong
262: % adaptation properties: it is adaptive for a very large range of
263: % smoothness classes, including Besov spaces, in the sense that it
264: % achieves the optimal convergence rate without assumption on the
265: % design (or covariates) distribution, besides almost sure
266: % boundedness. % Moreover, this estimator can reduce the dimension of the
267: % % problem, when a single-index assumption is satisfied.
268: % Adaptation is realized via aggregation of several so-called \emph{weak}
269: % estimators, that have in common this strong \emph{design adaptation}
270: % property. The explanatory variable $Y$ is not assumed to be bounded
271: % (we consider subgaussian noise), thus the setting considered here is
272: % more general than the so-called ``distribution free non-parametric
273: % estimation'', see for instance~\cite{kohler02}, which contains a very
274: % exhaustive and detailed presentation of methods that handle the
275: % situation where the knowledge about $P_X$ is very poor.
276: 
277: % Adaptation is achieved via \emph{aggregation} (or aggregation) of penalized
278: % least squares estimators over general spaces. From a theoretical point
279: % of view, we use probability techniques coming from empirical process
280: % theory such as covering numbers, peeling and chaining. These
281: % techniques are technical recipes that allows to counterpart the
282: % massiveness of the smoothness classes considered in nonparametric
283: % statistics. On this topic, we refer to~\cite{kohler02},
284: % \cite{vandegeer88, van_de_geer00}, which contains tools and ideas of
285: % importance here (concerning penalized least squares). From a more
286: % practical point of view, the adaptive estimator presented here allows
287: % to mix estimators that are known to provide good results for certain
288: % types of curves. A simple example proposed here is the aggregation of
289: % smoothing splines (least squares with Sobolev penalty). Instead of
290: % selecting the smoothing parameter via GCV (generalized cross
291: % validation), which is of common use in practice, we suggest to apply a
292: % aggregation algorithm to estimators computed with different smoothing
293: % parameters. This allows to consider splines with different orders
294: % simultaneously, while cubic splines are often considered alone in
295: % applications. We show (see Section~\ref{sec:simulations}) that this
296: % provides a more stable procedure than GCV, and that it gives better
297: % results. Moreover, we provide here theoretical results for this
298: % adaptive method, while theoretical knowledge about GCV (concerning
299: % adaptive rates of convergence) is poor. Furthermore, we can mix
300: % smoothing splines with other estimators, like wavelet soft
301: % thresholding for instance (least squares with a particular Besov
302: % penalty). Such an estimator gives good results whathever the curve is:
303: % either a smooth curve, coming from econometric data for instance, or a
304: % signal with bumps or rapid oscillations. When the covariates are
305: % multivariate, we can even further mix purely nonparametric estimators
306: % (with curse of dimensionality) with semiparametric estimators that
307: % process the data using the single-index assumption. The resulting
308: % adaptive estimator provides good results, whether or not the data is
309: % well explained by a single-index model, and it is rate-optimal in both
310: % cases.
311: 
312: % The main drawback of our aggregation strategy is that it has a higher
313: % computational cost than a single estimation technique with data-driven
314: % selection of smoothing parameters. But, the counterpart is that when
315: % we aggregate estimators, we do not need to test if some model is
316: % better than another. For instance, we do not test if a single-index
317: % model explains well the data, we just mix all the estimators (purely
318: % nonparametric and single-index) using our aggregation rule, and come
319: % up with an estimator that does a job which is close to the best among
320: % them, whatever the model is (it must be emphasized at this point that
321: % actually, the performance of the aggregate is much better than the
322: % best among them, this is discudded in Section~\ref{sec:simulations}
323: % below).
324: 
325: % This general formulation of penalized least squares estimation
326: % includes several standard ones, for instance penalized splines (when
327: % $\mathcal F$ is a Sobolev class) or Besov-penalty least squares
328: % estimators, that are commonly considered in signal or image-processing
329: % papers, see for instance ????. As a consequence, the general upper
330: % bound stated in Theorem~\ref{thm:least_sq} provides directly the same
331: % general upper bound for such estimators, provided that the class
332: % $\mathcal F$ satisfies some complexity bound,
333: % see~\eqref{eq:covering_assumption}.
334: 
335: % \texttt{balance entre le temps de calcul, mais pas de test a
336: %   faire....}
337: 
338: % This idea was previously developped in the pioneering works of
339: % \texttt{citer Zhang a fond ici....} and concerning aggregation, we
340: % refer to see also the works by ?????
341: 
342: % distribution free assumption non-parametric estimation This upper
343: % bound is stated without any assumption on the law of the covariates,
344: % besides boundedness.  . In particular, we do not need to assume that
345: % the law of the covariate have a density with respect to the Lebesgue
346: % measure. this upper bound is valid when the corovatiates are
347: % discrete, or satisfies an upper bound We prove that this estimator
348: % converges with the optimal rate of convergence general This
349: % estimator is based on multivariate penalized least squares
350: % estimates, and By We construct an adaptive estimator of the
351: % regression, We propose a new algorithm for the estimation of both
352: % the index and the link function in the single index model. Un beau
353: % abstract
354: 
355: 
356: \subsection{The model}
357: \label{sec:model}
358: 
359: Let $(X, Y), (X_1, Y_1), \ldots, (X_n, Y_n)$, be independent and
360: identically distributed variables in $\mathbb R^d \times \mathbb
361: R$. We consider the regression model
362: \begin{equation}
363:   \label{eq:model}
364:   Y = f_0(X) + \sigma \varepsilon,
365: \end{equation}
366: where $f_0 : \mathbb R^d \rightarrow \mathbb R$ and $\varepsilon$ is
367: called noise. To simplify, we assume that the noise level $\sigma$ is
368: known. We denote by $P$ the probability distribution of $(X,Y)$ and by
369: $P_X$ the margin distribution in $X$ or \emph{design}, or
370: \emph{covariates} distribution. We denote by $P^n$ the joint
371: distribution of the sample
372: \begin{equation*}
373:   D_n := [ (X_i, Y_i) \;;\; 1 \leq i \leq n],
374: \end{equation*}
375: and by $P_n = P^n[\cdot | X^n]$ where $X^n := (X_1, \ldots, X_n)$, the
376: joint distribution of the sample $D_n$ conditional on the design $X^n
377: := (X_1, \ldots, X_n)$. The expectation w.r.t. $P_n$ is denoted by
378: $E_n$. The noise $\varepsilon$ is symmetrical and subgaussian
379: conditionally on $X$. Indeed, we assume that there is $b_\varepsilon >
380: 0$ such that
381: \begin{equation}
382:   \label{eq:subgaussian}
383:   (G1)(b_\varepsilon): \quad E[\exp(t\varepsilon) | X] \leq
384:   \exp(b_\varepsilon^2t^2/2) \quad \forall t > 0
385: \end{equation}
386: which is equivalent (up to an appropriate choice for the constant
387: $b_\varepsilon$) to
388: \begin{equation*}
389:   \nonumber(G2)(b_\varepsilon) : P[\varepsilon > t | X] \leq
390:   \exp(-t^2/(2b_\varepsilon^2)) \quad \forall t > 0.
391: \end{equation*}
392: Assumption~\eqref{eq:subgaussian} is standard in nonparametric
393: regression, it includes the models of bounded and Gaussian
394: regression. An important fact, that will be used in the proofs, is
395: that for $\varepsilon_1,\ldots,\varepsilon_n$ independent and such
396: that $\varepsilon_i$ satisfies $(G1)(b_i)$ for any $i=1,\ldots,n$, the
397: random variable $\sum_{i=1}^n a_i \varepsilon_i$ satisfies $(G1)(\sum
398: a_i^2b_i^2$) for any $a_1,\ldots,a_n \in \R$ and thus the
399: concentration property $(G2)(\sqrt{2}\sum a_i^2b_i^2$). Other
400: equivalent definitions of subgaussianity are, when $\varepsilon$ is
401: symmetrical, to assume that $E[ \exp(\varepsilon^2/b_\varepsilon^2 |
402: X) ] \leq 2$ for some $b_\varepsilon > 0$, or $(E[ |\varepsilon|^p |
403: X])^{1/p} \leq b_\varepsilon \sqrt{p}$ for any $p \geq 1$.
404: 
405: Concerning the design, we only assume that $X$ has a compact support,
406: and without loss of generality we can take its support equal to $[0,
407: 1]^d$. In particular we do not need $P_X$ to be continuous with
408: respect to the the Lebesgue measure. Note that the problem of adaptive
409: estimation with such a general multivariate design is not common in
410: literature. In the so-called ``distribution free nonparametric
411: estimation'' framework, when we want to obtain convergence rates and
412: not only the consistency of the estimators, it is, as far as we know,
413: always assumed that $|Y| \leq L$ a.s. for some constant $L > 0$, see
414: for instance~\cite{kohler02}, \cite{kohler_krzyzak01a},
415: \cite{kohler_krzyzak01b}, \cite{kohler00} and~\cite{kerk_picard07},
416: which is a setting less general than the one considered here.
417: 
418: \begin{remark}
419:   The results presented here can be extended to subexponential noise,
420:   that is when $E[ \exp(|\varepsilon| / b_\varepsilon) | X] \leq 2$
421:   for some $b_\varepsilon > 0$, but it involves complications
422:   (chaining with an adaptative truncation argument in the proof of
423:   Theorem~\ref{thm:devia1} below, see for instance~\cite{BLM99}
424:   or~\cite{van_de_geer00}, among others) that we prefer to skip
425:   here. % It can also be seen that extra smoothness in the noise, that
426:   % is $E_n[ \exp(b |\varepsilon|^p) ] \leq 1$ with $p \geq 2$ does
427:   % not actually improve the results presented here (the rates of
428:   % convergence remains the same), but this problem is beyond the
429:   % scope of this paper.
430: \end{remark}
431: 
432: % \begin{remark}
433: %   To avoid complications, we assume that the noise level
434: %   $\sigma(\cdot)$ is known, and such that $\sigma_0 < \sigma(X) \leq
435: %   \sigma_1$ a.s. for some $0 < \sigma_0 < \sigma_1$. If not, one can
436: %   replace penalized least squares by weighted penalized least squares
437: %   to handle heteroscedastic noise and one can do a slight modification
438: %   in the weights in the aggregation algorithm, see ??????
439: % \end{remark}
440: 
441: %% \begin{definition}
442: %%   \label{def:orlicz}
443: %%   A \emph{Young} function is a convex, increasing function $\psi$ on
444: %%   $\mathbb R^+ \rightarrow \mathbb R^+$ such that $\psi(0) = 0$ and
445: %%   $\lim_{x \rightarrow +\infty}\psi(x) = +\infty$. We define the
446: %%   \emph{Orlicz seminorm} $\norm{\varepsilon}_\psi$ of a random variable
447: %%   $\varepsilon$ by
448: %%   \begin{equation*}
449: %%     \norm{\varepsilon}_\psi := \inf \{ c > 0 : E[ \psi(|\varepsilon| / c) ]
450: %%     \leq 1 \},
451: %%   \end{equation*}
452: %%   with usual convention $\norm{\varepsilon}_\psi = +\infty$ when the
453: %%   infimum is taken over an empty set. We define also
454: %%   \begin{equation*}
455: %%     \norm{\varepsilon}_{n, \psi} := \inf \{ c > 0 : E_n[ \psi(|\varepsilon| / c) ]
456: %%     \leq 1 \text{ a.s. }\}.
457: %%   \end{equation*}
458: %% \end{definition}                %
459: 
460: % \begin{assumption}[Model assumption]
461: %   Throughout the paper, we assume that $E_n[\varepsilon] = 0$, and that
462: %   for some $p, B > 0$
463: %   \begin{equation*}
464: %     \norm{\varepsilon}_{n, \psi_p} \leq B
465: %   \end{equation*}
466: %  almost surely, where $\psi_p(x) := \exp(|x|^p) - 1$.
467: % \end{assumption}
468: 
469: % This assumption on the model is very general. First, it includes
470: % most of the standard assumptions on the noise that are considered in
471: % nonparametric regression literature. For instance, when $p=2$, this
472: % noise assumption means that the noise is subgaussian conditionally
473: % on the design. It includes also noises which are, conditionally on
474: % the design, gaussian ($p=2$), double exponential ($p=1$) or bounded
475: % almost surely ($p=\infty$, bounded regression). Note that the
476: % statisticien does not need to know the parameter $p$.
477: 
478: % If $\psi(x) = \exp(x^2) - 1$, then $\norm{\varepsilon}_\psi < +\infty$
479: % if and only if $\varepsilon$ is subgaussian, namely such that $E[ \exp(
480: % b \varepsilon^2) ] \leq B$ for some $b, B > 0$. In what follows, we
481: % assume that the noise $\varepsilon$ satisfies $\norm{\varepsilon}_{\psi_p}
482: % < +\infty$ for some $p > 0$, where $\psi(x) = |x|^p$. This
483: % assumption includes many standard noises, such as gaussian,
484: % subgaussian, or double exponential noise, among many others.
485: 
486: % Moreover, we assume that the $\varepsilon_i$ are independent of $X^n :=
487: % (X_1, \ldots, X_n)$ for $1 \leq i \leq n$.
488: 
489: % \section{Construction of the procedure}
490: % \label{sec:construction}
491: 
492: % \begin{figure}[htbp]
493: %   \centering
494: %   \label{fig:split}
495: %   \begin{tikzpicture}
496: %     \begin{scope}[shape=rectangle,rounded corners,%
497: %       minimum size=0.8cm,fill=white]%
498: %       \tikzstyle{every node}=[draw,fill]%
499: %       \node (D_n) at (0,0) {whole sample $D_n$};%
500: %       \node (D_m) at (1.5, 1.5) {training sample $D_m$};%
501: %       \node (D_l) at (1.5, -1.5) {learning sample $D_{(m)}$};%
502: %       \node (weak) at (6, 1.5) {weak estimators $\{ \bar f_\lambda ;
503: %         \lambda \in \Lambda \}$};%
504: %       \node (weights) at (6, -1.5) {weights $\{ \hat \theta_\lambda
505: %         ; \lambda \in \Lambda \}$};%
506: %       \node (aggregate) at (9, 0) {aggregated estimator $\hat
507: %         {\mathsf f}$};%
508: %     \end{scope}
509: %     \draw[] (D_n) -- (D_m);%
510: %     \draw[->,very thick] (D_m) -- (weak);%
511: %     % -- (weak) -- (aggregate);%
512: %     \draw[] (D_n) -- (D_l);%
513: %     \draw[->,very thick] (D_l) -- (weights);%
514: %     \draw[->,very thick] (weak) -- (aggregate);%
515: %     \draw[->,very thick] (weights) -- (aggregate);%
516: %     % \draw[] (D_n) -- (q_1) -- (q_2) -| (q_E);%
517: % %     \draw[->,shorten >=2pt] (D_n) .. controls +(75:1.4cm) and
518: % %     +(105:1.4cm) .. node[above] {$x$} (D_n);
519: %   \end{tikzpicture}
520: %   \caption{Splitting the sample}
521: % \end{figure}
522: 
523: \section{PERM over a large function set}
524: \label{sec:pena_least_squares}
525: 
526: We consider the following problem of estimation: we fix a function
527: space $\mathcal F$ and we want to recover $f_0$ based on the sample
528: $D_n$ using the knowledge that $f_0 \in \mathcal F$. The set $\mathcal
529: F$ is endowed with a seminorm $|\cdot|_{\mathcal F}$. To fix the
530: ideas, when $d=1$, one can think for instance of the Sobolev space
531: $\mathcal F = W_2^s$ of functions such that $|f|_{\mathcal F}^2 = \int
532: f^{(s)}(t)^2 dt < +\infty$, where $s$ is a natural integer and
533: $f^{(s)}$ is the $s$-th derivative of $f$. In this case, the estimator
534: described below is the so-called \emph{smoothing spline estimator},
535: see for instance \cite{wahba90}. Several other examples are given in
536: Section~\ref{sec:examples} below.
537: 
538: \subsection{Definition of the PERM}
539: 
540: The idea of penalized empirical risk minimization is to make the
541: balance between the goodness-of-fit of the estimator to the data with
542: its smoothness. The quantity $|f|_{\mathcal F}$ measures the
543: smoothness (or ``roughness'') of $f \in \mathcal F$ and the balance is
544: quantifyied by a parameter $h > 0$.
545: \begin{definition}[PERM]
546:   \label{def:perm}
547:   Let $\lambda = (h, \mathcal F)$ be fixed. We say that $\bar
548:   f_\lambda$ is a penalized empirical risk minimizer if it minimizes
549:   \begin{equation}
550:     \label{eq:pena_least_sq}
551:     R_n(f)  + \pen_\lambda(f)
552:   \end{equation}
553:   over $\mathcal F$, where $\pen_\lambda(f) := h^2 |f|_{\mathcal
554:     F}^\alpha$ for some $\alpha > 0$ and where
555:   \begin{equation*}
556:     R_n(f) := \norm{Y - f}_n^2 = \frac{1}{n} \sum_{i=1}^n (Y_i -
557:     f(X_i))^2
558:   \end{equation*}
559:   is the empirical risk of $f$ over the sample $D_n$.
560: \end{definition}
561: 
562: The parameter $\alpha$ is a tuning parameter, which can be chosen
563: depending on the seminorm $|\cdot|_{\mathcal F}$, see the examples in
564: Section~\ref{sec:examples}. For simplicity, we shall always assume
565: that a PERM $\bar f_\lambda$ exists, since we can always find $\tilde
566: f_\lambda$ such that $R_n(\tilde f_\lambda) + \pen_{\lambda}(\tilde
567: f_\lambda) \leq \inf_{f \in \mathcal F} \{ R_n(f) + \pen_{\lambda}(f)
568: \} + 1 / n$ which satisfies the same upper bound from
569: Theorem~\ref{thm:least_sq} (see below) as an hypothetic $\bar
570: f_\lambda$. However, a minimizer may not be necessarily unique, but
571: this is not a problem for the theoretical results proposed below. PERM
572: has been studied in a tremendous number of papers, we only refer to
573: \cite{van_de_geer00, vdg07}, \cite{massart03} and \cite{kohler02},
574: which are the closest to the material proposed in this Section.
575: 
576: In Theorem~\ref{thm:least_sq} below we propose a general upper bound
577: for PERM over a space $\mathcal F$ that satisfies the complexity
578: Assumption $(C_\beta)$ below. The proof of this upper bound involves a
579: result concerning the supremum of the empirical process $Z(f) :=
580: \sigma n^{-1/2} \sum_{i=1}^n f(X_i) \varepsilon_i$ over $f \in
581: \mathcal F$ which is given in Theorem~\ref{thm:devia1} below.
582: 
583: % \subsection{Main definitions}
584: 
585: \subsection{Some definitions and useful tools}
586: 
587: Let $(E, \norm{\cdot})$ be a normed space. For $z \in E$, we denote by
588: $B(z, \delta)$ the ball centered at $z$ with radius $\delta$. We say
589: that $\{ z_1, \ldots, z_p \}$ is a $\delta$-cover of some set $A
590: \subset E$ if
591: \begin{equation*}
592:   A \subset \bigcup_{1 \leq i \leq p} B(z_i, \delta).
593: \end{equation*}
594: The \emph{$\delta$-covering number} $N(\delta, A, \norm{\cdot})$ is
595: the minimal size of a $\delta$-cover of~$A$ and
596: \begin{equation*}
597:   H(\delta, A, \norm{\cdot}) := \log N(\delta, A, \norm{\cdot})
598: \end{equation*}
599: is the \emph{$\delta$-entropy} of $A$. The main assumption in this
600: section concerns the complexity of the space $\mathcal F$, which is
601: quantified by a bound on the entropy of its unit ball $B_{\mathcal F}
602: := \{ f \in \mathcal F : |f|_{\mathcal F} \leq 1 \}$. We denote for
603: short $H_\infty(\delta, A) = H(\delta, A, \norm{\cdot}_\infty)$ where
604: $\norm{f}_\infty := \sup_{x \in [0,1]^d} |f(x)|$. We denote by $C([0,
605: 1]^d)$ the set of continuous functions on $[0, 1]^d$.
606: \begin{assumption}[$C_\beta$]
607:   We assume that $\mathcal F \subset C([0, 1]^d)$ and that there is a
608:   number $\beta \in (0, 2)$ such that for any $\delta > 0$, we have
609:   \begin{equation}
610:     % \label{eq:covering_assumption}
611:     H_\infty\big( \delta, B_{\mathcal F} \big)
612:     \leq D \delta^{-\beta}
613:     % \exp\Big( D \Big( \frac{R}{\delta} \Big)^{d/s} \Big),
614:   \end{equation}
615:   where $D > 0$ is independent of $\delta$.
616: \end{assumption}
617: This assumption entails that, for any radius $R > 0$, we have
618: \begin{equation*}
619:   H_\infty\big( \delta, B_{\mathcal F}(R) \big) \leq D
620:   \Big(\frac{R}{\delta}\Big)^{\beta}
621: \end{equation*}
622: where $B_{\mathcal F}(R) := \{ f \in \mathcal F : |f|_{\mathcal F}
623: \leq R \}$.
624: % that thatsince this assumption entails that for any ball Define the
625: % ball $\mathcal F(R) := \{ f \in \mathcal F : |f|_{\mathcal F} \leq R
626: % \}$.  The main assumption in this section concerns the complexity of
627: % the space $\mathcal F$, which is quantified by a bound on the
628: % entropy of its balls $\mathcal F(R)$. We denote for short
629: % $H_\infty(\delta, A) = H(\delta, A, \norm{\cdot}_\infty)$ where
630: % $\norm{f}_\infty := \sup_{x \in [0,1]^d} |f(x)|$. We denote by
631: % $C([0, 1]^d)$ the set of continuous functions on $[0, 1]^d$.
632: % \begin{assumption}[$C_\beta$]
633: %   We assume that $\mathcal F \subset C([0, 1]^d)$ and that there is a
634: %   number $\beta \in (0, 2)$ such that for any positive $\delta$ and
635: %   $R$, we have
636: %   \begin{equation}
637: %     % \label{eq:covering_assumption}
638: %     H_\infty\big( \delta, \mathcal F(R) \big)
639: %     \leq D \Big(\frac{R}{\delta}\Big)^{\beta}
640: %     % \exp\Big( D \Big( \frac{R}{\delta} \Big)^{d/s} \Big),
641: %   \end{equation}
642: %   where $D > 0$ is independent of $\delta$ and $R$.
643: % \end{assumption}
644: % \begin{remark}
645: Assumption~$(C_\beta)$ is satisfied by barely all smoothness spaces
646: considered in nonparametric literature (at least when the smoothness
647: of the space is large enough compared to the dimension, see
648: below). The most general space that we consider in this paper and
649: which satisfies~$(C_\beta)$ is the anisotropic Besov space $B_{p,
650:   q}^{\bs s}$, where $\bs s = (s_1, \ldots, s_d)$ is a vector of
651: positive numbers. This space is precisely defined in
652: Appendix~\ref{sec:appendix_approximation}. Each $s_i$ corresponds to
653: the smoothness in the direction $e_i$, where $\{ e_1, \ldots, e_d \}$
654: is the canonical basis of $\mathbb R^d$. The computation of the
655: entropy of $B_{p, q}^{\bs s}$ can be found in~\cite{triebel06}, we
656: give more details in Appendix~\ref{sec:appendix_approximation}. If
657: $\bs {\bar s}$ is the harmonic mean of $\bs s$, namely
658:   \begin{equation}
659:     \label{eq:harmonic_mean}
660:     \frac{1}{\bs {\bar s}} := \frac{1}{d} \sum_{i=1}^d
661:     \frac{1}{s_i},
662:   \end{equation}
663:   then $B_{p, q}^{\bs s}$ satisfies~$(C_\beta)$ with $\beta = d / \bs
664:   {\bar s}$, given that $\bs {\bar s} > d / s$, which is the usual
665:   condition to have the embedding $B_{p, q}^{\bs s} \subset C([0,
666:   1]^d)$.
667: %\end{remark}
668: 
669: \begin{remark}
670:   Under the restriction $\beta \in (0, 2)$, the Dudley's entropy
671:   integral satisfies
672:   \begin{equation*}
673:     \int_0^{ {\rm diam}( B_{\mathcal F}, \|\cdot\|_\infty)}
674:     \sqrt{H_\infty(\delta, B_{\mathcal F})} d\delta<\infty,
675:   \end{equation*}
676:   where $\text{diam}(B_{\cF},\|\cdot\|_\infty)$ is the
677:   $L_\infty$-diameter of $B_{\mathcal F}$.  This is a standard
678:   assumption coming from empirical process theory. It is related to
679:   the so-called chaining argument, that we use in the proof of
680:   Theorem~\ref{thm:devia1}. However, in order to consider a larger
681:   space of functions $\mathcal F$, we could think of function spaces
682:   with a complexity $\beta \geq 2$. In this case, using a slightly
683:   different chaining argument (cf. \cite{vdVW:96}), the quantity
684:   appearing in the upper bound of some subgaussian process is of the
685:   type $\int_{c/\sqrt{n}}^{\text{diam}(B_{ \cF},\|\cdot\|_\infty)}
686:   \sqrt{H_\infty(\delta, B_{\mathcal F})} d\delta$ which converges
687:   whatever $\beta$ is. However, such considerations are beyond the
688:   scope of the paper and are to be considered in a future work.
689: \end{remark}
690: 
691: 
692: % if $\bs s / s \in \mu \mathbb \log Z_+^d$ for some $\mu > 0$
693: 
694: %  $d / s $we denote by $s$ We give a precise overview of such results
695: % in Appendix~\ref{sec:appendix}.
696: 
697: %  this condition is
698: 
699: % If
700: % $\mathcal F = B_{p,\infty}^s([0,1]^d)$, where
701: % $B_{p,\infty}^s([0,1]^d)$ is the Besov space with smoothness $s$
702: % (see~\cite{devore_lorentz93} for precise definitions and properties of
703: % Besov spaces), then condition~\eqref{eq:covering_assumption} holds,
704: % see~\cite{birge_massart00}. This result is precisely recalled in
705: % Theorem~\ref{thm:birge_massart}, see in Appendix.
706: 
707: % if $|\mathcal|$In certain cases, an appropriate choice of $\alpha$
708: % allows to simplify minimization of \eqref{eq:pena_least_sq}, see the
709: % examples given below. This definition includes several standard
710: % estimators: smoothing splines (take $\mathcal F$ as a Sobolev space)
711: % and when $\mathcal F$ is a Besov space, $\bar f_\lambda$ is related
712: % to other popular denoising techniques. This is explained in details
713: % later in the Section.
714: 
715: \subsection{About the supremum of the process $Z(\cdot)$}
716: \label{sec:process_Z0}
717: 
718: The beginning of the proof of Theorem~\ref{thm:least_sq} is, as usual
719: with the proof of upper bounds for $M$-estimators, based on an
720: inequality that links the empirical norm of estimation and the
721: empirical process of the model. This idea goes back to key
722: papers~\cite{vandegeer90} and \cite{birge_massart93}, see
723: also~\cite{van_de_geer00, vdg07} and \cite{massart03} for a detailed
724: presentation. In regression, it writes, if $\bar f$ is a PERM and if
725: $f_0 \in \mathcal F$:
726: \begin{align*}
727:   \norm{\bar f - f_0}_n^2 + \pen(\bar f) &\leq \frac{2}{\sqrt{n}}
728:   Z_n(\bar f - f_0) + \pen(f_0),
729: %   &\leq \sup_{f \in \mathcal F} \frac{2}{\sqrt{n}} Z_n(f - f_0) +
730: %   \pen(f_0),
731: \end{align*}
732: where
733: \begin{equation}
734:   \label{eq:Z_n_def}
735:   Z_n(f) := \frac{\sigma}{\sqrt{n}} \sum_{i=1}^n f(X_i) \varepsilon_i.
736: \end{equation}
737: This inequality explains why the next Theorem~\ref{thm:devia1} is the
738: main ingredient of the proof of Theorem~\ref{thm:least_sq}
739: below. Then, an important remark is that~\eqref{eq:subgaussian}
740: entails
741: \begin{equation}
742:   \label{eq:deviaZnf}
743:   P_n[Z_n(f) > z] \leq \exp\Big( \frac{-z^2}{2 b^2 \norm{f}_n^2}
744:   \Big)
745: \end{equation}
746: for any fixed $f$, $z > 0$ and $n \geq 1$, where $\norm{f}_n^2 :=
747: n^{-1} \sum_{i=1}^n f(X_i)^2$ and where we take for short $b := \sigma
748: b_\varepsilon$. This deviation inequality is at the core of the proof
749: of Theorem~\ref{thm:devia1} below. Let us introduce the
750: \emph{empirical ball} $B_n(f_0, \delta) := \{ f : \norm{f - f_0}_n
751: \leq \delta \}$ and let us recall that $P_n := P^n[\cdot | X^n]$ is
752: the joint law of the sample $D_n$ conditionally to the design $X^n =
753: (X_1, \ldots, X_n)$.
754: 
755: \begin{theorem}
756:   \label{thm:devia1}
757:   Let $Z_n(\cdot)$ be the empirical process~\eqref{eq:Z_n_def} and
758:   assume that $(\mathcal F, |\cdot|_{\mathcal F})$ satisfies
759:   $(C_\beta)$. Then\textup, if $f_0 \in \mathcal F$\textup, we can
760:   find constants $z_1 > 0$ and $D_1 > 0$ such that\textup:
761:   \begin{align}
762:     \label{eq:deviaZ_n}
763:     P_n \Big[ \sup_{f \in \mathcal F \cap B_n(f_0, \delta)} \frac{
764:       Z_n(f - f_0) }{\norm{f - f_0}_n^{1 - \beta / 2} (1 +
765:       |f|_{\mathcal F})^{\beta / 2} } > z \Big] \leq \exp( - D_1 z^2
766:     \delta^{-\beta} )
767:   \end{align}
768:   for any $\delta > 0$ and $z \geq z_1$ \textup(we recall that $\beta
769:   \in (0, 2)$\textup).
770: \end{theorem}
771: 
772: The proof of this Theorem is given is
773: Section~\ref{sec:proof_main_results}, it uses techniques from
774: empirical process theory such as peeling and chaining. It is a uniform
775: version of~\eqref{eq:deviaZnf}, localized around $f_0$ (for the
776: empirical norm). In this theorem, we use the ``weighting trick'' that
777: was introduced in~\cite{vandegeer90, van_de_geer00}: we divide
778: $Z_n(\cdot)$ by $\norm{f - f_0}_n$ and $|f|_{\mathcal F}$ in order to
779: counterpart, respectively, the variance of $Z_n(\cdot)$ and the
780: massiveness of the class $\mathcal F$. This renormalization of the
781: empirical process is also at the core of the proof of
782: Theorem~\ref{thm:least_sq}.
783: 
784: % \begin{remark}
785: %   There is no measurability problem in the inequality stated in
786: %   Theorem~\ref{thm:devia1} since the supremum holds over $\mathcal F$,
787: %   which is assumed to be included in the separable space $C([0,
788: %   1]^d)$.
789: % \end{remark}
790: 
791: % is close to results given in~\cite{van_de_geer00}, where a general
792: % presentation of the use of empirical process techniques for
793: % nonparametric estimation is proposed.  See also~\cite{kohler02} for
794: % the situation where $|Y| \leq L$ almost surely for some constant $L
795: % > 0$ and~\cite{massart03} for a detailed presentation of the use of
796: % concentration inequalities in nonparametric statistics.
797: 
798: % Thus, the proof relies on the study of the process $Z(\cdot)$. In
799: % Theorem~\ref{thm:devia1} below (see Section~\ref{sec:process_Z0}) we
800: % give a deviation inequality for the supremum of this process over a
801: % general space satisfying the complexity
802: % bound~\eqref{eq:covering_assumption}. This kind of result was
803: % previously used by~\cite{vandegeer90}, among many others, in order
804: % to derive upper bounds for least squares and penalized least squares
805: % estimators. See also~\cite{van_de_geer00}
806: 
807: 
808: 
809: \subsection{Upper bound for the PERM}
810: 
811: Theorem~\ref{thm:least_sq} below provides an upper bound for the mean
812: integrated squared error (MISE) of the PERM, both for integration
813: w.r.t. the empirical norm $\norm{f}_n^2 = n^{-1} \sum_{i=1}^n
814: f(X_i)^2$ and the norm $\norm{f}^2 := \int f(x)^2 P_X(dx)$.
815: 
816: \begin{theorem}
817:   \label{thm:least_sq}
818:   Let $\mathcal F$ be a space of functions satisfying $(C_\beta)$.
819:   % endowed with a seminorm $|\cdot|_{\mathcal F}$ which satisfies the
820:   % covering preperty~\eqref{eq:covering_assumption} for some $s > d /
821:   % 2$.
822:   Let $\lambda = (h, \mathcal F)$ and $\bar f_{\lambda}$ be a PERM
823:   given by~\eqref{eq:pena_least_sq}, where $h$ satisfies
824:   \begin{equation}
825:     \label{eq:bandwidth}
826:     h = a n^{-1 / (2 + \beta)}
827:   \end{equation}
828:   for some constant $a > 0$ and where $\alpha > 2\beta / (\beta +
829:   2)$. If $f_0 \in \mathcal F$, we have\textup:
830:   \begin{equation*}
831:     E_n \norm{\bar f_{\lambda} - f_0}_n^2 \leq C_1(1 + |f_0|_{\mathcal
832:       F}^\alpha) n^{-2 / (2 + \beta)}
833:   \end{equation*}
834:   for $n$ large enough, where $C_1$ is a fixed constant depending on
835:   $a$, $\beta$, $\alpha$ and $b$. If we assume further that
836:   $\norm{\bar f_\lambda - f_0}_\infty \leq Q$ a.s. for some constant
837:   $Q > 0$, we have
838:   \begin{equation*}
839:     E^n \norm{\bar f_\lambda - f_0}^2 \leq C_2 (1 + |f_0|_{\mathcal
840:       F}^\alpha ) n^{-2 / (2 + \beta)}
841:   \end{equation*}
842:   for $n$ large enough, where $C_2$ is a fixed constant depending on
843:   $C_1$ and $Q$.
844: \end{theorem}
845: 
846: % \begin{remark}
847: %   Theorem~\ref{thm:least_sq} improves previous results
848: %   by~\cite{kohler02}, see in particular Chapter~21, in several
849: %   ways. The class $\mathcal F$ here is very general, while it is a
850: %   Sobolev class in~\cite{kohler02}. We do not need to assume that $|Y|
851: %   \leq L$, and the rate in Theorem~\ref{thm:least_sq} corresponds to
852: %   the minimax optimal rate (for a Sobolev class for instance), since
853: %   there is not extra $\log n$ terms.
854: % \end{remark}
855: 
856: 
857: \begin{remark}
858:   Theorem~\ref{thm:least_sq} holds if we truncate $\bar f_\lambda$ by
859:   some constant $Q$ such that $\norm{f_0}_\infty \leq Q$. Such a
860:   truncation cannot be avoided in such a general regression
861:   setting. Indeed, the PERM is, without truncation, in general non
862:   consistent, see the example from Problem~20.4, p.~430
863:   in~\cite{kohler02}.
864: \end{remark}
865: 
866: \begin{remark}
867:   Theorem~\ref{thm:least_sq} holds for any design law $P_X$, even for
868:   the degenerate case where $P_X = \delta_x$ for some fixed point $x
869:   \in [0,1]^d$, where $\delta$ is the Dirac probability measure. Of
870:   course, in this case, the rate $n^{-2 / (2 + \beta)}$ becomes
871:   suboptimal, since the estimation problem with such a $P_X$ is no
872:   more ``truly nonparametric''. Indeed, for a discrete $P_X$ with
873:   finite support, it is proved in~\cite{hamers_kohler04} that the
874:   optimal rate is the parametric rate $1/n$ using a local averaging
875:   estimator.
876: \end{remark}
877: 
878: % Several consequences of Theorem~\ref{thm:least_sq} are given in
879: % Section~\ref{sec:examples}, such as the convergence rates of the
880: % PERM in the anisotropic Besov space $B_{p, q}^{\bs s}$, the
881: % convergence rates for PERM in reproductive kernel Hilbert spaces,
882: % and several smoothing spline type estimators, such as the so-called
883: % thin plate spline, or an estimator that we call anisotropic spline
884: % smoother, which was, as far as we know, not previously considered in
885: % literature.
886: 
887: \subsection{About the smoothing parameter $h$}
888: \label{sec:about_h}
889: 
890: It is well-known that in practice, the choice of the parameter $h$ is
891: of first importance. From the theoretical point of view, in order to
892: make $\bar f_\lambda$ rate-optimal, $h$ must equal in order to a
893: quantity involving the complexity of $\mathcal F$: see
894: condition~\eqref{eq:bandwidth} on the bandwidth and the
895: Assumption~$(C_\beta)$. This problem is commonplace in nonparametric
896: statistics. Indeed, the role of the penalty
897: in~\eqref{eq:pena_least_sq} is to make the balance with the
898: massiveness of the space $\mathcal F$. Without this penalty, or if $h$
899: is too small, $\bar f_{\lambda}$ roughly interpolates the data, which
900: is not suitable when the aim is denoising (this phenomenon is called
901: \emph{overfitting}).
902: 
903: Of course, the complexity parameter $\beta$ is unknown to the
904: statistician, and even worse, it does not necessarily make sense in
905: practice. So, several procedures are proposed to select $h$ based on
906: the data. The most popular are the leave-one-out cross validation (CV)
907: and the simpler generalized cross validation (GCV), which is often
908: used with smoothing spline estimators because of its computational
909: simplicity, see~\cite{wahba90} among others. Such methods are known to
910: provide good results in most cases. However, there is, as far as we
911: know, no convergence rates results for estimators based on CV or GCV
912: selection of smoothing parameters. In Section~\ref{sec:examples}
913: below, we propose an alternative approach. Indeed, instead of
914: selecting one particular $h$, we mix several estimators computed for
915: different $h$ in some grid using an aggregation algorithm. This
916: aggregation algorithm is described in Section~\ref{sec:ERM_finite}. We
917: show that this approach allows to construct adaptive estimators with
918: optimal rates of convergence in several particular cases, see
919: Section~\ref{sec:examples}. Moreover, we prove empirically in
920: Section~\ref{sec:simulations} that the aggregation approach is more
921: stable than CV or GCV when the number of observations is small.
922: 
923: 
924: 
925:   % \begin{remark}
926:   %   An inspection of the proof of Theorem~\ref{thm:least_sq} shows
927:   %   that the term $o(h^2)$ is going to zero as $h$ goes to $0$
928:   %   faster than any power function of $m$.  When $h$ is of order
929:   %   $m^{-s/(2s + 1)}$, which is the best choice theoretically, we
930:   %   have
931: %   \begin{equation*}
932: %     \sup_{f \in \mathcal F(R)} E \norm{\bar f - f}_{L^2(P_X^m)}^2 \leq
933: %     (C_1 + 2 R^2)  m^{-2s / (2s + d)}
934: %   \end{equation*}
935: %   which is the standard minimax convergence rate over classes with
936: %   smoothness $s$, at least when $P_X$ has a density with respect to
937: %   the Lebesgue measure which is continuous and bounded away from
938: %   $0$.  Such smoothness classes include Sobolev balls (for $s >
939: %   d/2$) and Besov balls ...
940: % \end{remark}
941: 
942: % \begin{remark}
943: %   In the proof of Theorem~\ref{thm:least_sq}, we do not use the
944: %   explicit form of the estimator $\bar f_{\mathcal F}$: we only need
945: %   the minimization property~\eqref{eq:pena_least_sq}. This entails
946: %   that the scheme of proof is quite generic, and could be used for
947: %   other estimators as well (namely, $M$-estimators.) This scheme of
948: %   proof was previously used in the key paper~\cite{vandegeer90}, see
949: %   also~\cite{van_de_geer00}. It relies on a deviation inequality for
950: %   the supremum of a particular empirical process over a smoothness
951: %   class $\mathcal F$, which is stated in Section~\ref{sec:process_Z0}
952: %   below.
953: % \end{remark}
954: 
955: % We first prove that the ``natural'' aggregation procedure, namely
956: % empirical risk minimization (or its penalized version), fails to
957: % achieve the optimal rate of aggregation in this setup. This
958: % motivates the choice
959: 
960: % In this section, we explore some statistical properties of penalized
961: % empirical risk minimization over a finite set of functions.
962: 
963: % In general, given is a data set $D_n$, we can consider two
964: % problems. The first one is the problem of estimation treated in the
965: % previous sections.  Namely, we aim at constructing some procedure
966: % $\bar{f}$ satysfying
967: % \begin{equation}
968: %   \label{eq:RateOfConvergence}
969: %   E \|\bar{f}-f_0 \|^2 \leq \psi(n)
970: % \end{equation}
971: % where $\psi(n)$, called the {\it rate of convergence}, is a quantity
972: % we wish very small as $n$ increases. To get this kind of inequality,
973: % we have to assume $f_0$ to belong to a set with a small complexity (at
974: % least compact). That is the reason why we introduced Assumption
975: % ($C_\beta$) in Section~\ref{sec:pena_least_squares}. Actually, this
976: % kind of ``a priori'' may not be fulfilled. That is why the second
977: % problem, called {\it agnostic learning} has been introduced. For this
978: % problem, one is given a set $F$ of functions. Without any assumption
979: % on $f_0$, we want to construct (from the data) a procedure $\tilde{f}$
980: % which has a risk as close as possible to the smallest risk over
981: % $F$. Namely, we want to obtain {\it oracle inequalities}, that is
982: % inequalities of the form
983: % \begin{equation*}
984: %   E \| \tilde{f} - f_0 \|^2 \leq C \min_{f\in F} \|f - f_0 \|^2 +
985: %   \phi(n,F),
986: % \end{equation*}
987: % where $C \geq 1$ and $\phi(n,F)$ is called the {\it residue}, which
988: % is the quantity that we want to be small as $n$ increases.  When $F$
989: % is of finite cardinality $M$, the agnostic problem is called {\it
990: %   aggregation problem} and the residue $\phi(n,F) = \phi(n,M)$ is
991: % called {\it rate of aggregation}. The main difference between the
992: % problems of estimation and aggregation is that we don't need any
993: % assumption on $f_0$ for the second problem. Nevertheless,
994: % aggregation method have been widely used to construct adaptive
995: % procedures for the estimation problem.  That is the reason why we
996: % study aggregation procedures in this section. We will use these
997: % procedures to construct estimation procedures which will be adaptive
998: % to the complexity parameter $\beta$ introduced in Assumption
999: % ($C_\beta$).
1000: 
1001: 
1002: \section{PERM and aggregation over a finite set of functions}
1003: \label{sec:ERM_finite}
1004: 
1005: Let us fix a set $F(\Lambda) := \{ f_\lambda : \lambda \in \Lambda \}$
1006: of arbitrary functions, and denote by $M = |\Lambda|$ its
1007: cardinality. % We will choose specific sets $F(\Lambda)$ in
1008: % Section~\ref{sec:examples}, but in this section it remains generic.
1009: 
1010: \subsection{Suboptimality of PERM over a finite set}
1011: 
1012: In this section, we prove that minimizing the empirical risk
1013: $R_n(\cdot)$ (or a penalized version) on $F(\Lambda)$ is a suboptimal
1014: aggregation procedure in the sense of~\cite{tsy:03}. According to
1015: \cite{tsy:03}, the optimal rate of aggregation in the gaussian
1016: regression model is $(\log M) /n$. This means that it is the minimum
1017: price one has to pay in order to mimic the best function among a class
1018: of $M$ functions with $n$ observations. This rate is achieved by the
1019: aggregate with cumulative exponential weights, see~\cite{catbook:01}
1020: and~\cite{jrt:06}.
1021: % temperature parameter $T\geq 2 \max_{f\in F(\Lambda)} \| f_0 -
1022: % f\|_\infty^2 + 2\sigma^2$
1023: In Theorem~\ref{TheoWeaknessERMRegression} below, we prove that the
1024: usual PERM procedure cannot achieve this rate and thus, that it is
1025: suboptimal compared to the aggregation methods with exponential
1026: weights. The lower bounds for aggregation methods appearing in the
1027: literature (see~\cite{tsy:03, jrt:06, LecJMLR:06}) are usually based
1028: on minimax theory arguments. The one considered here is based on
1029: geometric considerations, and involves an explicit example that makes
1030: the PERM fail. For that, we consider the Gaussian regression model
1031: with uniform design.
1032: \begin{assumption}[G]
1033:   Assume that $\varepsilon$ is standard Gaussian and that $X$ is
1034:   univariate and uniformly distributed on $[0, 1]$.
1035: \end{assumption}
1036: % where the design is uniformly distributed on $[0,1]$. That is the
1037: % model \eqref{eq:model} where $X$ has a uniform distribution on
1038: % $[0,1]$ (we consider here the case $d=1$) where the noise
1039: % $\varepsilon$ is a standard normal Gaussian variable.
1040: \begin{theorem}
1041:   \label{TheoWeaknessERMRegression}
1042:   Let $M \geq 2$ be an integer and assume that \textup{(G)} holds. % In
1043:   % the gaussian regression model with a design uniformly distributed
1044:   % on $[0,1]$,
1045:   We can find a regression function $f_0$ and a family $F(\Lambda)$ of
1046:   cardinality $M$ such that, if one considers a penalization
1047:   satisfying $|\pen(f)| \leq C \sqrt{(\log M)/n}, \forall f \in
1048:   F(\Lambda)$ with $0\leq C <\sigma (24\sqrt{2}c^*)^{-1}$ \textup($c^*$ is
1049:   an absolute constant from the Sudakov minorization, see
1050:   Theorem~\ref{TheoSudakov} in
1051:   Appendix~\ref{sec:appendix_proba}\textup), the PERM procedure
1052:   defined by
1053:   \begin{equation*}
1054:     \tilde{f}_n \in \argmin_{f \in F(\Lambda)}( R_n(f) + \pen(f))
1055:   \end{equation*}
1056:   satisfies
1057:   \begin{equation*}
1058:     E^n \| \tilde{f}_n - f_0 \|^2 \geq \min_{f \in
1059:       F(\Lambda)} \| f - f_0 \|^2 + C_3 \sqrt{\frac{\log
1060:         M}{n}}
1061:   \end{equation*}
1062:   for any integer $n \geq 1$ and $M\geq M_0(\sigma)$ such that $n^{-1}
1063:   \log[(M-1)(M-2)] \leq 1/4$ where $C_3$ is an absolute constant.
1064: \end{theorem}
1065: This result tells that, in some particular cases, the PERM cannot
1066: mimic the best element in a class of cardinality $M$ faster than
1067: $((\log M)/n)^{1/2}$. This rate is very far from the optimal one
1068: $(\log M)/n$.
1069: 
1070: Let $F(\Lambda)$ be the set that we consider in the proof of
1071: Theorem~\ref{TheoWeaknessERMRegression} (see
1072: Section~\ref{sec:proof_main_results} below), and take $\pen(f) = 0$.
1073: Using Monte-Carlo (we do $5000$ loops), we compute the excess risk $E
1074: \| \tilde{f}_n - f_0 \|^2 - \min_{f \in F(\Lambda)} \| f - f_0 \|^2$
1075: of the ERM. In Figure~\ref{fig:subERM} below, we compare the excess
1076: risk and the bound $((\log M) / n)^{1/2}$ for several values of $M$
1077: and $n$. It turns out that, for this set $F(\Lambda)$, the lower bound
1078: $((\log M) / n)^{1/2}$ is indeed accurate for the excess
1079: risk. Actually, by using the classical symmetrization argument and the
1080: Dudley's entropy integral, it is easy to obtain an upper bound for the
1081: excess risk of the ERM of the order of $((\log M) / n)^{1/2}$ for any
1082: class $F(\Lambda)$ of cardinality $M$.
1083: 
1084: \begin{figure}[htbp]
1085:   \centering
1086:   \includegraphics[width=4.3cm]{excess1.pdf}%
1087:   \includegraphics[width=4.3cm]{excess2.pdf}%
1088:   \includegraphics[width=4.3cm]{excess3.pdf}%
1089:   \caption{The excess risk of the ERM compared to $((\log M) /
1090:     n)^{1/2}$ for several values of $M$ and $n$
1091:     \textup($x$-axis\textup)}
1092:   \label{fig:subERM}
1093: \end{figure}
1094: 
1095: \subsection{Aggregation}
1096: \label{sec:aggregation}
1097: 
1098: % Let $F(\Lambda) = \{ f_\lambda : \lambda \in \Lambda \}$ be a finite
1099: % class of functions. In what follows, $ f_\lambda $ will be one of
1100: % the non-adaptive PERM defined in the previous section and
1101: % constructed with only a part of the data wich is assumed to be fixed
1102: % in this section.
1103: For each $ f_\lambda \in F(\Lambda)$, we compute a weight $\theta(
1104: f_\lambda) \in [0,1]$ such that $\sum_{\lambda \in \Lambda} \theta(
1105: f_{\lambda}) = 1$. These weights give a level of significance to each
1106: $ f_\lambda \in F(\Lambda)$.  The aggregated estimator is then the
1107: convex combination
1108: \begin{equation}
1109:   \label{eq:aggregate}
1110:   \hat {\mathsf f} := \sum_{\lambda \in \Lambda} \theta(f_\lambda)
1111:   f_\lambda,
1112: \end{equation}
1113: where the weight of $f \in F(\Lambda)$ is given by
1114: \begin{equation}
1115:   \label{eq:weights}
1116:   \theta(f) := \frac{\exp\big( - n R_{n}(f) / T
1117:     \big)}{\sum_{\lambda \in \Lambda} \exp\big(-n R_{n}(
1118:     f_\lambda)/T \big) },
1119: \end{equation}
1120: where $T > 0$ is the so-called \emph{temperature} parameter and where
1121: $R_n(f)$ is the empirical risk of $f$. This aggregation algorithm
1122: (with ``Gibbs'' or ``exponential'' weights) can also be found for
1123: instance in~\cite{catbook:01, leung_barron06, juditsky_etal05,
1124:   juditsky_nazin05, yang:00, yang04, LecAoS:07}. See
1125: also~\cite{gaiffas_lecue07} for adaptation by aggregation in a
1126: semiparametric model.
1127: 
1128: The next theorem is an oracle inequality for the aggregation
1129: method~\eqref{eq:weights}. It will be useful to derive the adaptive
1130: upper bounds stated in Section~\ref{sec:examples} below.
1131: \begin{theorem}
1132:   \label{thm:oracle}
1133:   % We assume that the noise $\varepsilon$ is symmetric.
1134:   Assume that for any $f \in F(\Lambda)$, we have $\norm{f -
1135:     f_0}_\infty \leq Q$ for some $Q > 0$. For any $a > 0$, the
1136:   aggregation method~\eqref{eq:weights} satisfies
1137:   \begin{equation*}
1138:     E^n \norm{\hat {\mathsf f} - f_0}^2 \leq (1+ a) \min_{f \in
1139:       F(\Lambda)} \norm{f - f_0}^2 + (C + T) \frac{(\log
1140:       n)^{1/2} \log M}{n},
1141:   \end{equation*}
1142:   where $C$ is a constant depending on $a, Q$ and $\sigma$.
1143: \end{theorem}
1144: When $T$ is too large, the weights~\eqref{eq:weights} are close to the
1145: uniform law over the set of weak estimators, and of course, the
1146: resulting aggregate is inaccurate. When $T$ is too small, one weight
1147: is close to $1$, and the others close to $0$: in this situation, the
1148: aggregate does barely the same job as the ERM procedure. This is not
1149: suitable since Theorem~\ref{TheoWeaknessERMRegression} told us that
1150: ERM is suboptimal. Hence, $T$ realize a tradeoff between the ERM and the
1151: uniform weights procedure.
1152: % It is a $T$ is somehow a regularization parameter of this tradeoff.
1153: % the estimator obtained by empirical risk minimization (ERM). This
1154: % behavior can be also explained by
1155: % equation~\eqref{eq:oracle_minimization} in the proof of
1156: % Theorem~\ref{thm:oracle}. Indeed, the exponential
1157: % weights~\eqref{eq:weights} A counterpart of the oracle inequality is
1158: % Theorem~\ref{TheoWeaknessERMRegression}, where we show that any
1159: % penalized empirical risk minimization algorithm is suboptimal
1160: % compared to the cumulative version of the aggregation algorithm
1161: % (\ref{eq:aggregate}) . This result tell us that $T$ shall not be too
1162: % large, since when $T$ is large, the aggregation algorithm
1163: % (\ref{eq:aggregate}) is close to the empirical risk minimization,
1164: % which is suboptimal (see Theorem~\ref{TheoWeaknessERMRegression}).
1165: It can be simply chosen by minimization of the empirical risk. We know
1166: empirically that it provides good results, see~\cite{gaiffas_lecue07}.
1167: Namely, we select the temperature
1168: \begin{equation}
1169:   \label{Tslection}
1170:   \hat T := \argmin_{T \in \mathcal T} \sum_{i=1}^n \big( Y_i - \hat
1171:   {\mathsf f}^{(T)} (X_i) \big)^2,
1172: \end{equation}
1173: where $\hat {\mathsf f}^{(T)}$ is the aggregated
1174: estimator~\eqref{eq:aggregate} with temperature $T$ and where
1175: $\mathcal T$ is some set of temperatures. This is what we do in the
1176: empirical study conducted in Section~\ref{sec:simulations}.
1177: 
1178: % The ERM already gives good results, but if $T$ is chosen carefully,
1179: % we expect to obtain an estimator which outperforms the ERM.
1180: 
1181: % This fact is confirmed by the numerical study conducted in
1182: % Section~\ref{sec:numerical}, where the choice of $T$ is done using a
1183: % simple leave-one-out cross-validation algorithm over the whole
1184: % sample for aggregates obtained with several $T$.
1185: 
1186: % We can understand the aggregation algorithm in the following way:
1187: % first, we compute the least squares of each weak estimators. This is
1188: % the most natural way of assessing the level of significance of some
1189: % estimator among the other ones. Then, we put a Gibbs law over the
1190: % set of weak estimators. The mass of each estimator relies on its
1191: % least squares (over the learning sample). Finally, the aggregate is
1192: % simply the mean expected estimator according to this law.  In
1193: % Section~\ref{sec:aggregation}, we propose an oracle inequality for
1194: % the aggregation algorithm (see Theorem~\ref{thm:oracle}), which is
1195: % the key result in the proof of the adaptive upper bound stated in
1196: % Theorem ?. The choice of the temperature parameter $T$ is discussed
1197: % in Section~\ref{sec:aggregation}.
1198: 
1199: 
1200: % Second, any penalized selection algorithm (ie, an algorithm that
1201: % selects a particular $\bar f_\lambda$ among $F(\Lambda)$ via a
1202: % penalized least squares minimization criterion) is suboptimal
1203: % compared to an aggregation procedure.
1204: 
1205: 
1206: % Another strategy for an adaptive choice of the smoothing parameter
1207: % $h$ in penalized least squares is complexity reguralization, which
1208: % was initiated by Vapnik, see~\cite{vapnik98}, and~\cite{kohler02},
1209: % among others. In \cite{kohler02}, the complexity regularization
1210: % approach is adopted to construct an adaptive estimator of the
1211: % regression. In this book, rates of convergence for the least squares
1212: % and penalized least squares estimators are given in the so-called
1213: % ``distribution free'' framework, where it is assumed that $|Y| \leq
1214: % L$ almost surely for a known positive constant $L$, and where there
1215: % is no assumption on $P_X$. In literature, the assumption $|Y| \leq
1216: % L$ is mandatory in order to derive rates of convergence in this
1217: % general setting for $P_X$. Note that this is also the standard
1218: % setting in learning theory. In Chapter~21 from~\cite{kohler02}, an
1219: % upper bound is obtained for the penalized least squares estimator,
1220: % in the case where $X$ is univariate and $\mathcal F$ is a Sobolev
1221: % space (smoothing splines). Herein, the convergence rate is shown to
1222: % be of order $(\log n)^2 n^{-2s/(2s + 1)}$ which is, up to the $(\log
1223: % n)^2$ term, optimal in this context. Thus, the results stated in
1224: % Section ???? improves upon complexity regularization in several
1225: % ways: the results are adaptive, holds in the multivariate case,
1226: % unbounded response $Y$ are taken into account, other spaces than the
1227: % Sobolev space can be considered and the rates are optimal (without
1228: % an extra logarithm).
1229: 
1230: % \subsection{Oracle inequality}
1231: 
1232: 
1233: 
1234: 
1235: % \begin{remark} %[Why don't we use the standard aggregation algorithm?]
1236: %   The standard aggregation algorithm (with exponantial weights) in the
1237: %   regression model is somewhat different from the one considered
1238: %   here. Usually, the weights are a Gibbs law over the set of
1239: %   estimators, with potential equals to the least squares over the
1240: %   learning sample. Here, we considered a potential equals to the
1241: %   penalized least squares. This weighting scheme is somewhat tuned to
1242: %   the situation where the weak estimators (or \emph{weak learners})
1243: %   are penalized least squares. The reason is the following: actually,
1244: %   the aggregation estimator is a reguralized version of the empirical risk
1245: %   minimizer estimator (ERM). It does a better job than the ERM when
1246: %   the temperature parameter (which can be understood as a
1247: %   reguralization parameter) is not too large.
1248: 
1249: %  is This allows to
1250: %   construct an adaptive estimator that does a better than more popular
1251: %   techniques for selecting the smoothness parameter $h$, such as the
1252: %   GCV technique, which provides satisfactory results is most cases.
1253: % \end{remark}
1254: 
1255: 
1256: % We recall that $m < n$ is the training sample size, which is a
1257: % fraction of $n$ \texttt{ATTENTION !} We recall that $D_m$, $D_{(m)}$
1258: % and $D_n$ stand for the training, the learning, and the whole sample
1259: % (respectively). We denote, repsectively, by $P^m$, $P^{(m)}$ and by
1260: % $P^n$ the corresponding empirical measures, and by $P_X^m$,
1261: % $P_X^{(m)}$ and $P_X^n$ the empirical measures for $X$. Moreover, for
1262: % short, we shall denote $\norm{f}^2 := \int f^2 d P_X$ and
1263: % $\norm{f}_n^2 = \int f^2 dP_X^n$, and we consider
1264: % $\prodsca{\cdot}{\cdot}$ and $\prodsca{\cdot}{\cdot}_n$ the associated
1265: % inner products. We define in the same way $\norm{f}_m$ and
1266: % $\norm{f}_{(m)}$. % In this section, we shall denote by $f_0$ the true
1267: % % regression function.
1268: 
1269: % We denote $\bar f$ and $J(f)$ instead of $\bar f_{\lambda}$ and
1270: % $J_s(f)$. We recall that $\pen(f) = h^2 J(f)^2$, where $J(f)^2 = 1 +
1271: % \norm{f}_\infty^2 + \tilde J(f)$. We denote by $|A|$ the cardinal of a
1272: % finite set $A$. We denote $\varepsilon = (\varepsilon_1, \ldots,
1273: % \varepsilon_n)$, and by convention $\norm{\varepsilon}_n^2 = \sum_{1 \leq i
1274: %   \leq n} \varepsilon_i^2 / n$, with the same definition for $Y = (Y_1,
1275: % \ldots, Y_n)$.
1276: 
1277: 
1278: % The resulting estimator is \emph{adaptive}, as showed below in the
1279: % Section, and as shown numerically in Section~\ref{sec:simulations}.
1280: 
1281: 
1282: % shall \emph{adapt} both to the complexity of $\mathcal F$ where
1283: % $f_0$ belongs to, which is measured by some smoothness paramerer
1284: % $s$, see~\eqref{eq:covering_assumption}, and to the smoothness
1285: % parameter $h$. \texttt{pas terrible la derniere phrase, et mal dit}
1286: 
1287: \section{Examples of adaptive results}
1288: \label{sec:examples}
1289: 
1290: 
1291: %\section{Examples of PERM over large function sets}
1292: 
1293: In this section, we construct adaptive estimators for several
1294: regression problems using the tools from
1295: Section~\ref{sec:pena_least_squares} and~\ref{sec:ERM_finite}. This
1296: involves, as usual with algorithms coming from statistical learning
1297: theory, a split of the sample into two parts (an exception can be
1298: found in~\cite{leung_barron06}). The main steps of the construction of
1299: adaptive estimators given in this section are:
1300: \begin{enumerate}
1301: \item split, at random, the whole sample $D_n$ into a \emph{training
1302:     sample}
1303: \begin{equation*}
1304:   D_m := [(X_i, Y_i) : 1 \leq i \leq m],
1305: \end{equation*}
1306: where $m < n$, and a \emph{learning sample}
1307: \begin{equation*}
1308:   D_{(m)} := [(X_i, Y_i) : m + 1 \leq i \leq n];
1309: \end{equation*}
1310: \item choose a set $\Lambda$ of parameters and compute, using the
1311:   training sample $D_m$, the corresponding class $F(\Lambda) = \{ \bar
1312:   f_\lambda : \lambda \in \Lambda \}$ of PERM (see
1313:   Definition~\ref{def:perm} in
1314:   Section~\ref{sec:pena_least_squares}). Each $\Lambda$ depends on the
1315:   considered problem of adaptive estimation, see below;
1316: \item using the learning sample $D_{(m)}$, compute the aggregation
1317:   weights and the aggregated estimator $\hat {\mathsf f}_n$,
1318:   respectively given by Equations~\eqref{eq:weights}
1319:   and~\eqref{eq:aggregate}.
1320: \end{enumerate}
1321: 
1322: Then, using Theorem~\ref{thm:least_sq} (see
1323: Section~\ref{sec:pena_least_squares}) and Theorem~\ref{sec:ERM_finite}
1324: (see Section~\ref{sec:ERM_finite}), we will derive adaptive upper
1325: bounds for estimators $\hat {\mathsf f}_n$ constructed in this
1326: way. Throughout the section, we shall assume the following.
1327: 
1328: \begin{assumption}[Split size]
1329:   Let $\ell$ be learning sample size, so that $\ell + m = n$. We shall
1330:   assume from now on, to simplify the presentation, that $\ell$ is a
1331:   fraction of $n$, typically $n/2$ or $n/4$.
1332: \end{assumption}
1333: 
1334: \subsection{About the split, jackknife}
1335: \label{sec:jackknife}
1336: 
1337: % \begin{remark}[Jackknife]
1338: The behavior of the aggregate $\hat {\mathsf f}_n$ can depend strongly
1339: on the split selected in Step~1, in particular when the number of
1340: observations is small. Hence, a good strategy is to jackknife: repeat,
1341: say, $J$ times Steps 1--3 to obtain aggregates $\{ \hat {\mathsf
1342:   f}_n^{(1)}, \ldots, \hat {\mathsf f}_n^{(J)} \}$, and compute the
1343: mean:
1344: \begin{equation*}
1345:   \hat {\mathsf f}_n := \frac{1}{J} \sum_{j=1}^J \hat {\mathsf
1346:     f}_n^{(j)}.
1347: \end{equation*}
1348: This jackknifed estimator provides better results than a single
1349: aggregate, see Section~\ref{sec:simulations} for an empirical study,
1350: where we show also that it gives more stable estimators than the ones
1351: involving cross-validation of generalized cross-validation. By
1352: convexity of $f \mapsto \norm{f - f_0}^2$, the jackknifed estimator
1353: satisfies the same upper bounds as a single aggregate: each of the
1354: adaptive upper bounds stated below also holds when we use the
1355: jackknife.
1356: 
1357: For the set of weak estimators considered in this paper, the split of
1358: the data is not a theoretical artefact. Indeed, if one skips Step~1
1359: (compute $F(\Lambda)$ and $\hat {\mathsf f}_n$ using the whole sample
1360: $D_n$), then $\hat {\mathsf f}_n$ has a very poor performance. An
1361: empirical illustration of this phenomenon is given in
1362: Figure~\ref{fig:split_effect}. Herein, we show the aggregation
1363: weights~\eqref{eq:weights} when the data is splitted and when it is
1364: not splitted. We consider an univariate design and cubic smoothing
1365: splines. Namely, we compute the set $F(\Lambda)$ of PERM
1366: (see~\eqref{eq:pena_least_sq}) with $\mathcal F = \{ f \in L^2([0, 1])
1367: : \int f^{(2)}(t) dt < +\infty \}$ and penalty $\pen(f) = h^2 \int
1368: f^{(2)}(t) dt$, where $f^{(2)}$ stands for the second derivative of
1369: $f$. We do that for several smoothing parameters $h$ in a grid $H$, so
1370: that $\Lambda := \{ (h, \mathcal F) : h \in H \}$. We used the
1371: \texttt{smooth.spline} routine in the \texttt{R} software to compute
1372: $F(\Lambda)$.
1373: \begin{figure}[htbp]
1374:   \centering
1375:   \includegraphics[width=6cm]{weightssplit.pdf}%
1376:   \includegraphics[width=6cm]{weightsnosplit.pdf}%
1377:   \caption{Aggregation weights with split \textup(left\textup) and
1378:     without split \textup(right\textup) and smoothing parameter
1379:     obtained by cross-validation \textup(vertical line\textup)}
1380:   \label{fig:split_effect}
1381: \end{figure}
1382: In Figure~\ref{fig:split_effect}, the x-axis is related to the value
1383: of $h$: it is the value of the parameter \texttt{spar} from the
1384: \texttt{smooth.spline} routine. The vertical line is the value of
1385: \texttt{spar} selected by cross-validation. The conclusion from
1386: Figure~\ref{fig:split_effect} is that, when the data is not splitted,
1387: an overfitting phenomenon occurs: the aggregation algorithm does not
1388: work, since it does not concentrate around a value of
1389: \texttt{spar}. Of course, the resulting aggregated estimator has a
1390: very poor performance.
1391: 
1392: 
1393: % \subsection{Weak estimators\textup: penalized least squares}
1394: % Using the training sample, we compute a family
1395: % \begin{equation*}
1396: %   F(\Lambda) := \{ \bar f_\lambda : \lambda \in \Lambda \}
1397: % \end{equation*}
1398: % of \emph{weak} estimators of the regression $f_0$. Each of these
1399: % estimators depend on a parameter $\lambda$ which makes them work
1400: % based on the data ``as if'' $f_0$ had some prescribed
1401: % properties. The parameter $\lambda$ writes $\lambda = (h, \mathcal
1402: % F)$, where $h > 0$ is a smoothing parameter, and where $\mathcal F$
1403: % is a smoothness space of function endowed with a seminorm
1404: % $|\cdot|_{\mathcal F}$.  The estimator $\bar f_\lambda$ is not
1405: % adaptive, since it depends on the choice of the tuning parameters
1406: % $h$ and $\mathcal F$ (we recall that we write $\lambda = (h,
1407: % \mathcal F)$ for short).  An obvious
1408: 
1409: % \begin{remark} (ne pas enlever cette remarque du tex
1410: %   The following criticism about data splitting is obvious: the weak
1411: %   estimators only use the training sample, which is smaller (typically
1412: %   two times smaller) than the whole sample, so each of them is less
1413: %   accurate than an estimator using the whole sample. This remark holds
1414: %   true when the learning sample is used to select one of them. If we
1415: %   do not select one of them, but mixes all of them according to the
1416: %   aggregation algorithm~(\ref{eq:aggregate}) for instance, then this
1417: %   is no more the case. We give an empirical evidence of this fact in
1418: %   Section~\ref{sec:simulations}, where we compare the CV (cross
1419: %   validation) and GCV (generalized cross validation) methods with our
1420: %   aggregation approach for the selection of the parameter $h$ in cubic
1421: %   spline estimation.
1422: % \end{remark}
1423: 
1424: 
1425: \subsection{How to derive the adaptive upper bounds}
1426: \label{sec:derive_adaptive}
1427: 
1428: In every examples considered below, the scheme to derive adaptive
1429: upper bounds is as follows. Say that $(\mathcal F_\beta : \beta \in
1430: B)$ is a set of embedded functions classes ($\mathcal F_\beta \subset
1431: \mathcal F_{\beta'}$ if $\beta < \beta'$) where each $\mathcal
1432: F_\beta$ satisfy Assumption~$(C_\beta)$. Let $B_n$ be an appropriate
1433: discretization of $B$. Let $\hat {\mathsf f}_n$ be the aggregated
1434: estimator obtained using Steps~1--3 (see the beginning of the
1435: section), with parameter $\Lambda = \Lambda_n = \{ (n^{-2 / (2 +
1436:   \beta)}, \mathcal F_\beta) : \beta \in B_n \}$ and let $M_n$ be the
1437: cardinality of $F(\Lambda_n)$. Let $E^{m}$ and $E^{(m)}$ be the
1438: expectations with respect to, repectively, the joint laws of $D_m$ and
1439: $D_{(m)}$, so that, by independence, we have $E^n[\cdot] =
1440: E^m[E^{(m)}[\cdot]]$. Let $f_0 \in \mathcal F_{\beta_0}$ for some
1441: $\beta_0 \in B$. Using Theorem~\ref{thm:oracle}, we have
1442: \begin{align*}
1443:   E^{(m)} \norm{\hat {\mathsf f}_n - f_0}^2 &\leq C \min_{f \in
1444:     F(\Lambda_n)} \norm{f - f_0}^2 + \frac{C (\log
1445:     n)^{1/2} \log M_n}{n} \\
1446:   & \leq C \norm{\bar f_{\lambda_n} - f_0}^2 + \frac{C (\log n)^{1/2}
1447:     \log M_n}{n},
1448: \end{align*}
1449: where $\lambda_n = (n^{-2 / (2 + \beta_n)}, \mathcal F_{\beta_n})$,
1450: with $\beta_n \in B_n$ chosen such that $\mathcal F_{\beta_0} \subset
1451: \mathcal F_{\beta_n}$ and $n^{-2 / (2 + \beta_n)} \leq C_1 n^{-2 / (2
1452:   + \beta_0)}$. Then, integrating w.r.t. to $E^{m}$ and using
1453: Theorem~\ref{thm:least_sq}, we have, if $M_n$ is no more than a power
1454: of $n$:
1455: \begin{align*}
1456:   E^n \norm{\hat {\mathsf f}_n - f_0}^2 &\leq C E^m \norm{\bar
1457:     f_{\lambda_n} - f_0}^2 + o(n^{-2 / (2 + \beta_0)}) \\
1458:   & \leq C_2 n^{-2 / (2 + \beta_n)} + o(n^{-2 / (2 + \beta_0)}) \leq
1459:   C_3 n^{-2 / (2 + \beta_0)}.
1460: \end{align*}
1461: This prove that, if $f_0 \in \mathcal F_{\beta_0}$ for some $\beta_0
1462: \in B$, we have $E^n \norm{\hat {\mathsf f}_n - f_0}^2 \leq C_3 n^{-2
1463:   / (2 + \beta_0)}$, thus $\hat {\mathsf f}_n$ is indeed adaptive over
1464: $(\mathcal F_\beta : \beta \in B)$.
1465: 
1466: 
1467: \subsection{Sobolev spaces, spline estimators}
1468: \label{sec:sobolev_spaces}
1469: 
1470: When $\mathcal F$ is a Sobolev space, the
1471: PERM~\eqref{eq:pena_least_sq} with $\alpha = 2$ is a very popular
1472: smoothing technique: see, among others, \cite{wahba90} and
1473: \cite{green_silverman94}. The most simple example is when $d=1$ and
1474: \begin{equation*}
1475:   \mathcal F = W_2^s([0, 1]) := \Big\{ f \in L^2([0, 1]) :
1476:   |f|_{W_2^s}^2 := \int_0^1 f^{(s)}(t)^2 dt < \infty \Big\},
1477: \end{equation*}
1478: where $s$ is some natural integer and $f^{(s)}$ stands for the $s$-th
1479: derivative of $f$. In this case, the PERM is called a \emph{smoothing
1480:   spline}, since in this situation the unique minimizer
1481: of~\eqref{eq:pena_least_sq} is a spline, see for
1482: instance~\cite{wahba90} or~\cite{kohler02}. When $s = 2$ (cubic
1483: splines), the routine \texttt{smooth.spline} from the \texttt{R}
1484: software (and for other softwares as well) neatly computes the
1485: solution to~\eqref{eq:pena_least_sq} using the B-spline basis, and
1486: chooses the parameter $h$ via generalized cross-validation (GCV). % Our
1487: % aggregation approach is an alternative to the selection of $h$ via
1488: % GCV, which is more stable when $n$ is small, see
1489: % Section~\ref{sec:simulations}.
1490: 
1491: The $d$-dimensional case is easily understood with the definition of
1492: $W_2^s([0, 1]^d)$ as the space of functions $f \in L^2([0, 1]^d)$ with
1493: all derivatives of total order $s$ in $L^2([0,1]^d)$. Namely,
1494: \begin{equation*}
1495:   W_2^s([0, 1]^d) := \Big\{ f \in L^2([0, 1]^d) :
1496:   |f|_{W_2^s([0, 1]^d)}^2 < \infty \Big\},
1497: \end{equation*}
1498: where
1499: \begin{equation}
1500:   \label{eq:usual_roughness}
1501:   |f|_{W_2^s([0, 1]^d)}^2 := \sum_{\mathbf k \in \mathbb N_0^d :
1502:     |\mathbf k| = s} \frac{s
1503:     !}{\mathbf k !} \int_{[0,1]^d} ( D_{\mathbf k} f(x) )^2 dx,
1504: \end{equation}
1505: where for $\mathbf k = (k_1, \ldots, k_d)$ we use the notations
1506: $\mathbf k ! := \prod_{i=1}^d k_i !$ and $|\mathbf k| := \sum_{i=1}^d
1507: k_i$ and where $D_{\mathbf k}$ is the differential operator
1508: $\partial^s / (\partial^{k_1} \cdots \partial^{k_d})$. When $d > 1$,
1509: the PERM for the choice $\mathcal F = W_2^s([0, 1]^d)$ is called a
1510: \emph{thin plate spline}, see again for instance~\cite{wahba90}
1511: or~\cite{kohler02}, where the practical computation of such PERM is
1512: explained in details. The usual assumption $s > d / 2$ gives the
1513: embedding $W_s([0, 1]^d) \subset C[0, 1]^d$ and that
1514: Assumption~$(C_\beta)$ holds, see~\cite{birman_solomjak67}. The
1515: situation where $s$ is not an integer is a particular case of what we
1516: do in Section~\ref{sec:anisotropic_besov} below. The case where
1517: $\mathcal F$ is a Sobolev space is actually a particular case of both
1518: the next sections. Indeed, it is well known (see~\cite{wahba90} for
1519: instance) that a Sobolev space is a Reproductive Kernel Hilbert Space
1520: (RKHS) for an appropriate kernel choice, and that it is also a Besov
1521: space $B_{2, 2}^s$.
1522: 
1523: % \texttt{verifier le lien besov et sobolev multidim... dire que
1524: %   sobolev est un cas particuler du rkhs, et que c'est le bon point
1525: %   de vue pour le calcul des thin plates, citer le mec qui fait ca a
1526: %   la fin dans le bouquin.... }
1527: 
1528: % Using the B-Spline basis (see~\cite{devore_lorentz93} for a precise
1529: % definition), the minimization~\eqref{eq:pena_least_sq} can be
1530: % written as a ridge regression problem, with a solution that can be
1531: % computed directly via the resolution of the corresponding linear
1532: % system.
1533: 
1534: 
1535: \subsection{Reproductive Kernel Hilbert Spaces}
1536: \label{sec:RKHS}
1537: 
1538: 
1539: Reproductive Kernel Hilbert Spaces (cf.~\cite{aronszajn50}), RKHS for
1540: short, provide a unified context for regularization in a wide variety
1541: of statistical model. Computational properties of estimators obtained
1542: by minimization of a functional onto a RKHS make these functions space
1543: very useful for statisticians. In this short section, we briefly
1544: recall some definitions and computational properties of RKHS.
1545: 
1546: Let $\cX$ be an abstract space (in this paper, we take
1547: $\cX=[0,1]^d$). We say that $K:\cX\times\cX\longmapsto\mathbb{R}$ is a
1548: {\it reproducing kernel}, RK for short, if for any integer $p$ and any
1549: points $x_1,\ldots,x_p$ in $\cX$, the matrix $(K(x_i,x_j))_{1\leq
1550:   i,j\leq p}$ is symmetric positive definite. Let $K$ be a RK. The
1551: Hilbert space associated with $K$, called {\it Reproducing Kernel
1552:   Hilbert Space} and denoted by $\cH_K$, is the completion of the
1553: space of all the finite linear combination $\sum_j a_j K(x_j,\cdot)$
1554: endowed with the inner product $\prodsca{\sum_j a_j
1555:   K(x_j,\cdot)}{\sum_k b_k K(y_k,\cdot)}_{K}=\sum_{j,k}a_j b_k
1556: K(x_j,y_k)$. We denote by $|\cdot|_K$ the associated norm on $\cH_K$.
1557: 
1558: The representer theorem (see~\cite{kimeldorf_wahba71} for results on
1559: optimization in RKHS) is at the heart of minimization of functional
1560: onto RKHS. The solution of the minimization problem
1561: \begin{equation}
1562:   \label{eq:RKHS_estimator}
1563:   \bar{f} \in \argmin_{f \in \cH_K} \{ R_n(f) + h^2|f|_{\cH_K}^2 \}
1564: \end{equation}
1565: is the linear combination
1566: \begin{equation*}
1567:   \bar{f} (\cdot) = \sum_{i=1}^n \alpha_i K(X_i,\cdot),\mbox{ where }
1568:   \boldsymbol {\alpha} = (\alpha_i)_{1 \leq i \leq n} = (\mathbf K_X +
1569:   n h^2 \mathbf I_n)^{-1} \mathbf Y,
1570: \end{equation*}
1571: where $\mathbf K_X$ is the Gram matrix $(K(X_i,X_j))_{1\leq i,j\leq
1572:   n}$, where $\mathbf Y = (Y_1, \ldots, Y_n)$ and where $\mathbf I_n$
1573: is the identity matrix in $\mathbb R^n$. They are many different ways
1574: to simplify the computation of the coefficients $\boldsymbol{\alpha}$,
1575: see for instance~\cite{amato_antoniadis_pensky06}.
1576: 
1577: In order to derive convergence rates for the estimator defined
1578: in~\eqref{eq:RKHS_estimator} from Theorem~\ref{thm:least_sq}, we use
1579: some results about covering numbers of RKHS obtained
1580: in~\cite{cucker_smale02} (other results on the entropy of RKHS can be
1581: found in \cite{SS:07,CS:98}). Let now assume that $P_X$ is a Borel
1582: measure. If $K$ is a {\it Mercer kernel} (this is a continuous
1583: reproducing kernel), the RKHS associated with $K$ is the set
1584: \begin{equation*}
1585:   \label{eq:Mercer_kernel}
1586:   \cH_K=\Big\{f\in L_2(P_X): f=\sum_{j=1}^\infty a_j \psi_j \mbox{
1587:     s.t. } \sum_{j=1}^\infty \lambda_j^{-1} a_j^2\leq \infty\Big\},
1588: \end{equation*}
1589: where $(\lambda_j)_{j\geq1}$ is the sequence of decreasing eigenvalues
1590: of the operator
1591: \begin{equation*}
1592:   L_K:\left\{\begin{array}{ccc}
1593:       L^2(P_X) & \longrightarrow & L^2(P_X)\\
1594:       f        & \longmapsto     & \int_\cX K(\cdot,y)f(y)dP_X(y)
1595:     \end{array} \right.
1596: \end{equation*}
1597: and $(\psi_j)_{j\leq1}$ the sequence of corresponding
1598: eigenvectors. According to Proposition~9 and Theorem~D in
1599: \cite{cucker_smale02}, if for any $k\geq1$ the $k$-th eigenvalue of
1600: $L_K$ is such that
1601: \begin{equation}
1602:   \label{eq:rkhs_eigenvalue}
1603:   \lambda_k \leq C k^{-l}
1604: \end{equation}
1605: for some $C > 0$ and $l > 1/2$ then the entropy of $B_K(R) := \{f \in
1606: \cH_K : |f|_K \leq R\}$ satisfies for any $\delta > 0$:
1607: % the ball of radius $R$ of the RKHS $\cH_K$, denoted by
1608: \begin{equation*}
1609:   H_\infty(\delta, B_K(R)) \leq \Big(\frac{2 R C_l}{\delta}
1610:   \Big)^{1/l},
1611: \end{equation*}
1612: where $C_l$ is slightly greater than $6Cl^l$. In this case,
1613: Theorem~\ref{thm:least_sq} and the arguments from
1614: Section~\ref{sec:derive_adaptive} gives the following result.
1615: 
1616: \begin{corollary}[Adaptive upper bound for RKHS]
1617:   \label{cor:rkhs}
1618:   Let $\bar f$ be defined by~\eqref{eq:RKHS_estimator} with a
1619:   reproducing kernel $K$ such that the eigenvalues of the operator
1620:   $L_K$ satisfy~\eqref{eq:rkhs_eigenvalue}. Then, if $h = a n^{-l /
1621:     (2l + 1)}$ and $\norm{\bar f - f_0}_\infty \leq Q$, we have
1622:   \begin{equation*}
1623:     E^n \norm{\bar f - f_0}_{L^2(P_X)}^2 \leq C_2 (1 +
1624:     |f_0|^2_{\mathcal H_K}) n^{-2l / (2l + 1)}
1625:   \end{equation*}
1626:   when $n$ is large enough.
1627: 
1628:   Now, let $L = [l_{\min}, l_{\max}]$ where $l_{\min} > 1/2$ and
1629:   $(\mathcal H_l : l \in L)$ be a family of nested RKHS. Assume that
1630:   the kernel of each $\mathcal H_l$
1631:   satisfies~\eqref{eq:rkhs_eigenvalue}. Let $\hat {\mathsf f}_n$ be
1632:   the aggregated estimator defined by Steps~1-3 with $\Lambda_n = \{
1633:   \lambda = (n^{-l / (2l + 1)}, \mathcal H_l) : l \in L_n \}$ and $L_n
1634:   := \{ l_{\min}, l_{\min} + (\log n)^{-1}, \ldots, l_{\max} \}$. We
1635:   have, if $f_0 \in \mathcal H_l$ for some $l \in L$,
1636:   \begin{equation*}
1637:     E^n \norm{\hat {\mathsf f}_n - f_0}_{L^2(P_X)}^2 \leq C_2 (1 +
1638:     |f_0|^2_{\mathcal H_l}) n^{-2l / (2l + 1)}
1639:   \end{equation*}
1640:   when $n$ is large enough.
1641: \end{corollary}
1642: 
1643: 
1644: 
1645: \subsection{Anisotropic Besov spaces}
1646: \label{sec:anisotropic_besov}
1647: 
1648: 
1649: In nonparametric estimation literature, Besov spaces are of particular
1650: interest since they include functions with \emph{inhomogeneous
1651:   smoothness}, for instance functions with rapid oscillations or
1652: bumps. Roughly, these spaces are used in statistics when we want to
1653: prove theoretically that some adaptive estimator is able to recover
1654: the details of a functions. When one considers a multivariate
1655: regression, the question of anisotropic smoothness naturally arises.
1656: Anisotropy means that the smoothness of $f_0$ differs in function of
1657: coordinates. As far as we know, adaptive estimation of a multivariate
1658: curve with anisotropic smoothness was previously considered only in
1659: Gaussian white noise or density models, see~\cite{hoffmann_lepski02},
1660: \cite{kerk_lepski_picard01}, \cite{kerk_lepski_picard07},
1661: \cite{neumann00}.  There is no results concerning the adaptive
1662: estimation of the regression with anisotropic smoothness on a general
1663: random design.
1664: 
1665: In this Section, we construct, using Steps~1-3, an adaptive estimator
1666: over anisotropic Besov spaces $B_{p, q}^{\bs s}$, where $\bs s = (s_1,
1667: \ldots, s_d)$ is the vector of smoothnesses. If $\{ e_1, \ldots, e_d
1668: \}$ is the canonical basis of $\mathbb R^d$, each $s_i$ is the
1669: smoothness in the direction $e_i$. A precise definition of $B_{p,
1670:   q}^{\bs s}$ is given in
1671: Appendix~\ref{sec:appendix_approximation}. Let $s$ be the harmonic
1672: mean of $\bs s$, see~\eqref{eq:harmonic_mean}. Let us introduce two
1673: vectors $\bs s^{\min}$ and $\bs s^{\max}$ in $\mathbb R_+^d$ with
1674: positive coordinates and harmonic means $\bar {\bs s}^{\min}$ and
1675: $\bar {\bs s}^{\max}$ respectively. Assume that $\bs s^{\min} \leq
1676: {\bs s}^{\max}$, which means that $s_i^{\min} \leq s_i^{\max}$ for any
1677: $i \in \{ 1, \ldots, d \}$ and assume that $\bar {\bs s}^{\min} > d /
1678: \min(p, 2)$. In view of Theorem~\ref{thm:anisotropic_entropy} and the
1679: embedding~\eqref{eq:anisotropic_embedding} (see
1680: Appendix~\ref{sec:appendix_approximation}), we know that Assumption
1681: $(C_\beta)$ holds for every $B_{p, \infty}^{\bs s}$ such that $\bs s
1682: \geq \bs s^{\min}$ with $\beta = d / \bar {\bs s}$ (and every $B_{p,
1683:   q}^{\bs s}$, since $B_{p, q}^{\bs s} \subset B_{p, \infty}^{\bs
1684:   s}$), where $\bar {\bs s}$ is the harmonic mean of $\bs s$. Consider
1685: the ``cube of smoothness''
1686: \begin{equation}
1687:   \label{eq:smoothness_cube}
1688:   \bs S := \prod_{i=1}^d [s_i^{\min}, s_i^{\max}],
1689: \end{equation}
1690: and consider the uniform discretization of this cube with step $(\log
1691: n)^{-1}$:
1692: \begin{equation}
1693:   \label{eq:discr_smoothness_cube}
1694:   \bs S_n := \prod_{i=1}^d  \big\{ s_i^{\min}
1695:   + k (\log n)^{-1} :1\leq k \leq [ (s_i^{\max} - s_i^{\min}) \log n ]
1696:   \big\},
1697: \end{equation}
1698: and the set of parameters
1699: \begin{equation*}
1700:   \Lambda(\bs S) := \{ \lambda = (n^{- \bar {\bs s} / (2 \bar {\bs s}
1701:     + d)}, B_{p, q}^{\bs s}) : \bs s \in \bs S_n \}.
1702: \end{equation*}
1703: Now, we compute, following Steps~1-3, the aggregated estimator $\hat
1704: {\mathsf f}_n^{\bs S}$ with set of parameters $\Lambda(\bs S)$ (see
1705: the beginning of the section). Following the arguments from
1706: Section~\ref{sec:derive_adaptive}, we can prove in the following
1707: Corollary~\ref{cor:anisotropic_besov_rate} that $\hat {\mathsf
1708:   f}_n^{\bs S}$ is adaptive over the whole range of anisotropic Besov
1709: spaces $\{ B_{p, q}^{\bs s} : \bs s \in \bs S \}$.
1710: 
1711: % the want to construct an estimator which is adaptive
1712: % over the whole range of anisotropic Besov spaces $\{ B_{p, q}^{\bs s}
1713: % : \bs s \in \bs S \}$. This is done in two steps:
1714: % \begin{enumerate}
1715: % \item First, using the training sample, compute the family of PERM
1716: %   (see Definition~\ref{def:perm})
1717: % \begin{equation*}
1718: %   F(\bs S) := \{ \bar f_\lambda : \lambda \in \Lambda(\bs S) \}
1719: % \end{equation*}
1720: % where
1721: 
1722: % where $s$ is the harmonic mean of $\bs s$. In
1723: % Definition~\ref{def:perm}, we can take $\alpha = p$, see Remark ???
1724: % above (\texttt{remarque sur les sequence spaces...}.
1725: % \item Then, consider $F(\bs S)$ as a family of weak estimators, and
1726: %   apply the aggregation algorithm on it. Namely, we compute the
1727: %   aggregate
1728: %   \begin{equation*}
1729: %     \hat {\mathsf f}_n^{\bs S} := \sum_{\lambda \in \Lambda(\bs S)}
1730: %     \theta(\bar f_\lambda ) \bar f_\lambda,
1731: %   \end{equation*}
1732: %   where the weights $\theta(\bar f)$ are given by~\eqref{eq:weights}.
1733: % \end{enumerate}
1734: 
1735: 
1736: % The adaptive upper bound stated in
1737: % Corollary~\ref{cor:anisotropic_besov_rate} follows from the arguments
1738: % from Section~\ref{sec:derive_adaptive}.
1739: 
1740: 
1741: % An immediate consequence of Theorem~\ref{thm:least_sq} is the
1742: % following convergence rate of the PERM in the anisotropic Besov space
1743: % $B_{p, \infty}^{\bs s}$ (see Section~\ref{sec:appendix_approximation}
1744: % for a definition) where we recall that
1745: 
1746: 
1747: \begin{corollary}
1748:   \label{cor:anisotropic_besov_rate}
1749:   Assume that $\norm{\bar f - f_0}_\infty \leq Q$ for every $\bar f
1750:   \in F(\bs S)$. If $f_0 \in B_{p, q}^{\bs s}$ for some $s \in \bs S$,
1751:   then
1752:   \begin{equation*}
1753:     E^n \norm{\hat {\mathsf f}_n^{\bs S} - f_0}_{L^2(P_X)}^2 \leq C
1754:     n^{-2 \bar {\bs s} / (2 \bar {\bs s} + d)}
1755:   \end{equation*}
1756:   when $n$ is large enough, where $C$ is a constant depending on $\bs
1757:   S, d$ and $Q$.
1758:   % Let $\bar f_\lambda$ be the same as in Theorem~\ref{thm:least_sq}
1759:   % with $\mathcal F = B_{p, \infty}^{\bs s}$ and $h = a n^{-s / (2s +
1760:   %   d)}$ where $s$ is the harmonic mean of $\bs s$. Assume that $s >
1761:   % d / p$ and that $\norm{\bar f_\lambda - f_0}_\infty \leq Q$ and
1762:   % $\norm{\alpha_0}_\infty \leq Q$ for some constant $Q > 0$. Then,
1763:   % uniformly over the ball $B_{p,\infty}^{\bs s}(R) = \{ f :
1764:   % |f|_{B_{p,\infty}^{\bs s}} \leq R \}$, we have\textup:
1765:   % \begin{equation*}
1766:   %   \sup_{f_0 \in B_{p, \infty}^{\bs s}(R)} E \norm{\bar f_\lambda -
1767:   %     f_0}^2 \leq C_3 (1 + R^2) n^{-2s / (2s + d)}
1768:   % \end{equation*}
1769:   % when $n$ is large enough.
1770: \end{corollary}
1771: 
1772: % Note that the same result holds for any $B_{p, q}^{\bs s}$ with $q >
1773: % 0$ because of the embedding $B_{p, q}^{\bs s} \subset B_{p,
1774: %   \infty}^{\bs s}$.
1775: In Corollary~\ref{cor:anisotropic_besov_rate} we recover the
1776: ``expected'' minimax rate $n^{-2 \bar {\bs s} / (2 \bar {\bs s} + d)}$
1777: of estimation of a $d$-dimensional curve in a Besov space. Note that
1778: there is no regular or sparse zone here, since the error of estimation
1779: is measured with $L^2(P_X)$ norm. A minimax lower bound over $B_{p,
1780:   q}^{\bs s}$ can be easily obtained using standard arguments, such as
1781: the ones from~\cite{tsybakov03}, together with Bernstein estimates
1782: over $B_{p, q}^{\bs s}$ that can be found in~\cite{hochmuth02}. Note
1783: that the only assumption required on the design law in this corollary
1784: is the compactness of its support.
1785: 
1786: 
1787: % This theorem proves that $\hat {\mathsf f}_n^{\bs S}$ is adaptive over
1788: % the whole range of anisotropic Besov spaces $\{ B_{p, q}^{\bs s} : \bs
1789: % s \in \bs S \}$. Its proof, which can be found in
1790: % Section~\ref{sec:proof_main_results}, is an easy consequence of
1791: % Theorems~\ref{thm:least_sq} and~\ref{thm:oracle}, together with the
1792: % embedding and entropy properties of these spaces, which are given in
1793: % Appendix~\ref{sec:appendix_approximation}.
1794: 
1795: % \texttt{rajouter en remarque estimation pour differents p, et le fait
1796: %   qu'on peut tronquer par Q, on peut aussi mettre $Q_1$ et $Q_2$, pas
1797: %   forcement le meme Q...}
1798: 
1799: 
1800: 
1801: 
1802: 
1803: 
1804: 
1805: 
1806: 
1807: 
1808: 
1809: % % \subsection{About the practical computation of PERM estimators}
1810: 
1811: % Although the practical computation of the PERM in the RKHS case is
1812: % very easy, see Section~\ref{sec:RKHS} (it is one of the reason that
1813: % makes it so popular), the computation of the other PERM proposed in
1814: % this section is less clear. For the Sobolev PERM (smoothing spline
1815: % type estimators) in the isotropic case, this is well
1816: % understood. Indeed, the computation of the thin-plate spline is a
1817: % particular case of RKHS, so its computation ..... see ???? Wahba ou
1818: % Gyorfi kohler ???  \texttt{rajouter des trucs ici}
1819: 
1820: % In the Besov case, in particular the anisotropic case, the compu
1821: 
1822: % \begin{equation*}
1823: %   \mathbf f_n = \mathbf A \boldsymbol \theta
1824: % \end{equation*}
1825: 
1826: % \begin{equation*}
1827: %   |f|_{B_{p,q}^s}^q := \sum_{j \geq 0} \Big( 2^{j(s + d/2 - d/p)} \Big(
1828: %   \sum_{k \in K_j } \sum_{e \in E} |\beta_{e, j, \mathbf k}|^p
1829: %   \Big)^{1/p} \Big)^q
1830: % \end{equation*}
1831: % where $E := \{ 0, 1 \}^d - \{ (0, \ldots, 0) \}$ and
1832: 
1833: % \begin{equation*}
1834: %   |f|_{B_{2, 2}^{s}}^2 := \sum_{j \geq 0} 2^{2js} \sum_{\mathbf k
1835: %     \in K_j} \sum_{e \in E} |\beta_{e,j,k}|^2
1836: % \end{equation*}
1837: 
1838: 
1839: % \begin{example}[Lasso and Elastic estimators]
1840: %   When the complexity parameter $s$ of the class $\cF$ of functions
1841: %   within the regression function belongs to is such that $s>d/2$,
1842: %   Theorem~\ref{thm:least_sq} provides convergence rate for the
1843: %   penalized least square estimator with the semi-norm of $f$ for
1844: %   penalty term (not only for the square of the semi-norm of $f$). We
1845: %   are going to apply this result to obtain convergence rates for the
1846: %   Lasso and Elastic estimators.
1847: 
1848: %   Take $M\geq2$ and $f_1,\ldots,f_M$ some functions from $[0,1]^d$ to $\mathbb{R}$. Consider the span $\cF$, in $L^2([0,1]^d)$, of these functions. That is
1849: %   \begin{equation*}
1850: %     \cF={\rm Span}(f_1,\ldots,f_M).
1851: %   \end{equation*}For identifiability reason, we will assume the following algebra  assumption:
1852: %   \begin{assumption}
1853: %     The dimension of the linear subspace $\cF\subset L^2([0,1]^d)$ is $M$.
1854: %   \end{assumption}
1855: %   Any element $f\in\cF$ is then associated with a unique vector  $\theta\in \mathbb{R}^M$ such that $f=f_\theta :=\sum_{j=1}^M\theta_j f_j$. We are going to endowed the space $\cF$ with the norm
1856: %   \begin{equation}\label{eq:Elastic_Penality}
1857: %     |f_\theta|_\cF=\omega\|\theta\|_1+(1-\omega)\|\theta\|_2,
1858: %   \end{equation}where $\omega\in[0,1]$ and $\|\theta\|_p=\big(\sum_{j=1}^M|\theta_j|^p \big)^{1/p}, \forall p\geq1$.
1859: %   The penalized least squares estimator with the penalty term given by~\eqref{eq:Elastic_Penality} is called the {\it elastic estimator }. When $\omega=1$, the elastic estimator is the {\it Lasso estimator}.
1860: 
1861: %   Within this framework, the set $\{f_1,\ldots,f_M\}$ is usually called the {\it dictionary}. When $M=d$, $f_j(x)=x_j$ (for any $x=(x_1,\ldots,x_d)\in [0,1]^d$ and $j\in\{1,\ldots,d\}$) and $f_0$ is assumed to belonging to $\cF$, model \eqref{eq:model} is the classical gaussian linear regression model
1862: %   \begin{equation}
1863: %     \label{eq:Model_Linear_Gaussian}
1864: %     \mathbf{Y}=\mathbf{X}\theta_0+\sigma(X)\boldsymbol{\varepsilon},
1865: %   \end{equation}where $\mathbf{Y}=(Y_1,\ldots,Y_n)^t$, $\mathbf{X}$ is the matrix $n\times d$ with lines $X_i^t,i=1,\ldots,n$, $\theta_0\in\mathbb{R}^d$ is such that $f_0=f_{\theta_0}$ and $\boldsymbol{\varepsilon}$ is the vector of noise $(\varepsilon_1,\ldots,\varepsilon_M)^t$. Lasso and Elastic estimators are usually studied in this framework.
1866: 
1867: %   We are going to study elastic estimators for a general dictionary. We are not going to deal with the problem of {\it Sign consistency} of the Lasso estimator but only with the convergence rate of this estimator and of the more general elastic estimator. For that, we assume the classical geometric assumption on the dictionary:
1868: %   \begin{assumption}\label{As:Isometry_Gram_Matrix}
1869: %     Let $\Gamma=(\prodsca{f_i}{f_j})_{1\leq i,j\leq M}$ be the Gram matrix of the dictionary $\{f_1,\ldots,f_M\}$ for the inner product $\prodsca{f}{g}=\int_{[0,1]^d}fgdP_X$. We assume that, there exists an absolute constant $c>0$ such that for any vector $\theta\in \mathbb{R}^d$, we have \begin{equation*}\theta^t \Gamma \theta\geq c \|\theta\|_2^2.\end{equation*}
1870: %   \end{assumption}
1871: 
1872: %   We don't need to split the sample thus we take $m=n$ observations to construct the estimators. We take $\bar{\theta}\in\mathbb{R}^M$ such that
1873: %   \begin{equation}
1874: %     f_{\bar{\theta}}\in \argmin_{f_\theta\in \cF} \big[\frac{1}{n}\sum_{i=1}^n(Y_i-f_\theta(X_i))^2+h^2 |f_\theta|_\cF\big]
1875: %   \end{equation}where the norm $|\cdot|_\cF$ is defined in equation~\eqref{eq:Elastic_Penality}. Assumption~\ref{As:Isometry_Gram_Matrix} yields $c\|\bar{\theta}-\theta_0\|_2^2\leq \|f_{\bar{\theta}}-f_0\|_2^2$. To obtain rates of convergence using Theorem~\ref{thm:least_sq}, we have to control the entropy of $L_\infty$-balls of the model $\cF$. It is easy to see that \begin{equation*}
1876: %     H(\delta,\cF(R),\|\cdot\|_\infty)\leq M \log\big(\frac{2MR}{\delta} \big), \mbox{ where } M=\max_{1\leq j \leq M}\|f_j\|_\infty.
1877: %   \end{equation*} \texttt{Il faut regarder pour quels $R/\delta$ le plus petit on applique cette inegalité}. If we have $M$ such that $M \log\big(\frac{2MR}{\delta} \big)\leq D (R/\delta)^{d/s}$ then, applying Proposition~\ref{prop:least_sq} \texttt{si on pouvait se passer de tronquer les estimateurs dans la Proposition 1 ce serait bien ici. Voir Einmahl et Masson?}, the elastic estimator $\bar{\theta}$ with $h\geq a n^{s/(2s+d)}$ satisfies
1878: %   \begin{equation*}
1879: %   \mathbb{E}\|\bar{\theta}-\theta_0\|_2^2\leq C(\theta_0)h^2,
1880: % \end{equation*}where $C(\theta_0)\leq
1881: % C_1/c+2(\omega\|\theta\|_1+(1-\omega)\|\theta\|_2)/c$.
1882: % \end{example}
1883: 
1884: % Usually, the ``roughness'' of a function $f \in W_s$ is measured by ,
1885: % consisting of a subsample of size $m < n$ of the whole sample $D_n$
1886: % (for more details about splitting the sample, see below.)
1887: 
1888: 
1889: 
1890: 
1891: 
1892: % The next corollary is an approximation type result.  \texttt{resultat
1893: %   d'approximation ici}. Let $\tilde f_{(s, h)}$ be given by
1894: % \begin{equation*}
1895: %   \tilde f_{(s, h)} := \argmin_{\tilde f \in W_s} \big\{ \norm{f -
1896: %     \tilde f}_n^2 + \pen(\tilde f) \big\},
1897: % \end{equation*}
1898: % where $\pen(f)$ is given by~\eqref{eq:pen}. A consequence of
1899: % Theorem~\ref{thm:least_sq} is as follows.
1900: 
1901: % \begin{corollary}
1902: %   \label{cor:spline_approx}
1903: %   Under the same assumptions as in Theorem~\ref{thm:least_sq}, we have
1904: %   \begin{equation*}
1905: %     E_T \norm{\tilde f - f}_{L^2(P_T)}^2 \leq C h^2,
1906: %   \end{equation*}
1907: %   where $E_T$ is the joint law of of $(T_1, \ldots, T_n)$.
1908: % \end{corollary}
1909: 
1910: 
1911: 
1912: % Then, we use again
1913: % Lemma~\ref{thm:devia1}: we consider this time the event $\mathcal
1914: % B_{f_0}( z_1, \gamma_m)$, where $z_1 > 0$ is a fixed constant given by
1915: % Lemma~\ref{thm:devia1}. We have this time
1916: % \begin{equation}
1917: %   \label{eq:deviaB2}
1918: %   P\big[ \mathcal B_{f_0}( z_1, \gamma_m)^\complement \big] \leq \exp(
1919: %   -D_3 (\log m)^{-d(1 + d / s) / s} h^{-d/s} ),
1920: % \end{equation}
1921: % where $D_3 := D_1 z_1^2 (4\alpha)^{-d / (2s)}$ and in view
1922: % of~\eqref{eq:thm1trick}, we have
1923: % \begin{equation*}
1924: %   \norm{\bar f - f_0}_m^2 + \pen(\bar f) \leq 16 z_1^2 \alpha h^2
1925: % \end{equation*}
1926: % on $\mathcal B_{f_0}( z_1, \gamma_m)$. Then, if
1927: % \begin{equation*}
1928: %   \mathcal B := \mathcal B_{f_0}( (\log m)^{1 + d / (2s)}, 2 \sigma_1
1929: %   t_m ) \cap \mathcal B_{f_0}(z_1, \gamma_m),
1930: % \end{equation*}
1931: % we have $P[ \mathcal B^\complement ] = o(h^2)$. Putting all this
1932: % together, we obtain:
1933: % \begin{align*}
1934: %   E \norm{\bar f - f_0}^2 &\leq \frac{(Q+Q_n)^2}{m} + 10 z_0 h^2 +
1935: %   o(h^2) \\
1936: %   &+ E[ A_2 \ind{\norm{\varepsilon}_m \leq t_m} (\ind{e_m \leq b_m} +
1937: %   \ind{b_m \leq e_m} (\ind{\mathcal B} + \ind{\mathcal
1938: %     B^\complement} ) ) ] \\
1939: %   &\leq (10 z_0 + 16 z_1^2 \alpha + 1 + \norm{f_0}_\infty^2 +
1940: %   \tilde J(f_0)^2 ) h^2 + o(h^2).
1941: %   % &\leq \frac{(Q+Q_n)^2}{m} + 10 z_0 h^2 + o(h^2) + \pend(f_0) + 16
1942: %   % z_1^2 \alpha h^2 + \\
1943: %   % &+
1944: %   % &= E[ A_1 ] +  \\
1945: %   % &\leq ( 10 z_0 + 16 z_1^2 \alpha + 16 J_s(f)^2 ) h^2 + \Delta_n \\
1946: %   % &= \big( C + 16 (\norm{f}_\infty^2 + \smallint f^{(s)}(t)^2 dt)
1947: %   % \big) h^2 + \Delta_n
1948: % \end{align*}
1949: % % where $C := 10 z_0 + 16 (z_1^2 \alpha + 1)$ and where $\Delta_n :=
1950: % % P[ A_1 \geq 10 z_0 h^2 ] + P[\mathcal B^{\complement}]$ is such that
1951: % % $n^\beta \Delta_n$ goes to $0$ for any $\beta > 0$, in view
1952: % % of~\eqref{eq:deviaA1}, \eqref{eq:deviaB1} and~\eqref{eq:deviaB2}.
1953: % This concludes the proof of Theorem~\ref{thm:least_sq}.
1954: % \end{proof}
1955: 
1956: 
1957: % \subsection*{Proof of Corollary~\ref{cor:spline_approx}}
1958: 
1959: 
1960: % As in the proof of Theorem~\ref{thm:least_sq}, we have
1961: % \begin{equation*}
1962: %   E \norm{\bar f - f}^2 \leq 10 z_0 h^2 + 2 Q^2 P[A_1 \geq 10 z_0 h^2] +
1963: %   E[ A_2 ],
1964: % \end{equation*}
1965: % where in view of Lemma~\ref{lem:devia2} we have $ P[ A_1 \geq 10 z_0
1966: % h^2] \leq \exp( -n h^2)$, and by the definition of $\tilde f_{(s,
1967: %   h)}$, we have
1968: % \begin{equation*}
1969: %   \norm{f - \tilde f}_n^2 + \pen(\tilde f) \leq \norm{f - f^*}_n^2 +
1970: %   \pen(f^*) \quad \forall f^* \in W_s,
1971: % \end{equation*}
1972: % which gives $\norm{f - \tilde f}_n^2 + \pen(\tilde f) \leq \pen(f)$ if
1973: % $f^* = f \in W_s$. This concludes the proof of the corollary. \hfill
1974: % $\square$
1975: 
1976: 
1977: % But since the Cauchy-Schwarz inequality gives
1978: % \begin{equation*}
1979: %   0 \leq 2 \norm{Y - f}_n \norm{\bar f - f}_n + \pen(f) - \pen(\bar f)
1980: %   \leq 2 \sigma \norm{\bar f - f}_n + \pen(f) - \pen(\bar f),
1981: % \end{equation*}
1982: % we have that necessarily,
1983: % \begin{equation}
1984: %   \label{eq:trick_pen}
1985: %   \pen(\bar f) \leq 2 \sigma \norm{\bar f - f}_n + \pen(f).
1986: % \end{equation}
1987: % This gives
1988: % \begin{equation*}
1989: %   B \leq 4 (  )
1990: % \end{equation*}
1991: 
1992: % We can rewrite it in the following way:
1993: % \begin{align*}
1994: %   \sqrt{n} \norm{\bar f - f}_n^{2k + 1 / (2k)} &\leq \frac{2
1995: %     \sqrt{n} \prodsca{Y - f}{\bar f - f}_n }{ \norm{\bar f - f}_n^{1
1996: %       - 1/(2k)} } + \frac{\sqrt{n} (\pen(f) - \pen(\bar f)) }{
1997: %     \norm{\bar f -
1998: %       f}_n^{1 - 1/(2k)} } \\
1999: %   &=: e_n + b_n.
2000: % \end{align*}
2001: 
2002: 
2003: %\section{Adaptation}
2004: %\label{sec:adaptation}
2005: 
2006: 
2007: 
2008: 
2009: % \subsection{Adaptative estimation over anisotropic Besov spaces}
2010: 
2011: \section{Empirical study}
2012: \label{sec:simulations}
2013: 
2014: In this Section, we compare empirically our aggregation procedure with
2015: the popular cross-validation (CV) and generalized cross-validation
2016: (GCV) procedures for the selection of the smoothing parameter $h$ (see
2017: Section~\ref{sec:about_h}) in smoothing splines (we use the
2018: \texttt{smooth.spline} routine from the \texttt{R} software, see
2019: \texttt{http://www.r-project.org/}). Concerning CV, GCV and smoothing
2020: splines, we refer to~\cite{wahba90}
2021: and~\cite{green_silverman94}. Those routines provide satisfactory
2022: results in most cases, in particular for the examples of regression
2023: functions considered here. However, we show that when the sample size
2024: $n$ is small (less than 50), and when the noise level is high (we take
2025: root-signal-to-noise ratio equals to $2$), then our aggregation
2026: approach is more stable, see Figure~\ref{fig:mises} below. Here in, we
2027: consider two examples of regression function, given, for $x \in [-1,
2028: 1]$, by:
2029: \begin{itemize}
2030: \item \texttt{hardsine}$(x) = 2 \sin(1 + x) \sin( 2 \pi x^2 + 1)$
2031: \item \texttt{oscsine}$(x) = (x+1) \sin(4 \pi x^2 )$.
2032: \end{itemize}
2033: We simply take $X$ uniformly distributed on $[-1, 1]$ and Gaussian
2034: noise with variance $\sigma$ chosen so that the root-signal-to-noise
2035: ratio is $2$. In Figure~\ref{fig:examples} we show typical simulation
2036: in this setting, where $n = 30$.
2037: \begin{figure}[htbp]
2038:   \centering
2039:   \includegraphics[width=6cm]{data1.pdf}%
2040:   \includegraphics[width=6cm]{data2.pdf}%
2041: %  \includegraphics[width=4.3cm]{n30r2agg.pdf}%
2042:   \caption{Examples of simulated data, for
2043:     $f_0$\texttt{=\textup{harsine}} \textup(left\textup) and
2044:     $f_0$\texttt{=\textup{oscsine}} \textup(right\textup)}
2045:   \label{fig:examples}
2046: \end{figure}
2047: 
2048: In Figure~\ref{fig:mises}, we show the mises $E\norm{\hat f_n -
2049:   f_0}_n^2$ computed by Monte Carlo using $1000$ simulations of the
2050: model. The tuning of the estimators in both examples is the following:
2051: for GCV, we simply use the \texttt{smooth.spline} routine with default
2052: selection of $h$ by GCV. For CV, we use the same routine, with the
2053: option \texttt{cv=TRUE} so that CV is used instead. For aggregation,
2054: we use Steps~1-3 (see Section~\ref{sec:examples}). Step~1 is done with
2055: $m=3n/4$ and $\ell = n/4$. For Step~2, we use the
2056: \texttt{smooth.spline} routine to compute a set of weak estimators,
2057: using the option \texttt{spar=x}, where \texttt{x} lies in the set $\{
2058: 0, 0.01, 0.02 \ldots, 1 \}$. The parameter \texttt{spar} is related to
2059: the value of the smoothing parameter $h$. For Step~3, we compute the
2060: weights with temperature given by~\eqref{Tslection} (over the training
2061: sample) and the set $\mathcal T = \{ 10, 20, \ldots, 100 \}$. Then, we
2062: repeat steps~1-3 $J=100$ times and compute the jackknifed estimator,
2063: see Section~\ref{sec:jackknife}. This gives our aggregated estimator.
2064: 
2065: On Figure~\ref{fig:mises}, we plot the MISEs (the mean of the $1000$
2066: MISEs obtained for each simulation) for sample sizes $n \in \{ 20, 30,
2067: 50, 100 \}$ and in Figure~\ref{fig:sd} we plot the corresponding
2068: standard deviations. The conclusion is that for small $n$, aggregation
2069: provides a more accurate and stable estimation than the GCV or
2070: CV. When $n$ is $100$ or larger, than the aggregation procedure has
2071: barely the same accuracy as GCV or CV.
2072: 
2073: \begin{figure}[htbp]
2074:   \centering
2075:   \includegraphics[width=6cm]{mises1.pdf}%
2076:   \includegraphics[width=6cm]{mises2.pdf}%
2077:   \caption{MISE for $f_0$\textup{=\texttt{harsine}}
2078:     \textup(left\textup) and $f_0$\textup{=\texttt{oscsine}}
2079:     \textup(right\textup)}
2080:   \label{fig:mises}
2081: \end{figure}
2082: 
2083: \begin{figure}[htbp]
2084:   \centering
2085:   \includegraphics[width=6cm]{sd1.pdf}%
2086:   \includegraphics[width=6cm]{sd2.pdf}%
2087:   \caption{standard deviation of the MISE for
2088:     $f_0$\textup{=\texttt{harsine}} \textup(left\textup) and
2089:     $f_0$\textup{=\texttt{oscsine}} \textup(right\textup)}
2090:   \label{fig:sd}
2091: \end{figure}
2092: 
2093: 
2094: 
2095: % \begin{table*}[htbp]
2096: %   \caption{Estimated MISE \textup(using 1000
2097: %     replications\textup) and standard deviations \textup(between
2098: %     brackets\textup) for $f = \texttt{\textup{hardsine}}$}
2099: %   \begin{tabular}{lccc}
2100: %     \hline
2101: %     $n$ & GCV &  CV &  AGG \\ \hline
2102: %     $20$ & 0.224 (0.132)  & 0.233  (0.172) & \textbf{0.188}
2103: %     (\textbf{0.089}) \\
2104: %     $30$ &  0.177 (0.124)  & 0.153 (0.103) & \textbf{0.146}
2105: %     (\textbf{0.064}) \\
2106: %     $50$ & $6.56 \times 10^{-2}$ ($9.86 \times 10^{-2}$) & $\mathbf{4.75
2107: %       \times 10^{-2}}$ ($5.53 \times 10^{-2}$) & $4.84 \times 10^{-2}$
2108: %     ($\mathbf{5.29 \times 10^{-2}}$) \\
2109: %     $100$ & $2.82 \times 10^{-3}$ ($1.24 \times 10^{-2}$) &
2110: %     $\mathbf{2.60 \times 10^{-3}}$ ($\mathbf{1.07 \times 10^{-2}}$) &
2111: %     $2.89 \times 10^{-3}$ ($1.17 \times 10^{-2}$) \\
2112: %     \hline
2113: %     \hline
2114: %   \end{tabular}
2115: %   \label{tab:mises1}
2116: % \end{table*}
2117: 
2118: 
2119: % \begin{table*}[htbp]
2120: %   \caption{Estimated MISE \textup(using 1000
2121: %     replications\textup) and standard deviations \textup(between
2122: %     brackets\textup) for $f = \texttt{\textup{oscsine}}$}
2123: %   \begin{tabular}{lccc}
2124: %     \hline
2125: %     $n$ & GCV &  CV &  AGG \\ \hline
2126: %     $20$ & 0.235 (0.195) & 0.167 (0.094) & 0.123 (0.09) \\
2127: %     $30$ &  &  & \\
2128: 
2129: %     0.07323741  ( 0.04325123 )
2130: 
2131: %     $50$ & $6.56 \times 10^{-2}$ ($9.86 \times 10^{-2}$) & $\mathbf{4.75
2132: %       \times 10^{-2}}$ ($5.53 \times 10^{-2}$) & $4.84 \times 10^{-2}$
2133: %     ($\mathbf{5.29 \times 10^{-2}}$) \\
2134: %     $100$ & $2.82 \times 10^{-3}$ ($1.24 \times 10^{-2}$) &
2135: %     $\mathbf{2.60 \times 10^{-3}}$ ($\mathbf{1.07 \times 10^{-2}}$) &
2136: %     $2.89 \times 10^{-3}$ ($1.17 \times 10^{-2}$) \\
2137: %     \hline
2138: %     \hline
2139: %   \end{tabular}
2140: %   \label{tab:mises2}
2141: % \end{table*}
2142: 
2143: 
2144: 
2145: \section{Proofs of the main results}
2146: \label{sec:proof_main_results}
2147: 
2148: We recall that $P_n$ stands for the joint law of the training sample
2149: $D_n$ conditional on $X^n := (X_1, \ldots, X_n)$, that is $P_n :=
2150: P^n[\cdot | X^n]$.
2151: 
2152: % Note that if $\{ z_1, \ldots, z_p \}$ is a $\delta$-cover of $A
2153: % \subset E$, where $(E, \norm{\cdot})$ is some normed space, we can
2154: % find a $2\delta$-cover of $A$ with same size $p$ which is included
2155: % in $A$. Thus, we shall always assume without loss of generality that
2156: % a $\delta$-cover is included in the space it covers.
2157: 
2158: \begin{proof}[Proof of Theorem~\ref{thm:devia1}]
2159:   First, we use the \emph{peeling} argument: we decompose $B_n(f_0,
2160:   \delta)$ into the union of the sets $S_j$ for $j \geq 0$, where for
2161:   $\delta_j := \delta 2^{-j/\beta}$
2162:   \begin{equation*}
2163:     S_j := B_n(f_0, \delta_j ) - B_n(f_0, \delta_{j+1}),
2164:   \end{equation*}
2165:   and decompose $\mathcal F$ into the union of the sets
2166:   \begin{equation*}
2167:     B_\cF(2^{k/\beta}) - B_\cF(2^{(k-1)/\beta}) = \{ f \in \mathcal F
2168:     : 2^{(k-1) / \beta} < |f|_{\mathcal F} \leq 2^{k / \beta} \},
2169:  \end{equation*}
2170:  for $k \geq 1$, where $B_{\mathcal F}(2^{k/\beta}) = \{ f \in
2171:  \mathcal F : |f|_{\mathcal F} \leq 2^{k/\beta}\}$ This gives that the
2172:  left hand side of~\eqref{eq:deviaZ_n} is smaller than
2173:    \begin{align*}
2174:      \sum_{j \geq 0} & P_n\Big[ \sup_{ \substack{f \in S_j \text{
2175:            s.t. } \\ |f|_{\mathcal F} \leq 1} } \frac{ Z(f - f_0)
2176:      }{\norm{f - f_0}_n^{1 - \beta/2} (1 + |f|_{\mathcal F})^{\beta/2}
2177:      } > z \Big] \\
2178:      &+ \sum_{j \geq 0} \sum_{k \geq 1} P_n \Big[ \sup_{ f
2179:          \in S_j\cap B_{\mathcal F}(2^{k/\beta})} \frac{ Z(f - f_0) }{\norm{f -
2180:          f_0}_n^{1 - \beta/2} (1 + |f|_{\mathcal F})^{\beta/2} } > z
2181:      \Big],
2182:    \end{align*}
2183:    which is smaller than
2184:    \begin{equation*}
2185:      \sum_{j,k \geq 0}P_n \Big[ \sup_{f \in
2186:          B_n(f_0, \delta_j)\cap B_\cF(2^{k/\beta}) }
2187:      Z(f - f_0) > z(\delta, j, k) \Big] =: \sum_{j,k \geq 0}  P_{j, k},
2188:    \end{equation*}
2189:    where $z(\delta, j, k) := z \delta_j^{1 - \beta/2}
2190:    2^{k/2-1/2}$. Let us consider, for any $\delta > 0$, a minimal
2191:    $\delta$-covering $F(\delta, k)$ of the set $B_{\mathcal
2192:      F}(2^{k/\beta})$ for the
2193:    $\norm{\cdot}_\infty$-norm. Assumption~$(C_\beta)$ implies
2194:    \begin{equation*}
2195:      | F(\delta, k) | \leq \exp\big( D (2^{k/\beta} / \delta)^{\beta} \big)
2196:      = \exp( D 2^k \delta^{-\beta} ).
2197:    \end{equation*}
2198:    Moreover, without loss of generality, we can assume that $F(\delta,
2199:    k) \subset B_{\mathcal F}(2^{k/\beta})$. For any $i \in \mathbb N$
2200:    and $j, k$ fixed, we introduce
2201:    \begin{equation}
2202:      \label{eq:Fi}
2203:      F^{(i)} := F(\delta_{i,j}, k) \text{ where } \delta_{i,j} :=
2204:      \delta_j 2^{-i/\beta} = \delta 2^{-(i+j)/\beta},
2205:    \end{equation}
2206:    and, for any $f\in B_\cF(2^{k/\beta})$ we denote by $\pi_i(f)$ an
2207:    element of $F^{(i)}$ such that $\norm{\pi_i(f) - f}_\infty \leq
2208:    \delta_{i,j}$. We have
2209:    \begin{align*}
2210:      P_{j,k} &\leq P_n\Big[ \sup_{ f \in B_n(f_0, \delta_j)\cap
2211:        B_\cF(2^{k/\beta})} | Z(\pi_0(f) - f_0) | > z(\delta, j, k) / 2
2212:      \Big] \\ & + P_n \Big[ \sup_{ f \in B_n(f_0, \delta_j) \cap
2213:        B_\cF(2^{k/\beta})} | Z(f - \pi_0(f))| > z(\delta, j, k) / 2
2214:      \Big] \\ &=: P_{j,k,1} + P_{j,k,2}.
2215:    \end{align*}
2216:    First, we consider $P_{j,k,1}$:
2217:    \begin{align*}
2218:      P_{j,k,1} \leq P_n \Big[ \sup_{f \in F^{(0)} \cap B_n(f_0,
2219:        \delta_j) } | Z(\pi_0(f) - f_0) | > z(\delta, j, k) / 2 \Big].
2220:    \end{align*}
2221:    We use~\eqref{eq:deviaZnf} and the union bound over $F^{(0)}$
2222:    together with the fact that $f \in B_n(f_0, \delta_j)$ to obtain:
2223:    \begin{equation*}
2224:      P_{j,k,1} \leq |F^{(0)}| \exp\Big( \frac{-a z^2(\delta, j, k)}{4
2225:        \delta_j^2} \Big) = \exp\Big( \frac{2^{j+k}}{\delta^{\beta}} (D - a z^2 / 8 ) \Big),
2226:    \end{equation*}
2227:    where $a := (2b^2)^{-1}$. Now, in order to control $P_{j,k,2}$, we
2228:    use the so-called chaining argument, which involves increasing
2229:    approximations by the covers $F^{(i)}$, see~\eqref{eq:Fi}. Let us
2230:    consider
2231:    \begin{equation*}
2232:      E_i := (2^{1/\beta - 1/2} - 1) 2^{-i (1/\beta-1/2) }
2233:    \end{equation*}
2234:    for $i \geq 1$ ($E_i > 0$ since $\beta \in(0, 2)$). By linearity of
2235:    $Z_n(\cdot)$ and since $\sum_{i \geq 1} E_i = 1$, we have
2236:    \begin{align*}
2237:      P_{j,k,2} &\leq \sum_{i \geq 1} P_n\Big[ \sup_{ \substack{ f \in
2238:          B_n(f_0, \delta_j) \\ |f|_{\mathcal F} \leq 2^{k/\beta} } } |
2239:      Z(\pi_i(f) - \pi_{i-1}(f)) | > E_i z(\delta, j, k) / 2 \Big] \\
2240:      &=: \sum_{i \geq 1} P_{i, j, k, 2}.
2241:    \end{align*}
2242:    Now, since
2243:    \begin{align*}
2244:      \norm{\pi_i(f) - \pi_{i-1}(f)}_n &\leq \norm{\pi_i(f) -
2245:        \pi_{i-1}(f)}_\infty \\
2246:      &\leq \norm{\pi_i(f) - f}_\infty + \norm{\pi_{i-1}(f) - f}_\infty \\
2247:      & \leq \delta_{i,j} + \delta_{i-1,j} = \delta_{i,j} (1 +
2248:      2^{1/\beta}),
2249:    \end{align*}
2250:    and since the number of pairs $\{ \pi_i(f), \pi_{i-1}(f) \}$ is at
2251:    most
2252:    \begin{equation*}
2253:      |F^{(i)}| \times |F^{(i-1)}| \leq \exp \Big( \frac{3 D 2^{i + j +
2254:          k}}{2 \delta^{\beta}} \Big),
2255:    \end{equation*}
2256:    we obtain using again~\eqref{eq:deviaZnf}:
2257:    \begin{align*}
2258:      P_{i, j, k, 2} &\leq |F^{(i)}| \times |F^{(i-1)}| \times
2259:      \exp\Big( \frac{-a E_i^2 z^2(\delta, j, k)}{4 \delta_{i,j}^2 (1 +
2260:        2^{1/\beta})^2} \Big) \\
2261:      &= \exp\Big( \frac{2^{i+j+k}}{\delta^{\beta}} \big( 3 D / 2 - C_1
2262:      z^2 \big) \Big)
2263:    \end{align*}
2264:    where $C_1 = C_1(s, d, a) := a(2^{1/\beta -
2265:      1/2} - 1) / (8 (1 + 2^{1/\beta})^2) > 0$. Then, if we choose $z_1
2266:    := (3 / C_1)^{1/2}$, we have for any $z \geq z_1$ and $D_1 := C_1 /
2267:    2$:
2268:    \begin{align*}
2269:      \sum_{j, k \geq 0} P_{j,k} &\leq \sum_{j,k \geq 0} \Big(
2270:      P_{j,k,1} + \sum_{i \geq 1} P_{i,j,k,2} \Big) \\
2271:      &\leq \sum_{j,k \geq 0} \Big( \exp( -D_1 2^{j+k} z^2
2272:      \delta^{-\beta} ) + \sum_{i \geq 1} \exp( -D_1 2^{i+j+k} z^2
2273:      \delta^{-\beta} ) \Big)
2274:    \end{align*}
2275:    and the Theorem follows.
2276: \end{proof}
2277: 
2278: 
2279: \begin{proof}[Proof of Theorem~\ref{thm:least_sq}]
2280:   For short, we shall write $\bar f$ instead of $\bar f_\lambda$, and
2281:   $\pen(f)$ instead of $\pen_\lambda(f)$. In view
2282:   of~\eqref{eq:pena_least_sq}, we have
2283:   \begin{equation}
2284:     \label{eq:f_bar_prop}
2285:     \norm{Y - \bar f}_n^2 + \pen(\bar f) \leq \norm{Y - f}_n^2 +
2286:     \pen(f) \quad \forall f \in \mathcal F,
2287:   \end{equation}
2288:   which is equivalent to
2289:   \begin{equation*}
2290:     \norm{\bar f - f}_n^2 + \pen(\bar f) \leq 2 \prodsca{Y -
2291:       f}{\bar f - f}_n + \pen(f) \quad \forall f \in \mathcal F,
2292:   \end{equation*}
2293:   where $\prodsca{f}{g}_n = n^{-1} \sum_{i=1}^n f(X_i) g(X_i)$. This
2294:   entails, since $f_0 \in \mathcal F$, that
2295:   \begin{equation}
2296:     \label{eq:trick1}
2297:     \norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq
2298:     \frac{2}{\sqrt{n}} Z(\bar f - f_0) + \pen(f_0)
2299:   \end{equation}
2300:   where $Z(\cdot)$ is the empirical process given
2301:   by~\eqref{eq:Z_n_def}. Recall that $B_n(f_0, \delta)$ stands for the
2302:   ball centered at $f_0$ with radius $\delta$ for the norm
2303:   $\norm{\cdot}_n$. Let us introduce the event
2304:   \begin{equation}
2305:     \label{eq:event_Z}
2306:     \mathcal Z(z, \delta) := \Big\{ \sup_{f \in \mathcal F \cap
2307:       B_n(f_0, \delta)} \frac{ Z(f - f_0) }{\norm{f - f_0}_n^{1 -
2308:         \beta/2} (1 + |f|_{\mathcal F})^{\beta/2} } \leq z \Big\}.
2309:   \end{equation}
2310:   In view of Theorem~\ref{thm:devia1}, see
2311:   Section~\ref{sec:process_Z0}, we can find constants $z_1 > 0$ and
2312:   $D_1 > 0$ such that\textup:
2313:   \begin{align*}
2314:     P_n\big[ \mathcal Z(z, \delta)^\complement \big] \leq \exp( - D_1
2315:     z^2 \delta^{-\beta} ),
2316:   \end{align*}
2317:   for any $\delta > 0$ and $z \geq z_1$. When $2 n^{-1/2} Z(\bar f -
2318:   f_0) \leq \pen(f_0)$, we have $\norm{\bar f - f_0}_n^2 \leq 2
2319:   \pen(f_0)$. When $2 n^{-1/2} Z(\bar f - f_0) \geq \pen(f_0)$, we
2320:   have, for any $z>0$, in view of~\eqref{eq:trick1}, whenever $\bar f \in B_n(f_0,
2321:   \delta)$ for some $\delta > 0$, that on $\mathcal Z(z, \delta)$,
2322:   \begin{equation*}
2323:     \norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq \frac{4 z}{\sqrt{n}}
2324:     \norm{\bar f - f_0}_n^{1 - \beta/2} (1 + |\bar f|_{\mathcal
2325:       F})^{\beta/2}.
2326:   \end{equation*}
2327:   If $|\bar f|_{\mathcal F} \leq 1$, this entails
2328:   \begin{equation*}
2329:     \norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq ( a^{-2}(2^\beta 4
2330:     z)^{4 / (2 + \beta)} + 1) h^2.
2331:   \end{equation*}
2332:   Otherwise, we have
2333:   \begin{equation*}
2334:     \norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq \frac{2^{\beta/2} 4
2335:       z}{\sqrt{n}} \norm{\bar f - f_0}_n^{1 - \beta/2} |\bar
2336:     f|_{\mathcal F}^{\beta/2},
2337:   \end{equation*}
2338:   and we use the following lemma.
2339:   \begin{lemma}
2340:     \label{lem:logtrick}
2341:     Let $r, I, h, \varepsilon$ be positive numbers, $\beta \in (0, 2)$
2342:     and $\alpha > 2\beta / (\beta + 2)$. Then, if
2343:     \begin{equation}
2344:       \label{eq:logtrick}
2345:       r^2 + h^2 I^\alpha \leq \varepsilon\, r^{1 - \beta / 2} I^{\beta / 2},
2346:     \end{equation}
2347:     we have
2348:     \begin{equation*}
2349:       r \leq ( \varepsilon^{\alpha} h^{-\beta} )^{2 / (2\alpha + \alpha \beta
2350:         -  2 \beta)}, \quad I \leq (\varepsilon^2
2351:       h^{-(\beta + 2)})^{2 / (2 \alpha + \alpha \beta - 2\beta)}
2352:     \end{equation*}
2353:     and consequently
2354:     \begin{equation*}
2355:       r^2 + h^2 I^\alpha \leq 2 (\varepsilon^\alpha
2356:       h^{-\beta})^{4/(2\alpha + \alpha \beta - 2\beta)}.
2357:     \end{equation*}
2358:   \end{lemma}
2359:   The proof of this Lemma is given in Section~\ref{sec:lemmas_proofs}
2360:   below. It entails, since $h = a n^{-1 / (2 + \beta)}$ and $\alpha >
2361:   2\beta / (\beta+2)$, that
2362:   \begin{equation*}
2363:     \norm{\bar f - f_0}_n^2 + h^2 |\bar f|_{\mathcal F}^{\alpha} \leq
2364:     2 ((2^{\beta/2} 4 z)^{\alpha} a^{-\beta})^{4 / (2\alpha + \alpha
2365:       \beta - 2\beta)} n^{-2 / (\beta+2)}.
2366:   \end{equation*}
2367:    Thus,
2368:   when $\bar f \in B_n(f_0, \delta)$, we have on $\mathcal Z(z, \delta)$:
2369:   \begin{equation*}
2370:     \norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq p(z)^2 h^2
2371:   \end{equation*}
2372:   where
2373:   \begin{equation*}
2374:     p(z)^2 := C_1 (1 + z^{4 / (2 + \beta)} + z^{4\alpha / (2\alpha + \alpha
2375:       \beta - 2\beta)})
2376:   \end{equation*}
2377:   and $C_1$ is a constant depending on $\alpha, \beta$ and $a$.
2378:   % It can be readily seen that this inequality entails
2379: %   \begin{align}
2380: %     \label{eq:thm1trick}
2381: %     \nonumber \norm{\bar f - f_0}_n &\leq \Big(\frac{z 4
2382: %       2^{\beta/2}}{m^{\alpha} h^{2 \beta}} \Big)^{1/(2\alpha + \alpha
2383: %       \beta - 2\beta)} \\ &\leq (z 4 2^{\beta/2} a^{-\beta/\alpha}
2384: %     )^{2 \alpha/(2\alpha + \alpha \beta - 2\beta)} m^{-1 / (2 + \beta)},
2385: %   \end{align}
2386: %   where we used~\eqref{eq:bandwidth}, and
2387: %   \begin{equation}
2388: %     \label{eq:delta1}
2389: %     \norm{\bar f - f_0}_n \leq C (1 + |f_0|^{d /(2s+d)}) p(z) m^{-s /
2390: %       (2s + d)} =: p(z) \delta_1,
2391: %   \end{equation}
2392: %   on $\mathcal Z(z, \delta)$, where $p(z) := (z^{2\alpha s / (\alpha
2393: %     (2s + d) - 2d)} \vee z^{d / (2s+d)})$ and $C := 2^{(4s+d)/(2s+d)}
2394: %   \vee (4 2^{d/(2s)} a^{-d / (2\alpha s )})^{2\alpha s / (2\alpha s +
2395: %     \alpha d - 2 d)}$.
2396:   Let us assume for now that $\norm{\bar f - f_0}_n \leq \delta$ for
2397:   some $\delta > 0$, and let us introduce
2398:   \begin{equation*}
2399:     \mathcal Z_1(z, \delta) := \mathcal Z(z, \delta) \cap \mathcal
2400:     Z(z_1, p(z) h),
2401:   \end{equation*}
2402:   where $z_1$ is a constant coming from Theorem~\ref{thm:devia1}. On
2403:   $\mathcal Z_1(z, \delta)$, we have
2404:   \begin{equation}
2405:     \label{eq:on_Z1}
2406:     \norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq p(z_1)^2 h^2.
2407:   \end{equation}
2408:   Indeed, we have $\bar{f}\in B_n(f_0,\delta)$ thus, on $\mathcal Z(z, \delta)$, $\norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq p(z_1)^2 h^2$ and so
2409:   $\norm{\bar f - f_0}_n^2\leq p(z)^2 h^2$. Thus, on the event $\mathcal
2410:     Z(z_1, p(z) h)$, we have (\ref{eq:on_Z1}). Moreover, Theorem~\ref{thm:devia1} yields
2411:   \begin{equation}
2412:     \label{eq:deviaB1}
2413:     P_n \big[ \mathcal Z_1( z, \delta)^\complement \big] \leq \exp(
2414:     -D_1 z^2 \delta^{-\beta}) + \exp( -D_1 z_1^2 (p(z) h)^{-\beta}).
2415:   \end{equation}
2416:   Now, in view of~\eqref{eq:f_bar_prop} and since $f_0 \in \mathcal
2417:   F$, we have the following rough majoration:
2418:   \begin{align}
2419:     \nonumber \norm{\bar f - f_0}_n^2 + \pen(\bar f) &\leq 2
2420:     (\norm{\bar f - Y}^2_n + \pen(\bar f) ) + 2 \norm{f_0 - Y}_n^2
2421:     \\ \nonumber &\leq 2 ( \norm{f_0 - Y}_n^2 + \pen(f_0)) + 2
2422:     \norm{f_0 - Y}_n^2 \\
2423:     \label{eq:rough1}
2424:     &\leq 4 \sigma^2 \norm{\varepsilon}_n^2 + 2 \pen(f_0),
2425:   \end{align}
2426:   which entails
2427:   \begin{equation*}
2428:     E_n\big[ \big( \norm{\bar f - f_0}_n^2 + \pen(\bar f) \big)^2
2429:     \big] \leq \sigma^4 C(\varepsilon)^2 + 8 h^4 |f_0|_{\mathcal F}^{2\alpha}
2430:   \end{equation*}
2431:   where $C(\varepsilon)^2 = 32( E[\varepsilon^4] / n + 2
2432:   (E[\varepsilon^2])^2)$. Putting all this together, we obtain, by a
2433:   decomposition of $E_n[\norm{\bar f - f_0}_n^2 + \pen(\bar f)]$ over
2434:   the union of the sets $\{ \norm{\bar f - f_0}_n\leq \delta \} \cap
2435:   \mathcal Z_1(z, \delta)$, $\mathcal Z_1(z, \delta)^\complement$ and
2436:   $\{\norm{\bar f - f_0}_n > \delta \}$ that
2437:   \begin{align*}
2438:     E_n[ \norm{\bar f - &f_0}_n^2 + \pen(\bar f)] \leq p(z_1)^2 h^2 \\
2439:     &+ (\sigma^2 C(\varepsilon) + 2\sqrt{2} h^2
2440:     |f_0|_{\mathcal F}^\alpha)\big(
2441:     P_n[ \mathcal Z_1(z, \delta)^\complement]^{1/2}+P_n[ \norm{\bar f - f_0}_n > \delta]^{1/2}\big).
2442:   \end{align*}
2443:   In view of~\eqref{eq:rough1}, if $\delta > 2 \pen(f_0)\vee1$ then we have
2444:   $\{ \norm{\bar f - f_0}_n^2 > \delta^2 \} \subset \{
2445:   \norm{\varepsilon}_n^2 > (\delta^2 - \delta) / (4 \sigma^2)\}$.
2446:   Thus, using the subgaussianity assumption~\eqref{eq:subgaussian}, we
2447:   have $P[ \norm{\bar f - f_0}_n > \delta ]^{1/2} \leq \exp( - (\delta^2
2448:   - \delta)^2 / (8 \sigma^2)) \leq ( \exp(-C_2 (\log n)^4)) =
2449:   o(h^2)$ if one chooses $\delta = \log n$. Now,
2450:   using~\eqref{eq:deviaB1} with this choice of $\delta$ and $z = (\log
2451:   n)^{1 + \beta/2}$ we have also $P_n[ \mathcal Z_1(z,
2452:   \delta)^\complement]^{1/2} \leq \exp( -C_3 (\log n)^2) =
2453:   o(h^2)$. This concludes the proof of the first upper bound of
2454:   Theorem~\ref{thm:least_sq}.
2455: 
2456:   To prove the upper bound for the integrated norm $\norm{\cdot}$
2457:   instead of the empirical norm $\norm{\cdot}_n$, we decompose
2458:   $\norm{\bar f - f_0}^2 = A_1 + A_2$ where
2459:   \begin{equation*}
2460:     A_1 := \norm{\bar f - f_0}^2 - 8 ( \norm{\bar f - f_0}_n^2
2461:     + \pen(\bar f)) \text{ and } A_2 := 8 ( \norm{\bar f - f_0}_n^2 +
2462:     \pen(\bar f)).
2463:   \end{equation*}
2464:   The first part of Theorem~\ref{thm:least_sq} provides
2465:   \begin{equation*}
2466:     E^n[A_2] \leq C_1 ( 1 + |f_0|_{\mathcal F}^\alpha) n^{-2 / (2 +
2467:       \beta)}.
2468:   \end{equation*}
2469:   Recall that we assumed that $\norm{\bar f - f_0}_\infty \leq Q$
2470:   a.s. for the second part of the Theorem. To handle $A_1$, we use the
2471:   following Lemma.
2472:   \begin{lemma}
2473:     \label{lem:devia2}
2474:     Let $(\mathcal F, |\cdot|_{\mathcal F})$ and $h$ satisfy the same
2475:     assumptions as in Theorem~\ref{thm:least_sq}. Define $\mathcal F_Q
2476:     := \{ f \in \mathcal F : \norm{f - f_0}_\infty \leq Q \}$. We can
2477:     find constants $z_0, D_0 > 0$ such that for any $z \geq
2478:     z_0$\textup:
2479:     \begin{align*}
2480:       P_X^n \big[ \exists f \in \mathcal F_Q : \norm{f - f_0}^2 &- 8
2481:       (\norm{f - f_0}_n^2 + \pen(f)) \geq 10 z h^2 \big] \\
2482:       &\leq \exp \big( - D_0 n h^2 z \big),
2483:     \end{align*}
2484:     where $z_0$ and $D_0$ are constants depending on $a, \alpha,
2485:     \beta$ and $Q$.
2486:   \end{lemma}
2487:   The proof of Lemma~\ref{lem:devia2} is given in
2488:   Section~\ref{sec:lemmas_proofs}. Using together
2489:   Lemma~\ref{lem:devia2} and the fact that $A_1 \leq Q^2$ a.s., we
2490:   have by a decomposition over the union of $\{ A_1 \geq 10 z_0 h^2
2491:   \}$ and $\{ A_1 < 10 z_0 h^2 \}$:
2492:   \begin{equation*}
2493:     E^n [A_1] \leq 10 z_0 h^2 + o(h^2).
2494:   \end{equation*}
2495:   This concludes the proof of Theorem~\ref{thm:least_sq}.
2496: \end{proof}
2497: %%%% END OF PROOF %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2498: 
2499: 
2500: \begin{figure}[htbp]
2501: %% \includegraphics[width=12cm]{designs.pdf}
2502:   \begin{tikzpicture}[scale=2]
2503:     \draw[thick] (0,0) node[anchor=south] {$f_0$} circle (1); %
2504:     \draw (0,0) -- (1,0) node[anchor=west] {$f_1$}; %
2505:     \draw (0,0) -- (40:1cm) node[anchor=south west]{$f_{M-1}$}; %
2506:     \draw (0,0) -- (150:1cm) node[anchor=south east] {$f_3$}; %
2507:     \draw (0,0) -- (190:1cm) node[anchor=east]{$f_{2}$}; %
2508:     \draw (0,0) -- (290:1cm) ; %
2509:     \draw[<->, very thick] (290:1cm) -- (290:0.6cm) node[anchor=east]
2510:     {$f_M$} node[pos=0.5, right] {$h$}; %
2511:     \draw[mark=x] (50:1cm) ;
2512:   \end{tikzpicture}
2513:   \caption{Example of a setup in which ERM performs badly. The set
2514:     $F(\Lambda) = \{f_1, \ldots, f_M \}$ is the dictionary from which
2515:     we want to mimic the best element and $f_0$ is the regression
2516:     function.}
2517:   \label{fig:badsetup}
2518: \end{figure}
2519: 
2520: \begin{proof}[Proof of Theorem \ref{TheoWeaknessERMRegression}]
2521:   We consider a random variable $X$ uniformly distributed on $[0,1]$
2522:   and its dyadic representation:
2523:   \begin{equation}
2524:     \label{EquaDyadicRegression}
2525:     X = \sum_{k = 1}^{+\infty} X^{(k)} 2^{-k},
2526:   \end{equation}
2527:   where $(X^{(k)} : k \geq 1)$ is a sequence of i.i.d. random
2528:   variables following a Bernoulli $\cB(1/2,1)$ with parameter $1/2$.
2529:   The random variable $X$ is the design of the regression model worked
2530:   out here. For the regression function we take
2531:   \begin{equation}
2532:     \label{FunctionBasisRegression}
2533:     f_0(x) =
2534:     \begin{cases}
2535:       \; 2h &\text{ if } x^{(M)} = 1 \\
2536:       \; h & \text{ if } x^{(M)} = 0,
2537:     \end{cases}
2538:   \end{equation}
2539:   where $x$ has the dyadic decomposition $x=\sum_{k \geq 1}
2540:   x^{(k)}2^{-k}$ where $x^{(k)} \in \{ 0, 1 \}$ and
2541:   \begin{equation*}
2542:     h=\frac{C}{4}\sqrt{\frac{\log M}{n}}.
2543:   \end{equation*}
2544:   We consider the dictionary of functions $F_M = \{f_1, \ldots, f_M\}$
2545:   \begin{equation}
2546:     \label{FunctionBasisRegression}
2547:     f_j(x) = 2x^{(j)}-1, \quad \forall j\in\{1,\ldots,M\},
2548:   \end{equation}
2549:   where again $(x^{(j)} : j \geq 1)$ is the dyadic decomposition of $x
2550:   \in [0,1]$. The dictionary $F_M$ is chosen so that we have, for any
2551:   $j \in \{ 1, \ldots ,M-1 \}$
2552:   \begin{equation*}
2553:     \| f_j - f_0 \|_{L^2([0,1])}^2 = \frac{5 h^2}{2} + 1 \;\text{ and }\;
2554:     \|f_M - f_0 \|_{L^2([0,1])}^2 = \frac{5h^2}{2} - h + 1.
2555:   \end{equation*}
2556:   Thus, we have
2557:   \begin{equation*}
2558:     \min_{j=1,\ldots,M} \|f_j - f_0 \|_{L^2([0,1])}^2 = \|f_M - f_0
2559:     \|_{L^2([0,1])}^2 = \frac{5h^2}{2} -h + 1.
2560:   \end{equation*}
2561:   This geometrical setup for $F(\Lambda)$, which is a unfavourable
2562:   setup for the ERM, is represented in Figure~\ref{fig:badsetup}. For
2563:   \begin{equation*}
2564:     \hat{f}_n := \tilde{f}_n^{\rm PERM} \in \argmin_{f \in F_M}
2565:     \big(R_n(f) + \pen(f) \big),
2566:   \end{equation*}
2567:   where we take $R_n(f) = \frac{1}{n} \sum_{i=1}^n (Y_i-f(X_i))^2 =\|
2568:   Y - f \|^2_n$, we have
2569:   \begin{equation}
2570:     \label{InegGaussian}
2571:     E \|\hat{f}_n - f_0 \|_{L^2([0,1])}^2 =
2572:     \min_{j=1,\ldots,M} \|f_j - f_0 \|_{L^2([0,1])}^2 + h
2573:     P[\hat{f}_n\neq f_M].
2574:   \end{equation}
2575:   Now, we upper bound $P[ \hat{f}_n= f_M]$. If we define
2576:   \begin{equation*}
2577:     N_j := \frac{1}{\sqrt{n}} \sum_{i=1}^n\zeta_i^{(j)}
2578:     \varepsilon_i \text{ and } \zeta_i^{(j)} := 2X_i^{(j)}-1,
2579:   \end{equation*}
2580:   we have by the definition of $h$ and since $\zeta_i^{(j)} \in \{ -1,
2581:   1\}$:
2582:   \begin{align*}
2583:     \frac{\sqrt{n}}{2 \sigma} (\norm{Y - f_M}_n^2 &- \norm{Y -
2584:       f_j}_n^2) \\
2585:     & = N_j - N_M + \frac{h}{2 \sigma \sqrt{n}} \sum_{i=1}^n
2586:     (\zeta_i^{(j)} \zeta_i^{(M)} + 3(\zeta_i^{(j)} - \zeta_i^{(M)}) -
2587:     1) \\
2588:     &\geq N_j - N_M - \frac{4C}{\sigma} \sqrt{\log M}.
2589:   \end{align*}
2590:   This entails, for $\bar N_{M-1} := \max_{1 \leq j \leq N-1} N_j$,
2591:   that
2592:   \begin{align*}
2593:     P[ \hat{f}_n= f_M] &= P \Big[ \bigcap_{j=1}^{M-1} \Big\{ \norm{Y -
2594:       f_M}_n^2 - \norm{Y - f_j}_n^2 \leq \pen(f_j) - \pen(f_M) \Big\}
2595:     \Big] \\
2596:     &\leq P\Big[ N_M \geq \bar N_{M-1} - \frac{6C}{\sigma} \sqrt{\log
2597:       M} \Big].
2598:   \end{align*}
2599:   % \begin{eqnarray*}
2600:   %   \lefteqn{\mathbb{P}[\hat{f}_n= f_M]= \mathbb{P}[\forall
2601:   %     j=1,\ldots,M-1, A_n(f_M)+
2602:   %     {\rm{pen}}(f_M)\leq A_n(f_j)+{\rm{pen}}(f_j)]}\\
2603:   %   &=&\mathbb{P}[\forall j=1,\ldots,M-1,
2604:   %   \frac{1}{\sqrt{n}}\sum_{i=1}^n (Y_i-f_M(X_i))^2 \leq
2605:   %   \frac{1}{\sqrt{n}}\sum_{i=1}^n
2606:   %   (Y_i-f_j(X_i))^2\\
2607:   %   &&+\sqrt{n}({\rm{pen}}(f_j)-{\rm{pen}}(f_M))]\\
2608:   %   &\leq& \mathbb{P}[\forall j=1,\ldots,M-1, N_M\geq
2609:   %   N_j\\&&+\frac{1}{\sigma\sqrt{n}}\sum_{i=1}^n
2610:   %   \frac{h}{2}(\zeta_i^{(M)}\zeta_i^{(j)}-1)
2611:   %   +\frac{3h}{2}(\zeta_i^{(j)}-1)-\frac{C}{\sigma}\sqrt{\log M}],
2612:   % \end{eqnarray*}
2613:   % where for any
2614:   % $j=1,\ldots,M$,
2615:   It is easy to check that $N_1, \ldots, N_M$ are $M$ normalized
2616:   standard gaussian random variables uncorrelated (but dependent). We
2617:   denote by $\boldsymbol{\zeta}$ the family of Rademacher variables
2618:   $(\zeta_i^{(j)} : i=1,\ldots,n ; j=1,\ldots,M)$. We have for any
2619:   $6C/\sigma <\gamma< (2\sqrt{2}c^*)^{-1}$ ($c^*$ is the ``Sudakov
2620:   constant'', see Theorem~\ref{TheoSudakov}),
2621:   \begin{align}
2622:     \label{EquaSudakov}
2623:     P[\hat{f}_n = f_M] &\leq E \Big[ P\Big( N_M \geq \bar N_{M-1} -
2624:     \frac{6C}{\sigma}\sqrt{\log M} \Big| \boldsymbol{\zeta} \Big)
2625:     \Big] \nonumber \\
2626:     &\leq P \big[ N_M \geq - \gamma \sqrt{\log M}
2627:     +  E(\bar N_{M-1} | \boldsymbol{\zeta} ) \big] \\
2628:     &+ E \Big[ P\Big\{ E( \bar N_{M-1} | \boldsymbol{\zeta} ) - \bar
2629:     N_{M-1} \geq (\gamma - \frac{6C}{\sigma}) \sqrt{\log M} \Big|
2630:     \boldsymbol{\zeta} \Big\} \Big]. \nonumber
2631:   \end{align}
2632:   Conditionally to $\boldsymbol{\zeta}$, the vector
2633:   $(N_1,\ldots,N_{M-1})$ is a linear transform of the Gaussian vector
2634:   $(\varepsilon_1, \ldots, \varepsilon_n)$. Hence, conditionally to
2635:   $\boldsymbol{\zeta}$, $(N_1,\ldots,N_{M-1})$ is a gaussian
2636:   vector. Thus, we can use a standard deviation result for the
2637:   supremum of Gaussian random vectors (see for
2638:   instance~\cite{massart03}, Chapter~3.2.4), which leads to the
2639:   following inequality for the second term of the RHS
2640:   in~\eqref{EquaSudakov}:
2641:   \begin{align*}
2642:     % \label{EquaSecondTerm}
2643:     P \Big\{ E( \bar N_{M-1} | \boldsymbol{\zeta} ) - \bar N_{M-1}
2644:     \geq (\gamma &- \frac{6C}{\sigma}) \sqrt{\log M} \Big|
2645:     \boldsymbol{\zeta}
2646:     \Big\} \\
2647:     &\leq \exp(-(3C/\sigma-\gamma/2)^2\log M).
2648:     % \mathbb{P}[\mathbb{E} [ \max_{j=1,\ldots,M-1}N_j |
2649:     % \boldsymbol{\zeta}]&-\max_{j=1,\ldots,M-1}N_j\geq
2650:     % (\gamma-2C/\sigma)\sqrt{\log
2651:     %   M}|\boldsymbol{\zeta}]\nonumber\\
2652:     % &\leq \exp(-(C/\sigma-\gamma/2)^2\log M).
2653:   \end{align*}
2654:   Remark that we used $E[ N_j^2 | \boldsymbol{\zeta}] = 1$ for any $j
2655:   = 1, \ldots, M-1$. For the first term in the RHS
2656:   of~\eqref{EquaSudakov}, we have
2657:   \begin{align}
2658:     \label{EquaIerTermSudakov}
2659:     P &\Big [N_M \geq - \gamma \sqrt{\log M}
2660:     + E( \bar N_{M-1} | \boldsymbol{\zeta} ) \Big] \nonumber\\
2661:     &\leq P \Big[N_M \geq - 2 \gamma \sqrt{\log M}
2662:     + E(\bar N_{M-1}) \Big] \\
2663:     &+P \Big[ - \gamma\sqrt{\log M} + E(\bar N_{M-1}) \geq E(\bar
2664:     N_{M-1} | \boldsymbol{\zeta}) \Big]. \nonumber
2665:   \end{align}
2666:   Next, we use Sudakov's Theorem (cf. Theorem \ref{TheoSudakov} in
2667:   Appendix~\ref{sec:appendix_proba}) to lower bound $E( \bar
2668:   N_{M-1})$. Since $(N_1,\ldots,N_{M-1})$ is, conditionally to
2669:   $\boldsymbol{\zeta}$, a Gaussian vector and since for any $1 \leq j
2670:   \neq k \leq M$ we have
2671:   \begin{equation*}
2672:     E[(N_k-N_j)^2 | \boldsymbol{\zeta}] = \frac{1}{n}
2673:     \sum_{i=1}^n (\zeta_i^{(k)} - \zeta_i^{(j)})^2
2674:   \end{equation*}
2675:   then, according to Sudakov's minoration
2676:   (cf. Theorem~\ref{TheoSudakov} in the Appendix), there exits an
2677:   absolute constant $c^* > 0$ such that
2678:   \begin{equation*}
2679:     %\label{EquaSudakPrimal}
2680:     c^* E[\bar N_{M-1} | \boldsymbol{\zeta}] \geq
2681:     \min_{1 \leq j \neq k \leq M-1} \Big(\frac{1}{n}\sum_{i=1}^n
2682:     (\zeta_i^{(k)} - \zeta_i^{(j)})^2\Big)^{1/2} \sqrt{\log M}.
2683:   \end{equation*}
2684:   Thus, we have
2685:   \begin{align*}
2686:     \label{EquaSudak3}
2687:     c^* E[\bar N_{M-1}] &\geq E\Big[ \min_{j \neq k} \Big(\frac{1}{n}
2688:     \sum_{i=1}^n (\zeta_i^{(k)} - \zeta_i^{(j)})^2
2689:     \Big)^{1/2} \Big] \sqrt{\log M} \\
2690:     &\geq \sqrt{2} \Big(1 - E\Big[ \max_{j\neq k} \frac{1}{n}
2691:     \sum_{i=1}^n \zeta_i^{(k)} \zeta_i^{(j)} \Big] \Big) \sqrt{\log M},
2692:   \end{align*}
2693:   where we used the fact that $\sqrt{x} \geq x/\sqrt{2}, \forall x \in
2694:   [0,2]$.
2695:   % \begin{equation}
2696:   %   \label{equaSudak2}
2697:   %   E\Big[\min_{k \neq j \in \{1, \ldots, M-1\} } \Big( \frac{1}{n}
2698:   %   \sum_{i=1}^n (\zeta_i^{(k)} - \zeta_i^{(j)})^2 \Big)^{1/2} \Big]
2699:   %   \geq \sqrt{2} \Big(1 - E\Big[ \max_{j\neq k} \frac{1}{n} \sum_{i=1}^n
2700:   %   \zeta_i^{(k)} \zeta_i^{(j)} \Big] \Big)
2701:   % \end{equation}
2702:   Besides, using Hoeffding's inequality we have $E[\exp(s
2703:   \xi^{(j,k)})] \leq \exp(s^2/(2n))$ for any $s > 0$, where
2704:   $\xi^{(j,k)} := n^{-1} \sum_{i=1}^n \zeta_i^{(k)} \zeta_i^{(j)}$.
2705:   Then, using a maximal inequality (cf.  Theorem~\ref{TheoMaxConcIneq}
2706:   in Appendix~\ref{sec:appendix_proba}) and since $n^{-1}
2707:   \log[(M-1)(M-2)] \leq 1/4$, we have
2708:   \begin{equation}
2709:     \label{EquaSudakFinal}
2710:     E\Big[\max_{j\neq k} \frac{1}{n} \sum_{i=1}^n
2711:     \zeta_i^{(k)} \zeta_i^{(j)} \Big] \leq
2712:     \Big(\frac{1}{n} \log[(M-1)(M-2)] \Big)^{1/2} \leq
2713:     \frac{1}{2}.
2714:   \end{equation}
2715:   This entails
2716:   \begin{equation*}
2717:     c^* E[ \bar N_{M-1} ] \geq \Big(\frac{\log M}{2} \Big)^{1/2}.
2718:   \end{equation*}
2719:   Thus, using this inequality in the first RHS
2720:   of~\eqref{EquaIerTermSudakov} and the usual inequality on the tail
2721:   of a Gaussian random variable ($N_M$ is standard Gaussian), we
2722:   obtain:
2723:   \begin{align}
2724:     \label{EquaFirstTerm}
2725:     P\Big[N_M \geq &-2\gamma \sqrt{\log M} + E(\bar N_{M-1}) \Big]
2726:     \leq P\Big[ N_M \geq ((c^*\sqrt{2})^{-1}-2\gamma)
2727:     \sqrt{\log M}\Big]\nonumber\\
2728:     &\leq \mathbb{P}\Big[N_M \geq ((c^*\sqrt{2})^{-1}-2\gamma)
2729:     \sqrt{\log
2730:       M}\Big]\\
2731:     &\leq \exp\Big(-((c^*\sqrt{2})^{-1}-2\gamma)^2(\log
2732:     M)/2\Big).\nonumber
2733:   \end{align}
2734:   Remark that we used $2\sqrt{2}c^* \gamma < 1$. For the second term
2735:   in (\ref{EquaIerTermSudakov}), we apply the concentration inequality
2736:   of Theorem \ref{TheoEinmahlMasson} to the non-negative random
2737:   variable $E[\bar N_{M-1}|\boldsymbol{\zeta}]$. We first have to
2738:   control the second moment of this variable. We know that,
2739:   conditionally to $\boldsymbol{\zeta}$,
2740:   $N_j|\boldsymbol{\zeta}\sim\cN(0,1)$ thus,
2741:   $N_j|\boldsymbol{\zeta}\in L_{\psi_2}$ (for more details on Orlicz
2742:   norm, we refer the reader to~\cite{vdVW:96}). Thus,
2743:   \begin{equation*}
2744:     \norm{\max_{1\leq j\leq M-1} N_j|\boldsymbol{\zeta}}_{\psi_2}\leq K
2745:     \psi_2^{-1}(M)\max_{1\leq j\leq M-1}\norm{N_j|\boldsymbol{\zeta}}_{\psi_2}
2746:   \end{equation*}
2747:   (cf. Lemma 2.2.2 in \cite{vdVW:96}). Since
2748:   $\norm{N_j|\boldsymbol{\zeta}}_{\psi_2}^2=1$, we have $\norm{\max_{1\leq j\leq M-1}
2749:     N_j|\boldsymbol{\zeta}}_{\psi_2}\leq K \sqrt{\log M}$. In particular, we have
2750:   $E\big[\max_{1\leq j\leq M-1} N_j^2|\boldsymbol{\zeta}\big]\leq
2751:   K\log M$ and so $E\big(E[\bar
2752:   N_{M-1}|\boldsymbol{\zeta}]\big)^2\leq K\log M$. Theorem
2753:   \ref{TheoEinmahlMasson} provides
2754:   \begin{equation}
2755:     \label{SecondTermEquaSuda}
2756:     P\Big[ -\gamma\sqrt{\log
2757:       M}+E[\bar N_{M-1}]\geq E[\bar N_{M-1}|\boldsymbol{\zeta}]\Big]\leq
2758:     \exp(-\gamma^2/c_0),
2759:   \end{equation}
2760:   where $c_0$ is an absolute constant.
2761: 
2762: Finally, combining (\ref{EquaSudakov}), (\ref{EquaFirstTerm}),
2763: (\ref{EquaIerTermSudakov}), (\ref{SecondTermEquaSuda}) in the initial
2764: inequality (\ref{EquaSudakov}), we obtain
2765: \begin{align*}
2766: P[\hat{f}_n= f_M] &\leq \exp(-(3C/\sigma-\gamma)^2\log M)\\
2767: &+
2768: \exp\Big(-((c^*\sqrt{2})^{-1}-2\gamma)^2(\log M)/2\Big)+
2769: \exp(-\gamma^2/c_0).
2770: \end{align*}
2771: Take $\gamma=(12\sqrt{2}c^*)^{-1}$. It is easy to find an integer $M_0(\sigma)$ depending only on $\sigma$ such that for any $M\geq M_0$, we have $P[\hat{f}_n= f_M]\leq c_1<1$, where $c_1$ is an absolute constant.
2772: We complete the proof by using this last result in
2773: (\ref{InegGaussian}).
2774: \end{proof}
2775: %%%% END OF PROOF %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%AGGREGATION
2776: 
2777: 
2778: \begin{proof}[Proof of Theorem~\ref{thm:oracle}]
2779:   We recall that we have a dictionary (set of functions) $F(\Lambda)$
2780:   of cardinality $M$ such that $\norm{f_\lambda - f_0}_\infty \leq Q$
2781:   for all $\lambda \in \Lambda$. Let us define the risk
2782:   \begin{equation*}
2783:     R(f) := E[(Y - f(X))^2]
2784:   \end{equation*}
2785:   and the linearized risk over $F(\Lambda)$, given by
2786:   \begin{equation*}
2787:     \mathsf R(\theta) := \sum_{\lambda \in \Lambda} \theta_\lambda
2788:     R(f_\lambda)
2789:   \end{equation*}
2790:   for $\theta \in \Theta$, where we recall that
2791:   \begin{equation*}
2792:     \Theta := \{ \theta \in \mathbf R^{|\Lambda|} ; \theta_\lambda
2793:     \geq 0,\; \sum_{\lambda \in \Lambda} \theta_\lambda = 1 \}.
2794:   \end{equation*}
2795:   We denote by $R_{n}(f)$ the empirical risk of $f$ over the sample
2796:   $D_{n}$, which is given by
2797:   \begin{equation*}
2798:     R_{n}(f) := \frac{1}{n} \sum_{i=1}^n (Y_i - f(X_i))^2,
2799:   \end{equation*}
2800:   and we define similarly the linearized empirical risk
2801:   \begin{equation*}
2802:     \mathsf R_{n}(\theta) := \sum_{\lambda \in \Lambda}
2803:     \theta_\lambda R_{n}(f_\lambda).
2804:   \end{equation*}
2805:   The excess risk of a function $f$ is given by $R(f) - R(f_0) =
2806:   \norm{f - f_0}^2$. By convexity of the risk, the aggregate $\hat
2807:   {\mathsf f}= \sum_{\lambda \in \Lambda} \hat \theta_\lambda
2808:   f_\lambda$ defined in (\ref{eq:aggregate}), satisfies, for any $a >
2809:   0$,
2810:   \begin{align*}
2811:     R(\hat {\mathsf f}) - R(f_0) &\leq \mathsf R(\hat \theta) - R(f_0) \\
2812:     &\leq (1 + a) (\mathsf R_{n}(\hat \theta) - R_{n}(f_0)) \\
2813:     &+ \mathsf R(\hat \theta) - R(f_0) - (1 + a) (\mathsf R_{n}(\hat
2814:     \theta) - R_{n}(f_0)),
2815:   \end{align*}
2816:   where it is easy to see that the Gibbs weights $\hat \theta = (\hat
2817:   \theta_\lambda)_{\lambda \in \Lambda} = (\hat
2818:   \theta(f_\lambda))_{\lambda \in \Lambda}$ are the unique solution to
2819:   the minimization problem
2820:   \begin{equation*}
2821:     \min_{\theta \in \Theta} \Big\{ \mathsf R_{n}(\theta) +
2822:     \frac{T}{ n} \sum_{\lambda \in \Lambda} \theta_\lambda \log
2823:     \theta_\lambda \Big\},
2824:   \end{equation*}
2825:   where $T$ is the temperature parameter, see~\eqref{eq:weights}, and
2826:   where we use the convention $0 \log 0 = 0$. Let $\hat \lambda$ be
2827:   such that $f_{\hat \lambda}$ is the ERM in $F(\Lambda)$, namely
2828:   \begin{equation*}
2829:     R_{n}(f_{\hat \lambda}) := \min_{\lambda \in \Lambda}
2830:     R_{n}(f_\lambda).
2831:   \end{equation*}
2832:   Since
2833:   \begin{equation*}
2834:     \sum_{\lambda \in \Lambda} \hat \theta_\lambda \log \Big( \frac{\hat
2835:       \theta_\lambda}{1 / |\Lambda|} \Big) = K(\hat \theta | u) \geq 0
2836:   \end{equation*}
2837:   where $K(\hat \theta | u)$ denotes the Kullback-Leibler divergence
2838:   between the weights $\hat \theta$ and the uniform weights $u := (1 /
2839:   |\Lambda|)_{\lambda \in \Lambda}$, we have
2840:   \begin{align*}
2841:     \mathsf R_{n}(\hat \theta) &\leq \mathsf R_{n}(\hat \theta) +
2842:     \frac{T}{ n} K(\hat \theta | u) \\
2843:     &= \mathsf R_{n}(\hat \theta) + \sum_{\lambda \in \Lambda} \hat
2844:     \theta_\lambda \log \hat \theta_\lambda + \frac{T\log |\Lambda|}{
2845:       n} \\
2846:     &\leq \mathsf R_{n}(e_{\hat \lambda}) + \frac{T\log |\Lambda|}{
2847:       n} = R_{n}(f_{\hat \lambda}) + \frac{T\log |\Lambda|}{n},
2848:   \end{align*}
2849:   where $e_\lambda \in \Theta$ is the vector with $1$ for the
2850:   $\lambda$-th coordinate and $0$ elsewhere. This gives
2851:   \begin{align*}
2852:     R(\hat {\mathsf f}) - R(f_0) &\leq (1 + a) \min_{\lambda \in \Lambda}
2853:     (R_{n}(f_\lambda) - R_{n}(f_0))+ (1 + a)
2854:     \frac{T\log |\Lambda|}{ n} \\
2855:     &+ \mathsf R(\hat \theta) - R(f_0) - (1 + a) (\mathsf R_{n}(\hat
2856:     \theta) - R_{n}(f_0)),
2857:   \end{align*}
2858:   and consequently
2859:   \begin{align*}
2860:     E \norm{\hat {\mathsf f} - f_0}^2 &\leq (1 + a) \min_{\lambda \in
2861:       \Lambda} \norm{f_\lambda - f_0}^2 + (1 + a) \frac{T\log
2862:       |\Lambda|}{n} \\
2863:     &+ E[ \mathsf R(\hat \theta) - R(f_0) - (1 + a) (\mathsf
2864:     R_{n}(\hat \theta) - R_{n}(f_0)) ].
2865:   \end{align*}
2866:   Since $\mathsf R(\cdot)$ and $\mathsf R_{n}$ are linear on
2867:   $\Theta$, we have
2868:   \begin{align*}
2869:     \mathsf R(\hat \theta) - R(f_0) &- (1 + a) (\mathsf R_{n}(\hat
2870:     \theta) - R_{n}(f_0)) \\
2871:     &\leq \max_{f \in F(\Lambda)} ( R(f) - R(f_0) - (1 + a)
2872:     (R_{n}(f) - R_{n}(f_0)) ).
2873:   \end{align*}
2874:   Thus, we have
2875:   \begin{equation}\label{eq:Main0}
2876:     E \norm{\hat {\mathsf f} - f_0}^2 \leq (1 + a)
2877:     \min_{\lambda \in \Lambda} \norm{f_\lambda - f_0}^2 + (1 + a)
2878:     \frac{\log |\Lambda|}{T n} + \mathcal R_n,
2879:   \end{equation}
2880:   where $\mathcal R_n := E [ \max_{f \in F(\Lambda)} \{ R(f) - R(f_0)
2881:   - (1 + a) (R_{n}(f) - R_{n}(f_0)) \} ] $. Now, we upper bound
2882:   $\mathcal R_n$. Introduce the random variables
2883:   \begin{align*}
2884:     \tilde{Z}_i(f) &:= (f(X_i) - f_0(X_i))^2 + 2 \sigma \varepsilon_i
2885:     I( |\varepsilon_i| \leq K) (f_0(X_i) - f(X_i)), \\
2886:     \bar Z_i(f) &:= 2 \sigma \varepsilon_i I(|\varepsilon_i| > K)
2887:     (f_0(X_i) - f(X_i)),
2888:   \end{align*}
2889:   and the two following processes indexed by $f \in F(\Lambda)$:
2890:   \begin{equation*}
2891:     \tilde{\zeta}(f) := \frac{1}{n}\sum_{i=1}^n \Big(
2892:     E[\tilde{Z}_i(f)] - (1+a) \tilde{Z}_i(f) \Big) \text{ and }
2893:     \bar{\zeta}(f) := \frac{1+a}{n} \sum_{i=1}^n\bar{Z}_i(f).
2894:   \end{equation*}
2895:   We use the symmetry of $\varepsilon$ to get
2896:   \begin{equation*}
2897:     \mathcal R_n \leq E \Big[ \max_{f \in F(\Lambda)}
2898:     \tilde{\zeta}(f) \Big] + E \Big[ \max_{f \in F(\Lambda)}
2899:     \bar{\zeta}(f) \Big].
2900:   \end{equation*}
2901:   First, we upper bound $E[ \max_{f \in F(\Lambda)}
2902:   \tilde{\zeta}(f)]$. The random variable $\tilde{\zeta}(f)$ is
2903:   bounded and satisfies the following Bernstein's type condition
2904:   (see~\cite{BM:06}): $\forall f \in F(\Lambda), E [
2905:   \tilde{\zeta}(f)^2] \leq (Q^2 + 4 \sigma^2) E[\tilde{\zeta}(f)]$. We
2906:   apply the union bound and the Bernstein's inequality
2907:   (cf. \cite{vdVW:96}) to get, for any $\delta>0$,
2908:   \begin{align*}
2909:     P \Big[\max_{f\in F(\Lambda)} \tilde{\zeta}(f) \geq \delta \Big]
2910:     &\leq \sum_{f\in F(\Lambda)} P\Big[ \frac{1}{n}\sum_{i=1}^n
2911:     E[\tilde{Z}_i(f)] - \tilde{Z}_i(f) \geq
2912:     \frac{\delta + a E[\tilde{Z}_i(f)] }{1+a} \Big] \\
2913:     &\leq M \exp(-C n \delta),
2914:   \end{align*}
2915:   where $C := a [8 (Q^2 + \sigma^2 (1 + a)^2 + (4Q / 3)(1 + a)(Q +
2916:   2K)]^{-1}$. Hence, a direct computation gives
2917:   \begin{equation}
2918:     \label{eq:I1}
2919:     E\Big[ \max_{f\in F(\Lambda)} \tilde{\zeta}(f) \Big] \leq
2920:     \frac{4 \log M}{C n}.
2921:   \end{equation}
2922:   Now, we upper bound $E [\max_{f\in F(\Lambda)}\bar{\zeta}(f) ]$. We
2923:   have
2924:   \begin{align}
2925:     \label{eq:I2}
2926:     \nonumber E \Big[ \max_{f\in F(\Lambda)} \bar{\zeta}(f) \Big]
2927:     &\leq 4 Q (1 + a) E \big[ |\varepsilon| I(|\varepsilon| > K) \big] \\
2928:     &\leq 4 Q (1 + a) \sigma P (|\varepsilon|>K)^{1/2} \\
2929:     &\leq 4Q(1+a) \sigma \exp(-K^2 / (2 b_\varepsilon^2)).
2930:   \end{align}
2931:   Finally, combining equations \eqref{eq:Main0},~\eqref{eq:I1})
2932:   and~\eqref{eq:I2} with $K = b_\varepsilon \sqrt{2 \log n}$,
2933:   concludes the proof of Theorem~\ref{thm:oracle}.
2934: \end{proof}
2935: 
2936: %%%% END OF PROOF %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%AGGREGATION
2937: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2938: 
2939: 
2940: 
2941: % \begin{proof}[Proof of Theorem~\ref{thm:adaptive_anisotropic}]
2942: 
2943: %   Let $f_0 \in B_{p, q}^{\bs s}$ for $\bs s \in \bs S$. Consider
2944: %   $\bs s_n \in \bs S_n$ such that $\bs s_n \leq \bs s \leq \bs s_n +
2945: %   u (\log n)^{-1}$ coordinatewise, where $u = (1, \ldots, 1)$. In
2946: %   view of embedding~\eqref{eq:anisotropic_embedding}, we have $B_{p,
2947: %     q}^{\bs s} \subset B_{p, q}^{\bs s_n}$ and if $r_n(\bs s) =
2948: %   n^{-g(\bs s)}$ where
2949: %   \begin{equation*}
2950: %     g(\bs s) = g(s_1, \ldots, s_d) = \Big(2 + \sum_{i=1}^d
2951: %     \frac{d}{s_i} \Big)^{-1},
2952: %   \end{equation*}
2953: %   it is easy to see that
2954: %   \begin{equation*}
2955: %     r_n(\bs s) \leq r_n(\bs s_n) \leq \exp(d^2) r_n(\bs s).
2956: %   \end{equation*}
2957: %   The proof is then a direct consequence of the oracle inequality from
2958: %   Theorem~\ref{thm:oracle} and the upper bound for PERM from
2959: %   Theorem~\ref{thm:least_sq}. \texttt{rajouter quelques details...}
2960: % \end{proof}
2961: 
2962: 
2963: \section{Proofs of the lemmas}
2964: \label{sec:lemmas_proofs}
2965: 
2966: 
2967: \begin{proof}[Proof of Lemma~\ref{lem:logtrick}]
2968:   Since $\beta \in (0, 2)$ we have $\alpha > 2 \beta / (\beta + 2) >
2969:   \beta/2$. Thus, inequality~\eqref{eq:logtrick} gives
2970:   \begin{align*}
2971:     \log(r^2 + h^2 I^\alpha) &\leq \log(\varepsilon) + (1 -
2972:     \frac{\beta}{2}) \log(r) - (1 - \frac{\beta}{2\alpha}) \log(r^2)
2973:     \\
2974:     & - \frac{\beta}{\alpha} \log(h) + (1 - \frac{\beta}{2\alpha})
2975:     \log(r^2) + \frac{\beta}{2\alpha} \log(h^2 I^\alpha) \\
2976:     &\leq \log(\varepsilon) + (\frac{\beta}{\alpha} - 1 -
2977:     \frac{\beta}{2}) \log(r) - \frac{\beta}{\alpha} \log(h) + \log(
2978:     r^2 + h^2 I^\alpha)
2979:   \end{align*}
2980:   and consequently
2981:   \begin{equation*}
2982:     r^{1 + \beta / 2 - \beta/\alpha} \leq \varepsilon h^{-\beta/\alpha}
2983:   \end{equation*}
2984:   which entails $r \leq ( \varepsilon^{\alpha} h^{-\beta} )^{2 / (2\alpha
2985:     + \alpha \beta - 2 \beta)}$. Now, using this inequality together
2986:   with $h^2 I^\alpha \leq \varepsilon\, r^{1 - \beta / 2} I^{\beta / 2}$
2987:   provides the upper bound for $I$. The last inequality easily follows.
2988: \end{proof}
2989: 
2990: 
2991: \begin{proof}[Proof of Lemma~\ref{lem:devia2}]
2992:   [The proof consists of a \emph{peeling} of $\mathcal F$ into
2993:   subspaces with complexity controlled by Assumption~$(C_\beta)$ and
2994:   the use of Bernstein's inequality.] Let us denote for short
2995:   $\mathcal F$ instead of $\mathcal F_Q$. Since $\bar f \in \mathcal
2996:   F$, we have
2997:   \begin{align*}
2998:     P \big[ \norm{\bar f &- f_0}^2 - 8 (\norm{\bar f - f_0}_n^2 +
2999:     \pen(\bar f)) \geq 10 z h^2 \big] \\
3000:     &\leq P \big[ \exists f \in \mathcal F : \norm{f - f_0}^2 - 8
3001:     ( \norm{f - f_0}_n^2 + \pen(f) ) \geq 10 z h^2 \big] \\
3002:     &\leq P[A_1] + \sum_{k \geq 2} P[A_k],
3003:   \end{align*}
3004:   where
3005:   \begin{align*}
3006:     A_1 := \big\{ \exists f &\in \mathcal F,\;\pen(f) \leq 2^{\alpha /
3007:       \beta} h^2 : \\
3008:     &\norm{f - f_0}^2 - 8 (\norm{f - f_0}_n^2 + \pen(f) ) \geq 10 z
3009:     h^2 \big\}
3010:   \end{align*}
3011:   and for $k \geq 2$,
3012:   \begin{align*}
3013:     A_k := \big\{ \exists f \in \mathcal F,\; &2^{\alpha (k-1) /
3014:       \beta} h^2 < \pen(f) \leq 2^{\alpha k / \beta} h^2 : \\
3015:     &\norm{f - f_0}^2 - 8 (\norm{f - f_0}_n^2 + \pen(f) ) \geq 10 z
3016:     h^2 \big\}.
3017:   \end{align*}
3018:   Hence, since $z \geq z_0 \geq 1$ and $\alpha / \beta = 2 / (\beta +
3019:   2) > 1/2$ since $\beta < 2$, we have $P[A_k] \leq P_k$ for any $k
3020:   \geq 1$, where
3021:   \begin{align*}
3022:     P_k := P \big[ \exists f \in \mathcal F,\; &\pen(f) \leq 2^{\alpha
3023:       k / \beta} h^2 : \\
3024:     &\norm{f - f_0}^2 - 8 \norm{f - f_0}_n^2 \geq 2 z h^2 + 4
3025:     2^{\alpha k / \beta} h^2 \big].
3026:   \end{align*}
3027:   Now, let $F(\delta, k)$ be a minimal $\delta$-covering for the norm
3028:   $\norm{\cdot}_\infty$ of the set
3029:   \begin{equation*}
3030:     \{ f \in \mathcal F : \pen(f) \leq 2^{\alpha k / \beta} h^2 \} =
3031:     \{ f \in \mathcal F : |f|_{\mathcal F} \leq 2^{k /\beta} \},
3032:   \end{equation*}
3033:   where we recall that $\pen(f) = h^2 |f|_{\mathcal
3034:     F}^\alpha$. Assumption~$(C_\beta)$ entails
3035:   \begin{equation}
3036:     \label{eq:covering1}
3037:     | F(\delta, k) | \leq \exp ( D 2^{k} \delta^{-\beta} ).
3038:   \end{equation}
3039:   Since for any $f_1, f_2 \in \mathcal F$ such that $\norm{f_1 -
3040:     f_2}_\infty \leq \delta$, we have
3041:   \begin{equation*}
3042:     \norm{f_1 - f_0}^2 \leq 2\norm{f_2 - f_0}^2 + 2 \delta^2 \quad
3043:     \text{ and } \quad 2
3044:     \norm{f_1 - f_0}_n^2 \geq 2\norm{f_2 - f_0}_n^2 - 2 \delta^2,
3045:   \end{equation*}
3046:   we obtain
3047:   \begin{align*}
3048:     P_k &\leq P \big[ \exists f \in F(\delta, k) : 2 \norm{f - f_0}^2
3049:     - 4 \norm{f - f_0}_n^2 + 6 \delta^2 \geq 2 z h^2 + 4 2^{\alpha k /
3050:       \beta} h^2 \big] \\
3051:     &\leq \sum_{f\in F(\delta, k)} \times P \big[ \norm{f - f_0}^2 - \norm{f -
3052:       f_0}_n^2 \geq t_k(z) \big],
3053:   \end{align*}
3054:   where $t_k(z) := z h^2 / 2 + 2^{\alpha k / \beta} h^2 - 3 \delta^2 /
3055:   2 + \norm{f - f_0}^2 / 2$. Let $f \in F(\delta, k)$ be fixed. We
3056:   introduce the random variables $U_i := (f(X_i) - f_0(X_i))^2$, so
3057:   that $\norm{f - f_0}_n^2 = \sum_{i=1}^n U_i / n$ and $E[U_1] =
3058:   \norm{f - f_0}^2$. Note that the $U_i$ are independent, such that $0
3059:   \leq U_i \leq Q^2$, and $\var [U_1] \leq E [U_1^2] \leq Q^2 E [U_1]
3060:   \leq Q^2 \norm{f - f_0}^2$. Hence, if $t_k(z) \geq \norm{f - f_0}^2
3061:   / 2$, Bernstein's inequality entails
3062:   \begin{align*}
3063:     P \big[ \norm{f - f_0}^2 &- \norm{f - f_0}_n^2 \geq t_k(z) \big]
3064:     = P \Big[ \sum_{i=1}^n (U_i - E [U_1]) \geq n t_k(z) \Big] \\
3065:     &\leq \exp \Big( \frac{-n t_k(z)^2}{2( Q^2 \norm{f -
3066:         f_0}^2 + Q^2 t_k(z) / 3)} \Big) \\
3067:     &\leq \exp \Big( \frac{-3 n ( z h^2 + 2^{\alpha k / \beta +1} h^2
3068:       - 3 \delta^2 )}{28 Q^2} \Big).
3069:   \end{align*}
3070:   By taking $\delta := (2^{\alpha k / \beta} h^2 / 3)^{1/2}$, we have
3071:   $t_k(z) \geq \norm{f - f_0}^2 / 2$ and \eqref{eq:covering1} becomes
3072:   \begin{equation*}
3073:     | F(\delta, k) | \leq \exp \Big( D_1 n h^2 2^{k(1 - \alpha / 2)}
3074:     \Big),
3075:   \end{equation*}
3076:   where we used~\eqref{eq:bandwidth} and took $D_1 := D 3^{\beta / 2}
3077:   / a^{\beta + 2}$. Hence, for $D_2 := 3 / (28 Q^2)$, we have
3078:   \begin{equation*}
3079:     P_k \leq \exp\Big( D_1 n h^2 2^{k (1 - \alpha / 2)} - D_2 n h^2 (z +
3080:     2^{\alpha k / \beta}) \Big).
3081:   \end{equation*}
3082:   Now, we choose
3083:   \begin{equation*}
3084:     K := \Big[ \frac{\log (\min(D_2 / D_1, 1) / 2)}{(1 - \alpha / 2 - \alpha
3085:       / \beta) \log 2} \Big] + 1,
3086:   \end{equation*}
3087:   where $[x]$ is the integer part of $x$, and where we recall that
3088:   $\alpha > 2 \beta / (\beta + 2)$, so that $1 - \alpha / 2 - \alpha /
3089:   \beta < 0$. The conclusion of the proof follows easily by the
3090:   decomposition $\sum_{k \geq 1} P_k = \sum_{1 \leq k < K} P_k +
3091:   \sum_{k \geq K} P_k$, if $z \geq z_1$ for the choice $z_1 := 2 (
3092:   2^{K \alpha / \beta} - D_1 2^{K(1 - \alpha / 2)} / D_2)$.
3093: \end{proof}
3094: 
3095: % \begin{align*}
3096: %   \exp( -D_Q n h^2 2^k) \exp\big( -n h^2 ( D_Q z - D 6^{1/(2s)}
3097: %   \alpha^{-(2+1/s)}) \big).
3098: % \end{align*}
3099: % \begin{equation*}
3100: %   P \big[ \norm{f^* - f}^2 - \norm{f^* - f}_n^2 \geq t_k \big] \leq
3101: % \end{equation*}
3102: % thus
3103: % \begin{equation*}
3104: %   P_k \leq \exp\big( D 6^{1/(2s)} h^{-1/s} -D_Q n h^2 ( z + 2^k )
3105: %   \big),
3106: % \end{equation*}
3107: % for any $k \geq 1$. But since $h \geq \alpha n^{-s / (2s + 1)}$, we
3108: % have
3109: % \begin{equation*}
3110: %   P_k \leq
3111: % \end{equation*}
3112: % This gives
3113: % \begin{equation*}
3114: %   \sum_{k \geq 1} P_k \leq \exp\big( -n h^2 (D_Q (z+1) - D 6^{1/(2s)}
3115: %   \alpha^{-(2+1/s)} ) \big),
3116: % \end{equation*}
3117: % which entails Lemma~\ref{lem:devia2} for $z_0$ given by
3118: % \begin{equation*}
3119: %   z_0 := \max\Big(0, \frac{D 6^{1/(2s)} \alpha^{-(2+1/s)} + 1}{D_Q} -
3120: %   1\Big). \qedhere
3121: % \end{equation*}
3122: % Let $K' \in \mathbb N$ be such that $D \alpha^{-1} (3 /
3123: % \alpha)^{1/(2s)} + 1 \leq D_Q 2^k$, and take $K := \max(4,
3124: % K')$. This choice entails
3125: % \begin{equation*}
3126: %   \sum_{k \geq K} P_k \leq \exp( -D_Q n h z) \sum_{k \geq K} \exp( -k
3127: %   n h ) \leq 2^{-1} \exp( -n h (D_Q z + K) ).
3128: % \end{equation*}
3129: % Now, for $k < K$, we have for any $z \geq z_0$, where
3130: % \begin{equation*}
3131: %   z_0  := \max(0, D_Q^{-1} (2^{K/2} D \alpha^{-1} (3 / \alpha)^{1 /
3132: %     (2s) } + 1) - 2),
3133: % \end{equation*}
3134: % that
3135: % \begin{equation*}
3136: %   \sum_{1 \leq k \leq K} P_k \leq K \exp(-n h),
3137: % \end{equation*}
3138: % hence which concludes the proof of Lemma~\ref{lem:devia2}.  \hfill
3139: % $\square$
3140: 
3141: 
3142: % \subsection*{}
3143: 
3144: 
3145: 
3146: 
3147: 
3148: % \begin{lemma}
3149: %   \label{lem:spline_bounded}
3150: %   Let $P_X$ be such that $|\supp P_X| > s$ \textup(the support
3151: %   contains at least $s+1$ points.\textup) Let $f \in W_s$ be such that
3152: %   $\norm{f - f_0}_m \leq \delta$ for some $\delta > 0$ and some
3153: %   function $f_0$.\textup) Then, we can find positive constants $C_0,
3154: %   C, D$ such that
3155: %   \begin{equation*}
3156: %     P\big[ \norm{f}_\infty > C_0( \delta + \norm{f_0}_m + J(f)) \big]
3157: %     \geq C \exp( -D n ).
3158: %   \end{equation*}
3159: % \end{lemma}
3160: 
3161: 
3162: % \begin{proof}[Proof of Lemma~\ref{lem:spline_bounded}]
3163: %   Since $f \in W_s$, we can write using the Sobolev-embedding theorem
3164: %   that
3165: %   \begin{equation*}
3166: %     f = f_1 + f_2
3167: %   \end{equation*}
3168: %   where $f_1 = \sum_{|\alpha| < s} b_\alpha x^{\alpha}$ and $f_2$ is
3169: %   such that $\norm{f_2}_\infty \leq J(f_2) = J(f)$. Moreover, we have
3170: %   \begin{equation*}
3171: %     \norm{f_1}_\infty \leq \norm{b}_\infty \leq C(s) (b^{\top} b )^{1/2},
3172: %   \end{equation*}
3173: %   where $b = (b_\alpha)_{|\alpha| < s}$. For $p = (p_1, \ldots, p_d)$
3174: %   and $q = (q_1, \ldots, q_d)$ such that $|p| < s$ and $|q| < s$, let
3175: %   us introduce the matrices $A_m$ and $A$ with entries
3176: %   \begin{equation*}
3177: %     (A_m)_{p,q} = \int x^{p+q} P_X^m(dx), \quad (A)_{p,q} = \int
3178: %     x^{p+q} P_X(dx).
3179: %   \end{equation*}
3180: %   The matrix $A$ is positive definite. Indeed, otherwise, we can find
3181: %   a vector $b$ such that
3182: %   \begin{equation*}
3183: %     0 = b^{\top} A b = E \Big[ \Big( \sum_{|\alpha| < s} b_\alpha
3184: %     X^{\alpha} \Big)^2 \Big],
3185: %   \end{equation*}
3186: %   which entails that the polynomial $\sum_{|\alpha| < s} b_\alpha
3187: %   x^{\alpha}$ is zero for almost every $x \in \supp P_X$, which is not
3188: %   possible since we assumed that $|\supp(P_X)| > s$. Then, let us
3189: %   denote by $\lambda(A) > 0$ the smallest eigenvalue of $A$. On the
3190: %   event $\{ \norm{A_m - A}_\infty \leq \lambda(A)/2 \}$, we have
3191: %   \begin{equation*}
3192: %     b^{\top} b \leq \lambda(A)^{-1} (b^{\top} A_m  b + b^{\top} b
3193: %     \lambda(A) / 2 ),
3194: %   \end{equation*}
3195: %   which entails
3196: %   \begin{equation*}
3197: %     b^{\top} b \leq 2 \lambda(A)^{-1} b^{\top} A_m b = 2
3198: %     \lambda(A)^{-1} \norm{f_1}_m^2.
3199: %   \end{equation*}
3200: %   Now, since $\norm{f - f_0}_m \leq \delta$, we have
3201: %   \begin{equation*}
3202: %     \norm{f_1}_m \leq \norm{f}_m + \norm{f_2}_m \leq \delta +
3203: %     \norm{f_0}_m + J(f),
3204: %   \end{equation*}
3205: %   and putting all this together, this gives that on $\{ \norm{A_m -
3206: %     A}_\infty \leq \lambda(A)/2 \}$:
3207: %   \begin{equation*}
3208: %     \norm{f}_\infty \leq C_0 (\delta + \norm{f_0}_m + J(\bar f)),
3209: %   \end{equation*}
3210: %   where $C_0 := (2 C(s) \lambda(A)^{-1} )^{1/2}$. By Hoeffding's
3211: %   inequality, we have
3212: %   \begin{equation*}
3213: %     P[ \norm{A_m - A}_\infty > \lambda(A)/2 ] \leq C(s)^2 \exp( -D n)
3214: %   \end{equation*}
3215: %   with $D := \lambda(A)^2 / (8 M_X^2)$, where $M_X$ is the radius of
3216: %   the support of $P_X$. This concludes the proof of the Lemma.
3217: % \end{proof}
3218: 
3219: 
3220: \appendix
3221: 
3222: \section{Function spaces}
3223: \label{sec:appendix_approximation}
3224: 
3225: In this section we give precise definitions of the spaces of functions
3226: considered in the paper, and give useful related results. The
3227: definitions and results presented here can be found
3228: in~\cite{triebel06}, in particular in Chapter~5 which is about
3229: anisotropic spaces, anisotropic multiresolutions, and entropy numbers
3230: of the embeddings of such spaces (see Section~5.3.3) that we use in
3231: particular to derive condition $(C_\beta)$, for the anisotropic Besov
3232: space, see Section~\ref{sec:pena_least_squares}.
3233: 
3234: % If $\bs k
3235: % = (k_1, \ldots, k_d)$ with $k_i \geq 0$ we define the \emph{iterated
3236: %   difference} by
3237: % \begin{equation*}
3238: %   \Delta_h^{\bs k} f(x) = \Delta_{h_1 e_1}^{k_1} \circ \cdots \circ
3239: %   \Delta_{h_d e_d}^{k_d} f(x)
3240: % \end{equation*}
3241: 
3242: \subsection{Anisotropic Besov space}
3243: 
3244: Let $\{ e_1, \ldots, e_d \}$ be the canonical basis of $\mathbb R^d$
3245: and $\bs s = (s_1, \ldots, s_d)$ with $s_i > 0$ be a vector of
3246: directional smoothness, where $s_i$ corresponds to the smoothness in
3247: direction $e_i$. Let us fix $1 \leq p, q \leq \infty$. If $f$ is a
3248: function in $\mathbb R^d$, we define $\Delta_h^k f$ as the
3249: \emph{difference} of order $k \geq 1$ and step $h \in \mathbb R^d$,
3250: given by $\Delta_h^1 f(x) = f(x + h) - f(x)$ and $\Delta_h^k f(x) =
3251: \Delta_h^1(\Delta_h^{k-1}f)(x)$ for any $x \in \mathbb R^d$. We say
3252: that $f \in L^p(\mathbb R^d)$ belongs to the anisotropic Besov space
3253: $B_{p, q}^{\bs s}(\mathbb R^d)$ if the semi-norm
3254: \begin{equation*}
3255:   |f|_{B_{p, q}^{\bs s}(\mathbb R^d)} := \sum_{i=1}^d \Big(
3256:   \int_0^1 (t^{-s_i} \norm{\Delta_{t e_i}^{k_i} f}_{p})^q
3257:   \frac{dt}{t} \Big)^{1/q}
3258: \end{equation*}
3259: is finite (with the usual modifications when $p = \infty$ or $q =
3260: \infty$). We know that the norms
3261: \begin{equation*}
3262:   \norm{f}_{B_{p, q}^{\bs s}} := \norm{f}_p + |f|_{B_{p, q}^{\bs s}}
3263: \end{equation*}
3264: are equivalent for any choice of $k_i > s_i$. An equivalent definition
3265: of the seminorm can be given using the directional differences and the
3266: anisotropic distance, see Theorem~5.8 in~\cite{triebel06}.
3267: % To make the presentation simple, we first define on $\mathbb R^d$
3268: % and then on some domain $\Omega \subset \mathbb R^d$.
3269: Following Section~5.3.3 in~\cite{triebel06}, we can define the
3270: anisotropic Besov space on an arbitrary domain $\Omega \subset \mathbb
3271: R^d$ (think of $\Omega$ as the support of the design $X$) in the
3272: following way. We define $B_{p, q}^{\bs s}(\Omega)$ as the set of all
3273: $f \in L^p(\Omega)$ such that there is $g \in B_{p, q}^{\bs s}(\mathbb
3274: R^d)$ with restriction $g | \Omega$ to $\Omega$ equal to $f$ in
3275: $L^p(\Omega)$. Moreover,
3276: \begin{equation*}
3277:   \norm{f}_{B_{p, q}^{\bs s}(\Omega)} = \inf_{g : g|\Omega = f}
3278:   \norm{g}_{B_{p, q}^{\bs s}(\mathbb R^d)},
3279: \end{equation*}
3280: where the infimum is taken over all $g \in B_{p, q}^{\bs s}(\mathbb
3281: R^d)$ such that $g | \Omega = f$. In an equivalent way, the space
3282: $B_{p, q}^{\bs s}(\Omega)$ can be defined using intrisic
3283: characterisations by differences, see Section~4.1.4
3284: in~\cite{triebel06}, where the idea is, roughly, to restrict the
3285: increments $h$ in the differences $\Delta_h^k$ so that the support of
3286: $\Delta_h^k f$ is included in $\Omega$.
3287: 
3288: In what follows, we shall remove from the notations the dependence on
3289: $\Omega$, since it is does not affect the definitions and results
3290: below. Moreover, for what we need in this paper, we shall simply take
3291: $\Omega$ as the support of the design $X$. Several explicit particular
3292: cases for the space $B_{p, q}^{\bs s}$ are of interest. If $\bs s =
3293: (s, \ldots, s)$ for some $s > 0$, then $B_{p, q}^{\bs s}$ is the
3294: standard isotropic Besov space. When $p = q = 2$ and $s = (s_1,
3295: \ldots, s_d)$ has integer coordinates, $B_{2, 2}^{\bs s}$ is the
3296: anisotropic Sobolev space
3297: \begin{equation*}
3298:   B_{2, 2}^{\bs s} = W_2^{\bs s} = \Big\{ f \in L^2 : \sum_{i=1}^d
3299:   \Big\| \frac{\partial^{s_i} f}{\partial x_i^{s_i}} \Big\|_2 < \infty
3300:   \Big\}.
3301: \end{equation*}
3302: If $\bs s$ has non-integer coordinates, then $B_{2, 2}^{\bs s}$ is the
3303: anisotropic Bessel-potential space
3304: \begin{equation*}
3305:   H^{\bs s} = \Big\{ f \in L^2 : \sum_{i=1}^d \Big\| (1 +
3306:   |\xi_i|^2)^{s_i/2} \hat f(\xi) \Big\|_2 < \infty \Big\}.
3307: \end{equation*}
3308: 
3309: 
3310: The results described in the next section are direct consequences of
3311: the transference method, see Section~5.3 in~\cite{triebel06}. Roughly,
3312: the idea is to transfer problems for anisotropic spaces via sequence
3313: space (one can think of sequence of wavelet coefficients for instance)
3314: to isotropic spaces. This technique allows to prove the statements
3315: below. Note that another technique of proof based on replicant coding
3316: can be used, see~\cite{kerk_picard_replicant_03}. This is commented
3317: below.
3318: 
3319: \subsection{Embeddings and entropy numbers}
3320: 
3321: % Let $\bs \sigma = (\sigma_1, \ldots, \sigma_d)$ be a fixed vector with
3322: % $\sigma_i > 0$ and harmonic mean equal to $1$, that is $\sum_{i=1}^d 1
3323: % / \sigma_i = d$. If $s > 0$, we denote for short $s \bs \sigma = (s
3324: % \sigma_1, \ldots, s \sigma_d)$.
3325: 
3326: % Using together Theorems~5.28 and~1.97 in \cite{triebel06}, we have the
3327: % following statements. If $0 < s_1 < s_0$, we have
3328: % \begin{equation}
3329: %   \label{eq:embedding1}
3330: %   B_{p, q}^{s_0 \bs \sigma} \subset B_{p, q}^{s_1 \bs \sigma}.
3331: % \end{equation}
3332: 
3333: Let us first mention the following obvious embedding, which is useful
3334: for the proof of adaptive upper bound (see
3335: Section~\ref{sec:derive_adaptive}). If $0 < \bs s_1 \leq \bs s_0$
3336: coordinatewise, that is $0 < s_{1, i} \leq s_{0, i}$ for any $i \in \{
3337: 1, \ldots, d \}$, we have
3338: \begin{equation}
3339:   \label{eq:anisotropic_embedding}
3340:   B_{p, q}^{\bs s_0} \subset B_{p, q}^{\bs s_1}.
3341: \end{equation}
3342: This simply follows from the fact that $B_{p, q}^{\bs s} =
3343: \cap_{i=1}^d B_{p, q, i}^{s_i}$, where $B_{p, q, i}^{s_i}$ is the
3344: corresponding Besov space in the $i$-th direction of coordinates, with
3345: norm $L^p$ extended to the other $d-1$ directions (see Remark~5.7 in
3346: \cite{triebel06}) together with the standard embedding for the
3347: isotropic Besov space.
3348: 
3349: % \subsection{Entropy numbers}
3350: 
3351: As we mentioned below, Assumption~$(C_\beta)$ (see
3352: Section~\ref{sec:pena_least_squares}) is satisfied for barely all
3353: smoothness spaces considered in nonparametric literature. In
3354: particular, if $\mathcal F = B_{p,q}^{\bs s}$ is the anisotropic Besov
3355: space defined above, $(C_\beta)$ is satisfied: it is a consequence of
3356: a more general Theorem (see Theorem~5.30 in \cite{triebel06})
3357: concerning the entropy numbers of embeddings (see Definition~1.87 in
3358: \cite{triebel06}). Here, we only give a simplified version of this
3359: Theorem, which is sufficient to derive $(C_\beta)$. Indeed, if one
3360: takes $\bs s_0 = \bs s$, $p_0 = p$, $q_0 = q$ and $\bs s_1 = 0$, $p_0
3361: = \infty$, $q_0 = \infty$ in Theorem~5.30 from \cite{triebel06}, we
3362: obtain the following
3363: \begin{theorem}
3364:   \label{thm:anisotropic_entropy}
3365:   Let $1 \leq p, q \leq \infty$ and $\bs s = (s_1, \ldots, s_d)$ where
3366:   $s_i > 0$\textup, and let $\bs {\bar s}$ be the harmonic mean of
3367:   $\bs s$ \textup(see~\eqref{eq:harmonic_mean}\textup). Whenever $\bs
3368:   {\bar s} > d / p$\textup, we have
3369:   \begin{equation*}
3370:     B_{p, q}^{\bs s} \subset C(\Omega),
3371:   \end{equation*}
3372:   where $C(\Omega)$ is the set of continuous functions on
3373:   $\Omega$\textup, and for any $\delta > 0$\textup, the sup-norm
3374:   entropy of the unit ball of the anisotropic Besov space\textup,
3375:   namely the set
3376:   \begin{equation*}
3377:     U_{p, q}^{\bs s} := \{ f \in B_{p, q}^{\bs s} :
3378:     |f|_{B_{p,q}^{\bs s}} \leq 1 \}
3379:   \end{equation*}
3380:   satisfies
3381:   \begin{equation}
3382:     H_\infty(\delta, U_{p, q}^{\bs s}) \leq D \delta^{-\bs {\bar s} / d},
3383:   \end{equation}
3384:   where $D > 0$ is a constant independent of $\delta$.
3385: \end{theorem}
3386: 
3387: For the isotropic Sobolev space, Theorem~\ref{thm:anisotropic_entropy}
3388: was obtained in the key paper~\cite{birman_solomjak67} (see
3389: Theorem~5.2 herein), and for the isotropic Besov space, it can be
3390: found, among others, in~\cite{birge_massart00}
3391: and~\cite{kerk_picard_replicant_03}.
3392: 
3393: \begin{remark}
3394:   A more constructive computation of the entropy of anisotropic Besov
3395:   spaces can be done using the replicant coding approach, which is
3396:   done for Besov bodies in~\cite{kerk_picard_replicant_03}. Using this
3397:   approach together with an anisotropic multiresolution analysis based
3398:   on compactly supported wavelets or atoms, see Section~5.2
3399:   in~\cite{triebel06}, we can obtain a direct computation of the
3400:   entropy. The idea is to do a quantization of the wavelet
3401:   coefficients, and then to code them using a replication of their
3402:   binary representation, and to use 01 as a separator (so that the
3403:   coding is injective). A lower bound for the entropy can be obtained
3404:   as an elegant consequence of Hoeffding's deviation inequality for
3405:   sums of i.i.d. variables and a combinatorial lemma.
3406: \end{remark}
3407: 
3408: % \texttt{faudra rajouter les jackson et bernstein estimates pour la
3409: %   borne inf sur besov anisotropes}
3410: 
3411: 
3412: % \begin{theorem}[Birg\'e and Massart (2000), Corollary~1]
3413: %   \label{thm:birge_massart}
3414: 
3415: % \end{theorem}
3416: 
3417: % \begin{remark}
3418: %   When $p=2$ and $s \in \mathbb N_0$, we recover the result
3419: %   from~\cite{birman_solomjak67}, namely
3420: %   \begin{equation*}
3421: %     N\big( \delta, W_s(R), \norm{\cdot}_{L^q} \big) \leq
3422: %     \exp\Big( D \Big( \frac{L}{\delta} \Big)^{d/s} \Big),
3423: %   \end{equation*}
3424: %   where $W_s(R) := \{ f \in W_s : J_s(f) \leq R \}$,
3425: %   see~\eqref{eq:usual_roughness}.
3426: % \end{remark}
3427: % The result from~\cite{birman_solomjak67} was previously used
3428: % in~\cite{mammen_vandegeer97}, for estimation in partial linear
3429: % models. % In~\cite{birman_solomjak67}, it is stated in a more general
3430: % % setting, for any $L^q$-norm with $1 \leq q \leq +\infty$.
3431: % The fact that this result holds for the $L^q$-norm, $q = \infty$
3432: % included, is important here. Indeed, a cover for $L^\infty$-norm is
3433: % also a cover for both the $L^2(P_X)$ and $L^2(P_X^m)$ norms (simply
3434: % write that $\norm{f}_{L^2(P_X)} \leq \norm{f}_\infty$ and
3435: % $\norm{f}_{L^2(P_X^m)} \leq \norm{f}_\infty$.)
3436: 
3437: % \texttt{Besov sur un domaine plutot ??}
3438: 
3439: % \subsection{Multiscale setting}
3440: 
3441: % Let $M$ be a dilatation matrix in $\mathcal M_d(\mathbb Z)$, namely a
3442: % matrix with integer entries and eigenvalues outside the unit disk. Let
3443: % $\varphi \in H^s$. We say that $\varphi$ is a \emph{$M$-scaling
3444: %   function} if it is compactly supported, if $\int_{\mathbb R^d}
3445: % \varphi(x) dx = 1$ and if
3446: % \begin{itemize}
3447: % \item there is a finite sequence of complex numbers $(h_k)_{k \in
3448: %     \mathbb Z^d}$ such that
3449: %   \begin{equation*}
3450: %     \varphi(x) = |\det M|^{1/2} \sum_{k \in \mathbb Z^d} h_k
3451: %     \varphi(M x - k) ;
3452: %   \end{equation*}
3453: % \item $\{ \varphi(\cdot - k) \}_{k \in \mathbb Z^d}$ is a Riesz basis
3454: %   for the space it spans.
3455: % \end{itemize}
3456: % Two $M$-scaling functions $\varphi$ and $\tilde \varphi$ are
3457: % \emph{biorthogonal $M$-scaling functions} if the systems $\{
3458: % \varphi(\cdot - k) \}_{k \in \mathbb Z^d}$ and $\{ \tilde
3459: % \varphi(\cdot - k) \}_{k \in \mathbb Z^d}$ are orthogonal.
3460: 
3461: % The construction of \emph{compactly supported} $M$-scaling functions
3462: % for an arbitrary dilatation matrix is a very difficult subject of
3463: % current research. Indeed, even in one-dimension, when $M = m$ is not
3464: % integer, there is no scaling functions with compact support, see
3465: % [1].
3466: 
3467: % If $M = \diag(m_1, \ldots, m_d)$ where the $m_i \geq 2$ are integers,
3468: % we can construct biorthogonal $M$-scaling functions using tensor
3469: % products of one dimensional $m_i$-scaling functions $\varphi_i, \tilde
3470: % \varphi_i \in H^s(\mathbb R)$ for an arbitrary large smoothness
3471: % $s$. The construction of biorthogonal compactly supported
3472: % one-dimensional $m$-scaling functions for any integer $m \geq 2$ can
3473: % be found in ???? Then, can we simply consider
3474: % \begin{equation*}
3475: %   \varphi(x) = \prod_{i=1}^d \varphi_i(x_i) \text{ and } \tilde
3476: %   \varphi(x) = \prod_{i=1}^d \tilde \varphi_i(x_i)
3477: % \end{equation*}
3478: % to obtain compactly supported biorthogonal $M$-scaling functions. Let
3479: % us consider the matrix
3480: % \begin{equation}
3481: %   \label{eq:particular_dilatation_matrix}
3482: %   M = \diag(\lambda^{1 / \sigma_1}, \ldots,
3483: %   \lambda^{1 / \sigma_d}).
3484: % \end{equation}
3485: % The following Lemma can be found in
3486: % \begin{lemma}[see Lemma 3.2 in  ????]
3487: %   Let $\bs \sigma = (\sigma_1, \ldots, \sigma_d) > 0$. The following
3488: %   conditions are equivalent:
3489: %   \begin{itemize}
3490: %   \item There is a number $\lambda > 1$ such that $\lambda^{1 /
3491: %       \sigma_i} \in \mathbb Z_+$ for $1 \leq i \leq d$
3492: %   \item There is a number $\mu > 0$ such that $(1 / \sigma_1, \ldots,
3493: %     1 / \sigma_d) \in \mu \log \mathbb Z_+^d$.
3494: %   \end{itemize}
3495: % \end{lemma}
3496: % Thus, we know that when
3497: % \begin{equation}
3498: %   \label{eq:anisotropic_restriction}
3499: %   \Big( \frac{1}{\sigma_1}, \ldots, \frac{1}{\sigma_d} \Big) \in \mu
3500: %   \log \mathbb Z_+^d
3501: % \end{equation}
3502: % for some $\mu > 0$, we can find compactly supported biorthogonal
3503: % $M$-scaling functions. A multiresolution analysis of $L^\pi$ for $1
3504: % \leq \pi \leq \infty$ based on such scaling functions can be easily
3505: % construted, in the same way as in the dyadic case where $m_i =
3506: % 2$). This is explained in details in ????. We define $\varphi_{j,
3507: %   k}(x) := |\det M|^{j / \pi} \varphi(M^j x - k)$ where $j \in \mathbb
3508: % Z$ is the resolution level and $k \in \mathbb Z^d$ is the localization
3509: % parameter. When $M$ is given
3510: % by~\eqref{eq:particular_dilatation_matrix} we can write
3511: % \begin{equation*}
3512: %   \varphi_{j, k}(x) := \lambda^{j d / p} \varphi(M^j x - k).
3513: % \end{equation*}
3514: % These dilated and translated scaling functions are normalized in
3515: % $L^\pi$ (if $\pi = \infty$, take $\pi = 1$ in the above definition and
3516: % take a scaling function \texttt{c'est faux !!!} divide $\varphi_{j,
3517: %   k}$ by $\norm{\varphi}_\infty$, so that $\norm{\varphi_{j,
3518: %     k}}_\infty = 1$). If we define for $j \in \mathbb Z$
3519: % \begin{equation*}
3520: %   V_j = \overline{ \Span\{ \varphi_{j, k} : k \in \mathbb Z^d \} }
3521: % \end{equation*}
3522: % which is the closure of the $\Span$ of the $\varphi_{j, k}$ in
3523: % $L^\pi$, then $(V_j)_{j \in \mathbb Z_d}$ is a multiresolution
3524: % analysis of $L^\pi$ (again, if $\pi = \infty$ then $L^\infty$ is
3525: % replaced by $C(\mathbb R^d)$). We can define in the same way dilated
3526: % and translated scaling functions $\tilde \varphi_{j, k}$, and
3527: % construct as a consequence multiresolution analysis of $L^\pi$.
3528: 
3529: % A remark of first importance in what follows is then the following: if
3530: % $x$ is fixed, then $\varphi_{e, j, k}$ \texttt{mettre ca apres la MRA}
3531: % $K_j$ of cardinaly $|K_j| \approx \lambda^{j d}$ (recall that by
3532: % construction $\lambda$ is an integer).
3533: 
3534: % $\beta_{e, j, k} := \langle f, \tilde \psi_{e, j, k} \rangle$
3535: 
3536: % $E = \{ 1, \ldots, m \}$
3537: 
3538: % \texttt{apres la MRA:}
3539: % For any $f \in L^\pi$
3540: % \begin{equation*}
3541: %   \Big\| \sum_{e \in E, k \in K_j} \beta_{e, k} \psi_{e, j, k}
3542: %   \Big\|_{L^\pi} \approx \Big( \sum_{e \in E, k \in K_j} |\beta_{e, j,
3543: %     k}|^\pi \Big)^{1/\pi}
3544: % \end{equation*}
3545: % with the usual modification whenever $\pi = \infty$.
3546: 
3547: % or equivalently,
3548: 
3549: 
3550: % where the above sums are convergent in $L^\pi$
3551: 
3552: % \texttt{mettre estimees de jackson et bernstein}
3553: 
3554: 
3555: 
3556: % This is the reason why the entropy of anisotropic Besov space
3557: % will given only be able to use the caracterization of anisotropic
3558: % Besov spaces by wavelet coefficients for
3559: 
3560: 
3561: 
3562: % This is the reason why the
3563: 
3564: % It is well-known that wavelets are a powerful tool for the
3565: % characterazition of Besov spaces, by means of sums weighted sums of
3566: % wavelet coefficient. Besov isotropic classes can be defined in this
3567: % way, using basis with
3568: 
3569: % The use of compaclty supported wavelets is of first importance in
3570: % statistics for instance, and the fact that the number of
3571: 
3572: % A key tool for the
3573: 
3574: % A powerful way of described isotropic Besov spaces is Wavelet. Indeed,
3575: % it is well known that
3576: 
3577: % If $\beta_{j, k} = \prodsca{f}{\tilde \psi_{e, j, k}}$
3578: 
3579: % \begin{equation*}
3580: %   \frac{1}{C} \Big( \sum_{e, k} |\beta_{e, j, k} |^p \Big)^{1/p} \leq \Big
3581: %   \| \sum_{e=1}^{m-1} \sum_{k \in \mathbb Z^d}  \beta_{e, j, k}
3582: %   \psi_{e, j, k} \Big\|_p \leq C \Big( \sum_{e, k} |\beta_{e, j, k}
3583: %   |^p \Big)^{1/p}
3584: % \end{equation*}
3585: 
3586: 
3587: 
3588: \section{Some probabilistic tools}
3589: \label{sec:appendix_proba}
3590: 
3591: 
3592: For the first Theorem we refer to \cite{EM:96}. The two following
3593: Theorems can be found, for instance, in
3594: \cite{massart03,vdVW:96,ledoux_talagrand91}.
3595: 
3596: 
3597: \begin{theorem}[Einmahl and Masson (1996)]
3598:   \label{TheoEinmahlMasson}
3599:   Let $Z_1,\ldots,Z_n$ be $n$ independent non-negative random
3600:   variables such that $E[Z_i^2]\leq \sigma^2,\forall i=1, \ldots, n$.
3601:   Then, we have, for any $\delta > 0$,
3602:   \begin{equation*}
3603:     P \Big[\sum_{i=1}^n Z_i - E[Z_i] \leq -n \delta \Big]
3604:     \leq \exp\Big(-\frac{n \delta^2}{2\sigma^2} \Big).
3605:   \end{equation*}
3606: \end{theorem}
3607: 
3608: 
3609: \begin{theorem}[Sudakov]
3610:   \label{TheoSudakov}
3611:   There exists an absolute constant $c^*>0$ such that for any integer
3612:   $M$, any centered gaussian vector $X = (X_1,\ldots,X_M)$ in
3613:   $\mathbb{R}^M$, we have,
3614:   \begin{equation*}
3615:     c^* E[\max_{1\leq j\leq M}X_j] \geq \varepsilon \sqrt{\log M},
3616:   \end{equation*}
3617:   where $\varepsilon := \min \Big\{ \sqrt{E[(X_i-X_j)^2]} : i \neq j
3618:   \in \{1, \ldots, M\} \Big\}$.
3619: \end{theorem}
3620: 
3621: \begin{theorem}[Maximal inequality]
3622:   \label{TheoMaxConcIneq}
3623:   Let $Y_1, \ldots, Y_M$ be $M$ random variables satisfying
3624:   $E[\exp(sY_j)] \leq \exp((s^2\sigma^2)/2)$ for any integer $j$ and
3625:   any $s>0$. Then, we have
3626:   \begin{equation*}
3627:     E[ \max_{1 \leq j \leq M} Y_j] \leq \sigma \sqrt{\log M}.
3628:   \end{equation*}
3629: \end{theorem}
3630: 
3631: % \begin{theorem}[Berry-Ess{\'e}en]\label{TheoBerry}
3632: % Suppose that $(X_i)_{i\in\mathbb{N}}$ is a sequence of i.i.d. random
3633: % variables with mean $\mu$ and variance $\sigma^2>0$. Then, for all
3634: % $n$,
3635: % $$\sup_{t\in\mathbb{R}}\left\vert\mathbb{P}\Big(\frac{\sum_{i=1}^n X_i-n\mu}
3636: % {\sigma \sqrt{n}}\leq t \Big)-\Phi(t)\right\vert\leq
3637: % \frac{33}{4}\frac{\mathbb{E}|X_1-\mu|^3}{\sigma^3\sqrt{n}}.$$
3638: % \end{theorem}
3639: 
3640: \par
3641: 
3642: 
3643: % \bibliographystyle{ims}
3644: 
3645: \begin{thebibliography}{48}
3646: \expandafter\ifx\csname natexlab\endcsname\relax\def\natexlab#1{#1}\fi
3647: \expandafter\ifx\csname url\endcsname\relax
3648:   \def\url#1{\texttt{#1}}\fi
3649: \expandafter\ifx\csname urlprefix\endcsname\relax\def\urlprefix{URL }\fi
3650: \providecommand{\eprint}[2][]{\url{#2}}
3651: 
3652: \bibitem[{Amato et~al.(2006)Amato, Antoniadis and
3653:   Pensky}]{amato_antoniadis_pensky06}
3654: \textsc{Amato, U.}, \textsc{Antoniadis, A.} and \textsc{Pensky, M.} (2006).
3655: \newblock Wavelet kernel penalized estimation for non-equispaced design
3656:   regression.
3657: \newblock \textit{Stat. Comput.}, \textbf{16} 37--55.
3658: 
3659: \bibitem[{Aronszajn(1950)}]{aronszajn50}
3660: \textsc{Aronszajn, N.} (1950).
3661: \newblock Theory of reproducing kernels.
3662: \newblock \textit{Trans. Amer. Math. Soc.}, \textbf{68} 337--404.
3663: 
3664: \bibitem[{Bartlett and Mendelson(2006)}]{BM:06}
3665: \textsc{Bartlett, P.~L.} and \textsc{Mendelson, S.} (2006).
3666: \newblock Empirical minimization.
3667: \newblock \textit{Probab. Theory Related Fields}, \textbf{135} 311--334.
3668: 
3669: \bibitem[{Birg\'e and Massart(1993)}]{birge_massart93}
3670: \textsc{Birg\'e, L.} and \textsc{Massart, P.} (1993).
3671: \newblock {Rates of convergence for minimum contrast estimators.}
3672: \newblock \textit{Probab. Theory Relat. Fields}, \textbf{97} 113--150.
3673: 
3674: \bibitem[{Birg{\'e} and Massart(2000)}]{birge_massart00}
3675: \textsc{Birg{\'e}, L.} and \textsc{Massart, P.} (2000).
3676: \newblock An adaptive compression algorithm in {B}esov spaces.
3677: \newblock \textit{Constr. Approx.}, \textbf{16} 1--36.
3678: 
3679: \bibitem[{Birman and Solomjak(1967)}]{birman_solomjak67}
3680: \textsc{Birman, M.~{\v{S}}.} and \textsc{Solomjak, M.~Z.} (1967).
3681: \newblock Piecewise polynomial approximations of functions of classes
3682:   {$W_p^{\alpha}$}.
3683: \newblock \textit{Mat. Sb. (N.S.)}, \textbf{73 (115)} 331--355.
3684: 
3685: \bibitem[{Bitouz{\'e} et~al.(1999)Bitouz{\'e}, Laurent and Massart}]{BLM99}
3686: \textsc{Bitouz{\'e}, D.}, \textsc{Laurent, B.} and \textsc{Massart, P.} (1999).
3687: \newblock A {D}voretzky-{K}iefer-{W}olfowitz type inequality for the
3688:   {K}aplan-{M}eier estimator.
3689: \newblock \textit{Ann. Inst. H. Poincar\'e Probab. Statist.}, \textbf{35}
3690:   735--763.
3691: 
3692: \bibitem[{Carl and Stephani(1990)}]{CS:98}
3693: \textsc{Carl, B.} and \textsc{Stephani, I.} (1990).
3694: \newblock \textit{Entropy, compactness and the approximation of operators},
3695:   vol.~98 of \textit{Cambridge Tracts in Mathematics}.
3696: \newblock Cambridge University Press, Cambridge.
3697: 
3698: \bibitem[{Catoni(2001)}]{catbook:01}
3699: \textsc{Catoni, O.} (2001).
3700: \newblock \textit{Statistical Learning Theory and Stochastic Optimization}.
3701: \newblock Ecole d'{\'e}t{\'e} de Probabilit{\'e}s de Saint-Flour 2001, Lecture
3702:   Notes in Mathematics, Springer, N.Y.
3703: 
3704: \bibitem[{Cucker and Smale(2002)}]{cucker_smale02}
3705: \textsc{Cucker, F.} and \textsc{Smale, S.} (2002).
3706: \newblock On the mathematical foundations of learning.
3707: \newblock \textit{Bull. Amer. Math. Soc. (N.S.)}, \textbf{39} 1--49
3708:   (electronic).
3709: 
3710: \bibitem[{Devroye et~al.(1996)Devroye, Gy{\"o}rfi and Lugosi}]{DGL:96}
3711: \textsc{Devroye, L.}, \textsc{Gy{\"o}rfi, L.} and \textsc{Lugosi, G.} (1996).
3712: \newblock \textit{A probabilistic theory of pattern recognition}, vol.~31 of
3713:   \textit{Applications of Mathematics (New York)}.
3714: \newblock Springer-Verlag, New York.
3715: 
3716: \bibitem[{Einmahl and Mason(1996)}]{EM:96}
3717: \textsc{Einmahl, U.} and \textsc{Mason, D.~M.} (1996).
3718: \newblock Some universal results on the behavior of increments of partial sums.
3719: \newblock \textit{Ann. Probab.}, \textbf{24} 1388--1407.
3720: 
3721: \bibitem[{Ga\"iffas and Lecu\'e(2007)}]{gaiffas_lecue07}
3722: \textsc{Ga\"iffas, S.} and \textsc{Lecu\'e, G.} (2007).
3723: \newblock Optimal rates and adaptation in the single-index model using
3724:   aggregation.
3725: \newblock \textit{Electronic Journal of Statistics}, \textbf{1} 538--573.
3726: 
3727: \bibitem[{Green and Silverman(1994)}]{green_silverman94}
3728: \textsc{Green, P.~J.} and \textsc{Silverman, B.~W.} (1994).
3729: \newblock \textit{Nonparametric regression and generalized linear models},
3730:   vol.~58 of \textit{Monographs on Statistics and Applied Probability}.
3731: \newblock Chapman \& Hall, London.
3732: \newblock A roughness penalty approach.
3733: 
3734: \bibitem[{Gy{\"o}rfi et~al.(2002)Gy{\"o}rfi, Kohler, Krzy{\.z}ak and
3735:   Walk}]{kohler02}
3736: \textsc{Gy{\"o}rfi, L.}, \textsc{Kohler, M.}, \textsc{Krzy{\.z}ak, A.} and
3737:   \textsc{Walk, H.} (2002).
3738: \newblock \textit{A distribution-free theory of nonparametric regression}.
3739: \newblock Springer Series in Statistics, Springer-Verlag, New York.
3740: 
3741: \bibitem[{Hamers and Kohler(2004)}]{hamers_kohler04}
3742: \textsc{Hamers, M.} and \textsc{Kohler, M.} (2004).
3743: \newblock How well can a regression function be estimated if the distribution
3744:   of the (random) design is concentrated on a finite set?
3745: \newblock \textit{J. Statist. Plann. Inference}, \textbf{123} 377--394.
3746: 
3747: \bibitem[{Haussler(1992)}]{H:92}
3748: \textsc{Haussler, D.} (1992).
3749: \newblock Decision-theoretic generalizations of the {PAC} model for neural net
3750:   and other learning applications.
3751: \newblock \textit{Inform. and Comput.}, \textbf{100} 78--150.
3752: 
3753: \bibitem[{Hochmuth(2002)}]{hochmuth02}
3754: \textsc{Hochmuth, R.} (2002).
3755: \newblock Wavelet characterizations for anisotropic {B}esov spaces.
3756: \newblock \textit{Appl. Comput. Harmon. Anal.}, \textbf{12} 179--208.
3757: 
3758: \bibitem[{Hoffmann and Lepski(2002)}]{hoffmann_lepski02}
3759: \textsc{Hoffmann, M.} and \textsc{Lepski, O.~V.} (2002).
3760: \newblock Random rates in anisotropic regression.
3761: \newblock \textit{The Annals of Statistics}, \textbf{30} 325--396.
3762: 
3763: \bibitem[{Juditsky et~al.(2005{\natexlab{a}})Juditsky, Rigollet and
3764:   Tsybakov}]{juditsky_etal05}
3765: \textsc{Juditsky, A.}, \textsc{Rigollet, P.} and \textsc{Tsybakov, A.}
3766:   (2005{\natexlab{a}}).
3767: \newblock Learning by mirror averaging.
3768: \newblock \urlprefix\url{http://arxiv.org/abs/math/0511468}.
3769: 
3770: \bibitem[{Juditsky et~al.(2005{\natexlab{b}})Juditsky, Nazin, Tsybakov and
3771:   Vayatis}]{juditsky_nazin05}
3772: \textsc{Juditsky, A.~B.}, \textsc{Nazin, A.~V.}, \textsc{Tsybakov, A.~B.} and
3773:   \textsc{Vayatis, N.} (2005{\natexlab{b}}).
3774: \newblock Recursive aggregation of estimators by the mirror descent method with
3775:   averaging.
3776: \newblock \textit{Problemy Peredachi Informatsii}, \textbf{41} 78--96.
3777: 
3778: \bibitem[{Juditsky et~al.(2006)Juditsky, Rigollet and Tsybakov}]{jrt:06}
3779: \textsc{Juditsky, A.~B.}, \textsc{Rigollet, P.} and \textsc{Tsybakov, A.~B.}
3780:   (2006).
3781: \newblock Learning by mirror averaging.
3782: \newblock To appear in the {\it Ann. Statist.}. Available at
3783:   http://www.imstat.org/aos/future\_papers.html.
3784: 
3785: \bibitem[{Kearns et~al.(1994)Kearns, Schapire, Sellie and
3786:   Hellerstein}]{KSSH:94}
3787: \textsc{Kearns, M.~J.}, \textsc{Schapire, R.~E.}, \textsc{Sellie, L.~M.} and
3788:   \textsc{Hellerstein, L.} (1994).
3789: \newblock Toward efficient agnostic learning.
3790: \newblock In \textit{Machine Learning}. ACM Press, 341--352.
3791: 
3792: \bibitem[{Kerkyacharian et~al.(2001)Kerkyacharian, Lepski and
3793:   Picard}]{kerk_lepski_picard01}
3794: \textsc{Kerkyacharian, G.}, \textsc{Lepski, O.} and \textsc{Picard, D.} (2001).
3795: \newblock Nonlinear estimation in anisotropic multi-index denoising.
3796: \newblock \textit{Probab. Theory Related Fields}, \textbf{121} 137--170.
3797: 
3798: \bibitem[{Kerkyacharian et~al.(2007)Kerkyacharian, Lepski and
3799:   Picard}]{kerk_lepski_picard07}
3800: \textsc{Kerkyacharian, G.}, \textsc{Lepski, O.} and \textsc{Picard, D.} (2007).
3801: \newblock Nonlinear estimation in anisotropic multiindex denoising. {S}parse
3802:   case.
3803: \newblock \textit{Teor. Veroyatn. Primen.}, \textbf{52} 150--171.
3804: 
3805: \bibitem[{Kerkyacharian and Picard(2003)}]{kerk_picard_replicant_03}
3806: \textsc{Kerkyacharian, G.} and \textsc{Picard, D.} (2003).
3807: \newblock Replicant compression coding in {B}esov spaces.
3808: \newblock \textit{ESAIM Probab. Stat.}, \textbf{7} 239--250 (electronic).
3809: 
3810: \bibitem[{Kerkyacharian and Picard(2007)}]{kerk_picard07}
3811: \textsc{Kerkyacharian, G.} and \textsc{Picard, D.} (2007).
3812: \newblock Thresholding in learning theory.
3813: \newblock \textit{Constr. Approx.}, \textbf{26} 173--203.
3814: 
3815: \bibitem[{Kimeldorf and Wahba(1971)}]{kimeldorf_wahba71}
3816: \textsc{Kimeldorf, G.} and \textsc{Wahba, G.} (1971).
3817: \newblock Some results on {T}chebycheffian spline functions.
3818: \newblock \textit{J. Math. Anal. Appl.}, \textbf{33} 82--95.
3819: 
3820: \bibitem[{Kohler(2000)}]{kohler00}
3821: \textsc{Kohler, M.} (2000).
3822: \newblock Inequalities for uniform deviations of averages from expectations
3823:   with applications to nonparametric regression.
3824: \newblock \textit{J. Statist. Plann. Inference}, \textbf{89} 1--23.
3825: 
3826: \bibitem[{Kohler and Krzy{\.z}ak(2001{\natexlab{a}})}]{kohler_krzyzak01a}
3827: \textsc{Kohler, M.} and \textsc{Krzy{\.z}ak, A.} (2001{\natexlab{a}}).
3828: \newblock Nonparametric regression estimation using penalized least squares.
3829: \newblock \textit{IEEE Trans. Inform. Theory}, \textbf{47} 3054--3058.
3830: 
3831: \bibitem[{Kohler and Krzy{\.z}ak(2001{\natexlab{b}})}]{kohler_krzyzak01b}
3832: \textsc{Kohler, M.} and \textsc{Krzy{\.z}ak, A.} (2001{\natexlab{b}}).
3833: \newblock Nonparametric regression estimation using penalized least squares.
3834: \newblock \textit{IEEE Trans. Inform. Theory}, \textbf{47} 3054--3058.
3835: 
3836: \bibitem[{Lecu{\'e}(2006)}]{LecJMLR:06}
3837: \textsc{Lecu{\'e}, G.} (2006).
3838: \newblock Lower bounds and aggregation in density estimation.
3839: \newblock \textit{J. Mach. Learn. Res.}, \textbf{7} 971--981.
3840: 
3841: \bibitem[{Lecu{\'e}(2007)}]{LecAoS:07}
3842: \textsc{Lecu{\'e}, G.} (2007).
3843: \newblock Simultaneous adaptation to the margin and to complexity in
3844:   classification.
3845: \newblock \textit{Ann. Statist.}, \textbf{35} 1698--1721.
3846: 
3847: \bibitem[{Ledoux and Talagrand(1991)}]{ledoux_talagrand91}
3848: \textsc{Ledoux, M.} and \textsc{Talagrand, M.} (1991).
3849: \newblock \textit{Probability in {B}anach spaces}, vol.~23 of
3850:   \textit{Ergebnisse der Mathematik und ihrer Grenzgebiete (3) [Results in
3851:   Mathematics and Related Areas (3)]}.
3852: \newblock Springer-Verlag, Berlin.
3853: \newblock Isoperimetry and processes.
3854: 
3855: \bibitem[{Leung and Barron(2006)}]{leung_barron06}
3856: \textsc{Leung, G.} and \textsc{Barron, A.~R.} (2006).
3857: \newblock Information theory and mixing least-squares regressions.
3858: \newblock \textit{IEEE Trans. Inform. Theory}, \textbf{52} 3396--3410.
3859: 
3860: \bibitem[{Massart(2007)}]{massart03}
3861: \textsc{Massart, P.} (2007).
3862: \newblock \textit{Concentration inequalities and model selection}, vol. 1896 of
3863:   \textit{Lecture Notes in Mathematics}.
3864: \newblock Springer, Berlin.
3865: \newblock Lectures from the 33rd Summer School on Probability Theory held in
3866:   Saint-Flour, July 6--23, 2003, With a foreword by Jean Picard.
3867: 
3868: \bibitem[{Neumann(2000)}]{neumann00}
3869: \textsc{Neumann, M.~H.} (2000).
3870: \newblock Multivariate wavelet thresholding in anisotropic function spaces.
3871: \newblock \textit{Statist. Sinica}, \textbf{10} 399--431.
3872: 
3873: \bibitem[{Steinwart and Scovel(2007)}]{SS:07}
3874: \textsc{Steinwart, I.} and \textsc{Scovel, C.} (2007).
3875: \newblock Fast rates for support vector machines using {G}aussian kernels.
3876: \newblock \textit{Ann. Statist.}, \textbf{35} 575--607.
3877: 
3878: \bibitem[{Triebel(2006)}]{triebel06}
3879: \textsc{Triebel, H.} (2006).
3880: \newblock \textit{Theory of function spaces. {III}}, vol. 100 of
3881:   \textit{Monographs in Mathematics}.
3882: \newblock Birkh\"auser Verlag, Basel.
3883: 
3884: \bibitem[{Tsybakov(2003{\natexlab{a}})}]{tsybakov03}
3885: \textsc{Tsybakov, A.} (2003{\natexlab{a}}).
3886: \newblock \textit{Introduction à l'estimation non-paramétrique}.
3887: \newblock Springer.
3888: 
3889: \bibitem[{Tsybakov(2003{\natexlab{b}})}]{tsy:03}
3890: \textsc{Tsybakov, A.~B.} (2003{\natexlab{b}}).
3891: \newblock Optimal rates of aggregation.
3892: \newblock \textit{Computational Learning Theory and Kernel Machines.
3893:   B.Sch{\"o}lkopf and M.Warmuth, eds. Lecture Notes in Artificial
3894:   Intelligence}, \textbf{2777} 303--313.
3895: \newblock Springer, Heidelberg.
3896: 
3897: \bibitem[{van~de Geer(1990)}]{vandegeer90}
3898: \textsc{van~de Geer, S.} (1990).
3899: \newblock Estimating a regression function.
3900: \newblock \textit{Ann. Statist.}, \textbf{18} 907--924.
3901: 
3902: \bibitem[{van~de Geer(2007)}]{vdg07}
3903: \textsc{van~de Geer, S.} (2007).
3904: \newblock Oracle inequalities and regularization.
3905: \newblock In \textit{Lectures on empirical processes}. EMS Ser. Lect. Math.,
3906:   Eur. Math. Soc., Z\"urich, 191--252.
3907: 
3908: \bibitem[{van~de Geer(2000)}]{van_de_geer00}
3909: \textsc{van~de Geer, S.~A.} (2000).
3910: \newblock \textit{Applications of empirical process theory}, vol.~6 of
3911:   \textit{Cambridge Series in Statistical and Probabilistic Mathematics}.
3912: \newblock Cambridge University Press, Cambridge.
3913: 
3914: \bibitem[{van~der Vaart and Wellner(1996)}]{vdVW:96}
3915: \textsc{van~der Vaart, A.~W.} and \textsc{Wellner, J.~A.} (1996).
3916: \newblock \textit{Weak convergence and empirical processes}.
3917: \newblock Springer Series in Statistics, Springer-Verlag, New York.
3918: \newblock With applications to statistics.
3919: 
3920: \bibitem[{Wahba(1990)}]{wahba90}
3921: \textsc{Wahba, G.} (1990).
3922: \newblock \textit{Spline models for observational data}, vol.~59 of
3923:   \textit{CBMS-NSF Regional Conference Series in Applied Mathematics}.
3924: \newblock Society for Industrial and Applied Mathematics (SIAM), Philadelphia,
3925:   PA.
3926: 
3927: \bibitem[{Yang(2000)}]{yang:00}
3928: \textsc{Yang, Y.} (2000).
3929: \newblock Mixing strategies for density estimation.
3930: \newblock \textit{Ann. Statist.}, \textbf{28} 75--87.
3931: 
3932: \bibitem[{Yang(2004)}]{yang04}
3933: \textsc{Yang, Y.} (2004).
3934: \newblock Aggregating regression procedures to improve performance.
3935: \newblock \textit{Bernoulli}, \textbf{10} 25--47.
3936: 
3937: \end{thebibliography}
3938: 
3939: 
3940: % \bibliography{biblio}
3941: 
3942: 
3943: \end{document}