1:
2:
3: \documentclass[aos, reqno, preprint]{imsart}%
4: \RequirePackage{amsthm, amsmath, natbib, amsfonts, amssymb}%
5: \RequirePackage[OT1]{fontenc}%
6: \usepackage{graphicx, color}%
7: \usepackage{tikz}%
8: \usepackage{natbib}%
9:
10:
11: \numberwithin{equation}{section}%
12: \theoremstyle{plain}%
13: % \newtheorem{theorem}{Theorem}[section]
14:
15: \definecolor{darkblue}{rgb}{0.0,0.0,0.7}
16:
17: \RequirePackage[%
18: colorlinks = true,%
19: linkcolor = darkblue,%
20: citecolor = darkblue,%
21: urlcolor = darkblue, %
22: ]{hyperref}%
23:
24:
25: \hypersetup{%
26: pdfauthor = {St\'ephane Ga\"iffas, Guillaume Lecu\'e},%
27: pdftitle = {Adaptive estimation of the regression with an assumption
28: free design},%
29: pdfcreator = {pdflatex},%
30: pdfproducer = {pdflatex}}
31:
32: \startlocaldefs
33:
34: \def \egal {\stackrel{{\rm def}}{=}}
35:
36: \newcommand \cA{{\cal A}}
37: \newcommand \cB{{\cal B}}
38: \newcommand \cC{{\cal C}}
39: \newcommand \cD{{\cal D}}
40: \newcommand \cE{{\cal E}}
41: \newcommand \cF{{\cal F}}
42: \newcommand \cG{{\cal G}}
43: \newcommand \cH{{\cal H}}
44: \newcommand \cI{{\cal I}}
45: \newcommand \cL{{\cal L}}
46: \newcommand \cM{{\cal M}}
47: \newcommand \cN{{\cal N}}
48: \newcommand \cO{{\cal O}}
49: \newcommand \cP{{\cal P}}
50: \newcommand \cR{{\cal R}}
51: \newcommand \cQ{{\cal Q}}
52: \newcommand \cS{{\cal S}}
53: \newcommand \cU{{\cal U}}
54: \newcommand \cX{{\cal X}}
55: \newcommand \cY{{\cal Y}}
56: \newcommand \cZ{{\cal Z}}
57: \newcommand{\smin}{s_{\min}}%
58: \newcommand{\smax}{s_{\max}}
59:
60: \newcommand \R{{\mathbb R}}
61: \newcommand \E{{\mathbb E}}
62: \newcommand \V{{\mathbb V}}
63:
64: \newcommand{\T}{^{\top}}%
65: \newcommand{\var}{\text{Var}}%
66: \newcommand{\prodsca}[2]{\langle #1,#2 \rangle}%
67: \newcommand{\norm}[1]{\|#1\|}%
68: \newcommand{\ind}[1]{\mathbf 1_{#1}}%
69: \newcommand{\mb}{\mathbf}
70: \newcommand{\sumin}{\sum_{i=1}^n}
71: \newcommand{\sumim}{\sum_{i=1}^m}
72: \newcommand{\bs}{\boldsymbol}
73:
74: \newcommand{\grad}{\triangledown}
75:
76: \DeclareMathOperator*{\supp}{Supp}
77:
78: \DeclareMathOperator{\limInf}{liminf}
79: \DeclareMathOperator{\limSup}{limsup}
80:
81: \DeclareMathOperator*{\argmin}{argmin}
82: \DeclareMathOperator*{\argmax}{argmax}
83: \DeclareMathOperator{\pen}{pen}
84:
85: \DeclareMathOperator{\diag}{diag}
86: \DeclareMathOperator{\Span}{span}
87:
88:
89: \newcommand{\1}{{\rm 1}\kern-0.24em{\rm I}}
90: \newcommand{\hfn}{{\hat{f}_n}}
91: \renewcommand{\hat}{\widehat}
92:
93: % \newtheorem{theo}{Theorem}%
94: \newtheorem{theorem}{Theorem}%
95: \newtheorem{corollary}{Corollary}%
96: \newtheorem{lemma}{Lemma}%
97: \newtheorem{proposition}{Proposition}%
98: % \newtheorem*{assumption}{Assumption}%
99: \theoremstyle{remark}%
100: \newtheorem*{remark}{Remark}%
101: \newtheorem{definition}{Definition}%
102: \newtheorem*{assumption}{Assumption}%
103: \newtheorem{example}{Example}%
104:
105:
106: \endlocaldefs
107:
108:
109: % \linespread{1.4}
110:
111:
112: \begin{document}
113:
114: \begin{frontmatter}
115:
116: \title{Aggregation of penalized empirical risk minimizers in
117: regression}%
118: \runtitle{Aggregation of penalized empirical risk minimizers}
119:
120: \begin{aug}
121: \author{\fnms{St\'ephane} \snm{Ga\"iffas}
122: \ead[label=e1]{stephane.gaiffas@upmc.fr}} and
123: \author{\fnms{ Guillaume} \snm{Lecu\'e}
124: \ead[label=e2]{lecue@latp.univ-mrs.fr}}
125:
126: \runauthor{S. Ga\"iffas and G. Lecu\'e} \affiliation{Universit\'e
127: Paris~6 and CNRS, LATP Marseille}
128:
129: \address{Universit\'e Paris 6 \\
130: Laboratoire de Statistique Th\'eorique et Appliqu\'ee \\
131: 175 rue du Chevaleret \\
132: 75013 Paris \\
133: \printead{e1}}
134:
135: \address{ Laboratoire d'abalyse, topologie et probabilit\'e\\
136: Centre de Mathématiques et Informatique\\
137: Technopôle de Château-Gombert\\
138: 39 rue F. Joliot Curie\\
139: 13453 Marseille Cedex 13\\
140: France\\
141: \printead{e2}}
142: \end{aug}
143:
144: \begin{abstract}
145: We give a general result concerning the rates of convergence of
146: penalized empirical risk minimizers (PERM) in the regression
147: model. Then, we consider the problem of agnostic learning of the
148: regression, and give in this context an oracle inequality and a
149: lower bound for PERM over a finite class. These results hold for a
150: general multivariate random design, the only assumption being the
151: compactness of the support of its law (allowing discrete
152: distributions for instance). Then, using these results, we
153: construct adaptive estimators. We consider as examples adaptive
154: estimation over anisotropic Besov spaces or reproductive kernel
155: Hilbert spaces. Finally, we provide an empirical evidence that
156: aggregation leads to more stable estimators than more standard
157: cross-validation or generalized cross-validation methods for the
158: selection of the smoothing parameter, when the number of
159: observation is small.
160: % estimators which are Our aggregation
161: % approach is motivated by a lower bound for PERM procedures over
162: % a finite set of weak estimators, which proves that PERM
163: % procedures are suboptimal compared to some exponential weighted
164: % averaged schemes.
165: % We propose an adaptive estimator of the multivariate regression
166: % function $f_0$ from i.i.d. observations. Without assumption on
167: % the law $P_X$ of the covariates, besides almost sure
168: % boundedness, we prove that the standard rate $n^{-s / (2s + 1)}$
169: % can be achieved by an adaptive estimator, where $n$ denotes the
170: % sample size and $s$ the smoothness of $f_0$ measured in some
171: % sense, including Besov smoothness. The assumption on the noise
172: % is fairly general.
173: \end{abstract}
174:
175: \begin{keyword}[class=AMS]
176: \kwd[Primary ]{62G08}
177: \kwd[; secondary ]{62H12}
178: \end{keyword}
179:
180: \begin{keyword}
181: \kwd{Nonparametric regression, agnostic learning, aggregation,
182: adaptive estimation, random design, anisotropic Besov space,
183: Reproductive Kernel Hilbert Spaces}
184: \end{keyword}
185:
186: \end{frontmatter}
187:
188:
189: \section{Introduction}
190: \label{sec:introduction}
191:
192: \subsection{Motivations}
193:
194: In this paper, we explore some statistical properties of penalized
195: empirical risk minimization (PERM) and aggregation procedures in the
196: regression model. From these properties, we will be able to obtain
197: results concerning adaptive estimation for several problems. Given a
198: data set $D_n$, we consider two problems. Let us define the norm
199: $\norm{g}^2 := \int g(x)^2 P_X(dx)$ where $P_X$ is the law of the
200: covariates and let $E[\cdot]$ be the expectation w.r.t. the joint law
201: of $D_n$. The first problem is the problem of estimation of the
202: regression function $f_0$. Namely, we aim at constructing some
203: procedure $\bar{f}_n$ satisfying
204: \begin{equation}
205: \label{eq:RateOfConvergence}
206: E \|\bar{f}_n - f_0 \|^2 \leq \psi(n)
207: \end{equation}
208: where $\psi(n)$, called the {\it rate of convergence}, is a quantity
209: we wish very small as $n$ increases. To get this kind of inequality,
210: it is well-known that one has to assume that $f_0$ belongs to a set
211: with a small complexity (cf., for instance, the "No free Lunch
212: theorem" in \cite{DGL:96}). This is what we do in
213: Section~\ref{sec:pena_least_squares} below, where an assumption on the
214: complexity is considered, see Assumption ($C_\beta$) on the metric
215: entropy.
216:
217: However, this kind of ``a priori'' may not be fulfilled. That is why
218: the second problem, called {\it agnostic learning} has been introduced
219: (cf. \cite{H:92,KSSH:94} and references therein). For this problem, one is given a set $F$ of
220: functions. Without any assumption on $f_0$, we want to construct (from
221: the data) a procedure $\tilde{f}$ which has a risk as close as
222: possible to the smallest risk over $F$. Namely, we want to obtain {\it
223: oracle inequalities}, that is inequalities of the form
224: \begin{equation*}
225: E \| \tilde{f} - f_0 \|^2 \leq C \min_{f\in F} \|f - f_0 \|^2 +
226: \phi(n,F),
227: \end{equation*}
228: where $C \geq 1$ and $\phi(n,F)$ is called the {\it residue}, which is
229: the quantity that we want to be small as $n$ increases. When $F$ is
230: of finite cardinality $M$, the agnostic problem is called {\it
231: aggregation problem} and the residue $\phi(n,F) = \phi(n,M)$ is
232: called {\it rate of aggregation}. The main difference between the
233: problems of estimation and aggregation is that we don't need any
234: assumption on $f_0$ for the second problem. Nevertheless, aggregation
235: methods have been widely used to construct adaptive procedures for the
236: estimation problem. That is the reason why we study aggregation
237: procedures in Section~\ref{sec:ERM_finite} below. We will use these
238: procedures in Section~\ref{sec:examples} to construct adaptive
239: estimators in several particular cases, such as adaptive estimation in
240: reproductive kernel Hilbert spaces (RKHS) or adaptive estimation over
241: anisotropic Besov spaces.
242:
243: In Section~\ref{sec:ERM_finite}, we also prove that the ``natural''
244: aggregation procedure, namely empirical risk minimization (ERM) (or
245: its penalized version), fails to achieve the optimal rate of
246: aggregation in this setup. This result motivates the use of an
247: aggregation procedure instead of the most common ERM. Moreover, we
248: provide an empirical evidence in Section~\ref{sec:simulations} that
249: aggregation (with jackknife) is more stable than the classical
250: cross-validation or generalized cross-validation procedures when the
251: number of observations and the signal-to-noise ratio are small.
252:
253: The approach proposed in this paper allows to give rates of
254: convergence for adaptive estimators over very general function sets,
255: such as the anisotropic besov space, with very mild assumption on the
256: law of the covariates: all the results are stated with the sole
257: assumption that the law of the covariates is compact.
258:
259:
260: % We propose an adaptive estimator of the multivariate regression
261: % function $f_0$ from i.i.d. observations. This procedure has strong
262: % adaptation properties: it is adaptive for a very large range of
263: % smoothness classes, including Besov spaces, in the sense that it
264: % achieves the optimal convergence rate without assumption on the
265: % design (or covariates) distribution, besides almost sure
266: % boundedness. % Moreover, this estimator can reduce the dimension of the
267: % % problem, when a single-index assumption is satisfied.
268: % Adaptation is realized via aggregation of several so-called \emph{weak}
269: % estimators, that have in common this strong \emph{design adaptation}
270: % property. The explanatory variable $Y$ is not assumed to be bounded
271: % (we consider subgaussian noise), thus the setting considered here is
272: % more general than the so-called ``distribution free non-parametric
273: % estimation'', see for instance~\cite{kohler02}, which contains a very
274: % exhaustive and detailed presentation of methods that handle the
275: % situation where the knowledge about $P_X$ is very poor.
276:
277: % Adaptation is achieved via \emph{aggregation} (or aggregation) of penalized
278: % least squares estimators over general spaces. From a theoretical point
279: % of view, we use probability techniques coming from empirical process
280: % theory such as covering numbers, peeling and chaining. These
281: % techniques are technical recipes that allows to counterpart the
282: % massiveness of the smoothness classes considered in nonparametric
283: % statistics. On this topic, we refer to~\cite{kohler02},
284: % \cite{vandegeer88, van_de_geer00}, which contains tools and ideas of
285: % importance here (concerning penalized least squares). From a more
286: % practical point of view, the adaptive estimator presented here allows
287: % to mix estimators that are known to provide good results for certain
288: % types of curves. A simple example proposed here is the aggregation of
289: % smoothing splines (least squares with Sobolev penalty). Instead of
290: % selecting the smoothing parameter via GCV (generalized cross
291: % validation), which is of common use in practice, we suggest to apply a
292: % aggregation algorithm to estimators computed with different smoothing
293: % parameters. This allows to consider splines with different orders
294: % simultaneously, while cubic splines are often considered alone in
295: % applications. We show (see Section~\ref{sec:simulations}) that this
296: % provides a more stable procedure than GCV, and that it gives better
297: % results. Moreover, we provide here theoretical results for this
298: % adaptive method, while theoretical knowledge about GCV (concerning
299: % adaptive rates of convergence) is poor. Furthermore, we can mix
300: % smoothing splines with other estimators, like wavelet soft
301: % thresholding for instance (least squares with a particular Besov
302: % penalty). Such an estimator gives good results whathever the curve is:
303: % either a smooth curve, coming from econometric data for instance, or a
304: % signal with bumps or rapid oscillations. When the covariates are
305: % multivariate, we can even further mix purely nonparametric estimators
306: % (with curse of dimensionality) with semiparametric estimators that
307: % process the data using the single-index assumption. The resulting
308: % adaptive estimator provides good results, whether or not the data is
309: % well explained by a single-index model, and it is rate-optimal in both
310: % cases.
311:
312: % The main drawback of our aggregation strategy is that it has a higher
313: % computational cost than a single estimation technique with data-driven
314: % selection of smoothing parameters. But, the counterpart is that when
315: % we aggregate estimators, we do not need to test if some model is
316: % better than another. For instance, we do not test if a single-index
317: % model explains well the data, we just mix all the estimators (purely
318: % nonparametric and single-index) using our aggregation rule, and come
319: % up with an estimator that does a job which is close to the best among
320: % them, whatever the model is (it must be emphasized at this point that
321: % actually, the performance of the aggregate is much better than the
322: % best among them, this is discudded in Section~\ref{sec:simulations}
323: % below).
324:
325: % This general formulation of penalized least squares estimation
326: % includes several standard ones, for instance penalized splines (when
327: % $\mathcal F$ is a Sobolev class) or Besov-penalty least squares
328: % estimators, that are commonly considered in signal or image-processing
329: % papers, see for instance ????. As a consequence, the general upper
330: % bound stated in Theorem~\ref{thm:least_sq} provides directly the same
331: % general upper bound for such estimators, provided that the class
332: % $\mathcal F$ satisfies some complexity bound,
333: % see~\eqref{eq:covering_assumption}.
334:
335: % \texttt{balance entre le temps de calcul, mais pas de test a
336: % faire....}
337:
338: % This idea was previously developped in the pioneering works of
339: % \texttt{citer Zhang a fond ici....} and concerning aggregation, we
340: % refer to see also the works by ?????
341:
342: % distribution free assumption non-parametric estimation This upper
343: % bound is stated without any assumption on the law of the covariates,
344: % besides boundedness. . In particular, we do not need to assume that
345: % the law of the covariate have a density with respect to the Lebesgue
346: % measure. this upper bound is valid when the corovatiates are
347: % discrete, or satisfies an upper bound We prove that this estimator
348: % converges with the optimal rate of convergence general This
349: % estimator is based on multivariate penalized least squares
350: % estimates, and By We construct an adaptive estimator of the
351: % regression, We propose a new algorithm for the estimation of both
352: % the index and the link function in the single index model. Un beau
353: % abstract
354:
355:
356: \subsection{The model}
357: \label{sec:model}
358:
359: Let $(X, Y), (X_1, Y_1), \ldots, (X_n, Y_n)$, be independent and
360: identically distributed variables in $\mathbb R^d \times \mathbb
361: R$. We consider the regression model
362: \begin{equation}
363: \label{eq:model}
364: Y = f_0(X) + \sigma \varepsilon,
365: \end{equation}
366: where $f_0 : \mathbb R^d \rightarrow \mathbb R$ and $\varepsilon$ is
367: called noise. To simplify, we assume that the noise level $\sigma$ is
368: known. We denote by $P$ the probability distribution of $(X,Y)$ and by
369: $P_X$ the margin distribution in $X$ or \emph{design}, or
370: \emph{covariates} distribution. We denote by $P^n$ the joint
371: distribution of the sample
372: \begin{equation*}
373: D_n := [ (X_i, Y_i) \;;\; 1 \leq i \leq n],
374: \end{equation*}
375: and by $P_n = P^n[\cdot | X^n]$ where $X^n := (X_1, \ldots, X_n)$, the
376: joint distribution of the sample $D_n$ conditional on the design $X^n
377: := (X_1, \ldots, X_n)$. The expectation w.r.t. $P_n$ is denoted by
378: $E_n$. The noise $\varepsilon$ is symmetrical and subgaussian
379: conditionally on $X$. Indeed, we assume that there is $b_\varepsilon >
380: 0$ such that
381: \begin{equation}
382: \label{eq:subgaussian}
383: (G1)(b_\varepsilon): \quad E[\exp(t\varepsilon) | X] \leq
384: \exp(b_\varepsilon^2t^2/2) \quad \forall t > 0
385: \end{equation}
386: which is equivalent (up to an appropriate choice for the constant
387: $b_\varepsilon$) to
388: \begin{equation*}
389: \nonumber(G2)(b_\varepsilon) : P[\varepsilon > t | X] \leq
390: \exp(-t^2/(2b_\varepsilon^2)) \quad \forall t > 0.
391: \end{equation*}
392: Assumption~\eqref{eq:subgaussian} is standard in nonparametric
393: regression, it includes the models of bounded and Gaussian
394: regression. An important fact, that will be used in the proofs, is
395: that for $\varepsilon_1,\ldots,\varepsilon_n$ independent and such
396: that $\varepsilon_i$ satisfies $(G1)(b_i)$ for any $i=1,\ldots,n$, the
397: random variable $\sum_{i=1}^n a_i \varepsilon_i$ satisfies $(G1)(\sum
398: a_i^2b_i^2$) for any $a_1,\ldots,a_n \in \R$ and thus the
399: concentration property $(G2)(\sqrt{2}\sum a_i^2b_i^2$). Other
400: equivalent definitions of subgaussianity are, when $\varepsilon$ is
401: symmetrical, to assume that $E[ \exp(\varepsilon^2/b_\varepsilon^2 |
402: X) ] \leq 2$ for some $b_\varepsilon > 0$, or $(E[ |\varepsilon|^p |
403: X])^{1/p} \leq b_\varepsilon \sqrt{p}$ for any $p \geq 1$.
404:
405: Concerning the design, we only assume that $X$ has a compact support,
406: and without loss of generality we can take its support equal to $[0,
407: 1]^d$. In particular we do not need $P_X$ to be continuous with
408: respect to the the Lebesgue measure. Note that the problem of adaptive
409: estimation with such a general multivariate design is not common in
410: literature. In the so-called ``distribution free nonparametric
411: estimation'' framework, when we want to obtain convergence rates and
412: not only the consistency of the estimators, it is, as far as we know,
413: always assumed that $|Y| \leq L$ a.s. for some constant $L > 0$, see
414: for instance~\cite{kohler02}, \cite{kohler_krzyzak01a},
415: \cite{kohler_krzyzak01b}, \cite{kohler00} and~\cite{kerk_picard07},
416: which is a setting less general than the one considered here.
417:
418: \begin{remark}
419: The results presented here can be extended to subexponential noise,
420: that is when $E[ \exp(|\varepsilon| / b_\varepsilon) | X] \leq 2$
421: for some $b_\varepsilon > 0$, but it involves complications
422: (chaining with an adaptative truncation argument in the proof of
423: Theorem~\ref{thm:devia1} below, see for instance~\cite{BLM99}
424: or~\cite{van_de_geer00}, among others) that we prefer to skip
425: here. % It can also be seen that extra smoothness in the noise, that
426: % is $E_n[ \exp(b |\varepsilon|^p) ] \leq 1$ with $p \geq 2$ does
427: % not actually improve the results presented here (the rates of
428: % convergence remains the same), but this problem is beyond the
429: % scope of this paper.
430: \end{remark}
431:
432: % \begin{remark}
433: % To avoid complications, we assume that the noise level
434: % $\sigma(\cdot)$ is known, and such that $\sigma_0 < \sigma(X) \leq
435: % \sigma_1$ a.s. for some $0 < \sigma_0 < \sigma_1$. If not, one can
436: % replace penalized least squares by weighted penalized least squares
437: % to handle heteroscedastic noise and one can do a slight modification
438: % in the weights in the aggregation algorithm, see ??????
439: % \end{remark}
440:
441: %% \begin{definition}
442: %% \label{def:orlicz}
443: %% A \emph{Young} function is a convex, increasing function $\psi$ on
444: %% $\mathbb R^+ \rightarrow \mathbb R^+$ such that $\psi(0) = 0$ and
445: %% $\lim_{x \rightarrow +\infty}\psi(x) = +\infty$. We define the
446: %% \emph{Orlicz seminorm} $\norm{\varepsilon}_\psi$ of a random variable
447: %% $\varepsilon$ by
448: %% \begin{equation*}
449: %% \norm{\varepsilon}_\psi := \inf \{ c > 0 : E[ \psi(|\varepsilon| / c) ]
450: %% \leq 1 \},
451: %% \end{equation*}
452: %% with usual convention $\norm{\varepsilon}_\psi = +\infty$ when the
453: %% infimum is taken over an empty set. We define also
454: %% \begin{equation*}
455: %% \norm{\varepsilon}_{n, \psi} := \inf \{ c > 0 : E_n[ \psi(|\varepsilon| / c) ]
456: %% \leq 1 \text{ a.s. }\}.
457: %% \end{equation*}
458: %% \end{definition} %
459:
460: % \begin{assumption}[Model assumption]
461: % Throughout the paper, we assume that $E_n[\varepsilon] = 0$, and that
462: % for some $p, B > 0$
463: % \begin{equation*}
464: % \norm{\varepsilon}_{n, \psi_p} \leq B
465: % \end{equation*}
466: % almost surely, where $\psi_p(x) := \exp(|x|^p) - 1$.
467: % \end{assumption}
468:
469: % This assumption on the model is very general. First, it includes
470: % most of the standard assumptions on the noise that are considered in
471: % nonparametric regression literature. For instance, when $p=2$, this
472: % noise assumption means that the noise is subgaussian conditionally
473: % on the design. It includes also noises which are, conditionally on
474: % the design, gaussian ($p=2$), double exponential ($p=1$) or bounded
475: % almost surely ($p=\infty$, bounded regression). Note that the
476: % statisticien does not need to know the parameter $p$.
477:
478: % If $\psi(x) = \exp(x^2) - 1$, then $\norm{\varepsilon}_\psi < +\infty$
479: % if and only if $\varepsilon$ is subgaussian, namely such that $E[ \exp(
480: % b \varepsilon^2) ] \leq B$ for some $b, B > 0$. In what follows, we
481: % assume that the noise $\varepsilon$ satisfies $\norm{\varepsilon}_{\psi_p}
482: % < +\infty$ for some $p > 0$, where $\psi(x) = |x|^p$. This
483: % assumption includes many standard noises, such as gaussian,
484: % subgaussian, or double exponential noise, among many others.
485:
486: % Moreover, we assume that the $\varepsilon_i$ are independent of $X^n :=
487: % (X_1, \ldots, X_n)$ for $1 \leq i \leq n$.
488:
489: % \section{Construction of the procedure}
490: % \label{sec:construction}
491:
492: % \begin{figure}[htbp]
493: % \centering
494: % \label{fig:split}
495: % \begin{tikzpicture}
496: % \begin{scope}[shape=rectangle,rounded corners,%
497: % minimum size=0.8cm,fill=white]%
498: % \tikzstyle{every node}=[draw,fill]%
499: % \node (D_n) at (0,0) {whole sample $D_n$};%
500: % \node (D_m) at (1.5, 1.5) {training sample $D_m$};%
501: % \node (D_l) at (1.5, -1.5) {learning sample $D_{(m)}$};%
502: % \node (weak) at (6, 1.5) {weak estimators $\{ \bar f_\lambda ;
503: % \lambda \in \Lambda \}$};%
504: % \node (weights) at (6, -1.5) {weights $\{ \hat \theta_\lambda
505: % ; \lambda \in \Lambda \}$};%
506: % \node (aggregate) at (9, 0) {aggregated estimator $\hat
507: % {\mathsf f}$};%
508: % \end{scope}
509: % \draw[] (D_n) -- (D_m);%
510: % \draw[->,very thick] (D_m) -- (weak);%
511: % % -- (weak) -- (aggregate);%
512: % \draw[] (D_n) -- (D_l);%
513: % \draw[->,very thick] (D_l) -- (weights);%
514: % \draw[->,very thick] (weak) -- (aggregate);%
515: % \draw[->,very thick] (weights) -- (aggregate);%
516: % % \draw[] (D_n) -- (q_1) -- (q_2) -| (q_E);%
517: % % \draw[->,shorten >=2pt] (D_n) .. controls +(75:1.4cm) and
518: % % +(105:1.4cm) .. node[above] {$x$} (D_n);
519: % \end{tikzpicture}
520: % \caption{Splitting the sample}
521: % \end{figure}
522:
523: \section{PERM over a large function set}
524: \label{sec:pena_least_squares}
525:
526: We consider the following problem of estimation: we fix a function
527: space $\mathcal F$ and we want to recover $f_0$ based on the sample
528: $D_n$ using the knowledge that $f_0 \in \mathcal F$. The set $\mathcal
529: F$ is endowed with a seminorm $|\cdot|_{\mathcal F}$. To fix the
530: ideas, when $d=1$, one can think for instance of the Sobolev space
531: $\mathcal F = W_2^s$ of functions such that $|f|_{\mathcal F}^2 = \int
532: f^{(s)}(t)^2 dt < +\infty$, where $s$ is a natural integer and
533: $f^{(s)}$ is the $s$-th derivative of $f$. In this case, the estimator
534: described below is the so-called \emph{smoothing spline estimator},
535: see for instance \cite{wahba90}. Several other examples are given in
536: Section~\ref{sec:examples} below.
537:
538: \subsection{Definition of the PERM}
539:
540: The idea of penalized empirical risk minimization is to make the
541: balance between the goodness-of-fit of the estimator to the data with
542: its smoothness. The quantity $|f|_{\mathcal F}$ measures the
543: smoothness (or ``roughness'') of $f \in \mathcal F$ and the balance is
544: quantifyied by a parameter $h > 0$.
545: \begin{definition}[PERM]
546: \label{def:perm}
547: Let $\lambda = (h, \mathcal F)$ be fixed. We say that $\bar
548: f_\lambda$ is a penalized empirical risk minimizer if it minimizes
549: \begin{equation}
550: \label{eq:pena_least_sq}
551: R_n(f) + \pen_\lambda(f)
552: \end{equation}
553: over $\mathcal F$, where $\pen_\lambda(f) := h^2 |f|_{\mathcal
554: F}^\alpha$ for some $\alpha > 0$ and where
555: \begin{equation*}
556: R_n(f) := \norm{Y - f}_n^2 = \frac{1}{n} \sum_{i=1}^n (Y_i -
557: f(X_i))^2
558: \end{equation*}
559: is the empirical risk of $f$ over the sample $D_n$.
560: \end{definition}
561:
562: The parameter $\alpha$ is a tuning parameter, which can be chosen
563: depending on the seminorm $|\cdot|_{\mathcal F}$, see the examples in
564: Section~\ref{sec:examples}. For simplicity, we shall always assume
565: that a PERM $\bar f_\lambda$ exists, since we can always find $\tilde
566: f_\lambda$ such that $R_n(\tilde f_\lambda) + \pen_{\lambda}(\tilde
567: f_\lambda) \leq \inf_{f \in \mathcal F} \{ R_n(f) + \pen_{\lambda}(f)
568: \} + 1 / n$ which satisfies the same upper bound from
569: Theorem~\ref{thm:least_sq} (see below) as an hypothetic $\bar
570: f_\lambda$. However, a minimizer may not be necessarily unique, but
571: this is not a problem for the theoretical results proposed below. PERM
572: has been studied in a tremendous number of papers, we only refer to
573: \cite{van_de_geer00, vdg07}, \cite{massart03} and \cite{kohler02},
574: which are the closest to the material proposed in this Section.
575:
576: In Theorem~\ref{thm:least_sq} below we propose a general upper bound
577: for PERM over a space $\mathcal F$ that satisfies the complexity
578: Assumption $(C_\beta)$ below. The proof of this upper bound involves a
579: result concerning the supremum of the empirical process $Z(f) :=
580: \sigma n^{-1/2} \sum_{i=1}^n f(X_i) \varepsilon_i$ over $f \in
581: \mathcal F$ which is given in Theorem~\ref{thm:devia1} below.
582:
583: % \subsection{Main definitions}
584:
585: \subsection{Some definitions and useful tools}
586:
587: Let $(E, \norm{\cdot})$ be a normed space. For $z \in E$, we denote by
588: $B(z, \delta)$ the ball centered at $z$ with radius $\delta$. We say
589: that $\{ z_1, \ldots, z_p \}$ is a $\delta$-cover of some set $A
590: \subset E$ if
591: \begin{equation*}
592: A \subset \bigcup_{1 \leq i \leq p} B(z_i, \delta).
593: \end{equation*}
594: The \emph{$\delta$-covering number} $N(\delta, A, \norm{\cdot})$ is
595: the minimal size of a $\delta$-cover of~$A$ and
596: \begin{equation*}
597: H(\delta, A, \norm{\cdot}) := \log N(\delta, A, \norm{\cdot})
598: \end{equation*}
599: is the \emph{$\delta$-entropy} of $A$. The main assumption in this
600: section concerns the complexity of the space $\mathcal F$, which is
601: quantified by a bound on the entropy of its unit ball $B_{\mathcal F}
602: := \{ f \in \mathcal F : |f|_{\mathcal F} \leq 1 \}$. We denote for
603: short $H_\infty(\delta, A) = H(\delta, A, \norm{\cdot}_\infty)$ where
604: $\norm{f}_\infty := \sup_{x \in [0,1]^d} |f(x)|$. We denote by $C([0,
605: 1]^d)$ the set of continuous functions on $[0, 1]^d$.
606: \begin{assumption}[$C_\beta$]
607: We assume that $\mathcal F \subset C([0, 1]^d)$ and that there is a
608: number $\beta \in (0, 2)$ such that for any $\delta > 0$, we have
609: \begin{equation}
610: % \label{eq:covering_assumption}
611: H_\infty\big( \delta, B_{\mathcal F} \big)
612: \leq D \delta^{-\beta}
613: % \exp\Big( D \Big( \frac{R}{\delta} \Big)^{d/s} \Big),
614: \end{equation}
615: where $D > 0$ is independent of $\delta$.
616: \end{assumption}
617: This assumption entails that, for any radius $R > 0$, we have
618: \begin{equation*}
619: H_\infty\big( \delta, B_{\mathcal F}(R) \big) \leq D
620: \Big(\frac{R}{\delta}\Big)^{\beta}
621: \end{equation*}
622: where $B_{\mathcal F}(R) := \{ f \in \mathcal F : |f|_{\mathcal F}
623: \leq R \}$.
624: % that thatsince this assumption entails that for any ball Define the
625: % ball $\mathcal F(R) := \{ f \in \mathcal F : |f|_{\mathcal F} \leq R
626: % \}$. The main assumption in this section concerns the complexity of
627: % the space $\mathcal F$, which is quantified by a bound on the
628: % entropy of its balls $\mathcal F(R)$. We denote for short
629: % $H_\infty(\delta, A) = H(\delta, A, \norm{\cdot}_\infty)$ where
630: % $\norm{f}_\infty := \sup_{x \in [0,1]^d} |f(x)|$. We denote by
631: % $C([0, 1]^d)$ the set of continuous functions on $[0, 1]^d$.
632: % \begin{assumption}[$C_\beta$]
633: % We assume that $\mathcal F \subset C([0, 1]^d)$ and that there is a
634: % number $\beta \in (0, 2)$ such that for any positive $\delta$ and
635: % $R$, we have
636: % \begin{equation}
637: % % \label{eq:covering_assumption}
638: % H_\infty\big( \delta, \mathcal F(R) \big)
639: % \leq D \Big(\frac{R}{\delta}\Big)^{\beta}
640: % % \exp\Big( D \Big( \frac{R}{\delta} \Big)^{d/s} \Big),
641: % \end{equation}
642: % where $D > 0$ is independent of $\delta$ and $R$.
643: % \end{assumption}
644: % \begin{remark}
645: Assumption~$(C_\beta)$ is satisfied by barely all smoothness spaces
646: considered in nonparametric literature (at least when the smoothness
647: of the space is large enough compared to the dimension, see
648: below). The most general space that we consider in this paper and
649: which satisfies~$(C_\beta)$ is the anisotropic Besov space $B_{p,
650: q}^{\bs s}$, where $\bs s = (s_1, \ldots, s_d)$ is a vector of
651: positive numbers. This space is precisely defined in
652: Appendix~\ref{sec:appendix_approximation}. Each $s_i$ corresponds to
653: the smoothness in the direction $e_i$, where $\{ e_1, \ldots, e_d \}$
654: is the canonical basis of $\mathbb R^d$. The computation of the
655: entropy of $B_{p, q}^{\bs s}$ can be found in~\cite{triebel06}, we
656: give more details in Appendix~\ref{sec:appendix_approximation}. If
657: $\bs {\bar s}$ is the harmonic mean of $\bs s$, namely
658: \begin{equation}
659: \label{eq:harmonic_mean}
660: \frac{1}{\bs {\bar s}} := \frac{1}{d} \sum_{i=1}^d
661: \frac{1}{s_i},
662: \end{equation}
663: then $B_{p, q}^{\bs s}$ satisfies~$(C_\beta)$ with $\beta = d / \bs
664: {\bar s}$, given that $\bs {\bar s} > d / s$, which is the usual
665: condition to have the embedding $B_{p, q}^{\bs s} \subset C([0,
666: 1]^d)$.
667: %\end{remark}
668:
669: \begin{remark}
670: Under the restriction $\beta \in (0, 2)$, the Dudley's entropy
671: integral satisfies
672: \begin{equation*}
673: \int_0^{ {\rm diam}( B_{\mathcal F}, \|\cdot\|_\infty)}
674: \sqrt{H_\infty(\delta, B_{\mathcal F})} d\delta<\infty,
675: \end{equation*}
676: where $\text{diam}(B_{\cF},\|\cdot\|_\infty)$ is the
677: $L_\infty$-diameter of $B_{\mathcal F}$. This is a standard
678: assumption coming from empirical process theory. It is related to
679: the so-called chaining argument, that we use in the proof of
680: Theorem~\ref{thm:devia1}. However, in order to consider a larger
681: space of functions $\mathcal F$, we could think of function spaces
682: with a complexity $\beta \geq 2$. In this case, using a slightly
683: different chaining argument (cf. \cite{vdVW:96}), the quantity
684: appearing in the upper bound of some subgaussian process is of the
685: type $\int_{c/\sqrt{n}}^{\text{diam}(B_{ \cF},\|\cdot\|_\infty)}
686: \sqrt{H_\infty(\delta, B_{\mathcal F})} d\delta$ which converges
687: whatever $\beta$ is. However, such considerations are beyond the
688: scope of the paper and are to be considered in a future work.
689: \end{remark}
690:
691:
692: % if $\bs s / s \in \mu \mathbb \log Z_+^d$ for some $\mu > 0$
693:
694: % $d / s $we denote by $s$ We give a precise overview of such results
695: % in Appendix~\ref{sec:appendix}.
696:
697: % this condition is
698:
699: % If
700: % $\mathcal F = B_{p,\infty}^s([0,1]^d)$, where
701: % $B_{p,\infty}^s([0,1]^d)$ is the Besov space with smoothness $s$
702: % (see~\cite{devore_lorentz93} for precise definitions and properties of
703: % Besov spaces), then condition~\eqref{eq:covering_assumption} holds,
704: % see~\cite{birge_massart00}. This result is precisely recalled in
705: % Theorem~\ref{thm:birge_massart}, see in Appendix.
706:
707: % if $|\mathcal|$In certain cases, an appropriate choice of $\alpha$
708: % allows to simplify minimization of \eqref{eq:pena_least_sq}, see the
709: % examples given below. This definition includes several standard
710: % estimators: smoothing splines (take $\mathcal F$ as a Sobolev space)
711: % and when $\mathcal F$ is a Besov space, $\bar f_\lambda$ is related
712: % to other popular denoising techniques. This is explained in details
713: % later in the Section.
714:
715: \subsection{About the supremum of the process $Z(\cdot)$}
716: \label{sec:process_Z0}
717:
718: The beginning of the proof of Theorem~\ref{thm:least_sq} is, as usual
719: with the proof of upper bounds for $M$-estimators, based on an
720: inequality that links the empirical norm of estimation and the
721: empirical process of the model. This idea goes back to key
722: papers~\cite{vandegeer90} and \cite{birge_massart93}, see
723: also~\cite{van_de_geer00, vdg07} and \cite{massart03} for a detailed
724: presentation. In regression, it writes, if $\bar f$ is a PERM and if
725: $f_0 \in \mathcal F$:
726: \begin{align*}
727: \norm{\bar f - f_0}_n^2 + \pen(\bar f) &\leq \frac{2}{\sqrt{n}}
728: Z_n(\bar f - f_0) + \pen(f_0),
729: % &\leq \sup_{f \in \mathcal F} \frac{2}{\sqrt{n}} Z_n(f - f_0) +
730: % \pen(f_0),
731: \end{align*}
732: where
733: \begin{equation}
734: \label{eq:Z_n_def}
735: Z_n(f) := \frac{\sigma}{\sqrt{n}} \sum_{i=1}^n f(X_i) \varepsilon_i.
736: \end{equation}
737: This inequality explains why the next Theorem~\ref{thm:devia1} is the
738: main ingredient of the proof of Theorem~\ref{thm:least_sq}
739: below. Then, an important remark is that~\eqref{eq:subgaussian}
740: entails
741: \begin{equation}
742: \label{eq:deviaZnf}
743: P_n[Z_n(f) > z] \leq \exp\Big( \frac{-z^2}{2 b^2 \norm{f}_n^2}
744: \Big)
745: \end{equation}
746: for any fixed $f$, $z > 0$ and $n \geq 1$, where $\norm{f}_n^2 :=
747: n^{-1} \sum_{i=1}^n f(X_i)^2$ and where we take for short $b := \sigma
748: b_\varepsilon$. This deviation inequality is at the core of the proof
749: of Theorem~\ref{thm:devia1} below. Let us introduce the
750: \emph{empirical ball} $B_n(f_0, \delta) := \{ f : \norm{f - f_0}_n
751: \leq \delta \}$ and let us recall that $P_n := P^n[\cdot | X^n]$ is
752: the joint law of the sample $D_n$ conditionally to the design $X^n =
753: (X_1, \ldots, X_n)$.
754:
755: \begin{theorem}
756: \label{thm:devia1}
757: Let $Z_n(\cdot)$ be the empirical process~\eqref{eq:Z_n_def} and
758: assume that $(\mathcal F, |\cdot|_{\mathcal F})$ satisfies
759: $(C_\beta)$. Then\textup, if $f_0 \in \mathcal F$\textup, we can
760: find constants $z_1 > 0$ and $D_1 > 0$ such that\textup:
761: \begin{align}
762: \label{eq:deviaZ_n}
763: P_n \Big[ \sup_{f \in \mathcal F \cap B_n(f_0, \delta)} \frac{
764: Z_n(f - f_0) }{\norm{f - f_0}_n^{1 - \beta / 2} (1 +
765: |f|_{\mathcal F})^{\beta / 2} } > z \Big] \leq \exp( - D_1 z^2
766: \delta^{-\beta} )
767: \end{align}
768: for any $\delta > 0$ and $z \geq z_1$ \textup(we recall that $\beta
769: \in (0, 2)$\textup).
770: \end{theorem}
771:
772: The proof of this Theorem is given is
773: Section~\ref{sec:proof_main_results}, it uses techniques from
774: empirical process theory such as peeling and chaining. It is a uniform
775: version of~\eqref{eq:deviaZnf}, localized around $f_0$ (for the
776: empirical norm). In this theorem, we use the ``weighting trick'' that
777: was introduced in~\cite{vandegeer90, van_de_geer00}: we divide
778: $Z_n(\cdot)$ by $\norm{f - f_0}_n$ and $|f|_{\mathcal F}$ in order to
779: counterpart, respectively, the variance of $Z_n(\cdot)$ and the
780: massiveness of the class $\mathcal F$. This renormalization of the
781: empirical process is also at the core of the proof of
782: Theorem~\ref{thm:least_sq}.
783:
784: % \begin{remark}
785: % There is no measurability problem in the inequality stated in
786: % Theorem~\ref{thm:devia1} since the supremum holds over $\mathcal F$,
787: % which is assumed to be included in the separable space $C([0,
788: % 1]^d)$.
789: % \end{remark}
790:
791: % is close to results given in~\cite{van_de_geer00}, where a general
792: % presentation of the use of empirical process techniques for
793: % nonparametric estimation is proposed. See also~\cite{kohler02} for
794: % the situation where $|Y| \leq L$ almost surely for some constant $L
795: % > 0$ and~\cite{massart03} for a detailed presentation of the use of
796: % concentration inequalities in nonparametric statistics.
797:
798: % Thus, the proof relies on the study of the process $Z(\cdot)$. In
799: % Theorem~\ref{thm:devia1} below (see Section~\ref{sec:process_Z0}) we
800: % give a deviation inequality for the supremum of this process over a
801: % general space satisfying the complexity
802: % bound~\eqref{eq:covering_assumption}. This kind of result was
803: % previously used by~\cite{vandegeer90}, among many others, in order
804: % to derive upper bounds for least squares and penalized least squares
805: % estimators. See also~\cite{van_de_geer00}
806:
807:
808:
809: \subsection{Upper bound for the PERM}
810:
811: Theorem~\ref{thm:least_sq} below provides an upper bound for the mean
812: integrated squared error (MISE) of the PERM, both for integration
813: w.r.t. the empirical norm $\norm{f}_n^2 = n^{-1} \sum_{i=1}^n
814: f(X_i)^2$ and the norm $\norm{f}^2 := \int f(x)^2 P_X(dx)$.
815:
816: \begin{theorem}
817: \label{thm:least_sq}
818: Let $\mathcal F$ be a space of functions satisfying $(C_\beta)$.
819: % endowed with a seminorm $|\cdot|_{\mathcal F}$ which satisfies the
820: % covering preperty~\eqref{eq:covering_assumption} for some $s > d /
821: % 2$.
822: Let $\lambda = (h, \mathcal F)$ and $\bar f_{\lambda}$ be a PERM
823: given by~\eqref{eq:pena_least_sq}, where $h$ satisfies
824: \begin{equation}
825: \label{eq:bandwidth}
826: h = a n^{-1 / (2 + \beta)}
827: \end{equation}
828: for some constant $a > 0$ and where $\alpha > 2\beta / (\beta +
829: 2)$. If $f_0 \in \mathcal F$, we have\textup:
830: \begin{equation*}
831: E_n \norm{\bar f_{\lambda} - f_0}_n^2 \leq C_1(1 + |f_0|_{\mathcal
832: F}^\alpha) n^{-2 / (2 + \beta)}
833: \end{equation*}
834: for $n$ large enough, where $C_1$ is a fixed constant depending on
835: $a$, $\beta$, $\alpha$ and $b$. If we assume further that
836: $\norm{\bar f_\lambda - f_0}_\infty \leq Q$ a.s. for some constant
837: $Q > 0$, we have
838: \begin{equation*}
839: E^n \norm{\bar f_\lambda - f_0}^2 \leq C_2 (1 + |f_0|_{\mathcal
840: F}^\alpha ) n^{-2 / (2 + \beta)}
841: \end{equation*}
842: for $n$ large enough, where $C_2$ is a fixed constant depending on
843: $C_1$ and $Q$.
844: \end{theorem}
845:
846: % \begin{remark}
847: % Theorem~\ref{thm:least_sq} improves previous results
848: % by~\cite{kohler02}, see in particular Chapter~21, in several
849: % ways. The class $\mathcal F$ here is very general, while it is a
850: % Sobolev class in~\cite{kohler02}. We do not need to assume that $|Y|
851: % \leq L$, and the rate in Theorem~\ref{thm:least_sq} corresponds to
852: % the minimax optimal rate (for a Sobolev class for instance), since
853: % there is not extra $\log n$ terms.
854: % \end{remark}
855:
856:
857: \begin{remark}
858: Theorem~\ref{thm:least_sq} holds if we truncate $\bar f_\lambda$ by
859: some constant $Q$ such that $\norm{f_0}_\infty \leq Q$. Such a
860: truncation cannot be avoided in such a general regression
861: setting. Indeed, the PERM is, without truncation, in general non
862: consistent, see the example from Problem~20.4, p.~430
863: in~\cite{kohler02}.
864: \end{remark}
865:
866: \begin{remark}
867: Theorem~\ref{thm:least_sq} holds for any design law $P_X$, even for
868: the degenerate case where $P_X = \delta_x$ for some fixed point $x
869: \in [0,1]^d$, where $\delta$ is the Dirac probability measure. Of
870: course, in this case, the rate $n^{-2 / (2 + \beta)}$ becomes
871: suboptimal, since the estimation problem with such a $P_X$ is no
872: more ``truly nonparametric''. Indeed, for a discrete $P_X$ with
873: finite support, it is proved in~\cite{hamers_kohler04} that the
874: optimal rate is the parametric rate $1/n$ using a local averaging
875: estimator.
876: \end{remark}
877:
878: % Several consequences of Theorem~\ref{thm:least_sq} are given in
879: % Section~\ref{sec:examples}, such as the convergence rates of the
880: % PERM in the anisotropic Besov space $B_{p, q}^{\bs s}$, the
881: % convergence rates for PERM in reproductive kernel Hilbert spaces,
882: % and several smoothing spline type estimators, such as the so-called
883: % thin plate spline, or an estimator that we call anisotropic spline
884: % smoother, which was, as far as we know, not previously considered in
885: % literature.
886:
887: \subsection{About the smoothing parameter $h$}
888: \label{sec:about_h}
889:
890: It is well-known that in practice, the choice of the parameter $h$ is
891: of first importance. From the theoretical point of view, in order to
892: make $\bar f_\lambda$ rate-optimal, $h$ must equal in order to a
893: quantity involving the complexity of $\mathcal F$: see
894: condition~\eqref{eq:bandwidth} on the bandwidth and the
895: Assumption~$(C_\beta)$. This problem is commonplace in nonparametric
896: statistics. Indeed, the role of the penalty
897: in~\eqref{eq:pena_least_sq} is to make the balance with the
898: massiveness of the space $\mathcal F$. Without this penalty, or if $h$
899: is too small, $\bar f_{\lambda}$ roughly interpolates the data, which
900: is not suitable when the aim is denoising (this phenomenon is called
901: \emph{overfitting}).
902:
903: Of course, the complexity parameter $\beta$ is unknown to the
904: statistician, and even worse, it does not necessarily make sense in
905: practice. So, several procedures are proposed to select $h$ based on
906: the data. The most popular are the leave-one-out cross validation (CV)
907: and the simpler generalized cross validation (GCV), which is often
908: used with smoothing spline estimators because of its computational
909: simplicity, see~\cite{wahba90} among others. Such methods are known to
910: provide good results in most cases. However, there is, as far as we
911: know, no convergence rates results for estimators based on CV or GCV
912: selection of smoothing parameters. In Section~\ref{sec:examples}
913: below, we propose an alternative approach. Indeed, instead of
914: selecting one particular $h$, we mix several estimators computed for
915: different $h$ in some grid using an aggregation algorithm. This
916: aggregation algorithm is described in Section~\ref{sec:ERM_finite}. We
917: show that this approach allows to construct adaptive estimators with
918: optimal rates of convergence in several particular cases, see
919: Section~\ref{sec:examples}. Moreover, we prove empirically in
920: Section~\ref{sec:simulations} that the aggregation approach is more
921: stable than CV or GCV when the number of observations is small.
922:
923:
924:
925: % \begin{remark}
926: % An inspection of the proof of Theorem~\ref{thm:least_sq} shows
927: % that the term $o(h^2)$ is going to zero as $h$ goes to $0$
928: % faster than any power function of $m$. When $h$ is of order
929: % $m^{-s/(2s + 1)}$, which is the best choice theoretically, we
930: % have
931: % \begin{equation*}
932: % \sup_{f \in \mathcal F(R)} E \norm{\bar f - f}_{L^2(P_X^m)}^2 \leq
933: % (C_1 + 2 R^2) m^{-2s / (2s + d)}
934: % \end{equation*}
935: % which is the standard minimax convergence rate over classes with
936: % smoothness $s$, at least when $P_X$ has a density with respect to
937: % the Lebesgue measure which is continuous and bounded away from
938: % $0$. Such smoothness classes include Sobolev balls (for $s >
939: % d/2$) and Besov balls ...
940: % \end{remark}
941:
942: % \begin{remark}
943: % In the proof of Theorem~\ref{thm:least_sq}, we do not use the
944: % explicit form of the estimator $\bar f_{\mathcal F}$: we only need
945: % the minimization property~\eqref{eq:pena_least_sq}. This entails
946: % that the scheme of proof is quite generic, and could be used for
947: % other estimators as well (namely, $M$-estimators.) This scheme of
948: % proof was previously used in the key paper~\cite{vandegeer90}, see
949: % also~\cite{van_de_geer00}. It relies on a deviation inequality for
950: % the supremum of a particular empirical process over a smoothness
951: % class $\mathcal F$, which is stated in Section~\ref{sec:process_Z0}
952: % below.
953: % \end{remark}
954:
955: % We first prove that the ``natural'' aggregation procedure, namely
956: % empirical risk minimization (or its penalized version), fails to
957: % achieve the optimal rate of aggregation in this setup. This
958: % motivates the choice
959:
960: % In this section, we explore some statistical properties of penalized
961: % empirical risk minimization over a finite set of functions.
962:
963: % In general, given is a data set $D_n$, we can consider two
964: % problems. The first one is the problem of estimation treated in the
965: % previous sections. Namely, we aim at constructing some procedure
966: % $\bar{f}$ satysfying
967: % \begin{equation}
968: % \label{eq:RateOfConvergence}
969: % E \|\bar{f}-f_0 \|^2 \leq \psi(n)
970: % \end{equation}
971: % where $\psi(n)$, called the {\it rate of convergence}, is a quantity
972: % we wish very small as $n$ increases. To get this kind of inequality,
973: % we have to assume $f_0$ to belong to a set with a small complexity (at
974: % least compact). That is the reason why we introduced Assumption
975: % ($C_\beta$) in Section~\ref{sec:pena_least_squares}. Actually, this
976: % kind of ``a priori'' may not be fulfilled. That is why the second
977: % problem, called {\it agnostic learning} has been introduced. For this
978: % problem, one is given a set $F$ of functions. Without any assumption
979: % on $f_0$, we want to construct (from the data) a procedure $\tilde{f}$
980: % which has a risk as close as possible to the smallest risk over
981: % $F$. Namely, we want to obtain {\it oracle inequalities}, that is
982: % inequalities of the form
983: % \begin{equation*}
984: % E \| \tilde{f} - f_0 \|^2 \leq C \min_{f\in F} \|f - f_0 \|^2 +
985: % \phi(n,F),
986: % \end{equation*}
987: % where $C \geq 1$ and $\phi(n,F)$ is called the {\it residue}, which
988: % is the quantity that we want to be small as $n$ increases. When $F$
989: % is of finite cardinality $M$, the agnostic problem is called {\it
990: % aggregation problem} and the residue $\phi(n,F) = \phi(n,M)$ is
991: % called {\it rate of aggregation}. The main difference between the
992: % problems of estimation and aggregation is that we don't need any
993: % assumption on $f_0$ for the second problem. Nevertheless,
994: % aggregation method have been widely used to construct adaptive
995: % procedures for the estimation problem. That is the reason why we
996: % study aggregation procedures in this section. We will use these
997: % procedures to construct estimation procedures which will be adaptive
998: % to the complexity parameter $\beta$ introduced in Assumption
999: % ($C_\beta$).
1000:
1001:
1002: \section{PERM and aggregation over a finite set of functions}
1003: \label{sec:ERM_finite}
1004:
1005: Let us fix a set $F(\Lambda) := \{ f_\lambda : \lambda \in \Lambda \}$
1006: of arbitrary functions, and denote by $M = |\Lambda|$ its
1007: cardinality. % We will choose specific sets $F(\Lambda)$ in
1008: % Section~\ref{sec:examples}, but in this section it remains generic.
1009:
1010: \subsection{Suboptimality of PERM over a finite set}
1011:
1012: In this section, we prove that minimizing the empirical risk
1013: $R_n(\cdot)$ (or a penalized version) on $F(\Lambda)$ is a suboptimal
1014: aggregation procedure in the sense of~\cite{tsy:03}. According to
1015: \cite{tsy:03}, the optimal rate of aggregation in the gaussian
1016: regression model is $(\log M) /n$. This means that it is the minimum
1017: price one has to pay in order to mimic the best function among a class
1018: of $M$ functions with $n$ observations. This rate is achieved by the
1019: aggregate with cumulative exponential weights, see~\cite{catbook:01}
1020: and~\cite{jrt:06}.
1021: % temperature parameter $T\geq 2 \max_{f\in F(\Lambda)} \| f_0 -
1022: % f\|_\infty^2 + 2\sigma^2$
1023: In Theorem~\ref{TheoWeaknessERMRegression} below, we prove that the
1024: usual PERM procedure cannot achieve this rate and thus, that it is
1025: suboptimal compared to the aggregation methods with exponential
1026: weights. The lower bounds for aggregation methods appearing in the
1027: literature (see~\cite{tsy:03, jrt:06, LecJMLR:06}) are usually based
1028: on minimax theory arguments. The one considered here is based on
1029: geometric considerations, and involves an explicit example that makes
1030: the PERM fail. For that, we consider the Gaussian regression model
1031: with uniform design.
1032: \begin{assumption}[G]
1033: Assume that $\varepsilon$ is standard Gaussian and that $X$ is
1034: univariate and uniformly distributed on $[0, 1]$.
1035: \end{assumption}
1036: % where the design is uniformly distributed on $[0,1]$. That is the
1037: % model \eqref{eq:model} where $X$ has a uniform distribution on
1038: % $[0,1]$ (we consider here the case $d=1$) where the noise
1039: % $\varepsilon$ is a standard normal Gaussian variable.
1040: \begin{theorem}
1041: \label{TheoWeaknessERMRegression}
1042: Let $M \geq 2$ be an integer and assume that \textup{(G)} holds. % In
1043: % the gaussian regression model with a design uniformly distributed
1044: % on $[0,1]$,
1045: We can find a regression function $f_0$ and a family $F(\Lambda)$ of
1046: cardinality $M$ such that, if one considers a penalization
1047: satisfying $|\pen(f)| \leq C \sqrt{(\log M)/n}, \forall f \in
1048: F(\Lambda)$ with $0\leq C <\sigma (24\sqrt{2}c^*)^{-1}$ \textup($c^*$ is
1049: an absolute constant from the Sudakov minorization, see
1050: Theorem~\ref{TheoSudakov} in
1051: Appendix~\ref{sec:appendix_proba}\textup), the PERM procedure
1052: defined by
1053: \begin{equation*}
1054: \tilde{f}_n \in \argmin_{f \in F(\Lambda)}( R_n(f) + \pen(f))
1055: \end{equation*}
1056: satisfies
1057: \begin{equation*}
1058: E^n \| \tilde{f}_n - f_0 \|^2 \geq \min_{f \in
1059: F(\Lambda)} \| f - f_0 \|^2 + C_3 \sqrt{\frac{\log
1060: M}{n}}
1061: \end{equation*}
1062: for any integer $n \geq 1$ and $M\geq M_0(\sigma)$ such that $n^{-1}
1063: \log[(M-1)(M-2)] \leq 1/4$ where $C_3$ is an absolute constant.
1064: \end{theorem}
1065: This result tells that, in some particular cases, the PERM cannot
1066: mimic the best element in a class of cardinality $M$ faster than
1067: $((\log M)/n)^{1/2}$. This rate is very far from the optimal one
1068: $(\log M)/n$.
1069:
1070: Let $F(\Lambda)$ be the set that we consider in the proof of
1071: Theorem~\ref{TheoWeaknessERMRegression} (see
1072: Section~\ref{sec:proof_main_results} below), and take $\pen(f) = 0$.
1073: Using Monte-Carlo (we do $5000$ loops), we compute the excess risk $E
1074: \| \tilde{f}_n - f_0 \|^2 - \min_{f \in F(\Lambda)} \| f - f_0 \|^2$
1075: of the ERM. In Figure~\ref{fig:subERM} below, we compare the excess
1076: risk and the bound $((\log M) / n)^{1/2}$ for several values of $M$
1077: and $n$. It turns out that, for this set $F(\Lambda)$, the lower bound
1078: $((\log M) / n)^{1/2}$ is indeed accurate for the excess
1079: risk. Actually, by using the classical symmetrization argument and the
1080: Dudley's entropy integral, it is easy to obtain an upper bound for the
1081: excess risk of the ERM of the order of $((\log M) / n)^{1/2}$ for any
1082: class $F(\Lambda)$ of cardinality $M$.
1083:
1084: \begin{figure}[htbp]
1085: \centering
1086: \includegraphics[width=4.3cm]{excess1.pdf}%
1087: \includegraphics[width=4.3cm]{excess2.pdf}%
1088: \includegraphics[width=4.3cm]{excess3.pdf}%
1089: \caption{The excess risk of the ERM compared to $((\log M) /
1090: n)^{1/2}$ for several values of $M$ and $n$
1091: \textup($x$-axis\textup)}
1092: \label{fig:subERM}
1093: \end{figure}
1094:
1095: \subsection{Aggregation}
1096: \label{sec:aggregation}
1097:
1098: % Let $F(\Lambda) = \{ f_\lambda : \lambda \in \Lambda \}$ be a finite
1099: % class of functions. In what follows, $ f_\lambda $ will be one of
1100: % the non-adaptive PERM defined in the previous section and
1101: % constructed with only a part of the data wich is assumed to be fixed
1102: % in this section.
1103: For each $ f_\lambda \in F(\Lambda)$, we compute a weight $\theta(
1104: f_\lambda) \in [0,1]$ such that $\sum_{\lambda \in \Lambda} \theta(
1105: f_{\lambda}) = 1$. These weights give a level of significance to each
1106: $ f_\lambda \in F(\Lambda)$. The aggregated estimator is then the
1107: convex combination
1108: \begin{equation}
1109: \label{eq:aggregate}
1110: \hat {\mathsf f} := \sum_{\lambda \in \Lambda} \theta(f_\lambda)
1111: f_\lambda,
1112: \end{equation}
1113: where the weight of $f \in F(\Lambda)$ is given by
1114: \begin{equation}
1115: \label{eq:weights}
1116: \theta(f) := \frac{\exp\big( - n R_{n}(f) / T
1117: \big)}{\sum_{\lambda \in \Lambda} \exp\big(-n R_{n}(
1118: f_\lambda)/T \big) },
1119: \end{equation}
1120: where $T > 0$ is the so-called \emph{temperature} parameter and where
1121: $R_n(f)$ is the empirical risk of $f$. This aggregation algorithm
1122: (with ``Gibbs'' or ``exponential'' weights) can also be found for
1123: instance in~\cite{catbook:01, leung_barron06, juditsky_etal05,
1124: juditsky_nazin05, yang:00, yang04, LecAoS:07}. See
1125: also~\cite{gaiffas_lecue07} for adaptation by aggregation in a
1126: semiparametric model.
1127:
1128: The next theorem is an oracle inequality for the aggregation
1129: method~\eqref{eq:weights}. It will be useful to derive the adaptive
1130: upper bounds stated in Section~\ref{sec:examples} below.
1131: \begin{theorem}
1132: \label{thm:oracle}
1133: % We assume that the noise $\varepsilon$ is symmetric.
1134: Assume that for any $f \in F(\Lambda)$, we have $\norm{f -
1135: f_0}_\infty \leq Q$ for some $Q > 0$. For any $a > 0$, the
1136: aggregation method~\eqref{eq:weights} satisfies
1137: \begin{equation*}
1138: E^n \norm{\hat {\mathsf f} - f_0}^2 \leq (1+ a) \min_{f \in
1139: F(\Lambda)} \norm{f - f_0}^2 + (C + T) \frac{(\log
1140: n)^{1/2} \log M}{n},
1141: \end{equation*}
1142: where $C$ is a constant depending on $a, Q$ and $\sigma$.
1143: \end{theorem}
1144: When $T$ is too large, the weights~\eqref{eq:weights} are close to the
1145: uniform law over the set of weak estimators, and of course, the
1146: resulting aggregate is inaccurate. When $T$ is too small, one weight
1147: is close to $1$, and the others close to $0$: in this situation, the
1148: aggregate does barely the same job as the ERM procedure. This is not
1149: suitable since Theorem~\ref{TheoWeaknessERMRegression} told us that
1150: ERM is suboptimal. Hence, $T$ realize a tradeoff between the ERM and the
1151: uniform weights procedure.
1152: % It is a $T$ is somehow a regularization parameter of this tradeoff.
1153: % the estimator obtained by empirical risk minimization (ERM). This
1154: % behavior can be also explained by
1155: % equation~\eqref{eq:oracle_minimization} in the proof of
1156: % Theorem~\ref{thm:oracle}. Indeed, the exponential
1157: % weights~\eqref{eq:weights} A counterpart of the oracle inequality is
1158: % Theorem~\ref{TheoWeaknessERMRegression}, where we show that any
1159: % penalized empirical risk minimization algorithm is suboptimal
1160: % compared to the cumulative version of the aggregation algorithm
1161: % (\ref{eq:aggregate}) . This result tell us that $T$ shall not be too
1162: % large, since when $T$ is large, the aggregation algorithm
1163: % (\ref{eq:aggregate}) is close to the empirical risk minimization,
1164: % which is suboptimal (see Theorem~\ref{TheoWeaknessERMRegression}).
1165: It can be simply chosen by minimization of the empirical risk. We know
1166: empirically that it provides good results, see~\cite{gaiffas_lecue07}.
1167: Namely, we select the temperature
1168: \begin{equation}
1169: \label{Tslection}
1170: \hat T := \argmin_{T \in \mathcal T} \sum_{i=1}^n \big( Y_i - \hat
1171: {\mathsf f}^{(T)} (X_i) \big)^2,
1172: \end{equation}
1173: where $\hat {\mathsf f}^{(T)}$ is the aggregated
1174: estimator~\eqref{eq:aggregate} with temperature $T$ and where
1175: $\mathcal T$ is some set of temperatures. This is what we do in the
1176: empirical study conducted in Section~\ref{sec:simulations}.
1177:
1178: % The ERM already gives good results, but if $T$ is chosen carefully,
1179: % we expect to obtain an estimator which outperforms the ERM.
1180:
1181: % This fact is confirmed by the numerical study conducted in
1182: % Section~\ref{sec:numerical}, where the choice of $T$ is done using a
1183: % simple leave-one-out cross-validation algorithm over the whole
1184: % sample for aggregates obtained with several $T$.
1185:
1186: % We can understand the aggregation algorithm in the following way:
1187: % first, we compute the least squares of each weak estimators. This is
1188: % the most natural way of assessing the level of significance of some
1189: % estimator among the other ones. Then, we put a Gibbs law over the
1190: % set of weak estimators. The mass of each estimator relies on its
1191: % least squares (over the learning sample). Finally, the aggregate is
1192: % simply the mean expected estimator according to this law. In
1193: % Section~\ref{sec:aggregation}, we propose an oracle inequality for
1194: % the aggregation algorithm (see Theorem~\ref{thm:oracle}), which is
1195: % the key result in the proof of the adaptive upper bound stated in
1196: % Theorem ?. The choice of the temperature parameter $T$ is discussed
1197: % in Section~\ref{sec:aggregation}.
1198:
1199:
1200: % Second, any penalized selection algorithm (ie, an algorithm that
1201: % selects a particular $\bar f_\lambda$ among $F(\Lambda)$ via a
1202: % penalized least squares minimization criterion) is suboptimal
1203: % compared to an aggregation procedure.
1204:
1205:
1206: % Another strategy for an adaptive choice of the smoothing parameter
1207: % $h$ in penalized least squares is complexity reguralization, which
1208: % was initiated by Vapnik, see~\cite{vapnik98}, and~\cite{kohler02},
1209: % among others. In \cite{kohler02}, the complexity regularization
1210: % approach is adopted to construct an adaptive estimator of the
1211: % regression. In this book, rates of convergence for the least squares
1212: % and penalized least squares estimators are given in the so-called
1213: % ``distribution free'' framework, where it is assumed that $|Y| \leq
1214: % L$ almost surely for a known positive constant $L$, and where there
1215: % is no assumption on $P_X$. In literature, the assumption $|Y| \leq
1216: % L$ is mandatory in order to derive rates of convergence in this
1217: % general setting for $P_X$. Note that this is also the standard
1218: % setting in learning theory. In Chapter~21 from~\cite{kohler02}, an
1219: % upper bound is obtained for the penalized least squares estimator,
1220: % in the case where $X$ is univariate and $\mathcal F$ is a Sobolev
1221: % space (smoothing splines). Herein, the convergence rate is shown to
1222: % be of order $(\log n)^2 n^{-2s/(2s + 1)}$ which is, up to the $(\log
1223: % n)^2$ term, optimal in this context. Thus, the results stated in
1224: % Section ???? improves upon complexity regularization in several
1225: % ways: the results are adaptive, holds in the multivariate case,
1226: % unbounded response $Y$ are taken into account, other spaces than the
1227: % Sobolev space can be considered and the rates are optimal (without
1228: % an extra logarithm).
1229:
1230: % \subsection{Oracle inequality}
1231:
1232:
1233:
1234:
1235: % \begin{remark} %[Why don't we use the standard aggregation algorithm?]
1236: % The standard aggregation algorithm (with exponantial weights) in the
1237: % regression model is somewhat different from the one considered
1238: % here. Usually, the weights are a Gibbs law over the set of
1239: % estimators, with potential equals to the least squares over the
1240: % learning sample. Here, we considered a potential equals to the
1241: % penalized least squares. This weighting scheme is somewhat tuned to
1242: % the situation where the weak estimators (or \emph{weak learners})
1243: % are penalized least squares. The reason is the following: actually,
1244: % the aggregation estimator is a reguralized version of the empirical risk
1245: % minimizer estimator (ERM). It does a better job than the ERM when
1246: % the temperature parameter (which can be understood as a
1247: % reguralization parameter) is not too large.
1248:
1249: % is This allows to
1250: % construct an adaptive estimator that does a better than more popular
1251: % techniques for selecting the smoothness parameter $h$, such as the
1252: % GCV technique, which provides satisfactory results is most cases.
1253: % \end{remark}
1254:
1255:
1256: % We recall that $m < n$ is the training sample size, which is a
1257: % fraction of $n$ \texttt{ATTENTION !} We recall that $D_m$, $D_{(m)}$
1258: % and $D_n$ stand for the training, the learning, and the whole sample
1259: % (respectively). We denote, repsectively, by $P^m$, $P^{(m)}$ and by
1260: % $P^n$ the corresponding empirical measures, and by $P_X^m$,
1261: % $P_X^{(m)}$ and $P_X^n$ the empirical measures for $X$. Moreover, for
1262: % short, we shall denote $\norm{f}^2 := \int f^2 d P_X$ and
1263: % $\norm{f}_n^2 = \int f^2 dP_X^n$, and we consider
1264: % $\prodsca{\cdot}{\cdot}$ and $\prodsca{\cdot}{\cdot}_n$ the associated
1265: % inner products. We define in the same way $\norm{f}_m$ and
1266: % $\norm{f}_{(m)}$. % In this section, we shall denote by $f_0$ the true
1267: % % regression function.
1268:
1269: % We denote $\bar f$ and $J(f)$ instead of $\bar f_{\lambda}$ and
1270: % $J_s(f)$. We recall that $\pen(f) = h^2 J(f)^2$, where $J(f)^2 = 1 +
1271: % \norm{f}_\infty^2 + \tilde J(f)$. We denote by $|A|$ the cardinal of a
1272: % finite set $A$. We denote $\varepsilon = (\varepsilon_1, \ldots,
1273: % \varepsilon_n)$, and by convention $\norm{\varepsilon}_n^2 = \sum_{1 \leq i
1274: % \leq n} \varepsilon_i^2 / n$, with the same definition for $Y = (Y_1,
1275: % \ldots, Y_n)$.
1276:
1277:
1278: % The resulting estimator is \emph{adaptive}, as showed below in the
1279: % Section, and as shown numerically in Section~\ref{sec:simulations}.
1280:
1281:
1282: % shall \emph{adapt} both to the complexity of $\mathcal F$ where
1283: % $f_0$ belongs to, which is measured by some smoothness paramerer
1284: % $s$, see~\eqref{eq:covering_assumption}, and to the smoothness
1285: % parameter $h$. \texttt{pas terrible la derniere phrase, et mal dit}
1286:
1287: \section{Examples of adaptive results}
1288: \label{sec:examples}
1289:
1290:
1291: %\section{Examples of PERM over large function sets}
1292:
1293: In this section, we construct adaptive estimators for several
1294: regression problems using the tools from
1295: Section~\ref{sec:pena_least_squares} and~\ref{sec:ERM_finite}. This
1296: involves, as usual with algorithms coming from statistical learning
1297: theory, a split of the sample into two parts (an exception can be
1298: found in~\cite{leung_barron06}). The main steps of the construction of
1299: adaptive estimators given in this section are:
1300: \begin{enumerate}
1301: \item split, at random, the whole sample $D_n$ into a \emph{training
1302: sample}
1303: \begin{equation*}
1304: D_m := [(X_i, Y_i) : 1 \leq i \leq m],
1305: \end{equation*}
1306: where $m < n$, and a \emph{learning sample}
1307: \begin{equation*}
1308: D_{(m)} := [(X_i, Y_i) : m + 1 \leq i \leq n];
1309: \end{equation*}
1310: \item choose a set $\Lambda$ of parameters and compute, using the
1311: training sample $D_m$, the corresponding class $F(\Lambda) = \{ \bar
1312: f_\lambda : \lambda \in \Lambda \}$ of PERM (see
1313: Definition~\ref{def:perm} in
1314: Section~\ref{sec:pena_least_squares}). Each $\Lambda$ depends on the
1315: considered problem of adaptive estimation, see below;
1316: \item using the learning sample $D_{(m)}$, compute the aggregation
1317: weights and the aggregated estimator $\hat {\mathsf f}_n$,
1318: respectively given by Equations~\eqref{eq:weights}
1319: and~\eqref{eq:aggregate}.
1320: \end{enumerate}
1321:
1322: Then, using Theorem~\ref{thm:least_sq} (see
1323: Section~\ref{sec:pena_least_squares}) and Theorem~\ref{sec:ERM_finite}
1324: (see Section~\ref{sec:ERM_finite}), we will derive adaptive upper
1325: bounds for estimators $\hat {\mathsf f}_n$ constructed in this
1326: way. Throughout the section, we shall assume the following.
1327:
1328: \begin{assumption}[Split size]
1329: Let $\ell$ be learning sample size, so that $\ell + m = n$. We shall
1330: assume from now on, to simplify the presentation, that $\ell$ is a
1331: fraction of $n$, typically $n/2$ or $n/4$.
1332: \end{assumption}
1333:
1334: \subsection{About the split, jackknife}
1335: \label{sec:jackknife}
1336:
1337: % \begin{remark}[Jackknife]
1338: The behavior of the aggregate $\hat {\mathsf f}_n$ can depend strongly
1339: on the split selected in Step~1, in particular when the number of
1340: observations is small. Hence, a good strategy is to jackknife: repeat,
1341: say, $J$ times Steps 1--3 to obtain aggregates $\{ \hat {\mathsf
1342: f}_n^{(1)}, \ldots, \hat {\mathsf f}_n^{(J)} \}$, and compute the
1343: mean:
1344: \begin{equation*}
1345: \hat {\mathsf f}_n := \frac{1}{J} \sum_{j=1}^J \hat {\mathsf
1346: f}_n^{(j)}.
1347: \end{equation*}
1348: This jackknifed estimator provides better results than a single
1349: aggregate, see Section~\ref{sec:simulations} for an empirical study,
1350: where we show also that it gives more stable estimators than the ones
1351: involving cross-validation of generalized cross-validation. By
1352: convexity of $f \mapsto \norm{f - f_0}^2$, the jackknifed estimator
1353: satisfies the same upper bounds as a single aggregate: each of the
1354: adaptive upper bounds stated below also holds when we use the
1355: jackknife.
1356:
1357: For the set of weak estimators considered in this paper, the split of
1358: the data is not a theoretical artefact. Indeed, if one skips Step~1
1359: (compute $F(\Lambda)$ and $\hat {\mathsf f}_n$ using the whole sample
1360: $D_n$), then $\hat {\mathsf f}_n$ has a very poor performance. An
1361: empirical illustration of this phenomenon is given in
1362: Figure~\ref{fig:split_effect}. Herein, we show the aggregation
1363: weights~\eqref{eq:weights} when the data is splitted and when it is
1364: not splitted. We consider an univariate design and cubic smoothing
1365: splines. Namely, we compute the set $F(\Lambda)$ of PERM
1366: (see~\eqref{eq:pena_least_sq}) with $\mathcal F = \{ f \in L^2([0, 1])
1367: : \int f^{(2)}(t) dt < +\infty \}$ and penalty $\pen(f) = h^2 \int
1368: f^{(2)}(t) dt$, where $f^{(2)}$ stands for the second derivative of
1369: $f$. We do that for several smoothing parameters $h$ in a grid $H$, so
1370: that $\Lambda := \{ (h, \mathcal F) : h \in H \}$. We used the
1371: \texttt{smooth.spline} routine in the \texttt{R} software to compute
1372: $F(\Lambda)$.
1373: \begin{figure}[htbp]
1374: \centering
1375: \includegraphics[width=6cm]{weightssplit.pdf}%
1376: \includegraphics[width=6cm]{weightsnosplit.pdf}%
1377: \caption{Aggregation weights with split \textup(left\textup) and
1378: without split \textup(right\textup) and smoothing parameter
1379: obtained by cross-validation \textup(vertical line\textup)}
1380: \label{fig:split_effect}
1381: \end{figure}
1382: In Figure~\ref{fig:split_effect}, the x-axis is related to the value
1383: of $h$: it is the value of the parameter \texttt{spar} from the
1384: \texttt{smooth.spline} routine. The vertical line is the value of
1385: \texttt{spar} selected by cross-validation. The conclusion from
1386: Figure~\ref{fig:split_effect} is that, when the data is not splitted,
1387: an overfitting phenomenon occurs: the aggregation algorithm does not
1388: work, since it does not concentrate around a value of
1389: \texttt{spar}. Of course, the resulting aggregated estimator has a
1390: very poor performance.
1391:
1392:
1393: % \subsection{Weak estimators\textup: penalized least squares}
1394: % Using the training sample, we compute a family
1395: % \begin{equation*}
1396: % F(\Lambda) := \{ \bar f_\lambda : \lambda \in \Lambda \}
1397: % \end{equation*}
1398: % of \emph{weak} estimators of the regression $f_0$. Each of these
1399: % estimators depend on a parameter $\lambda$ which makes them work
1400: % based on the data ``as if'' $f_0$ had some prescribed
1401: % properties. The parameter $\lambda$ writes $\lambda = (h, \mathcal
1402: % F)$, where $h > 0$ is a smoothing parameter, and where $\mathcal F$
1403: % is a smoothness space of function endowed with a seminorm
1404: % $|\cdot|_{\mathcal F}$. The estimator $\bar f_\lambda$ is not
1405: % adaptive, since it depends on the choice of the tuning parameters
1406: % $h$ and $\mathcal F$ (we recall that we write $\lambda = (h,
1407: % \mathcal F)$ for short). An obvious
1408:
1409: % \begin{remark} (ne pas enlever cette remarque du tex
1410: % The following criticism about data splitting is obvious: the weak
1411: % estimators only use the training sample, which is smaller (typically
1412: % two times smaller) than the whole sample, so each of them is less
1413: % accurate than an estimator using the whole sample. This remark holds
1414: % true when the learning sample is used to select one of them. If we
1415: % do not select one of them, but mixes all of them according to the
1416: % aggregation algorithm~(\ref{eq:aggregate}) for instance, then this
1417: % is no more the case. We give an empirical evidence of this fact in
1418: % Section~\ref{sec:simulations}, where we compare the CV (cross
1419: % validation) and GCV (generalized cross validation) methods with our
1420: % aggregation approach for the selection of the parameter $h$ in cubic
1421: % spline estimation.
1422: % \end{remark}
1423:
1424:
1425: \subsection{How to derive the adaptive upper bounds}
1426: \label{sec:derive_adaptive}
1427:
1428: In every examples considered below, the scheme to derive adaptive
1429: upper bounds is as follows. Say that $(\mathcal F_\beta : \beta \in
1430: B)$ is a set of embedded functions classes ($\mathcal F_\beta \subset
1431: \mathcal F_{\beta'}$ if $\beta < \beta'$) where each $\mathcal
1432: F_\beta$ satisfy Assumption~$(C_\beta)$. Let $B_n$ be an appropriate
1433: discretization of $B$. Let $\hat {\mathsf f}_n$ be the aggregated
1434: estimator obtained using Steps~1--3 (see the beginning of the
1435: section), with parameter $\Lambda = \Lambda_n = \{ (n^{-2 / (2 +
1436: \beta)}, \mathcal F_\beta) : \beta \in B_n \}$ and let $M_n$ be the
1437: cardinality of $F(\Lambda_n)$. Let $E^{m}$ and $E^{(m)}$ be the
1438: expectations with respect to, repectively, the joint laws of $D_m$ and
1439: $D_{(m)}$, so that, by independence, we have $E^n[\cdot] =
1440: E^m[E^{(m)}[\cdot]]$. Let $f_0 \in \mathcal F_{\beta_0}$ for some
1441: $\beta_0 \in B$. Using Theorem~\ref{thm:oracle}, we have
1442: \begin{align*}
1443: E^{(m)} \norm{\hat {\mathsf f}_n - f_0}^2 &\leq C \min_{f \in
1444: F(\Lambda_n)} \norm{f - f_0}^2 + \frac{C (\log
1445: n)^{1/2} \log M_n}{n} \\
1446: & \leq C \norm{\bar f_{\lambda_n} - f_0}^2 + \frac{C (\log n)^{1/2}
1447: \log M_n}{n},
1448: \end{align*}
1449: where $\lambda_n = (n^{-2 / (2 + \beta_n)}, \mathcal F_{\beta_n})$,
1450: with $\beta_n \in B_n$ chosen such that $\mathcal F_{\beta_0} \subset
1451: \mathcal F_{\beta_n}$ and $n^{-2 / (2 + \beta_n)} \leq C_1 n^{-2 / (2
1452: + \beta_0)}$. Then, integrating w.r.t. to $E^{m}$ and using
1453: Theorem~\ref{thm:least_sq}, we have, if $M_n$ is no more than a power
1454: of $n$:
1455: \begin{align*}
1456: E^n \norm{\hat {\mathsf f}_n - f_0}^2 &\leq C E^m \norm{\bar
1457: f_{\lambda_n} - f_0}^2 + o(n^{-2 / (2 + \beta_0)}) \\
1458: & \leq C_2 n^{-2 / (2 + \beta_n)} + o(n^{-2 / (2 + \beta_0)}) \leq
1459: C_3 n^{-2 / (2 + \beta_0)}.
1460: \end{align*}
1461: This prove that, if $f_0 \in \mathcal F_{\beta_0}$ for some $\beta_0
1462: \in B$, we have $E^n \norm{\hat {\mathsf f}_n - f_0}^2 \leq C_3 n^{-2
1463: / (2 + \beta_0)}$, thus $\hat {\mathsf f}_n$ is indeed adaptive over
1464: $(\mathcal F_\beta : \beta \in B)$.
1465:
1466:
1467: \subsection{Sobolev spaces, spline estimators}
1468: \label{sec:sobolev_spaces}
1469:
1470: When $\mathcal F$ is a Sobolev space, the
1471: PERM~\eqref{eq:pena_least_sq} with $\alpha = 2$ is a very popular
1472: smoothing technique: see, among others, \cite{wahba90} and
1473: \cite{green_silverman94}. The most simple example is when $d=1$ and
1474: \begin{equation*}
1475: \mathcal F = W_2^s([0, 1]) := \Big\{ f \in L^2([0, 1]) :
1476: |f|_{W_2^s}^2 := \int_0^1 f^{(s)}(t)^2 dt < \infty \Big\},
1477: \end{equation*}
1478: where $s$ is some natural integer and $f^{(s)}$ stands for the $s$-th
1479: derivative of $f$. In this case, the PERM is called a \emph{smoothing
1480: spline}, since in this situation the unique minimizer
1481: of~\eqref{eq:pena_least_sq} is a spline, see for
1482: instance~\cite{wahba90} or~\cite{kohler02}. When $s = 2$ (cubic
1483: splines), the routine \texttt{smooth.spline} from the \texttt{R}
1484: software (and for other softwares as well) neatly computes the
1485: solution to~\eqref{eq:pena_least_sq} using the B-spline basis, and
1486: chooses the parameter $h$ via generalized cross-validation (GCV). % Our
1487: % aggregation approach is an alternative to the selection of $h$ via
1488: % GCV, which is more stable when $n$ is small, see
1489: % Section~\ref{sec:simulations}.
1490:
1491: The $d$-dimensional case is easily understood with the definition of
1492: $W_2^s([0, 1]^d)$ as the space of functions $f \in L^2([0, 1]^d)$ with
1493: all derivatives of total order $s$ in $L^2([0,1]^d)$. Namely,
1494: \begin{equation*}
1495: W_2^s([0, 1]^d) := \Big\{ f \in L^2([0, 1]^d) :
1496: |f|_{W_2^s([0, 1]^d)}^2 < \infty \Big\},
1497: \end{equation*}
1498: where
1499: \begin{equation}
1500: \label{eq:usual_roughness}
1501: |f|_{W_2^s([0, 1]^d)}^2 := \sum_{\mathbf k \in \mathbb N_0^d :
1502: |\mathbf k| = s} \frac{s
1503: !}{\mathbf k !} \int_{[0,1]^d} ( D_{\mathbf k} f(x) )^2 dx,
1504: \end{equation}
1505: where for $\mathbf k = (k_1, \ldots, k_d)$ we use the notations
1506: $\mathbf k ! := \prod_{i=1}^d k_i !$ and $|\mathbf k| := \sum_{i=1}^d
1507: k_i$ and where $D_{\mathbf k}$ is the differential operator
1508: $\partial^s / (\partial^{k_1} \cdots \partial^{k_d})$. When $d > 1$,
1509: the PERM for the choice $\mathcal F = W_2^s([0, 1]^d)$ is called a
1510: \emph{thin plate spline}, see again for instance~\cite{wahba90}
1511: or~\cite{kohler02}, where the practical computation of such PERM is
1512: explained in details. The usual assumption $s > d / 2$ gives the
1513: embedding $W_s([0, 1]^d) \subset C[0, 1]^d$ and that
1514: Assumption~$(C_\beta)$ holds, see~\cite{birman_solomjak67}. The
1515: situation where $s$ is not an integer is a particular case of what we
1516: do in Section~\ref{sec:anisotropic_besov} below. The case where
1517: $\mathcal F$ is a Sobolev space is actually a particular case of both
1518: the next sections. Indeed, it is well known (see~\cite{wahba90} for
1519: instance) that a Sobolev space is a Reproductive Kernel Hilbert Space
1520: (RKHS) for an appropriate kernel choice, and that it is also a Besov
1521: space $B_{2, 2}^s$.
1522:
1523: % \texttt{verifier le lien besov et sobolev multidim... dire que
1524: % sobolev est un cas particuler du rkhs, et que c'est le bon point
1525: % de vue pour le calcul des thin plates, citer le mec qui fait ca a
1526: % la fin dans le bouquin.... }
1527:
1528: % Using the B-Spline basis (see~\cite{devore_lorentz93} for a precise
1529: % definition), the minimization~\eqref{eq:pena_least_sq} can be
1530: % written as a ridge regression problem, with a solution that can be
1531: % computed directly via the resolution of the corresponding linear
1532: % system.
1533:
1534:
1535: \subsection{Reproductive Kernel Hilbert Spaces}
1536: \label{sec:RKHS}
1537:
1538:
1539: Reproductive Kernel Hilbert Spaces (cf.~\cite{aronszajn50}), RKHS for
1540: short, provide a unified context for regularization in a wide variety
1541: of statistical model. Computational properties of estimators obtained
1542: by minimization of a functional onto a RKHS make these functions space
1543: very useful for statisticians. In this short section, we briefly
1544: recall some definitions and computational properties of RKHS.
1545:
1546: Let $\cX$ be an abstract space (in this paper, we take
1547: $\cX=[0,1]^d$). We say that $K:\cX\times\cX\longmapsto\mathbb{R}$ is a
1548: {\it reproducing kernel}, RK for short, if for any integer $p$ and any
1549: points $x_1,\ldots,x_p$ in $\cX$, the matrix $(K(x_i,x_j))_{1\leq
1550: i,j\leq p}$ is symmetric positive definite. Let $K$ be a RK. The
1551: Hilbert space associated with $K$, called {\it Reproducing Kernel
1552: Hilbert Space} and denoted by $\cH_K$, is the completion of the
1553: space of all the finite linear combination $\sum_j a_j K(x_j,\cdot)$
1554: endowed with the inner product $\prodsca{\sum_j a_j
1555: K(x_j,\cdot)}{\sum_k b_k K(y_k,\cdot)}_{K}=\sum_{j,k}a_j b_k
1556: K(x_j,y_k)$. We denote by $|\cdot|_K$ the associated norm on $\cH_K$.
1557:
1558: The representer theorem (see~\cite{kimeldorf_wahba71} for results on
1559: optimization in RKHS) is at the heart of minimization of functional
1560: onto RKHS. The solution of the minimization problem
1561: \begin{equation}
1562: \label{eq:RKHS_estimator}
1563: \bar{f} \in \argmin_{f \in \cH_K} \{ R_n(f) + h^2|f|_{\cH_K}^2 \}
1564: \end{equation}
1565: is the linear combination
1566: \begin{equation*}
1567: \bar{f} (\cdot) = \sum_{i=1}^n \alpha_i K(X_i,\cdot),\mbox{ where }
1568: \boldsymbol {\alpha} = (\alpha_i)_{1 \leq i \leq n} = (\mathbf K_X +
1569: n h^2 \mathbf I_n)^{-1} \mathbf Y,
1570: \end{equation*}
1571: where $\mathbf K_X$ is the Gram matrix $(K(X_i,X_j))_{1\leq i,j\leq
1572: n}$, where $\mathbf Y = (Y_1, \ldots, Y_n)$ and where $\mathbf I_n$
1573: is the identity matrix in $\mathbb R^n$. They are many different ways
1574: to simplify the computation of the coefficients $\boldsymbol{\alpha}$,
1575: see for instance~\cite{amato_antoniadis_pensky06}.
1576:
1577: In order to derive convergence rates for the estimator defined
1578: in~\eqref{eq:RKHS_estimator} from Theorem~\ref{thm:least_sq}, we use
1579: some results about covering numbers of RKHS obtained
1580: in~\cite{cucker_smale02} (other results on the entropy of RKHS can be
1581: found in \cite{SS:07,CS:98}). Let now assume that $P_X$ is a Borel
1582: measure. If $K$ is a {\it Mercer kernel} (this is a continuous
1583: reproducing kernel), the RKHS associated with $K$ is the set
1584: \begin{equation*}
1585: \label{eq:Mercer_kernel}
1586: \cH_K=\Big\{f\in L_2(P_X): f=\sum_{j=1}^\infty a_j \psi_j \mbox{
1587: s.t. } \sum_{j=1}^\infty \lambda_j^{-1} a_j^2\leq \infty\Big\},
1588: \end{equation*}
1589: where $(\lambda_j)_{j\geq1}$ is the sequence of decreasing eigenvalues
1590: of the operator
1591: \begin{equation*}
1592: L_K:\left\{\begin{array}{ccc}
1593: L^2(P_X) & \longrightarrow & L^2(P_X)\\
1594: f & \longmapsto & \int_\cX K(\cdot,y)f(y)dP_X(y)
1595: \end{array} \right.
1596: \end{equation*}
1597: and $(\psi_j)_{j\leq1}$ the sequence of corresponding
1598: eigenvectors. According to Proposition~9 and Theorem~D in
1599: \cite{cucker_smale02}, if for any $k\geq1$ the $k$-th eigenvalue of
1600: $L_K$ is such that
1601: \begin{equation}
1602: \label{eq:rkhs_eigenvalue}
1603: \lambda_k \leq C k^{-l}
1604: \end{equation}
1605: for some $C > 0$ and $l > 1/2$ then the entropy of $B_K(R) := \{f \in
1606: \cH_K : |f|_K \leq R\}$ satisfies for any $\delta > 0$:
1607: % the ball of radius $R$ of the RKHS $\cH_K$, denoted by
1608: \begin{equation*}
1609: H_\infty(\delta, B_K(R)) \leq \Big(\frac{2 R C_l}{\delta}
1610: \Big)^{1/l},
1611: \end{equation*}
1612: where $C_l$ is slightly greater than $6Cl^l$. In this case,
1613: Theorem~\ref{thm:least_sq} and the arguments from
1614: Section~\ref{sec:derive_adaptive} gives the following result.
1615:
1616: \begin{corollary}[Adaptive upper bound for RKHS]
1617: \label{cor:rkhs}
1618: Let $\bar f$ be defined by~\eqref{eq:RKHS_estimator} with a
1619: reproducing kernel $K$ such that the eigenvalues of the operator
1620: $L_K$ satisfy~\eqref{eq:rkhs_eigenvalue}. Then, if $h = a n^{-l /
1621: (2l + 1)}$ and $\norm{\bar f - f_0}_\infty \leq Q$, we have
1622: \begin{equation*}
1623: E^n \norm{\bar f - f_0}_{L^2(P_X)}^2 \leq C_2 (1 +
1624: |f_0|^2_{\mathcal H_K}) n^{-2l / (2l + 1)}
1625: \end{equation*}
1626: when $n$ is large enough.
1627:
1628: Now, let $L = [l_{\min}, l_{\max}]$ where $l_{\min} > 1/2$ and
1629: $(\mathcal H_l : l \in L)$ be a family of nested RKHS. Assume that
1630: the kernel of each $\mathcal H_l$
1631: satisfies~\eqref{eq:rkhs_eigenvalue}. Let $\hat {\mathsf f}_n$ be
1632: the aggregated estimator defined by Steps~1-3 with $\Lambda_n = \{
1633: \lambda = (n^{-l / (2l + 1)}, \mathcal H_l) : l \in L_n \}$ and $L_n
1634: := \{ l_{\min}, l_{\min} + (\log n)^{-1}, \ldots, l_{\max} \}$. We
1635: have, if $f_0 \in \mathcal H_l$ for some $l \in L$,
1636: \begin{equation*}
1637: E^n \norm{\hat {\mathsf f}_n - f_0}_{L^2(P_X)}^2 \leq C_2 (1 +
1638: |f_0|^2_{\mathcal H_l}) n^{-2l / (2l + 1)}
1639: \end{equation*}
1640: when $n$ is large enough.
1641: \end{corollary}
1642:
1643:
1644:
1645: \subsection{Anisotropic Besov spaces}
1646: \label{sec:anisotropic_besov}
1647:
1648:
1649: In nonparametric estimation literature, Besov spaces are of particular
1650: interest since they include functions with \emph{inhomogeneous
1651: smoothness}, for instance functions with rapid oscillations or
1652: bumps. Roughly, these spaces are used in statistics when we want to
1653: prove theoretically that some adaptive estimator is able to recover
1654: the details of a functions. When one considers a multivariate
1655: regression, the question of anisotropic smoothness naturally arises.
1656: Anisotropy means that the smoothness of $f_0$ differs in function of
1657: coordinates. As far as we know, adaptive estimation of a multivariate
1658: curve with anisotropic smoothness was previously considered only in
1659: Gaussian white noise or density models, see~\cite{hoffmann_lepski02},
1660: \cite{kerk_lepski_picard01}, \cite{kerk_lepski_picard07},
1661: \cite{neumann00}. There is no results concerning the adaptive
1662: estimation of the regression with anisotropic smoothness on a general
1663: random design.
1664:
1665: In this Section, we construct, using Steps~1-3, an adaptive estimator
1666: over anisotropic Besov spaces $B_{p, q}^{\bs s}$, where $\bs s = (s_1,
1667: \ldots, s_d)$ is the vector of smoothnesses. If $\{ e_1, \ldots, e_d
1668: \}$ is the canonical basis of $\mathbb R^d$, each $s_i$ is the
1669: smoothness in the direction $e_i$. A precise definition of $B_{p,
1670: q}^{\bs s}$ is given in
1671: Appendix~\ref{sec:appendix_approximation}. Let $s$ be the harmonic
1672: mean of $\bs s$, see~\eqref{eq:harmonic_mean}. Let us introduce two
1673: vectors $\bs s^{\min}$ and $\bs s^{\max}$ in $\mathbb R_+^d$ with
1674: positive coordinates and harmonic means $\bar {\bs s}^{\min}$ and
1675: $\bar {\bs s}^{\max}$ respectively. Assume that $\bs s^{\min} \leq
1676: {\bs s}^{\max}$, which means that $s_i^{\min} \leq s_i^{\max}$ for any
1677: $i \in \{ 1, \ldots, d \}$ and assume that $\bar {\bs s}^{\min} > d /
1678: \min(p, 2)$. In view of Theorem~\ref{thm:anisotropic_entropy} and the
1679: embedding~\eqref{eq:anisotropic_embedding} (see
1680: Appendix~\ref{sec:appendix_approximation}), we know that Assumption
1681: $(C_\beta)$ holds for every $B_{p, \infty}^{\bs s}$ such that $\bs s
1682: \geq \bs s^{\min}$ with $\beta = d / \bar {\bs s}$ (and every $B_{p,
1683: q}^{\bs s}$, since $B_{p, q}^{\bs s} \subset B_{p, \infty}^{\bs
1684: s}$), where $\bar {\bs s}$ is the harmonic mean of $\bs s$. Consider
1685: the ``cube of smoothness''
1686: \begin{equation}
1687: \label{eq:smoothness_cube}
1688: \bs S := \prod_{i=1}^d [s_i^{\min}, s_i^{\max}],
1689: \end{equation}
1690: and consider the uniform discretization of this cube with step $(\log
1691: n)^{-1}$:
1692: \begin{equation}
1693: \label{eq:discr_smoothness_cube}
1694: \bs S_n := \prod_{i=1}^d \big\{ s_i^{\min}
1695: + k (\log n)^{-1} :1\leq k \leq [ (s_i^{\max} - s_i^{\min}) \log n ]
1696: \big\},
1697: \end{equation}
1698: and the set of parameters
1699: \begin{equation*}
1700: \Lambda(\bs S) := \{ \lambda = (n^{- \bar {\bs s} / (2 \bar {\bs s}
1701: + d)}, B_{p, q}^{\bs s}) : \bs s \in \bs S_n \}.
1702: \end{equation*}
1703: Now, we compute, following Steps~1-3, the aggregated estimator $\hat
1704: {\mathsf f}_n^{\bs S}$ with set of parameters $\Lambda(\bs S)$ (see
1705: the beginning of the section). Following the arguments from
1706: Section~\ref{sec:derive_adaptive}, we can prove in the following
1707: Corollary~\ref{cor:anisotropic_besov_rate} that $\hat {\mathsf
1708: f}_n^{\bs S}$ is adaptive over the whole range of anisotropic Besov
1709: spaces $\{ B_{p, q}^{\bs s} : \bs s \in \bs S \}$.
1710:
1711: % the want to construct an estimator which is adaptive
1712: % over the whole range of anisotropic Besov spaces $\{ B_{p, q}^{\bs s}
1713: % : \bs s \in \bs S \}$. This is done in two steps:
1714: % \begin{enumerate}
1715: % \item First, using the training sample, compute the family of PERM
1716: % (see Definition~\ref{def:perm})
1717: % \begin{equation*}
1718: % F(\bs S) := \{ \bar f_\lambda : \lambda \in \Lambda(\bs S) \}
1719: % \end{equation*}
1720: % where
1721:
1722: % where $s$ is the harmonic mean of $\bs s$. In
1723: % Definition~\ref{def:perm}, we can take $\alpha = p$, see Remark ???
1724: % above (\texttt{remarque sur les sequence spaces...}.
1725: % \item Then, consider $F(\bs S)$ as a family of weak estimators, and
1726: % apply the aggregation algorithm on it. Namely, we compute the
1727: % aggregate
1728: % \begin{equation*}
1729: % \hat {\mathsf f}_n^{\bs S} := \sum_{\lambda \in \Lambda(\bs S)}
1730: % \theta(\bar f_\lambda ) \bar f_\lambda,
1731: % \end{equation*}
1732: % where the weights $\theta(\bar f)$ are given by~\eqref{eq:weights}.
1733: % \end{enumerate}
1734:
1735:
1736: % The adaptive upper bound stated in
1737: % Corollary~\ref{cor:anisotropic_besov_rate} follows from the arguments
1738: % from Section~\ref{sec:derive_adaptive}.
1739:
1740:
1741: % An immediate consequence of Theorem~\ref{thm:least_sq} is the
1742: % following convergence rate of the PERM in the anisotropic Besov space
1743: % $B_{p, \infty}^{\bs s}$ (see Section~\ref{sec:appendix_approximation}
1744: % for a definition) where we recall that
1745:
1746:
1747: \begin{corollary}
1748: \label{cor:anisotropic_besov_rate}
1749: Assume that $\norm{\bar f - f_0}_\infty \leq Q$ for every $\bar f
1750: \in F(\bs S)$. If $f_0 \in B_{p, q}^{\bs s}$ for some $s \in \bs S$,
1751: then
1752: \begin{equation*}
1753: E^n \norm{\hat {\mathsf f}_n^{\bs S} - f_0}_{L^2(P_X)}^2 \leq C
1754: n^{-2 \bar {\bs s} / (2 \bar {\bs s} + d)}
1755: \end{equation*}
1756: when $n$ is large enough, where $C$ is a constant depending on $\bs
1757: S, d$ and $Q$.
1758: % Let $\bar f_\lambda$ be the same as in Theorem~\ref{thm:least_sq}
1759: % with $\mathcal F = B_{p, \infty}^{\bs s}$ and $h = a n^{-s / (2s +
1760: % d)}$ where $s$ is the harmonic mean of $\bs s$. Assume that $s >
1761: % d / p$ and that $\norm{\bar f_\lambda - f_0}_\infty \leq Q$ and
1762: % $\norm{\alpha_0}_\infty \leq Q$ for some constant $Q > 0$. Then,
1763: % uniformly over the ball $B_{p,\infty}^{\bs s}(R) = \{ f :
1764: % |f|_{B_{p,\infty}^{\bs s}} \leq R \}$, we have\textup:
1765: % \begin{equation*}
1766: % \sup_{f_0 \in B_{p, \infty}^{\bs s}(R)} E \norm{\bar f_\lambda -
1767: % f_0}^2 \leq C_3 (1 + R^2) n^{-2s / (2s + d)}
1768: % \end{equation*}
1769: % when $n$ is large enough.
1770: \end{corollary}
1771:
1772: % Note that the same result holds for any $B_{p, q}^{\bs s}$ with $q >
1773: % 0$ because of the embedding $B_{p, q}^{\bs s} \subset B_{p,
1774: % \infty}^{\bs s}$.
1775: In Corollary~\ref{cor:anisotropic_besov_rate} we recover the
1776: ``expected'' minimax rate $n^{-2 \bar {\bs s} / (2 \bar {\bs s} + d)}$
1777: of estimation of a $d$-dimensional curve in a Besov space. Note that
1778: there is no regular or sparse zone here, since the error of estimation
1779: is measured with $L^2(P_X)$ norm. A minimax lower bound over $B_{p,
1780: q}^{\bs s}$ can be easily obtained using standard arguments, such as
1781: the ones from~\cite{tsybakov03}, together with Bernstein estimates
1782: over $B_{p, q}^{\bs s}$ that can be found in~\cite{hochmuth02}. Note
1783: that the only assumption required on the design law in this corollary
1784: is the compactness of its support.
1785:
1786:
1787: % This theorem proves that $\hat {\mathsf f}_n^{\bs S}$ is adaptive over
1788: % the whole range of anisotropic Besov spaces $\{ B_{p, q}^{\bs s} : \bs
1789: % s \in \bs S \}$. Its proof, which can be found in
1790: % Section~\ref{sec:proof_main_results}, is an easy consequence of
1791: % Theorems~\ref{thm:least_sq} and~\ref{thm:oracle}, together with the
1792: % embedding and entropy properties of these spaces, which are given in
1793: % Appendix~\ref{sec:appendix_approximation}.
1794:
1795: % \texttt{rajouter en remarque estimation pour differents p, et le fait
1796: % qu'on peut tronquer par Q, on peut aussi mettre $Q_1$ et $Q_2$, pas
1797: % forcement le meme Q...}
1798:
1799:
1800:
1801:
1802:
1803:
1804:
1805:
1806:
1807:
1808:
1809: % % \subsection{About the practical computation of PERM estimators}
1810:
1811: % Although the practical computation of the PERM in the RKHS case is
1812: % very easy, see Section~\ref{sec:RKHS} (it is one of the reason that
1813: % makes it so popular), the computation of the other PERM proposed in
1814: % this section is less clear. For the Sobolev PERM (smoothing spline
1815: % type estimators) in the isotropic case, this is well
1816: % understood. Indeed, the computation of the thin-plate spline is a
1817: % particular case of RKHS, so its computation ..... see ???? Wahba ou
1818: % Gyorfi kohler ??? \texttt{rajouter des trucs ici}
1819:
1820: % In the Besov case, in particular the anisotropic case, the compu
1821:
1822: % \begin{equation*}
1823: % \mathbf f_n = \mathbf A \boldsymbol \theta
1824: % \end{equation*}
1825:
1826: % \begin{equation*}
1827: % |f|_{B_{p,q}^s}^q := \sum_{j \geq 0} \Big( 2^{j(s + d/2 - d/p)} \Big(
1828: % \sum_{k \in K_j } \sum_{e \in E} |\beta_{e, j, \mathbf k}|^p
1829: % \Big)^{1/p} \Big)^q
1830: % \end{equation*}
1831: % where $E := \{ 0, 1 \}^d - \{ (0, \ldots, 0) \}$ and
1832:
1833: % \begin{equation*}
1834: % |f|_{B_{2, 2}^{s}}^2 := \sum_{j \geq 0} 2^{2js} \sum_{\mathbf k
1835: % \in K_j} \sum_{e \in E} |\beta_{e,j,k}|^2
1836: % \end{equation*}
1837:
1838:
1839: % \begin{example}[Lasso and Elastic estimators]
1840: % When the complexity parameter $s$ of the class $\cF$ of functions
1841: % within the regression function belongs to is such that $s>d/2$,
1842: % Theorem~\ref{thm:least_sq} provides convergence rate for the
1843: % penalized least square estimator with the semi-norm of $f$ for
1844: % penalty term (not only for the square of the semi-norm of $f$). We
1845: % are going to apply this result to obtain convergence rates for the
1846: % Lasso and Elastic estimators.
1847:
1848: % Take $M\geq2$ and $f_1,\ldots,f_M$ some functions from $[0,1]^d$ to $\mathbb{R}$. Consider the span $\cF$, in $L^2([0,1]^d)$, of these functions. That is
1849: % \begin{equation*}
1850: % \cF={\rm Span}(f_1,\ldots,f_M).
1851: % \end{equation*}For identifiability reason, we will assume the following algebra assumption:
1852: % \begin{assumption}
1853: % The dimension of the linear subspace $\cF\subset L^2([0,1]^d)$ is $M$.
1854: % \end{assumption}
1855: % Any element $f\in\cF$ is then associated with a unique vector $\theta\in \mathbb{R}^M$ such that $f=f_\theta :=\sum_{j=1}^M\theta_j f_j$. We are going to endowed the space $\cF$ with the norm
1856: % \begin{equation}\label{eq:Elastic_Penality}
1857: % |f_\theta|_\cF=\omega\|\theta\|_1+(1-\omega)\|\theta\|_2,
1858: % \end{equation}where $\omega\in[0,1]$ and $\|\theta\|_p=\big(\sum_{j=1}^M|\theta_j|^p \big)^{1/p}, \forall p\geq1$.
1859: % The penalized least squares estimator with the penalty term given by~\eqref{eq:Elastic_Penality} is called the {\it elastic estimator }. When $\omega=1$, the elastic estimator is the {\it Lasso estimator}.
1860:
1861: % Within this framework, the set $\{f_1,\ldots,f_M\}$ is usually called the {\it dictionary}. When $M=d$, $f_j(x)=x_j$ (for any $x=(x_1,\ldots,x_d)\in [0,1]^d$ and $j\in\{1,\ldots,d\}$) and $f_0$ is assumed to belonging to $\cF$, model \eqref{eq:model} is the classical gaussian linear regression model
1862: % \begin{equation}
1863: % \label{eq:Model_Linear_Gaussian}
1864: % \mathbf{Y}=\mathbf{X}\theta_0+\sigma(X)\boldsymbol{\varepsilon},
1865: % \end{equation}where $\mathbf{Y}=(Y_1,\ldots,Y_n)^t$, $\mathbf{X}$ is the matrix $n\times d$ with lines $X_i^t,i=1,\ldots,n$, $\theta_0\in\mathbb{R}^d$ is such that $f_0=f_{\theta_0}$ and $\boldsymbol{\varepsilon}$ is the vector of noise $(\varepsilon_1,\ldots,\varepsilon_M)^t$. Lasso and Elastic estimators are usually studied in this framework.
1866:
1867: % We are going to study elastic estimators for a general dictionary. We are not going to deal with the problem of {\it Sign consistency} of the Lasso estimator but only with the convergence rate of this estimator and of the more general elastic estimator. For that, we assume the classical geometric assumption on the dictionary:
1868: % \begin{assumption}\label{As:Isometry_Gram_Matrix}
1869: % Let $\Gamma=(\prodsca{f_i}{f_j})_{1\leq i,j\leq M}$ be the Gram matrix of the dictionary $\{f_1,\ldots,f_M\}$ for the inner product $\prodsca{f}{g}=\int_{[0,1]^d}fgdP_X$. We assume that, there exists an absolute constant $c>0$ such that for any vector $\theta\in \mathbb{R}^d$, we have \begin{equation*}\theta^t \Gamma \theta\geq c \|\theta\|_2^2.\end{equation*}
1870: % \end{assumption}
1871:
1872: % We don't need to split the sample thus we take $m=n$ observations to construct the estimators. We take $\bar{\theta}\in\mathbb{R}^M$ such that
1873: % \begin{equation}
1874: % f_{\bar{\theta}}\in \argmin_{f_\theta\in \cF} \big[\frac{1}{n}\sum_{i=1}^n(Y_i-f_\theta(X_i))^2+h^2 |f_\theta|_\cF\big]
1875: % \end{equation}where the norm $|\cdot|_\cF$ is defined in equation~\eqref{eq:Elastic_Penality}. Assumption~\ref{As:Isometry_Gram_Matrix} yields $c\|\bar{\theta}-\theta_0\|_2^2\leq \|f_{\bar{\theta}}-f_0\|_2^2$. To obtain rates of convergence using Theorem~\ref{thm:least_sq}, we have to control the entropy of $L_\infty$-balls of the model $\cF$. It is easy to see that \begin{equation*}
1876: % H(\delta,\cF(R),\|\cdot\|_\infty)\leq M \log\big(\frac{2MR}{\delta} \big), \mbox{ where } M=\max_{1\leq j \leq M}\|f_j\|_\infty.
1877: % \end{equation*} \texttt{Il faut regarder pour quels $R/\delta$ le plus petit on applique cette inegalité}. If we have $M$ such that $M \log\big(\frac{2MR}{\delta} \big)\leq D (R/\delta)^{d/s}$ then, applying Proposition~\ref{prop:least_sq} \texttt{si on pouvait se passer de tronquer les estimateurs dans la Proposition 1 ce serait bien ici. Voir Einmahl et Masson?}, the elastic estimator $\bar{\theta}$ with $h\geq a n^{s/(2s+d)}$ satisfies
1878: % \begin{equation*}
1879: % \mathbb{E}\|\bar{\theta}-\theta_0\|_2^2\leq C(\theta_0)h^2,
1880: % \end{equation*}where $C(\theta_0)\leq
1881: % C_1/c+2(\omega\|\theta\|_1+(1-\omega)\|\theta\|_2)/c$.
1882: % \end{example}
1883:
1884: % Usually, the ``roughness'' of a function $f \in W_s$ is measured by ,
1885: % consisting of a subsample of size $m < n$ of the whole sample $D_n$
1886: % (for more details about splitting the sample, see below.)
1887:
1888:
1889:
1890:
1891:
1892: % The next corollary is an approximation type result. \texttt{resultat
1893: % d'approximation ici}. Let $\tilde f_{(s, h)}$ be given by
1894: % \begin{equation*}
1895: % \tilde f_{(s, h)} := \argmin_{\tilde f \in W_s} \big\{ \norm{f -
1896: % \tilde f}_n^2 + \pen(\tilde f) \big\},
1897: % \end{equation*}
1898: % where $\pen(f)$ is given by~\eqref{eq:pen}. A consequence of
1899: % Theorem~\ref{thm:least_sq} is as follows.
1900:
1901: % \begin{corollary}
1902: % \label{cor:spline_approx}
1903: % Under the same assumptions as in Theorem~\ref{thm:least_sq}, we have
1904: % \begin{equation*}
1905: % E_T \norm{\tilde f - f}_{L^2(P_T)}^2 \leq C h^2,
1906: % \end{equation*}
1907: % where $E_T$ is the joint law of of $(T_1, \ldots, T_n)$.
1908: % \end{corollary}
1909:
1910:
1911:
1912: % Then, we use again
1913: % Lemma~\ref{thm:devia1}: we consider this time the event $\mathcal
1914: % B_{f_0}( z_1, \gamma_m)$, where $z_1 > 0$ is a fixed constant given by
1915: % Lemma~\ref{thm:devia1}. We have this time
1916: % \begin{equation}
1917: % \label{eq:deviaB2}
1918: % P\big[ \mathcal B_{f_0}( z_1, \gamma_m)^\complement \big] \leq \exp(
1919: % -D_3 (\log m)^{-d(1 + d / s) / s} h^{-d/s} ),
1920: % \end{equation}
1921: % where $D_3 := D_1 z_1^2 (4\alpha)^{-d / (2s)}$ and in view
1922: % of~\eqref{eq:thm1trick}, we have
1923: % \begin{equation*}
1924: % \norm{\bar f - f_0}_m^2 + \pen(\bar f) \leq 16 z_1^2 \alpha h^2
1925: % \end{equation*}
1926: % on $\mathcal B_{f_0}( z_1, \gamma_m)$. Then, if
1927: % \begin{equation*}
1928: % \mathcal B := \mathcal B_{f_0}( (\log m)^{1 + d / (2s)}, 2 \sigma_1
1929: % t_m ) \cap \mathcal B_{f_0}(z_1, \gamma_m),
1930: % \end{equation*}
1931: % we have $P[ \mathcal B^\complement ] = o(h^2)$. Putting all this
1932: % together, we obtain:
1933: % \begin{align*}
1934: % E \norm{\bar f - f_0}^2 &\leq \frac{(Q+Q_n)^2}{m} + 10 z_0 h^2 +
1935: % o(h^2) \\
1936: % &+ E[ A_2 \ind{\norm{\varepsilon}_m \leq t_m} (\ind{e_m \leq b_m} +
1937: % \ind{b_m \leq e_m} (\ind{\mathcal B} + \ind{\mathcal
1938: % B^\complement} ) ) ] \\
1939: % &\leq (10 z_0 + 16 z_1^2 \alpha + 1 + \norm{f_0}_\infty^2 +
1940: % \tilde J(f_0)^2 ) h^2 + o(h^2).
1941: % % &\leq \frac{(Q+Q_n)^2}{m} + 10 z_0 h^2 + o(h^2) + \pend(f_0) + 16
1942: % % z_1^2 \alpha h^2 + \\
1943: % % &+
1944: % % &= E[ A_1 ] + \\
1945: % % &\leq ( 10 z_0 + 16 z_1^2 \alpha + 16 J_s(f)^2 ) h^2 + \Delta_n \\
1946: % % &= \big( C + 16 (\norm{f}_\infty^2 + \smallint f^{(s)}(t)^2 dt)
1947: % % \big) h^2 + \Delta_n
1948: % \end{align*}
1949: % % where $C := 10 z_0 + 16 (z_1^2 \alpha + 1)$ and where $\Delta_n :=
1950: % % P[ A_1 \geq 10 z_0 h^2 ] + P[\mathcal B^{\complement}]$ is such that
1951: % % $n^\beta \Delta_n$ goes to $0$ for any $\beta > 0$, in view
1952: % % of~\eqref{eq:deviaA1}, \eqref{eq:deviaB1} and~\eqref{eq:deviaB2}.
1953: % This concludes the proof of Theorem~\ref{thm:least_sq}.
1954: % \end{proof}
1955:
1956:
1957: % \subsection*{Proof of Corollary~\ref{cor:spline_approx}}
1958:
1959:
1960: % As in the proof of Theorem~\ref{thm:least_sq}, we have
1961: % \begin{equation*}
1962: % E \norm{\bar f - f}^2 \leq 10 z_0 h^2 + 2 Q^2 P[A_1 \geq 10 z_0 h^2] +
1963: % E[ A_2 ],
1964: % \end{equation*}
1965: % where in view of Lemma~\ref{lem:devia2} we have $ P[ A_1 \geq 10 z_0
1966: % h^2] \leq \exp( -n h^2)$, and by the definition of $\tilde f_{(s,
1967: % h)}$, we have
1968: % \begin{equation*}
1969: % \norm{f - \tilde f}_n^2 + \pen(\tilde f) \leq \norm{f - f^*}_n^2 +
1970: % \pen(f^*) \quad \forall f^* \in W_s,
1971: % \end{equation*}
1972: % which gives $\norm{f - \tilde f}_n^2 + \pen(\tilde f) \leq \pen(f)$ if
1973: % $f^* = f \in W_s$. This concludes the proof of the corollary. \hfill
1974: % $\square$
1975:
1976:
1977: % But since the Cauchy-Schwarz inequality gives
1978: % \begin{equation*}
1979: % 0 \leq 2 \norm{Y - f}_n \norm{\bar f - f}_n + \pen(f) - \pen(\bar f)
1980: % \leq 2 \sigma \norm{\bar f - f}_n + \pen(f) - \pen(\bar f),
1981: % \end{equation*}
1982: % we have that necessarily,
1983: % \begin{equation}
1984: % \label{eq:trick_pen}
1985: % \pen(\bar f) \leq 2 \sigma \norm{\bar f - f}_n + \pen(f).
1986: % \end{equation}
1987: % This gives
1988: % \begin{equation*}
1989: % B \leq 4 ( )
1990: % \end{equation*}
1991:
1992: % We can rewrite it in the following way:
1993: % \begin{align*}
1994: % \sqrt{n} \norm{\bar f - f}_n^{2k + 1 / (2k)} &\leq \frac{2
1995: % \sqrt{n} \prodsca{Y - f}{\bar f - f}_n }{ \norm{\bar f - f}_n^{1
1996: % - 1/(2k)} } + \frac{\sqrt{n} (\pen(f) - \pen(\bar f)) }{
1997: % \norm{\bar f -
1998: % f}_n^{1 - 1/(2k)} } \\
1999: % &=: e_n + b_n.
2000: % \end{align*}
2001:
2002:
2003: %\section{Adaptation}
2004: %\label{sec:adaptation}
2005:
2006:
2007:
2008:
2009: % \subsection{Adaptative estimation over anisotropic Besov spaces}
2010:
2011: \section{Empirical study}
2012: \label{sec:simulations}
2013:
2014: In this Section, we compare empirically our aggregation procedure with
2015: the popular cross-validation (CV) and generalized cross-validation
2016: (GCV) procedures for the selection of the smoothing parameter $h$ (see
2017: Section~\ref{sec:about_h}) in smoothing splines (we use the
2018: \texttt{smooth.spline} routine from the \texttt{R} software, see
2019: \texttt{http://www.r-project.org/}). Concerning CV, GCV and smoothing
2020: splines, we refer to~\cite{wahba90}
2021: and~\cite{green_silverman94}. Those routines provide satisfactory
2022: results in most cases, in particular for the examples of regression
2023: functions considered here. However, we show that when the sample size
2024: $n$ is small (less than 50), and when the noise level is high (we take
2025: root-signal-to-noise ratio equals to $2$), then our aggregation
2026: approach is more stable, see Figure~\ref{fig:mises} below. Here in, we
2027: consider two examples of regression function, given, for $x \in [-1,
2028: 1]$, by:
2029: \begin{itemize}
2030: \item \texttt{hardsine}$(x) = 2 \sin(1 + x) \sin( 2 \pi x^2 + 1)$
2031: \item \texttt{oscsine}$(x) = (x+1) \sin(4 \pi x^2 )$.
2032: \end{itemize}
2033: We simply take $X$ uniformly distributed on $[-1, 1]$ and Gaussian
2034: noise with variance $\sigma$ chosen so that the root-signal-to-noise
2035: ratio is $2$. In Figure~\ref{fig:examples} we show typical simulation
2036: in this setting, where $n = 30$.
2037: \begin{figure}[htbp]
2038: \centering
2039: \includegraphics[width=6cm]{data1.pdf}%
2040: \includegraphics[width=6cm]{data2.pdf}%
2041: % \includegraphics[width=4.3cm]{n30r2agg.pdf}%
2042: \caption{Examples of simulated data, for
2043: $f_0$\texttt{=\textup{harsine}} \textup(left\textup) and
2044: $f_0$\texttt{=\textup{oscsine}} \textup(right\textup)}
2045: \label{fig:examples}
2046: \end{figure}
2047:
2048: In Figure~\ref{fig:mises}, we show the mises $E\norm{\hat f_n -
2049: f_0}_n^2$ computed by Monte Carlo using $1000$ simulations of the
2050: model. The tuning of the estimators in both examples is the following:
2051: for GCV, we simply use the \texttt{smooth.spline} routine with default
2052: selection of $h$ by GCV. For CV, we use the same routine, with the
2053: option \texttt{cv=TRUE} so that CV is used instead. For aggregation,
2054: we use Steps~1-3 (see Section~\ref{sec:examples}). Step~1 is done with
2055: $m=3n/4$ and $\ell = n/4$. For Step~2, we use the
2056: \texttt{smooth.spline} routine to compute a set of weak estimators,
2057: using the option \texttt{spar=x}, where \texttt{x} lies in the set $\{
2058: 0, 0.01, 0.02 \ldots, 1 \}$. The parameter \texttt{spar} is related to
2059: the value of the smoothing parameter $h$. For Step~3, we compute the
2060: weights with temperature given by~\eqref{Tslection} (over the training
2061: sample) and the set $\mathcal T = \{ 10, 20, \ldots, 100 \}$. Then, we
2062: repeat steps~1-3 $J=100$ times and compute the jackknifed estimator,
2063: see Section~\ref{sec:jackknife}. This gives our aggregated estimator.
2064:
2065: On Figure~\ref{fig:mises}, we plot the MISEs (the mean of the $1000$
2066: MISEs obtained for each simulation) for sample sizes $n \in \{ 20, 30,
2067: 50, 100 \}$ and in Figure~\ref{fig:sd} we plot the corresponding
2068: standard deviations. The conclusion is that for small $n$, aggregation
2069: provides a more accurate and stable estimation than the GCV or
2070: CV. When $n$ is $100$ or larger, than the aggregation procedure has
2071: barely the same accuracy as GCV or CV.
2072:
2073: \begin{figure}[htbp]
2074: \centering
2075: \includegraphics[width=6cm]{mises1.pdf}%
2076: \includegraphics[width=6cm]{mises2.pdf}%
2077: \caption{MISE for $f_0$\textup{=\texttt{harsine}}
2078: \textup(left\textup) and $f_0$\textup{=\texttt{oscsine}}
2079: \textup(right\textup)}
2080: \label{fig:mises}
2081: \end{figure}
2082:
2083: \begin{figure}[htbp]
2084: \centering
2085: \includegraphics[width=6cm]{sd1.pdf}%
2086: \includegraphics[width=6cm]{sd2.pdf}%
2087: \caption{standard deviation of the MISE for
2088: $f_0$\textup{=\texttt{harsine}} \textup(left\textup) and
2089: $f_0$\textup{=\texttt{oscsine}} \textup(right\textup)}
2090: \label{fig:sd}
2091: \end{figure}
2092:
2093:
2094:
2095: % \begin{table*}[htbp]
2096: % \caption{Estimated MISE \textup(using 1000
2097: % replications\textup) and standard deviations \textup(between
2098: % brackets\textup) for $f = \texttt{\textup{hardsine}}$}
2099: % \begin{tabular}{lccc}
2100: % \hline
2101: % $n$ & GCV & CV & AGG \\ \hline
2102: % $20$ & 0.224 (0.132) & 0.233 (0.172) & \textbf{0.188}
2103: % (\textbf{0.089}) \\
2104: % $30$ & 0.177 (0.124) & 0.153 (0.103) & \textbf{0.146}
2105: % (\textbf{0.064}) \\
2106: % $50$ & $6.56 \times 10^{-2}$ ($9.86 \times 10^{-2}$) & $\mathbf{4.75
2107: % \times 10^{-2}}$ ($5.53 \times 10^{-2}$) & $4.84 \times 10^{-2}$
2108: % ($\mathbf{5.29 \times 10^{-2}}$) \\
2109: % $100$ & $2.82 \times 10^{-3}$ ($1.24 \times 10^{-2}$) &
2110: % $\mathbf{2.60 \times 10^{-3}}$ ($\mathbf{1.07 \times 10^{-2}}$) &
2111: % $2.89 \times 10^{-3}$ ($1.17 \times 10^{-2}$) \\
2112: % \hline
2113: % \hline
2114: % \end{tabular}
2115: % \label{tab:mises1}
2116: % \end{table*}
2117:
2118:
2119: % \begin{table*}[htbp]
2120: % \caption{Estimated MISE \textup(using 1000
2121: % replications\textup) and standard deviations \textup(between
2122: % brackets\textup) for $f = \texttt{\textup{oscsine}}$}
2123: % \begin{tabular}{lccc}
2124: % \hline
2125: % $n$ & GCV & CV & AGG \\ \hline
2126: % $20$ & 0.235 (0.195) & 0.167 (0.094) & 0.123 (0.09) \\
2127: % $30$ & & & \\
2128:
2129: % 0.07323741 ( 0.04325123 )
2130:
2131: % $50$ & $6.56 \times 10^{-2}$ ($9.86 \times 10^{-2}$) & $\mathbf{4.75
2132: % \times 10^{-2}}$ ($5.53 \times 10^{-2}$) & $4.84 \times 10^{-2}$
2133: % ($\mathbf{5.29 \times 10^{-2}}$) \\
2134: % $100$ & $2.82 \times 10^{-3}$ ($1.24 \times 10^{-2}$) &
2135: % $\mathbf{2.60 \times 10^{-3}}$ ($\mathbf{1.07 \times 10^{-2}}$) &
2136: % $2.89 \times 10^{-3}$ ($1.17 \times 10^{-2}$) \\
2137: % \hline
2138: % \hline
2139: % \end{tabular}
2140: % \label{tab:mises2}
2141: % \end{table*}
2142:
2143:
2144:
2145: \section{Proofs of the main results}
2146: \label{sec:proof_main_results}
2147:
2148: We recall that $P_n$ stands for the joint law of the training sample
2149: $D_n$ conditional on $X^n := (X_1, \ldots, X_n)$, that is $P_n :=
2150: P^n[\cdot | X^n]$.
2151:
2152: % Note that if $\{ z_1, \ldots, z_p \}$ is a $\delta$-cover of $A
2153: % \subset E$, where $(E, \norm{\cdot})$ is some normed space, we can
2154: % find a $2\delta$-cover of $A$ with same size $p$ which is included
2155: % in $A$. Thus, we shall always assume without loss of generality that
2156: % a $\delta$-cover is included in the space it covers.
2157:
2158: \begin{proof}[Proof of Theorem~\ref{thm:devia1}]
2159: First, we use the \emph{peeling} argument: we decompose $B_n(f_0,
2160: \delta)$ into the union of the sets $S_j$ for $j \geq 0$, where for
2161: $\delta_j := \delta 2^{-j/\beta}$
2162: \begin{equation*}
2163: S_j := B_n(f_0, \delta_j ) - B_n(f_0, \delta_{j+1}),
2164: \end{equation*}
2165: and decompose $\mathcal F$ into the union of the sets
2166: \begin{equation*}
2167: B_\cF(2^{k/\beta}) - B_\cF(2^{(k-1)/\beta}) = \{ f \in \mathcal F
2168: : 2^{(k-1) / \beta} < |f|_{\mathcal F} \leq 2^{k / \beta} \},
2169: \end{equation*}
2170: for $k \geq 1$, where $B_{\mathcal F}(2^{k/\beta}) = \{ f \in
2171: \mathcal F : |f|_{\mathcal F} \leq 2^{k/\beta}\}$ This gives that the
2172: left hand side of~\eqref{eq:deviaZ_n} is smaller than
2173: \begin{align*}
2174: \sum_{j \geq 0} & P_n\Big[ \sup_{ \substack{f \in S_j \text{
2175: s.t. } \\ |f|_{\mathcal F} \leq 1} } \frac{ Z(f - f_0)
2176: }{\norm{f - f_0}_n^{1 - \beta/2} (1 + |f|_{\mathcal F})^{\beta/2}
2177: } > z \Big] \\
2178: &+ \sum_{j \geq 0} \sum_{k \geq 1} P_n \Big[ \sup_{ f
2179: \in S_j\cap B_{\mathcal F}(2^{k/\beta})} \frac{ Z(f - f_0) }{\norm{f -
2180: f_0}_n^{1 - \beta/2} (1 + |f|_{\mathcal F})^{\beta/2} } > z
2181: \Big],
2182: \end{align*}
2183: which is smaller than
2184: \begin{equation*}
2185: \sum_{j,k \geq 0}P_n \Big[ \sup_{f \in
2186: B_n(f_0, \delta_j)\cap B_\cF(2^{k/\beta}) }
2187: Z(f - f_0) > z(\delta, j, k) \Big] =: \sum_{j,k \geq 0} P_{j, k},
2188: \end{equation*}
2189: where $z(\delta, j, k) := z \delta_j^{1 - \beta/2}
2190: 2^{k/2-1/2}$. Let us consider, for any $\delta > 0$, a minimal
2191: $\delta$-covering $F(\delta, k)$ of the set $B_{\mathcal
2192: F}(2^{k/\beta})$ for the
2193: $\norm{\cdot}_\infty$-norm. Assumption~$(C_\beta)$ implies
2194: \begin{equation*}
2195: | F(\delta, k) | \leq \exp\big( D (2^{k/\beta} / \delta)^{\beta} \big)
2196: = \exp( D 2^k \delta^{-\beta} ).
2197: \end{equation*}
2198: Moreover, without loss of generality, we can assume that $F(\delta,
2199: k) \subset B_{\mathcal F}(2^{k/\beta})$. For any $i \in \mathbb N$
2200: and $j, k$ fixed, we introduce
2201: \begin{equation}
2202: \label{eq:Fi}
2203: F^{(i)} := F(\delta_{i,j}, k) \text{ where } \delta_{i,j} :=
2204: \delta_j 2^{-i/\beta} = \delta 2^{-(i+j)/\beta},
2205: \end{equation}
2206: and, for any $f\in B_\cF(2^{k/\beta})$ we denote by $\pi_i(f)$ an
2207: element of $F^{(i)}$ such that $\norm{\pi_i(f) - f}_\infty \leq
2208: \delta_{i,j}$. We have
2209: \begin{align*}
2210: P_{j,k} &\leq P_n\Big[ \sup_{ f \in B_n(f_0, \delta_j)\cap
2211: B_\cF(2^{k/\beta})} | Z(\pi_0(f) - f_0) | > z(\delta, j, k) / 2
2212: \Big] \\ & + P_n \Big[ \sup_{ f \in B_n(f_0, \delta_j) \cap
2213: B_\cF(2^{k/\beta})} | Z(f - \pi_0(f))| > z(\delta, j, k) / 2
2214: \Big] \\ &=: P_{j,k,1} + P_{j,k,2}.
2215: \end{align*}
2216: First, we consider $P_{j,k,1}$:
2217: \begin{align*}
2218: P_{j,k,1} \leq P_n \Big[ \sup_{f \in F^{(0)} \cap B_n(f_0,
2219: \delta_j) } | Z(\pi_0(f) - f_0) | > z(\delta, j, k) / 2 \Big].
2220: \end{align*}
2221: We use~\eqref{eq:deviaZnf} and the union bound over $F^{(0)}$
2222: together with the fact that $f \in B_n(f_0, \delta_j)$ to obtain:
2223: \begin{equation*}
2224: P_{j,k,1} \leq |F^{(0)}| \exp\Big( \frac{-a z^2(\delta, j, k)}{4
2225: \delta_j^2} \Big) = \exp\Big( \frac{2^{j+k}}{\delta^{\beta}} (D - a z^2 / 8 ) \Big),
2226: \end{equation*}
2227: where $a := (2b^2)^{-1}$. Now, in order to control $P_{j,k,2}$, we
2228: use the so-called chaining argument, which involves increasing
2229: approximations by the covers $F^{(i)}$, see~\eqref{eq:Fi}. Let us
2230: consider
2231: \begin{equation*}
2232: E_i := (2^{1/\beta - 1/2} - 1) 2^{-i (1/\beta-1/2) }
2233: \end{equation*}
2234: for $i \geq 1$ ($E_i > 0$ since $\beta \in(0, 2)$). By linearity of
2235: $Z_n(\cdot)$ and since $\sum_{i \geq 1} E_i = 1$, we have
2236: \begin{align*}
2237: P_{j,k,2} &\leq \sum_{i \geq 1} P_n\Big[ \sup_{ \substack{ f \in
2238: B_n(f_0, \delta_j) \\ |f|_{\mathcal F} \leq 2^{k/\beta} } } |
2239: Z(\pi_i(f) - \pi_{i-1}(f)) | > E_i z(\delta, j, k) / 2 \Big] \\
2240: &=: \sum_{i \geq 1} P_{i, j, k, 2}.
2241: \end{align*}
2242: Now, since
2243: \begin{align*}
2244: \norm{\pi_i(f) - \pi_{i-1}(f)}_n &\leq \norm{\pi_i(f) -
2245: \pi_{i-1}(f)}_\infty \\
2246: &\leq \norm{\pi_i(f) - f}_\infty + \norm{\pi_{i-1}(f) - f}_\infty \\
2247: & \leq \delta_{i,j} + \delta_{i-1,j} = \delta_{i,j} (1 +
2248: 2^{1/\beta}),
2249: \end{align*}
2250: and since the number of pairs $\{ \pi_i(f), \pi_{i-1}(f) \}$ is at
2251: most
2252: \begin{equation*}
2253: |F^{(i)}| \times |F^{(i-1)}| \leq \exp \Big( \frac{3 D 2^{i + j +
2254: k}}{2 \delta^{\beta}} \Big),
2255: \end{equation*}
2256: we obtain using again~\eqref{eq:deviaZnf}:
2257: \begin{align*}
2258: P_{i, j, k, 2} &\leq |F^{(i)}| \times |F^{(i-1)}| \times
2259: \exp\Big( \frac{-a E_i^2 z^2(\delta, j, k)}{4 \delta_{i,j}^2 (1 +
2260: 2^{1/\beta})^2} \Big) \\
2261: &= \exp\Big( \frac{2^{i+j+k}}{\delta^{\beta}} \big( 3 D / 2 - C_1
2262: z^2 \big) \Big)
2263: \end{align*}
2264: where $C_1 = C_1(s, d, a) := a(2^{1/\beta -
2265: 1/2} - 1) / (8 (1 + 2^{1/\beta})^2) > 0$. Then, if we choose $z_1
2266: := (3 / C_1)^{1/2}$, we have for any $z \geq z_1$ and $D_1 := C_1 /
2267: 2$:
2268: \begin{align*}
2269: \sum_{j, k \geq 0} P_{j,k} &\leq \sum_{j,k \geq 0} \Big(
2270: P_{j,k,1} + \sum_{i \geq 1} P_{i,j,k,2} \Big) \\
2271: &\leq \sum_{j,k \geq 0} \Big( \exp( -D_1 2^{j+k} z^2
2272: \delta^{-\beta} ) + \sum_{i \geq 1} \exp( -D_1 2^{i+j+k} z^2
2273: \delta^{-\beta} ) \Big)
2274: \end{align*}
2275: and the Theorem follows.
2276: \end{proof}
2277:
2278:
2279: \begin{proof}[Proof of Theorem~\ref{thm:least_sq}]
2280: For short, we shall write $\bar f$ instead of $\bar f_\lambda$, and
2281: $\pen(f)$ instead of $\pen_\lambda(f)$. In view
2282: of~\eqref{eq:pena_least_sq}, we have
2283: \begin{equation}
2284: \label{eq:f_bar_prop}
2285: \norm{Y - \bar f}_n^2 + \pen(\bar f) \leq \norm{Y - f}_n^2 +
2286: \pen(f) \quad \forall f \in \mathcal F,
2287: \end{equation}
2288: which is equivalent to
2289: \begin{equation*}
2290: \norm{\bar f - f}_n^2 + \pen(\bar f) \leq 2 \prodsca{Y -
2291: f}{\bar f - f}_n + \pen(f) \quad \forall f \in \mathcal F,
2292: \end{equation*}
2293: where $\prodsca{f}{g}_n = n^{-1} \sum_{i=1}^n f(X_i) g(X_i)$. This
2294: entails, since $f_0 \in \mathcal F$, that
2295: \begin{equation}
2296: \label{eq:trick1}
2297: \norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq
2298: \frac{2}{\sqrt{n}} Z(\bar f - f_0) + \pen(f_0)
2299: \end{equation}
2300: where $Z(\cdot)$ is the empirical process given
2301: by~\eqref{eq:Z_n_def}. Recall that $B_n(f_0, \delta)$ stands for the
2302: ball centered at $f_0$ with radius $\delta$ for the norm
2303: $\norm{\cdot}_n$. Let us introduce the event
2304: \begin{equation}
2305: \label{eq:event_Z}
2306: \mathcal Z(z, \delta) := \Big\{ \sup_{f \in \mathcal F \cap
2307: B_n(f_0, \delta)} \frac{ Z(f - f_0) }{\norm{f - f_0}_n^{1 -
2308: \beta/2} (1 + |f|_{\mathcal F})^{\beta/2} } \leq z \Big\}.
2309: \end{equation}
2310: In view of Theorem~\ref{thm:devia1}, see
2311: Section~\ref{sec:process_Z0}, we can find constants $z_1 > 0$ and
2312: $D_1 > 0$ such that\textup:
2313: \begin{align*}
2314: P_n\big[ \mathcal Z(z, \delta)^\complement \big] \leq \exp( - D_1
2315: z^2 \delta^{-\beta} ),
2316: \end{align*}
2317: for any $\delta > 0$ and $z \geq z_1$. When $2 n^{-1/2} Z(\bar f -
2318: f_0) \leq \pen(f_0)$, we have $\norm{\bar f - f_0}_n^2 \leq 2
2319: \pen(f_0)$. When $2 n^{-1/2} Z(\bar f - f_0) \geq \pen(f_0)$, we
2320: have, for any $z>0$, in view of~\eqref{eq:trick1}, whenever $\bar f \in B_n(f_0,
2321: \delta)$ for some $\delta > 0$, that on $\mathcal Z(z, \delta)$,
2322: \begin{equation*}
2323: \norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq \frac{4 z}{\sqrt{n}}
2324: \norm{\bar f - f_0}_n^{1 - \beta/2} (1 + |\bar f|_{\mathcal
2325: F})^{\beta/2}.
2326: \end{equation*}
2327: If $|\bar f|_{\mathcal F} \leq 1$, this entails
2328: \begin{equation*}
2329: \norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq ( a^{-2}(2^\beta 4
2330: z)^{4 / (2 + \beta)} + 1) h^2.
2331: \end{equation*}
2332: Otherwise, we have
2333: \begin{equation*}
2334: \norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq \frac{2^{\beta/2} 4
2335: z}{\sqrt{n}} \norm{\bar f - f_0}_n^{1 - \beta/2} |\bar
2336: f|_{\mathcal F}^{\beta/2},
2337: \end{equation*}
2338: and we use the following lemma.
2339: \begin{lemma}
2340: \label{lem:logtrick}
2341: Let $r, I, h, \varepsilon$ be positive numbers, $\beta \in (0, 2)$
2342: and $\alpha > 2\beta / (\beta + 2)$. Then, if
2343: \begin{equation}
2344: \label{eq:logtrick}
2345: r^2 + h^2 I^\alpha \leq \varepsilon\, r^{1 - \beta / 2} I^{\beta / 2},
2346: \end{equation}
2347: we have
2348: \begin{equation*}
2349: r \leq ( \varepsilon^{\alpha} h^{-\beta} )^{2 / (2\alpha + \alpha \beta
2350: - 2 \beta)}, \quad I \leq (\varepsilon^2
2351: h^{-(\beta + 2)})^{2 / (2 \alpha + \alpha \beta - 2\beta)}
2352: \end{equation*}
2353: and consequently
2354: \begin{equation*}
2355: r^2 + h^2 I^\alpha \leq 2 (\varepsilon^\alpha
2356: h^{-\beta})^{4/(2\alpha + \alpha \beta - 2\beta)}.
2357: \end{equation*}
2358: \end{lemma}
2359: The proof of this Lemma is given in Section~\ref{sec:lemmas_proofs}
2360: below. It entails, since $h = a n^{-1 / (2 + \beta)}$ and $\alpha >
2361: 2\beta / (\beta+2)$, that
2362: \begin{equation*}
2363: \norm{\bar f - f_0}_n^2 + h^2 |\bar f|_{\mathcal F}^{\alpha} \leq
2364: 2 ((2^{\beta/2} 4 z)^{\alpha} a^{-\beta})^{4 / (2\alpha + \alpha
2365: \beta - 2\beta)} n^{-2 / (\beta+2)}.
2366: \end{equation*}
2367: Thus,
2368: when $\bar f \in B_n(f_0, \delta)$, we have on $\mathcal Z(z, \delta)$:
2369: \begin{equation*}
2370: \norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq p(z)^2 h^2
2371: \end{equation*}
2372: where
2373: \begin{equation*}
2374: p(z)^2 := C_1 (1 + z^{4 / (2 + \beta)} + z^{4\alpha / (2\alpha + \alpha
2375: \beta - 2\beta)})
2376: \end{equation*}
2377: and $C_1$ is a constant depending on $\alpha, \beta$ and $a$.
2378: % It can be readily seen that this inequality entails
2379: % \begin{align}
2380: % \label{eq:thm1trick}
2381: % \nonumber \norm{\bar f - f_0}_n &\leq \Big(\frac{z 4
2382: % 2^{\beta/2}}{m^{\alpha} h^{2 \beta}} \Big)^{1/(2\alpha + \alpha
2383: % \beta - 2\beta)} \\ &\leq (z 4 2^{\beta/2} a^{-\beta/\alpha}
2384: % )^{2 \alpha/(2\alpha + \alpha \beta - 2\beta)} m^{-1 / (2 + \beta)},
2385: % \end{align}
2386: % where we used~\eqref{eq:bandwidth}, and
2387: % \begin{equation}
2388: % \label{eq:delta1}
2389: % \norm{\bar f - f_0}_n \leq C (1 + |f_0|^{d /(2s+d)}) p(z) m^{-s /
2390: % (2s + d)} =: p(z) \delta_1,
2391: % \end{equation}
2392: % on $\mathcal Z(z, \delta)$, where $p(z) := (z^{2\alpha s / (\alpha
2393: % (2s + d) - 2d)} \vee z^{d / (2s+d)})$ and $C := 2^{(4s+d)/(2s+d)}
2394: % \vee (4 2^{d/(2s)} a^{-d / (2\alpha s )})^{2\alpha s / (2\alpha s +
2395: % \alpha d - 2 d)}$.
2396: Let us assume for now that $\norm{\bar f - f_0}_n \leq \delta$ for
2397: some $\delta > 0$, and let us introduce
2398: \begin{equation*}
2399: \mathcal Z_1(z, \delta) := \mathcal Z(z, \delta) \cap \mathcal
2400: Z(z_1, p(z) h),
2401: \end{equation*}
2402: where $z_1$ is a constant coming from Theorem~\ref{thm:devia1}. On
2403: $\mathcal Z_1(z, \delta)$, we have
2404: \begin{equation}
2405: \label{eq:on_Z1}
2406: \norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq p(z_1)^2 h^2.
2407: \end{equation}
2408: Indeed, we have $\bar{f}\in B_n(f_0,\delta)$ thus, on $\mathcal Z(z, \delta)$, $\norm{\bar f - f_0}_n^2 + \pen(\bar f) \leq p(z_1)^2 h^2$ and so
2409: $\norm{\bar f - f_0}_n^2\leq p(z)^2 h^2$. Thus, on the event $\mathcal
2410: Z(z_1, p(z) h)$, we have (\ref{eq:on_Z1}). Moreover, Theorem~\ref{thm:devia1} yields
2411: \begin{equation}
2412: \label{eq:deviaB1}
2413: P_n \big[ \mathcal Z_1( z, \delta)^\complement \big] \leq \exp(
2414: -D_1 z^2 \delta^{-\beta}) + \exp( -D_1 z_1^2 (p(z) h)^{-\beta}).
2415: \end{equation}
2416: Now, in view of~\eqref{eq:f_bar_prop} and since $f_0 \in \mathcal
2417: F$, we have the following rough majoration:
2418: \begin{align}
2419: \nonumber \norm{\bar f - f_0}_n^2 + \pen(\bar f) &\leq 2
2420: (\norm{\bar f - Y}^2_n + \pen(\bar f) ) + 2 \norm{f_0 - Y}_n^2
2421: \\ \nonumber &\leq 2 ( \norm{f_0 - Y}_n^2 + \pen(f_0)) + 2
2422: \norm{f_0 - Y}_n^2 \\
2423: \label{eq:rough1}
2424: &\leq 4 \sigma^2 \norm{\varepsilon}_n^2 + 2 \pen(f_0),
2425: \end{align}
2426: which entails
2427: \begin{equation*}
2428: E_n\big[ \big( \norm{\bar f - f_0}_n^2 + \pen(\bar f) \big)^2
2429: \big] \leq \sigma^4 C(\varepsilon)^2 + 8 h^4 |f_0|_{\mathcal F}^{2\alpha}
2430: \end{equation*}
2431: where $C(\varepsilon)^2 = 32( E[\varepsilon^4] / n + 2
2432: (E[\varepsilon^2])^2)$. Putting all this together, we obtain, by a
2433: decomposition of $E_n[\norm{\bar f - f_0}_n^2 + \pen(\bar f)]$ over
2434: the union of the sets $\{ \norm{\bar f - f_0}_n\leq \delta \} \cap
2435: \mathcal Z_1(z, \delta)$, $\mathcal Z_1(z, \delta)^\complement$ and
2436: $\{\norm{\bar f - f_0}_n > \delta \}$ that
2437: \begin{align*}
2438: E_n[ \norm{\bar f - &f_0}_n^2 + \pen(\bar f)] \leq p(z_1)^2 h^2 \\
2439: &+ (\sigma^2 C(\varepsilon) + 2\sqrt{2} h^2
2440: |f_0|_{\mathcal F}^\alpha)\big(
2441: P_n[ \mathcal Z_1(z, \delta)^\complement]^{1/2}+P_n[ \norm{\bar f - f_0}_n > \delta]^{1/2}\big).
2442: \end{align*}
2443: In view of~\eqref{eq:rough1}, if $\delta > 2 \pen(f_0)\vee1$ then we have
2444: $\{ \norm{\bar f - f_0}_n^2 > \delta^2 \} \subset \{
2445: \norm{\varepsilon}_n^2 > (\delta^2 - \delta) / (4 \sigma^2)\}$.
2446: Thus, using the subgaussianity assumption~\eqref{eq:subgaussian}, we
2447: have $P[ \norm{\bar f - f_0}_n > \delta ]^{1/2} \leq \exp( - (\delta^2
2448: - \delta)^2 / (8 \sigma^2)) \leq ( \exp(-C_2 (\log n)^4)) =
2449: o(h^2)$ if one chooses $\delta = \log n$. Now,
2450: using~\eqref{eq:deviaB1} with this choice of $\delta$ and $z = (\log
2451: n)^{1 + \beta/2}$ we have also $P_n[ \mathcal Z_1(z,
2452: \delta)^\complement]^{1/2} \leq \exp( -C_3 (\log n)^2) =
2453: o(h^2)$. This concludes the proof of the first upper bound of
2454: Theorem~\ref{thm:least_sq}.
2455:
2456: To prove the upper bound for the integrated norm $\norm{\cdot}$
2457: instead of the empirical norm $\norm{\cdot}_n$, we decompose
2458: $\norm{\bar f - f_0}^2 = A_1 + A_2$ where
2459: \begin{equation*}
2460: A_1 := \norm{\bar f - f_0}^2 - 8 ( \norm{\bar f - f_0}_n^2
2461: + \pen(\bar f)) \text{ and } A_2 := 8 ( \norm{\bar f - f_0}_n^2 +
2462: \pen(\bar f)).
2463: \end{equation*}
2464: The first part of Theorem~\ref{thm:least_sq} provides
2465: \begin{equation*}
2466: E^n[A_2] \leq C_1 ( 1 + |f_0|_{\mathcal F}^\alpha) n^{-2 / (2 +
2467: \beta)}.
2468: \end{equation*}
2469: Recall that we assumed that $\norm{\bar f - f_0}_\infty \leq Q$
2470: a.s. for the second part of the Theorem. To handle $A_1$, we use the
2471: following Lemma.
2472: \begin{lemma}
2473: \label{lem:devia2}
2474: Let $(\mathcal F, |\cdot|_{\mathcal F})$ and $h$ satisfy the same
2475: assumptions as in Theorem~\ref{thm:least_sq}. Define $\mathcal F_Q
2476: := \{ f \in \mathcal F : \norm{f - f_0}_\infty \leq Q \}$. We can
2477: find constants $z_0, D_0 > 0$ such that for any $z \geq
2478: z_0$\textup:
2479: \begin{align*}
2480: P_X^n \big[ \exists f \in \mathcal F_Q : \norm{f - f_0}^2 &- 8
2481: (\norm{f - f_0}_n^2 + \pen(f)) \geq 10 z h^2 \big] \\
2482: &\leq \exp \big( - D_0 n h^2 z \big),
2483: \end{align*}
2484: where $z_0$ and $D_0$ are constants depending on $a, \alpha,
2485: \beta$ and $Q$.
2486: \end{lemma}
2487: The proof of Lemma~\ref{lem:devia2} is given in
2488: Section~\ref{sec:lemmas_proofs}. Using together
2489: Lemma~\ref{lem:devia2} and the fact that $A_1 \leq Q^2$ a.s., we
2490: have by a decomposition over the union of $\{ A_1 \geq 10 z_0 h^2
2491: \}$ and $\{ A_1 < 10 z_0 h^2 \}$:
2492: \begin{equation*}
2493: E^n [A_1] \leq 10 z_0 h^2 + o(h^2).
2494: \end{equation*}
2495: This concludes the proof of Theorem~\ref{thm:least_sq}.
2496: \end{proof}
2497: %%%% END OF PROOF %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2498:
2499:
2500: \begin{figure}[htbp]
2501: %% \includegraphics[width=12cm]{designs.pdf}
2502: \begin{tikzpicture}[scale=2]
2503: \draw[thick] (0,0) node[anchor=south] {$f_0$} circle (1); %
2504: \draw (0,0) -- (1,0) node[anchor=west] {$f_1$}; %
2505: \draw (0,0) -- (40:1cm) node[anchor=south west]{$f_{M-1}$}; %
2506: \draw (0,0) -- (150:1cm) node[anchor=south east] {$f_3$}; %
2507: \draw (0,0) -- (190:1cm) node[anchor=east]{$f_{2}$}; %
2508: \draw (0,0) -- (290:1cm) ; %
2509: \draw[<->, very thick] (290:1cm) -- (290:0.6cm) node[anchor=east]
2510: {$f_M$} node[pos=0.5, right] {$h$}; %
2511: \draw[mark=x] (50:1cm) ;
2512: \end{tikzpicture}
2513: \caption{Example of a setup in which ERM performs badly. The set
2514: $F(\Lambda) = \{f_1, \ldots, f_M \}$ is the dictionary from which
2515: we want to mimic the best element and $f_0$ is the regression
2516: function.}
2517: \label{fig:badsetup}
2518: \end{figure}
2519:
2520: \begin{proof}[Proof of Theorem \ref{TheoWeaknessERMRegression}]
2521: We consider a random variable $X$ uniformly distributed on $[0,1]$
2522: and its dyadic representation:
2523: \begin{equation}
2524: \label{EquaDyadicRegression}
2525: X = \sum_{k = 1}^{+\infty} X^{(k)} 2^{-k},
2526: \end{equation}
2527: where $(X^{(k)} : k \geq 1)$ is a sequence of i.i.d. random
2528: variables following a Bernoulli $\cB(1/2,1)$ with parameter $1/2$.
2529: The random variable $X$ is the design of the regression model worked
2530: out here. For the regression function we take
2531: \begin{equation}
2532: \label{FunctionBasisRegression}
2533: f_0(x) =
2534: \begin{cases}
2535: \; 2h &\text{ if } x^{(M)} = 1 \\
2536: \; h & \text{ if } x^{(M)} = 0,
2537: \end{cases}
2538: \end{equation}
2539: where $x$ has the dyadic decomposition $x=\sum_{k \geq 1}
2540: x^{(k)}2^{-k}$ where $x^{(k)} \in \{ 0, 1 \}$ and
2541: \begin{equation*}
2542: h=\frac{C}{4}\sqrt{\frac{\log M}{n}}.
2543: \end{equation*}
2544: We consider the dictionary of functions $F_M = \{f_1, \ldots, f_M\}$
2545: \begin{equation}
2546: \label{FunctionBasisRegression}
2547: f_j(x) = 2x^{(j)}-1, \quad \forall j\in\{1,\ldots,M\},
2548: \end{equation}
2549: where again $(x^{(j)} : j \geq 1)$ is the dyadic decomposition of $x
2550: \in [0,1]$. The dictionary $F_M$ is chosen so that we have, for any
2551: $j \in \{ 1, \ldots ,M-1 \}$
2552: \begin{equation*}
2553: \| f_j - f_0 \|_{L^2([0,1])}^2 = \frac{5 h^2}{2} + 1 \;\text{ and }\;
2554: \|f_M - f_0 \|_{L^2([0,1])}^2 = \frac{5h^2}{2} - h + 1.
2555: \end{equation*}
2556: Thus, we have
2557: \begin{equation*}
2558: \min_{j=1,\ldots,M} \|f_j - f_0 \|_{L^2([0,1])}^2 = \|f_M - f_0
2559: \|_{L^2([0,1])}^2 = \frac{5h^2}{2} -h + 1.
2560: \end{equation*}
2561: This geometrical setup for $F(\Lambda)$, which is a unfavourable
2562: setup for the ERM, is represented in Figure~\ref{fig:badsetup}. For
2563: \begin{equation*}
2564: \hat{f}_n := \tilde{f}_n^{\rm PERM} \in \argmin_{f \in F_M}
2565: \big(R_n(f) + \pen(f) \big),
2566: \end{equation*}
2567: where we take $R_n(f) = \frac{1}{n} \sum_{i=1}^n (Y_i-f(X_i))^2 =\|
2568: Y - f \|^2_n$, we have
2569: \begin{equation}
2570: \label{InegGaussian}
2571: E \|\hat{f}_n - f_0 \|_{L^2([0,1])}^2 =
2572: \min_{j=1,\ldots,M} \|f_j - f_0 \|_{L^2([0,1])}^2 + h
2573: P[\hat{f}_n\neq f_M].
2574: \end{equation}
2575: Now, we upper bound $P[ \hat{f}_n= f_M]$. If we define
2576: \begin{equation*}
2577: N_j := \frac{1}{\sqrt{n}} \sum_{i=1}^n\zeta_i^{(j)}
2578: \varepsilon_i \text{ and } \zeta_i^{(j)} := 2X_i^{(j)}-1,
2579: \end{equation*}
2580: we have by the definition of $h$ and since $\zeta_i^{(j)} \in \{ -1,
2581: 1\}$:
2582: \begin{align*}
2583: \frac{\sqrt{n}}{2 \sigma} (\norm{Y - f_M}_n^2 &- \norm{Y -
2584: f_j}_n^2) \\
2585: & = N_j - N_M + \frac{h}{2 \sigma \sqrt{n}} \sum_{i=1}^n
2586: (\zeta_i^{(j)} \zeta_i^{(M)} + 3(\zeta_i^{(j)} - \zeta_i^{(M)}) -
2587: 1) \\
2588: &\geq N_j - N_M - \frac{4C}{\sigma} \sqrt{\log M}.
2589: \end{align*}
2590: This entails, for $\bar N_{M-1} := \max_{1 \leq j \leq N-1} N_j$,
2591: that
2592: \begin{align*}
2593: P[ \hat{f}_n= f_M] &= P \Big[ \bigcap_{j=1}^{M-1} \Big\{ \norm{Y -
2594: f_M}_n^2 - \norm{Y - f_j}_n^2 \leq \pen(f_j) - \pen(f_M) \Big\}
2595: \Big] \\
2596: &\leq P\Big[ N_M \geq \bar N_{M-1} - \frac{6C}{\sigma} \sqrt{\log
2597: M} \Big].
2598: \end{align*}
2599: % \begin{eqnarray*}
2600: % \lefteqn{\mathbb{P}[\hat{f}_n= f_M]= \mathbb{P}[\forall
2601: % j=1,\ldots,M-1, A_n(f_M)+
2602: % {\rm{pen}}(f_M)\leq A_n(f_j)+{\rm{pen}}(f_j)]}\\
2603: % &=&\mathbb{P}[\forall j=1,\ldots,M-1,
2604: % \frac{1}{\sqrt{n}}\sum_{i=1}^n (Y_i-f_M(X_i))^2 \leq
2605: % \frac{1}{\sqrt{n}}\sum_{i=1}^n
2606: % (Y_i-f_j(X_i))^2\\
2607: % &&+\sqrt{n}({\rm{pen}}(f_j)-{\rm{pen}}(f_M))]\\
2608: % &\leq& \mathbb{P}[\forall j=1,\ldots,M-1, N_M\geq
2609: % N_j\\&&+\frac{1}{\sigma\sqrt{n}}\sum_{i=1}^n
2610: % \frac{h}{2}(\zeta_i^{(M)}\zeta_i^{(j)}-1)
2611: % +\frac{3h}{2}(\zeta_i^{(j)}-1)-\frac{C}{\sigma}\sqrt{\log M}],
2612: % \end{eqnarray*}
2613: % where for any
2614: % $j=1,\ldots,M$,
2615: It is easy to check that $N_1, \ldots, N_M$ are $M$ normalized
2616: standard gaussian random variables uncorrelated (but dependent). We
2617: denote by $\boldsymbol{\zeta}$ the family of Rademacher variables
2618: $(\zeta_i^{(j)} : i=1,\ldots,n ; j=1,\ldots,M)$. We have for any
2619: $6C/\sigma <\gamma< (2\sqrt{2}c^*)^{-1}$ ($c^*$ is the ``Sudakov
2620: constant'', see Theorem~\ref{TheoSudakov}),
2621: \begin{align}
2622: \label{EquaSudakov}
2623: P[\hat{f}_n = f_M] &\leq E \Big[ P\Big( N_M \geq \bar N_{M-1} -
2624: \frac{6C}{\sigma}\sqrt{\log M} \Big| \boldsymbol{\zeta} \Big)
2625: \Big] \nonumber \\
2626: &\leq P \big[ N_M \geq - \gamma \sqrt{\log M}
2627: + E(\bar N_{M-1} | \boldsymbol{\zeta} ) \big] \\
2628: &+ E \Big[ P\Big\{ E( \bar N_{M-1} | \boldsymbol{\zeta} ) - \bar
2629: N_{M-1} \geq (\gamma - \frac{6C}{\sigma}) \sqrt{\log M} \Big|
2630: \boldsymbol{\zeta} \Big\} \Big]. \nonumber
2631: \end{align}
2632: Conditionally to $\boldsymbol{\zeta}$, the vector
2633: $(N_1,\ldots,N_{M-1})$ is a linear transform of the Gaussian vector
2634: $(\varepsilon_1, \ldots, \varepsilon_n)$. Hence, conditionally to
2635: $\boldsymbol{\zeta}$, $(N_1,\ldots,N_{M-1})$ is a gaussian
2636: vector. Thus, we can use a standard deviation result for the
2637: supremum of Gaussian random vectors (see for
2638: instance~\cite{massart03}, Chapter~3.2.4), which leads to the
2639: following inequality for the second term of the RHS
2640: in~\eqref{EquaSudakov}:
2641: \begin{align*}
2642: % \label{EquaSecondTerm}
2643: P \Big\{ E( \bar N_{M-1} | \boldsymbol{\zeta} ) - \bar N_{M-1}
2644: \geq (\gamma &- \frac{6C}{\sigma}) \sqrt{\log M} \Big|
2645: \boldsymbol{\zeta}
2646: \Big\} \\
2647: &\leq \exp(-(3C/\sigma-\gamma/2)^2\log M).
2648: % \mathbb{P}[\mathbb{E} [ \max_{j=1,\ldots,M-1}N_j |
2649: % \boldsymbol{\zeta}]&-\max_{j=1,\ldots,M-1}N_j\geq
2650: % (\gamma-2C/\sigma)\sqrt{\log
2651: % M}|\boldsymbol{\zeta}]\nonumber\\
2652: % &\leq \exp(-(C/\sigma-\gamma/2)^2\log M).
2653: \end{align*}
2654: Remark that we used $E[ N_j^2 | \boldsymbol{\zeta}] = 1$ for any $j
2655: = 1, \ldots, M-1$. For the first term in the RHS
2656: of~\eqref{EquaSudakov}, we have
2657: \begin{align}
2658: \label{EquaIerTermSudakov}
2659: P &\Big [N_M \geq - \gamma \sqrt{\log M}
2660: + E( \bar N_{M-1} | \boldsymbol{\zeta} ) \Big] \nonumber\\
2661: &\leq P \Big[N_M \geq - 2 \gamma \sqrt{\log M}
2662: + E(\bar N_{M-1}) \Big] \\
2663: &+P \Big[ - \gamma\sqrt{\log M} + E(\bar N_{M-1}) \geq E(\bar
2664: N_{M-1} | \boldsymbol{\zeta}) \Big]. \nonumber
2665: \end{align}
2666: Next, we use Sudakov's Theorem (cf. Theorem \ref{TheoSudakov} in
2667: Appendix~\ref{sec:appendix_proba}) to lower bound $E( \bar
2668: N_{M-1})$. Since $(N_1,\ldots,N_{M-1})$ is, conditionally to
2669: $\boldsymbol{\zeta}$, a Gaussian vector and since for any $1 \leq j
2670: \neq k \leq M$ we have
2671: \begin{equation*}
2672: E[(N_k-N_j)^2 | \boldsymbol{\zeta}] = \frac{1}{n}
2673: \sum_{i=1}^n (\zeta_i^{(k)} - \zeta_i^{(j)})^2
2674: \end{equation*}
2675: then, according to Sudakov's minoration
2676: (cf. Theorem~\ref{TheoSudakov} in the Appendix), there exits an
2677: absolute constant $c^* > 0$ such that
2678: \begin{equation*}
2679: %\label{EquaSudakPrimal}
2680: c^* E[\bar N_{M-1} | \boldsymbol{\zeta}] \geq
2681: \min_{1 \leq j \neq k \leq M-1} \Big(\frac{1}{n}\sum_{i=1}^n
2682: (\zeta_i^{(k)} - \zeta_i^{(j)})^2\Big)^{1/2} \sqrt{\log M}.
2683: \end{equation*}
2684: Thus, we have
2685: \begin{align*}
2686: \label{EquaSudak3}
2687: c^* E[\bar N_{M-1}] &\geq E\Big[ \min_{j \neq k} \Big(\frac{1}{n}
2688: \sum_{i=1}^n (\zeta_i^{(k)} - \zeta_i^{(j)})^2
2689: \Big)^{1/2} \Big] \sqrt{\log M} \\
2690: &\geq \sqrt{2} \Big(1 - E\Big[ \max_{j\neq k} \frac{1}{n}
2691: \sum_{i=1}^n \zeta_i^{(k)} \zeta_i^{(j)} \Big] \Big) \sqrt{\log M},
2692: \end{align*}
2693: where we used the fact that $\sqrt{x} \geq x/\sqrt{2}, \forall x \in
2694: [0,2]$.
2695: % \begin{equation}
2696: % \label{equaSudak2}
2697: % E\Big[\min_{k \neq j \in \{1, \ldots, M-1\} } \Big( \frac{1}{n}
2698: % \sum_{i=1}^n (\zeta_i^{(k)} - \zeta_i^{(j)})^2 \Big)^{1/2} \Big]
2699: % \geq \sqrt{2} \Big(1 - E\Big[ \max_{j\neq k} \frac{1}{n} \sum_{i=1}^n
2700: % \zeta_i^{(k)} \zeta_i^{(j)} \Big] \Big)
2701: % \end{equation}
2702: Besides, using Hoeffding's inequality we have $E[\exp(s
2703: \xi^{(j,k)})] \leq \exp(s^2/(2n))$ for any $s > 0$, where
2704: $\xi^{(j,k)} := n^{-1} \sum_{i=1}^n \zeta_i^{(k)} \zeta_i^{(j)}$.
2705: Then, using a maximal inequality (cf. Theorem~\ref{TheoMaxConcIneq}
2706: in Appendix~\ref{sec:appendix_proba}) and since $n^{-1}
2707: \log[(M-1)(M-2)] \leq 1/4$, we have
2708: \begin{equation}
2709: \label{EquaSudakFinal}
2710: E\Big[\max_{j\neq k} \frac{1}{n} \sum_{i=1}^n
2711: \zeta_i^{(k)} \zeta_i^{(j)} \Big] \leq
2712: \Big(\frac{1}{n} \log[(M-1)(M-2)] \Big)^{1/2} \leq
2713: \frac{1}{2}.
2714: \end{equation}
2715: This entails
2716: \begin{equation*}
2717: c^* E[ \bar N_{M-1} ] \geq \Big(\frac{\log M}{2} \Big)^{1/2}.
2718: \end{equation*}
2719: Thus, using this inequality in the first RHS
2720: of~\eqref{EquaIerTermSudakov} and the usual inequality on the tail
2721: of a Gaussian random variable ($N_M$ is standard Gaussian), we
2722: obtain:
2723: \begin{align}
2724: \label{EquaFirstTerm}
2725: P\Big[N_M \geq &-2\gamma \sqrt{\log M} + E(\bar N_{M-1}) \Big]
2726: \leq P\Big[ N_M \geq ((c^*\sqrt{2})^{-1}-2\gamma)
2727: \sqrt{\log M}\Big]\nonumber\\
2728: &\leq \mathbb{P}\Big[N_M \geq ((c^*\sqrt{2})^{-1}-2\gamma)
2729: \sqrt{\log
2730: M}\Big]\\
2731: &\leq \exp\Big(-((c^*\sqrt{2})^{-1}-2\gamma)^2(\log
2732: M)/2\Big).\nonumber
2733: \end{align}
2734: Remark that we used $2\sqrt{2}c^* \gamma < 1$. For the second term
2735: in (\ref{EquaIerTermSudakov}), we apply the concentration inequality
2736: of Theorem \ref{TheoEinmahlMasson} to the non-negative random
2737: variable $E[\bar N_{M-1}|\boldsymbol{\zeta}]$. We first have to
2738: control the second moment of this variable. We know that,
2739: conditionally to $\boldsymbol{\zeta}$,
2740: $N_j|\boldsymbol{\zeta}\sim\cN(0,1)$ thus,
2741: $N_j|\boldsymbol{\zeta}\in L_{\psi_2}$ (for more details on Orlicz
2742: norm, we refer the reader to~\cite{vdVW:96}). Thus,
2743: \begin{equation*}
2744: \norm{\max_{1\leq j\leq M-1} N_j|\boldsymbol{\zeta}}_{\psi_2}\leq K
2745: \psi_2^{-1}(M)\max_{1\leq j\leq M-1}\norm{N_j|\boldsymbol{\zeta}}_{\psi_2}
2746: \end{equation*}
2747: (cf. Lemma 2.2.2 in \cite{vdVW:96}). Since
2748: $\norm{N_j|\boldsymbol{\zeta}}_{\psi_2}^2=1$, we have $\norm{\max_{1\leq j\leq M-1}
2749: N_j|\boldsymbol{\zeta}}_{\psi_2}\leq K \sqrt{\log M}$. In particular, we have
2750: $E\big[\max_{1\leq j\leq M-1} N_j^2|\boldsymbol{\zeta}\big]\leq
2751: K\log M$ and so $E\big(E[\bar
2752: N_{M-1}|\boldsymbol{\zeta}]\big)^2\leq K\log M$. Theorem
2753: \ref{TheoEinmahlMasson} provides
2754: \begin{equation}
2755: \label{SecondTermEquaSuda}
2756: P\Big[ -\gamma\sqrt{\log
2757: M}+E[\bar N_{M-1}]\geq E[\bar N_{M-1}|\boldsymbol{\zeta}]\Big]\leq
2758: \exp(-\gamma^2/c_0),
2759: \end{equation}
2760: where $c_0$ is an absolute constant.
2761:
2762: Finally, combining (\ref{EquaSudakov}), (\ref{EquaFirstTerm}),
2763: (\ref{EquaIerTermSudakov}), (\ref{SecondTermEquaSuda}) in the initial
2764: inequality (\ref{EquaSudakov}), we obtain
2765: \begin{align*}
2766: P[\hat{f}_n= f_M] &\leq \exp(-(3C/\sigma-\gamma)^2\log M)\\
2767: &+
2768: \exp\Big(-((c^*\sqrt{2})^{-1}-2\gamma)^2(\log M)/2\Big)+
2769: \exp(-\gamma^2/c_0).
2770: \end{align*}
2771: Take $\gamma=(12\sqrt{2}c^*)^{-1}$. It is easy to find an integer $M_0(\sigma)$ depending only on $\sigma$ such that for any $M\geq M_0$, we have $P[\hat{f}_n= f_M]\leq c_1<1$, where $c_1$ is an absolute constant.
2772: We complete the proof by using this last result in
2773: (\ref{InegGaussian}).
2774: \end{proof}
2775: %%%% END OF PROOF %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%AGGREGATION
2776:
2777:
2778: \begin{proof}[Proof of Theorem~\ref{thm:oracle}]
2779: We recall that we have a dictionary (set of functions) $F(\Lambda)$
2780: of cardinality $M$ such that $\norm{f_\lambda - f_0}_\infty \leq Q$
2781: for all $\lambda \in \Lambda$. Let us define the risk
2782: \begin{equation*}
2783: R(f) := E[(Y - f(X))^2]
2784: \end{equation*}
2785: and the linearized risk over $F(\Lambda)$, given by
2786: \begin{equation*}
2787: \mathsf R(\theta) := \sum_{\lambda \in \Lambda} \theta_\lambda
2788: R(f_\lambda)
2789: \end{equation*}
2790: for $\theta \in \Theta$, where we recall that
2791: \begin{equation*}
2792: \Theta := \{ \theta \in \mathbf R^{|\Lambda|} ; \theta_\lambda
2793: \geq 0,\; \sum_{\lambda \in \Lambda} \theta_\lambda = 1 \}.
2794: \end{equation*}
2795: We denote by $R_{n}(f)$ the empirical risk of $f$ over the sample
2796: $D_{n}$, which is given by
2797: \begin{equation*}
2798: R_{n}(f) := \frac{1}{n} \sum_{i=1}^n (Y_i - f(X_i))^2,
2799: \end{equation*}
2800: and we define similarly the linearized empirical risk
2801: \begin{equation*}
2802: \mathsf R_{n}(\theta) := \sum_{\lambda \in \Lambda}
2803: \theta_\lambda R_{n}(f_\lambda).
2804: \end{equation*}
2805: The excess risk of a function $f$ is given by $R(f) - R(f_0) =
2806: \norm{f - f_0}^2$. By convexity of the risk, the aggregate $\hat
2807: {\mathsf f}= \sum_{\lambda \in \Lambda} \hat \theta_\lambda
2808: f_\lambda$ defined in (\ref{eq:aggregate}), satisfies, for any $a >
2809: 0$,
2810: \begin{align*}
2811: R(\hat {\mathsf f}) - R(f_0) &\leq \mathsf R(\hat \theta) - R(f_0) \\
2812: &\leq (1 + a) (\mathsf R_{n}(\hat \theta) - R_{n}(f_0)) \\
2813: &+ \mathsf R(\hat \theta) - R(f_0) - (1 + a) (\mathsf R_{n}(\hat
2814: \theta) - R_{n}(f_0)),
2815: \end{align*}
2816: where it is easy to see that the Gibbs weights $\hat \theta = (\hat
2817: \theta_\lambda)_{\lambda \in \Lambda} = (\hat
2818: \theta(f_\lambda))_{\lambda \in \Lambda}$ are the unique solution to
2819: the minimization problem
2820: \begin{equation*}
2821: \min_{\theta \in \Theta} \Big\{ \mathsf R_{n}(\theta) +
2822: \frac{T}{ n} \sum_{\lambda \in \Lambda} \theta_\lambda \log
2823: \theta_\lambda \Big\},
2824: \end{equation*}
2825: where $T$ is the temperature parameter, see~\eqref{eq:weights}, and
2826: where we use the convention $0 \log 0 = 0$. Let $\hat \lambda$ be
2827: such that $f_{\hat \lambda}$ is the ERM in $F(\Lambda)$, namely
2828: \begin{equation*}
2829: R_{n}(f_{\hat \lambda}) := \min_{\lambda \in \Lambda}
2830: R_{n}(f_\lambda).
2831: \end{equation*}
2832: Since
2833: \begin{equation*}
2834: \sum_{\lambda \in \Lambda} \hat \theta_\lambda \log \Big( \frac{\hat
2835: \theta_\lambda}{1 / |\Lambda|} \Big) = K(\hat \theta | u) \geq 0
2836: \end{equation*}
2837: where $K(\hat \theta | u)$ denotes the Kullback-Leibler divergence
2838: between the weights $\hat \theta$ and the uniform weights $u := (1 /
2839: |\Lambda|)_{\lambda \in \Lambda}$, we have
2840: \begin{align*}
2841: \mathsf R_{n}(\hat \theta) &\leq \mathsf R_{n}(\hat \theta) +
2842: \frac{T}{ n} K(\hat \theta | u) \\
2843: &= \mathsf R_{n}(\hat \theta) + \sum_{\lambda \in \Lambda} \hat
2844: \theta_\lambda \log \hat \theta_\lambda + \frac{T\log |\Lambda|}{
2845: n} \\
2846: &\leq \mathsf R_{n}(e_{\hat \lambda}) + \frac{T\log |\Lambda|}{
2847: n} = R_{n}(f_{\hat \lambda}) + \frac{T\log |\Lambda|}{n},
2848: \end{align*}
2849: where $e_\lambda \in \Theta$ is the vector with $1$ for the
2850: $\lambda$-th coordinate and $0$ elsewhere. This gives
2851: \begin{align*}
2852: R(\hat {\mathsf f}) - R(f_0) &\leq (1 + a) \min_{\lambda \in \Lambda}
2853: (R_{n}(f_\lambda) - R_{n}(f_0))+ (1 + a)
2854: \frac{T\log |\Lambda|}{ n} \\
2855: &+ \mathsf R(\hat \theta) - R(f_0) - (1 + a) (\mathsf R_{n}(\hat
2856: \theta) - R_{n}(f_0)),
2857: \end{align*}
2858: and consequently
2859: \begin{align*}
2860: E \norm{\hat {\mathsf f} - f_0}^2 &\leq (1 + a) \min_{\lambda \in
2861: \Lambda} \norm{f_\lambda - f_0}^2 + (1 + a) \frac{T\log
2862: |\Lambda|}{n} \\
2863: &+ E[ \mathsf R(\hat \theta) - R(f_0) - (1 + a) (\mathsf
2864: R_{n}(\hat \theta) - R_{n}(f_0)) ].
2865: \end{align*}
2866: Since $\mathsf R(\cdot)$ and $\mathsf R_{n}$ are linear on
2867: $\Theta$, we have
2868: \begin{align*}
2869: \mathsf R(\hat \theta) - R(f_0) &- (1 + a) (\mathsf R_{n}(\hat
2870: \theta) - R_{n}(f_0)) \\
2871: &\leq \max_{f \in F(\Lambda)} ( R(f) - R(f_0) - (1 + a)
2872: (R_{n}(f) - R_{n}(f_0)) ).
2873: \end{align*}
2874: Thus, we have
2875: \begin{equation}\label{eq:Main0}
2876: E \norm{\hat {\mathsf f} - f_0}^2 \leq (1 + a)
2877: \min_{\lambda \in \Lambda} \norm{f_\lambda - f_0}^2 + (1 + a)
2878: \frac{\log |\Lambda|}{T n} + \mathcal R_n,
2879: \end{equation}
2880: where $\mathcal R_n := E [ \max_{f \in F(\Lambda)} \{ R(f) - R(f_0)
2881: - (1 + a) (R_{n}(f) - R_{n}(f_0)) \} ] $. Now, we upper bound
2882: $\mathcal R_n$. Introduce the random variables
2883: \begin{align*}
2884: \tilde{Z}_i(f) &:= (f(X_i) - f_0(X_i))^2 + 2 \sigma \varepsilon_i
2885: I( |\varepsilon_i| \leq K) (f_0(X_i) - f(X_i)), \\
2886: \bar Z_i(f) &:= 2 \sigma \varepsilon_i I(|\varepsilon_i| > K)
2887: (f_0(X_i) - f(X_i)),
2888: \end{align*}
2889: and the two following processes indexed by $f \in F(\Lambda)$:
2890: \begin{equation*}
2891: \tilde{\zeta}(f) := \frac{1}{n}\sum_{i=1}^n \Big(
2892: E[\tilde{Z}_i(f)] - (1+a) \tilde{Z}_i(f) \Big) \text{ and }
2893: \bar{\zeta}(f) := \frac{1+a}{n} \sum_{i=1}^n\bar{Z}_i(f).
2894: \end{equation*}
2895: We use the symmetry of $\varepsilon$ to get
2896: \begin{equation*}
2897: \mathcal R_n \leq E \Big[ \max_{f \in F(\Lambda)}
2898: \tilde{\zeta}(f) \Big] + E \Big[ \max_{f \in F(\Lambda)}
2899: \bar{\zeta}(f) \Big].
2900: \end{equation*}
2901: First, we upper bound $E[ \max_{f \in F(\Lambda)}
2902: \tilde{\zeta}(f)]$. The random variable $\tilde{\zeta}(f)$ is
2903: bounded and satisfies the following Bernstein's type condition
2904: (see~\cite{BM:06}): $\forall f \in F(\Lambda), E [
2905: \tilde{\zeta}(f)^2] \leq (Q^2 + 4 \sigma^2) E[\tilde{\zeta}(f)]$. We
2906: apply the union bound and the Bernstein's inequality
2907: (cf. \cite{vdVW:96}) to get, for any $\delta>0$,
2908: \begin{align*}
2909: P \Big[\max_{f\in F(\Lambda)} \tilde{\zeta}(f) \geq \delta \Big]
2910: &\leq \sum_{f\in F(\Lambda)} P\Big[ \frac{1}{n}\sum_{i=1}^n
2911: E[\tilde{Z}_i(f)] - \tilde{Z}_i(f) \geq
2912: \frac{\delta + a E[\tilde{Z}_i(f)] }{1+a} \Big] \\
2913: &\leq M \exp(-C n \delta),
2914: \end{align*}
2915: where $C := a [8 (Q^2 + \sigma^2 (1 + a)^2 + (4Q / 3)(1 + a)(Q +
2916: 2K)]^{-1}$. Hence, a direct computation gives
2917: \begin{equation}
2918: \label{eq:I1}
2919: E\Big[ \max_{f\in F(\Lambda)} \tilde{\zeta}(f) \Big] \leq
2920: \frac{4 \log M}{C n}.
2921: \end{equation}
2922: Now, we upper bound $E [\max_{f\in F(\Lambda)}\bar{\zeta}(f) ]$. We
2923: have
2924: \begin{align}
2925: \label{eq:I2}
2926: \nonumber E \Big[ \max_{f\in F(\Lambda)} \bar{\zeta}(f) \Big]
2927: &\leq 4 Q (1 + a) E \big[ |\varepsilon| I(|\varepsilon| > K) \big] \\
2928: &\leq 4 Q (1 + a) \sigma P (|\varepsilon|>K)^{1/2} \\
2929: &\leq 4Q(1+a) \sigma \exp(-K^2 / (2 b_\varepsilon^2)).
2930: \end{align}
2931: Finally, combining equations \eqref{eq:Main0},~\eqref{eq:I1})
2932: and~\eqref{eq:I2} with $K = b_\varepsilon \sqrt{2 \log n}$,
2933: concludes the proof of Theorem~\ref{thm:oracle}.
2934: \end{proof}
2935:
2936: %%%% END OF PROOF %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%AGGREGATION
2937: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2938:
2939:
2940:
2941: % \begin{proof}[Proof of Theorem~\ref{thm:adaptive_anisotropic}]
2942:
2943: % Let $f_0 \in B_{p, q}^{\bs s}$ for $\bs s \in \bs S$. Consider
2944: % $\bs s_n \in \bs S_n$ such that $\bs s_n \leq \bs s \leq \bs s_n +
2945: % u (\log n)^{-1}$ coordinatewise, where $u = (1, \ldots, 1)$. In
2946: % view of embedding~\eqref{eq:anisotropic_embedding}, we have $B_{p,
2947: % q}^{\bs s} \subset B_{p, q}^{\bs s_n}$ and if $r_n(\bs s) =
2948: % n^{-g(\bs s)}$ where
2949: % \begin{equation*}
2950: % g(\bs s) = g(s_1, \ldots, s_d) = \Big(2 + \sum_{i=1}^d
2951: % \frac{d}{s_i} \Big)^{-1},
2952: % \end{equation*}
2953: % it is easy to see that
2954: % \begin{equation*}
2955: % r_n(\bs s) \leq r_n(\bs s_n) \leq \exp(d^2) r_n(\bs s).
2956: % \end{equation*}
2957: % The proof is then a direct consequence of the oracle inequality from
2958: % Theorem~\ref{thm:oracle} and the upper bound for PERM from
2959: % Theorem~\ref{thm:least_sq}. \texttt{rajouter quelques details...}
2960: % \end{proof}
2961:
2962:
2963: \section{Proofs of the lemmas}
2964: \label{sec:lemmas_proofs}
2965:
2966:
2967: \begin{proof}[Proof of Lemma~\ref{lem:logtrick}]
2968: Since $\beta \in (0, 2)$ we have $\alpha > 2 \beta / (\beta + 2) >
2969: \beta/2$. Thus, inequality~\eqref{eq:logtrick} gives
2970: \begin{align*}
2971: \log(r^2 + h^2 I^\alpha) &\leq \log(\varepsilon) + (1 -
2972: \frac{\beta}{2}) \log(r) - (1 - \frac{\beta}{2\alpha}) \log(r^2)
2973: \\
2974: & - \frac{\beta}{\alpha} \log(h) + (1 - \frac{\beta}{2\alpha})
2975: \log(r^2) + \frac{\beta}{2\alpha} \log(h^2 I^\alpha) \\
2976: &\leq \log(\varepsilon) + (\frac{\beta}{\alpha} - 1 -
2977: \frac{\beta}{2}) \log(r) - \frac{\beta}{\alpha} \log(h) + \log(
2978: r^2 + h^2 I^\alpha)
2979: \end{align*}
2980: and consequently
2981: \begin{equation*}
2982: r^{1 + \beta / 2 - \beta/\alpha} \leq \varepsilon h^{-\beta/\alpha}
2983: \end{equation*}
2984: which entails $r \leq ( \varepsilon^{\alpha} h^{-\beta} )^{2 / (2\alpha
2985: + \alpha \beta - 2 \beta)}$. Now, using this inequality together
2986: with $h^2 I^\alpha \leq \varepsilon\, r^{1 - \beta / 2} I^{\beta / 2}$
2987: provides the upper bound for $I$. The last inequality easily follows.
2988: \end{proof}
2989:
2990:
2991: \begin{proof}[Proof of Lemma~\ref{lem:devia2}]
2992: [The proof consists of a \emph{peeling} of $\mathcal F$ into
2993: subspaces with complexity controlled by Assumption~$(C_\beta)$ and
2994: the use of Bernstein's inequality.] Let us denote for short
2995: $\mathcal F$ instead of $\mathcal F_Q$. Since $\bar f \in \mathcal
2996: F$, we have
2997: \begin{align*}
2998: P \big[ \norm{\bar f &- f_0}^2 - 8 (\norm{\bar f - f_0}_n^2 +
2999: \pen(\bar f)) \geq 10 z h^2 \big] \\
3000: &\leq P \big[ \exists f \in \mathcal F : \norm{f - f_0}^2 - 8
3001: ( \norm{f - f_0}_n^2 + \pen(f) ) \geq 10 z h^2 \big] \\
3002: &\leq P[A_1] + \sum_{k \geq 2} P[A_k],
3003: \end{align*}
3004: where
3005: \begin{align*}
3006: A_1 := \big\{ \exists f &\in \mathcal F,\;\pen(f) \leq 2^{\alpha /
3007: \beta} h^2 : \\
3008: &\norm{f - f_0}^2 - 8 (\norm{f - f_0}_n^2 + \pen(f) ) \geq 10 z
3009: h^2 \big\}
3010: \end{align*}
3011: and for $k \geq 2$,
3012: \begin{align*}
3013: A_k := \big\{ \exists f \in \mathcal F,\; &2^{\alpha (k-1) /
3014: \beta} h^2 < \pen(f) \leq 2^{\alpha k / \beta} h^2 : \\
3015: &\norm{f - f_0}^2 - 8 (\norm{f - f_0}_n^2 + \pen(f) ) \geq 10 z
3016: h^2 \big\}.
3017: \end{align*}
3018: Hence, since $z \geq z_0 \geq 1$ and $\alpha / \beta = 2 / (\beta +
3019: 2) > 1/2$ since $\beta < 2$, we have $P[A_k] \leq P_k$ for any $k
3020: \geq 1$, where
3021: \begin{align*}
3022: P_k := P \big[ \exists f \in \mathcal F,\; &\pen(f) \leq 2^{\alpha
3023: k / \beta} h^2 : \\
3024: &\norm{f - f_0}^2 - 8 \norm{f - f_0}_n^2 \geq 2 z h^2 + 4
3025: 2^{\alpha k / \beta} h^2 \big].
3026: \end{align*}
3027: Now, let $F(\delta, k)$ be a minimal $\delta$-covering for the norm
3028: $\norm{\cdot}_\infty$ of the set
3029: \begin{equation*}
3030: \{ f \in \mathcal F : \pen(f) \leq 2^{\alpha k / \beta} h^2 \} =
3031: \{ f \in \mathcal F : |f|_{\mathcal F} \leq 2^{k /\beta} \},
3032: \end{equation*}
3033: where we recall that $\pen(f) = h^2 |f|_{\mathcal
3034: F}^\alpha$. Assumption~$(C_\beta)$ entails
3035: \begin{equation}
3036: \label{eq:covering1}
3037: | F(\delta, k) | \leq \exp ( D 2^{k} \delta^{-\beta} ).
3038: \end{equation}
3039: Since for any $f_1, f_2 \in \mathcal F$ such that $\norm{f_1 -
3040: f_2}_\infty \leq \delta$, we have
3041: \begin{equation*}
3042: \norm{f_1 - f_0}^2 \leq 2\norm{f_2 - f_0}^2 + 2 \delta^2 \quad
3043: \text{ and } \quad 2
3044: \norm{f_1 - f_0}_n^2 \geq 2\norm{f_2 - f_0}_n^2 - 2 \delta^2,
3045: \end{equation*}
3046: we obtain
3047: \begin{align*}
3048: P_k &\leq P \big[ \exists f \in F(\delta, k) : 2 \norm{f - f_0}^2
3049: - 4 \norm{f - f_0}_n^2 + 6 \delta^2 \geq 2 z h^2 + 4 2^{\alpha k /
3050: \beta} h^2 \big] \\
3051: &\leq \sum_{f\in F(\delta, k)} \times P \big[ \norm{f - f_0}^2 - \norm{f -
3052: f_0}_n^2 \geq t_k(z) \big],
3053: \end{align*}
3054: where $t_k(z) := z h^2 / 2 + 2^{\alpha k / \beta} h^2 - 3 \delta^2 /
3055: 2 + \norm{f - f_0}^2 / 2$. Let $f \in F(\delta, k)$ be fixed. We
3056: introduce the random variables $U_i := (f(X_i) - f_0(X_i))^2$, so
3057: that $\norm{f - f_0}_n^2 = \sum_{i=1}^n U_i / n$ and $E[U_1] =
3058: \norm{f - f_0}^2$. Note that the $U_i$ are independent, such that $0
3059: \leq U_i \leq Q^2$, and $\var [U_1] \leq E [U_1^2] \leq Q^2 E [U_1]
3060: \leq Q^2 \norm{f - f_0}^2$. Hence, if $t_k(z) \geq \norm{f - f_0}^2
3061: / 2$, Bernstein's inequality entails
3062: \begin{align*}
3063: P \big[ \norm{f - f_0}^2 &- \norm{f - f_0}_n^2 \geq t_k(z) \big]
3064: = P \Big[ \sum_{i=1}^n (U_i - E [U_1]) \geq n t_k(z) \Big] \\
3065: &\leq \exp \Big( \frac{-n t_k(z)^2}{2( Q^2 \norm{f -
3066: f_0}^2 + Q^2 t_k(z) / 3)} \Big) \\
3067: &\leq \exp \Big( \frac{-3 n ( z h^2 + 2^{\alpha k / \beta +1} h^2
3068: - 3 \delta^2 )}{28 Q^2} \Big).
3069: \end{align*}
3070: By taking $\delta := (2^{\alpha k / \beta} h^2 / 3)^{1/2}$, we have
3071: $t_k(z) \geq \norm{f - f_0}^2 / 2$ and \eqref{eq:covering1} becomes
3072: \begin{equation*}
3073: | F(\delta, k) | \leq \exp \Big( D_1 n h^2 2^{k(1 - \alpha / 2)}
3074: \Big),
3075: \end{equation*}
3076: where we used~\eqref{eq:bandwidth} and took $D_1 := D 3^{\beta / 2}
3077: / a^{\beta + 2}$. Hence, for $D_2 := 3 / (28 Q^2)$, we have
3078: \begin{equation*}
3079: P_k \leq \exp\Big( D_1 n h^2 2^{k (1 - \alpha / 2)} - D_2 n h^2 (z +
3080: 2^{\alpha k / \beta}) \Big).
3081: \end{equation*}
3082: Now, we choose
3083: \begin{equation*}
3084: K := \Big[ \frac{\log (\min(D_2 / D_1, 1) / 2)}{(1 - \alpha / 2 - \alpha
3085: / \beta) \log 2} \Big] + 1,
3086: \end{equation*}
3087: where $[x]$ is the integer part of $x$, and where we recall that
3088: $\alpha > 2 \beta / (\beta + 2)$, so that $1 - \alpha / 2 - \alpha /
3089: \beta < 0$. The conclusion of the proof follows easily by the
3090: decomposition $\sum_{k \geq 1} P_k = \sum_{1 \leq k < K} P_k +
3091: \sum_{k \geq K} P_k$, if $z \geq z_1$ for the choice $z_1 := 2 (
3092: 2^{K \alpha / \beta} - D_1 2^{K(1 - \alpha / 2)} / D_2)$.
3093: \end{proof}
3094:
3095: % \begin{align*}
3096: % \exp( -D_Q n h^2 2^k) \exp\big( -n h^2 ( D_Q z - D 6^{1/(2s)}
3097: % \alpha^{-(2+1/s)}) \big).
3098: % \end{align*}
3099: % \begin{equation*}
3100: % P \big[ \norm{f^* - f}^2 - \norm{f^* - f}_n^2 \geq t_k \big] \leq
3101: % \end{equation*}
3102: % thus
3103: % \begin{equation*}
3104: % P_k \leq \exp\big( D 6^{1/(2s)} h^{-1/s} -D_Q n h^2 ( z + 2^k )
3105: % \big),
3106: % \end{equation*}
3107: % for any $k \geq 1$. But since $h \geq \alpha n^{-s / (2s + 1)}$, we
3108: % have
3109: % \begin{equation*}
3110: % P_k \leq
3111: % \end{equation*}
3112: % This gives
3113: % \begin{equation*}
3114: % \sum_{k \geq 1} P_k \leq \exp\big( -n h^2 (D_Q (z+1) - D 6^{1/(2s)}
3115: % \alpha^{-(2+1/s)} ) \big),
3116: % \end{equation*}
3117: % which entails Lemma~\ref{lem:devia2} for $z_0$ given by
3118: % \begin{equation*}
3119: % z_0 := \max\Big(0, \frac{D 6^{1/(2s)} \alpha^{-(2+1/s)} + 1}{D_Q} -
3120: % 1\Big). \qedhere
3121: % \end{equation*}
3122: % Let $K' \in \mathbb N$ be such that $D \alpha^{-1} (3 /
3123: % \alpha)^{1/(2s)} + 1 \leq D_Q 2^k$, and take $K := \max(4,
3124: % K')$. This choice entails
3125: % \begin{equation*}
3126: % \sum_{k \geq K} P_k \leq \exp( -D_Q n h z) \sum_{k \geq K} \exp( -k
3127: % n h ) \leq 2^{-1} \exp( -n h (D_Q z + K) ).
3128: % \end{equation*}
3129: % Now, for $k < K$, we have for any $z \geq z_0$, where
3130: % \begin{equation*}
3131: % z_0 := \max(0, D_Q^{-1} (2^{K/2} D \alpha^{-1} (3 / \alpha)^{1 /
3132: % (2s) } + 1) - 2),
3133: % \end{equation*}
3134: % that
3135: % \begin{equation*}
3136: % \sum_{1 \leq k \leq K} P_k \leq K \exp(-n h),
3137: % \end{equation*}
3138: % hence which concludes the proof of Lemma~\ref{lem:devia2}. \hfill
3139: % $\square$
3140:
3141:
3142: % \subsection*{}
3143:
3144:
3145:
3146:
3147:
3148: % \begin{lemma}
3149: % \label{lem:spline_bounded}
3150: % Let $P_X$ be such that $|\supp P_X| > s$ \textup(the support
3151: % contains at least $s+1$ points.\textup) Let $f \in W_s$ be such that
3152: % $\norm{f - f_0}_m \leq \delta$ for some $\delta > 0$ and some
3153: % function $f_0$.\textup) Then, we can find positive constants $C_0,
3154: % C, D$ such that
3155: % \begin{equation*}
3156: % P\big[ \norm{f}_\infty > C_0( \delta + \norm{f_0}_m + J(f)) \big]
3157: % \geq C \exp( -D n ).
3158: % \end{equation*}
3159: % \end{lemma}
3160:
3161:
3162: % \begin{proof}[Proof of Lemma~\ref{lem:spline_bounded}]
3163: % Since $f \in W_s$, we can write using the Sobolev-embedding theorem
3164: % that
3165: % \begin{equation*}
3166: % f = f_1 + f_2
3167: % \end{equation*}
3168: % where $f_1 = \sum_{|\alpha| < s} b_\alpha x^{\alpha}$ and $f_2$ is
3169: % such that $\norm{f_2}_\infty \leq J(f_2) = J(f)$. Moreover, we have
3170: % \begin{equation*}
3171: % \norm{f_1}_\infty \leq \norm{b}_\infty \leq C(s) (b^{\top} b )^{1/2},
3172: % \end{equation*}
3173: % where $b = (b_\alpha)_{|\alpha| < s}$. For $p = (p_1, \ldots, p_d)$
3174: % and $q = (q_1, \ldots, q_d)$ such that $|p| < s$ and $|q| < s$, let
3175: % us introduce the matrices $A_m$ and $A$ with entries
3176: % \begin{equation*}
3177: % (A_m)_{p,q} = \int x^{p+q} P_X^m(dx), \quad (A)_{p,q} = \int
3178: % x^{p+q} P_X(dx).
3179: % \end{equation*}
3180: % The matrix $A$ is positive definite. Indeed, otherwise, we can find
3181: % a vector $b$ such that
3182: % \begin{equation*}
3183: % 0 = b^{\top} A b = E \Big[ \Big( \sum_{|\alpha| < s} b_\alpha
3184: % X^{\alpha} \Big)^2 \Big],
3185: % \end{equation*}
3186: % which entails that the polynomial $\sum_{|\alpha| < s} b_\alpha
3187: % x^{\alpha}$ is zero for almost every $x \in \supp P_X$, which is not
3188: % possible since we assumed that $|\supp(P_X)| > s$. Then, let us
3189: % denote by $\lambda(A) > 0$ the smallest eigenvalue of $A$. On the
3190: % event $\{ \norm{A_m - A}_\infty \leq \lambda(A)/2 \}$, we have
3191: % \begin{equation*}
3192: % b^{\top} b \leq \lambda(A)^{-1} (b^{\top} A_m b + b^{\top} b
3193: % \lambda(A) / 2 ),
3194: % \end{equation*}
3195: % which entails
3196: % \begin{equation*}
3197: % b^{\top} b \leq 2 \lambda(A)^{-1} b^{\top} A_m b = 2
3198: % \lambda(A)^{-1} \norm{f_1}_m^2.
3199: % \end{equation*}
3200: % Now, since $\norm{f - f_0}_m \leq \delta$, we have
3201: % \begin{equation*}
3202: % \norm{f_1}_m \leq \norm{f}_m + \norm{f_2}_m \leq \delta +
3203: % \norm{f_0}_m + J(f),
3204: % \end{equation*}
3205: % and putting all this together, this gives that on $\{ \norm{A_m -
3206: % A}_\infty \leq \lambda(A)/2 \}$:
3207: % \begin{equation*}
3208: % \norm{f}_\infty \leq C_0 (\delta + \norm{f_0}_m + J(\bar f)),
3209: % \end{equation*}
3210: % where $C_0 := (2 C(s) \lambda(A)^{-1} )^{1/2}$. By Hoeffding's
3211: % inequality, we have
3212: % \begin{equation*}
3213: % P[ \norm{A_m - A}_\infty > \lambda(A)/2 ] \leq C(s)^2 \exp( -D n)
3214: % \end{equation*}
3215: % with $D := \lambda(A)^2 / (8 M_X^2)$, where $M_X$ is the radius of
3216: % the support of $P_X$. This concludes the proof of the Lemma.
3217: % \end{proof}
3218:
3219:
3220: \appendix
3221:
3222: \section{Function spaces}
3223: \label{sec:appendix_approximation}
3224:
3225: In this section we give precise definitions of the spaces of functions
3226: considered in the paper, and give useful related results. The
3227: definitions and results presented here can be found
3228: in~\cite{triebel06}, in particular in Chapter~5 which is about
3229: anisotropic spaces, anisotropic multiresolutions, and entropy numbers
3230: of the embeddings of such spaces (see Section~5.3.3) that we use in
3231: particular to derive condition $(C_\beta)$, for the anisotropic Besov
3232: space, see Section~\ref{sec:pena_least_squares}.
3233:
3234: % If $\bs k
3235: % = (k_1, \ldots, k_d)$ with $k_i \geq 0$ we define the \emph{iterated
3236: % difference} by
3237: % \begin{equation*}
3238: % \Delta_h^{\bs k} f(x) = \Delta_{h_1 e_1}^{k_1} \circ \cdots \circ
3239: % \Delta_{h_d e_d}^{k_d} f(x)
3240: % \end{equation*}
3241:
3242: \subsection{Anisotropic Besov space}
3243:
3244: Let $\{ e_1, \ldots, e_d \}$ be the canonical basis of $\mathbb R^d$
3245: and $\bs s = (s_1, \ldots, s_d)$ with $s_i > 0$ be a vector of
3246: directional smoothness, where $s_i$ corresponds to the smoothness in
3247: direction $e_i$. Let us fix $1 \leq p, q \leq \infty$. If $f$ is a
3248: function in $\mathbb R^d$, we define $\Delta_h^k f$ as the
3249: \emph{difference} of order $k \geq 1$ and step $h \in \mathbb R^d$,
3250: given by $\Delta_h^1 f(x) = f(x + h) - f(x)$ and $\Delta_h^k f(x) =
3251: \Delta_h^1(\Delta_h^{k-1}f)(x)$ for any $x \in \mathbb R^d$. We say
3252: that $f \in L^p(\mathbb R^d)$ belongs to the anisotropic Besov space
3253: $B_{p, q}^{\bs s}(\mathbb R^d)$ if the semi-norm
3254: \begin{equation*}
3255: |f|_{B_{p, q}^{\bs s}(\mathbb R^d)} := \sum_{i=1}^d \Big(
3256: \int_0^1 (t^{-s_i} \norm{\Delta_{t e_i}^{k_i} f}_{p})^q
3257: \frac{dt}{t} \Big)^{1/q}
3258: \end{equation*}
3259: is finite (with the usual modifications when $p = \infty$ or $q =
3260: \infty$). We know that the norms
3261: \begin{equation*}
3262: \norm{f}_{B_{p, q}^{\bs s}} := \norm{f}_p + |f|_{B_{p, q}^{\bs s}}
3263: \end{equation*}
3264: are equivalent for any choice of $k_i > s_i$. An equivalent definition
3265: of the seminorm can be given using the directional differences and the
3266: anisotropic distance, see Theorem~5.8 in~\cite{triebel06}.
3267: % To make the presentation simple, we first define on $\mathbb R^d$
3268: % and then on some domain $\Omega \subset \mathbb R^d$.
3269: Following Section~5.3.3 in~\cite{triebel06}, we can define the
3270: anisotropic Besov space on an arbitrary domain $\Omega \subset \mathbb
3271: R^d$ (think of $\Omega$ as the support of the design $X$) in the
3272: following way. We define $B_{p, q}^{\bs s}(\Omega)$ as the set of all
3273: $f \in L^p(\Omega)$ such that there is $g \in B_{p, q}^{\bs s}(\mathbb
3274: R^d)$ with restriction $g | \Omega$ to $\Omega$ equal to $f$ in
3275: $L^p(\Omega)$. Moreover,
3276: \begin{equation*}
3277: \norm{f}_{B_{p, q}^{\bs s}(\Omega)} = \inf_{g : g|\Omega = f}
3278: \norm{g}_{B_{p, q}^{\bs s}(\mathbb R^d)},
3279: \end{equation*}
3280: where the infimum is taken over all $g \in B_{p, q}^{\bs s}(\mathbb
3281: R^d)$ such that $g | \Omega = f$. In an equivalent way, the space
3282: $B_{p, q}^{\bs s}(\Omega)$ can be defined using intrisic
3283: characterisations by differences, see Section~4.1.4
3284: in~\cite{triebel06}, where the idea is, roughly, to restrict the
3285: increments $h$ in the differences $\Delta_h^k$ so that the support of
3286: $\Delta_h^k f$ is included in $\Omega$.
3287:
3288: In what follows, we shall remove from the notations the dependence on
3289: $\Omega$, since it is does not affect the definitions and results
3290: below. Moreover, for what we need in this paper, we shall simply take
3291: $\Omega$ as the support of the design $X$. Several explicit particular
3292: cases for the space $B_{p, q}^{\bs s}$ are of interest. If $\bs s =
3293: (s, \ldots, s)$ for some $s > 0$, then $B_{p, q}^{\bs s}$ is the
3294: standard isotropic Besov space. When $p = q = 2$ and $s = (s_1,
3295: \ldots, s_d)$ has integer coordinates, $B_{2, 2}^{\bs s}$ is the
3296: anisotropic Sobolev space
3297: \begin{equation*}
3298: B_{2, 2}^{\bs s} = W_2^{\bs s} = \Big\{ f \in L^2 : \sum_{i=1}^d
3299: \Big\| \frac{\partial^{s_i} f}{\partial x_i^{s_i}} \Big\|_2 < \infty
3300: \Big\}.
3301: \end{equation*}
3302: If $\bs s$ has non-integer coordinates, then $B_{2, 2}^{\bs s}$ is the
3303: anisotropic Bessel-potential space
3304: \begin{equation*}
3305: H^{\bs s} = \Big\{ f \in L^2 : \sum_{i=1}^d \Big\| (1 +
3306: |\xi_i|^2)^{s_i/2} \hat f(\xi) \Big\|_2 < \infty \Big\}.
3307: \end{equation*}
3308:
3309:
3310: The results described in the next section are direct consequences of
3311: the transference method, see Section~5.3 in~\cite{triebel06}. Roughly,
3312: the idea is to transfer problems for anisotropic spaces via sequence
3313: space (one can think of sequence of wavelet coefficients for instance)
3314: to isotropic spaces. This technique allows to prove the statements
3315: below. Note that another technique of proof based on replicant coding
3316: can be used, see~\cite{kerk_picard_replicant_03}. This is commented
3317: below.
3318:
3319: \subsection{Embeddings and entropy numbers}
3320:
3321: % Let $\bs \sigma = (\sigma_1, \ldots, \sigma_d)$ be a fixed vector with
3322: % $\sigma_i > 0$ and harmonic mean equal to $1$, that is $\sum_{i=1}^d 1
3323: % / \sigma_i = d$. If $s > 0$, we denote for short $s \bs \sigma = (s
3324: % \sigma_1, \ldots, s \sigma_d)$.
3325:
3326: % Using together Theorems~5.28 and~1.97 in \cite{triebel06}, we have the
3327: % following statements. If $0 < s_1 < s_0$, we have
3328: % \begin{equation}
3329: % \label{eq:embedding1}
3330: % B_{p, q}^{s_0 \bs \sigma} \subset B_{p, q}^{s_1 \bs \sigma}.
3331: % \end{equation}
3332:
3333: Let us first mention the following obvious embedding, which is useful
3334: for the proof of adaptive upper bound (see
3335: Section~\ref{sec:derive_adaptive}). If $0 < \bs s_1 \leq \bs s_0$
3336: coordinatewise, that is $0 < s_{1, i} \leq s_{0, i}$ for any $i \in \{
3337: 1, \ldots, d \}$, we have
3338: \begin{equation}
3339: \label{eq:anisotropic_embedding}
3340: B_{p, q}^{\bs s_0} \subset B_{p, q}^{\bs s_1}.
3341: \end{equation}
3342: This simply follows from the fact that $B_{p, q}^{\bs s} =
3343: \cap_{i=1}^d B_{p, q, i}^{s_i}$, where $B_{p, q, i}^{s_i}$ is the
3344: corresponding Besov space in the $i$-th direction of coordinates, with
3345: norm $L^p$ extended to the other $d-1$ directions (see Remark~5.7 in
3346: \cite{triebel06}) together with the standard embedding for the
3347: isotropic Besov space.
3348:
3349: % \subsection{Entropy numbers}
3350:
3351: As we mentioned below, Assumption~$(C_\beta)$ (see
3352: Section~\ref{sec:pena_least_squares}) is satisfied for barely all
3353: smoothness spaces considered in nonparametric literature. In
3354: particular, if $\mathcal F = B_{p,q}^{\bs s}$ is the anisotropic Besov
3355: space defined above, $(C_\beta)$ is satisfied: it is a consequence of
3356: a more general Theorem (see Theorem~5.30 in \cite{triebel06})
3357: concerning the entropy numbers of embeddings (see Definition~1.87 in
3358: \cite{triebel06}). Here, we only give a simplified version of this
3359: Theorem, which is sufficient to derive $(C_\beta)$. Indeed, if one
3360: takes $\bs s_0 = \bs s$, $p_0 = p$, $q_0 = q$ and $\bs s_1 = 0$, $p_0
3361: = \infty$, $q_0 = \infty$ in Theorem~5.30 from \cite{triebel06}, we
3362: obtain the following
3363: \begin{theorem}
3364: \label{thm:anisotropic_entropy}
3365: Let $1 \leq p, q \leq \infty$ and $\bs s = (s_1, \ldots, s_d)$ where
3366: $s_i > 0$\textup, and let $\bs {\bar s}$ be the harmonic mean of
3367: $\bs s$ \textup(see~\eqref{eq:harmonic_mean}\textup). Whenever $\bs
3368: {\bar s} > d / p$\textup, we have
3369: \begin{equation*}
3370: B_{p, q}^{\bs s} \subset C(\Omega),
3371: \end{equation*}
3372: where $C(\Omega)$ is the set of continuous functions on
3373: $\Omega$\textup, and for any $\delta > 0$\textup, the sup-norm
3374: entropy of the unit ball of the anisotropic Besov space\textup,
3375: namely the set
3376: \begin{equation*}
3377: U_{p, q}^{\bs s} := \{ f \in B_{p, q}^{\bs s} :
3378: |f|_{B_{p,q}^{\bs s}} \leq 1 \}
3379: \end{equation*}
3380: satisfies
3381: \begin{equation}
3382: H_\infty(\delta, U_{p, q}^{\bs s}) \leq D \delta^{-\bs {\bar s} / d},
3383: \end{equation}
3384: where $D > 0$ is a constant independent of $\delta$.
3385: \end{theorem}
3386:
3387: For the isotropic Sobolev space, Theorem~\ref{thm:anisotropic_entropy}
3388: was obtained in the key paper~\cite{birman_solomjak67} (see
3389: Theorem~5.2 herein), and for the isotropic Besov space, it can be
3390: found, among others, in~\cite{birge_massart00}
3391: and~\cite{kerk_picard_replicant_03}.
3392:
3393: \begin{remark}
3394: A more constructive computation of the entropy of anisotropic Besov
3395: spaces can be done using the replicant coding approach, which is
3396: done for Besov bodies in~\cite{kerk_picard_replicant_03}. Using this
3397: approach together with an anisotropic multiresolution analysis based
3398: on compactly supported wavelets or atoms, see Section~5.2
3399: in~\cite{triebel06}, we can obtain a direct computation of the
3400: entropy. The idea is to do a quantization of the wavelet
3401: coefficients, and then to code them using a replication of their
3402: binary representation, and to use 01 as a separator (so that the
3403: coding is injective). A lower bound for the entropy can be obtained
3404: as an elegant consequence of Hoeffding's deviation inequality for
3405: sums of i.i.d. variables and a combinatorial lemma.
3406: \end{remark}
3407:
3408: % \texttt{faudra rajouter les jackson et bernstein estimates pour la
3409: % borne inf sur besov anisotropes}
3410:
3411:
3412: % \begin{theorem}[Birg\'e and Massart (2000), Corollary~1]
3413: % \label{thm:birge_massart}
3414:
3415: % \end{theorem}
3416:
3417: % \begin{remark}
3418: % When $p=2$ and $s \in \mathbb N_0$, we recover the result
3419: % from~\cite{birman_solomjak67}, namely
3420: % \begin{equation*}
3421: % N\big( \delta, W_s(R), \norm{\cdot}_{L^q} \big) \leq
3422: % \exp\Big( D \Big( \frac{L}{\delta} \Big)^{d/s} \Big),
3423: % \end{equation*}
3424: % where $W_s(R) := \{ f \in W_s : J_s(f) \leq R \}$,
3425: % see~\eqref{eq:usual_roughness}.
3426: % \end{remark}
3427: % The result from~\cite{birman_solomjak67} was previously used
3428: % in~\cite{mammen_vandegeer97}, for estimation in partial linear
3429: % models. % In~\cite{birman_solomjak67}, it is stated in a more general
3430: % % setting, for any $L^q$-norm with $1 \leq q \leq +\infty$.
3431: % The fact that this result holds for the $L^q$-norm, $q = \infty$
3432: % included, is important here. Indeed, a cover for $L^\infty$-norm is
3433: % also a cover for both the $L^2(P_X)$ and $L^2(P_X^m)$ norms (simply
3434: % write that $\norm{f}_{L^2(P_X)} \leq \norm{f}_\infty$ and
3435: % $\norm{f}_{L^2(P_X^m)} \leq \norm{f}_\infty$.)
3436:
3437: % \texttt{Besov sur un domaine plutot ??}
3438:
3439: % \subsection{Multiscale setting}
3440:
3441: % Let $M$ be a dilatation matrix in $\mathcal M_d(\mathbb Z)$, namely a
3442: % matrix with integer entries and eigenvalues outside the unit disk. Let
3443: % $\varphi \in H^s$. We say that $\varphi$ is a \emph{$M$-scaling
3444: % function} if it is compactly supported, if $\int_{\mathbb R^d}
3445: % \varphi(x) dx = 1$ and if
3446: % \begin{itemize}
3447: % \item there is a finite sequence of complex numbers $(h_k)_{k \in
3448: % \mathbb Z^d}$ such that
3449: % \begin{equation*}
3450: % \varphi(x) = |\det M|^{1/2} \sum_{k \in \mathbb Z^d} h_k
3451: % \varphi(M x - k) ;
3452: % \end{equation*}
3453: % \item $\{ \varphi(\cdot - k) \}_{k \in \mathbb Z^d}$ is a Riesz basis
3454: % for the space it spans.
3455: % \end{itemize}
3456: % Two $M$-scaling functions $\varphi$ and $\tilde \varphi$ are
3457: % \emph{biorthogonal $M$-scaling functions} if the systems $\{
3458: % \varphi(\cdot - k) \}_{k \in \mathbb Z^d}$ and $\{ \tilde
3459: % \varphi(\cdot - k) \}_{k \in \mathbb Z^d}$ are orthogonal.
3460:
3461: % The construction of \emph{compactly supported} $M$-scaling functions
3462: % for an arbitrary dilatation matrix is a very difficult subject of
3463: % current research. Indeed, even in one-dimension, when $M = m$ is not
3464: % integer, there is no scaling functions with compact support, see
3465: % [1].
3466:
3467: % If $M = \diag(m_1, \ldots, m_d)$ where the $m_i \geq 2$ are integers,
3468: % we can construct biorthogonal $M$-scaling functions using tensor
3469: % products of one dimensional $m_i$-scaling functions $\varphi_i, \tilde
3470: % \varphi_i \in H^s(\mathbb R)$ for an arbitrary large smoothness
3471: % $s$. The construction of biorthogonal compactly supported
3472: % one-dimensional $m$-scaling functions for any integer $m \geq 2$ can
3473: % be found in ???? Then, can we simply consider
3474: % \begin{equation*}
3475: % \varphi(x) = \prod_{i=1}^d \varphi_i(x_i) \text{ and } \tilde
3476: % \varphi(x) = \prod_{i=1}^d \tilde \varphi_i(x_i)
3477: % \end{equation*}
3478: % to obtain compactly supported biorthogonal $M$-scaling functions. Let
3479: % us consider the matrix
3480: % \begin{equation}
3481: % \label{eq:particular_dilatation_matrix}
3482: % M = \diag(\lambda^{1 / \sigma_1}, \ldots,
3483: % \lambda^{1 / \sigma_d}).
3484: % \end{equation}
3485: % The following Lemma can be found in
3486: % \begin{lemma}[see Lemma 3.2 in ????]
3487: % Let $\bs \sigma = (\sigma_1, \ldots, \sigma_d) > 0$. The following
3488: % conditions are equivalent:
3489: % \begin{itemize}
3490: % \item There is a number $\lambda > 1$ such that $\lambda^{1 /
3491: % \sigma_i} \in \mathbb Z_+$ for $1 \leq i \leq d$
3492: % \item There is a number $\mu > 0$ such that $(1 / \sigma_1, \ldots,
3493: % 1 / \sigma_d) \in \mu \log \mathbb Z_+^d$.
3494: % \end{itemize}
3495: % \end{lemma}
3496: % Thus, we know that when
3497: % \begin{equation}
3498: % \label{eq:anisotropic_restriction}
3499: % \Big( \frac{1}{\sigma_1}, \ldots, \frac{1}{\sigma_d} \Big) \in \mu
3500: % \log \mathbb Z_+^d
3501: % \end{equation}
3502: % for some $\mu > 0$, we can find compactly supported biorthogonal
3503: % $M$-scaling functions. A multiresolution analysis of $L^\pi$ for $1
3504: % \leq \pi \leq \infty$ based on such scaling functions can be easily
3505: % construted, in the same way as in the dyadic case where $m_i =
3506: % 2$). This is explained in details in ????. We define $\varphi_{j,
3507: % k}(x) := |\det M|^{j / \pi} \varphi(M^j x - k)$ where $j \in \mathbb
3508: % Z$ is the resolution level and $k \in \mathbb Z^d$ is the localization
3509: % parameter. When $M$ is given
3510: % by~\eqref{eq:particular_dilatation_matrix} we can write
3511: % \begin{equation*}
3512: % \varphi_{j, k}(x) := \lambda^{j d / p} \varphi(M^j x - k).
3513: % \end{equation*}
3514: % These dilated and translated scaling functions are normalized in
3515: % $L^\pi$ (if $\pi = \infty$, take $\pi = 1$ in the above definition and
3516: % take a scaling function \texttt{c'est faux !!!} divide $\varphi_{j,
3517: % k}$ by $\norm{\varphi}_\infty$, so that $\norm{\varphi_{j,
3518: % k}}_\infty = 1$). If we define for $j \in \mathbb Z$
3519: % \begin{equation*}
3520: % V_j = \overline{ \Span\{ \varphi_{j, k} : k \in \mathbb Z^d \} }
3521: % \end{equation*}
3522: % which is the closure of the $\Span$ of the $\varphi_{j, k}$ in
3523: % $L^\pi$, then $(V_j)_{j \in \mathbb Z_d}$ is a multiresolution
3524: % analysis of $L^\pi$ (again, if $\pi = \infty$ then $L^\infty$ is
3525: % replaced by $C(\mathbb R^d)$). We can define in the same way dilated
3526: % and translated scaling functions $\tilde \varphi_{j, k}$, and
3527: % construct as a consequence multiresolution analysis of $L^\pi$.
3528:
3529: % A remark of first importance in what follows is then the following: if
3530: % $x$ is fixed, then $\varphi_{e, j, k}$ \texttt{mettre ca apres la MRA}
3531: % $K_j$ of cardinaly $|K_j| \approx \lambda^{j d}$ (recall that by
3532: % construction $\lambda$ is an integer).
3533:
3534: % $\beta_{e, j, k} := \langle f, \tilde \psi_{e, j, k} \rangle$
3535:
3536: % $E = \{ 1, \ldots, m \}$
3537:
3538: % \texttt{apres la MRA:}
3539: % For any $f \in L^\pi$
3540: % \begin{equation*}
3541: % \Big\| \sum_{e \in E, k \in K_j} \beta_{e, k} \psi_{e, j, k}
3542: % \Big\|_{L^\pi} \approx \Big( \sum_{e \in E, k \in K_j} |\beta_{e, j,
3543: % k}|^\pi \Big)^{1/\pi}
3544: % \end{equation*}
3545: % with the usual modification whenever $\pi = \infty$.
3546:
3547: % or equivalently,
3548:
3549:
3550: % where the above sums are convergent in $L^\pi$
3551:
3552: % \texttt{mettre estimees de jackson et bernstein}
3553:
3554:
3555:
3556: % This is the reason why the entropy of anisotropic Besov space
3557: % will given only be able to use the caracterization of anisotropic
3558: % Besov spaces by wavelet coefficients for
3559:
3560:
3561:
3562: % This is the reason why the
3563:
3564: % It is well-known that wavelets are a powerful tool for the
3565: % characterazition of Besov spaces, by means of sums weighted sums of
3566: % wavelet coefficient. Besov isotropic classes can be defined in this
3567: % way, using basis with
3568:
3569: % The use of compaclty supported wavelets is of first importance in
3570: % statistics for instance, and the fact that the number of
3571:
3572: % A key tool for the
3573:
3574: % A powerful way of described isotropic Besov spaces is Wavelet. Indeed,
3575: % it is well known that
3576:
3577: % If $\beta_{j, k} = \prodsca{f}{\tilde \psi_{e, j, k}}$
3578:
3579: % \begin{equation*}
3580: % \frac{1}{C} \Big( \sum_{e, k} |\beta_{e, j, k} |^p \Big)^{1/p} \leq \Big
3581: % \| \sum_{e=1}^{m-1} \sum_{k \in \mathbb Z^d} \beta_{e, j, k}
3582: % \psi_{e, j, k} \Big\|_p \leq C \Big( \sum_{e, k} |\beta_{e, j, k}
3583: % |^p \Big)^{1/p}
3584: % \end{equation*}
3585:
3586:
3587:
3588: \section{Some probabilistic tools}
3589: \label{sec:appendix_proba}
3590:
3591:
3592: For the first Theorem we refer to \cite{EM:96}. The two following
3593: Theorems can be found, for instance, in
3594: \cite{massart03,vdVW:96,ledoux_talagrand91}.
3595:
3596:
3597: \begin{theorem}[Einmahl and Masson (1996)]
3598: \label{TheoEinmahlMasson}
3599: Let $Z_1,\ldots,Z_n$ be $n$ independent non-negative random
3600: variables such that $E[Z_i^2]\leq \sigma^2,\forall i=1, \ldots, n$.
3601: Then, we have, for any $\delta > 0$,
3602: \begin{equation*}
3603: P \Big[\sum_{i=1}^n Z_i - E[Z_i] \leq -n \delta \Big]
3604: \leq \exp\Big(-\frac{n \delta^2}{2\sigma^2} \Big).
3605: \end{equation*}
3606: \end{theorem}
3607:
3608:
3609: \begin{theorem}[Sudakov]
3610: \label{TheoSudakov}
3611: There exists an absolute constant $c^*>0$ such that for any integer
3612: $M$, any centered gaussian vector $X = (X_1,\ldots,X_M)$ in
3613: $\mathbb{R}^M$, we have,
3614: \begin{equation*}
3615: c^* E[\max_{1\leq j\leq M}X_j] \geq \varepsilon \sqrt{\log M},
3616: \end{equation*}
3617: where $\varepsilon := \min \Big\{ \sqrt{E[(X_i-X_j)^2]} : i \neq j
3618: \in \{1, \ldots, M\} \Big\}$.
3619: \end{theorem}
3620:
3621: \begin{theorem}[Maximal inequality]
3622: \label{TheoMaxConcIneq}
3623: Let $Y_1, \ldots, Y_M$ be $M$ random variables satisfying
3624: $E[\exp(sY_j)] \leq \exp((s^2\sigma^2)/2)$ for any integer $j$ and
3625: any $s>0$. Then, we have
3626: \begin{equation*}
3627: E[ \max_{1 \leq j \leq M} Y_j] \leq \sigma \sqrt{\log M}.
3628: \end{equation*}
3629: \end{theorem}
3630:
3631: % \begin{theorem}[Berry-Ess{\'e}en]\label{TheoBerry}
3632: % Suppose that $(X_i)_{i\in\mathbb{N}}$ is a sequence of i.i.d. random
3633: % variables with mean $\mu$ and variance $\sigma^2>0$. Then, for all
3634: % $n$,
3635: % $$\sup_{t\in\mathbb{R}}\left\vert\mathbb{P}\Big(\frac{\sum_{i=1}^n X_i-n\mu}
3636: % {\sigma \sqrt{n}}\leq t \Big)-\Phi(t)\right\vert\leq
3637: % \frac{33}{4}\frac{\mathbb{E}|X_1-\mu|^3}{\sigma^3\sqrt{n}}.$$
3638: % \end{theorem}
3639:
3640: \par
3641:
3642:
3643: % \bibliographystyle{ims}
3644:
3645: \begin{thebibliography}{48}
3646: \expandafter\ifx\csname natexlab\endcsname\relax\def\natexlab#1{#1}\fi
3647: \expandafter\ifx\csname url\endcsname\relax
3648: \def\url#1{\texttt{#1}}\fi
3649: \expandafter\ifx\csname urlprefix\endcsname\relax\def\urlprefix{URL }\fi
3650: \providecommand{\eprint}[2][]{\url{#2}}
3651:
3652: \bibitem[{Amato et~al.(2006)Amato, Antoniadis and
3653: Pensky}]{amato_antoniadis_pensky06}
3654: \textsc{Amato, U.}, \textsc{Antoniadis, A.} and \textsc{Pensky, M.} (2006).
3655: \newblock Wavelet kernel penalized estimation for non-equispaced design
3656: regression.
3657: \newblock \textit{Stat. Comput.}, \textbf{16} 37--55.
3658:
3659: \bibitem[{Aronszajn(1950)}]{aronszajn50}
3660: \textsc{Aronszajn, N.} (1950).
3661: \newblock Theory of reproducing kernels.
3662: \newblock \textit{Trans. Amer. Math. Soc.}, \textbf{68} 337--404.
3663:
3664: \bibitem[{Bartlett and Mendelson(2006)}]{BM:06}
3665: \textsc{Bartlett, P.~L.} and \textsc{Mendelson, S.} (2006).
3666: \newblock Empirical minimization.
3667: \newblock \textit{Probab. Theory Related Fields}, \textbf{135} 311--334.
3668:
3669: \bibitem[{Birg\'e and Massart(1993)}]{birge_massart93}
3670: \textsc{Birg\'e, L.} and \textsc{Massart, P.} (1993).
3671: \newblock {Rates of convergence for minimum contrast estimators.}
3672: \newblock \textit{Probab. Theory Relat. Fields}, \textbf{97} 113--150.
3673:
3674: \bibitem[{Birg{\'e} and Massart(2000)}]{birge_massart00}
3675: \textsc{Birg{\'e}, L.} and \textsc{Massart, P.} (2000).
3676: \newblock An adaptive compression algorithm in {B}esov spaces.
3677: \newblock \textit{Constr. Approx.}, \textbf{16} 1--36.
3678:
3679: \bibitem[{Birman and Solomjak(1967)}]{birman_solomjak67}
3680: \textsc{Birman, M.~{\v{S}}.} and \textsc{Solomjak, M.~Z.} (1967).
3681: \newblock Piecewise polynomial approximations of functions of classes
3682: {$W_p^{\alpha}$}.
3683: \newblock \textit{Mat. Sb. (N.S.)}, \textbf{73 (115)} 331--355.
3684:
3685: \bibitem[{Bitouz{\'e} et~al.(1999)Bitouz{\'e}, Laurent and Massart}]{BLM99}
3686: \textsc{Bitouz{\'e}, D.}, \textsc{Laurent, B.} and \textsc{Massart, P.} (1999).
3687: \newblock A {D}voretzky-{K}iefer-{W}olfowitz type inequality for the
3688: {K}aplan-{M}eier estimator.
3689: \newblock \textit{Ann. Inst. H. Poincar\'e Probab. Statist.}, \textbf{35}
3690: 735--763.
3691:
3692: \bibitem[{Carl and Stephani(1990)}]{CS:98}
3693: \textsc{Carl, B.} and \textsc{Stephani, I.} (1990).
3694: \newblock \textit{Entropy, compactness and the approximation of operators},
3695: vol.~98 of \textit{Cambridge Tracts in Mathematics}.
3696: \newblock Cambridge University Press, Cambridge.
3697:
3698: \bibitem[{Catoni(2001)}]{catbook:01}
3699: \textsc{Catoni, O.} (2001).
3700: \newblock \textit{Statistical Learning Theory and Stochastic Optimization}.
3701: \newblock Ecole d'{\'e}t{\'e} de Probabilit{\'e}s de Saint-Flour 2001, Lecture
3702: Notes in Mathematics, Springer, N.Y.
3703:
3704: \bibitem[{Cucker and Smale(2002)}]{cucker_smale02}
3705: \textsc{Cucker, F.} and \textsc{Smale, S.} (2002).
3706: \newblock On the mathematical foundations of learning.
3707: \newblock \textit{Bull. Amer. Math. Soc. (N.S.)}, \textbf{39} 1--49
3708: (electronic).
3709:
3710: \bibitem[{Devroye et~al.(1996)Devroye, Gy{\"o}rfi and Lugosi}]{DGL:96}
3711: \textsc{Devroye, L.}, \textsc{Gy{\"o}rfi, L.} and \textsc{Lugosi, G.} (1996).
3712: \newblock \textit{A probabilistic theory of pattern recognition}, vol.~31 of
3713: \textit{Applications of Mathematics (New York)}.
3714: \newblock Springer-Verlag, New York.
3715:
3716: \bibitem[{Einmahl and Mason(1996)}]{EM:96}
3717: \textsc{Einmahl, U.} and \textsc{Mason, D.~M.} (1996).
3718: \newblock Some universal results on the behavior of increments of partial sums.
3719: \newblock \textit{Ann. Probab.}, \textbf{24} 1388--1407.
3720:
3721: \bibitem[{Ga\"iffas and Lecu\'e(2007)}]{gaiffas_lecue07}
3722: \textsc{Ga\"iffas, S.} and \textsc{Lecu\'e, G.} (2007).
3723: \newblock Optimal rates and adaptation in the single-index model using
3724: aggregation.
3725: \newblock \textit{Electronic Journal of Statistics}, \textbf{1} 538--573.
3726:
3727: \bibitem[{Green and Silverman(1994)}]{green_silverman94}
3728: \textsc{Green, P.~J.} and \textsc{Silverman, B.~W.} (1994).
3729: \newblock \textit{Nonparametric regression and generalized linear models},
3730: vol.~58 of \textit{Monographs on Statistics and Applied Probability}.
3731: \newblock Chapman \& Hall, London.
3732: \newblock A roughness penalty approach.
3733:
3734: \bibitem[{Gy{\"o}rfi et~al.(2002)Gy{\"o}rfi, Kohler, Krzy{\.z}ak and
3735: Walk}]{kohler02}
3736: \textsc{Gy{\"o}rfi, L.}, \textsc{Kohler, M.}, \textsc{Krzy{\.z}ak, A.} and
3737: \textsc{Walk, H.} (2002).
3738: \newblock \textit{A distribution-free theory of nonparametric regression}.
3739: \newblock Springer Series in Statistics, Springer-Verlag, New York.
3740:
3741: \bibitem[{Hamers and Kohler(2004)}]{hamers_kohler04}
3742: \textsc{Hamers, M.} and \textsc{Kohler, M.} (2004).
3743: \newblock How well can a regression function be estimated if the distribution
3744: of the (random) design is concentrated on a finite set?
3745: \newblock \textit{J. Statist. Plann. Inference}, \textbf{123} 377--394.
3746:
3747: \bibitem[{Haussler(1992)}]{H:92}
3748: \textsc{Haussler, D.} (1992).
3749: \newblock Decision-theoretic generalizations of the {PAC} model for neural net
3750: and other learning applications.
3751: \newblock \textit{Inform. and Comput.}, \textbf{100} 78--150.
3752:
3753: \bibitem[{Hochmuth(2002)}]{hochmuth02}
3754: \textsc{Hochmuth, R.} (2002).
3755: \newblock Wavelet characterizations for anisotropic {B}esov spaces.
3756: \newblock \textit{Appl. Comput. Harmon. Anal.}, \textbf{12} 179--208.
3757:
3758: \bibitem[{Hoffmann and Lepski(2002)}]{hoffmann_lepski02}
3759: \textsc{Hoffmann, M.} and \textsc{Lepski, O.~V.} (2002).
3760: \newblock Random rates in anisotropic regression.
3761: \newblock \textit{The Annals of Statistics}, \textbf{30} 325--396.
3762:
3763: \bibitem[{Juditsky et~al.(2005{\natexlab{a}})Juditsky, Rigollet and
3764: Tsybakov}]{juditsky_etal05}
3765: \textsc{Juditsky, A.}, \textsc{Rigollet, P.} and \textsc{Tsybakov, A.}
3766: (2005{\natexlab{a}}).
3767: \newblock Learning by mirror averaging.
3768: \newblock \urlprefix\url{http://arxiv.org/abs/math/0511468}.
3769:
3770: \bibitem[{Juditsky et~al.(2005{\natexlab{b}})Juditsky, Nazin, Tsybakov and
3771: Vayatis}]{juditsky_nazin05}
3772: \textsc{Juditsky, A.~B.}, \textsc{Nazin, A.~V.}, \textsc{Tsybakov, A.~B.} and
3773: \textsc{Vayatis, N.} (2005{\natexlab{b}}).
3774: \newblock Recursive aggregation of estimators by the mirror descent method with
3775: averaging.
3776: \newblock \textit{Problemy Peredachi Informatsii}, \textbf{41} 78--96.
3777:
3778: \bibitem[{Juditsky et~al.(2006)Juditsky, Rigollet and Tsybakov}]{jrt:06}
3779: \textsc{Juditsky, A.~B.}, \textsc{Rigollet, P.} and \textsc{Tsybakov, A.~B.}
3780: (2006).
3781: \newblock Learning by mirror averaging.
3782: \newblock To appear in the {\it Ann. Statist.}. Available at
3783: http://www.imstat.org/aos/future\_papers.html.
3784:
3785: \bibitem[{Kearns et~al.(1994)Kearns, Schapire, Sellie and
3786: Hellerstein}]{KSSH:94}
3787: \textsc{Kearns, M.~J.}, \textsc{Schapire, R.~E.}, \textsc{Sellie, L.~M.} and
3788: \textsc{Hellerstein, L.} (1994).
3789: \newblock Toward efficient agnostic learning.
3790: \newblock In \textit{Machine Learning}. ACM Press, 341--352.
3791:
3792: \bibitem[{Kerkyacharian et~al.(2001)Kerkyacharian, Lepski and
3793: Picard}]{kerk_lepski_picard01}
3794: \textsc{Kerkyacharian, G.}, \textsc{Lepski, O.} and \textsc{Picard, D.} (2001).
3795: \newblock Nonlinear estimation in anisotropic multi-index denoising.
3796: \newblock \textit{Probab. Theory Related Fields}, \textbf{121} 137--170.
3797:
3798: \bibitem[{Kerkyacharian et~al.(2007)Kerkyacharian, Lepski and
3799: Picard}]{kerk_lepski_picard07}
3800: \textsc{Kerkyacharian, G.}, \textsc{Lepski, O.} and \textsc{Picard, D.} (2007).
3801: \newblock Nonlinear estimation in anisotropic multiindex denoising. {S}parse
3802: case.
3803: \newblock \textit{Teor. Veroyatn. Primen.}, \textbf{52} 150--171.
3804:
3805: \bibitem[{Kerkyacharian and Picard(2003)}]{kerk_picard_replicant_03}
3806: \textsc{Kerkyacharian, G.} and \textsc{Picard, D.} (2003).
3807: \newblock Replicant compression coding in {B}esov spaces.
3808: \newblock \textit{ESAIM Probab. Stat.}, \textbf{7} 239--250 (electronic).
3809:
3810: \bibitem[{Kerkyacharian and Picard(2007)}]{kerk_picard07}
3811: \textsc{Kerkyacharian, G.} and \textsc{Picard, D.} (2007).
3812: \newblock Thresholding in learning theory.
3813: \newblock \textit{Constr. Approx.}, \textbf{26} 173--203.
3814:
3815: \bibitem[{Kimeldorf and Wahba(1971)}]{kimeldorf_wahba71}
3816: \textsc{Kimeldorf, G.} and \textsc{Wahba, G.} (1971).
3817: \newblock Some results on {T}chebycheffian spline functions.
3818: \newblock \textit{J. Math. Anal. Appl.}, \textbf{33} 82--95.
3819:
3820: \bibitem[{Kohler(2000)}]{kohler00}
3821: \textsc{Kohler, M.} (2000).
3822: \newblock Inequalities for uniform deviations of averages from expectations
3823: with applications to nonparametric regression.
3824: \newblock \textit{J. Statist. Plann. Inference}, \textbf{89} 1--23.
3825:
3826: \bibitem[{Kohler and Krzy{\.z}ak(2001{\natexlab{a}})}]{kohler_krzyzak01a}
3827: \textsc{Kohler, M.} and \textsc{Krzy{\.z}ak, A.} (2001{\natexlab{a}}).
3828: \newblock Nonparametric regression estimation using penalized least squares.
3829: \newblock \textit{IEEE Trans. Inform. Theory}, \textbf{47} 3054--3058.
3830:
3831: \bibitem[{Kohler and Krzy{\.z}ak(2001{\natexlab{b}})}]{kohler_krzyzak01b}
3832: \textsc{Kohler, M.} and \textsc{Krzy{\.z}ak, A.} (2001{\natexlab{b}}).
3833: \newblock Nonparametric regression estimation using penalized least squares.
3834: \newblock \textit{IEEE Trans. Inform. Theory}, \textbf{47} 3054--3058.
3835:
3836: \bibitem[{Lecu{\'e}(2006)}]{LecJMLR:06}
3837: \textsc{Lecu{\'e}, G.} (2006).
3838: \newblock Lower bounds and aggregation in density estimation.
3839: \newblock \textit{J. Mach. Learn. Res.}, \textbf{7} 971--981.
3840:
3841: \bibitem[{Lecu{\'e}(2007)}]{LecAoS:07}
3842: \textsc{Lecu{\'e}, G.} (2007).
3843: \newblock Simultaneous adaptation to the margin and to complexity in
3844: classification.
3845: \newblock \textit{Ann. Statist.}, \textbf{35} 1698--1721.
3846:
3847: \bibitem[{Ledoux and Talagrand(1991)}]{ledoux_talagrand91}
3848: \textsc{Ledoux, M.} and \textsc{Talagrand, M.} (1991).
3849: \newblock \textit{Probability in {B}anach spaces}, vol.~23 of
3850: \textit{Ergebnisse der Mathematik und ihrer Grenzgebiete (3) [Results in
3851: Mathematics and Related Areas (3)]}.
3852: \newblock Springer-Verlag, Berlin.
3853: \newblock Isoperimetry and processes.
3854:
3855: \bibitem[{Leung and Barron(2006)}]{leung_barron06}
3856: \textsc{Leung, G.} and \textsc{Barron, A.~R.} (2006).
3857: \newblock Information theory and mixing least-squares regressions.
3858: \newblock \textit{IEEE Trans. Inform. Theory}, \textbf{52} 3396--3410.
3859:
3860: \bibitem[{Massart(2007)}]{massart03}
3861: \textsc{Massart, P.} (2007).
3862: \newblock \textit{Concentration inequalities and model selection}, vol. 1896 of
3863: \textit{Lecture Notes in Mathematics}.
3864: \newblock Springer, Berlin.
3865: \newblock Lectures from the 33rd Summer School on Probability Theory held in
3866: Saint-Flour, July 6--23, 2003, With a foreword by Jean Picard.
3867:
3868: \bibitem[{Neumann(2000)}]{neumann00}
3869: \textsc{Neumann, M.~H.} (2000).
3870: \newblock Multivariate wavelet thresholding in anisotropic function spaces.
3871: \newblock \textit{Statist. Sinica}, \textbf{10} 399--431.
3872:
3873: \bibitem[{Steinwart and Scovel(2007)}]{SS:07}
3874: \textsc{Steinwart, I.} and \textsc{Scovel, C.} (2007).
3875: \newblock Fast rates for support vector machines using {G}aussian kernels.
3876: \newblock \textit{Ann. Statist.}, \textbf{35} 575--607.
3877:
3878: \bibitem[{Triebel(2006)}]{triebel06}
3879: \textsc{Triebel, H.} (2006).
3880: \newblock \textit{Theory of function spaces. {III}}, vol. 100 of
3881: \textit{Monographs in Mathematics}.
3882: \newblock Birkh\"auser Verlag, Basel.
3883:
3884: \bibitem[{Tsybakov(2003{\natexlab{a}})}]{tsybakov03}
3885: \textsc{Tsybakov, A.} (2003{\natexlab{a}}).
3886: \newblock \textit{Introduction à l'estimation non-paramétrique}.
3887: \newblock Springer.
3888:
3889: \bibitem[{Tsybakov(2003{\natexlab{b}})}]{tsy:03}
3890: \textsc{Tsybakov, A.~B.} (2003{\natexlab{b}}).
3891: \newblock Optimal rates of aggregation.
3892: \newblock \textit{Computational Learning Theory and Kernel Machines.
3893: B.Sch{\"o}lkopf and M.Warmuth, eds. Lecture Notes in Artificial
3894: Intelligence}, \textbf{2777} 303--313.
3895: \newblock Springer, Heidelberg.
3896:
3897: \bibitem[{van~de Geer(1990)}]{vandegeer90}
3898: \textsc{van~de Geer, S.} (1990).
3899: \newblock Estimating a regression function.
3900: \newblock \textit{Ann. Statist.}, \textbf{18} 907--924.
3901:
3902: \bibitem[{van~de Geer(2007)}]{vdg07}
3903: \textsc{van~de Geer, S.} (2007).
3904: \newblock Oracle inequalities and regularization.
3905: \newblock In \textit{Lectures on empirical processes}. EMS Ser. Lect. Math.,
3906: Eur. Math. Soc., Z\"urich, 191--252.
3907:
3908: \bibitem[{van~de Geer(2000)}]{van_de_geer00}
3909: \textsc{van~de Geer, S.~A.} (2000).
3910: \newblock \textit{Applications of empirical process theory}, vol.~6 of
3911: \textit{Cambridge Series in Statistical and Probabilistic Mathematics}.
3912: \newblock Cambridge University Press, Cambridge.
3913:
3914: \bibitem[{van~der Vaart and Wellner(1996)}]{vdVW:96}
3915: \textsc{van~der Vaart, A.~W.} and \textsc{Wellner, J.~A.} (1996).
3916: \newblock \textit{Weak convergence and empirical processes}.
3917: \newblock Springer Series in Statistics, Springer-Verlag, New York.
3918: \newblock With applications to statistics.
3919:
3920: \bibitem[{Wahba(1990)}]{wahba90}
3921: \textsc{Wahba, G.} (1990).
3922: \newblock \textit{Spline models for observational data}, vol.~59 of
3923: \textit{CBMS-NSF Regional Conference Series in Applied Mathematics}.
3924: \newblock Society for Industrial and Applied Mathematics (SIAM), Philadelphia,
3925: PA.
3926:
3927: \bibitem[{Yang(2000)}]{yang:00}
3928: \textsc{Yang, Y.} (2000).
3929: \newblock Mixing strategies for density estimation.
3930: \newblock \textit{Ann. Statist.}, \textbf{28} 75--87.
3931:
3932: \bibitem[{Yang(2004)}]{yang04}
3933: \textsc{Yang, Y.} (2004).
3934: \newblock Aggregating regression procedures to improve performance.
3935: \newblock \textit{Bernoulli}, \textbf{10} 25--47.
3936:
3937: \end{thebibliography}
3938:
3939:
3940: % \bibliography{biblio}
3941:
3942:
3943: \end{document}