0711.2509/ms.tex
1: \documentclass[useAMS,usenatbib,usegraphicx]{mn2e}
2: \usepackage{times}
3: 
4: 
5: 
6: \title[Shrinkage estimation of the covariance matrix]
7: {Shrinkage estimation of the power spectrum covariance matrix}
8: 
9: \author[Pope \& Szapudi]{
10: Adrian C. Pope$^1$\thanks{E-mail: pope@ifa.hawaii.edu}
11: and Istv\'{a}n Szapudi$^1$\\
12: $^1$Institute for Astronomy, 2680 Woodlawn Drive, Honolulu, HI 96822}
13: 
14: 
15: 
16: \newcommand{\figtoy}{
17:   \begin{figure*}
18:     \begin{center}
19:       \includegraphics[width=\textwidth]{fig/toyMseEv3.eps}
20:     \end{center}
21:     \caption{
22:       Comparison of the Monte Carlo (MC) and shrinkage (MC+S) estimates 
23:       for the toy model covariance from Section~\ref{sub:toycov}.
24:       The plot at left shows the mean squared error (MSE) between the
25:       estimate and the known input covariance as a function of 
26:       the number of realizations, $n$.
27:       Results using only the target are also shown.
28:       The plots at right show the eigenvalue spectra for the covariance 
29:       estimators using different numbers of realizations, $n$.
30:       The known true eigenvalue spectrum and the eigenvalues of the target
31:       are also shown.
32:       All plots represent the averages of 100 simulations for each
33:       number of realizations, $n$.
34:     }
35:     \label{fig:toy}
36:   \end{figure*}
37: }
38: 
39: 
40: 
41: \newcommand{\figpk}{
42:   \begin{figure}
43:     \begin{center}
44:       \includegraphics[width=\columnwidth]{fig/pk.eps}
45:     \end{center}
46:     \caption{Plot of the  input theoretical power spectrum 
47:       $\left( P_{\rm CAMB} \right)$, 
48:       the theoretical power spectrum convolved with the survey window function
49:       $\left( P^W_{\rm CAMB} \right)$,
50:       and the averaged power spectrum measured from all of the sub-volumes 
51:       $\left( \langle P_{\rm meas} \rangle \right)$.
52:       Inset shows the spherically averaged survey window function.}
53:     \label{fig:pk}
54:   \end{figure}
55: }
56: 
57: 
58: 
59: \newcommand{\figsmc}{
60:   \begin{figure}
61:     \begin{center}
62:       \includegraphics[width=\columnwidth]{fig/s8mc.eps}
63:     \end{center}
64:     \caption{(Area normalized) distributions of maximum-likelihood value, 
65:       $\hat{\sigma}_8$, and error bar, 
66:       $\Delta$, estimates for the Reference, Monte Carlo (MC), Monte
67:       Carlo Target only (Target), and the Monte Carlo + Shrinkage (MC+S)
68:       covariance matrix estimates.}
69:     \label{fig:s8mc}
70:   \end{figure}
71: }
72: 
73: 
74: 
75: \newcommand{\figcov}{
76:   \begin{figure}
77:     \begin{center}
78:       \includegraphics[width=\columnwidth]{fig/ciitii.eps}
79:     \end{center}
80:     \caption{Plot of the diagonal elements of the reference covariance
81:       matrix estimated from all of the sub-volumes, of a linear theory
82:       model for the covariance, and of the 102 target matrices for
83:       the Monte Carlo + Shrinkage estimates.
84:       Inset shows the reference correlation matrix in a linear stretch.}
85:     \label{fig:cov}
86:   \end{figure}
87: }
88: 
89: 
90: 
91: \newcommand{\figsjk}{
92:   \begin{figure}
93:     \begin{center}
94:       \includegraphics[width=\columnwidth]{fig/s8jk.eps}
95:     \end{center}
96:     \caption{(Area normalized) distributions of maximum-likelihood value, 
97:       $\hat{\sigma}_8$, and error bar, 
98:       $\Delta$, estimates for the Reference, Jackknife (JK), Jackknife
99:       Target only (Target), and the Jackknife + Shrinkage (JK+S)
100:       covariance matrix estimates.}
101:     \label{fig:s8jk}
102:   \end{figure}
103: }
104: 
105: 
106: 
107: \newcommand{\figeigen}{
108:   \begin{figure}
109:     \begin{center}
110:       \includegraphics[width=\columnwidth]{fig/eigen.eps}
111:     \end{center}
112:     \caption{Plot of the (sorted) eigenvalue spectrum for the Reference 
113:       covariance matrix, empirically estimated from 4096 sub-volumes, and the 
114:       shrinkage of that reference covariance matrix against our diagonal 
115:       target (Reference+S).  The lower panel shows the ratio of these 
116:       eigenvalue spectra.}
117:     \label{fig:eigen}
118:   \end{figure}
119: }
120: 
121: 
122: 
123: \newcommand{\figlambda}{
124:   \begin{figure}
125:     \begin{center}
126:       \includegraphics[width=\columnwidth]{fig/lambda.eps}
127:     \end{center}
128:     \caption{Plot of estimated error bar, $\Delta$, as a function
129:       of the shrinkage intensity, $\hat{\lambda}^{\star}$, for the 
130:       shrinkage versions of the Monte Carlo (MC+S) and jackknife 
131:       (JK+S) methods.  For clarity all of the MC+S error bars are plotted
132:       as positive and all of the JK+S as negative.}
133:     \label{fig:lambda}
134:   \end{figure}
135: }
136: 
137: 
138: 
139: \newcommand{\tableresults}{
140:   \begin{table}
141:     \caption{Statistics of $\hat{\sigma}_8$ and error bar estimates.}
142:     \label{tab:s8}
143:     \begin{tabular}{lllll}
144:       \hline
145:       $\mathbfss{C}$ & 
146:       $\langle \hat{\sigma}_8 \rangle$ & 
147:       $\sigma_{\hat{\sigma}_8}$ & 
148:       $\langle \Delta \rangle$ & 
149:       $\sigma_{\Delta}$ \\
150:       \hline
151:       Reference & 0.870 & 0.041 & 0.042 & 0.002 \\
152:       Monte Carlo & 0.853 & 0.088 & 0.031 & 0.006 \\
153:       Monte Carlo Target Only & 0.870 & 0.042 & 0.014 & 0.002 \\
154:       Monte Carlo + Shrinkage & 0.872 & 0.042 & 0.027 & 0.008 \\
155:       Jackknife & 0.790 & 0.102 & 0.015 & 0.005 \\
156:       Jackknife Target Only & 0.869 & 0.044 & 0.013 & 0.003 \\
157:       Jackknife + Shrinkage & 0.850 & 0.047 & 0.021 & 0.007 \\	
158:       \hline
159:     \end{tabular}
160: 
161:     \medskip
162:     The mean and standard deviation of the estimates of the
163:     maximum-likelihood estimate, $\hat{\sigma}_8$, and the one-sigma
164:     error bar, $\Delta$, using different methods to estimate the
165:     covariance matrix.
166:     \end{table}     
167: }
168: 
169: 
170: 
171: \newcommand{\tabletoy}{
172:   \begin{table*}
173:     \begin{minipage}{\textwidth}
174:       \caption{Shrinkage estimation of the mean of a noisy vector.}
175:       \label{tab:toy}
176:       \begin{tabular}{lllll|lllllllll}
177: 	\hline
178: 	\multicolumn{5}{l}{Input}  & \multicolumn{9}{l}{Output} \\
179: 	$\psi$ &
180: 	$\sigma$ &
181: 	$t$ &
182: 	$n$ &
183: 	$p$ &
184: 	$\langle \lambda^{\star} \rangle$ &
185: 	$\overline{\hat{\lambda}^{\star}}$ &
186: 	$\Delta \hat{\lambda}^{\star}$ &
187: 	$\langle {\rm MSE}(\bmath{u}^{\star}) \rangle$ &
188: 	$\overline{{\rm MSE}(\bmath{u}^{\star})}$ &
189: 	$\langle {\rm MSE}(\bmath{u}) \rangle$ &
190: 	$\overline{{\rm MSE}(\bmath{u})}$ &
191: 	$\langle {\rm MSE}(\bmath{t}) \rangle$ &
192: 	$\overline{{\rm MSE}(\bmath{t})}$ \\
193: 	\hline
194: 	1.1 & 1.0 & 1.0 & 100 & 100 & 0.50 & 0.509 & 0.003 & 0.50 & 0.52 & 1.00 & 1.02 & 1.00 & 1.00\\
195: 	1.2 & & & & & 0.20 & 0.205 & 0.0002 & 0.80 & 0.80 & 1.00 & 0.99 & 4.00 & 4.00\\
196: 	& 0.9 & & & & 0.45 & 0.459 & 0.003 & 0.45 & 0.46 & 0.81 & 0.80 & 1.00 & 1.00\\
197: 	& & 0.9 & & & 0.20 & 0.201 & 0.0003 & 0.80 & 0.80 & 1.00 & 1.00 & 4.00 & 4.00\\
198: 	& & & 50 & & 0.67 & 0.696 & 0.008 & 0.67 & 0.71 & 1.00 & 2.00 & 1.00 & 1.00\\
199: 	& & & & 50 & 0.50 & 0.529 & 0.01 & 0.25 & 0.26 & 0.50 & 0.49 & 0.50 & 0.50\\
200: 	\hline
201:       \end{tabular}
202:       
203:       \medskip 
204:       Results of simulations to test shrinkage estimation of the mean of
205:       a noisy vector.  The first five columns are the input values for
206:       the simulations.  The first row gives the fiducial values and
207:       subsequent rows only indicate parameters that were varied.
208:       The remaining columns list analytically predicted 
209:       (indicated by $\langle \rangle$)
210:       and measured (where $\bar{a}$ and $\Delta a$ indicate the sample
211:       mean and standard deviation for $a$) 
212:       quantities from the outputs of the simulations.
213:       We used 100 simulations for each set of input parameters.
214:       See Section~\ref{sub:toymean} for an explanation of the parameters
215:       and quantities.
216: 
217:     \end{minipage}
218:   \end{table*}
219: }
220: 
221: 
222: 
223: \begin{document}
224: \maketitle
225: 
226: 
227: 
228: \begin{abstract}
229: We seek to improve estimates of the power spectrum covariance matrix
230: from a limited number of simulations by employing a novel statistical
231: technique known as shrinkage estimation.
232: The shrinkage technique optimally combines an empirical 
233: estimate of the covariance with a model 
234: (the {\it target}) to minimize the {\it total} mean squared error compared 
235: to the true underlying covariance.
236: We test this technique on N-body simulations and evaluate its performance
237: by estimating cosmological parameters.
238: Using a simple diagonal target, we show that the shrinkage estimator 
239: significantly outperforms both the empirical covariance and the target 
240: individually when using a small number of simulations.
241: We find that reducing noise in the covariance estimate is essential for 
242: properly estimating the values of cosmological parameters as well as their
243: confidence intervals.
244: We extend our method to the jackknife covariance estimator and again
245: find significant improvement, though simulations give better results.
246: Even for thousands of simulations we still find evidence that our method 
247: improves estimation of the covariance matrix.
248: Because our method is simple, requires negligible additional numerical 
249: effort, and produces superior results, we always advocate shrinkage 
250: estimation for the covariance of the power spectrum and other large-scale 
251: structure measurements when purely theoretical modeling of the
252: covariance is insufficient.
253: \end{abstract}
254: 
255: 
256: 
257: \begin{keywords}
258: methods: statistical -- large-scale structure of the Universe.
259: \end{keywords}
260: 
261: 
262: 
263: \section{Introduction}
264: \label{sec:intro}
265: 
266: Large-scale structure statistics, especially power spectra, provide precise 
267: constraints on cosmological theories.
268: Accurate measurements are now possible with large-volume surveys and 
269: advancing computational power.
270: However, the measured power spectrum is not the only required ingredient 
271: for estimating cosmological parameters; the covariance matrix also carries 
272: a great deal of information that is vital for properly estimating parameter 
273: values and their confidence intervals.
274: Observational effects such as the survey geometry, redshift-space 
275: distortions, and non-linear clustering make theoretical modeling of the 
276: covariance matrix difficult, and often simulations are used to study
277: them in detail.
278: Covariance matrices estimated from a finite number of simulations will
279: contain noise.
280: Cosmological parameter estimation requires the inverse of the covariance
281: matrix to properly weight the measurements.
282: Matrix inversion is an inherently non-linear operation that is sensitive 
283: to the noise of all the elements.
284: \citet{cs06} showed that when the off-diagonal elements of a covariance
285: matrix are excessively noisy it is better for parameter estimation to
286: use a diagonal approximation of the covariance.
287: This reduces the effects of noise, but ignores important information in the 
288: covariance.
289: 
290: Covariance matrices for large-scale structure measurements are 
291: often estimated using
292: the unbiased empirical covariance matrix, $\mathbfss{S}$ (see
293: equation~\ref{eq:cov}), a close relative of the maximum-likelihood
294: estimator, $\mathbfss{S}^{(ML)} = \frac {n-1} {n} \mathbfss{S}$.
295: These estimators work well in the regime where the number of
296: repeat observations, $n$, is much greater than the number of
297: parameters measured for each observation, $p$.  However, in
298: the regimes where $n \sim p$ or $n \ll p$ the covariance matrix
299: estimates become ill-conditioned and unstable during inversion,
300: which is necessary for optimal weighting of the data.  
301: This is an indication that these estimators do not produce good 
302: approximations of the true underlying covariance matrix in these regimes.
303: \citet{efron82} provides some insight into the
304: difference between maximum-likelihood as a {\it summarizer} and
305: as an {\it estimator}.  Maximum-likelihood is an excellent summarizer
306: of data in the sense of trying to represent the important statistical
307: information about a dataset in a small set of numbers.  
308: Though maximum-likelihood is asymptotically optimal for estimation
309: in the limit of infinite data, the use of this summary of information 
310: for the purpose of making estimates with a finite set of data
311: is not always the best option.  
312: \citet{stein56} proved that one can construct estimators in high-dimensional 
313: ($d \geq 3$) inference problems that outperform maximum-likelihood 
314: estimators in the sense of minimizing the {\it total} mean squared error.
315: Maximum-likelihood produces the best estimates of individual
316: parameters, but the alternatives can often reduce the error on many
317: of the parameters while only slightly increasing the error on a few,
318: resulting in an overall improvement.
319: \citet{stein56} also showed that the maximum likelihood estimator has 
320: the best performance among estimators that transform correctly
321: under translation, implying that any estimator that outperforms
322: maximum-likelihood will necessarily involve an arbitrary choice.
323: 
324: \citet{ss05} employ a method known as {\it shrinkage estimation}
325: to construct covariance matrices for functional genomics
326: measurements in the $n \ll p$ regime.  Their technique optimally combines
327: a high-dimensional estimate that has little or no bias with a 
328: low-dimensional estimate that may be biased but has much less variance.
329: The result minimizes the total mean squared error, which is the sum
330: of bias (squared) and variance.  
331: They argue that their method can also perform some amount of regularization,
332: resulting in a covariance matrix that has a full set of
333: positive-definite eigenvalues and is well-conditioned (i.e., the ratio
334: of the largest to smallest eigenvalue is not so large that inversion
335: becomes unstable).
336: They employ a lemma from \citet{lw03} to analytically calculate 
337: the optimal linear combination of the low and high dimensional estimates.
338: 
339: In this paper our goal is to provide a simple recipe for using shrinkage
340: estimation to improve the covariance matrix of the
341: matter power spectrum from a limited number of simulations over the
342: ubiquitous sample covariance estimator.
343: Our method aims to reduce the total noise
344: while retaining as much information about real covariance in the
345: simulations as possible.
346: Shrinkage estimation achieves this by optimally combining a theoretical
347: model with the empirical estimate.
348: We will assess the improvements our method offers by examining the
349: performance of the covariance matrices through inversion and
350: use in cosmological parameter estimation.
351: Although we focus on the matter power spectrum, the shrinkage
352: technique is relevant for many studies in large-scale structure
353: and cosmology.
354: 
355: The outline of this paper is as follows:
356: in Section~\ref{sec:shrink} we introduce shrinkage estimation and
357: describe its application to covariance matrices.
358: Section~\ref{sec:toy} applies the shrinkage technique to several toy
359: problems before moving to a more complicated example involving
360: galaxy clustering.
361: We describe our technique for measuring matter power spectra from N-body 
362: simulations in section~\ref{sec:sims}.
363: In Section~\ref{sec:results} we construct several estimates of
364: the power spectrum covariance matrix and compare their performance
365: by estimating cosmological parameters.
366: Finally we review our results, make recommendations, and discuss future 
367: directions of this project in Section~\ref{sec:disc}.
368: 
369: 
370: 
371: \section{Shrinkage Estimation}
372: \label{sec:shrink}
373: 
374: \subsection{The Method}
375: \label{sub:shrink}
376: 
377: Much of this section summarizes the introduction to shrinkage
378: estimation given in \citet{ss05}.  
379: Suppose we are estimating a vector $\bmath{\psi}$ (of length $p$)
380: from a set of $n$ measurements using two different models.  
381: One of the models has many free parameters and
382: produces an estimate, $\bmath{u}$, with little (or no) bias, but
383: the variance may be significant due to the number of free parameters.  
384: The second model (called the {\it target}) has many fewer (or no) free 
385: parameters and produces an estimate, $\bmath{t}$, which will have smaller 
386: variance but may be biased.  
387: We construct a new estimate, $\bmath{u}^{\star}$, 
388: from a linear combination of these two models, given by
389: %
390: \begin{equation}
391: \bmath{u}^{\star} = \lambda \bmath{t} + (1-\lambda) \bmath{u}
392: \label{eq:ustar}
393: \end{equation}
394: %
395: where $\lambda \in [0,1]$ is called the {\it shrinkage intensity}.  
396: The question now becomes how to choose $\lambda$ in an optimal way.  
397: A common way to optimize an estimator is to minimize the 
398: expected mean squared error, given by the risk function
399: %
400: \begin{equation}
401: R(\lambda) = 
402: \left\langle \sum_{i=1}^{p} (u^{\star}_i - \psi_i)^2 \right\rangle
403: \label{eq:mse}
404: \end{equation}
405: %
406: where the angle brackets indicate the expectation value.
407: \citet{lw03} introduced an analytic solution for the optimal 
408: shrinkage intensity, $\lambda^{\star}$.
409: Prior to this solution shrinkage estimation was much less practical
410: because numerically complicated and expensive methods were necessary
411: to find the optimal shrinkage intensity.
412: The analytic solution is
413: %
414: \begin{equation}
415: \lambda^{\star} = 
416: \frac {\sum_{i=1}^{p} {\rm Var}(u_i) - {\rm Cov}(t_i,u_i)
417: - {\rm Bias}(u_i) \langle t_i-u_i \rangle}
418: {\sum_{i=1}^{p} \langle (t_i-u_i)^2 \rangle}
419: \label{eq:lstar}
420: \end{equation}
421: %
422: where ${\rm Var}$, ${\rm Cov}$, and ${\rm Bias}$ are the true
423: variance, covariance, and bias, respectively.  For a practical
424: estimator, \citet{ss05} suggest estimating $\hat{\lambda}^{\star}$ as
425: %
426: \begin{equation}
427: \hat{\lambda}^{\star} = \frac 
428: {\sum_{i=1}^{p} \widehat{\rm Var}(u_i) - \widehat{\rm Cov}(t_i,u_i)
429: - \widehat{\rm Bias}(u_i) (t_i-u_i)}
430: {\sum_{i=1}^{p} (t_i-u_i)^2}
431: \label{eq:estlstar}
432: \end{equation}
433: %
434: where $\widehat{\rm Var}$, $\widehat{\rm Cov}$, and $\widehat{\rm Bias}$
435: are the unbiased sample
436: estimates of ${\rm Var}$, ${\rm Cov}$, and ${\rm Bias}$,
437: respectively.  
438: The ${\rm Bias}$ term can be ignored if $\bmath{u}$ is an 
439: unbiased estimator.
440: 
441: 
442: 
443: \tabletoy
444: 
445: 
446: 
447: \subsection{Application to Covariance Matrices}
448: \label{sub:shrinkcov}
449: 
450: We now specialize the shrinkage estimation technique to covariance matrices.  
451: Suppose we have $n$ sets of data and we measure a data vector 
452: $\bmath{x}$ of length $p$ for each of them.
453: Let $x^{(k)}_i$ be the $k^{\rm th}$ (of $n$ total) observation
454: of the $i^{\rm th}$ (of $p$ total) element of the data vector.  
455: The estimated empirical mean is given by 
456: $\overline{x}_i = \frac {1} {n} \sum_{k=1}^{n} x^{(k)}_i$.
457: We define
458: %
459: \begin{eqnarray}
460: \label{eq:wkij}
461: W^{(k)}_{ij} & = & 
462: (x^{(k)}_i - \overline{x}_i)(x^{(k)}_j - \overline{x}_j), \\
463: \label{eq:wij}
464: \overline{W}_{ij} & = & 
465: \frac {1} {n} \sum_{k=1}^{n} W^{(k)}_{ij}
466: \end{eqnarray}
467: %
468: and write the unbiased empirically estimated covariance matrix
469: of the data, $\mathbfss{S}$, as
470: %
471: \begin{equation}
472: S_{ij} = \widehat{\rm{Cov}}(x_i,x_j) 
473: = \frac {n} {n-1} \overline{W}_{ij}.
474: \label{eq:wcov}
475: \end{equation}
476: %
477: Explicit substitution results in the usual
478: %
479: \begin{equation}
480: S_{ij} = \frac {1} {n-1} \sum_{k=1}^{n} 
481: (x^{(k)}_i - \overline{x}_i)(x^{(k)}_j - \overline{x}_j).
482: \label{eq:cov}
483: \end{equation}
484: %
485: Similarly we can estimate the covariance of the elements
486: of the covariance matrix of the data, given as
487: %
488: \begin{equation}
489: \widehat{\rm{Cov}}(S_{ij},S_{lm}) 
490: = \frac {n} {(n-1)^3} \sum_{k=1}^{n}
491: (W^{(k)}_{ij} - \overline{W}_{ij}) 
492: (W^{(k)}_{lm} - \overline{W}_{lm})
493: \label{eq:covcov}
494: \end{equation}
495: %
496: with the variance of an individual entry given by
497: $\widehat{\rm{Var}}(S_{ij}) = \widehat{\rm{Cov}}(S_{ij},S_{ij})$.
498: For shrinkage estimation we let $\mathbfss{S}$ take the role of 
499: $\bmath{u}$ and supply a target covariance matrix $\mathbfss{T}$ to take 
500: the role of $\bmath{t}$.
501: The optimal shrinkage intensity and resulting covariance 
502: matrix $\mathbfss{C}$ are given by
503: %
504: \begin{eqnarray}
505: \label{eq:lshat}
506: \hat{\lambda}^{\star} & = & \frac
507: {\sum_{i,j} \widehat{\rm Var}(S_{ij})
508: - \widehat{\rm Cov}(T_{ij},S_{ij})}
509: {\sum_{i,j} (T_{ij} - S_{ij})^2}, \\
510: \label{eq:scov}
511: \mathbfss{C} & = & \hat{\lambda}^{\star}\mathbfss{T} + 
512: (1-\hat{\lambda}^{\star}) \mathbfss{S}.
513: \end{eqnarray}
514: %
515: If the $\hat{\lambda}^{\star}$ estimate is greater than
516: one, then $\hat{\lambda}^{\star} = 1$ is enforced, implying that only
517: the target matrix is used.  If the $\hat{\lambda}^{\star}$ estimate
518: is less than zero, then $\hat{\lambda}^{\star} = 0$ is enforced,
519: implying that only the empirical covariance matrix is used.
520: The numerator of equation~\ref{eq:lshat} implies that as the variances 
521: of the elements of $\mathbfss{S}$ decrease 
522: (e.g., approaching the $n \gg p$ regime)
523: the shrinkage estimate smoothly approaches the empirical covariance.
524: The denominator in equation~\ref{eq:lshat} also ensures that if the
525: chosen target matrix is very different from the empirical covariance
526: then the estimator will tend to the empirical covariance.  
527: Thus an inappropriate choice of target should not make the final results 
528: any worse, but there will be little gain in efficiency.  
529: 
530: The $\widehat{\rm Cov}(T_{ij},S_{ij})$ term in the numerator of
531: equation~\ref{eq:lshat} accounts for the fact that $\mathbfss{S}$ and
532: $\mathbfss{T}$ are estimated from the same data.  If $\mathbfss{T}$
533: is fixed then the term is zero.  If some of the elements of $\mathbfss{T}$
534: are taken directly from $\mathbfss{S}$ then the $\widehat{\rm Cov}$
535: term exactly cancels the $\widehat{\rm Var}$ term and those elements
536: do not affect the estimate of $\hat{\lambda}^{\star}$.
537: Tab. 2 of \citet{ss05} shows a variety of worked examples for
538: common targets that may or may not depend on the empirically estimated
539: covariance.  In their examples they ignore moments of higher
540: order than $\widehat{\rm Var}(S_{ij})$.
541: 
542: 
543: 
544: \section{Toy Examples}
545: \label{sec:toy}
546: 
547: 
548: 
549: \figtoy
550: 
551: 
552: 
553: Before tackling large-scale structure measurements
554: we present the application of shrinkage estimation to toy models.
555: In Section~\ref{sub:toymean} we employ shrinkage to estimate
556: the mean of a noisy vector.
557: For that example we can calculate the expected shrinkage intensity
558: analytically.
559: In Section~\ref{sub:toycov} we present a toy example of covariance
560: estimation.
561: 
562: 
563: 
564: \subsection{Estimating the Mean of a Noisy Vector}
565: \label{sub:toymean}
566: 
567: In our first toy example of the shrinkage technique we will estimate
568: the mean of a vector from a set of noisy realizations using the
569: formalism from Section~\ref{sub:shrink}.
570: Given an input mean vector, $\bmath{\psi}$, the distribution for
571: an element of the noisy realization is $X_{i} = N(\psi_i,\sigma_i^2)$,
572: a normal distribution with mean $\psi_i$ and variance $\sigma_i^2$.
573: The $i^{th}$ element (of $p$ total) of the $k^{th}$ realization
574: (of $n$ total) is written $x_i^{(k)}$, and our high-dimensional
575: estimate of the mean is given by
576: %
577: \begin{equation}
578: \bmath{u} = \frac {1} {n} \sum_{k=1}^{n} \bmath{x}^{(k)}.
579: \label{eq:u}
580: \end{equation}
581: %
582: Given a fixed target, $\bmath{t}$, we will calculate the expected value 
583: for the optimal shrinkage intensity, $\langle \lambda^{\star} \rangle$,
584: as given by equation~\ref{eq:lstar}.  If we assume the numerator and
585: denominator are uncorrelated we can write
586: %
587: \begin{equation}
588: \langle \lambda^{\star} \rangle = 
589: \left \langle 
590: \frac {\sum_{i=1}^{p} {\rm Var}(u_i)}
591: {\sum_{i=1}^{p} \langle (t_i-u_i)^2 \rangle}
592: \right \rangle
593: \approx
594: \frac {\sum_{i=1}^{p} \left \langle {\rm Var}(u_i) \right \rangle}
595: {\sum_{i=1}^{p} \left \langle (t_i-u_i)^2 \rangle \right \rangle}.
596: \label{eq:explstar}
597: \end{equation}
598: %
599: From our definitions we know that
600: \begin{eqnarray}
601: \label{eq:expx}
602: \langle x_i^{(k)} \rangle & = & \psi_i, \\
603: \label{eq:expx2}
604: \langle x_i^{(k)} x_j^{(l)} \rangle & = &
605: \psi_i \psi_j + \delta_{ij} \delta_{kl} \sigma_i^2
606: \end{eqnarray}
607: %
608: which we can use to find the expectation values of our estimators
609: %
610: \begin{eqnarray}
611: \label{eq:expu}
612: \langle u_i \rangle & = &
613: \left \langle \frac {1} {n} \sum_{k=1}^{n} x_i^{(k)} \right \rangle
614: = \psi_i,\\
615: \label{eq:expu2}
616: \langle u_i^2 \rangle & = & \left \langle 
617: \frac {1} {n} \sum_{k=1}^{n} x_i^{(k)}
618: \frac {1} {n} \sum_{l=1}^{n} x_i^{(l)}
619: \right \rangle
620: = \psi_i^2 + \frac {1} {n} \sigma_i^2.
621: \end{eqnarray}
622: %
623: Fixing the values for $\psi_i$, $\sigma_i$, and $t_i$ to be
624: $\psi$, $\sigma$, and $t$, respectively, we predict the shrinkage
625: intensity to be
626: %
627: \begin{equation}
628: \langle \lambda^{\star} \rangle \approx \frac {\sigma^2}
629: {n(t-\psi)^2 + \sigma^2} .
630: \label{eq:explstarfinal}
631: \end{equation}
632: %
633: This formula displays the behavior we expect.  As $n$ becomes large,
634: $\langle \lambda^{\star} \rangle$ 
635: approaches zero and we will rely mostly on the empirical
636: estimate of the mean.  As $t$ approaches $\psi$, 
637: $\langle \lambda^{\star} \rangle$ 
638: approaches unity as we have chosen the perfect target.
639: We can also calculate the expected mean squared errors for the
640: high-dimensional, low-dimensional, and shrinkage estimators 
641: and find they are given by
642: %
643: \begin{eqnarray}
644: \label{eq:mseu}
645: \langle {\rm MSE}(\bmath{u}) \rangle & = & p \frac {\sigma^2} {n},\\
646: \label{eq:mset}
647: \langle {\rm MSE}(\bmath{t}) \rangle & = & p(\psi-t)^2,\\
648: \label{eq:mseustar}
649: \langle {\rm MSE}(\bmath{u}^{\star}) \rangle & = &
650: p \left [ \langle \lambda^{\star} \rangle^2 (\psi-t)^2 +
651: (1-\langle \lambda^{\star} \rangle^2) \frac {\sigma^2} {n} \right ],
652: \end{eqnarray}
653: %
654: respectively.
655: 
656: We generated simulations to check the
657: validity of our analytic predictions.  Each simulation starts by
658: generating the $n$ realizations $\bmath{x}^{(k)}$.  The sample mean
659: $\bmath{u}$ is calculated for the set of realizations and used with the
660: target $\bmath{t}$ to generate the shrinkage estimate $\bmath{u^{\star}}$.
661: The shrinkage intensity, $\hat{\lambda}^{\star}$,
662: for the set of realizations is calculated using
663: equation~\ref{eq:estlstar} and jackknife resampling of equation~\ref{eq:u}
664: is used to estimate the $\widehat{\rm Var}(u_i)$ term.
665: We generated 100 simulations for each set of input parameters we
666: tested and the results are shown in Table~\ref{tab:toy}.
667: The results of our simulations match the analytical predictions
668: reasonably well and also show that the shrinkage estimator outperforms
669: both the empirical estimate and the target in a mean squared error sense.
670: 
671: 
672: 
673: \subsection{Estimating the Covariance}
674: \label{sub:toycov}
675: 
676: Our second toy model applies the formalism of Section~\ref{sub:shrinkcov}
677: for shrinkage estimation of a covariance matrix.
678: In this case the analytical predictions would be prohibitively
679: tedious, so we present only the results of simulations.
680: Each simulation starts by generating the realizations,
681: $\bmath{x}^{(k)}$, where the distribution of each element is
682: $X_i^{(k)} = N(0,\sigma^2)$.
683: From each set of realizations we construct two estimates of the
684: covariance matrix.
685: The Monte Carlo (MC) method uses the standard empirical covariance estimate 
686: as given by equation~\ref{eq:cov}.
687: We also compute a shrinkage version of the Monte Carlo estimator (MC+S) from
688: equation~\ref{eq:scov} using the identity matrix as a target.
689: The two estimators are compared by inspecting the eigenvalue spectra
690: and by computing the mean squared error between the
691: estimate and the known true covariance matrix,
692: which is $\sigma^2\mathbfss{I}$ where $\mathbfss{I}$ is the identity matrix.
693: The mean squared error between two matrices $\mathbfss{A}$ and $\mathbfss{B}$
694: is given by the Frobenius norm of the difference,
695: %
696: \begin{equation}
697: ||\mathbfss{A}-\mathbfss{B}||^2_{F}
698: = \sum_{i,j} |A_{ij}-B_{ij}|^2.
699: \label{eq:frobenius}
700: \end{equation}
701: 
702: For our simulations we fixed $p=18$ and $\sigma=1.1$ and compared
703: the performance of the covariance estimators as a function of the
704: number of realizations, $n$.  We ran 100 simulations for each value
705: of $n$ and averaged the results.
706: For $n = 40, 400, 4000$ the shrinkage intensities were
707: $\hat{\lambda}^{\star} = 0.92 \pm 0.08$, $0.61 \pm 0.06$, $0.136 \pm 0.006$.
708: Figure~\ref{fig:toy} shows the results for the two estimators as well
709: as the results when only using the target.
710: The shrinkage estimator gives a mean squared error that is equal to or
711: less than both the empirical covariance and the target by itself for all $n$.
712: The eigenvalue spectrum for the shrinkage estimator is also closer to the
713: correct spectrum than for the empirical covariance at fixed $n$.
714: This indicates that using the shrinkage estimator is in some sense
715: equivalent to using more simulations.
716: 
717: 
718: 
719: \section{Simulations and Measurements}
720: \label{sec:sims}
721: 
722: We divide the Hubble Volume \citep{hv} $\Lambda$CDM simulation into 
723: 4096 sub-volumes, each with a sidelength of $187.5~{\rm Mpc/h}$.  
724: We measure the power spectrum in each sub-volume using a simple code 
725: that implements the same basic algorithm as presented in \citet{espice}
726: and employs a Fast Fourier Transform (FFT).
727: The FFT was performed on a $64^3$ grid and the power
728: spectrum was measured in 18 logarithmically-spaced bins from 
729: $k=0.0367$ to $k=0.920$.  
730: For plotting and model comparison purposes we calculate the average $k$ 
731: from the actual modes in each bin for the bin centers.  
732: The power spectrum calculated on a pixelized grid is the product of
733: the true power spectrum with (the square of) the Fourier transform of
734: the pixel window function.
735: Our measured power spectrum can be written as
736: $P_{\rm meas}(\bmath{k}) = P(\bmath{k})/|\tilde{W}_p(\bmath{k})|^2$
737: where the Fourier transform of the pixel window function, $\tilde{W}_p$,
738: is given by
739: %
740: \begin{eqnarray}
741: \label{eq:f}
742: \tilde{f}(k) & = & 
743: \frac {{\rm sin}(\pi k / (2 \pi/L))} {\pi k / (2 \pi/L)},\\
744: \label{eq:w}
745: |\tilde{W}_p(\bmath{k})|^2 & = & \tilde{f}^2(k_x)
746: \tilde{f}^2(k_y) \tilde{f}^2(k_z),
747: \end{eqnarray}
748: %
749: and $L$ is the length of the side of a pixel.  
750: Our pixels are $2.93~{\rm Mpc/h}$ on each side.
751: 
752: We used {\sevensize CAMB} \citep{camb} to calculate a model transfer function 
753: and matter power spectrum using cosmological parameter values to match 
754: the Hubble Volume \citep{hv} $\Lambda$CDM simulation: 
755: $\Omega_m = 0.3$, 
756: $\Omega_{\Lambda} = 0.7$,
757: $h = {\rm H}_0/100~{\rm km~s^{-1} Mpc^{-1}} = 0.7$,
758: $\Omega_bh^2 = 0.0196$,
759: and $n_s = 1$.
760: We normalized the resulting power spectrum, $P_{\rm CAMB}(k)$, to have a 
761: (linear) $\sigma_8 = 0.9$ and then applied the non-linear 
762: {\sevensize HALOFIT} \citep{halofit} correction.
763: Thus the normalization matches linear theory for large scales
764: (low $k$), but the shape includes non-linear clustering corrections
765: at smaller scales (higher $k$).
766: 
767: The measured power spectrum is the convolution of the true power
768: spectrum with the squared Fourier transform of the survey window function.
769: When testing a model for the power spectrum we must perform this
770: convolution before comparing it to our measurement.  The
771: convolved power spectrum, $P^W$, is given by
772: %
773: \begin{equation}
774: P^W(\bmath{k}) = \int d\bmath{q}~P(\bmath{k}-\bmath{q})
775: |\tilde{W}_s(\bmath{q})|^2
776: \end{equation}
777: %
778: where $P$ is the input (theoretical) power spectrum.
779: Because the survey is the same shape as our pixels we can write
780: the (normalized) survey window function in terms of the pixel window 
781: function as $|\tilde{W}_s|^2 = (L/2\pi)^6|\tilde{W}_p|^2$ 
782: where $L$ is now the survey sidelength.
783: To simplify the convolution we used the
784: spherically averaged transform of the window function (calculated
785: via Monte Carlo) and reduced the convolution of the spherically
786: symmetric power spectrum to a two-dimensional integral, given by
787: %
788: \begin{equation}
789: P^W(k) \approx 2\pi \int_0^{\infty}dq~q^2 |\tilde{W}_s(q)|^2
790: \int_{-1}^{1}dx~P(\sqrt{k^2+q^2-2kqx}).
791: \end{equation}
792: %
793: In practice the integral over $dx$ was done with Romberg integration
794: and the integration over $dq$ was performed with the extended
795: trapezoidal rule at the $k$ values where we had calculated the
796: spherically averaged $|\tilde{W}_s(\bmath{k})|^2$.  
797: Power spectrum values inside the integral were evaluated using cubic 
798: spline interpolation in $({\rm log}~k, {\rm log}~P)$ and power law 
799: extrapolation at low and high $k$.  
800: Fig.~\ref{fig:pk} shows the input theoretical power spectrum 
801: $\left( P_{\rm CAMB} \right)$, 
802: the convolved model power spectrum $\left( P^W_{\rm CAMB} \right)$, 
803: and the average of the measured power spectra from all of the sub-volumes
804: $\left( \langle P_{\rm meas} \rangle \right)$.
805: The inset shows the spherically averaged survey window
806: function.
807: 
808: 
809: 
810: \figpk
811: 
812: 
813: 
814: We wish to evaluate our covariance estimates in the context of
815: cosmological parameter fitting.  Given the relatively small volume of 
816: each of our sub-volumes we decided to fit for only the power spectrum
817: normalization in each sub-volume, parameterized by $\sigma_8$ (linear).
818: We write the log-likelihood (for a fixed covariance matrix) as
819: %
820: \begin{equation}
821: \bmath{d} (\sigma_8) = P_{\rm meas}(k) - \left ( \frac {\sigma_8} 
822: {0.9} \right )^2 P^W_{\rm CAMB}(k,\sigma_8=0.9 ),
823: \label{eq:lnld}
824: \end{equation}
825: %
826: \begin{equation}
827: {\rm log} {\mathcal L} (\sigma_8) \propto - \frac {1} {2} 
828: \bmath{d}^T \mathbfss{C}^{-1} \bmath{d}
829: = - \frac{1}{2} \chi^2
830: \label{eq:lnl}
831: \end{equation}
832: %
833: where $\mathbfss{C}$ is the covariance matrix being tested.  
834: We use $\sigma_8$ to fit for the amplitude of the power
835: spectrum in the linear regime, and we assume that the difference in
836: shape in the non-linear regime is minimal for values of $\sigma_8$
837: that are close to our fiducial model of $\sigma_8=0.9$.
838: We numerically find the maximum likelihood value, $\hat{\sigma}_8$,
839: and define the upper and lower one sigma error bars, $\pm \Delta$, 
840: where
841: \[
842: {\rm log} {\mathcal L}(\hat{\sigma}_8 \pm \Delta) 
843: - 
844: {\rm log} {\mathcal L}(\hat{\sigma}_8) 
845: = -1/2.
846: \]
847: 
848: Our likelihood analysis assumes that the bandpower measurements of
849: the power spectrum are normally distributed.  For most of our bandpowers
850: this is a good approximation, but there are several for which an
851: offset lognormal distribution \citep{bjk00} would be more accurate.
852: This could cause some bias in our recovered value of $\hat{\sigma}_8$,
853: but it should not affect our assessment of the relative performance
854: of different covariance estimators.  The covariance matrices for
855: the normal and offset lognormal cases are related by a change
856: of variables.
857: 
858: 
859: 
860: \section{Covariance Matrix Estimates}
861: \label{sec:results}
862: 
863: In this section we compare the performance of several techniques
864: for estimating the covariance matrix of our power spectrum measurements.
865: As a baseline comparison we calculate the covariance matrix from all
866: 4096 sub-volumes using equation~\ref{eq:cov}.  For each method of
867: covariance matrix estimate we measure $\hat{\sigma}_8$ and
868: $\Delta$ using the power spectra measured from the sub-volumes
869: and show the distributions of the $\hat{\sigma}_8$ and $\Delta$ values.
870: The mean and standard deviations of those quantities are presented
871: in Table~\ref{tab:s8}.
872: 
873: 
874: 
875: \tableresults
876: 
877: 
878: 
879: \subsection{Reference}
880: \label{sub:ref}
881: 
882: As a reference we estimate the covariance matrix of our power spectrum
883: measurement by applying equation~\ref{eq:cov} to our measurements from
884: all 4096 sub-volumes.  
885: There are $18 \times 19/2 = 171$ independent elements in the covariance 
886: matrix, thus we are in the regime where we
887: have many more realizations than elements to be estimated and the
888: usual covariance estimator should give reasonable results.
889: The solid black line in
890: Fig.~\ref{fig:s8mc} is a histogram of the results of estimating
891: $\hat{\sigma}_8$ using the reference covariance matrix and the
892: power spectra measured from each of the 4096 sub-volumes.  
893: The upper panel shows the distribution of $\hat{\sigma}_8$ and the
894: lower panel shows the distribution of the error bar estimates
895: (absolute value of both upper and lower).
896: The mean and standard deviation of these histograms is presented
897: in Table~\ref{tab:s8}.
898: The agreement between the width of the best-fit
899: distribution, $\sigma_{\hat{\sigma}_8}$,
900: and the mean error bar estimate, $\langle \Delta \rangle$, 
901: indicates that the covariance
902: matrix is properly estimating the likelihood distribution.
903: The width of the error bar distribution, $\sigma_\Delta$,
904: is small, indicating that the error bar estimate is usually
905: very close to the correct value.
906: The mean of the maximum-likelihood estimates, 
907: $\langle \hat{\sigma}_8 \rangle$, 
908: is $0.870$ instead of our known input value of $0.9$, 
909: but we know that our modeling of the power
910: spectrum into the non-linear regime is not perfect so this small
911: offset is not worrisome for our purposes.
912: For the remainder of this section we assume that the results using the 
913: reference covariance matrix are a good approximation to those
914: that would be obtained using the true underlying covariance matrix.
915: See Section~\ref{sec:disc} for further discussions.
916: 
917: 
918: 
919: \subsection{Monte Carlo}
920: \label{sub:mc}
921: 
922: 
923: 
924: \figsmc
925: 
926: 
927: 
928: Next we test covariance matrices estimated with 
929: equation~\ref{eq:cov} but using a small number of sub-volumes,
930: which we call the Monte Carlo method.
931: We use sets of 40 (randomly chosen, non-overlapping) 
932: sub-volumes to test the regime where we have more simulations
933: than diagonal elements of the covariance matrix (18), but fewer
934: simulations than independent elements (171).
935: From 4096 sub-volumes we can create 102 separate covariance matrix
936: estimates.
937: To obtain smooth histograms in Fig.~\ref{fig:s8mc} we test each covariance
938: matrix estimate against 40 randomly chosen $P(k)$ measurements
939: from other sub-volumes.
940: The statistics in Table~\ref{tab:s8} are calculated
941: using one randomly chosen $P(k)$ measurement per covariance matrix.
942: The statistics do not depend on how many randomly chosen $P(k)$
943: measurements are used for each covariance matrix estimate.
944: The upper panel of Fig.~\ref{fig:s8mc} shows that the distribution
945: of $\hat{\sigma}_8$ is too wide, indicating that a parameter
946: analysis using a covariance matrix estimated with this method will
947: often return a value far from the mean.
948: The lower panel of Fig.~\ref{fig:s8mc} shows that the error bar
949: is typically underestimated by $\sim 25\%$.
950: The width of the estimated error
951: bar distribution is also much wider than for the reference covariance
952: matrix estimate.  These effects are the result of using a very noisy
953: estimate of the covariance matrix.
954: 
955: 
956: 
957: \subsection{Monte Carlo + Shrinkage}
958: \label{sub:mcs}
959: 
960: 
961: 
962: \figcov
963: 
964: 
965: 
966: Our first test of the shrinkage approach is to apply shrinkage
967: estimation to the Monte Carlo method described in the previous
968: section.  First we need to choose a target covariance matrix,
969: $\mathbfss{T}$.  
970: In linear theory we expect the covariance matrix of the power spectrum
971: to be diagonal.  Off-diagonal terms arise in practice from
972: the survey window function and non-linear clustering effects.  
973: We use a diagonal target matrix to simulate a situation where we have some 
974: idea about the structure of the covariance matrix but we know our model is 
975: not exact.  Our target matrix takes the form
976: %
977: \begin{equation}
978: T_{ij} = \left\{
979: \begin{array}{ll}
980: 0 & i \neq j \\
981: 2 [P^W_{\rm CAMB} (k_i)]^2/N_i & \langle k_i \rangle \leq 0.14~{\rm h/Mpc}\\
982: S_{ii} & \langle k_i \rangle > 0.14~{\rm h/Mpc}
983: \end{array}
984: \right.
985: \label{eq:target}
986: \end{equation}
987: %
988: where we use a different method in the linear and non-linear
989: regimes.  For bins in the linear regime we use our convolved model 
990: for the power spectrum, $P^W_{\rm CAMB}(k_i)$, and the number 
991: of $k$ modes in each bin, $N_i$, to predict the covariance
992: \citep*[e.g.,][]{hrs}.  In the non-linear regime we use the diagonal
993: of the empirically estimated covariance from the 40 sub-volumes.
994: Fig.~\ref{fig:cov} shows the diagonal elements of the reference
995: covariance matrix, the linear theory model, and the 102 target
996: matrices.  Inset is the reference correlation matrix,
997: $R_{ij} = C_{ij}/\sqrt{C_{ii}~C_{jj}}$,
998: showing that the covariance matrix is strongly diagonal until
999: well into the non-linear regime.  
1000: Our results are robust to changes in the non-linear cutoff
1001: by several bins in either direction.
1002: 
1003: We calculate the optimal shrinkage intensity, $\hat{\lambda}^{\star}$, 
1004: for each of the 102 Monte Carlo estimates, $\mathbfss{S}$,
1005: according to equation~\ref{eq:lshat}.  We apply the
1006: $\widehat{\rm Cov}(T_{ij},S_{ij})$ term to the diagonal elements of 
1007: $\mathbfss{T}$ that are taken from $\mathbfss{S}$.
1008: We find values for $\hat{\lambda}^{\star}$ distributed evenly
1009: between $0.1$ and $1.0$ (see Fig.~\ref{fig:lambda}).
1010: We produce each of our 102 new estimates of the covariance matrices, 
1011: $\mathbfss{C}$, from a linear combination of $\mathbfss{S}$ and 
1012: $\mathbfss{T}$ according to equation~\ref{eq:scov}.  
1013: We perform the same tests as
1014: described in section~\ref{sub:mc} and compare the results
1015: in Fig.~\ref{fig:s8mc} and Table~\ref{tab:s8}.
1016: The most striking result is that the maximum-likelihood estimates,
1017: $\hat{\sigma}_8$, follow a very similar distribution to that for
1018: the reference matrix, indicating that the parameter values are now
1019: correctly estimated.
1020: The error bars are still underestimated, but the distribution is
1021: very similar to that for the normal Monte Carlo estimator. 
1022: 
1023: Fig.~\ref{fig:s8mc} and Table~\ref{tab:s8} also show results
1024: using only the diagonal target.  The values of $\hat{\sigma}_8$
1025: follow the correct distribution, indicating that the estimated
1026: parameter values are fine, but the error bars are much more
1027: severely underestimated.  This is expected because our target
1028: matrix is diagonal and we are using information from far enough into
1029: the non-linear regime to know that we are missing some important
1030: covariance.
1031: It is now clear that the estimated error bar distribution of the shrinkage
1032: estimator is a combination of the Monte Carlo and target distributions.
1033: The shrinkage intensity can serve as a proxy for whether the estimated
1034: error bars are likely to be similar to those for the Monte Carlo
1035: or the target.  See Section~\ref{sec:disc} for further discussions.
1036: 
1037: In summary, the shrinkage of the empirically estimated covariance
1038: against our target matrix outperforms either matrix by itself.
1039: Using just the empirically estimated covariance brings in too much
1040: noise which causes error in the estimation of $\hat{\sigma}_8$
1041: itself.  Using only the diagonal target mitigates the noise problems,
1042: but ignores important covariance.  The shrinkage estimator
1043: uses the best aspects of both, keeping the part of the covariance
1044: that is well estimated but drastically reducing the total amount
1045: of noise.
1046: 
1047: 
1048: 
1049: \subsection{Jackknife}
1050: \label{sub:jk}
1051: 
1052: 
1053: 
1054: \figsjk
1055: 
1056: 
1057: 
1058: Recently a resampling technique know as the jackknife method has been
1059: used to estimate covariance matrices for large-scale structure
1060: measurements from the data set itself. 
1061: The method works by dividing the data volume
1062: into $n$ cells of roughly the same size and recalculating the
1063: measurement $n$ times, each time with a different cell left out.
1064: The variance between the measurements can be adjusted to try and
1065: calculate the variance corresponding to the entire volume.  In practice
1066: one replaces equation~\ref{eq:wkij} with
1067: %
1068: \begin{equation}
1069: W^{(k)}_{ij} = \frac {(n-1)^2} {n}
1070: (x^{(k)}_i - \overline{x}_i)(x^{(k)}_j - \overline{x}_j)
1071: \label{eq:jkwkij}
1072: \end{equation}
1073: %
1074: and then calculates the covariance matrix with equation~\ref{eq:wcov},
1075: resulting in the usual
1076: %
1077: \begin{equation}
1078: S_{ij} = \frac {n-1} {n} \sum_{k=1}^{n} 
1079: (x^{(k)}_i - \overline{x}_i)(x^{(k)}_j - \overline{x}_j).
1080: \label{eq:covjk}
1081: \end{equation}
1082: %
1083: We divided each sub-volume into $3^3 = 27$ cells and modified our
1084: code to calculate the power spectrum with one cell removed.  Our
1085: code incorporates a volume correction, the lowest order edge
1086: correction in Fourier space.  For each of the 4096 
1087: sub-volumes we estimate $\hat{\sigma}_8$ and $\Delta$ using the power
1088: spectrum and the jackknife covariance matrix from the same
1089: sub-volume.  The results are compared to the reference case in
1090: Fig.~\ref{fig:s8jk} and listed in Table~\ref{tab:s8}.
1091: 
1092: The distribution of $\hat{\sigma}_8$ 
1093: is much wider than for the reference covariance matrix, indicating that
1094: the noise in the covariance estimate causes incorrect parameter
1095: estimation.  This is similar to the result for the Monte Carlo method.
1096: The jackknife estimates of $\hat{\sigma}_8$ also peak at a noticeably lower 
1097: value than for the reference covariance, though the two
1098: histograms are in roughly $1 \sigma$ agreement given the width of
1099: the distribution for the jackknife case.  
1100: The error bars estimated
1101: in the jackknife case are typically underestimated by a factor
1102: of almost three compared to the reference covariance matrix and nearly
1103: an order of magnitude compared to the actual width of the
1104: jackknife distribution of $\hat{\sigma}_8$.
1105: 
1106: 
1107: 
1108: \subsection{Jackknife + Shrinkage}
1109: \label{sub:jks}
1110: 
1111: Our final method of estimating the covariance matrix
1112: applies shrinkage to the jackknife estimator to see if we can achieve
1113: enhanced robustness.  
1114: We use the same method to construct a target matrix as described in
1115: section~\ref{sub:mcs}, using the diagonal of the jackknife
1116: estimated covariance matrix in the non-linear regime.
1117: We calculate the shrinkage intensity, $\hat{\lambda}^{\star}$,
1118: and covariance estimate
1119: for each of the 4096 covariance matrices as described in
1120: section~\ref{sub:shrinkcov}, but substituting
1121: equation~\ref{eq:jkwkij} for equation~\ref{eq:wkij} throughout.  
1122: We find values for $\hat{\lambda}^{\star}$ distributed evenly
1123: between $0.0$ and $1.0$ (see Fig.~\ref{fig:lambda}).
1124: We run the same tests as described in section~\ref{sub:jk} and
1125: the results are shown in Fig.~\ref{fig:s8jk} and Table~\ref{tab:s8}.  
1126: 
1127: As with the shrinkage version of the Monte Carlo estimator, the
1128: shrinkage version of the jackknife estimator shows significant
1129: improvement in the actual estimated parameter, $\hat{\sigma}_8$.
1130: However, the central value and width are not quite as good as for
1131: the reference case.  There is some improvement
1132: in the estimation of the error bar, though the error bars are still
1133: systematically underestimated by a factor of roughly two
1134: compared to the reference.
1135: 
1136: Fig.~\ref{fig:s8jk} and Table~\ref{tab:s8} also show the results
1137: of estimating $\hat{\sigma}_8$ and $\Delta$ using only the diagonal
1138: targets used in the shrinkage version of the jackknife estimator.
1139: Again, the diagonal target matrix does well for estimating
1140: $\hat{\sigma}_8$ due to the lack of noise, but it gives the worst
1141: estimates of the error bars.
1142: 
1143: In this case, the shrinkage version of the jackknife estimator
1144: did the best job of estimating the error bars, and it was only
1145: slightly worse than the diagonal approximation at recovering
1146: the distribution of $\hat{\sigma}_8$.  Again, shrinkage estimation
1147: is doing an excellent job of keeping information about covariance
1148: while reducing the total noise.
1149: 
1150: 
1151: 
1152: \section{Discussions and Conclusions}
1153: \label{sec:disc}
1154: 
1155: We have introduced shrinkage as a technique for improving estimates
1156: of the covariance matrix for power spectrum measurements.
1157: We tested our methods on dark matter simulations and showed
1158: improvement over the empirically estimated covariance matrix from
1159: a limited number of simulations or jackknife resamplings.
1160: In order to clearly assess the potential improvement from using
1161: shrinkage estimation, we chose an intentionally difficult scenario
1162: where traditional methods of estimating the covariance were unlikely
1163: to yield satisfactory results.  
1164: All of these methods would perform better if we allowed ourselves
1165: more simulations per Monte Carlo estimate or if we did not push as
1166: far into the non-linear clustering regime.  The shrinkage technique
1167: would still outperform the other methods, but perhaps the differences
1168: would be less obvious.
1169: 
1170: A good estimate of the covariance matrix of a power spectrum
1171: measurement is essential for extracting cosmological information via
1172: parameter fitting.  Including the covariance between different bins
1173: is a good step towards properly estimating the confidence intervals 
1174: on cosmological parameters.  However, the increased number of free
1175: parameters of a full covariance estimate (as opposed to a diagonal
1176: approximation) can cause the covariance estimate to be noisy if only
1177: a relatively small number of simulations are available.  This noise
1178: can adversely affect the estimate of the parameter itself.  A diagonal 
1179: approximation to the covariance can be more easily constrained with a 
1180: limited number of simulations, leading to better estimates of the 
1181: parameter values.  However, the confidence intervals can be severely
1182: underestimated if actual covariance is ignored.  Neither alternative
1183: is appealing.  If a similar measurement was performed with the
1184: two-point correlation function, the Fourier dual of the power spectrum,
1185: a full covariance matrix is especially important as bins will be strongly
1186: correlated, even in the linear clustering regime.
1187: Realistic survey geometries will also cause additional covariance on large
1188: scales for the power spectrum.
1189: 
1190: Shrinkage estimation is an optimal way of combining a model with many 
1191: degrees of freedom and a model with few degrees of freedom
1192: to minimize the total error on the covariance estimate.  
1193: In our example the shrinkage versions of the Monte Carlo and jackknife
1194: estimators clearly outperformed their counterparts without shrinkage,
1195: with the shrinkage version of the Monte Carlo estimator producing
1196: the best results.
1197: The lemma of \citet{lw03} as employed by \citet{ss05} allows a
1198: mathematically and numerically simple way of calculating the optimal
1199: shrinkage intensity.
1200: This means that there is minimal addition work required to use a
1201: shrinkage version of a covariance estimator.
1202: Shrinkage estimation can result in a massive improvement in the limit of 
1203: a small number of simulations and will not adversely affect the 
1204: covariance estimate in the limit of a large number of simulations.
1205: For these reasons we always recommend the use of the shrinkage versions 
1206: of covariance estimators in all regimes.
1207: 
1208: We briefly investigated the effects of shrinkage estimation in the
1209: limit of a large (though not infinite) number of simulations.
1210: We applied shrinkage estimation to our reference covariance matrix
1211: estimated from all 4096 sub-volumes using the target from
1212: equation~\ref{eq:target} and found an optimal shrinkage intensity
1213: $\hat{\lambda}^{\star} = 0.0096$.  This number is the same order as
1214: the relative noise we expect in each element of the matrix,
1215: $1/\sqrt{4096} = 0.0156$.  We then calculated the eigensystems of
1216: both matrices.  The dot products of the corresponding eigenvectors
1217: always exceeded $0.996$, indicating that
1218: they are essentially identical.  The (sorted) eigenvalue spectra
1219: are shown in Fig.~\ref{fig:eigen}.  The eigenvalues are the same
1220: to within $1\%$ for the first 10 eigenmodes.  After the tenth eigenmode
1221: the eigenvalues from the reference matrix become increasingly smaller
1222: compared to the shrinkage version.  By the final eigenmode the difference
1223: is $\sim 50\%$.  The shrinkage version of the reference matrix
1224: should be a more accurate estimate of the true underlying covariance
1225: matrix.  
1226: The non-linear nature of matrix inversion can cause errors $\gg 1\%$ even 
1227: when individual elements of the covariance matrix are estimated to $\sim 1\%$.
1228: We ran our parameter estimation test using the shrinkage version of the 
1229: reference matrix and found that $\langle \hat{\sigma}_8 \rangle$ moved by 
1230: less than $0.5\%$.
1231: This is small compared to the width of the distribution, which is $\sim 5\%$.  
1232: The average minimum $\chi^2$ did improve from $52.4$ to $41.1$ with the 
1233: shrinkage version of the covariance matrix, though this is still large 
1234: for $18 - 1 = 17$ degrees of freedom.
1235: The remaining discrepancy is dominated by bias from problems with modeling
1236: the power spectrum into the non-linear regime or power loss in the
1237: simulation at smaller scales due to low resolution, not a grossly
1238: inaccurate estimate of the variances.
1239: The amplitude is mainly sensitive to smooth eigenmodes, which have
1240: large eigenvalues, so there is little change in the estimated value.
1241: Parameters that are more sensitive to the shape of the power spectrum may 
1242: be more sensitive to the lower eigenvalue modes and show more than a
1243: $1\%$ change.
1244: The impact of these differences could be estimated with a study of
1245: the information content of the power spectrum covariance 
1246: in terms of cosmological parameter confidences (i.e., \citealt{ns07}), 
1247: but this is beyond the scope of this paper.
1248: 
1249: 
1250: 
1251: \figeigen
1252: 
1253: 
1254: 
1255: We employed a very simple diagonal target matrix in this paper, 
1256: but better targets can clearly improve the efficiency of the shrinkage
1257: technique.  A much more realistic model for covariance on small scales
1258: could be constructed using the halo model.  For realistic measurements
1259: it may also be advantageous to model some of the effects of survey
1260: geometries, redshift-space distortions, and clustering bias.
1261: Targets that depend on a small number of free parameters may be very
1262: useful for some of these effects (e.g., clustering bias).
1263: Targets can also be developed for a wide range of large-scale
1264: structure measurements in addition to the power spectrum.
1265: The exploration of more sophisticated targets is beyond the scope of
1266: this paper and is left to future studies.
1267: 
1268: Ultimately we would like to develop more diagnostics of the
1269: performance of our covariance estimates.  Fig.~\ref{fig:lambda}
1270: shows the estimated error bar, $\Delta$, as a function of the
1271: shrinkage intensity, $\hat{\lambda}^{\star}$, for the shrinkage
1272: versions of the Monte Carlo and jackknife estimators.  There is
1273: clearly some correlation for the shrinkage version of the
1274: Monte Carlo estimator, so knowledge of $\hat{\lambda}^{\star}$
1275: could help one gauge how much the error bars are underestimated.
1276: The exploration of such diagnostics should proceed as better
1277: targets are developed.
1278: 
1279: 
1280: 
1281: \figlambda
1282: 
1283: 
1284: 
1285: The difficulties in estimating the power spectrum covariance
1286: matrix in the context of making precision cosmological
1287: measurements are of even greater concern for higher-order
1288: clustering measurements.  Higher-order clustering measurements
1289: have a configuration space with more degrees of freedom than the power 
1290: spectrum (or two-point correlation function).  Even a lower resolution
1291: measurement will have more bins and a much larger
1292: covariance matrix, and noise will cause larger deviations in the
1293: inverse matrix.  Theoretical modeling of the covariance matrix
1294: for an N-point correlation function generally involves correlations
1295: up to the 2N-point \citep[e.g.,][]{higher}, making the models more uncertain.
1296: The ability to optimally combine simulations and a theoretical
1297: model with a small number of free parameters will make dramatic
1298: improvements.  Shrinkage estimators could also be used for covariance 
1299: matrices of measurements outside of large-scale structure, including the 
1300: cosmic microwave background power spectrum.  
1301: Finally, we note that Section~\ref{sub:shrink} makes no specific references 
1302: to covariance matrices and that shrinkage is a general estimation technique.
1303: We are studying additional applications of shrinkage estimation for
1304: cosmological measurements.
1305: 
1306: 
1307: 
1308: \section*{Acknowledgments}
1309: The authors thank Mark Neyrinck and Gang Chen for discussions about
1310: the covariance matrix of the power spectrum and the effects of noise.
1311: The authors are grateful for support NASA grant NNG06GE71G
1312: and NSF grant AMS04-0434413.
1313: 
1314: 
1315: 
1316: \begin{thebibliography}{}
1317: 
1318: \bibitem[\protect\citeauthoryear{Bond, Jaffe, \& Knox}{2000}]{bjk00} 
1319:   Bond J.~R., Jaffe A.~H., Knox L., 2000, 
1320:   ApJ, 533, 19
1321: 
1322: \bibitem[\protect\citeauthoryear{Chen \& Szapudi}{2006}]{cs06} 
1323:   Chen G., Szapudi I., 2006, 
1324:   ApJ, 647, L87
1325: 
1326: \bibitem[\protect\citeauthoryear{Efron}{1982}]{efron82}
1327:   Efron B., 1982,
1328:   %Annals of Statistics, 10, 340-356
1329:   Ann. Stat., 10, 340
1330: 
1331: \bibitem[\protect\citeauthoryear{Evrard et al.}{2002}]{hv} 
1332:   Evrard A.~E., et al., 2002, ApJ, 573, 7
1333: 
1334: \bibitem[\protect\citeauthoryear{Hamilton, Rimes, \& Scoccimarro}
1335:   {Hamilton et al.}{2006}]{hrs} 
1336:   Hamilton A.~J.~S., Rimes C.~D., Scoccimarro R., 
1337:   2006, MNRAS, 371, 1188
1338: 
1339: \bibitem[\protect\citeauthoryear{Ledoit \& Wolf}{2003}]{lw03}
1340:   Ledoit O., Wolf M., 2003,
1341:   %Journal of Empirical Finance, 10, 603-621
1342:   J. Empirical Finance, 10, 603
1343: 
1344: \bibitem[\protect\citeauthoryear{Lewis, Challinor, \& Lasenby}
1345:   {Lewis et al.}{2000}]{camb} Lewis A., Challinor A., Lasenby A., 
1346:   2000, ApJ, 538, 473, http://camb.info
1347: 
1348: \bibitem[\protect\citeauthoryear{Neyrinck \& Szapudi}{2007}]{ns07} 
1349:   Neyrinck M.~C., Szapudi I., 2007, 
1350:   MNRAS, 375, L51
1351: 
1352: \bibitem[\protect\citeauthoryear{Sch\"{a}fer \& Strimmer}{2005}]{ss05}
1353:   Sch\"{a}fer J., Strimmer K., 2005, 
1354:   %Statistical Applications in Genetics and Molecular Biology, 
1355:   %Vol. 4: No. 1, Article 32
1356:   Stat. App. Genet. Mol. Biol., 4, 32
1357: 
1358: \bibitem[\protect\citeauthoryear{Smith et al.}{2003}]{halofit} 
1359:   Smith R.~E., et al., 2003, MNRAS, 341, 1311
1360: 
1361: \bibitem[\protect\citeauthoryear{Stein}{1956}]{stein56}
1362:   Stein C., 1956,
1363:   Proc. Third Berkeley Symp. Math. Stat. Probab., 1, 197
1364: 
1365: \bibitem[\protect\citeauthoryear{Szapudi}{2005}]{higher}
1366:   Szapudi I., 2005, astro-ph/0505391
1367: 
1368: \bibitem[\protect\citeauthoryear{Szapudi et al.}{2005}]{espice} 
1369:   Szapudi I., Pan J., Prunet S., Budav{\'a}ri T., 2005, 
1370:   ApJ, 631, L1
1371: 
1372: \end{thebibliography}
1373: 
1374: 
1375: 
1376: \end{document}
1377: