0711:0711.2509/ms.tex

1: \documentclass[useAMS,usenatbib,usegraphicx]{mn2e}

2: \usepackage{times}

3:

4:

5:

6: \title[Shrinkage estimation of the covariance matrix]

7: {Shrinkage estimation of the power spectrum covariance matrix}

8:

9: \author[Pope \& Szapudi]{

10: Adrian C. Pope$^1$\thanks{E-mail: pope@ifa.hawaii.edu}

11: and Istv\'{a}n Szapudi$^1$\\

12: $^1$Institute for Astronomy, 2680 Woodlawn Drive, Honolulu, HI 96822}

13:

14:

15:

16: \newcommand{\figtoy}{

17:   \begin{figure*}

18:     \begin{center}

19:       \includegraphics[width=\textwidth]{fig/toyMseEv3.eps}

20:     \end{center}

21:     \caption{

22:       Comparison of the Monte Carlo (MC) and shrinkage (MC+S) estimates

23:       for the toy model covariance from Section~\ref{sub:toycov}.

24:       The plot at left shows the mean squared error (MSE) between the

25:       estimate and the known input covariance as a function of

26:       the number of realizations, $n$.

27:       Results using only the target are also shown.

28:       The plots at right show the eigenvalue spectra for the covariance

29:       estimators using different numbers of realizations, $n$.

30:       The known true eigenvalue spectrum and the eigenvalues of the target

31:       are also shown.

32:       All plots represent the averages of 100 simulations for each

33:       number of realizations, $n$.

34:     }

35:     \label{fig:toy}

36:   \end{figure*}

37: }

38:

39:

40:

41: \newcommand{\figpk}{

42:   \begin{figure}

43:     \begin{center}

44:       \includegraphics[width=\columnwidth]{fig/pk.eps}

45:     \end{center}

46:     \caption{Plot of the  input theoretical power spectrum

47:       $\left( P_{\rm CAMB} \right)$,

48:       the theoretical power spectrum convolved with the survey window function

49:       $\left( P^W_{\rm CAMB} \right)$,

50:       and the averaged power spectrum measured from all of the sub-volumes

51:       $\left( \langle P_{\rm meas} \rangle \right)$.

52:       Inset shows the spherically averaged survey window function.}

53:     \label{fig:pk}

54:   \end{figure}

55: }

56:

57:

58:

59: \newcommand{\figsmc}{

60:   \begin{figure}

61:     \begin{center}

62:       \includegraphics[width=\columnwidth]{fig/s8mc.eps}

63:     \end{center}

64:     \caption{(Area normalized) distributions of maximum-likelihood value,

65:       $\hat{\sigma}_8$, and error bar,

66:       $\Delta$, estimates for the Reference, Monte Carlo (MC), Monte

67:       Carlo Target only (Target), and the Monte Carlo + Shrinkage (MC+S)

68:       covariance matrix estimates.}

69:     \label{fig:s8mc}

70:   \end{figure}

71: }

72:

73:

74:

75: \newcommand{\figcov}{

76:   \begin{figure}

77:     \begin{center}

78:       \includegraphics[width=\columnwidth]{fig/ciitii.eps}

79:     \end{center}

80:     \caption{Plot of the diagonal elements of the reference covariance

81:       matrix estimated from all of the sub-volumes, of a linear theory

82:       model for the covariance, and of the 102 target matrices for

83:       the Monte Carlo + Shrinkage estimates.

84:       Inset shows the reference correlation matrix in a linear stretch.}

85:     \label{fig:cov}

86:   \end{figure}

87: }

88:

89:

90:

91: \newcommand{\figsjk}{

92:   \begin{figure}

93:     \begin{center}

94:       \includegraphics[width=\columnwidth]{fig/s8jk.eps}

95:     \end{center}

96:     \caption{(Area normalized) distributions of maximum-likelihood value,

97:       $\hat{\sigma}_8$, and error bar,

98:       $\Delta$, estimates for the Reference, Jackknife (JK), Jackknife

99:       Target only (Target), and the Jackknife + Shrinkage (JK+S)

100:       covariance matrix estimates.}

101:     \label{fig:s8jk}

102:   \end{figure}

103: }

104:

105:

106:

107: \newcommand{\figeigen}{

108:   \begin{figure}

109:     \begin{center}

110:       \includegraphics[width=\columnwidth]{fig/eigen.eps}

111:     \end{center}

112:     \caption{Plot of the (sorted) eigenvalue spectrum for the Reference

113:       covariance matrix, empirically estimated from 4096 sub-volumes, and the

114:       shrinkage of that reference covariance matrix against our diagonal

115:       target (Reference+S).  The lower panel shows the ratio of these

116:       eigenvalue spectra.}

117:     \label{fig:eigen}

118:   \end{figure}

119: }

120:

121:

122:

123: \newcommand{\figlambda}{

124:   \begin{figure}

125:     \begin{center}

126:       \includegraphics[width=\columnwidth]{fig/lambda.eps}

127:     \end{center}

128:     \caption{Plot of estimated error bar, $\Delta$, as a function

129:       of the shrinkage intensity, $\hat{\lambda}^{\star}$, for the

130:       shrinkage versions of the Monte Carlo (MC+S) and jackknife

131:       (JK+S) methods.  For clarity all of the MC+S error bars are plotted

132:       as positive and all of the JK+S as negative.}

133:     \label{fig:lambda}

134:   \end{figure}

135: }

136:

137:

138:

139: \newcommand{\tableresults}{

140:   \begin{table}

141:     \caption{Statistics of $\hat{\sigma}_8$ and error bar estimates.}

142:     \label{tab:s8}

143:     \begin{tabular}{lllll}

144:       \hline

145:       $\mathbfss{C}$ &

146:       $\langle \hat{\sigma}_8 \rangle$ &

147:       $\sigma_{\hat{\sigma}_8}$ &

148:       $\langle \Delta \rangle$ &

149:       $\sigma_{\Delta}$ \\

150:       \hline

151:       Reference & 0.870 & 0.041 & 0.042 & 0.002 \\

152:       Monte Carlo & 0.853 & 0.088 & 0.031 & 0.006 \\

153:       Monte Carlo Target Only & 0.870 & 0.042 & 0.014 & 0.002 \\

154:       Monte Carlo + Shrinkage & 0.872 & 0.042 & 0.027 & 0.008 \\

155:       Jackknife & 0.790 & 0.102 & 0.015 & 0.005 \\

156:       Jackknife Target Only & 0.869 & 0.044 & 0.013 & 0.003 \\

157:       Jackknife + Shrinkage & 0.850 & 0.047 & 0.021 & 0.007 \\

158:       \hline

159:     \end{tabular}

160:

161:     \medskip

162:     The mean and standard deviation of the estimates of the

163:     maximum-likelihood estimate, $\hat{\sigma}_8$, and the one-sigma

164:     error bar, $\Delta$, using different methods to estimate the

165:     covariance matrix.

166:     \end{table}

167: }

168:

169:

170:

171: \newcommand{\tabletoy}{

172:   \begin{table*}

173:     \begin{minipage}{\textwidth}

174:       \caption{Shrinkage estimation of the mean of a noisy vector.}

175:       \label{tab:toy}

176:       \begin{tabular}{lllll|lllllllll}

177: 	\hline

178: 	\multicolumn{5}{l}{Input}  & \multicolumn{9}{l}{Output} \\

179: 	$\psi$ &

180: 	$\sigma$ &

181: 	$t$ &

182: 	$n$ &

183: 	$p$ &

184: 	$\langle \lambda^{\star} \rangle$ &

185: 	$\overline{\hat{\lambda}^{\star}}$ &

186: 	$\Delta \hat{\lambda}^{\star}$ &

187: 	$\langle {\rm MSE}(\bmath{u}^{\star}) \rangle$ &

188: 	$\overline{{\rm MSE}(\bmath{u}^{\star})}$ &

189: 	$\langle {\rm MSE}(\bmath{u}) \rangle$ &

190: 	$\overline{{\rm MSE}(\bmath{u})}$ &

191: 	$\langle {\rm MSE}(\bmath{t}) \rangle$ &

192: 	$\overline{{\rm MSE}(\bmath{t})}$ \\

193: 	\hline

194: 	1.1 & 1.0 & 1.0 & 100 & 100 & 0.50 & 0.509 & 0.003 & 0.50 & 0.52 & 1.00 & 1.02 & 1.00 & 1.00\\

195: 	1.2 & & & & & 0.20 & 0.205 & 0.0002 & 0.80 & 0.80 & 1.00 & 0.99 & 4.00 & 4.00\\

196: 	& 0.9 & & & & 0.45 & 0.459 & 0.003 & 0.45 & 0.46 & 0.81 & 0.80 & 1.00 & 1.00\\

197: 	& & 0.9 & & & 0.20 & 0.201 & 0.0003 & 0.80 & 0.80 & 1.00 & 1.00 & 4.00 & 4.00\\

198: 	& & & 50 & & 0.67 & 0.696 & 0.008 & 0.67 & 0.71 & 1.00 & 2.00 & 1.00 & 1.00\\

199: 	& & & & 50 & 0.50 & 0.529 & 0.01 & 0.25 & 0.26 & 0.50 & 0.49 & 0.50 & 0.50\\

200: 	\hline

201:       \end{tabular}

202:

203:       \medskip

204:       Results of simulations to test shrinkage estimation of the mean of

205:       a noisy vector.  The first five columns are the input values for

206:       the simulations.  The first row gives the fiducial values and

207:       subsequent rows only indicate parameters that were varied.

208:       The remaining columns list analytically predicted

209:       (indicated by $\langle \rangle$)

210:       and measured (where $\bar{a}$ and $\Delta a$ indicate the sample

211:       mean and standard deviation for $a$)

212:       quantities from the outputs of the simulations.

213:       We used 100 simulations for each set of input parameters.

214:       See Section~\ref{sub:toymean} for an explanation of the parameters

215:       and quantities.

216:

217:     \end{minipage}

218:   \end{table*}

219: }

220:

221:

222:

223: \begin{document}

224: \maketitle

225:

226:

227:

228: \begin{abstract}

229: We seek to improve estimates of the power spectrum covariance matrix

230: from a limited number of simulations by employing a novel statistical

231: technique known as shrinkage estimation.

232: The shrinkage technique optimally combines an empirical

233: estimate of the covariance with a model

234: (the {\it target}) to minimize the {\it total} mean squared error compared

235: to the true underlying covariance.

236: We test this technique on N-body simulations and evaluate its performance

237: by estimating cosmological parameters.

238: Using a simple diagonal target, we show that the shrinkage estimator

239: significantly outperforms both the empirical covariance and the target

240: individually when using a small number of simulations.

241: We find that reducing noise in the covariance estimate is essential for

242: properly estimating the values of cosmological parameters as well as their

243: confidence intervals.

244: We extend our method to the jackknife covariance estimator and again

245: find significant improvement, though simulations give better results.

246: Even for thousands of simulations we still find evidence that our method

247: improves estimation of the covariance matrix.

248: Because our method is simple, requires negligible additional numerical

249: effort, and produces superior results, we always advocate shrinkage

250: estimation for the covariance of the power spectrum and other large-scale

251: structure measurements when purely theoretical modeling of the

252: covariance is insufficient.

253: \end{abstract}

254:

255:

256:

257: \begin{keywords}

258: methods: statistical -- large-scale structure of the Universe.

259: \end{keywords}

260:

261:

262:

263: \section{Introduction}

264: \label{sec:intro}

265:

266: Large-scale structure statistics, especially power spectra, provide precise

267: constraints on cosmological theories.

268: Accurate measurements are now possible with large-volume surveys and

269: advancing computational power.

270: However, the measured power spectrum is not the only required ingredient

271: for estimating cosmological parameters; the covariance matrix also carries

272: a great deal of information that is vital for properly estimating parameter

273: values and their confidence intervals.

274: Observational effects such as the survey geometry, redshift-space

275: distortions, and non-linear clustering make theoretical modeling of the

276: covariance matrix difficult, and often simulations are used to study

277: them in detail.

278: Covariance matrices estimated from a finite number of simulations will

279: contain noise.

280: Cosmological parameter estimation requires the inverse of the covariance

281: matrix to properly weight the measurements.

282: Matrix inversion is an inherently non-linear operation that is sensitive

283: to the noise of all the elements.

284: \citet{cs06} showed that when the off-diagonal elements of a covariance

285: matrix are excessively noisy it is better for parameter estimation to

286: use a diagonal approximation of the covariance.

287: This reduces the effects of noise, but ignores important information in the

288: covariance.

289:

290: Covariance matrices for large-scale structure measurements are

291: often estimated using

292: the unbiased empirical covariance matrix, $\mathbfss{S}$ (see

293: equation~\ref{eq:cov}), a close relative of the maximum-likelihood

294: estimator, $\mathbfss{S}^{(ML)} = \frac {n-1} {n} \mathbfss{S}$.

295: These estimators work well in the regime where the number of

296: repeat observations, $n$, is much greater than the number of

297: parameters measured for each observation, $p$.  However, in

298: the regimes where $n \sim p$ or $n \ll p$ the covariance matrix

299: estimates become ill-conditioned and unstable during inversion,

300: which is necessary for optimal weighting of the data.

301: This is an indication that these estimators do not produce good

302: approximations of the true underlying covariance matrix in these regimes.

303: \citet{efron82} provides some insight into the

304: difference between maximum-likelihood as a {\it summarizer} and

305: as an {\it estimator}.  Maximum-likelihood is an excellent summarizer

306: of data in the sense of trying to represent the important statistical

307: information about a dataset in a small set of numbers.

308: Though maximum-likelihood is asymptotically optimal for estimation

309: in the limit of infinite data, the use of this summary of information

310: for the purpose of making estimates with a finite set of data

311: is not always the best option.

312: \citet{stein56} proved that one can construct estimators in high-dimensional

313: ($d \geq 3$) inference problems that outperform maximum-likelihood

314: estimators in the sense of minimizing the {\it total} mean squared error.

315: Maximum-likelihood produces the best estimates of individual

316: parameters, but the alternatives can often reduce the error on many

317: of the parameters while only slightly increasing the error on a few,

318: resulting in an overall improvement.

319: \citet{stein56} also showed that the maximum likelihood estimator has

320: the best performance among estimators that transform correctly

321: under translation, implying that any estimator that outperforms

322: maximum-likelihood will necessarily involve an arbitrary choice.

323:

324: \citet{ss05} employ a method known as {\it shrinkage estimation}

325: to construct covariance matrices for functional genomics

326: measurements in the $n \ll p$ regime.  Their technique optimally combines

327: a high-dimensional estimate that has little or no bias with a

328: low-dimensional estimate that may be biased but has much less variance.

329: The result minimizes the total mean squared error, which is the sum

330: of bias (squared) and variance.

331: They argue that their method can also perform some amount of regularization,

332: resulting in a covariance matrix that has a full set of

333: positive-definite eigenvalues and is well-conditioned (i.e., the ratio

334: of the largest to smallest eigenvalue is not so large that inversion

335: becomes unstable).

336: They employ a lemma from \citet{lw03} to analytically calculate

337: the optimal linear combination of the low and high dimensional estimates.

338:

339: In this paper our goal is to provide a simple recipe for using shrinkage

340: estimation to improve the covariance matrix of the

341: matter power spectrum from a limited number of simulations over the

342: ubiquitous sample covariance estimator.

343: Our method aims to reduce the total noise

344: while retaining as much information about real covariance in the

345: simulations as possible.

346: Shrinkage estimation achieves this by optimally combining a theoretical

347: model with the empirical estimate.

348: We will assess the improvements our method offers by examining the

349: performance of the covariance matrices through inversion and

350: use in cosmological parameter estimation.

351: Although we focus on the matter power spectrum, the shrinkage

352: technique is relevant for many studies in large-scale structure

353: and cosmology.

354:

355: The outline of this paper is as follows:

356: in Section~\ref{sec:shrink} we introduce shrinkage estimation and

357: describe its application to covariance matrices.

358: Section~\ref{sec:toy} applies the shrinkage technique to several toy

359: problems before moving to a more complicated example involving

360: galaxy clustering.

361: We describe our technique for measuring matter power spectra from N-body

362: simulations in section~\ref{sec:sims}.

363: In Section~\ref{sec:results} we construct several estimates of

364: the power spectrum covariance matrix and compare their performance

365: by estimating cosmological parameters.

366: Finally we review our results, make recommendations, and discuss future

367: directions of this project in Section~\ref{sec:disc}.

368:

369:

370:

371: \section{Shrinkage Estimation}

372: \label{sec:shrink}

373:

374: \subsection{The Method}

375: \label{sub:shrink}

376:

377: Much of this section summarizes the introduction to shrinkage

378: estimation given in \citet{ss05}.

379: Suppose we are estimating a vector $\bmath{\psi}$ (of length $p$)

380: from a set of $n$ measurements using two different models.

381: One of the models has many free parameters and

382: produces an estimate, $\bmath{u}$, with little (or no) bias, but

383: the variance may be significant due to the number of free parameters.

384: The second model (called the {\it target}) has many fewer (or no) free

385: parameters and produces an estimate, $\bmath{t}$, which will have smaller

386: variance but may be biased.

387: We construct a new estimate, $\bmath{u}^{\star}$,

388: from a linear combination of these two models, given by

389: %

390: \begin{equation}

391: \bmath{u}^{\star} = \lambda \bmath{t} + (1-\lambda) \bmath{u}

392: \label{eq:ustar}

393: \end{equation}

394: %

395: where $\lambda \in [0,1]$ is called the {\it shrinkage intensity}.

396: The question now becomes how to choose $\lambda$ in an optimal way.

397: A common way to optimize an estimator is to minimize the

398: expected mean squared error, given by the risk function

399: %

400: \begin{equation}

401: R(\lambda) =

402: \left\langle \sum_{i=1}^{p} (u^{\star}_i - \psi_i)^2 \right\rangle

403: \label{eq:mse}

404: \end{equation}

405: %

406: where the angle brackets indicate the expectation value.

407: \citet{lw03} introduced an analytic solution for the optimal

408: shrinkage intensity, $\lambda^{\star}$.

409: Prior to this solution shrinkage estimation was much less practical

410: because numerically complicated and expensive methods were necessary

411: to find the optimal shrinkage intensity.

412: The analytic solution is

413: %

414: \begin{equation}

415: \lambda^{\star} =

416: \frac {\sum_{i=1}^{p} {\rm Var}(u_i) - {\rm Cov}(t_i,u_i)

417: - {\rm Bias}(u_i) \langle t_i-u_i \rangle}

418: {\sum_{i=1}^{p} \langle (t_i-u_i)^2 \rangle}

419: \label{eq:lstar}

420: \end{equation}

421: %

422: where ${\rm Var}$, ${\rm Cov}$, and ${\rm Bias}$ are the true

423: variance, covariance, and bias, respectively.  For a practical

424: estimator, \citet{ss05} suggest estimating $\hat{\lambda}^{\star}$ as

425: %

426: \begin{equation}

427: \hat{\lambda}^{\star} = \frac

428: {\sum_{i=1}^{p} \widehat{\rm Var}(u_i) - \widehat{\rm Cov}(t_i,u_i)

429: - \widehat{\rm Bias}(u_i) (t_i-u_i)}

430: {\sum_{i=1}^{p} (t_i-u_i)^2}

431: \label{eq:estlstar}

432: \end{equation}

433: %

434: where $\widehat{\rm Var}$, $\widehat{\rm Cov}$, and $\widehat{\rm Bias}$

435: are the unbiased sample

436: estimates of ${\rm Var}$, ${\rm Cov}$, and ${\rm Bias}$,

437: respectively.

438: The ${\rm Bias}$ term can be ignored if $\bmath{u}$ is an

439: unbiased estimator.

440:

441:

442:

443: \tabletoy

444:

445:

446:

447: \subsection{Application to Covariance Matrices}

448: \label{sub:shrinkcov}

449:

450: We now specialize the shrinkage estimation technique to covariance matrices.

451: Suppose we have $n$ sets of data and we measure a data vector

452: $\bmath{x}$ of length $p$ for each of them.

453: Let $x^{(k)}_i$ be the $k^{\rm th}$ (of $n$ total) observation

454: of the $i^{\rm th}$ (of $p$ total) element of the data vector.

455: The estimated empirical mean is given by

456: $\overline{x}_i = \frac {1} {n} \sum_{k=1}^{n} x^{(k)}_i$.

457: We define

458: %

459: \begin{eqnarray}

460: \label{eq:wkij}

461: W^{(k)}_{ij} & = &

462: (x^{(k)}_i - \overline{x}_i)(x^{(k)}_j - \overline{x}_j), \\

463: \label{eq:wij}

464: \overline{W}_{ij} & = &

465: \frac {1} {n} \sum_{k=1}^{n} W^{(k)}_{ij}

466: \end{eqnarray}

467: %

468: and write the unbiased empirically estimated covariance matrix

469: of the data, $\mathbfss{S}$, as

470: %

471: \begin{equation}

472: S_{ij} = \widehat{\rm{Cov}}(x_i,x_j)

473: = \frac {n} {n-1} \overline{W}_{ij}.

474: \label{eq:wcov}

475: \end{equation}

476: %

477: Explicit substitution results in the usual

478: %

479: \begin{equation}

480: S_{ij} = \frac {1} {n-1} \sum_{k=1}^{n}

481: (x^{(k)}_i - \overline{x}_i)(x^{(k)}_j - \overline{x}_j).

482: \label{eq:cov}

483: \end{equation}

484: %

485: Similarly we can estimate the covariance of the elements

486: of the covariance matrix of the data, given as

487: %

488: \begin{equation}

489: \widehat{\rm{Cov}}(S_{ij},S_{lm})

490: = \frac {n} {(n-1)^3} \sum_{k=1}^{n}

491: (W^{(k)}_{ij} - \overline{W}_{ij})

492: (W^{(k)}_{lm} - \overline{W}_{lm})

493: \label{eq:covcov}

494: \end{equation}

495: %

496: with the variance of an individual entry given by

497: $\widehat{\rm{Var}}(S_{ij}) = \widehat{\rm{Cov}}(S_{ij},S_{ij})$.

498: For shrinkage estimation we let $\mathbfss{S}$ take the role of

499: $\bmath{u}$ and supply a target covariance matrix $\mathbfss{T}$ to take

500: the role of $\bmath{t}$.

501: The optimal shrinkage intensity and resulting covariance

502: matrix $\mathbfss{C}$ are given by

503: %

504: \begin{eqnarray}

505: \label{eq:lshat}

506: \hat{\lambda}^{\star} & = & \frac

507: {\sum_{i,j} \widehat{\rm Var}(S_{ij})

508: - \widehat{\rm Cov}(T_{ij},S_{ij})}

509: {\sum_{i,j} (T_{ij} - S_{ij})^2}, \\

510: \label{eq:scov}

511: \mathbfss{C} & = & \hat{\lambda}^{\star}\mathbfss{T} +

512: (1-\hat{\lambda}^{\star}) \mathbfss{S}.

513: \end{eqnarray}

514: %

515: If the $\hat{\lambda}^{\star}$ estimate is greater than

516: one, then $\hat{\lambda}^{\star} = 1$ is enforced, implying that only

517: the target matrix is used.  If the $\hat{\lambda}^{\star}$ estimate

518: is less than zero, then $\hat{\lambda}^{\star} = 0$ is enforced,

519: implying that only the empirical covariance matrix is used.

520: The numerator of equation~\ref{eq:lshat} implies that as the variances

521: of the elements of $\mathbfss{S}$ decrease

522: (e.g., approaching the $n \gg p$ regime)

523: the shrinkage estimate smoothly approaches the empirical covariance.

524: The denominator in equation~\ref{eq:lshat} also ensures that if the

525: chosen target matrix is very different from the empirical covariance

526: then the estimator will tend to the empirical covariance.

527: Thus an inappropriate choice of target should not make the final results

528: any worse, but there will be little gain in efficiency.

529:

530: The $\widehat{\rm Cov}(T_{ij},S_{ij})$ term in the numerator of

531: equation~\ref{eq:lshat} accounts for the fact that $\mathbfss{S}$ and

532: $\mathbfss{T}$ are estimated from the same data.  If $\mathbfss{T}$

533: is fixed then the term is zero.  If some of the elements of $\mathbfss{T}$

534: are taken directly from $\mathbfss{S}$ then the $\widehat{\rm Cov}$

535: term exactly cancels the $\widehat{\rm Var}$ term and those elements

536: do not affect the estimate of $\hat{\lambda}^{\star}$.

537: Tab. 2 of \citet{ss05} shows a variety of worked examples for

538: common targets that may or may not depend on the empirically estimated

539: covariance.  In their examples they ignore moments of higher

540: order than $\widehat{\rm Var}(S_{ij})$.

541:

542:

543:

544: \section{Toy Examples}

545: \label{sec:toy}

546:

547:

548:

549: \figtoy

550:

551:

552:

553: Before tackling large-scale structure measurements

554: we present the application of shrinkage estimation to toy models.

555: In Section~\ref{sub:toymean} we employ shrinkage to estimate

556: the mean of a noisy vector.

557: For that example we can calculate the expected shrinkage intensity

558: analytically.

559: In Section~\ref{sub:toycov} we present a toy example of covariance

560: estimation.

561:

562:

563:

564: \subsection{Estimating the Mean of a Noisy Vector}

565: \label{sub:toymean}

566:

567: In our first toy example of the shrinkage technique we will estimate

568: the mean of a vector from a set of noisy realizations using the

569: formalism from Section~\ref{sub:shrink}.

570: Given an input mean vector, $\bmath{\psi}$, the distribution for

571: an element of the noisy realization is $X_{i} = N(\psi_i,\sigma_i^2)$,

572: a normal distribution with mean $\psi_i$ and variance $\sigma_i^2$.

573: The $i^{th}$ element (of $p$ total) of the $k^{th}$ realization

574: (of $n$ total) is written $x_i^{(k)}$, and our high-dimensional

575: estimate of the mean is given by

576: %

577: \begin{equation}

578: \bmath{u} = \frac {1} {n} \sum_{k=1}^{n} \bmath{x}^{(k)}.

579: \label{eq:u}

580: \end{equation}

581: %

582: Given a fixed target, $\bmath{t}$, we will calculate the expected value

583: for the optimal shrinkage intensity, $\langle \lambda^{\star} \rangle$,

584: as given by equation~\ref{eq:lstar}.  If we assume the numerator and

585: denominator are uncorrelated we can write

586: %

587: \begin{equation}

588: \langle \lambda^{\star} \rangle =

589: \left \langle

590: \frac {\sum_{i=1}^{p} {\rm Var}(u_i)}

591: {\sum_{i=1}^{p} \langle (t_i-u_i)^2 \rangle}

592: \right \rangle

593: \approx

594: \frac {\sum_{i=1}^{p} \left \langle {\rm Var}(u_i) \right \rangle}

595: {\sum_{i=1}^{p} \left \langle (t_i-u_i)^2 \rangle \right \rangle}.

596: \label{eq:explstar}

597: \end{equation}

598: %

599: From our definitions we know that

600: \begin{eqnarray}

601: \label{eq:expx}

602: \langle x_i^{(k)} \rangle & = & \psi_i, \\

603: \label{eq:expx2}

604: \langle x_i^{(k)} x_j^{(l)} \rangle & = &

605: \psi_i \psi_j + \delta_{ij} \delta_{kl} \sigma_i^2

606: \end{eqnarray}

607: %

608: which we can use to find the expectation values of our estimators

609: %

610: \begin{eqnarray}

611: \label{eq:expu}

612: \langle u_i \rangle & = &

613: \left \langle \frac {1} {n} \sum_{k=1}^{n} x_i^{(k)} \right \rangle

614: = \psi_i,\\

615: \label{eq:expu2}

616: \langle u_i^2 \rangle & = & \left \langle

617: \frac {1} {n} \sum_{k=1}^{n} x_i^{(k)}

618: \frac {1} {n} \sum_{l=1}^{n} x_i^{(l)}

619: \right \rangle

620: = \psi_i^2 + \frac {1} {n} \sigma_i^2.

621: \end{eqnarray}

622: %

623: Fixing the values for $\psi_i$, $\sigma_i$, and $t_i$ to be

624: $\psi$, $\sigma$, and $t$, respectively, we predict the shrinkage

625: intensity to be

626: %

627: \begin{equation}

628: \langle \lambda^{\star} \rangle \approx \frac {\sigma^2}

629: {n(t-\psi)^2 + \sigma^2} .

630: \label{eq:explstarfinal}

631: \end{equation}

632: %

633: This formula displays the behavior we expect.  As $n$ becomes large,

634: $\langle \lambda^{\star} \rangle$

635: approaches zero and we will rely mostly on the empirical

636: estimate of the mean.  As $t$ approaches $\psi$,

637: $\langle \lambda^{\star} \rangle$

638: approaches unity as we have chosen the perfect target.

639: We can also calculate the expected mean squared errors for the

640: high-dimensional, low-dimensional, and shrinkage estimators

641: and find they are given by

642: %

643: \begin{eqnarray}

644: \label{eq:mseu}

645: \langle {\rm MSE}(\bmath{u}) \rangle & = & p \frac {\sigma^2} {n},\\

646: \label{eq:mset}

647: \langle {\rm MSE}(\bmath{t}) \rangle & = & p(\psi-t)^2,\\

648: \label{eq:mseustar}

649: \langle {\rm MSE}(\bmath{u}^{\star}) \rangle & = &

650: p \left [ \langle \lambda^{\star} \rangle^2 (\psi-t)^2 +

651: (1-\langle \lambda^{\star} \rangle^2) \frac {\sigma^2} {n} \right ],

652: \end{eqnarray}

653: %

654: respectively.

655:

656: We generated simulations to check the

657: validity of our analytic predictions.  Each simulation starts by

658: generating the $n$ realizations $\bmath{x}^{(k)}$.  The sample mean

659: $\bmath{u}$ is calculated for the set of realizations and used with the

660: target $\bmath{t}$ to generate the shrinkage estimate $\bmath{u^{\star}}$.

661: The shrinkage intensity, $\hat{\lambda}^{\star}$,

662: for the set of realizations is calculated using

663: equation~\ref{eq:estlstar} and jackknife resampling of equation~\ref{eq:u}

664: is used to estimate the $\widehat{\rm Var}(u_i)$ term.

665: We generated 100 simulations for each set of input parameters we

666: tested and the results are shown in Table~\ref{tab:toy}.

667: The results of our simulations match the analytical predictions

668: reasonably well and also show that the shrinkage estimator outperforms

669: both the empirical estimate and the target in a mean squared error sense.

670:

671:

672:

673: \subsection{Estimating the Covariance}

674: \label{sub:toycov}

675:

676: Our second toy model applies the formalism of Section~\ref{sub:shrinkcov}

677: for shrinkage estimation of a covariance matrix.

678: In this case the analytical predictions would be prohibitively

679: tedious, so we present only the results of simulations.

680: Each simulation starts by generating the realizations,

681: $\bmath{x}^{(k)}$, where the distribution of each element is

682: $X_i^{(k)} = N(0,\sigma^2)$.

683: From each set of realizations we construct two estimates of the

684: covariance matrix.

685: The Monte Carlo (MC) method uses the standard empirical covariance estimate

686: as given by equation~\ref{eq:cov}.

687: We also compute a shrinkage version of the Monte Carlo estimator (MC+S) from

688: equation~\ref{eq:scov} using the identity matrix as a target.

689: The two estimators are compared by inspecting the eigenvalue spectra

690: and by computing the mean squared error between the

691: estimate and the known true covariance matrix,

692: which is $\sigma^2\mathbfss{I}$ where $\mathbfss{I}$ is the identity matrix.

693: The mean squared error between two matrices $\mathbfss{A}$ and $\mathbfss{B}$

694: is given by the Frobenius norm of the difference,

695: %

696: \begin{equation}

697: ||\mathbfss{A}-\mathbfss{B}||^2_{F}

698: = \sum_{i,j} |A_{ij}-B_{ij}|^2.

699: \label{eq:frobenius}

700: \end{equation}

701:

702: For our simulations we fixed $p=18$ and $\sigma=1.1$ and compared

703: the performance of the covariance estimators as a function of the

704: number of realizations, $n$.  We ran 100 simulations for each value

705: of $n$ and averaged the results.

706: For $n = 40, 400, 4000$ the shrinkage intensities were

707: $\hat{\lambda}^{\star} = 0.92 \pm 0.08$, $0.61 \pm 0.06$, $0.136 \pm 0.006$.

708: Figure~\ref{fig:toy} shows the results for the two estimators as well

709: as the results when only using the target.

710: The shrinkage estimator gives a mean squared error that is equal to or

711: less than both the empirical covariance and the target by itself for all $n$.

712: The eigenvalue spectrum for the shrinkage estimator is also closer to the

713: correct spectrum than for the empirical covariance at fixed $n$.

714: This indicates that using the shrinkage estimator is in some sense

715: equivalent to using more simulations.

716:

717:

718:

719: \section{Simulations and Measurements}

720: \label{sec:sims}

721:

722: We divide the Hubble Volume \citep{hv} $\Lambda$CDM simulation into

723: 4096 sub-volumes, each with a sidelength of $187.5~{\rm Mpc/h}$.

724: We measure the power spectrum in each sub-volume using a simple code

725: that implements the same basic algorithm as presented in \citet{espice}

726: and employs a Fast Fourier Transform (FFT).

727: The FFT was performed on a $64^3$ grid and the power

728: spectrum was measured in 18 logarithmically-spaced bins from

729: $k=0.0367$ to $k=0.920$.

730: For plotting and model comparison purposes we calculate the average $k$

731: from the actual modes in each bin for the bin centers.

732: The power spectrum calculated on a pixelized grid is the product of

733: the true power spectrum with (the square of) the Fourier transform of

734: the pixel window function.

735: Our measured power spectrum can be written as

736: $P_{\rm meas}(\bmath{k}) = P(\bmath{k})/|\tilde{W}_p(\bmath{k})|^2$

737: where the Fourier transform of the pixel window function, $\tilde{W}_p$,

738: is given by

739: %

740: \begin{eqnarray}

741: \label{eq:f}

742: \tilde{f}(k) & = &

743: \frac {{\rm sin}(\pi k / (2 \pi/L))} {\pi k / (2 \pi/L)},\\

744: \label{eq:w}

745: |\tilde{W}_p(\bmath{k})|^2 & = & \tilde{f}^2(k_x)

746: \tilde{f}^2(k_y) \tilde{f}^2(k_z),

747: \end{eqnarray}

748: %

749: and $L$ is the length of the side of a pixel.

750: Our pixels are $2.93~{\rm Mpc/h}$ on each side.

751:

752: We used {\sevensize CAMB} \citep{camb} to calculate a model transfer function

753: and matter power spectrum using cosmological parameter values to match

754: the Hubble Volume \citep{hv} $\Lambda$CDM simulation:

755: $\Omega_m = 0.3$,

756: $\Omega_{\Lambda} = 0.7$,

757: $h = {\rm H}_0/100~{\rm km~s^{-1} Mpc^{-1}} = 0.7$,

758: $\Omega_bh^2 = 0.0196$,

759: and $n_s = 1$.

760: We normalized the resulting power spectrum, $P_{\rm CAMB}(k)$, to have a

761: (linear) $\sigma_8 = 0.9$ and then applied the non-linear

762: {\sevensize HALOFIT} \citep{halofit} correction.

763: Thus the normalization matches linear theory for large scales

764: (low $k$), but the shape includes non-linear clustering corrections

765: at smaller scales (higher $k$).

766:

767: The measured power spectrum is the convolution of the true power

768: spectrum with the squared Fourier transform of the survey window function.

769: When testing a model for the power spectrum we must perform this

770: convolution before comparing it to our measurement.  The

771: convolved power spectrum, $P^W$, is given by

772: %

773: \begin{equation}

774: P^W(\bmath{k}) = \int d\bmath{q}~P(\bmath{k}-\bmath{q})

775: |\tilde{W}_s(\bmath{q})|^2

776: \end{equation}

777: %

778: where $P$ is the input (theoretical) power spectrum.

779: Because the survey is the same shape as our pixels we can write

780: the (normalized) survey window function in terms of the pixel window

781: function as $|\tilde{W}_s|^2 = (L/2\pi)^6|\tilde{W}_p|^2$

782: where $L$ is now the survey sidelength.

783: To simplify the convolution we used the

784: spherically averaged transform of the window function (calculated

785: via Monte Carlo) and reduced the convolution of the spherically

786: symmetric power spectrum to a two-dimensional integral, given by

787: %

788: \begin{equation}

789: P^W(k) \approx 2\pi \int_0^{\infty}dq~q^2 |\tilde{W}_s(q)|^2

790: \int_{-1}^{1}dx~P(\sqrt{k^2+q^2-2kqx}).

791: \end{equation}

792: %

793: In practice the integral over $dx$ was done with Romberg integration

794: and the integration over $dq$ was performed with the extended

795: trapezoidal rule at the $k$ values where we had calculated the

796: spherically averaged $|\tilde{W}_s(\bmath{k})|^2$.

797: Power spectrum values inside the integral were evaluated using cubic

798: spline interpolation in $({\rm log}~k, {\rm log}~P)$ and power law

799: extrapolation at low and high $k$.

800: Fig.~\ref{fig:pk} shows the input theoretical power spectrum

801: $\left( P_{\rm CAMB} \right)$,

802: the convolved model power spectrum $\left( P^W_{\rm CAMB} \right)$,

803: and the average of the measured power spectra from all of the sub-volumes

804: $\left( \langle P_{\rm meas} \rangle \right)$.

805: The inset shows the spherically averaged survey window

806: function.

807:

808:

809:

810: \figpk

811:

812:

813:

814: We wish to evaluate our covariance estimates in the context of

815: cosmological parameter fitting.  Given the relatively small volume of

816: each of our sub-volumes we decided to fit for only the power spectrum

817: normalization in each sub-volume, parameterized by $\sigma_8$ (linear).

818: We write the log-likelihood (for a fixed covariance matrix) as

819: %

820: \begin{equation}

821: \bmath{d} (\sigma_8) = P_{\rm meas}(k) - \left ( \frac {\sigma_8}

822: {0.9} \right )^2 P^W_{\rm CAMB}(k,\sigma_8=0.9 ),

823: \label{eq:lnld}

824: \end{equation}

825: %

826: \begin{equation}

827: {\rm log} {\mathcal L} (\sigma_8) \propto - \frac {1} {2}

828: \bmath{d}^T \mathbfss{C}^{-1} \bmath{d}

829: = - \frac{1}{2} \chi^2

830: \label{eq:lnl}

831: \end{equation}

832: %

833: where $\mathbfss{C}$ is the covariance matrix being tested.

834: We use $\sigma_8$ to fit for the amplitude of the power

835: spectrum in the linear regime, and we assume that the difference in

836: shape in the non-linear regime is minimal for values of $\sigma_8$

837: that are close to our fiducial model of $\sigma_8=0.9$.

838: We numerically find the maximum likelihood value, $\hat{\sigma}_8$,

839: and define the upper and lower one sigma error bars, $\pm \Delta$,

840: where

841: \[

842: {\rm log} {\mathcal L}(\hat{\sigma}_8 \pm \Delta)

843: -

844: {\rm log} {\mathcal L}(\hat{\sigma}_8)

845: = -1/2.

846: \]

847:

848: Our likelihood analysis assumes that the bandpower measurements of

849: the power spectrum are normally distributed.  For most of our bandpowers

850: this is a good approximation, but there are several for which an

851: offset lognormal distribution \citep{bjk00} would be more accurate.

852: This could cause some bias in our recovered value of $\hat{\sigma}_8$,

853: but it should not affect our assessment of the relative performance

854: of different covariance estimators.  The covariance matrices for

855: the normal and offset lognormal cases are related by a change

856: of variables.

857:

858:

859:

860: \section{Covariance Matrix Estimates}

861: \label{sec:results}

862:

863: In this section we compare the performance of several techniques

864: for estimating the covariance matrix of our power spectrum measurements.

865: As a baseline comparison we calculate the covariance matrix from all

866: 4096 sub-volumes using equation~\ref{eq:cov}.  For each method of

867: covariance matrix estimate we measure $\hat{\sigma}_8$ and

868: $\Delta$ using the power spectra measured from the sub-volumes

869: and show the distributions of the $\hat{\sigma}_8$ and $\Delta$ values.

870: The mean and standard deviations of those quantities are presented

871: in Table~\ref{tab:s8}.

872:

873:

874:

875: \tableresults

876:

877:

878:

879: \subsection{Reference}

880: \label{sub:ref}

881:

882: As a reference we estimate the covariance matrix of our power spectrum

883: measurement by applying equation~\ref{eq:cov} to our measurements from

884: all 4096 sub-volumes.

885: There are $18 \times 19/2 = 171$ independent elements in the covariance

886: matrix, thus we are in the regime where we

887: have many more realizations than elements to be estimated and the

888: usual covariance estimator should give reasonable results.

889: The solid black line in

890: Fig.~\ref{fig:s8mc} is a histogram of the results of estimating

891: $\hat{\sigma}_8$ using the reference covariance matrix and the

892: power spectra measured from each of the 4096 sub-volumes.

893: The upper panel shows the distribution of $\hat{\sigma}_8$ and the

894: lower panel shows the distribution of the error bar estimates

895: (absolute value of both upper and lower).

896: The mean and standard deviation of these histograms is presented

897: in Table~\ref{tab:s8}.

898: The agreement between the width of the best-fit

899: distribution, $\sigma_{\hat{\sigma}_8}$,

900: and the mean error bar estimate, $\langle \Delta \rangle$,

901: indicates that the covariance

902: matrix is properly estimating the likelihood distribution.

903: The width of the error bar distribution, $\sigma_\Delta$,

904: is small, indicating that the error bar estimate is usually

905: very close to the correct value.

906: The mean of the maximum-likelihood estimates,

907: $\langle \hat{\sigma}_8 \rangle$,

908: is $0.870$ instead of our known input value of $0.9$,

909: but we know that our modeling of the power

910: spectrum into the non-linear regime is not perfect so this small

911: offset is not worrisome for our purposes.

912: For the remainder of this section we assume that the results using the

913: reference covariance matrix are a good approximation to those

914: that would be obtained using the true underlying covariance matrix.

915: See Section~\ref{sec:disc} for further discussions.

916:

917:

918:

919: \subsection{Monte Carlo}

920: \label{sub:mc}

921:

922:

923:

924: \figsmc

925:

926:

927:

928: Next we test covariance matrices estimated with

929: equation~\ref{eq:cov} but using a small number of sub-volumes,

930: which we call the Monte Carlo method.

931: We use sets of 40 (randomly chosen, non-overlapping)

932: sub-volumes to test the regime where we have more simulations

933: than diagonal elements of the covariance matrix (18), but fewer

934: simulations than independent elements (171).

935: From 4096 sub-volumes we can create 102 separate covariance matrix

936: estimates.

937: To obtain smooth histograms in Fig.~\ref{fig:s8mc} we test each covariance

938: matrix estimate against 40 randomly chosen $P(k)$ measurements

939: from other sub-volumes.

940: The statistics in Table~\ref{tab:s8} are calculated

941: using one randomly chosen $P(k)$ measurement per covariance matrix.

942: The statistics do not depend on how many randomly chosen $P(k)$

943: measurements are used for each covariance matrix estimate.

944: The upper panel of Fig.~\ref{fig:s8mc} shows that the distribution

945: of $\hat{\sigma}_8$ is too wide, indicating that a parameter

946: analysis using a covariance matrix estimated with this method will

947: often return a value far from the mean.

948: The lower panel of Fig.~\ref{fig:s8mc} shows that the error bar

949: is typically underestimated by $\sim 25\%$.

950: The width of the estimated error

951: bar distribution is also much wider than for the reference covariance

952: matrix estimate.  These effects are the result of using a very noisy

953: estimate of the covariance matrix.

954:

955:

956:

957: \subsection{Monte Carlo + Shrinkage}

958: \label{sub:mcs}

959:

960:

961:

962: \figcov

963:

964:

965:

966: Our first test of the shrinkage approach is to apply shrinkage

967: estimation to the Monte Carlo method described in the previous

968: section.  First we need to choose a target covariance matrix,

969: $\mathbfss{T}$.

970: In linear theory we expect the covariance matrix of the power spectrum

971: to be diagonal.  Off-diagonal terms arise in practice from

972: the survey window function and non-linear clustering effects.

973: We use a diagonal target matrix to simulate a situation where we have some

974: idea about the structure of the covariance matrix but we know our model is

975: not exact.  Our target matrix takes the form

976: %

977: \begin{equation}

978: T_{ij} = \left\{

979: \begin{array}{ll}

980: 0 & i \neq j \\

981: 2 [P^W_{\rm CAMB} (k_i)]^2/N_i & \langle k_i \rangle \leq 0.14~{\rm h/Mpc}\\

982: S_{ii} & \langle k_i \rangle > 0.14~{\rm h/Mpc}

983: \end{array}

984: \right.

985: \label{eq:target}

986: \end{equation}

987: %

988: where we use a different method in the linear and non-linear

989: regimes.  For bins in the linear regime we use our convolved model

990: for the power spectrum, $P^W_{\rm CAMB}(k_i)$, and the number

991: of $k$ modes in each bin, $N_i$, to predict the covariance

992: \citep*[e.g.,][]{hrs}.  In the non-linear regime we use the diagonal

993: of the empirically estimated covariance from the 40 sub-volumes.

994: Fig.~\ref{fig:cov} shows the diagonal elements of the reference

995: covariance matrix, the linear theory model, and the 102 target

996: matrices.  Inset is the reference correlation matrix,

997: $R_{ij} = C_{ij}/\sqrt{C_{ii}~C_{jj}}$,

998: showing that the covariance matrix is strongly diagonal until

999: well into the non-linear regime.

1000: Our results are robust to changes in the non-linear cutoff

1001: by several bins in either direction.

1002:

1003: We calculate the optimal shrinkage intensity, $\hat{\lambda}^{\star}$,

1004: for each of the 102 Monte Carlo estimates, $\mathbfss{S}$,

1005: according to equation~\ref{eq:lshat}.  We apply the

1006: $\widehat{\rm Cov}(T_{ij},S_{ij})$ term to the diagonal elements of

1007: $\mathbfss{T}$ that are taken from $\mathbfss{S}$.

1008: We find values for $\hat{\lambda}^{\star}$ distributed evenly

1009: between $0.1$ and $1.0$ (see Fig.~\ref{fig:lambda}).

1010: We produce each of our 102 new estimates of the covariance matrices,

1011: $\mathbfss{C}$, from a linear combination of $\mathbfss{S}$ and

1012: $\mathbfss{T}$ according to equation~\ref{eq:scov}.

1013: We perform the same tests as

1014: described in section~\ref{sub:mc} and compare the results

1015: in Fig.~\ref{fig:s8mc} and Table~\ref{tab:s8}.

1016: The most striking result is that the maximum-likelihood estimates,

1017: $\hat{\sigma}_8$, follow a very similar distribution to that for

1018: the reference matrix, indicating that the parameter values are now

1019: correctly estimated.

1020: The error bars are still underestimated, but the distribution is

1021: very similar to that for the normal Monte Carlo estimator.

1022:

1023: Fig.~\ref{fig:s8mc} and Table~\ref{tab:s8} also show results

1024: using only the diagonal target.  The values of $\hat{\sigma}_8$

1025: follow the correct distribution, indicating that the estimated

1026: parameter values are fine, but the error bars are much more

1027: severely underestimated.  This is expected because our target

1028: matrix is diagonal and we are using information from far enough into

1029: the non-linear regime to know that we are missing some important

1030: covariance.

1031: It is now clear that the estimated error bar distribution of the shrinkage

1032: estimator is a combination of the Monte Carlo and target distributions.

1033: The shrinkage intensity can serve as a proxy for whether the estimated

1034: error bars are likely to be similar to those for the Monte Carlo

1035: or the target.  See Section~\ref{sec:disc} for further discussions.

1036:

1037: In summary, the shrinkage of the empirically estimated covariance

1038: against our target matrix outperforms either matrix by itself.

1039: Using just the empirically estimated covariance brings in too much

1040: noise which causes error in the estimation of $\hat{\sigma}_8$

1041: itself.  Using only the diagonal target mitigates the noise problems,

1042: but ignores important covariance.  The shrinkage estimator

1043: uses the best aspects of both, keeping the part of the covariance

1044: that is well estimated but drastically reducing the total amount

1045: of noise.

1046:

1047:

1048:

1049: \subsection{Jackknife}

1050: \label{sub:jk}

1051:

1052:

1053:

1054: \figsjk

1055:

1056:

1057:

1058: Recently a resampling technique know as the jackknife method has been

1059: used to estimate covariance matrices for large-scale structure

1060: measurements from the data set itself.

1061: The method works by dividing the data volume

1062: into $n$ cells of roughly the same size and recalculating the

1063: measurement $n$ times, each time with a different cell left out.

1064: The variance between the measurements can be adjusted to try and

1065: calculate the variance corresponding to the entire volume.  In practice

1066: one replaces equation~\ref{eq:wkij} with

1067: %

1068: \begin{equation}

1069: W^{(k)}_{ij} = \frac {(n-1)^2} {n}

1070: (x^{(k)}_i - \overline{x}_i)(x^{(k)}_j - \overline{x}_j)

1071: \label{eq:jkwkij}

1072: \end{equation}

1073: %

1074: and then calculates the covariance matrix with equation~\ref{eq:wcov},

1075: resulting in the usual

1076: %

1077: \begin{equation}

1078: S_{ij} = \frac {n-1} {n} \sum_{k=1}^{n}

1079: (x^{(k)}_i - \overline{x}_i)(x^{(k)}_j - \overline{x}_j).

1080: \label{eq:covjk}

1081: \end{equation}

1082: %

1083: We divided each sub-volume into $3^3 = 27$ cells and modified our

1084: code to calculate the power spectrum with one cell removed.  Our

1085: code incorporates a volume correction, the lowest order edge

1086: correction in Fourier space.  For each of the 4096

1087: sub-volumes we estimate $\hat{\sigma}_8$ and $\Delta$ using the power

1088: spectrum and the jackknife covariance matrix from the same

1089: sub-volume.  The results are compared to the reference case in

1090: Fig.~\ref{fig:s8jk} and listed in Table~\ref{tab:s8}.

1091:

1092: The distribution of $\hat{\sigma}_8$

1093: is much wider than for the reference covariance matrix, indicating that

1094: the noise in the covariance estimate causes incorrect parameter

1095: estimation.  This is similar to the result for the Monte Carlo method.

1096: The jackknife estimates of $\hat{\sigma}_8$ also peak at a noticeably lower

1097: value than for the reference covariance, though the two

1098: histograms are in roughly $1 \sigma$ agreement given the width of

1099: the distribution for the jackknife case.

1100: The error bars estimated

1101: in the jackknife case are typically underestimated by a factor

1102: of almost three compared to the reference covariance matrix and nearly

1103: an order of magnitude compared to the actual width of the

1104: jackknife distribution of $\hat{\sigma}_8$.

1105:

1106:

1107:

1108: \subsection{Jackknife + Shrinkage}

1109: \label{sub:jks}

1110:

1111: Our final method of estimating the covariance matrix

1112: applies shrinkage to the jackknife estimator to see if we can achieve

1113: enhanced robustness.

1114: We use the same method to construct a target matrix as described in

1115: section~\ref{sub:mcs}, using the diagonal of the jackknife

1116: estimated covariance matrix in the non-linear regime.

1117: We calculate the shrinkage intensity, $\hat{\lambda}^{\star}$,

1118: and covariance estimate

1119: for each of the 4096 covariance matrices as described in

1120: section~\ref{sub:shrinkcov}, but substituting

1121: equation~\ref{eq:jkwkij} for equation~\ref{eq:wkij} throughout.

1122: We find values for $\hat{\lambda}^{\star}$ distributed evenly

1123: between $0.0$ and $1.0$ (see Fig.~\ref{fig:lambda}).

1124: We run the same tests as described in section~\ref{sub:jk} and

1125: the results are shown in Fig.~\ref{fig:s8jk} and Table~\ref{tab:s8}.

1126:

1127: As with the shrinkage version of the Monte Carlo estimator, the

1128: shrinkage version of the jackknife estimator shows significant

1129: improvement in the actual estimated parameter, $\hat{\sigma}_8$.

1130: However, the central value and width are not quite as good as for

1131: the reference case.  There is some improvement

1132: in the estimation of the error bar, though the error bars are still

1133: systematically underestimated by a factor of roughly two

1134: compared to the reference.

1135:

1136: Fig.~\ref{fig:s8jk} and Table~\ref{tab:s8} also show the results

1137: of estimating $\hat{\sigma}_8$ and $\Delta$ using only the diagonal

1138: targets used in the shrinkage version of the jackknife estimator.

1139: Again, the diagonal target matrix does well for estimating

1140: $\hat{\sigma}_8$ due to the lack of noise, but it gives the worst

1141: estimates of the error bars.

1142:

1143: In this case, the shrinkage version of the jackknife estimator

1144: did the best job of estimating the error bars, and it was only

1145: slightly worse than the diagonal approximation at recovering

1146: the distribution of $\hat{\sigma}_8$.  Again, shrinkage estimation

1147: is doing an excellent job of keeping information about covariance

1148: while reducing the total noise.

1149:

1150:

1151:

1152: \section{Discussions and Conclusions}

1153: \label{sec:disc}

1154:

1155: We have introduced shrinkage as a technique for improving estimates

1156: of the covariance matrix for power spectrum measurements.

1157: We tested our methods on dark matter simulations and showed

1158: improvement over the empirically estimated covariance matrix from

1159: a limited number of simulations or jackknife resamplings.

1160: In order to clearly assess the potential improvement from using

1161: shrinkage estimation, we chose an intentionally difficult scenario

1162: where traditional methods of estimating the covariance were unlikely

1163: to yield satisfactory results.

1164: All of these methods would perform better if we allowed ourselves

1165: more simulations per Monte Carlo estimate or if we did not push as

1166: far into the non-linear clustering regime.  The shrinkage technique

1167: would still outperform the other methods, but perhaps the differences

1168: would be less obvious.

1169:

1170: A good estimate of the covariance matrix of a power spectrum

1171: measurement is essential for extracting cosmological information via

1172: parameter fitting.  Including the covariance between different bins

1173: is a good step towards properly estimating the confidence intervals

1174: on cosmological parameters.  However, the increased number of free

1175: parameters of a full covariance estimate (as opposed to a diagonal

1176: approximation) can cause the covariance estimate to be noisy if only

1177: a relatively small number of simulations are available.  This noise

1178: can adversely affect the estimate of the parameter itself.  A diagonal

1179: approximation to the covariance can be more easily constrained with a

1180: limited number of simulations, leading to better estimates of the

1181: parameter values.  However, the confidence intervals can be severely

1182: underestimated if actual covariance is ignored.  Neither alternative

1183: is appealing.  If a similar measurement was performed with the

1184: two-point correlation function, the Fourier dual of the power spectrum,

1185: a full covariance matrix is especially important as bins will be strongly

1186: correlated, even in the linear clustering regime.

1187: Realistic survey geometries will also cause additional covariance on large

1188: scales for the power spectrum.

1189:

1190: Shrinkage estimation is an optimal way of combining a model with many

1191: degrees of freedom and a model with few degrees of freedom

1192: to minimize the total error on the covariance estimate.

1193: In our example the shrinkage versions of the Monte Carlo and jackknife

1194: estimators clearly outperformed their counterparts without shrinkage,

1195: with the shrinkage version of the Monte Carlo estimator producing

1196: the best results.

1197: The lemma of \citet{lw03} as employed by \citet{ss05} allows a

1198: mathematically and numerically simple way of calculating the optimal

1199: shrinkage intensity.

1200: This means that there is minimal addition work required to use a

1201: shrinkage version of a covariance estimator.

1202: Shrinkage estimation can result in a massive improvement in the limit of

1203: a small number of simulations and will not adversely affect the

1204: covariance estimate in the limit of a large number of simulations.

1205: For these reasons we always recommend the use of the shrinkage versions

1206: of covariance estimators in all regimes.

1207:

1208: We briefly investigated the effects of shrinkage estimation in the

1209: limit of a large (though not infinite) number of simulations.

1210: We applied shrinkage estimation to our reference covariance matrix

1211: estimated from all 4096 sub-volumes using the target from

1212: equation~\ref{eq:target} and found an optimal shrinkage intensity

1213: $\hat{\lambda}^{\star} = 0.0096$.  This number is the same order as

1214: the relative noise we expect in each element of the matrix,

1215: $1/\sqrt{4096} = 0.0156$.  We then calculated the eigensystems of

1216: both matrices.  The dot products of the corresponding eigenvectors

1217: always exceeded $0.996$, indicating that

1218: they are essentially identical.  The (sorted) eigenvalue spectra

1219: are shown in Fig.~\ref{fig:eigen}.  The eigenvalues are the same

1220: to within $1\%$ for the first 10 eigenmodes.  After the tenth eigenmode

1221: the eigenvalues from the reference matrix become increasingly smaller

1222: compared to the shrinkage version.  By the final eigenmode the difference

1223: is $\sim 50\%$.  The shrinkage version of the reference matrix

1224: should be a more accurate estimate of the true underlying covariance

1225: matrix.

1226: The non-linear nature of matrix inversion can cause errors $\gg 1\%$ even

1227: when individual elements of the covariance matrix are estimated to $\sim 1\%$.

1228: We ran our parameter estimation test using the shrinkage version of the

1229: reference matrix and found that $\langle \hat{\sigma}_8 \rangle$ moved by

1230: less than $0.5\%$.

1231: This is small compared to the width of the distribution, which is $\sim 5\%$.

1232: The average minimum $\chi^2$ did improve from $52.4$ to $41.1$ with the

1233: shrinkage version of the covariance matrix, though this is still large

1234: for $18 - 1 = 17$ degrees of freedom.

1235: The remaining discrepancy is dominated by bias from problems with modeling

1236: the power spectrum into the non-linear regime or power loss in the

1237: simulation at smaller scales due to low resolution, not a grossly

1238: inaccurate estimate of the variances.

1239: The amplitude is mainly sensitive to smooth eigenmodes, which have

1240: large eigenvalues, so there is little change in the estimated value.

1241: Parameters that are more sensitive to the shape of the power spectrum may

1242: be more sensitive to the lower eigenvalue modes and show more than a

1243: $1\%$ change.

1244: The impact of these differences could be estimated with a study of

1245: the information content of the power spectrum covariance

1246: in terms of cosmological parameter confidences (i.e., \citealt{ns07}),

1247: but this is beyond the scope of this paper.

1248:

1249:

1250:

1251: \figeigen

1252:

1253:

1254:

1255: We employed a very simple diagonal target matrix in this paper,

1256: but better targets can clearly improve the efficiency of the shrinkage

1257: technique.  A much more realistic model for covariance on small scales

1258: could be constructed using the halo model.  For realistic measurements

1259: it may also be advantageous to model some of the effects of survey

1260: geometries, redshift-space distortions, and clustering bias.

1261: Targets that depend on a small number of free parameters may be very

1262: useful for some of these effects (e.g., clustering bias).

1263: Targets can also be developed for a wide range of large-scale

1264: structure measurements in addition to the power spectrum.

1265: The exploration of more sophisticated targets is beyond the scope of

1266: this paper and is left to future studies.

1267:

1268: Ultimately we would like to develop more diagnostics of the

1269: performance of our covariance estimates.  Fig.~\ref{fig:lambda}

1270: shows the estimated error bar, $\Delta$, as a function of the

1271: shrinkage intensity, $\hat{\lambda}^{\star}$, for the shrinkage

1272: versions of the Monte Carlo and jackknife estimators.  There is

1273: clearly some correlation for the shrinkage version of the

1274: Monte Carlo estimator, so knowledge of $\hat{\lambda}^{\star}$

1275: could help one gauge how much the error bars are underestimated.

1276: The exploration of such diagnostics should proceed as better

1277: targets are developed.

1278:

1279:

1280:

1281: \figlambda

1282:

1283:

1284:

1285: The difficulties in estimating the power spectrum covariance

1286: matrix in the context of making precision cosmological

1287: measurements are of even greater concern for higher-order

1288: clustering measurements.  Higher-order clustering measurements

1289: have a configuration space with more degrees of freedom than the power

1290: spectrum (or two-point correlation function).  Even a lower resolution

1291: measurement will have more bins and a much larger

1292: covariance matrix, and noise will cause larger deviations in the

1293: inverse matrix.  Theoretical modeling of the covariance matrix

1294: for an N-point correlation function generally involves correlations

1295: up to the 2N-point \citep[e.g.,][]{higher}, making the models more uncertain.

1296: The ability to optimally combine simulations and a theoretical

1297: model with a small number of free parameters will make dramatic

1298: improvements.  Shrinkage estimators could also be used for covariance

1299: matrices of measurements outside of large-scale structure, including the

1300: cosmic microwave background power spectrum.

1301: Finally, we note that Section~\ref{sub:shrink} makes no specific references

1302: to covariance matrices and that shrinkage is a general estimation technique.

1303: We are studying additional applications of shrinkage estimation for

1304: cosmological measurements.

1305:

1306:

1307:

1308: \section*{Acknowledgments}

1309: The authors thank Mark Neyrinck and Gang Chen for discussions about

1310: the covariance matrix of the power spectrum and the effects of noise.

1311: The authors are grateful for support NASA grant NNG06GE71G

1312: and NSF grant AMS04-0434413.

1313:

1314:

1315:

1316: \begin{thebibliography}{}

1317:

1318: \bibitem[\protect\citeauthoryear{Bond, Jaffe, \& Knox}{2000}]{bjk00}

1319:   Bond J.~R., Jaffe A.~H., Knox L., 2000,

1320:   ApJ, 533, 19

1321:

1322: \bibitem[\protect\citeauthoryear{Chen \& Szapudi}{2006}]{cs06}

1323:   Chen G., Szapudi I., 2006,

1324:   ApJ, 647, L87

1325:

1326: \bibitem[\protect\citeauthoryear{Efron}{1982}]{efron82}

1327:   Efron B., 1982,

1328:   %Annals of Statistics, 10, 340-356

1329:   Ann. Stat., 10, 340

1330:

1331: \bibitem[\protect\citeauthoryear{Evrard et al.}{2002}]{hv}

1332:   Evrard A.~E., et al., 2002, ApJ, 573, 7

1333:

1334: \bibitem[\protect\citeauthoryear{Hamilton, Rimes, \& Scoccimarro}

1335:   {Hamilton et al.}{2006}]{hrs}

1336:   Hamilton A.~J.~S., Rimes C.~D., Scoccimarro R.,

1337:   2006, MNRAS, 371, 1188

1338:

1339: \bibitem[\protect\citeauthoryear{Ledoit \& Wolf}{2003}]{lw03}

1340:   Ledoit O., Wolf M., 2003,

1341:   %Journal of Empirical Finance, 10, 603-621

1342:   J. Empirical Finance, 10, 603

1343:

1344: \bibitem[\protect\citeauthoryear{Lewis, Challinor, \& Lasenby}

1345:   {Lewis et al.}{2000}]{camb} Lewis A., Challinor A., Lasenby A.,

1346:   2000, ApJ, 538, 473, http://camb.info

1347:

1348: \bibitem[\protect\citeauthoryear{Neyrinck \& Szapudi}{2007}]{ns07}

1349:   Neyrinck M.~C., Szapudi I., 2007,

1350:   MNRAS, 375, L51

1351:

1352: \bibitem[\protect\citeauthoryear{Sch\"{a}fer \& Strimmer}{2005}]{ss05}

1353:   Sch\"{a}fer J., Strimmer K., 2005,

1354:   %Statistical Applications in Genetics and Molecular Biology,

1355:   %Vol. 4: No. 1, Article 32

1356:   Stat. App. Genet. Mol. Biol., 4, 32

1357:

1358: \bibitem[\protect\citeauthoryear{Smith et al.}{2003}]{halofit}

1359:   Smith R.~E., et al., 2003, MNRAS, 341, 1311

1360:

1361: \bibitem[\protect\citeauthoryear{Stein}{1956}]{stein56}

1362:   Stein C., 1956,

1363:   Proc. Third Berkeley Symp. Math. Stat. Probab., 1, 197

1364:

1365: \bibitem[\protect\citeauthoryear{Szapudi}{2005}]{higher}

1366:   Szapudi I., 2005, astro-ph/0505391

1367:

1368: \bibitem[\protect\citeauthoryear{Szapudi et al.}{2005}]{espice}

1369:   Szapudi I., Pan J., Prunet S., Budav{\'a}ri T., 2005,

1370:   ApJ, 631, L1

1371:

1372: \end{thebibliography}

1373:

1374:

1375:

1376: \end{document}

1377: