0807:0807.0624/ms.tex

1: \documentclass{emulateapj}

2: %\documentclass[12pt, preprint]{aastex}

3:

4: \usepackage{float}

5: \usepackage{amsmath}

6: \usepackage{epsfig,floatflt}

7: \usepackage{subfigure}

8:

9: \newcommand{\npix}{N_{\textrm{pix}}}

10: \newcommand{\BA}{\mathbf{A}}

11: \newcommand{\BB}{\mathbf{B}}

12: \newcommand{\BC}{\mathbf{C}}

13: \newcommand{\Ba}{\mathbf{a}}

14: \newcommand{\Bb}{\mathbf{b}}

15: \newcommand{\Bc}{\mathbf{c}}

16: \newcommand{\Bs}{\mathbf{s}}

17: \newcommand{\BS}{\mathbf{S}}

18: \newcommand{\Bn}{\mathbf{n}}

19: \newcommand{\BN}{\mathbf{N}}

20: \newcommand{\Bm}{\mathbf{m}}

21: \newcommand{\Bf}{\mathbf{f}}

22: \newcommand{\BF}{\mathbf{F}}

23: \newcommand{\Bd}{\mathbf{d}}

24: \newcommand{\BW}{\mathbf{W}}

25: \newcommand{\BP}{\mathbf{P}}

26: \newcommand{\id}{\mathbf{1}}

27: \newcommand{\Bx}{\mathbf{x}}

28: \newcommand{\By}{\mathbf{y}}

29: \newcommand{\Br}{\mathbf{r}}

30: \newcommand{\Bu}{\mathbf{u}}

31: \newcommand{\Bv}{\mathbf{v}}

32: \newcommand{\Bt}{\mathbf{t}}

33: \newcommand{\BD}{\mathbf{D}}

34: \newcommand{\BU}{\mathbf{U}}

35: \newcommand{\BM}{\mathbf{M}}

36: \newcommand{\BT}{\mathbf{T}}

37: \newcommand{\BG}{\mathbf{G}}

38: \newcommand{\BPi}{\mathbf{\Pi}}

39: \newcommand{\Cell}{C_{\ell}}

40: \newcommand{\muK}{\mu\textrm{K}}

41:

42:

43:     \newtheorem{theorem}{Theorem}[section]

44:     \newtheorem{lemma}[theorem]{Lemma}

45:     \newtheorem{proposition}[theorem]{Proposition}

46:     \newtheorem{corollary}[theorem]{Corollary}

47:

48:     \newenvironment{proof}[1][Proof]{\begin{trivlist}

49:     \item[\hskip \labelsep {\bfseries #1}]}{\end{trivlist}}

50:     \newenvironment{definition}[1][Definition]{\begin{trivlist}

51:     \item[\hskip \labelsep {\bfseries #1}]}{\end{trivlist}}

52:     \newenvironment{example}[1][Example]{\begin{trivlist}

53:     \item[\hskip \labelsep {\bfseries #1}]}{\end{trivlist}}

54:     \newenvironment{remark}[1][Remark]{\begin{trivlist}

55:     \item[\hskip \labelsep {\bfseries #1}]}{\end{trivlist}}

56:

57:     \newcommand{\qed}{\nobreak \ifvmode \relax \else

58:           \ifdim\lastskip<1.5em \hskip-\lastskip

59:           \hskip1.5em plus0em minus0.5em \fi \nobreak

60:           \vrule height0.75em width0.5em depth0.25em\fi}

61:

62:

63:

64: \begin{document}

65:

66:

67: \title{A Markov Chain Monte Carlo Algorithm for analysis of low

68:   signal-to-noise CMB data}

69:

70: \author{J. B.  Jewell\altaffilmark{1, 3, 4}, H.\ K.\

71:   Eriksen\altaffilmark{2,5,6}, B.\ D.\ Wandelt\altaffilmark{7,8},

72:  I.\ J.\ O'Dwyer\altaffilmark{3},  Greg Huey\altaffilmark{3}, and K. M.

73:   G\'{o}rski\altaffilmark{3,4,9}}

74:

75: \altaffiltext{1}{email: Jeffrey.B.Jewell@jpl.nasa.gov}

76: \altaffiltext{2}{email: h.k.k.eriksen@astro.uio.no}

77:

78: \altaffiltext{3}{Jet Propulsion Laboratory, 4800 Oak

79:   Grove Drive, Pasadena CA 91109}

80:

81: \altaffiltext{4}{California Institute of Technology, Pasadena, CA

82:   91125}

83:

84: \altaffiltext{5}{Institute of Theoretical Astrophysics, University of

85: Oslo, P.O.\ Box 1029 Blindern, N-0315 Oslo, Norway}

86:

87: \altaffiltext{6}{Centre of

88: Mathematics for Applications, University of Oslo, P.O.\ Box 1053

89: Blindern, N-0316 Oslo}

90:

91: \altaffiltext{7}{Department of Physics, University of Illinois,

92:   Urbana, IL 61801}

93:

94: \altaffiltext{8}{Astronomy Department, University of Illinois at

95:   Urbana-Champaign, IL 61801-3080}

96:

97: \altaffiltext{9}{Warsaw University Observatory, Aleje Ujazdowskie 4, 00-478 Warszawa,

98:   Poland}

99:

100:

101: \date{Received - / Accepted -}

102:

103: \begin{abstract}

104:   We present a new Monte Carlo Markov Chain algorithm for CMB analysis

105:   in the low signal-to-noise regime. This method builds on and

106:   complements the previously described CMB Gibbs sampler, and

107:   effectively solves the low signal-to-noise inefficiency problem of

108:   the direct Gibbs sampler. The new algorithm is a simple

109:   Metropolis-Hastings sampler with a general proposal rule for the

110:   power spectrum, $C_{\ell}$, followed by a particular deterministic

111:   rescaling operation of the sky signal, $\mathbf{s}$. The acceptance

112:   probability for this joint move depends on the sky map only through

113:   the difference of $\chi^2$'s between the original and proposed sky

114:   sample, which is close to unity in the low signal-to-noise regime.

115:   The algorithm is completed by alternating this move with a standard

116:   Gibbs move. Together, these two proposals constitute a

117:   computationally efficient algorithm for mapping out the full joint

118:   CMB posterior, both in the high and low signal-to-noise regimes.

119: \end{abstract}

120:

121: \keywords{cosmic microwave background --- cosmology: observations ---

122: methods: numerical}

123:

124: \maketitle

125:

126: \section{Introduction}

127:

128: Since the detection of anisotropy in the Cosmic Microwave Background

129: (CMB; Smoot et al.\ 1992), there has been an emphasis on likelihood or Bayesian methods

130: for the inference of cosmological parameters and their error bars, or

131: more generally, their confidence intervals. CMB analysis is most

132: suitably addressed in a Bayesian, as opposed to frequentist,

133: framework, simply because the observed microwave sky is interpreted as

134: a single realization of a spatial random process.

135:

136: Early measurements of the CMB were limited to signal to noise ratios

137: of order unity at relatively low angular scales, where direct

138: evaluation of the likelihood for the power spectrum or cosmological

139: parameters is possible. However, the ${\cal O}(N^{3})$ scaling of

140: computational expense with pixel number $N$ prohibits direct

141: likelihood evaluation for current and future CMB observations.

142: Motivated by the scientific potential of CMB data with increasingly

143: high spatial resolution, yet beset with systematics including partial

144: sky coverage and foregrounds, an iterative method of sampling from the

145: Bayes posterior, using a special case of Markov Chain Monte Carlo

146: (MCMC) known as Gibbs sampling, was introduced by \citep{jewell:2002,jewell:2004}.

147: The method was later independently discovered and applied to COBE data

148: \citep{wandelt:2004}, numerically extended to high-resolution on the

149: sphere \citep{eriksen:2004}, applied to analysis of the WMAP

150: \citep{bennett:2003, hinshaw:2007, page:2007} data \citep{odwyer:2004,

151:   eriksen:2007a, eriksen:2007b}, as well as generalized to include

152: inference of foreground model parameters \citep{eriksen:2008a,

153:   eriksen:2008b}.

154:

155: While Gibbs sampling provably converges to the Bayes posterior over

156: the entire range of angular scales probed by the data, the run-time

157: required to generate enough independent samples at the low

158: signal-to-noise, small angular scale regime was found to be

159: prohibitive \citep{eriksen:2004}. The reason for this is that typical

160: variations in the power spectrum from one sample to the next are

161: determined by cosmic variance alone, whereas the posterior itself is

162: given by both cosmic variance and noise.  This results in a long

163: correlation length in the sequence of spectra in the low signal to

164: noise regime, thus requiring a very long run time to generate a

165: sufficient number of independent samples.

166:

167: In this paper we generalize the original Gibbs sampling algorithm to

168: include a new type of MCMC step alternating with standard Gibbs

169: sampling, which solves this problem of slow probabilistic convergence

170: in the low signal to noise regime.  This method therefore makes

171: possible an exact Bayesian approach to CMB analysis over the entire

172: range of angular scales probed by current and future experiments.

173:

174: The paper is organized as follows. We first review the CMB Gibbs

175: sampler, and describe the associated numerical difficulties in

176: analysis at small angular scales.  We then introduce the new MCMC step

177: to the Markov chain, designed specifically to allow large variations

178: in the high-$\ell$ CMB spectrum, precisely where the signal to noise

179: is $\le 1$. We derive the required Metropolis-Hastings acceptance

180: probability correctness in Appendix \ref{app:proof}, and numerically

181: demonstrate the method in Section \ref{sec:simulations}, for both

182: temperature and polarization. Finally, we summarize and conclude in

183: Section \ref{sec:conclusions}.

184:

185:

186: \section{Review of Gibbs Sampling}

187:

188: \subsection{The Joint Posterior}

189:

190: We begin by assuming that the observed data may be modelled by a

191: signal and a noise term,

192: \begin{equation}

193: \Bd = \BA \Bs + \Bn,

194: \end{equation}

195: where $\Bd$ is a vector containing the data (at every pointing of the

196: detectors), the matrix $\BA$ involves both pointing and beam

197: convolution (and where for this paper we will assume symmetric beams

198: and neglect the details of this operation), and $\Bn$ is additive

199: noise (here in the pixel domain).  We assume both the CMB signal and

200: noise to be Gaussian random fields with vanishing mean and covariance

201: matrices $\BS$ and $\BN$, respectively. In harmonic space, where $\Bs

202: = \sum_{\ell, m} a_{\ell m} Y_{\ell m}$, the CMB temperature

203: covariance matrix is given by $\textrm{C}_{\ell m, \ell' m'} = \langle

204: a_{\ell m}^* a_{\ell' m'}\rangle = C_{\ell} \delta_{\ell \ell'}

205: \delta_{m m'}$, $\Cell$ being the angular power spectrum.  A

206: generalization to polarization merely requires the replacement of the

207: signal matrix diagonal elements with $3 \times 3$ matrices of the form

208: \begin{equation}

209: \BC_{l} = \left[ \begin{array}{ccc}

210: C_{l}^{TT} & C_{l}^{TE} & C_{l}^{TB} \\

211: C_{l}^{ET} & C_{l}^{EE} & C_{l}^{EB} \\

212: C_{l}^{BT} & C_{l}^{BE} & C_{l}^{BB} \end{array} \right]

213: \end{equation}

214: For the discussion in this section, we focus on the temperature case,

215: but note that the generalization to polarization is straightforward

216: and discussed by \citet{larson:2007}.

217:

218: Given these asumptions, our goal is to quantify what has been learned

219: about the underlying power spectrum of the CMB given the data, or how

220: well the data constrain the cosmological parameters.  One proceeds

221: then, in a Bayesian framework, by writing down the posterior given the

222: data,

223: \begin{equation}

224: P(C_{\ell} | \Bd) \propto \mathcal{L}(\Bd | C_{\ell} ) P(C_{\ell}).

225: \end{equation}

226: Here $\mathcal{L}(\Bd | C_{\ell})$ is the likelihood and $P(C_{\ell})$

227: is a prior on $C_{\ell}$.

228: %HKE: The next sentence didn't feel very natural to me, given how the next

229: %sentence is formulated. Better to skip it, I think, so it's commented

230: %out for now:

231: %It is important to remember, as will be seen

232: %in what follows, that the likelihood is something to be derived in the

233: %context of our data model and assumptions about the signal and noise

234: %processes.

235:

236: In order to derive the functional form of the likelihood, one imagines

237: randomly choosing any relevant model [here a power spectrum drawn from

238: $P(C_{\ell})$], and asks what sequence of effects needs to be modeled

239: in order to simulate the data. Here, simulation is understood as

240: conditioning on the chosen model, and leads to a joint density

241: \begin{eqnarray}

242: P(\Bd,\Bs,C_{\ell}) & = & P(\Bd,\Bs | C_{\ell}) P(C_{\ell}) \nonumber \\

243: & = & P(\Bd | \Bs) P(\Bs | C_{\ell})  P(C_{\ell})

244: \end{eqnarray}

245: where the last line follows directly from our data model through the

246: assumption of additive noise. Specifically, the factors in the above

247: are

248: \begin{eqnarray}

249: -2 \log P(\Bs | C_{\ell} ) & = & \Bs^{t} \BC^{-1} \Bs - \log |\BC| \nonumber \\

250: -2 \log P(\Bd | \Bs) & = & -(\Bd-\Bs)^{t} \BN^{-1}(\Bd-\Bs) - \log |\BN|

251: \end{eqnarray}

252: which follow from the assumption that both the signal and noise are

253: independent Gaussian processes.

254:

255: The idea of a ``simulation chain'' provides a conceptually

256: clear approach to constructing a joint density, from which we

257: immediately have the Bayesian posterior

258: \begin{equation}

259: P(C_{\ell} | \Bd) = \int d\Bs \ P(C_{\ell}, \Bs | \Bd)

260: \end{equation}

261: The relevance of the above for this paper lies in relating what we

262: refer to as the {\it joint posterior}, $P(C_{\ell}, \Bs | \Bd)$, and

263: the more familiar likelihood $\mathcal{L}(\Bd | C_{\ell}) \propto

264: P(C_{\ell} | \Bd) / P(C_{\ell})$,

265:

266: Although we can analytically compute the integral of the joint

267: posterior over the signal for the Gaussian signal and noise processes

268: considered here, and therefore simply write down the functional form

269: of the likelihood, it is too expensive to evaluate it for any

270: specified $C_{l}$ given high-resolution data. Furthermore, for more

271: complicated data models (i.e. including foreground model

272: uncertainties) we will not be able to perform the integrals over the

273: additional degrees of freedom.  Both situations then instead motivate

274: sampling from the joint posterior, and thereby generating samples from

275: $P(C_{\ell} | \Bd)$ without ever evaluating $P(C_{\ell} | \Bd)$.  We

276: now discuss the original Gibbs sampling approach proposed and

277: implemented by \citet{jewell:2004}, \citet{wandelt:2004} and

278: \citet{eriksen:2004}, and then introduce a new MCMC step which

279: directly addresses the previously reported slow probabilistic

280: convergence in the low signal to noise regime \citep{eriksen:2004}.

281:

282: \subsection{The CMB Gibbs sampler}

283: \label{sec:cmb_sampling}

284:

285: As stated above, our goal is to sample from the joint posterior,

286: \begin{equation}

287:   - 2  \log P(\Bs, C_{\ell}|\Bd) =

288:   \chi^{2}(\Bd, \Bs) +

289:   \Bs^{t} \BS^{-1} \Bs + \log | \BS|

290:   + \log P(C_{\ell}).

291: \label{eq:cmb_posterior}

292: \end{equation}

293: For notational convenience, we have here dropped constant factors of

294: $2\pi$, and also defined

295: \begin{equation}

296: \chi^{2}(\Bs, \Bd) = (\Bd - \Bs)^{t} \BN^{-1}(\Bd -\Bs).

297: \end{equation}

298: One approach to sample from this posterior is to use an algorithm

299: known as Gibbs sampling, where we can alternately sample from the

300: respective conditional densities,

301: \begin{align}

302: \Bs^{i+1} &\leftarrow P(\Bs | C_{\ell}^i, \Bd) \\

303: C_{\ell}^{i+1} &\leftarrow P(C_{\ell} | \Bs^{i+1}, \Bd).

304: \end{align}

305: Here $\leftarrow$ indicates sampling from the distribution on the

306: right-hand side. After some burn-in period, during which all samples

307: must be discarded, the joint samples $(\Bs^i, C_{\ell}^i)$ will be

308: drawn from the desired density. Thus, the problem is reduced to that

309: of sampling from the two \emph{conditional} densities $P(\Bs |

310: C_{\ell}, \Bd)$ and $P(C_{\ell} | \Bs, \Bd)$.

311:

312: We now describe the sampling algorithms for each of these two

313: conditional distributions, starting with $P(C_{\ell} | \Bs, \Bd)$.

314: First, note that $P(C_{\ell} | \Bs, \Bd) = P(C_{\ell} | \Bs)$ which

315: follows directly from the construction of the joint density of

316: ``everything'' above.  This is also intuitively easy to understand

317: since if we already know the CMB sky signal, the data themselves tell

318: us nothing new about the CMB power spectrum. Next, since the sky is

319: assumed to be Gaussian and isotropic, the distribution reads

320: \begin{equation}

321: P(C_{\ell} | \Bs) \propto P(C_{\ell}) \frac{e^{-\frac{1}{2}

322:     \Bs_{\ell}^{t}\BS_{\ell}^{-1}\Bs_{\ell}}}{\sqrt{|\BS_{\ell}|}} =

323: P(C_{\ell})

324: \frac{e^{-\frac{2\ell+1}{2} \frac{\sigma_{\ell}}{C_{\ell}}}}{C_{\ell}^{\frac{2\ell+1}{2}}},

325: \end{equation}

326: which, when interpreted as a function of $C_{\ell}$, is known as the

327: inverse Gamma distribution. In this expression, $\sigma_{\ell} =

328: \frac{1}{2\ell+1} \sum_{m} |a_{\ell m}|^2$ denotes the observed power spectrum

329: of $\Bs$. Fortunately, there exists a simple textbook sampling

330: algorithm for this distribution \citep[e.g.,][]{gupta:2000}, and we

331: refer the interested reader to the previous papers for details. For an

332: alternative, and more flexible, sampling algorithm, see

333: \citet{wehus:2008}.

334:

335: In order to describe the sky signal sampling step, we first define the

336: mean-field map (or Wiener filtered data) to be $\hat{\Bs} = (\BS^{-1}

337: + \BN^{-1})^{-1} \BN^{-1} \Bd$, and note that the conditional sky

338: signal density given the data and $C_{l}$ can be written as

339: \begin{align}

340: P(\Bs | C_{\ell}, \Bd) &\propto e^{-\frac{1}{2} (\Bs-\hat{\Bs})^t (\BS^{-1} + \BN^{-1}) (\Bs-\hat{\Bs})}.

341: \end{align}

342: Thus, $P(\Bs | C_{\ell}, \Bd)$ is a Gaussian distribution with mean

343: equals to $\hat{\Bs}$ and a covariance matrix equals to $(\BS^{-1} +

344: \BN^{-1})^{-1}$.

345:

346: Sampling from this Gaussian distribution is straightforward, but

347: computationally somewhat cumbersome. First, draw two random white

348: noise maps $\omega_0$ and $\omega_1$ with zero mean and unit

349: variance. Then solve the equation

350: \begin{equation}

351: \left[\BS^{-1} + \BN^{-1}\right] \Bs = \BN^{-1}\Bd + \BS^{-\frac{1}{2}} \omega_0 +

352: \BN^{-\frac{1}{2}} \omega_1.

353: \label{eq:lin_sys}

354: \end{equation}

355: for $\Bs$. Since the white noise maps have zero mean, one immediately

356: sees that $\langle \Bs \rangle = \hat{\Bs}$, while a few more

357: calculations show that $\langle \Bs \Bs^{t} \rangle = (\BS^{-1} +

358: \BN^{-1})^{-1}$.

359:

360: The problematic part about this sampling step is the solution of the

361: linear system in Equation \ref{eq:lin_sys}. Since this a $\sim10^6

362: \times 10^6$ system for current CMB data sets, it cannot be solved by

363: brute force. Instead, one must use a method called Conjugate Gradients

364: (CG), which only requires multiplication of the coefficient matrix on

365: the left-hand side, not inversion. For details on these computations,

366: together with some ideas on preconditioning, see \citet{eriksen:2004}.

367:

368:

369: \subsection{Convergence issues in the low signal-to-noise regime}

370:

371: As originally applied to high-resolution CMB data, the Gibbs sampling

372: algorithm as described above has very slow convergence at the

373: high-$\ell$, low signal-to-noise part of the spectrum.  The reason for

374: the slow convergence is easy to understand in light of the above: When

375: sampling from $P(C_{\ell} | \Bs)$, the typical step size is given by

376: cosmic variance at all angular scales. In the high signal-to-noise

377: regime, cosmic variance dominates the noise variance, and we are able

378: to explore the full width of the posterior in only a few Gibbs

379: iterations. However, in the low signal-to-noise end, cosmic variance

380: is far smaller than the posterior variance, and it takes a

381: prohibitively long time to converge probabilistically.  This problem

382: of ``slow mixing'' of the Gibbs sampler is illustrated in figures

383: \ref{fig:TT_trace_plots} and \ref{fig:TT_correlation_length}.  The

384: long correlation length starting at signal-to-noise of unity leads to

385: extremely long run times in order to produce a reasonable number of

386: uncorrelated samples.

387:

388:

389: \section{A Low Signal-to-Noise MCMC Sampler}

390:

391: When sampling from the true posterior, the goal is to produce as many

392: independent samples from $P(C_{\ell}, \Bs | \Bd)$ as possible.  One might

393: intuitively guess that it should be straightforward to establish good

394: approximations to the posterior in the low signal-to-noise regime,

395: since in the limit of vanishing signal to noise we simply recover the

396: prior. This suggests that we look for a sampling scheme in which we

397: first sample a new spectrum from some approximation to the true

398: posterior independent on the current spectrum and CMB map, followed by

399: sampling the CMB map from the conditional $P(\Bs|C_{\ell}, \Bd)$. The

400: problem with such a direct scheme is that the accept probability will

401: involve a ratio of determinants which are too expensive to compute.

402:

403: We are therefore motivated to look for a sampling scheme in which we

404: can make a large variation in $C_{\ell}$ in the low signal-to-noise

405: regime, and make an associated {\it deterministic change} in the CMB

406: map, while still maintaining a reasonably high acceptance rate. The

407: motivation for a deterministic change is that it will avoid

408: introducing ratios of determinants which we cannot compute.

409:

410: \subsection{Proposal rule and acceptance probability}

411:

412: Assume that we have defined a deterministic

413: sampling scheme for $\Bs$, and that our new CMB map is given by some

414: function

415: \begin{equation}

416: \Bs_{n+1} = F(\Bs_{n}, C_{\ell}^{(n+1)}, C_{\ell}^{(n)} ).

417: \end{equation}

418: Then the condition of detailed balance for our MCMC

419: sampler requires that

420: \begin{equation}

421: F^{-1}(\Bs_{n+1}, C_{\ell}^{(n+1)}, C_{\ell}^{(n)}) = F(\Bs_{n+1}, C_{\ell}^{(n)}, C_{\ell}^{(n+1)}),

422: \end{equation}

423: or, in other words, that the inverse function is given by exchanging

424: the order of the spectra in the function $F$. One simple function which has this property is

425: \begin{equation}

426: \Bs_{n+1} = \left(\frac{C_{\ell}^{(n+1)}}{C_{\ell}^{(n)}}\right)^{\frac{1}{2}} \Bs_{n}

427: \end{equation}

428: The total proposal matrix is then

429: \begin{eqnarray}

430: w(C_{\ell}^{(n+1)}, \Bs_{n+1} | C_{\ell}^{(n)}, \Bs_{n}) & = &  w(C_{\ell}^{(n+1)} | C_{\ell}^{(n)}, \Bd)

431: \nonumber \\

432: & &  \delta \left( \Bs_{n+1} - \left(\frac{C_{\ell}^{(n+1)}}{C_{\ell}^{(n)}}\right)^{-\frac{1}{2}} \Bs_{n} \right),

433: \nonumber

434: \end{eqnarray}

435: and the ``reverse'' proposal is

436: \begin{eqnarray}

437:   w(C_{\ell}^{(n)}, \Bs_{n} | C_{\ell}^{(n+1)}, \Bs_{n+1}) & = &  w(C_{\ell}^{(n)} | C_{\ell}^{(n+1)}, \Bd)

438:   \nonumber \\

439:   & &  \delta \left( \Bs_{n} - \left(\frac{C_{\ell}^{(n)}}{C_{\ell}^{(n+1)}}\right)^{-\frac{1}{2}} \Bs_{n+1} \right).

440:   \nonumber

441: \end{eqnarray}

442: The condition of detailed balance including deterministic moves

443: requires the consideration of some technical points which we leave

444: to Appendix \ref{app:proof}. There

445: we show that the full Metropolis-Hastings accept probability reads

446: \begin{eqnarray}

447: A & = & \min \left[ 1,

448: \frac{e^{- \chi^{2}(\Bs_{n+1}, \Bd)}}{e^{-\chi^{2}(\Bs_{n}, \Bd)}}

449: \frac{w(C_{\ell}^{(n)} | C_{\ell}^{(n+1)}, \Bd)}{w(C_{\ell}^{(n+1)} | C_{\ell}^{(n)}, \Bd)}

450: \right]

451: \end{eqnarray}

452: The significance of the above is that we can make relatively large

453: changes to the power spectrum in the low signal-to-noise regime, where

454: $\BN^{-1}$ is getting small, since the $\chi^2$ is affected only very

455: mildly by changes in any low signal-to-noise mode.

456:

457: We note the interesting point (discussed more completely in Appendix

458: \ref{app:cov}) that if one changes variables in the

459: joint posterior from CMB maps, $\Bs$, to whitened maps, $\Bx =

460: \BC_{\ell}^{-\frac{1}{2}} \Bs$, and then Gibbs sample in the new

461: variables $(C_{\ell}, \Bx)$, the resulting accept probability is

462: numerically identitical to the above.  However, we note

463: the distinction here to emphasize the difference between MCMC

464: algorithms implementing deterministic proposals of maps given

465: $C_{\ell}$, and those sampling in a different set of variables, as

466: there could be other deterministic proposal schemes or

467: another change of variables which lead to improvements over the approach

468: presented in this paper.

469:

470: For the numerical demonstration of the  MCMC algorithm presented

471: in this paper, we use a simple symmetric Gaussian proposal, truncated

472: at $C_{\ell}>0$ (or, for polarization, the region where the resulting

473: CMB covariance matrix is positive definite), for the power spectrum,

474: \begin{equation}

475: w(C_{\ell}^{(n+1)} | C_{\ell}^{(n)}, \Bd) \propto e^{-\frac{1}{2}

476:   \left(\frac{C_{\ell}^{n+1}-C_{\ell}^{n}}{\tau_{\ell}}\right)^2} I(C_{\ell}>0),

477: \end{equation}

478: where $\tau_{\ell}$ is a measure of the typical step size taken

479: between two samples. Note that because this proposal density is

480: symmetric, the ratio of $C_{\ell}$ proposals cancels, and the

481: acceptance probability is entirely determined by the change in

482: $\chi^{2}$.

483:

484: It should be noted that while the above MCMC step satisfies detailed

485: balance, it is not {\it irreducable}, in the sense that there is not a

486: non-vanishing probability in reaching any state from any other state

487: in a finite number of MCMC steps; the phases are unchanged in each

488: MCMC step. However, alternating these steps with a traditional Gibbs

489: sampling step gives a combined ``two-step'' MCMC algorithm which

490: indeed is irreducable, and therefore provably converges to the joint

491: posterior. Once again, the details are left to the appendix for the

492: interested reader.

493:

494: \subsection{Optimization of the MCMC sampler}

495:

496: A general advantage of the Gibbs sampler is the fact that it is free

497: of tunable efficiency parameters. The same is not true for the

498: Metropolis-Hastings MCMC algorithm; for satisfactory sampling

499: performance, it typically has to be tuned quite extensively. In this

500: section, we describe three specific features that helps in this task,

501: namely 1) step size tuning, 2) slice sampling and 3) binning.

502:

503: First, we have to ensure that the step size of our Gaussian proposal

504: density roughly matches the width of the target distribution, in order

505: to maintain both a reasonable acceptance rate and high mobility. We do

506: this by performing an initial test run, producing typically a few

507: hundreds $C_{\ell}$ samples, and compute the standard deviation of

508: these samples for each $\ell$. These are then adopted as the proposal

509: widths for the main run, scaled by some number less than unity,

510: typically between 0.05 and 0.5. For the initial test run, we

511: approximate the posterior width by the noise variance alone,

512: \begin{equation}

513: \tau_{\ell}^{2} = \frac{2}{2\ell+1} \frac{N_{\ell}}{b_{\ell}^{2}},

514: \end{equation}

515: because the MCMC sampler is used only in the low signal-to-noise

516: regime. In this expression $N_{\ell}$ is the power spectrum of the

517: instrumental noise alone, and $b_{\ell}$ is the product of the

518: Legendre transform of the beam and the HEALPix window function.

519:

520: Next, Metropolis-Hastings MCMC is inefficient in spaces with too many

521: free parameters. For this reason, we divide the power spectrum

522: coefficients, $C_{\ell}$, into subsets, each containing typically only

523: 10--20 multipoles. Then we propose changes to one subset at a time,

524: while keeping all other multipoles fixed. Finally, we loop over

525: subsets, and thus effectively implement a multipole slice Gibbs

526: sampler for the full power spectrum.

527:

528: This is computationally feasible, because a single MCMC proposal only

529: requires a single $\chi^{2}$ evaluation, which has a computational cost

530: of a single spherical harmonic transform. Since drawing a full sky map

531: from $P(\Bs|C_{\ell}, \Bd)$ in the classical Gibbs sampling step

532: requires $\mathcal{O}(10^{2})$ spherical harmonic transforms, we can

533: indeed afford to perform many MCMC proposals for each Gibbs step,

534: without dominating the total cost.

535:

536: \begin{figure}

537: \mbox{\epsfig{file=trace_plot_phase1a.eps,width=\linewidth,clip=}}

538: \caption{Comparison of $C_{\ell}$ chains produced by standard Gibbs

539:   sampling (black) and by the Gibbs+MCMC hybrid (red) for three

540:   selected multipole bins. The simulation was based on full sky

541:   coverage and uniform noise. See text for full details. }

542: \label{fig:TT_trace_plots}

543: \end{figure}

544:

545: Nevertheless, for very high-resolution analysis it is often beneficial

546: to bin several $C_{\ell}$'s together, both in order to increase the

547: signal-to-noise of the joint coefficient, and to decrease the number

548: of parameters that needs to be sampled by MCMC. We implement this by

549: defining a new binned spectrum, weighted by $\ell(\ell+1)/2\pi$, as

550: follows,

551: \begin{equation}

552: C_{b} = \frac{1}{N_b}\sum_{\ell \in b} \frac{\ell(\ell+1)}{2\pi} C_{\ell}.

553: \end{equation}

554: Here $b=[\ell_{\textrm{min}}, \ell_{\textrm{max}}]$ denotes the

555: current bin, and $N_{b} = \ell_{\textrm{max}}-\ell_{\textrm{min}}+1$

556: is the number of multipoles within the bin. These new (and fewer)

557: coefficients are then sampled with the above MCMC sampler, after which

558: the original spectrum coefficients are given by

559: \begin{equation}

560: C_{\ell} = \frac{2\pi}{\ell(\ell+1)} C_{b}.

561: \end{equation}

562:

563: \begin{figure}

564: \mbox{\epsfig{file=correlation_funcs_phase1a.eps,width=\linewidth,clip=}}

565: \caption{Comparison of chain correlation functions for standard Gibbs

566:   sampling (blue) and Gibbs+MCMC (red), computed from the full-sky

567:   uniform noise temperature data set.. Note that while the correlation

568:   length goes to infinity with increasing $\ell$ (or equivalently, low

569:   signal-to-noise) for standard Gibbs sampling, it is $\lesssim40$

570:   everywhere for the MCMC hybrid case. }

571: \label{fig:TT_correlation_length}

572: \end{figure}

573:

574: \section{Testing and Validation}

575: \label{sec:simulations}

576:

577: We have implemented the new sampling step described above in the

578: previously Gibbs sampling code called ``Commander''

579: \citep{eriksen:2004,eriksen:2008a}, and in this section we demonstrate

580: its advantages compared to the old sampling algorithm. We consider two

581: different cases, namely high-$\ell$ temperature and low-$\ell$

582: polarization analysis. In the former case, we also analyse two cases,

583: with and without a sky cut. The former allows us to verify the results

584: against an analytically known answer, while the second demonstrates

585: that the sky cut does not degrade the sampling efficiency.

586:

587: \subsection{Temperature analysis}

588:

589: The high-$\ell$ temperature simulation is designed to mimic the 5-year

590: WMAP temperature data \citep{hinshaw:2008} with one exception, namely

591: that the noise is assumed spatially uniform, in order to facilitate

592: analytic comparison. Specifically, the CMB realization was drawn from

593: the best-fit $\Lambda$CDM model derived from WMAP alone

594: \citep{komatsu:2008}, including multipoles up to

595: $\ell_{\textrm{max}}=1000$, and then smoothed with the instrumental

596: beam of the WMAP V1 differencing assembly, and pixelized at

597: HEALPix\footnote{http://healpix.jpl.nasa.gov} resolution

598: $N_{\textrm{side}}=512$. Finally, uniform noise of $\sigma_0 =

599: 40\mu\textrm{K}$ RMS was added to each pixel. This corresponds to a

600: signal-to-noise ratio of unity at $\ell \sim 550$, roughly similar to

601: the 5-year WMAP data. We analyse this simulation both with and without

602: the WMAP KQ85 sky cut \citep{gold:2008}.

603:

604: In both analyses, we adopted the Gaussian proposal density with tuned

605: variances, as described above. We also bin the power spectrum in

606: progressively wide bins, starting at $\ell = 600$, to maintain a

607: reasonable signal-to-noise per sampled power spectrum parameter. Ten

608: bins were sampled jointly per proposal, while all others were kept

609: fixed.

610:

611: In the full-sky case, we produced a total of 31,800 samples over 60

612: chains, and in the cut sky case a total of 6800 samples. The cost for

613: producing one sample in the latter, and by far most expensive, set was

614: 2.5 CPU hours, for a total of 17\,000 CPU hours. The number of MCMC

615: steps per Gibbs step was one in the former and 20 in the latter.

616: (Since the the signal sampler dominates the cut sky Gibbs chain one can

617: perform more low S/N steps without slowing down the overall code significantly.)

618: In addition to these two main sample sets, we also

619: produced two longer chains with each 3500 samples for the full-sky

620: casee, both with and without the new MCMC step turned on, in order to

621: compare the Markov chain correlation lengths before and after

622: including the MCMC sampler.

623:

624: We first consider the full-sky data set, and in Figure

625: \ref{fig:TT_trace_plots} we show a segment of each of the two longer

626: chains for three selected multipole bins. The top panel shows

627: $\ell=600$, which is the first bin to be sampled by MCMC, the middle

628: panel shows $\ell=732-742$, where there is still some signal in the

629: data, and, finally, the bottom panel shows $\ell=855-1000$, which is

630: strongly noise dominated. Starting with the top panel, we see that the

631: red curve (Gibbs+MCMC) scatters significantly faster than the black

632: curve (Gibbs only), implying more efficient sampling. This trend

633: becomes even stronger with lower signal-to-noise, until the last case,

634: where the Gibbs-only chain essentially does not move at all, while the

635: MCMC sampler does probe the full range. Note, however, that even the

636: MCMC sampler has a significant correlation length in this range, and

637: this implies that there is still some room for improvement to be made

638: in defining our proposals.

639:

640: Next, these considerations are quantified in Figure

641: \ref{fig:TT_correlation_length}, where we plot the Markov chain

642: correlation length as a function of distance in the chain, for six

643: bins with and without the MCMC sampler. As first reported by

644: \citet{eriksen:2004}, we see that the Gibbs-only correlation length

645: increases dramatically with decreasing signal-to-noise, rendering the

646: algorithm essentially useless in this regime. However, we also see

647: that the new MCMC step effectively resolves this issue, as the

648: correlation length (here defined by having a correlation less than

649: 0.2) now is less than $\sim40$ steps. This is a dramatic improvement,

650: and makes the algorithm useful even in this range. Nevertheless, we

651: once again point out that it is possible to make further improvements

652: by establishing better proposal densities.

653:

654: \begin{figure}

655: \mbox{\epsfig{file=gr_phase1a.eps,width=\linewidth,clip=}}

656: \caption{Gelman-Rubin statistic for the full-sky, uniform noise

657:   temperature analysis. Note the feature at $\ell=600$, which marks

658:   the transition between standard Gibbs sampling and Gibbs+MCMC.}

659: \label{fig:TT_gr}

660: \end{figure}

661:

662: In Figure \ref{fig:TT_gr} we consider the convergence properties of

663: the $\sim30$k samples set, by computing the Gelman-Rubin statistic $R$

664: \citep{gelman:1992} as a function of $\ell$. Typically, one recommends

665: that $R$ should be less than, say, 1.2 in order to claim

666: convergence. We see that this holds everywhere for this sample set,

667: and typically it is even less than 1.05. Note also the step at

668: $\ell=600$, showing clearly the beneficial effect of the MCMC

669: sampler.

670:

671:

672: \begin{figure}

673: \mbox{\epsfig{file=post_phase1a.eps,width=\linewidth,clip=}}

674: \caption{High-$\ell$ temperature marginal posteriors computed with

675:   Gibbs+MCMC from the full-sky, uniform noise temperature data set,

676:   compared to analytic results.}

677: \label{fig:TT_posteriors}

678: \end{figure}

679:

680: Next, in Figure \ref{fig:TT_posteriors} we compare the marginal

681: distributions derived from this sample set with the analytic result,

682: \begin{equation}

683: P(C_{\ell}|\mathbf{d}) \propto \prod_{\ell \in b}

684: \frac{e^{-\frac{2\ell+1}{2}

685:     \frac{\sigma_{\ell}^{\textrm{S+N}}}{b_{\ell}^2

686:         C_{\ell}+N_{\ell}}}}

687:  {(b_{\ell}^2 C_{\ell}+N_{\ell})^{\frac{2\ell+1}{2}}}.

688: \end{equation}

689: Here $b=[\ell_{\textrm{min}}, \ell_{\textrm{max}}]$ indicates a given

690: multipole bin, $b_{\ell}$ denotes the product of the instrumental beam

691: and the HEALPix pixel window, and $\sigma_{\ell}^{\textrm{S+N}}$ is

692: the power spectrum of the noisy data map. We see that the new

693: algorithm reproduces the analytic distributions very well, and this

694: verifies the overall method.

695:

696: \begin{figure}

697: \mbox{\epsfig{file=spectrum_phase1b.eps,width=\linewidth,clip=}}

698: \caption{Temperature power spectrum estimated from cut sky temperature

699: data. The panels show the same spectrum, but emphasizing different

700: multipole ranges (full-range; S/N$\sim$1 transition region; and

701: high-$\ell$, low S/N).}

702: \label{fig:TT_spectrum}

703: \end{figure}

704:

705: Finally, the cut-sky power spectrum with one-sigma confidence regions

706: is shown in three panels in Figure \ref{fig:TT_spectrum}, focusing on

707: different $\ell$-ranges, namely all $\ell$'s, the $S/N \sim 1$

708: transition region, and the low $S/N$ region. This completes the

709: high-$\ell$ temperature analysis validation.

710:

711: \subsection{Polarization analysis}

712:

713: We now consider polarization analysis, and construct a new low-$\ell$

714: simulation for this purpose. This simulation does not mimic any

715: planned experiment, but is rather designed to highlight the analysis

716: method itself. Specifically, we drew a new CMB realization from the

717: best-fit WMAP $\Lambda$CDM spectrum that includes a non-zero tensor

718: contribution, including multipoles up to $\ell_{\textrm{max}}=150$,

719: and convolved this with a $3^{\circ}$ FWHM Gaussian beam, and

720: pixelized it at $N_{\textrm{side}} = 64$. Uniform noise of

721: $5\mu\textrm{K}$ RMS was added to the temperature component, and

722: $1\mu\textrm{K}$ RMS to the polarization components. The 5-year WMAP

723: polarization sky mask was imposed on the data.

724:

725: We allowed for non-zero $C_{\ell}^{TT}$, $C_{\ell}^{TE}$,

726: $C_{\ell}^{EE}$ and $C_{\ell}^{BB}$ spectra, but fixed $C_{\ell}^{TB}

727: = C_{\ell}^{EB} = 0$. These spectra were then individually binned to

728: maintain a reasonable signal-to-noise per bin. (Details on how to

729: introduce individual binning of each power spectrum were recently

730: described by Eriksen and Wehus, 2008.) Again, a tuned Gaussian proposal

731: density was used in the MCMC step. A total of 12\,000 samples were

732: produced over 12 chains, and the CPU time per sample was 55 seconds,

733: for a total of $\sim200$ CPU hours.

734:

735: \begin{figure}

736: \mbox{\epsfig{file=pol_trace_plots.eps,width=\linewidth,clip=}}

737: \caption{$C_{\ell}$ chains generated by Gibbs+MCMC hybrid for the

738:   cut-sky polarization data set. Only the highest multipole bin for

739:   each spectrum is shown ($\ell = 108-150$ for TT, $\ell = 88-150$ for

740:   TE, $\ell=101-150$ for EE and $\ell=61-150$ for BB).}

741: \label{fig:pol_trace_plots}

742: \end{figure}

743:

744: In Figure \ref{fig:pol_trace_plots} we show one $C_{\ell}$ chain for

745: each of the four sampled spectra, for the last (and therefore most

746: difficult) bin in each case. Note that the $C_{\ell}^{EE}$ and

747: $C_{\ell}^{BB}$ spectra have essentially vanishing signal-to-noise,

748: and therefore these chains reach zero values. Clearly, we see that

749: mixing properties of these chains are satisfactory, and the

750: correlation lengths are quite short.

751:

752: \begin{figure}

753: \mbox{\epsfig{file=gr_polarization.eps,width=\linewidth,clip=}}

754: \caption{Gelman-Rubin statistic for cut-sky polarization analysis.}

755: \label{fig:gr_polarization}

756: \end{figure}

757:

758: In Figure \ref{fig:gr_polarization} we show the Gelman-Rubin

759: statistics for each of the four power spectra, and with the single

760: exception of the very last bin of $C_{\ell}^{EE}$, all $R$ values are

761: well below 1.1. Thus, all spectra have converged well everywhere.

762:

763: \begin{figure}

764: \mbox{\epsfig{file=pol_spectra.eps,width=\linewidth,clip=}}

765: \caption{Marginal $C_{\ell}$ power spectra (red curves) estimated from

766:   cut sky polarization data. Gray bands indicate 68\% confidence

767:   regions, and the black lines show the input spectrum. (Note that the

768:   marginal spectra shown here are not individually unbiased estimators

769:   because of the correlations between TT, TE and EE. Proper treatment

770:   of the full joint polarization density will be considered separately

771:   in a future publication.)}

772: \label{fig:pol_spectrum}

773: \end{figure}

774:

775: Finally, in Figure \ref{fig:pol_spectrum} we show the reconstructed

776: marginal power spectra for each polarization component, overplotted on

777: the input spectrum. The agreement is very good. Note, however, that

778: these spectra are direct marginals, and not a joint maximum likelihood

779: estimate. They are therefore not individual unbiased estimators. In

780: particular, the marginal $C_{\ell}^{EE}$ power spectrum is biased

781: slightly high because of the combination of the

782: $C_{\ell}^{TT}C_{\ell}^{EE} - (C_{\ell}^{TE})^2 > 0$ positivity

783: constraint and relatively low signal-to-noise. Consideration of the

784: joint polarization posterior, which \emph{is} an unbiased estimator,

785: is postponed to a future publication.

786:

787:

788: %\section{WMAP Simulations}

789: %\begin{itemize}

790: %\item Temperature only - we will wait for 5 year to do polarization.

791: %\item  Re-do the WMAP sims. from the Eriksen et al 2004 method

792: %paper, but with appropriate S/N of the 3 and 5 year data.

793:

794: %\end{itemize}

795:

796:

797:

798:

799: \section{Conclusions}

800: \label{sec:conclusions}

801:

802: We have presented a new MCMC algorithm for the high-L, low

803: signal to noise limit of the joint posterior which

804: solves the slow probabilistic convergence of the traditional

805: Gibbs sampler in this regime.  This in principle allows sampling over the

806: joint posterior $p(C_{l}, \Bs | \Bd)$ over the entire range

807: of angular scales probed by current and future CMB experiments.

808: The limiting computational burden is now entirely in the map-making

809: step of Gibbs sampling, for which the cost per Gibbs iteration

810: now scales with the expense of multiplication by the inverse

811: noise matrix $\BN^{-1}$.  Assuming pixel uncorrelated (but scan weighted)

812: noise as a good approximation at small angular scales, the cost of

813: an $\BN^{-1}$ multiplcation is that of a forward and inverse spherical

814: harmonic transform, or ${\cal O}(\ell_{\textrm{max}}^{3})$.  Future work will attempt to push

815: the generalized Gibbs + MCMC sampling scheme presented here to smaller

816: angular scales, ultimately limited by the degree to which we can compute

817: harmonic transforms.

818:

819:

820:

821:

822: \begin{acknowledgements}

823:   We acknowledge use of the

824:   HEALPix\footnote{http://healpix.jpl.nasa.gov} software

825:   \citep{gorski:2005} and analysis package for deriving the results in

826:   this paper. HKE acknowledges financial support from the Research

827:   Council of Norway.

828: \end{acknowledgements}

829:

830:

831:

832: \appendix

833:

834: \section{Including Deterministic Proposals in MCMC}

835: \label{app:proof}

836: Here we review the derivation of the accept probability in Markov Chain Monte Carlo

837: when using deterministic proposals (or proposals where some of the degrees of freedom

838: are specified as deterministic functions of the past state and/or proposed

839: variations in some other degrees of freedom).  We first briefly review

840: the Metropolis-Hastings Markov Chain Monte Carlo algorithm and the proof of

841: its convergence, and then turn to the special case involving deterministic

842: proposals.  Much of the review of the MCMC algorithm here follows \citep{Sokal:1989}.

843: We also note that similar technical considerations including deterministic

844: elements in proposals are presented in \citep{Green:1995} in the context of

845: MCMC algorithms in which the dimension of the state space itself is included

846: as a random variable to be sampled over.

847:

848: The goal is the construction of a transition matrix

849: $T(C_{l}, \Bs | C_{l}', \Bs', \Bd)$ such that after initializing

850: the Markov Chain with a sample from any probability density $p_{0}(C_{l}, \Bs | \Bd)$,

851: we generate samples from a sequence of probability densities

852: \begin{equation}

853: p_{n+1}(C_{l}, \Bs | \Bd)  \equiv   \int d(C_{l}', \Bs') \

854: T(C_{l}, \Bs | C_{l}', \Bs', \Bd) \ p_{n}(C_{l}', \Bs' | \Bd)

855: \end{equation}

856: which eventually converge to an {\it equilibrium density} $\pi(C_{l}, \Bs | \Bd)$

857: \begin{equation}

858: \pi (C_{l}, \Bs | \Bd) = \lim_{n \rightarrow \infty}  p_{n}(C_{l}, \Bs | \Bd)

859: \end{equation}

860: We remind the reader

861: of the sufficient conditions to establish convergence of an MCMC algorithm:

862: {\it stationarity}, which means that the MCMC transition matrix satisfies

863: \begin{equation}

864: \pi(C_{l}, \Bs | \Bd) = \int d(C_{l}', \Bs') \ T(C_{l}, \Bs | C_{l}', \Bs', \Bd) \ \pi(C_{l}', \Bs' | \Bd)

865: \end{equation}

866: and {\it irreducability}, which means that for any two states, there is a finite

867: number of iterations which give a non-vanishing probability to transition from one

868: state to the other.  It is well known that these two properties are sufficient

869: to establish convergence, as can be seen simply from the triangle inequality

870: \begin{eqnarray}

871: \int d(C_{l}, \Bs) \ \left| \pi(C_{l}, \Bs | \Bd) - p_{n}(C_{l}, \Bs | \Bd) \right|

872: & = & \int d(C_{l}, \Bs) \ \left| \int d(C_{l}', \Bs') \ T(C_{l}, \Bs | C_{l}', \Bs')

873: \left( \pi(C_{l}', \Bs' | \Bd) - p_{n-1}(C_{l}', \Bs' | \Bd) \right) \right| \nonumber \\

874: & \le & \int d(C_{l}, \Bs) \  \int d(C_{l}', \Bs') \ T(C_{l}, \Bs | C_{l}', \Bs')

875: \left| \pi(C_{l}', \Bs' | \Bd) - p_{n-1}(C_{l}', \Bs' | \Bd)  \right| \nonumber \\

876: & = & \int d(C_{l}', \Bs') \ \left( \int d(C_{l}, \Bs) \  T(C_{l}, \Bs | C_{l}', \Bs') \right)

877: \left| \pi(C_{l}', \Bs' | \Bd) - p_{n-1}(C_{l}', \Bs' | \Bd)  \right| \nonumber \\

878: & = & \int d(C_{l}', \Bs') \

879: \left| \pi(C_{l}', \Bs' | \Bd) - p_{n-1}(C_{l}', \Bs' | \Bd)  \right| \nonumber

880: \end{eqnarray}

881:

882: The Metropolis-Hastings Markov Chain Monte Carlo algorithm is one

883: method of constructing such a transition matrix.  We choose {\it any}

884: proposal matrix $w(C_{l}, \Bs | C_{l}', \Bs', \Bd)$ and then accept

885: the proposed move with a probability

886: \begin{equation}

887: 0 \le A(C_{l}, \Bs | C_{l}', \Bs', \Bd) \le 1

888: \end{equation}

889: while rejecting the proposed move with probability $1 - A$ leads

890: to a ``null transition'' where the next state in the Markov Chain remains the same.

891: Application of this algorithm then leads to the sequence of probability densities

892: which satisfy

893: \begin{eqnarray}

894: p_{n+1}(C_{l}, \Bs | \Bd) & = &

895: \left( 1 - \int d(C_{l}', \Bs') \ A(C_{l}', \Bs' | C_{l}

896: , \Bs, \Bd) w(C_{l}', \Bs' | C_{l}, \Bs, \Bd) \right)

897: p_{n}(C_{l}, \Bs | \Bd) \nonumber \\

898: & & +

899: \int d(C_{l}', \Bs') \ A(C_{l}, \Bs | C_{l}', \Bs', \Bd) w(C_{l}, \Bs | C_{l}', \Bs', \Bd)

900: p_{n}(C_{l}', \Bs' | \Bd)

901: \end{eqnarray}

902: where the first term is the constribution to the probability density

903: $p_{n+1}$ if we reject any proposed move, while the second term

904: is the contribution from accepting the proposed move from any possible

905: previous state.  If we demand that, for a chosen proposal matrix, the accept probability satisfies

906: \begin{equation}

907: \pi(C_{l}', \Bs' | \Bd) w(C_{l}, \Bs | C_{l}', \Bs', \Bd) A(C_{l}, \Bs | C_{l}', \Bs', \Bd)

908: = A(C_{l}', \Bs' | C_{l}, \Bs, \Bd) w(C_{l}', \Bs' | C_{l}, \Bs, \Bd) \pi(C_{l}, \Bs | \Bd)

909: \end{equation}

910: then we see that the MH MCMC algorithm satisfies stationarity, i.e. denoting

911: by $T \circ \pi$ the density resulting from one application of the transition matrix

912: to $\pi$, we have directly from detailed balance

913: \begin{eqnarray}

914: T \circ \pi

915: & = & \left( 1 - \int d(C_{l}', \Bs') \ A(C_{l}', \Bs' | C_{l}

916: , \Bs, \Bd) w(C_{l}', \Bs' | C_{l}, \Bs, \Bd) \right)

917: \pi(C_{l}, \Bs | \Bd) \nonumber \\

918: & & +

919: \int d(C_{l}', \Bs') \ A(C_{l}, \Bs | C_{l}', \Bs', \Bd) w(C_{l}, \Bs | C_{l}', \Bs', \Bd)

920: \pi(C_{l}', \Bs' | \Bd) \nonumber \\

921: & = & \left( 1 - \int d(C_{l}', \Bs') \ A(C_{l}', \Bs' | C_{l}

922: , \Bs, \Bd) w(C_{l}', \Bs' | C_{l}, \Bs, \Bd) \right)

923: \pi(C_{l}, \Bs | \Bd) \nonumber \\

924: & & + \pi (C_{l}, \Bs | \Bd)

925: \int d(C_{l}', \Bs') \ A(C_{l}', \Bs' | C_{l}, \Bs, \Bd) w(C_{l}', \Bs' | C_{l}, \Bs, \Bd)

926:  \nonumber \\

927: & = & \pi(C_{l}, \Bs | \Bd)

928: \end{eqnarray}

929:

930: We now turn to the case where our proposal is of the form

931: \begin{equation}

932: w(\Bs', C_{l}' | \Bs, C_{l}) = \delta \left[ \Bs' - F(\Bs, C_{l}', C_{l}) \right] w(C_{l}' | C_{l}, \Bd)

933: \end{equation}

934: where we randomly propose a new power spectrum, posibly in a manner conditionally

935: denpendent on the current spectrum and the data, and then deterministically

936: compute a new CMB map with some function

937: \begin{equation}

938: \Bs' = F(\Bs, C_{l}', C_{l})

939: \end{equation}

940: To satisfy detailed balance with a non-vanishing accept probability

941: our function must satisfy

942: \begin{eqnarray}

943: \Bs' & = & F(\Bs, C_{l}', C_{l}) \nonumber \\

944: \Bs & = & F(\Bs', C_{l}, C_{l}')

945: \end{eqnarray}

946: or, that the inverse function is equivalent to interchanging the order

947: of the power spectrum arguements

948: \begin{equation}

949:  F(\Bs', C_{l}, C_{l}') = F^{-1}(\Bs',C_{l}', C_{l})

950: \end{equation}

951: In this paper, we have chosen one such function, given by

952: \begin{equation}

953: F(\Bs, C_{l}', C_{l}) = [\BC']^{1/2} [\BC]^{-1/2} \Bs

954: \end{equation}

955: where interchanging the spectra in the function above does in fact give

956: the inverse function itself.

957:

958: Our job now is to {\it derive} the accept probability such that we

959: satisfy stationarity (as discussed above).  For the proposal with deterministic

960: changes to some of the degrees of freedom, stationarity is satisfied if

961: \begin{eqnarray}

962: (T \circ \pi)(C_{l}, \Bs | \Bd)

963: & = & \left[ 1 - \int d(C_{l}'', \Bs'') \ A[C_{l}'', \Bs'' | C_{l}, \Bs]

964: \delta[\Bs'' - F(\Bs, C_{l}'', C_{l})] w(C_{l}'' | C_{l}, \Bd) \right] \pi(C_{l}, \Bs | \Bd) \nonumber \\

965: & & + \int d(C_{l}', \Bs') \ A[\Bs, C_{l} | \Bs', C_{l}'] \delta[\Bs - F(\Bs', C_{l}, C_{l}')]

966: w(C_{l} | C_{l}' , \Bd) \pi(C_{l}', \Bs' | \Bd) \nonumber

967: \end{eqnarray}

968: In order to determine the integral over the $\delta$-function in the

969: accept term above, we recall the identity for $\delta[G(\Bx)]$, where $G(\Ba) = 0$,

970: \begin{equation}

971: \delta[G(\Bx)] = \frac{\delta(\Bx - \Ba)}{\left| \partial G / \partial \Bx \right|_{a} }

972: \end{equation}

973: In our case, we can identify

974: \begin{equation}

975: G(\Bs') = \Bs - F(\Bs', C_{l}, C_{l}')

976: \end{equation}

977: which vanishes at $F^{-1}(\Bs,C_{l}, C_{l}') = F(\Bs, C_{l}', C_{l})$.  We also have the Jacobian

978: \begin{equation}

979: \left| \frac{\partial G}{ \partial \Bs'} \right|_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')}

980: = \left| \frac{\partial F}{ \partial \Bs'} \right|_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')}

981: \end{equation}

982: (i.e. $G(\Bs')$ is considered a function of $\Bs'$ with the other CMB map

983: $\Bs$ considered fixed) which therefore gives

984: \begin{equation}

985: \delta[\Bs - F(\Bs', C_{l}, C_{l}')] =

986:  \delta[\Bs' - F^{-1}(\Bs, C_{l}, C_{l}')]

987: \left| \frac{\partial F}{ \partial \Bs'} \right|^{-1}_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')}

988: \end{equation}

989: Inserting this into the condition for stationarity we have

990: \begin{eqnarray}

991: (T \circ \pi )(C_{l}, \Bs | \Bd)

992: & = & \left[ 1 - \int d(C_{l}'', \Bs'') \ A[C_{l}'', \Bs'' | C_{l}, \Bs]

993: \delta[\Bs'' - F(\Bs, C_{l}'', C_{l})] w(C_{l}'' | C_{l}, \Bd) \right] \pi(C_{l}, \Bs | \Bd) \nonumber \\

994: & & + \int d(C_{l}', s') \ A[s, C_{l} | s', C_{l}']

995: \left( \delta[\Bs' - F^{-1}(\Bs, C_{l}, C_{l}')]

996: \left| \frac{\partial F}{ \partial \Bs'} \right|^{-1}_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')} \right)

997: w(C_{l} | C_{l}' , \Bd) \pi(C_{l}', \Bs' | \Bd) \nonumber \\

998: & = & \left[ 1 - \int d(C_{l}'', \Bs'') \ A[C_{l}'', \Bs'' | C_{l}, \Bs]

999: \delta[\Bs'' - F(\Bs, C_{l}'', C_{l})] w(C_{l}'' | C_{l}, \Bd) \right] \pi(C_{l}, \Bs | \Bd) \nonumber \\

1000: & & + \int d(C_{l}', \Bs') \ A[\Bs, C_{l} | \Bs', C_{l}']

1001: \left( \delta[\Bs' - F(\Bs, C_{l}', C_{l})]

1002: \left| \frac{\partial F}{ \partial \Bs'} \right|^{-1}_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')} \right)

1003: w(C_{l} | C_{l}' , \Bd) \pi(C_{l}', \Bs' | \Bd) \nonumber

1004: \end{eqnarray}

1005: where in the second line we again used the property that the inverse $F^{-1}$ is equivalent

1006: to $F$ with the spectra arguements interchanged.

1007: We see from the above that a sufficient condition for stationarity is

1008: \begin{equation}

1009: \pi(C_{l}, \Bs | \Bd) w(C_{l}' | C_{l} , \Bd)

1010: A[\Bs', C_{l}' | \Bs, C_{l}]

1011: = A[\Bs, C_{l} | \Bs', C_{l}']

1012: \left(  \left| \frac{\partial F}{ \partial \Bs'} \right|^{-1}_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')} \right)

1013: w(C_{l} | C_{l}' , \Bd) \pi(C_{l}', \Bs' | \Bd)

1014: \end{equation}

1015: An accept probability which satisfies this condition therefore gives cancellation

1016: of the integrals over the $\delta$-functions for both the reject and accept

1017: contributions, leaving us exactly with $T \circ  \pi = \pi$.

1018: We therefore have the accept probability

1019: \begin{equation}

1020: A[\Bs', C_{l}' | \Bs, C_{l}] = \min \left[ 1,

1021: \frac{\pi(C_{l}', \Bs' | d)}{\pi(C_{l}, \Bs | \Bd)}

1022: \frac{w(C_{l} | C_{l}', \Bd)}{w(C_{l}' | C_{l}, \Bd)}

1023: \left(  \left| \frac{\partial F}{ \partial \Bs'} \right|^{-1}_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')} \right)

1024: \right]

1025: \end{equation}

1026: We give the expression above for the general case of any deterministic

1027: change in the CMB map with a function which satisfies $F(\Bs, C_{l}, C_{l}') = F^{-1}(\Bs, C_{l}', C_{l})$.

1028: We now explicitly evaluate this accept probability for the functional form chosen for this

1029: paper.

1030:

1031: Since we have $F(\Bs', C_{l}, C_{l}') = [\BC]^{1/2} [\BC']^{-1/2} \Bs'$, we have

1032: \begin{equation}

1033: \left(  \left| \frac{\partial F}{ \partial \Bs'} \right|_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')} \right)

1034: = \frac{|\BC|^{1/2}}{|\BC'|^{1/2}}

1035: \end{equation}

1036: Reminding the reader of the functional form of the joint posterior in

1037: eqn. \ref{eq:cmb_posterior}, we have the accept probability given by

1038: \begin{eqnarray}

1039: A[\Bs', C_{l}' | \Bs, C_{l}]

1040: & = &  \min \left[ 1,

1041: \frac{\pi(C_{l}', \Bs' | \Bd)}{\pi(C_{l}, \Bs | \Bd)}

1042: \frac{w(C_{l} | C_{l}', \Bd)}{w(C_{l}' | C_{l}, \Bd)}

1043: \left(  \left| \frac{\partial F}{ \partial \Bs'} \right|^{-1}_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')} \right)

1044: \right] \nonumber \\

1045: & = &  \min \left[ 1,

1046: \frac{e^{- \chi^{2}(\Bs', \Bd)}}{e^{- \chi^{2}(\Bs, \Bd)}}

1047: \frac{e^{- \Bs' [\BC']^{-1} \Bs'}}{e^{\Bs \BC^{-1} \Bs}}

1048: \frac{|\BC|^{1/2}}{|\BC'|^{1/2}}

1049: \frac{w(C_{l} | C_{l}', d)}{w(C_{l}' | C_{l}, d)}

1050: \left(  \left| \frac{\partial F}{ \partial \Bs'} \right|^{-1}_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')} \right)

1051: \right] \nonumber \\

1052: & = &  \min \left[ 1,

1053: \frac{e^{- \chi^{2}(\Bs', \Bd)}}{e^{- \chi^{2}(\Bs, \Bd)}}

1054: \frac{e^{- \Bs' [\BC']^{-1} \Bs'}}{e^{\Bs \BC^{-1} \Bs}}

1055: \frac{|\BC|^{1/2}}{|\BC'|^{1/2}}

1056: \frac{w(C_{l} | C_{l}', \Bd)}{w(C_{l}' | C_{l}, \Bd)}

1057: \left(  \frac{|\BC'|^{1/2}}{|\BC|^{1/2}} \right)

1058: \right] \nonumber \\

1059: & = &  \min \left[ 1,

1060: \frac{e^{- \chi^{2}(\Bs', \Bd)}}{e^{- \chi^{2}(\Bs, \Bd)}}

1061: \frac{w(C_{l} | C_{l}', \Bd)}{w(C_{l}' | C_{l}, \Bd)}

1062: \right]

1063: \label{eq:accept_prob}

1064: \end{eqnarray}

1065: where the last line follows from the invariance of the quadratic form

1066: under the functional mapping

1067: $\Bs' [\BC']^{-1} \Bs'  =  \Bs \BC^{-1} \Bs$.  Finally, we note that for the special

1068: case of a symmmetric proposal matrix where $w(C_{l}' | C_{l}, \Bd) = w(C_{l} | C_{l}', \Bd)$, the

1069: accept probability is completely determined by the (exponeniated) change in $\chi^{2}$

1070: \begin{equation}

1071: A[\Bs', C_{l}' | \Bs, C_{l}] =

1072:   \min \left[ 1,

1073: \frac{e^{- \chi^{2}(\Bs', \Bd)}}{e^{- \chi^{2}(\Bs, \Bd)}} \right]

1074: \end{equation}

1075: As emphasized earlier in the main part of the text, the above allows large

1076: changes to the spectrum precisely where the signal to noise is getting small,

1077: as $\chi^{2}$ does not change much in this regime.

1078:

1079:

1080:

1081: \section{Relation to Gibbs Sampling in a Change of Variables}

1082: \label{app:cov}

1083: We note here another interesting approach to an MCMC algorithm in a

1084: {\it different set of variables} which in fact allows for large

1085: moves in the spectrum in the low signal to noise regime.  We define the

1086: CMB map

1087: \begin{equation}

1088: \Bx = \BC^{-1/2} \Bs

1089: \end{equation}

1090: We therefore have the joint posterior {\it in the new variables} according to

1091: \begin{equation}

1092: p(C_{l}, \Bs | \Bd) d(C_{l}, \Bs) =

1093: p(C_{l}, \Bx | \Bd) \left| \frac{\partial \Bs}{\partial x} \right| d(C_{l}, \Bx)

1094: \end{equation}

1095: which is explicitly, up to a normalization constant

1096: \begin{equation}

1097: -2 \log p(C_{l}, \Bx | d) = (\Bd - \BC^{1/2} \Bx) \BN^{-1}

1098: (\Bd - \BC^{1/2} \Bx) - \| \Bx \|^{2}

1099: \end{equation}

1100: Then {\it traditional Gibbs sampling in the new variables} leads to an accept

1101: probability when changing the spectrum given the change of variable map $x$ as

1102: \begin{equation}

1103: A(C_{l}', \Bx | C_{l}, \Bx) =

1104: \min \left[ 1,

1105: \frac{e^{-(\Bd - [\BC']^{1/2} \Bx) \BN^{-1}(\Bd - [\BC']^{1/2} \Bx)}}

1106: {e^{-(\Bd - \BC^{1/2} \Bx) \BN^{-1}(\Bd - \BC^{1/2} \Bx)}}

1107: \frac{w(C_{l} | \Bx, C_{l}', \Bd)}{w(C_{l}' | \Bx, C_{l}, \Bd)} \right]

1108: \end{equation}

1109: where in the above the proposed variation in the spectrum can now be

1110: conditionally dependent on the current change of variable map $\Bx$.

1111: Assuming a symmetric proposal, or one conditionally independent of $\Bx$

1112: leads to an accept probability which is {\it numerically the same as

1113: \ref{eq:accept_prob} }, and also has the same property - large moves in the

1114: spectrum are possible in the low signal to noise regime.

1115: As a side note, we can see that $\log p(C_{l} | \Bx, \Bd)$

1116: is quadratic in $C_{l}^{1/2}$, and suggests a proposal

1117: given by a Gaussian in $C_{l}^{1/2}$.  However there are two problems with this

1118: scheme - sampling in $C_{l}^{1/2}$ will result in re-introducing a Jacobian

1119: factor given by the ratio of $|C'|^{1/2} / |C|^{1/2}$ which results typically

1120: in low acceptance probabilities, and furthermore we cannot afford to exactly

1121: compute the local ``Fisher'' covariance matrix for each $\Bx$.

1122: Because of these difficulties, we in general need to produce a proposal

1123: for $C_{l}$ and then compute the accept probability above.

1124:

1125: We emphasize an important distinction between MCMC with deterministic

1126: steps {\it in the original variables} $(C_{l}, \Bs)$ and Gibbs sampling

1127: in the change of variables $(C_{l}, \Bx)$.  It is only for the specific

1128: functional form that we have chosen for this paper that the numerical value of the

1129: accept probabilities for $A(C_{l}' , \Bs' | C_{l}, \Bs)$ and $A(C_{l}', \Bx | C_{l}, \Bx)$

1130: are the same.

1131:

1132: At first glance, it might appear

1133: that a random variation in some of the variables followed by a deterministic

1134: change in the complementary set is always equivalent to random variation

1135: in a new set of variables.  For notational convenience, we will assume the state

1136: space is separated into two sets of variables $(\Bx,\By)$, i.e. for the CMB sampling

1137: context we have $(\Bs, C_{l})$.  Now, to make the distinction between a change

1138: of variables and deterministic steps in MCMC more precise, consider a ``global'' change of variables

1139: of the form

1140: \begin{eqnarray}

1141: \Bu & = & F(\Bx,\By) \nonumber \\

1142: \Bv & = & \By

1143: \end{eqnarray}

1144: with Jacobian

1145: \begin{equation}

1146: \left| \begin{array}{cc}

1147: \frac{\partial \Bu}{\partial \Bx} & \frac{\partial \Bv}{\partial \Bx} \\

1148: \frac{\partial \Bu}{\partial \By} & \frac{\partial \Bv}{\partial \By} \end{array}

1149: \right| =

1150: \left| \begin{array}{cc}

1151: \frac{\partial F}{\partial \Bx} & 0 \\

1152: \frac{\partial F}{\partial \By} & \id \end{array}

1153: \right| = \left| \frac{\partial F}{\partial \Bx} \right|

1154: \end{equation}

1155: A Gibbs sampling step varying $v$ with $u$ fixed, has accept probability

1156: \begin{eqnarray}

1157: A(\By_{n+1}, \Bu_{n} | \By_{n}, \Bu_{n})

1158: & = & \min \left[ 1,

1159: \frac{\pi(\By_{n+1} | \Bu_{n}, \Bd)}{\pi(\By_{n} | \Bu_{n}, \Bd)}

1160: \frac{w(\By_{n} | \Bu_{n}, \Bd)}{w(\By_{n+1} | \Bu_{n}, \Bd)} \right] \nonumber \\

1161: & = & \min \left[ 1,

1162: \frac{\pi(\By_{n+1}, \Bx_{n+1} | \Bd)}{\pi(\By_{n} , \Bx_{n} | \Bd)}

1163: \left( \left| \frac{\partial F}{\partial \Bx} \right|_{\Bx_{n+1}, \By_{n+1}}

1164: \left| \frac{\partial F}{\partial \Bx} \right|^{-1}_{\Bx_{n}, \By_{n}} \right)

1165: \frac{w(\By_{n} | \Bu_{n}, \Bd)}{w(\By_{n+1} | \Bu_{n}, \Bd)} \right] \nonumber \\

1166: \end{eqnarray}

1167: where in the above we have the constraint

1168: \begin{eqnarray}

1169: \Bx_{n+1} & = & F^{-1}(\Bu_{n}, \By_{n+1}) \nonumber \\

1170: \Bx_{n} & = & F^{-1}(\Bu_{n}, \By_{n})

1171: \end{eqnarray}

1172: Now consider an MCMC step in the original variables of the form

1173: \begin{equation}

1174: w(\Bx_{n+1}, \By_{n+1} | \Bx_{n}, \By_{n }) = w(\By_{n+1} | \By_{n}, \Bx_{n}, \Bd)

1175: \delta \left( x_{n+1} - H(x_{n}, y_{n+1}, y_{n}) \right)

1176: \end{equation}

1177: with general accept probability, according to the discussion above

1178: \begin{eqnarray}

1179: A(\By_{n+1}, \Bx_{n+1} | \By_{n}, \Bx_{n}) & = & \min \left[ 1,

1180: \frac{\pi(\By_{n+1}, \Bx_{n+1} | \Bd)}{\pi(\By_{n}, \Bx_{n} | \Bd)}

1181: \frac{w(\By_{n} | \Bx_{n}, \By_{n+1}, \Bd)}{w(\By_{n+1} | \Bx_{n+1}, \By_{n}, \Bd)}

1182: \left( \left| \frac{\partial H}{\partial \Bx} \right|^{-1}_{\Bx_{n+1} = H^{-1}(\Bx_{n},\By_{n}, \By_{n+1})} \right)

1183: \right]

1184: \end{eqnarray}

1185: Interestingly enough this suggests that we can set $H$ to be the function

1186: \begin{equation}

1187: H(\Bx, \By_{n+1}, \By_{n}) = F^{-1} \left( F(\Bx, \By_{n}), \By_{n+1} \right)

1188: \end{equation}

1189: Does this function have the correct properties for its inverse?  Assuming we

1190: have computed in the forward direction $\Bx' = H(\Bx, \By_{n+1}, \By_{n})$, we can invert to find $x$ by

1191: computing sequentially

1192: \begin{eqnarray}

1193: F(\Bx', \By_{n+1}) & = & F(\Bx, \By_{n}) \nonumber \\

1194: \Bx & = & F^{-1} \left( F(\Bx', \By_{n+1}), \By_{n} \right) \nonumber \\

1195: & \equiv & H(\Bx', \By_{n}, \By_{n+1})

1196: \end{eqnarray}

1197: where the last line follows from definition of the forward $H$.

1198: Since we have, by definition

1199: \begin{eqnarray}

1200: \Bx' & = & H(\Bx, \By_{n+1}, \By_{n}) \nonumber \\

1201: \Bx & \equiv & H^{-1}(\Bx', \By_{n+1}, \By_{n}) \nonumber

1202: \end{eqnarray}

1203: we therefore have shown that

1204: \begin{equation}

1205: H^{-1}(\Bx', \By_{n+1}, \By_{n}) = H(\Bx', \By_{n}, \By_{n+1})

1206: \end{equation}

1207: as required for a non-vanishing accept probability.

1208: The above as a function of $x$ has Jacobian

1209: \begin{eqnarray}

1210: \left| \frac{\partial H}{\partial \Bx} \right|

1211: & = & \left| \frac{\partial F^{-1}}{\partial \Bu}  \right|_{( \Bu(\Bx, \By_{n}), \By_{n+1})}

1212: \  \left| \frac{\partial F}{\partial \Bx}  \right|_{(\Bx,\By_{n})} \nonumber \\

1213: & = & \left| \frac{\partial F}{\partial \Bx}  \right|^{-1}_{( \Bx, \By_{n+1})}

1214: \  \left| \frac{\partial F}{\partial \Bx}  \right|_{(\Bx,\By_{n})} \nonumber \\

1215: \end{eqnarray}

1216: However, when evaluated at $\Bx_{n+1} = H^{-1}(\Bx_{n}, \By_{n}, \By_{n+1})$, we will not

1217: in general satisfy the required equality required for numerical equivalence

1218: \begin{equation}

1219: \left( \left| \frac{\partial H}{\partial \Bx} \right|^{-1}_{\Bx_{n+1} = H^{-1}(\Bx_{n},\By_{n}, \By_{n+1})} \right)

1220: \neq  \left( \left| \frac{\partial F}{\partial \Bx} \right|_{\Bx_{n+1}, \By_{n+1}}

1221: \left| \frac{\partial F}{\partial \Bx} \right|^{-1}_{\Bx_{n}, \By_{n}} \right)

1222: \label{eq:jacobian_identity}

1223: \end{equation}

1224: So in general, while we can use any function $F(\Bx,\By)$ to generate deterministic

1225: moves in the original variables within MCMC, this is not equivalent to

1226: a Gibbs sampling step $p(\By_{n+1} | \Bu_{n}, \Bd)$ in the new variables using $(F(\Bx,\By), \By)$ as a global change of variables.

1227:

1228: However, using the above construction for

1229: the CMB change of variables, we have explcitly

1230: \begin{eqnarray}

1231: F^{-1} \left( F(\Bs, C_{l}), C_{l}' \right) & = &

1232: [\BC']^{1/2} \left( \BC^{-1/2} \Bs \right)

1233: \end{eqnarray}

1234: which is exactly the functional form used for the deterministic MCMC steps.  In this

1235: case, it is because the Jacobian of our deterministic change in the

1236: CMB map is independent of the current CMB map $\Bs$ (and only dependent

1237: on the proposed and current spectra) that we have numerical

1238: equivalence of the accept probabilities.

1239:

1240: So in summary, while we can use any mapping $F(\Bx,\By)$ to generate deterministic

1241: steps for use in MCMC, the accept probability is not equivalent to a conditional

1242: step $p(\By | \Bu, \Bd)$ using $F(\Bx,\By)$ in a change of variables due to the general

1243: ``location'' dependence of the Jacobian.  Furthermore, setting

1244: $H(\Bx, \By', \By) = F^{-1} ( F(\Bx,\By), \By')$ is not the most general form

1245: for a function that satisfies the detailed balance requirement

1246: $H^{-1}(\Bx, \By', \By) = H(\Bx, \By, \By')$.

1247: In this sense then, a change of variables as an approach to more efficiently generating

1248: samples from a probability density is distinct from a strategy of designing an MCMC

1249: algorithm (in any chosen representation of the variables) with deterministic

1250: changes of some of the degrees of freedom.  Both approaches are interesting, and

1251: advances in either approach for Bayesian CMB analysis could lead to improvements over

1252: the approach presented in this paper.

1253:

1254:

1255:

1256:

1257:

1258: \begin{thebibliography}{}

1259:

1260: \bibitem[Abramowitz \& Stegun(1972)]{abramowitz:1972} Abramowitz, M.,

1261:   \& Stegun, I.~A.\ 1972, Handbook of Mathematical Functions, New

1262:   York: Dover, 1972,

1263:

1264: \bibitem[Bennett et al.(2003)]{bennett:2003} Bennett, C. L., et al.\

1265: 2003a, \apjs, 148, 1

1266:

1267: \bibitem[Chu et al.(2005)]{chu:2005} Chu, M., Eriksen, H.~K., Knox,

1268:   L., G{\'o}rski, K.~M., Jewell, J.~B., Larson, D.~L., O'Dwyer, I.~J.,

1269:   \& Wandelt, B.~D.\ 2005, \prd, 71, 103002

1270:

1271: % First method paper

1272: \bibitem[Eriksen et al.(2004)]{eriksen:2004}

1273: Eriksen, H.~K., et al.\ 2004, \apjs, 155, 227

1274:

1275: % WMAP3 reanalysis paper

1276: \bibitem[Eriksen et al.(2007a)]{eriksen:2007a} Eriksen, H.~K., et al.\

1277: 2007a, \apj, 656, 641

1278:

1279: % WMAP3 polarization paper

1280: \bibitem[Eriksen et al.(2007b)]{eriksen:2007b} Eriksen, H.~K., Huey,

1281: G., Banday, A.~J., G{\'o}rski, K.~M., Jewell, J.~B., O'Dwyer, I.~J.,

1282: \& Wandelt, B.~D.\ 2007b, \apjl, 665, L1

1283:

1284: % Foreground method paper

1285: \bibitem[Eriksen et al.(2008a)]{eriksen:2008a} Eriksen, H.~K., Jewell,

1286: J.~B., Dickinson, C., Banday, A.~J., G{\'o}rski, K.~M.,

1287: \& Lawrence, C.~R.\ 2008a, \apj, 676, 10

1288:

1289: % WMAP3 foreground analysis

1290: \bibitem[Eriksen et al.(2008b)]{eriksen:2008b} Eriksen, H.~K.,

1291: Dickinson, C., Jewell, J.~B., Banday, A.~J., G{\'o}rski, K.~M.,

1292: \& Lawrence, C.~R.\ 2008b, \apjl, 672, L87

1293:

1294: \bibitem[Eriksen \& Wehus(2008)]{wehus:2008} Eriksen, H.~K. \&

1295:   Wehus, I.~K.\ 2008a, \apjs, submitted, [astro-ph/XXXXXX]

1296:

1297: \bibitem[Gelman \& Rubin(1992)]{gelman:1992}

1298: Gelman, A., \& Rubin, D. 1992, Stat. Sci., 7, 457

1299:

1300: \bibitem[Gold et al.(2008)]{gold:2008} Gold, B., et al.\ 2008,

1301: [arXiv:0803.0715]

1302:

1303: \bibitem[G{\'o}rski et al.(2005)]{gorski:2005}

1304:   G{\' o}rski, K.~M., Hivon, E., Banday, A.~J., Wandelt, B.~D.,

1305:   Hansen, F.\,K., Reinecke, M., \& Bartelmann, M. 2005, \apj, 622, 759

1306:

1307: \bibitem[Green (1995)]{Green:1995}

1308: Green, P.; 1995, Biometrika, 82: 711-732

1309:

1310: \bibitem[Gupta \& Nagar(2000)]{gupta:2000}

1311:   Gupta, A.~K. \& Nagar, D.~K. 2000, Matrix Variate Distributions

1312:

1313: \bibitem[Hinshaw et al.(2007)]{hinshaw:2007} Hinshaw, G., et al.\

1314: 2007, \apjs, 170, 288

1315:

1316: \bibitem[Hinshaw et al.(2008)]{hinshaw:2008} Hinshaw, G., et al.\

1317: 2008, ApJ, submitted, [arXiv:0803.0732]

1318:

1319: \bibitem[Hivon et al.(2002)]{hivon:2002} Hivon, E., G{\' o}rski,

1320:   K.~M., Netterfield, C.~B., Crill, B.~P., Prunet, S., \& Hansen, F.\

1321:   2002, \apj, 567, 2

1322:

1323: \bibitem[Jewell et al.(2002)]{jewell:2002}

1324:   Jewell, J., Levin, S., \& Anderson, C.  H.  2002, astro-ph 0209560v1

1325:

1326: \bibitem[Jewell et al.(2004)]{jewell:2004}

1327:   Jewell, J., Levin, S., \& Anderson, C.  H.  2004, \apj, 609, 1

1328:

1329: \bibitem[Komatsu et al.(2008)]{komatsu:2008} Komatsu, E., et al.\

1330: 2008, [arXiv:0803.0547]

1331:

1332: \bibitem[Larson et al.(2007)]{larson:2007} Larson, D.~L., Eriksen,

1333: H.~K., Wandelt, B.~D., G{\'o}rski, K.~M., Huey, G., Jewell, J.~B., \&

1334: O'Dwyer, I.~J.\ 2007, \apj, 656, 653

1335:

1336: \bibitem[Liu(2001)]{liu:2001} Liu, J. S., Monte Carlo Strategies in

1337:   Scientific Computing, Cambridge, USA: Springer, 2001,

1338:

1339: \bibitem[O'Dwyer et al.(2004)]{odwyer:2004} O'Dwyer, I.~J., et al.\

1340: 2004, \apjl, 617, L99

1341:

1342: \bibitem[Page et al.(2007)]{page:2007} Page, L., et al.\ 2007,

1343: \apjs, 170, 335

1344:

1345: \bibitem[Seljak \& Zaldarriaga(1996)]{seljak:1996} Seljak, U., \&

1346:   Zaldarriaga, M.\ 1996, \apj, 469, 437

1347:

1348: \bibitem[Smoot et al.(1992)]{smoot:1992} Smoot, G.~F., et al.\

1349: 1992, \apjl, 396, L1

1350:

1351: \bibitem[Sokal (1989)]{Sokal:1989}

1352: Sokal, A.D.; ``Monte Carlo methods in statistical mechanics: foundations

1353: and new algorithms'', {\it Cous de Troisi\`{e}me Cycle de la Physique en Suisse Romande},

1354: Lausanne.

1355:

1356: \bibitem[Taylor et al.(2007)]{taylor:2007} Taylor, J.~F., Ashdown,

1357: M.~A.~J., \& Hobson, M.~P.\ 2007, MNRAS, submitted, [arXiv:0708.2989]

1358:

1359: \bibitem[Wandelt et al.(2004)]{wandelt:2004}

1360:   Wandelt, B.~D., Larson, D.~L., \& Lakshminarayanan, A.\ 2004, \prd,

1361:   70, 083511

1362:

1363: \bibitem[Zaldarriaga \& Seljak(1997)]{zaldarriaga:1997}

1364: Zaldarriaga, M., \& Seljak, U.\ 1997, \prd, 55, 1830

1365:

1366: \end{thebibliography}

1367:

1368:

1369: \end{document}