0807.0624/ms.tex
1: \documentclass{emulateapj}
2: %\documentclass[12pt, preprint]{aastex}
3: 
4: \usepackage{float}
5: \usepackage{amsmath}
6: \usepackage{epsfig,floatflt}
7: \usepackage{subfigure}
8: 
9: \newcommand{\npix}{N_{\textrm{pix}}}
10: \newcommand{\BA}{\mathbf{A}}
11: \newcommand{\BB}{\mathbf{B}}
12: \newcommand{\BC}{\mathbf{C}}
13: \newcommand{\Ba}{\mathbf{a}}
14: \newcommand{\Bb}{\mathbf{b}}
15: \newcommand{\Bc}{\mathbf{c}}
16: \newcommand{\Bs}{\mathbf{s}}
17: \newcommand{\BS}{\mathbf{S}}
18: \newcommand{\Bn}{\mathbf{n}}
19: \newcommand{\BN}{\mathbf{N}}
20: \newcommand{\Bm}{\mathbf{m}}
21: \newcommand{\Bf}{\mathbf{f}}
22: \newcommand{\BF}{\mathbf{F}}
23: \newcommand{\Bd}{\mathbf{d}}
24: \newcommand{\BW}{\mathbf{W}}
25: \newcommand{\BP}{\mathbf{P}}
26: \newcommand{\id}{\mathbf{1}}
27: \newcommand{\Bx}{\mathbf{x}}
28: \newcommand{\By}{\mathbf{y}}
29: \newcommand{\Br}{\mathbf{r}}
30: \newcommand{\Bu}{\mathbf{u}}
31: \newcommand{\Bv}{\mathbf{v}}
32: \newcommand{\Bt}{\mathbf{t}}
33: \newcommand{\BD}{\mathbf{D}}
34: \newcommand{\BU}{\mathbf{U}}
35: \newcommand{\BM}{\mathbf{M}}
36: \newcommand{\BT}{\mathbf{T}}
37: \newcommand{\BG}{\mathbf{G}}
38: \newcommand{\BPi}{\mathbf{\Pi}}
39: \newcommand{\Cell}{C_{\ell}}
40: \newcommand{\muK}{\mu\textrm{K}}
41: 
42: 
43:     \newtheorem{theorem}{Theorem}[section]
44:     \newtheorem{lemma}[theorem]{Lemma}
45:     \newtheorem{proposition}[theorem]{Proposition}
46:     \newtheorem{corollary}[theorem]{Corollary}
47: 
48:     \newenvironment{proof}[1][Proof]{\begin{trivlist}
49:     \item[\hskip \labelsep {\bfseries #1}]}{\end{trivlist}}
50:     \newenvironment{definition}[1][Definition]{\begin{trivlist}
51:     \item[\hskip \labelsep {\bfseries #1}]}{\end{trivlist}}
52:     \newenvironment{example}[1][Example]{\begin{trivlist}
53:     \item[\hskip \labelsep {\bfseries #1}]}{\end{trivlist}}
54:     \newenvironment{remark}[1][Remark]{\begin{trivlist}
55:     \item[\hskip \labelsep {\bfseries #1}]}{\end{trivlist}}
56: 
57:     \newcommand{\qed}{\nobreak \ifvmode \relax \else
58:           \ifdim\lastskip<1.5em \hskip-\lastskip
59:           \hskip1.5em plus0em minus0.5em \fi \nobreak
60:           \vrule height0.75em width0.5em depth0.25em\fi}
61: 
62: 
63: 
64: \begin{document}
65: 
66: 
67: \title{A Markov Chain Monte Carlo Algorithm for analysis of low
68:   signal-to-noise CMB data} 
69: 
70: \author{J. B.  Jewell\altaffilmark{1, 3, 4}, H.\ K.\
71:   Eriksen\altaffilmark{2,5,6}, B.\ D.\ Wandelt\altaffilmark{7,8}, 
72:  I.\ J.\ O'Dwyer\altaffilmark{3},  Greg Huey\altaffilmark{3}, and K. M.
73:   G\'{o}rski\altaffilmark{3,4,9}}
74: 
75: \altaffiltext{1}{email: Jeffrey.B.Jewell@jpl.nasa.gov}
76: \altaffiltext{2}{email: h.k.k.eriksen@astro.uio.no}
77: 
78: \altaffiltext{3}{Jet Propulsion Laboratory, 4800 Oak
79:   Grove Drive, Pasadena CA 91109} 
80: 
81: \altaffiltext{4}{California Institute of Technology, Pasadena, CA
82:   91125} 
83: 
84: \altaffiltext{5}{Institute of Theoretical Astrophysics, University of
85: Oslo, P.O.\ Box 1029 Blindern, N-0315 Oslo, Norway}
86: 
87: \altaffiltext{6}{Centre of
88: Mathematics for Applications, University of Oslo, P.O.\ Box 1053
89: Blindern, N-0316 Oslo}
90: 
91: \altaffiltext{7}{Department of Physics, University of Illinois,
92:   Urbana, IL 61801}
93: 
94: \altaffiltext{8}{Astronomy Department, University of Illinois at
95:   Urbana-Champaign, IL 61801-3080}
96: 
97: \altaffiltext{9}{Warsaw University Observatory, Aleje Ujazdowskie 4, 00-478 Warszawa,
98:   Poland}
99: 
100: 
101: \date{Received - / Accepted -}
102: 
103: \begin{abstract}
104:   We present a new Monte Carlo Markov Chain algorithm for CMB analysis
105:   in the low signal-to-noise regime. This method builds on and
106:   complements the previously described CMB Gibbs sampler, and
107:   effectively solves the low signal-to-noise inefficiency problem of
108:   the direct Gibbs sampler. The new algorithm is a simple
109:   Metropolis-Hastings sampler with a general proposal rule for the
110:   power spectrum, $C_{\ell}$, followed by a particular deterministic
111:   rescaling operation of the sky signal, $\mathbf{s}$. The acceptance
112:   probability for this joint move depends on the sky map only through
113:   the difference of $\chi^2$'s between the original and proposed sky
114:   sample, which is close to unity in the low signal-to-noise regime.
115:   The algorithm is completed by alternating this move with a standard
116:   Gibbs move. Together, these two proposals constitute a
117:   computationally efficient algorithm for mapping out the full joint
118:   CMB posterior, both in the high and low signal-to-noise regimes.
119: \end{abstract}
120: 
121: \keywords{cosmic microwave background --- cosmology: observations --- 
122: methods: numerical}
123: 
124: \maketitle
125: 
126: \section{Introduction}
127: 
128: Since the detection of anisotropy in the Cosmic Microwave Background
129: (CMB; Smoot et al.\ 1992), there has been an emphasis on likelihood or Bayesian methods
130: for the inference of cosmological parameters and their error bars, or
131: more generally, their confidence intervals. CMB analysis is most
132: suitably addressed in a Bayesian, as opposed to frequentist,
133: framework, simply because the observed microwave sky is interpreted as
134: a single realization of a spatial random process. 
135: 
136: Early measurements of the CMB were limited to signal to noise ratios
137: of order unity at relatively low angular scales, where direct
138: evaluation of the likelihood for the power spectrum or cosmological
139: parameters is possible. However, the ${\cal O}(N^{3})$ scaling of
140: computational expense with pixel number $N$ prohibits direct
141: likelihood evaluation for current and future CMB observations.
142: Motivated by the scientific potential of CMB data with increasingly
143: high spatial resolution, yet beset with systematics including partial
144: sky coverage and foregrounds, an iterative method of sampling from the
145: Bayes posterior, using a special case of Markov Chain Monte Carlo
146: (MCMC) known as Gibbs sampling, was introduced by \citep{jewell:2002,jewell:2004}.
147: The method was later independently discovered and applied to COBE data
148: \citep{wandelt:2004}, numerically extended to high-resolution on the
149: sphere \citep{eriksen:2004}, applied to analysis of the WMAP
150: \citep{bennett:2003, hinshaw:2007, page:2007} data \citep{odwyer:2004,
151:   eriksen:2007a, eriksen:2007b}, as well as generalized to include
152: inference of foreground model parameters \citep{eriksen:2008a,
153:   eriksen:2008b}.
154: 
155: While Gibbs sampling provably converges to the Bayes posterior over
156: the entire range of angular scales probed by the data, the run-time
157: required to generate enough independent samples at the low
158: signal-to-noise, small angular scale regime was found to be
159: prohibitive \citep{eriksen:2004}. The reason for this is that typical
160: variations in the power spectrum from one sample to the next are
161: determined by cosmic variance alone, whereas the posterior itself is
162: given by both cosmic variance and noise.  This results in a long
163: correlation length in the sequence of spectra in the low signal to
164: noise regime, thus requiring a very long run time to generate a
165: sufficient number of independent samples.  
166: 
167: In this paper we generalize the original Gibbs sampling algorithm to
168: include a new type of MCMC step alternating with standard Gibbs
169: sampling, which solves this problem of slow probabilistic convergence
170: in the low signal to noise regime.  This method therefore makes
171: possible an exact Bayesian approach to CMB analysis over the entire
172: range of angular scales probed by current and future experiments.
173: 
174: The paper is organized as follows. We first review the CMB Gibbs
175: sampler, and describe the associated numerical difficulties in
176: analysis at small angular scales.  We then introduce the new MCMC step
177: to the Markov chain, designed specifically to allow large variations
178: in the high-$\ell$ CMB spectrum, precisely where the signal to noise
179: is $\le 1$. We derive the required Metropolis-Hastings acceptance
180: probability correctness in Appendix \ref{app:proof}, and numerically
181: demonstrate the method in Section \ref{sec:simulations}, for both
182: temperature and polarization. Finally, we summarize and conclude in
183: Section \ref{sec:conclusions}.
184: 
185: 
186: \section{Review of Gibbs Sampling}
187: 
188: \subsection{The Joint Posterior}
189: 
190: We begin by assuming that the observed data may be modelled by a
191: signal and a noise term,
192: \begin{equation}
193: \Bd = \BA \Bs + \Bn,
194: \end{equation}
195: where $\Bd$ is a vector containing the data (at every pointing of the
196: detectors), the matrix $\BA$ involves both pointing and beam
197: convolution (and where for this paper we will assume symmetric beams
198: and neglect the details of this operation), and $\Bn$ is additive
199: noise (here in the pixel domain).  We assume both the CMB signal and
200: noise to be Gaussian random fields with vanishing mean and covariance
201: matrices $\BS$ and $\BN$, respectively. In harmonic space, where $\Bs
202: = \sum_{\ell, m} a_{\ell m} Y_{\ell m}$, the CMB temperature
203: covariance matrix is given by $\textrm{C}_{\ell m, \ell' m'} = \langle
204: a_{\ell m}^* a_{\ell' m'}\rangle = C_{\ell} \delta_{\ell \ell'}
205: \delta_{m m'}$, $\Cell$ being the angular power spectrum.  A
206: generalization to polarization merely requires the replacement of the
207: signal matrix diagonal elements with $3 \times 3$ matrices of the form
208: \begin{equation}
209: \BC_{l} = \left[ \begin{array}{ccc}
210: C_{l}^{TT} & C_{l}^{TE} & C_{l}^{TB} \\
211: C_{l}^{ET} & C_{l}^{EE} & C_{l}^{EB} \\
212: C_{l}^{BT} & C_{l}^{BE} & C_{l}^{BB} \end{array} \right]
213: \end{equation}
214: For the discussion in this section, we focus on the temperature case,
215: but note that the generalization to polarization is straightforward
216: and discussed by \citet{larson:2007}.
217: 
218: Given these asumptions, our goal is to quantify what has been learned
219: about the underlying power spectrum of the CMB given the data, or how
220: well the data constrain the cosmological parameters.  One proceeds
221: then, in a Bayesian framework, by writing down the posterior given the
222: data,
223: \begin{equation}
224: P(C_{\ell} | \Bd) \propto \mathcal{L}(\Bd | C_{\ell} ) P(C_{\ell}).
225: \end{equation}
226: Here $\mathcal{L}(\Bd | C_{\ell})$ is the likelihood and $P(C_{\ell})$
227: is a prior on $C_{\ell}$. 
228: %HKE: The next sentence didn't feel very natural to me, given how the next
229: %sentence is formulated. Better to skip it, I think, so it's commented
230: %out for now:
231: %It is important to remember, as will be seen
232: %in what follows, that the likelihood is something to be derived in the
233: %context of our data model and assumptions about the signal and noise
234: %processes.
235: 
236: In order to derive the functional form of the likelihood, one imagines
237: randomly choosing any relevant model [here a power spectrum drawn from
238: $P(C_{\ell})$], and asks what sequence of effects needs to be modeled
239: in order to simulate the data. Here, simulation is understood as
240: conditioning on the chosen model, and leads to a joint density
241: \begin{eqnarray}
242: P(\Bd,\Bs,C_{\ell}) & = & P(\Bd,\Bs | C_{\ell}) P(C_{\ell}) \nonumber \\
243: & = & P(\Bd | \Bs) P(\Bs | C_{\ell})  P(C_{\ell})
244: \end{eqnarray}
245: where the last line follows directly from our data model through the
246: assumption of additive noise. Specifically, the factors in the above
247: are
248: \begin{eqnarray}
249: -2 \log P(\Bs | C_{\ell} ) & = & \Bs^{t} \BC^{-1} \Bs - \log |\BC| \nonumber \\
250: -2 \log P(\Bd | \Bs) & = & -(\Bd-\Bs)^{t} \BN^{-1}(\Bd-\Bs) - \log |\BN|
251: \end{eqnarray}
252: which follow from the assumption that both the signal and noise are
253: independent Gaussian processes.
254: 
255: The idea of a ``simulation chain'' provides a conceptually
256: clear approach to constructing a joint density, from which we
257: immediately have the Bayesian posterior
258: \begin{equation}
259: P(C_{\ell} | \Bd) = \int d\Bs \ P(C_{\ell}, \Bs | \Bd)
260: \end{equation}
261: The relevance of the above for this paper lies in relating what we
262: refer to as the {\it joint posterior}, $P(C_{\ell}, \Bs | \Bd)$, and
263: the more familiar likelihood $\mathcal{L}(\Bd | C_{\ell}) \propto
264: P(C_{\ell} | \Bd) / P(C_{\ell})$,
265: 
266: Although we can analytically compute the integral of the joint
267: posterior over the signal for the Gaussian signal and noise processes
268: considered here, and therefore simply write down the functional form
269: of the likelihood, it is too expensive to evaluate it for any
270: specified $C_{l}$ given high-resolution data. Furthermore, for more
271: complicated data models (i.e. including foreground model
272: uncertainties) we will not be able to perform the integrals over the
273: additional degrees of freedom.  Both situations then instead motivate
274: sampling from the joint posterior, and thereby generating samples from
275: $P(C_{\ell} | \Bd)$ without ever evaluating $P(C_{\ell} | \Bd)$.  We
276: now discuss the original Gibbs sampling approach proposed and
277: implemented by \citet{jewell:2004}, \citet{wandelt:2004} and
278: \citet{eriksen:2004}, and then introduce a new MCMC step which
279: directly addresses the previously reported slow probabilistic
280: convergence in the low signal to noise regime \citep{eriksen:2004}.
281: 
282: \subsection{The CMB Gibbs sampler}
283: \label{sec:cmb_sampling}
284: 
285: As stated above, our goal is to sample from the joint posterior,
286: \begin{equation}
287:   - 2  \log P(\Bs, C_{\ell}|\Bd) =
288:   \chi^{2}(\Bd, \Bs) + 
289:   \Bs^{t} \BS^{-1} \Bs + \log | \BS| 
290:   + \log P(C_{\ell}).
291: \label{eq:cmb_posterior}
292: \end{equation}
293: For notational convenience, we have here dropped constant factors of
294: $2\pi$, and also defined
295: \begin{equation}
296: \chi^{2}(\Bs, \Bd) = (\Bd - \Bs)^{t} \BN^{-1}(\Bd -\Bs).
297: \end{equation}
298: One approach to sample from this posterior is to use an algorithm
299: known as Gibbs sampling, where we can alternately sample from the
300: respective conditional densities,
301: \begin{align}
302: \Bs^{i+1} &\leftarrow P(\Bs | C_{\ell}^i, \Bd) \\
303: C_{\ell}^{i+1} &\leftarrow P(C_{\ell} | \Bs^{i+1}, \Bd).
304: \end{align}
305: Here $\leftarrow$ indicates sampling from the distribution on the
306: right-hand side. After some burn-in period, during which all samples
307: must be discarded, the joint samples $(\Bs^i, C_{\ell}^i)$ will be
308: drawn from the desired density. Thus, the problem is reduced to that
309: of sampling from the two \emph{conditional} densities $P(\Bs |
310: C_{\ell}, \Bd)$ and $P(C_{\ell} | \Bs, \Bd)$.
311: 
312: We now describe the sampling algorithms for each of these two
313: conditional distributions, starting with $P(C_{\ell} | \Bs, \Bd)$.
314: First, note that $P(C_{\ell} | \Bs, \Bd) = P(C_{\ell} | \Bs)$ which
315: follows directly from the construction of the joint density of
316: ``everything'' above.  This is also intuitively easy to understand
317: since if we already know the CMB sky signal, the data themselves tell
318: us nothing new about the CMB power spectrum. Next, since the sky is
319: assumed to be Gaussian and isotropic, the distribution reads
320: \begin{equation}
321: P(C_{\ell} | \Bs) \propto P(C_{\ell}) \frac{e^{-\frac{1}{2}
322:     \Bs_{\ell}^{t}\BS_{\ell}^{-1}\Bs_{\ell}}}{\sqrt{|\BS_{\ell}|}} =    
323: P(C_{\ell})
324: \frac{e^{-\frac{2\ell+1}{2} \frac{\sigma_{\ell}}{C_{\ell}}}}{C_{\ell}^{\frac{2\ell+1}{2}}},
325: \end{equation}
326: which, when interpreted as a function of $C_{\ell}$, is known as the
327: inverse Gamma distribution. In this expression, $\sigma_{\ell} =
328: \frac{1}{2\ell+1} \sum_{m} |a_{\ell m}|^2$ denotes the observed power spectrum
329: of $\Bs$. Fortunately, there exists a simple textbook sampling
330: algorithm for this distribution \citep[e.g.,][]{gupta:2000}, and we
331: refer the interested reader to the previous papers for details. For an
332: alternative, and more flexible, sampling algorithm, see
333: \citet{wehus:2008}.
334: 
335: In order to describe the sky signal sampling step, we first define the
336: mean-field map (or Wiener filtered data) to be $\hat{\Bs} = (\BS^{-1}
337: + \BN^{-1})^{-1} \BN^{-1} \Bd$, and note that the conditional sky
338: signal density given the data and $C_{l}$ can be written as
339: \begin{align}
340: P(\Bs | C_{\ell}, \Bd) &\propto e^{-\frac{1}{2} (\Bs-\hat{\Bs})^t (\BS^{-1} + \BN^{-1}) (\Bs-\hat{\Bs})}.
341: \end{align}
342: Thus, $P(\Bs | C_{\ell}, \Bd)$ is a Gaussian distribution with mean
343: equals to $\hat{\Bs}$ and a covariance matrix equals to $(\BS^{-1} +
344: \BN^{-1})^{-1}$.
345: 
346: Sampling from this Gaussian distribution is straightforward, but
347: computationally somewhat cumbersome. First, draw two random white
348: noise maps $\omega_0$ and $\omega_1$ with zero mean and unit
349: variance. Then solve the equation
350: \begin{equation}
351: \left[\BS^{-1} + \BN^{-1}\right] \Bs = \BN^{-1}\Bd + \BS^{-\frac{1}{2}} \omega_0 +
352: \BN^{-\frac{1}{2}} \omega_1.
353: \label{eq:lin_sys}
354: \end{equation}
355: for $\Bs$. Since the white noise maps have zero mean, one immediately
356: sees that $\langle \Bs \rangle = \hat{\Bs}$, while a few more
357: calculations show that $\langle \Bs \Bs^{t} \rangle = (\BS^{-1} +
358: \BN^{-1})^{-1}$. 
359: 
360: The problematic part about this sampling step is the solution of the
361: linear system in Equation \ref{eq:lin_sys}. Since this a $\sim10^6
362: \times 10^6$ system for current CMB data sets, it cannot be solved by
363: brute force. Instead, one must use a method called Conjugate Gradients
364: (CG), which only requires multiplication of the coefficient matrix on
365: the left-hand side, not inversion. For details on these computations,
366: together with some ideas on preconditioning, see \citet{eriksen:2004}.
367: 
368: 
369: \subsection{Convergence issues in the low signal-to-noise regime}
370: 
371: As originally applied to high-resolution CMB data, the Gibbs sampling
372: algorithm as described above has very slow convergence at the
373: high-$\ell$, low signal-to-noise part of the spectrum.  The reason for
374: the slow convergence is easy to understand in light of the above: When
375: sampling from $P(C_{\ell} | \Bs)$, the typical step size is given by
376: cosmic variance at all angular scales. In the high signal-to-noise
377: regime, cosmic variance dominates the noise variance, and we are able
378: to explore the full width of the posterior in only a few Gibbs
379: iterations. However, in the low signal-to-noise end, cosmic variance
380: is far smaller than the posterior variance, and it takes a
381: prohibitively long time to converge probabilistically.  This problem
382: of ``slow mixing'' of the Gibbs sampler is illustrated in figures
383: \ref{fig:TT_trace_plots} and \ref{fig:TT_correlation_length}.  The
384: long correlation length starting at signal-to-noise of unity leads to
385: extremely long run times in order to produce a reasonable number of
386: uncorrelated samples.
387: 
388: 
389: \section{A Low Signal-to-Noise MCMC Sampler}
390: 
391: When sampling from the true posterior, the goal is to produce as many
392: independent samples from $P(C_{\ell}, \Bs | \Bd)$ as possible.  One might
393: intuitively guess that it should be straightforward to establish good
394: approximations to the posterior in the low signal-to-noise regime,
395: since in the limit of vanishing signal to noise we simply recover the
396: prior. This suggests that we look for a sampling scheme in which we
397: first sample a new spectrum from some approximation to the true
398: posterior independent on the current spectrum and CMB map, followed by
399: sampling the CMB map from the conditional $P(\Bs|C_{\ell}, \Bd)$. The
400: problem with such a direct scheme is that the accept probability will
401: involve a ratio of determinants which are too expensive to compute.
402: 
403: We are therefore motivated to look for a sampling scheme in which we
404: can make a large variation in $C_{\ell}$ in the low signal-to-noise
405: regime, and make an associated {\it deterministic change} in the CMB
406: map, while still maintaining a reasonably high acceptance rate. The
407: motivation for a deterministic change is that it will avoid
408: introducing ratios of determinants which we cannot compute. 
409: 
410: \subsection{Proposal rule and acceptance probability}
411: 
412: Assume that we have defined a deterministic
413: sampling scheme for $\Bs$, and that our new CMB map is given by some
414: function
415: \begin{equation}
416: \Bs_{n+1} = F(\Bs_{n}, C_{\ell}^{(n+1)}, C_{\ell}^{(n)} ).
417: \end{equation}
418: Then the condition of detailed balance for our MCMC
419: sampler requires that
420: \begin{equation}
421: F^{-1}(\Bs_{n+1}, C_{\ell}^{(n+1)}, C_{\ell}^{(n)}) = F(\Bs_{n+1}, C_{\ell}^{(n)}, C_{\ell}^{(n+1)}),
422: \end{equation}
423: or, in other words, that the inverse function is given by exchanging
424: the order of the spectra in the function $F$. One simple function which has this property is
425: \begin{equation}
426: \Bs_{n+1} = \left(\frac{C_{\ell}^{(n+1)}}{C_{\ell}^{(n)}}\right)^{\frac{1}{2}} \Bs_{n}
427: \end{equation}
428: The total proposal matrix is then
429: \begin{eqnarray}
430: w(C_{\ell}^{(n+1)}, \Bs_{n+1} | C_{\ell}^{(n)}, \Bs_{n}) & = &  w(C_{\ell}^{(n+1)} | C_{\ell}^{(n)}, \Bd)
431: \nonumber \\
432: & &  \delta \left( \Bs_{n+1} - \left(\frac{C_{\ell}^{(n+1)}}{C_{\ell}^{(n)}}\right)^{-\frac{1}{2}} \Bs_{n} \right),
433: \nonumber
434: \end{eqnarray}
435: and the ``reverse'' proposal is
436: \begin{eqnarray}
437:   w(C_{\ell}^{(n)}, \Bs_{n} | C_{\ell}^{(n+1)}, \Bs_{n+1}) & = &  w(C_{\ell}^{(n)} | C_{\ell}^{(n+1)}, \Bd)
438:   \nonumber \\
439:   & &  \delta \left( \Bs_{n} - \left(\frac{C_{\ell}^{(n)}}{C_{\ell}^{(n+1)}}\right)^{-\frac{1}{2}} \Bs_{n+1} \right).
440:   \nonumber
441: \end{eqnarray}
442: The condition of detailed balance including deterministic moves
443: requires the consideration of some technical points which we leave
444: to Appendix \ref{app:proof}. There
445: we show that the full Metropolis-Hastings accept probability reads
446: \begin{eqnarray}
447: A & = & \min \left[ 1,
448: \frac{e^{- \chi^{2}(\Bs_{n+1}, \Bd)}}{e^{-\chi^{2}(\Bs_{n}, \Bd)}}
449: \frac{w(C_{\ell}^{(n)} | C_{\ell}^{(n+1)}, \Bd)}{w(C_{\ell}^{(n+1)} | C_{\ell}^{(n)}, \Bd)}
450: \right]
451: \end{eqnarray}
452: The significance of the above is that we can make relatively large
453: changes to the power spectrum in the low signal-to-noise regime, where
454: $\BN^{-1}$ is getting small, since the $\chi^2$ is affected only very
455: mildly by changes in any low signal-to-noise mode.
456: 
457: We note the interesting point (discussed more completely in Appendix
458: \ref{app:cov}) that if one changes variables in the
459: joint posterior from CMB maps, $\Bs$, to whitened maps, $\Bx =
460: \BC_{\ell}^{-\frac{1}{2}} \Bs$, and then Gibbs sample in the new
461: variables $(C_{\ell}, \Bx)$, the resulting accept probability is
462: numerically identitical to the above.  However, we note
463: the distinction here to emphasize the difference between MCMC
464: algorithms implementing deterministic proposals of maps given
465: $C_{\ell}$, and those sampling in a different set of variables, as
466: there could be other deterministic proposal schemes or
467: another change of variables which lead to improvements over the approach
468: presented in this paper.
469: 
470: For the numerical demonstration of the  MCMC algorithm presented
471: in this paper, we use a simple symmetric Gaussian proposal, truncated
472: at $C_{\ell}>0$ (or, for polarization, the region where the resulting
473: CMB covariance matrix is positive definite), for the power spectrum,
474: \begin{equation}
475: w(C_{\ell}^{(n+1)} | C_{\ell}^{(n)}, \Bd) \propto e^{-\frac{1}{2}
476:   \left(\frac{C_{\ell}^{n+1}-C_{\ell}^{n}}{\tau_{\ell}}\right)^2} I(C_{\ell}>0),
477: \end{equation}
478: where $\tau_{\ell}$ is a measure of the typical step size taken
479: between two samples. Note that because this proposal density is
480: symmetric, the ratio of $C_{\ell}$ proposals cancels, and the
481: acceptance probability is entirely determined by the change in
482: $\chi^{2}$.
483: 
484: It should be noted that while the above MCMC step satisfies detailed
485: balance, it is not {\it irreducable}, in the sense that there is not a
486: non-vanishing probability in reaching any state from any other state
487: in a finite number of MCMC steps; the phases are unchanged in each
488: MCMC step. However, alternating these steps with a traditional Gibbs
489: sampling step gives a combined ``two-step'' MCMC algorithm which
490: indeed is irreducable, and therefore provably converges to the joint
491: posterior. Once again, the details are left to the appendix for the
492: interested reader.
493: 
494: \subsection{Optimization of the MCMC sampler}
495: 
496: A general advantage of the Gibbs sampler is the fact that it is free
497: of tunable efficiency parameters. The same is not true for the
498: Metropolis-Hastings MCMC algorithm; for satisfactory sampling
499: performance, it typically has to be tuned quite extensively. In this
500: section, we describe three specific features that helps in this task,
501: namely 1) step size tuning, 2) slice sampling and 3) binning.
502: 
503: First, we have to ensure that the step size of our Gaussian proposal
504: density roughly matches the width of the target distribution, in order
505: to maintain both a reasonable acceptance rate and high mobility. We do
506: this by performing an initial test run, producing typically a few
507: hundreds $C_{\ell}$ samples, and compute the standard deviation of
508: these samples for each $\ell$. These are then adopted as the proposal
509: widths for the main run, scaled by some number less than unity,
510: typically between 0.05 and 0.5. For the initial test run, we
511: approximate the posterior width by the noise variance alone,
512: \begin{equation}
513: \tau_{\ell}^{2} = \frac{2}{2\ell+1} \frac{N_{\ell}}{b_{\ell}^{2}},
514: \end{equation}
515: because the MCMC sampler is used only in the low signal-to-noise
516: regime. In this expression $N_{\ell}$ is the power spectrum of the
517: instrumental noise alone, and $b_{\ell}$ is the product of the 
518: Legendre transform of the beam and the HEALPix window function.
519: 
520: Next, Metropolis-Hastings MCMC is inefficient in spaces with too many
521: free parameters. For this reason, we divide the power spectrum
522: coefficients, $C_{\ell}$, into subsets, each containing typically only
523: 10--20 multipoles. Then we propose changes to one subset at a time,
524: while keeping all other multipoles fixed. Finally, we loop over
525: subsets, and thus effectively implement a multipole slice Gibbs
526: sampler for the full power spectrum.
527: 
528: This is computationally feasible, because a single MCMC proposal only
529: requires a single $\chi^{2}$ evaluation, which has a computational cost
530: of a single spherical harmonic transform. Since drawing a full sky map
531: from $P(\Bs|C_{\ell}, \Bd)$ in the classical Gibbs sampling step
532: requires $\mathcal{O}(10^{2})$ spherical harmonic transforms, we can
533: indeed afford to perform many MCMC proposals for each Gibbs step,
534: without dominating the total cost.
535: 
536: \begin{figure}
537: \mbox{\epsfig{file=trace_plot_phase1a.eps,width=\linewidth,clip=}}
538: \caption{Comparison of $C_{\ell}$ chains produced by standard Gibbs
539:   sampling (black) and by the Gibbs+MCMC hybrid (red) for three
540:   selected multipole bins. The simulation was based on full sky
541:   coverage and uniform noise. See text for full details. }
542: \label{fig:TT_trace_plots}
543: \end{figure}
544: 
545: Nevertheless, for very high-resolution analysis it is often beneficial
546: to bin several $C_{\ell}$'s together, both in order to increase the
547: signal-to-noise of the joint coefficient, and to decrease the number
548: of parameters that needs to be sampled by MCMC. We implement this by
549: defining a new binned spectrum, weighted by $\ell(\ell+1)/2\pi$, as
550: follows,
551: \begin{equation}
552: C_{b} = \frac{1}{N_b}\sum_{\ell \in b} \frac{\ell(\ell+1)}{2\pi} C_{\ell}.
553: \end{equation}
554: Here $b=[\ell_{\textrm{min}}, \ell_{\textrm{max}}]$ denotes the
555: current bin, and $N_{b} = \ell_{\textrm{max}}-\ell_{\textrm{min}}+1$
556: is the number of multipoles within the bin. These new (and fewer)
557: coefficients are then sampled with the above MCMC sampler, after which
558: the original spectrum coefficients are given by
559: \begin{equation}
560: C_{\ell} = \frac{2\pi}{\ell(\ell+1)} C_{b}.
561: \end{equation}
562: 
563: \begin{figure}
564: \mbox{\epsfig{file=correlation_funcs_phase1a.eps,width=\linewidth,clip=}}
565: \caption{Comparison of chain correlation functions for standard Gibbs
566:   sampling (blue) and Gibbs+MCMC (red), computed from the full-sky
567:   uniform noise temperature data set.. Note that while the correlation
568:   length goes to infinity with increasing $\ell$ (or equivalently, low
569:   signal-to-noise) for standard Gibbs sampling, it is $\lesssim40$
570:   everywhere for the MCMC hybrid case. }
571: \label{fig:TT_correlation_length}
572: \end{figure}
573: 
574: \section{Testing and Validation}
575: \label{sec:simulations}
576: 
577: We have implemented the new sampling step described above in the
578: previously Gibbs sampling code called ``Commander''
579: \citep{eriksen:2004,eriksen:2008a}, and in this section we demonstrate
580: its advantages compared to the old sampling algorithm. We consider two
581: different cases, namely high-$\ell$ temperature and low-$\ell$
582: polarization analysis. In the former case, we also analyse two cases,
583: with and without a sky cut. The former allows us to verify the results
584: against an analytically known answer, while the second demonstrates
585: that the sky cut does not degrade the sampling efficiency.
586: 
587: \subsection{Temperature analysis}
588: 
589: The high-$\ell$ temperature simulation is designed to mimic the 5-year
590: WMAP temperature data \citep{hinshaw:2008} with one exception, namely
591: that the noise is assumed spatially uniform, in order to facilitate
592: analytic comparison. Specifically, the CMB realization was drawn from
593: the best-fit $\Lambda$CDM model derived from WMAP alone
594: \citep{komatsu:2008}, including multipoles up to
595: $\ell_{\textrm{max}}=1000$, and then smoothed with the instrumental
596: beam of the WMAP V1 differencing assembly, and pixelized at
597: HEALPix\footnote{http://healpix.jpl.nasa.gov} resolution
598: $N_{\textrm{side}}=512$. Finally, uniform noise of $\sigma_0 =
599: 40\mu\textrm{K}$ RMS was added to each pixel. This corresponds to a
600: signal-to-noise ratio of unity at $\ell \sim 550$, roughly similar to
601: the 5-year WMAP data. We analyse this simulation both with and without
602: the WMAP KQ85 sky cut \citep{gold:2008}.
603: 
604: In both analyses, we adopted the Gaussian proposal density with tuned
605: variances, as described above. We also bin the power spectrum in
606: progressively wide bins, starting at $\ell = 600$, to maintain a
607: reasonable signal-to-noise per sampled power spectrum parameter. Ten
608: bins were sampled jointly per proposal, while all others were kept
609: fixed.
610: 
611: In the full-sky case, we produced a total of 31,800 samples over 60
612: chains, and in the cut sky case a total of 6800 samples. The cost for
613: producing one sample in the latter, and by far most expensive, set was
614: 2.5 CPU hours, for a total of 17\,000 CPU hours. The number of MCMC
615: steps per Gibbs step was one in the former and 20 in the latter. 
616: (Since the the signal sampler dominates the cut sky Gibbs chain one can
617: perform more low S/N steps without slowing down the overall code significantly.)
618: In addition to these two main sample sets, we also
619: produced two longer chains with each 3500 samples for the full-sky
620: casee, both with and without the new MCMC step turned on, in order to
621: compare the Markov chain correlation lengths before and after
622: including the MCMC sampler.
623: 
624: We first consider the full-sky data set, and in Figure
625: \ref{fig:TT_trace_plots} we show a segment of each of the two longer
626: chains for three selected multipole bins. The top panel shows
627: $\ell=600$, which is the first bin to be sampled by MCMC, the middle
628: panel shows $\ell=732-742$, where there is still some signal in the
629: data, and, finally, the bottom panel shows $\ell=855-1000$, which is
630: strongly noise dominated. Starting with the top panel, we see that the
631: red curve (Gibbs+MCMC) scatters significantly faster than the black
632: curve (Gibbs only), implying more efficient sampling. This trend
633: becomes even stronger with lower signal-to-noise, until the last case,
634: where the Gibbs-only chain essentially does not move at all, while the
635: MCMC sampler does probe the full range. Note, however, that even the
636: MCMC sampler has a significant correlation length in this range, and
637: this implies that there is still some room for improvement to be made
638: in defining our proposals.
639: 
640: Next, these considerations are quantified in Figure
641: \ref{fig:TT_correlation_length}, where we plot the Markov chain
642: correlation length as a function of distance in the chain, for six
643: bins with and without the MCMC sampler. As first reported by
644: \citet{eriksen:2004}, we see that the Gibbs-only correlation length
645: increases dramatically with decreasing signal-to-noise, rendering the
646: algorithm essentially useless in this regime. However, we also see
647: that the new MCMC step effectively resolves this issue, as the
648: correlation length (here defined by having a correlation less than
649: 0.2) now is less than $\sim40$ steps. This is a dramatic improvement,
650: and makes the algorithm useful even in this range. Nevertheless, we
651: once again point out that it is possible to make further improvements
652: by establishing better proposal densities.
653: 
654: \begin{figure}
655: \mbox{\epsfig{file=gr_phase1a.eps,width=\linewidth,clip=}}
656: \caption{Gelman-Rubin statistic for the full-sky, uniform noise
657:   temperature analysis. Note the feature at $\ell=600$, which marks
658:   the transition between standard Gibbs sampling and Gibbs+MCMC.}
659: \label{fig:TT_gr}
660: \end{figure}
661: 
662: In Figure \ref{fig:TT_gr} we consider the convergence properties of
663: the $\sim30$k samples set, by computing the Gelman-Rubin statistic $R$
664: \citep{gelman:1992} as a function of $\ell$. Typically, one recommends
665: that $R$ should be less than, say, 1.2 in order to claim
666: convergence. We see that this holds everywhere for this sample set,
667: and typically it is even less than 1.05. Note also the step at
668: $\ell=600$, showing clearly the beneficial effect of the MCMC
669: sampler. 
670: 
671: 
672: \begin{figure}
673: \mbox{\epsfig{file=post_phase1a.eps,width=\linewidth,clip=}}
674: \caption{High-$\ell$ temperature marginal posteriors computed with
675:   Gibbs+MCMC from the full-sky, uniform noise temperature data set,
676:   compared to analytic results.}
677: \label{fig:TT_posteriors}
678: \end{figure}
679: 
680: Next, in Figure \ref{fig:TT_posteriors} we compare the marginal
681: distributions derived from this sample set with the analytic result,
682: \begin{equation}
683: P(C_{\ell}|\mathbf{d}) \propto \prod_{\ell \in b}
684: \frac{e^{-\frac{2\ell+1}{2}
685:     \frac{\sigma_{\ell}^{\textrm{S+N}}}{b_{\ell}^2
686:         C_{\ell}+N_{\ell}}}}
687:  {(b_{\ell}^2 C_{\ell}+N_{\ell})^{\frac{2\ell+1}{2}}}.
688: \end{equation}
689: Here $b=[\ell_{\textrm{min}}, \ell_{\textrm{max}}]$ indicates a given
690: multipole bin, $b_{\ell}$ denotes the product of the instrumental beam
691: and the HEALPix pixel window, and $\sigma_{\ell}^{\textrm{S+N}}$ is
692: the power spectrum of the noisy data map. We see that the new
693: algorithm reproduces the analytic distributions very well, and this
694: verifies the overall method.
695: 
696: \begin{figure}
697: \mbox{\epsfig{file=spectrum_phase1b.eps,width=\linewidth,clip=}}
698: \caption{Temperature power spectrum estimated from cut sky temperature
699: data. The panels show the same spectrum, but emphasizing different
700: multipole ranges (full-range; S/N$\sim$1 transition region; and
701: high-$\ell$, low S/N).}
702: \label{fig:TT_spectrum}
703: \end{figure}
704: 
705: Finally, the cut-sky power spectrum with one-sigma confidence regions
706: is shown in three panels in Figure \ref{fig:TT_spectrum}, focusing on
707: different $\ell$-ranges, namely all $\ell$'s, the $S/N \sim 1$
708: transition region, and the low $S/N$ region. This completes the
709: high-$\ell$ temperature analysis validation.
710: 
711: \subsection{Polarization analysis}
712: 
713: We now consider polarization analysis, and construct a new low-$\ell$
714: simulation for this purpose. This simulation does not mimic any
715: planned experiment, but is rather designed to highlight the analysis
716: method itself. Specifically, we drew a new CMB realization from the
717: best-fit WMAP $\Lambda$CDM spectrum that includes a non-zero tensor
718: contribution, including multipoles up to $\ell_{\textrm{max}}=150$,
719: and convolved this with a $3^{\circ}$ FWHM Gaussian beam, and
720: pixelized it at $N_{\textrm{side}} = 64$. Uniform noise of
721: $5\mu\textrm{K}$ RMS was added to the temperature component, and
722: $1\mu\textrm{K}$ RMS to the polarization components. The 5-year WMAP
723: polarization sky mask was imposed on the data.
724: 
725: We allowed for non-zero $C_{\ell}^{TT}$, $C_{\ell}^{TE}$,
726: $C_{\ell}^{EE}$ and $C_{\ell}^{BB}$ spectra, but fixed $C_{\ell}^{TB}
727: = C_{\ell}^{EB} = 0$. These spectra were then individually binned to
728: maintain a reasonable signal-to-noise per bin. (Details on how to
729: introduce individual binning of each power spectrum were recently
730: described by Eriksen and Wehus, 2008.) Again, a tuned Gaussian proposal
731: density was used in the MCMC step. A total of 12\,000 samples were
732: produced over 12 chains, and the CPU time per sample was 55 seconds,
733: for a total of $\sim200$ CPU hours.
734: 
735: \begin{figure}
736: \mbox{\epsfig{file=pol_trace_plots.eps,width=\linewidth,clip=}}
737: \caption{$C_{\ell}$ chains generated by Gibbs+MCMC hybrid for the
738:   cut-sky polarization data set. Only the highest multipole bin for
739:   each spectrum is shown ($\ell = 108-150$ for TT, $\ell = 88-150$ for
740:   TE, $\ell=101-150$ for EE and $\ell=61-150$ for BB).}
741: \label{fig:pol_trace_plots}
742: \end{figure}
743: 
744: In Figure \ref{fig:pol_trace_plots} we show one $C_{\ell}$ chain for
745: each of the four sampled spectra, for the last (and therefore most
746: difficult) bin in each case. Note that the $C_{\ell}^{EE}$ and
747: $C_{\ell}^{BB}$ spectra have essentially vanishing signal-to-noise,
748: and therefore these chains reach zero values. Clearly, we see that
749: mixing properties of these chains are satisfactory, and the
750: correlation lengths are quite short.
751: 
752: \begin{figure}
753: \mbox{\epsfig{file=gr_polarization.eps,width=\linewidth,clip=}}
754: \caption{Gelman-Rubin statistic for cut-sky polarization analysis.}
755: \label{fig:gr_polarization}
756: \end{figure}
757: 
758: In Figure \ref{fig:gr_polarization} we show the Gelman-Rubin
759: statistics for each of the four power spectra, and with the single
760: exception of the very last bin of $C_{\ell}^{EE}$, all $R$ values are
761: well below 1.1. Thus, all spectra have converged well everywhere.
762: 
763: \begin{figure}
764: \mbox{\epsfig{file=pol_spectra.eps,width=\linewidth,clip=}}
765: \caption{Marginal $C_{\ell}$ power spectra (red curves) estimated from
766:   cut sky polarization data. Gray bands indicate 68\% confidence
767:   regions, and the black lines show the input spectrum. (Note that the
768:   marginal spectra shown here are not individually unbiased estimators
769:   because of the correlations between TT, TE and EE. Proper treatment
770:   of the full joint polarization density will be considered separately
771:   in a future publication.)}
772: \label{fig:pol_spectrum}
773: \end{figure}
774: 
775: Finally, in Figure \ref{fig:pol_spectrum} we show the reconstructed
776: marginal power spectra for each polarization component, overplotted on
777: the input spectrum. The agreement is very good. Note, however, that
778: these spectra are direct marginals, and not a joint maximum likelihood
779: estimate. They are therefore not individual unbiased estimators. In
780: particular, the marginal $C_{\ell}^{EE}$ power spectrum is biased
781: slightly high because of the combination of the
782: $C_{\ell}^{TT}C_{\ell}^{EE} - (C_{\ell}^{TE})^2 > 0$ positivity
783: constraint and relatively low signal-to-noise. Consideration of the
784: joint polarization posterior, which \emph{is} an unbiased estimator,
785: is postponed to a future publication.
786: 
787: 
788: %\section{WMAP Simulations}
789: %\begin{itemize}
790: %\item Temperature only - we will wait for 5 year to do polarization.
791: %\item  Re-do the WMAP sims. from the Eriksen et al 2004 method
792: %paper, but with appropriate S/N of the 3 and 5 year data.
793: 
794: %\end{itemize}
795: 
796: 
797: 
798: 
799: \section{Conclusions}
800: \label{sec:conclusions}
801: 
802: We have presented a new MCMC algorithm for the high-L, low
803: signal to noise limit of the joint posterior which
804: solves the slow probabilistic convergence of the traditional
805: Gibbs sampler in this regime.  This in principle allows sampling over the
806: joint posterior $p(C_{l}, \Bs | \Bd)$ over the entire range
807: of angular scales probed by current and future CMB experiments.
808: The limiting computational burden is now entirely in the map-making
809: step of Gibbs sampling, for which the cost per Gibbs iteration
810: now scales with the expense of multiplication by the inverse
811: noise matrix $\BN^{-1}$.  Assuming pixel uncorrelated (but scan weighted)
812: noise as a good approximation at small angular scales, the cost of
813: an $\BN^{-1}$ multiplcation is that of a forward and inverse spherical
814: harmonic transform, or ${\cal O}(\ell_{\textrm{max}}^{3})$.  Future work will attempt to push
815: the generalized Gibbs + MCMC sampling scheme presented here to smaller
816: angular scales, ultimately limited by the degree to which we can compute
817: harmonic transforms.
818: 
819: 
820: 
821: 
822: \begin{acknowledgements}
823:   We acknowledge use of the
824:   HEALPix\footnote{http://healpix.jpl.nasa.gov} software
825:   \citep{gorski:2005} and analysis package for deriving the results in
826:   this paper. HKE acknowledges financial support from the Research
827:   Council of Norway.
828: \end{acknowledgements}
829: 
830: 
831: 
832: \appendix
833: 
834: \section{Including Deterministic Proposals in MCMC}
835: \label{app:proof}
836: Here we review the derivation of the accept probability in Markov Chain Monte Carlo
837: when using deterministic proposals (or proposals where some of the degrees of freedom
838: are specified as deterministic functions of the past state and/or proposed
839: variations in some other degrees of freedom).  We first briefly review
840: the Metropolis-Hastings Markov Chain Monte Carlo algorithm and the proof of
841: its convergence, and then turn to the special case involving deterministic
842: proposals.  Much of the review of the MCMC algorithm here follows \citep{Sokal:1989}.
843: We also note that similar technical considerations including deterministic
844: elements in proposals are presented in \citep{Green:1995} in the context of
845: MCMC algorithms in which the dimension of the state space itself is included
846: as a random variable to be sampled over.
847: 
848: The goal is the construction of a transition matrix
849: $T(C_{l}, \Bs | C_{l}', \Bs', \Bd)$ such that after initializing
850: the Markov Chain with a sample from any probability density $p_{0}(C_{l}, \Bs | \Bd)$,
851: we generate samples from a sequence of probability densities
852: \begin{equation}
853: p_{n+1}(C_{l}, \Bs | \Bd)  \equiv   \int d(C_{l}', \Bs') \ 
854: T(C_{l}, \Bs | C_{l}', \Bs', \Bd) \ p_{n}(C_{l}', \Bs' | \Bd)
855: \end{equation}
856: which eventually converge to an {\it equilibrium density} $\pi(C_{l}, \Bs | \Bd)$
857: \begin{equation}
858: \pi (C_{l}, \Bs | \Bd) = \lim_{n \rightarrow \infty}  p_{n}(C_{l}, \Bs | \Bd)
859: \end{equation}
860: We remind the reader
861: of the sufficient conditions to establish convergence of an MCMC algorithm:
862: {\it stationarity}, which means that the MCMC transition matrix satisfies
863: \begin{equation}
864: \pi(C_{l}, \Bs | \Bd) = \int d(C_{l}', \Bs') \ T(C_{l}, \Bs | C_{l}', \Bs', \Bd) \ \pi(C_{l}', \Bs' | \Bd)
865: \end{equation}
866: and {\it irreducability}, which means that for any two states, there is a finite
867: number of iterations which give a non-vanishing probability to transition from one
868: state to the other.  It is well known that these two properties are sufficient
869: to establish convergence, as can be seen simply from the triangle inequality
870: \begin{eqnarray}
871: \int d(C_{l}, \Bs) \ \left| \pi(C_{l}, \Bs | \Bd) - p_{n}(C_{l}, \Bs | \Bd) \right|
872: & = & \int d(C_{l}, \Bs) \ \left| \int d(C_{l}', \Bs') \ T(C_{l}, \Bs | C_{l}', \Bs') 
873: \left( \pi(C_{l}', \Bs' | \Bd) - p_{n-1}(C_{l}', \Bs' | \Bd) \right) \right| \nonumber \\
874: & \le & \int d(C_{l}, \Bs) \  \int d(C_{l}', \Bs') \ T(C_{l}, \Bs | C_{l}', \Bs') 
875: \left| \pi(C_{l}', \Bs' | \Bd) - p_{n-1}(C_{l}', \Bs' | \Bd)  \right| \nonumber \\
876: & = & \int d(C_{l}', \Bs') \ \left( \int d(C_{l}, \Bs) \  T(C_{l}, \Bs | C_{l}', \Bs') \right) 
877: \left| \pi(C_{l}', \Bs' | \Bd) - p_{n-1}(C_{l}', \Bs' | \Bd)  \right| \nonumber \\
878: & = & \int d(C_{l}', \Bs') \ 
879: \left| \pi(C_{l}', \Bs' | \Bd) - p_{n-1}(C_{l}', \Bs' | \Bd)  \right| \nonumber
880: \end{eqnarray}
881: 
882: The Metropolis-Hastings Markov Chain Monte Carlo algorithm is one
883: method of constructing such a transition matrix.  We choose {\it any}
884: proposal matrix $w(C_{l}, \Bs | C_{l}', \Bs', \Bd)$ and then accept
885: the proposed move with a probability
886: \begin{equation}
887: 0 \le A(C_{l}, \Bs | C_{l}', \Bs', \Bd) \le 1
888: \end{equation}
889: while rejecting the proposed move with probability $1 - A$ leads
890: to a ``null transition'' where the next state in the Markov Chain remains the same.
891: Application of this algorithm then leads to the sequence of probability densities
892: which satisfy
893: \begin{eqnarray}
894: p_{n+1}(C_{l}, \Bs | \Bd) & = & 
895: \left( 1 - \int d(C_{l}', \Bs') \ A(C_{l}', \Bs' | C_{l}
896: , \Bs, \Bd) w(C_{l}', \Bs' | C_{l}, \Bs, \Bd) \right)
897: p_{n}(C_{l}, \Bs | \Bd) \nonumber \\
898: & & +
899: \int d(C_{l}', \Bs') \ A(C_{l}, \Bs | C_{l}', \Bs', \Bd) w(C_{l}, \Bs | C_{l}', \Bs', \Bd)
900: p_{n}(C_{l}', \Bs' | \Bd) 
901: \end{eqnarray}
902: where the first term is the constribution to the probability density
903: $p_{n+1}$ if we reject any proposed move, while the second term
904: is the contribution from accepting the proposed move from any possible
905: previous state.  If we demand that, for a chosen proposal matrix, the accept probability satisfies
906: \begin{equation}
907: \pi(C_{l}', \Bs' | \Bd) w(C_{l}, \Bs | C_{l}', \Bs', \Bd) A(C_{l}, \Bs | C_{l}', \Bs', \Bd)
908: = A(C_{l}', \Bs' | C_{l}, \Bs, \Bd) w(C_{l}', \Bs' | C_{l}, \Bs, \Bd) \pi(C_{l}, \Bs | \Bd)
909: \end{equation}
910: then we see that the MH MCMC algorithm satisfies stationarity, i.e. denoting
911: by $T \circ \pi$ the density resulting from one application of the transition matrix
912: to $\pi$, we have directly from detailed balance
913: \begin{eqnarray}
914: T \circ \pi 
915: & = & \left( 1 - \int d(C_{l}', \Bs') \ A(C_{l}', \Bs' | C_{l}
916: , \Bs, \Bd) w(C_{l}', \Bs' | C_{l}, \Bs, \Bd) \right)
917: \pi(C_{l}, \Bs | \Bd) \nonumber \\ 
918: & & +
919: \int d(C_{l}', \Bs') \ A(C_{l}, \Bs | C_{l}', \Bs', \Bd) w(C_{l}, \Bs | C_{l}', \Bs', \Bd)
920: \pi(C_{l}', \Bs' | \Bd) \nonumber \\ 
921: & = & \left( 1 - \int d(C_{l}', \Bs') \ A(C_{l}', \Bs' | C_{l}
922: , \Bs, \Bd) w(C_{l}', \Bs' | C_{l}, \Bs, \Bd) \right)
923: \pi(C_{l}, \Bs | \Bd) \nonumber \\ 
924: & & + \pi (C_{l}, \Bs | \Bd)
925: \int d(C_{l}', \Bs') \ A(C_{l}', \Bs' | C_{l}, \Bs, \Bd) w(C_{l}', \Bs' | C_{l}, \Bs, \Bd)
926:  \nonumber \\ 
927: & = & \pi(C_{l}, \Bs | \Bd)
928: \end{eqnarray}
929: 
930: We now turn to the case where our proposal is of the form
931: \begin{equation}
932: w(\Bs', C_{l}' | \Bs, C_{l}) = \delta \left[ \Bs' - F(\Bs, C_{l}', C_{l}) \right] w(C_{l}' | C_{l}, \Bd)
933: \end{equation}
934: where we randomly propose a new power spectrum, posibly in a manner conditionally
935: denpendent on the current spectrum and the data, and then deterministically
936: compute a new CMB map with some function
937: \begin{equation}
938: \Bs' = F(\Bs, C_{l}', C_{l})
939: \end{equation}
940: To satisfy detailed balance with a non-vanishing accept probability
941: our function must satisfy
942: \begin{eqnarray}
943: \Bs' & = & F(\Bs, C_{l}', C_{l}) \nonumber \\
944: \Bs & = & F(\Bs', C_{l}, C_{l}')
945: \end{eqnarray}
946: or, that the inverse function is equivalent to interchanging the order
947: of the power spectrum arguements
948: \begin{equation}
949:  F(\Bs', C_{l}, C_{l}') = F^{-1}(\Bs',C_{l}', C_{l})
950: \end{equation}
951: In this paper, we have chosen one such function, given by
952: \begin{equation}
953: F(\Bs, C_{l}', C_{l}) = [\BC']^{1/2} [\BC]^{-1/2} \Bs
954: \end{equation}
955: where interchanging the spectra in the function above does in fact give
956: the inverse function itself.
957: 
958: Our job now is to {\it derive} the accept probability such that we
959: satisfy stationarity (as discussed above).  For the proposal with deterministic
960: changes to some of the degrees of freedom, stationarity is satisfied if
961: \begin{eqnarray}
962: (T \circ \pi)(C_{l}, \Bs | \Bd)
963: & = & \left[ 1 - \int d(C_{l}'', \Bs'') \ A[C_{l}'', \Bs'' | C_{l}, \Bs]
964: \delta[\Bs'' - F(\Bs, C_{l}'', C_{l})] w(C_{l}'' | C_{l}, \Bd) \right] \pi(C_{l}, \Bs | \Bd) \nonumber \\
965: & & + \int d(C_{l}', \Bs') \ A[\Bs, C_{l} | \Bs', C_{l}'] \delta[\Bs - F(\Bs', C_{l}, C_{l}')]
966: w(C_{l} | C_{l}' , \Bd) \pi(C_{l}', \Bs' | \Bd) \nonumber 
967: \end{eqnarray}
968: In order to determine the integral over the $\delta$-function in the
969: accept term above, we recall the identity for $\delta[G(\Bx)]$, where $G(\Ba) = 0$,
970: \begin{equation}
971: \delta[G(\Bx)] = \frac{\delta(\Bx - \Ba)}{\left| \partial G / \partial \Bx \right|_{a} }
972: \end{equation}
973: In our case, we can identify
974: \begin{equation}
975: G(\Bs') = \Bs - F(\Bs', C_{l}, C_{l}')
976: \end{equation}
977: which vanishes at $F^{-1}(\Bs,C_{l}, C_{l}') = F(\Bs, C_{l}', C_{l})$.  We also have the Jacobian
978: \begin{equation}
979: \left| \frac{\partial G}{ \partial \Bs'} \right|_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')} 
980: = \left| \frac{\partial F}{ \partial \Bs'} \right|_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')} 
981: \end{equation}
982: (i.e. $G(\Bs')$ is considered a function of $\Bs'$ with the other CMB map
983: $\Bs$ considered fixed) which therefore gives
984: \begin{equation}
985: \delta[\Bs - F(\Bs', C_{l}, C_{l}')] = 
986:  \delta[\Bs' - F^{-1}(\Bs, C_{l}, C_{l}')]
987: \left| \frac{\partial F}{ \partial \Bs'} \right|^{-1}_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')} 
988: \end{equation}
989: Inserting this into the condition for stationarity we have
990: \begin{eqnarray}
991: (T \circ \pi )(C_{l}, \Bs | \Bd)
992: & = & \left[ 1 - \int d(C_{l}'', \Bs'') \ A[C_{l}'', \Bs'' | C_{l}, \Bs]
993: \delta[\Bs'' - F(\Bs, C_{l}'', C_{l})] w(C_{l}'' | C_{l}, \Bd) \right] \pi(C_{l}, \Bs | \Bd) \nonumber \\
994: & & + \int d(C_{l}', s') \ A[s, C_{l} | s', C_{l}'] 
995: \left( \delta[\Bs' - F^{-1}(\Bs, C_{l}, C_{l}')]
996: \left| \frac{\partial F}{ \partial \Bs'} \right|^{-1}_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')} \right)
997: w(C_{l} | C_{l}' , \Bd) \pi(C_{l}', \Bs' | \Bd) \nonumber \\
998: & = & \left[ 1 - \int d(C_{l}'', \Bs'') \ A[C_{l}'', \Bs'' | C_{l}, \Bs]
999: \delta[\Bs'' - F(\Bs, C_{l}'', C_{l})] w(C_{l}'' | C_{l}, \Bd) \right] \pi(C_{l}, \Bs | \Bd) \nonumber \\
1000: & & + \int d(C_{l}', \Bs') \ A[\Bs, C_{l} | \Bs', C_{l}'] 
1001: \left( \delta[\Bs' - F(\Bs, C_{l}', C_{l})]
1002: \left| \frac{\partial F}{ \partial \Bs'} \right|^{-1}_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')} \right)
1003: w(C_{l} | C_{l}' , \Bd) \pi(C_{l}', \Bs' | \Bd) \nonumber 
1004: \end{eqnarray}
1005: where in the second line we again used the property that the inverse $F^{-1}$ is equivalent
1006: to $F$ with the spectra arguements interchanged.
1007: We see from the above that a sufficient condition for stationarity is
1008: \begin{equation}
1009: \pi(C_{l}, \Bs | \Bd) w(C_{l}' | C_{l} , \Bd) 
1010: A[\Bs', C_{l}' | \Bs, C_{l}] 
1011: = A[\Bs, C_{l} | \Bs', C_{l}'] 
1012: \left(  \left| \frac{\partial F}{ \partial \Bs'} \right|^{-1}_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')} \right)
1013: w(C_{l} | C_{l}' , \Bd) \pi(C_{l}', \Bs' | \Bd)
1014: \end{equation}
1015: An accept probability which satisfies this condition therefore gives cancellation
1016: of the integrals over the $\delta$-functions for both the reject and accept
1017: contributions, leaving us exactly with $T \circ  \pi = \pi$.
1018: We therefore have the accept probability
1019: \begin{equation}
1020: A[\Bs', C_{l}' | \Bs, C_{l}] = \min \left[ 1,
1021: \frac{\pi(C_{l}', \Bs' | d)}{\pi(C_{l}, \Bs | \Bd)}
1022: \frac{w(C_{l} | C_{l}', \Bd)}{w(C_{l}' | C_{l}, \Bd)} 
1023: \left(  \left| \frac{\partial F}{ \partial \Bs'} \right|^{-1}_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')} \right)
1024: \right]
1025: \end{equation}
1026: We give the expression above for the general case of any deterministic
1027: change in the CMB map with a function which satisfies $F(\Bs, C_{l}, C_{l}') = F^{-1}(\Bs, C_{l}', C_{l})$.
1028: We now explicitly evaluate this accept probability for the functional form chosen for this
1029: paper.
1030: 
1031: Since we have $F(\Bs', C_{l}, C_{l}') = [\BC]^{1/2} [\BC']^{-1/2} \Bs'$, we have
1032: \begin{equation}
1033: \left(  \left| \frac{\partial F}{ \partial \Bs'} \right|_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')} \right)
1034: = \frac{|\BC|^{1/2}}{|\BC'|^{1/2}}
1035: \end{equation}
1036: Reminding the reader of the functional form of the joint posterior in
1037: eqn. \ref{eq:cmb_posterior}, we have the accept probability given by
1038: \begin{eqnarray}
1039: A[\Bs', C_{l}' | \Bs, C_{l}] 
1040: & = &  \min \left[ 1,
1041: \frac{\pi(C_{l}', \Bs' | \Bd)}{\pi(C_{l}, \Bs | \Bd)}
1042: \frac{w(C_{l} | C_{l}', \Bd)}{w(C_{l}' | C_{l}, \Bd)} 
1043: \left(  \left| \frac{\partial F}{ \partial \Bs'} \right|^{-1}_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')} \right)
1044: \right] \nonumber \\
1045: & = &  \min \left[ 1,
1046: \frac{e^{- \chi^{2}(\Bs', \Bd)}}{e^{- \chi^{2}(\Bs, \Bd)}}
1047: \frac{e^{- \Bs' [\BC']^{-1} \Bs'}}{e^{\Bs \BC^{-1} \Bs}}
1048: \frac{|\BC|^{1/2}}{|\BC'|^{1/2}}
1049: \frac{w(C_{l} | C_{l}', d)}{w(C_{l}' | C_{l}, d)} 
1050: \left(  \left| \frac{\partial F}{ \partial \Bs'} \right|^{-1}_{\Bs' = F^{-1}(\Bs, C_{l}, C_{l}')} \right)
1051: \right] \nonumber \\ 
1052: & = &  \min \left[ 1,
1053: \frac{e^{- \chi^{2}(\Bs', \Bd)}}{e^{- \chi^{2}(\Bs, \Bd)}}
1054: \frac{e^{- \Bs' [\BC']^{-1} \Bs'}}{e^{\Bs \BC^{-1} \Bs}}
1055: \frac{|\BC|^{1/2}}{|\BC'|^{1/2}}
1056: \frac{w(C_{l} | C_{l}', \Bd)}{w(C_{l}' | C_{l}, \Bd)} 
1057: \left(  \frac{|\BC'|^{1/2}}{|\BC|^{1/2}} \right) 
1058: \right] \nonumber \\ 
1059: & = &  \min \left[ 1,
1060: \frac{e^{- \chi^{2}(\Bs', \Bd)}}{e^{- \chi^{2}(\Bs, \Bd)}}
1061: \frac{w(C_{l} | C_{l}', \Bd)}{w(C_{l}' | C_{l}, \Bd)} 
1062: \right] 
1063: \label{eq:accept_prob}
1064: \end{eqnarray}
1065: where the last line follows from the invariance of the quadratic form
1066: under the functional mapping
1067: $\Bs' [\BC']^{-1} \Bs'  =  \Bs \BC^{-1} \Bs$.  Finally, we note that for the special
1068: case of a symmmetric proposal matrix where $w(C_{l}' | C_{l}, \Bd) = w(C_{l} | C_{l}', \Bd)$, the
1069: accept probability is completely determined by the (exponeniated) change in $\chi^{2}$
1070: \begin{equation}
1071: A[\Bs', C_{l}' | \Bs, C_{l}] =
1072:   \min \left[ 1,
1073: \frac{e^{- \chi^{2}(\Bs', \Bd)}}{e^{- \chi^{2}(\Bs, \Bd)}} \right] 
1074: \end{equation}
1075: As emphasized earlier in the main part of the text, the above allows large
1076: changes to the spectrum precisely where the signal to noise is getting small,
1077: as $\chi^{2}$ does not change much in this regime.
1078: 
1079: 
1080: 
1081: \section{Relation to Gibbs Sampling in a Change of Variables}
1082: \label{app:cov}
1083: We note here another interesting approach to an MCMC algorithm in a
1084: {\it different set of variables} which in fact allows for large
1085: moves in the spectrum in the low signal to noise regime.  We define the
1086: CMB map
1087: \begin{equation}
1088: \Bx = \BC^{-1/2} \Bs
1089: \end{equation}
1090: We therefore have the joint posterior {\it in the new variables} according to
1091: \begin{equation}
1092: p(C_{l}, \Bs | \Bd) d(C_{l}, \Bs) = 
1093: p(C_{l}, \Bx | \Bd) \left| \frac{\partial \Bs}{\partial x} \right| d(C_{l}, \Bx)
1094: \end{equation}
1095: which is explicitly, up to a normalization constant
1096: \begin{equation}
1097: -2 \log p(C_{l}, \Bx | d) = (\Bd - \BC^{1/2} \Bx) \BN^{-1}
1098: (\Bd - \BC^{1/2} \Bx) - \| \Bx \|^{2}
1099: \end{equation}
1100: Then {\it traditional Gibbs sampling in the new variables} leads to an accept
1101: probability when changing the spectrum given the change of variable map $x$ as
1102: \begin{equation}
1103: A(C_{l}', \Bx | C_{l}, \Bx) =
1104: \min \left[ 1,
1105: \frac{e^{-(\Bd - [\BC']^{1/2} \Bx) \BN^{-1}(\Bd - [\BC']^{1/2} \Bx)}}
1106: {e^{-(\Bd - \BC^{1/2} \Bx) \BN^{-1}(\Bd - \BC^{1/2} \Bx)}}
1107: \frac{w(C_{l} | \Bx, C_{l}', \Bd)}{w(C_{l}' | \Bx, C_{l}, \Bd)} \right]
1108: \end{equation}
1109: where in the above the proposed variation in the spectrum can now be
1110: conditionally dependent on the current change of variable map $\Bx$.
1111: Assuming a symmetric proposal, or one conditionally independent of $\Bx$
1112: leads to an accept probability which is {\it numerically the same as 
1113: \ref{eq:accept_prob} }, and also has the same property - large moves in the
1114: spectrum are possible in the low signal to noise regime.
1115: As a side note, we can see that $\log p(C_{l} | \Bx, \Bd)$
1116: is quadratic in $C_{l}^{1/2}$, and suggests a proposal
1117: given by a Gaussian in $C_{l}^{1/2}$.  However there are two problems with this
1118: scheme - sampling in $C_{l}^{1/2}$ will result in re-introducing a Jacobian
1119: factor given by the ratio of $|C'|^{1/2} / |C|^{1/2}$ which results typically
1120: in low acceptance probabilities, and furthermore we cannot afford to exactly
1121: compute the local ``Fisher'' covariance matrix for each $\Bx$.
1122: Because of these difficulties, we in general need to produce a proposal
1123: for $C_{l}$ and then compute the accept probability above.
1124: 
1125: We emphasize an important distinction between MCMC with deterministic
1126: steps {\it in the original variables} $(C_{l}, \Bs)$ and Gibbs sampling
1127: in the change of variables $(C_{l}, \Bx)$.  It is only for the specific
1128: functional form that we have chosen for this paper that the numerical value of the
1129: accept probabilities for $A(C_{l}' , \Bs' | C_{l}, \Bs)$ and $A(C_{l}', \Bx | C_{l}, \Bx)$
1130: are the same.
1131: 
1132: At first glance, it might appear
1133: that a random variation in some of the variables followed by a deterministic
1134: change in the complementary set is always equivalent to random variation
1135: in a new set of variables.  For notational convenience, we will assume the state
1136: space is separated into two sets of variables $(\Bx,\By)$, i.e. for the CMB sampling
1137: context we have $(\Bs, C_{l})$.  Now, to make the distinction between a change
1138: of variables and deterministic steps in MCMC more precise, consider a ``global'' change of variables
1139: of the form
1140: \begin{eqnarray}
1141: \Bu & = & F(\Bx,\By) \nonumber \\
1142: \Bv & = & \By
1143: \end{eqnarray}
1144: with Jacobian
1145: \begin{equation}
1146: \left| \begin{array}{cc}
1147: \frac{\partial \Bu}{\partial \Bx} & \frac{\partial \Bv}{\partial \Bx} \\
1148: \frac{\partial \Bu}{\partial \By} & \frac{\partial \Bv}{\partial \By} \end{array}
1149: \right| =
1150: \left| \begin{array}{cc}
1151: \frac{\partial F}{\partial \Bx} & 0 \\
1152: \frac{\partial F}{\partial \By} & \id \end{array}
1153: \right| = \left| \frac{\partial F}{\partial \Bx} \right|
1154: \end{equation}
1155: A Gibbs sampling step varying $v$ with $u$ fixed, has accept probability
1156: \begin{eqnarray}
1157: A(\By_{n+1}, \Bu_{n} | \By_{n}, \Bu_{n}) 
1158: & = & \min \left[ 1,
1159: \frac{\pi(\By_{n+1} | \Bu_{n}, \Bd)}{\pi(\By_{n} | \Bu_{n}, \Bd)}
1160: \frac{w(\By_{n} | \Bu_{n}, \Bd)}{w(\By_{n+1} | \Bu_{n}, \Bd)} \right] \nonumber \\
1161: & = & \min \left[ 1,
1162: \frac{\pi(\By_{n+1}, \Bx_{n+1} | \Bd)}{\pi(\By_{n} , \Bx_{n} | \Bd)}
1163: \left( \left| \frac{\partial F}{\partial \Bx} \right|_{\Bx_{n+1}, \By_{n+1}}
1164: \left| \frac{\partial F}{\partial \Bx} \right|^{-1}_{\Bx_{n}, \By_{n}} \right)
1165: \frac{w(\By_{n} | \Bu_{n}, \Bd)}{w(\By_{n+1} | \Bu_{n}, \Bd)} \right] \nonumber \\
1166: \end{eqnarray}
1167: where in the above we have the constraint
1168: \begin{eqnarray}
1169: \Bx_{n+1} & = & F^{-1}(\Bu_{n}, \By_{n+1}) \nonumber \\
1170: \Bx_{n} & = & F^{-1}(\Bu_{n}, \By_{n}) 
1171: \end{eqnarray}
1172: Now consider an MCMC step in the original variables of the form
1173: \begin{equation} 
1174: w(\Bx_{n+1}, \By_{n+1} | \Bx_{n}, \By_{n }) = w(\By_{n+1} | \By_{n}, \Bx_{n}, \Bd)
1175: \delta \left( x_{n+1} - H(x_{n}, y_{n+1}, y_{n}) \right)
1176: \end{equation}
1177: with general accept probability, according to the discussion above
1178: \begin{eqnarray}
1179: A(\By_{n+1}, \Bx_{n+1} | \By_{n}, \Bx_{n}) & = & \min \left[ 1,
1180: \frac{\pi(\By_{n+1}, \Bx_{n+1} | \Bd)}{\pi(\By_{n}, \Bx_{n} | \Bd)}
1181: \frac{w(\By_{n} | \Bx_{n}, \By_{n+1}, \Bd)}{w(\By_{n+1} | \Bx_{n+1}, \By_{n}, \Bd)}
1182: \left( \left| \frac{\partial H}{\partial \Bx} \right|^{-1}_{\Bx_{n+1} = H^{-1}(\Bx_{n},\By_{n}, \By_{n+1})} \right)
1183: \right]
1184: \end{eqnarray}
1185: Interestingly enough this suggests that we can set $H$ to be the function
1186: \begin{equation}
1187: H(\Bx, \By_{n+1}, \By_{n}) = F^{-1} \left( F(\Bx, \By_{n}), \By_{n+1} \right)
1188: \end{equation}
1189: Does this function have the correct properties for its inverse?  Assuming we
1190: have computed in the forward direction $\Bx' = H(\Bx, \By_{n+1}, \By_{n})$, we can invert to find $x$ by
1191: computing sequentially
1192: \begin{eqnarray}
1193: F(\Bx', \By_{n+1}) & = & F(\Bx, \By_{n}) \nonumber \\
1194: \Bx & = & F^{-1} \left( F(\Bx', \By_{n+1}), \By_{n} \right) \nonumber \\
1195: & \equiv & H(\Bx', \By_{n}, \By_{n+1})
1196: \end{eqnarray}
1197: where the last line follows from definition of the forward $H$.
1198: Since we have, by definition
1199: \begin{eqnarray}
1200: \Bx' & = & H(\Bx, \By_{n+1}, \By_{n}) \nonumber \\
1201: \Bx & \equiv & H^{-1}(\Bx', \By_{n+1}, \By_{n}) \nonumber 
1202: \end{eqnarray}
1203: we therefore have shown that
1204: \begin{equation}
1205: H^{-1}(\Bx', \By_{n+1}, \By_{n}) = H(\Bx', \By_{n}, \By_{n+1})
1206: \end{equation}
1207: as required for a non-vanishing accept probability.
1208: The above as a function of $x$ has Jacobian
1209: \begin{eqnarray}
1210: \left| \frac{\partial H}{\partial \Bx} \right| 
1211: & = & \left| \frac{\partial F^{-1}}{\partial \Bu}  \right|_{( \Bu(\Bx, \By_{n}), \By_{n+1})}
1212: \  \left| \frac{\partial F}{\partial \Bx}  \right|_{(\Bx,\By_{n})} \nonumber \\
1213: & = & \left| \frac{\partial F}{\partial \Bx}  \right|^{-1}_{( \Bx, \By_{n+1})}
1214: \  \left| \frac{\partial F}{\partial \Bx}  \right|_{(\Bx,\By_{n})} \nonumber \\
1215: \end{eqnarray}
1216: However, when evaluated at $\Bx_{n+1} = H^{-1}(\Bx_{n}, \By_{n}, \By_{n+1})$, we will not
1217: in general satisfy the required equality required for numerical equivalence
1218: \begin{equation}
1219: \left( \left| \frac{\partial H}{\partial \Bx} \right|^{-1}_{\Bx_{n+1} = H^{-1}(\Bx_{n},\By_{n}, \By_{n+1})} \right)
1220: \neq  \left( \left| \frac{\partial F}{\partial \Bx} \right|_{\Bx_{n+1}, \By_{n+1}}
1221: \left| \frac{\partial F}{\partial \Bx} \right|^{-1}_{\Bx_{n}, \By_{n}} \right)
1222: \label{eq:jacobian_identity}
1223: \end{equation}
1224: So in general, while we can use any function $F(\Bx,\By)$ to generate deterministic
1225: moves in the original variables within MCMC, this is not equivalent to
1226: a Gibbs sampling step $p(\By_{n+1} | \Bu_{n}, \Bd)$ in the new variables using $(F(\Bx,\By), \By)$ as a global change of variables.
1227: 
1228: However, using the above construction for
1229: the CMB change of variables, we have explcitly
1230: \begin{eqnarray}
1231: F^{-1} \left( F(\Bs, C_{l}), C_{l}' \right) & = & 
1232: [\BC']^{1/2} \left( \BC^{-1/2} \Bs \right)
1233: \end{eqnarray}
1234: which is exactly the functional form used for the deterministic MCMC steps.  In this
1235: case, it is because the Jacobian of our deterministic change in the
1236: CMB map is independent of the current CMB map $\Bs$ (and only dependent
1237: on the proposed and current spectra) that we have numerical
1238: equivalence of the accept probabilities.
1239: 
1240: So in summary, while we can use any mapping $F(\Bx,\By)$ to generate deterministic
1241: steps for use in MCMC, the accept probability is not equivalent to a conditional
1242: step $p(\By | \Bu, \Bd)$ using $F(\Bx,\By)$ in a change of variables due to the general
1243: ``location'' dependence of the Jacobian.  Furthermore, setting
1244: $H(\Bx, \By', \By) = F^{-1} ( F(\Bx,\By), \By')$ is not the most general form
1245: for a function that satisfies the detailed balance requirement
1246: $H^{-1}(\Bx, \By', \By) = H(\Bx, \By, \By')$.
1247: In this sense then, a change of variables as an approach to more efficiently generating
1248: samples from a probability density is distinct from a strategy of designing an MCMC
1249: algorithm (in any chosen representation of the variables) with deterministic
1250: changes of some of the degrees of freedom.  Both approaches are interesting, and
1251: advances in either approach for Bayesian CMB analysis could lead to improvements over
1252: the approach presented in this paper.
1253: 
1254: 
1255: 
1256: 
1257: 
1258: \begin{thebibliography}{}
1259: 
1260: \bibitem[Abramowitz \& Stegun(1972)]{abramowitz:1972} Abramowitz, M.,
1261:   \& Stegun, I.~A.\ 1972, Handbook of Mathematical Functions, New
1262:   York: Dover, 1972,   
1263: 
1264: \bibitem[Bennett et al.(2003)]{bennett:2003} Bennett, C. L., et al.\
1265: 2003a, \apjs, 148, 1
1266: 
1267: \bibitem[Chu et al.(2005)]{chu:2005} Chu, M., Eriksen, H.~K., Knox,
1268:   L., G{\'o}rski, K.~M., Jewell, J.~B., Larson, D.~L., O'Dwyer, I.~J.,
1269:   \& Wandelt, B.~D.\ 2005, \prd, 71, 103002
1270: 
1271: % First method paper
1272: \bibitem[Eriksen et al.(2004)]{eriksen:2004} 
1273: Eriksen, H.~K., et al.\ 2004, \apjs, 155, 227
1274: 
1275: % WMAP3 reanalysis paper
1276: \bibitem[Eriksen et al.(2007a)]{eriksen:2007a} Eriksen, H.~K., et al.\ 
1277: 2007a, \apj, 656, 641
1278: 
1279: % WMAP3 polarization paper
1280: \bibitem[Eriksen et al.(2007b)]{eriksen:2007b} Eriksen, H.~K., Huey, 
1281: G., Banday, A.~J., G{\'o}rski, K.~M., Jewell, J.~B., O'Dwyer, I.~J., 
1282: \& Wandelt, B.~D.\ 2007b, \apjl, 665, L1 
1283: 
1284: % Foreground method paper
1285: \bibitem[Eriksen et al.(2008a)]{eriksen:2008a} Eriksen, H.~K., Jewell, 
1286: J.~B., Dickinson, C., Banday, A.~J., G{\'o}rski, K.~M., 
1287: \& Lawrence, C.~R.\ 2008a, \apj, 676, 10 
1288: 
1289: % WMAP3 foreground analysis
1290: \bibitem[Eriksen et al.(2008b)]{eriksen:2008b} Eriksen, H.~K., 
1291: Dickinson, C., Jewell, J.~B., Banday, A.~J., G{\'o}rski, K.~M., 
1292: \& Lawrence, C.~R.\ 2008b, \apjl, 672, L87 
1293: 
1294: \bibitem[Eriksen \& Wehus(2008)]{wehus:2008} Eriksen, H.~K. \&
1295:   Wehus, I.~K.\ 2008a, \apjs, submitted, [astro-ph/XXXXXX]
1296: 
1297: \bibitem[Gelman \& Rubin(1992)]{gelman:1992}
1298: Gelman, A., \& Rubin, D. 1992, Stat. Sci., 7, 457
1299: 
1300: \bibitem[Gold et al.(2008)]{gold:2008} Gold, B., et al.\ 2008, 
1301: [arXiv:0803.0715]
1302: 
1303: \bibitem[G{\'o}rski et al.(2005)]{gorski:2005} 
1304:   G{\' o}rski, K.~M., Hivon, E., Banday, A.~J., Wandelt, B.~D.,
1305:   Hansen, F.\,K., Reinecke, M., \& Bartelmann, M. 2005, \apj, 622, 759
1306: 
1307: \bibitem[Green (1995)]{Green:1995}
1308: Green, P.; 1995, Biometrika, 82: 711-732
1309: 
1310: \bibitem[Gupta \& Nagar(2000)]{gupta:2000}
1311:   Gupta, A.~K. \& Nagar, D.~K. 2000, Matrix Variate Distributions
1312: 
1313: \bibitem[Hinshaw et al.(2007)]{hinshaw:2007} Hinshaw, G., et al.\ 
1314: 2007, \apjs, 170, 288 
1315: 
1316: \bibitem[Hinshaw et al.(2008)]{hinshaw:2008} Hinshaw, G., et al.\ 
1317: 2008, ApJ, submitted, [arXiv:0803.0732]
1318: 
1319: \bibitem[Hivon et al.(2002)]{hivon:2002} Hivon, E., G{\' o}rski,
1320:   K.~M., Netterfield, C.~B., Crill, B.~P., Prunet, S., \& Hansen, F.\
1321:   2002, \apj, 567, 2
1322: 
1323: \bibitem[Jewell et al.(2002)]{jewell:2002} 
1324:   Jewell, J., Levin, S., \& Anderson, C.  H.  2002, astro-ph 0209560v1
1325: 
1326: \bibitem[Jewell et al.(2004)]{jewell:2004} 
1327:   Jewell, J., Levin, S., \& Anderson, C.  H.  2004, \apj, 609, 1
1328: 
1329: \bibitem[Komatsu et al.(2008)]{komatsu:2008} Komatsu, E., et al.\ 
1330: 2008, [arXiv:0803.0547]
1331: 
1332: \bibitem[Larson et al.(2007)]{larson:2007} Larson, D.~L., Eriksen, 
1333: H.~K., Wandelt, B.~D., G{\'o}rski, K.~M., Huey, G., Jewell, J.~B., \& 
1334: O'Dwyer, I.~J.\ 2007, \apj, 656, 653 
1335: 
1336: \bibitem[Liu(2001)]{liu:2001} Liu, J. S., Monte Carlo Strategies in
1337:   Scientific Computing, Cambridge, USA: Springer, 2001,   
1338: 
1339: \bibitem[O'Dwyer et al.(2004)]{odwyer:2004} O'Dwyer, I.~J., et al.\ 
1340: 2004, \apjl, 617, L99 
1341: 
1342: \bibitem[Page et al.(2007)]{page:2007} Page, L., et al.\ 2007, 
1343: \apjs, 170, 335 
1344: 
1345: \bibitem[Seljak \& Zaldarriaga(1996)]{seljak:1996} Seljak, U., \&
1346:   Zaldarriaga, M.\ 1996, \apj, 469, 437 
1347: 
1348: \bibitem[Smoot et al.(1992)]{smoot:1992} Smoot, G.~F., et al.\ 
1349: 1992, \apjl, 396, L1 
1350: 
1351: \bibitem[Sokal (1989)]{Sokal:1989}
1352: Sokal, A.D.; ``Monte Carlo methods in statistical mechanics: foundations
1353: and new algorithms'', {\it Cous de Troisi\`{e}me Cycle de la Physique en Suisse Romande},
1354: Lausanne.
1355: 
1356: \bibitem[Taylor et al.(2007)]{taylor:2007} Taylor, J.~F., Ashdown, 
1357: M.~A.~J., \& Hobson, M.~P.\ 2007, MNRAS, submitted, [arXiv:0708.2989] 
1358: 
1359: \bibitem[Wandelt et al.(2004)]{wandelt:2004} 
1360:   Wandelt, B.~D., Larson, D.~L., \& Lakshminarayanan, A.\ 2004, \prd,
1361:   70, 083511
1362: 
1363: \bibitem[Zaldarriaga \& Seljak(1997)]{zaldarriaga:1997} 
1364: Zaldarriaga, M., \& Seljak, U.\ 1997, \prd, 55, 1830 
1365: 
1366: \end{thebibliography}
1367: 
1368: 
1369: \end{document}