0703:math0703715/imc.tex

1: % Inferring Markov Chains: Bayesian Estimation,

2: %	Model Comparison, Entropy Rate, and Out-of-class Modeling

3: %

4: % ccs: mar 01, 2007

5: % jpc: mar 10, 2007

6: % ccs: mar 13, 2007

7: % jpc: mar 23, 2007

8:

9: \documentclass[pre,twocolumn,showpacs,superscriptaddress,preprintnumbers,floatfix]{revtex4}

10: %\documentclass[pre,showpacs,superscriptaddress,preprintnumbers,floatfix]{revtex4}

11:

12: %

13: %-packages

14: \usepackage{amssymb,amsmath} % math utilities

15: \usepackage{graphicx}% Include figure files

16: \usepackage{bm}% Bold mat

17: %\usepackage[pstricks1-10]{vaucanson-g} % for FSA diagrams

18: \usepackage{vaucanson-g} % for FSA diagrams, older version of ps-tricks

19:

20: %

21: %-new user commands

22:

23: %% references %%

24: \newcommand{\eqnref}[1]{Eq.~(\ref{#1})}

25: \newcommand{\begeqnref}[1]{Equation~\ref{#1}}

26: \newcommand{\figref}[1]{Fig.~\ref{#1}}

27: \newcommand{\begfigref}[1]{Figure~\ref{#1}}

28: \newcommand{\appref}[1]{App.~\ref{#1}}

29:

30: %% notation %%

31: \newcommand{\hmu}	{h_\mu} % entropy rate

32: \newcommand{\hmuL}	{h_{\mu L}} % entropy rate

33: \newcommand{\hmuk}	{h_{\mu k}} % entropy rate

34: \newcommand{\KLd}	{\mathcal{D}} % entropy rate

35: \newcommand{\EQP}	{E(Q,P)}

36: % size of the alphabet

37: \newcommand{\Asize}[0]{\vert \mathcal{A} \vert}

38: % history of length-k: #1

39: \newcommand{\hk}[0]{ \overleftarrow{s}^k}

40: % history of length l:#2, at time t:#1

41: \newcommand{\htl}[2]{ \overleftarrow{s}_{#1}^{#2}}

42:

43: % specific forms

44: \newcommand{\nsk}[0]{n(\overleftarrow{s}^k)} % number of s^k

45: \newcommand{\nsks}[0]{n(\overleftarrow{s}^k s)} % number of s^k s

46: \newcommand{\ask}[0]{\alpha(\overleftarrow{s}^k)} % alpha s^k

47: \newcommand{\asks}[0]{\alpha(\overleftarrow{s}^k s)} % alpha s^k s

48: % model parameters

49: \newcommand{\MP}[0]{\mathbf{\theta}} %model parameters

50: \newcommand{\MPk}[0]{\mathbf{\theta}_k} %model parameters, k-th order

51: \newcommand{\MC}[0]{\mathbf{M}} % model

52: \newcommand{\MCk}[0]{\mathbf{M}_k} % model, k-th order

53: \newcommand{\MCkprime}[0]{\mathbf{M}_{k}'} % model, k-th order

54:

55: % avg, var, covar

56: \newcommand{\avg}[2]{\mathbf{E}_{#2}[#1]} %expectation values

57: \newcommand{\var}[2]{\mathbf{Var}_{#2}[#1]} % variance

58: \newcommand{\cov}[2]{\mathbf{Cov}[#1,#2]} % covariance

59: % probabilities

60: % true distribution

61: \newcommand{\psk}[0]{p(\overleftarrow{s}^k)}  % True parameter

62: \newcommand{\psks}[0]{p(s\vert \overleftarrow{s}^k)}  % True parameter

63: % pme distribution

64: \newcommand{\qsk}[0]{q(\overleftarrow{s}^k)}

65: \newcommand{\qsks}[0]{q(s\vert \overleftarrow{s}^k)}

66: % prior distribution

67: \newcommand{\rsk}[0]{r(\overleftarrow{s}^k)}

68: \newcommand{\rsks}[0]{r(s\vert \overleftarrow{s}^k)}

69: % uniform distribution

70: \newcommand{\usks}[0]{u(s\vert \overleftarrow{s}^k)}  % Uniform distribution

71: \newcommand{\usk}[0]{u(\overleftarrow{s}^k)}  % Uniform distribution

72:

73: %-inference methods

74: \newcommand{\mle}[0]{MLE} % maximum likelihood estimate

75: \newcommand{\pme}[0]{PME} % posterior mean estimate

76: \newcommand{\map}[0]{MAP} % maximum a-posteriori

77:

78: \begin{document}

79:

80: \preprint{Santa Fe Institute Working Paper 07-03-XXX}

81: \preprint{arxiv.org/xxxxx/0703XXX}

82:

83: \title{Inferring Markov Chains: Bayesian Estimation,\\

84: Model Comparison, Entropy Rate, and Out-of-class Modeling}

85:

86: \author{Christopher~C.~Strelioff}

87: 	\email{streliof@uiuc.edu}

88: 	\affiliation{Center for Computational Science \&

89: 	Engineering and Physics Department,\\

90: 	University of California at Davis, One Shields Avenue, Davis, CA 95616}

91:  	\affiliation{Center for Complex Systems Research

92:  	and Physics Department,\\

93: 	University of Illinois at Urbana-Champaign,

94: 	1110 West Green Street, Urbana, Illinois 61801}

95: \author{James P. Crutchfield}

96: 	\email{chaos@cse.ucdavis.edu}

97: 	\affiliation{Center for Computational Science \&

98: 	Engineering and Physics Department,\\

99: 	University of California at Davis, One Shields Avenue, Davis, CA 95616}

100: \author{Alfred W. H\"{u}bler}

101: 	\email{a-hubler@uiuc.edu}

102:  	\affiliation{Center for Complex Systems Research

103:  	and Physics Department,\\

104: 	University of Illinois at Urbana-Champaign,

105: 	1110 West Green Street, Urbana, Illinois 61801}

106:

107: \begin{abstract}

108: Markov chains are a natural and well understood tool for describing

109: one-dimensional patterns in time or space. We show how to infer $k$-th order

110: Markov chains, for arbitrary $k$, from finite data by applying Bayesian

111: methods to both parameter estimation and model-order selection. Extending

112: existing results for multinomial models of discrete data, we connect inference

113: to statistical mechanics through information-theoretic (type theory) techniques.

114: We establish a direct relationship between Bayesian evidence and the partition

115: function which allows for straightforward calculation of the expectation and

116: variance of the conditional relative entropy and the source entropy rate.

117: Finally, we introduce a novel method that uses finite data-size scaling with

118: model-order comparison to infer the structure of out-of-class processes.

119: \end{abstract}

120:

121: %%% PACS

122: % Inference methods, 02.50.Tt

123: % Markov processes, 02.50.Ga

124: % Stochastic models- in statistical physics and nonlinear dynamics, 05.10.Gg

125: \pacs{02.50.Tt,02.50.Ga,05.10.Gg}

126:

127: \maketitle

128:

129: %

130: % introduction

131: %

132: \section{Introduction}

133:

134: Statistical inference of models from small data samples is a vital tool in

135: the understanding of natural systems.  In many problems of interest data

136: consists of a sequence of \emph{letters} from a finite \emph{alphabet}.

137: Examples include analysis of sequence information in

138: biopolymers~\cite{Avery1999,JSLiu1999}, investigation of

139: one-dimensional spin systems~\cite{Crutchfield1997}, models of natural

140: languages~\cite{MacKay1994}, and coarse-grained models of chaotic

141: dynamics~\cite{Crutchfield1983,BLHao1998}.  This diversity of potential

142: application has resulted in the development of a variety of representations

143: for describing discrete-valued data series.

144:

145: We consider the $k$-th order Markov chain model class which uses the previous

146: $k$ letters in a sequence to predict the next letter. Inference of Markov

147: chains from data has a long history in mathematical statistics.  Early work

148: focused on maximum likelihood methods for estimating the parameters of the

149: Markov chain~\cite{TWAnderson1957,Billingsley1961a,Chatfield1973}. This work

150: often assumed a given fixed model order. That is, no \emph{model comparison}

151: across orders is done. This work also typically relied on the assumed

152: asymptotic normality of the likelihood when estimating regions of

153: confidence and when implementing model comparison.  As a result, the realm

154: of application has been limited to data sources where these conditions are

155: met.  One consequence of these assumptions has been that data sources which

156: exhibit \emph{forbidden words}, symbol sequences which are not allowed, cannot

157: be analyzed with these methods.  This type of data violates the assumed

158: normality of the likelihood function.

159:

160: More recently, model comparison in the maximum likelihood approach has been

161: extended using various \emph{information criteria}. These methods for

162: model-order selection are based on extensions of the likelihood ratio and allow

163: the comparison of more than two candidate models at a time. The most widely used

164: are \emph{Akaike's information criteria} (AIC)~\cite{HTong1975} and the

165: \emph{Bayesian information criteria} (BIC)~\cite{Katz1981}. (Although the

166: latter is called Bayesian, it does not employ Bayesian model comparison in

167: the ways we will present here.) In addition to model selection using information

168: criteria, methods from information theory and machine learning have also been

169: developed.  Two of the most widely employed are \emph{minimum

170: description length} (MDL)~\cite{JRissanen1984} and \emph{structural risk

171: minimization}~\cite{VVapnik1999}.  Note that MDL and Bayesian

172: methods obtain similar results in some situations~\cite{Vitanyi2000}.  However,

173: to the best of our knowledge, structural risk minimization has not been adapted

174: to Markov chain inference.

175:

176: We consider Bayesian inference of the Markov chain model class, extending

177: previous results~\cite{MacKay1994,JSLiu1999,Baldi2001,Durbin1998}. We provide

178: the details necessary to infer a Markov chain of arbitrary order, choose

179: the appropriate order (or weight orders according to their probability),

180: and estimate the data source's entropy rate.  The latter is important for

181: estimating the intrinsic randomness and achievable compression rates for

182: an information source~\cite{Cover1991}.  The ability to weight Markov chain

183: orders according their probability is unique to Bayesian methods and

184: unavailable in the model selection techniques discussed above.

185:

186: In much of the literature just cited, steps of the inference process

187: are divided into (i) point estimation of model parameters, (ii) model

188: comparison (hypothesis testing), and (iii) estimation of functions of the

189: model parameters. Here we will show that Bayesian inference connects all

190: of these steps, using a unified set of ideas. Parameter estimation is the first

191: step of inference, model comparison a second level, and estimation of the

192: entropy rate a final step, intimately related to the mathematical structure

193: underlying the inference process.  This view of connecting model to data

194: provides a powerful and unique understanding of inference not available in the

195: classical statistics approach to these problems. As we demonstrate, each of

196: these steps is vital and implementation of one step without the others does

197: not provide a complete analysis of the data-model connection.

198:

199: Moreover, the combination of inference of model parameters, comparison of

200: performance across model orders, and estimation of entropy rates provides a

201: powerful tool for understanding Markov chain models themselves. Remarkably,

202: this is true even when the generating data source is outside of the Markov

203: chain model class.

204: Model comparison provides a sense of the structure of the data source, whereas

205: estimates of the entropy rate provide a description of the inherent randomness.

206: Bayesian inference, information theory, and tools from statistical mechanics

207: presented here touch on all of these issues within a unified framework.

208:

209: We develop this as follows, assuming a passing familiarity with Bayesian

210: methods and statistical mechanics. First, we discuss estimation of Markov

211: chain parameters using Bayesian methods, emphasizing the use of the complete

212: marginal posterior density for each parameter, rather than point estimates

213: with error bars. Second, we consider selection of the appropriate memory

214: $k$ given a particular data set, demonstrating that a mixture of orders may

215: often be more appropriate than selecting a single order. This is certainly

216: a more genuinely Bayesian approach. In these first two parts

217: we exploit different forms of Bayes' theorem to connect data and model class.

218:

219: Third, we consider the mathematical structure of the evidence (or marginal

220: likelihood) and draw connections to statistical mechanics.  In this discussion

221: we present a method for estimating entropy rates by taking derivatives of a

222: partition function formed from elements of each step of the inference procedure.

223: Last, we apply these tools to three example information sources of increasing

224: complexity. The first example belongs to the Markov chain model class, but

225: the other two are examples of hidden Markov models (HMMs) that fall outside

226: of that class. We show that the methods developed here provide a powerful tool

227: for understanding data from these sources, even when they do not belong to the

228: model class being assumed.

229:

230: %%

231: %%

232: \section{Inferring Model Parameters}

233:

234: In the first level of Bayesian inference we develop a systematic relation

235: between the data $D$, the chosen \emph{model class} $M$, and the vector of

236: \emph{model parameters} $\MP$. The object of interest in the inference of

237: model parameters is the \emph{posterior probability density}

238: $P\left( \MP \vert D, M \right)$.  This is the probability of the model

239: parameters given the observed data and chosen model. To find the posterior

240: we first consider the joint distribution $P\left( \MP, D \vert M \right)$

241: over the data and model parameters given that one has chosen to model the

242: source with a representation in a certain class $M$. This can be factored in

243: two ways: $P\left( \MP \vert D, M \right)P\left(D \vert M\right)$ or

244: $P\left( D \vert \MP, M \right)P\left(\MP \vert M\right)$.  Setting these

245: equal and solving for the posterior we obtain Bayes' theorem:

246: \begin{equation}

247: \label{eqn:bayes}

248: P\left( \MP \vert D, M \right)

249: 	= \frac{ P\left( D \vert \MP , M \right) \;

250: 	P\left( \MP \vert M \right) }{ P\left( D \vert M \right) }.

251: \end{equation}

252:

253: The \emph{prior} $P\left( \MP \vert M \right)$ specifies our assumptions

254: regarding the model parameters. We take a pragmatic view of the prior,

255: considering its specification to be a statement of assumptions about the

256: chosen model class. The \emph{likelihood} $P\left( D \vert \MP , M \right)$

257: describes the probability of the data given the model.  Finally, the

258: \emph{evidence} (or marginal likelihood) $P\left( D \vert M \right)$ is the

259: probability of the data given the model.  In the following sections we

260: describe each of the quantities in detail on our path to giving an explicit

261: expression for the posterior.

262:

263: %%

264: \subsection{Markov chains}

265:

266: The first step in inference is to clearly state the assumptions that make up

267: the model.  This is the foundation for writing down the likelihood of a data

268: sample and informs the choice of prior. We assume that a single data set of

269: length $N$ is the starting point of the inference and that it consists of

270: \textit{symbols} $s_t$ from a finite alphabet $\mathcal{A}$,

271: \begin{equation}

272: 	\label{eqn:data}

273: 	D = s_0 s_1 \ldots s_{N-1} \; , \; s_t \in \mathcal{A}.

274: \end{equation}

275: We introduce the notation $\htl{t}{k}$ to indicate a length-$k$ sequence of

276: letters ending at position $t$: e.g., $\htl{4}{2}=s_3s_4$.

277:

278: The $k$-th order Markov chain model class assumes finite memory and

279: stationarity in the data source.  The finite memory condition, a

280: generalization of the conventional Markov property, can be written

281: \begin{equation}

282: p(D)	 = p(\htl{k-1}{k}) \prod_{t=k-1}^{N-2} p(s_{t+1} \vert \htl{t}{k}) ~,

283: 	    \label{eqn:markov_condition}

284: \end{equation}

285: thereby factoring into terms which depend only on preceding words of

286: length-$k$. The stationarity condition can be expressed

287: \begin{equation}

288: 	\label{eqn:stationarity}

289: 	p(s_t \vert \htl{t-1}{k}) = p(s_{t+m} \vert \htl{t+m-1}{k}) ~,

290: \end{equation}

291: for any $(t,m)$.  \begeqnref{eqn:stationarity} results in a simplification of

292: the notation because we no longer need to track the position index,

293: $p(s_t = s \vert \htl{t-1}{k} = \hk ) = p( s \vert \hk )$ for any $t$.  Given

294: these two assumptions, the model parameters of the $k$-th order Markov chain

295: $\MCk$ are

296: \begin{equation}

297: 	\label{eqn:model_parameters}

298:  	\MPk  = \left\{ \, p( s \vert \hk ) : s \in \mathcal{A},

299:  	\hk \in \mathcal{A}^k \, \right\}.

300: \end{equation}

301: A normalization constraint is placed on these parameters $\sum_{s\in

302: \mathcal{A}} p( s \vert \hk ) = 1$ for each word $\hk$.

303:

304: The next step is to write down the elements of Bayes' theorem specific to the

305: $k$-th order Markov chain.

306:

307: %%

308: \subsection{Likelihood}

309:

310: Given a sample of data $D=s_{0}s_{1} \ldots s_{N-1}$, the likelihood can be

311: written down using the Markov property of~\eqnref{eqn:markov_condition} and the

312: stationarity of~\eqnref{eqn:stationarity}.  This results in the form

313: \begin{equation}

314: 	\label{eqn:likelihood}

315: 	P(D\vert \MPk, \MCk) = \prod_{ s \in \mathcal{A} }

316: 	\prod_{ \hk \in \mathcal{A}^{k} } p( s \vert \hk )^{\nsks} ,

317: \end{equation}

318: where $\nsks$ is the number of times the \textit{word} $\hk s$ occurs in the

319: sample $D$.  For future use we also introduce notation for the number of times a

320: word $\hk$ has been observed $\nsk = \sum_{s \in \mathcal{A}} \nsks$.  We note

321: that~\eqnref{eqn:likelihood} is conditioned on the \emph{start sequence}

322: $\hk = s_0s_1\ldots s_{k-1}$.

323:

324: %%

325: \vspace{-0.125in}

326: \subsection{Prior}

327: \vspace{-0.125in}

328:

329: The prior $P(\theta|M)$ is used to specify assumptions about the model to be

330: inferred before the data is considered. Here we use

331: \emph{conjugate priors} for which the posterior distribution has the same

332: functional form as the prior.  Our choice allows us to derive exact expressions

333: for many quantities of interest in inference. This provides a powerful tool for

334: understanding what information is gained during inference and,

335: especially, model comparison.

336:

337: The exact form of the prior is determined by our assignment of

338: \emph{hyperparameters} $\asks$ for the prior which balance the strength of

339: the modeling assumptions encoded in the prior against the weight of the data.

340: For a $k$-th order Markov chain, there is one hyperparameter for each word

341: $\hk s$, given the alphabet under consideration. A useful way to think about

342: the assignment of values to the

343: hyperparameters is to relate them to fake counts $\tilde{n}(\hk s)$, such that

344: $\asks = \tilde{n}(\hk s) + 1$.  In this way, the $\asks$ can be set to reflect

345: knowledge of the data source and the strength of these prior assumptions can be

346: properly weighted in relation to the actual data counts $\nsks$.

347:

348: The conjugate prior for Markov chain inference is a product of Dirichlet

349: distributions, one for each word $\hk$. It restates the finite-memory

350: assumption from the model definition:

351: \begin{eqnarray}

352: 	P(\MPk \vert \MCk )

353: 	& = & \prod_{\hk \in \mathcal{A}^{k}} \left\{

354: 	\frac{ \Gamma( \ask  )}{

355: 	\prod_{s\in\mathcal{A}} \Gamma( \asks ) } \right. \nonumber \\

356:     & \times & \delta \mathbf{(}1-\sum_{s\in\mathcal{A}}

357:     p( s \vert \hk )\mathbf{)} \label{eqn:prior} \\

358: 	& \times & \left. \prod_{s\in\mathcal{A}} p( s \vert \hk )^{\asks-1}

359: 	\right\}. \nonumber

360: \end{eqnarray}

361: (See App. \ref{app:Dirichlet} for relevant properties of Dirichlet

362: distributions.)

363: The prior's hyperparameters $\{ \asks \}$ must be real and positive.  We

364: also introduce the more compact notation $\ask = \sum_{s \in \mathcal{A}}

365: \asks$.  The function $\Gamma(x)=(x-1)!$ is the well known Gamma function.  The

366: $\delta$-function constrains the model parameters to be properly normalized:

367: $\sum_{s \in \mathcal{A}} \psks = 1$ for each $\hk$.

368:

369: Given this functional form, there are at least two ways to interpret what the

370: prior says about the Markov chain parameters $\MPk$. In addition to considering

371: fake counts $\tilde{n}( \cdot )$, as discussed above, we can consider the

372: range of fluctuations in the estimated $\psks$. Classical statistics would

373: dictate describing the fluctuations via a single value with error bars. This

374: can be accomplished by finding the average and variance of $\psks$ with

375: respect to the prior. The result is:

376: \begin{eqnarray}

377: 	\label{eqn:prior_mean}

378: 	\avg{\psks}{\rm{prior}} & = & \frac{\asks}{\ask}~, \\

379: 	\label{eqn:prior_variance}

380: 	\var{\psks}{\rm{prior}} & = & \frac{\asks(\ask-\asks)}{\ask^2(1+\ask)} .

381: \end{eqnarray}

382:

383: A second method, more in line with traditional Bayesian estimation, is to

384: consider the marginal distribution for each model parameter. For a Dirichlet

385: distribution, the marginal for any one parameter will be a Beta distribution.

386: With this knowledge, a probability density can be provided for each Markov chain

387: parameter given a particular setting for the hyperparameters $\asks$. In this

388: way, the prior can be assigned and analyzed in substantial detail.

389:

390: A common stance in model inference is to assume all things are a-priori

391: equal.  This can be expressed by assigning $\asks=1$ for all $\hk \in

392: \mathcal{A}^k$ and $s \in \mathcal{A}$, adding \textit{no} fake counts

393: $\tilde{n}(\hk s)$.  This assignment results in a uniform prior distribution

394: over the model parameters and a prior expectation:

395: \begin{equation}

396: \avg{p(s\vert \hk)}{\rm{prior}} = 1/ \vert \mathcal{A} \vert ~.

397: \end{equation}

398:

399: %%

400: \vspace{-0.20in}

401: \subsection{Evidence}

402: \vspace{-0.125in}

403:

404: Given the likelihood and prior derived above, the evidence $P(D|M)$ is seen

405: to be a simple normalization term in Bayes' theorem.  In fact, the evidence

406: provides the probability of the data given the model $\MCk$ and so plays a

407: fundamental role in model comparison.  Formally, the definition is

408: \begin{equation}

409: 	P(D\vert \MCk ) = 	\int \; d\MPk \; P(D\vert \MPk, \MCk)

410: 						P(\MPk \vert \MCk ),

411: 	\label{eqn:evidence_defn}

412: \end{equation}

413: where we can see that this term can be interpreted as an average of the

414: likelihood over the prior distribution.  Applying this to the likelihood

415: in~\eqnref{eqn:likelihood} and the prior in~\eqnref{eqn:prior} produces

416: \begin{eqnarray}

417: 	P(D\vert \MCk) & = & \prod_{\hk \in \mathcal{A}^{k}} \left\{ \;

418: 	\frac{ \Gamma(\ask) }{ \prod_{s\in \mathcal{A}} \Gamma(\asks)}

419: 	\right. \nonumber \\

420: 	& & \label{eqn:evidence} \\

421: 	& \times & \left.

422: 	\frac{ \prod_{s\in \mathcal{A}} \Gamma(\nsks+\asks) }{ \Gamma(\nsk+\ask) }

423: 	\; \right\}. \nonumber

424: \end{eqnarray}

425: As we will see, this analytic expression results in the ability to make useful

426: connections to statistical mechanics techniques when estimating entropy rates.

427: This is another benefit of choosing a conjugate prior with known properties.

428:

429: %%

430: \subsection{Posterior}

431:

432: Using Bayes' theorem~\eqnref{eqn:bayes} the results of the three previous

433: sections can be combined to obtain the posterior distribution over the

434: parameters of the $k$-th order Markov chain. One finds:

435: \begin{eqnarray}

436: 	P(\MPk\vert D, \MCk) & = & \prod_{\hk \in \mathcal{A}^{k}} \left\{

437: 	\frac{ \Gamma( \nsk + \ask  ) }{

438: 	\prod_{s\in\mathcal{A}} \Gamma( \nsks + \asks ) } \right. \nonumber \\

439: 	& \times & \delta \mathbf{(}1-\sum_{s\in\mathcal{A}} p( s \vert \hk)

440: 	\mathbf{)} \label{eqn:posterior} \\

441: 	& \times & \left. \prod_{s\in\mathcal{A}}

442: 	p( s \vert \hk )^{\nsks + \asks - 1} \right\}. \nonumber

443: \end{eqnarray}

444: As noted in selecting the prior, the resulting form is a Dirichlet

445: distribution with modified parameters.  This is a result of choosing the

446: conjugate prior: cf. the forms of \eqnref{eqn:prior} and

447: \eqnref{eqn:posterior}.

448:

449: From~\eqnref{eqn:posterior} the estimation of the model parameters

450: $p(s\vert \hk)$ and the uncertainty of these estimates can be given using the

451: known properties of the Dirichlet distribution.  As with the prior,

452: there are two main ways to understand what the posterior tells us about the

453: fluctuations in the estimated Markov chain parameters. The first uses a point

454: estimate with ``error bars''. We obtain these from the mean and variance of

455: the $\psks$ with respect to the posterior, finding

456: \begin{gather}

457:  	\avg{p(s\vert \hk)}{\rm{post}} =  \frac{ \nsks + \asks }{ \nsk + \ask }

458: 	\label{eqn:posterior_mean} ~, \\ \nonumber \\

459:  	\var{p(s\vert \hk)}{\rm{post}}  =  \frac{ \nsks +\asks }{ ( \nsk + \ask )^2 }

460: 	\nonumber  \\ \label{eqn:posterior_variance} \\

461: 	\times  \frac{ ( \nsk + \ask ) - ( \nsks + \asks  ) }{

462: 	( \nsk + \ask +1 ) }. \nonumber

463: \end{gather}

464: This is the \textit{posterior mean estimate} (\pme) of the model parameters.

465:

466: A deeper understanding of~\eqnref{eqn:posterior_mean} is obtained through a

467: simple factoring:

468: \begin{eqnarray}

469: 	\avg{p(s\vert \hk)}{\rm{post}} & = & \frac{1}{ \nsk + \ask }

470: 	\left[ \nsk \, \left (\frac{\nsks}{\nsk} \right) \right. \nonumber \\

471: 	&& 	\label{eqn:pme_factor} \\

472: 	& + & \left. \ask \, \left(\frac{\asks}{\ask} \right) \right], \nonumber

473: \end{eqnarray}

474: where $\nsks /\nsk $ is the \emph{maximum likelihood estimate} (\mle)

475: of the model parameters and $\asks /\ask$ is the prior expectation given

476: in~\eqnref{eqn:prior_mean}.  In this form, it is

477: apparent that the posterior mean estimate is a weighted sum of the \mle~and

478: prior expectation.  As a result, we can say that the posterior mean and

479: maximum likelihood estimates converge to the same value for $\nsk \gg \ask$.

480: Only when the data is scarce, or the prior is set with strong conviction,

481: does the Bayesian estimate add corrections to the \mle.

482:

483: A second method for analyzing the resulting posterior density is to consider the

484: marginal density for each parameter.  As discussed with the prior, the marginal

485: for a Dirichlet is a Beta distribution.  As a result, we can either provide

486: regions of confidence for each parameter or simply inspect the density function.

487: The latter provides much more information about the inference being made than

488: the point estimation just given.  In our examples, to follow shortly, we

489: plot the marginal posterior density for various parameters of interest

490: to demonstrate the wealth of information this method provides.

491:

492: Before we move on, we make a final point regarding the estimation of inference

493: uncertainty. The form of the posterior is not meant to reflect the potential

494: fluctuations of the data source.  Instead, the width of the distribution

495: reflects the possible Markov chain parameters which are consistent with

496: observed data sample.  These are distinct notions and should not be conflated.

497:

498: %%

499: \subsection{Predictive distribution}

500:

501: Once we have an inferred model, a common task is to estimate the probability of

502: a new observation $D^{(new)}$ given the previous data and estimated model.

503: This is implemented by taking an average of the likelihood of the new data:

504: \begin{equation}

505: P(D^{(new)}\vert \MPk, \MCk)

506:   = \prod_{\hk \in \mathcal{A}^k, s \in \mathcal{A}} p(s\vert \hk)^{m(\hk s)}

507: \end{equation}

508: with respect to the posterior

509: distribution~\cite{MacKay2003}:

510: \begin{eqnarray}

511: 	\label{eqn:predictive_distribution_defn}

512: 	P(D^{(new)}\vert D,\MCk) & =  & \int  d\MPk  P(D^{(new)}\vert \MPk, \MCk) \\

513: 	& \times & P(\MPk \vert D, \MCk) ~. \nonumber

514: \end{eqnarray}

515: We introduce the notation $m(\hk s)$ to indicate the number of times the word

516: $\hk s$ occurs in $D^{(new)}$. This method has the desirable property, compared

517: to point estimates, that it takes into account the uncertainty in the model

518: parameters $\MPk$ as reflected in the form of the posterior distribution.

519:

520: The evaluation of~\eqnref{eqn:predictive_distribution_defn} follows the same

521: path as the calculation for the evidence and produces a similar

522: form; we find:

523: \begin{gather}

524: 	P(D^{(new)}\vert D, \MCk)  =  \prod_{\hk \in \mathcal{A}^{k}} \left\{ \;

525: 	\frac{ \Gamma( \nsk+\ask) }{ \prod_{s\in \mathcal{A}}

526: 	\Gamma( \nsks + \asks)} \right. \nonumber \\

527: 	\label{eqn:predictive_distribution} \\

528: 	\times  \left. \frac{ \prod_{s\in \mathcal{A}}

529: 	\Gamma( \nsks + m(\hk s) + \asks ) }{ \Gamma( \nsk + m(\hk) + \ask ) }

530: 	\; \right\}. \nonumber

531: \end{gather}

532:

533: %%

534: %%

535: \section{Model Comparison}

536:

537: With the ability to infer a Markov chain of a given order $k$, a common sense

538: question is to ask how do we choose the correct order given a particular data

539: set?  Bayesian methods have a systematic way to address this through

540: the use of \emph{model comparison}.

541:

542: In many ways, this process is analogous to inferring model parameters

543: themselves, which we just laid out.  We start by enumerating the set of model

544: orders to be compared $\mathcal{M} = \{ \MCk \}_{k_{min}}^{k_{max}}$, where

545: $k_{min}$ and $k_{max}$ correspond to the minimum and maximum order to be

546: inferred, respectively.  Although we will not consider an independent,

547: identically distributed (IID) model ($k=0$) here, we do note that this could

548: be included using the same techniques described below.

549:

550: We start with the joint probability $P(M_{k},D \vert \mathcal{M} )$ of a

551: particular model $M_{k} \in \mathcal{M}$ and data sample $D$, factoring it in

552: two ways following Bayes' theorem. Solving for the probability of a particular

553: model class we obtain

554: \begin{equation}

555: 	\label{eqn:model_comparison}

556:  	P(\MCk \vert D , \mathcal{M} ) = \frac{ P(D \vert \MCk, \mathcal{M} )

557:  					P(\MCk \vert \mathcal{M} ) }{ P(D \vert \mathcal{M})} ,

558: \end{equation}

559: where the denominator is the sum given by

560: \begin{equation}

561: P(D \vert \mathcal{M}) =

562:   \sum_{\MCkprime \in \mathcal{M}}

563:   P(D \vert \MCkprime, \mathcal{M} )P(\MCkprime \vert \mathcal{M} ) ~.

564: \end{equation}

565: The probability of a particular model class in the set under consideration is

566: driven by two components: the evidence $P(D \vert \MCk, \mathcal{M})$, derived

567: in \eqnref{eqn:evidence}, and the prior over model classes

568: $P(\MCk \vert \mathcal{M} )$.

569:

570: Two common priors in model comparison are: (i) all models are equally likely

571: and (ii) models should be penalized for the number of free parameters used to

572: fit the data.  In the first instance

573: $P(\MCk \vert \mathcal{M})=1/ \vert \mathcal{M} \vert$ is the same for all

574: orders $k$.  However, this factor cancels out because it appears in both the

575: numerator and denominator.  As a result, the probability of models using this

576: prior becomes

577: \begin{equation}

578: 	\label{eqn:best_model_uniform_prior}

579: 	P(\MCk \vert D , \mathcal{M} ) = \frac{P(D \vert \MCk, \mathcal{M} )

580: 					}{

581: 					\sum_{\MCkprime \in \mathcal{M}}

582: 					P(D \vert \MCkprime, \mathcal{M} )}.

583: \end{equation}

584:

585: In the second case, a common penalty for the number of model parameters is

586: \begin{equation}

587: \label{eqn:df_penalty_prior}

588: P(\MCk \vert \mathcal{M}) = \frac{\exp( - \vert \MCk \vert )

589: 						  }{\sum_{\MCkprime \in \mathcal{M}}

590: 						  \exp( - \vert \MCkprime \vert ) } ~,

591: \end{equation}

592: where $\vert \MCk \vert$ is the number of free parameters in the model. For a

593: $k$-th order Markov chain, the number of free parameters is

594: \begin{equation}

595: \vert \MCk \vert = \vert \mathcal{A} \vert^k(\vert \mathcal{A} \vert-1) ~,

596: \end{equation}

597: where $\vert \mathcal{A} \vert$ is the alphabet size. Thus, model

598: probabilities under this prior take on the form

599: \begin{equation}

600: 	\label{eqn:best_model_df_penalty_prior}

601: 	P(\MCk \vert D , \mathcal{M} ) = \frac{

602: 					P(D \vert \MCk, \mathcal{M} ) \exp( - \vert \MCk \vert )

603: 					}{

604: 					\sum_{\MCkprime}

605: 					P(D \vert \MCkprime, \mathcal{M} )

606: 					\exp( - \vert \MCkprime \vert ) }.

607: \end{equation}

608: We note that the normalization sum in~\eqnref{eqn:df_penalty_prior}

609: cancels because it appears in both the numerator and denominator.

610:

611: Bayesian model comparison has a natural \emph{Occam's razor} in the model

612: comparison process~\cite{MacKay2003}.  This means there is a natural preference

613: for smaller models even when a uniform prior over model orders is applied.  In

614: this light, a penalty for the number of model parameters can be seen as a very

615: cautious form of model comparison.  Both of these priors,

616: \eqnref{eqn:best_model_uniform_prior} and

617: \eqnref{eqn:best_model_df_penalty_prior}, will be considered in

618: the examples to follow.

619:

620: A note is in order on computational implementation. In general, the resulting

621: probabilities can be extremely small, easily resulting in numerical underflow

622: if the equations are not implemented with care. As mentioned

623: in~\cite{Durbin1998}, computation with extended logarithms can be used to

624: alleviate these concerns.

625:

626: %%

627: %%

628: \section{Information Theory, Statistical Mechanics, and Entropy Rates}

629:

630: An important property of an information source is its \emph{entropy rate}

631: $\hmu$, which indicates the degree of intrinsic randomness and controls the

632: achievable compression. A first attempt at estimating a source's entropy rate

633: might consist of plugging a Markov chain's estimated model parameters into the

634: known expression~\cite{Cover1991}. However, this does not

635: accurately reflect the posterior distribution derived above. This observation

636: leaves two realistic alternatives. The first option is to sample model

637: parameters from the posterior distribution. These samples can then be used to

638: calculate a set of entropy rate estimates that reflect the underlying posterior

639: distribution. A second option, which we take here, is to adapt methods from

640: type theory and

641: statistical mechanics previously developed for IID models~\cite{Samengo2002}

642: to Markov chains. To the best of our knowledge this is the first time these

643: ideas have been extended to inferring Markov chains; although cf.

644: \cite{Young1994}.

645:

646: In simple terms, type theory shows that the probability of an observed sequence

647: can be written in terms of the \emph{Kullback-Leibler} (KL) \emph{distance} and

648: the entropy rate.  When applied to the Markov chain inference problem the resulting

649: form suggests a connection to statistical mechanics. For example, we will show

650: that averages of the KL-distance and entropy rate with respect to the posterior

651: are found by taking simple derivatives of a partition function.

652:

653: The connection between inference and information theory starts by considering

654: the product of the prior~\eqnref{eqn:prior} and

655: likelihood~\eqnref{eqn:likelihood}:

656: \begin{equation}

657: P(\MPk\vert \MCk)P( D\vert \MPk, \MCk)=P( D, \MPk\vert \MCk) ~.

658: \end{equation}

659: This forms a joint distribution over the observed data $D$ and model parameters

660: $\MPk$ given the model order $\MCk$. Denoting the normalization constant from

661: the prior as $Z$ to save space, this joint distribution is

662: \begin{equation}

663: 	\label{eqn:product_prior_likelihood}

664:  	P( D, \MPk\vert \MCk) = Z \, \prod_{\hk, s}

665:  	p( s \vert \hk )^{\nsks + \asks - 1}.

666: \end{equation}

667: This form can be written, without approximation, in terms of conditional

668: relative entropies $\KLd [\cdot \| \cdot ]$ and entropy rate $\hmu [\cdot]$:

669: \begin{eqnarray}

670: 	\label{eqn:info_prior_likelihood}

671:  	P( D, \MPk\vert \MCk) & = & Z \, 2^{-\beta_k \mathbf{(} \KLd [Q \| P ]

672:  	 + \hmu [Q]\mathbf{)}} \\

673: 	& \times & 2^{+\Asize^{k+1} \mathbf{(} \KLd [ U \| P ]

674: 	+ \hmu [U]\mathbf{)}} ~, \nonumber

675: \end{eqnarray}

676: where $\beta_k = \sum_{\hk,s} \left[ \nsks + \asks \right]$ and the

677: distribution of true parameters is

678: $P = \{ \psk, \psks \}$. The distributions $Q$ and $U$ are given by

679: \begin{eqnarray}

680: 	\label{eqn:pme_distribution}

681: 	Q & = & \left\{ \qsk = \frac{\nsk+\ask}{\beta_k} , \right. \\

682: 	  & &	\left. \qsks = \frac{\nsks + \asks}{\nsk + \ask} \right\}

683: 	  \nonumber \\

684: 	\label{eqn:uniform_distribution}

685: 	U & = & \left\{ \usk = \frac{1}{\Asize^k}, \usks = \frac{1}{\Asize} \right\}

686: 	~,

687: \end{eqnarray}

688: where $Q$ is the distribution defined by the posterior mean and $U$ is a uniform

689: distribution. The information-theoretic quantities used above are given by

690: \begin{eqnarray}

691: 	\KLd [ Q \| P ] & = & \sum_{s, \hk} \qsk \qsks \log_2 \frac{\qsks}{\psks}

692: 	\label{eqn:conditional_KL_div} \\

693: 	\hmu [ Q ] 	& = & - \sum_{s, \hk} \qsk \qsks \log_2 \qsks ~.

694: 	\label{eqn:entropy_rate_estimate}

695: \end{eqnarray}

696: The form of~\eqnref{eqn:info_prior_likelihood} and its relation to the evidence

697: suggests a connection to statistical mechanics: The evidence

698: $P(D \vert \MCk) = \int d\MPk P( D, \MPk\vert \MCk)$ is a partition function

699: $\mathcal{Z} = P( D \vert \MCk)$.  Using conventional techniques, the

700: expectation and variance of the ``energy''

701: \begin{equation}

702: \label{eqn:info_energy}

703: \EQP = \KLd [Q \| P ] + \hmu [Q]

704: \end{equation}

705: are obtained by taking derivatives of the logarithm of the partition function

706: with respect to $\beta_k$:

707: \begin{eqnarray}

708: 	\avg{\, \EQP \, }{\rm{post}}

709: 	& = &

710: 	- \frac{1}{\log 2}

711: 	\frac{\partial}{\partial \beta_k} \, \log \mathcal{Z}

712: 	\label{eqn:info_mean_energy}\\

713:  	\var{\, \EQP \, }{\rm{post}}

714: 	& = &

715: 	\frac{1}{\log 2}

716:  	\frac{\partial^2}{\partial \beta_k^2} \, \log \mathcal{Z}

717: 	~.

718: 	\label{eqn:info_variance_energy}

719: \end{eqnarray}

720: The factors of $\log 2$ in the above expressions come from the decision to use

721: base 2 logarithms in the definition of our information-theoretic quantities.

722: This results in values in \emph{bits} rather than \emph{nats}~\cite{Cover1991}.

723:

724: To evaluate the above expression, we take advantage of the known form for the

725: evidence provided in~\eqnref{eqn:evidence}.  With the definitions $\alpha_k =

726: \sum_{\hk} \ask$ and

727: \begin{equation}

728: 	\label{eqn:prior_distribution}

729: 	R = \left\{ \rsk = \frac{\ask}{\alpha_k} ,

730: 	\rsks = \frac{\asks}{\ask} \right\}

731: \end{equation}

732: the negative logarithm of the partition function can be written

733: \begin{eqnarray}

734: 	- \log \mathcal{Z} & = & \sum_{\hk,s} \log \Gamma

735: 	\left[ \alpha_k \rsk \rsks \right]

736: 	\\ & - & \sum_{\hk} \log \Gamma \left[ \alpha_k \rsk \right]

737: 	+  \sum_{\hk} \log \Gamma \left[ \beta_k \qsk \right] \nonumber \\

738: 	& - & \sum_{\hk,s} \log \Gamma

739: 	\left[ \beta_k \qsk \qsks \right]. \nonumber

740: \end{eqnarray}

741:

742: From this expression, the desired expectation is found by taking derivatives

743: with respect to $\beta_k$; we find that

744: \begin{gather}

745: 	\avg{\, \EQP \, }{\rm{post}}

746: 		= \frac{1}{\log 2}

747: 		\sum_{\hk} \qsk \psi^{(0)} \left[ \beta_k \qsk \right]

748: 		\nonumber \\

749: 	-  \frac{1}{\log 2} \sum_{\hk,s} \qsk \qsks \psi^{(0)}

750: 	\left[ \beta_k \qsk \qsks \right]~. \nonumber \\

751: 	\label{eqn:average_info}

752: \end{gather}

753: The variance is obtained by taking a second derivative with respect to

754: $\beta_k$, producing

755:

756: \begin{gather}

757: 	\var{\, \EQP \, }{\rm{post}}  =

758: 	- \frac{1}{\log 2} \sum_{\hk} \qsk^2 \psi^{(1)} \left[ \beta_k \qsk \right]

759: 	\nonumber \\

760: 	+  \frac{1}{\log 2} \sum_{\hk,s} \qsk^2 \qsks^2 \psi^{(1)}

761: 	\left[ \beta_k \qsk \qsks \right]. \nonumber \\

762: 	\label{eqn:variance_info}

763: \end{gather}

764: In both of the above the polygamma function is defined $\psi^{(n)}(x) =

765: d^{n+1}/dx^{n+1} \log \Gamma(x)$. (For further details, consult a reference

766: such as~\cite{Abramowitz1965}.)

767:

768: From the form of~\eqnref{eqn:average_info}

769: and~\eqnref{eqn:variance_info}, the meaning is not immediately clear. We can

770: use an expansion of the $n=0$ polygamma function

771: \begin{equation}

772: \psi^{(0)}(x) = \log x - 1/2x + \mathcal{O}(x^{-2}) ~,

773: \end{equation}

774: valid for $x \gg 1$, however, to obtain an asymptotic form

775: for~\eqnref{eqn:average_info}; we find

776: \begin{gather}

777:  	\avg{\, \EQP \, }{\rm{post}} =

778:  	H[ \qsk \qsks ] - H[\qsk] \nonumber \\

779: 	+ \frac{1}{2\beta_k} \Asize^k(\Asize -1 )

780: 	+ \mathcal{O}(1/ \beta_k^2)

781: 	\label{eqn:average_info_asymptotic}.

782: \end{gather}

783: From this we see that the first two terms make up the entropy

784: rate $\hmu [ Q ] = H[ \qsk \qsks ] - H[\qsk]$ and the last

785: term is associated with the conditional relative entropy between the posterior

786: mean distribution $Q$ and true distribution $P$.

787:

788: In summary, we have found the average of conditional relative entropy and

789: entropy rate with respect to the posterior density.  This was accomplished by

790: making connections to statistical mechanics through type theory.  Unlike

791: sampling from the posterior to estimate the entropy rate, this method results

792: in an analytic form which approaches $\hmu [ P ]$ as the inverse of the data

793: size. This method for approximating $\hmu$ also provides a computational

794: benefit. No eigenstates have to be found from the Markov transition matrix,

795: allowing for the storage of values in sparse data structures. This provides

796: a distinct computational advantage when large orders or alphabets are

797: considered.

798:

799: Finally, it might seem awkward to use the expectation

800: of~\eqnref{eqn:info_energy} for estimation of the entropy rate.  This method

801: was chosen because it is the form that naturally appears in writing down the

802: likelihood-prior combination in~\eqnref{eqn:info_prior_likelihood}.  As a result

803: of using this method, most of the results obtained above are without

804: approximation.  We were also able to show this expectation converges to the

805: desired value in a well behaved manor.

806:

807: %%

808: %%

809: \vspace{-0.125in}

810: \section{Examples}

811: \vspace{-0.125in}

812:

813: To explore how the above produces a robust inference procedure, let's now

814: consider the statistical inference of a series of increasingly complex data

815: sources. The first, called the \emph{golden mean} process, is a first-order

816: Markov chain. The second data source is called the \emph{even process} and

817: cannot be represented by a Markov chain with finite order. However, this source

818: is a deterministic HMM, meaning that the current state and next output symbol

819: uniquely determine the next state.  Finally, we consider the \emph{simple

820: nondeterministic source}, so named since its smallest representation is as

821: a nondeterministic HMM. (Nondeterminism here refers to the HMM structure: the

822: current state and next output symbol do not uniquely determine the next state.

823: This source is represented by an infinite-state deterministic HMM

824: \cite{Crutchfield1994,Upper1997}.)

825:

826: The golden mean, even, and simple nondeterministic processes can all be written

827: down as models with two internal states---call them $A$ and $B$.  However, the

828: complexity of the data generated from each source is of markedly different

829: character. Our goal in this section is to consider the three main steps in

830: inference to analyze them. First, we consider inference of a first-order Markov

831: chain to demonstrate the

832: estimation of model parameters with uncertainty.  Second, we consider model

833: comparison for a range of orders $k$.  This allows us to discover structure in

834: the data source even though the true model class cannot be captured in all

835: cases. Finally, we consider estimation of entropy rates from these data sources,

836: investigating how randomness is expressed in them.

837:

838: While investigating these processes we consider average data counts,

839: rather than sample counts from specific realizations, as we want

840: to focus specifically on the average performance of Bayesian inference.  To

841: do this we take advantage of the known form of the sources. Each is described

842: by a transition matrix $T$, which gives transitions between states

843: $A$ and $B$:

844: \begin{equation}

845: 	\label{eqn:transition_matrix_definition}

846: 	T = \left[ \begin{array}{cc}

847: 	p(A\vert A) & p(B\vert A) \\

848: 	p(A\vert B) & p(B\vert B)

849: 	\end{array}

850: 	\right] \;.

851: \end{equation}

852: Although two of our data sources are not finite Markov chains, the transition

853: matrix between internal states is Markov.  This means the matrix

854: is \emph{stochastic} (all rows sum to one) and we are guaranteed an eigenstate

855: $\vec{\pi}$ with eigenvalue one: $\vec{\pi} \, T = \vec{\pi}$.  This eigenstate

856: describes the asymptotic distribution over internal states:

857: $\vec{\pi} = \left[ p(A), p(B) \right]$.

858:

859: The transition matrix can be divided into labeled matrices $T^{(s)}$ which

860: contain those elements of $T$ that output symbol $s$. For our binary data

861: sources one has

862: \begin{equation}

863: 	\label{eqn:transition_matrix}

864: 	T = T^{(0)} + T^{(1)}.

865: \end{equation}

866: Using these matrices, the average probability of words can be estimated for

867: each process of interest. For example, the probability of word $01$ can be

868: found using

869: \begin{equation}

870: p(01) = \vec{\pi} \, T^{(0)}T^{(1)} \vec{\eta} ~,

871: \end{equation}

872: where $\vec{\eta}$ is a column vector with all $1$'s. In this way, for any

873: data size $N$, we estimate the average count for a word as

874: \begin{equation}

875: \nsks = (N-k)~p(\hk s) ~.

876: \end{equation}

877: Average counts, obtained this way, will be the basis for all of

878: the examples to follow.

879:

880: In the estimation of the true entropy rate for the examples we use the formula

881: \begin{equation}

882: 	h_{\mu} = - \sum_{v \in \{A,B\}} p(v)

883: 	\sum_{s \in \mathcal{A}} ~p(s\vert v) \log_2 p(s\vert v)

884: 	\label{eqn:entropy_rate}

885: \end{equation}

886: for the the golden mean and even processes, where

887: $p(s\vert v) = T^{(s)}_{v \cdot}$ is the probability of a letter $s$ given the

888: state $v$ and $p(v)$ is the asymptotic probability of the state $v$ which can be

889: found as noted above. For the simple nondeterministic source this closed-form

890: expression cannot be applied and the entropy rate must be found using more

891: involved methods; see~\cite{Crutchfield1994} for further details.

892:

893: %%

894: %%

895: \subsection{Golden mean process: In-class modeling}

896:

897: The \emph{golden mean process} can be represented by a simple $1$st-order

898: Markov chain over a binary alphabet characterized by a single (shortest)

899: forbidden word $s^2 = 00$. The defining labeled transition matrices for this data

900: source are given by

901: \begin{equation}

902: 	\label{eqn:label_transition_matrix_golden_mean}

903: 	T^{(0)} = \left[ \begin{array}{cc}

904: 	0 & 1/2 \\

905: 	0 & 0

906: 	\end{array}

907: 	\right] \; , \;

908: 	T^{(1)} = \left[ \begin{array}{cc}

909: 	1/2 & 0 \\

910: 	1	& 0

911: 	\end{array}

912: 	\right]	 ~.

913: \end{equation}

914: \begfigref{fig:golden_mean} provides a graphical representation of the

915: corresponding hidden Markov chain. Inspection reveals a simple relation

916: between the \text{internal states} $A$ and $B$ and the output symbols

917: $0$ and $1$. An observation of $0$ indicates a transition to internal

918: state $B$ and a $1$ corresponds to state $A$, making this process a Markov

919: chain over $0$s and $1$s.

920:

921: %%

922: %% Beamer Implementation

923: %%

924: \begin{figure}[htb]

925: \begin{center}

926: 		%options for the plot:

927: 			%-states

928: 			\SetStateLabelScale{1.6}

929: 			\SetStateLineWidth{1.4pt}

930: 			%-edges

931: 			\SetEdgeLabelScale{1.4}

932: 			\SetEdgeLineWidth{0.75pt}

933:

934: 		\begin{VCPicture}{(0,0)(5,2)}

935: 			%states

936: 			\ChgStateLabelScale{0.8}

937: 				\State[A]{(1,0)}{A}

938: 				\State[B]{(4,0)}{B}

939: 			\ChgEdgeLabelScale{0.7}

940: 			%transitions

941: 				\LoopW{A}{ 1 | 1/2 }

942: 				\LArcR[0.5]{B}{A}{ 1 | 1 }

943: 				\LArcR[0.5]{A}{B}{ 0 | 1/2 }

944: 		\end{VCPicture}

945: \end{center}

946: \vspace{0.5in}

947: \caption{A deterministic hidden Markov chain for the golden mean process.

948:   Edges are labeled with the output symbol and the transition probability:

949:   \emph{symbol} $\vert$ \emph{probability}.

950:   }

951: \label{fig:golden_mean}

952: \end{figure}

953:

954: For the golden mean the eigenstate is $\vec{\pi} = \left[ p(A), p(B)

955: \right] = \left( 2/3 , 1/3 \right)$.  With this vector and the labeled

956: transition matrices any desired word count can be found as discussed above.

957:

958: %

959: \vspace{-0.125in}

960: \subsubsection{Estimation of $M_1$ Parameters}

961: \vspace{-0.125in}

962:

963: To demonstrate the effective inference of the Markov chain parameters for the

964: golden mean process we consider average counts for a variety of data sizes

965: $N$.  For each size, the marginal posterior for the parameters $p(0\vert 1)$ and

966: $p(1\vert 0)$ is plotted in~\figref{fig:GoldenMean_ParameterEstimates}.  The

967: results demonstrate that the shape of the posterior effectively

968: describes the distribution of possible model parameters at each $N$ and converges

969: to the correct values of $p(0\vert 1)=1/2$ and $p(1\vert 0)=1$ with increasing

970: data.

971:

972: %% details-

973: %%   code: MarginalPosterior.py in MarkovChainPaper_Code folder.

974: %%   parameters: marginal density plotted for N=50,100,200,400.

975: %%

976: \begin{figure}[htbp]

977: 	\centering

978: 	\includegraphics[width=0.98\columnwidth]{MarginalPosterior_GM.eps}

979: 	\caption{A plot of the inference of $M_1$ model parameters for the

980: 	golden mean process.  For each data sample size $N$, the marginal posterior is

981: 	plotted for the parameters of interest: $p(0\vert 1)$ in the top panel and

982: 	$p(1\vert 0)$ in the lower panel.  The \textit{true} values of the parameters

983: 	are $p(0\vert 1)=1/2$ and $p(1\vert 0) = 1$.

984: 	\label{fig:GoldenMean_ParameterEstimates}}

985: \end{figure}

986:

987: Point estimates with a variance can be provided for each of the parameters, but

988: these numbers by themselves can be misleading.  However, the estimate obtained

989: by using the mean and variance of the posterior are a more effective description

990: of the inference process than a maximum likelihood estimate with estimated

991: error given by a Gaussian approximation of the likelihood alone.

992: As~\figref{fig:GoldenMean_ParameterEstimates} demonstrates, in

993: fact, a Gaussian

994: approximation of uncertainty is an ineffective description of our knowledge

995: when the Markov chain parameters are near their upper or lower limits at $0$

996: and $1$. Probably the most effective set of numbers to provide consists of the

997: mean of the posterior and a region of confidence. These would most accurately

998: describe asymmetries in the uncertainty of model parameters. Although we will

999: not do that here, a brief description of finding regions of confidence is

1000: provided in~\appref{app:dirichlet}.

1001:

1002: %

1003: \vspace{-0.125in}

1004: \subsubsection{Selecting the Model Order $k$}

1005: \vspace{-0.125in}

1006:

1007: Now consider the selection of the appropriate order $k$ from golden mean

1008: realizations.  As discussed above, the golden mean process is a first order

1009: Markov chain with $k=1$.  As a result, we would expect model comparison to

1010: select this order from the available possibilities. To demonstrate this,

1011: we consider orders $k=1-4$ and perform model comparison with a uniform prior

1012: over orders (\eqnref{eqn:best_model_uniform_prior}) and with a penalty for the

1013: number of model parameters (\eqnref{eqn:best_model_df_penalty_prior}).

1014:

1015: %% details-

1016: %%   code: ModelComparison.py in MarkovChainPaper_Code folder.

1017: %%   parameters: length_min=100, length_max=1000, step=5

1018: %%

1019: \begin{figure}[htbp]

1020: 	\centering

1021: 	\includegraphics[width=0.98\columnwidth]{ModelCompare_GM.eps}

1022: 	\caption{Model comparison for Markov chains of order $k=1-4$ using

1023: 	average counts from the golden mean process.  Sample sizes from $N=100$ to

1024: 	$N=1,000$ in steps of $\Delta N=5$ are used to generate these plots.  The top panel

1025: 	displays the model probabilities using a uniform prior over orders $k$.  The

1026: 	bottom panel displays the effect of a penalty for model size.

1027: 	\label{fig:GoldenMean_ModelComparison}}

1028: \end{figure}

1029:

1030: The results of the model comparisons are given

1031: in~\figref{fig:GoldenMean_ModelComparison}.  The top panel shows the probability

1032: for each order $k$ as a function of the sample size, using a uniform prior.  For

1033: this prior over orders, $M_1$ is selected with any reasonable amount of

1034: data.   However, there does seem to be a possibility to over-fit for small data

1035: size $N \leq 100$.  The bottom panel shows the model probability with a penalty

1036: prior over model order $k$.  This removes the over-fitting at small data sizes

1037: and produces an offset which must be overcome by the data before higher $k$ is

1038: selected.  This example is not meant to argue for the penalty prior over model

1039: orders.  In fact, Bayesian model comparison with a uniform prior does an

1040: effective job using a relatively small sample size.

1041:

1042: %

1043: \vspace{-0.125in}

1044: \subsubsection{Estimation of Entropy Rate}

1045: \vspace{-0.125in}

1046:

1047: We can also demonstrate the convergence of  the average for

1048: $E(Q,P)=D[ Q \| P ] + \hmu [Q]$ given in~\eqnref{eqn:average_info} to the

1049: correct entropy rate for the golden mean process.  We choose to show this

1050: convergence for all orders $k=1-4$ discussed in the previous section.  This

1051: exercise demonstrates that all orders greater than or equal to $k = 1$

1052: effectively capture the entropy rate. However, the convergence to the correct

1053: values for higher-order $k$ takes more data because of a larger initial value of

1054: $D[ Q \| P ]$.  This larger value is simply due to the larger number of

1055: parameters for higher-order Markov chains.

1056:

1057: %% details-

1058: %%   code: EntropyEstimation.py in MarkovChainPaper_Code folder.

1059: %%   parameters: length_min=50, length_max=5000, step=50, k=1-4

1060: %%

1061: \begin{figure}[htbp]

1062: 	\centering

1063: 	\includegraphics[width=0.98\columnwidth]{EntropyEstimates_GM.eps}

1064: 	\caption{The convergence of $\avg{\, E(Q,P) \, }{\rm{post}}$ to the true

1065: 	entropy rate $\hmu = 2/3$ bits per symbol (indicated by the gray horizontal

1066: 	line) for the the golden mean process.  As demonstrated

1067: 	in~\eqnref{eqn:average_info_asymptotic}, the conditional relative

1068: 	entropy $D[Q \| P ] \rightarrow 0$ as $1/N$.  This results in

1069: 	the convergence of $\hmu [Q]$ to the true entropy rate.

1070: 	\label{fig:GoldenMean_InfoTheory}}

1071: \end{figure}

1072:

1073: In evaluating the value of $D[Q \| P ] + \hmu [Q]$ for different sample lengths,

1074: we expect that the \pme \, estimated $Q$ will converge to the true distribution

1075: $P$.  As a result, the conditional relative entropy should go to zero with

1076: increasing $N$.  For the golden mean process, the known value of the entropy

1077: rate is $\hmu =2/3$ bits per symbol.  Inspection

1078: of~\figref{fig:GoldenMean_InfoTheory} demonstrates the expected convergence of the

1079: average from~\eqnref{eqn:average_info} to the true entropy rate.

1080:

1081: The result of our model comparison from the previous section could also be used

1082: in the estimation of the entropy rate.  As we saw

1083: in~\figref{fig:GoldenMean_ModelComparison}, there are ranges of sample length $N$

1084: where the probability of orders $k=1,2$ are both nonzero.  In principle, an

1085: estimate of $\hmu$ should be made by weighting the values obtained for each

1086: $k$ by the corresponding order probability $P(\MCk \vert D, \mathcal{M})$.  As

1087: we can see from~\figref{fig:GoldenMean_InfoTheory}, the estimates of the entropy

1088: rate for $k=1,2$ are also very similar in this range of $N$.  As a result, this

1089: additional step would not have a large effect for entropy rate estimation.

1090:

1091: %%

1092: \subsection{Even process: Out-of-class modeling}

1093:

1094: We now consider a more difficult data source called the \emph{even process}.

1095: The defining labeled transition matrices are given by

1096: \begin{equation}

1097: 	\label{eqn:label_transition_matrix_even}

1098: 	T^{(0)} = \left[ \begin{array}{cc}

1099: 	1/2 & 0 \\

1100: 	0	& 0

1101: 	\end{array}

1102: 	\right] \; , \;

1103: 	T^{(1)} = \left[ \begin{array}{cc}

1104: 	0 & 1/2 \\

1105: 	1	& 0

1106: 	\end{array}

1107: 	\right]~.

1108: \end{equation}

1109:

1110: As can be seen in~\figref{fig:even}, the node-edge structure is identical to

1111: the golden mean process but the output symbols on the edges have been changed

1112: slightly.  As a result of this shuffle, the states $A$ and $B$ can no longer be

1113: associated with a simple sequence of $0$'s and $1$'s.  Whereas the golden mean

1114: has the irreducible set of forbidden words $\mathcal{F} = \{00\}$, the even

1115: process has a countably infinite set

1116: $\mathcal{F} = \{01^{2n+1}0: n=0,1,2,\ldots \}$

1117: \cite{Crutchfield1994}.

1118: \begin{figure}[htb]

1119: \begin{center}

1120: 		%options for the plot:

1121: 			%-states

1122: 			\SetStateLabelScale{1.6}

1123: 			\SetStateLineWidth{1.4pt}

1124: 			%-edges

1125: 			\SetEdgeLabelScale{1.4}

1126: 			\SetEdgeLineWidth{0.75pt}

1127:

1128: 		\begin{VCPicture}{(0,0)(5,2)}

1129: 			%states

1130: 			\ChgStateLabelScale{0.8}

1131: 				\State[A]{(1,0)}{A}

1132: 				\State[B]{(4,0)}{B}

1133: 			\ChgEdgeLabelScale{0.7}

1134: 			%transitions

1135: 				\LoopW{A}{ 0 | 1/2 }

1136: 				\LArcR[0.5]{B}{A}{ 1 | 1 }

1137: 				\LArcR[0.5]{A}{B}{ 1 | 1/2 }

1138: 		\end{VCPicture}

1139: \end{center}

1140: \vspace{0.5in}

1141: \caption{Deterministic hidden Markov chain representation of the even process.

1142:   This process cannot be represented as a finite-order (nonhidden) Markov chain

1143:   over the output symbols $0$s and $1$s. The set of irreducible forbidden words

1144:   $\mathcal{F} = \{01^{2n+1}0: n=0,1,2,\ldots \}$ reflects the fact that the

1145:   process generates blocks of $1$'s, bounded by $0$s, that are \emph{even} in

1146:   length, at any length.

1147: \label{fig:even}}

1148: \end{figure}

1149:

1150: In simple terms, the even process produces blocks of $1$'s which are even in

1151: length. This is a much more complicated type of memory than we saw in

1152: the golden mean process.  For the Markov chain model class, where a word of

1153: length $k$ is used to predict the next letter, this would require an

1154: infinite-order $k$. It would be necessary to keep track of all even and odd

1155: strings of $1$'s, irrespective of the length. As a result, the properties of

1156: the even process mean that a finite Markov chain \textit{cannot} represent

1157: this data source.

1158:

1159: This example is then a demonstration of what can be learned in a case of

1160: out-of-class modeling. We are interested, therefore, in how well Markov

1161: chains approximate the even process. We

1162: expect that model comparison will select larger $k$ as the size of the data

1163: sample increases.  Does the model selection tells us anything about the

1164: underlying data source despite the inability to exactly capture its properties?

1165: As we will see, we do obtain intriguing hints of the true nature of the even

1166: process from model comparison.  Finally, can we estimate the entropy rate of

1167: the process with a Markov chain?  As we will see, a high $k$ is needed to do

1168: this effectively.

1169:

1170: %

1171: \subsubsection{Estimation of $M_1$ Parameters}

1172:

1173: In this section we consider an $M_1$ approximation of the even process.

1174: We expect the resulting model to accurately capture length-$2$ word

1175: probabilities as $N$ increases.  In this example, we consider the \emph{true}

1176: model to be the best approximation possible by a $k=1$ Markov chain.  From the

1177: labeled transition matrices given above we can calculate the appropriate

1178: values for $p(0\vert 1)$ and $p(1\vert 0)$ using the methods described above.

1179: Starting from the asymptotic distribution $\vec{\pi} = \left[ p(A), p(B)\right]

1180: = \left[ 2/3, 1/3 \right]$ we obtain $p(0\vert 1)=p(10)/p(1)=1/4$ and $p(1\vert

1181: 0)=p(01)/p(0)=1/2$.

1182:

1183: As we can see from~\figref{fig:Even_ParameterEstimates}, a first-order Markov

1184: chain can be inferred without difficulty.  The values obtained are exactly as

1185: expected.  However, these values do not tell us much about the nature

1186: of the data source by themselves. This points to the important role of model

1187: comparison and entropy rate estimation in understanding this data.

1188:

1189: %% details-

1190: %%   code: MarginalPosterior.py in MarkovChainPaper_Code folder.

1191: %%   parameters: marginal density plotted for N=50,100,200,400.

1192: %%

1193: \begin{figure}[htbp]

1194: 	\centering

1195: 	\includegraphics[width=0.98\columnwidth]{MarginalPosterior_EVEN.eps}

1196: 	\caption{A plot of the inference of $M_1$ model parameters for the even

1197: 	process.  For a variety of sample sizes $N$, the marginal posterior for

1198: 	$p(0\vert 1)$ (top panel) and $p(1\vert 0)$ (bottom panel) are shown.  The

1199: 	\textit{true} values of the parameters are $p(0\vert 1)=1/4$ and

1200: 	$p(1\vert 0) = 1/2$.

1201: 	\label{fig:Even_ParameterEstimates}}

1202: \end{figure}

1203:

1204: %

1205: \subsubsection{Selecting the Model Order $k$}

1206:

1207: Now consider the selection of Markov chain order $k=1-4$ for a range of data

1208: sizes $N$. Recall that the even process cannot be represented by a finite-order

1209: Markov chain over the output symbols $0$ and $1$. As a consequence, we expect

1210: higher $k$ to be selected with increasing data $N$, as more data statistically

1211: justifies more complex models. This is what happens, in fact, but the way in

1212: which orders are selected as we increase $N$ provides structural information

1213: we could not obtain from the inference of a Markov chain of fixed order.

1214:

1215: %% details-

1216: %%   code: ModelComparison.py in MarkovChainPaper_Code folder.

1217: %%   parameters: length_min=100, length_max=1000, step=5

1218: %%

1219: \begin{figure}[htbp]

1220: 	\centering

1221: 	\includegraphics[width=0.98\columnwidth]{ModelCompare_EVEN.eps}

1222: 	\caption{Model comparison for Markov chains of order $k=1-4$ for

1223: 	average data from the even process.  The top panel shows the model

1224: 	comparison with a uniform prior over the possible orders $k$.  The bottom

1225: 	panel demonstrates model comparison with a penalty for the number of model

1226: 	parameters.  In both cases the $k=4$ model is chosen over lower orders as the

1227: 	amount of data available increases.

1228: 	\label{fig:Even_ModelComparison}}

1229: \end{figure}

1230:

1231: If we consider~\figref{fig:Even_ModelComparison}, an interesting pattern becomes

1232: apparent.  Orders with even $k$ are preferred over odd. In this way model

1233: selection is hinting at the underlying structure of the source. The Markov

1234: chain model class cannot represent the even process in a compact way, but

1235: inference and model comparison combined provide useful information about

1236: the hidden structure of the source.

1237:

1238: In this example we also have regions where the probability of multiple orders $k$

1239: are equally probable.  The sample size at which this occurs depends on the prior

1240: over orders which is employed.  When this happens, properties estimated from the

1241: Markov chain model class should use a weighted sum of the various orders. As we

1242: will see in the estimation of entropy rates, this is not as critical. At

1243: sample sizes where the order probabilities are similar, the estimated entropy

1244: rates are also similar.

1245:

1246: %

1247: \subsubsection{Estimation of Entropy Rate}

1248:

1249: Entropy rate estimation for the even process turns out to be a more

1250: difficult task than one might expect.  In~\figref{fig:Even_InfoTheory} we see

1251: that Markov chains of orders $1-6$ are unable to effectively capture the true

1252: entropy rate.  In fact, experience shows that an order $k=10$ Markov chain or

1253: higher is needed to get close to the true value of $\hmu = 2/3$ bits per symbol.

1254: Note also the factor of $20$ longer realizations that are required compared,

1255: say, to the golden mean example.

1256:

1257: %% details-

1258: %%   code: EntropyEstimation.py in MarkovChainPaper_Code folder.

1259: %%   parameters: length_min=100, length_max=20000, step=100, k=1-6

1260: %%

1261: \begin{figure}[htbp]

1262: 	\centering

1263: 	\includegraphics[width=0.98\columnwidth]{EntropyEstimates_EVEN.eps}

1264: 	\caption{The convergence of $\avg{\, D[Q \| P ] + \hmu [Q] \, }{\rm{post}}$

1265: 	to the true entropy rate $\hmu = 2/3$ bits per symbol for the the even

1266: 	process.  The true value is indicated by the horizontal gray line.  Experience

1267: 	shows that a $k=10$ Markov chain is needed to effectively approximate the true

1268: 	value of $\hmu$.

1269: 	\label{fig:Even_InfoTheory}}

1270: \end{figure}

1271:

1272: As discussed above, a weighted sum of

1273: $\avg{\, D[Q \| P ] + \hmu [Q] \, }{\rm{post}}$ could be employed in this

1274: example.  For the estimate this is not critical because the different orders

1275: provide roughly the same value at these points.  In fact, these points

1276: correspond to where the estimates of $E(Q,P)$ cross

1277: in~\figref{fig:Even_InfoTheory}. They are samples sizes where apparent

1278: randomness can be explained by structure and increased order $k$.

1279:

1280:

1281: %%

1282: \subsection{Simple Nondeterministic Source: Out-of-class modeling}

1283:

1284: The simple nondeterministic source adds another level of challenge to inference.

1285: As its name suggests, it is described by a nondeterministic HMM.

1286: Considering~\figref{fig:sns} we can see that a $1$ is produced on every

1287: transition except for the $B \rightarrow A$ edge.  This means there are many

1288: paths through the internal states that produce the same observable sequence of

1289: $0$s and $1$s. The defining labeled transition matrices for this process are

1290: given by

1291: \begin{equation}

1292: 	\label{eqn:label_transition_matrix_sns}

1293: 	T^{(0)} = \left[ \begin{array}{cc}

1294: 	0 & 0 \\

1295: 	1/2	& 0

1296: 	\end{array}

1297: 	\right] \; , \;

1298: 	T^{(1)} = \left[ \begin{array}{cc}

1299: 	1/2 & 1/2 \\

1300: 	0	& 1/2

1301: 	\end{array}

1302: 	\right]~.

1303: \end{equation}

1304:

1305: Using the state-to-state transition matrix $T=T^{(0)}+T^{(1)}$, we find the

1306: asymptotic distribution for the hidden states to be

1307: $\vec{\pi} = \left[ p(A), p(B) \right] = \left[1/2, 1/2 \right]$. Each of

1308: the hidden states is equally likely; however, a $1$ is always produced from

1309: state $A$, while there is an equal chance of obtaining a $0$

1310: or $1$ from state $B$.

1311:

1312: \begin{figure}[htb]

1313: \begin{center}

1314: 		%options for the plot:

1315: 			%-states

1316: 			\SetStateLabelScale{1.6}

1317: 			\SetStateLineWidth{1.4pt}

1318: 			%-edges

1319: 			\SetEdgeLabelScale{1.4}

1320: 			\SetEdgeLineWidth{0.75pt}

1321:

1322: 		\begin{VCPicture}{(0,0)(5,2)}

1323: 			%states

1324: 			\ChgStateLabelScale{0.8}

1325: 				\State[A]{(1,0)}{A}

1326: 				\State[B]{(4,0)}{B}

1327: 			\ChgEdgeLabelScale{0.7}

1328: 			%transitions

1329: 				\LoopW{A}{ 1 | 1/2 }

1330: 				\LoopE{B}{ 1 | 1/2 }

1331: 				\LArcR[0.5]{B}{A}{ 0 | 1/2 }

1332: 				\LArcR[0.5]{A}{B}{ 1 | 1/2 }

1333: 		\end{VCPicture}

1334: \end{center}

1335: \vspace{0.5in}

1336: \caption{A hidden Markov chain representation of the simple nondeterministic

1337:   process. This example also cannot be represented as a finite-order Markov

1338:   chain over outputs $0$ and $1$. It, however, is more complicated than the

1339:   two previous examples: Only the observation of a $0$ provides the observer

1340:   with information regarding the internal state of the underlying process;

1341:   observing a $1$ leaves the internal state ambiguous.

1342: \label{fig:sns}}

1343: \end{figure}

1344:

1345: %

1346: \subsubsection{Estimation of $M_1$ Parameters}

1347:

1348: Using the asymptotic distribution derived above, the parameters of an inferred

1349: first-order Markov chain should approach $p(0\vert 1)=p(10)/p(1)=1/3$ and

1350: $p(1\vert 0)=p(01)/p(0)=1$.  As we can see

1351: from~\figref{fig:SNS_ParameterEstimates}, the inference

1352: process captures these values very effectively despite the out-of-class data

1353: source.

1354:

1355: %% details-

1356: %%   code: MarginalPosterior.py in MarkovChainPaper_Code folder.

1357: %%   parameters: marginal density plotted for N=50,100,200,400.

1358: %%

1359: \begin{figure}[htbp]

1360: 	\centering

1361: 	\includegraphics[width=0.98\columnwidth]{MarginalPosterior_SNS.eps}

1362: 	\caption{Marginal density for $M_1$ model parameters for the

1363: 	simple nondeterministic process:  The curves for each data size $N$

1364: 	demonstrate a well behaved convergence to the correct values:

1365: 	$p(0\vert 1)=1/3$ and $p(1\vert 0) = 1$.

1366: 	\label{fig:SNS_ParameterEstimates}}

1367: \end{figure}

1368:

1369: %

1370: \subsubsection{Selecting the Model Order $k$}

1371:

1372: Here we consider the comparison of Markov chain models of orders $k=1-4$ when

1373: applied to data from the simple nondeterministic source.  As with the even

1374: process, we expect increasing order to be selected as the amount of available

1375: data increases.  In~\figref{fig:SNS_ModelComparison} we see that this is

1376: exactly what happens.

1377:

1378: %% details-

1379: %%   code: ModelComparison.py in MarkovChainPaper_Code folder.

1380: %%   parameters: length_min=100, length_max=1.5e5, step=50

1381: %%

1382: \begin{figure}[htbp]

1383: 	\centering

1384: 	\includegraphics[width=0.98\columnwidth]{ModelCompare_SNS.eps}

1385: 	\caption{Model comparison for Markov chains of order $k=1-4$ for

1386: 	data from the simple nondeterministic process.  The top panel

1387: 	shows the model comparison with a uniform prior over the possible orders

1388: 	$k$.  The bottom panel demonstrates model comparison with a penalty for the

1389: 	number of model parameters.  Note the scale on the horizontal axis---it

1390: 	takes much more data for the model comparison to pick out higher orders

1391: 	for this process compared to the previous examples.

1392: 	\label{fig:SNS_ModelComparison}}

1393: \end{figure}

1394:

1395: Unlike the even process, there is no preference for even orders.  Instead, we

1396: observe a systematic increase in order with larger data sets.  We do note that

1397: the amount of data need to select a higher order does seem to be larger than for

1398: the even process.  Here the distribution over words is more important and more

1399: subtle than the support of the distribution (those words with positive

1400: probability).

1401:

1402: %

1403: \subsubsection{Estimation of Entropy Rate}

1404:

1405: Estimation of the entropy rate for the simple nondeterministic source provides

1406: an interesting contrast to the previous examples. As discussed when introducing

1407: the examples, this data source is a nondeterministic HMM and the entropy rate

1408: cannot be directly calculated using~\eqnref{eqn:entropy_rate}

1409: \cite{Blackwell1957}. However, a

1410: value of $\hmu \approx 0.677867$ bits per symbol has been obtained

1411: in~\cite{Crutchfield1994}.

1412:

1413: %% details-

1414: %%   code: EntropyEstimation.py in MarkovChainPaper_Code folder.

1415: %%   parameters: length_min=100, length_max=20000, step=100, k=1-6

1416: %%

1417: \begin{figure}[htbp]

1418: 	\centering

1419: 	\includegraphics[width=0.98\columnwidth]{EntropyEstimates_SNS.eps}

1420: 	\caption{The convergence of $\avg{\, D[Q \| P ] +

1421: 	\hmu [Q] \, }{\rm{post}}$ to the true entropy rate $\hmu \approx 0.677867$

1422: 	bits per symbol for the simple nondeterministic source.  The true value is

1423: 	indicated by the gray horizontal line.

1424: 	\label{fig:SNS_InfoTheory}}

1425: \end{figure}

1426:

1427: \begfigref{fig:SNS_InfoTheory} shows the results of entropy-rate estimation

1428: using Markov chains of order $k=1-6$.  These results demonstrate that the

1429: entropy rate can be effectively estimated with low-order $k$ and relatively

1430: small data samples.  This is an interesting result, as we might expect

1431: estimation of the entropy rate to be most difficult in this example.  Instead we

1432: find that the even process was a more difficult test case.

1433:

1434: %%

1435: %%

1436: \section{Discussion}

1437:

1438: The examples presented above provide several interesting lessons in inference,

1439: model comparison, and estimating randomness. The combination of these three

1440: ideas applied to a data source provides information and intuition about the

1441: structure of the underlying system, even when modeling out-of-class processes.

1442:

1443: In the examples of $\MC_{1}$ estimates for each of the sources we see that

1444: the Bayesian methods provide a powerful and consistent description of Markov

1445: chain model parameters.  The marginal density accurately describes the

1446: uncertainty associated with these estimates, reflecting asymmetries which point

1447: estimation with error bars cannot capture.  In addition, methods described

1448: in~\appref{app:dirichlet} can be used to generate regions of confidence of any

1449: type.

1450:

1451: Although the estimates obtained for the Markov chain model parameters were

1452: consistent with the data source for words up to length $k+1$, they did not capture

1453: the true nature of the system under consideration.  This demonstrates that

1454: estimation of model parameters without some kind of model comparison can be very

1455: misleading.  Only with the comparison of different orders did some indication

1456: of the true properties of the data source become clear.  Without this step,

1457: misguided interpretations are easily obtained.

1458:

1459: For the golden mean process, a $k=1$ Markov chain, the results of model

1460: comparison were predictably uninteresting.  This is a good indication that the

1461: correct model class is being employed.  However, with the even process a much

1462: more complicated model comparison was found.  In this case, a selection of even

1463: $k$ over odd hinted at the distinguishing properties of the source. In a

1464: similar way, the results of model comparison for the simple nondeterministic

1465: source selected increasing order with larger $N$.  In both out-of-class

1466: modeling examples, the increase in selected order without end is a good

1467: indication that the data source is not in the Markov chain class. (A parallel

1468: technique is found in \emph{hierarchical $\epsilon$-machine reconstruction}

1469: \cite{Crutchfield1994}.) Alternatively, there is an indication that

1470: very high-order dependencies are important in the description of the process.

1471: Either way, this information is important since it gives an indication to the

1472: modeler that a more complicated dynamic is at work and all results must be

1473: treated with caution.

1474:

1475: Finally, we considered the estimation of entropy rates for the example data

1476: sources.  In two of the cases, the golden mean process and the simple

1477: nondeterministic source, short data streams were adequate.  This is not

1478: unexpected for the golden mean, but for the simple nondeterministic source this

1479: might be considered surprising.  For the even process, the estimation of the

1480: entropy rate was markedly more difficult.  For this data source, the countably

1481: infinite number of forbidden words makes the support of the word distribution

1482: at a given length important.  As a result, a larger amount of data and a

1483: higher-order Markov chain are needed to find a decent estimate of randomness

1484: from that data source. In this way, each of the steps in Bayesian

1485: inference allow one to separate structure from randomness.

1486:

1487: %%

1488: %%

1489: \section{Conclusion}

1490:

1491: We considered Bayesian inference of $k$-th order Markov chain

1492: models.  This included estimating model parameters for a given $k$, model

1493: comparison between orders, and estimation of randomness in the form of entropy

1494: rates.  In most approaches to inference, these three aspects are treated as

1495: separate, but related endeavors.  However, we find them to be intimately

1496: related.  An estimate of model parameters without a sense of whether the

1497: correct model is being used is misguided at best.  Model comparison

1498: provides a window into this problem by comparing various orders $k$ within the

1499: model class.  Finally, estimating randomness in the form of an entropy rate

1500: provides more information about the trade-off between structure and randomness.

1501: To do this we developed a connection to the statistical mechanical partition

1502: function, from which averages and variances were directly calculable. For the

1503: even process, structure was perceived as randomness and for the simple

1504: nondeterministic source

1505: randomness was easily estimated and structure was more difficult to find.

1506: These insights, despite the out-of-class data, demonstrate the power of

1507: combining these three methods into one effective tool for investigating

1508: structure and randomness in finite strings of discrete data.

1509:

1510: %

1511: % acknowledgments

1512: %

1513: \section*{Acknowledgments}

1514: This work was partially supported at the Center for Computational Science

1515: and Engineering at the University of California at Davis by Intel

1516: Corporation. Work at the Santa Fe Institute was supported under its

1517: Computation, Dynamics, and Inference Program core grants from the

1518: National Science and MacArthur Foundations. C.S. and A.H. acknowledge

1519: support by the National Science Foundation Grant DMS 03-25939 ITR.

1520:

1521: %

1522: % appendices

1523: %

1524: \appendix

1525:

1526: %

1527: % Dirichlet Appendix

1528: %

1529: \section{}

1530: \label{app:Dirichlet}

1531:

1532: \subsection{Dirichlet Distribution\label{app:dirichlet}}

1533:

1534: We supply a brief overview of the Dirichlet distribution for completeness.  For

1535: more information, a reference such as~\cite{Wilks1962} should be consulted.  In

1536: simple terms, the Dirichlet distribution is the multinomial generalization of

1537: the Beta distribution.  The probability density function for $q$ elements is

1538: given by

1539: \begin{equation}

1540: 	\label{eqn:dirichlet_pdf}

1541: 	\text{Dir}( \{ p_{i} \} )

1542: 	=

1543: 	\frac{ \Gamma( \alpha ) }{\prod_{i=0}^{q-1} \Gamma( \alpha_{i} ) }

1544: 	\delta(1-\sum_{i=0}^{q-1} p_{i})

1545: 	\prod_{i=0}^{q-1} p_{i}^{\alpha_{i}-1}.

1546: \end{equation}

1547:

1548: The variates must satisfy $p_i \in [0,1]$ and $\sum_{i=0}^{q-1} p_{i} = 1$. The

1549: hyperparameters $\{ \alpha_{i} \}$ of the distribution, must be real and

1550: positive and we use the notation $\alpha = \sum_{i=0}^{q-1} \alpha_{i}$.  The

1551: average, variance, and covariance of the parameters $p_{i}$ are

1552: given by, respectively,

1553: \begin{eqnarray}

1554: 	\avg{p_{j}}{} & = & \frac{ \alpha_{j} }{ \alpha },

1555: 	\label{eqn:dirichlet_average}\\

1556: 	\var{p_{j}}{} & = & \frac{ \alpha_{j}\left( \alpha - \alpha_{j} \right)

1557: 	}{ \alpha^{2} \left( 1+ \alpha \right) },

1558: 	\label{eqn:dirichlet_variance}\\

1559: 	\cov{p_{j}}{p_{l}}	& = & - \frac{ \alpha_{j} \alpha_{l}

1560: 	}{ \alpha^{2} \left( 1+ \alpha \right) } \; , \; j \neq l.

1561: 	\label{eqn:dirichlet_covariance}

1562: \end{eqnarray}

1563:

1564: %%

1565: %%

1566: \subsection{Marginal distributions\label{app:dirichlet_marginal}}

1567:

1568: An important part of understanding uncertainty in the inference process is the

1569: ability to find regions of confidence from a marginal density.  The marginal is

1570: obtained from the posterior by integrating out the dependence on all parameters

1571: except for the parameter of interest.  For a Dirichlet distribution, the

1572: marginal density is known to be a Beta distribution~\cite{Wilks1962},

1573: \begin{equation}

1574: 	\label{eqn:beta_pdf}

1575: 	\text{Beta}( p_{i} )

1576: 	=

1577: 	\frac{ \Gamma( \alpha ) }{\Gamma( \alpha_{i} ) \Gamma( \alpha - \alpha_{i} ) }

1578: 	 p_{i}^{\alpha_{i}-1} \left( 1 - p_{i} \right)^{\alpha - \alpha_{i}-1}.

1579: \end{equation}

1580:

1581: %%

1582: %%

1583: \subsection{Regions of confidence from the marginal density}

1584:

1585: From the marginal density provided in~\eqnref{eqn:beta_pdf} a cumulative

1586: distribution function can be obtained using the incomplete Beta integral

1587: \begin{equation}

1588: 	\Pr(p_{i} \leq x) = \int_{0}^{x} \, dp_{i} \, \text{Beta}(p_{i}) ~.

1589: 	\label{eqn:beta_cdf}

1590: \end{equation}

1591: Using this form, the probability that a Markov chain parameter will be between

1592: $a$ and $b$ can be found using $\Pr( a \leq p_{i} \leq b) = \Pr( p_{i} \leq b) -

1593: \Pr( p_{i} \leq a)$.  For a confidence level $R$, between zero and one, we then

1594: want to find $(a,b)$ such that $R=\Pr( a \leq p_{i} \leq b)$.  The incomplete

1595: Beta integral and its inverse can be found using computational methods,

1596: see~\cite{Majumder1973,Majumder1973a,Cran1977,Berry1990} for details.

1597:

1598: %

1599: % the bibliography

1600: %

1601: \begin{thebibliography}{29}

1602: \expandafter\ifx\csname natexlab\endcsname\relax\def\natexlab#1{#1}\fi

1603: \expandafter\ifx\csname bibnamefont\endcsname\relax

1604:   \def\bibnamefont#1{#1}\fi

1605: \expandafter\ifx\csname bibfnamefont\endcsname\relax

1606:   \def\bibfnamefont#1{#1}\fi

1607: \expandafter\ifx\csname citenamefont\endcsname\relax

1608:   \def\citenamefont#1{#1}\fi

1609: \expandafter\ifx\csname url\endcsname\relax

1610:   \def\url#1{\texttt{#1}}\fi

1611: \expandafter\ifx\csname urlprefix\endcsname\relax\def\urlprefix{URL }\fi

1612: \providecommand{\bibinfo}[2]{#2}

1613: \providecommand{\eprint}[2][]{\url{#2}}

1614:

1615: \bibitem[{\citenamefont{Avery and Henderson}(1999)}]{Avery1999}

1616: \bibinfo{author}{\bibfnamefont{P.~J.} \bibnamefont{Avery}} \bibnamefont{and}

1617:   \bibinfo{author}{\bibfnamefont{D.~A.} \bibnamefont{Henderson}},

1618:   \bibinfo{journal}{Appl. Stat.} \textbf{\bibinfo{volume}{48}},

1619:   \bibinfo{pages}{53 } (\bibinfo{year}{1999}).

1620:

1621: \bibitem[{\citenamefont{Liu and Lawrence}(1999)}]{JSLiu1999}

1622: \bibinfo{author}{\bibfnamefont{J.~S.} \bibnamefont{Liu}} \bibnamefont{and}

1623:   \bibinfo{author}{\bibfnamefont{C.~E.} \bibnamefont{Lawrence}},

1624:   \bibinfo{journal}{Bioinformatics} \textbf{\bibinfo{volume}{15}},

1625:   \bibinfo{pages}{38 } (\bibinfo{year}{1999}).

1626:

1627: \bibitem[{\citenamefont{Crutchfield and Feldman}(1997)}]{Crutchfield1997}

1628: \bibinfo{author}{\bibfnamefont{J.~P.} \bibnamefont{Crutchfield}}

1629:   \bibnamefont{and} \bibinfo{author}{\bibfnamefont{D.~P.}

1630:   \bibnamefont{Feldman}}, \bibinfo{journal}{Phys. Rev. E}

1631:   \textbf{\bibinfo{volume}{55}}, \bibinfo{pages}{R1239 }

1632:   (\bibinfo{year}{1997}).

1633:

1634: \bibitem[{\citenamefont{MacKay and Peto}(1994)}]{MacKay1994}

1635: \bibinfo{author}{\bibfnamefont{D.~J.~C.} \bibnamefont{MacKay}}

1636:   \bibnamefont{and} \bibinfo{author}{\bibfnamefont{L.~C.~B.}

1637:   \bibnamefont{Peto}}, \bibinfo{journal}{Nat. Lang. Eng.}

1638:   \textbf{\bibinfo{volume}{1}} (\bibinfo{year}{1994}).

1639:

1640: \bibitem[{\citenamefont{Crutchfield and Packard}(1983)}]{Crutchfield1983}

1641: \bibinfo{author}{\bibfnamefont{J.~P.} \bibnamefont{Crutchfield}}

1642:   \bibnamefont{and} \bibinfo{author}{\bibfnamefont{N.~H.}

1643:   \bibnamefont{Packard}}, \bibinfo{journal}{Physica D}

1644:   \textbf{\bibinfo{volume}{7D}}, \bibinfo{pages}{201 } (\bibinfo{year}{1983}).

1645:

1646: \bibitem[{\citenamefont{Hao and Zheng}(1998)}]{BLHao1998}

1647: \bibinfo{author}{\bibfnamefont{B.-L.} \bibnamefont{Hao}} \bibnamefont{and}

1648:   \bibinfo{author}{\bibfnamefont{W.-M.} \bibnamefont{Zheng}},

1649:   \emph{\bibinfo{title}{Applied Symbolic Dynamics and Chaos}}

1650:   (\bibinfo{publisher}{World Scientific}, \bibinfo{year}{1998}).

1651:

1652: \bibitem[{\citenamefont{Anderson and Goodman}(1957)}]{TWAnderson1957}

1653: \bibinfo{author}{\bibfnamefont{T.~W.} \bibnamefont{Anderson}} \bibnamefont{and}

1654:   \bibinfo{author}{\bibfnamefont{L.~A.} \bibnamefont{Goodman}},

1655:   \bibinfo{journal}{Ann. Math. Stat.} \textbf{\bibinfo{volume}{28}},

1656:   \bibinfo{pages}{89 } (\bibinfo{year}{1957}).

1657:

1658: \bibitem[{\citenamefont{Billingsley}(1961)}]{Billingsley1961a}

1659: \bibinfo{author}{\bibfnamefont{P.}~\bibnamefont{Billingsley}},

1660:   \bibinfo{journal}{Ann. Math. Stat.} \textbf{\bibinfo{volume}{32}},

1661:   \bibinfo{pages}{12 } (\bibinfo{year}{1961}).

1662:

1663: \bibitem[{\citenamefont{Chatfield}(1973)}]{Chatfield1973}

1664: \bibinfo{author}{\bibfnamefont{C.}~\bibnamefont{Chatfield}},

1665:   \bibinfo{journal}{Appl. Stat.} \textbf{\bibinfo{volume}{22}},

1666:   \bibinfo{pages}{7} (\bibinfo{year}{1973}).

1667:

1668: \bibitem[{\citenamefont{Tong}(1975)}]{HTong1975}

1669: \bibinfo{author}{\bibfnamefont{H.}~\bibnamefont{Tong}}, \bibinfo{journal}{Jour.

1670:   Appl. Prob.} \textbf{\bibinfo{volume}{12}}, \bibinfo{pages}{488 }

1671:   (\bibinfo{year}{1975}).

1672:

1673: \bibitem[{\citenamefont{Katz}(1981)}]{Katz1981}

1674: \bibinfo{author}{\bibfnamefont{R.~W.} \bibnamefont{Katz}},

1675:   \bibinfo{journal}{Technometrics} \textbf{\bibinfo{volume}{23}},

1676:   \bibinfo{pages}{243 } (\bibinfo{year}{1981}).

1677:

1678: \bibitem[{\citenamefont{Rissanen}(1984)}]{JRissanen1984}

1679: \bibinfo{author}{\bibfnamefont{J.}~\bibnamefont{Rissanen}},

1680:   \bibinfo{journal}{IEEE Trans. Inform. Theory} \textbf{\bibinfo{volume}{30}},

1681:   \bibinfo{pages}{629} (\bibinfo{year}{1984}).

1682:

1683: \bibitem[{\citenamefont{Vapnik}(1999)}]{VVapnik1999}

1684: \bibinfo{author}{\bibfnamefont{V.}~\bibnamefont{Vapnik}},

1685:   \bibinfo{journal}{IEEE Trans. Neur. Net.} \textbf{\bibinfo{volume}{10}},

1686:   \bibinfo{pages}{988} (\bibinfo{year}{1999}).

1687:

1688: \bibitem[{\citenamefont{Vit{\'a}nyi and Li}(2000)}]{Vitanyi2000}

1689: \bibinfo{author}{\bibfnamefont{P.~M.} \bibnamefont{Vit{\'a}nyi}}

1690:   \bibnamefont{and} \bibinfo{author}{\bibfnamefont{M.}~\bibnamefont{Li}},

1691:   \bibinfo{journal}{IEEE Trans. Inform. Theory}

1692:   \textbf{\bibinfo{volume}{46(2)}}, \bibinfo{pages}{446}

1693:   (\bibinfo{year}{2000}).

1694:

1695: \bibitem[{\citenamefont{Baldi and Brunak}(2001)}]{Baldi2001}

1696: \bibinfo{author}{\bibfnamefont{P.}~\bibnamefont{Baldi}} \bibnamefont{and}

1697:   \bibinfo{author}{\bibfnamefont{S.}~\bibnamefont{Brunak}},

1698:   \emph{\bibinfo{title}{Bioinformatics: The Machine Learning Approach}}

1699:   (\bibinfo{publisher}{MIT Press}, \bibinfo{address}{Cambridge},

1700:   \bibinfo{year}{2001}).

1701:

1702: \bibitem[{\citenamefont{Durbin et~al.}(1998)\citenamefont{Durbin, Eddy, Krogh,

1703:   and Mitchison}}]{Durbin1998}

1704: \bibinfo{author}{\bibfnamefont{R.}~\bibnamefont{Durbin}},

1705:   \bibinfo{author}{\bibfnamefont{S.}~\bibnamefont{Eddy}},

1706:   \bibinfo{author}{\bibfnamefont{A.}~\bibnamefont{Krogh}}, \bibnamefont{and}

1707:   \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{Mitchison}},

1708:   \emph{\bibinfo{title}{Biological Sequence Analysis}}

1709:   (\bibinfo{publisher}{Cambridge University Press},

1710:   \bibinfo{address}{Cambridge}, \bibinfo{year}{1998}).

1711:

1712: \bibitem[{\citenamefont{Cover and Thomas}(1991)}]{Cover1991}

1713: \bibinfo{author}{\bibfnamefont{T.~M.} \bibnamefont{Cover}} \bibnamefont{and}

1714:   \bibinfo{author}{\bibfnamefont{J.~A.} \bibnamefont{Thomas}},

1715:   \emph{\bibinfo{title}{Elements of Information Theory}}

1716:   (\bibinfo{publisher}{Wiley-Interscience}, \bibinfo{address}{New York},

1717:   \bibinfo{year}{1991}).

1718:

1719: \bibitem[{\citenamefont{MacKay}(2003)}]{MacKay2003}

1720: \bibinfo{author}{\bibfnamefont{D.~J.~C.} \bibnamefont{MacKay}},

1721:   \emph{\bibinfo{title}{Information Theory, Inference, and Learning

1722:   Algorithms}} (\bibinfo{publisher}{Cambridge University Press},

1723:   \bibinfo{address}{Cambridge}, \bibinfo{year}{2003}).

1724:

1725: \bibitem[{\citenamefont{Samengo}(2002)}]{Samengo2002}

1726: \bibinfo{author}{\bibfnamefont{I.}~\bibnamefont{Samengo}},

1727:   \bibinfo{journal}{Phys. Rev. E} \textbf{\bibinfo{volume}{65}},

1728:   \bibinfo{pages}{46124} (\bibinfo{year}{2002}).

1729:

1730: \bibitem[{\citenamefont{Young and Crutchfield}(1994)}]{Young1994}

1731: \bibinfo{author}{\bibfnamefont{K.}~\bibnamefont{Young}} \bibnamefont{and}

1732:   \bibinfo{author}{\bibfnamefont{J.~P.} \bibnamefont{Crutchfield}},

1733:   \bibinfo{journal}{Chaos, Solitons, and Fractals}

1734:   \textbf{\bibinfo{volume}{4}}, \bibinfo{pages}{5 } (\bibinfo{year}{1994}).

1735:

1736: \bibitem[{\citenamefont{Abramowitz and Stegun}(1965)}]{Abramowitz1965}

1737: \bibinfo{author}{\bibfnamefont{M.}~\bibnamefont{Abramowitz}} \bibnamefont{and}

1738:   \bibinfo{author}{\bibfnamefont{I.~A.} \bibnamefont{Stegun}},

1739:   \emph{\bibinfo{title}{Handbook of Mathematical Functions}}

1740:   (\bibinfo{publisher}{Dover}, \bibinfo{address}{New York},

1741:   \bibinfo{year}{1965}).

1742:

1743: \bibitem[{\citenamefont{Crutchfield}(1994)}]{Crutchfield1994}

1744: \bibinfo{author}{\bibfnamefont{J.~P.} \bibnamefont{Crutchfield}},

1745:   \bibinfo{journal}{Physica D} \textbf{\bibinfo{volume}{75}},

1746:   \bibinfo{pages}{11} (\bibinfo{year}{1994}).

1747:

1748: \bibitem[{\citenamefont{Upper}(1997)}]{Upper1997}

1749: \bibinfo{author}{\bibfnamefont{D.~R.} \bibnamefont{Upper}}, Ph.D. thesis,

1750:   \bibinfo{school}{University of California}, \bibinfo{address}{Berkeley}

1751:   (\bibinfo{year}{1997}), \bibinfo{note}{{P}ublished by University Microfilms

1752:   Intl, Ann Arbor, Michigan}.

1753:

1754: \bibitem[{\citenamefont{Blackwell and Koopmans}(1957)}]{Blackwell1957}

1755: \bibinfo{author}{\bibfnamefont{D.}~\bibnamefont{Blackwell}} \bibnamefont{and}

1756:   \bibinfo{author}{\bibfnamefont{L.}~\bibnamefont{Koopmans}},

1757:   \bibinfo{journal}{Ann. Math. Stat.} \textbf{\bibinfo{volume}{28}},

1758:   \bibinfo{pages}{1011} (\bibinfo{year}{1957}).

1759:

1760: \bibitem[{\citenamefont{Wilks}(1962)}]{Wilks1962}

1761: \bibinfo{author}{\bibfnamefont{S.~S.} \bibnamefont{Wilks}},

1762:   \emph{\bibinfo{title}{Mathematical Statistics}} (\bibinfo{publisher}{John

1763:   Wiley \& Sons, Inc.}, \bibinfo{address}{New York}, \bibinfo{year}{1962}).

1764:

1765: \bibitem[{\citenamefont{Majumder and

1766:   Bhattacharjee}(1973{\natexlab{a}})}]{Majumder1973}

1767: \bibinfo{author}{\bibfnamefont{K.}~\bibnamefont{Majumder}} \bibnamefont{and}

1768:   \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{Bhattacharjee}},

1769:   \bibinfo{journal}{Appl. Stat.} \textbf{\bibinfo{volume}{22}},

1770:   \bibinfo{pages}{411} (\bibinfo{year}{1973}{\natexlab{a}}).

1771:

1772: \bibitem[{\citenamefont{Majumder and

1773:   Bhattacharjee}(1973{\natexlab{b}})}]{Majumder1973a}

1774: \bibinfo{author}{\bibfnamefont{K.}~\bibnamefont{Majumder}} \bibnamefont{and}

1775:   \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{Bhattacharjee}},

1776:   \bibinfo{journal}{Appl. Stat.} \textbf{\bibinfo{volume}{22}},

1777:   \bibinfo{pages}{409} (\bibinfo{year}{1973}{\natexlab{b}}).

1778:

1779: \bibitem[{\citenamefont{Cran et~al.}(1977)\citenamefont{Cran, Martin, and

1780:   Thomas}}]{Cran1977}

1781: \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{Cran}},

1782:   \bibinfo{author}{\bibfnamefont{K.}~\bibnamefont{Martin}}, \bibnamefont{and}

1783:   \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{Thomas}},

1784:   \bibinfo{journal}{Appl. Stat.} \textbf{\bibinfo{volume}{26}},

1785:   \bibinfo{pages}{111} (\bibinfo{year}{1977}).

1786:

1787: \bibitem[{\citenamefont{Berry et~al.}(1990)\citenamefont{Berry, {P.W. Mielke,

1788:   Jr.}, and Cran}}]{Berry1990}

1789: \bibinfo{author}{\bibfnamefont{K.}~\bibnamefont{Berry}},

1790:   \bibinfo{author}{\bibnamefont{{P.W. Mielke, Jr.}}}, \bibnamefont{and}

1791:   \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{Cran}},

1792:   \bibinfo{journal}{Appl. Stat.} \textbf{\bibinfo{volume}{39}},

1793:   \bibinfo{pages}{309} (\bibinfo{year}{1990}).

1794:

1795: \end{thebibliography}

1796:

1797:

1798: \end{document}

1799:

1800: