math0703715/imc.tex
1: % Inferring Markov Chains: Bayesian Estimation,
2: %	Model Comparison, Entropy Rate, and Out-of-class Modeling
3: % 
4: % ccs: mar 01, 2007
5: % jpc: mar 10, 2007
6: % ccs: mar 13, 2007
7: % jpc: mar 23, 2007
8: 
9: \documentclass[pre,twocolumn,showpacs,superscriptaddress,preprintnumbers,floatfix]{revtex4}
10: %\documentclass[pre,showpacs,superscriptaddress,preprintnumbers,floatfix]{revtex4}
11: 
12: %
13: %-packages
14: \usepackage{amssymb,amsmath} % math utilities
15: \usepackage{graphicx}% Include figure files
16: \usepackage{bm}% Bold mat
17: %\usepackage[pstricks1-10]{vaucanson-g} % for FSA diagrams
18: \usepackage{vaucanson-g} % for FSA diagrams, older version of ps-tricks
19: 
20: %
21: %-new user commands
22: 
23: %% references %%
24: \newcommand{\eqnref}[1]{Eq.~(\ref{#1})}
25: \newcommand{\begeqnref}[1]{Equation~\ref{#1}}
26: \newcommand{\figref}[1]{Fig.~\ref{#1}}
27: \newcommand{\begfigref}[1]{Figure~\ref{#1}}
28: \newcommand{\appref}[1]{App.~\ref{#1}}
29: 
30: %% notation %%
31: \newcommand{\hmu}	{h_\mu} % entropy rate
32: \newcommand{\hmuL}	{h_{\mu L}} % entropy rate
33: \newcommand{\hmuk}	{h_{\mu k}} % entropy rate
34: \newcommand{\KLd}	{\mathcal{D}} % entropy rate
35: \newcommand{\EQP}	{E(Q,P)}
36: % size of the alphabet
37: \newcommand{\Asize}[0]{\vert \mathcal{A} \vert}
38: % history of length-k: #1 
39: \newcommand{\hk}[0]{ \overleftarrow{s}^k} 
40: % history of length l:#2, at time t:#1
41: \newcommand{\htl}[2]{ \overleftarrow{s}_{#1}^{#2}} 
42: 
43: % specific forms
44: \newcommand{\nsk}[0]{n(\overleftarrow{s}^k)} % number of s^k
45: \newcommand{\nsks}[0]{n(\overleftarrow{s}^k s)} % number of s^k s
46: \newcommand{\ask}[0]{\alpha(\overleftarrow{s}^k)} % alpha s^k
47: \newcommand{\asks}[0]{\alpha(\overleftarrow{s}^k s)} % alpha s^k s
48: % model parameters
49: \newcommand{\MP}[0]{\mathbf{\theta}} %model parameters
50: \newcommand{\MPk}[0]{\mathbf{\theta}_k} %model parameters, k-th order
51: \newcommand{\MC}[0]{\mathbf{M}} % model
52: \newcommand{\MCk}[0]{\mathbf{M}_k} % model, k-th order
53: \newcommand{\MCkprime}[0]{\mathbf{M}_{k}'} % model, k-th order
54: 
55: % avg, var, covar
56: \newcommand{\avg}[2]{\mathbf{E}_{#2}[#1]} %expectation values
57: \newcommand{\var}[2]{\mathbf{Var}_{#2}[#1]} % variance
58: \newcommand{\cov}[2]{\mathbf{Cov}[#1,#2]} % covariance
59: % probabilities
60: % true distribution
61: \newcommand{\psk}[0]{p(\overleftarrow{s}^k)}  % True parameter
62: \newcommand{\psks}[0]{p(s\vert \overleftarrow{s}^k)}  % True parameter
63: % pme distribution
64: \newcommand{\qsk}[0]{q(\overleftarrow{s}^k)}  
65: \newcommand{\qsks}[0]{q(s\vert \overleftarrow{s}^k)}
66: % prior distribution
67: \newcommand{\rsk}[0]{r(\overleftarrow{s}^k)}  
68: \newcommand{\rsks}[0]{r(s\vert \overleftarrow{s}^k)}  
69: % uniform distribution
70: \newcommand{\usks}[0]{u(s\vert \overleftarrow{s}^k)}  % Uniform distribution
71: \newcommand{\usk}[0]{u(\overleftarrow{s}^k)}  % Uniform distribution
72: 
73: %-inference methods
74: \newcommand{\mle}[0]{MLE} % maximum likelihood estimate
75: \newcommand{\pme}[0]{PME} % posterior mean estimate
76: \newcommand{\map}[0]{MAP} % maximum a-posteriori
77: 
78: \begin{document}
79: 
80: \preprint{Santa Fe Institute Working Paper 07-03-XXX}
81: \preprint{arxiv.org/xxxxx/0703XXX}
82: 
83: \title{Inferring Markov Chains: Bayesian Estimation,\\
84: Model Comparison, Entropy Rate, and Out-of-class Modeling}
85: 
86: \author{Christopher~C.~Strelioff}
87: 	\email{streliof@uiuc.edu} 
88: 	\affiliation{Center for Computational Science \& 
89: 	Engineering and Physics Department,\\ 
90: 	University of California at Davis, One Shields Avenue, Davis, CA 95616}
91:  	\affiliation{Center for Complex Systems Research 
92:  	and Physics Department,\\
93: 	University of Illinois at Urbana-Champaign, 
94: 	1110 West Green Street, Urbana, Illinois 61801}
95: \author{James P. Crutchfield}
96: 	\email{chaos@cse.ucdavis.edu}
97: 	\affiliation{Center for Computational Science \& 
98: 	Engineering and Physics Department,\\ 
99: 	University of California at Davis, One Shields Avenue, Davis, CA 95616}
100: \author{Alfred W. H\"{u}bler}
101: 	\email{a-hubler@uiuc.edu}
102:  	\affiliation{Center for Complex Systems Research 
103:  	and Physics Department,\\
104: 	University of Illinois at Urbana-Champaign, 
105: 	1110 West Green Street, Urbana, Illinois 61801}
106: 
107: \begin{abstract} 
108: Markov chains are a natural and well understood tool for describing
109: one-dimensional patterns in time or space. We show how to infer $k$-th order
110: Markov chains, for arbitrary $k$, from finite data by applying Bayesian
111: methods to both parameter estimation and model-order selection. Extending
112: existing results for multinomial models of discrete data, we connect inference
113: to statistical mechanics through information-theoretic (type theory) techniques.
114: We establish a direct relationship between Bayesian evidence and the partition
115: function which allows for straightforward calculation of the expectation and
116: variance of the conditional relative entropy and the source entropy rate.
117: Finally, we introduce a novel method that uses finite data-size scaling with
118: model-order comparison to infer the structure of out-of-class processes.
119: \end{abstract}
120: 
121: %%% PACS
122: % Inference methods, 02.50.Tt
123: % Markov processes, 02.50.Ga
124: % Stochastic models- in statistical physics and nonlinear dynamics, 05.10.Gg
125: \pacs{02.50.Tt,02.50.Ga,05.10.Gg}
126:                          
127: \maketitle
128: 
129: %
130: % introduction
131: %
132: \section{Introduction}
133: 
134: Statistical inference of models from small data samples is a vital tool in
135: the understanding of natural systems.  In many problems of interest data
136: consists of a sequence of \emph{letters} from a finite \emph{alphabet}.  
137: Examples include analysis of sequence information in
138: biopolymers~\cite{Avery1999,JSLiu1999}, investigation of
139: one-dimensional spin systems~\cite{Crutchfield1997}, models of natural 
140: languages~\cite{MacKay1994}, and coarse-grained models of chaotic
141: dynamics~\cite{Crutchfield1983,BLHao1998}.  This diversity of potential
142: application has resulted in the development of a variety of representations
143: for describing discrete-valued data series.
144: 
145: We consider the $k$-th order Markov chain model class which uses the previous
146: $k$ letters in a sequence to predict the next letter. Inference of Markov
147: chains from data has a long history in mathematical statistics.  Early work
148: focused on maximum likelihood methods for estimating the parameters of the
149: Markov chain~\cite{TWAnderson1957,Billingsley1961a,Chatfield1973}. This work 
150: often assumed a given fixed model order. That is, no \emph{model comparison}
151: across orders is done. This work also typically relied on the assumed
152: asymptotic normality of the likelihood when estimating regions of
153: confidence and when implementing model comparison.  As a result, the realm
154: of application has been limited to data sources where these conditions are
155: met.  One consequence of these assumptions has been that data sources which
156: exhibit \emph{forbidden words}, symbol sequences which are not allowed, cannot
157: be analyzed with these methods.  This type of data violates the assumed
158: normality of the likelihood function. 
159: 
160: More recently, model comparison in the maximum likelihood approach has been
161: extended using various \emph{information criteria}. These methods for
162: model-order selection are based on extensions of the likelihood ratio and allow
163: the comparison of more than two candidate models at a time. The most widely used
164: are \emph{Akaike's information criteria} (AIC)~\cite{HTong1975} and the
165: \emph{Bayesian information criteria} (BIC)~\cite{Katz1981}. (Although the
166: latter is called Bayesian, it does not employ Bayesian model comparison in
167: the ways we will present here.) In addition to model selection using information
168: criteria, methods from information theory and machine learning have also been
169: developed.  Two of the most widely employed are \emph{minimum
170: description length} (MDL)~\cite{JRissanen1984} and \emph{structural risk
171: minimization}~\cite{VVapnik1999}.  Note that MDL and Bayesian
172: methods obtain similar results in some situations~\cite{Vitanyi2000}.  However,
173: to the best of our knowledge, structural risk minimization has not been adapted
174: to Markov chain inference.
175: 
176: We consider Bayesian inference of the Markov chain model class, extending
177: previous results~\cite{MacKay1994,JSLiu1999,Baldi2001,Durbin1998}. We provide
178: the details necessary to infer a Markov chain of arbitrary order, choose
179: the appropriate order (or weight orders according to their probability),
180: and estimate the data source's entropy rate.  The latter is important for
181: estimating the intrinsic randomness and achievable compression rates for
182: an information source~\cite{Cover1991}.  The ability to weight Markov chain
183: orders according their probability is unique to Bayesian methods and
184: unavailable in the model selection techniques discussed above.
185: 
186: In much of the literature just cited, steps of the inference process
187: are divided into (i) point estimation of model parameters, (ii) model
188: comparison (hypothesis testing), and (iii) estimation of functions of the
189: model parameters. Here we will show that Bayesian inference connects all
190: of these steps, using a unified set of ideas. Parameter estimation is the first
191: step of inference, model comparison a second level, and estimation of the
192: entropy rate a final step, intimately related to the mathematical structure
193: underlying the inference process.  This view of connecting model to data
194: provides a powerful and unique understanding of inference not available in the
195: classical statistics approach to these problems. As we demonstrate, each of
196: these steps is vital and implementation of one step without the others does
197: not provide a complete analysis of the data-model connection. 
198: 
199: Moreover, the combination of inference of model parameters, comparison of
200: performance across model orders, and estimation of entropy rates provides a
201: powerful tool for understanding Markov chain models themselves. Remarkably,
202: this is true even when the generating data source is outside of the Markov
203: chain model class.
204: Model comparison provides a sense of the structure of the data source, whereas
205: estimates of the entropy rate provide a description of the inherent randomness. 
206: Bayesian inference, information theory, and tools from statistical mechanics
207: presented here touch on all of these issues within a unified framework.
208: 
209: We develop this as follows, assuming a passing familiarity with Bayesian
210: methods and statistical mechanics. First, we discuss estimation of Markov
211: chain parameters using Bayesian methods, emphasizing the use of the complete
212: marginal posterior density for each parameter, rather than point estimates
213: with error bars. Second, we consider selection of the appropriate memory
214: $k$ given a particular data set, demonstrating that a mixture of orders may
215: often be more appropriate than selecting a single order. This is certainly
216: a more genuinely Bayesian approach. In these first two parts
217: we exploit different forms of Bayes' theorem to connect data and model class.
218: 
219: Third, we consider the mathematical structure of the evidence (or marginal
220: likelihood) and draw connections to statistical mechanics.  In this discussion
221: we present a method for estimating entropy rates by taking derivatives of a
222: partition function formed from elements of each step of the inference procedure.
223: Last, we apply these tools to three example information sources of increasing
224: complexity. The first example belongs to the Markov chain model class, but
225: the other two are examples of hidden Markov models (HMMs) that fall outside
226: of that class. We show that the methods developed here provide a powerful tool
227: for understanding data from these sources, even when they do not belong to the
228: model class being assumed.
229: 
230: %%
231: %%
232: \section{Inferring Model Parameters}
233: 
234: In the first level of Bayesian inference we develop a systematic relation
235: between the data $D$, the chosen \emph{model class} $M$, and the vector of
236: \emph{model parameters} $\MP$. The object of interest in the inference of
237: model parameters is the \emph{posterior probability density}
238: $P\left( \MP \vert D, M \right)$.  This is the probability of the model
239: parameters given the observed data and chosen model. To find the posterior
240: we first consider the joint distribution $P\left( \MP, D \vert M \right)$
241: over the data and model parameters given that one has chosen to model the
242: source with a representation in a certain class $M$. This can be factored in
243: two ways: $P\left( \MP \vert D, M \right)P\left(D \vert M\right)$ or
244: $P\left( D \vert \MP, M \right)P\left(\MP \vert M\right)$.  Setting these
245: equal and solving for the posterior we obtain Bayes' theorem:
246: \begin{equation}
247: \label{eqn:bayes}
248: P\left( \MP \vert D, M \right) 
249: 	= \frac{ P\left( D \vert \MP , M \right) \; 
250: 	P\left( \MP \vert M \right) }{ P\left( D \vert M \right) }.
251: \end{equation}
252: 
253: The \emph{prior} $P\left( \MP \vert M \right)$ specifies our assumptions
254: regarding the model parameters. We take a pragmatic view of the prior,
255: considering its specification to be a statement of assumptions about the
256: chosen model class. The \emph{likelihood} $P\left( D \vert \MP , M \right)$
257: describes the probability of the data given the model.  Finally, the
258: \emph{evidence} (or marginal likelihood) $P\left( D \vert M \right)$ is the
259: probability of the data given the model.  In the following sections we
260: describe each of the quantities in detail on our path to giving an explicit
261: expression for the posterior.
262: 
263: %%
264: \subsection{Markov chains}
265: 
266: The first step in inference is to clearly state the assumptions that make up
267: the model.  This is the foundation for writing down the likelihood of a data
268: sample and informs the choice of prior. We assume that a single data set of
269: length $N$ is the starting point of the inference and that it consists of
270: \textit{symbols} $s_t$ from a finite alphabet $\mathcal{A}$,  
271: \begin{equation}
272: 	\label{eqn:data}
273: 	D = s_0 s_1 \ldots s_{N-1} \; , \; s_t \in \mathcal{A}.
274: \end{equation}
275: We introduce the notation $\htl{t}{k}$ to indicate a length-$k$ sequence of
276: letters ending at position $t$: e.g., $\htl{4}{2}=s_3s_4$.
277: 
278: The $k$-th order Markov chain model class assumes finite memory and 
279: stationarity in the data source.  The finite memory condition, a
280: generalization of the conventional Markov property, can be written
281: \begin{equation}
282: p(D)	 = p(\htl{k-1}{k}) \prod_{t=k-1}^{N-2} p(s_{t+1} \vert \htl{t}{k}) ~,
283: 	    \label{eqn:markov_condition}
284: \end{equation}
285: thereby factoring into terms which depend only on preceding words of
286: length-$k$. The stationarity condition can be expressed
287: \begin{equation}
288: 	\label{eqn:stationarity}
289: 	p(s_t \vert \htl{t-1}{k}) = p(s_{t+m} \vert \htl{t+m-1}{k}) ~, 
290: \end{equation}
291: for any $(t,m)$.  \begeqnref{eqn:stationarity} results in a simplification of 
292: the notation because we no longer need to track the position index, 
293: $p(s_t = s \vert \htl{t-1}{k} = \hk ) = p( s \vert \hk )$ for any $t$.  Given 
294: these two assumptions, the model parameters of the $k$-th order Markov chain
295: $\MCk$ are
296: \begin{equation}
297: 	\label{eqn:model_parameters}
298:  	\MPk  = \left\{ \, p( s \vert \hk ) : s \in \mathcal{A}, 
299:  	\hk \in \mathcal{A}^k \, \right\}.
300: \end{equation}
301: A normalization constraint is placed on these parameters $\sum_{s\in
302: \mathcal{A}} p( s \vert \hk ) = 1$ for each word $\hk$.
303: 
304: The next step is to write down the elements of Bayes' theorem specific to the
305: $k$-th order Markov chain.
306: 
307: %%
308: \subsection{Likelihood}
309: 
310: Given a sample of data $D=s_{0}s_{1} \ldots s_{N-1}$, the likelihood can be 
311: written down using the Markov property of~\eqnref{eqn:markov_condition} and the 
312: stationarity of~\eqnref{eqn:stationarity}.  This results in the form
313: \begin{equation}
314: 	\label{eqn:likelihood}
315: 	P(D\vert \MPk, \MCk) = \prod_{ s \in \mathcal{A} } 
316: 	\prod_{ \hk \in \mathcal{A}^{k} } p( s \vert \hk )^{\nsks} ,
317: \end{equation}
318: where $\nsks$ is the number of times the \textit{word} $\hk s$ occurs in the
319: sample $D$.  For future use we also introduce notation for the number of times a
320: word $\hk$ has been observed $\nsk = \sum_{s \in \mathcal{A}} \nsks$.  We note
321: that~\eqnref{eqn:likelihood} is conditioned on the \emph{start sequence}
322: $\hk = s_0s_1\ldots s_{k-1}$.
323: 
324: %%
325: \vspace{-0.125in}
326: \subsection{Prior}
327: \vspace{-0.125in}
328: 
329: The prior $P(\theta|M)$ is used to specify assumptions about the model to be
330: inferred before the data is considered. Here we use
331: \emph{conjugate priors} for which the posterior distribution has the same
332: functional form as the prior.  Our choice allows us to derive exact expressions
333: for many quantities of interest in inference. This provides a powerful tool for
334: understanding what information is gained during inference and,
335: especially, model comparison.
336: 
337: The exact form of the prior is determined by our assignment of 
338: \emph{hyperparameters} $\asks$ for the prior which balance the strength of
339: the modeling assumptions encoded in the prior against the weight of the data.
340: For a $k$-th order Markov chain, there is one hyperparameter for each word
341: $\hk s$, given the alphabet under consideration. A useful way to think about
342: the assignment of values to the
343: hyperparameters is to relate them to fake counts $\tilde{n}(\hk s)$, such that
344: $\asks = \tilde{n}(\hk s) + 1$.  In this way, the $\asks$ can be set to reflect
345: knowledge of the data source and the strength of these prior assumptions can be
346: properly weighted in relation to the actual data counts $\nsks$.
347: 
348: The conjugate prior for Markov chain inference is a product of Dirichlet
349: distributions, one for each word $\hk$. It restates the finite-memory
350: assumption from the model definition:
351: \begin{eqnarray}
352: 	P(\MPk \vert \MCk ) 
353: 	& = & \prod_{\hk \in \mathcal{A}^{k}} \left\{
354: 	\frac{ \Gamma( \ask  )}{
355: 	\prod_{s\in\mathcal{A}} \Gamma( \asks ) } \right. \nonumber \\
356:     & \times & \delta \mathbf{(}1-\sum_{s\in\mathcal{A}} 
357:     p( s \vert \hk )\mathbf{)} \label{eqn:prior} \\
358: 	& \times & \left. \prod_{s\in\mathcal{A}} p( s \vert \hk )^{\asks-1} 
359: 	\right\}. \nonumber
360: \end{eqnarray}
361: (See App. \ref{app:Dirichlet} for relevant properties of Dirichlet
362: distributions.)
363: The prior's hyperparameters $\{ \asks \}$ must be real and positive.  We
364: also introduce the more compact notation $\ask = \sum_{s \in \mathcal{A}}
365: \asks$.  The function $\Gamma(x)=(x-1)!$ is the well known Gamma function.  The
366: $\delta$-function constrains the model parameters to be properly normalized:
367: $\sum_{s \in \mathcal{A}} \psks = 1$ for each $\hk$.
368: 
369: Given this functional form, there are at least two ways to interpret what the
370: prior says about the Markov chain parameters $\MPk$. In addition to considering
371: fake counts $\tilde{n}( \cdot )$, as discussed above, we can consider the
372: range of fluctuations in the estimated $\psks$. Classical statistics would
373: dictate describing the fluctuations via a single value with error bars. This
374: can be accomplished by finding the average and variance of $\psks$ with
375: respect to the prior. The result is:
376: \begin{eqnarray}
377: 	\label{eqn:prior_mean}
378: 	\avg{\psks}{\rm{prior}} & = & \frac{\asks}{\ask}~, \\
379: 	\label{eqn:prior_variance}
380: 	\var{\psks}{\rm{prior}} & = & \frac{\asks(\ask-\asks)}{\ask^2(1+\ask)} .
381: \end{eqnarray}
382: 
383: A second method, more in line with traditional Bayesian estimation, is to
384: consider the marginal distribution for each model parameter. For a Dirichlet
385: distribution, the marginal for any one parameter will be a Beta distribution.
386: With this knowledge, a probability density can be provided for each Markov chain
387: parameter given a particular setting for the hyperparameters $\asks$. In this
388: way, the prior can be assigned and analyzed in substantial detail.
389: 
390: A common stance in model inference is to assume all things are a-priori
391: equal.  This can be expressed by assigning $\asks=1$ for all $\hk \in
392: \mathcal{A}^k$ and $s \in \mathcal{A}$, adding \textit{no} fake counts
393: $\tilde{n}(\hk s)$.  This assignment results in a uniform prior distribution
394: over the model parameters and a prior expectation:
395: \begin{equation}
396: \avg{p(s\vert \hk)}{\rm{prior}} = 1/ \vert \mathcal{A} \vert ~.
397: \end{equation}
398: 
399: %%
400: \vspace{-0.20in}
401: \subsection{Evidence}
402: \vspace{-0.125in}
403: 
404: Given the likelihood and prior derived above, the evidence $P(D|M)$ is seen
405: to be a simple normalization term in Bayes' theorem.  In fact, the evidence
406: provides the probability of the data given the model $\MCk$ and so plays a
407: fundamental role in model comparison.  Formally, the definition is
408: \begin{equation}
409: 	P(D\vert \MCk ) = 	\int \; d\MPk \; P(D\vert \MPk, \MCk) 
410: 						P(\MPk \vert \MCk ),
411: 	\label{eqn:evidence_defn}
412: \end{equation}
413: where we can see that this term can be interpreted as an average of the 
414: likelihood over the prior distribution.  Applying this to the likelihood
415: in~\eqnref{eqn:likelihood} and the prior in~\eqnref{eqn:prior} produces 
416: \begin{eqnarray}
417: 	P(D\vert \MCk) & = & \prod_{\hk \in \mathcal{A}^{k}} \left\{ \; 
418: 	\frac{ \Gamma(\ask) }{ \prod_{s\in \mathcal{A}} \Gamma(\asks)} 
419: 	\right. \nonumber \\
420: 	& & \label{eqn:evidence} \\
421: 	& \times & \left. 
422: 	\frac{ \prod_{s\in \mathcal{A}} \Gamma(\nsks+\asks) }{ \Gamma(\nsk+\ask) }
423: 	\; \right\}. \nonumber
424: \end{eqnarray}
425: As we will see, this analytic expression results in the ability to make useful
426: connections to statistical mechanics techniques when estimating entropy rates.
427: This is another benefit of choosing a conjugate prior with known properties.
428:  
429: %%
430: \subsection{Posterior}
431: 
432: Using Bayes' theorem~\eqnref{eqn:bayes} the results of the three previous 
433: sections can be combined to obtain the posterior distribution over the
434: parameters of the $k$-th order Markov chain. One finds:
435: \begin{eqnarray}
436: 	P(\MPk\vert D, \MCk) & = & \prod_{\hk \in \mathcal{A}^{k}} \left\{
437: 	\frac{ \Gamma( \nsk + \ask  ) }{
438: 	\prod_{s\in\mathcal{A}} \Gamma( \nsks + \asks ) } \right. \nonumber \\
439: 	& \times & \delta \mathbf{(}1-\sum_{s\in\mathcal{A}} p( s \vert \hk)
440: 	\mathbf{)} \label{eqn:posterior} \\
441: 	& \times & \left. \prod_{s\in\mathcal{A}} 
442: 	p( s \vert \hk )^{\nsks + \asks - 1} \right\}. \nonumber
443: \end{eqnarray}
444: As noted in selecting the prior, the resulting form is a Dirichlet 
445: distribution with modified parameters.  This is a result of choosing the 
446: conjugate prior: cf. the forms of \eqnref{eqn:prior} and
447: \eqnref{eqn:posterior}.
448: 
449: From~\eqnref{eqn:posterior} the estimation of the model parameters 
450: $p(s\vert \hk)$ and the uncertainty of these estimates can be given using the
451: known properties of the Dirichlet distribution.  As with the prior,
452: there are two main ways to understand what the posterior tells us about the
453: fluctuations in the estimated Markov chain parameters. The first uses a point
454: estimate with ``error bars''. We obtain these from the mean and variance of
455: the $\psks$ with respect to the posterior, finding
456: \begin{gather}
457:  	\avg{p(s\vert \hk)}{\rm{post}} =  \frac{ \nsks + \asks }{ \nsk + \ask }
458: 	\label{eqn:posterior_mean} ~, \\ \nonumber \\
459:  	\var{p(s\vert \hk)}{\rm{post}}  =  \frac{ \nsks +\asks }{ ( \nsk + \ask )^2 }
460: 	\nonumber  \\ \label{eqn:posterior_variance} \\
461: 	\times  \frac{ ( \nsk + \ask ) - ( \nsks + \asks  ) }{
462: 	( \nsk + \ask +1 ) }. \nonumber
463: \end{gather}
464: This is the \textit{posterior mean estimate} (\pme) of the model parameters.
465: 
466: A deeper understanding of~\eqnref{eqn:posterior_mean} is obtained through a
467: simple factoring:
468: \begin{eqnarray}
469: 	\avg{p(s\vert \hk)}{\rm{post}} & = & \frac{1}{ \nsk + \ask } 
470: 	\left[ \nsk \, \left (\frac{\nsks}{\nsk} \right) \right. \nonumber \\
471: 	&& 	\label{eqn:pme_factor} \\
472: 	& + & \left. \ask \, \left(\frac{\asks}{\ask} \right) \right], \nonumber
473: \end{eqnarray}
474: where $\nsks /\nsk $ is the \emph{maximum likelihood estimate} (\mle) 
475: of the model parameters and $\asks /\ask$ is the prior expectation given
476: in~\eqnref{eqn:prior_mean}.  In this form, it is
477: apparent that the posterior mean estimate is a weighted sum of the \mle~and 
478: prior expectation.  As a result, we can say that the posterior mean and
479: maximum likelihood estimates converge to the same value for $\nsk \gg \ask$. 
480: Only when the data is scarce, or the prior is set with strong conviction, 
481: does the Bayesian estimate add corrections to the \mle.
482: 
483: A second method for analyzing the resulting posterior density is to consider the
484: marginal density for each parameter.  As discussed with the prior, the marginal
485: for a Dirichlet is a Beta distribution.  As a result, we can either provide
486: regions of confidence for each parameter or simply inspect the density function.
487: The latter provides much more information about the inference being made than
488: the point estimation just given.  In our examples, to follow shortly, we
489: plot the marginal posterior density for various parameters of interest
490: to demonstrate the wealth of information this method provides.
491: 
492: Before we move on, we make a final point regarding the estimation of inference 
493: uncertainty. The form of the posterior is not meant to reflect the potential
494: fluctuations of the data source.  Instead, the width of the distribution
495: reflects the possible Markov chain parameters which are consistent with
496: observed data sample.  These are distinct notions and should not be conflated.
497: 
498: %%
499: \subsection{Predictive distribution}
500: 
501: Once we have an inferred model, a common task is to estimate the probability of
502: a new observation $D^{(new)}$ given the previous data and estimated model.
503: This is implemented by taking an average of the likelihood of the new data:
504: \begin{equation}
505: P(D^{(new)}\vert \MPk, \MCk)
506:   = \prod_{\hk \in \mathcal{A}^k, s \in \mathcal{A}} p(s\vert \hk)^{m(\hk s)}
507: \end{equation}
508: with respect to the posterior
509: distribution~\cite{MacKay2003}:
510: \begin{eqnarray}
511: 	\label{eqn:predictive_distribution_defn}
512: 	P(D^{(new)}\vert D,\MCk) & =  & \int  d\MPk  P(D^{(new)}\vert \MPk, \MCk) \\ 
513: 	& \times & P(\MPk \vert D, \MCk) ~. \nonumber
514: \end{eqnarray}
515: We introduce the notation $m(\hk s)$ to indicate the number of times the word
516: $\hk s$ occurs in $D^{(new)}$. This method has the desirable property, compared
517: to point estimates, that it takes into account the uncertainty in the model
518: parameters $\MPk$ as reflected in the form of the posterior distribution.
519: 
520: The evaluation of~\eqnref{eqn:predictive_distribution_defn} follows the same 
521: path as the calculation for the evidence and produces a similar
522: form; we find:
523: \begin{gather}
524: 	P(D^{(new)}\vert D, \MCk)  =  \prod_{\hk \in \mathcal{A}^{k}} \left\{ \; 
525: 	\frac{ \Gamma( \nsk+\ask) }{ \prod_{s\in \mathcal{A}} 
526: 	\Gamma( \nsks + \asks)} \right. \nonumber \\
527: 	\label{eqn:predictive_distribution} \\
528: 	\times  \left. \frac{ \prod_{s\in \mathcal{A}} 
529: 	\Gamma( \nsks + m(\hk s) + \asks ) }{ \Gamma( \nsk + m(\hk) + \ask ) }
530: 	\; \right\}. \nonumber
531: \end{gather}
532: 
533: %%
534: %%
535: \section{Model Comparison}
536: 
537: With the ability to infer a Markov chain of a given order $k$, a common sense 
538: question is to ask how do we choose the correct order given a particular data 
539: set?  Bayesian methods have a systematic way to address this through 
540: the use of \emph{model comparison}.
541: 
542: In many ways, this process is analogous to inferring model parameters
543: themselves, which we just laid out.  We start by enumerating the set of model
544: orders to be compared $\mathcal{M} = \{ \MCk \}_{k_{min}}^{k_{max}}$, where
545: $k_{min}$ and $k_{max}$ correspond to the minimum and maximum order to be
546: inferred, respectively.  Although we will not consider an independent,
547: identically distributed (IID) model ($k=0$) here, we do note that this could
548: be included using the same techniques described below.
549: 
550: We start with the joint probability $P(M_{k},D \vert \mathcal{M} )$ of a
551: particular model $M_{k} \in \mathcal{M}$ and data sample $D$, factoring it in
552: two ways following Bayes' theorem. Solving for the probability of a particular
553: model class we obtain
554: \begin{equation}
555: 	\label{eqn:model_comparison}
556:  	P(\MCk \vert D , \mathcal{M} ) = \frac{ P(D \vert \MCk, \mathcal{M} )
557:  					P(\MCk \vert \mathcal{M} ) }{ P(D \vert \mathcal{M})} ,	
558: \end{equation} 
559: where the denominator is the sum given by
560: \begin{equation}
561: P(D \vert \mathcal{M}) = 
562:   \sum_{\MCkprime \in \mathcal{M}}
563:   P(D \vert \MCkprime, \mathcal{M} )P(\MCkprime \vert \mathcal{M} ) ~.
564: \end{equation}
565: The probability of a particular model class in the set under consideration is
566: driven by two components: the evidence $P(D \vert \MCk, \mathcal{M})$, derived
567: in \eqnref{eqn:evidence}, and the prior over model classes
568: $P(\MCk \vert \mathcal{M} )$.
569: 
570: Two common priors in model comparison are: (i) all models are equally likely
571: and (ii) models should be penalized for the number of free parameters used to
572: fit the data.  In the first instance 
573: $P(\MCk \vert \mathcal{M})=1/ \vert \mathcal{M} \vert$ is the same for all 
574: orders $k$.  However, this factor cancels out because it appears in both the
575: numerator and denominator.  As a result, the probability of models using this
576: prior becomes
577: \begin{equation}
578: 	\label{eqn:best_model_uniform_prior}
579: 	P(\MCk \vert D , \mathcal{M} ) = \frac{P(D \vert \MCk, \mathcal{M} )
580: 					}{
581: 					\sum_{\MCkprime \in \mathcal{M}}
582: 					P(D \vert \MCkprime, \mathcal{M} )}.
583: \end{equation}
584: 
585: In the second case, a common penalty for the number of model parameters is 
586: \begin{equation}
587: \label{eqn:df_penalty_prior}
588: P(\MCk \vert \mathcal{M}) = \frac{\exp( - \vert \MCk \vert )
589: 						  }{\sum_{\MCkprime \in \mathcal{M}} 
590: 						  \exp( - \vert \MCkprime \vert ) } ~,
591: \end{equation}
592: where $\vert \MCk \vert$ is the number of free parameters in the model. For a
593: $k$-th order Markov chain, the number of free parameters is
594: \begin{equation}
595: \vert \MCk \vert = \vert \mathcal{A} \vert^k(\vert \mathcal{A} \vert-1) ~,
596: \end{equation}
597: where $\vert \mathcal{A} \vert$ is the alphabet size. Thus, model
598: probabilities under this prior take on the form
599: \begin{equation}
600: 	\label{eqn:best_model_df_penalty_prior}
601: 	P(\MCk \vert D , \mathcal{M} ) = \frac{
602: 					P(D \vert \MCk, \mathcal{M} ) \exp( - \vert \MCk \vert )
603: 					}{
604: 					\sum_{\MCkprime}
605: 					P(D \vert \MCkprime, \mathcal{M} )
606: 					\exp( - \vert \MCkprime \vert ) }.
607: \end{equation}
608: We note that the normalization sum in~\eqnref{eqn:df_penalty_prior}
609: cancels because it appears in both the numerator and denominator.
610: 
611: Bayesian model comparison has a natural \emph{Occam's razor} in the model
612: comparison process~\cite{MacKay2003}.  This means there is a natural preference
613: for smaller models even when a uniform prior over model orders is applied.  In
614: this light, a penalty for the number of model parameters can be seen as a very
615: cautious form of model comparison.  Both of these priors,
616: \eqnref{eqn:best_model_uniform_prior} and  
617: \eqnref{eqn:best_model_df_penalty_prior}, will be considered in
618: the examples to follow.
619: 
620: A note is in order on computational implementation. In general, the resulting
621: probabilities can be extremely small, easily resulting in numerical underflow
622: if the equations are not implemented with care. As mentioned
623: in~\cite{Durbin1998}, computation with extended logarithms can be used to
624: alleviate these concerns.
625: 
626: %%
627: %%
628: \section{Information Theory, Statistical Mechanics, and Entropy Rates}
629: 
630: An important property of an information source is its \emph{entropy rate}
631: $\hmu$, which indicates the degree of intrinsic randomness and controls the
632: achievable compression. A first attempt at estimating a source's entropy rate
633: might consist of plugging a Markov chain's estimated model parameters into the
634: known expression~\cite{Cover1991}. However, this does not
635: accurately reflect the posterior distribution derived above. This observation
636: leaves two realistic alternatives. The first option is to sample model
637: parameters from the posterior distribution. These samples can then be used to
638: calculate a set of entropy rate estimates that reflect the underlying posterior
639: distribution. A second option, which we take here, is to adapt methods from
640: type theory and
641: statistical mechanics previously developed for IID models~\cite{Samengo2002}
642: to Markov chains. To the best of our knowledge this is the first time these
643: ideas have been extended to inferring Markov chains; although cf.
644: \cite{Young1994}. 
645: 
646: In simple terms, type theory shows that the probability of an observed sequence
647: can be written in terms of the \emph{Kullback-Leibler} (KL) \emph{distance} and
648: the entropy rate.  When applied to the Markov chain inference problem the resulting
649: form suggests a connection to statistical mechanics. For example, we will show
650: that averages of the KL-distance and entropy rate with respect to the posterior
651: are found by taking simple derivatives of a partition function.  
652: 
653: The connection between inference and information theory starts by considering
654: the product of the prior~\eqnref{eqn:prior} and
655: likelihood~\eqnref{eqn:likelihood}:
656: \begin{equation}
657: P(\MPk\vert \MCk)P( D\vert \MPk, \MCk)=P( D, \MPk\vert \MCk) ~.
658: \end{equation}
659: This forms a joint distribution over the observed data $D$ and model parameters
660: $\MPk$ given the model order $\MCk$. Denoting the normalization constant from
661: the prior as $Z$ to save space, this joint distribution is
662: \begin{equation}
663: 	\label{eqn:product_prior_likelihood}
664:  	P( D, \MPk\vert \MCk) = Z \, \prod_{\hk, s}  
665:  	p( s \vert \hk )^{\nsks + \asks - 1}.
666: \end{equation}
667: This form can be written, without approximation, in terms of conditional 
668: relative entropies $\KLd [\cdot \| \cdot ]$ and entropy rate $\hmu [\cdot]$:
669: \begin{eqnarray}
670: 	\label{eqn:info_prior_likelihood}
671:  	P( D, \MPk\vert \MCk) & = & Z \, 2^{-\beta_k \mathbf{(} \KLd [Q \| P ]
672:  	 + \hmu [Q]\mathbf{)}} \\
673: 	& \times & 2^{+\Asize^{k+1} \mathbf{(} \KLd [ U \| P ] 
674: 	+ \hmu [U]\mathbf{)}} ~, \nonumber
675: \end{eqnarray}
676: where $\beta_k = \sum_{\hk,s} \left[ \nsks + \asks \right]$ and the
677: distribution of true parameters is
678: $P = \{ \psk, \psks \}$. The distributions $Q$ and $U$ are given by
679: \begin{eqnarray}
680: 	\label{eqn:pme_distribution}
681: 	Q & = & \left\{ \qsk = \frac{\nsk+\ask}{\beta_k} , \right. \\ 
682: 	  & &	\left. \qsks = \frac{\nsks + \asks}{\nsk + \ask} \right\}
683: 	  \nonumber \\
684: 	\label{eqn:uniform_distribution}
685: 	U & = & \left\{ \usk = \frac{1}{\Asize^k}, \usks = \frac{1}{\Asize} \right\}
686: 	~,
687: \end{eqnarray}
688: where $Q$ is the distribution defined by the posterior mean and $U$ is a uniform
689: distribution. The information-theoretic quantities used above are given by
690: \begin{eqnarray}
691: 	\KLd [ Q \| P ] & = & \sum_{s, \hk} \qsk \qsks \log_2 \frac{\qsks}{\psks}
692: 	\label{eqn:conditional_KL_div} \\
693: 	\hmu [ Q ] 	& = & - \sum_{s, \hk} \qsk \qsks \log_2 \qsks ~. 
694: 	\label{eqn:entropy_rate_estimate}
695: \end{eqnarray}
696: The form of~\eqnref{eqn:info_prior_likelihood} and its relation to the evidence 
697: suggests a connection to statistical mechanics: The evidence 
698: $P(D \vert \MCk) = \int d\MPk P( D, \MPk\vert \MCk)$ is a partition function 
699: $\mathcal{Z} = P( D \vert \MCk)$.  Using conventional techniques, the
700: expectation and variance of the ``energy''
701: \begin{equation}
702: \label{eqn:info_energy}
703: \EQP = \KLd [Q \| P ] + \hmu [Q]
704: \end{equation}
705: are obtained by taking derivatives of the logarithm of the partition function
706: with respect to $\beta_k$:
707: \begin{eqnarray}
708: 	\avg{\, \EQP \, }{\rm{post}} 
709: 	& = &
710: 	- \frac{1}{\log 2}
711: 	\frac{\partial}{\partial \beta_k} \, \log \mathcal{Z}
712: 	\label{eqn:info_mean_energy}\\
713:  	\var{\, \EQP \, }{\rm{post}}
714: 	& = &
715: 	\frac{1}{\log 2}
716:  	\frac{\partial^2}{\partial \beta_k^2} \, \log \mathcal{Z}
717: 	~. 
718: 	\label{eqn:info_variance_energy}
719: \end{eqnarray}
720: The factors of $\log 2$ in the above expressions come from the decision to use
721: base 2 logarithms in the definition of our information-theoretic quantities. 
722: This results in values in \emph{bits} rather than \emph{nats}~\cite{Cover1991}. 
723: 
724: To evaluate the above expression, we take advantage of the known form for the 
725: evidence provided in~\eqnref{eqn:evidence}.  With the definitions $\alpha_k =
726: \sum_{\hk} \ask$ and
727: \begin{equation}
728: 	\label{eqn:prior_distribution}
729: 	R = \left\{ \rsk = \frac{\ask}{\alpha_k} , 
730: 	\rsks = \frac{\asks}{\ask} \right\}
731: \end{equation}
732: the negative logarithm of the partition function can be written
733: \begin{eqnarray}
734: 	- \log \mathcal{Z} & = & \sum_{\hk,s} \log \Gamma 
735: 	\left[ \alpha_k \rsk \rsks \right]
736: 	\\ & - & \sum_{\hk} \log \Gamma \left[ \alpha_k \rsk \right] 
737: 	+  \sum_{\hk} \log \Gamma \left[ \beta_k \qsk \right] \nonumber \\
738: 	& - & \sum_{\hk,s} \log \Gamma 
739: 	\left[ \beta_k \qsk \qsks \right]. \nonumber
740: \end{eqnarray}
741: 
742: From this expression, the desired expectation is found by taking derivatives
743: with respect to $\beta_k$; we find that
744: \begin{gather}
745: 	\avg{\, \EQP \, }{\rm{post}} 
746: 		= \frac{1}{\log 2}
747: 		\sum_{\hk} \qsk \psi^{(0)} \left[ \beta_k \qsk \right] 
748: 		\nonumber \\
749: 	-  \frac{1}{\log 2} \sum_{\hk,s} \qsk \qsks \psi^{(0)} 
750: 	\left[ \beta_k \qsk \qsks \right]~. \nonumber \\
751: 	\label{eqn:average_info}
752: \end{gather} 
753: The variance is obtained by taking a second derivative with respect to
754: $\beta_k$, producing
755: 
756: \begin{gather}
757: 	\var{\, \EQP \, }{\rm{post}}  = 
758: 	- \frac{1}{\log 2} \sum_{\hk} \qsk^2 \psi^{(1)} \left[ \beta_k \qsk \right]
759: 	\nonumber \\
760: 	+  \frac{1}{\log 2} \sum_{\hk,s} \qsk^2 \qsks^2 \psi^{(1)} 
761: 	\left[ \beta_k \qsk \qsks \right]. \nonumber \\
762: 	\label{eqn:variance_info}
763: \end{gather} 
764: In both of the above the polygamma function is defined $\psi^{(n)}(x) = 
765: d^{n+1}/dx^{n+1} \log \Gamma(x)$. (For further details, consult a reference 
766: such as~\cite{Abramowitz1965}.)
767: 
768: From the form of~\eqnref{eqn:average_info} 
769: and~\eqnref{eqn:variance_info}, the meaning is not immediately clear. We can 
770: use an expansion of the $n=0$ polygamma function
771: \begin{equation}
772: \psi^{(0)}(x) = \log x - 1/2x + \mathcal{O}(x^{-2}) ~,
773: \end{equation}
774: valid for $x \gg 1$, however, to obtain an asymptotic form
775: for~\eqnref{eqn:average_info}; we find
776: \begin{gather}
777:  	\avg{\, \EQP \, }{\rm{post}} = 
778:  	H[ \qsk \qsks ] - H[\qsk] \nonumber \\
779: 	+ \frac{1}{2\beta_k} \Asize^k(\Asize -1 )
780: 	+ \mathcal{O}(1/ \beta_k^2)
781: 	\label{eqn:average_info_asymptotic}.
782: \end{gather}
783: From this we see that the first two terms make up the entropy 
784: rate $\hmu [ Q ] = H[ \qsk \qsks ] - H[\qsk]$ and the last 
785: term is associated with the conditional relative entropy between the posterior
786: mean distribution $Q$ and true distribution $P$.
787: 
788: In summary, we have found the average of conditional relative entropy and
789: entropy rate with respect to the posterior density.  This was accomplished by
790: making connections to statistical mechanics through type theory.  Unlike
791: sampling from the posterior to estimate the entropy rate, this method results
792: in an analytic form which approaches $\hmu [ P ]$ as the inverse of the data
793: size. This method for approximating $\hmu$ also provides a computational
794: benefit. No eigenstates have to be found from the Markov transition matrix,
795: allowing for the storage of values in sparse data structures. This provides
796: a distinct computational advantage when large orders or alphabets are
797: considered.  
798: 
799: Finally, it might seem awkward to use the expectation 
800: of~\eqnref{eqn:info_energy} for estimation of the entropy rate.  This method
801: was chosen because it is the form that naturally appears in writing down the
802: likelihood-prior combination in~\eqnref{eqn:info_prior_likelihood}.  As a result
803: of using this method, most of the results obtained above are without
804: approximation.  We were also able to show this expectation converges to the
805: desired value in a well behaved manor.
806: 
807: %%
808: %%
809: \vspace{-0.125in}
810: \section{Examples}
811: \vspace{-0.125in}
812: 
813: To explore how the above produces a robust inference procedure, let's now
814: consider the statistical inference of a series of increasingly complex data
815: sources. The first, called the \emph{golden mean} process, is a first-order
816: Markov chain. The second data source is called the \emph{even process} and
817: cannot be represented by a Markov chain with finite order. However, this source
818: is a deterministic HMM, meaning that the current state and next output symbol
819: uniquely determine the next state.  Finally, we consider the \emph{simple
820: nondeterministic source}, so named since its smallest representation is as
821: a nondeterministic HMM. (Nondeterminism here refers to the HMM structure: the
822: current state and next output symbol do not uniquely determine the next state. 
823: This source is represented by an infinite-state deterministic HMM
824: \cite{Crutchfield1994,Upper1997}.)
825: 
826: The golden mean, even, and simple nondeterministic processes can all be written
827: down as models with two internal states---call them $A$ and $B$.  However, the
828: complexity of the data generated from each source is of markedly different
829: character. Our goal in this section is to consider the three main steps in
830: inference to analyze them. First, we consider inference of a first-order Markov
831: chain to demonstrate the
832: estimation of model parameters with uncertainty.  Second, we consider model
833: comparison for a range of orders $k$.  This allows us to discover structure in
834: the data source even though the true model class cannot be captured in all 
835: cases. Finally, we consider estimation of entropy rates from these data sources,
836: investigating how randomness is expressed in them.
837: 
838: While investigating these processes we consider average data counts,
839: rather than sample counts from specific realizations, as we want
840: to focus specifically on the average performance of Bayesian inference.  To
841: do this we take advantage of the known form of the sources. Each is described
842: by a transition matrix $T$, which gives transitions between states
843: $A$ and $B$:
844: \begin{equation}
845: 	\label{eqn:transition_matrix_definition}
846: 	T = \left[ \begin{array}{cc}
847: 	p(A\vert A) & p(B\vert A) \\
848: 	p(A\vert B) & p(B\vert B) 
849: 	\end{array}
850: 	\right] \;.
851: \end{equation}
852: Although two of our data sources are not finite Markov chains, the transition
853: matrix between internal states is Markov.  This means the matrix
854: is \emph{stochastic} (all rows sum to one) and we are guaranteed an eigenstate
855: $\vec{\pi}$ with eigenvalue one: $\vec{\pi} \, T = \vec{\pi}$.  This eigenstate
856: describes the asymptotic distribution over internal states:
857: $\vec{\pi} = \left[ p(A), p(B) \right]$.
858: 
859: The transition matrix can be divided into labeled matrices $T^{(s)}$ which
860: contain those elements of $T$ that output symbol $s$. For our binary data
861: sources one has
862: \begin{equation}
863: 	\label{eqn:transition_matrix}
864: 	T = T^{(0)} + T^{(1)}.
865: \end{equation}
866: Using these matrices, the average probability of words can be estimated for
867: each process of interest. For example, the probability of word $01$ can be
868: found using
869: \begin{equation}
870: p(01) = \vec{\pi} \, T^{(0)}T^{(1)} \vec{\eta} ~,
871: \end{equation}
872: where $\vec{\eta}$ is a column vector with all $1$'s. In this way, for any
873: data size $N$, we estimate the average count for a word as
874: \begin{equation}
875: \nsks = (N-k)~p(\hk s) ~.
876: \end{equation}
877: Average counts, obtained this way, will be the basis for all of
878: the examples to follow.
879: 
880: In the estimation of the true entropy rate for the examples we use the formula
881: \begin{equation}
882: 	h_{\mu} = - \sum_{v \in \{A,B\}} p(v)
883: 	\sum_{s \in \mathcal{A}} ~p(s\vert v) \log_2 p(s\vert v)
884: 	\label{eqn:entropy_rate}
885: \end{equation}
886: for the the golden mean and even processes, where
887: $p(s\vert v) = T^{(s)}_{v \cdot}$ is the probability of a letter $s$ given the
888: state $v$ and $p(v)$ is the asymptotic probability of the state $v$ which can be
889: found as noted above. For the simple nondeterministic source this closed-form
890: expression cannot be applied and the entropy rate must be found using more
891: involved methods; see~\cite{Crutchfield1994} for further details.
892: 
893: %%
894: %%
895: \subsection{Golden mean process: In-class modeling}
896: 
897: The \emph{golden mean process} can be represented by a simple $1$st-order
898: Markov chain over a binary alphabet characterized by a single (shortest)
899: forbidden word $s^2 = 00$. The defining labeled transition matrices for this data
900: source are given by
901: \begin{equation}
902: 	\label{eqn:label_transition_matrix_golden_mean}
903: 	T^{(0)} = \left[ \begin{array}{cc}
904: 	0 & 1/2 \\
905: 	0 & 0 
906: 	\end{array}
907: 	\right] \; , \;
908: 	T^{(1)} = \left[ \begin{array}{cc}
909: 	1/2 & 0 \\
910: 	1	& 0 
911: 	\end{array}
912: 	\right]	 ~.
913: \end{equation}
914: \begfigref{fig:golden_mean} provides a graphical representation of the
915: corresponding hidden Markov chain. Inspection reveals a simple relation
916: between the \text{internal states} $A$ and $B$ and the output symbols
917: $0$ and $1$. An observation of $0$ indicates a transition to internal
918: state $B$ and a $1$ corresponds to state $A$, making this process a Markov
919: chain over $0$s and $1$s. 
920: 
921: %%
922: %% Beamer Implementation
923: %%
924: \begin{figure}[htb]
925: \begin{center}
926: 		%options for the plot:
927: 			%-states
928: 			\SetStateLabelScale{1.6}
929: 			\SetStateLineWidth{1.4pt}
930: 			%-edges
931: 			\SetEdgeLabelScale{1.4}
932: 			\SetEdgeLineWidth{0.75pt}
933: 		
934: 		\begin{VCPicture}{(0,0)(5,2)}
935: 			%states
936: 			\ChgStateLabelScale{0.8}
937: 				\State[A]{(1,0)}{A}
938: 				\State[B]{(4,0)}{B}
939: 			\ChgEdgeLabelScale{0.7}
940: 			%transitions
941: 				\LoopW{A}{ 1 | 1/2 } 
942: 				\LArcR[0.5]{B}{A}{ 1 | 1 } 
943: 				\LArcR[0.5]{A}{B}{ 0 | 1/2 }
944: 		\end{VCPicture}
945: \end{center}
946: \vspace{0.5in}
947: \caption{A deterministic hidden Markov chain for the golden mean process.
948:   Edges are labeled with the output symbol and the transition probability:
949:   \emph{symbol} $\vert$ \emph{probability}.
950:   }
951: \label{fig:golden_mean}
952: \end{figure}
953: 
954: For the golden mean the eigenstate is $\vec{\pi} = \left[ p(A), p(B)
955: \right] = \left( 2/3 , 1/3 \right)$.  With this vector and the labeled 
956: transition matrices any desired word count can be found as discussed above.
957: 
958: %
959: \vspace{-0.125in}
960: \subsubsection{Estimation of $M_1$ Parameters}
961: \vspace{-0.125in}
962: 
963: To demonstrate the effective inference of the Markov chain parameters for the 
964: golden mean process we consider average counts for a variety of data sizes 
965: $N$.  For each size, the marginal posterior for the parameters $p(0\vert 1)$ and
966: $p(1\vert 0)$ is plotted in~\figref{fig:GoldenMean_ParameterEstimates}.  The
967: results demonstrate that the shape of the posterior effectively
968: describes the distribution of possible model parameters at each $N$ and converges
969: to the correct values of $p(0\vert 1)=1/2$ and $p(1\vert 0)=1$ with increasing
970: data.
971: 
972: %% details-
973: %%   code: MarginalPosterior.py in MarkovChainPaper_Code folder.
974: %%   parameters: marginal density plotted for N=50,100,200,400.
975: %%
976: \begin{figure}[htbp]
977: 	\centering
978: 	\includegraphics[width=0.98\columnwidth]{MarginalPosterior_GM.eps}
979: 	\caption{A plot of the inference of $M_1$ model parameters for the 
980: 	golden mean process.  For each data sample size $N$, the marginal posterior is
981: 	plotted for the parameters of interest: $p(0\vert 1)$ in the top panel and
982: 	$p(1\vert 0)$ in the lower panel.  The \textit{true} values of the parameters
983: 	are $p(0\vert 1)=1/2$ and $p(1\vert 0) = 1$.
984: 	\label{fig:GoldenMean_ParameterEstimates}}
985: \end{figure}
986: 
987: Point estimates with a variance can be provided for each of the parameters, but
988: these numbers by themselves can be misleading.  However, the estimate obtained
989: by using the mean and variance of the posterior are a more effective description
990: of the inference process than a maximum likelihood estimate with estimated
991: error given by a Gaussian approximation of the likelihood alone.
992: As~\figref{fig:GoldenMean_ParameterEstimates} demonstrates, in
993: fact, a Gaussian
994: approximation of uncertainty is an ineffective description of our knowledge
995: when the Markov chain parameters are near their upper or lower limits at $0$
996: and $1$. Probably the most effective set of numbers to provide consists of the
997: mean of the posterior and a region of confidence. These would most accurately
998: describe asymmetries in the uncertainty of model parameters. Although we will
999: not do that here, a brief description of finding regions of confidence is
1000: provided in~\appref{app:dirichlet}.
1001: 
1002: %
1003: \vspace{-0.125in}
1004: \subsubsection{Selecting the Model Order $k$}
1005: \vspace{-0.125in}
1006: 
1007: Now consider the selection of the appropriate order $k$ from golden mean
1008: realizations.  As discussed above, the golden mean process is a first order
1009: Markov chain with $k=1$.  As a result, we would expect model comparison to
1010: select this order from the available possibilities. To demonstrate this,
1011: we consider orders $k=1-4$ and perform model comparison with a uniform prior
1012: over orders (\eqnref{eqn:best_model_uniform_prior}) and with a penalty for the 
1013: number of model parameters (\eqnref{eqn:best_model_df_penalty_prior}).
1014: 
1015: %% details-
1016: %%   code: ModelComparison.py in MarkovChainPaper_Code folder.
1017: %%   parameters: length_min=100, length_max=1000, step=5
1018: %%
1019: \begin{figure}[htbp]
1020: 	\centering
1021: 	\includegraphics[width=0.98\columnwidth]{ModelCompare_GM.eps}
1022: 	\caption{Model comparison for Markov chains of order $k=1-4$ using 
1023: 	average counts from the golden mean process.  Sample sizes from $N=100$ to 
1024: 	$N=1,000$ in steps of $\Delta N=5$ are used to generate these plots.  The top panel
1025: 	displays the model probabilities using a uniform prior over orders $k$.  The
1026: 	bottom panel displays the effect of a penalty for model size.
1027: 	\label{fig:GoldenMean_ModelComparison}}
1028: \end{figure}
1029:  
1030: The results of the model comparisons are given 
1031: in~\figref{fig:GoldenMean_ModelComparison}.  The top panel shows the probability
1032: for each order $k$ as a function of the sample size, using a uniform prior.  For
1033: this prior over orders, $M_1$ is selected with any reasonable amount of
1034: data.   However, there does seem to be a possibility to over-fit for small data
1035: size $N \leq 100$.  The bottom panel shows the model probability with a penalty
1036: prior over model order $k$.  This removes the over-fitting at small data sizes
1037: and produces an offset which must be overcome by the data before higher $k$ is
1038: selected.  This example is not meant to argue for the penalty prior over model
1039: orders.  In fact, Bayesian model comparison with a uniform prior does an
1040: effective job using a relatively small sample size.
1041: 
1042: %
1043: \vspace{-0.125in}
1044: \subsubsection{Estimation of Entropy Rate}
1045: \vspace{-0.125in}
1046: 
1047: We can also demonstrate the convergence of  the average for
1048: $E(Q,P)=D[ Q \| P ] + \hmu [Q]$ given in~\eqnref{eqn:average_info} to the
1049: correct entropy rate for the golden mean process.  We choose to show this
1050: convergence for all orders $k=1-4$ discussed in the previous section.  This
1051: exercise demonstrates that all orders greater than or equal to $k = 1$
1052: effectively capture the entropy rate. However, the convergence to the correct
1053: values for higher-order $k$ takes more data because of a larger initial value of
1054: $D[ Q \| P ]$.  This larger value is simply due to the larger number of
1055: parameters for higher-order Markov chains.
1056: 
1057: %% details-
1058: %%   code: EntropyEstimation.py in MarkovChainPaper_Code folder.
1059: %%   parameters: length_min=50, length_max=5000, step=50, k=1-4
1060: %%
1061: \begin{figure}[htbp]
1062: 	\centering
1063: 	\includegraphics[width=0.98\columnwidth]{EntropyEstimates_GM.eps}
1064: 	\caption{The convergence of $\avg{\, E(Q,P) \, }{\rm{post}}$ to the true 
1065: 	entropy rate $\hmu = 2/3$ bits per symbol (indicated by the gray horizontal 
1066: 	line) for the the golden mean process.  As demonstrated
1067: 	in~\eqnref{eqn:average_info_asymptotic}, the conditional relative 
1068: 	entropy $D[Q \| P ] \rightarrow 0$ as $1/N$.  This results in 
1069: 	the convergence of $\hmu [Q]$ to the true entropy rate.
1070: 	\label{fig:GoldenMean_InfoTheory}}
1071: \end{figure}
1072: 
1073: In evaluating the value of $D[Q \| P ] + \hmu [Q]$ for different sample lengths,
1074: we expect that the \pme \, estimated $Q$ will converge to the true distribution
1075: $P$.  As a result, the conditional relative entropy should go to zero with
1076: increasing $N$.  For the golden mean process, the known value of the entropy 
1077: rate is $\hmu =2/3$ bits per symbol.  Inspection 
1078: of~\figref{fig:GoldenMean_InfoTheory} demonstrates the expected convergence of the 
1079: average from~\eqnref{eqn:average_info} to the true entropy rate.
1080: 
1081: The result of our model comparison from the previous section could also be used
1082: in the estimation of the entropy rate.  As we saw 
1083: in~\figref{fig:GoldenMean_ModelComparison}, there are ranges of sample length $N$
1084: where the probability of orders $k=1,2$ are both nonzero.  In principle, an
1085: estimate of $\hmu$ should be made by weighting the values obtained for each
1086: $k$ by the corresponding order probability $P(\MCk \vert D, \mathcal{M})$.  As
1087: we can see from~\figref{fig:GoldenMean_InfoTheory}, the estimates of the entropy
1088: rate for $k=1,2$ are also very similar in this range of $N$.  As a result, this
1089: additional step would not have a large effect for entropy rate estimation.
1090: 
1091: %%
1092: \subsection{Even process: Out-of-class modeling}
1093: 
1094: We now consider a more difficult data source called the \emph{even process}.
1095: The defining labeled transition matrices are given by
1096: \begin{equation}
1097: 	\label{eqn:label_transition_matrix_even}
1098: 	T^{(0)} = \left[ \begin{array}{cc}
1099: 	1/2 & 0 \\
1100: 	0	& 0 
1101: 	\end{array}
1102: 	\right] \; , \;
1103: 	T^{(1)} = \left[ \begin{array}{cc}
1104: 	0 & 1/2 \\
1105: 	1	& 0 
1106: 	\end{array}
1107: 	\right]~.
1108: \end{equation}
1109: 
1110: As can be seen in~\figref{fig:even}, the node-edge structure is identical to
1111: the golden mean process but the output symbols on the edges have been changed
1112: slightly.  As a result of this shuffle, the states $A$ and $B$ can no longer be
1113: associated with a simple sequence of $0$'s and $1$'s.  Whereas the golden mean
1114: has the irreducible set of forbidden words $\mathcal{F} = \{00\}$, the even
1115: process has a countably infinite set
1116: $\mathcal{F} = \{01^{2n+1}0: n=0,1,2,\ldots \}$
1117: \cite{Crutchfield1994}.
1118: \begin{figure}[htb]
1119: \begin{center}
1120: 		%options for the plot:
1121: 			%-states
1122: 			\SetStateLabelScale{1.6}
1123: 			\SetStateLineWidth{1.4pt}
1124: 			%-edges
1125: 			\SetEdgeLabelScale{1.4}
1126: 			\SetEdgeLineWidth{0.75pt}
1127: 		
1128: 		\begin{VCPicture}{(0,0)(5,2)}
1129: 			%states
1130: 			\ChgStateLabelScale{0.8}
1131: 				\State[A]{(1,0)}{A}
1132: 				\State[B]{(4,0)}{B}
1133: 			\ChgEdgeLabelScale{0.7}
1134: 			%transitions
1135: 				\LoopW{A}{ 0 | 1/2 } 
1136: 				\LArcR[0.5]{B}{A}{ 1 | 1 } 
1137: 				\LArcR[0.5]{A}{B}{ 1 | 1/2 }
1138: 		\end{VCPicture}
1139: \end{center}
1140: \vspace{0.5in}
1141: \caption{Deterministic hidden Markov chain representation of the even process.
1142:   This process cannot be represented as a finite-order (nonhidden) Markov chain
1143:   over the output symbols $0$s and $1$s. The set of irreducible forbidden words
1144:   $\mathcal{F} = \{01^{2n+1}0: n=0,1,2,\ldots \}$ reflects the fact that the
1145:   process generates blocks of $1$'s, bounded by $0$s, that are \emph{even} in
1146:   length, at any length.  
1147: \label{fig:even}}
1148: \end{figure}
1149: 
1150: In simple terms, the even process produces blocks of $1$'s which are even in
1151: length. This is a much more complicated type of memory than we saw in
1152: the golden mean process.  For the Markov chain model class, where a word of
1153: length $k$ is used to predict the next letter, this would require an
1154: infinite-order $k$. It would be necessary to keep track of all even and odd
1155: strings of $1$'s, irrespective of the length. As a result, the properties of
1156: the even process mean that a finite Markov chain \textit{cannot} represent
1157: this data source.  
1158: 
1159: This example is then a demonstration of what can be learned in a case of
1160: out-of-class modeling. We are interested, therefore, in how well Markov
1161: chains approximate the even process. We
1162: expect that model comparison will select larger $k$ as the size of the data 
1163: sample increases.  Does the model selection tells us anything about the 
1164: underlying data source despite the inability to exactly capture its properties?
1165: As we will see, we do obtain intriguing hints of the true nature of the even
1166: process from model comparison.  Finally, can we estimate the entropy rate of
1167: the process with a Markov chain?  As we will see, a high $k$ is needed to do
1168: this effectively.
1169: 
1170: %
1171: \subsubsection{Estimation of $M_1$ Parameters}
1172: 
1173: In this section we consider an $M_1$ approximation of the even process.
1174: We expect the resulting model to accurately capture length-$2$ word 
1175: probabilities as $N$ increases.  In this example, we consider the \emph{true}
1176: model to be the best approximation possible by a $k=1$ Markov chain.  From the
1177: labeled transition matrices given above we can calculate the appropriate
1178: values for $p(0\vert 1)$ and $p(1\vert 0)$ using the methods described above.  
1179: Starting from the asymptotic distribution $\vec{\pi} = \left[ p(A), p(B)\right]
1180: = \left[ 2/3, 1/3 \right]$ we obtain $p(0\vert 1)=p(10)/p(1)=1/4$ and $p(1\vert
1181: 0)=p(01)/p(0)=1/2$. 
1182: 
1183: As we can see from~\figref{fig:Even_ParameterEstimates}, a first-order Markov
1184: chain can be inferred without difficulty.  The values obtained are exactly as
1185: expected.  However, these values do not tell us much about the nature
1186: of the data source by themselves. This points to the important role of model
1187: comparison and entropy rate estimation in understanding this data.
1188: 
1189: %% details-
1190: %%   code: MarginalPosterior.py in MarkovChainPaper_Code folder.
1191: %%   parameters: marginal density plotted for N=50,100,200,400.
1192: %%
1193: \begin{figure}[htbp]
1194: 	\centering
1195: 	\includegraphics[width=0.98\columnwidth]{MarginalPosterior_EVEN.eps}
1196: 	\caption{A plot of the inference of $M_1$ model parameters for the even 
1197: 	process.  For a variety of sample sizes $N$, the marginal posterior for
1198: 	$p(0\vert 1)$ (top panel) and $p(1\vert 0)$ (bottom panel) are shown.  The
1199: 	\textit{true} values of the parameters are $p(0\vert 1)=1/4$ and 
1200: 	$p(1\vert 0) = 1/2$.
1201: 	\label{fig:Even_ParameterEstimates}}
1202: \end{figure}
1203: 
1204: %
1205: \subsubsection{Selecting the Model Order $k$}
1206: 
1207: Now consider the selection of Markov chain order $k=1-4$ for a range of data
1208: sizes $N$. Recall that the even process cannot be represented by a finite-order
1209: Markov chain over the output symbols $0$ and $1$. As a consequence, we expect
1210: higher $k$ to be selected with increasing data $N$, as more data statistically
1211: justifies more complex models. This is what happens, in fact, but the way in
1212: which orders are selected as we increase $N$ provides structural information
1213: we could not obtain from the inference of a Markov chain of fixed order.
1214: 
1215: %% details-
1216: %%   code: ModelComparison.py in MarkovChainPaper_Code folder.
1217: %%   parameters: length_min=100, length_max=1000, step=5
1218: %%
1219: \begin{figure}[htbp]
1220: 	\centering
1221: 	\includegraphics[width=0.98\columnwidth]{ModelCompare_EVEN.eps}
1222: 	\caption{Model comparison for Markov chains of order $k=1-4$ for 
1223: 	average data from the even process.  The top panel shows the model 
1224: 	comparison with a uniform prior over the possible orders $k$.  The bottom 
1225: 	panel demonstrates model comparison with a penalty for the number of model
1226: 	parameters.  In both cases the $k=4$ model is chosen over lower orders as the
1227: 	amount of data available increases.
1228: 	\label{fig:Even_ModelComparison}}
1229: \end{figure}
1230: 
1231: If we consider~\figref{fig:Even_ModelComparison}, an interesting pattern becomes
1232: apparent.  Orders with even $k$ are preferred over odd. In this way model
1233: selection is hinting at the underlying structure of the source. The Markov
1234: chain model class cannot represent the even process in a compact way, but
1235: inference and model comparison combined provide useful information about
1236: the hidden structure of the source.
1237: 
1238: In this example we also have regions where the probability of multiple orders $k$
1239: are equally probable.  The sample size at which this occurs depends on the prior
1240: over orders which is employed.  When this happens, properties estimated from the
1241: Markov chain model class should use a weighted sum of the various orders. As we
1242: will see in the estimation of entropy rates, this is not as critical. At
1243: sample sizes where the order probabilities are similar, the estimated entropy
1244: rates are also similar.
1245: 
1246: %
1247: \subsubsection{Estimation of Entropy Rate}
1248: 
1249: Entropy rate estimation for the even process turns out to be a more
1250: difficult task than one might expect.  In~\figref{fig:Even_InfoTheory} we see
1251: that Markov chains of orders $1-6$ are unable to effectively capture the true
1252: entropy rate.  In fact, experience shows that an order $k=10$ Markov chain or
1253: higher is needed to get close to the true value of $\hmu = 2/3$ bits per symbol.
1254: Note also the factor of $20$ longer realizations that are required compared,
1255: say, to the golden mean example.
1256: 
1257: %% details-
1258: %%   code: EntropyEstimation.py in MarkovChainPaper_Code folder.
1259: %%   parameters: length_min=100, length_max=20000, step=100, k=1-6
1260: %%
1261: \begin{figure}[htbp]
1262: 	\centering
1263: 	\includegraphics[width=0.98\columnwidth]{EntropyEstimates_EVEN.eps}
1264: 	\caption{The convergence of $\avg{\, D[Q \| P ] + \hmu [Q] \, }{\rm{post}}$ 
1265: 	to the true entropy rate $\hmu = 2/3$ bits per symbol for the the even
1266: 	process.  The true value is indicated by the horizontal gray line.  Experience
1267: 	shows that a $k=10$ Markov chain is needed to effectively approximate the true
1268: 	value of $\hmu$.
1269: 	\label{fig:Even_InfoTheory}}
1270: \end{figure}
1271: 
1272: As discussed above, a weighted sum of 
1273: $\avg{\, D[Q \| P ] + \hmu [Q] \, }{\rm{post}}$ could be employed in this
1274: example.  For the estimate this is not critical because the different orders
1275: provide roughly the same value at these points.  In fact, these points
1276: correspond to where the estimates of $E(Q,P)$ cross 
1277: in~\figref{fig:Even_InfoTheory}. They are samples sizes where apparent
1278: randomness can be explained by structure and increased order $k$.
1279: 
1280: 
1281: %%
1282: \subsection{Simple Nondeterministic Source: Out-of-class modeling}
1283: 
1284: The simple nondeterministic source adds another level of challenge to inference.
1285: As its name suggests, it is described by a nondeterministic HMM.  
1286: Considering~\figref{fig:sns} we can see that a $1$ is produced on every
1287: transition except for the $B \rightarrow A$ edge.  This means there are many
1288: paths through the internal states that produce the same observable sequence of
1289: $0$s and $1$s. The defining labeled transition matrices for this process are
1290: given by
1291: \begin{equation}
1292: 	\label{eqn:label_transition_matrix_sns}
1293: 	T^{(0)} = \left[ \begin{array}{cc}
1294: 	0 & 0 \\
1295: 	1/2	& 0 
1296: 	\end{array}
1297: 	\right] \; , \;
1298: 	T^{(1)} = \left[ \begin{array}{cc}
1299: 	1/2 & 1/2 \\
1300: 	0	& 1/2 
1301: 	\end{array}
1302: 	\right]~.
1303: \end{equation}
1304: 
1305: Using the state-to-state transition matrix $T=T^{(0)}+T^{(1)}$, we find the
1306: asymptotic distribution for the hidden states to be
1307: $\vec{\pi} = \left[ p(A), p(B) \right] = \left[1/2, 1/2 \right]$. Each of
1308: the hidden states is equally likely; however, a $1$ is always produced from
1309: state $A$, while there is an equal chance of obtaining a $0$
1310: or $1$ from state $B$.
1311: 
1312: \begin{figure}[htb]
1313: \begin{center}
1314: 		%options for the plot:
1315: 			%-states
1316: 			\SetStateLabelScale{1.6}
1317: 			\SetStateLineWidth{1.4pt}
1318: 			%-edges
1319: 			\SetEdgeLabelScale{1.4}
1320: 			\SetEdgeLineWidth{0.75pt}
1321: 		
1322: 		\begin{VCPicture}{(0,0)(5,2)}
1323: 			%states
1324: 			\ChgStateLabelScale{0.8}
1325: 				\State[A]{(1,0)}{A}
1326: 				\State[B]{(4,0)}{B}
1327: 			\ChgEdgeLabelScale{0.7}
1328: 			%transitions
1329: 				\LoopW{A}{ 1 | 1/2 }
1330: 				\LoopE{B}{ 1 | 1/2 } 
1331: 				\LArcR[0.5]{B}{A}{ 0 | 1/2 } 
1332: 				\LArcR[0.5]{A}{B}{ 1 | 1/2 }
1333: 		\end{VCPicture}
1334: \end{center}
1335: \vspace{0.5in}
1336: \caption{A hidden Markov chain representation of the simple nondeterministic
1337:   process. This example also cannot be represented as a finite-order Markov
1338:   chain over outputs $0$ and $1$. It, however, is more complicated than the
1339:   two previous examples: Only the observation of a $0$ provides the observer
1340:   with information regarding the internal state of the underlying process;
1341:   observing a $1$ leaves the internal state ambiguous.
1342: \label{fig:sns}}
1343: \end{figure}
1344: 
1345: %
1346: \subsubsection{Estimation of $M_1$ Parameters}
1347: 
1348: Using the asymptotic distribution derived above, the parameters of an inferred
1349: first-order Markov chain should approach $p(0\vert 1)=p(10)/p(1)=1/3$ and
1350: $p(1\vert 0)=p(01)/p(0)=1$.  As we can see
1351: from~\figref{fig:SNS_ParameterEstimates}, the inference
1352: process captures these values very effectively despite the out-of-class data
1353: source.  
1354: 
1355: %% details-
1356: %%   code: MarginalPosterior.py in MarkovChainPaper_Code folder.
1357: %%   parameters: marginal density plotted for N=50,100,200,400.
1358: %%
1359: \begin{figure}[htbp]
1360: 	\centering
1361: 	\includegraphics[width=0.98\columnwidth]{MarginalPosterior_SNS.eps}
1362: 	\caption{Marginal density for $M_1$ model parameters for the
1363: 	simple nondeterministic process:  The curves for each data size $N$
1364: 	demonstrate a well behaved convergence to the correct values:
1365: 	$p(0\vert 1)=1/3$ and $p(1\vert 0) = 1$.
1366: 	\label{fig:SNS_ParameterEstimates}}
1367: \end{figure}
1368: 
1369: %
1370: \subsubsection{Selecting the Model Order $k$}
1371: 
1372: Here we consider the comparison of Markov chain models of orders $k=1-4$ when
1373: applied to data from the simple nondeterministic source.  As with the even
1374: process, we expect increasing order to be selected as the amount of available
1375: data increases.  In~\figref{fig:SNS_ModelComparison} we see that this is
1376: exactly what happens.
1377: 
1378: %% details-
1379: %%   code: ModelComparison.py in MarkovChainPaper_Code folder.
1380: %%   parameters: length_min=100, length_max=1.5e5, step=50
1381: %%
1382: \begin{figure}[htbp]
1383: 	\centering
1384: 	\includegraphics[width=0.98\columnwidth]{ModelCompare_SNS.eps}
1385: 	\caption{Model comparison for Markov chains of order $k=1-4$ for 
1386: 	data from the simple nondeterministic process.  The top panel 
1387: 	shows the model comparison with a uniform prior over the possible orders 
1388: 	$k$.  The bottom panel demonstrates model comparison with a penalty for the 
1389: 	number of model parameters.  Note the scale on the horizontal axis---it
1390: 	takes much more data for the model comparison to pick out higher orders
1391: 	for this process compared to the previous examples.
1392: 	\label{fig:SNS_ModelComparison}}
1393: \end{figure}
1394: 
1395: Unlike the even process, there is no preference for even orders.  Instead, we
1396: observe a systematic increase in order with larger data sets.  We do note that
1397: the amount of data need to select a higher order does seem to be larger than for
1398: the even process.  Here the distribution over words is more important and more
1399: subtle than the support of the distribution (those words with positive
1400: probability).
1401: 
1402: %
1403: \subsubsection{Estimation of Entropy Rate}
1404: 
1405: Estimation of the entropy rate for the simple nondeterministic source provides
1406: an interesting contrast to the previous examples. As discussed when introducing
1407: the examples, this data source is a nondeterministic HMM and the entropy rate
1408: cannot be directly calculated using~\eqnref{eqn:entropy_rate}
1409: \cite{Blackwell1957}. However, a
1410: value of $\hmu \approx 0.677867$ bits per symbol has been obtained
1411: in~\cite{Crutchfield1994}. 
1412: 
1413: %% details-
1414: %%   code: EntropyEstimation.py in MarkovChainPaper_Code folder.
1415: %%   parameters: length_min=100, length_max=20000, step=100, k=1-6
1416: %%
1417: \begin{figure}[htbp]
1418: 	\centering
1419: 	\includegraphics[width=0.98\columnwidth]{EntropyEstimates_SNS.eps}
1420: 	\caption{The convergence of $\avg{\, D[Q \| P ] + 
1421: 	\hmu [Q] \, }{\rm{post}}$ to the true entropy rate $\hmu \approx 0.677867$ 
1422: 	bits per symbol for the simple nondeterministic source.  The true value is
1423: 	indicated by the gray horizontal line.
1424: 	\label{fig:SNS_InfoTheory}}
1425: \end{figure}
1426: 
1427: \begfigref{fig:SNS_InfoTheory} shows the results of entropy-rate estimation
1428: using Markov chains of order $k=1-6$.  These results demonstrate that the
1429: entropy rate can be effectively estimated with low-order $k$ and relatively
1430: small data samples.  This is an interesting result, as we might expect
1431: estimation of the entropy rate to be most difficult in this example.  Instead we
1432: find that the even process was a more difficult test case.
1433: 
1434: %%
1435: %%
1436: \section{Discussion}
1437: 
1438: The examples presented above provide several interesting lessons in inference,
1439: model comparison, and estimating randomness. The combination of these three
1440: ideas applied to a data source provides information and intuition about the
1441: structure of the underlying system, even when modeling out-of-class processes.
1442: 
1443: In the examples of $\MC_{1}$ estimates for each of the sources we see that
1444: the Bayesian methods provide a powerful and consistent description of Markov
1445: chain model parameters.  The marginal density accurately describes the
1446: uncertainty associated with these estimates, reflecting asymmetries which point
1447: estimation with error bars cannot capture.  In addition, methods described
1448: in~\appref{app:dirichlet} can be used to generate regions of confidence of any
1449: type. 
1450: 
1451: Although the estimates obtained for the Markov chain model parameters were
1452: consistent with the data source for words up to length $k+1$, they did not capture
1453: the true nature of the system under consideration.  This demonstrates that
1454: estimation of model parameters without some kind of model comparison can be very
1455: misleading.  Only with the comparison of different orders did some indication
1456: of the true properties of the data source become clear.  Without this step, 
1457: misguided interpretations are easily obtained.
1458: 
1459: For the golden mean process, a $k=1$ Markov chain, the results of model
1460: comparison were predictably uninteresting.  This is a good indication that the
1461: correct model class is being employed.  However, with the even process a much
1462: more complicated model comparison was found.  In this case, a selection of even
1463: $k$ over odd hinted at the distinguishing properties of the source. In a
1464: similar way, the results of model comparison for the simple nondeterministic
1465: source selected increasing order with larger $N$.  In both out-of-class
1466: modeling examples, the increase in selected order without end is a good
1467: indication that the data source is not in the Markov chain class. (A parallel
1468: technique is found in \emph{hierarchical $\epsilon$-machine reconstruction}
1469: \cite{Crutchfield1994}.) Alternatively, there is an indication that
1470: very high-order dependencies are important in the description of the process. 
1471: Either way, this information is important since it gives an indication to the
1472: modeler that a more complicated dynamic is at work and all results must be
1473: treated with caution.
1474: 
1475: Finally, we considered the estimation of entropy rates for the example data
1476: sources.  In two of the cases, the golden mean process and the simple
1477: nondeterministic source, short data streams were adequate.  This is not
1478: unexpected for the golden mean, but for the simple nondeterministic source this
1479: might be considered surprising.  For the even process, the estimation of the
1480: entropy rate was markedly more difficult.  For this data source, the countably
1481: infinite number of forbidden words makes the support of the word distribution
1482: at a given length important.  As a result, a larger amount of data and a
1483: higher-order Markov chain are needed to find a decent estimate of randomness
1484: from that data source. In this way, each of the steps in Bayesian
1485: inference allow one to separate structure from randomness. 
1486: 
1487: %%
1488: %%
1489: \section{Conclusion}
1490: 
1491: We considered Bayesian inference of $k$-th order Markov chain
1492: models.  This included estimating model parameters for a given $k$, model
1493: comparison between orders, and estimation of randomness in the form of entropy
1494: rates.  In most approaches to inference, these three aspects are treated as
1495: separate, but related endeavors.  However, we find them to be intimately
1496: related.  An estimate of model parameters without a sense of whether the
1497: correct model is being used is misguided at best.  Model comparison
1498: provides a window into this problem by comparing various orders $k$ within the
1499: model class.  Finally, estimating randomness in the form of an entropy rate
1500: provides more information about the trade-off between structure and randomness. 
1501: To do this we developed a connection to the statistical mechanical partition
1502: function, from which averages and variances were directly calculable. For the
1503: even process, structure was perceived as randomness and for the simple
1504: nondeterministic source
1505: randomness was easily estimated and structure was more difficult to find.
1506: These insights, despite the out-of-class data, demonstrate the power of
1507: combining these three methods into one effective tool for investigating
1508: structure and randomness in finite strings of discrete data.
1509: 
1510: %
1511: % acknowledgments
1512: %
1513: \section*{Acknowledgments}
1514: This work was partially supported at the Center for Computational Science
1515: and Engineering at the University of California at Davis by Intel
1516: Corporation. Work at the Santa Fe Institute was supported under its
1517: Computation, Dynamics, and Inference Program core grants from the
1518: National Science and MacArthur Foundations. C.S. and A.H. acknowledge
1519: support by the National Science Foundation Grant DMS 03-25939 ITR.
1520: 
1521: %
1522: % appendices
1523: %
1524: \appendix
1525: 
1526: %
1527: % Dirichlet Appendix
1528: %
1529: \section{}
1530: \label{app:Dirichlet}
1531: 
1532: \subsection{Dirichlet Distribution\label{app:dirichlet}}
1533: 
1534: We supply a brief overview of the Dirichlet distribution for completeness.  For 
1535: more information, a reference such as~\cite{Wilks1962} should be consulted.  In 
1536: simple terms, the Dirichlet distribution is the multinomial generalization of 
1537: the Beta distribution.  The probability density function for $q$ elements is
1538: given by
1539: \begin{equation}
1540: 	\label{eqn:dirichlet_pdf}
1541: 	\text{Dir}( \{ p_{i} \} )
1542: 	=
1543: 	\frac{ \Gamma( \alpha ) }{\prod_{i=0}^{q-1} \Gamma( \alpha_{i} ) }
1544: 	\delta(1-\sum_{i=0}^{q-1} p_{i})
1545: 	\prod_{i=0}^{q-1} p_{i}^{\alpha_{i}-1}.
1546: \end{equation}
1547: 
1548: The variates must satisfy $p_i \in [0,1]$ and $\sum_{i=0}^{q-1} p_{i} = 1$. The 
1549: hyperparameters $\{ \alpha_{i} \}$ of the distribution, must be real and 
1550: positive and we use the notation $\alpha = \sum_{i=0}^{q-1} \alpha_{i}$.  The
1551: average, variance, and covariance of the parameters $p_{i}$ are
1552: given by, respectively,
1553: \begin{eqnarray}
1554: 	\avg{p_{j}}{} & = & \frac{ \alpha_{j} }{ \alpha }, 
1555: 	\label{eqn:dirichlet_average}\\
1556: 	\var{p_{j}}{} & = & \frac{ \alpha_{j}\left( \alpha - \alpha_{j} \right) 
1557: 	}{ \alpha^{2} \left( 1+ \alpha \right) },
1558: 	\label{eqn:dirichlet_variance}\\
1559: 	\cov{p_{j}}{p_{l}}	& = & - \frac{ \alpha_{j} \alpha_{l} 
1560: 	}{ \alpha^{2} \left( 1+ \alpha \right) } \; , \; j \neq l. 
1561: 	\label{eqn:dirichlet_covariance}
1562: \end{eqnarray}
1563: 
1564: %%
1565: %%
1566: \subsection{Marginal distributions\label{app:dirichlet_marginal}}
1567: 
1568: An important part of understanding uncertainty in the inference process is the
1569: ability to find regions of confidence from a marginal density.  The marginal is
1570: obtained from the posterior by integrating out the dependence on all parameters
1571: except for the parameter of interest.  For a Dirichlet distribution, the
1572: marginal density is known to be a Beta distribution~\cite{Wilks1962},
1573: \begin{equation}
1574: 	\label{eqn:beta_pdf}
1575: 	\text{Beta}( p_{i} )
1576: 	=
1577: 	\frac{ \Gamma( \alpha ) }{\Gamma( \alpha_{i} ) \Gamma( \alpha - \alpha_{i} ) }
1578: 	 p_{i}^{\alpha_{i}-1} \left( 1 - p_{i} \right)^{\alpha - \alpha_{i}-1}.
1579: \end{equation}
1580: 
1581: %%
1582: %%
1583: \subsection{Regions of confidence from the marginal density}
1584: 
1585: From the marginal density provided in~\eqnref{eqn:beta_pdf} a cumulative
1586: distribution function can be obtained using the incomplete Beta integral
1587: \begin{equation}
1588: 	\Pr(p_{i} \leq x) = \int_{0}^{x} \, dp_{i} \, \text{Beta}(p_{i}) ~.
1589: 	\label{eqn:beta_cdf}
1590: \end{equation}
1591: Using this form, the probability that a Markov chain parameter will be between
1592: $a$ and $b$ can be found using $\Pr( a \leq p_{i} \leq b) = \Pr( p_{i} \leq b) -
1593: \Pr( p_{i} \leq a)$.  For a confidence level $R$, between zero and one, we then
1594: want to find $(a,b)$ such that $R=\Pr( a \leq p_{i} \leq b)$.  The incomplete
1595: Beta integral and its inverse can be found using computational methods,
1596: see~\cite{Majumder1973,Majumder1973a,Cran1977,Berry1990} for details.
1597: 
1598: %
1599: % the bibliography
1600: %
1601: \begin{thebibliography}{29}
1602: \expandafter\ifx\csname natexlab\endcsname\relax\def\natexlab#1{#1}\fi
1603: \expandafter\ifx\csname bibnamefont\endcsname\relax
1604:   \def\bibnamefont#1{#1}\fi
1605: \expandafter\ifx\csname bibfnamefont\endcsname\relax
1606:   \def\bibfnamefont#1{#1}\fi
1607: \expandafter\ifx\csname citenamefont\endcsname\relax
1608:   \def\citenamefont#1{#1}\fi
1609: \expandafter\ifx\csname url\endcsname\relax
1610:   \def\url#1{\texttt{#1}}\fi
1611: \expandafter\ifx\csname urlprefix\endcsname\relax\def\urlprefix{URL }\fi
1612: \providecommand{\bibinfo}[2]{#2}
1613: \providecommand{\eprint}[2][]{\url{#2}}
1614: 
1615: \bibitem[{\citenamefont{Avery and Henderson}(1999)}]{Avery1999}
1616: \bibinfo{author}{\bibfnamefont{P.~J.} \bibnamefont{Avery}} \bibnamefont{and}
1617:   \bibinfo{author}{\bibfnamefont{D.~A.} \bibnamefont{Henderson}},
1618:   \bibinfo{journal}{Appl. Stat.} \textbf{\bibinfo{volume}{48}},
1619:   \bibinfo{pages}{53 } (\bibinfo{year}{1999}).
1620: 
1621: \bibitem[{\citenamefont{Liu and Lawrence}(1999)}]{JSLiu1999}
1622: \bibinfo{author}{\bibfnamefont{J.~S.} \bibnamefont{Liu}} \bibnamefont{and}
1623:   \bibinfo{author}{\bibfnamefont{C.~E.} \bibnamefont{Lawrence}},
1624:   \bibinfo{journal}{Bioinformatics} \textbf{\bibinfo{volume}{15}},
1625:   \bibinfo{pages}{38 } (\bibinfo{year}{1999}).
1626: 
1627: \bibitem[{\citenamefont{Crutchfield and Feldman}(1997)}]{Crutchfield1997}
1628: \bibinfo{author}{\bibfnamefont{J.~P.} \bibnamefont{Crutchfield}}
1629:   \bibnamefont{and} \bibinfo{author}{\bibfnamefont{D.~P.}
1630:   \bibnamefont{Feldman}}, \bibinfo{journal}{Phys. Rev. E}
1631:   \textbf{\bibinfo{volume}{55}}, \bibinfo{pages}{R1239 }
1632:   (\bibinfo{year}{1997}).
1633: 
1634: \bibitem[{\citenamefont{MacKay and Peto}(1994)}]{MacKay1994}
1635: \bibinfo{author}{\bibfnamefont{D.~J.~C.} \bibnamefont{MacKay}}
1636:   \bibnamefont{and} \bibinfo{author}{\bibfnamefont{L.~C.~B.}
1637:   \bibnamefont{Peto}}, \bibinfo{journal}{Nat. Lang. Eng.}
1638:   \textbf{\bibinfo{volume}{1}} (\bibinfo{year}{1994}).
1639: 
1640: \bibitem[{\citenamefont{Crutchfield and Packard}(1983)}]{Crutchfield1983}
1641: \bibinfo{author}{\bibfnamefont{J.~P.} \bibnamefont{Crutchfield}}
1642:   \bibnamefont{and} \bibinfo{author}{\bibfnamefont{N.~H.}
1643:   \bibnamefont{Packard}}, \bibinfo{journal}{Physica D}
1644:   \textbf{\bibinfo{volume}{7D}}, \bibinfo{pages}{201 } (\bibinfo{year}{1983}).
1645: 
1646: \bibitem[{\citenamefont{Hao and Zheng}(1998)}]{BLHao1998}
1647: \bibinfo{author}{\bibfnamefont{B.-L.} \bibnamefont{Hao}} \bibnamefont{and}
1648:   \bibinfo{author}{\bibfnamefont{W.-M.} \bibnamefont{Zheng}},
1649:   \emph{\bibinfo{title}{Applied Symbolic Dynamics and Chaos}}
1650:   (\bibinfo{publisher}{World Scientific}, \bibinfo{year}{1998}).
1651: 
1652: \bibitem[{\citenamefont{Anderson and Goodman}(1957)}]{TWAnderson1957}
1653: \bibinfo{author}{\bibfnamefont{T.~W.} \bibnamefont{Anderson}} \bibnamefont{and}
1654:   \bibinfo{author}{\bibfnamefont{L.~A.} \bibnamefont{Goodman}},
1655:   \bibinfo{journal}{Ann. Math. Stat.} \textbf{\bibinfo{volume}{28}},
1656:   \bibinfo{pages}{89 } (\bibinfo{year}{1957}).
1657: 
1658: \bibitem[{\citenamefont{Billingsley}(1961)}]{Billingsley1961a}
1659: \bibinfo{author}{\bibfnamefont{P.}~\bibnamefont{Billingsley}},
1660:   \bibinfo{journal}{Ann. Math. Stat.} \textbf{\bibinfo{volume}{32}},
1661:   \bibinfo{pages}{12 } (\bibinfo{year}{1961}).
1662: 
1663: \bibitem[{\citenamefont{Chatfield}(1973)}]{Chatfield1973}
1664: \bibinfo{author}{\bibfnamefont{C.}~\bibnamefont{Chatfield}},
1665:   \bibinfo{journal}{Appl. Stat.} \textbf{\bibinfo{volume}{22}},
1666:   \bibinfo{pages}{7} (\bibinfo{year}{1973}).
1667: 
1668: \bibitem[{\citenamefont{Tong}(1975)}]{HTong1975}
1669: \bibinfo{author}{\bibfnamefont{H.}~\bibnamefont{Tong}}, \bibinfo{journal}{Jour.
1670:   Appl. Prob.} \textbf{\bibinfo{volume}{12}}, \bibinfo{pages}{488 }
1671:   (\bibinfo{year}{1975}).
1672: 
1673: \bibitem[{\citenamefont{Katz}(1981)}]{Katz1981}
1674: \bibinfo{author}{\bibfnamefont{R.~W.} \bibnamefont{Katz}},
1675:   \bibinfo{journal}{Technometrics} \textbf{\bibinfo{volume}{23}},
1676:   \bibinfo{pages}{243 } (\bibinfo{year}{1981}).
1677: 
1678: \bibitem[{\citenamefont{Rissanen}(1984)}]{JRissanen1984}
1679: \bibinfo{author}{\bibfnamefont{J.}~\bibnamefont{Rissanen}},
1680:   \bibinfo{journal}{IEEE Trans. Inform. Theory} \textbf{\bibinfo{volume}{30}},
1681:   \bibinfo{pages}{629} (\bibinfo{year}{1984}).
1682: 
1683: \bibitem[{\citenamefont{Vapnik}(1999)}]{VVapnik1999}
1684: \bibinfo{author}{\bibfnamefont{V.}~\bibnamefont{Vapnik}},
1685:   \bibinfo{journal}{IEEE Trans. Neur. Net.} \textbf{\bibinfo{volume}{10}},
1686:   \bibinfo{pages}{988} (\bibinfo{year}{1999}).
1687: 
1688: \bibitem[{\citenamefont{Vit{\'a}nyi and Li}(2000)}]{Vitanyi2000}
1689: \bibinfo{author}{\bibfnamefont{P.~M.} \bibnamefont{Vit{\'a}nyi}}
1690:   \bibnamefont{and} \bibinfo{author}{\bibfnamefont{M.}~\bibnamefont{Li}},
1691:   \bibinfo{journal}{IEEE Trans. Inform. Theory}
1692:   \textbf{\bibinfo{volume}{46(2)}}, \bibinfo{pages}{446}
1693:   (\bibinfo{year}{2000}).
1694: 
1695: \bibitem[{\citenamefont{Baldi and Brunak}(2001)}]{Baldi2001}
1696: \bibinfo{author}{\bibfnamefont{P.}~\bibnamefont{Baldi}} \bibnamefont{and}
1697:   \bibinfo{author}{\bibfnamefont{S.}~\bibnamefont{Brunak}},
1698:   \emph{\bibinfo{title}{Bioinformatics: The Machine Learning Approach}}
1699:   (\bibinfo{publisher}{MIT Press}, \bibinfo{address}{Cambridge},
1700:   \bibinfo{year}{2001}).
1701: 
1702: \bibitem[{\citenamefont{Durbin et~al.}(1998)\citenamefont{Durbin, Eddy, Krogh,
1703:   and Mitchison}}]{Durbin1998}
1704: \bibinfo{author}{\bibfnamefont{R.}~\bibnamefont{Durbin}},
1705:   \bibinfo{author}{\bibfnamefont{S.}~\bibnamefont{Eddy}},
1706:   \bibinfo{author}{\bibfnamefont{A.}~\bibnamefont{Krogh}}, \bibnamefont{and}
1707:   \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{Mitchison}},
1708:   \emph{\bibinfo{title}{Biological Sequence Analysis}}
1709:   (\bibinfo{publisher}{Cambridge University Press},
1710:   \bibinfo{address}{Cambridge}, \bibinfo{year}{1998}).
1711: 
1712: \bibitem[{\citenamefont{Cover and Thomas}(1991)}]{Cover1991}
1713: \bibinfo{author}{\bibfnamefont{T.~M.} \bibnamefont{Cover}} \bibnamefont{and}
1714:   \bibinfo{author}{\bibfnamefont{J.~A.} \bibnamefont{Thomas}},
1715:   \emph{\bibinfo{title}{Elements of Information Theory}}
1716:   (\bibinfo{publisher}{Wiley-Interscience}, \bibinfo{address}{New York},
1717:   \bibinfo{year}{1991}).
1718: 
1719: \bibitem[{\citenamefont{MacKay}(2003)}]{MacKay2003}
1720: \bibinfo{author}{\bibfnamefont{D.~J.~C.} \bibnamefont{MacKay}},
1721:   \emph{\bibinfo{title}{Information Theory, Inference, and Learning
1722:   Algorithms}} (\bibinfo{publisher}{Cambridge University Press},
1723:   \bibinfo{address}{Cambridge}, \bibinfo{year}{2003}).
1724: 
1725: \bibitem[{\citenamefont{Samengo}(2002)}]{Samengo2002}
1726: \bibinfo{author}{\bibfnamefont{I.}~\bibnamefont{Samengo}},
1727:   \bibinfo{journal}{Phys. Rev. E} \textbf{\bibinfo{volume}{65}},
1728:   \bibinfo{pages}{46124} (\bibinfo{year}{2002}).
1729: 
1730: \bibitem[{\citenamefont{Young and Crutchfield}(1994)}]{Young1994}
1731: \bibinfo{author}{\bibfnamefont{K.}~\bibnamefont{Young}} \bibnamefont{and}
1732:   \bibinfo{author}{\bibfnamefont{J.~P.} \bibnamefont{Crutchfield}},
1733:   \bibinfo{journal}{Chaos, Solitons, and Fractals}
1734:   \textbf{\bibinfo{volume}{4}}, \bibinfo{pages}{5 } (\bibinfo{year}{1994}).
1735: 
1736: \bibitem[{\citenamefont{Abramowitz and Stegun}(1965)}]{Abramowitz1965}
1737: \bibinfo{author}{\bibfnamefont{M.}~\bibnamefont{Abramowitz}} \bibnamefont{and}
1738:   \bibinfo{author}{\bibfnamefont{I.~A.} \bibnamefont{Stegun}},
1739:   \emph{\bibinfo{title}{Handbook of Mathematical Functions}}
1740:   (\bibinfo{publisher}{Dover}, \bibinfo{address}{New York},
1741:   \bibinfo{year}{1965}).
1742: 
1743: \bibitem[{\citenamefont{Crutchfield}(1994)}]{Crutchfield1994}
1744: \bibinfo{author}{\bibfnamefont{J.~P.} \bibnamefont{Crutchfield}},
1745:   \bibinfo{journal}{Physica D} \textbf{\bibinfo{volume}{75}},
1746:   \bibinfo{pages}{11} (\bibinfo{year}{1994}).
1747: 
1748: \bibitem[{\citenamefont{Upper}(1997)}]{Upper1997}
1749: \bibinfo{author}{\bibfnamefont{D.~R.} \bibnamefont{Upper}}, Ph.D. thesis,
1750:   \bibinfo{school}{University of California}, \bibinfo{address}{Berkeley}
1751:   (\bibinfo{year}{1997}), \bibinfo{note}{{P}ublished by University Microfilms
1752:   Intl, Ann Arbor, Michigan}.
1753: 
1754: \bibitem[{\citenamefont{Blackwell and Koopmans}(1957)}]{Blackwell1957}
1755: \bibinfo{author}{\bibfnamefont{D.}~\bibnamefont{Blackwell}} \bibnamefont{and}
1756:   \bibinfo{author}{\bibfnamefont{L.}~\bibnamefont{Koopmans}},
1757:   \bibinfo{journal}{Ann. Math. Stat.} \textbf{\bibinfo{volume}{28}},
1758:   \bibinfo{pages}{1011} (\bibinfo{year}{1957}).
1759: 
1760: \bibitem[{\citenamefont{Wilks}(1962)}]{Wilks1962}
1761: \bibinfo{author}{\bibfnamefont{S.~S.} \bibnamefont{Wilks}},
1762:   \emph{\bibinfo{title}{Mathematical Statistics}} (\bibinfo{publisher}{John
1763:   Wiley \& Sons, Inc.}, \bibinfo{address}{New York}, \bibinfo{year}{1962}).
1764: 
1765: \bibitem[{\citenamefont{Majumder and
1766:   Bhattacharjee}(1973{\natexlab{a}})}]{Majumder1973}
1767: \bibinfo{author}{\bibfnamefont{K.}~\bibnamefont{Majumder}} \bibnamefont{and}
1768:   \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{Bhattacharjee}},
1769:   \bibinfo{journal}{Appl. Stat.} \textbf{\bibinfo{volume}{22}},
1770:   \bibinfo{pages}{411} (\bibinfo{year}{1973}{\natexlab{a}}).
1771: 
1772: \bibitem[{\citenamefont{Majumder and
1773:   Bhattacharjee}(1973{\natexlab{b}})}]{Majumder1973a}
1774: \bibinfo{author}{\bibfnamefont{K.}~\bibnamefont{Majumder}} \bibnamefont{and}
1775:   \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{Bhattacharjee}},
1776:   \bibinfo{journal}{Appl. Stat.} \textbf{\bibinfo{volume}{22}},
1777:   \bibinfo{pages}{409} (\bibinfo{year}{1973}{\natexlab{b}}).
1778: 
1779: \bibitem[{\citenamefont{Cran et~al.}(1977)\citenamefont{Cran, Martin, and
1780:   Thomas}}]{Cran1977}
1781: \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{Cran}},
1782:   \bibinfo{author}{\bibfnamefont{K.}~\bibnamefont{Martin}}, \bibnamefont{and}
1783:   \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{Thomas}},
1784:   \bibinfo{journal}{Appl. Stat.} \textbf{\bibinfo{volume}{26}},
1785:   \bibinfo{pages}{111} (\bibinfo{year}{1977}).
1786: 
1787: \bibitem[{\citenamefont{Berry et~al.}(1990)\citenamefont{Berry, {P.W. Mielke,
1788:   Jr.}, and Cran}}]{Berry1990}
1789: \bibinfo{author}{\bibfnamefont{K.}~\bibnamefont{Berry}},
1790:   \bibinfo{author}{\bibnamefont{{P.W. Mielke, Jr.}}}, \bibnamefont{and}
1791:   \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{Cran}},
1792:   \bibinfo{journal}{Appl. Stat.} \textbf{\bibinfo{volume}{39}},
1793:   \bibinfo{pages}{309} (\bibinfo{year}{1990}).
1794: 
1795: \end{thebibliography}
1796: 
1797: 
1798: \end{document}
1799: 
1800: