1: % Inferring Markov Chains: Bayesian Estimation,
2: % Model Comparison, Entropy Rate, and Out-of-class Modeling
3: %
4: % ccs: mar 01, 2007
5: % jpc: mar 10, 2007
6: % ccs: mar 13, 2007
7: % jpc: mar 23, 2007
8:
9: \documentclass[pre,twocolumn,showpacs,superscriptaddress,preprintnumbers,floatfix]{revtex4}
10: %\documentclass[pre,showpacs,superscriptaddress,preprintnumbers,floatfix]{revtex4}
11:
12: %
13: %-packages
14: \usepackage{amssymb,amsmath} % math utilities
15: \usepackage{graphicx}% Include figure files
16: \usepackage{bm}% Bold mat
17: %\usepackage[pstricks1-10]{vaucanson-g} % for FSA diagrams
18: \usepackage{vaucanson-g} % for FSA diagrams, older version of ps-tricks
19:
20: %
21: %-new user commands
22:
23: %% references %%
24: \newcommand{\eqnref}[1]{Eq.~(\ref{#1})}
25: \newcommand{\begeqnref}[1]{Equation~\ref{#1}}
26: \newcommand{\figref}[1]{Fig.~\ref{#1}}
27: \newcommand{\begfigref}[1]{Figure~\ref{#1}}
28: \newcommand{\appref}[1]{App.~\ref{#1}}
29:
30: %% notation %%
31: \newcommand{\hmu} {h_\mu} % entropy rate
32: \newcommand{\hmuL} {h_{\mu L}} % entropy rate
33: \newcommand{\hmuk} {h_{\mu k}} % entropy rate
34: \newcommand{\KLd} {\mathcal{D}} % entropy rate
35: \newcommand{\EQP} {E(Q,P)}
36: % size of the alphabet
37: \newcommand{\Asize}[0]{\vert \mathcal{A} \vert}
38: % history of length-k: #1
39: \newcommand{\hk}[0]{ \overleftarrow{s}^k}
40: % history of length l:#2, at time t:#1
41: \newcommand{\htl}[2]{ \overleftarrow{s}_{#1}^{#2}}
42:
43: % specific forms
44: \newcommand{\nsk}[0]{n(\overleftarrow{s}^k)} % number of s^k
45: \newcommand{\nsks}[0]{n(\overleftarrow{s}^k s)} % number of s^k s
46: \newcommand{\ask}[0]{\alpha(\overleftarrow{s}^k)} % alpha s^k
47: \newcommand{\asks}[0]{\alpha(\overleftarrow{s}^k s)} % alpha s^k s
48: % model parameters
49: \newcommand{\MP}[0]{\mathbf{\theta}} %model parameters
50: \newcommand{\MPk}[0]{\mathbf{\theta}_k} %model parameters, k-th order
51: \newcommand{\MC}[0]{\mathbf{M}} % model
52: \newcommand{\MCk}[0]{\mathbf{M}_k} % model, k-th order
53: \newcommand{\MCkprime}[0]{\mathbf{M}_{k}'} % model, k-th order
54:
55: % avg, var, covar
56: \newcommand{\avg}[2]{\mathbf{E}_{#2}[#1]} %expectation values
57: \newcommand{\var}[2]{\mathbf{Var}_{#2}[#1]} % variance
58: \newcommand{\cov}[2]{\mathbf{Cov}[#1,#2]} % covariance
59: % probabilities
60: % true distribution
61: \newcommand{\psk}[0]{p(\overleftarrow{s}^k)} % True parameter
62: \newcommand{\psks}[0]{p(s\vert \overleftarrow{s}^k)} % True parameter
63: % pme distribution
64: \newcommand{\qsk}[0]{q(\overleftarrow{s}^k)}
65: \newcommand{\qsks}[0]{q(s\vert \overleftarrow{s}^k)}
66: % prior distribution
67: \newcommand{\rsk}[0]{r(\overleftarrow{s}^k)}
68: \newcommand{\rsks}[0]{r(s\vert \overleftarrow{s}^k)}
69: % uniform distribution
70: \newcommand{\usks}[0]{u(s\vert \overleftarrow{s}^k)} % Uniform distribution
71: \newcommand{\usk}[0]{u(\overleftarrow{s}^k)} % Uniform distribution
72:
73: %-inference methods
74: \newcommand{\mle}[0]{MLE} % maximum likelihood estimate
75: \newcommand{\pme}[0]{PME} % posterior mean estimate
76: \newcommand{\map}[0]{MAP} % maximum a-posteriori
77:
78: \begin{document}
79:
80: \preprint{Santa Fe Institute Working Paper 07-03-XXX}
81: \preprint{arxiv.org/xxxxx/0703XXX}
82:
83: \title{Inferring Markov Chains: Bayesian Estimation,\\
84: Model Comparison, Entropy Rate, and Out-of-class Modeling}
85:
86: \author{Christopher~C.~Strelioff}
87: \email{streliof@uiuc.edu}
88: \affiliation{Center for Computational Science \&
89: Engineering and Physics Department,\\
90: University of California at Davis, One Shields Avenue, Davis, CA 95616}
91: \affiliation{Center for Complex Systems Research
92: and Physics Department,\\
93: University of Illinois at Urbana-Champaign,
94: 1110 West Green Street, Urbana, Illinois 61801}
95: \author{James P. Crutchfield}
96: \email{chaos@cse.ucdavis.edu}
97: \affiliation{Center for Computational Science \&
98: Engineering and Physics Department,\\
99: University of California at Davis, One Shields Avenue, Davis, CA 95616}
100: \author{Alfred W. H\"{u}bler}
101: \email{a-hubler@uiuc.edu}
102: \affiliation{Center for Complex Systems Research
103: and Physics Department,\\
104: University of Illinois at Urbana-Champaign,
105: 1110 West Green Street, Urbana, Illinois 61801}
106:
107: \begin{abstract}
108: Markov chains are a natural and well understood tool for describing
109: one-dimensional patterns in time or space. We show how to infer $k$-th order
110: Markov chains, for arbitrary $k$, from finite data by applying Bayesian
111: methods to both parameter estimation and model-order selection. Extending
112: existing results for multinomial models of discrete data, we connect inference
113: to statistical mechanics through information-theoretic (type theory) techniques.
114: We establish a direct relationship between Bayesian evidence and the partition
115: function which allows for straightforward calculation of the expectation and
116: variance of the conditional relative entropy and the source entropy rate.
117: Finally, we introduce a novel method that uses finite data-size scaling with
118: model-order comparison to infer the structure of out-of-class processes.
119: \end{abstract}
120:
121: %%% PACS
122: % Inference methods, 02.50.Tt
123: % Markov processes, 02.50.Ga
124: % Stochastic models- in statistical physics and nonlinear dynamics, 05.10.Gg
125: \pacs{02.50.Tt,02.50.Ga,05.10.Gg}
126:
127: \maketitle
128:
129: %
130: % introduction
131: %
132: \section{Introduction}
133:
134: Statistical inference of models from small data samples is a vital tool in
135: the understanding of natural systems. In many problems of interest data
136: consists of a sequence of \emph{letters} from a finite \emph{alphabet}.
137: Examples include analysis of sequence information in
138: biopolymers~\cite{Avery1999,JSLiu1999}, investigation of
139: one-dimensional spin systems~\cite{Crutchfield1997}, models of natural
140: languages~\cite{MacKay1994}, and coarse-grained models of chaotic
141: dynamics~\cite{Crutchfield1983,BLHao1998}. This diversity of potential
142: application has resulted in the development of a variety of representations
143: for describing discrete-valued data series.
144:
145: We consider the $k$-th order Markov chain model class which uses the previous
146: $k$ letters in a sequence to predict the next letter. Inference of Markov
147: chains from data has a long history in mathematical statistics. Early work
148: focused on maximum likelihood methods for estimating the parameters of the
149: Markov chain~\cite{TWAnderson1957,Billingsley1961a,Chatfield1973}. This work
150: often assumed a given fixed model order. That is, no \emph{model comparison}
151: across orders is done. This work also typically relied on the assumed
152: asymptotic normality of the likelihood when estimating regions of
153: confidence and when implementing model comparison. As a result, the realm
154: of application has been limited to data sources where these conditions are
155: met. One consequence of these assumptions has been that data sources which
156: exhibit \emph{forbidden words}, symbol sequences which are not allowed, cannot
157: be analyzed with these methods. This type of data violates the assumed
158: normality of the likelihood function.
159:
160: More recently, model comparison in the maximum likelihood approach has been
161: extended using various \emph{information criteria}. These methods for
162: model-order selection are based on extensions of the likelihood ratio and allow
163: the comparison of more than two candidate models at a time. The most widely used
164: are \emph{Akaike's information criteria} (AIC)~\cite{HTong1975} and the
165: \emph{Bayesian information criteria} (BIC)~\cite{Katz1981}. (Although the
166: latter is called Bayesian, it does not employ Bayesian model comparison in
167: the ways we will present here.) In addition to model selection using information
168: criteria, methods from information theory and machine learning have also been
169: developed. Two of the most widely employed are \emph{minimum
170: description length} (MDL)~\cite{JRissanen1984} and \emph{structural risk
171: minimization}~\cite{VVapnik1999}. Note that MDL and Bayesian
172: methods obtain similar results in some situations~\cite{Vitanyi2000}. However,
173: to the best of our knowledge, structural risk minimization has not been adapted
174: to Markov chain inference.
175:
176: We consider Bayesian inference of the Markov chain model class, extending
177: previous results~\cite{MacKay1994,JSLiu1999,Baldi2001,Durbin1998}. We provide
178: the details necessary to infer a Markov chain of arbitrary order, choose
179: the appropriate order (or weight orders according to their probability),
180: and estimate the data source's entropy rate. The latter is important for
181: estimating the intrinsic randomness and achievable compression rates for
182: an information source~\cite{Cover1991}. The ability to weight Markov chain
183: orders according their probability is unique to Bayesian methods and
184: unavailable in the model selection techniques discussed above.
185:
186: In much of the literature just cited, steps of the inference process
187: are divided into (i) point estimation of model parameters, (ii) model
188: comparison (hypothesis testing), and (iii) estimation of functions of the
189: model parameters. Here we will show that Bayesian inference connects all
190: of these steps, using a unified set of ideas. Parameter estimation is the first
191: step of inference, model comparison a second level, and estimation of the
192: entropy rate a final step, intimately related to the mathematical structure
193: underlying the inference process. This view of connecting model to data
194: provides a powerful and unique understanding of inference not available in the
195: classical statistics approach to these problems. As we demonstrate, each of
196: these steps is vital and implementation of one step without the others does
197: not provide a complete analysis of the data-model connection.
198:
199: Moreover, the combination of inference of model parameters, comparison of
200: performance across model orders, and estimation of entropy rates provides a
201: powerful tool for understanding Markov chain models themselves. Remarkably,
202: this is true even when the generating data source is outside of the Markov
203: chain model class.
204: Model comparison provides a sense of the structure of the data source, whereas
205: estimates of the entropy rate provide a description of the inherent randomness.
206: Bayesian inference, information theory, and tools from statistical mechanics
207: presented here touch on all of these issues within a unified framework.
208:
209: We develop this as follows, assuming a passing familiarity with Bayesian
210: methods and statistical mechanics. First, we discuss estimation of Markov
211: chain parameters using Bayesian methods, emphasizing the use of the complete
212: marginal posterior density for each parameter, rather than point estimates
213: with error bars. Second, we consider selection of the appropriate memory
214: $k$ given a particular data set, demonstrating that a mixture of orders may
215: often be more appropriate than selecting a single order. This is certainly
216: a more genuinely Bayesian approach. In these first two parts
217: we exploit different forms of Bayes' theorem to connect data and model class.
218:
219: Third, we consider the mathematical structure of the evidence (or marginal
220: likelihood) and draw connections to statistical mechanics. In this discussion
221: we present a method for estimating entropy rates by taking derivatives of a
222: partition function formed from elements of each step of the inference procedure.
223: Last, we apply these tools to three example information sources of increasing
224: complexity. The first example belongs to the Markov chain model class, but
225: the other two are examples of hidden Markov models (HMMs) that fall outside
226: of that class. We show that the methods developed here provide a powerful tool
227: for understanding data from these sources, even when they do not belong to the
228: model class being assumed.
229:
230: %%
231: %%
232: \section{Inferring Model Parameters}
233:
234: In the first level of Bayesian inference we develop a systematic relation
235: between the data $D$, the chosen \emph{model class} $M$, and the vector of
236: \emph{model parameters} $\MP$. The object of interest in the inference of
237: model parameters is the \emph{posterior probability density}
238: $P\left( \MP \vert D, M \right)$. This is the probability of the model
239: parameters given the observed data and chosen model. To find the posterior
240: we first consider the joint distribution $P\left( \MP, D \vert M \right)$
241: over the data and model parameters given that one has chosen to model the
242: source with a representation in a certain class $M$. This can be factored in
243: two ways: $P\left( \MP \vert D, M \right)P\left(D \vert M\right)$ or
244: $P\left( D \vert \MP, M \right)P\left(\MP \vert M\right)$. Setting these
245: equal and solving for the posterior we obtain Bayes' theorem:
246: \begin{equation}
247: \label{eqn:bayes}
248: P\left( \MP \vert D, M \right)
249: = \frac{ P\left( D \vert \MP , M \right) \;
250: P\left( \MP \vert M \right) }{ P\left( D \vert M \right) }.
251: \end{equation}
252:
253: The \emph{prior} $P\left( \MP \vert M \right)$ specifies our assumptions
254: regarding the model parameters. We take a pragmatic view of the prior,
255: considering its specification to be a statement of assumptions about the
256: chosen model class. The \emph{likelihood} $P\left( D \vert \MP , M \right)$
257: describes the probability of the data given the model. Finally, the
258: \emph{evidence} (or marginal likelihood) $P\left( D \vert M \right)$ is the
259: probability of the data given the model. In the following sections we
260: describe each of the quantities in detail on our path to giving an explicit
261: expression for the posterior.
262:
263: %%
264: \subsection{Markov chains}
265:
266: The first step in inference is to clearly state the assumptions that make up
267: the model. This is the foundation for writing down the likelihood of a data
268: sample and informs the choice of prior. We assume that a single data set of
269: length $N$ is the starting point of the inference and that it consists of
270: \textit{symbols} $s_t$ from a finite alphabet $\mathcal{A}$,
271: \begin{equation}
272: \label{eqn:data}
273: D = s_0 s_1 \ldots s_{N-1} \; , \; s_t \in \mathcal{A}.
274: \end{equation}
275: We introduce the notation $\htl{t}{k}$ to indicate a length-$k$ sequence of
276: letters ending at position $t$: e.g., $\htl{4}{2}=s_3s_4$.
277:
278: The $k$-th order Markov chain model class assumes finite memory and
279: stationarity in the data source. The finite memory condition, a
280: generalization of the conventional Markov property, can be written
281: \begin{equation}
282: p(D) = p(\htl{k-1}{k}) \prod_{t=k-1}^{N-2} p(s_{t+1} \vert \htl{t}{k}) ~,
283: \label{eqn:markov_condition}
284: \end{equation}
285: thereby factoring into terms which depend only on preceding words of
286: length-$k$. The stationarity condition can be expressed
287: \begin{equation}
288: \label{eqn:stationarity}
289: p(s_t \vert \htl{t-1}{k}) = p(s_{t+m} \vert \htl{t+m-1}{k}) ~,
290: \end{equation}
291: for any $(t,m)$. \begeqnref{eqn:stationarity} results in a simplification of
292: the notation because we no longer need to track the position index,
293: $p(s_t = s \vert \htl{t-1}{k} = \hk ) = p( s \vert \hk )$ for any $t$. Given
294: these two assumptions, the model parameters of the $k$-th order Markov chain
295: $\MCk$ are
296: \begin{equation}
297: \label{eqn:model_parameters}
298: \MPk = \left\{ \, p( s \vert \hk ) : s \in \mathcal{A},
299: \hk \in \mathcal{A}^k \, \right\}.
300: \end{equation}
301: A normalization constraint is placed on these parameters $\sum_{s\in
302: \mathcal{A}} p( s \vert \hk ) = 1$ for each word $\hk$.
303:
304: The next step is to write down the elements of Bayes' theorem specific to the
305: $k$-th order Markov chain.
306:
307: %%
308: \subsection{Likelihood}
309:
310: Given a sample of data $D=s_{0}s_{1} \ldots s_{N-1}$, the likelihood can be
311: written down using the Markov property of~\eqnref{eqn:markov_condition} and the
312: stationarity of~\eqnref{eqn:stationarity}. This results in the form
313: \begin{equation}
314: \label{eqn:likelihood}
315: P(D\vert \MPk, \MCk) = \prod_{ s \in \mathcal{A} }
316: \prod_{ \hk \in \mathcal{A}^{k} } p( s \vert \hk )^{\nsks} ,
317: \end{equation}
318: where $\nsks$ is the number of times the \textit{word} $\hk s$ occurs in the
319: sample $D$. For future use we also introduce notation for the number of times a
320: word $\hk$ has been observed $\nsk = \sum_{s \in \mathcal{A}} \nsks$. We note
321: that~\eqnref{eqn:likelihood} is conditioned on the \emph{start sequence}
322: $\hk = s_0s_1\ldots s_{k-1}$.
323:
324: %%
325: \vspace{-0.125in}
326: \subsection{Prior}
327: \vspace{-0.125in}
328:
329: The prior $P(\theta|M)$ is used to specify assumptions about the model to be
330: inferred before the data is considered. Here we use
331: \emph{conjugate priors} for which the posterior distribution has the same
332: functional form as the prior. Our choice allows us to derive exact expressions
333: for many quantities of interest in inference. This provides a powerful tool for
334: understanding what information is gained during inference and,
335: especially, model comparison.
336:
337: The exact form of the prior is determined by our assignment of
338: \emph{hyperparameters} $\asks$ for the prior which balance the strength of
339: the modeling assumptions encoded in the prior against the weight of the data.
340: For a $k$-th order Markov chain, there is one hyperparameter for each word
341: $\hk s$, given the alphabet under consideration. A useful way to think about
342: the assignment of values to the
343: hyperparameters is to relate them to fake counts $\tilde{n}(\hk s)$, such that
344: $\asks = \tilde{n}(\hk s) + 1$. In this way, the $\asks$ can be set to reflect
345: knowledge of the data source and the strength of these prior assumptions can be
346: properly weighted in relation to the actual data counts $\nsks$.
347:
348: The conjugate prior for Markov chain inference is a product of Dirichlet
349: distributions, one for each word $\hk$. It restates the finite-memory
350: assumption from the model definition:
351: \begin{eqnarray}
352: P(\MPk \vert \MCk )
353: & = & \prod_{\hk \in \mathcal{A}^{k}} \left\{
354: \frac{ \Gamma( \ask )}{
355: \prod_{s\in\mathcal{A}} \Gamma( \asks ) } \right. \nonumber \\
356: & \times & \delta \mathbf{(}1-\sum_{s\in\mathcal{A}}
357: p( s \vert \hk )\mathbf{)} \label{eqn:prior} \\
358: & \times & \left. \prod_{s\in\mathcal{A}} p( s \vert \hk )^{\asks-1}
359: \right\}. \nonumber
360: \end{eqnarray}
361: (See App. \ref{app:Dirichlet} for relevant properties of Dirichlet
362: distributions.)
363: The prior's hyperparameters $\{ \asks \}$ must be real and positive. We
364: also introduce the more compact notation $\ask = \sum_{s \in \mathcal{A}}
365: \asks$. The function $\Gamma(x)=(x-1)!$ is the well known Gamma function. The
366: $\delta$-function constrains the model parameters to be properly normalized:
367: $\sum_{s \in \mathcal{A}} \psks = 1$ for each $\hk$.
368:
369: Given this functional form, there are at least two ways to interpret what the
370: prior says about the Markov chain parameters $\MPk$. In addition to considering
371: fake counts $\tilde{n}( \cdot )$, as discussed above, we can consider the
372: range of fluctuations in the estimated $\psks$. Classical statistics would
373: dictate describing the fluctuations via a single value with error bars. This
374: can be accomplished by finding the average and variance of $\psks$ with
375: respect to the prior. The result is:
376: \begin{eqnarray}
377: \label{eqn:prior_mean}
378: \avg{\psks}{\rm{prior}} & = & \frac{\asks}{\ask}~, \\
379: \label{eqn:prior_variance}
380: \var{\psks}{\rm{prior}} & = & \frac{\asks(\ask-\asks)}{\ask^2(1+\ask)} .
381: \end{eqnarray}
382:
383: A second method, more in line with traditional Bayesian estimation, is to
384: consider the marginal distribution for each model parameter. For a Dirichlet
385: distribution, the marginal for any one parameter will be a Beta distribution.
386: With this knowledge, a probability density can be provided for each Markov chain
387: parameter given a particular setting for the hyperparameters $\asks$. In this
388: way, the prior can be assigned and analyzed in substantial detail.
389:
390: A common stance in model inference is to assume all things are a-priori
391: equal. This can be expressed by assigning $\asks=1$ for all $\hk \in
392: \mathcal{A}^k$ and $s \in \mathcal{A}$, adding \textit{no} fake counts
393: $\tilde{n}(\hk s)$. This assignment results in a uniform prior distribution
394: over the model parameters and a prior expectation:
395: \begin{equation}
396: \avg{p(s\vert \hk)}{\rm{prior}} = 1/ \vert \mathcal{A} \vert ~.
397: \end{equation}
398:
399: %%
400: \vspace{-0.20in}
401: \subsection{Evidence}
402: \vspace{-0.125in}
403:
404: Given the likelihood and prior derived above, the evidence $P(D|M)$ is seen
405: to be a simple normalization term in Bayes' theorem. In fact, the evidence
406: provides the probability of the data given the model $\MCk$ and so plays a
407: fundamental role in model comparison. Formally, the definition is
408: \begin{equation}
409: P(D\vert \MCk ) = \int \; d\MPk \; P(D\vert \MPk, \MCk)
410: P(\MPk \vert \MCk ),
411: \label{eqn:evidence_defn}
412: \end{equation}
413: where we can see that this term can be interpreted as an average of the
414: likelihood over the prior distribution. Applying this to the likelihood
415: in~\eqnref{eqn:likelihood} and the prior in~\eqnref{eqn:prior} produces
416: \begin{eqnarray}
417: P(D\vert \MCk) & = & \prod_{\hk \in \mathcal{A}^{k}} \left\{ \;
418: \frac{ \Gamma(\ask) }{ \prod_{s\in \mathcal{A}} \Gamma(\asks)}
419: \right. \nonumber \\
420: & & \label{eqn:evidence} \\
421: & \times & \left.
422: \frac{ \prod_{s\in \mathcal{A}} \Gamma(\nsks+\asks) }{ \Gamma(\nsk+\ask) }
423: \; \right\}. \nonumber
424: \end{eqnarray}
425: As we will see, this analytic expression results in the ability to make useful
426: connections to statistical mechanics techniques when estimating entropy rates.
427: This is another benefit of choosing a conjugate prior with known properties.
428:
429: %%
430: \subsection{Posterior}
431:
432: Using Bayes' theorem~\eqnref{eqn:bayes} the results of the three previous
433: sections can be combined to obtain the posterior distribution over the
434: parameters of the $k$-th order Markov chain. One finds:
435: \begin{eqnarray}
436: P(\MPk\vert D, \MCk) & = & \prod_{\hk \in \mathcal{A}^{k}} \left\{
437: \frac{ \Gamma( \nsk + \ask ) }{
438: \prod_{s\in\mathcal{A}} \Gamma( \nsks + \asks ) } \right. \nonumber \\
439: & \times & \delta \mathbf{(}1-\sum_{s\in\mathcal{A}} p( s \vert \hk)
440: \mathbf{)} \label{eqn:posterior} \\
441: & \times & \left. \prod_{s\in\mathcal{A}}
442: p( s \vert \hk )^{\nsks + \asks - 1} \right\}. \nonumber
443: \end{eqnarray}
444: As noted in selecting the prior, the resulting form is a Dirichlet
445: distribution with modified parameters. This is a result of choosing the
446: conjugate prior: cf. the forms of \eqnref{eqn:prior} and
447: \eqnref{eqn:posterior}.
448:
449: From~\eqnref{eqn:posterior} the estimation of the model parameters
450: $p(s\vert \hk)$ and the uncertainty of these estimates can be given using the
451: known properties of the Dirichlet distribution. As with the prior,
452: there are two main ways to understand what the posterior tells us about the
453: fluctuations in the estimated Markov chain parameters. The first uses a point
454: estimate with ``error bars''. We obtain these from the mean and variance of
455: the $\psks$ with respect to the posterior, finding
456: \begin{gather}
457: \avg{p(s\vert \hk)}{\rm{post}} = \frac{ \nsks + \asks }{ \nsk + \ask }
458: \label{eqn:posterior_mean} ~, \\ \nonumber \\
459: \var{p(s\vert \hk)}{\rm{post}} = \frac{ \nsks +\asks }{ ( \nsk + \ask )^2 }
460: \nonumber \\ \label{eqn:posterior_variance} \\
461: \times \frac{ ( \nsk + \ask ) - ( \nsks + \asks ) }{
462: ( \nsk + \ask +1 ) }. \nonumber
463: \end{gather}
464: This is the \textit{posterior mean estimate} (\pme) of the model parameters.
465:
466: A deeper understanding of~\eqnref{eqn:posterior_mean} is obtained through a
467: simple factoring:
468: \begin{eqnarray}
469: \avg{p(s\vert \hk)}{\rm{post}} & = & \frac{1}{ \nsk + \ask }
470: \left[ \nsk \, \left (\frac{\nsks}{\nsk} \right) \right. \nonumber \\
471: && \label{eqn:pme_factor} \\
472: & + & \left. \ask \, \left(\frac{\asks}{\ask} \right) \right], \nonumber
473: \end{eqnarray}
474: where $\nsks /\nsk $ is the \emph{maximum likelihood estimate} (\mle)
475: of the model parameters and $\asks /\ask$ is the prior expectation given
476: in~\eqnref{eqn:prior_mean}. In this form, it is
477: apparent that the posterior mean estimate is a weighted sum of the \mle~and
478: prior expectation. As a result, we can say that the posterior mean and
479: maximum likelihood estimates converge to the same value for $\nsk \gg \ask$.
480: Only when the data is scarce, or the prior is set with strong conviction,
481: does the Bayesian estimate add corrections to the \mle.
482:
483: A second method for analyzing the resulting posterior density is to consider the
484: marginal density for each parameter. As discussed with the prior, the marginal
485: for a Dirichlet is a Beta distribution. As a result, we can either provide
486: regions of confidence for each parameter or simply inspect the density function.
487: The latter provides much more information about the inference being made than
488: the point estimation just given. In our examples, to follow shortly, we
489: plot the marginal posterior density for various parameters of interest
490: to demonstrate the wealth of information this method provides.
491:
492: Before we move on, we make a final point regarding the estimation of inference
493: uncertainty. The form of the posterior is not meant to reflect the potential
494: fluctuations of the data source. Instead, the width of the distribution
495: reflects the possible Markov chain parameters which are consistent with
496: observed data sample. These are distinct notions and should not be conflated.
497:
498: %%
499: \subsection{Predictive distribution}
500:
501: Once we have an inferred model, a common task is to estimate the probability of
502: a new observation $D^{(new)}$ given the previous data and estimated model.
503: This is implemented by taking an average of the likelihood of the new data:
504: \begin{equation}
505: P(D^{(new)}\vert \MPk, \MCk)
506: = \prod_{\hk \in \mathcal{A}^k, s \in \mathcal{A}} p(s\vert \hk)^{m(\hk s)}
507: \end{equation}
508: with respect to the posterior
509: distribution~\cite{MacKay2003}:
510: \begin{eqnarray}
511: \label{eqn:predictive_distribution_defn}
512: P(D^{(new)}\vert D,\MCk) & = & \int d\MPk P(D^{(new)}\vert \MPk, \MCk) \\
513: & \times & P(\MPk \vert D, \MCk) ~. \nonumber
514: \end{eqnarray}
515: We introduce the notation $m(\hk s)$ to indicate the number of times the word
516: $\hk s$ occurs in $D^{(new)}$. This method has the desirable property, compared
517: to point estimates, that it takes into account the uncertainty in the model
518: parameters $\MPk$ as reflected in the form of the posterior distribution.
519:
520: The evaluation of~\eqnref{eqn:predictive_distribution_defn} follows the same
521: path as the calculation for the evidence and produces a similar
522: form; we find:
523: \begin{gather}
524: P(D^{(new)}\vert D, \MCk) = \prod_{\hk \in \mathcal{A}^{k}} \left\{ \;
525: \frac{ \Gamma( \nsk+\ask) }{ \prod_{s\in \mathcal{A}}
526: \Gamma( \nsks + \asks)} \right. \nonumber \\
527: \label{eqn:predictive_distribution} \\
528: \times \left. \frac{ \prod_{s\in \mathcal{A}}
529: \Gamma( \nsks + m(\hk s) + \asks ) }{ \Gamma( \nsk + m(\hk) + \ask ) }
530: \; \right\}. \nonumber
531: \end{gather}
532:
533: %%
534: %%
535: \section{Model Comparison}
536:
537: With the ability to infer a Markov chain of a given order $k$, a common sense
538: question is to ask how do we choose the correct order given a particular data
539: set? Bayesian methods have a systematic way to address this through
540: the use of \emph{model comparison}.
541:
542: In many ways, this process is analogous to inferring model parameters
543: themselves, which we just laid out. We start by enumerating the set of model
544: orders to be compared $\mathcal{M} = \{ \MCk \}_{k_{min}}^{k_{max}}$, where
545: $k_{min}$ and $k_{max}$ correspond to the minimum and maximum order to be
546: inferred, respectively. Although we will not consider an independent,
547: identically distributed (IID) model ($k=0$) here, we do note that this could
548: be included using the same techniques described below.
549:
550: We start with the joint probability $P(M_{k},D \vert \mathcal{M} )$ of a
551: particular model $M_{k} \in \mathcal{M}$ and data sample $D$, factoring it in
552: two ways following Bayes' theorem. Solving for the probability of a particular
553: model class we obtain
554: \begin{equation}
555: \label{eqn:model_comparison}
556: P(\MCk \vert D , \mathcal{M} ) = \frac{ P(D \vert \MCk, \mathcal{M} )
557: P(\MCk \vert \mathcal{M} ) }{ P(D \vert \mathcal{M})} ,
558: \end{equation}
559: where the denominator is the sum given by
560: \begin{equation}
561: P(D \vert \mathcal{M}) =
562: \sum_{\MCkprime \in \mathcal{M}}
563: P(D \vert \MCkprime, \mathcal{M} )P(\MCkprime \vert \mathcal{M} ) ~.
564: \end{equation}
565: The probability of a particular model class in the set under consideration is
566: driven by two components: the evidence $P(D \vert \MCk, \mathcal{M})$, derived
567: in \eqnref{eqn:evidence}, and the prior over model classes
568: $P(\MCk \vert \mathcal{M} )$.
569:
570: Two common priors in model comparison are: (i) all models are equally likely
571: and (ii) models should be penalized for the number of free parameters used to
572: fit the data. In the first instance
573: $P(\MCk \vert \mathcal{M})=1/ \vert \mathcal{M} \vert$ is the same for all
574: orders $k$. However, this factor cancels out because it appears in both the
575: numerator and denominator. As a result, the probability of models using this
576: prior becomes
577: \begin{equation}
578: \label{eqn:best_model_uniform_prior}
579: P(\MCk \vert D , \mathcal{M} ) = \frac{P(D \vert \MCk, \mathcal{M} )
580: }{
581: \sum_{\MCkprime \in \mathcal{M}}
582: P(D \vert \MCkprime, \mathcal{M} )}.
583: \end{equation}
584:
585: In the second case, a common penalty for the number of model parameters is
586: \begin{equation}
587: \label{eqn:df_penalty_prior}
588: P(\MCk \vert \mathcal{M}) = \frac{\exp( - \vert \MCk \vert )
589: }{\sum_{\MCkprime \in \mathcal{M}}
590: \exp( - \vert \MCkprime \vert ) } ~,
591: \end{equation}
592: where $\vert \MCk \vert$ is the number of free parameters in the model. For a
593: $k$-th order Markov chain, the number of free parameters is
594: \begin{equation}
595: \vert \MCk \vert = \vert \mathcal{A} \vert^k(\vert \mathcal{A} \vert-1) ~,
596: \end{equation}
597: where $\vert \mathcal{A} \vert$ is the alphabet size. Thus, model
598: probabilities under this prior take on the form
599: \begin{equation}
600: \label{eqn:best_model_df_penalty_prior}
601: P(\MCk \vert D , \mathcal{M} ) = \frac{
602: P(D \vert \MCk, \mathcal{M} ) \exp( - \vert \MCk \vert )
603: }{
604: \sum_{\MCkprime}
605: P(D \vert \MCkprime, \mathcal{M} )
606: \exp( - \vert \MCkprime \vert ) }.
607: \end{equation}
608: We note that the normalization sum in~\eqnref{eqn:df_penalty_prior}
609: cancels because it appears in both the numerator and denominator.
610:
611: Bayesian model comparison has a natural \emph{Occam's razor} in the model
612: comparison process~\cite{MacKay2003}. This means there is a natural preference
613: for smaller models even when a uniform prior over model orders is applied. In
614: this light, a penalty for the number of model parameters can be seen as a very
615: cautious form of model comparison. Both of these priors,
616: \eqnref{eqn:best_model_uniform_prior} and
617: \eqnref{eqn:best_model_df_penalty_prior}, will be considered in
618: the examples to follow.
619:
620: A note is in order on computational implementation. In general, the resulting
621: probabilities can be extremely small, easily resulting in numerical underflow
622: if the equations are not implemented with care. As mentioned
623: in~\cite{Durbin1998}, computation with extended logarithms can be used to
624: alleviate these concerns.
625:
626: %%
627: %%
628: \section{Information Theory, Statistical Mechanics, and Entropy Rates}
629:
630: An important property of an information source is its \emph{entropy rate}
631: $\hmu$, which indicates the degree of intrinsic randomness and controls the
632: achievable compression. A first attempt at estimating a source's entropy rate
633: might consist of plugging a Markov chain's estimated model parameters into the
634: known expression~\cite{Cover1991}. However, this does not
635: accurately reflect the posterior distribution derived above. This observation
636: leaves two realistic alternatives. The first option is to sample model
637: parameters from the posterior distribution. These samples can then be used to
638: calculate a set of entropy rate estimates that reflect the underlying posterior
639: distribution. A second option, which we take here, is to adapt methods from
640: type theory and
641: statistical mechanics previously developed for IID models~\cite{Samengo2002}
642: to Markov chains. To the best of our knowledge this is the first time these
643: ideas have been extended to inferring Markov chains; although cf.
644: \cite{Young1994}.
645:
646: In simple terms, type theory shows that the probability of an observed sequence
647: can be written in terms of the \emph{Kullback-Leibler} (KL) \emph{distance} and
648: the entropy rate. When applied to the Markov chain inference problem the resulting
649: form suggests a connection to statistical mechanics. For example, we will show
650: that averages of the KL-distance and entropy rate with respect to the posterior
651: are found by taking simple derivatives of a partition function.
652:
653: The connection between inference and information theory starts by considering
654: the product of the prior~\eqnref{eqn:prior} and
655: likelihood~\eqnref{eqn:likelihood}:
656: \begin{equation}
657: P(\MPk\vert \MCk)P( D\vert \MPk, \MCk)=P( D, \MPk\vert \MCk) ~.
658: \end{equation}
659: This forms a joint distribution over the observed data $D$ and model parameters
660: $\MPk$ given the model order $\MCk$. Denoting the normalization constant from
661: the prior as $Z$ to save space, this joint distribution is
662: \begin{equation}
663: \label{eqn:product_prior_likelihood}
664: P( D, \MPk\vert \MCk) = Z \, \prod_{\hk, s}
665: p( s \vert \hk )^{\nsks + \asks - 1}.
666: \end{equation}
667: This form can be written, without approximation, in terms of conditional
668: relative entropies $\KLd [\cdot \| \cdot ]$ and entropy rate $\hmu [\cdot]$:
669: \begin{eqnarray}
670: \label{eqn:info_prior_likelihood}
671: P( D, \MPk\vert \MCk) & = & Z \, 2^{-\beta_k \mathbf{(} \KLd [Q \| P ]
672: + \hmu [Q]\mathbf{)}} \\
673: & \times & 2^{+\Asize^{k+1} \mathbf{(} \KLd [ U \| P ]
674: + \hmu [U]\mathbf{)}} ~, \nonumber
675: \end{eqnarray}
676: where $\beta_k = \sum_{\hk,s} \left[ \nsks + \asks \right]$ and the
677: distribution of true parameters is
678: $P = \{ \psk, \psks \}$. The distributions $Q$ and $U$ are given by
679: \begin{eqnarray}
680: \label{eqn:pme_distribution}
681: Q & = & \left\{ \qsk = \frac{\nsk+\ask}{\beta_k} , \right. \\
682: & & \left. \qsks = \frac{\nsks + \asks}{\nsk + \ask} \right\}
683: \nonumber \\
684: \label{eqn:uniform_distribution}
685: U & = & \left\{ \usk = \frac{1}{\Asize^k}, \usks = \frac{1}{\Asize} \right\}
686: ~,
687: \end{eqnarray}
688: where $Q$ is the distribution defined by the posterior mean and $U$ is a uniform
689: distribution. The information-theoretic quantities used above are given by
690: \begin{eqnarray}
691: \KLd [ Q \| P ] & = & \sum_{s, \hk} \qsk \qsks \log_2 \frac{\qsks}{\psks}
692: \label{eqn:conditional_KL_div} \\
693: \hmu [ Q ] & = & - \sum_{s, \hk} \qsk \qsks \log_2 \qsks ~.
694: \label{eqn:entropy_rate_estimate}
695: \end{eqnarray}
696: The form of~\eqnref{eqn:info_prior_likelihood} and its relation to the evidence
697: suggests a connection to statistical mechanics: The evidence
698: $P(D \vert \MCk) = \int d\MPk P( D, \MPk\vert \MCk)$ is a partition function
699: $\mathcal{Z} = P( D \vert \MCk)$. Using conventional techniques, the
700: expectation and variance of the ``energy''
701: \begin{equation}
702: \label{eqn:info_energy}
703: \EQP = \KLd [Q \| P ] + \hmu [Q]
704: \end{equation}
705: are obtained by taking derivatives of the logarithm of the partition function
706: with respect to $\beta_k$:
707: \begin{eqnarray}
708: \avg{\, \EQP \, }{\rm{post}}
709: & = &
710: - \frac{1}{\log 2}
711: \frac{\partial}{\partial \beta_k} \, \log \mathcal{Z}
712: \label{eqn:info_mean_energy}\\
713: \var{\, \EQP \, }{\rm{post}}
714: & = &
715: \frac{1}{\log 2}
716: \frac{\partial^2}{\partial \beta_k^2} \, \log \mathcal{Z}
717: ~.
718: \label{eqn:info_variance_energy}
719: \end{eqnarray}
720: The factors of $\log 2$ in the above expressions come from the decision to use
721: base 2 logarithms in the definition of our information-theoretic quantities.
722: This results in values in \emph{bits} rather than \emph{nats}~\cite{Cover1991}.
723:
724: To evaluate the above expression, we take advantage of the known form for the
725: evidence provided in~\eqnref{eqn:evidence}. With the definitions $\alpha_k =
726: \sum_{\hk} \ask$ and
727: \begin{equation}
728: \label{eqn:prior_distribution}
729: R = \left\{ \rsk = \frac{\ask}{\alpha_k} ,
730: \rsks = \frac{\asks}{\ask} \right\}
731: \end{equation}
732: the negative logarithm of the partition function can be written
733: \begin{eqnarray}
734: - \log \mathcal{Z} & = & \sum_{\hk,s} \log \Gamma
735: \left[ \alpha_k \rsk \rsks \right]
736: \\ & - & \sum_{\hk} \log \Gamma \left[ \alpha_k \rsk \right]
737: + \sum_{\hk} \log \Gamma \left[ \beta_k \qsk \right] \nonumber \\
738: & - & \sum_{\hk,s} \log \Gamma
739: \left[ \beta_k \qsk \qsks \right]. \nonumber
740: \end{eqnarray}
741:
742: From this expression, the desired expectation is found by taking derivatives
743: with respect to $\beta_k$; we find that
744: \begin{gather}
745: \avg{\, \EQP \, }{\rm{post}}
746: = \frac{1}{\log 2}
747: \sum_{\hk} \qsk \psi^{(0)} \left[ \beta_k \qsk \right]
748: \nonumber \\
749: - \frac{1}{\log 2} \sum_{\hk,s} \qsk \qsks \psi^{(0)}
750: \left[ \beta_k \qsk \qsks \right]~. \nonumber \\
751: \label{eqn:average_info}
752: \end{gather}
753: The variance is obtained by taking a second derivative with respect to
754: $\beta_k$, producing
755:
756: \begin{gather}
757: \var{\, \EQP \, }{\rm{post}} =
758: - \frac{1}{\log 2} \sum_{\hk} \qsk^2 \psi^{(1)} \left[ \beta_k \qsk \right]
759: \nonumber \\
760: + \frac{1}{\log 2} \sum_{\hk,s} \qsk^2 \qsks^2 \psi^{(1)}
761: \left[ \beta_k \qsk \qsks \right]. \nonumber \\
762: \label{eqn:variance_info}
763: \end{gather}
764: In both of the above the polygamma function is defined $\psi^{(n)}(x) =
765: d^{n+1}/dx^{n+1} \log \Gamma(x)$. (For further details, consult a reference
766: such as~\cite{Abramowitz1965}.)
767:
768: From the form of~\eqnref{eqn:average_info}
769: and~\eqnref{eqn:variance_info}, the meaning is not immediately clear. We can
770: use an expansion of the $n=0$ polygamma function
771: \begin{equation}
772: \psi^{(0)}(x) = \log x - 1/2x + \mathcal{O}(x^{-2}) ~,
773: \end{equation}
774: valid for $x \gg 1$, however, to obtain an asymptotic form
775: for~\eqnref{eqn:average_info}; we find
776: \begin{gather}
777: \avg{\, \EQP \, }{\rm{post}} =
778: H[ \qsk \qsks ] - H[\qsk] \nonumber \\
779: + \frac{1}{2\beta_k} \Asize^k(\Asize -1 )
780: + \mathcal{O}(1/ \beta_k^2)
781: \label{eqn:average_info_asymptotic}.
782: \end{gather}
783: From this we see that the first two terms make up the entropy
784: rate $\hmu [ Q ] = H[ \qsk \qsks ] - H[\qsk]$ and the last
785: term is associated with the conditional relative entropy between the posterior
786: mean distribution $Q$ and true distribution $P$.
787:
788: In summary, we have found the average of conditional relative entropy and
789: entropy rate with respect to the posterior density. This was accomplished by
790: making connections to statistical mechanics through type theory. Unlike
791: sampling from the posterior to estimate the entropy rate, this method results
792: in an analytic form which approaches $\hmu [ P ]$ as the inverse of the data
793: size. This method for approximating $\hmu$ also provides a computational
794: benefit. No eigenstates have to be found from the Markov transition matrix,
795: allowing for the storage of values in sparse data structures. This provides
796: a distinct computational advantage when large orders or alphabets are
797: considered.
798:
799: Finally, it might seem awkward to use the expectation
800: of~\eqnref{eqn:info_energy} for estimation of the entropy rate. This method
801: was chosen because it is the form that naturally appears in writing down the
802: likelihood-prior combination in~\eqnref{eqn:info_prior_likelihood}. As a result
803: of using this method, most of the results obtained above are without
804: approximation. We were also able to show this expectation converges to the
805: desired value in a well behaved manor.
806:
807: %%
808: %%
809: \vspace{-0.125in}
810: \section{Examples}
811: \vspace{-0.125in}
812:
813: To explore how the above produces a robust inference procedure, let's now
814: consider the statistical inference of a series of increasingly complex data
815: sources. The first, called the \emph{golden mean} process, is a first-order
816: Markov chain. The second data source is called the \emph{even process} and
817: cannot be represented by a Markov chain with finite order. However, this source
818: is a deterministic HMM, meaning that the current state and next output symbol
819: uniquely determine the next state. Finally, we consider the \emph{simple
820: nondeterministic source}, so named since its smallest representation is as
821: a nondeterministic HMM. (Nondeterminism here refers to the HMM structure: the
822: current state and next output symbol do not uniquely determine the next state.
823: This source is represented by an infinite-state deterministic HMM
824: \cite{Crutchfield1994,Upper1997}.)
825:
826: The golden mean, even, and simple nondeterministic processes can all be written
827: down as models with two internal states---call them $A$ and $B$. However, the
828: complexity of the data generated from each source is of markedly different
829: character. Our goal in this section is to consider the three main steps in
830: inference to analyze them. First, we consider inference of a first-order Markov
831: chain to demonstrate the
832: estimation of model parameters with uncertainty. Second, we consider model
833: comparison for a range of orders $k$. This allows us to discover structure in
834: the data source even though the true model class cannot be captured in all
835: cases. Finally, we consider estimation of entropy rates from these data sources,
836: investigating how randomness is expressed in them.
837:
838: While investigating these processes we consider average data counts,
839: rather than sample counts from specific realizations, as we want
840: to focus specifically on the average performance of Bayesian inference. To
841: do this we take advantage of the known form of the sources. Each is described
842: by a transition matrix $T$, which gives transitions between states
843: $A$ and $B$:
844: \begin{equation}
845: \label{eqn:transition_matrix_definition}
846: T = \left[ \begin{array}{cc}
847: p(A\vert A) & p(B\vert A) \\
848: p(A\vert B) & p(B\vert B)
849: \end{array}
850: \right] \;.
851: \end{equation}
852: Although two of our data sources are not finite Markov chains, the transition
853: matrix between internal states is Markov. This means the matrix
854: is \emph{stochastic} (all rows sum to one) and we are guaranteed an eigenstate
855: $\vec{\pi}$ with eigenvalue one: $\vec{\pi} \, T = \vec{\pi}$. This eigenstate
856: describes the asymptotic distribution over internal states:
857: $\vec{\pi} = \left[ p(A), p(B) \right]$.
858:
859: The transition matrix can be divided into labeled matrices $T^{(s)}$ which
860: contain those elements of $T$ that output symbol $s$. For our binary data
861: sources one has
862: \begin{equation}
863: \label{eqn:transition_matrix}
864: T = T^{(0)} + T^{(1)}.
865: \end{equation}
866: Using these matrices, the average probability of words can be estimated for
867: each process of interest. For example, the probability of word $01$ can be
868: found using
869: \begin{equation}
870: p(01) = \vec{\pi} \, T^{(0)}T^{(1)} \vec{\eta} ~,
871: \end{equation}
872: where $\vec{\eta}$ is a column vector with all $1$'s. In this way, for any
873: data size $N$, we estimate the average count for a word as
874: \begin{equation}
875: \nsks = (N-k)~p(\hk s) ~.
876: \end{equation}
877: Average counts, obtained this way, will be the basis for all of
878: the examples to follow.
879:
880: In the estimation of the true entropy rate for the examples we use the formula
881: \begin{equation}
882: h_{\mu} = - \sum_{v \in \{A,B\}} p(v)
883: \sum_{s \in \mathcal{A}} ~p(s\vert v) \log_2 p(s\vert v)
884: \label{eqn:entropy_rate}
885: \end{equation}
886: for the the golden mean and even processes, where
887: $p(s\vert v) = T^{(s)}_{v \cdot}$ is the probability of a letter $s$ given the
888: state $v$ and $p(v)$ is the asymptotic probability of the state $v$ which can be
889: found as noted above. For the simple nondeterministic source this closed-form
890: expression cannot be applied and the entropy rate must be found using more
891: involved methods; see~\cite{Crutchfield1994} for further details.
892:
893: %%
894: %%
895: \subsection{Golden mean process: In-class modeling}
896:
897: The \emph{golden mean process} can be represented by a simple $1$st-order
898: Markov chain over a binary alphabet characterized by a single (shortest)
899: forbidden word $s^2 = 00$. The defining labeled transition matrices for this data
900: source are given by
901: \begin{equation}
902: \label{eqn:label_transition_matrix_golden_mean}
903: T^{(0)} = \left[ \begin{array}{cc}
904: 0 & 1/2 \\
905: 0 & 0
906: \end{array}
907: \right] \; , \;
908: T^{(1)} = \left[ \begin{array}{cc}
909: 1/2 & 0 \\
910: 1 & 0
911: \end{array}
912: \right] ~.
913: \end{equation}
914: \begfigref{fig:golden_mean} provides a graphical representation of the
915: corresponding hidden Markov chain. Inspection reveals a simple relation
916: between the \text{internal states} $A$ and $B$ and the output symbols
917: $0$ and $1$. An observation of $0$ indicates a transition to internal
918: state $B$ and a $1$ corresponds to state $A$, making this process a Markov
919: chain over $0$s and $1$s.
920:
921: %%
922: %% Beamer Implementation
923: %%
924: \begin{figure}[htb]
925: \begin{center}
926: %options for the plot:
927: %-states
928: \SetStateLabelScale{1.6}
929: \SetStateLineWidth{1.4pt}
930: %-edges
931: \SetEdgeLabelScale{1.4}
932: \SetEdgeLineWidth{0.75pt}
933:
934: \begin{VCPicture}{(0,0)(5,2)}
935: %states
936: \ChgStateLabelScale{0.8}
937: \State[A]{(1,0)}{A}
938: \State[B]{(4,0)}{B}
939: \ChgEdgeLabelScale{0.7}
940: %transitions
941: \LoopW{A}{ 1 | 1/2 }
942: \LArcR[0.5]{B}{A}{ 1 | 1 }
943: \LArcR[0.5]{A}{B}{ 0 | 1/2 }
944: \end{VCPicture}
945: \end{center}
946: \vspace{0.5in}
947: \caption{A deterministic hidden Markov chain for the golden mean process.
948: Edges are labeled with the output symbol and the transition probability:
949: \emph{symbol} $\vert$ \emph{probability}.
950: }
951: \label{fig:golden_mean}
952: \end{figure}
953:
954: For the golden mean the eigenstate is $\vec{\pi} = \left[ p(A), p(B)
955: \right] = \left( 2/3 , 1/3 \right)$. With this vector and the labeled
956: transition matrices any desired word count can be found as discussed above.
957:
958: %
959: \vspace{-0.125in}
960: \subsubsection{Estimation of $M_1$ Parameters}
961: \vspace{-0.125in}
962:
963: To demonstrate the effective inference of the Markov chain parameters for the
964: golden mean process we consider average counts for a variety of data sizes
965: $N$. For each size, the marginal posterior for the parameters $p(0\vert 1)$ and
966: $p(1\vert 0)$ is plotted in~\figref{fig:GoldenMean_ParameterEstimates}. The
967: results demonstrate that the shape of the posterior effectively
968: describes the distribution of possible model parameters at each $N$ and converges
969: to the correct values of $p(0\vert 1)=1/2$ and $p(1\vert 0)=1$ with increasing
970: data.
971:
972: %% details-
973: %% code: MarginalPosterior.py in MarkovChainPaper_Code folder.
974: %% parameters: marginal density plotted for N=50,100,200,400.
975: %%
976: \begin{figure}[htbp]
977: \centering
978: \includegraphics[width=0.98\columnwidth]{MarginalPosterior_GM.eps}
979: \caption{A plot of the inference of $M_1$ model parameters for the
980: golden mean process. For each data sample size $N$, the marginal posterior is
981: plotted for the parameters of interest: $p(0\vert 1)$ in the top panel and
982: $p(1\vert 0)$ in the lower panel. The \textit{true} values of the parameters
983: are $p(0\vert 1)=1/2$ and $p(1\vert 0) = 1$.
984: \label{fig:GoldenMean_ParameterEstimates}}
985: \end{figure}
986:
987: Point estimates with a variance can be provided for each of the parameters, but
988: these numbers by themselves can be misleading. However, the estimate obtained
989: by using the mean and variance of the posterior are a more effective description
990: of the inference process than a maximum likelihood estimate with estimated
991: error given by a Gaussian approximation of the likelihood alone.
992: As~\figref{fig:GoldenMean_ParameterEstimates} demonstrates, in
993: fact, a Gaussian
994: approximation of uncertainty is an ineffective description of our knowledge
995: when the Markov chain parameters are near their upper or lower limits at $0$
996: and $1$. Probably the most effective set of numbers to provide consists of the
997: mean of the posterior and a region of confidence. These would most accurately
998: describe asymmetries in the uncertainty of model parameters. Although we will
999: not do that here, a brief description of finding regions of confidence is
1000: provided in~\appref{app:dirichlet}.
1001:
1002: %
1003: \vspace{-0.125in}
1004: \subsubsection{Selecting the Model Order $k$}
1005: \vspace{-0.125in}
1006:
1007: Now consider the selection of the appropriate order $k$ from golden mean
1008: realizations. As discussed above, the golden mean process is a first order
1009: Markov chain with $k=1$. As a result, we would expect model comparison to
1010: select this order from the available possibilities. To demonstrate this,
1011: we consider orders $k=1-4$ and perform model comparison with a uniform prior
1012: over orders (\eqnref{eqn:best_model_uniform_prior}) and with a penalty for the
1013: number of model parameters (\eqnref{eqn:best_model_df_penalty_prior}).
1014:
1015: %% details-
1016: %% code: ModelComparison.py in MarkovChainPaper_Code folder.
1017: %% parameters: length_min=100, length_max=1000, step=5
1018: %%
1019: \begin{figure}[htbp]
1020: \centering
1021: \includegraphics[width=0.98\columnwidth]{ModelCompare_GM.eps}
1022: \caption{Model comparison for Markov chains of order $k=1-4$ using
1023: average counts from the golden mean process. Sample sizes from $N=100$ to
1024: $N=1,000$ in steps of $\Delta N=5$ are used to generate these plots. The top panel
1025: displays the model probabilities using a uniform prior over orders $k$. The
1026: bottom panel displays the effect of a penalty for model size.
1027: \label{fig:GoldenMean_ModelComparison}}
1028: \end{figure}
1029:
1030: The results of the model comparisons are given
1031: in~\figref{fig:GoldenMean_ModelComparison}. The top panel shows the probability
1032: for each order $k$ as a function of the sample size, using a uniform prior. For
1033: this prior over orders, $M_1$ is selected with any reasonable amount of
1034: data. However, there does seem to be a possibility to over-fit for small data
1035: size $N \leq 100$. The bottom panel shows the model probability with a penalty
1036: prior over model order $k$. This removes the over-fitting at small data sizes
1037: and produces an offset which must be overcome by the data before higher $k$ is
1038: selected. This example is not meant to argue for the penalty prior over model
1039: orders. In fact, Bayesian model comparison with a uniform prior does an
1040: effective job using a relatively small sample size.
1041:
1042: %
1043: \vspace{-0.125in}
1044: \subsubsection{Estimation of Entropy Rate}
1045: \vspace{-0.125in}
1046:
1047: We can also demonstrate the convergence of the average for
1048: $E(Q,P)=D[ Q \| P ] + \hmu [Q]$ given in~\eqnref{eqn:average_info} to the
1049: correct entropy rate for the golden mean process. We choose to show this
1050: convergence for all orders $k=1-4$ discussed in the previous section. This
1051: exercise demonstrates that all orders greater than or equal to $k = 1$
1052: effectively capture the entropy rate. However, the convergence to the correct
1053: values for higher-order $k$ takes more data because of a larger initial value of
1054: $D[ Q \| P ]$. This larger value is simply due to the larger number of
1055: parameters for higher-order Markov chains.
1056:
1057: %% details-
1058: %% code: EntropyEstimation.py in MarkovChainPaper_Code folder.
1059: %% parameters: length_min=50, length_max=5000, step=50, k=1-4
1060: %%
1061: \begin{figure}[htbp]
1062: \centering
1063: \includegraphics[width=0.98\columnwidth]{EntropyEstimates_GM.eps}
1064: \caption{The convergence of $\avg{\, E(Q,P) \, }{\rm{post}}$ to the true
1065: entropy rate $\hmu = 2/3$ bits per symbol (indicated by the gray horizontal
1066: line) for the the golden mean process. As demonstrated
1067: in~\eqnref{eqn:average_info_asymptotic}, the conditional relative
1068: entropy $D[Q \| P ] \rightarrow 0$ as $1/N$. This results in
1069: the convergence of $\hmu [Q]$ to the true entropy rate.
1070: \label{fig:GoldenMean_InfoTheory}}
1071: \end{figure}
1072:
1073: In evaluating the value of $D[Q \| P ] + \hmu [Q]$ for different sample lengths,
1074: we expect that the \pme \, estimated $Q$ will converge to the true distribution
1075: $P$. As a result, the conditional relative entropy should go to zero with
1076: increasing $N$. For the golden mean process, the known value of the entropy
1077: rate is $\hmu =2/3$ bits per symbol. Inspection
1078: of~\figref{fig:GoldenMean_InfoTheory} demonstrates the expected convergence of the
1079: average from~\eqnref{eqn:average_info} to the true entropy rate.
1080:
1081: The result of our model comparison from the previous section could also be used
1082: in the estimation of the entropy rate. As we saw
1083: in~\figref{fig:GoldenMean_ModelComparison}, there are ranges of sample length $N$
1084: where the probability of orders $k=1,2$ are both nonzero. In principle, an
1085: estimate of $\hmu$ should be made by weighting the values obtained for each
1086: $k$ by the corresponding order probability $P(\MCk \vert D, \mathcal{M})$. As
1087: we can see from~\figref{fig:GoldenMean_InfoTheory}, the estimates of the entropy
1088: rate for $k=1,2$ are also very similar in this range of $N$. As a result, this
1089: additional step would not have a large effect for entropy rate estimation.
1090:
1091: %%
1092: \subsection{Even process: Out-of-class modeling}
1093:
1094: We now consider a more difficult data source called the \emph{even process}.
1095: The defining labeled transition matrices are given by
1096: \begin{equation}
1097: \label{eqn:label_transition_matrix_even}
1098: T^{(0)} = \left[ \begin{array}{cc}
1099: 1/2 & 0 \\
1100: 0 & 0
1101: \end{array}
1102: \right] \; , \;
1103: T^{(1)} = \left[ \begin{array}{cc}
1104: 0 & 1/2 \\
1105: 1 & 0
1106: \end{array}
1107: \right]~.
1108: \end{equation}
1109:
1110: As can be seen in~\figref{fig:even}, the node-edge structure is identical to
1111: the golden mean process but the output symbols on the edges have been changed
1112: slightly. As a result of this shuffle, the states $A$ and $B$ can no longer be
1113: associated with a simple sequence of $0$'s and $1$'s. Whereas the golden mean
1114: has the irreducible set of forbidden words $\mathcal{F} = \{00\}$, the even
1115: process has a countably infinite set
1116: $\mathcal{F} = \{01^{2n+1}0: n=0,1,2,\ldots \}$
1117: \cite{Crutchfield1994}.
1118: \begin{figure}[htb]
1119: \begin{center}
1120: %options for the plot:
1121: %-states
1122: \SetStateLabelScale{1.6}
1123: \SetStateLineWidth{1.4pt}
1124: %-edges
1125: \SetEdgeLabelScale{1.4}
1126: \SetEdgeLineWidth{0.75pt}
1127:
1128: \begin{VCPicture}{(0,0)(5,2)}
1129: %states
1130: \ChgStateLabelScale{0.8}
1131: \State[A]{(1,0)}{A}
1132: \State[B]{(4,0)}{B}
1133: \ChgEdgeLabelScale{0.7}
1134: %transitions
1135: \LoopW{A}{ 0 | 1/2 }
1136: \LArcR[0.5]{B}{A}{ 1 | 1 }
1137: \LArcR[0.5]{A}{B}{ 1 | 1/2 }
1138: \end{VCPicture}
1139: \end{center}
1140: \vspace{0.5in}
1141: \caption{Deterministic hidden Markov chain representation of the even process.
1142: This process cannot be represented as a finite-order (nonhidden) Markov chain
1143: over the output symbols $0$s and $1$s. The set of irreducible forbidden words
1144: $\mathcal{F} = \{01^{2n+1}0: n=0,1,2,\ldots \}$ reflects the fact that the
1145: process generates blocks of $1$'s, bounded by $0$s, that are \emph{even} in
1146: length, at any length.
1147: \label{fig:even}}
1148: \end{figure}
1149:
1150: In simple terms, the even process produces blocks of $1$'s which are even in
1151: length. This is a much more complicated type of memory than we saw in
1152: the golden mean process. For the Markov chain model class, where a word of
1153: length $k$ is used to predict the next letter, this would require an
1154: infinite-order $k$. It would be necessary to keep track of all even and odd
1155: strings of $1$'s, irrespective of the length. As a result, the properties of
1156: the even process mean that a finite Markov chain \textit{cannot} represent
1157: this data source.
1158:
1159: This example is then a demonstration of what can be learned in a case of
1160: out-of-class modeling. We are interested, therefore, in how well Markov
1161: chains approximate the even process. We
1162: expect that model comparison will select larger $k$ as the size of the data
1163: sample increases. Does the model selection tells us anything about the
1164: underlying data source despite the inability to exactly capture its properties?
1165: As we will see, we do obtain intriguing hints of the true nature of the even
1166: process from model comparison. Finally, can we estimate the entropy rate of
1167: the process with a Markov chain? As we will see, a high $k$ is needed to do
1168: this effectively.
1169:
1170: %
1171: \subsubsection{Estimation of $M_1$ Parameters}
1172:
1173: In this section we consider an $M_1$ approximation of the even process.
1174: We expect the resulting model to accurately capture length-$2$ word
1175: probabilities as $N$ increases. In this example, we consider the \emph{true}
1176: model to be the best approximation possible by a $k=1$ Markov chain. From the
1177: labeled transition matrices given above we can calculate the appropriate
1178: values for $p(0\vert 1)$ and $p(1\vert 0)$ using the methods described above.
1179: Starting from the asymptotic distribution $\vec{\pi} = \left[ p(A), p(B)\right]
1180: = \left[ 2/3, 1/3 \right]$ we obtain $p(0\vert 1)=p(10)/p(1)=1/4$ and $p(1\vert
1181: 0)=p(01)/p(0)=1/2$.
1182:
1183: As we can see from~\figref{fig:Even_ParameterEstimates}, a first-order Markov
1184: chain can be inferred without difficulty. The values obtained are exactly as
1185: expected. However, these values do not tell us much about the nature
1186: of the data source by themselves. This points to the important role of model
1187: comparison and entropy rate estimation in understanding this data.
1188:
1189: %% details-
1190: %% code: MarginalPosterior.py in MarkovChainPaper_Code folder.
1191: %% parameters: marginal density plotted for N=50,100,200,400.
1192: %%
1193: \begin{figure}[htbp]
1194: \centering
1195: \includegraphics[width=0.98\columnwidth]{MarginalPosterior_EVEN.eps}
1196: \caption{A plot of the inference of $M_1$ model parameters for the even
1197: process. For a variety of sample sizes $N$, the marginal posterior for
1198: $p(0\vert 1)$ (top panel) and $p(1\vert 0)$ (bottom panel) are shown. The
1199: \textit{true} values of the parameters are $p(0\vert 1)=1/4$ and
1200: $p(1\vert 0) = 1/2$.
1201: \label{fig:Even_ParameterEstimates}}
1202: \end{figure}
1203:
1204: %
1205: \subsubsection{Selecting the Model Order $k$}
1206:
1207: Now consider the selection of Markov chain order $k=1-4$ for a range of data
1208: sizes $N$. Recall that the even process cannot be represented by a finite-order
1209: Markov chain over the output symbols $0$ and $1$. As a consequence, we expect
1210: higher $k$ to be selected with increasing data $N$, as more data statistically
1211: justifies more complex models. This is what happens, in fact, but the way in
1212: which orders are selected as we increase $N$ provides structural information
1213: we could not obtain from the inference of a Markov chain of fixed order.
1214:
1215: %% details-
1216: %% code: ModelComparison.py in MarkovChainPaper_Code folder.
1217: %% parameters: length_min=100, length_max=1000, step=5
1218: %%
1219: \begin{figure}[htbp]
1220: \centering
1221: \includegraphics[width=0.98\columnwidth]{ModelCompare_EVEN.eps}
1222: \caption{Model comparison for Markov chains of order $k=1-4$ for
1223: average data from the even process. The top panel shows the model
1224: comparison with a uniform prior over the possible orders $k$. The bottom
1225: panel demonstrates model comparison with a penalty for the number of model
1226: parameters. In both cases the $k=4$ model is chosen over lower orders as the
1227: amount of data available increases.
1228: \label{fig:Even_ModelComparison}}
1229: \end{figure}
1230:
1231: If we consider~\figref{fig:Even_ModelComparison}, an interesting pattern becomes
1232: apparent. Orders with even $k$ are preferred over odd. In this way model
1233: selection is hinting at the underlying structure of the source. The Markov
1234: chain model class cannot represent the even process in a compact way, but
1235: inference and model comparison combined provide useful information about
1236: the hidden structure of the source.
1237:
1238: In this example we also have regions where the probability of multiple orders $k$
1239: are equally probable. The sample size at which this occurs depends on the prior
1240: over orders which is employed. When this happens, properties estimated from the
1241: Markov chain model class should use a weighted sum of the various orders. As we
1242: will see in the estimation of entropy rates, this is not as critical. At
1243: sample sizes where the order probabilities are similar, the estimated entropy
1244: rates are also similar.
1245:
1246: %
1247: \subsubsection{Estimation of Entropy Rate}
1248:
1249: Entropy rate estimation for the even process turns out to be a more
1250: difficult task than one might expect. In~\figref{fig:Even_InfoTheory} we see
1251: that Markov chains of orders $1-6$ are unable to effectively capture the true
1252: entropy rate. In fact, experience shows that an order $k=10$ Markov chain or
1253: higher is needed to get close to the true value of $\hmu = 2/3$ bits per symbol.
1254: Note also the factor of $20$ longer realizations that are required compared,
1255: say, to the golden mean example.
1256:
1257: %% details-
1258: %% code: EntropyEstimation.py in MarkovChainPaper_Code folder.
1259: %% parameters: length_min=100, length_max=20000, step=100, k=1-6
1260: %%
1261: \begin{figure}[htbp]
1262: \centering
1263: \includegraphics[width=0.98\columnwidth]{EntropyEstimates_EVEN.eps}
1264: \caption{The convergence of $\avg{\, D[Q \| P ] + \hmu [Q] \, }{\rm{post}}$
1265: to the true entropy rate $\hmu = 2/3$ bits per symbol for the the even
1266: process. The true value is indicated by the horizontal gray line. Experience
1267: shows that a $k=10$ Markov chain is needed to effectively approximate the true
1268: value of $\hmu$.
1269: \label{fig:Even_InfoTheory}}
1270: \end{figure}
1271:
1272: As discussed above, a weighted sum of
1273: $\avg{\, D[Q \| P ] + \hmu [Q] \, }{\rm{post}}$ could be employed in this
1274: example. For the estimate this is not critical because the different orders
1275: provide roughly the same value at these points. In fact, these points
1276: correspond to where the estimates of $E(Q,P)$ cross
1277: in~\figref{fig:Even_InfoTheory}. They are samples sizes where apparent
1278: randomness can be explained by structure and increased order $k$.
1279:
1280:
1281: %%
1282: \subsection{Simple Nondeterministic Source: Out-of-class modeling}
1283:
1284: The simple nondeterministic source adds another level of challenge to inference.
1285: As its name suggests, it is described by a nondeterministic HMM.
1286: Considering~\figref{fig:sns} we can see that a $1$ is produced on every
1287: transition except for the $B \rightarrow A$ edge. This means there are many
1288: paths through the internal states that produce the same observable sequence of
1289: $0$s and $1$s. The defining labeled transition matrices for this process are
1290: given by
1291: \begin{equation}
1292: \label{eqn:label_transition_matrix_sns}
1293: T^{(0)} = \left[ \begin{array}{cc}
1294: 0 & 0 \\
1295: 1/2 & 0
1296: \end{array}
1297: \right] \; , \;
1298: T^{(1)} = \left[ \begin{array}{cc}
1299: 1/2 & 1/2 \\
1300: 0 & 1/2
1301: \end{array}
1302: \right]~.
1303: \end{equation}
1304:
1305: Using the state-to-state transition matrix $T=T^{(0)}+T^{(1)}$, we find the
1306: asymptotic distribution for the hidden states to be
1307: $\vec{\pi} = \left[ p(A), p(B) \right] = \left[1/2, 1/2 \right]$. Each of
1308: the hidden states is equally likely; however, a $1$ is always produced from
1309: state $A$, while there is an equal chance of obtaining a $0$
1310: or $1$ from state $B$.
1311:
1312: \begin{figure}[htb]
1313: \begin{center}
1314: %options for the plot:
1315: %-states
1316: \SetStateLabelScale{1.6}
1317: \SetStateLineWidth{1.4pt}
1318: %-edges
1319: \SetEdgeLabelScale{1.4}
1320: \SetEdgeLineWidth{0.75pt}
1321:
1322: \begin{VCPicture}{(0,0)(5,2)}
1323: %states
1324: \ChgStateLabelScale{0.8}
1325: \State[A]{(1,0)}{A}
1326: \State[B]{(4,0)}{B}
1327: \ChgEdgeLabelScale{0.7}
1328: %transitions
1329: \LoopW{A}{ 1 | 1/2 }
1330: \LoopE{B}{ 1 | 1/2 }
1331: \LArcR[0.5]{B}{A}{ 0 | 1/2 }
1332: \LArcR[0.5]{A}{B}{ 1 | 1/2 }
1333: \end{VCPicture}
1334: \end{center}
1335: \vspace{0.5in}
1336: \caption{A hidden Markov chain representation of the simple nondeterministic
1337: process. This example also cannot be represented as a finite-order Markov
1338: chain over outputs $0$ and $1$. It, however, is more complicated than the
1339: two previous examples: Only the observation of a $0$ provides the observer
1340: with information regarding the internal state of the underlying process;
1341: observing a $1$ leaves the internal state ambiguous.
1342: \label{fig:sns}}
1343: \end{figure}
1344:
1345: %
1346: \subsubsection{Estimation of $M_1$ Parameters}
1347:
1348: Using the asymptotic distribution derived above, the parameters of an inferred
1349: first-order Markov chain should approach $p(0\vert 1)=p(10)/p(1)=1/3$ and
1350: $p(1\vert 0)=p(01)/p(0)=1$. As we can see
1351: from~\figref{fig:SNS_ParameterEstimates}, the inference
1352: process captures these values very effectively despite the out-of-class data
1353: source.
1354:
1355: %% details-
1356: %% code: MarginalPosterior.py in MarkovChainPaper_Code folder.
1357: %% parameters: marginal density plotted for N=50,100,200,400.
1358: %%
1359: \begin{figure}[htbp]
1360: \centering
1361: \includegraphics[width=0.98\columnwidth]{MarginalPosterior_SNS.eps}
1362: \caption{Marginal density for $M_1$ model parameters for the
1363: simple nondeterministic process: The curves for each data size $N$
1364: demonstrate a well behaved convergence to the correct values:
1365: $p(0\vert 1)=1/3$ and $p(1\vert 0) = 1$.
1366: \label{fig:SNS_ParameterEstimates}}
1367: \end{figure}
1368:
1369: %
1370: \subsubsection{Selecting the Model Order $k$}
1371:
1372: Here we consider the comparison of Markov chain models of orders $k=1-4$ when
1373: applied to data from the simple nondeterministic source. As with the even
1374: process, we expect increasing order to be selected as the amount of available
1375: data increases. In~\figref{fig:SNS_ModelComparison} we see that this is
1376: exactly what happens.
1377:
1378: %% details-
1379: %% code: ModelComparison.py in MarkovChainPaper_Code folder.
1380: %% parameters: length_min=100, length_max=1.5e5, step=50
1381: %%
1382: \begin{figure}[htbp]
1383: \centering
1384: \includegraphics[width=0.98\columnwidth]{ModelCompare_SNS.eps}
1385: \caption{Model comparison for Markov chains of order $k=1-4$ for
1386: data from the simple nondeterministic process. The top panel
1387: shows the model comparison with a uniform prior over the possible orders
1388: $k$. The bottom panel demonstrates model comparison with a penalty for the
1389: number of model parameters. Note the scale on the horizontal axis---it
1390: takes much more data for the model comparison to pick out higher orders
1391: for this process compared to the previous examples.
1392: \label{fig:SNS_ModelComparison}}
1393: \end{figure}
1394:
1395: Unlike the even process, there is no preference for even orders. Instead, we
1396: observe a systematic increase in order with larger data sets. We do note that
1397: the amount of data need to select a higher order does seem to be larger than for
1398: the even process. Here the distribution over words is more important and more
1399: subtle than the support of the distribution (those words with positive
1400: probability).
1401:
1402: %
1403: \subsubsection{Estimation of Entropy Rate}
1404:
1405: Estimation of the entropy rate for the simple nondeterministic source provides
1406: an interesting contrast to the previous examples. As discussed when introducing
1407: the examples, this data source is a nondeterministic HMM and the entropy rate
1408: cannot be directly calculated using~\eqnref{eqn:entropy_rate}
1409: \cite{Blackwell1957}. However, a
1410: value of $\hmu \approx 0.677867$ bits per symbol has been obtained
1411: in~\cite{Crutchfield1994}.
1412:
1413: %% details-
1414: %% code: EntropyEstimation.py in MarkovChainPaper_Code folder.
1415: %% parameters: length_min=100, length_max=20000, step=100, k=1-6
1416: %%
1417: \begin{figure}[htbp]
1418: \centering
1419: \includegraphics[width=0.98\columnwidth]{EntropyEstimates_SNS.eps}
1420: \caption{The convergence of $\avg{\, D[Q \| P ] +
1421: \hmu [Q] \, }{\rm{post}}$ to the true entropy rate $\hmu \approx 0.677867$
1422: bits per symbol for the simple nondeterministic source. The true value is
1423: indicated by the gray horizontal line.
1424: \label{fig:SNS_InfoTheory}}
1425: \end{figure}
1426:
1427: \begfigref{fig:SNS_InfoTheory} shows the results of entropy-rate estimation
1428: using Markov chains of order $k=1-6$. These results demonstrate that the
1429: entropy rate can be effectively estimated with low-order $k$ and relatively
1430: small data samples. This is an interesting result, as we might expect
1431: estimation of the entropy rate to be most difficult in this example. Instead we
1432: find that the even process was a more difficult test case.
1433:
1434: %%
1435: %%
1436: \section{Discussion}
1437:
1438: The examples presented above provide several interesting lessons in inference,
1439: model comparison, and estimating randomness. The combination of these three
1440: ideas applied to a data source provides information and intuition about the
1441: structure of the underlying system, even when modeling out-of-class processes.
1442:
1443: In the examples of $\MC_{1}$ estimates for each of the sources we see that
1444: the Bayesian methods provide a powerful and consistent description of Markov
1445: chain model parameters. The marginal density accurately describes the
1446: uncertainty associated with these estimates, reflecting asymmetries which point
1447: estimation with error bars cannot capture. In addition, methods described
1448: in~\appref{app:dirichlet} can be used to generate regions of confidence of any
1449: type.
1450:
1451: Although the estimates obtained for the Markov chain model parameters were
1452: consistent with the data source for words up to length $k+1$, they did not capture
1453: the true nature of the system under consideration. This demonstrates that
1454: estimation of model parameters without some kind of model comparison can be very
1455: misleading. Only with the comparison of different orders did some indication
1456: of the true properties of the data source become clear. Without this step,
1457: misguided interpretations are easily obtained.
1458:
1459: For the golden mean process, a $k=1$ Markov chain, the results of model
1460: comparison were predictably uninteresting. This is a good indication that the
1461: correct model class is being employed. However, with the even process a much
1462: more complicated model comparison was found. In this case, a selection of even
1463: $k$ over odd hinted at the distinguishing properties of the source. In a
1464: similar way, the results of model comparison for the simple nondeterministic
1465: source selected increasing order with larger $N$. In both out-of-class
1466: modeling examples, the increase in selected order without end is a good
1467: indication that the data source is not in the Markov chain class. (A parallel
1468: technique is found in \emph{hierarchical $\epsilon$-machine reconstruction}
1469: \cite{Crutchfield1994}.) Alternatively, there is an indication that
1470: very high-order dependencies are important in the description of the process.
1471: Either way, this information is important since it gives an indication to the
1472: modeler that a more complicated dynamic is at work and all results must be
1473: treated with caution.
1474:
1475: Finally, we considered the estimation of entropy rates for the example data
1476: sources. In two of the cases, the golden mean process and the simple
1477: nondeterministic source, short data streams were adequate. This is not
1478: unexpected for the golden mean, but for the simple nondeterministic source this
1479: might be considered surprising. For the even process, the estimation of the
1480: entropy rate was markedly more difficult. For this data source, the countably
1481: infinite number of forbidden words makes the support of the word distribution
1482: at a given length important. As a result, a larger amount of data and a
1483: higher-order Markov chain are needed to find a decent estimate of randomness
1484: from that data source. In this way, each of the steps in Bayesian
1485: inference allow one to separate structure from randomness.
1486:
1487: %%
1488: %%
1489: \section{Conclusion}
1490:
1491: We considered Bayesian inference of $k$-th order Markov chain
1492: models. This included estimating model parameters for a given $k$, model
1493: comparison between orders, and estimation of randomness in the form of entropy
1494: rates. In most approaches to inference, these three aspects are treated as
1495: separate, but related endeavors. However, we find them to be intimately
1496: related. An estimate of model parameters without a sense of whether the
1497: correct model is being used is misguided at best. Model comparison
1498: provides a window into this problem by comparing various orders $k$ within the
1499: model class. Finally, estimating randomness in the form of an entropy rate
1500: provides more information about the trade-off between structure and randomness.
1501: To do this we developed a connection to the statistical mechanical partition
1502: function, from which averages and variances were directly calculable. For the
1503: even process, structure was perceived as randomness and for the simple
1504: nondeterministic source
1505: randomness was easily estimated and structure was more difficult to find.
1506: These insights, despite the out-of-class data, demonstrate the power of
1507: combining these three methods into one effective tool for investigating
1508: structure and randomness in finite strings of discrete data.
1509:
1510: %
1511: % acknowledgments
1512: %
1513: \section*{Acknowledgments}
1514: This work was partially supported at the Center for Computational Science
1515: and Engineering at the University of California at Davis by Intel
1516: Corporation. Work at the Santa Fe Institute was supported under its
1517: Computation, Dynamics, and Inference Program core grants from the
1518: National Science and MacArthur Foundations. C.S. and A.H. acknowledge
1519: support by the National Science Foundation Grant DMS 03-25939 ITR.
1520:
1521: %
1522: % appendices
1523: %
1524: \appendix
1525:
1526: %
1527: % Dirichlet Appendix
1528: %
1529: \section{}
1530: \label{app:Dirichlet}
1531:
1532: \subsection{Dirichlet Distribution\label{app:dirichlet}}
1533:
1534: We supply a brief overview of the Dirichlet distribution for completeness. For
1535: more information, a reference such as~\cite{Wilks1962} should be consulted. In
1536: simple terms, the Dirichlet distribution is the multinomial generalization of
1537: the Beta distribution. The probability density function for $q$ elements is
1538: given by
1539: \begin{equation}
1540: \label{eqn:dirichlet_pdf}
1541: \text{Dir}( \{ p_{i} \} )
1542: =
1543: \frac{ \Gamma( \alpha ) }{\prod_{i=0}^{q-1} \Gamma( \alpha_{i} ) }
1544: \delta(1-\sum_{i=0}^{q-1} p_{i})
1545: \prod_{i=0}^{q-1} p_{i}^{\alpha_{i}-1}.
1546: \end{equation}
1547:
1548: The variates must satisfy $p_i \in [0,1]$ and $\sum_{i=0}^{q-1} p_{i} = 1$. The
1549: hyperparameters $\{ \alpha_{i} \}$ of the distribution, must be real and
1550: positive and we use the notation $\alpha = \sum_{i=0}^{q-1} \alpha_{i}$. The
1551: average, variance, and covariance of the parameters $p_{i}$ are
1552: given by, respectively,
1553: \begin{eqnarray}
1554: \avg{p_{j}}{} & = & \frac{ \alpha_{j} }{ \alpha },
1555: \label{eqn:dirichlet_average}\\
1556: \var{p_{j}}{} & = & \frac{ \alpha_{j}\left( \alpha - \alpha_{j} \right)
1557: }{ \alpha^{2} \left( 1+ \alpha \right) },
1558: \label{eqn:dirichlet_variance}\\
1559: \cov{p_{j}}{p_{l}} & = & - \frac{ \alpha_{j} \alpha_{l}
1560: }{ \alpha^{2} \left( 1+ \alpha \right) } \; , \; j \neq l.
1561: \label{eqn:dirichlet_covariance}
1562: \end{eqnarray}
1563:
1564: %%
1565: %%
1566: \subsection{Marginal distributions\label{app:dirichlet_marginal}}
1567:
1568: An important part of understanding uncertainty in the inference process is the
1569: ability to find regions of confidence from a marginal density. The marginal is
1570: obtained from the posterior by integrating out the dependence on all parameters
1571: except for the parameter of interest. For a Dirichlet distribution, the
1572: marginal density is known to be a Beta distribution~\cite{Wilks1962},
1573: \begin{equation}
1574: \label{eqn:beta_pdf}
1575: \text{Beta}( p_{i} )
1576: =
1577: \frac{ \Gamma( \alpha ) }{\Gamma( \alpha_{i} ) \Gamma( \alpha - \alpha_{i} ) }
1578: p_{i}^{\alpha_{i}-1} \left( 1 - p_{i} \right)^{\alpha - \alpha_{i}-1}.
1579: \end{equation}
1580:
1581: %%
1582: %%
1583: \subsection{Regions of confidence from the marginal density}
1584:
1585: From the marginal density provided in~\eqnref{eqn:beta_pdf} a cumulative
1586: distribution function can be obtained using the incomplete Beta integral
1587: \begin{equation}
1588: \Pr(p_{i} \leq x) = \int_{0}^{x} \, dp_{i} \, \text{Beta}(p_{i}) ~.
1589: \label{eqn:beta_cdf}
1590: \end{equation}
1591: Using this form, the probability that a Markov chain parameter will be between
1592: $a$ and $b$ can be found using $\Pr( a \leq p_{i} \leq b) = \Pr( p_{i} \leq b) -
1593: \Pr( p_{i} \leq a)$. For a confidence level $R$, between zero and one, we then
1594: want to find $(a,b)$ such that $R=\Pr( a \leq p_{i} \leq b)$. The incomplete
1595: Beta integral and its inverse can be found using computational methods,
1596: see~\cite{Majumder1973,Majumder1973a,Cran1977,Berry1990} for details.
1597:
1598: %
1599: % the bibliography
1600: %
1601: \begin{thebibliography}{29}
1602: \expandafter\ifx\csname natexlab\endcsname\relax\def\natexlab#1{#1}\fi
1603: \expandafter\ifx\csname bibnamefont\endcsname\relax
1604: \def\bibnamefont#1{#1}\fi
1605: \expandafter\ifx\csname bibfnamefont\endcsname\relax
1606: \def\bibfnamefont#1{#1}\fi
1607: \expandafter\ifx\csname citenamefont\endcsname\relax
1608: \def\citenamefont#1{#1}\fi
1609: \expandafter\ifx\csname url\endcsname\relax
1610: \def\url#1{\texttt{#1}}\fi
1611: \expandafter\ifx\csname urlprefix\endcsname\relax\def\urlprefix{URL }\fi
1612: \providecommand{\bibinfo}[2]{#2}
1613: \providecommand{\eprint}[2][]{\url{#2}}
1614:
1615: \bibitem[{\citenamefont{Avery and Henderson}(1999)}]{Avery1999}
1616: \bibinfo{author}{\bibfnamefont{P.~J.} \bibnamefont{Avery}} \bibnamefont{and}
1617: \bibinfo{author}{\bibfnamefont{D.~A.} \bibnamefont{Henderson}},
1618: \bibinfo{journal}{Appl. Stat.} \textbf{\bibinfo{volume}{48}},
1619: \bibinfo{pages}{53 } (\bibinfo{year}{1999}).
1620:
1621: \bibitem[{\citenamefont{Liu and Lawrence}(1999)}]{JSLiu1999}
1622: \bibinfo{author}{\bibfnamefont{J.~S.} \bibnamefont{Liu}} \bibnamefont{and}
1623: \bibinfo{author}{\bibfnamefont{C.~E.} \bibnamefont{Lawrence}},
1624: \bibinfo{journal}{Bioinformatics} \textbf{\bibinfo{volume}{15}},
1625: \bibinfo{pages}{38 } (\bibinfo{year}{1999}).
1626:
1627: \bibitem[{\citenamefont{Crutchfield and Feldman}(1997)}]{Crutchfield1997}
1628: \bibinfo{author}{\bibfnamefont{J.~P.} \bibnamefont{Crutchfield}}
1629: \bibnamefont{and} \bibinfo{author}{\bibfnamefont{D.~P.}
1630: \bibnamefont{Feldman}}, \bibinfo{journal}{Phys. Rev. E}
1631: \textbf{\bibinfo{volume}{55}}, \bibinfo{pages}{R1239 }
1632: (\bibinfo{year}{1997}).
1633:
1634: \bibitem[{\citenamefont{MacKay and Peto}(1994)}]{MacKay1994}
1635: \bibinfo{author}{\bibfnamefont{D.~J.~C.} \bibnamefont{MacKay}}
1636: \bibnamefont{and} \bibinfo{author}{\bibfnamefont{L.~C.~B.}
1637: \bibnamefont{Peto}}, \bibinfo{journal}{Nat. Lang. Eng.}
1638: \textbf{\bibinfo{volume}{1}} (\bibinfo{year}{1994}).
1639:
1640: \bibitem[{\citenamefont{Crutchfield and Packard}(1983)}]{Crutchfield1983}
1641: \bibinfo{author}{\bibfnamefont{J.~P.} \bibnamefont{Crutchfield}}
1642: \bibnamefont{and} \bibinfo{author}{\bibfnamefont{N.~H.}
1643: \bibnamefont{Packard}}, \bibinfo{journal}{Physica D}
1644: \textbf{\bibinfo{volume}{7D}}, \bibinfo{pages}{201 } (\bibinfo{year}{1983}).
1645:
1646: \bibitem[{\citenamefont{Hao and Zheng}(1998)}]{BLHao1998}
1647: \bibinfo{author}{\bibfnamefont{B.-L.} \bibnamefont{Hao}} \bibnamefont{and}
1648: \bibinfo{author}{\bibfnamefont{W.-M.} \bibnamefont{Zheng}},
1649: \emph{\bibinfo{title}{Applied Symbolic Dynamics and Chaos}}
1650: (\bibinfo{publisher}{World Scientific}, \bibinfo{year}{1998}).
1651:
1652: \bibitem[{\citenamefont{Anderson and Goodman}(1957)}]{TWAnderson1957}
1653: \bibinfo{author}{\bibfnamefont{T.~W.} \bibnamefont{Anderson}} \bibnamefont{and}
1654: \bibinfo{author}{\bibfnamefont{L.~A.} \bibnamefont{Goodman}},
1655: \bibinfo{journal}{Ann. Math. Stat.} \textbf{\bibinfo{volume}{28}},
1656: \bibinfo{pages}{89 } (\bibinfo{year}{1957}).
1657:
1658: \bibitem[{\citenamefont{Billingsley}(1961)}]{Billingsley1961a}
1659: \bibinfo{author}{\bibfnamefont{P.}~\bibnamefont{Billingsley}},
1660: \bibinfo{journal}{Ann. Math. Stat.} \textbf{\bibinfo{volume}{32}},
1661: \bibinfo{pages}{12 } (\bibinfo{year}{1961}).
1662:
1663: \bibitem[{\citenamefont{Chatfield}(1973)}]{Chatfield1973}
1664: \bibinfo{author}{\bibfnamefont{C.}~\bibnamefont{Chatfield}},
1665: \bibinfo{journal}{Appl. Stat.} \textbf{\bibinfo{volume}{22}},
1666: \bibinfo{pages}{7} (\bibinfo{year}{1973}).
1667:
1668: \bibitem[{\citenamefont{Tong}(1975)}]{HTong1975}
1669: \bibinfo{author}{\bibfnamefont{H.}~\bibnamefont{Tong}}, \bibinfo{journal}{Jour.
1670: Appl. Prob.} \textbf{\bibinfo{volume}{12}}, \bibinfo{pages}{488 }
1671: (\bibinfo{year}{1975}).
1672:
1673: \bibitem[{\citenamefont{Katz}(1981)}]{Katz1981}
1674: \bibinfo{author}{\bibfnamefont{R.~W.} \bibnamefont{Katz}},
1675: \bibinfo{journal}{Technometrics} \textbf{\bibinfo{volume}{23}},
1676: \bibinfo{pages}{243 } (\bibinfo{year}{1981}).
1677:
1678: \bibitem[{\citenamefont{Rissanen}(1984)}]{JRissanen1984}
1679: \bibinfo{author}{\bibfnamefont{J.}~\bibnamefont{Rissanen}},
1680: \bibinfo{journal}{IEEE Trans. Inform. Theory} \textbf{\bibinfo{volume}{30}},
1681: \bibinfo{pages}{629} (\bibinfo{year}{1984}).
1682:
1683: \bibitem[{\citenamefont{Vapnik}(1999)}]{VVapnik1999}
1684: \bibinfo{author}{\bibfnamefont{V.}~\bibnamefont{Vapnik}},
1685: \bibinfo{journal}{IEEE Trans. Neur. Net.} \textbf{\bibinfo{volume}{10}},
1686: \bibinfo{pages}{988} (\bibinfo{year}{1999}).
1687:
1688: \bibitem[{\citenamefont{Vit{\'a}nyi and Li}(2000)}]{Vitanyi2000}
1689: \bibinfo{author}{\bibfnamefont{P.~M.} \bibnamefont{Vit{\'a}nyi}}
1690: \bibnamefont{and} \bibinfo{author}{\bibfnamefont{M.}~\bibnamefont{Li}},
1691: \bibinfo{journal}{IEEE Trans. Inform. Theory}
1692: \textbf{\bibinfo{volume}{46(2)}}, \bibinfo{pages}{446}
1693: (\bibinfo{year}{2000}).
1694:
1695: \bibitem[{\citenamefont{Baldi and Brunak}(2001)}]{Baldi2001}
1696: \bibinfo{author}{\bibfnamefont{P.}~\bibnamefont{Baldi}} \bibnamefont{and}
1697: \bibinfo{author}{\bibfnamefont{S.}~\bibnamefont{Brunak}},
1698: \emph{\bibinfo{title}{Bioinformatics: The Machine Learning Approach}}
1699: (\bibinfo{publisher}{MIT Press}, \bibinfo{address}{Cambridge},
1700: \bibinfo{year}{2001}).
1701:
1702: \bibitem[{\citenamefont{Durbin et~al.}(1998)\citenamefont{Durbin, Eddy, Krogh,
1703: and Mitchison}}]{Durbin1998}
1704: \bibinfo{author}{\bibfnamefont{R.}~\bibnamefont{Durbin}},
1705: \bibinfo{author}{\bibfnamefont{S.}~\bibnamefont{Eddy}},
1706: \bibinfo{author}{\bibfnamefont{A.}~\bibnamefont{Krogh}}, \bibnamefont{and}
1707: \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{Mitchison}},
1708: \emph{\bibinfo{title}{Biological Sequence Analysis}}
1709: (\bibinfo{publisher}{Cambridge University Press},
1710: \bibinfo{address}{Cambridge}, \bibinfo{year}{1998}).
1711:
1712: \bibitem[{\citenamefont{Cover and Thomas}(1991)}]{Cover1991}
1713: \bibinfo{author}{\bibfnamefont{T.~M.} \bibnamefont{Cover}} \bibnamefont{and}
1714: \bibinfo{author}{\bibfnamefont{J.~A.} \bibnamefont{Thomas}},
1715: \emph{\bibinfo{title}{Elements of Information Theory}}
1716: (\bibinfo{publisher}{Wiley-Interscience}, \bibinfo{address}{New York},
1717: \bibinfo{year}{1991}).
1718:
1719: \bibitem[{\citenamefont{MacKay}(2003)}]{MacKay2003}
1720: \bibinfo{author}{\bibfnamefont{D.~J.~C.} \bibnamefont{MacKay}},
1721: \emph{\bibinfo{title}{Information Theory, Inference, and Learning
1722: Algorithms}} (\bibinfo{publisher}{Cambridge University Press},
1723: \bibinfo{address}{Cambridge}, \bibinfo{year}{2003}).
1724:
1725: \bibitem[{\citenamefont{Samengo}(2002)}]{Samengo2002}
1726: \bibinfo{author}{\bibfnamefont{I.}~\bibnamefont{Samengo}},
1727: \bibinfo{journal}{Phys. Rev. E} \textbf{\bibinfo{volume}{65}},
1728: \bibinfo{pages}{46124} (\bibinfo{year}{2002}).
1729:
1730: \bibitem[{\citenamefont{Young and Crutchfield}(1994)}]{Young1994}
1731: \bibinfo{author}{\bibfnamefont{K.}~\bibnamefont{Young}} \bibnamefont{and}
1732: \bibinfo{author}{\bibfnamefont{J.~P.} \bibnamefont{Crutchfield}},
1733: \bibinfo{journal}{Chaos, Solitons, and Fractals}
1734: \textbf{\bibinfo{volume}{4}}, \bibinfo{pages}{5 } (\bibinfo{year}{1994}).
1735:
1736: \bibitem[{\citenamefont{Abramowitz and Stegun}(1965)}]{Abramowitz1965}
1737: \bibinfo{author}{\bibfnamefont{M.}~\bibnamefont{Abramowitz}} \bibnamefont{and}
1738: \bibinfo{author}{\bibfnamefont{I.~A.} \bibnamefont{Stegun}},
1739: \emph{\bibinfo{title}{Handbook of Mathematical Functions}}
1740: (\bibinfo{publisher}{Dover}, \bibinfo{address}{New York},
1741: \bibinfo{year}{1965}).
1742:
1743: \bibitem[{\citenamefont{Crutchfield}(1994)}]{Crutchfield1994}
1744: \bibinfo{author}{\bibfnamefont{J.~P.} \bibnamefont{Crutchfield}},
1745: \bibinfo{journal}{Physica D} \textbf{\bibinfo{volume}{75}},
1746: \bibinfo{pages}{11} (\bibinfo{year}{1994}).
1747:
1748: \bibitem[{\citenamefont{Upper}(1997)}]{Upper1997}
1749: \bibinfo{author}{\bibfnamefont{D.~R.} \bibnamefont{Upper}}, Ph.D. thesis,
1750: \bibinfo{school}{University of California}, \bibinfo{address}{Berkeley}
1751: (\bibinfo{year}{1997}), \bibinfo{note}{{P}ublished by University Microfilms
1752: Intl, Ann Arbor, Michigan}.
1753:
1754: \bibitem[{\citenamefont{Blackwell and Koopmans}(1957)}]{Blackwell1957}
1755: \bibinfo{author}{\bibfnamefont{D.}~\bibnamefont{Blackwell}} \bibnamefont{and}
1756: \bibinfo{author}{\bibfnamefont{L.}~\bibnamefont{Koopmans}},
1757: \bibinfo{journal}{Ann. Math. Stat.} \textbf{\bibinfo{volume}{28}},
1758: \bibinfo{pages}{1011} (\bibinfo{year}{1957}).
1759:
1760: \bibitem[{\citenamefont{Wilks}(1962)}]{Wilks1962}
1761: \bibinfo{author}{\bibfnamefont{S.~S.} \bibnamefont{Wilks}},
1762: \emph{\bibinfo{title}{Mathematical Statistics}} (\bibinfo{publisher}{John
1763: Wiley \& Sons, Inc.}, \bibinfo{address}{New York}, \bibinfo{year}{1962}).
1764:
1765: \bibitem[{\citenamefont{Majumder and
1766: Bhattacharjee}(1973{\natexlab{a}})}]{Majumder1973}
1767: \bibinfo{author}{\bibfnamefont{K.}~\bibnamefont{Majumder}} \bibnamefont{and}
1768: \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{Bhattacharjee}},
1769: \bibinfo{journal}{Appl. Stat.} \textbf{\bibinfo{volume}{22}},
1770: \bibinfo{pages}{411} (\bibinfo{year}{1973}{\natexlab{a}}).
1771:
1772: \bibitem[{\citenamefont{Majumder and
1773: Bhattacharjee}(1973{\natexlab{b}})}]{Majumder1973a}
1774: \bibinfo{author}{\bibfnamefont{K.}~\bibnamefont{Majumder}} \bibnamefont{and}
1775: \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{Bhattacharjee}},
1776: \bibinfo{journal}{Appl. Stat.} \textbf{\bibinfo{volume}{22}},
1777: \bibinfo{pages}{409} (\bibinfo{year}{1973}{\natexlab{b}}).
1778:
1779: \bibitem[{\citenamefont{Cran et~al.}(1977)\citenamefont{Cran, Martin, and
1780: Thomas}}]{Cran1977}
1781: \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{Cran}},
1782: \bibinfo{author}{\bibfnamefont{K.}~\bibnamefont{Martin}}, \bibnamefont{and}
1783: \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{Thomas}},
1784: \bibinfo{journal}{Appl. Stat.} \textbf{\bibinfo{volume}{26}},
1785: \bibinfo{pages}{111} (\bibinfo{year}{1977}).
1786:
1787: \bibitem[{\citenamefont{Berry et~al.}(1990)\citenamefont{Berry, {P.W. Mielke,
1788: Jr.}, and Cran}}]{Berry1990}
1789: \bibinfo{author}{\bibfnamefont{K.}~\bibnamefont{Berry}},
1790: \bibinfo{author}{\bibnamefont{{P.W. Mielke, Jr.}}}, \bibnamefont{and}
1791: \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{Cran}},
1792: \bibinfo{journal}{Appl. Stat.} \textbf{\bibinfo{volume}{39}},
1793: \bibinfo{pages}{309} (\bibinfo{year}{1990}).
1794:
1795: \end{thebibliography}
1796:
1797:
1798: \end{document}
1799:
1800: