0805.2689/bc3.tex
1: \documentclass[aps,pre,twocolumn]{revtex4}
2: 
3: \usepackage{amssymb,amsmath,graphicx}
4: 
5: \begin{document}
6: 
7: \title{Bayesian approach to clustering real value, hypergraph and 
8: bipartite graph data:\\ solution via variational methods}
9: 
10: \author{Alexei Vazquez\\
11: The Simons Center for Systems Biology\\
12: Institute for Advanced Study, Einstein Drive, Princeton, New Jersey 08540, 
13: USA}
14: 
15: \date{\today}
16: 
17: \begin{abstract}
18: 
19: Data clustering, including problems such as finding network communities, 
20: can be put into a systematic framework by means of a Bayesian approach. 
21: The application of Bayesian approaches to real problems can be, however, 
22: quite challenging. In most cases the solution is explored via Monte Carlo 
23: sampling or variational methods. Here we work further on the application 
24: of variational methods to clustering problems. We introduce generative 
25: models based on a hidden group structure and prior distributions. We 
26: extend previous attends by Jaynes, and derive the prior distributions 
27: based on symmetry arguments. As a case study we address the problems of 
28: two-sides clustering real value data and clustering data represented by a 
29: hypergraph or bipartite graph. From the variational calculations, and 
30: depending on the starting statistical model for the data, we derive a 
31: variational Bayes algorithm, a generalized version of the expectation 
32: maximization algorithm with a built in penalization for model complexity 
33: or bias. We demonstrate the good performance of the variational Bayes 
34: algorithm using test examples.
35: 
36: \end{abstract}
37: 
38: \maketitle
39: 
40: \bibliographystyle{apsrev}
41: 
42: \section{Introduction}
43: 
44: Mixture models provide an intuitive statistical representation of datasets 
45: structured in groups, clusters or classes \cite{maclachlan00}. A complex 
46: dataset is decomposed into the superposition of simpler datasets. The 
47: inverse problem consists in determining the group decomposition and the 
48: statistical parameters characterizing each group. For a fixed number of 
49: groups the expectation maximization (EM) algorithm provides a recursive 
50: solution to the inverse problem \cite{dempster77}. The estimation of the 
51: right number or groups has been, however, a great challenge. Corrections 
52: such as the Arkaike information criterion (AIC) \cite{akaike74} and the 
53: Bayesian information criterion (BIC) \cite{schwarz78} have been derived, 
54: penalizing model complexity and overfitting. Yet, the number of groups 
55: estimated from these criteria is in general unsatisfactory.
56: 
57: In contrast, a Bayesian approach would not attempt to estimate what is the 
58: ``optimal'' number of groups, but instead average over models with a 
59: different number of groups \cite{jeffreys39}. The Bayesian approach is 
60: becoming a popular technique to solve problems in data analysis, model 
61: selection and hypothesis testing \cite{spirtes00,mackay03,robert07}. Many 
62: of the original ideas come from the early work of Jeffreys 
63: \cite{jeffreys39}, but it is just recently that they are starting to be 
64: used widely \cite{spirtes00,mackay03,robert07}. The application of 
65: Bayesian approaches to real problems can be, however, quite challenging. 
66: In most cases the solution is explored via Monte Carlo sampling 
67: \cite{chen00,mackay03} or variational methods 
68: \cite{mackay03,beal03,yedidia05}. The application of variational methods 
69: to Bayesian problems results in the variational Bayes (VB) algorithm 
70: \cite{mackay03,beal03}. The VB algorithm is a set of self-consistent 
71: equations analog to the EM algorithm. They can be solved recursively 
72: obtaining an approximate solution to the inverse inference problem. These 
73: methods have been applied, for example, to Gaussian mixture models for 
74: real value data \cite{maclachlan00,rasmussen00}, Dirichlet mixture models 
75: for categorical data \cite{blei03} and the problem of finding graph 
76: modules \cite{hofman07}.
77: 
78: Here we further study the use of variational methods in the context of 
79: Bayesian approaches, focusing on data clustering problems. I the first two 
80: sections we review the Bayesian approach. In Section \ref{variational} we 
81: revisit the connection between the Bayesian formulation and statistical 
82: mechanics. In section \ref{models} we introduce the generalities of 
83: generative models with a hidden structure at the samples side and at both 
84: the samples and variables side. In Section \ref{S:priors} we extend the 
85: previous work by Jaynes \cite{jaynes68} deriving prior distributions based 
86: on symmetry properties. We report a correction to his result for the model 
87: with a location and scale parameter and an extension of his result for the 
88: binomial model to the multinomial model. In the following Sections we 
89: study the problem of two-sides clustering real value data and of 
90: clustering data represented by a hypergraph or bipartite graph. Depending 
91: on our starting statistical model, we obtain a VB algorithm. Because of 
92: its Bayesian root, the VB algorithms have a built in correction for model 
93: complexity or bias and, therefore, they do not require the use of 
94: additional complexity criteria. The performance of the VB algorithms is 
95: tested in some examples, obtaining satisfactory results whenever there is 
96: a significant distinction between the groups.
97:  
98: \section{Bayesian approach and variational solution}\label{variational}
99: 
100: The {\em Bayesian approach} is a systematic methodology to interpret 
101: complex datasets and to evaluate model hypothesis. Its main ingredients or 
102: steps are: given a dataset $D$, (i) introduce a statistical model with 
103: model parameters, $\phi$, (ii) write down the likelihood to observe the 
104: data given the proposed model and parameters, $P(D|\phi)$, (iii) determine 
105: the prior distribution for the model parameters based on our current 
106: knowledge, $P(\phi)$, and, finally, (iv) invert the statistical model of 
107: the data given the likelihood and prior distribution to obtain the 
108: posterior distribution of the model parameters given the model and data, 
109: $P(\phi|D)$. The latter step is based on Bayes rule
110: 
111: \begin{equation}\label{Bayes-Theorem}
112: P(\phi|D) = \frac{1}{Z} P(D|\phi)P(\phi)
113: \end{equation}
114: 
115: \noindent where 
116: 
117: \begin{equation}\label{Z}
118: Z = P(D)=\int d\theta P(D|\phi)P(\phi)\ .
119: \end{equation}
120: 
121: \noindent Having obtained the distribution of the model parameters, at 
122: least formally, we can determine other magnitudes. For example, the 
123: average of a quantity $A(\phi)$ is given by
124: 
125: \begin{equation}\label{Ave}
126: \langle A(\phi)\rangle = \int d\phi P(\phi|D) A(\phi)\ .
127: \end{equation}
128: 
129: \noindent In practice calculating (\ref{Z}) or (\ref{Ave}) is a formidable 
130: task. A very powerful approximation scheme is the {\em variational method} 
131: \cite{mackay03,beal03}. The main idea of the variational method is to 
132: approximate the generally difficult to handle distribution $P(\phi|D)$ by 
133: a distribution $Q(\phi|D)$ of a more tractable form. In the following we 
134: omit the dependency of $Q$ on $D$ and just write $Q(\phi)$. Given 
135: $Q(\phi)$ we can obtain a bound for $F=-\ln Z$ using Jensen's inequality
136: 
137: \begin{eqnarray}\label{Jensen}
138: F & = & -\ln Z
139: \nonumber\\
140:  & = & -\ln \int d\phi Q(\phi) 
141: \frac{P(D|\phi)P(\phi)}{Q(\phi)}
142: \nonumber\\
143:  & \leq & -\int d\phi Q(\phi) 
144: \ln \frac{P(D|\phi)P(\phi)}{Q(\phi)}
145: \end{eqnarray}
146: 
147: \noindent The latter equation can be rewritten as \cite{mackay03}
148: 
149: \begin{equation}\label{F}
150: F \leq U - TS
151: \end{equation}
152: 
153: \noindent where $T=1$,
154: 
155: \begin{equation}\label{U}
156: U = -\int d\phi Q(\phi) \ln P(D|\phi)
157: \end{equation}
158: 
159: \noindent is minus the average log likelihood and
160: 
161: \begin{equation}\label{S}
162: S = - \int d\phi Q(\phi) \ln 
163: \frac{Q(\phi)}{P(\phi)}
164: \end{equation}
165: 
166: \noindent is the Kullback-Leibler divergence of $Q(\phi)$ relative to the 
167: prior distribution $P(\phi)$ \cite{kullback59}. Equation (\ref{F}) 
168: resembles the usual free energy in statistical mechanics: $F = U - TS$, 
169: where $U$, $S$ and $T$ are the internal energy, entropy and temperature of 
170: the system, the temperature being expressed in units of the Boltzman 
171: constant $k_{\rm B}$. Minus the average log likelihood plays the role of 
172: the internal energy, the Kullback-Leibler divergence of $Q(\phi)$ plays 
173: the role of the entropy and temperature equals one.
174: 
175: Equation (\ref{F}) emphasizes the two components determining the best 
176: choice of variational distribution $Q(\phi)$:  better fit to the data and 
177: model bias. How well the data is fitted is quantified by the internal 
178: energy $U$ (\ref{U}). To achieve the best fit, or internal energy ground 
179: state, $Q(\phi)$ should be concentrated around the regions of the 
180: parameter space where $P(D|\phi)$ is maximum. The best choice in this 
181: respect will by the maximum likelihood estimate (MLE)
182: 
183: \begin{equation}\label{QMLE}
184: Q_{\rm MLE}(\phi) = \delta(\phi-\phi^*)
185: \end{equation}
186: 
187: \noindent where
188: 
189: \begin{equation}\label{phiMLE}
190: \phi^*=\max_{\phi}P(D|\phi)\ .
191: \end{equation}
192: 
193: \noindent In the opposite extreme, when no data is presented to us, the 
194: best distribution is that maximizing the Kullback-Leibler divergence 
195: relative to the prior distribution. This maximum entropy (ME) solution is 
196: the prior distribution itself
197: 
198: \begin{equation}\label{QME}
199: Q_{ME}(\phi) = P(\phi)\ .
200: \end{equation}
201: 
202: \noindent In general, the drive to better fit the data is opposed by the 
203: tendency to obtain the least unbiased model. The variational solution is 
204: therefore in the middle between the one extreme of biased models fitting 
205: the data very well and completely unbiased models giving a bad fit to the 
206: data. It is obtained after minimizing (\ref{F}) with respect to $Q(\phi)$ 
207: over a restricted class of functions. This variational solution $Q(\phi)$ 
208: represents the closest distribution to $P(\phi|D)$ within the class of 
209: functions considered.
210: 
211: \section{Statistical model with a population structure}\label{models}
212: 
213: In this section we present the generalities of statistical models with a 
214: first level population structure. Similar models has been studied in 
215: \cite{blei03,hofman07}. Our working hypothesis is that there is 
216: a hidden population structure, characterized by the subdivision of the 
217: population samples into groups. We assume that we are given a dataset $D$ 
218: which, in some way to be determined, reflects the population structure. 
219: The problem consist in inferring this hidden structure and the associated 
220: model parameters from the data. To tackle this problem we introduce a 
221: statistical model with a built in population structure as a generative 
222: model of the data. The population structure and the model parameters are 
223: then inferred solving the inverse problem. More precisely
224: 
225: \begin{itemize}
226: 
227: \item[(i)] We consider a population composed of $n$ elements divided in 
228: $K$ groups.
229: 
230: \item[(ii)] The samples assignment to groups is generated by a multinomial 
231: model with probabilities $\pi_k$, $k=1,\ldots,K$. Denoting by $g_i$ the 
232: group to which the $i$-th sample belongs, we obtain
233: 
234: \begin{equation}\label{Pgpi}
235: P(g|\pi) = \prod_{i=1}^n\pi_{g_i}\ .
236: \end{equation}
237: 
238: \item[(iii)] Given the group assignments $g_i$, and depending on the 
239: dataset, we write down the likelihood $P(D|g,\theta)$ to observe the data 
240: parametrized by the parameter set $\theta$.
241: 
242: \item[(iv)] Putting all this together we obtain the posterior distribution
243: 
244: \begin{equation}\label{GM1}
245: P(\phi|D) = \frac{1}{Z} P(D|g,\theta)P(g|\pi)P(\theta)P(\pi)P(K)\ ,
246: \end{equation}
247: 
248: \noindent where $\phi=(g,\theta,\pi,K)$ and $P(\theta)$, $P(\pi)$ and 
249: $P(K)$ are the prior distributions of $\theta$, $\pi$ and $K$.
250: 
251: \end{itemize}
252: 
253: \noindent The form of the prior distributions, except for $P(K)$, is the 
254: subject of the next section. The distribution $P(K)$ is irrelevant for 
255: problems with large datasets. The difference between the log-likelihood of 
256: models with different values of $K$ is in general of the order of the 
257: dataset size and, as a consequence, the contribution of $\ln P(K)$ is 
258: negligible. Thus, in the following sections we simply neglect the 
259: contribution given by $P(K)$. Finally, we specify the likelihood 
260: $P(D|g,\theta)$ when addressing specific problems.
261: 
262: In some cases we are going to assume that the variables in our dataset are 
263: also divided in groups. Here we consider a set of $m$ variables divided in 
264: $L$ groups. The variables assignment to groups is generated by a 
265: multinomial model with probabilities $\kappa_l$, $l=1,\ldots,L$. Denoting 
266: by $c_j$, $j=1,\ldots,m$, the variable group to which variable $j$ belongs 
267: we can then write
268: 
269: \begin{equation}\label{Pckappa}
270: P(c|\kappa) = \prod_{j=1}^m \kappa_{c_j}
271: \end{equation}
272: 
273: \noindent After adding this variable group structure, the posterior 
274: distribution (\ref{GM1}) is replaced by
275: 
276: \begin{eqnarray}\label{GM2}
277: P(\phi|D) &=& \frac{1}{Z} P(D|g,c,\theta)P(g|\pi)P(c|\kappa)
278: \nonumber\\
279: &\times& P(\pi)P(\kappa)P(\theta)P(K)\ ,
280: \end{eqnarray}
281: 
282: \noindent where $\phi=(g,c,\theta,\pi,\kappa,K)$ and $P(\kappa)$ is the 
283: prior distribution of $\kappa$.
284: 
285: \section{Prior distributions}\label{S:priors}
286: 
287: \begin{table*}\label{priors}
288: \begin{tabular}{|l|l|l|l|l|}
289: %\display
290: \hline
291: Model & Likelihood & Conjugate prior & Invariant prior & Renormalization limit\\
292: \hline
293: Binomial & $\binom{N}{n} p^n(1-p)^{N-n}$ 
294: & $ {\rm Beta}(p;\tilde{\alpha},\tilde{\beta}) =
295: \frac{ 1 }{ {\rm B}(\tilde{\alpha},\tilde{\beta}) }
296: p^{\tilde{\alpha}-1}(1-p)^{\tilde{\beta}-1} $
297: & ${\rm const.} p^{-1}(1-p)^{-1}$
298: & $\tilde{\alpha}\rightarrow0$, $\tilde{\beta}\rightarrow0$\\
299: Multinomial & $ \frac{ (\sum_{k=1}^Kn_k)! }{ \prod_{k=1}^K n_k! }
300: \prod_{k=1}^K\pi_k^{n_k}$ 
301: & $ {\rm D}(\pi;\gamma) =
302: \frac{1}{{\rm B}(\tilde{\gamma})} \prod_{k=1}^K\pi_k^{\tilde{\gamma_k}-1} $ 
303: & $ {\rm const.} \prod_{i=1}^K\pi_i^{-1}$
304: & $\tilde{\gamma}_k\rightarrow0$\\
305: Normal & $\prod_{i=1}^n 
306: \frac{ 1 }{ \sqrt{2\pi\sigma^2} } e^{ - \frac{ (X_i-\mu)^2 }{ 2\sigma^2 } }$
307: & $\frac{ 2\left(\frac{\tilde{\alpha}}{2}\tilde{\sigma}^2\right)^{ \frac{\tilde{\alpha}}{2} } }
308: { \Gamma\left(\frac{\tilde{\alpha}}{2}\right)\sigma^{\tilde{\alpha}+1} }
309: e^{ -\frac{ \tilde{\alpha}\tilde{\sigma}^2 }{ 2\sigma^2 } }
310: \sqrt{ \frac{ \tilde{\alpha} }{ 2\pi\sigma^2 } }
311: e^{ -\frac{ \tilde{\alpha}(\mu-\mu_0)^2 }{ 2\sigma^2 } }$
312: & $\frac{\rm const.}{\sigma^2}$
313: & $\tilde{\alpha}\rightarrow0$\\
314: \hline
315: \end{tabular}
316: 
317: \caption{{\bf Prior distributions:} Examples of model likelihoods and 
318: their associated conjugated priors and invariant priors. ${\rm 
319: Beta}(x;a,b)$ denotes the probability density function of the beta 
320: distribution, where ${\rm B}(a,b)$ is the beta function. ${\rm 
321: D}(x;\gamma)$ denotes the probability density function of the Dirichlet 
322: distribution, the generalized beta distribution, where ${\rm 
323: B}(\gamma)=\Gamma(\sum_k\gamma_k)/\prod_k\Gamma(\gamma_k)$ is the 
324: generalized beta function. The renormalization limit column indicates the 
325: limit in which the conjugate prior approaches the invariant prior.}
326: 
327: \end{table*}
328: 
329: The choice of the prior distribution $P(\phi)$ is probably one of the less 
330: obvious topics in Bayesian analysis. Currently the predominant choice is 
331: the use of conjugate priors. The form of conjugate priors is indicated by 
332: the likelihood, making the prior selection less ambiguous. For example, the 
333: binomial likelihood $P(n|p)\propto p^n (1-p)^{N-n}$ suggests a beta 
334: distribution for $P(p|n)$. Furthermore, by choosing a beta distribution as 
335: a prior, $P(p) \propto p^{\tilde{\alpha}-1}(1-p)^{\tilde{\beta}-1}$, the 
336: posterior distribution remains a beta distribution, but with exponents 
337: $\alpha=\tilde{\alpha}+n\nonumber$ and $\beta=\tilde{\beta}+N-n$. In this 
338: sense, the beta distribution is the conjugate prior of the binomial 
339: likelihood. A list of conjugate priors relevant for this work is provided 
340: in Table \ref{priors}.
341: 
342: Yet, the fact that the form of conjugate priors is suggested by the 
343: likelihood does not demonstrate that they are the correct choice of 
344: priors. Moreover, even if we accept their use, it is not clear what is the 
345: correct choice for the prior distribution parameters, e.g. 
346: $\tilde{\alpha}$ and $\tilde{\beta}$. Different methods have been 
347: proposed to determine these parameters. In general they are based on {\it 
348: a posteriori} analyzes, e.g. calculations, making use of the data in some 
349: way or another. Such methods violate, however, the concept of prior 
350: distribution, defined as the distribution of the model parameters in the 
351: absence of the data.
352: 
353: An alternative approach is that by Jaynes \cite{jaynes68}. According to 
354: Jaynes, in the absence of any data, the priors should be solely determined 
355: based on the symmetries and constraints of the problem under 
356: consideration. In this work we make use of Jaynes's approach to determine 
357: the prior distribution. Below we derive Jaynes's priors for the cases 
358: relevant for this work.
359: 
360: \subsection{Prior for a model with location and scale parameters}\label{LP}
361: 
362: Consider a problem where the data consists of equally distributed random 
363: variables $X_i$, $i,\ldots,n$, taking real values. Furthermore let us 
364: assume that the likelihood has the form
365: 
366: \begin{equation}\label{l1}
367: P(X|\mu,\sigma) = \prod_i f\left( \frac{X_i-\mu}{\sigma} \right) \frac{1}{\sigma}\ ,
368: \end{equation}
369: 
370: \noindent where $f(x)$ is a probability density function in the real line 
371: and $\mu$ and $\sigma$ are a location and scale parameter respectively. 
372: Our task consist in determining the prior distribution of $\mu$ and 
373: $\sigma$. Now, suppose $X_i$ represent positions, which could be measured 
374: from difference systems of reference and using different units. In this 
375: context the prior distribution should be the same regardless of our system 
376: of reference and units. More precisely, our system is invariant under the 
377: transformations
378: 
379: \begin{eqnarray}\label{t1}
380: x^\prime &=& a(x+b)\nonumber\\
381: \mu^\prime &=& a(\mu+b)\nonumber\\
382: \sigma^\prime &=& a\sigma
383: \end{eqnarray}
384: 
385: \noindent where $b$ represents a translation and $a$ a change of scale or 
386: units. The likelihood is invariant under these transformations and so must 
387: be the prior distribution. Therefore,
388: 
389: \begin{equation}\label{i1}
390: P(\mu^\prime,\sigma^\prime) d\mu^\prime d\sigma^\prime = P(\mu,\sigma) d\mu d\sigma
391: \end{equation}
392: 
393: \noindent The solution to this functional equation is
394: 
395: \begin{equation}\label{p1}
396: P(\mu,\sigma) = \frac{\rm const.}{\sigma^2}\ .
397: \end{equation}
398: 
399: \noindent This analysis was first reported by Jaynes \cite{jaynes68}. He 
400: obtained, however, $P(\mu,\sigma)\propto 1/\sigma$. This discrepancy is 
401: rooted in the fact that Jaynes did not take into account that the location 
402: parameter $\mu$ follows the same rules than $x$ upon the translation and 
403: scale transformations. He assumed $\mu^\prime=\mu+b$ \cite{jaynes68} while 
404: the correct transformation is $\mu^\prime=a(\mu+b)$ (\ref{t1}).
405: 
406: \subsection{Prior for the multinomial model}\label{PM}
407: 
408: Consider the multinomial model with $K$ states
409: 
410: \begin{equation}\label{l2}
411: P(n|\pi) =\frac{ \left( \sum_{k=1}^Kn_k \right)! }{ \prod_{i=1}^K\pi_k }
412: \prod_{k=1}^K \pi_k^{n_k}\ ,
413: \end{equation}
414: 
415: \noindent where $n_k$ is the number of times state $k$ was observed and 
416: $\pi_k$ is the probability to observe state $k$ in one trial, 
417: $0\leq\pi_k\leq1$ and $\sum_{k=1}^K\pi_k=1$. Here we extend the approach 
418: followed by Jaynes for the binomial model \cite{jaynes68}.
419: 
420: The probabilities $\pi_k$ may be different depending on our believe, e.g. 
421: all states are equally probable. Different investigators may have 
422: different believes, resulting in different choices of $\pi_k$. The main 
423: assumption is that the prior distribution should be independent of what is 
424: our specific believe and, therefore, should be invariant under a believe 
425: transformation.
426: 
427: {\em Believe transformation:} Let us represent by $S_k$ the state $k$, and 
428: let $P(S_k|E)$ and $P(S_k|E^\prime)$ be the probabilities to observe state 
429: $S_k$ in one trial according to believe $E$ and $E^\prime$, respectively. 
430: From Bayes rule it follows that
431: 
432: \begin{equation}\label{t2}
433: P(S_k|E^\prime) = \frac{ P(E^\prime|S_k,E)P(S_k|E) }{ \sum_j 
434: P(E^\prime|S_j,E)P(S_j|E) }
435: \end{equation}
436: 
437: \noindent for $k=1,\ldots,K$. The latter equation can be rewritten as
438: 
439: \begin{equation}\label{t3}
440: \pi_k^\prime = \frac{a_k}{A} \pi_k
441: \end{equation}
442: 
443: \noindent for $k=1,\ldots,K-1$ and 
444: $\pi_K^\prime=1-\sum_{k<K}\pi_k^\prime$, where $\pi_k=P(S_k|E)$, 
445: $\pi_k^\prime=P(S_k|E^\prime)$,
446: 
447: \begin{equation}\label{ai}
448: a_k = \frac{ P(E^\prime|S_k,E) }{ P(E^\prime|S_K,E) }
449: \end{equation}
450: 
451: and
452: 
453: \begin{equation}\label{A}
454: A = 1 + \sum_{k<K} (a_jk-1)\pi_k\ .
455: \end{equation}
456: 
457: \noindent Equation (\ref{t3}) provides the transformation rules of the 
458: probabilities $\pi_k$ from one system of believe to another.
459: 
460: The invariance under the above transformation lead to the functional equation 
461: 
462: \begin{equation}\label{i2}
463: P(\pi^\prime) d\pi^\prime = P(\pi) d\pi\ ,
464: \end{equation}
465: 
466: \noindent To solve this equation we first need to compute the determinant 
467: of the transformation Jacobian. The Jacobian of the transformation 
468: (\ref{t3}) has the matrix elements
469: 
470: \begin{equation}\label{Jij}
471: J_{ij} = \frac{\partial\pi_i^\prime}{\partial\pi_j} 
472: = \frac{a_i\delta_{ij}}{A} - \frac{a_i(a_j-1)\pi_i}{A^2}\ ,
473: \end{equation}
474: 
475: \noindent $i,j=1,\ldots,K-1$. This matrix can be decomposed into the 
476: product $J=BC$, where $B_{ij}=a_i\delta_{ij}/A$ is a diagonal matrix and 
477: $C_{ij}=\delta_{ij}-(a_j-1)\pi_i/A$ has two eigenvalues, 
478: $\lambda_1=A^{-1}$ and a $n-2$-degenerate eigenvalue $\lambda_2=1$. 
479: Putting all together we obtain
480: 
481: \begin{equation}\label{dJ}
482: |J| = |B|\lambda_1\lambda_2^{n-2} = \frac{1}{A^n} \prod_{k=1}^K a_k\ .
483: \end{equation}
484: 
485: \noindent The solution of (\ref{i2}), with $d\pi^\prime = |J|d\pi$, is 
486: given by
487: 
488: \begin{equation}\label{p2}
489: P(\pi) = {\rm const.} \prod_{i=1}^K \pi_i^{-1}\ .
490: \end{equation}
491: 
492: \noindent Note that for $K=2$, $\pi_1=p$ and $\pi_2=1-p$, we recover the 
493: result by Jaynes for the binomial model
494: 
495: \begin{equation}\label{p3}
496: P(p)\propto p^{-1}(1-p)^{-1}\ .
497: \end{equation}
498: 
499: \subsection{Improper priors renormalization}
500: 
501: The prior distributions (\ref{p1}) and (\ref{p2}) are improper, i.e. their 
502: integral over the parameter space is not finite. At first this may sound 
503: an unsuitable property for a prior distribution. Nevertheless, the 
504: improper nature of these prior distributions is just indicating that the 
505: symmetries in our problem are not sufficient to fully determine them. Data 
506: is required to obtain a proper distribution. The best example for an 
507: intuitive understanding of these arguments is the prior distribution of 
508: the location parameter. In the absence of any data and under the 
509: assumption of translational invariance, it is clear that every value in 
510: the real line is an equally probable value for the location parameter, 
511: resulting in an improper prior.
512: 
513: From the operational point of view, the posterior distribution may be 
514: proper even when the prior is not. Indeed, the 
515: integral $\int d\phi 
516: P(\phi)$ may be improper, $\int d\phi P(\phi|D) \propto \int d\phi 
517: P(D|\phi)P((\phi)$ may be proper. The posterior distribution can be 
518: improper when the inference problem has not been correctly formulated or 
519: there is not sufficient data to determine the model parameters.
520: 
521: To avoid dealing with improper distributions, we can renormalize improper 
522: priors to some limit of a proper distribution. Since conjugate priors 
523: facilitate analytical calculations they are a good starting point. This is 
524: illustrated in Table (\ref{priors}) for selected examples. These are the 
525: prior distributions used herein. In particular, for the multinomial 
526: probabilities $\pi$ and $\kappa$ we use the renormalized invariant priors
527: 
528: \begin{equation}\label{Ppi}
529: P(\pi) = \frac{1}{{\rm B}(\tilde{\gamma})} 
530: \prod_{k=1}^K \pi^{\tilde{\gamma}_k-1}
531: \end{equation}
532: 
533: \begin{equation}\label{Pkappa}
534: P(\kappa) = \frac{1}{{\rm B}(\tilde{\epsilon})} 
535: \prod_{l=1}^L \kappa^{\tilde{\epsilon}_l-1}
536: \end{equation}
537: 
538: \noindent with $\tilde{\gamma}\rightarrow0$ and 
539: $\tilde{\epsilon}_l\rightarrow0$.
540: 
541: \section{Mean-field approximation}\label{MF}
542: 
543: In this section we specify the form of the variational function $Q(\phi)$. 
544: To allow for an analytical solution we neglect correlations between the 
545: group assignments and the remaining model parameters. We denote by $p_{ik}$ 
546: the probability that sample $i$ belongs to sample group $k$ and by 
547: $q_{jl}$ the probability that probe $j$ belongs to probe group $l$. 
548: Furthermore, given that $\theta$, $\pi$ and $\kappa$ always appear in 
549: different factors in (\ref{GM1}) or (\ref{GM2}) then their join 
550: distribution factorizes. Within the mean-field approximation for the group 
551: assignments and the later factorization the variational function can be 
552: written as
553: 
554: \begin{equation}\label{MF1}
555: Q(\phi) = \prod_i p_{ig_i} R(\theta)R(\pi)
556: \end{equation}
557: 
558: when dealing with the generative model (\ref{GM1}) and
559: 
560: \begin{equation}\label{MF2}
561: Q(\phi) = \prod_i p_{ig_i} \prod_j q_{jc_j} R(\theta)R(\pi)R(\kappa)
562: \end{equation}
563: 
564: \noindent when dealing with the generative model (\ref{GM2}), where $R(x)$ 
565: denotes a generic probability density function of $x$.
566: 
567: Summarizing, in the case studies below, we are going to solve the 
568: generative models (\ref{GM1}) or (\ref{GM2}), making use of renormalized 
569: invariant priors (Table \ref{priors}) and the MF variational function 
570: (\ref{MF1}) or (\ref{MF2}), respectively. This approach is based on the 
571: assumptions that: the population is divided in groups, the group 
572: assignments are generated by a multinomial model, the priors are 
573: renormalized invariant distributions, and a MF approximation of the 
574: variational solution with respect to the group assignments.
575: 
576: \section{Case study: Clustering real value data}\label{real}
577: 
578: Quite often we deal with datasets consisting of a real value measurement 
579: $X_{ij}$ over $i=1,\ldots,n$ samples and $j=1,\ldots,m$ variables, where 
580: the samples and variables are not necessarily independent. For simplicity, 
581: the particular kind of dependency we focus on is the existence of sample 
582: and variable groups. Our problem is to infer the sample and variable 
583: groups and the statistical parameters characterizing them.
584: 
585: To address this problem we consider the generative model (\ref{GM2}) with 
586: a normal likelihood, representing a two-sides Gaussian mixture model. The 
587: two-sides Gaussian mixture model is a natural extension of the Gaussian 
588: mixture model \cite{maclachlan00,rasmussen00} to characterize datasets 
589: with a group structure for both the samples and variables. Our 
590: contributions in this context are the use of prior distributions derived 
591: from symmetry arguments alone and the inclusion of a group structure at 
592: the variables side. The dataset, likelihood and priors associated with our 
593: statistical model are defined as follows:
594: 
595: {\em Data:} Consider $i=1,\ldots,n$ samples, $j=1,\ldots,m$ variables, and 
596: the real value measurements $X_{ij}$.
597: 
598: {\em Likelihood:} We assume that $X_{ij}$ are random variables with a 
599: normal distribution, with group dependent mean $\mu_{g_ic_j}$ and group 
600: independent variance $\sigma$, resulting in the likelihood
601: 
602: \begin{equation}\label{Preal}
603: P(X|g,c,\mu,\sigma) = \prod_{ij} \frac{1}{ \sqrt{2\pi\sigma^2} }
604: e^{ - \frac{\left(X_{ij}-\mu_{g_ic_j}\right)^2}{2\sigma^2} }\ .
605: \end{equation}
606: 
607: \noindent Here we are assuming that the main difference between groups is 
608: given by the means while the variance is group independent. The latter is 
609: a good approximation when the source of noise is given by the measurement 
610: itself and it behaves the same independently of the sample and variable 
611: group.
612: 
613: {\it Priors:} For the prior $P(\mu,\sigma)$ we generalize the Normal 
614: distribution prior in Table \ref{priors}. Accounting for more than one 
615: location parameter we obtain
616: 
617: \begin{eqnarray}\label{PGm}
618: P(\mu,\sigma) &=& \frac{ 2\left(\frac{\tilde{\alpha}}{2}\tilde{\sigma}^2\right)
619: ^\frac{\tilde{\alpha}}{2} }{ \Gamma\left(\frac{\tilde{\alpha}}{2}\right)
620: \sigma^{\tilde{\alpha}+1} } e^{ -\tilde{\alpha}\frac{\tilde{\sigma}^2}{2\sigma^2} }\\
621: &\times& \prod_{kl} \sqrt{ \frac{\tilde{\alpha}}{2\pi\sigma^2} }
622: e^{ -\frac{\tilde{\alpha}}{2\sigma^2} \left(\mu_{kl}-\tilde{\mu}_{kl}\right)^2 }
623: \end{eqnarray}
624: 
625: \noindent and we work in the limit $\tilde{\alpha}\rightarrow0$.
626: 
627: To apply the variational method we consider the MF approximation 
628: (\ref{MF2}). Substituting the likelihood (\ref{Preal}), the priors 
629: (\ref{Ppi}), (\ref{Pkappa}) and (\ref{PGm}) and the MF variational 
630: function (\ref{MF2}) into (\ref{F}), and integrating over $\phi$ (summing 
631: over $g_i$ and $c_j$ and integrating over $\mu_{kl}$, $\sigma$, $\pi_k$ 
632: and $\kappa_l$) we obtain
633: 
634: \begin{eqnarray}\label{Freal}
635: F &\leq& {\rm const.} + (nm+KL+\tilde{\alpha}+1)\langle\ln\sigma\rangle
636: \nonumber\\
637: &+& \frac{1}{2} 
638: \sum_{ijkl}
639: p_{ik}q_{jl}\left( \langle\frac{1}{\sigma^2}\rangle X_{ij}^2 -
640: 2X_{ij}\langle\frac{\mu_{kl}}{\sigma^2}\rangle
641: + \langle\frac{\mu_{kl}^2}{\sigma^2}\rangle \right)
642: \nonumber\\
643: &+& \frac{\tilde{\alpha}}{2} \left[
644: \langle\frac{1}{\sigma^2}\rangle \tilde{\sigma}^2 
645: + \sum_{kl} \left( \langle\frac{1}{\sigma^2}\rangle \tilde{\mu}_{kl}^2 
646: - 2\tilde{\mu}_{kl}\langle\frac{\mu_{kl}}{\sigma^2}\rangle
647: + \langle\frac{\mu_{kl}^2}{\sigma^2}\rangle \right) \right]
648: \nonumber\\
649: &-&\sum_k\left(\sum_ip_{ik}+\tilde{\gamma}_k-1\right)
650: \langle\ln\pi_k\rangle
651: \nonumber\\
652: &-&\sum_l\left(\sum_jq_{jl}+\tilde{\epsilon}_l-1\right)
653: \langle\ln\kappa_l\rangle
654: \nonumber\\
655: &+& \int d\mu d\sigma R(\mu,\sigma)\ln R(\mu,\sigma)
656: \nonumber\\
657: &+& \int d\pi R(\pi)\ln R(\pi)
658: + \int d\kappa R(\kappa)\ln R(\kappa)\kappa
659: \nonumber\\
660: &+&\sum_{ik} p_{ik}\ln p_{ik} + \sum_{jl} q_{jl}\ln q_{jl}
661: \end{eqnarray}
662: 
663: \noindent Minimizing (\ref{Freal}) with respect to $p_{il}$, $q_{jl}$, 
664: $R(\mu,\sigma)$, $R(\pi)$ and $R(\kappa)$ we obtain (VB-1):
665: 
666: \begin{equation}\label{preal}
667: p_{ik} = \frac{ e^{\langle\ln\pi_k\rangle -\frac{1}{2\sigma_*^2}
668: \sum_{jl} q_{jl}\left(
669: \frac{\sigma_*^2}{\alpha_{kl}} + \left(X_{ij}-\langle \mu_{kl}\rangle\right)^2
670: \right) } }
671: {\sum_s e^{ \langle\ln\pi_s\rangle -\frac{1}{2\sigma_*^2}
672: \sum_{jl} q_{jl}\left(
673: \frac{\sigma_*^2}{\alpha_{sl}} + \left(X_{ij}-\langle \mu_{sl}\rangle\right)^2
674: \right) } }
675: \end{equation}
676: 
677: \begin{equation}\label{qreal}
678: q_{jl} = \frac{ e^{ \langle\ln\kappa_l\rangle -\frac{1}{2\sigma_*^2}
679: \sum_{ik} p_{ik}\left(
680: \frac{\sigma_*^2}{\alpha_{kl}} + \left(X_{ij}-\langle \mu_{kl}\rangle\right)^2
681: \right) } }
682: { \sum_s e^{ \langle\ln\kappa_l\rangle -\frac{1}{2\sigma_*^2}
683: \sum_{ik} p_{ik}\left(
684: \frac{\sigma_*^2}{\alpha_{ks}} + \left(X_{ij}-\langle \mu_{ks}\rangle\right)^2
685: \right) } }
686: \end{equation}
687: 
688: \begin{eqnarray}\label{Rmusigma}
689: R(\mu,\sigma) &=& \frac{ 2\left(\frac{\alpha}{2}\sigma_*^2\right)^{\frac{\alpha}{2}} }
690: { \Gamma\left(\frac{\alpha}{2}\right) \sigma^{\alpha+1} }
691: e^{ -\frac{\alpha\sigma_*^2}{\sigma^2} }
692: \nonumber\\
693: &\times& \prod_{kl} \sqrt{ \frac{ \alpha_{kl} }{ 2\pi\sigma^2 } }
694: e^{ -\frac{\alpha_{kl}}{2\sigma^2} \left(\mu_{kl}-\langle\mu_{kl}\rangle\right)^2 }
695: \end{eqnarray}
696: 
697: \begin{equation}\label{alphakl}
698: \alpha_{kl} = \tilde{\alpha} + \sum_{ij}p_{ik}q_{jl}
699: \end{equation}
700: 
701: \begin{equation}\label{alpha1}
702: \alpha = \tilde{\alpha}+nm
703: \end{equation}
704: 
705: \begin{equation}\label{mu}
706: \langle\mu_{kl}\rangle = 
707: \frac{ \tilde{\alpha}\tilde{\mu}_{kl} + \sum_{ij} p_{ik}q_{jl}X_{ij} }
708: { \tilde{\alpha} + \sum_{ij} p_{ik}q_{jl} }
709: \end{equation}
710: 
711: \begin{eqnarray}\label{sigma}
712: \sigma_*^2 &=& \frac{1}{\tilde{\alpha}+nm} \left[
713: \tilde{\alpha} \left( \tilde{\sigma}^2
714: +\sum_{kl}\left(\tilde{\mu}_{kl}^2
715: -\langle\mu_{kl}\rangle^2\right) \right) \right.
716: \nonumber\\
717: &+& \left. \sum_{ijkl}p_{ik}q_{jl} \left( X_{ij}^2-\langle\mu_{kl}\rangle^2 \right) \right]
718: \end{eqnarray}
719: 
720: \begin{equation}\label{P_pi}
721: R(\pi)={\rm D}(\pi;\gamma)\ ,\ \ \ \ 
722: \gamma_k = \tilde{\gamma}_k+\sum_ip_{ik}
723: \end{equation}
724: 
725: \begin{equation}\label{P_kappa}
726: R(\kappa)={\rm D}(\kappa;\epsilon)\ ,\ \ \ \
727: \epsilon_l = \tilde{\epsilon}_l+\sum_jq_{jl}
728: \end{equation}
729: 
730: \begin{eqnarray}\label{F_real}
731: F^* &=& {\rm const.} + \sum_{ik} p_{ik}\ln p_{ik} + \sum_{jl} q_{jl}\ln q_{jl}
732: - \ln{\rm B}(\gamma)
733: \nonumber\\
734: &-& \ln{\rm B}(\epsilon) + \frac{1}{2}\sum_{kl}\ln\alpha_{kl}\ .
735: \end{eqnarray}
736: 
737: \noindent These are a set of self-consistent equations which can be solved 
738: recursively to determine the probabilistic group assignments and the 
739: $\mu$, $\sigma$, $\pi$ and $\kappa$ distributions. They are the same in 
740: spirit as those for the EM algorithm \cite{dempster77}. Following 
741: \cite{mackay03,beal03} we refer to them as {\em variational Bayes} (VB) 
742: algorithm.
743: 
744: The main difference between the EM and VB algorithms is that in the former 
745: case we would take the average of the log likelihood over the group 
746: assignments but not over the distributions of $\mu$, $\sigma$, $\pi$ and 
747: $\kappa$. By taking the average over $\mu$ and $\sigma$ we obtain the 
748: additional $1/\alpha_{kl}$ term within the parenthesis in equations 
749: (\ref{preal}) and (\ref{qreal}). According to (\ref{alphakl}) $\alpha_{k}$ 
750: is equal to $\tilde{\alpha}$ plus the product of the average number of 
751: samples in sample group $k$ ($\sum_ip_{ik}$) and the average number of 
752: variables in variable group $l$ ($\sum_jq_{jl}$). Therefore, 
753: the $1/\alpha_{k}$ term penalizes assignments to small size groups. And it 
754: balances the contribution of $(X_{ij}-\langle\mu_{kl}\rangle)^2$, which 
755: drives the estimates towards a better fit and consequently groups of 
756: minimal size.
757: 
758: \subsection{VB implementation, real value data}
759: 
760: The actual implementation of the VB-1 algorithm in the context of real 
761: value data proceeds as follows. Set sufficiently large values for $K$ and 
762: $L$, larger than our expectation for the actual values of $K$ and $L$. In 
763: the following test examples we use $K=L=20$. Set the parameters 
764: $\tilde{\alpha}$, $\tilde{\mu}_{kl}$, $\tilde{\sigma}$, $\tilde{\gamma}_k$ 
765: and $\tilde{\epsilon}_l$. We set 
766: $\tilde{\alpha}=\tilde{\gamma}_k=\tilde{\epsilon}_l=10^{-6}$, 
767: $\tilde{\mu}_{kl}=0$ and $\tilde{\sigma}=1$. The choice of 
768: $\tilde{\mu}_{kl}$ and $\tilde{\sigma}$ is practically irrelevant provided 
769: we have chosen a sufficiently small $\tilde{\alpha}$. Set random initial 
770: conditions for $p_{ik}$ and $q_{jl}$. Starting from these random initial 
771: conditions iterate equations (\ref{preal})-(\ref{F_real}) until the 
772: solution converges up to some predefined accuracy. We use relative error 
773: of $F^*$ smaller than $10^{-6}$. In practice, compute 
774: $\langle\mu_{kl}\rangle$, $\alpha_{kl}$, $\sigma_*$, $\gamma_k$, 
775: $\langle\ln\pi_k\rangle$, $\epsilon_l$, $\langle\ln\kappa_l\rangle$, 
776: $p_{ik}$, $q_{jl}$ and $F^*$ in that order. To explore different potential 
777: local minima use different initial conditions and select the solution with 
778: lowest $F^*$. Since this algorithm penalizes groups with few members it 
779: turns out that, for sufficiently large $K$ and $L$, some sample and 
780: condition groups result empty. If this is not the case $K$ and/or $L$ 
781: should be increased until at least one sample group and one variable group 
782: results empty.
783: 
784: \begin{figure}[t]
785: 
786: \centerline{\includegraphics[width=3.2in]{bc.fig.gaussian.eps}}
787: 
788: \caption{{\bf Clustering real value data:} Mutual information 
789: $I=I(p^O,p^*)$ between the original $p^O$ and estimated $p^*$ groups 
790: assignments, relative to its maximum value $I_0$ when $p^*=p^O$. The 
791: original data was made of $n=100$ samples divided in $K$ groups and 
792: $m=100$ conditions divided in $L$ groups. The values of $X_{ij}$ were 
793: extracted from a normal distribution with mean $\mu_{kl}=k+l$ and variance 
794: $\sigma$. The figure shows the mutual information between the original 
795: groups and the group assignment, estimated by the VB-1 algorithm, as a 
796: function of the variance $\sigma$. The dashed-dotted, solid and dashed 
797: lines corresponds with the worst, average and best case on 100 test 
798: examples, respectively. In a)  $K=L=2$ and in b) $K=L=4$. In both 
799: cases the mutual information is approximately equal to its maximum $I_0$ 
800: for values of $\sigma$ less than one, the minimum difference between the 
801: original means $\mu_{kl}$.}
802: 
803: \label{fig_real}
804: \end{figure}
805: 
806: \subsection{Test examples}
807: 
808: To test the performance of the VB-1 algorithm, 
809: (\ref{preal})-(\ref{F_real}), we consider test examples generated by 
810: the likelihood (\ref{Preal}) itself. Our aim is to test the variational 
811: result in the context of a relatively small number of samples and 
812: conditions. To quantify the goodness of the group assignment we consider 
813: the mutual information between the original $p^O$ 
814: ($p^O_{ik}=\delta_{g_ik}$) and estimated $p^*$ sample group assignments,
815: 
816: \begin{equation}\label{Ip0p}
817: I(p^O,p^*) = \sum_{kk^\prime} \rho_{kk^\prime} \ln 
818: \frac{ \rho_{kk^\prime} }{ \rho^O_k\rho^*_{k^\prime} }
819: \end{equation}
820: 
821: \noindent where
822: 
823: \begin{equation}\label{rhopp}
824: \rho_{kk^\prime} = \frac{1}{n}\sum_i p^O_{ik} p^*_{ik}
825: \end{equation}
826: 
827: \begin{equation}\label{rhop0}
828: \rho^O_{k} = \frac{1}{n}\sum_i p^O_{ik}
829: \end{equation}
830: 
831: \begin{equation}\label{rhop}
832: \rho^*_k = \frac{1}{n}\sum_i p^*_{ik}\ .
833: \end{equation}
834: 
835: \noindent Note that $I(p^O,p^*)$ takes its maximum value when $p^*=p^O$, 
836: denoted by $I_0=I(p^O,p^O)$. Off course, the same could be done for the 
837: condition group assignments as well.
838: 
839: In our test examples the original data was made of $n=100$ samples divided 
840: in $K$ groups and $m=100$ conditions divided in $L$ groups. The values 
841: of $X_{ij}$ were extracted from a normal distribution with mean 
842: $\mu_{kl}=k+l$ and variance $\sigma$. We estimate the group assignment 
843: using the VB-1 algorithm, sampling one initial condition. Figure 
844: \ref{fig_real} shows the mutual information between the original and 
845: estimated groups as a function of the variance $\sigma$. In a) $K=L=2$ 
846: and in b) $K=L=4$. In both cases the mutual information is 
847: approximately equal to its maximum $I_0$ for values of $\sigma$ less than 
848: 1. Since 1 is the minimum difference between the original means 
849: $\mu_{kl}$, we conclude that the VB-1 algorithm performs well when there 
850: is a significant difference between the distributions associated with 
851: different groups. For larger values of $\sigma$ the VB-1 algorithm 
852: performance starts to decrease. This is not, however, a deficiency of the 
853: algorithm but an unavoidable consequence of the mixing between the 
854: distributions coming from different groups. It is worth noticing that we 
855: obtain similar results for the case $K=4$ and $L=1$, indicating that 
856: the method works when there is no group structure on one side, in this 
857: case the conditions.
858: 
859: \section{Case study: clustering data represented by hypergraphs and 
860: bipartite graphs}
861: 
862: There are several datasets consisting of a certain number of properties and 
863: the information of whether or not each sample exhibits each of the 
864: properties. For example, the dataset in Fig. \ref{fig_hg_bg} describes a 
865: population of three animals characterized by two attributes, hair and 
866: legs. The attribute hair can take the value YES (has hair) or NO (does not 
867: have hair) while the attribute legs takes the values 2 or 4 (at least 
868: within this dataset). The mathematical treatment of this problem is 
869: significantly simplified if the variables are mapped onto Boolean 
870: variables. To each $S$ states variable we associate $S$ Boolean variables, 
871: each representing the occurrence or not of a specific letter of the 
872: alphabet. For example, the attribute hair is associated with hair-YES and 
873: hair-NO and the attribute legs with legs-2 and legs-4 (Fig. 
874: \ref{fig_hg_bg}b). The outcome of this mapping is represented by the 
875: Boolean matrix $a_{ij}$, taking the value 1 if the answer to the Boolean 
876: variable $j$ is YES on sample $i$ and 0 otherwise.
877: 
878: \begin{figure}[t]
879: 
880: \centerline{\includegraphics[width=3.2in]{fig_hg_bg.eps}}
881: 
882: \caption{{\bf Hypergraph and bipartite graph data representations:} a) An 
883: example of a problem with categorical data. b) Mapping of the categorical 
884: variables onto augmented Boolean variables. c) Hypergraph representation 
885: of the categorical dataset in a). d) Bipartite graph representation of the 
886: categorical dataset in a). e) A graph example. f) Nearest-neighbor mapping 
887: of the graph in e) onto a hypergraph, where each hyper-edge represents a 
888: set of nearest neighbors of a vertex in the original graph, indicated by 
889: (1), (2), (3) and (4). g) Nearest-neighbor mapping of the graph in e) onto 
890: a bipartite graph. The original graph vertices are represented by 1, 2, 3 
891: and 4. The augmented bipartite graph vertices, representing 
892: nearest-neighbor sets, are represented by (1), (2), (3) and (4).}
893: 
894: \label{fig_hg_bg}
895: \end{figure}
896: 
897: Depending on our aim, the Boolean matrix can be represented either by a 
898: hypergraph or a bipartite graph. When we aim to cluster the samples 
899: without attempting to cluster the Boolean variables, $a_{ij}$ is better 
900: interpreted as the adjacency matrix of a hypergraph. A hypergraph is an 
901: intuitive extension of the concept of graph to allow for connections 
902: between more than two elements. In our case, the hypergraph vertices 
903: represent samples and hyper-edges, one associated which each Boolean 
904: variable, represent the set of all samples with the answer YES to the 
905: corresponding Boolean variable (Fig. \ref{fig_hg_bg}c). On the other hand, 
906: when we aim to cluster both the samples and Boolean variables then a 
907: bipartite graph interpretation is more appropriate, with one class of 
908: vertices for the samples and another one for the Boolean variables, and an 
909: edge connecting sample $i$ and variable $j$ whenever $a_{ij}=1$ 
910: (\ref{fig_hg_bg}d). The differences between these two approaches will 
911: become clear below.
912: 
913: \subsection{One side clustering: Statistical model on hypergraphs}
914: 
915: In this case the samples are assumed to be divided in groups while the 
916: hypergraph edges are modeled as independent. Here we follow the 
917: statistical model introduced in \cite{vazquez08}:
918: 
919: {\em Data:} Consider a hypergraph with a vertex set representing $n$ 
920: samples and $m$ edges characterizing the relationships among them. The 
921: hypergraph is specified by its adjacency matrix $a$, where $a_{ij}=1$ if 
922: element $i$ belongs to edge $j$ and it is 0 otherwise.
923: 
924: {\em Likelihood:} The adjacency matrix elements are generated by a 
925: binomial model with sample group and variable dependent probabilities 
926: $\theta_{kj}$, $k=1,\ldots,K$ and $j=1,\dots,m$, resulting in
927: 
928: \begin{equation}\label{Phg}
929: P(a|g,\theta) = \prod_{ij} \theta_{g_ij}^{a_{ij}}
930: \left(1- \theta_{g_ij}\right)^{1-a_{ij}}\ ,
931: \end{equation}
932: 
933: {\em Priors:} As priors we use the renormalized invariant prior of the 
934: binomial model (Table \ref{priors}). Taking into account that we have a 
935: binomial model for each pair of sample group and edge, we obtain
936: 
937: \begin{equation}\label{P_hg}
938: P(\theta) = \prod_{kj}{\rm Beta}(\theta_{kj};\tilde{\alpha}_{kj},\tilde{\beta}_{kj})
939: \end{equation}
940: 
941: \noindent with $\tilde{\alpha}_{kj}\rightarrow0$ and 
942: $\tilde{\beta}\rightarrow0$.
943: 
944: Substitute the likelihood (\ref{Phg}), the priors (\ref{Ppi}) and 
945: (\ref{P_hg}), and the MF variational function (\ref{MF1}) into (\ref{F}), 
946: and integrating over $\phi$ (summing over $g_i$ and integrating over 
947: $\theta_{kl}$ and $\pi_k$) we obtain
948: 
949: \begin{eqnarray}\label{F_hg}
950: F &\leq& - \sum_{jk} \left(\sum_ip_{ik}a_{ij}+\tilde{\alpha}_{kj}-1\right) 
951: \langle\ln\theta_{kj}\rangle
952: \nonumber\\
953: &-& \sum_{jk}\left(\sum_ip_{ik}(1-a_{ij})+\tilde{\beta}_{kj}-1\right)
954: \langle\ln(1-\theta_{kj})\rangle
955: \nonumber\\
956: &+& \sum_{ik} p_{ik}\ln p_{ik}
957: + \int d\theta R(\theta)\ln R(\theta)
958: \nonumber\\
959: &+& \int d\pi R(\pi)\ln R(\pi)
960: +{\rm const.}
961: \end{eqnarray}
962: 
963: \noindent Minimizing (\ref{F_hg}) with respect to $p_{il}$,
964: $R(\theta)$ and $R(\pi)$ we obtain (VB-2)
965: 
966: \begin{equation}\label{p_hg}
967: p_{ik} = \frac{ e^{ \langle\ln\pi_k\rangle +
968: \sum_j \left[
969: a_{ij}\langle\ln\theta_{kj}\rangle
970: +(1-a_{ij})\langle\ln(1-\theta_{kj})\rangle
971: \right] } }
972: { \sum_s e^{  \langle\ln\pi_s\rangle +
973: \sum_j \left[
974: a_{ij}\langle\ln\theta_{sj}\rangle
975: +(1-a_{ij})\langle\ln(1-\theta_{sj})\rangle
976: \right] } }
977: \end{equation}
978: 
979: \begin{equation}\label{Qtheta_hg}
980: R(\theta) = \prod_{kj} 
981: {\rm B}(\theta_{kj};\alpha_{kj},\beta_{kj})\ ,
982: \end{equation}
983: 
984: \begin{equation}\label{alpha_hg}
985: \alpha_{kj} = \tilde{\alpha}_{kj}+\sum_{ij}p_{ik}a_{ij}
986: \end{equation}
987: 
988: \begin{equation}\label{beta_hg}
989: \beta_{kj} = \tilde{\beta}_{kj}+\sum_{ij}p_{ik}(1-a_{ij})\ .
990: \end{equation}
991: 
992: \begin{equation}\label{Qpi_hg}
993: R(\pi)={\rm D}(\pi;\gamma)\ ,\ \ \ \ 
994: \gamma_k = \tilde{\gamma}_k + \sum_ip_{ik}
995: \end{equation}
996: 
997: \begin{eqnarray}\label{F_hg_min}
998: F^*&=& {\rm const.} +
999: \sum_{ik} p_{ik}\ln p_{ik}
1000: \nonumber\\
1001: &-& \sum_{kj}\ln {\rm B}(\alpha_{kj},\beta_{kj})
1002: -\ln{\rm B}(\gamma)
1003: \end{eqnarray}
1004: 
1005: \noindent These equations represent the VB algorithm for the statistical 
1006: model on hypergraphs. In this case we have not been able to disentangle 
1007: the contributions weighting the fit to the data and the model bias, both 
1008: being included in the averages $\langle\ln(\theta_{kj})\rangle$ and 
1009: $\langle\ln(1-\theta_{kj})\rangle$.
1010: 
1011: \subsection{VB algorithm implementation, statistical model on hypergraphs}
1012: 
1013: The implementation of the VB algorithm for the statistical model on 
1014: hypergraphs proceeds as follows. Set sufficiently large values for $K$, 
1015: larger than our expectation for the actual values of $K$. We use $K=20$ in 
1016: the following test examples. Set the parameters $\tilde{\alpha}_{kj}$, 
1017: $\tilde{\beta}_{kj}$ and $\tilde{\gamma}_k$. We set the parameters 
1018: $\tilde{\alpha}_{kj}=\tilde{\beta}_{kj}=\tilde{\gamma}_k=10^{-6}$. Set 
1019: random initial conditions for $p_{ik}$. Starting from these initial 
1020: conditions iterate equations (\ref{p_hg})-(\ref{F_hg_min}) until the 
1021: solution converges up to some predefined accuracy. We use relative error 
1022: of $F^*$ smaller than $10^{-6}$. In practice, compute $\alpha_{kj}$, 
1023: $\beta_{kj}$, $\langle\ln\theta_{kj}\rangle$, 
1024: $\langle\ln(1-\theta_{kj})\rangle$, $\gamma_k$, $\langle\ln\pi_k\rangle$, 
1025: $p_{ik}$ and $F^*$ in that order.  To explore different potential local 
1026: minima use different initial conditions and select the solution with 
1027: lowest $F^*$. Since this algorithm penalizes groups with few members it 
1028: turns out that, for sufficiently large $K$, some sample and condition 
1029: groups result empty. If this is not the case then increase $K$ until at 
1030: least one group is empty. A matlab code implementing this algorithm can be 
1031: found at http://www.sns.ias.edu/~vazquez/hgc.html.
1032: 
1033: \begin{figure*}[t]
1034: 
1035: \centerline{\includegraphics[width=5.8in]{fig_zoo.eps}}
1036: 
1037: \caption{{\bf Stratification or an animal population:} a) A list of 
1038: animals is given together with certain attributes characterizing them. The 
1039: complete dataset is available from \cite{asuncion07}. Except for the 
1040: attribute - legs - one and zero indicate possession or not, respectively, 
1041: of the corresponding attribute. The problem consist on determining the 
1042: optimal stratification of the animal population based on the provided 
1043: attributes. b) Hypergraph representing the zoo data. Each line corresponds 
1044: with an edge, whose elements are specified within the right column. c) 
1045: Stratification as obtained in \cite{vazquez08}. d) Stratification by the 
1046: VB-2 algorithm. e) and f) Stratification of the animal population e) and 
1047: Boolean variables f) by the VB-3 algorithm.}
1048: 
1049: \label{fig_zoo}
1050: \end{figure*}
1051: 
1052: \subsubsection{Test example: zoo problem}
1053: 
1054: Consider the animal population in Fig. \ref{fig_zoo}a together with their 
1055: attributes: habitat, nutrition behavior, etc. Figure \ref{fig_zoo}b shows 
1056: the mapping of this dataset onto a hypergraph. The hypergraph vertices 
1057: represent animals and the edges represent the association between all 
1058: animals with a given attribute: edge 1, all non-airborne animals; edge 2, 
1059: all airborne animals, and so on.
1060: 
1061: The animal population stratification was already addressed in 
1062: \cite{vazquez08}, finding the solution in Fig. \ref{fig_zoo}c. Although 
1063: the starting statistical model is the same, the solution in 
1064: \cite{vazquez08} was found assuming fixed the number of groups and 
1065: estimating the group assignment using the EM algorithm (essentially a 
1066: maximum likelihood estimate). Then, in an an attempt to focus in the 
1067: solution with better consensus, solutions for different number of groups 
1068: were obtained and the most representative solution was selected.
1069: 
1070: Here we address the same problem using a Bayesian approach and the 
1071: variational solution. We start from the same statistical model on 
1072: hypergraphs but now obtain a solution using the VB-2 algorithm 
1073: (\ref{p_hg})-(\ref{F_hg_min}), sampling 10,000 initial conditions as in 
1074: \cite{vazquez08}. The solution found by the VB-2 algorithm (Fig. 
1075: \ref{fig_zoo}d) is quite similar to that previously found in 
1076: \cite{vazquez08} (Fig. \ref{fig_zoo}c). The main differences are the 
1077: splitting of the terrestrial mammals, the exclusion of the platypus and 
1078: the tortoise from the amphibia-reptiles group and the scorpion from the 
1079: terrestrial arthropods. More important, in both cases the main groups 
1080: represent terrestrial mammals, aquatic mammals, birds, fishes, 
1081: amphibia-reptiles, terrestrial arthropods and aquatic arthropods. The VB-2 
1082: (\ref{p_hg})-(\ref{F_hg}) algorithm represents, however, a significant 
1083: improvement over the approach followed in \cite{vazquez08}. It finds the 
1084: consensus solution in one run, because it has built in the balance between 
1085: better fitting and less bias.
1086: 
1087: \begin{figure}
1088: 
1089: \centerline{\includegraphics[width=3.2in]{bc.fig.hypergraph.eps}}
1090: 
1091: \caption{{\bf Finding graph modules, hypergraph model:} Mutual information 
1092: $I=I(p^O,p^*)$ between the original $p^O$ and estimated $p^*$ groups 
1093: assignments, relative to its maximum value $I_0$ when $p^*=p^O$. The 
1094: original data was made of a graph with $n=100$ vertices divided in $K=2$ 
1095: groups, with an intra- and inter-community connection probabilities $p_1$ 
1096: and $p_2$, respectively. The figure shows the mutual information, between 
1097: the original groups and the group assignment estimated by the VB-2 
1098: algorithm (\ref{p_hg})-(\ref{F_hg_min}), as a function of the 
1099: inter-community connectivity $p_2$. The dashed-dotted, solid and dashed 
1100: lines corresponds with the worst, average and best case on 100 test 
1101: examples. In a) we deal with dense communities ($p_1=0.9$)  and the 
1102: algorithm performs well ($I/I_0\approx1$) for small values of the 
1103: inter-community connectivity probability $p_2$.  In b) we deal with sparse 
1104: communities ($p_1=0.1$) and the algorithm performs well for large values 
1105: of the inter-community connectivity probability $p_2$.}
1106: 
1107: \label{fig_hg}
1108: \end{figure}
1109: 
1110: \subsubsection{Test example: finding network modules}
1111: 
1112: The work by Newman and Leicht \cite{newman07} provides a hint on how to 
1113: apply the hypergraph clustering to the problem of finding modules or 
1114: communities in a graph or network. A graph is made by a set of vertices 
1115: and a set of edges, the latter being pairs of connected vertices. The idea 
1116: of Leicht and Newman is a ``guilty by association'' principle: vertices 
1117: between the same module of a graph will tend to have connections to the 
1118: same other vertices. This problem can be translated to a hypergraph 
1119: problem, where the vertices are the graphs vertices, the hyper-edges are 
1120: the set of nearest neighbors and the Boolean variables characterize 
1121: whether or not a vertex belongs to the a set of nearest neighbors 
1122: \cite{vazquez08} (Fig. \ref{fig_hg_bg}e and f). More precisely, to each 
1123: vertex we associate a hyper-edge, given by the set of its nearest 
1124: neighbors.  Therefore, there are $m=n$ hyper-edges, one for every vertex 
1125: in the original graph. The hypergraph adjacency matrix has the matrix 
1126: element $a_{ij}=1$ if vertex $i$ belongs to hyper-edge $j$, i.e. if vertex 
1127: $i$ belongs to the nearest-neighbor set of vertex $j$, and $a_{ij}=0$ 
1128: otherwise. If we label the nearest-neighbor sets with the same label as 
1129: the vertices then the hypergraph adjacency matrix coincides with the 
1130: adjacency matrix of the original graph. Thus, there is an exact mapping 
1131: from the statistical model proposed by Newman and Leicht \cite{newman07} 
1132: to the statistical model on hypergraphs.
1133: 
1134: Having specified this mapping we use the VB-2 algorithm 
1135: (\ref{p_hg})-(\ref{F_hg_min}), sampling one initial condition, to find the 
1136: graph modules in the original graph. To illustrate its performance we 
1137: consider as a case study a graph composed by two communities, with 
1138: probabilities $p_1$ and $p_2$ that two vertices within the same or 
1139: different communities are connected, respectively. As already anticipated 
1140: by Newman and Leicht \cite{newman07}, the nearest-neighbor approach can 
1141: resolve both dense communities with lesser inter-community connections 
1142: ($p_1\gg p_2$) and sparse communities with more inter-community connections 
1143: ($p_1\ll p_2$).  Figure \ref{fig_hg} shows that the VB-2 algorithm 
1144: performs quite well in those two regimes.
1145: 
1146: \subsection{Two sides clustering: statistical model on bipartite graphs}
1147: 
1148: We can face situations where there are groups of Boolean variables as well, requiring the 
1149: clustering of both samples and Boolean variables. In this case the bipartite graph 
1150: representation is more appropriate, with a class of vertices representing the samples and a 
1151: class of vertices representing the Boolean variables. More precisely,
1152: 
1153: {\em Data:} Consider a bipartite graph with two vertex subsets, representing $n$ samples and 
1154: $m$ Boolean variables. The graph is specified by its adjacency matrix $a$, where $a_{ij}=1$ 
1155: when sample $i$ is connected to Boolean variable $j$, i.e. if Boolean variable $j$ is true for 
1156: sample $i$, and $a_{ij}=0$ otherwise.
1157: 
1158: {\em Likelihood:} The adjacency matrix elements are generated by a binomial model with sample 
1159: group and variable group dependent probabilities $\theta_{kl}$, $k=1,\ldots,K$ and 
1160: $l=1,\dots,L$, resulting in
1161: 
1162: \begin{equation}\label{Pbg}
1163: P(a|g,c,\theta) = \prod_{ij} \theta_{g_ic_j}^{a_{ij}}
1164: \left(1- \theta_{g_ic_j}\right)^{1-a_{ij}}\ .
1165: \end{equation}
1166: 
1167: {\em Priors:} For $P(\theta)$ we use the renormalized invariant prior of the 
1168: binomial model. Taking into account that we have one binomial model per each pair 
1169: of sample and variable group we obtain
1170: 
1171: \begin{equation}\label{P_bg}
1172: P(\phi) =
1173: \prod_{kj}{\rm B}(\theta_{kl};\tilde{\alpha}_{kl},\tilde{\beta}_{kl})
1174: \end{equation}
1175: 
1176: \noindent with $\tilde{\alpha}_{kl}\rightarrow0$ and $\tilde{\beta}_{k}\rightarrow0$.
1177: 
1178: The likelihood (\ref{Pbg}) is quite similar to (\ref{Phg}), the main difference 
1179: being that now the statistical properties of the Boolean variables appear through their 
1180: corresponding group assignments $c_j$. This increases the model complexity by considering a 
1181: group structure for the Boolean variables and, at the same time, reduces the number of 
1182: $\theta$ parameters. Furthermore, (\ref{Pbg}) contains (\ref{Phg}) as the particular case 
1183: where $L=n$ and one group associated to each Boolean variable.
1184: 
1185: Substituting the likelihood (\ref{Pbg}), the priors (\ref{P_bg}), (\ref{Ppi}) and 
1186: (\ref{Pkappa}), and the MF variational function (\ref{MF2}) in (\ref{F}), and integrating over 
1187: $\phi$ (summing over $g_i$ and $c_j$ and integrating over $\theta_{kl}$, $\pi_k$ and 
1188: $\kappa_l$) we obtain
1189: 
1190: \begin{eqnarray}\label{F_bg}
1191: F &\leq& - \sum_{kl} 
1192: \left(\sum_{ij}p_{ik}q_{jl}a_{ij}+\tilde{\alpha}_{kl}-1\right) 
1193: \langle\ln\theta_{kl}\rangle
1194: \nonumber\\
1195: &+& 
1196: \sum_{kl}\left(\sum_{ij}p_{ik}q_{jl}(1-a_{ij})+\tilde{\beta}_{kl}-1\right)
1197: \langle\ln(1-\theta_{kl})\rangle
1198: \nonumber\\
1199: &+& \sum_{ik} p_{ik}\ln p_{ik} +
1200: \sum_{jl} q_{jl}\ln q_{jl}
1201: \nonumber\\
1202: &+& \int d\theta R(\theta)\ln R(\theta)
1203: + \int d\pi R(\pi)\ln R(\pi)
1204: \nonumber\\
1205: &+& \int d\kappa R(\kappa)\ln R(\kappa)
1206: +{\rm const.}
1207: \end{eqnarray}
1208: 
1209: \noindent Minimizing (\ref{F_bg}) with respect to $p_{il}$, $q_{jl}$,
1210: $R(\theta)$, $R(\pi)$ and $R(\kappa)$ we obtain (VB-3)
1211: 
1212: \begin{equation}\label{p_bg}
1213: p_{ik} = \frac{ e^{ \langle\pi_k\rangle +
1214: \sum_{jl} q_{jl} \left[
1215: a_{ij}\langle\ln\theta_{kl}\rangle
1216: +(1-a_{ij})\langle\ln(1-\theta_{kl})\rangle
1217: \right] } }
1218: { \sum_s e^{ \langle\pi_s\rangle +
1219: \sum_{jl} q_{jl} \left[
1220: a_{ij}\langle\ln\theta_{sl}\rangle
1221: +(1-a_{ij})\langle\ln(1-\theta_{sl})\rangle
1222: \right] } }
1223: \end{equation}
1224: 
1225: \begin{equation}\label{q_bg}
1226: q_{jl} = \frac{ e^{ \langle\kappa_l\rangle +
1227: \sum_{ik} p_{ik} \left[
1228: a_{ij}\langle\ln\theta_{kl}\rangle
1229: +(1-a_{ij})\langle\ln(1-\theta_{kl})\rangle
1230: \right] } }
1231: { \sum_s e^{ \langle\kappa_s\rangle +
1232: \sum_{ik} p_{ik} \left[
1233: a_{ij}\langle\ln\theta_{ks}\rangle
1234: +(1-a_{ij})\langle\ln(1-\theta_{ks})\rangle
1235: \right] } }
1236: \end{equation}
1237: 
1238: \begin{equation}\label{Qtheta_bg}
1239: R(\theta) = \prod_{kl} 
1240: {\rm B}(\theta_{kl};\alpha_{kl},\beta_{kl})\ ,
1241: \end{equation}
1242: 
1243: \begin{equation}\label{alpha_bg}
1244: \alpha_{kl} = 1 + \sum_{ij}p_{ik}q_{jl}a_{ij}
1245: \end{equation}
1246: 
1247: \begin{equation}\label{beta_bg}
1248: \beta_{kl} = 1 + \sum_{ij}p_{ik}q_{jl}(1-a_{ij})\ .
1249: \end{equation}
1250: 
1251: \begin{equation}\label{P_pi_bg}
1252: R(\pi)={\rm D}(\pi;\gamma)\ ,\ \ \ \ 
1253: \gamma_k = \tilde{\gamma}_k + \sum_ip_{ik}
1254: \end{equation}
1255: 
1256: \begin{equation}\label{P_kappa_bg}
1257: R(\pi)={\rm D}(\kappa;\epsilon)\ ,\ \ \ \ 
1258: \epsilon_l = \tilde{\epsilon}_l + \sum_jq_{jl}
1259: \end{equation}
1260: 
1261: \begin{eqnarray}\label{F_bg_min}
1262: F^* &=& {\rm const.} +
1263: \sum_{ik} p_{ik}\ln p_{ik} + \sum_{jl} q_{jl}\ln q_{jl}
1264: \nonumber\\
1265: &-& \sum_{kl}\ln {\rm B}(\alpha_{kl},\beta_{kl})
1266: - \ln{\rm B}(\gamma) -\ln{\rm B}(\epsilon)
1267: \end{eqnarray}
1268: 
1269: \noindent Equations (\ref{p_bg})-(\ref{F_bg_min}) represent the VB 
1270: algorithm for the statistical model on bipartite graphs. They can be used 
1271: to found modules or communities in graphs with a bipartite structure, 
1272: including those representing samples and Boolean variables.
1273: 
1274: \subsection{VB algorithm implementation, statistical model on bipartite 
1275: graphs}
1276: 
1277: The implementation of the VB-2 algorithm (\ref{p_bg})-(\ref{F_bg_min}) for the statistical 
1278: model on bipartite graphs proceeds as follows. Set sufficiently large values for $K$ and $L$, 
1279: larger than our expectation for the actual values of $K$ and $L$. Set the parameters 
1280: $\tilde{\alpha}_{kl}$, $\tilde{\beta}_{kl}$, $\tilde{\gamma}_k$ and $\tilde{\epsilon}_l$. We 
1281: set the parameters $\tilde{\alpha}_{kl} = \tilde{\beta}_{kl} = \tilde{\gamma}_k = 
1282: \tilde{\epsilon}_l = 10^{-6}$. Set random initial conditions for $p_{ik}$ and $q_{jl}$. 
1283: Starting from these initial conditions iterate equations (\ref{p_bg})-(\ref{F_bg_min}) until 
1284: the solution converges up to some predefined accuracy. We use relative error of $F^*$ smaller 
1285: than $10^{-6}$. In practice, compute $\alpha_{kj}$, $\beta_{kj}$, 
1286: $\langle\ln\theta_{kj}\rangle$, $\langle\ln(1-\theta_{kj})\rangle$, $\gamma_k$, 
1287: $\langle\ln\pi_k\rangle$, $\epsilon_l$, $\langle\ln\kappa_l\rangle$, $p_{ik}$, $q_{jl}$ and 
1288: $F^*$ in that order. To explore different potential local minima use different initial 
1289: conditions and select the solution with lowest $F^*$. Since this algorithm penalizes groups 
1290: with few members it turns out that, for sufficiently large $K$ and $L$ some sample and/or 
1291: variable groups result empty. If this is not the case, increase $K$ and/or $L$ until at least 
1292: one sample group and one variable group results empty.
1293: 
1294: \subsubsection{Test example: zoo problem}
1295: 
1296: Let us go back to the zoo problem (Fig. \ref{fig_zoo}a). Now we represent 
1297: this dataset by a bipartitite graph, with one class of vertices 
1298: representing the animals and the other class the Boolean variables (e.g. 
1299: Fig. \ref{fig_hg_bg}a,b and d) Using the VB-3 algorithm 
1300: (\ref{p_bg})-(\ref{F_bg_min}), sampling 10,000 initial conditions as in 
1301: \cite{vazquez08}, we perform a two-sides clustering of the bipartite graph 
1302: obtaining the animal population stratification in Fig. \ref{fig_zoo}e and 
1303: the Boolean variables stratification in Fig. \ref{fig_zoo}f. The animal 
1304: clusters are similar to those previously obtained using the statistical 
1305: model on hypergraphs (Fig. \ref{fig_zoo}c and d). The main difference is 
1306: the more refined subdivision of terrestrial mammals, now split in four 
1307: groups (1, 2, 3 and 4).
1308: 
1309: In addition to the animal population stratification the two-sides 
1310: clustering provides association groups between the Boolean variables (Fig. 
1311: \ref{fig_zoo}f). These associations reflect the fact that not all Boolean 
1312: variables are independent, some of them are linked. For example, group 2 
1313: cluster four typical attributes of terrestrial mammals, they have hair, do 
1314: not put eggs, milk and have four legs. In the same way, group 3 clusters 
1315: attributes of fishes and group four of birds. Thus, in general, the 
1316: bipartite graph model and the resulting two-sides clustering provides more 
1317: information than the hypergraph approach.
1318: 
1319: \begin{figure}
1320: 
1321: \centerline{\includegraphics[width=3.2in]{bc.fig.bipartite_graph.eps}}
1322: 
1323: \caption{{\bf Finding graph modules, bipartite model:} Mutual information 
1324: $I=I(p^O,p^*)$ between the original $p^O$ and estimated $p^*$ groups 
1325: assignments, relative to its maximum value $I_0$ when $p^*=p^O$. The 
1326: original data was made of a graph with $n=100$ vertices divided in $K=2$ 
1327: groups, with an intra and inter-community connection probabilities $p_1$ 
1328: and $p_2$, respectively. The figure shows the mutual information, between 
1329: the original groups and the group assignment estimate by the VEM-3 
1330: algorithm (\ref{p_bg})-(\ref{F_bg_min}), as a function of the 
1331: inter-community connectivity $p_2$. The dashed-dotted, solid and dashed 
1332: lines corresponds with the worst, average and best case on 100 test 
1333: examples. In a) we deal with dense communities ($p_1=0.9$)  and the 
1334: algorithm performs well ($I/I_0\approx1$) for small values of the 
1335: inter-community connectivity probability $p_2$.  In b) we deal with sparse 
1336: communities ($p_1=0.1$) and the algorithm performs well for large values 
1337: of the inter-community connectivity probability $p_2$.}
1338: 
1339: \label{fig_bg}
1340: \end{figure}
1341: 
1342: \subsubsection{Test example: finding network modules}
1343: 
1344: The bipartite graph model can be use to find network modules as well. In 
1345: this case one class of vertices represents the original graph vertices and 
1346: the other represents sets of nearest neighbors (Fig. \ref{fig_hg_bg}g). 
1347: The two-sides clustering thus attempts to cluster both the original graph 
1348: vertices and the sets of nearest neighbors. When the original graph is 
1349: undirected the problem is symmetric (e.g. see Fig. \ref{fig_hg_bg}g). 
1350: Indeed, if vertex $i$ belongs to the nearest-neighbor set of vertex $j$ 
1351: then vertex $j$ belongs to the nearest-neighbor set of vertex $i$. As a 
1352: consequence the clustering on the original vertices side cannot be 
1353: differentiated from the clustering of nearest-neighbor sets. 
1354: Intuitively this means that when two vertices belong to the same graph 
1355: module we can say that their nearest-neighbor sets belong to the same 
1356: nearest-neighbor set group.
1357: 
1358: Having specified this mapping we use the VB-3 algorithm 
1359: (\ref{p_bg})-(\ref{F_bg_min}), sampling one initial condition, to find the 
1360: graph modules in the original graph. To illustrate its performance we 
1361: consider once again a graph composed by two communities, with 
1362: probabilities $p_1$ and $p_2$ that two vertices within the same or 
1363: different communities are connected, respectively. Figure \ref{fig_bg} 
1364: shows that the VB-3 algorithm can resolve both dense communities with 
1365: lesser inter-community connections ($p_1\gg p_2$) and sparse communities 
1366: with more inter-community connections ($p_1\ll p_2$).
1367: 
1368: The comparison of Fig. \ref{fig_bg} and \ref{fig_hg} indicates that the 
1369: bipartite graph model performs slightly better than the hypergraph model. 
1370: For example, focusing on the average performance, for $p_1=0.9$ the VB-3 
1371: algorithm performs almost perfectly till $p_2=0.6$, while the VB-2 
1372: algorithm does till $p_2=0.5$. This could be, however, specific to the 
1373: tested set of examples. Further research is required to determine which 
1374: version performs better depending on the dataset under consideration.
1375: 
1376: \section{Discussion and conclusions}
1377: 
1378: The Bayesian approach allows for a systematic solution of data analysis 
1379: problems. Its starting point is a statistical model of the data under 
1380: consideration. From there, using Bayes rule, we can invert the statistical 
1381: model to obtain the posterior distribution of the model parameters. The 
1382: latter can be use, in principle, to calculate or compute averages or other 
1383: magnitudes of interest.
1384: 
1385: One of the main criticisms to the Bayesian approach is the apparent 
1386: ambiguity in selecting the prior distributions. Here we have worked 
1387: further on Jaynes method \cite{jaynes68}, claiming that the prior 
1388: distributions are given by the most general distribution dictated by the 
1389: symmetries of the problem under consideration. One undesired consequence 
1390: of this method is that when the symmetries are not sufficient constraints 
1391: we obtain improper prior distributions. Yet, the use of improper priors 
1392: can be avoided by working with renormalized distributions that are proper, 
1393: and approach the improper prior in a certain limit. Using this approach we 
1394: report here a correction to Jaynes prior for a likelihood with translation 
1395: and scale invariance and a generalization of Jaynes prior for the binomial 
1396: model to the multinomial model.
1397: 
1398: Having resolve the issue about the prior distributions, we can proceed to 
1399: the application of the Bayesian approach to resolve a population 
1400: structured. Taking inspiration from mixture models \cite{maclachlan00}, in 
1401: particular Dirichlet mixture models \cite{blei03}, we introduce general 
1402: statistical models with a built in population structure at the sample, and 
1403: sample and variable, level. The model with a structure at the sample level 
1404: aims one-side clustering problems, where the variables are assumed to be 
1405: independent measurements. The model with a structure at both sample and 
1406: variable level aims two-side clustering problems, where there are classes 
1407: of variables. These statistical models are then postulated as generative 
1408: models of some dataset. Introducing a MF approximation as variational 
1409: function, we then resolve the population structure by solving the inverse 
1410: problem, i.e. determining the sample and/or variable groups and model 
1411: parameters from the data.
1412: 
1413: To illustrate the applicability and systematicity of the variational 
1414: method, here we study the problem of data clustering, in the context of 
1415: real value and Boolean variables. The outcome is a variational Bayes (VB) 
1416: algorithm, a self-consistent set of equations to determine the group 
1417: assignments and the model parameters. The VB algorithm is based on 
1418: recursive equations similar to those for the EM algorithm, but with some 
1419: intrinsic penalization for model bias. In the case of real value data, and 
1420: under the assumption of normal distributions, the contributions favoring 
1421: fitting and penalizing model bias are clearly disentangled. The fitting is 
1422: quantified, as it is expected for normally distributed variables, by the 
1423: mean square deviation. The model bias is quantified by the inverse of the 
1424: square root of the mean cluster sizes. The tendency to reduce the mean 
1425: square deviation is thus balanced by a tendency to increase the cluster 
1426: sizes.
1427: 
1428: In the case of Boolean variables our analysis is based on a mapping into a 
1429: hypergraph or bipartite graph. When we cluster the samples but not the 
1430: Boolean variables the problem is mapped onto a statistical model on 
1431: hypergraphs \cite{vazquez08}. On the other hand, when we perform a 
1432: two-side clustering, clustering both the samples and the Boolean 
1433: variables, the problem is mapped onto a statistical model on bipartite 
1434: graphs.
1435: 
1436: The VB algorithms associated with the statistical model on hypergraphs and 
1437: bipartite graphs can be used to find modules on a graph. Starting on an 
1438: idea by Newman and Leicht \cite{newman07}, we show that the problem of 
1439: graph modules can be mapped onto the problem of finding hypergraph modules 
1440: or bipartite graph modules, where the hypergraph edges and the augmented 
1441: bipartite graph vertices represent nearest-neighbor sets in the original 
1442: graph. The resulting VB algorithms represent a significant improvement 
1443: over the maximum likelihood approaches followed in \cite{newman07} and 
1444: \cite{vazquez08}, by including a self-consistent correction for model 
1445: complexity and bias.
1446: 
1447: It is worth mentioning that, depending on the starting statistical model, 
1448: we could arrive to different versions of the VB algorithm. Indeed, for the 
1449: finding graph modules problem we could use both the hypergraph and 
1450: bipartite graph models. Furthermore, Hofman and Wiggins \cite{hofman07} 
1451: have obtained another version based on a statistical model with different 
1452: intra and inter-community connection probabilities. These approaches 
1453: differ in the definition of what constitutes a group, community or module. 
1454: We use the definition by Newman and Leicht \cite{newman07} based on 
1455: topological similarity, i.e. two vertixes are topologically identical if 
1456: they are connected to the same other vertices in the graph. Thus, we 
1457: obtain group of vertices whose patterns of connectivity are similar. On 
1458: the other hand, the definition used by Hofman and Wiggins \cite{hofman07} 
1459: is based on the existence of two edge densities, characterizing the 
1460: tendency of having an edge between intra- and inter-group pairs of 
1461: vertices. Depending on the problem and the question we are asking we may 
1462: adopt one or the other definition, and use the corresponding clustering 
1463: method.
1464: 
1465: 
1466: %\bibliography{network}
1467: 
1468: \begin{thebibliography}{19}
1469: \expandafter\ifx\csname natexlab\endcsname\relax\def\natexlab#1{#1}\fi
1470: \expandafter\ifx\csname bibnamefont\endcsname\relax
1471:   \def\bibnamefont#1{#1}\fi
1472: \expandafter\ifx\csname bibfnamefont\endcsname\relax
1473:   \def\bibfnamefont#1{#1}\fi
1474: \expandafter\ifx\csname citenamefont\endcsname\relax
1475:   \def\citenamefont#1{#1}\fi
1476: \expandafter\ifx\csname url\endcsname\relax
1477:   \def\url#1{\texttt{#1}}\fi
1478: \expandafter\ifx\csname urlprefix\endcsname\relax\def\urlprefix{URL }\fi
1479: \providecommand{\bibinfo}[2]{#2}
1480: \providecommand{\eprint}[2][]{\url{#2}}
1481: 
1482: \bibitem[{\citenamefont{McLachlan and Peel}(2000)}]{maclachlan00}
1483: \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{McLachlan}} \bibnamefont{and}
1484:   \bibinfo{author}{\bibfnamefont{D.}~\bibnamefont{Peel}},
1485:   \emph{\bibinfo{title}{Finite Mixture Models}} (\bibinfo{publisher}{John Wiley
1486:   \& Sons, Inc., New York}, \bibinfo{year}{2000}).
1487: 
1488: \bibitem[{\citenamefont{Dempster et~al.}(1977)\citenamefont{Dempster, Laird,
1489:   and Rubin}}]{dempster77}
1490: \bibinfo{author}{\bibfnamefont{A.}~\bibnamefont{Dempster}},
1491:   \bibinfo{author}{\bibfnamefont{N.}~\bibnamefont{Laird}}, \bibnamefont{and}
1492:   \bibinfo{author}{\bibfnamefont{D.}~\bibnamefont{Rubin}}, \bibinfo{journal}{J
1493:   R Statisti Soc B} \textbf{\bibinfo{volume}{39}}, \bibinfo{pages}{1}
1494:   (\bibinfo{year}{1977}).
1495: 
1496: \bibitem[{\citenamefont{Akaike}(1974)}]{akaike74}
1497: \bibinfo{author}{\bibfnamefont{H.}~\bibnamefont{Akaike}},
1498:   \bibinfo{journal}{IEEE Trans. Aut. Control} \textbf{\bibinfo{volume}{19}},
1499:   \bibinfo{pages}{716} (\bibinfo{year}{1974}).
1500: 
1501: \bibitem[{\citenamefont{Schwarz}(1978)}]{schwarz78}
1502: \bibinfo{author}{\bibfnamefont{G.}~\bibnamefont{Schwarz}},
1503:   \bibinfo{journal}{Ann. Statist.} \textbf{\bibinfo{volume}{6}},
1504:   \bibinfo{pages}{461} (\bibinfo{year}{1978}).
1505: 
1506: \bibitem[{\citenamefont{Jeffreys}(1939)}]{jeffreys39}
1507: \bibinfo{author}{\bibfnamefont{H.}~\bibnamefont{Jeffreys}},
1508:   \emph{\bibinfo{title}{Theory of Probability}} (\bibinfo{publisher}{Oxford,
1509:   University Press, OXford}, \bibinfo{year}{1939}).
1510: 
1511: \bibitem[{\citenamefont{Spirtes et~al.}(2000)\citenamefont{Spirtes, Glymour,
1512:   and Scheines}}]{spirtes00}
1513: \bibinfo{author}{\bibfnamefont{P.}~\bibnamefont{Spirtes}},
1514:   \bibinfo{author}{\bibfnamefont{C.}~\bibnamefont{Glymour}}, \bibnamefont{and}
1515:   \bibinfo{author}{\bibfnamefont{R.}~\bibnamefont{Scheines}},
1516:   \emph{\bibinfo{title}{Causation, prediction, and search}}
1517:   (\bibinfo{publisher}{MIT Press, Cambridge}, \bibinfo{year}{2000}).
1518: 
1519: \bibitem[{\citenamefont{MacKay}(2003)}]{mackay03}
1520: \bibinfo{author}{\bibfnamefont{D.~J.~C.} \bibnamefont{MacKay}},
1521:   \emph{\bibinfo{title}{Information theory, inference, and learning
1522:   algorithms}} (\bibinfo{publisher}{Cambridge University Press, Cambridge},
1523:   \bibinfo{year}{2003}).
1524: 
1525: \bibitem[{\citenamefont{Robert}(2007)}]{robert07}
1526: \bibinfo{author}{\bibfnamefont{C.~P.} \bibnamefont{Robert}},
1527:   \emph{\bibinfo{title}{The Bayesian choice}} (\bibinfo{publisher}{Springer,
1528:   New York}, \bibinfo{year}{2007}).
1529: 
1530: \bibitem[{\citenamefont{Chen et~al.}(2000)\citenamefont{Chen, Shao, and
1531:   Ibrahim}}]{chen00}
1532: \bibinfo{author}{\bibfnamefont{M.-H.} \bibnamefont{Chen}},
1533:   \bibinfo{author}{\bibfnamefont{Q.-M.} \bibnamefont{Shao}}, \bibnamefont{and}
1534:   \bibinfo{author}{\bibfnamefont{J.~G.} \bibnamefont{Ibrahim}},
1535:   \emph{\bibinfo{title}{Monte Carlo methods in Bayesian Computation}}
1536:   (\bibinfo{publisher}{Springer-Verlag, New York}, \bibinfo{year}{2000}).
1537: 
1538: \bibitem[{\citenamefont{Beal}()}]{beal03}
1539: \bibinfo{author}{\bibfnamefont{M.~J.} \bibnamefont{Beal}}, \bibinfo{note}{phD.
1540:   Thesis, Gatsby Computational Neuroscience Unit, University College London,
1541:   2003}.
1542: 
1543: \bibitem[{\citenamefont{Yedidia et~al.}(2005)\citenamefont{Yedidia, Freeman,
1544:   and Weiss}}]{yedidia05}
1545: \bibinfo{author}{\bibfnamefont{J.~S.} \bibnamefont{Yedidia}},
1546:   \bibinfo{author}{\bibfnamefont{W.~T.} \bibnamefont{Freeman}},
1547:   \bibnamefont{and} \bibinfo{author}{\bibfnamefont{Y.}~\bibnamefont{Weiss}},
1548:   \bibinfo{journal}{IEEE Trans Inf Theor} \textbf{\bibinfo{volume}{51}},
1549:   \bibinfo{pages}{2282} (\bibinfo{year}{2005}).
1550: 
1551: \bibitem[{\citenamefont{Rasmussen}(2000)}]{rasmussen00}
1552: \bibinfo{author}{\bibfnamefont{C.~E.} \bibnamefont{Rasmussen}},
1553:   \bibinfo{journal}{Adv. Neural Inf. Proc. Syst.}
1554:   \textbf{\bibinfo{volume}{12}}, \bibinfo{pages}{554} (\bibinfo{year}{2000}).
1555: 
1556: \bibitem[{\citenamefont{Blei et~al.}(2003)\citenamefont{Blei, Ng, and
1557:   Jordan}}]{blei03}
1558: \bibinfo{author}{\bibfnamefont{D.~M.} \bibnamefont{Blei}},
1559:   \bibinfo{author}{\bibfnamefont{A.~Y.} \bibnamefont{Ng}}, \bibnamefont{and}
1560:   \bibinfo{author}{\bibfnamefont{M.~I.} \bibnamefont{Jordan}},
1561:   \bibinfo{journal}{J. Machine Learn Res} \textbf{\bibinfo{volume}{3}},
1562:   \bibinfo{pages}{993} (\bibinfo{year}{2003}).
1563: 
1564: \bibitem[{\citenamefont{Hofman and Wiggins}()}]{hofman07}
1565: \bibinfo{author}{\bibfnamefont{J.~M.} \bibnamefont{Hofman}} \bibnamefont{and}
1566:   \bibinfo{author}{\bibfnamefont{C.~H.} \bibnamefont{Wiggins}},
1567:   \bibinfo{note}{arXiv:0709.3512v2 [physics.data-an]}.
1568: 
1569: \bibitem[{\citenamefont{Jaynes}(1969)}]{jaynes68}
1570: \bibinfo{author}{\bibfnamefont{E.~T.} \bibnamefont{Jaynes}},
1571:   \bibinfo{journal}{IEEE Trans. Syst. Sci. and Cybernet.}
1572:   \textbf{\bibinfo{volume}{SSC-4}}, \bibinfo{pages}{227}
1573:   (\bibinfo{year}{1969}).
1574: 
1575: \bibitem[{\citenamefont{Kullback}(1959)}]{kullback59}
1576: \bibinfo{author}{\bibfnamefont{S.}~\bibnamefont{Kullback}},
1577:   \emph{\bibinfo{title}{Information Theory and Statistics}}
1578:   (\bibinfo{publisher}{Wiley, New York}, \bibinfo{year}{1959}).
1579: 
1580: \bibitem[{\citenamefont{Vazquez}()}]{vazquez08}
1581: \bibinfo{author}{\bibfnamefont{A.}~\bibnamefont{Vazquez}}, \bibinfo{note}{phys.
1582:   Rev. E (In press); arXiv:0712.1365v1 [q-bio.PE]}.
1583: 
1584: \bibitem[{\citenamefont{Asuncion and Newman}(2007)}]{asuncion07}
1585: \bibinfo{author}{\bibfnamefont{A.}~\bibnamefont{Asuncion}} \bibnamefont{and}
1586:   \bibinfo{author}{\bibfnamefont{D.}~\bibnamefont{Newman}},
1587:   \emph{\bibinfo{title}{{UCI} machine learning repository}}
1588:   (\bibinfo{year}{2007}),
1589:   \urlprefix\url{http://www.ics.uci.edu/~mlearn/MLRepository.html}.
1590: 
1591: \bibitem[{\citenamefont{Newman and Leicht}(2007)}]{newman07}
1592: \bibinfo{author}{\bibfnamefont{M.}~\bibnamefont{Newman}} \bibnamefont{and}
1593:   \bibinfo{author}{\bibfnamefont{E.}~\bibnamefont{Leicht}},
1594:   \bibinfo{journal}{Proc. Natl. Acad. Sci. USA} \textbf{\bibinfo{volume}{104}},
1595:   \bibinfo{pages}{564} (\bibinfo{year}{2007}).
1596: 
1597: \end{thebibliography}
1598: 
1599: \end{document}
1600: