q-bio0311017/text.tex
1: %For Discussion: advantage of including both gene-score signs in modules.
2: %In Methods IIA change notation w -> m., lose subscript g in Eq. (5)?
3: %E_perp -> E_C  and  E_0 -> E_G ?
4: %Figs. 1 and 2 are fine in black and white, and solid line for module 2 in 1(a).
5: \documentclass[aps,pre,twocolumn,notitlepage,nofootinbib,superscriptaddress]{revtex4}
6: %[aps,pre,preprint,notitlepage,nofootinbib,superscriptaddress]{revtex4}
7: %\documentclass[aps,prl,twocolumn,superscriptaddress]{revtex4}
8: %\documentclass[aps,prl,preprint,superscriptaddress]{revtex4}
9: %\documentclass[twocolumn,prl,aps]{revtex4}
10: %\documentclass[11pt]{article}
11: %\topmargin-0.5in
12: %\textheight60pc
13: %\textwidth44pc
14: \newif\ifpdf\ifx\pdfoutput\undefined\pdffalse\else\pdfoutput=1\pdftrue\fi
15: \newcommand{\pdfgraphics}{\ifpdf\DeclareGraphicsExtensions{.pdf,.jpg}\else\fi}
16: \usepackage{graphicx}% Include figure files
17: \usepackage{color}
18: \newcommand{\avg}[1]{\langle{#1}\rangle}
19: \newcommand{\var}[1]{\mathrm{Var}\left({#1}\right)}
20: \newcommand{\abs}[1]{\left|{#1}\right|}
21: \newcommand{\req}[1]{(\ref{#1})}
22: \newcommand{\beq}{\begin{equation}}
23: \newcommand{\eeq}{\end{equation}} 
24: \newcommand{\del}{\partial}
25: \newcommand{\beqar}{\begin{eqnarray}}
26: \newcommand{\eeqar}{\end{eqnarray}} 
27: \newcommand{\ignore}[1]{}
28: %\newcommand{\comment}[1]{#1}
29: \newcommand{\comment}[1]{}
30: \newcommand{\nignore}[1]{}
31: \newcommand{\vect}[1]{{\bf #1}}
32: \newcommand{\mat}[1]{{\bf #1}}
33: \newcommand{\mr}[1]{{\mathrm{#1}}}
34: \newcommand{\WT}{\mathrm{WT}}
35: \newcommand{\OS}{\mathrm{SS}} 
36: %\newcommand{\MR}{M\!R}
37: \newcommand{\tmax}{t_{\rm{M}}}
38: \newcommand{\Tm}{{\bf M}}
39: \newcommand{\tm}{{\bf m}}
40: \newcommand{\transp}{^{\rm T}}
41: \newcommand{\tG}{t_{\rm G}}
42: %\newcommand{\NG}{N_{\rm G}}
43: \newcommand{\Eort}{\mat{E}_{\rm C}}
44: \newcommand{\Eorti}{\mat{E}_{\rm C,0}}
45: \newcommand{\Eg}{\mat{E}_{\rm G}}
46: %\newcommand{\choose}[2]{\left(\stackrel{#1}{#2}\right)}
47: \newcommand{\gene}[2]{{\fcolorbox{black}{fun#1}{\makebox[30pt]{#2$_{}^{}$}}}}
48: \definecolor{yel}{rgb}{1,1,0}
49: \newcommand{\moduleannotation}[1]{\mbox{\textcolor{yel}{#1}}}
50: \newcommand{\vspac}[1]{\begin{picture}(0,#1)(0,0)\end{picture}}
51: \newcommand{\genesize}{\tiny}
52: \newcommand{\modulesize}{\normalsize}
53: \newcommand{\moduletitlesize}{\large}
54: \newcommand{\module}[1]{\setlength{\unitlength}{0.6pt}\fbox{\begin{minipage}{3.200in}\begin{flushleft}#1\end{flushleft}\end{minipage}}}
55: %\newcommand{\xgdist}{35}
56: %\newcommand{\ygdist}{35}
57: %\newcommand{\gput}[2]{\put(0,35)}
58: %\newcommand{\gput}[2]{\put(#1\xgdist,#2\ygdist)}
59: %\renewcommand{\baselinestretch}{2}
60: %\renewcommand{\thefootnote}{\fnsymbol{footnote}}
61:  
62: \definecolor{fun0}{rgb}{1,1,1}
63: \definecolor{black}{rgb}{0,0,0}
64: \definecolor{fun1}{rgb}{0.5,1,0.5}
65: \definecolor{fun2}{rgb}{0.5,0.5,1}
66: \definecolor{fun3}{rgb}{0.8,0.6,0.8}
67: \definecolor{fun4}{rgb}{1,0.5,0.6}
68: 
69: \begin{document}
70: \title{Finding regulatory modules through large-scale gene-expression data analysis} 
71: \author{Morten Kloster}
72: \affiliation{Department of Physics, Princeton University, Princeton, New Jersey 08544}
73: \affiliation{NEC Laboratories America, Inc., 4 Independence Way, Princeton, New Jersey 08540}
74: \author{Chao Tang}
75: \email[To whom correspondence should be adressed. E-mail:]{tang@nec-labs.com}
76: \affiliation{NEC Laboratories America, Inc., 4 Independence Way, Princeton, New Jersey 08540}
77: \affiliation{Center for Theoretical Biology, Peking University, Beijing 100871, China}
78: \author{Ned Wingreen}
79: \affiliation{NEC Laboratories America, Inc., 4 Independence Way, Princeton, New Jersey 08540}
80: 
81: \date{\today}
82: 
83: \begin{abstract}
84: \end{abstract}
85: 
86: %\pacs{87.10.+e, 87.23.Kg}
87: 
88: \begin{abstract}
89: The use of gene microchips has enabled a rapid accumulation 
90: of gene-expression data.
91: One of the major challenges of analyzing this data is the diversity,
92: in both size and signal strength, of
93: the various modules in the gene regulatory networks of organisms. Based on
94: the Iterative Signature
95: Algorithm [Bergmann, S., Ihmels, J. \& Barkai, N. (2002) {\it Phys. Rev.
96: E} {\bf 67}, 031902],
97: we present an algorithm---the Progressive Iterative Signature Algorithm
98: (PISA)---that, by sequentially eliminating
99: modules, allows unsupervised identification of both large and 
100: small regulatory modules. We applied PISA to a large set of
101: yeast gene-expression data, and, using the Gene Ontology database
102: as a reference, found that the algorithm is much better able to 
103: identify regulatory modules than methods based on
104: high-throughput transcription-factor binding experiments or 
105: on comparative genomics.
106: \end{abstract}
107: 
108: \maketitle
109: 
110: \section{Introduction}
111: 
112: The introduction of DNA microarray technology has made it possible to
113: aquire vast 
114: amounts of gene expression data, raising the issue of how best to
115: extract information from this data.
116: While basic clustering algorithms have been successful at finding
117: genes 
118: that are coregulated for a small, specific set of experimental
119: conditions
120: ~\cite{Alon_Broad_patterns,Eisen_Cluster_analysis,Tamayo_Interpreting_patterns},
121: these algorithms are less effective when applied to large data sets due
122: to two
123: well-recognized limitations. First, standard clustering algorithms
124: assign each gene to a single cluster, while many genes in fact 
125: belong to multiple transcriptional 
126: regulons.~\cite{Bittner_Data_analysis,Cheng_Biclustering_of,Gasch_Exploring_the,SA}.
127: Second, each transcriptional regulon may only be active in a few
128: experiments, and the remaining experiments will only contribute to the
129: noise~\cite{Getz_Coupled_two-way,Cheng_Biclustering_of,SA}.
130: 
131: A number of approaches have been proposed to overcome one or both of
132: these problems
133: \cite{Getz_Coupled_two-way,Califano_Analysis_of,Cheng_Biclustering_of,Owen_Gene_Recommender,Gasch_Exploring_the,Lazzeroni_Plaid_Models}.
134: \ignore{
135: , including iteratively clustering on subsets of the
136: genes/conditions~\cite{Getz_Coupled_two-way}, searching for
137: patterns/submatrices
138: of the expression data with low mean squared
139: residue[MEANING?]~\cite{Califano_Analysis_of, Cheng_Biclustering_of},
140: complementing sets of query genes
141: based on ranks[MEANING?]~\cite{Owen_Gene_Recommender}, fuzzy
142: clustering~\cite{Gasch_Exploring_the} and plaid
143: models[MEANING?]~\cite{Lazzeroni_Plaid_Models}.
144: [WE NEED TO SAY SOMETHING ABOUT WHY THESE APPROACHES ARE NOT AS GOOD AS
145: SA/ISA -
146: CAN WE SAY THAT THESE CLUSTERING METHODS DON'T DIRECTLY EXPLOIT THE
147: BIOLOGY, I.E.
148: REGULONS CONTROLLED BY TRANSCRIPTION FACTORS, WITH MANY TO MANY
149: MAPPING?]
150: }
151: A particularly promising approach, the Signature Algorithm (SA) was
152: introduced in 2002 by Ihmels {\it et al.}~\cite{SA}. Based on input sets
153: of related genes, SA identifies ``transcription modules'' (TMs), {\it
154: i.e.} sets of coregulated genes along with the sets of conditions for
155: which the genes are strongly coregulated. 
156: SA is well grounded in the biology of gene regulation. Typically, a
157: single transcription factor regulates multiple genes; a TM naturally
158: corresponds to a set of such genes and the conditions under which the
159: transcription factor is active. The authors tested the algorithm on a
160: large data set for the yeast {\it Saccharomyces cerevisiae}. By applying
161: SA to various sets of genes that were known or believed to be related,
162: they identified a large number of TMs.
163: 
164: Soon after, Bergmann {\it et al.}~\cite{ISA} published the Iterative
165: Signature Algorithm (ISA), which uses the output of SA as the input for
166: additional runs of SA until a fixed point is reached. By applying ISA to
167: random input sets and varying the threshold coefficient $t_{\rm{G}}$
168: (see below), the authors found almost all the TMs that had been
169: identified using SA, as well as a number of new modules. Many of these
170: modules proved to be in excellent agreement with existing knowledge of
171: yeast gene regulation.
172: 
173: While ISA can identify many transcriptional regulons from
174: gene-expression data, the algorithm has significant limitations. The
175: modules found depend strongly on the value of a threshold coefficient
176: $\tG$ used in the algorithm. To find all the relevant modules, a large
177: range of threshold values must be considered, and for each threshold the
178: algorithm may find thousands of fixed points, many of which are
179: spurious. While the largest, strongest modules are easily identified,
180: among the smaller, weaker modules it is a major challenge to identify
181: the real transcriptional regulons. Weak modules can even be completely
182: ``absorbed" by stronger modules.
183: 
184: The performance limitations of ISA are related to a number of
185: algorithmic limitations. The need for a large range of thresholds is
186: partially due to the threshold definition, and the large number of fixed
187: points is due to the large positive feedback in the algorithm. The main
188: conceptual limitation of ISA, however, is that it only considers one
189: transcription module at a time. The algorithm does not use knowledge of
190: already identified modules to help it find new modules, and it may find
191: a strong module hundreds of times before it finds a given weak module.
192: An even worse case is shown in Fig.~\ref{AbsorbedFixedPointFig}: When a
193: strong and a weak module are coexpressed for a significant fraction of
194: conditions, it may be impossible to find the weak module by itself---ISA
195: will find only a single stable fixed point, dominated by the strong module.  
196: 
197: \begin{figure}
198: \includegraphics[width=3.375in]{ToyExample3}
199: \caption{
200: A toy model with only two transcription modules.  (a) Module 1 is
201: upregulated under condition A, while module 2, a larger, stronger
202: module, is upregulated under conditions A and B. (b) Normalized
203: histograms of the gene scores given by the Signature Algorithm (SA) for
204: the background (solid fill), module 1 (solid line) and module 2 (dotted
205: fill), when using the true condition vector for either module 1 (condition A)
206: or module 2 (conditions A+B). Even starting with the true condition
207: vectors, SA does not resolve the two modules. Nor
208: can the Iterated Signature Algorithm (ISA) resolve module 1, even if it
209: receives the module itself as input gene set, as the genes from module 2
210: have higher scores also for condition A (there is only one fixed point
211: of ISA). Due to the background noise, 
212: it is also impossible to separate the modules
213: by varying the ISA gene threshold coefficient $\tG$.}
214: \label{AbsorbedFixedPointFig}
215: \end{figure}
216: 
217: A simple way to ensure that the same module is not found repeatedly is
218: to directly subtract the module from the expression
219: data, (this approach is used in~\cite{Lazzeroni_Plaid_Models}). A more
220: robust approach is to require the condition vector, {\it i.e.} the
221: weighted condition set, of each new transcription module to be
222: orthogonal to the condition vectors of all previously found modules. In
223: essence, this procedure corresponds to successively removing
224: transcription factors to reveal smaller and weaker transcription
225: modules. The successive removal of condition vectors is the central new
226: feature in our approach, and it is illustrated schematically in
227: Fig.~\ref{PISAtoyex}. We call the modified algorithm the Progressive
228: Iterative Signature Algorithm (PISA). Returning to the example in
229: Figs.~\ref{AbsorbedFixedPointFig}~and~\ref{PISAtoyex}, one finds that
230: PISA can easily identify both TMs: it first finds the strong module, 
231: removes its condition vector, and
232: then the only signal left is that of the weak module.
233: 
234: \begin{figure}
235: \includegraphics[width=3.375in]{PISAExample3}
236: \caption{Once the Progressive Itererative Signature Algorithm (PISA) has
237: eliminated the combined module 1+2 from Fig.~\ref{AbsorbedFixedPointFig}
238: (dashed line), the remaining signal makes it easy to separate the genes
239: of module 1 from the genes of module 2. (a) Remaining signal for each
240: module. (b) Actual gene scores for the new fixed point found by PISA.
241: Genes of module 1 (solid line) have been separated from genes of module
242: 2 (dotted fill) and the background (solid fill).}
243: \label{PISAtoyex}
244: \end{figure}
245: 
246: Progressively eliminating transcription modules {\it \`{a} la} PISA can
247: also improve the prospects for finding unrelated modules. The gene
248: regulation from one module will contribute to the
249: background noise for all unrelated modules. Therefore, eliminating large,
250: strong modules can significantly improve the signal to noise ratio of
251: the remaining modules.
252: 
253: 
254: \section {Methods}
255: 
256: \subsection {The Algorithms SA/ISA}
257: 
258: We briefly review the algorithms SA and ISA.
259: \ignore{The algorithm is based on the assumption that the logarithmic
260: gene expression ratios for the organism in question are, to a rough
261: approximation, given by the sum of contributions from a number of
262: transcription modules (plus noise):}
263: A transcription module $\Tm$ can be specified by a condition vector
264: (``experiment signature") $\tm^{\rm C}$ and a gene vector (``gene
265: signature") $\tm^{\rm G}$, where nonzero entries in the vectors indicate
266: conditions/genes that belong to the transcription module (TM).
267: \ignore{The (presumed) contribution of $\Tm$ to the expression level of
268: a gene $g$ for condition $c$ is then simply the product of the
269: corresponding vector entries: $({\bf \Delta E}(\Tm))_{gc}=(\tm^{\rm
270: C})_c (\tm^{\rm G})_g$.}
271: 
272: Given an appropriately normalized\footnote{SA actually uses two matrices
273: with different normalizations~\cite{SA}.} matrix $\mat{E}$ of log-ratio
274: gene expression data \comment{[MORTEN - IS "LOG-RATIO GENE-EXPRESSION
275: DATA" STANDARD TERMINOLOGY?]}
276: and an input set $G_{\rm I}$ of genes, SA scores all the conditions in
277: the data set according to how much each condition upregulates the genes
278: in the input set (downregulation gives a negative score). The result is
279: a condition-score vector $\vect{s}^{\rm C}$:
280: \beq
281:   \vect{s}^{\rm C} \equiv \frac{\mat{E}\transp \vect{m}_{\rm in}^{\rm
282: G}}{\abs{\vect{m}_{\rm in}^{\rm G}}},
283: \eeq
284: where $\mat{E}\transp$ is the transpose of $\mat{E}$ and
285: \beq
286: (\vect{m}_{\rm in}^{\rm{G}})_g=
287:   \left\{\begin{array}{ll} 1\;\;\;\;\;& g\in G_I \\
288:     0 & g\notin G_I\end{array}
289:   \right.
290: \eeq
291: is the gene vector corresponding to the input set. The entries of
292: $\vect{s}^{\rm C}$ that are above/below a threshold $\pm t_{\rm{C}}$
293: constitute the condition vector $\tm^{\rm C}$:
294: \beq
295:   (\vect{m}^{\rm C})_c \equiv (\vect{s}^{\rm C})_c
296: \cdot\Theta(\abs{(\vect{s}^{\rm C})_c}-t_{\rm C}),
297: \eeq
298: where $\Theta(x)=1$ for $x\geq 0$ and $\Theta(x)=0$ for $x<0$.
299: 
300: Similarly, the gene-score vector $\vect{s}^{\rm G}$ measures how much
301: each gene is upregulated by the conditions in $\tm^{\rm C}$, using the
302: entries of $\tm^{\rm C}$ as weights:
303: \beq
304:   \vect{s}^{\rm G} \equiv \frac{\mat{E}\; \vect{m}^{\rm
305: C}}{\abs{\vect{m}^{\rm C}}}.
306:   \label{GeneScoreEq}
307: \eeq
308: The entries of the gene-score vector $\vect{s}^{\rm G}$ that are more
309: than $\tG$ standard deviations $\sigma_{\vect{s}^{\rm G}}^{_{}}$ above
310: the mean gene score in the vector $\vect{s}^{\rm G}$ constitute the gene
311: vector $\tm^{\rm G}$:
312: \beq
313:   (\vect{m}^{\rm G})_g \equiv (\vect{s}^{\rm G})_g
314: \cdot\Theta((\vect{s}^{\rm G})_g-\avg{(\vect{s}^{\rm G})_g}_g-t_{\rm
315: G}\sigma_{\vect{s}^{\rm G}}^{_{}})
316: \eeq
317: 
318: ISA uses $\vect{m}^{\rm G}$ as the input $\vect{m}_{\rm in}^{\rm G}$ for
319: the next iteration, {\it i.e.} the genes are now weighted according to
320: their gene scores, until a fixed point is reached.
321: 
322: \subsection {The Algorithm PISA}
323: 
324: 
325: \noindent {\it Orthogonalization.}
326: Within PISA, each condition-score vector $\vect{s}^{\rm C}$ is required
327: to be orthogonal to the condition-score vectors of all previously found
328: transcription modules (TMs). Therefore, whenever PISA finds a TM and its
329: associated condition-score vector $\vect{s}^{\rm C}$, the component
330: along $\vect{s}^{\rm C}$ of each gene is removed from the gene
331: expression matrix (see {\it Implementation of PISA} below). This
332: requirement of orthogonality in PISA conflicts with the condition-score
333: threshold as used in ISA. If we make the condition-score vector
334: orthogonal first and then apply the threshold, the vector will no longer
335: be orthogonal, whereas if we apply the threshold first,
336: orthogonalization will give nonzero weight to all conditions,
337: eliminating the noise-filtering benefit of thresholding. We have chosen
338: to eliminate the condition-score threshold completely. In any event,
339: conditions that in ISA would fall below the threshold will have low
340: weight and will give only a small contribution to the noise.\\
341: 
342: \noindent {\it The gene-score threshold.}
343: In ISA, to find all modules, it is necessary to run the algorithm with
344: many different threshold coefficients $\tG$.
345: \ignore{(in~\cite{ISA}, the authors used thresholds from 1.8 to 4.0,
346: but this is not enough to find all possible modules).}
347: For low thresholds one finds a few very large modules (many genes),
348: while for high thresholds one finds many small modules (few genes).
349: Without prior knowledge of the module one is searching for, it is
350: difficult to know what $\tG$ to use.
351: \ignore{This raises the issue of which value of the threshold to
352: use for PISA---}
353: Within PISA, we wish to find all the modules using a single threshold.
354: \ignore{which appears to be difficult. Fortunately, this problem is to a
355: large extent caused by}
356: This requires modifying the threshold definition. In ISA, the gene-score
357: threshold is $t_G \sigma^{\rm ISA}$ where the standard deviation
358: $\sigma^{\rm ISA}$ is computed using the full distribution of gene
359: scores, and includes contributions both from the background and from the
360: module of interest (Fig.~\ref{geneeffectivethresholdfig}). For large,
361: strong modules, the module contribution may be larger than the
362: background contribution. As a result, $\sigma^{\rm ISA}$ is module
363: dependent and $t_G$ must be adjusted to prevent false-positives from the
364: background.
365: 
366: 
367: 
368: 
369: %\begin{figure*}
370: \begin{figure}
371: \includegraphics[width=3.375in]{StdDeviations3}
372: \caption{Means and standard deviations as used in ISA and in PISA
373: algorithms, calculated using all the genes (top bars) or only the non-module
374: genes (bottom bars). The mean $\avg{x}^{70\%}$ and standard deviation
375: $\sigma^{70\%}$ from PISA, using only the distribution within the
376: shortest interval that contains 70\% of all genes, are almost identical
377: to the ideal values $\avg{x}^{\rm bg}$ and $0.56\sigma^{\rm bg}$ of the
378: background noise (non-module genes). In ISA, the mean and standard
379: deviation are calculated from the whole distribution and so are strongly
380: module dependent.
381: \ignore{ is much larger. Within ISA, to get an effective threshold of
382: 4.0 $\sigma^{\rm bg}$ the nominal threshold must be $\approx 2.5
383: \sigma^{\rm ISA}$.}
384: This example uses generated data for a module of 300 genes out of 6206
385: total genes. The non-module genes have a normal distribution of
386: gene-expression levels.}
387: \label{geneeffectivethresholdfig}
388: \end{figure}
389: %\end{figure*}
390: 
391: We eliminate this problem in PISA by specifying the threshold relative
392: to the background, which we estimate using the mean, $\avg{x}^{\rm
393: 70\%}$, and the standard deviation, $\sigma^{\rm 70\%}$, of the gene
394: scores within the shortest interval that contains at least 70\% of all
395: the gene scores. By excluding extreme gene scores in this way, we
396: minimize the influence of the module on the means and standard
397: deviations of gene scores (Fig.~\ref{geneeffectivethresholdfig}). As a
398: test, we used $\sigma^{\rm 70\%}$ in place of $\sigma^{\rm ISA}$ in ISA
399: and found both very large and very small modules with a single value of
400: $t_G$.
401: 
402: We need to be conservative when selecting the gene-score threshold
403: because, if PISA misidentifies a module, elimination of its condition
404: vector can lead to errors in other modules. Therefore, the number of
405: genes included in modules due to noise should be very low. We have used
406: a threshold of $7.0\sigma^{\rm 70\%}$, which for a Gaussian distribution
407: corresponds to about $3.9\sigma$. The chance of including a gene due to
408: noise is about $10^{-4}$ per gene, {\it e.g.} with the 6206 genes in the
409: yeast data set, the average number of genes included by mistake in each
410: module would be about 0.62. Using a high threshold means that we may
411: miss genes that should belong to a module, however this is less risky
412: than including genes by mistake. As PISA proceeds by eliminating
413: condition-score vectors, it does not matter whether we identify all the
414: genes in a module, as long as the condition-score vector is accurate.
415: Once,
416: PISA has finished, we can easily see which genes would be included
417: when using various gene-score thresholds for the same condition-score
418: vector.
419: 
420: ISA only considers sets of genes that have {\it high} gene scores, {\it
421: i.e.} positive signs. As discussed in~\cite{SA}, this can lead to two
422: modules that are regulated by the same conditions but with opposite
423: sign. In contrast, PISA includes all genes with sufficiently extreme
424: scores in a single module, and the relative signs of gene scores specify
425: whether the genes are coregulated or counter-regulated.\\ 
426: 
427: 
428: \noindent {\it Implementation of PISA.}
429: To begin, PISA requires a matrix {\bf E} of log-ratio gene-expression
430: data, with zero average for each condition.
431: Two matrices are obtained from {\bf E}: The first $\Eg$ is
432: normalized for each gene 
433: $$\avg{(\Eg)_{gc}}_c=0,\;\;
434: \avg{(\Eg)_{gc}^2}_c=1\;\;\;\;\;\;\forall g\in G.$$
435: Normalization of $\Eg$ is essential so that the gene-score threshold
436: can be applied to all genes on an equal footing. 
437: The second matrix $\Eort$ is obtained from $\Eg$ by normalizing for
438: each condition, $ \avg{(\Eorti)_{gc}^2}_g=1$, 
439: where $\Eorti$ denotes the initial $\Eort$.
440: (Note that this is essentially the opposite of the notation used
441: in~\cite{SA}.)
442: We then apply a modified version of ISA, mISA (see below), a large
443: number of times (typically 10,000), and whenever mISA finds a module, we
444: remove from $\Eort$ the components along the module's condition score vector $\vect{s}^{\rm C}$:
445: \beq
446:   \Eort^{\mr{new}} \equiv \Eort- \Eort \frac{\vect{s}^{\rm{C}}
447: (\vect{s}^{\rm{C}})\transp}{\abs{\vect{s}^{\rm{C}}}^2}
448: \eeq
449: \ignore{The new matrix $\Eort^{\rm new}$, which is used for the next
450: applications of mISA, will still have zero average over genes $g$,
451: %\avg{(\Eort^{\rm new})_{gc}}_g = 0$, but the variance,
452: %\avg{(\Eort^{\rm new})2_{gc}}_g$, may be different for different
453: conditions $c$.}
454: 
455: \ignore {The matrix $\Eg$, which is used to calculate the gene scores,
456: stays properly normalized throughout the algorithm, thus the scores for
457: different genes will always be comparable. As we don't use a condition
458: threshold, we don't have to worry about whether or not the condition
459: scores, calculated using $\Eort$, are comparable or not.}
460: 
461: As mISA is repeatedly applied, new modules are found less and less
462: frequently. For example, one run of 10,000 applications of mISA found
463: 496 modules, and 287 of them were found in the first 1,000 applications.
464: \ignore{711 modules, and 504 of them were found in the first 2,000} 
465: As the later modules are also generally smaller and less reliable, the
466: exact number of times mISA is applied is not very important.\\
467: 
468: \noindent {\it mISA.}
469: As input, the modified Iterative Signature Algorithm (mISA) requires the
470: two matrices $\Eort$ and $\Eg$. We start each application of mISA by
471: generating a random set of genes $G_0$ and a corresponding gene vector
472: $\vect{m}_0^{\rm{G}}$:
473: \ignore{The number of genes in $G_0$ is currently a random number $2\leq
474: \abs{G_0}\leq 51$.}
475: 
476: $$(\vect{m}_0^{\rm{G}})_g=\left\{\begin{array}{ll} 1\;\;\;\;\;& g\in G_0
477: \\ 0 & 
478: g\notin 
479: G_0\end{array} \right.$$
480: Each iteration $i$ within mISA consists of multiplying the transpose of
481: $\Eort$ by the gene vector $\vect{m}_i^{\rm G}$ to produce the
482: condition-score vector $\vect{s}_i^{\rm C}$:
483: $$\vect{s}_i^{\rm{C}} \equiv \Eort\transp \vect{m}_{i}^{\rm{G}},$$
484: and then multiplying $\mat{E}_0$ by the normalized condition-score
485: vector to produce the gene-score vector $\vect{s}_i^{\rm G}$:
486: $$\vect{s}_i^{\rm{G}} \equiv \frac{\Eg 
487: \vect{s}_i^{\rm{C}}}{\abs{\vect{s}_i^{\rm{C}}}},$$
488: From $\vect{s}_i^{\rm G}$, one calculates the gene vector
489: $\vect{m}_{i+1}^{\rm G}$ for the next iteration:
490: $$(\vect{m}_{i+1}^{\rm{G}})_g \equiv (\vect{s}_i^{\rm{G}})_g\, 
491: \theta(|(\vect{s}_i^{\rm{G}})_g-\avg{(\vect{s}_i^{\rm{G}})_g}_g^{70\%}|-t_{\rm{G
492: }} 
493: \sigma_{\vect{s}^{\rm{G}}_i}^{70\%})$$
494: 
495: We iterate until: (a) $(\vect{m}^{\rm{G}}_i)_g$ and $(\vect{m}^{\rm{G}}_{i+1})_g$ have the same sign (0, + or -) for all $g$, (b) the iteration number is $i=20$, or (c) fewer than two genes have nonzero weight. If fewer than five genes have nonzero weight (for (a) or (b)), the result is discarded, otherwise we have found a module with condition-score vector
496: $\vect{s}^{\rm{C}}=\vect{s}^{\rm{C}}_i$, gene-score vector
497: $\vect{s}^{\rm{G}}=\vect{s}^{\rm{G}}_i$,  and gene vector
498: $\tm^{\rm{G}}=\vect{m}^{\rm{G}}_{i+1}$.
499: 
500: We chose a threshold coefficient $t_{\rm{G}} = 7.0$ so that the expected
501: number of genes included in each module due to background noise would be
502: less than one. However, with this high threshold, starting from a random
503: set of genes there was only a very low chance that two or more genes
504: would score above the threshold in the first iteration\footnote{This is
505: not an issue in ISA, where the condition threshold helps to pick out
506: the, possibly very small, signal from the noise.}. To increase the
507: chance of finding a module, we used a different formula for
508: $\vect{m}_{1}^{\rm{G}}$. Instead of selecting only genes with scores
509: above the threshold, we kept a random number $2\leq n\leq 51$ of the
510: genes with the most extreme scores. This procedure was generally
511: adequate to produce a correlated set of genes for the next iteration.\\
512: 
513: 
514: \noindent {\it Consistent modules.}
515: ISA typically finds many different fixed points corresponding to the
516: same module, each differing by a few genes. PISA only finds each module
517: once during a run, but the precise genes in the module depend on the
518: random input set of genes and also on which modules were already found
519: and eliminated. Furthermore, PISA sometimes finds a module by itself,
520: while other times it may find the module joined with another module, or
521: PISA may find only part of a module, or not find the module at all. To
522: get a reliable set of modules, it was necessary to perform a number of
523: runs of PISA and identify the modules that were consistent from run to
524: run. To identify consistent modules, we first tabulated preliminary
525: modules -- transcription modules found by individual runs of PISA. A
526: preliminary module contributes to a consistent module if the preliminary
527: module 
528: contains more than half the genes in the full module, regardless of
529: gene-score sign, 
530: and these genes constitute at least 20\% of the genes in the preliminary
531: module.
532: A gene is included in the consistent module if the gene occurs in more
533: than 50\% of the contributing
534: preliminary modules, always with the same gene-score sign. \\
535: 
536: \noindent{\it Correlations between condition-score vectors.}
537: Once we identified a consistent module, $\vect{m}^{\rm G}$, we
538: calculated the raw condition-score vector $\vect{r}=\Eorti \transp
539: \vect{m}^{\rm G}$, using the initial value of the gene-expression-data
540: matrix $\Eort$. From the $\vect{r}$'s we evaluated the condition
541: correlations
542: $\vect{r}\cdot\vect{r}^\prime/(\abs{\vect{r}}\abs{\vect{r}^\prime})$
543: between different modules. 
544: \ignore{(This correlation can %be much larger than the average correlation
545: between individual genes from the two modules, as we here average over
546: many genes, reducing the noise.)}
547: \ignore{(This correlation can be much larger that the average
548: correlation between individual genes from the two modules: For the
549: modules we have found for (1) iron (and other metals) metabolism and (2)
550: phosphate metabolism, the average correlation between genes is 0.0135,
551: while the correlation between the modules is 0.11. These values,
552: however, depend on the normalization used.)}
553: \\
554: 
555: \noindent Additional details are discussed in the supporting material.
556: \ignore{\noindent {\it Details.}[I'D PREFER TO SHORTEN THIS TO JUST
557: "ADDITIONAL
558: DETAILS ARE DISCUSSED IN 
559: THE SUPPORTING MATERIAL". INSTEAD OF PUTTING THIS IN ITS OWN SECTION,
560: CAN IT GO AT THE BOTTOM OF
561: THE IMPLEMENTATION OF PISA SUBSECTION?]
562: There are some additional, relatively minor changes that we have made to
563: ISA,either to improve performance or to compensate for removing the
564: threshold $t_{\rm C}$. These are discussed in the supporting material.}
565: 
566: \subsection{$p$-Values}
567: 
568: Given a set containing $m$ genes out of the total of $N_{\rm G}$, the
569: $p$-value for having at least $n$ genes in common with a Gene Ontology
570: (GO) category containing $c$ of the $N_{\rm G}$ genes is
571: \beq
572:   p = \sum_{i=n}^{\min\{c,m\}} \frac{{{c}\choose{i}}{{N_{\rm
573: G}-c}\choose{m-i}}}{{{N_{\rm G}}\choose{m}}},
574: \eeq
575: \ignore{\beq
576:   p = 1-\sum_{i=0}^{n-1} \frac{{{c}\choose{i}}{{N_{\rm
577: G}-c}\choose{m-i}}}{{{N_{\rm G}}\choose{m}}}.
578: \eeq}
579: We ignore any genes that are not present in our expression data when counting $c$.
580: 
581: 
582: \section{Results}
583: 
584: We applied PISA to the yeast data set used in~\cite{ISA}, which consists
585: of log-ratio gene-expression data for $N_{\rm G}=6206$ genes and $N_{\rm
586: C}=1011$ experimental conditions (approximately 10\% of the values are
587: missing or invalid). Normalization gives the matrices $\Eg$ and $\Eort$
588: (see Methods for details).
589: 
590: As a preliminary test, we repeatedly applied PISA to one fully scrambled
591: version of the matrix $\Eg$ 
592: (and the corresponding $\Eort$). From run to run, the algorithm
593: identified many large modules derived almost entirely 
594: from a single condition, as expected in light of the broad distribution
595: of the raw gene-expression data (Fig.~\ref{distributions} in supporting material). 
596: \ignore{This is not too surprising, as the distributions for raw expression
597: data
598: %%(Fig.~\ref{distributions}(a))
599: %and for normalized data
600: %%(Fig.~\ref{distributions}(b))
601: %both have long tails (Fig. 1 in supporting material), thus for modules
602: that derive from only a very small number of %conditions, the noise
603: statistics will be far from Gaussian (for most modules, the gene scores
604: are averages of many %contributions, and the noise will be Gaussian).
605: }
606: PISA also found many small modules, but these differed from one run to
607: the next.
608: We were able to eliminate both these classes of false positives using
609: filters for consistency, recurrence, and number of contributing
610: conditions (Fig.~\ref{modulequality} in supporting material).
611: 
612: 
613: We performed 30 runs of PISA on the yeast data set
614: \ignore{(the same data set used in~\cite{ISA})}
615: and identified the modules that appeared consistently, using the filters
616: derived above.
617: At the start of each run, only a few modules could be found with our
618: single choice of gene threshold $\tG$. Nevertheless, PISA did
619: consistently find new modules after eliminating others, demonstrating
620: that removing the condition vectors of found modules improves the 
621: signal to noise for the remaining ones.
622: 
623: For most of the modules we found, the genes were coregulated, {\it
624: i.e.} all the gene scores had the same sign. (In contrast, the modules
625: that were eliminated by the filters often had about equal numbers of
626: genes of either sign.) There were, however, a significant number of
627: modules with a few gene scores differing in sign from the rest, and a
628: few modules 
629: with many gene scores of both signs, {\it e.g.} the a/$\alpha$ pheromone
630: production/detection module. Furthermore, many of the modules found by
631: PISA agreed closely with modules identified by ISA at various
632: thresholds, while other PISA modules were subsets of ISA modules. Some
633: PISA modules, for example the de novo purine synthesis module
634: (Fig.~\ref{PurineModule}), were significantly more complete than the
635: ones found by ISA (at any threshold).
636: 
637: \begin{figure}
638: %\setlength{\unitlength}{0.6pt}
639: %\fbox{
640: %\begin{minipage}{3.200in}
641: %\begin{flushleft}
642: \module{
643: \moduletitlesize
644: 
645: Module: De novo purine biosynthesis
646: \modulesize
647: 
648: \vspace{10pt}
649: Number of genes: 32
650: 
651: Average number of contributing conditions: 14.6
652: 
653: Consistency: 0.83
654: 
655: Best ISA overlap: 0.59 at threshold 5.0, frequency 16
656: 
657: \vspace{10pt}
658: \genesize
659: \begin{picture}(300,150)(0,0)
660: \ignore{
661:        MTD1  28  490.002
662:        ADE2  28  478.121
663: 
664:        SHM2  28  468.812
665:       ADE17  28  446.213
666:       ADE13  28  419.476
667:        ADE1  28  414.622
668:        GCV1  28  403.635
669:        ADE4  28  339.101
670:      ADE5,7  26  334.772
671:        CEM1  28  314.833
672:        GCV2  28  310.791
673:        SER1  28  310.702
674:     YGL186C  28  310.375
675:        GCV3  28  286.458
676:       ADE12  28  284.974
677:        HIS4  28  273.873
678:        ADE8  27  265.844
679:     YDR089W  27  243.694
680:        ADE6  25  235.064
681:        SER2  26  231.39
682:        SER3  26  228.487
683:       SER33  27  222.383
684:        ADE3  26  217.567
685:        MET6  26  204.301
686:     YPR004C  25  203.972
687:        AIP2  24  186.574
688:    ETF-BETA  23  153.927
689:        HIS7  21  135.631
690:        URA4  21  128.485
691:        SSU1  18  115.747
692:        HIS1  17  108.011
693:        HIS5  16  96.4565
694: }
695: \put(  65,  75){\gene{2}{GCV3}}
696: \put( 325, 125){\gene{1}{ADE1}}
697: \put( 195,  25){\gene{3}{HIS7}}
698: \put( 195,  75){\gene{3}{HIS4}}
699: \put(  65,  25){\gene{4}{AIP2}}
700: \put(   0, 100){\gene{2}{GCV1}}
701: \put( 325,  75){\gene{0}{YDR089W}}
702: \put( 260,  75){\gene{1}{ADE8}}
703: \put(   0,   0){\gene{3}{HIS1}}
704: \put( 195, 100){\gene{4}{CEM1}}
705: \put( 130,  50){\gene{2}{SER3}}
706: \put( 325,  50){\gene{2}{MET6}}
707: \put(   0,  75){\gene{1}{YGL186C}}
708: \put( 130, 100){\gene{1}{ADE5,7}}
709: \put(   0,  50){\gene{1}{ADE6}}
710: \put( 260,  50){\gene{1}{ADE3}}
711: \put( 130,  25){\gene{4}{ETF-BETA}}
712: \put(  65,  50){\gene{2}{SER2}}
713: \put( 195,  50){\gene{2}{SER33}}
714: \put(  65,   0){\gene{3}{HIS5}}
715: \put(   0, 125){\gene{2}{MTD1}}
716: \put( 130, 125){\gene{2}{SHM2}}
717: \put( 260, 125){\gene{1}{ADE13}}
718: \put( 260,  25){\gene{4}{URA4}}
719: \put( 195, 125){\gene{1}{ADE17}}
720: \put( 260, 100){\gene{2}{GCV2}}
721: \put(  65, 100){\gene{1}{ADE4}}
722: \put( 130,  75){\gene{1}{ADE12}}
723: \put(  65, 125){\gene{1}{ADE2}}
724: \put( 325, 100){\gene{2}{SER1}}
725: \put( 325,  25){\gene{4}{SSU1}}
726: \put(   0,  25){\gene{4}{YPR004C}}
727: \end{picture}
728: 
729: 
730: \vspace{20pt}
731: \gene{0}{0} \modulesize Unknown \genesize
732: 
733: \vspace{5pt}
734: \gene{1}{1} \modulesize Purine synthesis/transport \genesize
735: 
736: \vspace{5pt}
737: \gene{2}{2} \modulesize Tetrahydrofolate activation \genesize
738: 
739: \vspace{5pt}
740: \gene{3}{3} \modulesize Histidine biosynthesis \genesize
741: 
742: \vspace{5pt}
743: \gene{4}{4} \modulesize Other \genesize
744: 
745: \vspace{10pt}
746: \includegraphics[width=3.2in]{Purine}
747: %\end{flushleft}
748: %\end{minipage}
749: }
750: \caption{The de novo purine synthesis module found with PISA.}
751: \label{PurineModule}
752: \end{figure}
753: 
754: 
755: \ignore{
756: We also find many modules that derive almost entirely from a single
757: condition. For these modules, the statistical propertiThis means that,
758: once the earlier modules had eliminated, the expression pattern caused
759: by this mutation was not similar to any other conditions. These modules
760: are not very reliable (as the the gene scores are hardly averaged at
761: all), and certainly don't have much relevance under physiological
762: conditions.
763: }
764: 
765: 
766: \begin{figure}
767: \begin{picture}(240,160)(0,0)
768: \put(0,0){\includegraphics[width=3.375in]{ModuleCorrelations}}
769: \comment{\multiput (0,0)(0,10){16}{\line(1,0){2}}
770: \multiput (0,0)(10,0){24}{\line(0,1){2}}}
771: \put(42,10){\moduleannotation{$\left.\mbox{\vspac{10}}\right\}$Biosynthesis}}
772: \put(105,3.5){\moduleannotation{-Amino acid (general)}} % height*3.6, roughly
773: \put(190,7.1){\moduleannotation{-Arginine}}
774: \put(105,10.7){\moduleannotation{-Biotin}}
775: \put(190,14.3){\moduleannotation{-Lysine}}
776: \put(105,17.9){\moduleannotation{-De novo purine}}
777: \put(90,31){\moduleannotation{$\left.\mbox{\vspac{14}}\right\}$}}
778: \put(98,22){\rotatebox{90}{\moduleannotation{Stress}}}
779: \put(170,21.5){\moduleannotation{-Oxidative stress}}
780: \put(170,28.6){\moduleannotation{-Proteolysis}}
781: \put(170,35.8){\moduleannotation{-COS genes}}
782: \put(170,43.0){\moduleannotation{-S-S bond repair}}
783: \put(110,25.1){\moduleannotation{-AAD genes}}
784: \put(110,32.2){\moduleannotation{-Trehalose++}}
785: \put(110,39.4){\moduleannotation{-Heat shock}}
786: %\put(110,46.5){\moduleannotation{-Calmodulin}}
787: %\put(110,143.5){\moduleannotation{-Calmodulin}}
788: \put(105,95){\moduleannotation{Mating$\left\{\mbox{\vspac{8}}\right.$}}
789: \put(10,89.7){\moduleannotation{\makebox[90pt][r]{$\alpha$/a
790: difference-}}}
791: \put(10,96.8){\moduleannotation{\makebox[90pt][r]{Mating type a
792: genes-}}}
793: %\put(50,2){Amino acid biosynthesis}
794: %\put(90,6){Arginine biosynthesis}
795: %\put(50,10){Biotin synthesis \& transport}
796: \end{picture}
797: \caption{Correlations between modules identified by PISA. The modules
798: are ordered to form clusters: In the lower left corner are the main
799: amino acid biosynthesis module and several smaller, more specific
800: biosynthesis modules; the other main clusters are, roughly, stress
801: response, mating, and ribosomal proteins/rRNA processing.}
802: \label{Correlations}
803: \end{figure}
804: 
805: \ignore{Examples of correlated modules are: ribosomal proteins (104
806: genes) and RNA-related genes (144 genes)---correlation 0.76 and with no
807: common genes (in ISA, these modules are combined at sufficiently low
808: $\tG$). The protein synthesis module (104 genes) is correlated both with
809: de novo purine synthesis (28 genes) with 6 common
810: genes and a correlation of 0.51, and with nitrogen/sulfur metabolism (58
811: genes, correlation0.57), but the correlation between the last two
812: modules is only 0.30, {\it i.e.} they seem to be related only through
813: the protein-synthesis module.
814: }
815: 
816: \ignore{
817: The galactose induced module turns on GAL genes and also, as a weaker
818: effect, represses a number of hexose transporters. These hexose
819: transporters also occur in another module (which is consistently found
820: after the galactose induced module) which consists almost entirely of
821: hexose transporters, and this module also contains GAL2, the galactose
822: permease, but in this module it is coregulated with the other hexose
823: transporters, whereas they were counter-regulated in the galactose
824: induced module.
825: }
826: 
827: PISA found several small modules that agree very well with 
828: known gene regulation in yeast. For example, the arginine-biosynthesis
829: module
830: consists of ARG1, ARG3, ARG5,6, ARG8, CPA1, CTF13, and CAR2; out of
831: these
832: CAR2 has a negative gene score, {\it i.e.} it is counter-regulated
833: relative
834: to the others. The first five genes are precisely the arginine-synthesis
835: genes known to be repressed by arginine, while CAR2 and CAR1
836: (which is the 2nd highest scoring gene that failed to make the
837: threshold)
838: are catabolic genes known to be induced by
839: arginine~\cite{Messenguy_Regulation_Of}.
840: 
841: PISA also found a
842: zinc (zap1 regulated) module consisting of ZRT1, ZRT2, ZRT3, ZAP1,
843: YOL154, INO1, ADH4, and YNL254C. These are almost exactly the highest
844: scoring genes in a microarray experiment comparing expression under
845: zinc starvation of a zap1 mutant vs. wild
846: type~\cite{Lyons_Genome-wide_characterization}, however, our data set
847: does not include this or any other zinc starvation (or zap1 mutant)
848: experiment---indeed, there are no experimental conditions that have a
849: remarkably high score for this module, although conditions from the
850: Rosetta compendium~\cite{Rosetta}, most of which are deletion mutant
851: experiments, tend to have much higher scores than the other conditions
852: (see supporting material). This module, as well as the starvation
853: experiments in~\cite{Lyons_Genome-wide_characterization} and direct
854: transcription factor binding experiments (see below), all indicate that
855: YNL254C is regulated by zap1, and it probably has some function related
856: to zinc starvation/uptake.
857: 
858: In order to evaluate the overall performance of PISA, we compared our
859: modules to the categories in the Gene Ontology (GO) curated 
860: database~\cite{GO}. For the set of genes in each of our modules 
861: we calculated the $p$-value for the overlap with the set of genes in
862: every GO 
863: category (see Methods). The $p$-value is the probability that an 
864: observed overlap occurred by chance. The lowest $p$-value we found was
865: $3.5\cdot 10^{-216}$, for the GO category ``cytosolic ribosome'', 
866: and we found $p$-values
867: below $10^{-20}$ for more than 140 other GO categories. (The modules
868: that were
869: removed by our filters mostly did not have significant $p$-values, and
870: none were below $10^{-10}$).
871: We used the p-values between our PISA modules and the GO categories to
872: compare PISA 
873: to other means of identifying transcriptional modules. Specifically, we 
874: compared PISA to two different databases of genes predicted to be
875: regulated by single transcription factors. Database ``A'' contains
876: genes that were enriched through immunoprecipitation with tagged
877: transcriptional regulators~\cite{Lee_Transcriptional_Regulatory}, while
878: Database ``B'' has genes sharing regulatory sequences derived by
879: comparative genomics~\cite{Kellis_Sequencing_and}.
880: Figure~\ref{GOcomparisons} shows the $p$-values between GO and PISA
881: compared to
882: the $p$-values between GO and each of these two databases.\footnote{We
883: used an internal $p$-value threshold of
884: 0.001 for Database A, as suggested
885: in~\cite{Lee_Transcriptional_Regulatory}.} The lower $p$-values for PISA
886: indicate a consistently
887: better agreement between GO and PISA than between GO and the other
888: databases. For a few GO categories
889: Database B has a lower $p$-value than PISA, but these categories are all
890: close to the root of the GO tree 
891: and each contains more than half the genes in yeast.
892: 
893: 
894: 
895: \begin{figure}
896: \includegraphics[width=3.375in]{GOcomparisons_log3}
897: \caption{Best $p$-values onto every Gene Ontology (GO) category.
898: In each panel, we include only GO categories for which at least one
899: $p$-value is below
900: $10^{-10}$. (a) PISA vs. Database A. (b) PISA vs. Database B. (a) inset:
901: Database A vs.
902: database B---there are very few GO categories onto which both A and B
903: have
904: low $p$-values.}
905: \label{GOcomparisons}
906: \end{figure}
907: 
908: 
909: Compared to microarray data, both Database A and Database B 
910: have a clear disadvantage: their binding sites are assigned to
911: intergenic
912: regions, and if the two genes bordering an intergenic region are
913: divergently transcribed, then the databases do not identify which of the 
914: genes is regulated. In many cases, we found that by comparing sets of
915: genes in database A to PISA modules, we
916: could decide which of divergently transcribed genes were
917: actually regulated. For example, Database A lists 6 intergenic regions
918: as binding site for zap1 at an internal $p$-value threshold of $10^{-5}$,
919: and 4 of these lie 
920: between divergently transcribed genes. However, 5 of the 6 intergenic
921: regions border the genes
922: ZRT1, ZRT2, ZRT3, ZAP1, and YNL254C which PISA identifies as part of the
923: zinc module.
924: 
925: Database A appears to have an additional source of false positives.
926: Intergenic regions that are close to intergenic regions with very low
927: $p$-values often have low $p$-values themselves, even when there is no
928: apparent connection between the genes and no evidence of a binding site in
929: the DNA sequence.  For example, for the de novo purine-biosynthesis
930: module, which is primarily regulated by the bas1 transcription factor,
931: the intergenic region controlling GCV2 has the lowest $p$-value within
932: Database A, $1.1\cdot 10^{-16}$, and all the four closest intergenic 
933: regions have $p$-values below $10^{-5}$. Comparison to PISA 
934: modules can help eliminate these potential false positives: 
935: out of the 29 genes assigned a $p$-value below $10^{-4}$ for bas1 
936: binding in database A, 13 belong to a single PISA module, 
937: 4 others are divergently transcribed adjacent genes, and 6 others 
938: are genes transcribed from nearby intergenic regions. 
939: 
940: %and the 
941: %remaining 6 genes are 3 pairs of divergently transcribed genes 
942: %that we can not explain.
943: 
944: 
945: \ignore{
946: The agreement between the PISA modules and the two databases was mixed,
947: however the PISA modules agreed significantly better with database B
948: than the two databases did with each other. (Note that while both of the
949: databases attempt to identify genes that bind single transcription
950: factors, the PISA modules are effective modules, i.e. they may involve
951: combinatorial patterns of many transcription factors, and we should not
952: expect perfect agreement with the databases.) While some modules show
953: excellent agreement with database B (e.g. the ribosomal proteins
954: module), other modules (e.g. phosphate starvation), while clearly very
955: good, have no significant overlap with the databases. The agreement with
956: the database B was much better for the modules that we kept than for the
957: modules that were discarded by the filters: The geometric average of the
958: probability of the most significant overlap was $10^{-6}$ for the
959: modules we kept, but only 1/58 for the modules that were discarded (when
960: comparing to 113 different database sets).
961: }
962: 
963: 
964: \section{Discussion}
965: 
966: The Progressive Iterative Signature Algorithm
967: (PISA) embodies a new approach to analysis of large gene-expression 
968: data sets. The central new feature in PISA is the robust elimination 
969: of transcription modules as they are found, by removing their 
970: condition-score vectors. Also new to PISA, compared to its 
971: precursors SA \cite{SA} and ISA \cite{ISA}, is the inclusion of both 
972: coregulated and counter-regulated genes in a single module, and the
973: use of a single gene-score threshold.
974: 
975: Altogether, these new features result in an algorithm that 
976: can reliably identify both large and small regulatory modules, 
977: without supervision. We confirmed the performance of PISA by
978: comparison to the Gene Ontology (GO) database -- PISA performed
979: considerably better against GO than either high-throughput 
980: binding experiments or comparative genomics. PISA therefore 
981: provides a practical means to identify new regulatory modules
982: and to add new genes to known modules. 
983: 
984: %At one level, the modular organization of gene regulation in
985: %an organism can be viewed simply as its ``wiring diagram''. 
986: %However, at a deeper level, this organization must reflect
987: %the organism's physiology. 
988: 
989: Can PISA shed any light on the organization of gene expression 
990: beyond the level of individual transcription modules?
991: In~\cite{ISA}, the authors argued that they could trace the relationship
992: between modules from the effects of changing the threshold $\tG$. For
993: instance, a large module might split into two smaller ones as $\tG$ was
994: increased. With PISA, we were able to use a more direct approach. Once
995: we identified the modules, we computed the ``raw" ({\it i.e.}
996: pre-eliminations) condition-score vector
997: $\vect{r}$ for each module, and from these raw condition-score vectors,
998: we evaluated the condition correlations between modules (see Methods). 
999: Figure~\ref{Correlations} shows the condition correlations between 40 of
1000: the
1001: modules that we can put a name to. A large, positive correlation between
1002: two modules can either indicate that the modules have many genes in
1003: common, {\it e.g.} the genes of the arginine-biosynthesis module are
1004: essentially a subset of the genes of the amino-acid-biosynthesis module, 
1005: or, as in the toy model in 
1006: Figs.~\ref{AbsorbedFixedPointFig}~and~\ref{PISAtoyex},
1007: the modules have few/no genes in common, but the two
1008: sets of genes are similarly regulated under many
1009: conditions. In the toy model, the raw condition-score vectors
1010: $\vect{r}_1$ and $\vect{r}_2$ correspond to the vectors in
1011: Fig.~\ref{AbsorbedFixedPointFig}(a) and their correlation,
1012: $\vect{r_1}\cdot\vect{r_2}/(\abs{\vect{r_1}}\abs{\vect{r_2}})$,
1013: is simply the cosine of the angle between them.
1014: A real example of this second type of correlation is provided by 
1015: the ribosomal-protein module (104 genes) and the rRNA-processing module
1016: (144 genes). They have no genes in
1017: common, but the correlation between them is very high, 0.76.
1018: 
1019: \ignore{The ribosomal proteins module (136 genes) and the rRNA
1020: processing module (102 genes) are a good example of the latter: They
1021: have no genes in common, but the correlation between them is
1022: 0.73\ignore{Average correlation between genes is 0.52}.}
1023: 
1024: Out of the 6206 genes included in the expression data, 2626 genes appeared in at least one module, and 923 genes appeared in more than one module\footnote{We have adjusted for the facted that for some modules there are several versions that are very similar.}. No genes appeared in more than 4 different modules.
1025: 
1026: 
1027: \ignore{\section{Medical Applications}
1028: 
1029: The module-finding algorithms discussed here, including SA/ISA and
1030: PISA have potential applications in medicine. The ``conditions'' in
1031: which gene-expression data is obtained can equally well represent
1032: different tissues and/or different patients. Application of the
1033: algorithms in these cases could help uncover ``disease modules'',
1034: {\it i.e.} sets of genes coregulated in certain tissues 
1035: under certain disease conditions. Knowledge of these modules could
1036: prove valuable in diagnosis and treatment of the disease conditions.
1037: For example, the gene-expression profile of a new patient could
1038: be scanned for patterns of expression corresponding to previously
1039: identified disease modules. This would directly aid in disease
1040: diagnosis. Very accurate diagnosis obtained in this way could help 
1041: guide treatment protocols, particularly if a database of diagnoses,
1042: treatments, and outcomes for previous patients was available for
1043: reference.
1044: }
1045: 
1046: \section{Acknowledgements}
1047: 
1048: We wish to thank J. Ihmels and N. Barkai for sharing their data set,
1049: and Rahul Kulkarni for valuable discussions.
1050: 
1051: C.T. acknowledges support from the National Key Basic Research Project
1052: of China (No. 2003CB715900).
1053: 
1054: 
1055: 
1056: 
1057: \ignore{This breaks the symmetry between genes and conditions, as
1058: pointed out
1059: in~\cite{Barkai2}, but in reality genes and conditions are not
1060: symmetrical
1061: at all: Each gene belongs to a fixed (and presumably fairly small)
1062: set of transcription modules, while the conditions are whatever the
1063: experiment design says - they could affect very few or very many
1064: modules.
1065: If the experiments from which the data is collected, generally affect
1066: only
1067: a few modules each, then thresholding on condition scores makes sense.
1068: If, however, the experiments are designed to affect many modules
1069: simultaneously, in different combinations (note that this gives the most
1070: information from the least number of experiments, although that
1071: information
1072: is not easily extracted just by looking at the data), then thresholding
1073: the
1074: condition scores is not a good idea.}
1075: 
1076: 
1077: 
1078: 
1079: \begin{thebibliography}{99}
1080: 
1081: \bibitem{Eisen_Cluster_analysis}
1082: Eisen, M.B., Spellman, P.T., Brown, P.O. \& Botstein, D. (1998) {\it
1083: Proc.
1084: Natl. Acad. Sci.} {\bf 95}, 14863-14868.
1085: 
1086: \bibitem{Alon_Broad_patterns}
1087: Alon, U., Barkai, N., Notterman, D.A., Gish, K., Ybarra, S., Mack, D. \&
1088: Levine, A.J. (1999) {\it Proc. Natl. Acad. Sci.} {\bf 96}, 6745-6750.
1089: 
1090: \bibitem{Tamayo_Interpreting_patterns}
1091: Tamayo, P., Slonim, D., Mesirov, J., Zhu, Q., Kitareewan, S.,
1092: Dmitrovsky, E., Lander, E.S. \& Golub, T.R. (1999) {\it Proc. Natl.
1093: Acad.
1094: Sci.} {\bf 96}, 2907-2912.
1095: 
1096: \bibitem{Bittner_Data_analysis}
1097: Bittner, M., Meltzer, P. \& Trent, J. (1999) {\it Nature Genet.} {\bf
1098: 22},
1099: 213-215.
1100: 
1101: \bibitem{Getz_Coupled_two-way}
1102: Getz, G., Levine, E. \& Domany, E. (2000) {\it Proc. Natl. Acad. Sci.}
1103: {\bf
1104: 97}, 12079-12084.
1105: 
1106: \bibitem{Califano_Analysis_of}
1107: Califano, A., Stolovitzky, G. \& Tu, Y. (2000) {\it ISMB} {\bf 8},
1108: 75-85.
1109: 
1110: \bibitem{Cheng_Biclustering_of}
1111: Cheng, Y. \& Church, G. (2000) {\it Proc. Int. Conf. Intell. Syst. Mol.
1112: Biol.} {\bf 8}, 93-103.
1113: 
1114: \bibitem{Gasch_Exploring_the}
1115: Gasch, A. \& Eisen, M.B. (2002) {\it Gen. Biol.} {\bf
1116: 3(11)}:research0059.1-0059.22.
1117: 
1118: \bibitem{Owen_Gene_Recommender}
1119: Owen, A.B., Stuart, J., Mach, K., Villeneuve, A.M. \& Kim, S. (2003)
1120: {\it Gen. Res.} {\bf 13}, 1828-1837.
1121: 
1122: \bibitem{Lazzeroni_Plaid_Models}
1123: Lazzeroni, L. \& Owen, A. (2002) {\it Statistica Sinica}, {\bf 12(1)},
1124: 61-86.
1125: 
1126: \bibitem{SA}
1127: Ihmels, J., Friedlander, G., Bergmann, S., Sarig, O., Ziv, Y. \& Barkai,
1128: N. (2002)
1129: {\it Nature Genet.} {\bf 31}, 370-377.
1130: 
1131: \bibitem{ISA}
1132: Bergmann, S., Ihmels, J. \& Barkai, N. (2002) {\it Phys. Rev. E} {\bf
1133: 67}, 031902.
1134: 
1135: \bibitem{Lee_Transcriptional_Regulatory}
1136: Lee, T. I. {\it et al.} (2002), {\it Science} {\bf 298}, 799-804.
1137: 
1138: \bibitem{Kellis_Sequencing_and}
1139: Kellis, M., Patterson, N., Endrizzi, M., Birren, B. \& Lander, E. S.
1140: (2003)
1141: {\it Nature} {\bf 423}, 241-254.
1142: 
1143: \bibitem{Lyons_Genome-wide_characterization}
1144: Lyons, T. J., Gasch, A. P., Gaither, L. A., Botstein, D., Brown, P. O.
1145: \& Eide, D. J. (2000), {\it Proc. Natl. Acad. Sci.} {\bf 97}, 7957-7962.
1146: 
1147: \bibitem{Rosetta}
1148: Hughes, T. R. {\it et al.} (2000) {\it Cell} {\bf 102}, 109-126.
1149: 
1150: \bibitem{GO}
1151: The Gene Ontology Consortium (2001) {\it Genome Res.} {\bf 11},
1152: 1425-1433.
1153: 
1154: \bibitem{Messenguy_Regulation_Of}
1155: Messenguy, F. \& Dubois, E. (2000) {\it Food tech. biotech.} {\bf 38}
1156: 277-285.
1157: 
1158: \end{thebibliography}
1159: 
1160: 
1161: 
1162: 
1163: 
1164: \clearpage
1165: \setcounter{figure}{0}
1166: \setcounter{equation}{0}
1167: \setcounter{section}{0}
1168: \renewcommand{\thesection}{}
1169: \renewcommand{\thefigure}{S\arabic{figure}}
1170: \renewcommand{\theequation}{S\arabic{equation}}
1171: \section{Supporting material}
1172: 
1173: \subsection{Normalization}
1174: 
1175: Here we review in detail the normalization procedure employed in PISA. The most obvious requirement for the normalization is that scores for different genes must be comparable. The procedure itself is as follows: Given a matrix $\mat{E}$ of log-ratio gene-expression data, we first set the average to zero for each condition,
1176: \beq
1177:   (\mat{E}^\prime)_{gc} = (\mat{E})_{gc}-\avg{(\mat{E})_{g^\prime c}}_{g^\prime},
1178:   \label{RemoveConditionAverage}
1179: \eeq
1180: and then normalize to zero mean and unit variance for each gene, giving $\Eg$, which is used in PISA to calculate gene scores:
1181: \beqar
1182:   (\mat{E}^{\prime\prime})_{gc} & = & (\mat{E^\prime})_{gc}-\avg{(\mat{E^\prime})_{gc^\prime}}_{c^\prime} \label{RemoveGeneAverage} \\
1183:   (\Eg)_{gc} & = & (\mat{E}^{\prime\prime})_{gc}/\sqrt{\avg{(\mat{E}^{\prime\prime})_{gc^\prime}^2}_{c^\prime}}.
1184:   \label{NormalizeByGene}
1185: \eeqar
1186: For this normalization to be consistent through the iterations in mISA, the different condition scores must also be comparable. To get the initial value $\Eorti$ of the matrix used to calculate condition scores, we divide $\Eg$ by the rms value for each condition:
1187: \beq
1188:   (\Eorti)_{gc} = (\Eg)_{gc}/\sqrt{\avg{(\Eg)_{g^\prime c}^2}_{g^\prime}}.
1189:   \label{NormalizeByCondition}
1190: \eeq
1191: 
1192: Note that a simple approach would be to normalize for both genes and conditions simultaneously and thus use only a single set of data\footnote{If $\Eg=\Eorti$ initially, then it is equivalent to keep $\Eg$ constant or use $\Eg=\Eort$, which is updated every time PISA finds a module.}---this could be easily accomplished by alternately normalizing over conditions and genes a few times; the data converge quickly. There is, however, a risk of losing significant features of the data through excessive normalization. For some conditions, the typical change in expression levels may be very large, while for others it may be negligible, and it would be misleading to always normalize these to the same level; at the very least, this would give a lower signal to noise ratio. Therefore, we have chosen to normalize $\Eg$ over genes but not conditions, allowing conditions with large changes in expression level to make a proportionately larger contribution to gene scores. For genes, however, it is reasonable to always normalize to the same level. If two genes are in the same module, then there is little reason to consider the gene with the larger dynamical range to be more reliable than the other. That is why we use $\Eg$ to calculate $\Eorti$.
1193: 
1194: Also note a the difference between genes and conditions: The variance for a gene often depends on a small number of outlying values, and normalizing over genes prevents these from dominating. In contrast, the variance for a condition typically depends on many genes, and as such is a far more reliable quantity.
1195: 
1196: \ignore{
1197: In~\cite{SA,ISA}, the authors used separate matrices for calculating gene scores and condition scores, due to the different requirements: When calculating gene scores, the data must be normalized, with zero mean and unit average, for each gene, in order for the scores for all the different genes to be comparable to the threshold. Similarly, the data used for calculating conditions scores must be normalized for each condition. A somewhat disturbing consequence of this approach is that some data points end up with different signs in the two matrices, {\it i.e.} the normalized data sets contradict each other! (Both values are typically very small in these cases.) One way to avoid this would be to normalize for both genes and conditions simultaneously and thus use only a single set of data---this is easily accomplished by alternately normalizing over conditions and genes a few times; the data converge quickly.
1198: }
1199: 
1200: 
1201: \subsection{Avoiding Positive Feedback}
1202: 
1203: The basic principle of SA, or an iteration of ISA/mISA, is to find the set of genes whose expression profiles most resemble those of the genes in the input set, either for all conditions (mISA) or for a selected subset of conditions (SA/ISA). Of course, the gene whose expression profile most resembles that of a given gene is the gene itself, thus
1204: \ignore{Ignoring the difference between $\Eort$ and $\Eg$, the gene scores $(\vect{s}_i^{\rm G})_g$ for iteration $i$ in mISA are proportional to the weighted sums of the correlations between gene $g$ and all the genes in the input set $\vect{m}_i^{\rm G}$ for that iteration. Thus,}
1205: there is a potential for significant positive feedback. Adding one gene to the input set would typically increase the score of that gene far more than the score of any other gene. As a consequence of positive feedback, adding one gene to the gene vector of a fixed point would have a considerable chance of yielding another fixed point, and a small set of genes could be a fixed point even if the genes were completely uncorrelated.
1206: 
1207: In PISA, we only find each module (or combination) once for each run, and it is important to be as certain as possible that we have the correct genes. We avoid positive feedback by using leave-one-out scoring for genes that had nonzero weight at the start of the iteration, {\it i.e.} we remove the contribution from gene $g$ from the condition scores $\vect{s}_i^{\rm{C}}$ before we use these scores to calculate the new score for gene $g$:
1208: 
1209: $$(\vect{s}_i^{\rm{G}})_g \equiv
1210: \frac{(\Eg)_{g\_} 
1211: (\vect{s}_i^{\rm{C}}-[\Eort\transp)_{\_g}(\vect{m}_i^{\rm{G}})_g]} 
1212: {\abs{\vect{s}_i^{\rm{C}}-(\Eort\transp)_{\_g}(\vect{m}_i^{\rm{G}})_g}},$$
1213: where $(\mat{A})_{j\_}$ is row $j$ of matrix $\mat{A}$, and $(\mat{A})_{\_j}$ is column $j$ of matrix $\mat{A}$. With a Gaussian distribution of the background noise, this approach is very close to neutral, {\it i.e.} adding a gene will neither affect that gene's score, nor will it significantly change $\sigma^{\rm 70\%}$ of the gene-score distribution.
1214: 
1215: Without positive feedback, fixed points may be marginally stable (or even unstable, {\it i.e.} a limit cycle), thus we do not require a true fixed point; we accept any gene vector reached after 20 iterations in mISA, as long as it contains at least 5 genes.
1216: 
1217: In SA/ISA, the authors do not eliminate positive feedback. Indeed it would be difficult to do so, as adding/removing a gene can change which conditions have scores exceeding the condition threshold. Apart from this complication, the feedback in SA/ISA is proportional to the number of conditions that make the threshold. For small modules, typically only a small fraction of the conditions have scores above the threshold, thus the feedback is lower than it would have been for PISA, which includes all conditions. For large modules, the feedback is only a minor effect in the first place. Nevertheless, the total number of fixed points for ISA is huge due to positive feedback---at a gene threshold coefficient $\tG=4.0$, there are, at a minimum, more than a million fixed points.
1218: 
1219: 
1220: \subsection{Filters}
1221: 
1222: We chose the gene-score threshold as $7.0\sigma^{\rm 70\%}$ so that, on average, less than one gene would be included in a module purely due to background noise. This estimate assumed that the background noise had a Gaussian distribution. For most modules, the gene scores are the sums of contributions from many different conditions, and if these contributions are independent, as they should be for background noise, then the total background noise will have approximately a Gaussian distribution, regardless of the distribution for a single condition (central limit theorem). For modules that derive almost entirely from one or very few conditions, however, the distribution of gene scores may not be Gaussian.
1223: 
1224: While we do not know the true distribution of the background noise, it is reasonable to use the full distribution of the data as a worst case scenario. As shown in Fig.~\ref{distributions}, this distribution is far from Gaussian: it has a fairly sharp cusp at zero and long tails, even after normalization. For this distribution, more than 3.5\% of the values are outside the threshold $\pm 7.0\sigma^{\rm 70\%}$ (this is partially because the long tails contain many genes, and partially because $\sigma^{\rm 70\%}$ is small due to the sharp cusp), {\it i.e.} with a gene-expression matrix randomly drawn from this distribution, for any single condition one would expect to find a module with about 200 genes!
1225: 
1226: \begin{figure}[htb]
1227: \includegraphics[width=3.375in]{DataDistributions}
1228: \caption{Distributions of the yeast microarray data used
1229: (6206 genes/ORFs, 1011 conditions). Roughly 10\% of the data was
1230: invalid/missing
1231: (not included in the distributions). The distribution is sharply cusped
1232: and
1233: has long tails, both before and after normalization (Eqs.~\ref{RemoveConditionAverage}--\ref{NormalizeByGene}).}
1234: \label{distributions}
1235: \end{figure}
1236: 
1237: We applied PISA to a matrix $\Eg$ that had been fully scrambled after normalization\footnote{Scrambling the matrix {\it after} normalization ensured that the distribution remained the same. The data were no longer exactly normalized for each gene, but the deviations were insignificant. Scrambling the data before normalization gave similar results.}. As shown in Fig.~\ref{modulequality}, PISA found many large modules that were based almost entirely on a single condition (however, as the modules were not based on {\it only} one condition, they were not as large as our estimate of 200, above), whereas modules based on many conditions were much smaller. We also applied PISA to a random matrix generated from a Gaussian distribution, and in that case PISA did not find any large modules (in 30 runs, PISA found 8 modules with 20 or more genes; the largest contained 26 genes). In both cases, the small modules found by PISA varied from run to run.
1238: 
1239: \comment{[This is an even more severe problem for ISA: As they use a condition-score threshold, it is possible to find modules that {\it really} depend on only a very few conditions. ISA ignores any module that does not have at least 5 (I think) conditions with at least 70\% (I think) of the score of the highest-scoring condition---this is (in my opinion) a somewhat more arbitrary way to implement a similar filter, and they don't say why they do this.]}
1240: 
1241: \begin{figure}[b]
1242: \includegraphics[width=3.375in]{ModulesQuality2}
1243: \caption{The number of genes $n_{\Tm}^{\rm G}$ in a module $\Tm$ and the number of contributing conditions $n_{\Tm}^{\rm C}$ (see text) were two of the properties we used in our filters to eliminate false modules. PISA applied to a scrambled expression matrix (black) only yielded modules close to the axes (small $n_{\Tm}^{\rm G}$ or small $n_{\Tm}^{\rm C}$), while PISA run on the real data (green) yielded modules with both large $n_{\Tm}^{\rm G}$ and large $n_{\Tm}^{\rm C}$.}
1244: \label{modulequality}
1245: \end{figure}
1246: 
1247: In order to eliminate these false modules we introduced a set of filters. For each preliminary module $\Tm$ we calculate the ``number of contributing conditions", given as $n_{\Tm}^{\rm C} = \sum_{c}(\vect{s}^{\rm C})_c^2/(\max\{(\vect{s}^{\rm C})_c\})^2$. We ignored any module for which the median of the numbers of contributing conditions for its preliminary modules was below 6 (this threshold worked well; it is somewhat above the threshold required to remove the false positives for the scrambled matrix). We also ignored all modules that had fewer than 5 genes or fewer than 5 contributing preliminary modules, and for modules with fewer than 10 genes we required that the ``consistency", defined as the average fraction of the genes in the preliminary modules that are in the full module, was above 0.55 (during post processing, we required that this fraction was above 0.2 for {\it each} preliminary module). These filters removed all but one of the modules found by PISA when applied to the scrambled matrix.
1248: 
1249: 
1250: \ignore{
1251: \subsection{More Results}
1252: 
1253: We here discuss the modules found by PISA in more detail. A total of 260 different modules passed our filters, however some of these were very similar to each other. We assigned each module a weight according to how much overlap there was with other modules that passed the filter, and using this weight, the modules we found roughly correspond to 143 unrelated modules. Figure~\ref{ModuleSizeDistribution} shows the distribution of modules sizes.
1254: 
1255: \begin{figure}[htb]
1256: \includegraphics[width=3.375in]{ModuleSizeDistribution}
1257: \caption{The distribution of sizes of modules found by PISA.}
1258: \label{ModuleSizeDistribution}
1259: \end{figure}
1260: 
1261: 
1262: \comment{[HOW TO TAKE DATA?]}
1263: }
1264: 
1265: 
1266: \begin{table*}[h]
1267: \begin{tabular}{|c|c|c|c|c|c|c|}
1268: \hline
1269:  & \# & \# & & Over.& Best & \\
1270: Function & genes & cond. & Cons. & w/ISA & $t_{\rm G}$ & Freq.\\ \hline
1271: Amino acid biosynthesis & 96 & 31.2 & 0.83 & 0.89 & 3.7 & 10090 \\
1272: Arginine biosynthesis & 6 & 5.7 & 0.72 & 0.83 & 6.0 & 60 \\
1273: Biotin synthesis \& transport & 6 & 6.5 & 0.80 & 0.67 & 5.5 & 7 \\
1274: Lysine biosynthesis & 11 & 9.0 & 0.82 & 0.82 & 4.6 & 10 \\
1275: De novo purine biosynthesis & 32 & 13.1 & 0.83 & 0.59 & 5.0 & 16 \\
1276: Oxidative stress response & 69 & 23.8 & 0.91 & 0.32 & 3.4 & (1) \\
1277: Aryl alcohol dehydrogenases & 6 & 15.4 & 0.62 & 0.83 & 4.9 & 8 \\
1278: Proteolysis & 27 & 82.1 & 0.80 & 0.86 & 3.6 & 1661 \\
1279: Trehalose \& hexose metabolism/conversion & 21 & 34.9 & 0.55 & 0.67 &
1280: 3.2 & 910 \\
1281: COS genes & 11 & 9.2 & 0.49 & 1.00 & 3.3 & 756 \\
1282: Heat shock & 52 & 42.8 & 0.78 & 0.38 & 3.2 & (1) \\
1283: Repair of disulphide bonds & 26 & 41.6 & 0.73 & 0.58 & 3.5 & 15 \\
1284: Calcium-calmodulin related & 41 & 32.5 & 0.78 & 0.73 & 3.0 & 2198 \\
1285: Oxidative phosphorylation & 42 & 48.3 & 0.89 & 0.95 & 3.7 & 2600 \\
1286: Gluconeogenesis, fatty acid beta-oxidation & 38 & 18.2 & 0.81 & 0.63 &
1287: 2.9 & 264\\
1288: Mitochondrial ribosomal genes & 52 & 57.6 & 0.79 & 0.89 & 3.3 & 2291 \\
1289: Transcription (RNA polymerase etc.)++ & 22 & 70.4 & 0.59 & 0.52 & 3.2 &
1290: 1 \\
1291: Subtelomerically-encoded proteins & 36 & 48.2 & 0.94 & 1.00 & 3.9 & 6174
1292: \\
1293: Iron/copper uptake & 38 & 10.8 & 0.82 & 0.79 & 3.7 & 1704 \\
1294: Coated vesicles/secretion & 25 & 47.6 & 0.61 & 0.64 & 3.7 & 4 \\
1295: Phosphoglycerides biosynthesis & 33 & 36.1 & 0.86 & 0.61 & 2.9 & 27 \\
1296: Hexose transporters & 10 & 33.9 & 0.74 & 0.60 & 3.8 & 41 \\
1297: Galactose utilization & 23 & 17.4 & 0.84 & 0.74 & 3.2 & 686 \\
1298: Mid sporulation & 97 & 11.7 & 0.90 & 0.70 & 2.7 & 6556 \\
1299: Mating factors/receptors: a/$\alpha$ difference & 26 & 15.8 & 0.57 &
1300: 0.58 & 3.8 & 6\\
1301: Mating & 110 & 31.1 & 0.89 & 0.75 & 2.7 & 24622 \\
1302: Mating type a signaling genes & 6 & 18.6 & 0.26 & 0.83 & 5.5 & 22 \\
1303: Mating genes for mating type a & 15 & 13.6 & 0.41 & 0.53 & 8.0 & 16 \\
1304: Phosphate utilization & 27 & 24.4 & 0.89 & 0.81 & 3.3 & 5796 \\
1305: Glycolysis & 19 & 26.9 & 0.54 & 0.89 & 3.7 & 91 \\
1306: Ergosterol biosynthesis & 36 & 28.3 & 0.89 & 0.69 & 3.1 & 57 \\
1307: Cell cycle G1/S & 66 & 39.1 & 0.80 & 0.81 & 3.7 & 4382 \\
1308: Cell wall (bud emergence) & 17 & 42.7 & 0.76 & 0.94 & 4.0 & 63 \\
1309: Cell cycle M/G1 & 35 & 31.4 & 0.82 & 0.89 & 3.9 & 952 \\
1310: Cell cycle G2/M & 31 & 25.0 & 0.82 & 0.90 & 3.7 & 1258 \\
1311: Uracil synthesis/permeases & 8 & 11.4 & 0.75 & 0.88 & 3.5 & 19 \\
1312: Fatty acid synthesis++ & 22 & 49.4 & 0.86 & 0.50 & 3.1 & 2 \\
1313: Histones & 19 & 34.6 & 0.67 & 0.53 & 3.4 & 2972 \\
1314: Ribosomal proteins & 126 & 49.2 & 0.91 & 0.87 & 3.0 & 18661 \\
1315: rRNA processing & 117 & 46.0 & 0.85 & 0.64 & 2.7 & 13355 \\ \hline
1316: \end{tabular}
1317: \caption{40 of the modules found by PISA that we could assign a name to.
1318: For each module we list the number of genes in the module, the number of
1319: conditions that had a significant contribution to the module, how
1320: consistent
1321: the module was from each run to the next, the maximal overlap with a
1322: module
1323: found by ISA (using 200,000 seeds at each threshold from 1.8 to 15.0),
1324: the threshold value $t_{\rm G}$ at which that overlap was found, and how
1325: many times such an ISA module was found.}
1326: \end{table*}
1327: 
1328: 
1329: \begin{figure}
1330: \setlength{\unitlength}{0.6pt}
1331: \fbox{
1332: \begin{minipage}{3.200in}
1333: \begin{flushleft}
1334: \moduletitlesize
1335: Module: Galactose induced genes
1336: \modulesize
1337: 
1338: \vspace{10pt}
1339: 
1340: Number of genes: 23
1341: 
1342: Average number of contributing conditions: 18.1
1343: 
1344: Consistency: 0.84
1345: 
1346: Best ISA overlap: 0.74 at threshold 3.2, frequency 686
1347: \genesize
1348: \vspace{10pt}
1349: 
1350: \begin{picture}(300,100)(0,0)
1351: \ignore{
1352:       GAL10  30  651.086
1353:        GAL7  30  638.409
1354:        GAL1  30  632.368
1355:        GAL3  30  492.239
1356:        GAL2  30  424.969
1357:     YPL066W  30  412.222
1358:     YOR121C  30  370.898
1359:       GAL80  30  362.546
1360:       PCL10  30  350.308
1361:        GCY1  30  339.059
1362:        MLF3  30  306.93
1363:     YDR010C  24  227.32
1364:     YLR201C  22  200.349
1365:        FUR4  21  188.24
1366:        MUP3  18  178.833
1367:      MRPL24  18  161.74
1368:        OPT2  17  146.191
1369:     YEL057C  16  135.229
1370: }
1371: \put(  65,  75){\gene{1}{GAL7}}
1372: \put(   0,  75){\gene{1}{GAL10}}
1373: \put( 130,  75){\gene{1}{GAL1}}
1374: \put(  65,  25){\gene{4}{FUR4}}
1375: \put( 195,  75){\gene{1}{GAL3}}
1376: \put( 325,  50){\gene{0}{YDR010C}}
1377: \put( 260,   0){\gene{2}{HXT3}}
1378: \put( 325,  25){\gene{0}{YEL057C}}
1379: \put( 130,  50){\gene{4}{PCL10}}
1380: \put( 130,  25){\gene{4}{MUP3}}
1381: \put( 130,   0){\gene{2}{HXT4}}
1382: \put(   0,   0){\gene{2}{HXT1}}
1383: \put( 195,   0){\gene{3}{HSL1}}
1384: \put( 260,  75){\gene{1}{GAL2}}
1385: \put(   0,  25){\gene{0}{YLR201C}}
1386: \put(  65,  50){\gene{1}{GAL80}}
1387: \put(  65,   0){\gene{2}{HXT2}}
1388: \put( 195,  25){\gene{4}{MRPL24}}
1389: \put( 260,  50){\gene{4}{MLF3}}
1390: \put( 195,  50){\gene{1}{GCY1}}
1391: \put(   0,  50){\gene{0}{YOR121C}}
1392: \put( 325,  75){\gene{0}{YPL066W}}
1393: \put( 260,  25){\gene{4}{OPT2}}
1394: \ignore{
1395: \put(  65,  75){\gene{1}{GAL7}}
1396: \put(   0,  75){\gene{1}{GAL10}}
1397: \put( 260,  75){\gene{1}{GAL1}}
1398: \put( 195,  50){\gene{4}{FUR4}}
1399: \put(   0,  50){\gene{1}{GAL3}}
1400: \put( 325,  75){\gene{0}{YDR010C}}
1401: \put( 260,   0){\gene{2}{HXT3}}
1402: \put( 260,  50){\gene{0}{YEL057C}}
1403: \put(  65,  25){\gene{4}{PCL10}}
1404: \put( 325,  25){\gene{4}{MUP3}}
1405: \put(  65,   0){\gene{2}{HXT4}}
1406: \put( 195,   0){\gene{2}{HXT1}}
1407: \put(   0,   0){\gene{3}{HSL1}}
1408: \put( 130,  50){\gene{1}{GAL2}}
1409: \put( 195,  25){\gene{0}{YLR201C}}
1410: \put(   0,  25){\gene{1}{GAL80}}
1411: \put( 130,   0){\gene{2}{HXT2}}
1412: \put( 130,  25){\gene{4}{MRPL24}}
1413: \put( 325,  50){\gene{4}{MLF3}}
1414: \put( 260,  25){\gene{1}{GCY1}}
1415: \put( 195,  75){\gene{0}{YOR121C}}
1416: \put( 130,  75){\gene{0}{YPL066W}}
1417: \put(  65,  50){\gene{4}{OPT2}}
1418: }
1419: \end{picture}
1420: 
1421: \vspace{20pt}
1422: \gene{0}{0} \modulesize Unknown \genesize
1423: 
1424: \vspace{5pt}
1425: \gene{1}{1} \modulesize Galactose induced genes \genesize
1426: 
1427: \vspace{5pt}
1428: \gene{2}{2} \modulesize Hexose transporters (downregulated) \genesize
1429: 
1430: \vspace{5pt}
1431: \gene{3}{3} \modulesize Other, downregulated \genesize
1432: 
1433: \vspace{5pt}
1434: \gene{4}{4} \modulesize Other \genesize
1435: 
1436: \vspace{10pt}
1437: \includegraphics[width=3.2in]{Galactose}
1438: \end{flushleft}
1439: \end{minipage}
1440: }
1441: \caption{The galactose induced module found with PISA. This module turns
1442: on
1443: GAL genes and also, as a weaker effect, represses a number of hexose
1444: transporters.}
1445: \label{GalactoseModule}
1446: \end{figure}
1447: 
1448: \begin{figure}
1449: \setlength{\unitlength}{0.6pt}
1450: \fbox{
1451: \begin{minipage}{3.200in}
1452: \begin{flushleft}
1453: \moduletitlesize
1454: Module: Hexose transporters
1455: \modulesize
1456: 
1457: \vspace{10pt}
1458: 
1459: Number of genes: 10
1460: 
1461: Average number of contributing conditions: 33.7
1462: 
1463: Consistency: 0.74
1464: 
1465: Best ISA overlap: 0.6 at threshold 3.8, frequency 41
1466: \genesize
1467: \vspace{10pt}
1468: 
1469: \begin{picture}(300,50)(0,0)
1470: \ignore{
1471:        HXT3  28  292.002
1472:        HXT4  28  260.904
1473:        HXT2  27  231.856
1474:        HXT6  26  228.681
1475:        HXT7  25  192.247
1476:        GAL2  25  182.687
1477:     YKR075C  21  167.997
1478:        HXT1  19  131.751
1479:        MIG2  15  109.509
1480:        HXT8  18  102.121
1481: }
1482: \put( 260,  25){\gene{1}{HXT7}}
1483: \put( 195,  25){\gene{1}{HXT6}}
1484: \put(   0,  25){\gene{1}{HXT3}}
1485: \put( 130,   0){\gene{3}{MIG2}}
1486: \put(  65,  25){\gene{1}{HXT4}}
1487: \put(  65,   0){\gene{1}{HXT1}}
1488: \put( 195,   0){\gene{1}{HXT8}}
1489: \put(   0,   0){\gene{4}{YKR075C}}
1490: \put( 325,  25){\gene{2}{GAL2}}
1491: \put( 130,  25){\gene{1}{HXT2}}
1492: \ignore{
1493: \put( 130,  25){\gene{1}{HXT7}}
1494: \put(   0,  25){\gene{1}{HXT6}}
1495: \put(  65,  25){\gene{1}{HXT3}}
1496: \put(   0,   0){\gene{3}{MIG2}}
1497: \put( 195,  25){\gene{1}{HXT4}}
1498: \put( 325,  25){\gene{1}{HXT1}}
1499: \put( 195,   0){\gene{1}{HXT8}}
1500: \put( 260,  25){\gene{4}{YKR075C}}
1501: \put(  65,   0){\gene{2}{GAL2}}
1502: \put( 130,   0){\gene{1}{HXT2}}
1503: }
1504: \end{picture}
1505: 
1506: \vspace{20pt}
1507: \gene{1}{1} \modulesize Glucose transporter \genesize
1508: 
1509: \vspace{5pt}
1510: \gene{2}{2} \modulesize Galactose/glucose transporter \genesize
1511: 
1512: \vspace{5pt}
1513: \gene{3}{3} \modulesize Glucose suppression regulator \genesize
1514: 
1515: \vspace{5pt}
1516: \gene{4}{4} \modulesize Similar to glucose suppression regulator
1517: \genesize
1518: 
1519: \vspace{10pt}
1520: \includegraphics[width=3.2in]{Hexose}
1521: \end{flushleft}
1522: \end{minipage}
1523: }
1524: \caption{The hexose transporter module found with PISA. In this module
1525: (which is consistently found after the galactose induced module), the
1526: hexose transporter genes are co-regulated with GAL2, the galactose
1527: permease,
1528: whereas they were counter-regulated in the galactose induced module.
1529: }
1530: \label{HexoseModule}
1531: \end{figure}
1532: 
1533: \begin{figure}
1534: \setlength{\unitlength}{0.6pt}
1535: \fbox{
1536: \begin{minipage}{3.250in}
1537: \begin{flushleft}
1538: \moduletitlesize
1539: Module: Peroxide shock
1540: \modulesize
1541: 
1542: \vspace{10pt}
1543: 
1544: Number of genes: 69
1545: 
1546: Average number of contributing conditions: 23.9
1547: 
1548: Consistency: 0.91
1549: 
1550: Best ISA overlap: 0.34 at threshold 3.4, frequency (1)
1551: \genesize
1552: \vspace{10pt}
1553: %412224201144122444041014204410000433410440110031311420021003230420134
1554: \begin{picture}(300,300)(0,0)
1555: \ignore{
1556:     YKL071W  27  440.999
1557:        GPX2  27  439.203
1558:     YCR102C  27  407.171
1559:        FLR1  27  404.604
1560:        AAD6  27  395.01
1561:     YLR108C  27  393.525
1562:     YDR132C  27  382.926
1563:        GSH1  27  376.947
1564:     YLR460C  27  359.603
1565:        ISU2  27  352.729
1566:     YML131W  27  351.048
1567:     YFL057C  27  349.464
1568:       AAD15  27  343.579
1569:        AAD4  27  340.833
1570:        GTT2  27  328.238
1571:        FRE1  27  301.459
1572:        ATR1  27  300.928
1573:     YKR071C  27  299.813
1574:        TRR1  27  296.053
1575:       AAD14  27  293.316
1576:        SFA1  27  287.805
1577:     YNL134C  27  285.864
1578:     YDR453C  27  284.169
1579:        SDL1  27  280.931
1580:     YOR225W  27  279.217
1581:     YNL260C  27  278.871
1582:        CCP1  27  272.347
1583:     YOL150C  27  270.386
1584:        YSR3  27  268.748
1585:     YGL114W  27  260.416
1586:     YNR074C  27  259.628
1587:        AAD3  27  257.615
1588:        ISA2  27  252.627
1589:        OYE3  27  249.483
1590:        MRS4  27  237.294
1591:        YAP1  27  233.256
1592:        CYT2  27  230.506
1593:        ECM4  27  228.089
1594:        TRX2  26  228.088
1595:     YKL070W  27  224.577
1596:       LYS20  26  217.823
1597:        RIB3  26  214.765
1598:        MMT1  25  209.659
1599:       TAH18  25  209.175
1600:     YMR318C  25  198.007
1601:       TRS31  25  197.644
1602:        GRE2  24  189.743
1603:     YGR223C  24  188.606
1604:        OYE2  23  179.3
1605:     YGR011W  24  177.868
1606:        SOD2  23  170.43
1607:        TTR1  22  166.371
1608:        KSS1  21  150.661
1609:     YKL086W  19  144.809
1610:     YOL029C  19  143.744
1611:        KTR2  20  143.164
1612:       NBP35  19  139.893
1613:        CIN5  18  134.513
1614:        SOD1  18  132.523
1615:        AHP1  17  125.702
1616:        NFU1  17  123.387
1617:        LYS7  17  123.159
1618:        TSA1  16  115.828
1619:        ROX1  16  115.701
1620:     YGR010W  16  113.785
1621:      CDC123  15  107.395
1622:     YHR199C  15  106.447
1623:     YHR111W  14  101.71
1624:     YPL202C  14  96.9853
1625: }
1626: \put( 195, 275){\gene{4}{FLR1}}
1627: \put(  65, 275){\gene{1}{GPX2}}
1628: \put( 130, 275){\gene{2}{YCR102C}}
1629: \put(  65, 150){\gene{2}{AAD3}}
1630: \put( 130, 200){\gene{2}{SFA1}}
1631: \put( 260, 125){\gene{4}{LYS20}}
1632: \put(  65, 225){\gene{2}{AAD4}}
1633: \put(   0, 250){\gene{0}{YDR132C}}
1634: \put(   0, 200){\gene{1}{TRR1}}
1635: \put( 260, 200){\gene{1}{YDR453C}}
1636: \put( 195, 100){\gene{4}{TRS31}}
1637: \put( 325, 125){\gene{4}{RIB3}}
1638: \put( 195,  75){\gene{1}{TTR1}}
1639: \put( 260, 275){\gene{2}{AAD6}}
1640: \put( 325, 250){\gene{2}{YFL057C}}
1641: \put( 130,  50){\gene{4}{NBP35}}
1642: \put( 325, 175){\gene{4}{YGL114W}}
1643: \put( 260,  25){\gene{4}{YGR010W}}
1644: \put(  65,  75){\gene{0}{YGR011W}}
1645: \put( 260,  75){\gene{4}{KSS1}}
1646: \put( 130, 125){\gene{1}{TRX2}}
1647: \put( 325, 100){\gene{0}{YGR223C}}
1648: \put( 130,  75){\gene{1}{SOD2}}
1649: \put(  65,   0){\gene{4}{YHR111W}}
1650: \put(   0,  75){\gene{2}{OYE2}}
1651: \put(   0,   0){\gene{0}{YHR199C}}
1652: \put( 325, 200){\gene{4}{SDL1}}
1653: \put(  65, 250){\gene{4}{GSH1}}
1654: \put( 260,  50){\gene{1}{SOD1}}
1655: \put(   0,  25){\gene{0}{NFU1}}
1656: \put( 195, 125){\gene{0}{YKL070W}}
1657: \put(   0, 275){\gene{0}{YKL071W}}
1658: \put( 325,  75){\gene{0}{YKL086W}}
1659: \put(   0, 125){\gene{4}{CYT2}}
1660: \put( 260, 150){\gene{3}{MRS4}}
1661: \put( 260, 175){\gene{3}{YSR3}}
1662: \put(  65,  50){\gene{4}{KTR2}}
1663: \put( 130, 175){\gene{1}{CCP1}}
1664: \put( 325, 225){\gene{0}{YKR071C}}
1665: \put(  65, 125){\gene{4}{ECM4}}
1666: \put( 130, 225){\gene{4}{GTT2}}
1667: \put( 325, 275){\gene{0}{YLR108C}}
1668: \put( 325,  50){\gene{1}{AHP1}}
1669: \put( 195, 225){\gene{1}{FRE1}}
1670: \put( 325,  25){\gene{0}{CDC123}}
1671: \put( 130, 250){\gene{0}{YLR460C}}
1672: \put( 325, 150){\gene{3}{YAP1}}
1673: \put( 130,  25){\gene{1}{TSA1}}
1674: \put( 260, 225){\gene{3}{ATR1}}
1675: \put( 260, 250){\gene{1}{YML131W}}
1676: \put(  65,  25){\gene{1}{LYS7}}
1677: \put(   0, 100){\gene{4}{MMT1}}
1678: \put( 130, 100){\gene{2}{YMR318C}}
1679: \put( 195, 200){\gene{0}{YNL134C}}
1680: \put(  65, 175){\gene{0}{YNL260C}}
1681: \put(  65, 200){\gene{2}{AAD14}}
1682: \put(   0, 150){\gene{1}{YNR074C}}
1683: \put(   0,  50){\gene{0}{YOL029C}}
1684: \put( 195, 175){\gene{0}{YOL150C}}
1685: \put( 260, 100){\gene{3}{GRE2}}
1686: \put(   0, 225){\gene{2}{AAD15}}
1687: \put( 195,  50){\gene{3}{CIN5}}
1688: \put(   0, 175){\gene{0}{YOR225W}}
1689: \put( 195, 250){\gene{4}{ISU2}}
1690: \put( 195, 150){\gene{2}{OYE3}}
1691: \put( 130,   0){\gene{0}{YPL202C}}
1692: \put(  65, 100){\gene{1}{TAH18}}
1693: \put( 195,  25){\gene{3}{ROX1}}
1694: \put( 130, 150){\gene{4}{ISA2}}
1695: \ignore{
1696: \put(  65, 275){\gene{4}{FLR1}}
1697: \put(   0, 275){\gene{1}{GPX2}}
1698: \put( 130, 275){\gene{2}{YCR102C}}
1699: \put(  65, 250){\gene{2}{AAD3}}
1700: \put( 195, 275){\gene{2}{SFA1}}
1701: \put(   0, 250){\gene{4}{LYS20}}
1702: \put(  65, 175){\gene{2}{AAD4}}
1703: \put( 325, 275){\gene{0}{YDR132C}}
1704: \put( 260, 250){\gene{1}{TRR1}}
1705: \put(   0, 225){\gene{1}{YDR453C}}
1706: \put(  65, 200){\gene{4}{TRS31}}
1707: \put( 130, 200){\gene{4}{RIB3}}
1708: \put( 325, 225){\gene{1}{TTR1}}
1709: \put( 260, 275){\gene{2}{AAD6}}
1710: \put( 325, 250){\gene{2}{YFL057C}}
1711: \put( 260, 225){\gene{4}{NBP35}}
1712: \put( 195, 175){\gene{4}{YGL114W}}
1713: \put(   0, 200){\gene{4}{YGR010W}}
1714: \put( 195, 225){\gene{0}{YGR011W}}
1715: \put( 325, 200){\gene{4}{KSS1}}
1716: \put( 195, 250){\gene{1}{TRX2}}
1717: \put( 130, 250){\gene{0}{YGR223C}}
1718: \put(  65, 225){\gene{1}{SOD2}}
1719: \put( 325, 175){\gene{4}{YHR111W}}
1720: \put( 130, 225){\gene{2}{OYE2}}
1721: \put( 130, 175){\gene{0}{YHR199C}}
1722: \put( 260, 200){\gene{4}{SDL1}}
1723: \put(   0, 175){\gene{4}{GSH1}}
1724: \put( 195, 200){\gene{1}{SOD1}}
1725: \put( 260, 175){\gene{0}{NFU1}}
1726: \put( 130, 125){\gene{0}{YKL070W}}
1727: \put(   0, 150){\gene{0}{YKL071W}}
1728: \put( 130,  50){\gene{0}{YKL086W}}
1729: \put( 130,  25){\gene{4}{CYT2}}
1730: \put(  65,  50){\gene{3}{MRS4}}
1731: \put( 260,  75){\gene{3}{YSR3}}
1732: \put( 260,  25){\gene{4}{KTR2}}
1733: \put( 195,  50){\gene{1}{CCP1}}
1734: \put( 195,  75){\gene{0}{YKR071C}}
1735: \put(   0,  25){\gene{4}{ECM4}}
1736: \put( 325,  75){\gene{4}{GTT2}}
1737: \put( 325, 100){\gene{0}{YLR108C}}
1738: \put( 325,  50){\gene{1}{AHP1}}
1739: \put( 260, 100){\gene{1}{FRE1}}
1740: \put(   0,  75){\gene{0}{CDC123}}
1741: \put( 260, 125){\gene{0}{YLR460C}}
1742: \put( 325, 125){\gene{3}{YAP1}}
1743: \put( 325,  25){\gene{1}{TSA1}}
1744: \put(  65, 100){\gene{3}{ATR1}}
1745: \put( 195, 125){\gene{1}{YML131W}}
1746: \put(  65,  75){\gene{1}{LYS7}}
1747: \put( 130, 100){\gene{4}{MMT1}}
1748: \put( 195, 100){\gene{2}{YMR318C}}
1749: \put(   0, 100){\gene{0}{YNL134C}}
1750: \put( 195, 150){\gene{0}{YNL260C}}
1751: \put( 260, 150){\gene{2}{AAD14}}
1752: \put(   0, 125){\gene{1}{YNR074C}}
1753: \put( 325, 150){\gene{0}{YOL029C}}
1754: \put( 130, 150){\gene{0}{YOL150C}}
1755: \put(  65, 125){\gene{3}{GRE2}}
1756: \put(  65, 150){\gene{2}{AAD15}}
1757: \put(   0,   0){\gene{3}{CIN5}}
1758: \put(   0,  50){\gene{0}{YOR225W}}
1759: \put( 130,  75){\gene{4}{ISU2}}
1760: \put( 130,   0){\gene{2}{OYE3}}
1761: \put(  65,   0){\gene{0}{YPL202C}}
1762: \put(  65,  25){\gene{1}{TAH18}}
1763: \put( 195,  25){\gene{3}{ROX1}}
1764: \put( 260,  50){\gene{4}{ISA2}}
1765: }
1766: \end{picture}
1767: 
1768: \vspace{20pt}
1769: \gene{0}{0} \modulesize Unknown \genesize
1770: 
1771: \vspace{5pt}
1772: \gene{1}{1} \modulesize Peroxidase, superoxide dismutase, reductase
1773: \genesize
1774: 
1775: \vspace{5pt}
1776: \gene{2}{2} \modulesize Dehydrogenase \genesize
1777: 
1778: \vspace{5pt}
1779: \gene{3}{2} \modulesize Other stress related genes \genesize
1780: 
1781: \vspace{5pt}
1782: \gene{4}{3} \modulesize Other \genesize
1783: %\end{figure}
1784: 
1785: \vspace{10pt}
1786: \includegraphics[width=3.2in]{Peroxide}
1787: \end{flushleft}
1788: \end{minipage}
1789: }
1790: \caption{The oxidative stress response module found with PISA. This
1791: module
1792: is significantly more complete than the modules of comparable size found
1793: by ISA.
1794: }
1795: \label{PeroxideShockModule}
1796: \end{figure}
1797: 
1798: 
1799: \ignore{
1800: \begin{figure}
1801: \setlength{\unitlength}{0.6pt}
1802: \fbox{
1803: \begin{minipage}{3.200in}
1804: \begin{flushleft}
1805: \moduletitlesize
1806: 
1807: Module: De novo purine biosynthesis
1808: \modulesize
1809: 
1810: \vspace{10pt}
1811: Number of genes: 29
1812: 
1813: Average number of contributing conditions: 13.797
1814: 
1815: Consistency: 0.859429
1816: 
1817: Best ISA overlap:  at threshold , frequency
1818: 
1819: \vspace{10pt}
1820: \genesize
1821: \begin{picture}(300,150)(0,0)
1822: \put(  65,  50){\gene{2}{GCV3}}
1823: \put( 260, 100){\gene{1}{ADE1}}
1824: \put( 260,   0){\gene{3}{HIS7}}
1825: \put( 325,  50){\gene{3}{HIS4}}
1826: \put( 260,  25){\gene{4}{AIP2}}
1827: \put(   0,  75){\gene{2}{GCV1}}
1828: \put(  65,  25){\gene{0}{YDR089W}}
1829: \put( 195,  50){\gene{1}{ADE8}}
1830: \put( 195,  75){\gene{4}{CEM1}}
1831: \put(   0,  25){\gene{2}{SER3}}
1832: \put(  65,   0){\gene{2}{MET6}}
1833: \put( 325,  75){\gene{1}{YGL186C}}
1834: \put(  65,  75){\gene{1}{ADE5,7}}
1835: \put( 260,  50){\gene{1}{ADE6}}
1836: \put( 325,  25){\gene{1}{ADE3}}
1837: \put( 130,   0){\gene{4}{ETF-BETA}}
1838: \put( 130,  25){\gene{2}{SER2}}
1839: \put(   0,   0){\gene{2}{SER33}}
1840: \put(   0, 100){\gene{2}{MTD1}}
1841: \put( 130, 100){\gene{2}{SHM2}}
1842: \put( 325, 100){\gene{1}{ADE13}}
1843: \put( 195,   0){\gene{4}{URA4}}
1844: \put( 195, 100){\gene{1}{ADE17}}
1845: \put(   0,  50){\gene{2}{GCV2}}
1846: \put( 130,  75){\gene{1}{ADE4}}
1847: \put( 130,  50){\gene{1}{ADE12}}
1848: \put(  65, 100){\gene{1}{ADE2}}
1849: \put( 260,  75){\gene{2}{SER1}}
1850: \put( 195,  25){\gene{4}{YPR004C}}
1851: \end{picture}
1852: 
1853: 
1854: \vspace{20pt}
1855: \gene{0}{0} \modulesize Unknown \genesize
1856: 
1857: \vspace{5pt}
1858: \gene{1}{1} \modulesize Purine synthesis/transport \genesize
1859: 
1860: \vspace{5pt}
1861: \gene{2}{2} \modulesize Tetrahydrofolate activation \genesize
1862: 
1863: \vspace{5pt}
1864: \gene{3}{3} \modulesize Histidine biosynthesis \genesize
1865: 
1866: \vspace{5pt}
1867: \gene{4}{4} \modulesize Other \genesize
1868: 
1869: \vspace{10pt}
1870: \includegraphics[width=3.2in]{Purine}
1871: \end{flushleft}
1872: \end{minipage}
1873: }
1874: \caption{The de novo purine synthesis module found with PISA.}
1875: \label{PurineModule}
1876: \end{figure}
1877: 
1878: 
1879: \begin{figure}
1880: \setlength{\unitlength}{0.6pt}
1881: \fbox{
1882: \begin{minipage}{3.200in}
1883: \begin{flushleft}
1884: \moduletitlesize
1885: Module: Galactose induced genes
1886: \modulesize
1887: 
1888: \vspace{10pt}
1889: 
1890: Number of genes: 22
1891: 
1892: Average number of contributing conditions: 17.6144
1893: 
1894: Consistency: 0.863624
1895: 
1896: Best ISA overlap:  at threshold , frequency
1897: \genesize
1898: \vspace{10pt}
1899: 
1900: \begin{picture}(300,100)(0,0)
1901: \put(  65,  75){\gene{1}{GAL7}}
1902: \put(   0,  75){\gene{1}{GAL10}}
1903: \put( 130,  75){\gene{1}{GAL1}}
1904: \put( 195,  25){\gene{4}{FUR4}}
1905: \put( 195,  75){\gene{1}{GAL3}}
1906: \put(  65,  25){\gene{0}{YDR010C}}
1907: \put( 195,   0){\gene{2}{HXT3}}
1908: \put( 325,  25){\gene{0}{YEL057C}}
1909: \put(  65,  50){\gene{4}{PCL10}}
1910: \put( 130,  25){\gene{4}{MUP3}}
1911: \put(  65,   0){\gene{2}{HXT4}}
1912: \put( 130,   0){\gene{3}{HSL1}}
1913: \put( 260,  75){\gene{1}{GAL2}}
1914: \put(   0,  25){\gene{0}{YLR201C}}
1915: \put( 130,  50){\gene{1}{GAL80}}
1916: \put(   0,   0){\gene{2}{HXT2}}
1917: \put( 325,  50){\gene{4}{MRPL24}}
1918: \put( 260,  50){\gene{4}{MLF3}}
1919: \put( 195,  50){\gene{1}{GCY1}}
1920: \put(   0,  50){\gene{0}{YOR121C}}
1921: \put( 325,  75){\gene{0}{YPL066W}}
1922: \put( 260,  25){\gene{4}{OPT2}}
1923: \end{picture}
1924: 
1925: \vspace{20pt}
1926: \gene{0}{0} \modulesize Unknown \genesize
1927: 
1928: \vspace{5pt}
1929: \gene{1}{1} \modulesize Galactose induced genes \genesize
1930: 
1931: \vspace{5pt}
1932: \gene{2}{2} \modulesize Hexose transporters (downregulated) \genesize
1933: 
1934: \vspace{5pt}
1935: \gene{3}{3} \modulesize Other, downregulated \genesize
1936: 
1937: \vspace{5pt}
1938: \gene{4}{4} \modulesize Other \genesize
1939: 
1940: \vspace{10pt}
1941: \includegraphics[width=3.2in]{Galactose}
1942: \end{flushleft}
1943: \end{minipage}
1944: }
1945: \caption{The galactose induced module found with PISA. This module turns
1946: on
1947: GAL genes and also, as a weaker effect, represses a number of hexose
1948: transporters.}
1949: \label{GalactoseModule}
1950: \end{figure}
1951: 
1952: 
1953: \begin{figure}
1954: \setlength{\unitlength}{0.6pt}
1955: \fbox{
1956: \begin{minipage}{3.250in}
1957: \begin{flushleft}
1958: \moduletitlesize
1959: Module: Peroxide shock
1960: \modulesize
1961: 
1962: \vspace{10pt}
1963: 
1964: Number of genes: 56
1965: 
1966: Average number of contributing conditions: 25.3795
1967: 
1968: Consistency: 0.836174
1969: 
1970: Best ISA overlap:  at threshold , frequency
1971: \genesize
1972: \vspace{10pt}
1973: %412224201144122444041014204410000433410440110031311420021003230420134
1974: \begin{picture}(300,300)(0,0)
1975: \put( 260, 225){\gene{4}{FLR1}}
1976: \put(  65, 225){\gene{1}{GPX2}}
1977: \put( 130, 225){\gene{2}{YCR102C}}
1978: \put(  65, 125){\gene{2}{AAD3}}
1979: \put(   0, 150){\gene{2}{SFA1}}
1980: \put( 260,  75){\gene{4}{LYS20}}
1981: \put(  65, 175){\gene{2}{AAD4}}
1982: \put(   0, 200){\gene{0}{YDR132C}}
1983: \put( 260, 150){\gene{1}{TRR1}}
1984: \put( 195, 150){\gene{1}{YDR453C}}
1985: \put( 260,  50){\gene{4}{TRS31}}
1986: \put(   0,  50){\gene{4}{RIB3}}
1987: \put(   0,   0){\gene{1}{TTR1}}
1988: \put( 195, 225){\gene{2}{AAD6}}
1989: \put(   0, 175){\gene{2}{YFL057C}}
1990: \put(  65, 100){\gene{4}{YGL114W}}
1991: \put(  65,  25){\gene{0}{YGR011W}}
1992: \put(   0,  25){\gene{4}{KSS1}}
1993: \put(   0,  75){\gene{1}{TRX2}}
1994: \put( 195,  50){\gene{0}{YGR223C}}
1995: \put( 325,  25){\gene{1}{SOD2}}
1996: \put( 260,  25){\gene{2}{OYE2}}
1997: \put( 260, 125){\gene{4}{SDL1}}
1998: \put(  65, 200){\gene{4}{GSH1}}
1999: \put( 195,  75){\gene{0}{YKL070W}}
2000: \put(   0, 225){\gene{0}{YKL071W}}
2001: \put(  65,  75){\gene{4}{CYT2}}
2002: \put( 325, 100){\gene{3}{MRS4}}
2003: \put( 325, 125){\gene{3}{YSR3}}
2004: \put(   0, 125){\gene{1}{CCP1}}
2005: \put( 325, 150){\gene{0}{YKR071C}}
2006: \put( 195, 100){\gene{4}{ECM4}}
2007: \put( 130, 175){\gene{4}{GTT2}}
2008: \put( 325, 225){\gene{0}{YLR108C}}
2009: \put( 130, 150){\gene{1}{FRE1}}
2010: \put( 130, 200){\gene{0}{YLR460C}}
2011: \put( 130,  75){\gene{3}{YAP1}}
2012: \put( 260, 175){\gene{3}{ATR1}}
2013: \put( 325, 200){\gene{1}{YML131W}}
2014: \put( 325,  75){\gene{4}{MMT1}}
2015: \put( 325,  50){\gene{2}{YMR318C}}
2016: \put(  65, 150){\gene{0}{YNL134C}}
2017: \put( 195, 125){\gene{0}{YNL260C}}
2018: \put( 195, 175){\gene{2}{AAD14}}
2019: \put(   0, 100){\gene{1}{YNR074C}}
2020: \put( 130,  25){\gene{0}{YOL029C}}
2021: \put( 130, 125){\gene{0}{YOL150C}}
2022: \put( 130,  50){\gene{3}{GRE2}}
2023: \put( 260, 200){\gene{2}{AAD15}}
2024: \put( 195,  25){\gene{3}{CIN5}}
2025: \put( 325, 175){\gene{0}{YOR225W}}
2026: \put( 195, 200){\gene{4}{ISU2}}
2027: \put( 260, 100){\gene{2}{OYE3}}
2028: \put(  65,  50){\gene{1}{TAH18}}
2029: \put(  65,   0){\gene{3}{ROX1}}
2030: \put( 130, 100){\gene{4}{ISA2}}
2031: \end{picture}
2032: 
2033: \vspace{20pt}
2034: \gene{0}{0} \modulesize Unknown \genesize
2035: 
2036: \vspace{5pt}
2037: \gene{1}{1} \modulesize Peroxidase, superoxide dismutase, reductase
2038: \genesize
2039: 
2040: \vspace{5pt}
2041: \gene{2}{2} \modulesize Dehydrogenase \genesize
2042: 
2043: \vspace{5pt}
2044: \gene{3}{2} \modulesize Other stress related genes \genesize
2045: 
2046: \vspace{5pt}
2047: \gene{4}{3} \modulesize Other \genesize
2048: %\end{figure}
2049: 
2050: \vspace{10pt}
2051: \includegraphics[width=3.2in]{Peroxide}
2052: \end{flushleft}
2053: \end{minipage}
2054: }
2055: \caption{The oxidative stress response module found with PISA. This
2056: module
2057: is significantly more complete than the modules of comparable size found
2058: by ISA.
2059: }
2060: \label{PeroxideShockModule}
2061: \end{figure}
2062: 
2063: 
2064: \begin{figure}
2065: \setlength{\unitlength}{0.6pt}
2066: \fbox{
2067: \begin{minipage}{3.200in}
2068: \begin{flushleft}
2069: \moduletitlesize
2070: Module: Hexose transporters
2071: \modulesize
2072: 
2073: \vspace{10pt}
2074: 
2075: Number of genes: 12
2076: Average number of contributing conditions: 33.1968
2077: Consistency: 0.78495
2078: Best ISA overlap:  at threshold , frequency
2079: genesize
2080: vspace{10pt}
2081: \begin{picture}(300,50)(0,0)
2082: put( 260,   0){\gene{3}{MTH1}}
2083: put( 325,  25){\gene{1}{HXT7}}
2084: put( 195,  25){\gene{1}{HXT6}}
2085: put(   0,  25){\gene{1}{HXT3}}
2086: put( 130,   0){\gene{3}{MIG2}}
2087: put(  65,  25){\gene{1}{HXT4}}
2088: put(  65,   0){\gene{1}{HXT1}}
2089: put( 195,   0){\gene{1}{HXT8}}
2090: put(   0,   0){\gene{3}{YKR075C}}
2091: put( 260,  25){\gene{2}{GAL2}}
2092: put( 130,  25){\gene{1}{HXT2}}
2093: put( 325,   0){\gene{4}{YPR127W}}
2094: end{picture}
2095: \vspace{20pt}
2096: gene{1}{1} \modulesize Glucose transporter \genesize
2097: \vspace{5pt}
2098: gene{2}{2} \modulesize Galactose/glucose transporter \genesize
2099: \vspace{5pt}
2100: gene{3}{3} \modulesize Glucose suppression regulator \genesize
2101: \vspace{5pt}
2102: gene{4}{4} \modulesize Other, downregulated \genesize
2103: \vspace{10pt}
2104: includegraphics[width=3.2in]{Hexose}
2105: end{flushleft}
2106: end{minipage}
2107: 
2108: caption{The hexose transporter module found with PISA. In this module
2109: which is consistently found after the galactose induced module), the
2110: hexose transporter genes are co-regulated with GAL2, the galactose
2111: permease,
2112: hereas they are counter-regulated in the galactose induced module. Many
2113: of the genes that had a score slightly below the threshold are also
2114: glucose suppression related.
2115: 
2116: label{HexoseModule}
2117: end{figure}
2118: 
2119: 
2120: 
2121: begin{figure}
2122: setlength{\unitlength}{0.6pt}
2123: fbox{
2124: begin{minipage}{3.200in}
2125: begin{flushleft}
2126: moduletitlesize
2127: odule: Flocculation
2128: modulesize
2129: \vspace{10pt}
2130: Number of genes: 6
2131: Average number of contributing conditions: 15.5088
2132: Consistency: 0.552039
2133: Best ISA overlap:  at threshold , frequency
2134: genesize
2135: vspace{10pt}
2136: \begin{picture}(300,25)(0,0)
2137: ignore{
2138:       FLO5  11  63.1642
2139:       FLO9  11  57.3713
2140:    YAL065C  11  52.6247
2141:    YAR062W  11  50.6705
2142:       FLO1   8  39.5147
2143:    YHR213W   6  21.7364
2144: 
2145: put(  65,   0){\gene{2}{FLO9}}
2146: put( 130,   0){\gene{2}{YAL065C}}
2147: put( 260,   0){\gene{1}{FLO1}}
2148: put( 195,   0){\gene{3}{YAR062W}}
2149: put(   0,   0){\gene{1}{FLO5}}
2150: put( 325,   0){\gene{3}{YHR213W}}
2151: end{picture}
2152: 
2153: \vspace{20pt}
2154: \gene{1}{1} \modulesize Flocculin \genesize
2155: 
2156: \vspace{5pt}
2157: \gene{2}{2} \modulesize Similar to Flo1p \genesize
2158: 
2159: \vspace{5pt}
2160: \gene{3}{3} \modulesize Similar to N-terminus of Flo1p \genesize
2161: 
2162: \vspace{10pt}
2163: \includegraphics[width=3.2in]{Flocculation}
2164: \end{flushleft}
2165: \end{minipage}
2166: }
2167: \caption{The flocculation module found with PISA. 
2168: \comment{There is some support for this in database A.
2169: unknown genes for other modules
2170: Use our modules to guide p-values of TF database.}
2171: }
2172: \label{FlocculationModule}
2173: \end{figure}
2174: 
2175: 
2176: }}}}
2177: \begin{figure}
2178: \setlength{\unitlength}{0.6pt}
2179: \fbox{
2180: \begin{minipage}{3.200in}
2181: \begin{flushleft}
2182: \moduletitlesize
2183: Module: Zinc regulated genes
2184: \modulesize
2185: 
2186: \vspace{10pt}
2187: 
2188: Number of genes: 8
2189: 
2190: Average number of contributing conditions: 29.0683
2191: 
2192: Consistency: 0.638515
2193: 
2194: Best ISA overlap: 0.88 at threshold 4.6, frequency 2
2195: \genesize
2196: \vspace{10pt}
2197: 
2198: \begin{picture}(300,50)(0,0)
2199: \ignore{
2200:        ZRT1  22  264.338
2201:        ZRT3  22  202.015
2202:        ZAP1  22  195.191
2203:        ZRT2  18  128.968
2204:     YOL154W  18  113.449
2205:        INO1  17  109.054
2206:        ADH4  19  108.413
2207:     YNL254C  20  103.622
2208: }
2209: \put(   0,  25){\gene{1}{ZRT1}}
2210: \put(   0,   0){\gene{4}{ADH4}}
2211: \put( 130,  25){\gene{2}{ZAP1}}
2212: \put( 325,  25){\gene{4}{INO1}}
2213: \put(  65,  25){\gene{1}{ZRT3}}
2214: \put( 195,  25){\gene{1}{ZRT2}}
2215: \put(  65,   0){\gene{0}{YNL254C}}
2216: \put( 260,  25){\gene{3}{YOL154W}}
2217: \end{picture}
2218: 
2219: \vspace{20pt}
2220: \gene{0}{0} \modulesize Unknown \genesize
2221: 
2222: \vspace{5pt}
2223: \gene{1}{1} \modulesize Zinc transport/storage \genesize
2224: 
2225: \vspace{5pt}
2226: \gene{2}{2} \modulesize Zinc-responsive transcription factor \genesize
2227: 
2228: \vspace{5pt}
2229: \gene{3}{3} \modulesize Zinc metalloproteinase \genesize
2230: 
2231: \vspace{5pt}
2232: \gene{4}{4} \modulesize Other \genesize
2233: 
2234: \vspace{10pt}
2235: \includegraphics[width=3.2in]{Zinc}
2236: \end{flushleft}
2237: \end{minipage}
2238: }
2239: \caption{The zinc module found with PISA. This module has a high overlap
2240: with
2241: the group of genes bound by ZAP1 in database A (at $p$-value 0.001): The
2242: ZRT1, ZRT2, ZRT3, ZAP1 and YNL254C genes make up 5 of the 6 lowest
2243: $p$-values
2244: (counting each pair of divergently transcribed genes only once), and the
2245: remaining hits from database A (most with $p$-values above $10^{-4}$)
2246: are likely to be mostly false positives. Based on this, it seems very
2247: likely
2248: than YNL254C, if functional, is regulated by and related to zinc. (ADH4
2249: has also been shown to be zinc-regulated elsewhere.)
2250: }
2251: \label{ZincModule}
2252: \end{figure}
2253: 
2254: 
2255: \begin{figure}
2256: \setlength{\unitlength}{0.6pt}
2257: \fbox{
2258: \begin{minipage}{3.200in}
2259: \begin{flushleft}
2260: \moduletitlesize
2261: Module: Arginine regulation
2262: \modulesize
2263: 
2264: \vspace{10pt}
2265: 
2266: Number of genes: 7
2267: 
2268: Average number of contributing conditions: 14.0667
2269: 
2270: Consistency: 0.548283
2271: 
2272: Best ISA overlap: 0.71 at threshold 6.0, frequency 60
2273: \genesize
2274: \vspace{10pt}
2275: 
2276: \begin{picture}(300,50)(0,0)
2277: \put( 325,  25){\gene{1}{ARG5,6}}
2278: \put(  65,  25){\gene{1}{ARG3}}
2279: \put(   0,   0){\gene{2}{CAR2}}
2280: \put( 260,  25){\gene{4}{CTF13}}
2281: \put( 130,  25){\gene{1}{ARG1}}
2282: \put(   0,  25){\gene{1}{ARG8}}
2283: \put( 195,  25){\gene{1}{CPA1}}
2284: \end{picture}
2285: 
2286: \vspace{20pt}
2287: \gene{1}{1} \modulesize Arginine biosynthesis \genesize
2288: 
2289: \vspace{5pt}
2290: \gene{2}{2} \modulesize Arginine degradation, downregulated \genesize
2291: 
2292: \vspace{5pt}
2293: \gene{4}{4} \modulesize Other \genesize
2294: 
2295: \vspace{10pt}
2296: \includegraphics[width=3.2in]{Arginine}
2297: \end{flushleft}
2298: \end{minipage}
2299: }
2300: \caption{The arginine regulated module found with PISA. The module
2301: agrees very well with what is known about regulation of arginine
2302: metabolism
2303: [F. Mesenguy and E. Dubois (2000) {\it Food tech. bio.} {\bf 38},
2304: 277-285]:
2305: ARG1, ARG3, ARG5,6 and ARG8 are repressed by arginine through the
2306: Arg80/Arg81/Mcm1 complex, while CAR2 (and CAR1, which is the 2nd highest
2307: scoring gene that failed to make the module) is activated by the same
2308: complex.
2309: We also find CPA1, which is claimed to be regulated by arginine at the
2310: translational
2311: level---the mRNA is destabilized by a small peptide in the presence of
2312: arginine. However, database A indicates that ARG1, ARG3, ARG5,6, ARG8
2313: and CPA1 are
2314: all bound by the Arg80/Arg81/Mcm1 complex.
2315: }
2316: \label{ArginineModule}
2317: \end{figure}
2318: 
2319: 
2320: \ignore{
2321: \begin{figure}
2322: \setlength{\unitlength}{0.6pt}
2323: \fbox{
2324: \begin{minipage}{3.200in}
2325: \begin{flushleft}
2326: \moduletitlesize
2327: Module: IMP dehydrogenase
2328: \modulesize
2329: 
2330: \vspace{10pt}
2331: 
2332: Number of genes: 5
2333: 
2334: Average number of contributing conditions: 17.6909
2335: 
2336: Consistency: 0.792111
2337: 
2338: Best ISA overlap:  at threshold , frequency
2339: \genesize
2340: \vspace{10pt}
2341: 
2342: \begin{picture}(300,25)(0,0)
2343: \put(   0,   0){\gene{2}{IMD1}}
2344: \put( 130,   0){\gene{2}{YAR075W}}
2345: \put( 260,   0){\gene{3}{GLN3}}
2346: \put(  65,   0){\gene{1}{IMD2}}
2347: \put( 195,   0){\gene{2}{IMD3}}
2348: \end{picture}
2349: 
2350: \vspace{20pt}
2351: \vspace{5pt}
2352: \gene{1}{1} \modulesize Inosine-5'-monophosphate dehydrogenase \genesize
2353: 
2354: \vspace{5pt}
2355: \gene{2}{2} \modulesize Protein with similarity to Imd2p \genesize
2356: 
2357: \vspace{5pt}
2358: \gene{3}{3} \modulesize Nitrogen regulatory protein, downregulated
2359: \genesize
2360: 
2361: \vspace{10pt}
2362: \includegraphics[width=3.2in]{IMD}
2363: \end{flushleft}
2364: \end{minipage}
2365: }
2366: \caption{The last IMD gene, IMD4, is the highest scoring gene not
2367: included in
2368: the module (but the score is far below that of GLN3).
2369: }
2370: \label{NewModule}
2371: \end{figure}
2372: 
2373: 
2374: \ignore{
2375: \begin{figure}
2376: \setlength{\unitlength}{0.6pt}
2377: \fbox{
2378: \begin{minipage}{3.200in}
2379: \begin{flushleft}
2380: \moduletitlesize
2381: \begin{picture}(300,50)(0,0)
2382: \end{picture}
2383: 
2384: \vspace{20pt}
2385: \gene{0}{0} \modulesize Unknown \genesize
2386: 
2387: \vspace{5pt}
2388: \gene{1}{1} \modulesize Zinc transport/storage \genesize
2389: 
2390: \vspace{5pt}
2391: \gene{2}{2} \modulesize Zinc-responsive transcription factor \genesize
2392: 
2393: \vspace{5pt}
2394: \gene{3}{3} \modulesize Zinc metalloproteinase \genesize
2395: 
2396: \vspace{5pt}
2397: \gene{4}{4} \modulesize Other \genesize
2398: 
2399: \vspace{10pt}
2400: \includegraphics[width=3.2in]{Arginine}
2401: \end{flushleft}
2402: \end{minipage}
2403: }
2404: \caption{
2405: }
2406: \label{NewModule}
2407: \end{figure}
2408: }
2409: }
2410: 
2411: 
2412: 
2413: 
2414: \end{document}
2415: 
2416: 
2417: 
2418: