0709.3719/RF.tex
1: % Template article for preprint document class `elsart'
2: % with harvard style bibliographic references
3: % SP 2001/01/05
4: 
5: \documentclass{elsart}
6: % Use the option doublespacing or reviewcopy to obtain double line spacing
7: %\documentclass[doublespacing]{elsart}
8: \journal{NIM}
9: % the natbib package allows both number and author-year (Harvard)
10: % style referencing;
11: 
12: %\usepackage{natbib}
13: % if you use PostScript figures in your article
14: % use the graphics package for simple commands
15: % \usepackage{graphics}
16: % or use the graphicx package for more complicated commands
17: % \usepackage{graphicx}
18: % or use the epsfig package if you prefer to use the old commands
19: \usepackage{epsfig}
20: 
21: % The amssymb package provides various useful mathematical symbols
22: \usepackage{amssymb}
23: \usepackage{amsmath,amsfonts,verbatim,graphicx,float}
24: \usepackage{subfigure}
25: %\usepackage[english]{babel}
26: %\usepackage[latin1]{inputenc}
27: \usepackage{cite}
28: %\usepackage{subfigure}
29: \pdfoutput=1
30: 
31: \newfont{\tensy}{cmsy10}
32: \newcommand{\chemical}[1]{{$\fontdimen16\tensy=3.0pt\fontdimen17\tensy=3.0pt 
33: \mathrm{#1}$}}
34: 
35: %\usepackage[mediumqspace,squaren,textstyle]{SIunits}
36: \renewcommand{\arraystretch}{1.3}   %% arrays look nicer now
37: 
38: %\newcommand{\na}{\chemical{\mbox{}^{22}Na}}
39: 
40: 
41: \begin{document}
42:  
43: \begin{frontmatter}
44: 
45: % Title, authors and addresses
46: 
47: % use the thanksref command within \title, \author or \address for footnotes;
48: % use the corauthref command within \author for corresponding author footnotes;
49: % use the ead command for the email address,
50: % and the form \ead[url] for the home page:
51: 
52: \title{Implementation of the Random Forest Method for the
53: Imaging Atmospheric Cherenkov Telescope MAGIC}
54: 
55:  \author[a]{J.~Albert}, 
56:  \author[b]{E.~Aliu}, 
57:  \author[c]{H.~Anderhub}, 
58:  \author[d]{P.~Antoranz}, 
59:  \author[b]{A.~Armada}, 
60:  \author[d]{M.~Asensio}, 
61:  \author[e]{C.~Baixeras}, 
62:  \author[d]{J.~A.~Barrio},
63:  \author[f]{H.~Bartko}, 
64:  \author[g]{D.~Bastieri}, 
65:  \author[h]{J.~Becker},   
66:  \author[i]{W.~Bednarek}, 
67:  \author[a]{K.~Berger}, 
68:  \author[g]{C.~Bigongiari}, 
69:  \author[c]{A.~Biland}, 
70:  \author[f,g]{R.~K.~Bock}, 
71:  \author[j]{P.~Bordas},
72:  \author[j]{V.~Bosch-Ramon},
73:  \author[a]{T.~Bretz}, 
74:  \author[c]{I.~Britvitch}, 
75:  \author[d]{M.~Camara}, 
76:  \author[f]{E.~Carmona}, 
77:  \author[k]{A.~Chilingarian}, 
78:  \author[l]{S.~Ciprini}, 
79:  \author[f]{J.~A.~Coarasa}, 
80:  \author[c]{S.~Commichau}, 
81:  \author[d]{J.~L.~Contreras}, 
82:  \author[b]{J.~Cortina}, 
83:  \author[m,v]{M.~T.~Costado},
84:  \author[h]{V.~Curtef}, 
85:  \author[k]{V.~Danielyan}, 
86:  \author[g]{F.~Dazzi}, 
87:  \author[n]{A.~De Angelis}, 
88:  \author[m]{C.~Delgado}, 
89:  \author[d]{R.~de~los~Reyes}, 
90:  \author[n]{B.~De Lotto}, 
91:  \author[b]{E.~Domingo-Santamar\'\i a}, 
92:  \author[a]{D.~Dorner}, 
93:  \author[g]{M.~Doro}, 
94:  \author[b]{M.~Errando}, 
95:  \author[o]{M.~Fagiolini}, 
96:  \author[p]{D.~Ferenc}, 
97:  \author[b]{E.~Fern\'andez}, 
98:  \author[b]{R.~Firpo}, 
99:  \author[b]{J.~Flix}, 
100:  \author[d]{M.~V.~Fonseca}, 
101:  \author[e]{L.~Font}, 
102:  \author[f]{M.~Fuchs},
103:  \author[f]{N.~Galante},  
104:  \author[m,v]{R.~J.~Garc\'{\i}a-L\'opez}, 
105:  \author[f]{M.~Garczarczyk}, 
106:  \author[m]{M.~Gaug}, 
107:  \author[i]{M.~Giller}, 
108:  \author[f]{F.~Goebel}, 
109:  \author[k]{D.~Hakobyan}, 
110:  \author[f]{M.~Hayashida}, 
111:  \author[q]{T.~Hengstebeck\corauthref{cor1}},
112:  \ead{hengsteb@o2online.de}   
113:  \author[m,v]{A.~Herrero}, 
114:  \author[a]{D.~H\"ohne}, 
115:  \author[f]{J.~Hose},
116:  \author[a]{S.~Huber}, 
117:  \author[f]{C.~C.~Hsu}, 
118:  \author[i]{P.~Jacon},  
119:  \author[f]{T.~Jogler},  
120:  \author[f]{R.~Kosyra},
121:  \author[c]{D.~Kranich}, 
122:  \author[a]{R.~Kritzer},
123:  \author[p]{A.~Laille},  
124:  \author[l]{E.~Lindfors}, 
125:  \author[g]{S.~Lombardi},
126:  \author[n]{F.~Longo}, 
127:  \author[b]{J.~L\'opez}, 
128:  \author[d]{M.~L\'opez}, 
129:  \author[c,f]{E.~Lorenz}, 
130:  \author[f]{P.~Majumdar}, 
131:  \author[r]{G.~Maneva}, 
132:  \author[a]{K.~Mannheim}, 
133:  \author[g]{M.~Mariotti}, 
134:  \author[b]{M.~Mart\'\i nez}, 
135:  \author[b]{D.~Mazin},
136:  \author[f]{C.~Merck}, 
137:  \author[o]{M.~Meucci}, 
138:  \author[a]{M.~Meyer}, 
139:  \author[d]{J.~M.~Miranda}, 
140:  \author[f]{R.~Mirzoyan}, 
141:  \author[f]{S.~Mizobuchi}, 
142:  \author[b]{A.~Moralejo},  
143:  \author[d]{D.~Nieto}, 
144:  \author[l]{K.~Nilsson}, 
145:  \author[f]{J.~Ninkovic}, 
146:  \author[b]{E.~O\~na-Wilhelmi},  
147:  \author[f,q]{N.~Otte}, 
148:  \author[d]{I.~Oya}, 
149:  \author[m,x]{M.~Panniello},
150:  \author[o]{R.~Paoletti},   
151:  \author[j]{J.~M.~Paredes},
152:  \author[l]{M.~Pasanen}, 
153:  \author[g]{D.~Pascoli}, 
154:  \author[c]{F.~Pauss}, 
155:  \author[o]{R.~Pegna}, 
156:  \author[n,s]{M.~Persic}, 
157:  \author[g]{L.~Peruzzo}, 
158:  \author[o]{A.~Piccioli}, 
159:  \author[b]{N.~Puchades},  
160:  \author[g]{E.~Prandini}, 
161:  \author[k]{A.~Raymers},  
162:  \author[h]{W.~Rhode},  
163:  \author[j]{M.~Rib\'o},
164:  \author[b]{J.~Rico},  
165:  \author[c]{M.~Rissi}, 
166:  \author[e]{A.~Robert}, 
167:  \author[a]{S.~R\"ugamer}, 
168:  \author[g]{A.~Saggion},
169:  \author[f]{T.~Y.~Saito}, 
170:  \author[e]{A.~S\'anchez}, 
171:  \author[g]{P.~Sartori}, 
172:  \author[g]{V.~Scalzotto}, 
173:  \author[n]{V.~Scapin},
174:  \author[a]{R.~Schmitt}, 
175:  \author[f]{T.~Schweizer}, 
176:  \author[q,f]{M.~Shayduk}, 
177:  \author[f]{K.~Shinozaki}, 
178:  \author[t]{S.~N.~Shore}, 
179:  \author[b]{N.~Sidro}, 
180:  \author[l]{A.~Sillanp\"a\"a}, 
181:  \author[i]{D.~Sobczynska}, 
182:  \author[a]{F.~Spanier}, 
183:  \author[o]{A.~Stamerra}, 
184:  \author[c]{L.~S.~Stark}, 
185:  \author[l]{L.~Takalo}, 
186:  \author[r]{P.~Temnikov}, 
187:  \author[b]{D.~Tescaro}, 
188:  \author[f]{M.~Teshima},   
189:  \author[u]{D.~F.~Torres}, 
190:  \author[o]{N.~Turini}, 
191:  \author[r]{H.~Vankov},
192:  \author[n]{A.~Venturini},
193:  \author[n]{V.~Vitale}, 
194:  \author[f]{R.~M.~Wagner}, 
195:  \author[i]{T.~Wibig}, 
196:  \author[f]{W.~Wittek},
197:  \author[g]{F.~Zandanel},
198:  \author[b]{R.~Zanin},
199:  \author[e]{J.~Zapatero} 
200: 
201: 
202:  \address[a]{Universit\"at W\"urzburg, D-97074 W\"urzburg, Germany}
203:  \address[b]{Institut de F\'\i sica d'Altes Energies, Edifici Cn., E-08193 Bellaterra (Barcelona), Spain}
204:  \address[c]{ETH Zurich, CH-8093 Switzerland}
205:  \address[d]{Universidad Complutense, E-28040 Madrid, Spain}
206:  \address[e]{Universitat Aut\`onoma de Barcelona, E-08193 Bellaterra, Spain}
207:  \address[f]{Max-Planck-Institut f\"ur Physik, D-80805 M\"unchen, Germany}
208:  \address[g]{Universit\`a di Padova and INFN, I-35131 Padova, Italy} 
209:  \address[h]{Universit\"at Dortmund, D-44227 Dortmund, Germany} 
210:  \address[i]{University of \L \'od\'z, PL-90236 Lodz, Poland} 
211:  \address[j]{Universitat de Barcelona, E-08028 Barcelona, Spain}
212:  \address[k]{Yerevan Physics Institute, AM-375036 Yerevan, Armenia}
213:  \address[l]{Tuorla Observatory, FI-21500 Piikki\"o, Finland}
214:  \address[m]{Inst. de Astrofisica de Canarias, E-38200, La Laguna, Tenerife, Spain}
215:  \address[n]{Universit\`a di Udine, and INFN Trieste, I-33100 Udine, Italy}
216:  \address[o]{Universit\`a  di Siena, and INFN Pisa, I-53100 Siena, Italy}
217:  \address[p]{University of California, Davis, CA-95616-8677, USA}
218:  \address[q]{Humboldt-Universit\"at zu Berlin, D-12489 Berlin, Germany} 
219:  \address[r]{Institute for Nuclear Research and Nuclear Energy, BG-1784 Sofia, Bulgaria}
220:  \address[s]{INAF/Osservatorio Astronomico and INFN Trieste, I-34131 Trieste, Italy} 
221:  \address[t]{Universit\`a  di Pisa, and INFN Pisa, I-56126 Pisa, Italy}
222:  \address[u]{ICREA \& Institut de Ci\`encies de l'Espai (CSIC-IEEC), E-08193 Bellaterra, Spain}
223:  \address[v]{Depto. de Astrofisica, Universidad, E-38206, La Laguna, Tenerife, Spain}
224:  \address[x]{deceased}
225: 
226: 
227:  \corauth[cor1]{Corresponding author.}
228: 
229: 
230: 
231: %% abstract %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
232: \begin{abstract}
233: The paper describes an application of the tree classification method Random Forest (RF),
234: as used in the analysis of data from the ground-based gamma telescope MAGIC. 
235: In such telescopes, cosmic $\gamma$-rays are observed and have to be discriminated
236: against a dominating background of hadronic cosmic-ray particles. 
237: We describe the application of RF for this gamma/hadron separation. 
238: The RF method often shows superior performance in comparison with 
239: traditional semi-empirical techniques.
240: Critical issues of the method and its implementation are discussed.
241: An application of the RF method for estimation of a continuous parameter
242: from related variables, rather than discrete classes, is also discussed.
243: \end{abstract}
244: 
245: 
246: \begin{keyword}
247: % keywords here, in the form: keyword \sep keyword
248: discrimination \sep classification \sep decision tree
249: \end{keyword}
250: 
251: \end{frontmatter}
252: 
253: 
254: % main text
255: \section{Introduction}
256: Ground-based gamma-ray astronomy has in recent years shown to be a 
257: source of  spectacular discoveries, 
258: constraining the evolution of the universe and contributing 
259: to the understanding of the origin of cosmic rays. 
260: Observations are based on Imaging Atmospheric Cherenkov Telescopes 
261: (IACTs), which take advantage of
262: the Cherenkov radiation emanating from the electromagnetic showers 
263: that develop during the absorption of gamma-rays 
264: in the atmosphere. The faint Cherenkov light flashes are collected in 
265: a large-diameter mirror, and recorded in a pixelized camera. 
266: 
267: Several IACT systems are in successful
268: operation today, both in the Northern (MAGIC, VERITAS) and Southern 
269: (HESS, CANGAROO) hemisphere; all but MAGIC are implemented as multi-telescope arrays. 
270: Their scientfic goals include galactic and extragalactic sources:
271: Supernova remnants, Pulsars, X-ray binaries, Microquasars,
272: Active Galactic Nuclei (blazars or radio galaxies), Starburst galaxies
273: and potentially also Gamma Ray Bursts. Due to their small aperture IACTs can only 
274: perform scans over small areeas, and usually concentrate 
275: on sources that have been identified at other wavelengths; however, the number of 
276: known gamma-ray emitters is increasing fast, and they provide essential
277: contributions to the understanding of the non-thermal universe. 
278: 
279: Events seen by an IACT have a very short ($\approx 2ns$) duration, and 
280: the shower image is recorded as a compact cluster of pixels
281: in the camera of the IACT. A principal component analysis
282: permits to express the characteristics of this cluster in image parameters,
283: which will present statistically different properties for the
284: (interesting) gamma-rays and the (dominating) hadronic background.
285: IACTs provide raw data with a signal-to-noise ratio much smaller than  $1\%$, 
286: even for bright gamma sources. Establishing powerful methods of hadronic 
287: background rejection thus is a prerequisite for the effective utilization of 
288: observations with the Cherenkov technique. The fact was recognized with the advent of
289: the IACT technique, and has been given ample room in the literature, both for telescope
290: arrays and single telescopes, e.g. \cite{hillas,hillas1,fegan,aharonian,kraw}. 
291: Multivariate methods using global test statistics 
292: (e.g. likelihood ratios or artificial neural networks) are specifically mentioned in 
293: \cite{fegan} and \cite{kraw}.
294: 
295: A case study for and comparison of different advanced classification methods 
296: for a single-dish IACT can be found in \cite{bock}. In the same article the main 
297: features of Cherenkov images measured by gamma-ray telescopes are addressed 
298: and explained, and the image parameters used in the $\gamma$/h separation are defined.
299: 
300: In this paper, largely derived from chapter 5 of \cite{hengst}, we limit ourselves to the 
301: implementation, usage, and functionality of the RF method for the single-dish system
302: MAGIC \cite{lorenz}. In \cite{hengst}, a more detailed discussion of the RF method
303: and comprehensive MC studies are given.
304: The implementation closely follows the method desribed
305: by L. Breiman \cite{breiman1}. The application 
306: in $\gamma$/h separation is discussed in detail. Recent MAGIC publications
307: (e.g. \cite{albert1, albert2, albert3} use the RF technique, and \cite{albert4}
308: dicusses it in the context of the reference observations of the Crab nebula.
309: A short comparative study with the 
310: established method of cuts in scaled image parameters is given. 
311: We also discuss an application of the RF method in estimating the 
312: gamma energy, a continuous 
313: variable, in terms of the observed image parameters. 
314: In the following chapter \ref{sec_basic} the Random Forest 
315: method will be described in 
316: detail, since existing mathematical treatments show only few practically 
317: useful aspects, if any. 
318: The reader not interested in these details may regard RF as a 
319: black-box tree classification method, and continue 
320: with the results in section \ref{results}. 
321: 
322: \section{Basics of the Random Forest (RF) method}
323: \label{sec_basic}
324: The Random Forest method is based on a collection of decision trees, built
325: up with some elements of random choices.
326: Like many other classification and regression methods, a Random Forest  
327: is constructed on the basis of training samples suitable for the application.
328: For the purpose of $\gamma$/h separation, the training samples contain the two 
329: classes of gammas (usually Monte Carlo (MC) data) 
330: and hadrons (usually OFF data, also ON\footnote{ON and OFF data are telescope 
331: data obtained by pointing at the source or on a nearby, sourceless region of the sky, 
332: respectively} or MC data are possible).
333: In the further discussion, the following definitions will be used:
334: We call the elements of the training sample {\it events}. 
335: Each event is characterized by a vector whose components are {\it image parameters}
336: obtained by analyzing the camera pixels. We use the familiar Hillas parameters \cite{hillas} 
337: and some additional parameters, but also
338: observation- and detector-related parameters, like $cos(\theta)$, 
339: $\theta$ being the zenith angle of the source. 
340: The space spanned by the event vectors is multi-dimensional. One can consider the 
341: training samples of gammas and hadrons
342: as a single labeled training sample, viz. each event has an integer label 
343: (called {\it hadronness}) indicating if the event belongs to the class of gammas (hadronness 0) 
344: or to the class of hadrons (hadronness 1).
345: 
346: From this sample, a binary decision tree can be constructed, subdividing the parameter 
347: space first in two parts depending on one of the parameters, and subsequently repeating 
348: the process again and again for each part. The best choice of parameter and the
349: criteria for subdividing are discussed below.
350: Using a single tree for classification purposes, however, usually gives mediocre results. 
351: The tree is overoptimized on the training sample, and there is only poor generalization 
352: viz. new events will be classified rather badly. 
353: This is shown in figure~\ref{fig_pattern1}. Note, however, that even a set of trees 
354: (forest) results in some sparsely populated areas, where the hadronness 
355: necessarily is
356: not well defined, and the probability of misclassification may be substantial.
357: 
358: \begin{figure}[h]
359: \begin{minipage} [c] {0.50\textwidth}
360: \includegraphics[totalheight=4.0cm]{pattern2.jpg}
361: \end{minipage}
362: \begin{minipage} [c] {0.5\textwidth}
363: \includegraphics[totalheight=4.5cm]{pattern1.pdf}
364: \end{minipage}
365: \caption{{\it Left: Illustration of the RF method for a simple 2-dimensional model case. 
366: The black and white points are the observed points in class ‘gamma’ and ‘hadrons’, 
367: respectively. They are distributed according to two different, but overlapping 2-dimensional
368: Gaussians. The result of separation in terms of hadronness is shown in colour. 
369: Right: The result of using a single tree on the same data gives no probability 
370: measure like hadronness, but only y/n answers. Its performance is inadequate.}}
371: \label{fig_pattern1}
372: \end{figure}
373: 
374: There is no pruning (tree simplification by removing some branches considered irrelevant)
375: of the trees in the Random Forest algorithm. Instead, the RF creates
376: a set of largely uncorrelated trees, and combines their results to form a 
377: generalized predictor. Two random elements prior to and within the tree growing process serve to 
378: approximate ideally uncorrelated trees; they are described in the following sections.
379: 
380: \subsection{Bootstrap aggregating (bagging)}
381: \label{subsec_bagging}
382: There is usually a single data sample in each class used for training. 
383: A straightforward solution to obtain independent trees is to 
384: split the training sample into as many non-overlapping subsamples as trees should be grown. 
385: However, there are usually not enough training data available for this approach. This is especially the case if dealing with 
386: air shower data, which are always costly to generate (w.r.t. computer time and storage space).
387: A different way is to produce a bootstrap sample for each tree by sampling n times with replacement from the 
388: original training sample containing n events. This procedure guarantees that the events' image parameter 
389: distributions are statistically identical for all bootstrap samples 
390: (and equal to the image parameter distributions of the 
391: original training sample, since the probability of selecting an event is constantly 1/n for the ‘sampling with replacement 
392: procedure’), while the bootstrap samples do not contain the same events. It may (and will) happen that certain 
393: events are taken more than just once:
394: The probability of not selecting a certain event is equal to $(1 - 1/n)$, 
395: which becomes $(1-1/n)^n$ when repeating the 
396: selection process n times. As $lim_{n\rightarrow\infty} (1 + x/n) = e^x$, 
397: the probability of not selecting an
398: event in the bootstrap procedure becomes $e^{-1}\approx1/3$. Thus, in each bootstrap sample there will be on average 
399: $(1 - 1/e)$  original training events, the rest (also kept in the sample) are copies.
400: 
401: \subsection{Tree growing and random split selection}
402: The tree growing begins with the complete sample contained in a single node, the so called root node, 
403: which is identical to the complete image parameter space. In the following the $\gamma$/h separation is achieved by 
404: splitting (or cutting) each node into two successor nodes using one of the image parameters at a time, with a 
405: cut value optimized to separate the sample into its classes (in our case two: gammas and hadrons). This corresponds to a 
406: successive division of the image parameter space into hypercubes.
407: In order to measure the classification power (separation ability) of an image parameter and to 
408: optimize the cut value, the Gini index is used The Gini index is a frequently used 
409: measure in dealing with classifiers, originally in economics. Named after the Italian economist 
410: Corrado Gini,
411: it measures the inequality of two distributions, 
412: e.g. gamma acceptance and hadron acceptance as function of a cut in a variable. 
413: It is defined as the ratio between a) the area spanned
414: by the observed cumulative distribution and the hypothetical cumulative distribution 
415: for a non-discriminating variable (uniform distribution, 45-degree line), and b) the 
416: area under this uniform distribution. It is a variable between zero and one;
417: a low Gini coefficient indicates more equal distributions, a 
418: high Gini coefficient shows unequal distribution. 
419:  
420: The choice of the parameter 
421: taken for splitting is randomized (see below for details).
422: The splitting process stops if the node size (events per node) falls below a limit specified by the 
423: user, or if there are only events of one class (only gammas or only hadrons) left in the node, which 
424: therefore needs not be split further. 
425: These terminal nodes can also be called elementary hypercubes, they cover the entire image parameter space 
426: without intersections or gaps. To each terminal node the remaining training events assign a 
427: class label $l$ (0 for gammas, 1 for hadrons).
428: For terminal nodes still containing a mixture of events of different classes, 
429: a mean value is calculated for $l$, taking into account the 
430: class populations $N_h$ of hadrons and $N_{\gamma}$ of gammas: $l = N_h / (N_h + N_{\gamma})$.
431: The original program \cite{breiman2} uses a majority vote, and does not calculate mean values.
432: 
433: Before going into more details, the classification process is briefly described:
434: One can take a completely grown tree as starting point 
435: (see figure~\ref{fig_tree}). 
436: \begin{figure}[h]
437: \begin{center}
438: \includegraphics[totalheight=5cm]{decisiontree.pdf}
439: \caption{{\it Sketch of a tree structure for the classification of an event $v$ with
440: components $v_{length}$, $v_{width}$, and $v_{size}$. 
441: One can follow the decision path through the tree, 
442: leading to classification of the event as hadron.}}
443: \label{fig_tree}
444: \end{center}
445: \end{figure}
446: The task is to classify an event
447: characterized by a vector $v$ in the image parameter space. $v$ is fed into the decision tree; 
448: at the first (highest level) node 
449: there is a split in a certain image parameter (e.g. 'length'). Depending
450: on the component (image parameter) 'length' in $v$, the event $v$ proceeds to the left node 
451: (length $<$ split value) or to the 
452: right node (length $\geq$ split value) at the next lower level.
453: This node again splits in some other (or by chance the same) component, and the process continues. 
454: The result is that $v$ follows a track through the tree determined by the numerical values 
455: of its components, 
456: and the tree nodes' cut values, until it will end up in a terminal node.
457: This terminal node assigns a class label $l$ to $v$, which  can now be denoted 
458: as $l_i(v)$, where $i$ is the tree number.
459: 
460: The vector $v$ will be classified by all trees. Due to the randomization involved, 
461: different trees will often give different results, 
462: hence the name 'Random Forest'. From these results, a mean classification is calculated:
463: \begin{equation}
464: h(v) = \frac{\sum_{i=1}^{n_{trees}}l_i(v)}{n_{trees}}
465: \end{equation}
466: This mean classification is called Hadronness, and is used as the only test statistic (split-parameter) 
467: in the $\gamma$/h separation (see figure~\ref{fig_had}).
468: \begin{figure}[h] 
469: \begin{center}
470: \includegraphics[totalheight=7cm]{rfoutput.pdf}
471: \caption{{\it Mean hadronness for two test samples of gammas (left peak, black) and hadrons
472: (right peak, red). Hadronness is the final and only test statistic in $\gamma$/h separation.}}
473: \label{fig_had}
474: \end{center}
475: \end{figure}
476: 
477: 
478: The splitting process is somewhat randomized by a feature called random split selection. The parameter 
479: candidates for a split are chosen randomly from the total number of available parameters. 
480: Among the candidates, the parameter and corresponding cut value to be used for splitting 
481: are chosen by the minimal Gini index. 
482: In the case of two classes, the Gini index $Q_{Gini}$ can be referred to as binomial 
483: variance of the sample 
484: scaled to the interval $[0, 1]$.
485: The Gini index (or GINI coefficient) can be expressed in terms of the node 
486: class populations $N_{\gamma}$, $N_h$ 
487: and the total node population $N$:
488: \begin{equation}
489: Q_{Gini} =  \frac{4}{N}\sigma_{binomial} =  4 \frac{N_{\gamma}}{N} \frac{N_h}{N} 
490: = 4 \frac{N_{\gamma}(N-N_{\gamma})}{N^2}   \in [0,1] 
491: \end{equation}
492: $Q_{Gini}$ of a node is zero for the ideal case that only one class is present in the node
493: ($N_{\gamma}=0$ or $N_h=0$). The Gini index of the split is calculated by adding the
494: Gini indices of the two successor nodes (denoted by left and right node) and
495: scaling the result to [0,1]:
496: \begin{equation}
497: Q_{Gini} = 2  \left( \frac{N_{\gamma left}}{N_{left}} \frac{N_{h left}}{N_{left}} + 
498: \frac{N_{\gamma right}}{N_{right}} \frac{N_{h right}}{N_{right}} \right)    \in [0,1] 
499: \end{equation}
500: 
501: Choosing the smallest $Q_{Gini}$ corresponds to minimizing the variance of the 
502: population of gammas and hadrons, and naturally purifies the sample.
503: Minimization of the Gini index provides both the choice of the image parameter
504: and the split value to be used.
505: 
506: More details concerning the Random Forest method can be found in \cite{breiman2}. 
507: The original program 
508: was modified to calculate the mean hadronness instead of a $0$ or $1$ majority
509: vote for a class. Calculating the arithmetic mean by using 
510: weights (e.g. using the Gini index of terminal nodes) 
511: did not further improve the results \cite{bock},\cite{hengst}.
512: 
513: \section{Control of the training process}
514: \label{sec_control}
515: In this chapter we address some specific aspects of RF related to the training process. 
516: Proper training depends on several parameters, steering the growing of trees, 
517: which the user should be aware of. In the following these parameters are described.
518: \begin{itemize}
519: \item 
520: Number of trees:
521: the number of trees must be chosen large enough to ensure the convergence of the error $\sigma$, given by
522: \begin{equation}
523: \sigma(n_{tree}) = \sqrt{\frac{\sum_{i=1}^{n_{sample}}(h_i^{est}(n_{tree}) - h_i^{true})^2}{n_{sample}}}
524: \end{equation}
525: $\sigma(n_{tree})$ is the rms error of the estimated hadronness. $h_i^{est}(n_{tree})$ denotes the estimated 
526: hadronness (which depends 
527: on the number $n_{tree}$ of combined trees) and $h_i^{true}$ is the true hadronness
528: of event $i$ in the sample, which contains $n­_{sample}$ events in total.
529: The convergence process is shown in figure~\ref{fig_conv} for the training of 
530: RF on an MC gamma and MC hadron sample. 
531: 
532: \begin{figure}[h]
533: \begin{center}
534: \includegraphics[totalheight=6cm]{noftrees.pdf}
535: \caption{{\it Error (rms, = $\sqrt(\sigma^2)$) of the estimated hadronness as function of the
536: number of trees used. Also shown is the variance of each single tree.}}
537: \label{fig_conv}
538: \end{center}
539: \end{figure}
540: 
541: Care was taken that the test sample, for which the figure was produced, 
542: is disjunct from the training sample. 
543: When taking events already used in the training process, $\sigma$ would be underestimated.
544: From figure~\ref{fig_conv}, the following practical method can be deduced:
545: One generates a reasonably high number of trees (100 trees is usually sufficient), performs the training process,
546: and then finds decisions for a test sample using a diminishing number of trees, to
547: judge how many trees still give satisfactory results. Trees generated during the training 
548: process are stored successively in a file. For the classification task one can read in the actually needed number of trees.
549: If no test sample is available, one can take $\sigma(n_{tree})$ as calculated from the so-called out-of-bag 
550: data during the training. 
551: The out-of-bag data are the 'residue' of the bagging procedure, as explained in the following. In the bagging procedure 
552: (generating of bootstrap samples, see chapter \ref{sec_basic})  there are data for each tree which have 
553: not been used for the tree's  bootstrap sample. Being independent, they can be used as test data for the corresponding tree. 
554: In other words, each event of the original training sample can be used as test data for $\approx 1/3$ of the trees.
555: If one observes a sufficient convergence of $\sigma$  calculated from out-of-bag data after, 
556: say, 150 trees, actually 50 trees are needed.
557: \item
558: Overtraining: During tree growing, the cut values of the parameters are adjusted according to the training sample. 
559: This overtraining is not a major drawback, it affects merely the training sample, which provides these 
560: exact cut values.
561: According to \cite{breiman2} the overtraining (or overoptimization) vanishes in case of an infinite number of trees. 
562: The practical method described above favours a minimal forest, with a number of trees sufficiently large to 
563: ensure a classification error (of a test sample), which is not significantly decreased by adding more trees.
564: Such a forest still shows overtraining: when applying $\gamma$/h separation to the training data, the classes of gammas 
565: and hadrons can usually be well separated by a cut in hadronness = 0.5. In other words, each tree 'learned by heart'
566: the training events, and the same is true for the entire forest. 
567: The situation is the same with classical cuts: the cut values are optimized on a certain 
568: observed data set from a gamma source or on Monte Carlo data, and later on applied to the data  
569: to be analyzed, which must not contain the training data.
570: \item
571: Number of trials in random split selection: This concerns the parameters considered
572: for splitting. A good empirical value for their number 
573: is $\sqrt{N}$ where $N$ is the total number of parameters used 
574: in tree growing \cite{breiman2}.
575: \item
576: Node size: this is the minimum size of node at which further splitting stops.
577: For correctly labeled training events $nodesize = 1$ can be used, for
578: partly incorrect labeled data (e.g. using ON-data as hadrons) $nodesize > 1$ is preferable, 
579: since data are not intended to be split completely. Experience tells that a small number $< 10$ is best.
580: \end{itemize}
581: 
582: \section{Application of RF in $\gamma$/h separation}
583: \subsection{Remarks concerning the training process}
584: In this chapter some features related to the Random Forest method will be briefly addressed. 
585: Some of these remarks are valid also for many other advanced classification methods in need of a training 
586: process, like Neural Networks or linear discriminant analysis.
587: \begin{itemize}
588: \item
589: Training data for Cherenkov telescopes:
590: We have used OFF data and MC gammas (correctly labeled samples) or ON data and MC gammas 
591: (partly wrongly labeled hadron sample). It is usually advisable not to use MC hadrons, 
592: since hadronic showers are 
593: difficult to simulate (unlike gamma showers which have a pure electromagnetic nature), 
594: so that MC hadrons are difficult to match in all details with 
595: real data. In fact, there is no need to use MC hadrons, when OFF or ON data are available.
596: Choosing ON data for training has the advantage of obviating OFF data taking, and of using data
597: taken under identical observational conditions. The
598: Random Forest algorithm is stable enough to deal with a hadron sample containing up to 1\% of gammas,
599: as shown in figure~\ref{fig_contam}, where the training was performed 
600: using OFF data with variable artificial contamination for the hadrons, 
601: and MC data for the gamma sample. 
602: \begin{figure}[h]
603: \begin{center}
604: \includegraphics[totalheight=7cm]{contam.pdf}
605: \caption{{\it Neyman-Pearson or ROC diagrams of hadron training samples with 
606: a contamination of (mislabeled) gamma events. A hadron sample with 1\% gammas
607: introduces a negligible loss in selection efficiency.}}
608: \label{fig_contam}
609: \end{center}
610: \end{figure}
611: In order to simulate ON data, the OFF data were contaminated with MC gammas, i.e. the degree of 
612: contamination was known. For all simulated gamma admixtures the reduction of the
613: separation efficiency
614: beomes visible only in a region of low gamma acceptances, which is usually not advisible to 
615: operate in (too low gamma efficiency). Depending on the set of image parameters used for training,
616: a generalization of this result may not be possible. 
617: \item
618: Types of parameters:
619: All parameters are treated in the same way, which means that in particular detector-related or observational 
620: parameters like $cos(\theta)$ ($\theta$ being the zenith angle), $\bar{\sigma}$ (image noise, 
621: averaged over all pixels), or size  (integrated signal of the image),  must be 
622: used with care. The sense of using such parameters is that cuts in other image parameters will depend on them, 
623: but not that they should be used for cuts.
624: Thus, in  general, one can distinguish between parameters to be used for cuts, and 
625: parameters on which the cuts in other parameters may depend. 
626: To circumvent the problem, the training data must be chosen not to permit a classification using these parameters alone 
627: (e.g. by using the same (flat) distribution of $cos(\theta)$ in both training samples). 
628: Splits in these parameters, in training samples prepared this way,
629: can not directly serve for separating gammas and hadrons.
630: Additional attention must also be payed if e.g. the gamma data have discrete $cos(\theta)$ values for technical reasons 
631: in the Monte Carlo production. In this case the $cos(\theta)$ values appearing in the hadron sample must be 
632: rounded to the same values (binned), or the Monte Carlo data artificially spread to become continuous.
633: \end{itemize}
634: 
635: \subsection{Comparison with direct cuts in image parameters}
636: \label{results}
637: An extensive comparison of methods applied to Monte Carlo data sets for training and
638: test samples was given in \cite{bock}. One of the methods described there (called 
639: {\it Direct Selection}) was based on using simple AND/OR cuts in the multi-dimensional
640: space of image parameters. The choice of parameters or functions thereof
641: offers many possibilities for tuning.
642: We repeat here a similar comparison, again using Monte Carlo data, using {\it scaled} image 
643: parameters. Like in \cite{bock}, no claim can be made
644: that this result, found in favor of the RF method, 
645: can be generalized to all parameter choices or to real data.
646: Exhaustive comparisons with real data are lengthy, due to the high 
647: dimensionality of the problem,
648: which includes data selection and image cleaning steps even 
649: before image parameters are obtained.
650: Quality comparisons using real data are also influenced by the unavoidable changes in 
651: operation conditions, that are reflected in data corrections whose effect on separation 
652: methods are difficult to evaluate. A comparative study with comprehensive
653: MAGIC data samples is, however, in preparation.
654: 
655: For this comparison we used independent training and test samples, of 15000 events each. 
656: {\it Hadrons} were simulated with the parameters:
657: energy range $200GeV<E<30TeV$; spectral index $a = -2.7$; zenith angle range 
658: $0<\theta<30^\circ$; impact parameter range $0< R<400m$; viewing
659: cone $5^\circ$.
660: The {\it gamma} simulation settings were:
661: energy range $50GeV<E<30TeV$; spectral index Crab-like $a = -2.6$; 
662: zenith angle range $0<\theta<30^\circ$; impact parameter range $0<R< 200m$;
663: Figure~\ref{fig_hill} shows the corresponding 
664: distributions of the image parameters width [deg] and length[deg] as functions of
665: size [phe], for gammas and hadrons. 
666: All data were pre-cut to obtain high-quality training and 
667: test samples, requiring leakage\footnote{this parameter, not defined in \cite{bock},
668: uses an estimate of fractional energy escaping the camera} 
669: $<0.1$, dist$>0.3^\circ$, size $>200phe$.
670: \begin{figure}[h]
671: \begin{center}
672: \includegraphics[totalheight=7cm]{hillaspar.jpg}
673: \caption{{\it Distribution of the Hillas parameters width (top) and length (bottom) 
674: as function of log(size), 
675: for gammas (left) and hadrons (right), as used in the training samples. 
676: The profiles are shown in red (gammas) and black (hadrons), showing that
677: both parameters are good separators for size values above 200 photoelectrons 
678: (corresponding to about 100 GeV)}}
679: \label{fig_hill}
680: \end{center}
681: \end{figure}
682: Clearly, width and length are good separation parameters, at least for values of size 
683: exceeding 200 phe (photo electrons), 
684: which corresponds approximately to energies above $100GeV$. 
685: The size dependence of width and length can be dealt with
686: by using scaled parameters:
687: The size range (of MC gamma data) is divided into bins, and 
688: for each bin $i$ mean and variance of the 
689: width distribution ($\bar{w_i}$ and $\sigma^2_{w_i}$)
690: are calculated. The scaled width 
691: $w_{i,scaled}$ for each bin is then obtained by 
692: $w_{i,scaled} = (w_i - \bar{w_i}) / \sigma_{w_i}$.
693:  
694: The same procedure is used for the length parameter. As a result one obtains a normalized 
695: width and length distribution for gammas: they follow a pdf (probability density function) 
696: with mean 0 and variance 1.
697: In these variables, static (size-independent) cuts are used for $\gamma$/h separation. 
698: In order to find optimal cuts, a maximization of the $Q$-value which relates the relative 
699: acceptances of gamma-rays and hadrons
700: ($Q = \epsilon_{\gamma} / \sqrt{\epsilon_h}$) 
701: was performed, using the Metropolis minimization package\footnote{which 
702: includes random perturbations in the search, thus avoiding to return local minima}
703: followed by a SIMPLEX minimization. Both packages are part of TMinuit
704: in the root analysis environment \cite{brun}. 
705: 
706: Both the Random Forest and the scaled parameter method used independent data
707: for training and testing. Only the parameters size, dist, width, and length were used.
708: The results are compared in the Neyman-Pearson or ROC (Receiver 
709: operator characteristic) diagrams of  figure~\ref{fig_comp1}; these diagrams
710: show gamma acceptance as function of hadron acceptance.
711: \begin{figure}[h]
712: \begin{center}
713: \includegraphics[totalheight=6cm]{effcomp1.pdf}
714: \caption{{\it ROC curves for $\gamma$/h separation in the test sample, by the 
715: RF method (higher curve) and by cuts in scaled parameters, 
716: using the same parameters.}}
717: \label{fig_comp1}
718: \end{center}
719: \end{figure} 
720: In order to obtain for the scaled parameter method more than a single point
721: (that of overall maximum $Q$) in the ROC diagram, a 
722: regularizer $a (\epsilon_h - p)^2$ was introduced (a generalization of 
723: the method used in \cite{bock}).
724: Here $p$ denotes a target acceptance for hadrons, and  $\epsilon_h$ is the freely variable 
725: hadron acceptance, which is obtained from the maximization of $Q$ and different for each $p$. 
726: We used a high scaling number $a = 1000$ to ensure that the optimization will give as a
727: result a set of cuts with  $\epsilon_h$ close to $p$. 
728: 
729: These results are shown as the lower curve in figure~\ref{fig_comp1}. 
730: We should stress again that this comparison can in no way show a general 
731: superiority of the RF method; practical experience shows that for a given 
732: data sample other methods (also including direct selection
733: as in the above example) can, at an effort, be fine-tuned to give results 
734: comparable to the RF method. However, in no case has the RF result been shown inferior,
735: and much less tuning is needed (and possible) with the RF method.
736: More comparisons (including also MAGIC data) can be found in \cite{zimmermann}.
737: 
738: \section{Using a Random Forest estimator for a continuous variable}
739: The RF method permits also to construct an algorithm of estimating a 
740: continuous quantity rather than a discrete class 
741: membership, dealt with in previous sections. We have used this method 
742: to estimate non-analytically the particle energy from the
743: measured image parameters. Two main approaches are possible:
744: \begin{itemize}
745: \item Forced division into classes:
746: Class labels are assigned to the training events
747: according to an energy grid. As a result, multiple classes
748: $E_0, E_1, ...,E_{n-1}$ are created. 
749: In the RF training process the related class populations are taken into account
750: together with a more general Gini index \cite{breiman1}
751: \begin{equation}
752: p_i = N_i / N
753: \end{equation}
754: \begin{equation}
755: Q_{Gini} = 1 - \sum_{i=0}^{n-1}p_i^2
756: \end{equation}
757: Here $i$ is the class index ($0 \leq i \leq n-1$). As already shown above, the Gini index of
758: a split is evaluated as sum of the two Gini indices obtained after the split, and minimized.
759: After the training procedure, the class populations
760: inside a terminal node are used to calculate the 
761: estimated energy  corresponding to the terminal node:
762: \begin{equation}
763: E_{est} = \frac{\sum_{i=0}^{n-1}E_iN_i} {\sum_{i=0}^{n-1}N_i}
764: \end{equation}
765: In this application of RF each tree returns an estimated energy and the overall mean
766: is calculated as the final estimated energy.
767: 
768: \item
769: A splitting rule based on the continuous quantity:
770: It is possible to completely avoid the use of classes by introducing a splitting rule,
771: which does not rely on class populations.
772: The idea of the Gini index (with its interpretation as binomial variance of the
773: classes) as split rule is a purification of the class populations, i.e. a separation
774: of the classes, in the subsamples after the split process. Similarly, when using the
775: variance in energy as a splitting criterion, the subsamples are purified with respect to
776: their energy distribution.
777: \begin{equation}
778: \sigma^2(E) = \frac {1}{n-1} \sum_{i=1}^{N}(E_i-\bar{E})^2 = 
779: \frac{1}{n-1} \left[ \left( \sum_{i=1}^{N}E_i^2\right) - n\bar{E^2}\right].
780: \end{equation}
781: In analogy to the Gini index of the split, the ‘variance’ of the split is calculated by
782: adding the ‘subsample energy variances’, taking into account the node populations
783: as weights:
784: \begin{equation}
785: \sigma^2(E) = \frac{1}{N_L+N_R}\left(N_L\sigma_L^2(E) + N_R\sigma_R^2(E) \right)
786: \end{equation}
787: \end{itemize}
788: 
789: 
790: We have used both approaches for a set of Monte Carlo data. 
791: With 100 classes for the first (classification) method,
792: it produces results nearly identical to those of the second (regression) approach.
793: The results of this latter RF approximation for energy
794: can be seen from figure~\ref{fig_energy}.
795: The linearity is perfect, and the energy resolution (as defined by
796: the rms error $\sigma_E/E$) comes out 26\% at 100~GeV and
797: 19\% at 1~TeV,
798: very fair values for a single telescope (telescope arrays can reach better resolution).
799: We have not found an analytical parameterization for energy expressed in terms of image 
800: parameters giving a result better than with the RF representation; with extensive tuning,
801: results comparable in quality have been found, though.
802: \begin{figure}[h]
803: %\begin{minipage} [c] {0.5\textwidth}
804: %\includegraphics[totalheight=5cm]{energy1.pdf}
805: %\end{minipage}
806: %\begin{minipage} [c] {0.5\textwidth}
807: %\includegraphics[totalheight=5cm]{energy2.pdf}
808: %\end{minipage}
809: \includegraphics[totalheight=4.5cm]{RFEEst.jpg}
810: \caption{{\it Left: The relation between the RF-estimated energy (horizontal)
811: and initial Monte Carlo energy (vertical axis) is perfectly linear.
812: Right: The rms error $\sigma_E/E$ as function of initial energy.}}
813: \label{fig_energy}
814: \end{figure}
815: 
816: 
817: \section{Conclusions}
818: The Random Forest (RF) method based on multiple decision trees 
819: was extensively tested as an analysis tool in the $\gamma$/h separation 
820: for data obtained with the MAGIC telescope.
821: In this paper we discuss many implementation details and the 
822: parameters a user has to become familiar with.
823: We also compare the performance of RF with the more
824: conventional technique of cuts in scaled image parameters, using MC
825: data. It could be shown that RF in this comparison is superior
826: to the classical method. This comparison does not
827: imply a general superiority of the RF method; practical experience
828: shows that for a given data sample the conventional methods (like
829: dynamical cuts or cuts in scaled image parameters) may be tuned
830: to give results comparable (but not superior) to the RF method.
831: A dedicated comparative
832: study using MAGIC experimental data is still under way. 
833: 
834: The RF method does produce stable results and
835: is robust with respect to input parameters, even if strongly correlated. The method
836: adjusts itself to the available multi-dimensional space, 
837: with a minimum of human intervention:
838: there are only few tunable parameters, which can be chosen according to simple criteria 
839: (number of trees, trials in random split selection and final node size). 
840: This simpler control and tuning can then be seen as a general advantage
841: over conventional methods.
842: Proper training samples, however, are important, as in any advanced
843: method requiring a training process, i.e. 
844: one has to rely on a good Monte Carlo simulation. Using OFF or ON data as hadron 
845: sample limits the MC dependence to the gamma showers, better understood 
846: than hadron showers. 
847: There remains, however, the need to correctly treat 
848: atmospheric conditions under different zenith angles, 
849: and good knowledge of the detector. 
850: 
851: Training and classification are fast: benchmarks using
852: a 1.5~GHz PC (Athlon XP), with training and test samples each containing 10.000~events, 
853: a total of 10~image parameters used,
854: 100~trees used for classification, each tree completely grown (nodesize=1), 
855: 3~trials in random split selection, give one minute for training and 2~ms/event for classification.
856: A comparable analysis technique like Neural Networks demands substantially 
857: more computer time for training.
858: 
859: 
860: % The Appendices part is started with the command \appendix;
861: % appendix sections are then done as normal sections
862: % \appendix
863: 
864: \section*{Acknowledgement}
865: We thank Jens Zimmermann for fruitful discussions about the RF method
866: and for  comparisons of the RF method with a Neural Net approach.
867: 
868: % Bibliographic references with the natbib package:
869: % Parenthetical: \citep{Bai92} produces (Bailyn 1992).
870: % Textual: \citet{Bai95} produces Bailyn et al. (1995).
871: % An affix and part of a reference:
872: %   \citep[e.g.][Ch. 2]{Bar76}
873: %   produces (e.g. Barnes et al. 1976, Ch. 2).
874: 
875: %\bibliographystyle{elsart-num.bst}
876: %%%%%%%%%\begin{thebibliography}{10}
877: 
878: \begin{thebibliography}{10}
879: \bibitem{hillas}A.M.Hillas: Proceedings of the 19th International 
880: Cosmic Ray Conference, ICRC 1985 La Jolla , 3 (1985) 445
881: \bibitem{hillas1}A.M.Hillas: Space Science Rev. 75 (1996) 17
882: \bibitem{fegan}D.J.Fegan: J.Phys.G, Nucl.Part.Phys. 23 (1997) 1013
883: \bibitem{aharonian}F.Aharonian et al.: Astropart.Phys. 6 (1997) 343
884: \bibitem{kraw}H.Krawczynski et al.: Astropart.Phys. 25 (2006) 380
885: \bibitem{bock}R.K.Bock, A.Chilingarian, M.Gaug, et al., 
886: Nucl. Inst. and Methods A 516 (2004) 511
887: \bibitem{hengst}T.~Hengstebeck, PhD thesis,
888: Mathematisch-Naturwissenschaftliche Fakult\"at I,
889: Humboldt-Universit\"at zu Berlin, M\"arz 2007.
890: Available at URL http://edoc.hu-berlin.de/docviews/abstract.php?id=28015
891: \bibitem{lorenz} E. Lorenz, New Astron. Rev. 48 (2004) 339
892: \bibitem{breiman1}L . Breimann, J. H. Friedmann, R. A. Olshen, C. J .Stone: 
893: Classification and Regression Trees, Wadsworth, 1983
894: \bibitem{albert1} J.Albert et al., Astroph. Journal 664 (2007) L87
895: \bibitem{albert2} J.Albert et al., Astroph. Journal 665 (2007) L51
896: \bibitem{albert3} J.Albert et al., Astroph. Journal 669 (2007) 1143
897: \bibitem{albert4} J.Albert et al., to be published in Astroph. Journal,
898: preprint available at http://de.arxiv.org/abs/0705.3244
899: \bibitem{breiman2}L.Breiman, FORTRAN program Random Forests, Version 3.1, and
900: L.Breiman, Manual On Setting Up, Using, And Understanding Random Forests V3. 1, 
901: both available at http://oz.berkeley.edu/users/breiman
902: \bibitem{brun}R.~Brun, F.~Rademakers, http://root.cern.ch/
903: \bibitem{zimmermann} J.~Zimmermann, PhD thesis, Fakult\"at f\"ur Physik,
904: Ludwig-Maximilians-Universit\"at M\"unchen, Juni 2005.
905: Available at URL http://edoc.mpg.de/274832
906: 
907: 
908: \end{thebibliography}
909: 
910: \end{document}
911: