1: % Template article for preprint document class `elsart'
2: % with harvard style bibliographic references
3: % SP 2001/01/05
4:
5: \documentclass{elsart}
6: % Use the option doublespacing or reviewcopy to obtain double line spacing
7: %\documentclass[doublespacing]{elsart}
8: \journal{NIM}
9: % the natbib package allows both number and author-year (Harvard)
10: % style referencing;
11:
12: %\usepackage{natbib}
13: % if you use PostScript figures in your article
14: % use the graphics package for simple commands
15: % \usepackage{graphics}
16: % or use the graphicx package for more complicated commands
17: % \usepackage{graphicx}
18: % or use the epsfig package if you prefer to use the old commands
19: \usepackage{epsfig}
20:
21: % The amssymb package provides various useful mathematical symbols
22: \usepackage{amssymb}
23: \usepackage{amsmath,amsfonts,verbatim,graphicx,float}
24: \usepackage{subfigure}
25: %\usepackage[english]{babel}
26: %\usepackage[latin1]{inputenc}
27: \usepackage{cite}
28: %\usepackage{subfigure}
29: \pdfoutput=1
30:
31: \newfont{\tensy}{cmsy10}
32: \newcommand{\chemical}[1]{{$\fontdimen16\tensy=3.0pt\fontdimen17\tensy=3.0pt
33: \mathrm{#1}$}}
34:
35: %\usepackage[mediumqspace,squaren,textstyle]{SIunits}
36: \renewcommand{\arraystretch}{1.3} %% arrays look nicer now
37:
38: %\newcommand{\na}{\chemical{\mbox{}^{22}Na}}
39:
40:
41: \begin{document}
42:
43: \begin{frontmatter}
44:
45: % Title, authors and addresses
46:
47: % use the thanksref command within \title, \author or \address for footnotes;
48: % use the corauthref command within \author for corresponding author footnotes;
49: % use the ead command for the email address,
50: % and the form \ead[url] for the home page:
51:
52: \title{Implementation of the Random Forest Method for the
53: Imaging Atmospheric Cherenkov Telescope MAGIC}
54:
55: \author[a]{J.~Albert},
56: \author[b]{E.~Aliu},
57: \author[c]{H.~Anderhub},
58: \author[d]{P.~Antoranz},
59: \author[b]{A.~Armada},
60: \author[d]{M.~Asensio},
61: \author[e]{C.~Baixeras},
62: \author[d]{J.~A.~Barrio},
63: \author[f]{H.~Bartko},
64: \author[g]{D.~Bastieri},
65: \author[h]{J.~Becker},
66: \author[i]{W.~Bednarek},
67: \author[a]{K.~Berger},
68: \author[g]{C.~Bigongiari},
69: \author[c]{A.~Biland},
70: \author[f,g]{R.~K.~Bock},
71: \author[j]{P.~Bordas},
72: \author[j]{V.~Bosch-Ramon},
73: \author[a]{T.~Bretz},
74: \author[c]{I.~Britvitch},
75: \author[d]{M.~Camara},
76: \author[f]{E.~Carmona},
77: \author[k]{A.~Chilingarian},
78: \author[l]{S.~Ciprini},
79: \author[f]{J.~A.~Coarasa},
80: \author[c]{S.~Commichau},
81: \author[d]{J.~L.~Contreras},
82: \author[b]{J.~Cortina},
83: \author[m,v]{M.~T.~Costado},
84: \author[h]{V.~Curtef},
85: \author[k]{V.~Danielyan},
86: \author[g]{F.~Dazzi},
87: \author[n]{A.~De Angelis},
88: \author[m]{C.~Delgado},
89: \author[d]{R.~de~los~Reyes},
90: \author[n]{B.~De Lotto},
91: \author[b]{E.~Domingo-Santamar\'\i a},
92: \author[a]{D.~Dorner},
93: \author[g]{M.~Doro},
94: \author[b]{M.~Errando},
95: \author[o]{M.~Fagiolini},
96: \author[p]{D.~Ferenc},
97: \author[b]{E.~Fern\'andez},
98: \author[b]{R.~Firpo},
99: \author[b]{J.~Flix},
100: \author[d]{M.~V.~Fonseca},
101: \author[e]{L.~Font},
102: \author[f]{M.~Fuchs},
103: \author[f]{N.~Galante},
104: \author[m,v]{R.~J.~Garc\'{\i}a-L\'opez},
105: \author[f]{M.~Garczarczyk},
106: \author[m]{M.~Gaug},
107: \author[i]{M.~Giller},
108: \author[f]{F.~Goebel},
109: \author[k]{D.~Hakobyan},
110: \author[f]{M.~Hayashida},
111: \author[q]{T.~Hengstebeck\corauthref{cor1}},
112: \ead{hengsteb@o2online.de}
113: \author[m,v]{A.~Herrero},
114: \author[a]{D.~H\"ohne},
115: \author[f]{J.~Hose},
116: \author[a]{S.~Huber},
117: \author[f]{C.~C.~Hsu},
118: \author[i]{P.~Jacon},
119: \author[f]{T.~Jogler},
120: \author[f]{R.~Kosyra},
121: \author[c]{D.~Kranich},
122: \author[a]{R.~Kritzer},
123: \author[p]{A.~Laille},
124: \author[l]{E.~Lindfors},
125: \author[g]{S.~Lombardi},
126: \author[n]{F.~Longo},
127: \author[b]{J.~L\'opez},
128: \author[d]{M.~L\'opez},
129: \author[c,f]{E.~Lorenz},
130: \author[f]{P.~Majumdar},
131: \author[r]{G.~Maneva},
132: \author[a]{K.~Mannheim},
133: \author[g]{M.~Mariotti},
134: \author[b]{M.~Mart\'\i nez},
135: \author[b]{D.~Mazin},
136: \author[f]{C.~Merck},
137: \author[o]{M.~Meucci},
138: \author[a]{M.~Meyer},
139: \author[d]{J.~M.~Miranda},
140: \author[f]{R.~Mirzoyan},
141: \author[f]{S.~Mizobuchi},
142: \author[b]{A.~Moralejo},
143: \author[d]{D.~Nieto},
144: \author[l]{K.~Nilsson},
145: \author[f]{J.~Ninkovic},
146: \author[b]{E.~O\~na-Wilhelmi},
147: \author[f,q]{N.~Otte},
148: \author[d]{I.~Oya},
149: \author[m,x]{M.~Panniello},
150: \author[o]{R.~Paoletti},
151: \author[j]{J.~M.~Paredes},
152: \author[l]{M.~Pasanen},
153: \author[g]{D.~Pascoli},
154: \author[c]{F.~Pauss},
155: \author[o]{R.~Pegna},
156: \author[n,s]{M.~Persic},
157: \author[g]{L.~Peruzzo},
158: \author[o]{A.~Piccioli},
159: \author[b]{N.~Puchades},
160: \author[g]{E.~Prandini},
161: \author[k]{A.~Raymers},
162: \author[h]{W.~Rhode},
163: \author[j]{M.~Rib\'o},
164: \author[b]{J.~Rico},
165: \author[c]{M.~Rissi},
166: \author[e]{A.~Robert},
167: \author[a]{S.~R\"ugamer},
168: \author[g]{A.~Saggion},
169: \author[f]{T.~Y.~Saito},
170: \author[e]{A.~S\'anchez},
171: \author[g]{P.~Sartori},
172: \author[g]{V.~Scalzotto},
173: \author[n]{V.~Scapin},
174: \author[a]{R.~Schmitt},
175: \author[f]{T.~Schweizer},
176: \author[q,f]{M.~Shayduk},
177: \author[f]{K.~Shinozaki},
178: \author[t]{S.~N.~Shore},
179: \author[b]{N.~Sidro},
180: \author[l]{A.~Sillanp\"a\"a},
181: \author[i]{D.~Sobczynska},
182: \author[a]{F.~Spanier},
183: \author[o]{A.~Stamerra},
184: \author[c]{L.~S.~Stark},
185: \author[l]{L.~Takalo},
186: \author[r]{P.~Temnikov},
187: \author[b]{D.~Tescaro},
188: \author[f]{M.~Teshima},
189: \author[u]{D.~F.~Torres},
190: \author[o]{N.~Turini},
191: \author[r]{H.~Vankov},
192: \author[n]{A.~Venturini},
193: \author[n]{V.~Vitale},
194: \author[f]{R.~M.~Wagner},
195: \author[i]{T.~Wibig},
196: \author[f]{W.~Wittek},
197: \author[g]{F.~Zandanel},
198: \author[b]{R.~Zanin},
199: \author[e]{J.~Zapatero}
200:
201:
202: \address[a]{Universit\"at W\"urzburg, D-97074 W\"urzburg, Germany}
203: \address[b]{Institut de F\'\i sica d'Altes Energies, Edifici Cn., E-08193 Bellaterra (Barcelona), Spain}
204: \address[c]{ETH Zurich, CH-8093 Switzerland}
205: \address[d]{Universidad Complutense, E-28040 Madrid, Spain}
206: \address[e]{Universitat Aut\`onoma de Barcelona, E-08193 Bellaterra, Spain}
207: \address[f]{Max-Planck-Institut f\"ur Physik, D-80805 M\"unchen, Germany}
208: \address[g]{Universit\`a di Padova and INFN, I-35131 Padova, Italy}
209: \address[h]{Universit\"at Dortmund, D-44227 Dortmund, Germany}
210: \address[i]{University of \L \'od\'z, PL-90236 Lodz, Poland}
211: \address[j]{Universitat de Barcelona, E-08028 Barcelona, Spain}
212: \address[k]{Yerevan Physics Institute, AM-375036 Yerevan, Armenia}
213: \address[l]{Tuorla Observatory, FI-21500 Piikki\"o, Finland}
214: \address[m]{Inst. de Astrofisica de Canarias, E-38200, La Laguna, Tenerife, Spain}
215: \address[n]{Universit\`a di Udine, and INFN Trieste, I-33100 Udine, Italy}
216: \address[o]{Universit\`a di Siena, and INFN Pisa, I-53100 Siena, Italy}
217: \address[p]{University of California, Davis, CA-95616-8677, USA}
218: \address[q]{Humboldt-Universit\"at zu Berlin, D-12489 Berlin, Germany}
219: \address[r]{Institute for Nuclear Research and Nuclear Energy, BG-1784 Sofia, Bulgaria}
220: \address[s]{INAF/Osservatorio Astronomico and INFN Trieste, I-34131 Trieste, Italy}
221: \address[t]{Universit\`a di Pisa, and INFN Pisa, I-56126 Pisa, Italy}
222: \address[u]{ICREA \& Institut de Ci\`encies de l'Espai (CSIC-IEEC), E-08193 Bellaterra, Spain}
223: \address[v]{Depto. de Astrofisica, Universidad, E-38206, La Laguna, Tenerife, Spain}
224: \address[x]{deceased}
225:
226:
227: \corauth[cor1]{Corresponding author.}
228:
229:
230:
231: %% abstract %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
232: \begin{abstract}
233: The paper describes an application of the tree classification method Random Forest (RF),
234: as used in the analysis of data from the ground-based gamma telescope MAGIC.
235: In such telescopes, cosmic $\gamma$-rays are observed and have to be discriminated
236: against a dominating background of hadronic cosmic-ray particles.
237: We describe the application of RF for this gamma/hadron separation.
238: The RF method often shows superior performance in comparison with
239: traditional semi-empirical techniques.
240: Critical issues of the method and its implementation are discussed.
241: An application of the RF method for estimation of a continuous parameter
242: from related variables, rather than discrete classes, is also discussed.
243: \end{abstract}
244:
245:
246: \begin{keyword}
247: % keywords here, in the form: keyword \sep keyword
248: discrimination \sep classification \sep decision tree
249: \end{keyword}
250:
251: \end{frontmatter}
252:
253:
254: % main text
255: \section{Introduction}
256: Ground-based gamma-ray astronomy has in recent years shown to be a
257: source of spectacular discoveries,
258: constraining the evolution of the universe and contributing
259: to the understanding of the origin of cosmic rays.
260: Observations are based on Imaging Atmospheric Cherenkov Telescopes
261: (IACTs), which take advantage of
262: the Cherenkov radiation emanating from the electromagnetic showers
263: that develop during the absorption of gamma-rays
264: in the atmosphere. The faint Cherenkov light flashes are collected in
265: a large-diameter mirror, and recorded in a pixelized camera.
266:
267: Several IACT systems are in successful
268: operation today, both in the Northern (MAGIC, VERITAS) and Southern
269: (HESS, CANGAROO) hemisphere; all but MAGIC are implemented as multi-telescope arrays.
270: Their scientfic goals include galactic and extragalactic sources:
271: Supernova remnants, Pulsars, X-ray binaries, Microquasars,
272: Active Galactic Nuclei (blazars or radio galaxies), Starburst galaxies
273: and potentially also Gamma Ray Bursts. Due to their small aperture IACTs can only
274: perform scans over small areeas, and usually concentrate
275: on sources that have been identified at other wavelengths; however, the number of
276: known gamma-ray emitters is increasing fast, and they provide essential
277: contributions to the understanding of the non-thermal universe.
278:
279: Events seen by an IACT have a very short ($\approx 2ns$) duration, and
280: the shower image is recorded as a compact cluster of pixels
281: in the camera of the IACT. A principal component analysis
282: permits to express the characteristics of this cluster in image parameters,
283: which will present statistically different properties for the
284: (interesting) gamma-rays and the (dominating) hadronic background.
285: IACTs provide raw data with a signal-to-noise ratio much smaller than $1\%$,
286: even for bright gamma sources. Establishing powerful methods of hadronic
287: background rejection thus is a prerequisite for the effective utilization of
288: observations with the Cherenkov technique. The fact was recognized with the advent of
289: the IACT technique, and has been given ample room in the literature, both for telescope
290: arrays and single telescopes, e.g. \cite{hillas,hillas1,fegan,aharonian,kraw}.
291: Multivariate methods using global test statistics
292: (e.g. likelihood ratios or artificial neural networks) are specifically mentioned in
293: \cite{fegan} and \cite{kraw}.
294:
295: A case study for and comparison of different advanced classification methods
296: for a single-dish IACT can be found in \cite{bock}. In the same article the main
297: features of Cherenkov images measured by gamma-ray telescopes are addressed
298: and explained, and the image parameters used in the $\gamma$/h separation are defined.
299:
300: In this paper, largely derived from chapter 5 of \cite{hengst}, we limit ourselves to the
301: implementation, usage, and functionality of the RF method for the single-dish system
302: MAGIC \cite{lorenz}. In \cite{hengst}, a more detailed discussion of the RF method
303: and comprehensive MC studies are given.
304: The implementation closely follows the method desribed
305: by L. Breiman \cite{breiman1}. The application
306: in $\gamma$/h separation is discussed in detail. Recent MAGIC publications
307: (e.g. \cite{albert1, albert2, albert3} use the RF technique, and \cite{albert4}
308: dicusses it in the context of the reference observations of the Crab nebula.
309: A short comparative study with the
310: established method of cuts in scaled image parameters is given.
311: We also discuss an application of the RF method in estimating the
312: gamma energy, a continuous
313: variable, in terms of the observed image parameters.
314: In the following chapter \ref{sec_basic} the Random Forest
315: method will be described in
316: detail, since existing mathematical treatments show only few practically
317: useful aspects, if any.
318: The reader not interested in these details may regard RF as a
319: black-box tree classification method, and continue
320: with the results in section \ref{results}.
321:
322: \section{Basics of the Random Forest (RF) method}
323: \label{sec_basic}
324: The Random Forest method is based on a collection of decision trees, built
325: up with some elements of random choices.
326: Like many other classification and regression methods, a Random Forest
327: is constructed on the basis of training samples suitable for the application.
328: For the purpose of $\gamma$/h separation, the training samples contain the two
329: classes of gammas (usually Monte Carlo (MC) data)
330: and hadrons (usually OFF data, also ON\footnote{ON and OFF data are telescope
331: data obtained by pointing at the source or on a nearby, sourceless region of the sky,
332: respectively} or MC data are possible).
333: In the further discussion, the following definitions will be used:
334: We call the elements of the training sample {\it events}.
335: Each event is characterized by a vector whose components are {\it image parameters}
336: obtained by analyzing the camera pixels. We use the familiar Hillas parameters \cite{hillas}
337: and some additional parameters, but also
338: observation- and detector-related parameters, like $cos(\theta)$,
339: $\theta$ being the zenith angle of the source.
340: The space spanned by the event vectors is multi-dimensional. One can consider the
341: training samples of gammas and hadrons
342: as a single labeled training sample, viz. each event has an integer label
343: (called {\it hadronness}) indicating if the event belongs to the class of gammas (hadronness 0)
344: or to the class of hadrons (hadronness 1).
345:
346: From this sample, a binary decision tree can be constructed, subdividing the parameter
347: space first in two parts depending on one of the parameters, and subsequently repeating
348: the process again and again for each part. The best choice of parameter and the
349: criteria for subdividing are discussed below.
350: Using a single tree for classification purposes, however, usually gives mediocre results.
351: The tree is overoptimized on the training sample, and there is only poor generalization
352: viz. new events will be classified rather badly.
353: This is shown in figure~\ref{fig_pattern1}. Note, however, that even a set of trees
354: (forest) results in some sparsely populated areas, where the hadronness
355: necessarily is
356: not well defined, and the probability of misclassification may be substantial.
357:
358: \begin{figure}[h]
359: \begin{minipage} [c] {0.50\textwidth}
360: \includegraphics[totalheight=4.0cm]{pattern2.jpg}
361: \end{minipage}
362: \begin{minipage} [c] {0.5\textwidth}
363: \includegraphics[totalheight=4.5cm]{pattern1.pdf}
364: \end{minipage}
365: \caption{{\it Left: Illustration of the RF method for a simple 2-dimensional model case.
366: The black and white points are the observed points in class ‘gamma’ and ‘hadrons’,
367: respectively. They are distributed according to two different, but overlapping 2-dimensional
368: Gaussians. The result of separation in terms of hadronness is shown in colour.
369: Right: The result of using a single tree on the same data gives no probability
370: measure like hadronness, but only y/n answers. Its performance is inadequate.}}
371: \label{fig_pattern1}
372: \end{figure}
373:
374: There is no pruning (tree simplification by removing some branches considered irrelevant)
375: of the trees in the Random Forest algorithm. Instead, the RF creates
376: a set of largely uncorrelated trees, and combines their results to form a
377: generalized predictor. Two random elements prior to and within the tree growing process serve to
378: approximate ideally uncorrelated trees; they are described in the following sections.
379:
380: \subsection{Bootstrap aggregating (bagging)}
381: \label{subsec_bagging}
382: There is usually a single data sample in each class used for training.
383: A straightforward solution to obtain independent trees is to
384: split the training sample into as many non-overlapping subsamples as trees should be grown.
385: However, there are usually not enough training data available for this approach. This is especially the case if dealing with
386: air shower data, which are always costly to generate (w.r.t. computer time and storage space).
387: A different way is to produce a bootstrap sample for each tree by sampling n times with replacement from the
388: original training sample containing n events. This procedure guarantees that the events' image parameter
389: distributions are statistically identical for all bootstrap samples
390: (and equal to the image parameter distributions of the
391: original training sample, since the probability of selecting an event is constantly 1/n for the ‘sampling with replacement
392: procedure’), while the bootstrap samples do not contain the same events. It may (and will) happen that certain
393: events are taken more than just once:
394: The probability of not selecting a certain event is equal to $(1 - 1/n)$,
395: which becomes $(1-1/n)^n$ when repeating the
396: selection process n times. As $lim_{n\rightarrow\infty} (1 + x/n) = e^x$,
397: the probability of not selecting an
398: event in the bootstrap procedure becomes $e^{-1}\approx1/3$. Thus, in each bootstrap sample there will be on average
399: $(1 - 1/e)$ original training events, the rest (also kept in the sample) are copies.
400:
401: \subsection{Tree growing and random split selection}
402: The tree growing begins with the complete sample contained in a single node, the so called root node,
403: which is identical to the complete image parameter space. In the following the $\gamma$/h separation is achieved by
404: splitting (or cutting) each node into two successor nodes using one of the image parameters at a time, with a
405: cut value optimized to separate the sample into its classes (in our case two: gammas and hadrons). This corresponds to a
406: successive division of the image parameter space into hypercubes.
407: In order to measure the classification power (separation ability) of an image parameter and to
408: optimize the cut value, the Gini index is used The Gini index is a frequently used
409: measure in dealing with classifiers, originally in economics. Named after the Italian economist
410: Corrado Gini,
411: it measures the inequality of two distributions,
412: e.g. gamma acceptance and hadron acceptance as function of a cut in a variable.
413: It is defined as the ratio between a) the area spanned
414: by the observed cumulative distribution and the hypothetical cumulative distribution
415: for a non-discriminating variable (uniform distribution, 45-degree line), and b) the
416: area under this uniform distribution. It is a variable between zero and one;
417: a low Gini coefficient indicates more equal distributions, a
418: high Gini coefficient shows unequal distribution.
419:
420: The choice of the parameter
421: taken for splitting is randomized (see below for details).
422: The splitting process stops if the node size (events per node) falls below a limit specified by the
423: user, or if there are only events of one class (only gammas or only hadrons) left in the node, which
424: therefore needs not be split further.
425: These terminal nodes can also be called elementary hypercubes, they cover the entire image parameter space
426: without intersections or gaps. To each terminal node the remaining training events assign a
427: class label $l$ (0 for gammas, 1 for hadrons).
428: For terminal nodes still containing a mixture of events of different classes,
429: a mean value is calculated for $l$, taking into account the
430: class populations $N_h$ of hadrons and $N_{\gamma}$ of gammas: $l = N_h / (N_h + N_{\gamma})$.
431: The original program \cite{breiman2} uses a majority vote, and does not calculate mean values.
432:
433: Before going into more details, the classification process is briefly described:
434: One can take a completely grown tree as starting point
435: (see figure~\ref{fig_tree}).
436: \begin{figure}[h]
437: \begin{center}
438: \includegraphics[totalheight=5cm]{decisiontree.pdf}
439: \caption{{\it Sketch of a tree structure for the classification of an event $v$ with
440: components $v_{length}$, $v_{width}$, and $v_{size}$.
441: One can follow the decision path through the tree,
442: leading to classification of the event as hadron.}}
443: \label{fig_tree}
444: \end{center}
445: \end{figure}
446: The task is to classify an event
447: characterized by a vector $v$ in the image parameter space. $v$ is fed into the decision tree;
448: at the first (highest level) node
449: there is a split in a certain image parameter (e.g. 'length'). Depending
450: on the component (image parameter) 'length' in $v$, the event $v$ proceeds to the left node
451: (length $<$ split value) or to the
452: right node (length $\geq$ split value) at the next lower level.
453: This node again splits in some other (or by chance the same) component, and the process continues.
454: The result is that $v$ follows a track through the tree determined by the numerical values
455: of its components,
456: and the tree nodes' cut values, until it will end up in a terminal node.
457: This terminal node assigns a class label $l$ to $v$, which can now be denoted
458: as $l_i(v)$, where $i$ is the tree number.
459:
460: The vector $v$ will be classified by all trees. Due to the randomization involved,
461: different trees will often give different results,
462: hence the name 'Random Forest'. From these results, a mean classification is calculated:
463: \begin{equation}
464: h(v) = \frac{\sum_{i=1}^{n_{trees}}l_i(v)}{n_{trees}}
465: \end{equation}
466: This mean classification is called Hadronness, and is used as the only test statistic (split-parameter)
467: in the $\gamma$/h separation (see figure~\ref{fig_had}).
468: \begin{figure}[h]
469: \begin{center}
470: \includegraphics[totalheight=7cm]{rfoutput.pdf}
471: \caption{{\it Mean hadronness for two test samples of gammas (left peak, black) and hadrons
472: (right peak, red). Hadronness is the final and only test statistic in $\gamma$/h separation.}}
473: \label{fig_had}
474: \end{center}
475: \end{figure}
476:
477:
478: The splitting process is somewhat randomized by a feature called random split selection. The parameter
479: candidates for a split are chosen randomly from the total number of available parameters.
480: Among the candidates, the parameter and corresponding cut value to be used for splitting
481: are chosen by the minimal Gini index.
482: In the case of two classes, the Gini index $Q_{Gini}$ can be referred to as binomial
483: variance of the sample
484: scaled to the interval $[0, 1]$.
485: The Gini index (or GINI coefficient) can be expressed in terms of the node
486: class populations $N_{\gamma}$, $N_h$
487: and the total node population $N$:
488: \begin{equation}
489: Q_{Gini} = \frac{4}{N}\sigma_{binomial} = 4 \frac{N_{\gamma}}{N} \frac{N_h}{N}
490: = 4 \frac{N_{\gamma}(N-N_{\gamma})}{N^2} \in [0,1]
491: \end{equation}
492: $Q_{Gini}$ of a node is zero for the ideal case that only one class is present in the node
493: ($N_{\gamma}=0$ or $N_h=0$). The Gini index of the split is calculated by adding the
494: Gini indices of the two successor nodes (denoted by left and right node) and
495: scaling the result to [0,1]:
496: \begin{equation}
497: Q_{Gini} = 2 \left( \frac{N_{\gamma left}}{N_{left}} \frac{N_{h left}}{N_{left}} +
498: \frac{N_{\gamma right}}{N_{right}} \frac{N_{h right}}{N_{right}} \right) \in [0,1]
499: \end{equation}
500:
501: Choosing the smallest $Q_{Gini}$ corresponds to minimizing the variance of the
502: population of gammas and hadrons, and naturally purifies the sample.
503: Minimization of the Gini index provides both the choice of the image parameter
504: and the split value to be used.
505:
506: More details concerning the Random Forest method can be found in \cite{breiman2}.
507: The original program
508: was modified to calculate the mean hadronness instead of a $0$ or $1$ majority
509: vote for a class. Calculating the arithmetic mean by using
510: weights (e.g. using the Gini index of terminal nodes)
511: did not further improve the results \cite{bock},\cite{hengst}.
512:
513: \section{Control of the training process}
514: \label{sec_control}
515: In this chapter we address some specific aspects of RF related to the training process.
516: Proper training depends on several parameters, steering the growing of trees,
517: which the user should be aware of. In the following these parameters are described.
518: \begin{itemize}
519: \item
520: Number of trees:
521: the number of trees must be chosen large enough to ensure the convergence of the error $\sigma$, given by
522: \begin{equation}
523: \sigma(n_{tree}) = \sqrt{\frac{\sum_{i=1}^{n_{sample}}(h_i^{est}(n_{tree}) - h_i^{true})^2}{n_{sample}}}
524: \end{equation}
525: $\sigma(n_{tree})$ is the rms error of the estimated hadronness. $h_i^{est}(n_{tree})$ denotes the estimated
526: hadronness (which depends
527: on the number $n_{tree}$ of combined trees) and $h_i^{true}$ is the true hadronness
528: of event $i$ in the sample, which contains $n_{sample}$ events in total.
529: The convergence process is shown in figure~\ref{fig_conv} for the training of
530: RF on an MC gamma and MC hadron sample.
531:
532: \begin{figure}[h]
533: \begin{center}
534: \includegraphics[totalheight=6cm]{noftrees.pdf}
535: \caption{{\it Error (rms, = $\sqrt(\sigma^2)$) of the estimated hadronness as function of the
536: number of trees used. Also shown is the variance of each single tree.}}
537: \label{fig_conv}
538: \end{center}
539: \end{figure}
540:
541: Care was taken that the test sample, for which the figure was produced,
542: is disjunct from the training sample.
543: When taking events already used in the training process, $\sigma$ would be underestimated.
544: From figure~\ref{fig_conv}, the following practical method can be deduced:
545: One generates a reasonably high number of trees (100 trees is usually sufficient), performs the training process,
546: and then finds decisions for a test sample using a diminishing number of trees, to
547: judge how many trees still give satisfactory results. Trees generated during the training
548: process are stored successively in a file. For the classification task one can read in the actually needed number of trees.
549: If no test sample is available, one can take $\sigma(n_{tree})$ as calculated from the so-called out-of-bag
550: data during the training.
551: The out-of-bag data are the 'residue' of the bagging procedure, as explained in the following. In the bagging procedure
552: (generating of bootstrap samples, see chapter \ref{sec_basic}) there are data for each tree which have
553: not been used for the tree's bootstrap sample. Being independent, they can be used as test data for the corresponding tree.
554: In other words, each event of the original training sample can be used as test data for $\approx 1/3$ of the trees.
555: If one observes a sufficient convergence of $\sigma$ calculated from out-of-bag data after,
556: say, 150 trees, actually 50 trees are needed.
557: \item
558: Overtraining: During tree growing, the cut values of the parameters are adjusted according to the training sample.
559: This overtraining is not a major drawback, it affects merely the training sample, which provides these
560: exact cut values.
561: According to \cite{breiman2} the overtraining (or overoptimization) vanishes in case of an infinite number of trees.
562: The practical method described above favours a minimal forest, with a number of trees sufficiently large to
563: ensure a classification error (of a test sample), which is not significantly decreased by adding more trees.
564: Such a forest still shows overtraining: when applying $\gamma$/h separation to the training data, the classes of gammas
565: and hadrons can usually be well separated by a cut in hadronness = 0.5. In other words, each tree 'learned by heart'
566: the training events, and the same is true for the entire forest.
567: The situation is the same with classical cuts: the cut values are optimized on a certain
568: observed data set from a gamma source or on Monte Carlo data, and later on applied to the data
569: to be analyzed, which must not contain the training data.
570: \item
571: Number of trials in random split selection: This concerns the parameters considered
572: for splitting. A good empirical value for their number
573: is $\sqrt{N}$ where $N$ is the total number of parameters used
574: in tree growing \cite{breiman2}.
575: \item
576: Node size: this is the minimum size of node at which further splitting stops.
577: For correctly labeled training events $nodesize = 1$ can be used, for
578: partly incorrect labeled data (e.g. using ON-data as hadrons) $nodesize > 1$ is preferable,
579: since data are not intended to be split completely. Experience tells that a small number $< 10$ is best.
580: \end{itemize}
581:
582: \section{Application of RF in $\gamma$/h separation}
583: \subsection{Remarks concerning the training process}
584: In this chapter some features related to the Random Forest method will be briefly addressed.
585: Some of these remarks are valid also for many other advanced classification methods in need of a training
586: process, like Neural Networks or linear discriminant analysis.
587: \begin{itemize}
588: \item
589: Training data for Cherenkov telescopes:
590: We have used OFF data and MC gammas (correctly labeled samples) or ON data and MC gammas
591: (partly wrongly labeled hadron sample). It is usually advisable not to use MC hadrons,
592: since hadronic showers are
593: difficult to simulate (unlike gamma showers which have a pure electromagnetic nature),
594: so that MC hadrons are difficult to match in all details with
595: real data. In fact, there is no need to use MC hadrons, when OFF or ON data are available.
596: Choosing ON data for training has the advantage of obviating OFF data taking, and of using data
597: taken under identical observational conditions. The
598: Random Forest algorithm is stable enough to deal with a hadron sample containing up to 1\% of gammas,
599: as shown in figure~\ref{fig_contam}, where the training was performed
600: using OFF data with variable artificial contamination for the hadrons,
601: and MC data for the gamma sample.
602: \begin{figure}[h]
603: \begin{center}
604: \includegraphics[totalheight=7cm]{contam.pdf}
605: \caption{{\it Neyman-Pearson or ROC diagrams of hadron training samples with
606: a contamination of (mislabeled) gamma events. A hadron sample with 1\% gammas
607: introduces a negligible loss in selection efficiency.}}
608: \label{fig_contam}
609: \end{center}
610: \end{figure}
611: In order to simulate ON data, the OFF data were contaminated with MC gammas, i.e. the degree of
612: contamination was known. For all simulated gamma admixtures the reduction of the
613: separation efficiency
614: beomes visible only in a region of low gamma acceptances, which is usually not advisible to
615: operate in (too low gamma efficiency). Depending on the set of image parameters used for training,
616: a generalization of this result may not be possible.
617: \item
618: Types of parameters:
619: All parameters are treated in the same way, which means that in particular detector-related or observational
620: parameters like $cos(\theta)$ ($\theta$ being the zenith angle), $\bar{\sigma}$ (image noise,
621: averaged over all pixels), or size (integrated signal of the image), must be
622: used with care. The sense of using such parameters is that cuts in other image parameters will depend on them,
623: but not that they should be used for cuts.
624: Thus, in general, one can distinguish between parameters to be used for cuts, and
625: parameters on which the cuts in other parameters may depend.
626: To circumvent the problem, the training data must be chosen not to permit a classification using these parameters alone
627: (e.g. by using the same (flat) distribution of $cos(\theta)$ in both training samples).
628: Splits in these parameters, in training samples prepared this way,
629: can not directly serve for separating gammas and hadrons.
630: Additional attention must also be payed if e.g. the gamma data have discrete $cos(\theta)$ values for technical reasons
631: in the Monte Carlo production. In this case the $cos(\theta)$ values appearing in the hadron sample must be
632: rounded to the same values (binned), or the Monte Carlo data artificially spread to become continuous.
633: \end{itemize}
634:
635: \subsection{Comparison with direct cuts in image parameters}
636: \label{results}
637: An extensive comparison of methods applied to Monte Carlo data sets for training and
638: test samples was given in \cite{bock}. One of the methods described there (called
639: {\it Direct Selection}) was based on using simple AND/OR cuts in the multi-dimensional
640: space of image parameters. The choice of parameters or functions thereof
641: offers many possibilities for tuning.
642: We repeat here a similar comparison, again using Monte Carlo data, using {\it scaled} image
643: parameters. Like in \cite{bock}, no claim can be made
644: that this result, found in favor of the RF method,
645: can be generalized to all parameter choices or to real data.
646: Exhaustive comparisons with real data are lengthy, due to the high
647: dimensionality of the problem,
648: which includes data selection and image cleaning steps even
649: before image parameters are obtained.
650: Quality comparisons using real data are also influenced by the unavoidable changes in
651: operation conditions, that are reflected in data corrections whose effect on separation
652: methods are difficult to evaluate. A comparative study with comprehensive
653: MAGIC data samples is, however, in preparation.
654:
655: For this comparison we used independent training and test samples, of 15000 events each.
656: {\it Hadrons} were simulated with the parameters:
657: energy range $200GeV<E<30TeV$; spectral index $a = -2.7$; zenith angle range
658: $0<\theta<30^\circ$; impact parameter range $0< R<400m$; viewing
659: cone $5^\circ$.
660: The {\it gamma} simulation settings were:
661: energy range $50GeV<E<30TeV$; spectral index Crab-like $a = -2.6$;
662: zenith angle range $0<\theta<30^\circ$; impact parameter range $0<R< 200m$;
663: Figure~\ref{fig_hill} shows the corresponding
664: distributions of the image parameters width [deg] and length[deg] as functions of
665: size [phe], for gammas and hadrons.
666: All data were pre-cut to obtain high-quality training and
667: test samples, requiring leakage\footnote{this parameter, not defined in \cite{bock},
668: uses an estimate of fractional energy escaping the camera}
669: $<0.1$, dist$>0.3^\circ$, size $>200phe$.
670: \begin{figure}[h]
671: \begin{center}
672: \includegraphics[totalheight=7cm]{hillaspar.jpg}
673: \caption{{\it Distribution of the Hillas parameters width (top) and length (bottom)
674: as function of log(size),
675: for gammas (left) and hadrons (right), as used in the training samples.
676: The profiles are shown in red (gammas) and black (hadrons), showing that
677: both parameters are good separators for size values above 200 photoelectrons
678: (corresponding to about 100 GeV)}}
679: \label{fig_hill}
680: \end{center}
681: \end{figure}
682: Clearly, width and length are good separation parameters, at least for values of size
683: exceeding 200 phe (photo electrons),
684: which corresponds approximately to energies above $100GeV$.
685: The size dependence of width and length can be dealt with
686: by using scaled parameters:
687: The size range (of MC gamma data) is divided into bins, and
688: for each bin $i$ mean and variance of the
689: width distribution ($\bar{w_i}$ and $\sigma^2_{w_i}$)
690: are calculated. The scaled width
691: $w_{i,scaled}$ for each bin is then obtained by
692: $w_{i,scaled} = (w_i - \bar{w_i}) / \sigma_{w_i}$.
693:
694: The same procedure is used for the length parameter. As a result one obtains a normalized
695: width and length distribution for gammas: they follow a pdf (probability density function)
696: with mean 0 and variance 1.
697: In these variables, static (size-independent) cuts are used for $\gamma$/h separation.
698: In order to find optimal cuts, a maximization of the $Q$-value which relates the relative
699: acceptances of gamma-rays and hadrons
700: ($Q = \epsilon_{\gamma} / \sqrt{\epsilon_h}$)
701: was performed, using the Metropolis minimization package\footnote{which
702: includes random perturbations in the search, thus avoiding to return local minima}
703: followed by a SIMPLEX minimization. Both packages are part of TMinuit
704: in the root analysis environment \cite{brun}.
705:
706: Both the Random Forest and the scaled parameter method used independent data
707: for training and testing. Only the parameters size, dist, width, and length were used.
708: The results are compared in the Neyman-Pearson or ROC (Receiver
709: operator characteristic) diagrams of figure~\ref{fig_comp1}; these diagrams
710: show gamma acceptance as function of hadron acceptance.
711: \begin{figure}[h]
712: \begin{center}
713: \includegraphics[totalheight=6cm]{effcomp1.pdf}
714: \caption{{\it ROC curves for $\gamma$/h separation in the test sample, by the
715: RF method (higher curve) and by cuts in scaled parameters,
716: using the same parameters.}}
717: \label{fig_comp1}
718: \end{center}
719: \end{figure}
720: In order to obtain for the scaled parameter method more than a single point
721: (that of overall maximum $Q$) in the ROC diagram, a
722: regularizer $a (\epsilon_h - p)^2$ was introduced (a generalization of
723: the method used in \cite{bock}).
724: Here $p$ denotes a target acceptance for hadrons, and $\epsilon_h$ is the freely variable
725: hadron acceptance, which is obtained from the maximization of $Q$ and different for each $p$.
726: We used a high scaling number $a = 1000$ to ensure that the optimization will give as a
727: result a set of cuts with $\epsilon_h$ close to $p$.
728:
729: These results are shown as the lower curve in figure~\ref{fig_comp1}.
730: We should stress again that this comparison can in no way show a general
731: superiority of the RF method; practical experience shows that for a given
732: data sample other methods (also including direct selection
733: as in the above example) can, at an effort, be fine-tuned to give results
734: comparable to the RF method. However, in no case has the RF result been shown inferior,
735: and much less tuning is needed (and possible) with the RF method.
736: More comparisons (including also MAGIC data) can be found in \cite{zimmermann}.
737:
738: \section{Using a Random Forest estimator for a continuous variable}
739: The RF method permits also to construct an algorithm of estimating a
740: continuous quantity rather than a discrete class
741: membership, dealt with in previous sections. We have used this method
742: to estimate non-analytically the particle energy from the
743: measured image parameters. Two main approaches are possible:
744: \begin{itemize}
745: \item Forced division into classes:
746: Class labels are assigned to the training events
747: according to an energy grid. As a result, multiple classes
748: $E_0, E_1, ...,E_{n-1}$ are created.
749: In the RF training process the related class populations are taken into account
750: together with a more general Gini index \cite{breiman1}
751: \begin{equation}
752: p_i = N_i / N
753: \end{equation}
754: \begin{equation}
755: Q_{Gini} = 1 - \sum_{i=0}^{n-1}p_i^2
756: \end{equation}
757: Here $i$ is the class index ($0 \leq i \leq n-1$). As already shown above, the Gini index of
758: a split is evaluated as sum of the two Gini indices obtained after the split, and minimized.
759: After the training procedure, the class populations
760: inside a terminal node are used to calculate the
761: estimated energy corresponding to the terminal node:
762: \begin{equation}
763: E_{est} = \frac{\sum_{i=0}^{n-1}E_iN_i} {\sum_{i=0}^{n-1}N_i}
764: \end{equation}
765: In this application of RF each tree returns an estimated energy and the overall mean
766: is calculated as the final estimated energy.
767:
768: \item
769: A splitting rule based on the continuous quantity:
770: It is possible to completely avoid the use of classes by introducing a splitting rule,
771: which does not rely on class populations.
772: The idea of the Gini index (with its interpretation as binomial variance of the
773: classes) as split rule is a purification of the class populations, i.e. a separation
774: of the classes, in the subsamples after the split process. Similarly, when using the
775: variance in energy as a splitting criterion, the subsamples are purified with respect to
776: their energy distribution.
777: \begin{equation}
778: \sigma^2(E) = \frac {1}{n-1} \sum_{i=1}^{N}(E_i-\bar{E})^2 =
779: \frac{1}{n-1} \left[ \left( \sum_{i=1}^{N}E_i^2\right) - n\bar{E^2}\right].
780: \end{equation}
781: In analogy to the Gini index of the split, the ‘variance’ of the split is calculated by
782: adding the ‘subsample energy variances’, taking into account the node populations
783: as weights:
784: \begin{equation}
785: \sigma^2(E) = \frac{1}{N_L+N_R}\left(N_L\sigma_L^2(E) + N_R\sigma_R^2(E) \right)
786: \end{equation}
787: \end{itemize}
788:
789:
790: We have used both approaches for a set of Monte Carlo data.
791: With 100 classes for the first (classification) method,
792: it produces results nearly identical to those of the second (regression) approach.
793: The results of this latter RF approximation for energy
794: can be seen from figure~\ref{fig_energy}.
795: The linearity is perfect, and the energy resolution (as defined by
796: the rms error $\sigma_E/E$) comes out 26\% at 100~GeV and
797: 19\% at 1~TeV,
798: very fair values for a single telescope (telescope arrays can reach better resolution).
799: We have not found an analytical parameterization for energy expressed in terms of image
800: parameters giving a result better than with the RF representation; with extensive tuning,
801: results comparable in quality have been found, though.
802: \begin{figure}[h]
803: %\begin{minipage} [c] {0.5\textwidth}
804: %\includegraphics[totalheight=5cm]{energy1.pdf}
805: %\end{minipage}
806: %\begin{minipage} [c] {0.5\textwidth}
807: %\includegraphics[totalheight=5cm]{energy2.pdf}
808: %\end{minipage}
809: \includegraphics[totalheight=4.5cm]{RFEEst.jpg}
810: \caption{{\it Left: The relation between the RF-estimated energy (horizontal)
811: and initial Monte Carlo energy (vertical axis) is perfectly linear.
812: Right: The rms error $\sigma_E/E$ as function of initial energy.}}
813: \label{fig_energy}
814: \end{figure}
815:
816:
817: \section{Conclusions}
818: The Random Forest (RF) method based on multiple decision trees
819: was extensively tested as an analysis tool in the $\gamma$/h separation
820: for data obtained with the MAGIC telescope.
821: In this paper we discuss many implementation details and the
822: parameters a user has to become familiar with.
823: We also compare the performance of RF with the more
824: conventional technique of cuts in scaled image parameters, using MC
825: data. It could be shown that RF in this comparison is superior
826: to the classical method. This comparison does not
827: imply a general superiority of the RF method; practical experience
828: shows that for a given data sample the conventional methods (like
829: dynamical cuts or cuts in scaled image parameters) may be tuned
830: to give results comparable (but not superior) to the RF method.
831: A dedicated comparative
832: study using MAGIC experimental data is still under way.
833:
834: The RF method does produce stable results and
835: is robust with respect to input parameters, even if strongly correlated. The method
836: adjusts itself to the available multi-dimensional space,
837: with a minimum of human intervention:
838: there are only few tunable parameters, which can be chosen according to simple criteria
839: (number of trees, trials in random split selection and final node size).
840: This simpler control and tuning can then be seen as a general advantage
841: over conventional methods.
842: Proper training samples, however, are important, as in any advanced
843: method requiring a training process, i.e.
844: one has to rely on a good Monte Carlo simulation. Using OFF or ON data as hadron
845: sample limits the MC dependence to the gamma showers, better understood
846: than hadron showers.
847: There remains, however, the need to correctly treat
848: atmospheric conditions under different zenith angles,
849: and good knowledge of the detector.
850:
851: Training and classification are fast: benchmarks using
852: a 1.5~GHz PC (Athlon XP), with training and test samples each containing 10.000~events,
853: a total of 10~image parameters used,
854: 100~trees used for classification, each tree completely grown (nodesize=1),
855: 3~trials in random split selection, give one minute for training and 2~ms/event for classification.
856: A comparable analysis technique like Neural Networks demands substantially
857: more computer time for training.
858:
859:
860: % The Appendices part is started with the command \appendix;
861: % appendix sections are then done as normal sections
862: % \appendix
863:
864: \section*{Acknowledgement}
865: We thank Jens Zimmermann for fruitful discussions about the RF method
866: and for comparisons of the RF method with a Neural Net approach.
867:
868: % Bibliographic references with the natbib package:
869: % Parenthetical: \citep{Bai92} produces (Bailyn 1992).
870: % Textual: \citet{Bai95} produces Bailyn et al. (1995).
871: % An affix and part of a reference:
872: % \citep[e.g.][Ch. 2]{Bar76}
873: % produces (e.g. Barnes et al. 1976, Ch. 2).
874:
875: %\bibliographystyle{elsart-num.bst}
876: %%%%%%%%%\begin{thebibliography}{10}
877:
878: \begin{thebibliography}{10}
879: \bibitem{hillas}A.M.Hillas: Proceedings of the 19th International
880: Cosmic Ray Conference, ICRC 1985 La Jolla , 3 (1985) 445
881: \bibitem{hillas1}A.M.Hillas: Space Science Rev. 75 (1996) 17
882: \bibitem{fegan}D.J.Fegan: J.Phys.G, Nucl.Part.Phys. 23 (1997) 1013
883: \bibitem{aharonian}F.Aharonian et al.: Astropart.Phys. 6 (1997) 343
884: \bibitem{kraw}H.Krawczynski et al.: Astropart.Phys. 25 (2006) 380
885: \bibitem{bock}R.K.Bock, A.Chilingarian, M.Gaug, et al.,
886: Nucl. Inst. and Methods A 516 (2004) 511
887: \bibitem{hengst}T.~Hengstebeck, PhD thesis,
888: Mathematisch-Naturwissenschaftliche Fakult\"at I,
889: Humboldt-Universit\"at zu Berlin, M\"arz 2007.
890: Available at URL http://edoc.hu-berlin.de/docviews/abstract.php?id=28015
891: \bibitem{lorenz} E. Lorenz, New Astron. Rev. 48 (2004) 339
892: \bibitem{breiman1}L . Breimann, J. H. Friedmann, R. A. Olshen, C. J .Stone:
893: Classification and Regression Trees, Wadsworth, 1983
894: \bibitem{albert1} J.Albert et al., Astroph. Journal 664 (2007) L87
895: \bibitem{albert2} J.Albert et al., Astroph. Journal 665 (2007) L51
896: \bibitem{albert3} J.Albert et al., Astroph. Journal 669 (2007) 1143
897: \bibitem{albert4} J.Albert et al., to be published in Astroph. Journal,
898: preprint available at http://de.arxiv.org/abs/0705.3244
899: \bibitem{breiman2}L.Breiman, FORTRAN program Random Forests, Version 3.1, and
900: L.Breiman, Manual On Setting Up, Using, And Understanding Random Forests V3. 1,
901: both available at http://oz.berkeley.edu/users/breiman
902: \bibitem{brun}R.~Brun, F.~Rademakers, http://root.cern.ch/
903: \bibitem{zimmermann} J.~Zimmermann, PhD thesis, Fakult\"at f\"ur Physik,
904: Ludwig-Maximilians-Universit\"at M\"unchen, Juni 2005.
905: Available at URL http://edoc.mpg.de/274832
906:
907:
908: \end{thebibliography}
909:
910: \end{document}
911: