0709:0709.3719/RF.tex

1: % Template article for preprint document class `elsart'

2: % with harvard style bibliographic references

3: % SP 2001/01/05

4:

5: \documentclass{elsart}

6: % Use the option doublespacing or reviewcopy to obtain double line spacing

7: %\documentclass[doublespacing]{elsart}

8: \journal{NIM}

9: % the natbib package allows both number and author-year (Harvard)

10: % style referencing;

11:

12: %\usepackage{natbib}

13: % if you use PostScript figures in your article

14: % use the graphics package for simple commands

15: % \usepackage{graphics}

16: % or use the graphicx package for more complicated commands

17: % \usepackage{graphicx}

18: % or use the epsfig package if you prefer to use the old commands

19: \usepackage{epsfig}

20:

21: % The amssymb package provides various useful mathematical symbols

22: \usepackage{amssymb}

23: \usepackage{amsmath,amsfonts,verbatim,graphicx,float}

24: \usepackage{subfigure}

25: %\usepackage[english]{babel}

26: %\usepackage[latin1]{inputenc}

27: \usepackage{cite}

28: %\usepackage{subfigure}

29: \pdfoutput=1

30:

31: \newfont{\tensy}{cmsy10}

32: \newcommand{\chemical}[1]{{$\fontdimen16\tensy=3.0pt\fontdimen17\tensy=3.0pt

33: \mathrm{#1}$}}

34:

35: %\usepackage[mediumqspace,squaren,textstyle]{SIunits}

36: \renewcommand{\arraystretch}{1.3}   %% arrays look nicer now

37:

38: %\newcommand{\na}{\chemical{\mbox{}^{22}Na}}

39:

40:

41: \begin{document}

42:

43: \begin{frontmatter}

44:

45: % Title, authors and addresses

46:

47: % use the thanksref command within \title, \author or \address for footnotes;

48: % use the corauthref command within \author for corresponding author footnotes;

49: % use the ead command for the email address,

50: % and the form \ead[url] for the home page:

51:

52: \title{Implementation of the Random Forest Method for the

53: Imaging Atmospheric Cherenkov Telescope MAGIC}

54:

55:  \author[a]{J.~Albert},

56:  \author[b]{E.~Aliu},

57:  \author[c]{H.~Anderhub},

58:  \author[d]{P.~Antoranz},

59:  \author[b]{A.~Armada},

60:  \author[d]{M.~Asensio},

61:  \author[e]{C.~Baixeras},

62:  \author[d]{J.~A.~Barrio},

63:  \author[f]{H.~Bartko},

64:  \author[g]{D.~Bastieri},

65:  \author[h]{J.~Becker},

66:  \author[i]{W.~Bednarek},

67:  \author[a]{K.~Berger},

68:  \author[g]{C.~Bigongiari},

69:  \author[c]{A.~Biland},

70:  \author[f,g]{R.~K.~Bock},

71:  \author[j]{P.~Bordas},

72:  \author[j]{V.~Bosch-Ramon},

73:  \author[a]{T.~Bretz},

74:  \author[c]{I.~Britvitch},

75:  \author[d]{M.~Camara},

76:  \author[f]{E.~Carmona},

77:  \author[k]{A.~Chilingarian},

78:  \author[l]{S.~Ciprini},

79:  \author[f]{J.~A.~Coarasa},

80:  \author[c]{S.~Commichau},

81:  \author[d]{J.~L.~Contreras},

82:  \author[b]{J.~Cortina},

83:  \author[m,v]{M.~T.~Costado},

84:  \author[h]{V.~Curtef},

85:  \author[k]{V.~Danielyan},

86:  \author[g]{F.~Dazzi},

87:  \author[n]{A.~De Angelis},

88:  \author[m]{C.~Delgado},

89:  \author[d]{R.~de~los~Reyes},

90:  \author[n]{B.~De Lotto},

91:  \author[b]{E.~Domingo-Santamar\'\i a},

92:  \author[a]{D.~Dorner},

93:  \author[g]{M.~Doro},

94:  \author[b]{M.~Errando},

95:  \author[o]{M.~Fagiolini},

96:  \author[p]{D.~Ferenc},

97:  \author[b]{E.~Fern\'andez},

98:  \author[b]{R.~Firpo},

99:  \author[b]{J.~Flix},

100:  \author[d]{M.~V.~Fonseca},

101:  \author[e]{L.~Font},

102:  \author[f]{M.~Fuchs},

103:  \author[f]{N.~Galante},

104:  \author[m,v]{R.~J.~Garc\'{\i}a-L\'opez},

105:  \author[f]{M.~Garczarczyk},

106:  \author[m]{M.~Gaug},

107:  \author[i]{M.~Giller},

108:  \author[f]{F.~Goebel},

109:  \author[k]{D.~Hakobyan},

110:  \author[f]{M.~Hayashida},

111:  \author[q]{T.~Hengstebeck\corauthref{cor1}},

112:  \ead{hengsteb@o2online.de}

113:  \author[m,v]{A.~Herrero},

114:  \author[a]{D.~H\"ohne},

115:  \author[f]{J.~Hose},

116:  \author[a]{S.~Huber},

117:  \author[f]{C.~C.~Hsu},

118:  \author[i]{P.~Jacon},

119:  \author[f]{T.~Jogler},

120:  \author[f]{R.~Kosyra},

121:  \author[c]{D.~Kranich},

122:  \author[a]{R.~Kritzer},

123:  \author[p]{A.~Laille},

124:  \author[l]{E.~Lindfors},

125:  \author[g]{S.~Lombardi},

126:  \author[n]{F.~Longo},

127:  \author[b]{J.~L\'opez},

128:  \author[d]{M.~L\'opez},

129:  \author[c,f]{E.~Lorenz},

130:  \author[f]{P.~Majumdar},

131:  \author[r]{G.~Maneva},

132:  \author[a]{K.~Mannheim},

133:  \author[g]{M.~Mariotti},

134:  \author[b]{M.~Mart\'\i nez},

135:  \author[b]{D.~Mazin},

136:  \author[f]{C.~Merck},

137:  \author[o]{M.~Meucci},

138:  \author[a]{M.~Meyer},

139:  \author[d]{J.~M.~Miranda},

140:  \author[f]{R.~Mirzoyan},

141:  \author[f]{S.~Mizobuchi},

142:  \author[b]{A.~Moralejo},

143:  \author[d]{D.~Nieto},

144:  \author[l]{K.~Nilsson},

145:  \author[f]{J.~Ninkovic},

146:  \author[b]{E.~O\~na-Wilhelmi},

147:  \author[f,q]{N.~Otte},

148:  \author[d]{I.~Oya},

149:  \author[m,x]{M.~Panniello},

150:  \author[o]{R.~Paoletti},

151:  \author[j]{J.~M.~Paredes},

152:  \author[l]{M.~Pasanen},

153:  \author[g]{D.~Pascoli},

154:  \author[c]{F.~Pauss},

155:  \author[o]{R.~Pegna},

156:  \author[n,s]{M.~Persic},

157:  \author[g]{L.~Peruzzo},

158:  \author[o]{A.~Piccioli},

159:  \author[b]{N.~Puchades},

160:  \author[g]{E.~Prandini},

161:  \author[k]{A.~Raymers},

162:  \author[h]{W.~Rhode},

163:  \author[j]{M.~Rib\'o},

164:  \author[b]{J.~Rico},

165:  \author[c]{M.~Rissi},

166:  \author[e]{A.~Robert},

167:  \author[a]{S.~R\"ugamer},

168:  \author[g]{A.~Saggion},

169:  \author[f]{T.~Y.~Saito},

170:  \author[e]{A.~S\'anchez},

171:  \author[g]{P.~Sartori},

172:  \author[g]{V.~Scalzotto},

173:  \author[n]{V.~Scapin},

174:  \author[a]{R.~Schmitt},

175:  \author[f]{T.~Schweizer},

176:  \author[q,f]{M.~Shayduk},

177:  \author[f]{K.~Shinozaki},

178:  \author[t]{S.~N.~Shore},

179:  \author[b]{N.~Sidro},

180:  \author[l]{A.~Sillanp\"a\"a},

181:  \author[i]{D.~Sobczynska},

182:  \author[a]{F.~Spanier},

183:  \author[o]{A.~Stamerra},

184:  \author[c]{L.~S.~Stark},

185:  \author[l]{L.~Takalo},

186:  \author[r]{P.~Temnikov},

187:  \author[b]{D.~Tescaro},

188:  \author[f]{M.~Teshima},

189:  \author[u]{D.~F.~Torres},

190:  \author[o]{N.~Turini},

191:  \author[r]{H.~Vankov},

192:  \author[n]{A.~Venturini},

193:  \author[n]{V.~Vitale},

194:  \author[f]{R.~M.~Wagner},

195:  \author[i]{T.~Wibig},

196:  \author[f]{W.~Wittek},

197:  \author[g]{F.~Zandanel},

198:  \author[b]{R.~Zanin},

199:  \author[e]{J.~Zapatero}

200:

201:

202:  \address[a]{Universit\"at W\"urzburg, D-97074 W\"urzburg, Germany}

203:  \address[b]{Institut de F\'\i sica d'Altes Energies, Edifici Cn., E-08193 Bellaterra (Barcelona), Spain}

204:  \address[c]{ETH Zurich, CH-8093 Switzerland}

205:  \address[d]{Universidad Complutense, E-28040 Madrid, Spain}

206:  \address[e]{Universitat Aut\`onoma de Barcelona, E-08193 Bellaterra, Spain}

207:  \address[f]{Max-Planck-Institut f\"ur Physik, D-80805 M\"unchen, Germany}

208:  \address[g]{Universit\`a di Padova and INFN, I-35131 Padova, Italy}

209:  \address[h]{Universit\"at Dortmund, D-44227 Dortmund, Germany}

210:  \address[i]{University of \L \'od\'z, PL-90236 Lodz, Poland}

211:  \address[j]{Universitat de Barcelona, E-08028 Barcelona, Spain}

212:  \address[k]{Yerevan Physics Institute, AM-375036 Yerevan, Armenia}

213:  \address[l]{Tuorla Observatory, FI-21500 Piikki\"o, Finland}

214:  \address[m]{Inst. de Astrofisica de Canarias, E-38200, La Laguna, Tenerife, Spain}

215:  \address[n]{Universit\`a di Udine, and INFN Trieste, I-33100 Udine, Italy}

216:  \address[o]{Universit\`a  di Siena, and INFN Pisa, I-53100 Siena, Italy}

217:  \address[p]{University of California, Davis, CA-95616-8677, USA}

218:  \address[q]{Humboldt-Universit\"at zu Berlin, D-12489 Berlin, Germany}

219:  \address[r]{Institute for Nuclear Research and Nuclear Energy, BG-1784 Sofia, Bulgaria}

220:  \address[s]{INAF/Osservatorio Astronomico and INFN Trieste, I-34131 Trieste, Italy}

221:  \address[t]{Universit\`a  di Pisa, and INFN Pisa, I-56126 Pisa, Italy}

222:  \address[u]{ICREA \& Institut de Ci\`encies de l'Espai (CSIC-IEEC), E-08193 Bellaterra, Spain}

223:  \address[v]{Depto. de Astrofisica, Universidad, E-38206, La Laguna, Tenerife, Spain}

224:  \address[x]{deceased}

225:

226:

227:  \corauth[cor1]{Corresponding author.}

228:

229:

230:

231: %% abstract %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

232: \begin{abstract}

233: The paper describes an application of the tree classification method Random Forest (RF),

234: as used in the analysis of data from the ground-based gamma telescope MAGIC.

235: In such telescopes, cosmic $\gamma$-rays are observed and have to be discriminated

236: against a dominating background of hadronic cosmic-ray particles.

237: We describe the application of RF for this gamma/hadron separation.

238: The RF method often shows superior performance in comparison with

239: traditional semi-empirical techniques.

240: Critical issues of the method and its implementation are discussed.

241: An application of the RF method for estimation of a continuous parameter

242: from related variables, rather than discrete classes, is also discussed.

243: \end{abstract}

244:

245:

246: \begin{keyword}

247: % keywords here, in the form: keyword \sep keyword

248: discrimination \sep classification \sep decision tree

249: \end{keyword}

250:

251: \end{frontmatter}

252:

253:

254: % main text

255: \section{Introduction}

256: Ground-based gamma-ray astronomy has in recent years shown to be a

257: source of  spectacular discoveries,

258: constraining the evolution of the universe and contributing

259: to the understanding of the origin of cosmic rays.

260: Observations are based on Imaging Atmospheric Cherenkov Telescopes

261: (IACTs), which take advantage of

262: the Cherenkov radiation emanating from the electromagnetic showers

263: that develop during the absorption of gamma-rays

264: in the atmosphere. The faint Cherenkov light flashes are collected in

265: a large-diameter mirror, and recorded in a pixelized camera.

266:

267: Several IACT systems are in successful

268: operation today, both in the Northern (MAGIC, VERITAS) and Southern

269: (HESS, CANGAROO) hemisphere; all but MAGIC are implemented as multi-telescope arrays.

270: Their scientfic goals include galactic and extragalactic sources:

271: Supernova remnants, Pulsars, X-ray binaries, Microquasars,

272: Active Galactic Nuclei (blazars or radio galaxies), Starburst galaxies

273: and potentially also Gamma Ray Bursts. Due to their small aperture IACTs can only

274: perform scans over small areeas, and usually concentrate

275: on sources that have been identified at other wavelengths; however, the number of

276: known gamma-ray emitters is increasing fast, and they provide essential

277: contributions to the understanding of the non-thermal universe.

278:

279: Events seen by an IACT have a very short ($\approx 2ns$) duration, and

280: the shower image is recorded as a compact cluster of pixels

281: in the camera of the IACT. A principal component analysis

282: permits to express the characteristics of this cluster in image parameters,

283: which will present statistically different properties for the

284: (interesting) gamma-rays and the (dominating) hadronic background.

285: IACTs provide raw data with a signal-to-noise ratio much smaller than  $1\%$,

286: even for bright gamma sources. Establishing powerful methods of hadronic

287: background rejection thus is a prerequisite for the effective utilization of

288: observations with the Cherenkov technique. The fact was recognized with the advent of

289: the IACT technique, and has been given ample room in the literature, both for telescope

290: arrays and single telescopes, e.g. \cite{hillas,hillas1,fegan,aharonian,kraw}.

291: Multivariate methods using global test statistics

292: (e.g. likelihood ratios or artificial neural networks) are specifically mentioned in

293: \cite{fegan} and \cite{kraw}.

294:

295: A case study for and comparison of different advanced classification methods

296: for a single-dish IACT can be found in \cite{bock}. In the same article the main

297: features of Cherenkov images measured by gamma-ray telescopes are addressed

298: and explained, and the image parameters used in the $\gamma$/h separation are defined.

299:

300: In this paper, largely derived from chapter 5 of \cite{hengst}, we limit ourselves to the

301: implementation, usage, and functionality of the RF method for the single-dish system

302: MAGIC \cite{lorenz}. In \cite{hengst}, a more detailed discussion of the RF method

303: and comprehensive MC studies are given.

304: The implementation closely follows the method desribed

305: by L. Breiman \cite{breiman1}. The application

306: in $\gamma$/h separation is discussed in detail. Recent MAGIC publications

307: (e.g. \cite{albert1, albert2, albert3} use the RF technique, and \cite{albert4}

308: dicusses it in the context of the reference observations of the Crab nebula.

309: A short comparative study with the

310: established method of cuts in scaled image parameters is given.

311: We also discuss an application of the RF method in estimating the

312: gamma energy, a continuous

313: variable, in terms of the observed image parameters.

314: In the following chapter \ref{sec_basic} the Random Forest

315: method will be described in

316: detail, since existing mathematical treatments show only few practically

317: useful aspects, if any.

318: The reader not interested in these details may regard RF as a

319: black-box tree classification method, and continue

320: with the results in section \ref{results}.

321:

322: \section{Basics of the Random Forest (RF) method}

323: \label{sec_basic}

324: The Random Forest method is based on a collection of decision trees, built

325: up with some elements of random choices.

326: Like many other classification and regression methods, a Random Forest

327: is constructed on the basis of training samples suitable for the application.

328: For the purpose of $\gamma$/h separation, the training samples contain the two

329: classes of gammas (usually Monte Carlo (MC) data)

330: and hadrons (usually OFF data, also ON\footnote{ON and OFF data are telescope

331: data obtained by pointing at the source or on a nearby, sourceless region of the sky,

332: respectively} or MC data are possible).

333: In the further discussion, the following definitions will be used:

334: We call the elements of the training sample {\it events}.

335: Each event is characterized by a vector whose components are {\it image parameters}

336: obtained by analyzing the camera pixels. We use the familiar Hillas parameters \cite{hillas}

337: and some additional parameters, but also

338: observation- and detector-related parameters, like $cos(\theta)$,

339: $\theta$ being the zenith angle of the source.

340: The space spanned by the event vectors is multi-dimensional. One can consider the

341: training samples of gammas and hadrons

342: as a single labeled training sample, viz. each event has an integer label

343: (called {\it hadronness}) indicating if the event belongs to the class of gammas (hadronness 0)

344: or to the class of hadrons (hadronness 1).

345:

346: From this sample, a binary decision tree can be constructed, subdividing the parameter

347: space first in two parts depending on one of the parameters, and subsequently repeating

348: the process again and again for each part. The best choice of parameter and the

349: criteria for subdividing are discussed below.

350: Using a single tree for classification purposes, however, usually gives mediocre results.

351: The tree is overoptimized on the training sample, and there is only poor generalization

352: viz. new events will be classified rather badly.

353: This is shown in figure~\ref{fig_pattern1}. Note, however, that even a set of trees

354: (forest) results in some sparsely populated areas, where the hadronness

355: necessarily is

356: not well defined, and the probability of misclassification may be substantial.

357:

358: \begin{figure}[h]

359: \begin{minipage} [c] {0.50\textwidth}

360: \includegraphics[totalheight=4.0cm]{pattern2.jpg}

361: \end{minipage}

362: \begin{minipage} [c] {0.5\textwidth}

363: \includegraphics[totalheight=4.5cm]{pattern1.pdf}

364: \end{minipage}

365: \caption{{\it Left: Illustration of the RF method for a simple 2-dimensional model case.

366: The black and white points are the observed points in class �gamma� and �hadrons�,

367: respectively. They are distributed according to two different, but overlapping 2-dimensional

368: Gaussians. The result of separation in terms of hadronness is shown in colour.

369: Right: The result of using a single tree on the same data gives no probability

370: measure like hadronness, but only y/n answers. Its performance is inadequate.}}

371: \label{fig_pattern1}

372: \end{figure}

373:

374: There is no pruning (tree simplification by removing some branches considered irrelevant)

375: of the trees in the Random Forest algorithm. Instead, the RF creates

376: a set of largely uncorrelated trees, and combines their results to form a

377: generalized predictor. Two random elements prior to and within the tree growing process serve to

378: approximate ideally uncorrelated trees; they are described in the following sections.

379:

380: \subsection{Bootstrap aggregating (bagging)}

381: \label{subsec_bagging}

382: There is usually a single data sample in each class used for training.

383: A straightforward solution to obtain independent trees is to

384: split the training sample into as many non-overlapping subsamples as trees should be grown.

385: However, there are usually not enough training data available for this approach. This is especially the case if dealing with

386: air shower data, which are always costly to generate (w.r.t. computer time and storage space).

387: A different way is to produce a bootstrap sample for each tree by sampling n times with replacement from the

388: original training sample containing n events. This procedure guarantees that the events' image parameter

389: distributions are statistically identical for all bootstrap samples

390: (and equal to the image parameter distributions of the

391: original training sample, since the probability of selecting an event is constantly 1/n for the �sampling with replacement

392: procedure�), while the bootstrap samples do not contain the same events. It may (and will) happen that certain

393: events are taken more than just once:

394: The probability of not selecting a certain event is equal to $(1 - 1/n)$,

395: which becomes $(1-1/n)^n$ when repeating the

396: selection process n times. As $lim_{n\rightarrow\infty} (1 + x/n) = e^x$,

397: the probability of not selecting an

398: event in the bootstrap procedure becomes $e^{-1}\approx1/3$. Thus, in each bootstrap sample there will be on average

399: $(1 - 1/e)$  original training events, the rest (also kept in the sample) are copies.

400:

401: \subsection{Tree growing and random split selection}

402: The tree growing begins with the complete sample contained in a single node, the so called root node,

403: which is identical to the complete image parameter space. In the following the $\gamma$/h separation is achieved by

404: splitting (or cutting) each node into two successor nodes using one of the image parameters at a time, with a

405: cut value optimized to separate the sample into its classes (in our case two: gammas and hadrons). This corresponds to a

406: successive division of the image parameter space into hypercubes.

407: In order to measure the classification power (separation ability) of an image parameter and to

408: optimize the cut value, the Gini index is used The Gini index is a frequently used

409: measure in dealing with classifiers, originally in economics. Named after the Italian economist

410: Corrado Gini,

411: it measures the inequality of two distributions,

412: e.g. gamma acceptance and hadron acceptance as function of a cut in a variable.

413: It is defined as the ratio between a) the area spanned

414: by the observed cumulative distribution and the hypothetical cumulative distribution

415: for a non-discriminating variable (uniform distribution, 45-degree line), and b) the

416: area under this uniform distribution. It is a variable between zero and one;

417: a low Gini coefficient indicates more equal distributions, a

418: high Gini coefficient shows unequal distribution.

419:

420: The choice of the parameter

421: taken for splitting is randomized (see below for details).

422: The splitting process stops if the node size (events per node) falls below a limit specified by the

423: user, or if there are only events of one class (only gammas or only hadrons) left in the node, which

424: therefore needs not be split further.

425: These terminal nodes can also be called elementary hypercubes, they cover the entire image parameter space

426: without intersections or gaps. To each terminal node the remaining training events assign a

427: class label $l$ (0 for gammas, 1 for hadrons).

428: For terminal nodes still containing a mixture of events of different classes,

429: a mean value is calculated for $l$, taking into account the

430: class populations $N_h$ of hadrons and $N_{\gamma}$ of gammas: $l = N_h / (N_h + N_{\gamma})$.

431: The original program \cite{breiman2} uses a majority vote, and does not calculate mean values.

432:

433: Before going into more details, the classification process is briefly described:

434: One can take a completely grown tree as starting point

435: (see figure~\ref{fig_tree}).

436: \begin{figure}[h]

437: \begin{center}

438: \includegraphics[totalheight=5cm]{decisiontree.pdf}

439: \caption{{\it Sketch of a tree structure for the classification of an event $v$ with

440: components $v_{length}$, $v_{width}$, and $v_{size}$.

441: One can follow the decision path through the tree,

442: leading to classification of the event as hadron.}}

443: \label{fig_tree}

444: \end{center}

445: \end{figure}

446: The task is to classify an event

447: characterized by a vector $v$ in the image parameter space. $v$ is fed into the decision tree;

448: at the first (highest level) node

449: there is a split in a certain image parameter (e.g. 'length'). Depending

450: on the component (image parameter) 'length' in $v$, the event $v$ proceeds to the left node

451: (length $<$ split value) or to the

452: right node (length $\geq$ split value) at the next lower level.

453: This node again splits in some other (or by chance the same) component, and the process continues.

454: The result is that $v$ follows a track through the tree determined by the numerical values

455: of its components,

456: and the tree nodes' cut values, until it will end up in a terminal node.

457: This terminal node assigns a class label $l$ to $v$, which  can now be denoted

458: as $l_i(v)$, where $i$ is the tree number.

459:

460: The vector $v$ will be classified by all trees. Due to the randomization involved,

461: different trees will often give different results,

462: hence the name 'Random Forest'. From these results, a mean classification is calculated:

463: \begin{equation}

464: h(v) = \frac{\sum_{i=1}^{n_{trees}}l_i(v)}{n_{trees}}

465: \end{equation}

466: This mean classification is called Hadronness, and is used as the only test statistic (split-parameter)

467: in the $\gamma$/h separation (see figure~\ref{fig_had}).

468: \begin{figure}[h]

469: \begin{center}

470: \includegraphics[totalheight=7cm]{rfoutput.pdf}

471: \caption{{\it Mean hadronness for two test samples of gammas (left peak, black) and hadrons

472: (right peak, red). Hadronness is the final and only test statistic in $\gamma$/h separation.}}

473: \label{fig_had}

474: \end{center}

475: \end{figure}

476:

477:

478: The splitting process is somewhat randomized by a feature called random split selection. The parameter

479: candidates for a split are chosen randomly from the total number of available parameters.

480: Among the candidates, the parameter and corresponding cut value to be used for splitting

481: are chosen by the minimal Gini index.

482: In the case of two classes, the Gini index $Q_{Gini}$ can be referred to as binomial

483: variance of the sample

484: scaled to the interval $[0, 1]$.

485: The Gini index (or GINI coefficient) can be expressed in terms of the node

486: class populations $N_{\gamma}$, $N_h$

487: and the total node population $N$:

488: \begin{equation}

489: Q_{Gini} =  \frac{4}{N}\sigma_{binomial} =  4 \frac{N_{\gamma}}{N} \frac{N_h}{N}

490: = 4 \frac{N_{\gamma}(N-N_{\gamma})}{N^2}   \in [0,1]

491: \end{equation}

492: $Q_{Gini}$ of a node is zero for the ideal case that only one class is present in the node

493: ($N_{\gamma}=0$ or $N_h=0$). The Gini index of the split is calculated by adding the

494: Gini indices of the two successor nodes (denoted by left and right node) and

495: scaling the result to [0,1]:

496: \begin{equation}

497: Q_{Gini} = 2  \left( \frac{N_{\gamma left}}{N_{left}} \frac{N_{h left}}{N_{left}} +

498: \frac{N_{\gamma right}}{N_{right}} \frac{N_{h right}}{N_{right}} \right)    \in [0,1]

499: \end{equation}

500:

501: Choosing the smallest $Q_{Gini}$ corresponds to minimizing the variance of the

502: population of gammas and hadrons, and naturally purifies the sample.

503: Minimization of the Gini index provides both the choice of the image parameter

504: and the split value to be used.

505:

506: More details concerning the Random Forest method can be found in \cite{breiman2}.

507: The original program

508: was modified to calculate the mean hadronness instead of a $0$ or $1$ majority

509: vote for a class. Calculating the arithmetic mean by using

510: weights (e.g. using the Gini index of terminal nodes)

511: did not further improve the results \cite{bock},\cite{hengst}.

512:

513: \section{Control of the training process}

514: \label{sec_control}

515: In this chapter we address some specific aspects of RF related to the training process.

516: Proper training depends on several parameters, steering the growing of trees,

517: which the user should be aware of. In the following these parameters are described.

518: \begin{itemize}

519: \item

520: Number of trees:

521: the number of trees must be chosen large enough to ensure the convergence of the error $\sigma$, given by

522: \begin{equation}

523: \sigma(n_{tree}) = \sqrt{\frac{\sum_{i=1}^{n_{sample}}(h_i^{est}(n_{tree}) - h_i^{true})^2}{n_{sample}}}

524: \end{equation}

525: $\sigma(n_{tree})$ is the rms error of the estimated hadronness. $h_i^{est}(n_{tree})$ denotes the estimated

526: hadronness (which depends

527: on the number $n_{tree}$ of combined trees) and $h_i^{true}$ is the true hadronness

528: of event $i$ in the sample, which contains $n�_{sample}$ events in total.

529: The convergence process is shown in figure~\ref{fig_conv} for the training of

530: RF on an MC gamma and MC hadron sample.

531:

532: \begin{figure}[h]

533: \begin{center}

534: \includegraphics[totalheight=6cm]{noftrees.pdf}

535: \caption{{\it Error (rms, = $\sqrt(\sigma^2)$) of the estimated hadronness as function of the

536: number of trees used. Also shown is the variance of each single tree.}}

537: \label{fig_conv}

538: \end{center}

539: \end{figure}

540:

541: Care was taken that the test sample, for which the figure was produced,

542: is disjunct from the training sample.

543: When taking events already used in the training process, $\sigma$ would be underestimated.

544: From figure~\ref{fig_conv}, the following practical method can be deduced:

545: One generates a reasonably high number of trees (100 trees is usually sufficient), performs the training process,

546: and then finds decisions for a test sample using a diminishing number of trees, to

547: judge how many trees still give satisfactory results. Trees generated during the training

548: process are stored successively in a file. For the classification task one can read in the actually needed number of trees.

549: If no test sample is available, one can take $\sigma(n_{tree})$ as calculated from the so-called out-of-bag

550: data during the training.

551: The out-of-bag data are the 'residue' of the bagging procedure, as explained in the following. In the bagging procedure

552: (generating of bootstrap samples, see chapter \ref{sec_basic})  there are data for each tree which have

553: not been used for the tree's  bootstrap sample. Being independent, they can be used as test data for the corresponding tree.

554: In other words, each event of the original training sample can be used as test data for $\approx 1/3$ of the trees.

555: If one observes a sufficient convergence of $\sigma$  calculated from out-of-bag data after,

556: say, 150 trees, actually 50 trees are needed.

557: \item

558: Overtraining: During tree growing, the cut values of the parameters are adjusted according to the training sample.

559: This overtraining is not a major drawback, it affects merely the training sample, which provides these

560: exact cut values.

561: According to \cite{breiman2} the overtraining (or overoptimization) vanishes in case of an infinite number of trees.

562: The practical method described above favours a minimal forest, with a number of trees sufficiently large to

563: ensure a classification error (of a test sample), which is not significantly decreased by adding more trees.

564: Such a forest still shows overtraining: when applying $\gamma$/h separation to the training data, the classes of gammas

565: and hadrons can usually be well separated by a cut in hadronness = 0.5. In other words, each tree 'learned by heart'

566: the training events, and the same is true for the entire forest.

567: The situation is the same with classical cuts: the cut values are optimized on a certain

568: observed data set from a gamma source or on Monte Carlo data, and later on applied to the data

569: to be analyzed, which must not contain the training data.

570: \item

571: Number of trials in random split selection: This concerns the parameters considered

572: for splitting. A good empirical value for their number

573: is $\sqrt{N}$ where $N$ is the total number of parameters used

574: in tree growing \cite{breiman2}.

575: \item

576: Node size: this is the minimum size of node at which further splitting stops.

577: For correctly labeled training events $nodesize = 1$ can be used, for

578: partly incorrect labeled data (e.g. using ON-data as hadrons) $nodesize > 1$ is preferable,

579: since data are not intended to be split completely. Experience tells that a small number $< 10$ is best.

580: \end{itemize}

581:

582: \section{Application of RF in $\gamma$/h separation}

583: \subsection{Remarks concerning the training process}

584: In this chapter some features related to the Random Forest method will be briefly addressed.

585: Some of these remarks are valid also for many other advanced classification methods in need of a training

586: process, like Neural Networks or linear discriminant analysis.

587: \begin{itemize}

588: \item

589: Training data for Cherenkov telescopes:

590: We have used OFF data and MC gammas (correctly labeled samples) or ON data and MC gammas

591: (partly wrongly labeled hadron sample). It is usually advisable not to use MC hadrons,

592: since hadronic showers are

593: difficult to simulate (unlike gamma showers which have a pure electromagnetic nature),

594: so that MC hadrons are difficult to match in all details with

595: real data. In fact, there is no need to use MC hadrons, when OFF or ON data are available.

596: Choosing ON data for training has the advantage of obviating OFF data taking, and of using data

597: taken under identical observational conditions. The

598: Random Forest algorithm is stable enough to deal with a hadron sample containing up to 1\% of gammas,

599: as shown in figure~\ref{fig_contam}, where the training was performed

600: using OFF data with variable artificial contamination for the hadrons,

601: and MC data for the gamma sample.

602: \begin{figure}[h]

603: \begin{center}

604: \includegraphics[totalheight=7cm]{contam.pdf}

605: \caption{{\it Neyman-Pearson or ROC diagrams of hadron training samples with

606: a contamination of (mislabeled) gamma events. A hadron sample with 1\% gammas

607: introduces a negligible loss in selection efficiency.}}

608: \label{fig_contam}

609: \end{center}

610: \end{figure}

611: In order to simulate ON data, the OFF data were contaminated with MC gammas, i.e. the degree of

612: contamination was known. For all simulated gamma admixtures the reduction of the

613: separation efficiency

614: beomes visible only in a region of low gamma acceptances, which is usually not advisible to

615: operate in (too low gamma efficiency). Depending on the set of image parameters used for training,

616: a generalization of this result may not be possible.

617: \item

618: Types of parameters:

619: All parameters are treated in the same way, which means that in particular detector-related or observational

620: parameters like $cos(\theta)$ ($\theta$ being the zenith angle), $\bar{\sigma}$ (image noise,

621: averaged over all pixels), or size  (integrated signal of the image),  must be

622: used with care. The sense of using such parameters is that cuts in other image parameters will depend on them,

623: but not that they should be used for cuts.

624: Thus, in  general, one can distinguish between parameters to be used for cuts, and

625: parameters on which the cuts in other parameters may depend.

626: To circumvent the problem, the training data must be chosen not to permit a classification using these parameters alone

627: (e.g. by using the same (flat) distribution of $cos(\theta)$ in both training samples).

628: Splits in these parameters, in training samples prepared this way,

629: can not directly serve for separating gammas and hadrons.

630: Additional attention must also be payed if e.g. the gamma data have discrete $cos(\theta)$ values for technical reasons

631: in the Monte Carlo production. In this case the $cos(\theta)$ values appearing in the hadron sample must be

632: rounded to the same values (binned), or the Monte Carlo data artificially spread to become continuous.

633: \end{itemize}

634:

635: \subsection{Comparison with direct cuts in image parameters}

636: \label{results}

637: An extensive comparison of methods applied to Monte Carlo data sets for training and

638: test samples was given in \cite{bock}. One of the methods described there (called

639: {\it Direct Selection}) was based on using simple AND/OR cuts in the multi-dimensional

640: space of image parameters. The choice of parameters or functions thereof

641: offers many possibilities for tuning.

642: We repeat here a similar comparison, again using Monte Carlo data, using {\it scaled} image

643: parameters. Like in \cite{bock}, no claim can be made

644: that this result, found in favor of the RF method,

645: can be generalized to all parameter choices or to real data.

646: Exhaustive comparisons with real data are lengthy, due to the high

647: dimensionality of the problem,

648: which includes data selection and image cleaning steps even

649: before image parameters are obtained.

650: Quality comparisons using real data are also influenced by the unavoidable changes in

651: operation conditions, that are reflected in data corrections whose effect on separation

652: methods are difficult to evaluate. A comparative study with comprehensive

653: MAGIC data samples is, however, in preparation.

654:

655: For this comparison we used independent training and test samples, of 15000 events each.

656: {\it Hadrons} were simulated with the parameters:

657: energy range $200GeV<E<30TeV$; spectral index $a = -2.7$; zenith angle range

658: $0<\theta<30^\circ$; impact parameter range $0< R<400m$; viewing

659: cone $5^\circ$.

660: The {\it gamma} simulation settings were:

661: energy range $50GeV<E<30TeV$; spectral index Crab-like $a = -2.6$;

662: zenith angle range $0<\theta<30^\circ$; impact parameter range $0<R< 200m$;

663: Figure~\ref{fig_hill} shows the corresponding

664: distributions of the image parameters width [deg] and length[deg] as functions of

665: size [phe], for gammas and hadrons.

666: All data were pre-cut to obtain high-quality training and

667: test samples, requiring leakage\footnote{this parameter, not defined in \cite{bock},

668: uses an estimate of fractional energy escaping the camera}

669: $<0.1$, dist$>0.3^\circ$, size $>200phe$.

670: \begin{figure}[h]

671: \begin{center}

672: \includegraphics[totalheight=7cm]{hillaspar.jpg}

673: \caption{{\it Distribution of the Hillas parameters width (top) and length (bottom)

674: as function of log(size),

675: for gammas (left) and hadrons (right), as used in the training samples.

676: The profiles are shown in red (gammas) and black (hadrons), showing that

677: both parameters are good separators for size values above 200 photoelectrons

678: (corresponding to about 100 GeV)}}

679: \label{fig_hill}

680: \end{center}

681: \end{figure}

682: Clearly, width and length are good separation parameters, at least for values of size

683: exceeding 200 phe (photo electrons),

684: which corresponds approximately to energies above $100GeV$.

685: The size dependence of width and length can be dealt with

686: by using scaled parameters:

687: The size range (of MC gamma data) is divided into bins, and

688: for each bin $i$ mean and variance of the

689: width distribution ($\bar{w_i}$ and $\sigma^2_{w_i}$)

690: are calculated. The scaled width

691: $w_{i,scaled}$ for each bin is then obtained by

692: $w_{i,scaled} = (w_i - \bar{w_i}) / \sigma_{w_i}$.

693:

694: The same procedure is used for the length parameter. As a result one obtains a normalized

695: width and length distribution for gammas: they follow a pdf (probability density function)

696: with mean 0 and variance 1.

697: In these variables, static (size-independent) cuts are used for $\gamma$/h separation.

698: In order to find optimal cuts, a maximization of the $Q$-value which relates the relative

699: acceptances of gamma-rays and hadrons

700: ($Q = \epsilon_{\gamma} / \sqrt{\epsilon_h}$)

701: was performed, using the Metropolis minimization package\footnote{which

702: includes random perturbations in the search, thus avoiding to return local minima}

703: followed by a SIMPLEX minimization. Both packages are part of TMinuit

704: in the root analysis environment \cite{brun}.

705:

706: Both the Random Forest and the scaled parameter method used independent data

707: for training and testing. Only the parameters size, dist, width, and length were used.

708: The results are compared in the Neyman-Pearson or ROC (Receiver

709: operator characteristic) diagrams of  figure~\ref{fig_comp1}; these diagrams

710: show gamma acceptance as function of hadron acceptance.

711: \begin{figure}[h]

712: \begin{center}

713: \includegraphics[totalheight=6cm]{effcomp1.pdf}

714: \caption{{\it ROC curves for $\gamma$/h separation in the test sample, by the

715: RF method (higher curve) and by cuts in scaled parameters,

716: using the same parameters.}}

717: \label{fig_comp1}

718: \end{center}

719: \end{figure}

720: In order to obtain for the scaled parameter method more than a single point

721: (that of overall maximum $Q$) in the ROC diagram, a

722: regularizer $a (\epsilon_h - p)^2$ was introduced (a generalization of

723: the method used in \cite{bock}).

724: Here $p$ denotes a target acceptance for hadrons, and  $\epsilon_h$ is the freely variable

725: hadron acceptance, which is obtained from the maximization of $Q$ and different for each $p$.

726: We used a high scaling number $a = 1000$ to ensure that the optimization will give as a

727: result a set of cuts with  $\epsilon_h$ close to $p$.

728:

729: These results are shown as the lower curve in figure~\ref{fig_comp1}.

730: We should stress again that this comparison can in no way show a general

731: superiority of the RF method; practical experience shows that for a given

732: data sample other methods (also including direct selection

733: as in the above example) can, at an effort, be fine-tuned to give results

734: comparable to the RF method. However, in no case has the RF result been shown inferior,

735: and much less tuning is needed (and possible) with the RF method.

736: More comparisons (including also MAGIC data) can be found in \cite{zimmermann}.

737:

738: \section{Using a Random Forest estimator for a continuous variable}

739: The RF method permits also to construct an algorithm of estimating a

740: continuous quantity rather than a discrete class

741: membership, dealt with in previous sections. We have used this method

742: to estimate non-analytically the particle energy from the

743: measured image parameters. Two main approaches are possible:

744: \begin{itemize}

745: \item Forced division into classes:

746: Class labels are assigned to the training events

747: according to an energy grid. As a result, multiple classes

748: $E_0, E_1, ...,E_{n-1}$ are created.

749: In the RF training process the related class populations are taken into account

750: together with a more general Gini index \cite{breiman1}

751: \begin{equation}

752: p_i = N_i / N

753: \end{equation}

754: \begin{equation}

755: Q_{Gini} = 1 - \sum_{i=0}^{n-1}p_i^2

756: \end{equation}

757: Here $i$ is the class index ($0 \leq i \leq n-1$). As already shown above, the Gini index of

758: a split is evaluated as sum of the two Gini indices obtained after the split, and minimized.

759: After the training procedure, the class populations

760: inside a terminal node are used to calculate the

761: estimated energy  corresponding to the terminal node:

762: \begin{equation}

763: E_{est} = \frac{\sum_{i=0}^{n-1}E_iN_i} {\sum_{i=0}^{n-1}N_i}

764: \end{equation}

765: In this application of RF each tree returns an estimated energy and the overall mean

766: is calculated as the final estimated energy.

767:

768: \item

769: A splitting rule based on the continuous quantity:

770: It is possible to completely avoid the use of classes by introducing a splitting rule,

771: which does not rely on class populations.

772: The idea of the Gini index (with its interpretation as binomial variance of the

773: classes) as split rule is a purification of the class populations, i.e. a separation

774: of the classes, in the subsamples after the split process. Similarly, when using the

775: variance in energy as a splitting criterion, the subsamples are purified with respect to

776: their energy distribution.

777: \begin{equation}

778: \sigma^2(E) = \frac {1}{n-1} \sum_{i=1}^{N}(E_i-\bar{E})^2 =

779: \frac{1}{n-1} \left[ \left( \sum_{i=1}^{N}E_i^2\right) - n\bar{E^2}\right].

780: \end{equation}

781: In analogy to the Gini index of the split, the �variance� of the split is calculated by

782: adding the �subsample energy variances�, taking into account the node populations

783: as weights:

784: \begin{equation}

785: \sigma^2(E) = \frac{1}{N_L+N_R}\left(N_L\sigma_L^2(E) + N_R\sigma_R^2(E) \right)

786: \end{equation}

787: \end{itemize}

788:

789:

790: We have used both approaches for a set of Monte Carlo data.

791: With 100 classes for the first (classification) method,

792: it produces results nearly identical to those of the second (regression) approach.

793: The results of this latter RF approximation for energy

794: can be seen from figure~\ref{fig_energy}.

795: The linearity is perfect, and the energy resolution (as defined by

796: the rms error $\sigma_E/E$) comes out 26\% at 100~GeV and

797: 19\% at 1~TeV,

798: very fair values for a single telescope (telescope arrays can reach better resolution).

799: We have not found an analytical parameterization for energy expressed in terms of image

800: parameters giving a result better than with the RF representation; with extensive tuning,

801: results comparable in quality have been found, though.

802: \begin{figure}[h]

803: %\begin{minipage} [c] {0.5\textwidth}

804: %\includegraphics[totalheight=5cm]{energy1.pdf}

805: %\end{minipage}

806: %\begin{minipage} [c] {0.5\textwidth}

807: %\includegraphics[totalheight=5cm]{energy2.pdf}

808: %\end{minipage}

809: \includegraphics[totalheight=4.5cm]{RFEEst.jpg}

810: \caption{{\it Left: The relation between the RF-estimated energy (horizontal)

811: and initial Monte Carlo energy (vertical axis) is perfectly linear.

812: Right: The rms error $\sigma_E/E$ as function of initial energy.}}

813: \label{fig_energy}

814: \end{figure}

815:

816:

817: \section{Conclusions}

818: The Random Forest (RF) method based on multiple decision trees

819: was extensively tested as an analysis tool in the $\gamma$/h separation

820: for data obtained with the MAGIC telescope.

821: In this paper we discuss many implementation details and the

822: parameters a user has to become familiar with.

823: We also compare the performance of RF with the more

824: conventional technique of cuts in scaled image parameters, using MC

825: data. It could be shown that RF in this comparison is superior

826: to the classical method. This comparison does not

827: imply a general superiority of the RF method; practical experience

828: shows that for a given data sample the conventional methods (like

829: dynamical cuts or cuts in scaled image parameters) may be tuned

830: to give results comparable (but not superior) to the RF method.

831: A dedicated comparative

832: study using MAGIC experimental data is still under way.

833:

834: The RF method does produce stable results and

835: is robust with respect to input parameters, even if strongly correlated. The method

836: adjusts itself to the available multi-dimensional space,

837: with a minimum of human intervention:

838: there are only few tunable parameters, which can be chosen according to simple criteria

839: (number of trees, trials in random split selection and final node size).

840: This simpler control and tuning can then be seen as a general advantage

841: over conventional methods.

842: Proper training samples, however, are important, as in any advanced

843: method requiring a training process, i.e.

844: one has to rely on a good Monte Carlo simulation. Using OFF or ON data as hadron

845: sample limits the MC dependence to the gamma showers, better understood

846: than hadron showers.

847: There remains, however, the need to correctly treat

848: atmospheric conditions under different zenith angles,

849: and good knowledge of the detector.

850:

851: Training and classification are fast: benchmarks using

852: a 1.5~GHz PC (Athlon XP), with training and test samples each containing 10.000~events,

853: a total of 10~image parameters used,

854: 100~trees used for classification, each tree completely grown (nodesize=1),

855: 3~trials in random split selection, give one minute for training and 2~ms/event for classification.

856: A comparable analysis technique like Neural Networks demands substantially

857: more computer time for training.

858:

859:

860: % The Appendices part is started with the command \appendix;

861: % appendix sections are then done as normal sections

862: % \appendix

863:

864: \section*{Acknowledgement}

865: We thank Jens Zimmermann for fruitful discussions about the RF method

866: and for  comparisons of the RF method with a Neural Net approach.

867:

868: % Bibliographic references with the natbib package:

869: % Parenthetical: \citep{Bai92} produces (Bailyn 1992).

870: % Textual: \citet{Bai95} produces Bailyn et al. (1995).

871: % An affix and part of a reference:

872: %   \citep[e.g.][Ch. 2]{Bar76}

873: %   produces (e.g. Barnes et al. 1976, Ch. 2).

874:

875: %\bibliographystyle{elsart-num.bst}

876: %%%%%%%%%\begin{thebibliography}{10}

877:

878: \begin{thebibliography}{10}

879: \bibitem{hillas}A.M.Hillas: Proceedings of the 19th International

880: Cosmic Ray Conference, ICRC 1985 La Jolla , 3 (1985) 445

881: \bibitem{hillas1}A.M.Hillas: Space Science Rev. 75 (1996) 17

882: \bibitem{fegan}D.J.Fegan: J.Phys.G, Nucl.Part.Phys. 23 (1997) 1013

883: \bibitem{aharonian}F.Aharonian et al.: Astropart.Phys. 6 (1997) 343

884: \bibitem{kraw}H.Krawczynski et al.: Astropart.Phys. 25 (2006) 380

885: \bibitem{bock}R.K.Bock, A.Chilingarian, M.Gaug, et al.,

886: Nucl. Inst. and Methods A 516 (2004) 511

887: \bibitem{hengst}T.~Hengstebeck, PhD thesis,

888: Mathematisch-Naturwissenschaftliche Fakult\"at I,

889: Humboldt-Universit\"at zu Berlin, M\"arz 2007.

890: Available at URL http://edoc.hu-berlin.de/docviews/abstract.php?id=28015

891: \bibitem{lorenz} E. Lorenz, New Astron. Rev. 48 (2004) 339

892: \bibitem{breiman1}L . Breimann, J. H. Friedmann, R. A. Olshen, C. J .Stone:

893: Classification and Regression Trees, Wadsworth, 1983

894: \bibitem{albert1} J.Albert et al., Astroph. Journal 664 (2007) L87

895: \bibitem{albert2} J.Albert et al., Astroph. Journal 665 (2007) L51

896: \bibitem{albert3} J.Albert et al., Astroph. Journal 669 (2007) 1143

897: \bibitem{albert4} J.Albert et al., to be published in Astroph. Journal,

898: preprint available at http://de.arxiv.org/abs/0705.3244

899: \bibitem{breiman2}L.Breiman, FORTRAN program Random Forests, Version 3.1, and

900: L.Breiman, Manual On Setting Up, Using, And Understanding Random Forests V3. 1,

901: both available at http://oz.berkeley.edu/users/breiman

902: \bibitem{brun}R.~Brun, F.~Rademakers, http://root.cern.ch/

903: \bibitem{zimmermann} J.~Zimmermann, PhD thesis, Fakult\"at f\"ur Physik,

904: Ludwig-Maximilians-Universit\"at M\"unchen, Juni 2005.

905: Available at URL http://edoc.mpg.de/274832

906:

907:

908: \end{thebibliography}

909:

910: \end{document}

911: