0202:nlin0202038/z.tex

1: %------------------------------------------------------------------------------

2: % standard

3:

4: \newcommand{\stdpack}{

5:   \usepackage{amssymb}

6:   \usepackage{amsmath}

7:   \usepackage{eucal}

8: %  \usepackage{/home/mt/usr/tex/calrsfs}

9: %  \usepackage{/home/mt/usr/tex/calligra}

10:   \usepackage[final]{graphicx}

11:   \usepackage{psfrag}

12:   \usepackage{fancyhdr}

13:   \renewcommand{\headrulewidth}{0pt}\lhead{}\cfoot{}\rfoot{\thepage}

14:

15:   \newcommand{\draft}{\usepackage[light,first]{draftcopy}\draftcopyName{draft}{350}}

16:   \newcommand{\labels}{\usepackage{showlabels}}

17:   \newcommand{\maple}{\usepackage{maple2e}}

18:   \newcommand{\makeidx}{\usepackage{makeidx}\makeindex}

19: }

20: \newcommand{\std}[1]{

21:   \stdpack

22:   \usepackage{a4,a4wide}

23:  %\renewcommand{\labelenumi}{\textbf{(\roman{enumi})}}

24:  %\renewcommand{\labelenumi}{${}^{\bf (\roman{enumi})}$}

25:  %\renewcommand{\labelitemi}{\bf $\cdot$}

26:   \newcommand{\blockindent}{3ex}

27:   \renewcommand{\baselinestretch}{#1}

28:   \renewcommand{\arraystretch}{1.2}

29:   \hoffset -1cm  \addtolength{\textwidth}{2cm}

30:   \voffset -0cm  \addtolength{\textheight}{0cm}

31:

32:   \usepackage{./mt}

33:   \columnsep 5ex

34:   \parindent 3ex

35:   \parskip 1ex

36:   \macros

37:   \pagestyle{fancy}

38:   \bibliographystyle{./chicago}

39: \renewenvironment{abstract}{\paragraph{Abstract}\begin{rblock}\small}{\end{rblock}}

40: }

41:

42: %------------------------------------------------------------------------------

43: % styles

44:

45: \newcommand{\article}[2]{

46:   \documentclass[#1pt,twoside,fleqn]{article}\usepackage{chicago}\std{#2} }

47: \newcommand{\nips}{

48:   \documentclass{article} \usepackage{nips01e,times} \stdpack\macros }

49: \newcommand{\ijcnn}{

50:   \documentclass[10pt,twocolumn]{/home/mt/usr/tex/ijcnn}

51:   %\documentclass[10pt,twocolumn]{article}\usepackage{/home/mt/usr/tex/wcci}

52:   \stdpack\macros

53:   \bibliographystyle{abbrv}

54: }

55: \newcommand{\foga}{

56:   \documentclass{article}

57: %  \documentclass[10pt,twocolumn]{/home/mt/usr/tex/foga}

58:   %\documentclass[10pt,twocolumn]{article}\usepackage{/home/mt/usr/tex/wcci}

59:   \stdpack

60:   \usepackage{/home/mt/usr/tex/foga}

61:   \macros

62: }

63:

64: \newcommand{\book}[2]{

65:   \documentclass[#1pt,twoside,fleqn]{book}\usepackage{chicago}\std{#2} }

66:

67: \newcommand{\foils}[1]{

68:   \documentclass[12pt,fleqn]{article}

69:   \std{#1}

70:   %\renewcommand{\arraystretch}{1.5}

71:   %\setlength{\voffset}{-3cm}%{-6cm}

72:   %\setlength{\hoffset}{-4cm}

73:   %\setlength{\textheight}{267mm}%{29cm}

74:   %\setlength{\textwidth}{19cm}

75:   %\hoffset -1cm  \addtolength{\textwidth}{2cm}

76:   \voffset -1cm  \addtolength{\textheight}{2cm}

77:   \renewcommand{\footskip}{2cm}

78:   %\parindent 0ex

79:   %\parskip 0ex %8ex

80:   %\pagestyle{plain}

81:   %\macros

82:   \begin{document}

83:   \large

84: }

85: \newcommand{\landfoils}[1]{

86:   \documentclass[fleqn]{article}

87:   \stdpack

88:   \renewcommand{\baselinestretch}{#1}

89:   \renewcommand{\arraystretch}{1}

90:   \setlength{\hoffset}{-3.5cm}%{-5cm}

91:   \setlength{\voffset}{-3.5cm}

92:   \setlength{\textwidth}{27cm}%{29cm}

93:   \setlength{\textheight}{19cm}

94:   \parindent 0ex

95:   \parskip 0ex %8ex

96:   \pagestyle{empty}

97:   \macros

98:   \begin{document}

99:   \huge

100: }

101: \newcommand{\landfolien}[1]{

102:   \documentclass[fleqn]{article}

103:   \usepackage{german}

104:   \stdpack

105:   \renewcommand{\baselinestretch}{#1}

106:   \renewcommand{\arraystretch}{1.5}

107:   \setlength{\hoffset}{-5cm}%{-6cm}

108:   \setlength{\voffset}{-1.5cm}

109:   \setlength{\textwidth}{27cm}%{29cm}

110:   \setlength{\textheight}{19cm}

111:   \parindent 0ex

112:   \parskip 0ex %8ex

113:   \pagestyle{plain}

114:   \begin{document}

115:   \huge

116: }

117:

118:

119: %------------------------------------------------------------------------------

120: % titles

121:

122: \newcommand{\addressCologne}{

123:   Institute for Theoretical Physics\\

124:   University of Cologne\\

125:   50923 K\"oln---Germany\\

126:   {\tt mt@thp.uni-koeln.de}\\

127:   {\tt www.thp.uni-koeln.de/\~{}mt/}

128: }

129:

130: \newcommand{\homepage}{{\tt www.neuroinformatik.ruhr-uni-bochum.de/PEOPLE/mt/}}

131: \newcommand{\email}{{\rm mt@neuroinformatik.ruhr-uni-bochum.de}}

132:

133: \newcommand{\address}{\small\it

134:   Institut f\"ur Neuroinformatik,

135:   Ruhr-Universit\"at Bochum, ND 04,

136:   44780 Bochum---Germany\\

137:   \email

138:   %\homepage

139: }

140:

141: \newcommand{\mytitle}{

142:   \thispagestyle{empty}

143:   \rhead{\it Marc Toussaint---\today}

144:   \hrule height2pt

145:   \begin{list}{}{\leftmargin2ex \rightmargin2ex \topsep2ex }\item[]

146:     {\Large\bf \thetitle}

147:   \end{list}

148:   \begin{list}{}{\leftmargin7ex \rightmargin7ex \topsep0ex }\item[]

149:     Marc Toussaint \quad\today

150:

151:     \address

152:   \end{list}

153:   \vspace{2ex}

154:   \hrule height1pt

155:   \vspace{5ex}

156: }

157:

158: \newcommand{\contents}{{\small \parskip 0ex \tableofcontents \parskip 2ex }}

159:

160:

161: %------------------------------------------------------------------------------

162: % environments / commands

163:

164:

165: \newcommand{\sepline}{

166:   \begin{center} \begin{picture}(200,0)

167:     \line(1,0){200}

168:   \end{picture}\end{center}

169: }

170:

171: \newcommand{\partsection}[1]{

172:   \vspace{5ex}

173:   \centerline{\sc\LARGE #1}

174:   \addtocontents{toc}{\contentsline{section}{{\sc #1}}{}}

175: }

176:

177: \newcommand{\intro}[1]{\textbf{#1}\index{#1}}

178:

179: \newtheorem{definition}{Definition}

180: \newtheorem{statement}{Statement}

181: \newtheorem{theorem}{Theorem}

182: \newtheorem{hypothesis}{Hypothesis}

183: \newenvironment{remark}{\noindent\emph{Remark.}}{}

184: \newenvironment{example}[1][]{\begin{block}[Example {#1}]}{\end{block}~}

185:

186: \newcounter{parac}

187: \newcommand{\para}{\refstepcounter{parac}{\bf [{\roman{parac}}]}~~}

188: \newcommand{\Pref}[1]{[\emph{\ref{#1}}\,]}

189:

190: \newenvironment{block}[1][]{{\noindent\bf #1}

191: \begin{list}{}{\leftmargin\blockindent \topsep-\parskip}

192: \item[]

193: }{

194: \end{list}

195: }

196:

197: \newenvironment{rblock}{

198: \begin{list}{}{\leftmargin\blockindent \rightmargin\blockindent \topsep-\parskip}

199: \item[]

200: }{

201: \end{list}

202: }

203:

204: \newenvironment{keywords}{\paragraph{Keywords}\begin{rblock}\small}{\end{rblock}}

205:

206: \newenvironment{colpage}{

207: \addtolength{\columnwidth}{-3ex}

208: \begin{minipage}{\columnwidth}

209: \vspace{.5ex}

210: }{

211: \vspace{.5ex}

212: \end{minipage}

213: }

214:

215: \newenvironment{enum}{

216: \begin{list}{}{\leftmargin3ex \topsep0ex \itemsep0ex}

217: \item[\labelenumi]

218: }{

219: \end{list}

220: }

221:

222: \newenvironment{cramp}{

223: \begin{quote} \begin{picture}(0,0)

224:         \put(-5,0){\line(1,0){20}}

225:         \put(-5,0){\line(0,-1){20}}

226: \end{picture}

227: }{

228: \begin{picture}(0,0)

229:         \put(-5,5){\line(1,0){20}}

230:         \put(-5,5){\line(0,1){20}}

231: \end{picture} \end{quote}

232: }

233:

234:

235: \newcommand{\inputReduce}[1]{

236: {\sc\hspace{\fill} REDUCE file: #1}

237: %  \input{#1.tex}

238: %  \sepline

239: %  \input{../tex/tridefs}

240: %  \input{#1.out}

241: %  \redefinemath

242: }

243:

244: \newcommand{\inputReduceInput}[1]{

245:   {\sc\hspace{\fill} REDUCE input - file: #1}

246:   \input{#1.tex}

247: }

248:

249: \newcommand{\inputReduceOutput}[1]{

250:   {\sc\hspace{\fill} REDUCE output - file: #1}

251: %  \input{../tex/tridefs}

252: %  \input{#1.out}

253: %  \redefinemath

254: }

255:

256: \newcommand{\todo}[1]{{\bf[#1]}}

257:

258: %------------------------------------------------------------------------------

259: % macros

260:

261: \newcommand{\macros}{

262:   \newcommand{\0}{{\hat 0}}

263:   \newcommand{\1}{{\hat 1}}

264:   \newcommand{\2}{{\hat 2}}

265:   \newcommand{\3}{{\hat 3}}

266:   \newcommand{\5}{{\hat 5}}

267:

268:   \renewcommand{\a}{\alpha}

269:   \renewcommand{\b}{\beta}

270:   \renewcommand{\c}{\gamma}

271:   \renewcommand{\d}{\delta}

272:     \newcommand{\D}{\Delta}

273:     \newcommand{\e}{\epsilon}

274:     \newcommand{\g}{\gamma}

275:     \newcommand{\G}{\Gamma}

276:   \renewcommand{\l}{\lambda}

277:   \renewcommand{\L}{\Lambda}

278:     \newcommand{\m}{\mu}

279:     \newcommand{\n}{\nu}

280:     \newcommand{\N}{\nabla}

281:   \renewcommand{\k}{\kappa}

282:   %\renewcommand{\o}{\omega}

283:   \renewcommand{\O}{\Omega}

284:     \newcommand{\p}{\phi}

285:     \newcommand{\ph}{\varphi}

286:   \renewcommand{\P}{\Phi}

287:   \renewcommand{\r}{\varrho}

288:     \newcommand{\s}{\sigma}

289:     \newcommand{\Si}{\Sigma}

290:   \renewcommand{\t}{\theta}

291:     \newcommand{\T}{\Theta}

292:   \renewcommand{\v}{\vartheta}

293:     \newcommand{\X}{\Xi}

294:     \newcommand{\Y}{\Upsilon}

295:

296:   \renewcommand{\AA}{{\cal A}}

297:     \newcommand{\GG}{{\cal G}}

298:   \renewcommand{\SS}{{\cal S}}

299:     \newcommand{\TT}{{\cal T}}

300:     \newcommand{\EE}{{\cal E}}

301:     \newcommand{\FF}{{\cal F}}

302:     \newcommand{\HH}{{\cal H}}

303:     \newcommand{\II}{{\cal I}}

304:     \newcommand{\KK}{{\cal K}}

305:     \newcommand{\LL}{{\cal L}}

306:     \newcommand{\MM}{{\cal M}}

307:     \newcommand{\NN}{{\cal N}}

308:     \newcommand{\CC}{{\cal C}}

309:     \newcommand{\PP}{{\cal P}}

310:     \newcommand{\QQ}{{\cal Q}}

311:     \newcommand{\RR}{{\cal R}}

312:     \newcommand{\UU}{{\cal U}}

313:     \newcommand{\YY}{{\cal Y}}

314:     \newcommand{\SOSO}{{\cal SO}}

315:     \newcommand{\GLGL}{{\cal GL}}

316:

317:   \newcommand{\NNN}{{\mathbb{N}}}

318:   \newcommand{\ZZZ}{{\mathbb{Z}}}

319:   %\newcommand{\RRR}{{\mathrm{I\!R}}}

320:   \newcommand{\RRR}{{\mathbb{R}}}

321:   \newcommand{\CCC}{{\mathbb{C}}}

322:   \newcommand{\one}{{{\bf 1}}}

323:

324:   \newcommand{\<}{{\ensuremath\langle}}

325:   \renewcommand{\>}{{\ensuremath\rangle}}

326:   \newcommand{\Aut}{{\rm Aut}}

327:   \newcommand{\cor}{{\rm cor}}

328:   \newcommand{\corr}{{\rm corr}}

329:   \newcommand{\cov}{{\rm cov}}

330:   \newcommand{\sd}{{\rm sd}}

331:   \newcommand{\tr}{{\rm tr}}

332:   \newcommand{\lag}{\mathcal{L}}

333:   \newcommand{\inn}{\rfloor}

334:   \newcommand{\lie}{\pounds}

335:   \newcommand{\longto}{\longrightarrow}

336:   \newcommand{\speer}{\parbox{0.4ex}{\raisebox{0.8ex}{$\nearrow$}}}

337:   \renewcommand{\dag}{ {}^\dagger }

338:   \newcommand{\h}{{}^\star}

339:   \newcommand{\w}{\wedge}

340:   \newcommand{\too}{\longrightarrow}

341:   \newcommand{\To}{\Rightarrow}

342:   \newcommand{\Too}{\;\Longrightarrow\;}

343:   \newcommand{\ow}{\stackrel{\circ}\wedge}

344:   \newcommand{\feed}{\nonumber \\}

345:   \newcommand{\comma}{\; , \quad}

346:   \newcommand{\period}{\; . \quad}

347:   \newcommand{\del}{\partial}

348: %  \newcommand{\quabla}{\Delta}

349:   \newcommand{\point}{$\bullet~~$}

350:   \newcommand{\doubletilde}{

351:   ~ \raisebox{0.3ex}{$\widetilde {}$} \raisebox{0.6ex}{$\widetilde {}$} \!\!

352:   }

353:   \newcommand{\topcirc}{\parbox{0ex}{~\raisebox{2.5ex}{${}^\circ$}}}

354:   \newcommand{\sym}{\topcirc}

355:

356:   \newcommand{\half}{\frac{1}{2}}

357:   \newcommand{\third}{\frac{1}{3}}

358:   \newcommand{\fourth}{\frac{1}{4}}

359:

360:   \renewcommand{\_}{\underset}

361:   \renewcommand{\^}{\overset}

362:

363:   \renewcommand{\small}{\footnotesize}

364: }

365:

366: \newcommand{\argmax}[1]{\text{arg}\underset{#1}\max}

367: \newcommand{\argmin}[1]{\text{arg}\underset{#1}\min}

368: \newcommand{\kld}[2]{D\!\left(\,#1\,|\!|\,#2\,\right)}

369:

370: %\newcommand{\path}{\pathmt}

371: \newcommand{\pathmt}{./}

372: \newcommand{\basepath}{./}

373: \newcommand{\setpath}[1]{\renewcommand{\pathmt}{#1}\renewcommand{\basepath}{#1}}

374: \newcommand{\pathinput}[2]{

375:   \renewcommand{\pathmt}{\basepath #1}

376:   \input{\pathmt #2} \renewcommand{\pathmt}{\basepath}}

377:

378: \newcommand{\hide}[1]{[\small #1 \normalsize]}

379: \newcommand{\color}[2][1]{}

380: %\newcommand{\url}[1]{{\tt #1}}

381:

382: \article{10}{1}

383:

384: %\draft

385: %\labels

386:

387: %\ijcnn

388: %\renewcommand{\paragraph}[1]{{\bf #1}}

389: %\newcommand{\citeNP}[1]{\cite{#1}}

390: %\newcommand{\citeyear}[1]{\cite{#1}}

391: %\rfoot{}

392:

393:

394:

395: \newcommand{\df}{\d\!f}

396:

397: \title{\Large\textbf{On model selection and the disability of neural networks\\ to decompose tasks}}

398:

399: \author{\normalsize Marc Toussaint\\

400:  \sizeix Institut f\"ur Neuroinformatik, Ruhr-Universit\"at Bochum\\

401: \sizeix 44780 Bochum, Germany\\

402: \textit{\sizeix Marc.Toussaint@neuroinformatik.ruhr-uni-bochum.de}}

403:

404: \date{}

405:

406:

407:

408:

409: \begin{document}

410:

411: %\pagestyle{empty}

412: %\maketitle\thispagestyle{fancy}

413:

414: \twocolumn[\mytitle]\thispagestyle{fancy}

415:

416: \rhead{\it Proceedings of the International Joint Conference on Neural

417:   Networks (IJCNN 2002)}

418:

419:

420: \begin{abstract}%

421:   A neural network with fixed topology can be regarded as a

422:   parametrization of functions, which decides on the correlations

423:   between functional variations when parameters are adapted. We

424:   propose an analysis, based on a differential geometry point of view,

425:   that allows to calculate these correlations. In practise, this

426:   describes how one response is unlearned while another is trained.

427:   Concerning conventional feed-forward neural networks we find that

428:   they generically introduce strong correlations, are predisposed to

429:   forgetting, and inappropriate for task decomposition.  Perspectives

430:   to solve these problems are discussed.

431:

432: %   Consider the functional responses of a neural network on two

433: %   different stimuli. When the weights are varied or adapted, we ask:

434: %   Do both of the responses vary? Do they always vary in correlation?

435: %   Or are their variations decorrelated, e.g., because they depend on

436: %   different weights?

437:

438: %   These questions actually address the way of parametrization of the

439: %   function space and may thus be embedded in the model selection

440: %   problem. We formalize these ideas and propose an analysis of the

441: %   parametrization that is based on a differential geometry point of

442: %   view and allows to predict the correlations in functional

443: %   variations. In practise, this allows a prediction of how one

444: %   response is unlearned while the other is trained.

445:

446: %   Finally, an important result is that we can generically classify

447: %   conventional feed-forward neural networks as introducing much

448: %   correlations, being predisposed to forgetting, and inappropriate for

449: %   task decomposition. Perspectives to solve these problems are

450: %   discussed.

451: \end{abstract}

452:

453:

454:

455:

456: \section{Introduction}\label{intro}

457:

458: Following Kerns et al. \citeyear{kearns:95}, the problem of model

459: selection may be defined as follows: Given a finite set of data

460: points, find a function (or conditional probability distribution, also

461: called hypothesis) such that the expected generalization error is

462: minimized.  Typically, the search space $\FF$ (the space of functions

463: or conditional probability distributions) is assumed to be organized

464: as a nested sequence of subspaces $\FF_1 \subseteq ..  \subseteq \FF_d

465: \subseteq .. \subseteq \FF$ of increasing complexity.  For instance,

466: the index $d$ may denote the number of parameters or the

467: Vapnik-Chervonenkis dimension \cite{vapnik:95}. Finding the function

468: with minimal generalization error then amounts to finding the

469: appropriate sub-search-space before applying ordinary optimization

470: schemes.  Many approaches introduce a penalty term related to

471: complexity which has to be minimized together with the training error.

472: Penalty terms are, for example, the number of parameters of the model,

473: the number of \emph{effective} model parameters, the

474: Vapnik-Chervonenkis dimension, or the description length

475: \cite{akaike:74,amari:93,moody:91,rissanen:78,vapnik:95}. An

476: alternative based on geometric arguments is presented by

477: Schuurmans \citeyear{schuurmans:97}.

478:

479: The emphasis of our investigations is different to these classical

480: approaches.  The choice of a specific model (e.g., a neural network)

481: to represent a function has \emph{two} implications: it defines the

482: space $\FF_d$ of representable functions, but it also defines a

483: \emph{parametrization} of this space, where parametrization is not

484: meant in the sense of `finding parameters' but in the sense of

485: introducing coordinates on that space, i.e., introducing a mapping

486: $\P:\, \RRR^m \to \FF_d$ from some coordinate space $\RRR^m$ onto the

487: sub-search-space. To omit confusion, we use the term \emph{model

488:   class} for the sub-search-space $\FF_d$, and \emph{model

489:   parametrization} for the parametrization $\P$ of this

490: sub-search-space.  For example, an artificial neural network with $m$

491: free parameters, fixed topology, and fixed activation functions

492: defines a model class (the subspace of functions it can

493: realize---which, if the topology is appropriate, includes an

494: approximation of any function \cite{hornik:89}) but it also defines a

495: model parametrization (the mapping from its parameters to the

496: corresponding function).

497:

498: Our emphasis is on the implications of a specific model

499: parametrization instead of the choice of a certain model class. It is

500: important to have a closer look at this parametrization in order to

501: allow for an analytical description of the adaptation dynamics, rather

502: than just analyzing the complexity of a model class. In particular,

503: the precise relation between variations of parameters and functional

504: variations of the system is of fundamental interest because it

505: decides, e.g., on the way of ``extrapolation'', or on how the system

506: forgets previously learned data. This relation can be derived from the

507: model parametrization and our goal is to extract such features

508: analytically.  We focus on forgetting as a specific character of

509: adaptation dynamics and develop an analysis of the model

510: parametrization that allows to approximate the rate of forgetting.

511: This analysis is based on a differential geometry point of view and is

512: related to a large pool of research, including the discussions of

513: \emph{cross-talk} \cite{jacobs:90} and \emph{catastrophic forgetting}

514: \cite{french:99}, the information geometry point of view on parameter

515: adaptation \cite{amari:00}, and perfectly analogous ideas in the

516: context of evolutionary adaptation \cite{toussaint:01}.  Section

517: \ref{ana} includes a discussion of these relations.

518:

519: We apply our method of analyzing the model para\-metrization on the

520: class of standard feed-forward neural networks (FFNNs). We find that

521: the variety of FFNNs with arbitrary topology is actually not a great

522: variety with respect to certain characters of the model

523: parametrization. In particular, FFNNs gnerically introduce strong

524: correlations between functional variations and thereby are predisposed

525: to forget previously learned data.  Hence, using FFNNs as a function

526: model means a limitation---not with respect to representable functions

527: but with respect to learning characteristics. A simple example

528: compares a standard FFNN with a network that includes competitive

529: interactions.  The results validate our analytical predictions and

530: illustrate their implications. We conclude that a generalization of

531: the class of FFNNs is necessary and that the introduction of

532: competitive interactions between neurons is a promising approach to

533: solve these problems.

534:

535: Section \ref{def} will introduce to the formalism our investigations

536: are based on and, in section \ref{ana}, we describe the analysis of

537: the model parametrization. Section \ref{emp} presents the examples

538: and in section 5 we give an outlook concerning the evolutionary

539: perspective on model selection and discuss the relevance of the

540: limitedness of FFNN models. The conclusion follows up.

541:

542:

543:

544:

545: \section{Definitions}\label{def}

546:

547: \subsection{The functional point of view}

548:

549: Let $\FF$ be the search space. Here, $\FF$ shall be the space of all

550: functions mapping from a finite space $X$ to $Y \subseteq \RRR^n$.

551: However, all results can be transferred to the search space of

552: conditional probabilities, as we discuss below.

553:

554: The space of functions $f:\, X \to Y$ can be written as $Y^X$, which

555: is isomorphic to $\RRR^{n\cdot|X|}$. Thus, let a function $f \in

556: Y^X$ be represented by $n\!\cdot\!|X|$ components $f^a \in \RRR$,

557: where the index $a$ refers to a specific point in $X$ \emph{and} a

558: $Y$-dimension. (The components $f^a$ may be regarded as entries of a

559: lookup-table representation of $f$.) On this representation, we

560: describe an online adaptation step as a probabilistic transition to a

561: new function as follows: Assume that adaptation is initiated by the

562: observation of a target value $t^a$ for a functional component $f^a$.

563: A transition occurs as a variation $\df \in \RRR^{n|X|}$ with

564: probability $p(\df\, |\, f^a,t^a)$. The interesting point is that

565: functional components of which no target value has been observed may

566: vary as well. Let $a$ be a random variable and consider the density

567: $p(\df)=p(\df\, |\, f^a,t^a)\, p(a)$. We will refer to the respective

568: covariance between two variation components as the \emph{functional

569:   covariance matrix}

570: \begin{align}

571:   C^{bc} := \cov_{p(\df)}(\df^b,\df^c) \;.

572: \end{align}

573:

574: This matrix is a first order description of how the adaptation of the

575: observed functional component results in a \emph{coadaptation} of a

576: functional component which has not been observed. For example,

577: assuming a linear dependence between $\df^a$ and $\df^b$, we have

578: $\df^b \stackrel{\cdot}= \<\df^b\> + \frac{C^{ab}}{\s^2}\, \big( \df^a

579: - \<\df^a\>\big)$, where $\s^2$ is the variance of $\df^a$. Whether

580: this coadaptation is desirable or not depends on the problem.

581: Coadaptation is also an explicit description of the ``way of

582: generalization''\footnote{By ``way of generalization'' we do not refer

583:   to the generalization error but to the way of extrapolation from

584:   observed data to unobserved.}: unobserved functional components

585: (i.e., the functional response on stimuli that have not been observed)

586: are coadapted depending on the adaptation of observed functional

587: components.  In general, one would like to choose from a variety of

588: different coadaptation schemes, i.e., one would like to select a model

589: from a variety of models with different kinds of coadaptation. We will

590: find that this refers to the selection of a model parametrization.

591:

592: When the set of functional components can be separated in two disjoint

593: subsets such that $C^{ab}$ vanishes for two components $f^a$ and $f^b$

594: of different subsets, then we speak of \emph{adaptation

595:   decomposition}. During online learning, adaptation decomposition

596: means that the development of two such components during successive

597: adaptation is not correlated. In terms of homogeneous Markov

598: processes, successive adaptation is described by the transition

599: probability $p(\df\, |\, f^a,t^a)$ (assuming that the draw of $a$ from

600: $p(a)$ is independent at each time), and adaptation is decomposed if

601: $p(\df^a,\df^b)=p(\df^a)\, p(\df^b)$.

602:

603:

604:

605: \subsection{The parameter point of view}

606:

607: We now address the \emph{modeling} of functions. Let $\P$ be a

608: $m$-dimensional, differentiable parametrization of a subset $\P(W)$

609: of functions:

610: \begin{align}

611: & \P:\, W \to \FF \comma W\subseteq\RRR^m \;,\\

612: & \P(W) := \bigcup_{w \in W} \{\P(w)\} \quad \subseteq \FF \;.

613: \end{align}

614: We call $\P$ the \emph{model parametrization} and $\P(W)$ the

615: \emph{model class}.  In terms of differential geometry, $\P$ is the

616: inverse of a coordinate map (or chart, or atlas) for $\P(W)$. Since

617: this map is differentiable, it induces a metric on $\P(W)$ if one on

618: $W$ is given and vice versa. We define the \emph{functional metric}

619: $g^{ab}(w)$ on $\P(W)$ as the lift of the Euclidean metric on $W$,

620: \begin{align}

621: g^{ab}(w) := \sum_i \frac{d \P(w)^a}{d w^i}\, \frac{d \P(w)^b}{d w^i} \;;

622: \label{funcMet}

623: \end{align}

624: and we define the \emph{parameter metric} $g_{ij}(w)$ on $W$ (actually

625: on the \emph{dual} tangent spaces of $W$) as the pull-back of the

626: Euclidean metric on $\P(W)$,

627: \begin{align}

628: g_{ij}(w) := \sum_a \frac{d \P(w)^a}{d w^i}\, \frac{d \P(w)^a}{d w^j} \;.

629: \label{parMet}

630: \end{align}

631: As usual in differential geometry, the metrics depend on the locality

632: given by $w$. These metrics describe the relation between parameter

633: variations and functional variations as we explore in more detail in

634: the next section.

635:

636:

637:

638:

639: \section{Analysis of the model para\-metrization}\label{ana}

640:

641: In the previous section we defined the correlation matrix $C^{ab}$ on

642: the functional level. Now we analyze what the choice of a model

643: parametrization $\P$ implies on this functional level. Given $\P$ and

644: parameters $w$, we write $f^a=\P(w)^a$. Assume that a target $t^a$ was

645: observed and adaptation of the parameters takes place by a gradient

646: descent,

647: \begin{align}

648: \d w^i = 2 \a\; \frac{d f^a}{d w^i}\, (t^a - f^a) \;,

649: \end{align}

650: which corresponds to the gradient of the squared error multiplied by

651: an adaptation rate $\a$. In first order approximation, this induces a

652: functional variation

653: \begin{align}

654: \df^b

655:   = 2 \a\; \sum_i \frac{d f^b}{d w^i}\, \d w^i

656:   = 2 \a\; g^{ab}\, (t^a - f^a) \;,

657: \label{deltaF}

658: \end{align}

659: using definition (\ref{funcMet}). Thus, the functional metric $g^{ab}$

660: describes the variation of a functional component $f^b$ when $t^a$ is

661: observed.  This gives a first order description of coadaptation and of

662: how the model generalizes the experience of a target value $t^a$ in

663: order to adapt also functional components $f^b$. In this approximation

664: the functional covariance reads

665: \begin{align}\label{covari}

666: C^{bc} = 4 \a^2\; \sum_a p(a)\; g^{ba}\, g^{ca}\, (t^a - f^a)^2 -\<\df^a\>\<\df^b\>\;.

667: \end{align}

668: To discuss this expression, let us assume that the second term

669: vanishes, $\<\df^a\>\<\df^b\>=0$. Concerning the first term, the

670: product $g^{ba}\, g^{ca}$ vanishes for all $a$ if and only if the

671: functional metric is a block matrix and $b$ and $c$ refer to different

672: blocks:

673: \begin{align*}

674: g^{ab} = \left(\begin{array}{cc}

675:            A \in \RRR^{\m\times\m} & 0\\

676:            0 & B\in \RRR^{\n\times\n}

677:     \end{array}\right) \comma b\le\m \comma c>\m \;,

678: %\bigg(\!\begin{array}{c}\text{\input{blocks1.fig}}\end{array}\!\!\!\!\!\bigg)

679: \end{align*}

680: where $A$ and $B$ are arbitrary symmetric matrices and

681: $\m+\n=n\cdot|X|$. Thus, adaptation is decomposed into two subsets of

682: functional components exactly if the functional metric is a block

683: matrix and the functional component subsets correspond to these

684: blocks.\footnote{Note the relation to group theory: A group

685:   representation is said to be reducible if all group generators can

686:   be represented as a block matrix (such that all of them fit in the

687:   same block template). On this basis, physics defines the notion of

688:   an elementary particle as corresponding to an irreducible

689:   representation, whereas physical systems that correspond to a

690:   reducible representation (a block matrix) are considered as

691:   \emph{composed} of particles. A system of which the adaptation

692:   dynamics (instead of physical interactions) can be decomposed in the

693:   sense of a block matrix can analogously be thought of as composed of

694:   subsystems.

695:

696:   More formally, the observation of a target $t^a$ can be identified

697:   with an element of a group that applies on the functional

698:   components. Adaptation dynamics is now interpreted as successive

699:   application of group elements. The group representation (i.e., the

700:   way the group elements apply on the functional components) is

701:   determined by the model parametrization. If adaptation is

702:   decomposed, this representation is reducible.}

703:

704:

705:

706:

707:

708: \subsection{Reference to related research}

709:

710:

711: \paragraph{Cross-talk.}

712: The inspiring work by Jacobs et al. \citeyear{jacobs:90} discusses the

713: implication of the choice of a multi-expert model on the learning

714: speed and generalization behavior. They formulate the idea of spatial

715: and temporal crosstalk, which denotes the statistical dependence

716: between the states of two different neurons or between the states of a

717: neuron at two different times. In our formalism, this crosstalk is

718: captured by the functional covariance---spatial for two indices $a$

719: and $b$ belonging to the same input $x \in X$, and temporal for two

720: indices of different input. They argue that such a crosstalk may be

721: undesirable and is avoided by explicitly separating neurons in

722: disjoint experts. As we will see below, selecting a multi-expert model

723: is a very intuitive way to explicitly declare an independence of

724: functional components and realize decomposed adaptation. In fact, the

725: separation into experts corresponds to a block matrix type functional

726: metric. (If the gating is also adaptive, the functional metric is

727: actually not a completely clean block matrix.)

728:

729: In the context of artificial neural networks, the term

730: \emph{catastrophic forgetting} has been used to describe negative

731: effects of coadaptation. See \cite{french:99} for a review.

732:

733: \paragraph{Information geometry.}

734: The methods applied in this paper are related to information geometry.

735: Let $Y=S_\n=[0,1]^{2^\n-1}$ be the $2^\n-1$ dimensional manifold of

736: probability distributions over $\{0,1\}^\n$ as defined by Arami

737: \citeyear{amari:00}. Then, the search space $\FF$ of mappings $X \to Y$

738: is the space of all conditional probabilities $p(y|x)$, $x\in X, y\in

739: Y$. Usually, one assumes the Fisher metric on $\FF$, not the

740: Euclidean. Thus, we would have to change the definition (\ref{parMet})

741: of the parameter metric into

742: \begin{align}

743: g_{ij}(w)=E\left[

744:   \frac{\del\log p(x,y;w)}{\del w^i}\;

745:   \frac{\del\log p(x,y;w)}{\del w^j}\right]\;,

746: \end{align}

747: where $E[.]$ denotes the expectation and $p(x,y;w)=p(y|x;w)\,p(x)$,

748: $p(y|x;w)=\P(w) \in \FF$. Arami \citeyear{amari:98} uses this metric

749: to define the natural gradient descent on the parameter space (which

750: actually is the covariant derivative instead of the contravariant).

751: The use of the natural gradient can also be motivated by a

752: spatio-temporal decorrelation \cite{choi:00}.

753:

754:

755: \paragraph{Evolutionary computation.}

756: It seems that in the field of evolutionary computation the discussion

757: of the covariance structure in the search space is much more

758: elaborated than in the field of neural computation (see

759: \citeNP{toussaint:01}). Roughly speaking, the goal of evolutionary

760: computation is to maximize the probability of good mutations during

761: evolutionary search. Eventually, fitness requires some phenotypic

762: traits to be mutated in correlation. Such correlations (coadaptation)

763: may be modeled explicitly in the search density of evolutionary

764: algorithms \cite{baluja:97,hansen:01,muehlenbein:99,pelikan:99}.

765: Alternatively, they may be induced implicitly by the choice of a good

766: parametrization of phenotypic traits---by a genotype-phenotype

767: mapping, which is in perfect analogy to the model parametrization

768: $\P$. Many research efforts focus on the choice or the understanding

769: of the genotype-phenotype mapping

770: \cite{stephens:99,toussaint:01,wagner:96}.  In this view, functional

771: components $f^a$ may be compared to phenotypic traits, whereas

772: parameters relate to the genotype.

773:

774:

775:

776:

777:

778: \section{Example}\label{emp}

779:

780: Our test of the learning behavior is very simple: a regression of only

781: two patterns in $\{0,1\}^3$ has to be learned by mapping the first

782: pattern on $+1$ and the second on $-1$. However, we impose that these

783: patterns have to be learned \emph{online} where they alternate only

784: after they have been exposed for $100$ times in

785: succession.\footnote{This task is not meant as a performance test but

786:   as an experimental setup to test our analytical methods. However,

787:   similar effects of learning and unlearning occur in online learning

788:   when a specific response is unlearned during the course of training

789:   other responses for several time steps. In real world simulations it

790:   is also plausible that stimuli remain unchanged for many time

791:   steps.} We test two systems on this task: a standard feed-forward

792: neural network as described in detail in table \ref{FFN}, and a system

793: that involves a softmax layer as described in table \ref{monet}. The

794: parameters of both systems are initialized randomly by the normal

795: distribution $\NN(0,0.1)$ around zero with standard deviation $0.1$.

796: The two patterns were chosen as $110$ and $010$. Learning is realized

797: by a slow gradient descent with adaptation rate $2\cdot10^{-3}$ and

798: momentum $0.5$. The metric components are calculated from the

799: gradients.

800:

801: \begin{table}[t]

802:   \fbox{\begin{colpage} The feed-forward neural network we investigate

803:     here is 3-4-1-layered; layers are completely connected; the output

804:     neurons are linear, the hidden ones implement the sigmoid

805:     $\frac{1}{1+\exp(-10\, x)}$; only the hidden neurons have bias

806:     terms.

807: \end{colpage}}

808: \caption{The Standard model}

809: \label{FFN}

810: \end{table}

811:

812: \begin{table}[t]

813:   \fbox{\begin{colpage} The softmax model is the same as the standard

814:     model with the exception that the four neurons in the hidden layer

815:     \emph{compete} for activation: their output activations $y_i$ are

816:     given by

817: \begin{align}

818: y_i=\frac{e^{30\, x_i}}{X} \comma

819: &x_i=\sum_{j \in\, \text{input}} w_{ij} y_j + w_i \;,\feed

820: &X=\sum_{i \in\, \text{hidden}} e^{30\, x_i} \;.

821: \label{gating}

822: \end{align}

823: Here, $w_{ij}$ and $w_i$ denote weight and bias parameters. The

824: exponent factor $30$ may be interpreted as rather low temperature,

825: i.e., high competition. The calculation of the gradient is a little

826: more involved than ordinary back-propagation but straightforward and

827: of same computational cost (see \cite{toussaint:02b}).

828: \end{colpage}}

829: \caption{The Softmax model}

830: \label{monet}

831: \end{table}

832:

833: \begin{figure}[t]\center

834: \psfrag{g_00}{\!\small $g^{00}$}

835: \psfrag{g_01}{\!\small $g^{01}$}

836: \psfrag{g_11}{\!\small $g^{11}$}

837: \psfrag{empirical}{\small measured}

838: \psfrag{estimated}{\small calculated}

839: \psfrag{trained}{\small trained}

840: \psfrag{untrained}{\small untrained}

841: %\includegraphics[scale=0.36]{data/forget3c.eps}

842: \includegraphics[width=\columnwidth]{forget3c.eps}

843: \caption{\emph{Test of the standard model.}\newline

844:   \small For all four graphs the abscissa denotes the time

845:   step.\newline \emph{Top:} The learning curves (errors) with respect

846:   to both patterns are displayed. Only one of the patterns is

847:   trained---alternating every 100 time steps.  The error of the

848:   untrained patterns increases.\newline \emph{Second:} The slope

849:   (change of error per time step) of the untrained learning curve is

850:   displayed. The dotted line refer to the measured slope of the upper

851:   curve, the normal line is calculated according to equation

852:   (\ref{deltaF}).\newline \emph{Third:} The slope (measured and

853:   calculated) of the trained learning curve.\newline \emph{Bottom:}

854:   The three components of the functional metric $g^{00}$, $g^{01}$,

855:   $g^{11}$ are displayed in logarithmic scale. In particular the

856:   cross-component $g^{01}$ is clearly non-vanishing.}

857: \label{curvesStd}

858: \end{figure}

859:

860:

861: \begin{figure}[t]\center

862: \psfrag{g_00}{\!\small $g^{00}$}

863: \psfrag{g_01}{\!\small $g^{01}$}

864: \psfrag{g_11}{\!\small $g^{11}$}

865: \psfrag{empirical}{\small measured}

866: \psfrag{estimated}{\small calculated}

867: \psfrag{trained}{\small trained}

868: \psfrag{untrained}{\small untrained}

869: %\includegraphics[scale=0.36]{data/forget2d.eps}

870: \includegraphics[width=\columnwidth]{forget2d.eps}

871: \caption{\emph{Test of the softmax model.} \newline

872:   \small\emph{Top:} The learning curves (errors) with respect to both

873:   patterns are displayed. The untrained patterns is scarcely

874:   forgotten.\newline \emph{Second:} The slope (measured and

875:   calculated) of the untrained learning curve nearly vanishes.\newline

876:   \emph{Third:} The slope (measured and calculated) of the trained

877:   learning curve.\newline \emph{Bottom:} The three components of the

878:   functional metric $g^{00}$, $g^{01}$, $g^{11}$ (in logarithmic

879:   scale). The cross-component $g^{01}$ is small, it decreases

880:   significantly at time step 200.}

881: \label{curvesSoft}

882: \end{figure}

883:

884:

885: Please see Figures \ref{curvesStd} and \ref{curvesSoft} for the

886: results. For the standard neural model we observe some forgetting of

887: the untrained pattern during the training of the other. For the

888: softmax model, the error of the untrained pattern hardly increases.

889: The rate of forgetting, given by the slope of the error curve, is well

890: described by equation (\ref{deltaF}) and demonstrated by the graphs in

891: the middle. The bottom graphs display the functional metric components

892: and generally exhibit that the cross-component $g^{01}$, which is

893: responsible for coadaptation and forgetting, is quite large for the

894: standard model compared to the softmax model. Further, the softmax

895: model seems to learn the adaptation decomposition, as defined in

896: section \ref{def}, after the 200th time step. All these results reveal

897: that the standard model is not well-suited to solve the simple task

898: given and that the analysis of the model's functional metric provides

899: a formal way of understanding this phenomenon. Remarkably also, the

900: components $g^{00}$ and $g^{11}$ become significantly greater than $1$

901: during the training phase of the respective functional component. By

902: equation (\ref{deltaF}), this means that the ``effective'' adaptation

903: rate is larger than $2\cdot10^{-3}$.

904:

905:

906: One might object that the results given above rely on the random

907: initialization and on the specific task we chose. To analyze both

908: types of models in a more general way we perform another test. We

909: investigate the distribution of the functional metric components when

910: parameters are normally distributed by $\NN(0,0.1)$.  Figure

911: \ref{distri} shows the distributions for both models. Clearly, the

912: standard model exhibits a Gauss-like distribution of the

913: cross-component $g^{01}$ with mean around $1.5$; a vanishing

914: cross-component $g^{01}$ is not very likely.  On the other hand, the

915: softmax model exhibits two strong peaks at $g^{01}=0$ and $g^{01}=1$,

916: such that the probability for $g^{01}<0.1$ is larger than 10\%. These

917: distributions are generic properties of the two models.

918:

919:

920:

921:

922: \begin{figure}[t]\center

923: \psfrag{g00}{\!\small $g^{00}$}

924: \psfrag{g01}{\!\small $g^{01}$}

925: \psfrag{g11}{\!\small $g^{11}$}

926: \includegraphics[width=\columnwidth]{stat2.eps}

927: \caption{\emph{Distribution of metric components.}\newline

928:   The distribution was calculated as a histogram of $1$ million

929:   samples by using bins of size $\frac{1}{100}$. The ordinate is

930:   scaled in ``percent of samples that fell into the bin''.\newline

931:   \emph{Top:} The standard model. The probability of vanishing

932:   cross-component $g^{01}$ is vary small.\newline \emph{Bottom:} The

933:   softmax model. The inset graph is in logarithmic scale. The

934:   probability of vanishing cross-component $g^{01}$ is fairly high.}

935: \label{distri}

936: \end{figure}

937:

938:

939:

940:

941:

942: \section{Toward evolutionary model selection}

943:

944: Finally, the question of how to select an appropriate model has not

945: yet been addressed. As discussed in the introduction, classical

946: approaches to model selection commonly introduce a penalty term in

947: order to reduce the model's complexity. Following this tradition we

948: could introduce a penalty term that reduces forgetting. Consider

949: \begin{align}

950: \sum_{ab} (g^{ab})^2 - \sum_a (g^{aa})^2 \;.

951: \end{align}

952: This is a measure of the cross-components in the functional metric.

953: Unfortunately, we cannot present any experiments with this model

954: selection criterion here. This approach is postponed to future

955: research.

956:

957: The original motivation for this work, though, was not to develop a

958: new model selection criterion as given by the above penalty term.

959: Instead we believe that the evolution of neural networks, as it

960: recently became an elaborated branch of research (see \cite{yao:99}

961: for a review), is actually a promising method of model selection.

962: However, most of these approaches focus on standard neural models,

963: i.e., the evolutionary search space is the space of ordinary

964: feed-forward neural networks (FFNNs) with arbitrary topology. The

965: belief is that the variety of topologies offers a variety of

966: functionally different models. The present paper is a critique of this

967: belief because it supports that the functional metric inherent of

968: FFNNs comprises significantly non-vanishing cross-components. This

969: implies that the variety of FFNNs with arbitrary topology is actually

970: not a great variety with respect to the functional metric. E.g., it

971: hardly includes models with vanishing cross-components and low rate of

972: forgetting. In conclusion, the search space has to be generalized to

973: contain also models with arbitrary functional metric in order to allow

974: for the selection of more optimal models. The presented softmax model

975: involving competitive interactions between neurons is a step in this

976: direction, but much motivation is left for future research toward the

977: generalization of the model search space and evolutionary methods to

978: select good models from this great variety. The model presented in

979: \cite{toussaint:02b} is one approach.

980:

981:

982:

983:

984: \section{Conclusion}

985:

986: We developed a new analytical approach to characterize a function

987: model and describe its learning properties. We focussed on functional

988: correlations in the adaptation process and derived the relation to the

989: functional metric of the model parametrization. The analysis can in

990: principal be applied on any kind of differentiable model (also

991: probabilistic, when formulated in terms of information geometry). Our

992: empirical studies illustrate the approach and demonstrate that

993: conventional neural network models are rather limited with respect to

994: their adaptation behavior: a task separation, i.e., decorrelated

995: adaptation to decorrelated data, is hardly possible. In contrast, a

996: model involving competitive interactions is more predisposed for task

997: decomposition. Thus, as we pointed out in the previous section, the

998: evolutionary approach to model selection should generalize the search

999: space to include not only standard feed-forward neural networks, but

1000: also models with arbitrary functional metrics, e.g., by allowing for

1001: competitive interactions.

1002:

1003:

1004: \subsection*{Acknowledgment}

1005:

1006: The author acknowledges support by the German Research Foundation DFG

1007: under grant \emph{SoleSys} SE 251/41-1.

1008:

1009: \small

1010: \bibliography{/home/mt/bibtex/bibs}

1011: \end{document}

1012: