0107:cs0107025/corr.tex

1: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

2: %

3: % File:         Expansions/OnLine/Texte/jacm.tex

4: %

5: % Author(s):    Marc Daumas

6: %               <Marc.Daumas@Lip.Ens-Lyon.Fr>

7: %               Laboratoire de l'Informatique du Parallelisme

8: %

9: % Last modified on Fri May 05 15:10:03 GMT MET DST 2000 by daumas

10: %

11:

12: %%%%%%%%%%%%%%%%

13: %%%%% TODO %%%%%

14: %%%%%%%%%%%%%%%%

15: % Rechercher les TODO

16:

17: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

18:

19: \documentclass[a4paper]{article}

20: %\documentclass[a4paper,twoside]{lipreport}

21:

22: %\usepackage{RR}

23:

24: \usepackage[latin1]{inputenc}

25: \usepackage[T1]{fontenc}

26: \usepackage[english]{babel}

27:

28: %\usepackage{styles}

29: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

30: %

31: % File:         styles.sty

32: %

33: % Author(s):    Marc Daumas

34: %               <Marc.Daumas@Lip.Ens-Lyon.Fr>

35: %               Laboratoire de l'Informatique du Parallelisme

36: %

37: % Last modified on Thu Jan 28 14:33:02 MET 1999 by daumas

38: %      modified on Wed Feb 26 09:02:03 MET 1997 by mdaumas

39: %

40:

41: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

42: %

43: % Detection LaTeX vs. LaTeX2e

44: %

45:

46: \ifx\usepackage\undefined

47: \def\inca#1{\input #1.sty}

48: \def\incb#1#2{\input #1.sty}

49: \else

50: \newcommand{\inca}[1]{\usepackage{#1}}

51: \newcommand{\incb}[2]{\usepackage[#2]{#1}}

52: \fi

53:

54: %\if@compatibility

55: %\def\inca#1{\input #1.sty}

56: %\def\incb#1#2{\input #1.sty}

57: %\fi

58:

59: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

60: %

61: % Styles usuels dans un ordre que LaTeX accepte

62: %

63:

64: \ifx\pdfoutput\undefined

65:   \incb{graphics}{dvips}

66:   \incb{epsfig}{dvips}

67:   \def\dvipsselector#1#2{#1}

68:   \def\dvipsoption{dvips}

69: \else

70:   \incb{graphics}{pdftex}

71:   \def\dvipsselector#1#2{#2}

72:   \def\dvipsoption{pdftex}

73: \fi

74:

75: %\inca{tgrind}

76: \inca{boxedminipage,url}

77: \inca{amstext,amsfonts,latexsym}

78: \ifx\usepackage\undefined\else\inca{latexsym}\fi

79: \inca{epic}

80: % \inca{eepic}

81:

82: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

83: %

84: % Macro personnelles

85: %

86:

87: %\inca{macro}

88: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

89: %

90: % File:         macro.sty

91: %

92: % Author(s):    Marc Daumas

93: %               <Marc.Daumas@Lip.Ens-Lyon.Fr>

94: %               Laboratoire de l'Informatique du Parallelisme

95: %

96: % Last modified on Wed Mar 01 12:44:17 GMT MET 2000 by daumas

97: %      modified on Fri Feb 28 16:22:11 MET 1997 by mdaumas

98: %

99:

100: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

101: %

102: % Quelques alias utiles

103: %

104:

105: \def\ov#1{\overline{#1}}

106: \def\mc#1#2#3{\multicolumn{#1}{#2}{#3}}

107: \def\ita{\vspace{-\itemsep}\item}

108:

109: % \newcommand{\interligne}[1]{\baselineskip = #1pt}

110: % \renewcommand{\baselinestretch}{1.5}

111:

112: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

113: %

114: % Environnements

115: %

116:

117: \ifx\frenchname\undefined

118: \newtheorem{defin}{Definition}

119: \newtheorem{prop}{Proposition}

120: \newtheorem{lem}{Lemma}

121: \newtheorem{theo}{Theorem}

122: \newtheorem{corol}{Corollary}

123: \newtheorem{conj}{Conjecture}

124: \newtheorem{observ}{Observation}

125: \newtheorem{exemple}{Example}

126: \else

127: \newtheorem{defin}{D�finition}

128: \newtheorem{prop}{Proposition}

129: \newtheorem{lem}{Lemme}

130: \newtheorem{theo}{Th�or�me}

131: \newtheorem{corol}{Corollaire}

132: \newtheorem{conj}{Conjecture}

133: \newtheorem{observ}{Observation}

134: \newtheorem{exemple}{Exemple}

135: \fi

136:

137: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

138: %

139: % Ensembles math�matiques usuels

140: %

141:

142: \def\R{\mathbb R}

143: \def\Q{\mathbb Q}

144: \def\Z{\mathbb Z}

145: \def\N{\mathbb N}

146: \def\F{\mathbb F}

147: \def\M{\mathbb M}

148: \def\K{\mathbb K}

149: \def\C{\mathbb C}

150: \def\Fb{\overline{\mathbb F}}

151: \def\Rb{\overline{\mathbb R}}

152:

153: \ifx\mathbb\undefined

154: \def\mathbb{\Bbb}

155: \fi

156:

157: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

158: %

159: % Fonction math�matiques utilis�es en informatique

160: %

161:

162: \def\mod{\mathop{\rm mod}\nolimits}

163: \def\ulp{\mathop{\rm ulp}\nolimits}

164: \def\sign{\mathop{\rm sign}\nolimits}

165: \def\trunc{\mathop{\rm Trunc}\nolimits}

166: \def\tronc{\mathop{\rm Tronc}\nolimits}

167:

168: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

169: %

170: % Expressions math�matiques

171: %

172:

173: \def\ceil#1{ \left\lceil  #1 \right\rceil }

174: \def\floor#1{\left\lfloor #1 \right\rfloor}

175: \def\fl#1{\left\lfloor \log #1 \right\rfloor}

176: \def\cl#1{\left\lceil  \log #1 \right\rceil }

177:

178: \def\CG{\left[}   \def\CD{\right]}

179: \def\PG{\left(}   \def\PD{\right)}

180: \def\AG{\left\{}  \def\AD{\right\}}

181:

182: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

183: %

184: % Modes d'arrondi normalis�s IEEE

185: %

186:

187: \def\Rx{\mathop{\Box}\nolimits}

188:

189: \def\RO{\mathop{\bowtie}\nolimits}

190: \def\RU{\mathop{\triangle}\nolimits}

191: \def\RN{\mathop{\circ}\nolimits}

192: \def\RD{\mathop{\nabla}\nolimits}

193:

194: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

195: %

196: % Inclusion de figures

197: %

198:

199: \newenvironment{pfigure}{\begin{figure}}{\end{figure}}

200: \def\figdir{Fig} \def\figepsdir{\figdir}

201: \def\figext{\dvipsselector{pstex_t}{pdftex_t}}

202: \def\epsdir{../EPS}

203:

204: \newcommand{\figtex}[2]{%

205:   \begin{pfigure}

206:     \begin{center}

207:       \input \figdir/#1.\figext

208:     \end{center}

209:     \caption{#2}

210:     \label{fig/#1}

211:     \label{fig:#1}

212:   \end{pfigure}

213: }

214:

215: \newcommand{\figeps}[2]{%

216:   \begin{pfigure}

217:     \centerline{\epsfig{file=\epsdir/#1.eps}}

218:     \caption{#2}

219:     \label{fig/#1}

220:     \label{fig:#1}

221:   \end{pfigure}

222: }

223:

224: \newcommand{\figpic}[2]{%

225:   \begin{pfigure}

226:     \begin{center}

227:       \input \figdir/#1.eepic

228:     \end{center}

229:     \caption{#2}

230:     \label{fig/#1}

231:     \label{fig:#1}

232:   \end{pfigure}

233: }

234:

235: \newcommand{\figinc}[2]{%

236:   \begin{pfigure}

237:     \begin{center}

238:       \input \figdir/#1.tex

239:     \end{center}

240:     \caption{#2}

241:     \label{fig/#1}

242:     \label{fig:#1}

243:   \end{pfigure}

244: }

245:

246: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

247: %

248: % Dessins

249: %

250:

251: \newcommand{\pictex}[1]{%

252:   \begin{center}

253:     \input \figdir/#1.\figext

254:   \end{center}

255:   }

256:

257: \newcommand{\piceps}[1]{\centerline{\epsfig{file=\epsdir/#1.eps}}}

258:

259: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

260: %

261: % Encarts, listings

262: %

263:

264: \newcommand{\boxlabel}{}

265: \newcommand{\boxcaption}{}

266:

267: \newenvironment{encart}[2]{%

268:   \fboxsep=5mm

269:   \renewcommand{\boxlabel}{#1}

270:   \renewcommand{\boxcaption}{#2}

271:   \begin{pfigure}

272:     \begin{center}

273:       \begin{boxedminipage}{.9\textwidth}

274: }{%

275:         \vspace{-1.5\baselineskip}

276:         \hspace*{\fill}

277:       \end{boxedminipage}

278:     \end{center}

279:     \caption{\boxcaption}

280:     \label{enc/\boxlabel}

281:   \end{pfigure}

282: }

283:

284: \newenvironment{encart2}[2]{%

285:   \fboxsep=5mm

286:   \renewcommand{\boxlabel}{#1}

287:   \renewcommand{\boxcaption}{#2}

288:   \begin{pfigure}

289:     \begin{center}

290:       \begin{boxedminipage}{.475\textwidth}

291: }{%

292:         \vspace{-1.5\baselineskip}

293:         \hspace*{\fill}

294:       \end{boxedminipage}

295:     \end{center}

296:     \caption{\boxcaption}

297:     \label{enc/\boxlabel}

298:   \end{pfigure}

299: }

300:

301: \usepackage{times,palatino}

302: %\usepackage{fullpage}

303: %\usepackage{rri}

304: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

305: %

306: % File:         rri.sty

307: %

308: % Author(s):    Marc Daumas

309: %               <Marc.Daumas@Lip.Ens-Lyon.Fr>

310: %               Laboratoire de l'Informatique du Parallelisme

311: %

312: % Last modified on Wed Mar 01 12:44:17 GMT MET 2000 by daumas

313: %      modified on Fri Feb 28 16:22:11 MET 1997 by mdaumas

314: %

315:

316: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

317: %

318: % Environnements

319: %

320:

321: \ifx\titleLIP\undefined

322:   \ifx\RRetitle\undefined

323:     \newcommand{\RRItitle}[1]{}

324:     \newcommand{\RRItitre}[1]{}

325:     \newcommand{\RRIdate}[1]{}

326:     \newcommand{\RRIauthor}[1]{}

327:     \newcommand{\RRIthead}[1]{}

328:     \newcommand{\RRIahead}[1]{}

329:     \newcommand{\RRIabstract}[1]{\newcommand{\RRIpre}{\begin{abstract}#1\end{abstract}}}

330:     \newcommand{\RRIresume}[1]{}

331:     \newcommand{\RRIkeywords}[1]{}

332:     \newcommand{\RRImotscles}[1]{}

333:     \newcommand{\RRIbegin}{}

334:     \newcommand{\RRInumber}[1]{}

335:     \newcommand{\RRItheme}[1]{}

336:     \newcommand{\RRIprojet}[1]{}

337:     \newcommand{\RRIno}[1]{#1}

338:     \newcommand{\RRIyes}[1]{}

339:     \newcommand{\RRIinria}[1]{}

340:     \newcommand{\RRIlip}[1]{}

341:   \else

342:     \newcommand{\RRItitle}[1]{\RRetitle{#1}}

343:     \newcommand{\RRItitre}[1]{\RRtitle{#1}}

344:     \newcommand{\RRIdate}[1]{\RRdate{#1}}

345:     \newcommand{\RRIauthor}[1]{\RRauthor{#1}}

346:     \newcommand{\RRIthead}[1]{\titlehead{#1}}

347:     \newcommand{\RRIahead}[1]{\authorhead{#1}}

348:     \newcommand{\RRIabstract}[1]{\RRabstract{#1}}

349:     \newcommand{\RRIresume}[1]{\RRresume{#1}}

350:     \newcommand{\RRIkeywords}[1]{\RRkeyword{#1}}

351:     \newcommand{\RRImotscles}[1]{\RRmotcle{#1}}

352:     \newcommand{\RRIbegin}{\URRhoneAlpes \makeRR}

353:     \newcommand{\RRIpre}{}

354:     \newcommand{\RRInumber}[1]{}

355:     \newcommand{\RRItheme}[1]{\RRtheme{#1}}

356:     \newcommand{\RRIprojet}[1]{\RRprojet{#1}}

357:     \newcommand{\RRIno}[1]{}

358:     \newcommand{\RRIyes}[1]{#1}

359:     \newcommand{\RRIinria}[1]{#1}

360:     \newcommand{\RRIlip}[1]{}

361:     \newcommand{\french}{}

362:     \newcommand{\english}{}

363:   \fi

364: \else

365:     \usepackage{fancyheadings}

366:     \newcommand{\RRItitle}[1]{\titleLIP{#1}}

367:     \newcommand{\RRItitre}[1]{}

368:     \newcommand{\RRIdate}[1]{\dateLIP{#1}}

369:     \newcommand{\RRIauthor}[1]{\authorLIP{#1}}

370:     \newcommand{\RRIthead}[1]{\newcommand{\RRItheadv}{#1}}

371:     \newcommand{\RRIahead}[1]{\newcommand{\RRIaheadv}{#1}}

372:     \newcommand{\RRIabstract}[1]{\abstractLIP{#1}}

373:     \newcommand{\RRIresume}[1]{\resumeLIP{#1}}

374:     \newcommand{\RRIkeywords}[1]{\keywordsLIP{#1}}

375:     \newcommand{\RRImotscles}[1]{\motsclesLIP{#1}}

376:     \newcommand{\RRIbegin}{\rapportLIP\pagenumbering{arabic}\pagestyle{fancy}\rhead[\RRIaheadv]{\thepage}\lhead[\thepage]{\RRItheadv}\cfoot{}}

377:     \newcommand{\RRIpre}{}

378:     \newcommand{\RRInumber}[1]{\numberLIP{#1}}

379:     \newcommand{\RRItheme}[1]{}

380:     \newcommand{\RRIprojet}[1]{}

381:     \newcommand{\RRIno}[1]{}

382:     \newcommand{\RRIyes}[1]{#1}

383:     \newcommand{\RRIinria}[1]{}

384:     \newcommand{\RRIlip}[1]{#1}

385: \fi

386:

387:

388: \usepackage{color,amssymb}

389:

390: \bibliographystyle{plain}

391:

392: \RRItitle{Computer validated proofs of \\ a toolset for adaptable arithmetic}

393:

394: \RRItitre{Bo�te � outils pour l'arithm�tique num�rique \\ valid�e par un preuve sur ordinateur}

395:

396: \RRIdate{Janvier 2001}

397:

398: \RRIauthor{%

399:   Sylvie Boldo\RRIinria{, LIP} \\

400:   Marc Daumas\RRIinria{, CNRS} \\

401:   Claire Moreau-Finot\RRIinria{, LIP} \\

402:   Laurent Th�ry\RRIinria{, INRIA \\~}

403: }%

404:

405: \RRIthead{Computer validated proofs of a toolset for adaptable arithmetic}

406: \RRIahead{Sylvie Boldo, Marc Daumas, Claire Moreau-Finot and Laurent Th�ry}

407:

408: \RRInumber{2001-01}

409:

410: \RRIabstract{%

411:   Most  existing  implementations  of  multiple  precision  arithmetic

412:   demand  that the  user  sets  the precision  {\em  a priori}.   Some

413:   libraries  are said  adaptable in  the sense  that  they dynamically

414:   change the precision of  each intermediate operation individually to

415:   deliver  the target  accuracy according  to the  actual  inputs.  We

416:   present in this text a new adaptable numeric core inspired both from

417:   floating point expansions and from on-line arithmetic.

418:

419:   The numeric core is cut down  to four tools.  The tool that contains

420:   arithmetic operations is proved to  be correct. The proofs have been

421:   formally checked  by the Coq  assistant.  Developing the  proofs, we

422:   have formally proved many results published in the literature and we

423:   have extended  a few of  them. This work  may let users  (i) develop

424:   application specific adaptable libraries  based on the toolset and /

425:   or (ii) write new formal proofs based on the set of validated facts.

426: }%

427:

428: \RRIresume{%

429:   La plupart  des codes �  pr�cision multiple existants  demandent que

430:   l'utilisateur sp�cifie la pr�cision des calculs {\em a priori}.  Une

431:   solution est d'utiliser pour les calculs interm�diaires la pr�cision

432:   la  plus  �lev�e  pour  atteindre  la  pr�cision  demand�e  pour  le

433:   r�sultat. On  dit que certaines biblioth�ques  sont adaptatives dans

434:   le sens  qu'elles changent dynamiquement la pr�cision  de chacun des

435:   calculs  interm�diaires  en  fonction  des  valeurs  effectives  des

436:   entr�es pour atteindre la pr�cision demand�e pour le r�sultat.  Nous

437:   pr�sentons  dans  ce  texte  un nouveau  noyau  num�rique  adaptatif

438:   inspir�  �  la  fois  des  expansions de  nombres  flottants  et  de

439:   l'arithm�tique en-ligne.

440:

441:   Le noyau  num�rique se d�compose  en cinq outils.  Le  premier outil

442:   qui contient de nombreuses op�rations arithm�tiques est prouv�.  Les

443:   preuves  ont  �t� valid�es  formellement  par  l'assistant Coq.   En

444:   d�veloppant  les  preuves, nous  avons  red�velopp� formellement  de

445:   nombreux r�sultats d�j� publi�s dans la litt�rature et nous en avons

446:   �tendu  certains.   Ce travail  permettra  aux  utilisateurs de  (i)

447:   d�velopper   des  biblioth�ques   adaptatives   sp�cifiques  �   une

448:   application bas�es  sur la  bo�te �  outils et /  ou (ii)  �crire de

449:   nouvelles preuves formelles bas�es sur notre socle de faits valid�s.

450: }%

451:

452: \RRIkeywords{Multiple  precision,  expansion,  on-line, formal  proof, Coq.}

453:

454: \RRImotscles{Pr�cision multiple, expansion, en-ligne, preuve formelle, Coq.}

455:

456: \RRItheme{2}

457:

458: \RRIprojet{Ar�naire}

459:

460: \RRIinria{\RRnote{%

461:     This  text  is  also  available   as  a  research  report  of  the

462:     Laboratoire    de     l'Informatique    du    Parall�lisme    {\tt

463:       http://www.ens-lyon.fr/LIP}.

464: }}%

465:

466: \begin{document}

467:

468: \RRIbegin

469:

470: \newtheorem{proc}{Lemma}

471: \newcommand{\ptys}[2]{{\noindent \bf{#1}}{\it{#2}}}

472:

473: \title{%

474:   {\bf

475:     Computer validated proofs of \\

476:     a toolset for adaptable arithmetic}\thanks{%

477:     Authors  may be  reached via  e-mail  at Sylvie.Boldo@ENS-Lyon.Fr,

478:     Marc.Daumas@ENS-Lyon.Fr,       Claire.Moreau@ENS-Lyon.Fr       and

479:     Laurent.Thery@INRIA.Fr.  \RRIlip{This text  is also available as a

480:       research  report  of  the  Institut  National  de  Recherche  en

481:       Informatique et en Automatique {\tt http://www.inria.fr}.}

482:     }%

483: }%

484:

485: \author{

486:   {\bf Sylvie Boldo, Marc Daumas, Claire Moreau-Finot} \\

487:   Laboratoire de l'Informatique du Parall�lisme        \\

488:   UMR 5668 - CNRS - ENS de Lyon - INRIA                \\[24pt]

489:   {\bf Laurent Th�ry}                                  \\

490:   INRIA Nice---Sophia-Antipolis

491: }%

492: \date{}

493:

494: \maketitle

495: \RRIpre

496:

497: \section*{Introduction}

498:

499: We will  first present examples of numerical  computations coming from

500: four  fields  of application.   With  each  example  we introduce  new

501: properties, new goals and some needed references.  As the subject will

502: become clear, we  will present the contributions of  this text and its

503: summary.

504:

505: \subsection*{Motivation}

506:

507: As an undergraduate student, we  have all learnt to compute a quantity

508: with a few significant digits first  as 5 miles is about 8 kilometers.

509: If the question is ``Are 4.995 miles more than 8 kilometers?'', we are

510: close to  a threshold. To answer  the question, we  would compute more

511: (less significant) digits  to find that 4.995 miles  are a little over

512: 38  meters   longer  than   8  kilometers.   We   might  first   do  a

513: multiplication of 5 by 1.6  and then the exact multiplication of 4.995

514: by 1.609344.  However, we could save that 5 times 1.6 is exactly 8 and

515: afterward  just multiply  5 by  9  to obtain  45 meters  with a  small

516: additional  value  that is  less  than  5  meters.  That  answers  the

517: question since 0.005 miles is less than 10 meters.

518:

519: Numerical  analysts have long  designed algorithms  that react  to the

520: accuracy aimed by the  user.  Countless specific algorithms reduce the

521: error on the final result by iterative refinement in solving algebraic

522: equations,  linear systems, ODEs,  PDEs and  so on  \cite{Hig96}. Such

523: methods are  so robust  that the final  solution can be  very accurate

524: even if the first attempt before any refinement is not close.

525:

526: All  the  above-mentioned   solutions  assume  that  the  intermediate

527: arithmetic operations are performed up  to a fixed level of precision.

528: This  level,  usually  the  one  given  by  machine  double  precision

529: arithmetic, is used for all  the operations and is sufficient to store

530: the result to  the accuracy aimed by the  user.  Recent work presented

531: in \cite{Hig96,SorTan91,Bar93}  proposes new enhanced  routines if two

532: levels of precision are available on the machine. The second level can

533: be obtained  by a careful use  of the tools presented  in this article

534: \cite{BolDau01}.  All  these algorithms use the  machine arithmetic as

535: an  error  prone  tool  that  must  be  circumvented  by  mathematical

536: analysis.

537:

538: % For the solutions presented above, adaptability is defined at the {\em

539: %   application level}, we will see  in the remaining of this text, {\em

540: %   arithmetic level} adaptability.

541:

542: \subsection*{Adaptability {\em versus} multiple precision}

543:

544: A  numerical  algorithm  is   only  slightly  modified  to  accommodate

545: adaptable arithmetic or  multiple precision arithmetic.  With multiple

546: precision arithmetic, each operator is  fired only once and the result

547: is computed  with the precision  fixed at compile time.   Static error

548: analysis is  used to guarantee that  the final result is  known to the

549: accuracy specified by  the user.  Being static, the  error analysis is

550: often too pessimistic.  As it is readily available, multiple precision

551: arithmetic is often chosen by users.

552:

553: When  a code  has  been  modified to  become  adaptable, every  single

554: operator is started with a very limited precision.  The runtime system

555: freezes the operator but it is able to resume its work when necessary.

556: The  program is  not ended  unless the  final result  is known  to the

557: accuracy specified by the user. The evaluation continues until this is

558: the  case,  with  the  system  increasing  automatically  the  working

559: precision of each operator independently.

560:

561:

562: Coherency  is key  to the  field of  computational geometry  where one

563: single tiny error  may change a convex hull  to a non-convex, possibly

564: non-planar,  graph.   Numerical  routines  are  used  to  answer  such

565: predicates as ``Is  a point in a circle defined by  3 other points?''.

566: A Boolean answer cannot be  approximated, it is either right or wrong,

567: and one single wrong answer might  lead a program to an abstract state

568: that cannot  occur in Euclidian geometry.  Recent  works have proposed

569: clever    implementations    of    multiple    precision    arithmetic

570: \cite{ForVan93,KarLiPecYap99}  to  answer  correctly such  questions.

571: They reinforce the existing  set of general purpose multiple precision

572: libraries \cite{Bre78a,SerVuiHer89,Bai93,Bai95,Gra2Kb}.  Some marginal

573: work does implement iterative  refinements of the geometric predicates

574: \cite{AvnBoiDevPreYvi97}  or change  the algorithm  to  work correctly

575: with limited predicates \cite{BoiPre2K}.

576:

577: Adaptability is used for our last example. It is the central scheme of

578: Ziv's paper \cite{Ziv91} where the author implements correctly rounded

579: elementary functions (circular and exponential).  Correct rounding for

580: the  floating   point  numbers   is  mathematically  defined   by  two

581: international      standards      (ANSI-IEEE-ISO~754      and      854

582: \cite{Ste.81,Ste.87,Gol91,CodKar.84}).  It  is a monotonous projection

583: from the  real set to the set  of the floating point  numbers.  If all

584: the  numbers of  a given  interval round  to the  same  floating point

585: value,  you can  safely  round this  interval  to the  common result.

586: Otherwise, you cannot round the interval.

587:

588: An approximated algorithm such as  the ones used for the evaluation of

589: the elementary  functions \cite{Mul97}  does not produce  directly the

590: rounded result  but it  computes an approximated  result and  an error

591: bound.  This pair defines an interval that contains the exact result.

592: \begin{itemize}

593: \item  If the interval  is narrow,  it rounds  to one  single floating

594:   point  value  that is  the  correctly  rounded  value of  the  exact

595:   elementary  function.

596: \item If we discover from  the current approximation that the interval

597:   contains a  discriminating point, the approximation  must be refined

598:   to  reduce its  width.  This  process continues  until  a satisfying

599:   approximation is obtained to round the interval to one single value.

600:   The set of the discriminating  point is the set of the representable

601:   numbers for the directed roundings and the set of the midpoints when

602:   rounding to the nearest.

603: \end{itemize}

604:

605: With the first  iteration of Ziv's scheme, machine  precision is used.

606: For the  result interval to  get sharper, the approximation  scheme is

607: only slightly modified  but the target precision is  increased and the

608: arithmetic  operations become  increasingly slow.  Cases where  a high

609: precision result  is needed are  very infrequent and the  running time

610: seen by  a user will most  probably be the  time for the first  or the

611: second iteration.

612:

613: Adaptable behaviour  is sometime recreated with Mathematica  or Maple.

614: These two softwares let the user change the working precision and they

615: provide some criteria to estimate the final accuracy of many numerical

616: routines \cite{ChaGedGonMonWat91}.  Users set  the number of digits of

617: the intermediate precision after a few educated random attempts.

618:

619: \subsection*{Prior art}

620:

621: \paragraph{Adaptability}

622:

623: Some  few  authors  already  presented  a  general  purpose  adaptable

624: arithmetic                                                      library

625: \cite{Boe87,Men94,Men95,BerDau97,MicMor97,BurFleMehSch99},   but  some

626: others stopped with one or two adaptable attempts followed by an exact

627: evaluation   \cite{ForVan93,She97}.     These   last   solutions   are

628: appropriate in many cases because  the first few attempts are fast and

629: they succeed  by far in the  most frequent case  for many applications

630: \cite{DevPre98}. These authors where  forced to stop because they used

631: a  data structure  that  is not  appropriate  for adaptability.   They

632: forced  one  or  two  rounds  of  adaptability  with  great  efforts.

633:

634: \paragraph{Floating point expansion}

635:

636: This  data structure was  introduced by  Priest \cite{Pri91}  based on

637: some earlier  operations \cite{Knu69,Knu73}.  Actually,  M{\o}ller and

638: Dekker were the first in the past to propose such techniques to extend

639: precision  on  the  floating  point unit  \cite{Mol65a,Mol65b,Dek71}.

640: Kahan  and  Pichat  applied  similar  techniques  for  other  purposes

641: \cite{Kah65,Pic72}.

642:

643:

644: Fast algorithms, tightly connected  to the machines, are possible with

645: IEEE-754 compatible commercial floating point units.  Shewchuk was the

646: first one  to present a working  library available on the  net with an

647: actual  application  to  computational  geometry  \cite{She96,She97}.

648: Assuming that  the floating point unit  is IEEE compatible  gives us a

649: powerful set of axiomatic properties on floating point operations.  We

650: have proposed in the past some algorithms and their implementation for

651: multiplication        and         division        on        expansions

652: \cite{Dau99a,DauFin98,DauFin99a}.  Developments are still undergone as

653: new research teams get  interested by this approach \cite{Sei2K}.  Our

654: contribution forces  us to define a  new kind of  expansions.  We call

655: them pseudo-expansions.

656:

657: % This     definition    yields     a     more    redundant     notation

658: % \cite{Mat82,NieKor99}.  Although  the proof of  \cite{Maz93a} does not

659: % apply here, it is sensible  that more redundancy in the notation eases

660: % the implementation  of most significant digit  first (MSDF) arithmetic

661: % \cite{Avi61,Mul94a,Fro91}.

662:

663: \paragraph{On-line arithmetic}

664:

665: We will  present in  this text that  we can  reduce the growth  of the

666: running time by setting up an appropriate data structure that will not

667: restart  an  adaptable evaluation  from  scratch  as the  intermediate

668: precision is modified.

669:

670: This  data   structure  is   inspired  by  on-line   arithmetic  where

671: adaptability is natural.  On line  arithmetic was targeted in the past

672: to   hardware  design   with  small   radices  (typically   2   or  4)

673: \cite{TriErc77,CorDupHocMul91,LouErc93,DauMulVui94,DauMulTis97} but it

674: has    proved    to   be    suitable    for   software    applications

675: \cite{MazMer93,BerDau97}.

676:

677: \subsection*{Contributions}

678:

679: The first contribution  of this work is a  set of primitives available

680: soon  on   the  Internet  through  our   home  web  site\footnote{{\tt

681:     http://www.ens-lyon.fr/LIP/Arenaire/}.}   and   through  the  {\sc

682:   netlib} repository\footnote{{\tt  http://www.netlib.org/} among many

683:   access protocols.}.  A toy example shows that high level programming

684: with  object  interface  is  too  slow for  the  kernel  of  numerical

685: softwares \cite{Gog2K}.   We ran the  same FLOPS routines  three times

686: \cite{Don92,Don99}.  The first run was performed with object automatic

687: allocation and destruction.  The second run used in-line function calls

688: and no object allocation.  The final code used explicit code in-lining.

689: Removing  object programming  increased the  performance of  133\% and

690: using code  in-lining yielded  another 56\%.  Our  contributed building

691: blocks are pieces  of code assembled by the  designer according to its

692: needs.   They present a  straight interface  to get  a good  trade off

693: between simplicity and speed.

694:

695: The second  contribution is a  set of validated proofs.   To mechanise

696: our    proofs,     we    have    been    using     the    Coq    proof

697: assistant~\cite{HueKahPau97}.   Systems  like Coq  allow  the user  to

698: define  new objects and  to derive  consequences of  these definitions

699: formally. The language of Coq  is based on a higher-order logic.  With

700: such an expressive logic, it  is possible to state properties in their

701: most  general form.   For example,  universal quantification  has been

702: used to state  properties that are true for an  arbitrary format or an

703: arbitrary  rounding  mode.    Proofs  are  built  interactively  using

704: high-level  tactics. At the  end of  each proof,  Coq records  a proof

705: object that  contains all  the details of  the derivation  and ensures

706: that  the  theorem  is   valid.  Theorem  provers  have  already  been

707: successfully   used   to  mechanically   check   the  correctness   of

708: floating-point algorithms~\cite{Rus98,Har97a}.

709:

710: Our    formal    development    is    freely    available    on    the

711: Internet\footnote{{\tt  http://www-sop.inria.fr/lemme/AOC/coq/}.}  and

712: it  will  soon  be   available  as  a  Coq  contribution\footnote{{\tt

713:     http://coq.inria.fr/}.}.  A clickage map of the hierarchy makes it

714: possible to browse  into the different components.  At  the moment, it

715: is 22000 line  long and it contains 106 Definitions  and 545 Theorems.

716: No proof is presented in detail in this manuscript, but we outline the

717: main results  used for the  proofs. Since the proofs  are mechanically

718: validated, the  reader is  sure that the  annoying special  cases have

719: been checked and that all the conditions of the properties used in the

720: proof have been met.  \RRIyes{All the Definitions and Theorems of this

721:   development  are  presented  in  annexes  A through  W.}   For  each

722: mentioned  theorem  we give  its  name as  it  appears  in the  formal

723: development and the file where it is proved.

724:

725: We first present definitions and validated properties for the floating

726: point numbers,  the expansions  and the pseudo-expansions.   Section 2

727: presents  the  addition, multiplication  and  division toolset.   This

728: presentation ends with concluding remarks in the last Section.

729:

730: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

731:

732: \newcommand{\rr}{\mathbb{R}}

733: \newcommand{\nn}{\mathbb{N}}

734: \newcommand{\ideal}[1]{<\!\!#1\!\!>}

735: \newcommand{\lcm}[2]{#1\!\hbox{\^{}}#2}

736:

737: \newlength{\ppwd}

738: \newlength{\ppht}

739: \newcommand{\rpp}[1]{

740: \settowidth{\ppwd}{#1}

741: \settoheight{\ppht}{#1}

742: \raisebox{\ppht}[0pt]{\makebox[0pt][l]{

743: \hspace{.15\ppwd}{\tiny \tt >}}}#1}

744: \newcommand\pp{\rpp{+}}

745: \newcommand{\rppa}[1]{

746: \settowidth{\ppwd}{#1}

747: \settoheight{\ppht}{#1}

748: \raisebox{\ppht}[0pt]{\makebox[0pt][r]{

749: {\tiny \tt >}\hspace{-5pt}}}#1}

750: \newcommand\ppa{\rppa{+}}

751: %\newcommand\pp{\hbox{\raise6pt\hbox{${\hbox{\raise-5pt\hbox{\tiny \tt >}}}\atop+$}}}

752: \newcommand{\limp}{\Rightarrow}

753: \newcommand{\reduce}{\rightarrow^{\ }_S}

754: \newcommand{\reducep}{\rightarrow^+_S}

755: \newcommand{\reduces}{\rightarrow^*_S}

756: \newcommand{\mon}{M_n}

757: \newcommand{\lm}{\le_{M_n}}

758: \newlength{\hsbw}

759: \newcommand\MSpacing{13pt}

760: \newenvironment{cboxed}{\begin{flushleft}

761: \setlength{\hsbw}{\textwidth}

762: \addtolength{\hsbw}{-\arrayrulewidth}

763: \addtolength{\hsbw}{-\tabcolsep}

764:  \begin{tabular}{@{}|c@{}|@{}}\hline

765:  \begin{minipage}[b]{\hsbw}

766:  \vspace*{.06in}

767:  \begingroup\small\baselineskip\MSpacing}{\endgroup\end{minipage}\\ \hline

768:  \end{tabular}

769:  \end{flushleft}}

770: \newcommand{\term}{{\it term}}

771: \newcommand{\pol}{{\it poly}}

772: \newcommand{\grob}{\hbox{\it Gr�bner}}

773: \newcommand{\cset}{{\it Set}}

774: \newcommand{\cprop}{{\it Prop}}

775: \newcommand{\cfun}{\rightarrow}

776: \newcommand{\zerop}[1]{{\it zeroP}(#1)}

777: \newcommand{\lpol}{({\it list}\, \pol)}

778: \newcommand{\lterm}{({\it list}\, \term)}

779: \newcommand{\cano}{\mathcal{C}}

780: \newcommand{\olist}{\mathcal{O}}

781: \newcommand{\cforall}[3]{\forall #1\!\!:#2.\, #3}

782: \newcommand{\cforalln}[2]{\forall #1\!\!:#2}

783: \newcommand{\clambda}[3]{\lambda #1\!\!:#2.\, #3}

784: \newcommand{\cexists}[3]{\exists #1\!\!:#2.\, #3}

785: \newcommand{\cin}[2]{#1\,\,{\it in}\,#2}

786: \newcommand{\cdef}[3]{\hbox{{\bf Definition}$\,\, #1{\bf :}\, #2\,{\bf :=}\, \,#3$.}}

787: \newcommand{\cdefq}[2]{\hbox{{\bf Definition}$\,\, #1\,{\bf :=}\, \,#2$.}}

788: \newcommand{\cdefqn}[2]{\hbox{{\bf Definition}$\,\, #1\,{\bf :=}\, \,#2$}}

789: \newcommand{\cdeffn}[3]{\hbox{{\bf Definition}$\,\, #1{\bf :}\, #2\,{\bf :=}\,\,  #3$}}

790: \newcommand{\cdefp}[2]{\hbox{{\bf Definition}$\,\, #1\, {\bf :}\, \, #2$.}}

791: \newcommand{\cdefpn}[2]{\hbox{{\bf Definition}$\,\, #1\, {\bf :}\, \, #2$}}

792: \newcommand{\cdefrn}[2]{\hbox{{\bf Definition}$\,\, #1\, {\bf :}\, \, #2\,{\bf :=}$}}

793: \newcommand{\cthm}[2]{\hbox{{\bf Theorem}$\,\,  #1 {\bf :}\, #2$.}}

794: \newcommand{\cthmn}[2]{\hbox{{\bf Theorem}$\,\,  #1 {\bf :}\, #2$}}

795: \newcommand{\cthmrn}[1]{\hbox{{\bf Theorem}$\,\,  #1 {\bf :}$}}

796: \newcommand{\clem}[2]{\hbox{{\bf Lemma}$\,\, #1{\bf :}\quad  #2$.}}

797: \newcommand{\clemn}[2]{\hbox{{\bf Lemma}$\,\, #1{\bf :}\quad #2$}}

798: \newcommand{\cdefn}[2]{\hbox{{\bf Definition}$\,\, #1{\bf :}\quad  #2\, {\bf :=}$}}

799: \newcommand{\cfix}[3]{\hbox{{\bf Fixpoint}$\,\, #1\,{\bf [}#2{\bf ]:}\quad  #3\, {\bf :=}$}

800: }

801: \newcommand{\cfixn}[4]{\hbox{{\bf Fixpoint}$\,\, #1\,{\bf [}#2{\bf ]:}\quad  #3\, {\bf :=} #4$}

802: }

803: \newcommand{\caxiom}[2]{\hbox{{\bf Axiom}$\,\, #1{\bf :}\quad  #2$.}}

804: \newcommand{\caxiomn}[2]{\hbox{{\bf Axiom}$\,\, #1{\bf :}\quad  #2$}}

805: \newcommand{\cparam}[2]{\hbox{{\bf Parameter}$\,\, #1{\bf :}\quad  #2$.}}

806: \newcommand{\cparamn}[2]{\hbox{{\bf Parameter}$\,\, #1{\bf :}\quad  #2$}}

807: \newcommand{\ccas}[1]{\hbox{$\quad${\bf Cases}$\,\, #1\,\,{\bf of}$}}

808: \newcommand{\cend}{\hbox{$\quad${\bf end}}}

809: \newcommand{\cp}[1]{\hbox{#1.}}

810: \newcommand{\ccasf}[2]{\hbox{$\quad \ \, #1\,\, {\bf \Longrightarrow}\,\, #2$}}

811: \newcommand{\ccasi}[2]{\hbox{$\quad |\, #1\,\, {\bf \Longrightarrow}\,\, #2$}}

812: \newcommand{\clet}[2]{\hbox{$\quad${\bf let }$\, #1=#2\, {\bf in}$}}

813: \newcommand{\clin}[1]{\hbox{$\quad #1$}}

814: \newcommand{\cvar}[2]{\hbox{{\bf Variable}$\,\, #1{\bf :}\, #2$.}}

815: \newcommand{\chyp}[2]{\hbox{{\bf Hypothesis}$\,\, #1{\bf :}\, #2$.}}

816: \newcommand{\chypn}[2]{\hbox{{\bf Hypothesis}$\,\, #1{\bf :}\, #2$}}

817: \newcommand{\cind}[2]{\hbox{{\bf Inductive}$\,\, #1{\bf :}\, #2 :=$}}

818: \newcommand{\cindu}[3]{\hbox{{\bf Inductive}$\,\, #1[#2]{\bf :}\, #3 :=$}}

819: \newcommand{\cicn}[2]{\hbox{${\bf |}\quad #1\,{\bf :}\,#2$}}

820: \newcommand{\cicnf}[2]{\hbox{$\ \quad #1\,{\bf :}\,#2$}}

821: \newcommand{\cicnl}[2]{\hbox{${\bf |}\quad #1\,{\bf :}\,#2$.}}

822: \newcommand{\cicnfl}[2]{\hbox{$\ \quad #1\,{\bf :}\,#2$}}

823: \newcommand{\cic}[1]{\hbox{${\bf |}\quad #1$}}

824: \newcommand{\cicf}[1]{\hbox{$\ \quad #1$}}

825: \newcommand{\cicl}[1]{\hbox{${\bf |}\quad #1$.}}

826: \newcommand{\cicfl}[1]{\hbox{$\ \quad #1$.}}

827:

828: \section{Definitions and validated properties}

829:

830: \subsection{Floating point numbers}

831:

832: An IEEE double precision floating  point number is built from 3 binary

833: fields:  the sign  (1  bit), the  fraction  (52 bits)  and the  biased

834: exponent  (11 bits).   Its  normal interpretation  $x$  as a  rational

835: number is given below.

836: $$

837: \begin{array}{r c l}

838:   \text{mantissa} & = & 1.\text{fraction} \\

839:   \text{exponent} & = & \text{biased exponent} - \text{bias} \\

840:                x  & = & (-1)^{\text{sign}} \times \text{mantissa} \times 2^{\text{exponent}}

841: \end{array}

842: $$

843:

844: %\figtex{fp}{Binary fields of a floating point number.}

845:

846: We  will use  in this  text  the more  general notation  $x= n  \times

847: \beta^{e}$ to define a floating point number of radix $\beta$ with two

848: integers, the  significand $n$ and  the amplitude $e$.   This relation

849: gives a  signification to  any pair  $(n, e)$.  We  will use  the name

850: floating point  number or {\bf float}  to represent such  a pair.  Two

851: floats $(n,  e)$ and $(n', e')$  are equivalent if they  have the same

852: value as real  numbers, we write $(n, e) \equiv  (n', e')$.  We extend

853: this notation to the represented real value and we may write $n \times

854: \beta^e  \equiv  (n  ,e)$.   We  will also  use  the  equivalence  for

855: expression whenever there is no ambiguity.  No order is defined on the

856: floats and inequalities are given implicitly on their interpretation.

857:

858: Given a  pair $(n_{\max}, e_{\min}) \in  \N_*^2$, we say  that a float

859: $(n, e)$ is {\bf bounded} if and only if it satisfies

860: \begin{equation}

861: \label{eqn/bounded}

862:   |n| \le n_{\max}

863: ~~~~~ \text{and} ~~~~~

864:   - e_{\min} \le e.

865: \end{equation}

866:

867: We do  not use  the overflow bound  of $e_{\max}$.  The  IEEE standard

868: defines an overflow  when the rounded value of  the result exceeds the

869: largest  representable  number.   It  implies that  the  rounding  is

870: defined  on a data  type that  does not  have an  upper bound  for the

871: exponent.   All the  results of  this text  are true  provided neither

872: overflow  nor   exact  infinity  (division  by  0)   occurred  in  the

873: computation.  If this is not the case, infinity and {\em not a number}

874: quantities will proliferate in the results.

875:

876: The bound for an IEEE standard inspired representation with $p$ digits

877: of mantissa and $r$ digits  of exponent is given Table~\ref{tab/ieee}.

878: We will use the common fraction notation for the examples although the

879: integer  notation  is  used  for   the  proofs.   We  use  only  radix

880: independent generic IEEE standard inspired  bounds in the rest of this

881: text.   A  bounded  float is  {\bf  normal}  if  $|n| \times  \beta  >

882: n_{\max}$.  It is {\bf de-normal} if $|n| \times \beta \le n_{\max}$. A

883: de-normal float such that $e = -e_{\min}$ is called {\bf subnormal}. We

884: define the  {\bf ulp} value as  one unit in  the last place of  1 that

885: equals $\beta / (n_{\max} + 1)$.

886:

887: \begin{table}

888: \caption{IEEE standard inspired bounds}

889: \label{tab/ieee}

890: \begin{center}

891: \begin{tabular}{| c | c | c | c |}\hline

892:            & Radix independent                            & \mc{2}{| c |}{$\beta = 2$}                     \\

893:            & (Generic)                                    & Single                & Double                 \\ \hline

894: $p$        & Mantissa width                               & 24                    & 53                     \\

895: $n_{\max}$ & $\beta^p - 1$                                & 16~777~215            & 9~007~199~254~740~991  \\

896: $r$        & Exponent width                               & 8                     & 11                     \\

897: bias      & $\lceil  \beta^r / 2 \rceil  - 1$            & 127                   & 1023                   \\

898: $e_{\min}$ & $\lceil  \beta^r / 2 \rceil  + p - 3$        & 149                   & 1074                   \\

899: $e_{\max}$ & $\lfloor \beta^r / 2 \rfloor - p$            & 124                   & 971                    \\

900: ulp        & $\beta^{1 - p}$                              & $1.19 \times 10^{-7}$ & $2.22 \times 10^{-16}$ \\ \hline

901: \end{tabular}

902: \end{center}

903: \end{table}

904:

905: \begin{theo}[FnormalizeCanonic and FcanonicUnique in Form]

906:   ~ Any bounded float has one unique equivalent float $(n, e)$ that is

907:   normal or subnormal.  This float  is the direct transcription of the

908:   number in machine with

909:   $$

910:   \begin{array}{r c l}

911:     n & = &  \text{mantissa} \times \beta^{p - 1}            \\

912:     e & = & (\text{biased exponent} - \text{biais}) - p + 1. \\

913:   \end{array}

914:   $$%

915:   It  is  later   referred  in  this  text  as   the  {\bf  canonical}

916:   representation.

917: \end{theo}

918:

919: The IEEE  standard describes four  rounding modes but the  rounding to

920: the nearest floating point number is the rounding mode used by default

921: in most  computers.  A  rounding is generally  defined as  a monotonous

922: projection onto the set of  the canonical floats.  Considering all the

923: floats rather than just the  canonical ones, we would rather define it

924: as a projection onto subsets of bounded floats.

925:

926: For  any real  number  $x$, we  define  its bounded  floor $\lfloor  x

927: \rfloor$ as the class of the  maximum bounded floats smaller than $x$.

928: It means  that a bounded float $(n,  e)$ is in $\lfloor  x \rfloor$ if

929: and only if, for all the bounded floats $(n', e')$

930: $$

931: (n', e') \le x ~~~~~ \Longrightarrow ~~~~~ (n', e') \le (n, e)

932: $$%

933: We define identically its bounded  ceil $\lceil x \rceil$ as the class

934: of  the  minimum  bounded  floats  larger than  $x$.   We  define  the

935: truncated subset $F(x)$ as follows.

936: $$

937: F(x) = \left\{\begin{array}{l c}

938:   \lfloor x \rfloor                     & x > 0 \\

939:   \lceil  x \rceil                      & x < 0 \\

940:   \{ (0, e), ~~ -e_{\min} \le e \}      & x = 0 \\

941: \end{array}\right.

942: $$

943:

944: We also define the class of the bounded floats nearest to $x$ as $\Box

945: (x)$  with $(n,  e)$ is  in $\Box  (x)$ if  and only  if, for  all the

946: bounded floats $(n', e')$

947: $$

948: | (n, e) - x | \le | (n', e') - x|

949: $$

950:

951: When $\Box  (x)$ contains two distinct equivalence  classes of bounded

952: floats,  we  define $\circ(x)$  as  the class  of  $\Box  (x)$ of  the

953: elements where  the canonical representation has an  even significand.

954: In other cases,  $\circ(x) = \Box (x)$.  The  following theorem states

955: that this definition is appropriate.

956:

957: \begin{theo}[*RoundedModeP in Fround and Closest]

958:   ~ The  relations rounding down  $\lfloor \cdot \rfloor$,  up $\lceil

959:   \cdot \rceil$, to zero $F  (\cdot)$ and to the nearest (even) $\circ

960:   (\cdot)$ define  each a total  monotonous projection from  the reals

961:   onto the  bounded floats compatible  with the interpretation  of the

962:   floats.

963: \end{theo}

964:

965: We now give a few results on the rounding to the nearest.  The theorem

966: could be defined  on the canonical representation but  it is better to

967: prove it for any bounded float.

968:

969: \begin{theo}[ClosestErrorBound in ClosestProp]

970:   \label{theo/Round/Error}%

971:   Let $x$ be any real number and $(n, e)$ any float of $\Box (x)$,

972:   $$

973:   |x - n \beta^e| \le \beta^e  / 2.

974:   $$

975:   That relation  is independent of the rounding being  even or not.

976:   Even rounding provides that if both  $(n, e) \in \circ(x)$ and $|x -

977:   (n, e)| \equiv \beta^e / 2$ then $n$ is even.

978: \end{theo}

979:

980: The following theorem gives a relation for every representation of the

981: rounded value and the error provided that the error is non zero and it

982: does have  a bounded representation.  The universal  quantifier is the

983: key to prove the theorems~\ref{theo/TS/Exp} and \ref{theo/M/Exp}.

984:

985: \begin{theo}[RoundedModeErrorExpStrict in FRoundProp]

986:   \label{theo/Round/Error/Strict}

987:   Given an arbitrary rounding mode, let  $x$ be a real number and $(n,

988:   e)$  a  bounded  float  rounded  from  $x$, such  that  $x$  is  not

989:   represented by  $(n, e)$  ({\em ie.}  $x  \neq n \beta^e$).  For all

990:   bounded  float  $(n',  e')$ that  is  a  representation  of $x  -  n

991:   \beta^e$,

992:   $$

993:   e' < e.

994:   $$

995:   The bound  is tight as $(n', e')  = (1, e - 1)$  is an acceptable

996:   round-off error.

997: \end{theo}

998:

999: The  result of  any implemented  operation, namely  the  addition, the

1000: multiplication, the  division and the  square root extraction,  is the

1001: rounded result  of the exact mathematical operation.   For example, if

1002: $a \oplus b$ is  the machine addition then $a \oplus b  \in \circ (a +

1003: b)$.  The machine  operation could  have  been defined  to return  the

1004: canonical  float  in  $\circ(a  +  b)$.  We  preferred  to  define  an

1005: acceptable implementation as a  function that return any bounded float

1006: in $\circ(a + b)$.

1007:

1008: When a theorem is independent of the tie breaking rule implemented, we

1009: use the boxed symbols  $\boxplus$, $\boxminus$ and $\boxtimes$ for the

1010: addition, the  subtraction and the  multiplication in the  hypothesis.

1011: Figure~\ref{fig:fpop}  presents  the   symbol  of  the  four  standard

1012: floating point operators used in  this work.  All the figures use even

1013: rounding as no other implementation exists.

1014:

1015: \figtex{fpop}{Standard  floating  point   operations  rounded  to  the

1016:   nearest (even) value.}

1017:

1018: % Given  a single  or  double precision,  we  can define  the {\bf  ulp}

1019: % function that stands  for the weight of one unit in  the last place of

1020: % $x$ and that is the weight of the last bit of the mantissa of $x$. Any

1021: % non  zero  normalized  bounded  floating point  number  $x$  satisfies

1022: % $\omega (x)  \ge ulp  (x) = \alpha  (x) ulp  (1)$.  We will  later use

1023: % these equations  and the fact  that $\alpha$ and $ulp$  are monotonous

1024: % functions.  The last equality  does not hold for denormalized numbers,

1025: % yet treating these  numbers is beyond our goal in  this work.  For any

1026: % non zero floating point number, we define $m(x)=\frac{x}{\alpha(x)}$ ;

1027: % $m(x)$ is the mantissa of $x$. We know that $|m(x)| \in [1,2-ulp]$.

1028:

1029: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1030:

1031: \subsection{Exact operations}

1032:

1033: We used for our proofs of the exact operation a result first published

1034: in \cite{Ste74}. It was later  presented in \cite{Gol91} for the radix

1035: 2 notation.  This fact is true for any bounded representation with any

1036: radix.

1037:

1038: \begin{theo}[Sterbenz in Fprop]

1039:   Given two  bounded floats $x$ and $y$  such that ${y \over  2} \le x

1040:     \le 2  y$, the rational  $x - y$  can be represented by  a bounded

1041:     float.

1042: \end{theo}

1043:

1044: We immediately prove  the next result from its  negation after setting

1045: $\beta$ to be 2. We use it later in our proofs.

1046:

1047: \begin{theo}[plusClosestLowerBound in Closest2Plus]

1048:   Given two bounded floats radix 2, $x$ and $y$, such that $x + y \neq

1049:   x \oplus y$, we know that $|x \oplus y| \ge \max (|x|, |y|) / 2$.

1050: \end{theo}

1051:

1052: M{\o}ller  first,  then  Knuth   proposed  the  common  exact  sum  of

1053: Figure~\ref{fig:badd}-(a) involving  4 floating point  additions and 2

1054: floating point subtractions on an IEEE compliant computer.  The result

1055: is a pair of  floats $(a', b')$ such that $a' = a  \oplus b$ and $a' +

1056: b' \equiv a  + b$.

1057:

1058: \figtex{badd}{Exact sum}

1059:

1060: We have proved this algorithm  correct (Theorem {\bf Knuth in TwoSum})

1061: using  the proof given  by Priest  in~\cite{Pri91}.  Actually  we will

1062: never use  directly the algorithm but  only the property  that the sum

1063: and the  rounding error  can be represented.   This is  proved picking

1064: bounded representations that have special properties:

1065:

1066: % We did not  validate this long published algorithm, but  we set up its

1067: % specification  and   we  deduced  the  following   properties  on  its

1068: % outputs\footnote{In  fact, the  correct behavior  of the  algorithm is

1069: %   proved for radix 2 in file {\bf TwoSum} although it is not necessary

1070: %   in  the course  of this  work \cite{DauRidThe01}.}.   Since  we only

1071: % prove that there exists  one bounded representation, the theorem gives

1072: % some properties of this special representation.

1073:

1074: % \begin{theo}[errorBoundedPlus in ClosestPlus]

1075: %   \label{theo/TS/exists}

1076: %   Given two bounded floats $a =  (n_a, e_a)$ and $b = (n_b, e_b)$, for

1077: %   any $a' \in a \boxplus b$, there exists $n'$ such that $(n', e')$ is

1078: %   a bounded float with

1079: %   $$

1080: %     a' + (n', e') \equiv a + b

1081: %     ~~~~~ \text{and} ~~~~~

1082: %     e' = \min (e_a, e_b).

1083: %   $$

1084: % \end{theo}

1085:

1086: %The rounded  sum $a \oplus  b$ and  the additional error  $a + b  - (a

1087: %\oplus b)$  are both  integer multiple of  $\beta^l$ where $l$  is the

1088: %lowest of  the ulps  of $a$  and $b$.  No  significant bit  is created

1089: %right of the rightmost non zero  significant bit of the inputs and the

1090: %ulp of the sum is limited as we can see in the next fact.

1091:

1092: \begin{theo}[plusExactExp in ClosestPlus]

1093:   \label{theo/TS/Exp}

1094:   Given two bounded floats $a =  (n_a, e_a)$ and $b = (n_b, e_b)$, for

1095:   all $a' \in a \boxplus b$, we  can define $(n, e) \equiv a'$ and $b'

1096:   = (n', e')$, bounded floats, such that

1097:   $$

1098:     a' + b' \equiv a + b

1099:     ~~~~~ \text{and} ~~~~~

1100:     \min (e_a, e_b) = e' \le e \le \max (e_a, e_b) + 1.

1101:   $$

1102: \end{theo}

1103:

1104: This theorem uses the Theorem~\ref{theo/Round/Error} and Theorems {\bf

1105:   errorBoundedPlus}, {\bf  plusExpBound} not presented  here to define

1106: an    existing    representation    of    $a'$    and    $b'$.     The

1107: theorem~\ref{theo/Round/Error/Strict} is used  to connect both results

1108: since it is true for any  representation of $a'$ and $b'$. The special

1109: case where $b' = 0$ is handled independently.

1110:

1111: % \begin{theo}[plusExact1 and plusExact1 in FroundPlus]

1112: %   Given two  bounded representations $(n_a, e_a)$ and  $(n_b, e_b)$ of

1113: %   the floating  point numbers $a$ and  $b$. Let $(n, e)$  be a bounded

1114: %   representation of $\circ  (a + b)$.  The rounded  sum is exact, {\em

1115: %     id}  $\circ (a  + b)  = a  +  b$, when  any one  of the  following

1116: %   conditions is satisfied:

1117: %   \begin{itemize}

1118: %   \item $e \le  \min (e_a, e_b)$.

1119: %   \item $(n_a, e_a)$ is canonical and $e < e_a - 1$

1120: %   \end{itemize}

1121: % \end{theo}

1122:

1123: The last Theorem prompts the fact  that the addition is accurate up to

1124: one unit  in the last  place. Such result  is needed to show  that the

1125: numerical tools of the next Section actually do some work.

1126:

1127: \begin{theo}[plusErrorBoundUlp in ClosestPlus]

1128:   \label{theo/work}

1129:   Given two bounded floats $a$ and  $b$, for all $a' \in a \boxplus b$

1130:   such that $a' \not\equiv 0$, we know that

1131:   $$

1132:   |a + b - a'| \le       |a'|      {\ulp  \over 2}

1133:                \le \max (|a|, |b|)  \ulp.

1134:   $$%

1135:   The last  inequality is true only  working radix 2. It  is proved in

1136:   theorem plusErrorBound2 in file Closest2Plus.

1137: \end{theo}

1138:

1139: It has also  been proved by M{\o}ller and  presented in Knuth's second

1140: edition that one can get a correct pair $(a', b')$ by an early exit of

1141: the  exact  sum  provided  $|b|  \le |a|$.   The  conditional  sum  of

1142: Figure~\ref{fig:badd}-(b) returns an exact pair $(a', b'') \equiv (a',

1143: b')$  with only  2 floating  point  additions and  one floating  point

1144: subtraction.

1145:

1146: Knuth suggested that the early exit  is still valid if $a$ and $b$ are

1147: both canonical  and share the same  exponent.  We prove  here a result

1148: rephrased from \cite{Dau99a} that $a$  and $b$ just have to be bounded

1149: and the exponent be in order.

1150:

1151: \begin{theo}[ExtDekker in EFast2Sum]

1152:   On a  radix 2 IEEE inspired  floating point unit,  given two bounded

1153:   floats $a  = (n_a, e_a)$ and  $b = (n_b, e_b)$,  the conditional sum

1154:   presented Figure~\ref{fig:badd}-(b) returns an exact pair $(a \oplus

1155:   b, a + b - a \oplus b)$ provided

1156:   $$e_b \le e_a.$$

1157: \end{theo}

1158:

1159: This  theorem is  proved only  for radix  two notations  as it  is not

1160: necessarily  correct for other  radices.  An  example where  the early

1161: exit is  not correct radix 10 is  given for $n_{\max} =  99$ with both

1162: inputs  being  $9.9$.   The theorem  is  correct  for  radix 2  and  3

1163: \cite{Knu73}. One can  check one last example radix  4: $n_{\max} = 3$

1164: and both inputs are 3.

1165:

1166: An exact multiplication  is also available.  It computes  a pair $(a',

1167: b')$ such that  $a' = a \otimes b$ and  $a' + b' = a  \times b$ with 7

1168: floating  point  multiplications, 5  floating  point  additions and  5

1169: floating  point   subtractions.   These  operators   are  surveyed  in

1170: \cite{Dau99a}.   Here are  some  properties obtained  from the  desired

1171: specification of $a'$ and $b'$, not from the algorithm.  The condition

1172: on the exponent is not necessary but it is sufficient to discard cases

1173: of underflow.

1174:

1175: \begin{theo}[multExactExpCan in ClosestMult]

1176:   \label{theo/M/Exp}

1177:   Given two bounded floats $a = (n_a, e_a)$ and $b = (n_b, e_b)$, with

1178:   $e_a +  e_b \ge -e_{\min}  + p$, for  all $a'$ rounded to  a nearest

1179:   float of $a \times b$, we  can define $(n, e) \equiv a'$ canonic and

1180:   $b' = (n', e')$ bounded float, such that

1181:   $$

1182:     a' + b' \equiv a \times b

1183:     ~~~~~ \text{and} ~~~~~

1184:     e' = e - p.$$

1185: \end{theo}

1186:

1187: It was proved  that comparable error quantities, that  always fit in a

1188: common  floating point  number, may  be defined  for the  division and

1189: square root \cite{BohWalKorMat91}.

1190:

1191: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1192:

1193: \subsection{Floating point expansions}

1194:

1195: A floating point {\bf expansion} is defined as a finite sequence ${\bf

1196:   x} = (x_0, x_1, \cdots , x_{n - 1})$ of floating point numbers.  The

1197: value represented  is the  exact, not rounded,  sum of  its components

1198: $\sum  x_i$.   The  length of  the  expansion  is  the number  of  its

1199: components.

1200:

1201: %We say that  $\widehat{x}$ is a {\bf fair  most significant component}

1202: %of ${\bf  x}$ if  $\widehat{x}$ equals to  the rounded to  the nearest

1203: %value of $x'$ with $|x' -  \sum x_i| \leq \beta^e / n_{\max}$ and $(n,

1204: %e)$ is  the canonical representation of  $\widehat{x}$.  This includes

1205: %the case where  $\widehat{x}$ equals the rounded to  the nearest value

1206: %of the sum of the component of ${\bf x}$.

1207:

1208: Any  component  of  an  expansion  may  be  equal  to  zero,  but  the

1209: subsequence of  the non-zero components $x_i$ must  be non overlapping

1210: and ordered  by magnitude.  The  non overlapping condition  means that

1211: two components cannot have significant  bits with the same weight that

1212: is specified for each $i > 0$ there exists a bounded float $(n, e)$

1213: such that

1214: $$

1215: x_i \equiv (n, e) ~~~~~ \text{and} ~~~~~ |x_{i + 1}| < \beta^e.

1216: $$

1217:

1218: %\figtex{superp}{Floating point numbers A and B do not overlap whereas B and C do.}

1219:

1220: %A {\bf  non-adjacent expansion}  is an expansion  ${\bf x}$  where the

1221: %non-zero  subsequence has non-adjacent  components.  The  condition is

1222: %specified below.  There is at least one  bit set to 0  between the two

1223: %nearest non-zero (signed) bits of two consecutive components.

1224: %$$\exists (n, e) \in \F_{\cal  R}

1225: %%    ~~~ \text{such that} ~~~ (n, e) \equiv x_i

1226: %    ~~~ \text{and}       ~~~ |x_{i - 1}| < \beta^e / 2

1227: %$$

1228:

1229: We have  already presented two examples in  \cite{Dau99a} showing that

1230: any float  $(n, e)$  where $e  \ge -e_{\min}$ can  be expressed  as an

1231: expansion.

1232:

1233: As noted in the introduction,  previous works by Priest, Shewchuck and

1234: ourselves have produced arithmetic  operators on expansions and useful

1235: primitives for computational geometry.  In this process, we recognized

1236: that the  operators for the addition and  the multiplication computing

1237: the  least significant  digits  first do  not  produce length  optimal

1238: results  but  tend  to  break  the  expansion in  a  large  number  of

1239: components with  small significands. A compression routine  is used to

1240: group  together the  components and  reduce the  length of  the result

1241: expansion.

1242:

1243: % (see figure \ref{fig:lsdf}(a))

1244: % The

1245: % program has to compress the result to reduce the memory occupation and

1246: % the length of  the expansion.  The exact floating  point addition puts

1247: % the 52 most  significant bits in the first word of  the result and any

1248: % remaining  bit   in  the  correcting  word.    Thereafter,  using  the

1249: % correcting  word as the  output of  a process  will definitely  have a

1250: % tendency to produce long expansions  of components with a small number

1251: % of significant bits.

1252:

1253: % \begin{figure}[ht]

1254: % \begin{center}

1255: % \input{lsdf.pstex_t}

1256: % \caption{(a) LSDF operators : the result is a not optimal expansion (b) MSDF operator : the result is a pseudo-expansion}

1257: % \label{fig:lsdf}

1258: % \end{center}

1259: % \end{figure}

1260:

1261: It is  possible to compute the arithmetic  operations most significant

1262: digits first and compute directly components with a large significand.

1263: The  common,  both restoring  and  non  restoring, division  operators

1264: compute the components of the quotient like this.  As a drawback, some

1265: of  the   components  may  overlap  slightly.   In   our  former  work

1266: \cite{DauFin99a}, the sequence of the  quotient digits is cleaned by a

1267: compression routine that produces an acceptable expansion.

1268:

1269: % (see figure \ref{fig:lsdf}(b))

1270:

1271: We define {\bf pseudo-expansions}  as slightly overlapping expansions.

1272: The condition on the subsequence of non zero components as that

1273: $$|x_{i  + 1}|  \le \epsilon |x_i|.$$%

1274: We do not give the value of $\epsilon < {1 \over 2}$ now as it will be

1275: fixed  in the next  section considering  the numerical  algorithms. We

1276: deduce that

1277: $$|\sum_{j > i} x_j| \le {\epsilon \over 1 - \epsilon} |x_i|.$$

1278: Conversely, given a sequence where

1279: $$|\sum_{j > i} x_j| \le \lambda |x_i|,$$

1280: for all $i$, we deduce that

1281: $$|x_{i  + 1}|  \le {\lambda \over 1 - \lambda} |x_i|.$$%

1282:

1283: % The  division of  pseudo-expansions is  copied from  the  existing one

1284: % except  that  it does  not  require  the  final cleaning  loops.   The

1285: % addition  and  the  multiplication   are  inspired  from  the  on-line

1286: % operators  to  produce the  result  most  significant  digit first  as

1287: % components are available  and safe.  As we do  for on-line arithmetic,

1288: % we recognize  that for the  three operators, least  significant digits

1289: % are not  necessary at the beginning  of the process  and the operators

1290: % are  transformed  from most  significant  digit  first operators  that

1291: % require its  inputs to  be exactly known  to adaptable  operators that

1292: % need only a  limited number of most significant  components to deliver

1293: % the first components of the output.

1294:

1295: % \begin{theo}[errorBoundedMult in FroundMult]

1296: %   \label{theo/seq}

1297: %   Given  a sequence $(x_i)_{i \in \N}$ and provided $\epsilon < 1$. We know that

1298: % \begin{itemize}

1299: % \item $

1300: %   \displaystyle \text{If} ~~~~~

1301: %   \forall i \in \N, ~~~~~ |x_{i + 1}| \le \epsilon |x_i|,

1302: %   ~~~~~ \text{then} ~~~~~

1303: %   \forall i \in \N, ~~~~~ |\sum_{j \ge i} x_j| \le \epsilon' |x_i|$.

1304: % \item $

1305: %   \displaystyle \text{If} ~~~~~

1306: %   \forall i \in \N, ~~~~~ |\sum_{j \ge i} x_j| \le \epsilon |x_i|,

1307: %   ~~~~~ \text{then} ~~~~~

1308: %   \forall i \in \N, ~~~~~ |x_{i + 1}| \le \epsilon' |x_i|$.

1309: % \end{itemize}

1310: % \end{theo}

1311:

1312: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1313:

1314: \section{Toolset for the addition, the multiplication and the division}

1315:

1316: We combine in this Section  tools to create the arithmetic operations:

1317: MQ,  PP, PQ  and $\Sigma_3$.  The tools  have been  implemented in  C.

1318: Among them, the numerical behavior of $\Sigma_3$ alone is difficult to

1319: analyze as the  three other tools only sort  and produce components as

1320: they are needed.

1321:

1322: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1323:

1324: \subsection{Addition}

1325:

1326: \figtex{addition}{Representation of the addition.}

1327: \figtex{insert3}{Definition of the $\Sigma_3$ operator.}

1328:

1329: Figure~\ref{fig/addition}  represents  the  algorithm  implemented  to

1330: compute  the  sum  of  pseudo-expansions  $A$ and  $B$.   We  use  two

1331: primitive operators MQ (merge queue) and $\Sigma_3$ (insert and sum of

1332: Figure~\ref{fig/insert3}).  Arrows represent streams of floating point

1333: number flowing  most significant digits first.  The  components of the

1334: result are in the  stream named $C$ in Figure~\ref{fig/addition}.  The

1335: output  is  only a  pseudo-expansion  since  the  $a'$ are  sorted  by

1336: decreasing  order of  magnitude but  they may  partially  overlap each

1337: other.

1338:

1339: MQ is an  operator fired only if it  has a value on both  inputs or if

1340: the flow  of one input  is finished  and it has  a value on  the other

1341: input. The  largest input,  in magnitude, is  then transmitted  to the

1342: output.  Cases of tie  are not  critical. The  MQ operator  merges two

1343: flows sorted by magnitude and produces a single flow once again sorted

1344: by magnitude.  The following theorem  states that this flow  of values

1345: can be represented  by a flow of floats to be  fed into the $\Sigma_3$

1346: operator.

1347:

1348: \begin{theo}[IsRleExpRevIsExp in FexpAdd]

1349:   Given a list of floats $(x_i)_{i \in L}$ sorted by magnitude, we can

1350:   define the bounded floats $(n_i, e_i)$ equivalent to the $x_i$s such

1351:   that the list $(e_i)_{i \in L}$ is sorted.

1352: \end{theo}

1353:

1354: We deduce the correct behavior  of the operation by induction from the

1355: following Theorem.   The property on  the input floating  point number

1356: $c$ is inherited from the fact  that the two input streams $A$ and $B$

1357: are pseudo-expansions merged by the MQ operator by order of magnitude.

1358:

1359: \begin{theo}[bound3Sum and exp3Sum in ThreeSum2]

1360:   \label{theo/sum3}

1361:   Let $a = (n_a,  e_a)$, $b = (n_b, e_b)$ and $c  = (n_c, e_c)$ be the

1362:   bounded     inputs    of     the     $\Sigma_3$    operator     (see

1363:   Figure~\ref{fig/insert3}).  Provided that $e_a  \ge e_b \ge e_c$ the

1364:   $\Sigma_3$   operator  returns  three   numbers  $a'$,   $b'$,  $c'$

1365:   represented  by $(n_a',  e_a')$, $(n_b',  e_b')$ and  $(n_c', e_c')$

1366:   such that $e_a' \ge e_b' \ge e_c'  = e_c$ and finaly either $c' = 0$

1367:   or

1368:   $$|b' + c'|   \le  3 \ulp                   |a'|,$$

1369:   $$\beta^{e_c} \le {3 \ulp^2 \over 2 - \ulp} |a'|.$$

1370:

1371:   If $a = 0$ or $|b| + n_{\max} \beta^{e_c} \le n_{\max} \beta^{e_a}$,

1372:   all the exact additions can use the early exit.

1373: \end{theo}

1374:

1375: If $c'$  equals to zero, $a'$  and $b'$ are not  relevant enough. They

1376: remain in the operator as $a$  and $b$ for the next iteration.  On the

1377: opposite, if $c'$  is not equal to zero, $a'$ is  relevant.  It is one

1378: component of  the result, $b'$ and  $c'$ are kept in  the operator for

1379: the next iteration.

1380:

1381: \paragraph{\em Proof sketch:}

1382:

1383: We first define $u$, $v$, $a'$,  $b'$ and $c'$ as they are computed in

1384: Figure~\ref{fig/insert3}. We are interested in the case where $c' \neq

1385: 0$. It implies that $w \neq 0$. We bound

1386: $$

1387: \begin{array}{r c l}

1388:   |w| & \le & {\ulp \over 2} |a'| \\

1389:   |v| & \le & \ulp \max (|u|, |a|)

1390: \end{array}

1391: $$

1392: Since $w \neq 0$, we know that

1393: $$|a'| \ge {\max (|u|, |a|) \over 2}.$$

1394: The first bound follows since

1395: $$|b' + c'| = |v + w| \le |v| + |w|.$$

1396:

1397: We continue for the second bound

1398: $$

1399: 3 \ulp |a'| \ge |b' + c'|

1400:             \ge |b'| - |c'|

1401:             \ge \left( 1 - {\ulp \over 2} \right)|b'|.

1402: $$

1403: We conclude with  the representation  of Theorem~\ref{theo/TS/Exp}

1404: $$

1405: \beta^{e_c} = \beta^{e_c'}

1406:             \le |c'|

1407:             \le {\ulp \over 2} |b'|

1408:             \le {3 \ulp^2 \over 2 - \ulp} |a'|.

1409: $$

1410:

1411: \begin{flushright}

1412: $\Box$

1413: \end{flushright}

1414:

1415: The last theorem does the induction over time. It is written from both

1416: statement and proof of the theorem in Coq.

1417:

1418: \begin{theo}[FexpAdd in FexpAdd]

1419:   Given a list $(n_i, e_i)_{i \in  L}$ of $\#L$ bounded floats so that

1420:   $(e_i)_{i   \in   L}$  is   sorted   the   addition  operator   (see

1421:   Figure~\ref{fig/addition}) produces  a list $(c_i)_{i \in  L}$ of at

1422:   most $(\#L + 1)$ floats with

1423:   $$

1424:   |c_{i + 1}| \le \epsilon |c_i|

1425:   ~~~~~ \text{with} ~~~~~

1426:   \epsilon \le {6 \#L + 6 \over n_{\max} - 1 - 6\#L}

1427:   $$

1428:   under the condition that $\epsilon < 1$.

1429: \end{theo}

1430:

1431: \paragraph{\em Proof sketch:}

1432:

1433: We have  to show that the  input conditions are met  at each iteration

1434: using  pre-conditions  and  post-conditions.   In the  same  time,  we

1435: establish that

1436: $$|\sum_{j > i} c_j| \le 3(1 + 2 \#L) \ulp |c_i|,$$

1437: and

1438: $$|c_{i  + 1}|       \le 6(1 + 2 \#L) \ulp |c_i|.$$%

1439: Being more careful, we proved the tighter bound in Coq.

1440:

1441: \begin{flushright}

1442: $\Box$

1443: \end{flushright}

1444:

1445: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1446:

1447: \subsection{Multiplication}

1448:

1449: \figtex{multiplication3}{Representation of the multiplication.}

1450:

1451: The  multiplication  of  the   expansions  $A$  (size  $n_A$)  by  the

1452: expansions  $B$ (size $n_B$)  generates the  $n_A \times  n_B$ partial

1453: products and  computes their sum. The  problem is to  sort the partial

1454: products  $a_{i_j} \times  b_j$  knowing that  the  lists $(a_i)$  and

1455: $(b_j)$ are  sorted by  magnitude. It  is related to  $X +  Y$ sorting

1456: \cite{HarPaySavStr75,SteStr94}.        Figure~\ref{fig/multiplication3}

1457: represents  the  algorithm  implemented.   We  use  new  and  extended

1458: primitives: PP, a  partial product generator, PQ and  a priority queue

1459: extending the MQ.

1460:

1461: % \figtex{prods}{Generation of partial product and priority queue.}

1462:

1463: Working with pseudo-expansions, we have  to make sure that the list of

1464: the  partial products  is sorted  by magnitude.   As $b_j$  arrives at

1465: position  $j$,  the PP  tool  initializes the  $i_j$  index  to 0  and

1466: computes $a_{i_j} \otimes b_j$.  Its  magnitude is inserted at the end

1467: of PQ that is updated in $\lfloor \log j \rfloor$ operations. When the

1468: partial  product $a_{i_j}  \otimes  b_j$  is output  by  PQ, the  tool

1469: increments  $i_j$.  The  generation  is frozen  until both  components

1470: $a_{i_j}$  and  $b_{j +  1}$  are  known.   Then the  cell  containing

1471: $|a_{i_j} \otimes b_j|$  in PQ is updated in  $\lfloor \log j \rfloor$

1472: operation and the tools can generate the next partial product.

1473:

1474: % \figtex{somme5}{Definition of the $\Sigma_5$ operator.}

1475:

1476: Numbers  flow by  pair since  $a_i \times  b_j$ produces  two floating

1477: point  numbers. The  upper part  $p_{ij} =  a_i \otimes  b_j$  is used

1478: immediately. The  lower part $a_i \times  b_j - p_{ij}$ is  put into a

1479: waiting  queue.   The  two  streams  are merged  together  by  the  MQ

1480: operators. It is fired as long as it has an input on both entries.  It

1481: does not compare the values but  the keys that are associated with the

1482: floats.   The operator does  not produce  a list  of floats  sorted by

1483: magnitude but directly a list sorted by amplitude as we use the second

1484: part of theorem~\ref{theo/M/Exp}.

1485:

1486: As $a_i \times  b_j - p_{ij}$ cannot be  selected before $p_{ij}$, its

1487: insertion  into  the  waiting queue  is  not  in  the critical  path.

1488: Instruction reordering will allow these operation to be hidden in dead

1489: cycles of the $\Sigma_3$ operator.   The waiting list cannot hold more

1490: than $2n_B$ elements since the bound on $\epsilon$ is a few ulps.

1491:

1492: % $(A_i \times  B_j)_{High}$ that  is inserted in  $c$ and  $(A_i \times

1493: % B_j)_{Low}$    inserted    in   $e$.     A    formal   extension    of

1494: % Theorem~\ref{theo/sum3} will guarantee that this operator produces the

1495: % desired output.

1496:

1497: % If $d'$ equals to zero, then $e'  = 0$ and $a'$, $b'$ and $c'$ are not

1498: % relevant enough. They  remain in the operator for  the next iteration.

1499: % If $d' \neq 0$, $a'$ is relevant.   It is one component of the result.

1500: % If  $e'$ is  also  non zero,  $b'$  is immediately  used  as the  next

1501: % component of  the result. The  three remaining floating  point numbers

1502: % are kept in the operator for the next iteration.

1503:

1504: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1505:

1506: \subsection{Division}

1507:

1508: % \subsubsection{Common division}

1509:

1510: \figtex{division3}{Representation of the non-restoring division.}

1511:

1512: Figure~\ref{fig/division3}  represents  the division  of  $R$ by  $D$,

1513: pseudo-expansions. We want to compute $Q$  such that $Q = R / D$, {\em

1514:   ie.}  $R - QD = 0$. Each  component $q_i$ of $Q$ is deduced from the

1515: previous ones by computing

1516: $$R_i = R - \sum_{j < i} q_j \times D,$$%

1517: before evaluation $q_i \approx R_i /  D$. We will see later that $d_0$

1518: is handled separately, so our scheme computes $Q \times (d_0 - D)$.

1519:

1520: The  goal is  to ensure  that the  values $c$  used as  inputs  of the

1521: $\Sigma_3$ operator  are sorted by  amplitude.  The PP-PQ-MQ  chain is

1522: close   to   the   one    used   for   the   multiplication   in   the

1523: Figure~\ref{fig/multiplication3}. The  generation is frozen  only when

1524: $d_{j + 1}$ is not known.

1525:

1526: The test  on $q_{i_j}$ is replaced  by a new  condition: the operation

1527: stops before it may insert a value $c$  such that $c < 4 |a + b| \ulp$.

1528: A new  approximate quotient  digit $q_i$ is  then guessed from  a fair

1529: most significant  component $d_0$ of the  divisor $D$ and  a fair most

1530: significant component $a' \oplus b'$.

1531:

1532: %  of the remainder $R_i$: $q_i =

1533: % \circ(a' / d_l)$.

1534: % The less  significant components of the dividend  are stored unchanged

1535: % with all the components of the terms $-q_i \cdot D$ that have not been

1536: % reduced so far.   A priority queue (PQ) extracts  the most significant

1537: % components  among this  set.  These  numbers are  accumulated  until a

1538: % relevant $a'$ can be estimated again.

1539:

1540: % Since  $R_i$  is  at  least  divided  by a  constant  factor  at  each

1541: % iteration, the algorithm is converging.

1542: % $$

1543: % \displaystyle \frac{R_0}{D}=\sum_{i=0}^{n-1}{q_i}+\frac{R_n}{D}

1544: % $$

1545:

1546: % We  use four  primitive operators.   The partial  product PP,  and the

1547: % priority  queue PQ,  sort  by  magnitude the  pairs  generated by  the

1548: % product of $q_i$  by $D - d_l$.  The merging queue  MQ, takes the most

1549: % significant numbers between the  components of the numerator that have

1550: % not been used so far and the numbers given by the priority queue.  The

1551: % $\Sigma_5$  operator   adds  the   new  component  to   the  left-over

1552: % components.  The number  $a'$ is significant enough when  $d' \neq 0$.

1553: % This  process guarantees  that  $a'$ is  a  relevant approximation  of

1554: % $R_i$. We will prove that $e' = 0$.

1555:

1556: The following  theorem states that one  division step is  very robust.

1557: For a reasonable value of $\epsilon$, it guarantees that the remainder

1558: decreases almost linearly.

1559:

1560: \begin{theo}[DivConv in FexpDiv]

1561:   \label{theo/Div/Converge}

1562:   Let $W$ and $D$ be two non zero real numbers approximated by $w$ and

1563:   $d$ with  a relative error  bound $\epsilon <  1$ and let $q$  be an

1564:   approximation of $w / d$ with the same relative error bound.

1565:   $$

1566:   |W - q D | \le \epsilon {3 + \epsilon \over 1 - \epsilon} |W|

1567:   $$

1568: \end{theo}

1569:

1570: To reduce $\epsilon$, we replace the initial values $d_0$ and $d_1$ by

1571: $d_0 \oplus d_1$  and $d_0 + d_1 - d_0 \oplus  d_1$. As a consequence,

1572: the worst  error on the Theorem~\ref{theo/Div/Converge} is  the one on

1573: the  remainder and  it  is bounded  by  construction by  4 ulps.  This

1574: guarantees with the  loop condition that any term  $q_i \times d_j$ is

1575: smaller than  all the  terms already produced  by the  partial product

1576: generation chain except  $q_i \times d_0$. This last  value is handled

1577: separately as $a'$ and  $b'$ are replaced by $a' + b'  - q_i d_0$. The

1578: loop condition also ensures that $c' = 0$.

1579:

1580: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1581:

1582: % \subsubsection{Pre-scaled division}

1583:

1584: % In  modern computers,  the  floating  point division  is  a very  slow

1585: % operator.   For  example,  Intel  Pentium processors  need  39  cycles

1586: % \cite{Int95a}  whereas  multiplications   and  additions  may  have  a

1587: % throughput of 1 cycle.  Prescaled  division, can be used to reduce the

1588: % number of floating point division to 1.

1589:

1590: % As a  price, we have to  exactly prescale the  numerator $R$ involving

1591: % $n_R$ multiplications.  As  a matter of fact, the  prescaling fits the

1592: % size of the second input of the MQ operator that is waiting for a pair

1593: % of floating point digits from the PQ and now also from the numerator.

1594:

1595: % \begin{figure}[ht]

1596: % \begin{center}

1597: % \input{division2.pstex_t}

1598: % \caption{Representation of the pre-scaled division}

1599: % \label{fig:div2}

1600: % \end{center}

1601: % \end{figure}

1602:

1603: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

1604:

1605: \section*{Conclusion}

1606:

1607: We  have presented  a new  adaptable numeric  core inspired  both from

1608: floating point expansions and from on-line arithmetic.  Our choice was

1609: to  present only  the arithmetic  core as  we have  presented  in past

1610: publications  how a  core can  be used  efficiently to  compute matrix

1611: determinants for example \cite{DauFin99a}.  Building a general purpose

1612: runtime environment raises question  in areas like parallelism (demand

1613: driven {\em vs.}  data flow...)  and in compiler techniques (efficiency

1614: of  object oriented  code generated  by existing  compilers...)  among

1615: others.

1616:

1617: The numeric core  is cut down to four  tools.  The $\Sigma_3$ operator

1618: that  contains  many arithmetic  operations  is  proved correct.   The

1619: proofs  have been  validated  by the  Coq  assistant.  Developing  the

1620: proofs,  we have  formally proved  many result  long published  in the

1621: literature and  we have  extended a  few of them.   This work  may let

1622: users

1623: \begin{itemize}

1624: \item Develop  application specific  adaptable libraries based  on the

1625:   toolset.

1626: \item  Easily write  new formal  proofs based  on the  growing  set of

1627:   sensible validated facts.

1628: \end{itemize}

1629:

1630: The set  of the validated facts  is now sufficient to  shield the user

1631: from   the  difficult   implementation  details   of   floating  point

1632: arithmetic.  The proof  of the key Theorem on  the $\Sigma_3$ operator

1633: does not  use any low  level result. It  just uses the many  lower and

1634: upper bounds defined in the Theorems of Section~1.

1635:

1636: \small

1637:

1638: %\bibliography{/home/arenaire/public_html/Bibliographies/daumas_ref}

1639: %\bibliography{p:/Arenaire/Bibliographies/daumas_ref}

1640:

1641: \begin{thebibliography}{10}

1642:

1643: \bibitem{AvnBoiDevPreYvi97}

1644: Francis Avnaim, Jean-Daniel Boissonnat, Olivier Devillers, Franco Preparata,

1645:   and Mariette Yvinec.

1646: \newblock Evaluating signs of determinants using single precision arithmetic.

1647: \newblock {\em Algorithmica}, 17(2):111--132, 1997.

1648:

1649: \bibitem{Bai93}

1650: David~H. Bailey.

1651: \newblock Algorithm 719, multiprecision translation and execution of fortran

1652:   programs.

1653: \newblock {\em ACM Transactions on Mathematical Software}, 19(3):288--319,

1654:   1993.

1655:

1656: \bibitem{Bai95}

1657: David~H. Bailey.

1658: \newblock A {F}ortran-90 based multiprecision system.

1659: \newblock {\em ACM Transactions on Mathematical Software}, 21(4):379--387,

1660:   1995.

1661:

1662: \bibitem{Bar93}

1663: Jesse~L. Barlow.

1664: \newblock Error analysis of update methods for the symmetric eigenvalue

1665:   problem.

1666: \newblock {\em SIAM Journal of Matrix Analysis and Applications},

1667:   14(2):598--618, 1993.

1668:

1669: \bibitem{BerDau97}

1670: David Berthelot and Marc Daumas.

1671: \newblock Computing on sequences of embedded intervals.

1672: \newblock {\em Reliable Computing}, 3(3):219--227, 1997.

1673:

1674: \bibitem{Boe87}

1675: Hans-Juergen Boehm.

1676: \newblock Constructive real interpretation of numerical programs.

1677: \newblock {\em ACM SIGPLAN Notices}, 22(7):214--221, 1987.

1678:

1679: \bibitem{BohWalKorMat91}

1680: Gerd Bohlender, Wolfgang Walter, Peter Kornerup, and David~W. Matula.

1681: \newblock Semantics for exact floating point operations.

1682: \newblock In Peter Kornerup and David Matula, editors, {\em Proceedings of the

1683:   10th Symposium on Computer Arithmetic}, pages 22--26, Grenoble, France, 1991.

1684:

1685: \bibitem{BoiPre2K}

1686: Jean-Daniel Boissonnat and Franco~P. Preparata.

1687: \newblock Robust plane sweep for intersecting segments.

1688: \newblock {\em SIAM Journal on Computing}, 29(5):1401--1421, 2000.

1689:

1690: \bibitem{BolDau01}

1691: Sylvie Boldo and Marc Daumas.

1692: \newblock Performances d'implantations de l'addition en pr{\'e}cision

1693:   quad-double sur diff{\'e}rentes machines.

1694: \newblock In {\em 7{\`e}me Symposium sur les Architectures Nouvelles de

1695:   Machines}, Paris, France, 2001.

1696:

1697: \bibitem{Bre78a}

1698: Richard~P. Brent.

1699: \newblock A fortran multiple-precision arithmetic package.

1700: \newblock {\em ACM Transactions on Mathematical Software}, 4(1):57--70, 1978.

1701:

1702: \bibitem{BurFleMehSch99}

1703: C.~Burnikel, R.~Fleischer, K.~Mehlhorn, and S.~Schirra.

1704: \newblock Efficient exact geometric computation made easy.

1705: \newblock In {\em Proceedings of the 15th Annual ACM Symposium on Computational

1706:   Geometry}, pages 341--350, Miami, Florida, 1999.

1707:

1708: \bibitem{ChaGedGonMonWat91}

1709: Bruce~W. Char, Keith~O. Geddes, Gaston~H. Gonnet, Michael~B. Monagan, and

1710:   Stephen~M. Watt.

1711: \newblock {\em Maple {V}: langage reference manual}.

1712: \newblock Springer Verlag, 1991.

1713:

1714: \bibitem{CodKar.84}

1715: William~J. Cody, Richard Karpinski, et~al.

1716: \newblock A proposed radix and word-length independent standard for floating

1717:   point arithmetic.

1718: \newblock {\em IEEE Micro}, 4(4):86--100, 1984.

1719:

1720: \bibitem{CorDupHocMul91}

1721: G.~Corbaz, Jean Duprat, Bertrand Hochet, and Jean-Michel Muller.

1722: \newblock Implementation of a {VLSI} polynomial evaluator for real-time

1723:   applications.

1724: \newblock In {\em International Conference on Application-Specific Array

1725:   Processors}, 1991.

1726:

1727: \bibitem{Dau99a}

1728: Marc Daumas.

1729: \newblock Multiplications of floating point expansions.

1730: \newblock In Israel Koren and Peter Kornerup, editors, {\em Proceedings of the

1731:   14th Symposium on Computer Arithmetic}, pages 250--257, Adelaide, Australia,

1732:   1999.

1733:

1734: \bibitem{DauFin98}

1735: Marc Daumas and Claire Finot.

1736: \newblock Division of floating point expansions.

1737: \newblock In {\em IMACS-GAMMM International Symposium on Scientific Computing,

1738:   Computer Arithmetic and Validated Numerics}, pages 45--46, Budapest,

1739:   Hungaria, 1998.

1740:

1741: \bibitem{DauFin99a}

1742: Marc Daumas and Claire Finot.

1743: \newblock Division of floating point expansions with an application to the

1744:   computation of a determinant.

1745: \newblock {\em Journal of Universal Computer Science}, 5(6):323--338, 1999.

1746:

1747: \bibitem{DauMulTis97}

1748: Marc Daumas, Jean-Michel Muller, and Arnaud Tisserand.

1749: \newblock Very high radix on-line arithmetic for accurate computations.

1750: \newblock In {\em Proceedings of the 15th IMACS World Congress on Computational

1751:   and Applied Mathematics}, volume~2, pages 347--352, Berlin, Germany, 1997.

1752:

1753: \bibitem{DauMulVui94}

1754: Marc Daumas, Jean-Michel Muller, and Jean Vuillemin.

1755: \newblock Implementing on-line arithmetic on {PAM}.

1756: \newblock In Reiner~W. Hartenstein and Michael~Z. Serv{\'\i}t, editors, {\em

1757:   Field-Programmable Logic: Architectures Synthesis and Applications}, pages

1758:   196--207, Prague, Czech Republic, 1994. Springer Verlag.

1759:

1760: \bibitem{Dek71}

1761: Theodorus~J. Dekker.

1762: \newblock A floating point technique for extending the available precision.

1763: \newblock {\em Numerische Mathematik}, 18(3):224--242, 1971.

1764:

1765: \bibitem{DevPre98}

1766: Olivier Devillers and Franco Preparata.

1767: \newblock A probabilistic analysis of the power of arithmetic filters.

1768: \newblock {\em Discrete and Computational Geometry}, 20(4):523--547, 1998.

1769:

1770: \bibitem{Don92}

1771: Jack~J. Dongarra.

1772: \newblock Performance of various computers using standard linear equations

1773:   software.

1774: \newblock {\em Computer architecture news}, 20(3):22--44, 1992.

1775:

1776: \bibitem{Don99}

1777: Jack~J. Dongarra.

1778: \newblock Performance of various computers using standard linear equations

1779:   software.

1780: \newblock Report CS-89-85, University of Tennessee, 1999.

1781:

1782: \bibitem{ForVan93}

1783: Steven Fortune and Christopher~J. {Van Wyk}.

1784: \newblock Efficient exact arithmetic for computational geometry.

1785: \newblock In {\em Proceedings of the 9th ACM Symposium on Computational

1786:   Geometry}, pages 163--172, 1993.

1787:

1788: \bibitem{Gog2K}

1789: Brice Goglin.

1790: \newblock Calcul sur des expansions de taille fixe.

1791: \newblock Technical report, {\'E}cole Normale Sup{\'e}rieure de Lyon, 2000.

1792:

1793: \bibitem{Gol91}

1794: David Goldberg.

1795: \newblock What every computer scientist should know about floating point

1796:   arithmetic.

1797: \newblock {\em ACM Computing Surveys}, 23(1):5--47, 1991.

1798:

1799: \bibitem{Gra2Kb}

1800: Tobj\"orn Granlund.

1801: \newblock {\em The {GNU} multiple precision arithmetic library}, 2000.

1802: \newblock Version 3.1.

1803:

1804: \bibitem{HarPaySavStr75}

1805: L.~H. Harper, Terry~H. Payne, John~E. Savage, and E.~Straus.

1806: \newblock Sorting $x + y$.

1807: \newblock {\em Communications of the ACM}, 18(6):347--349, 1975.

1808:

1809: \bibitem{Har97a}

1810: John Harrison.

1811: \newblock Floating point verification in {HOL} light: the exponential function.

1812: \newblock Technical Report 428, University of Cambridge Computer Laboratory,

1813:   1997.

1814:

1815: \bibitem{Hig96}

1816: Nicholas~J. Higham.

1817: \newblock {\em Accuracy and stability of numerical algorithms}.

1818: \newblock SIAM, 1996.

1819:

1820: \bibitem{HueKahPau97}

1821: G{\'e}rard Huet, Gilles Kahn, and Christine Paulin-Mohring.

1822: \newblock The {Coq} {P}roof {A}ssistant: {A} {T}utorial: Version 6.1.

1823: \newblock Technical Report 204, Institut National de Recherche en Informatique

1824:   et en Automatique, Le Chesnay, France, 1997.

1825:

1826: \bibitem{Kah65}

1827: William Kahan.

1828: \newblock Further remarks on reducing truncation errors.

1829: \newblock {\em Communications of the ACM}, 8(1):40, 1965.

1830:

1831: \bibitem{KarLiPecYap99}

1832: V.~Karamcheti, C.~Li, I.~Pechtchanski, and Chee Yap.

1833: \newblock A core library for robust numeric and geometric computation.

1834: \newblock In {\em Proceedings of the 15th Annual ACM Symposium on Computational

1835:   Geometry}, pages 351--359, Miami, Florida, 1999.

1836:

1837: \bibitem{Knu69}

1838: Donald~E. Knuth.

1839: \newblock {\em The art of computer programming: Seminumerical Algorithms}.

1840: \newblock Addison Wesley, 1969.

1841: \newblock First edition.

1842:

1843: \bibitem{Knu73}

1844: Donald~E. Knuth.

1845: \newblock {\em The art of computer programming: Seminumerical Algorithms}.

1846: \newblock Addison Wesley, 1973.

1847: \newblock Second edition.

1848:

1849: \bibitem{LouErc93}

1850: Marianne~E. Louie and Milo\v{s}~D. Ercegovac.

1851: \newblock On digit recurrence division implementation for field programmable

1852:   gate arrays.

1853: \newblock In Earl Swartzlander, Mary~Jane Irwin, and Graham Jullien, editors,

1854:   {\em Proceedings of the 11th Symposium on Computer Arithmetic}, pages

1855:   202--209, Windsor, Ontario, 1993.

1856:

1857: \bibitem{MazMer93}

1858: Christophe Mazenc and Xavier Merrheim.

1859: \newblock Handling numbers in high precision and controlling the numerical

1860:   precision in a serial system on a parallel {MIMD} machine.

1861: \newblock In {\em 26th International Conference on System Sciences}, pages

1862:   1196--1198, Honolulu, Hawaii, 1993.

1863:

1864: \bibitem{Men94}

1865: Val{\'e}rie M{\'e}nissier.

1866: \newblock {\em Arithm{\'e}tique Exacte~: Conception, Algorithmique et

1867:   Performances d'une Implantation Informatique en Pr{\'e}cision Arbitraire}.

1868: \newblock PhD thesis, Universit{\'e} Paris VI, Paris, France, 1994.

1869:

1870: \bibitem{Men95}

1871: Val\'erie M\'enissier-Morain.

1872: \newblock Conception et algorithmique d'une repr\'esentation d'arithm\'etique

1873:   r\'eelle en pr\'ecision arbitraire.

1874: \newblock In {\em 1st Real Numbers and Computers Conference}, pages 47--64,

1875:   St-\'Etienne, France, 1995.

1876:

1877: \bibitem{MicMor97}

1878: Dominique Michelucci and Jean-Michel Moreau.

1879: \newblock Lazy arithmetic.

1880: \newblock {\em IEEE Transactions on Computers}, 46(9):961--975, 1997.

1881:

1882: \bibitem{Mol65b}

1883: Ole M{\o}ller.

1884: \newblock Note on quasi double-precision.

1885: \newblock {\em BIT}, 5(4):251--255, 1965.

1886:

1887: \bibitem{Mol65a}

1888: Ole M{\o}ller.

1889: \newblock Quasi double-precision in floating point addition.

1890: \newblock {\em BIT}, 5(1):37--50, 1965.

1891:

1892: \bibitem{Mul97}

1893: Jean-Michel Muller.

1894: \newblock {\em Elementary functions, algorithms and implementation}.

1895: \newblock Birkhauser, 1997.

1896:

1897: \bibitem{Pic72}

1898: Mich\`ele Pichat.

1899: \newblock Correction d'une somme en arithm\'etique \`a virgule flottante.

1900: \newblock {\em Numerische Mathematik}, 19:400--406, 1972.

1901:

1902: \bibitem{Pri91}

1903: Douglas~M. Priest.

1904: \newblock Algorithms for arbitrary precision floating point arithmetic.

1905: \newblock In Peter Kornerup and David Matula, editors, {\em Proceedings of the

1906:   10th Symposium on Computer Arithmetic}, pages 132--144, Grenoble, France,

1907:   1991.

1908:

1909: \bibitem{Rus98}

1910: David~M. Russinoff.

1911: \newblock A mechanically checked proof of {IEEE} compliance of the floating

1912:   point multiplication, division and square root algorithms of the {AMD-K7}

1913:   processor.

1914: \newblock {\em LMS Journal of Computation and Mathematics}, 1:148--200, 1998.

1915:

1916: \bibitem{Sei2K}

1917: Peter-Michael Seidel.

1918: \newblock Exact arithmetic based on floating-point numbers.

1919: \newblock In {\em IMACS-GAMMM International Symposium on Scientific Computing,

1920:   Computer Arithmetic and Validated Numerics}, page 123, Karlsruhe, Germany,

1921:   2000.

1922:

1923: \bibitem{SerVuiHer89}

1924: Bernard Serpette, Jean Vuillemin, and Jean-Claude Herv\'{e}.

1925: \newblock {BigNum}: a portable and efficient package for arbitrary-precision

1926:   arithmetic.

1927: \newblock Technical Report~2, Digital Equipment Corporation, Paris Research

1928:   Laboratory, 1989.

1929:

1930: \bibitem{She96}

1931: Jonathan~R. Shewchuk.

1932: \newblock Robust adaptative floating point geometric predicates.

1933: \newblock In {\em Proceedings of the 12th Annual ACM Symposium on Computational

1934:   Geometry}, pages 141--150, Philadelphia, Pensylvania, 1996.

1935:

1936: \bibitem{She97}

1937: Jonathan~R. Shewchuk.

1938: \newblock Adaptive precision floating-point arithmetic and fast robust

1939:   geometric predicates.

1940: \newblock In {\em Discrete and Computational Geometry}, volume~18, pages

1941:   305--363, 1997.

1942:

1943: \bibitem{SorTan91}

1944: Danny~C. S{\o}rensen and Ping Tak~Peter Tang.

1945: \newblock On the orthogonality of eigenvectors computed by divide and conquer

1946:   methods techniques.

1947: \newblock {\em SIAM Journal of Numerical Analysis}, 28(6):1752--1775, 1991.

1948:

1949: \bibitem{SteStr94}

1950: William Steiger and Ileana Streinu.

1951: \newblock A pseudo-algorithmic separation of line from pseudo-lines.

1952: \newblock In {\em Proceedings of the Sixth Canadian Conference on Computational

1953:   Geometry}, Saskatoon, Saskatchewan, 1994.

1954:

1955: \bibitem{Ste74}

1956: Pat~H. Sterbenz.

1957: \newblock {\em Floating point computation}.

1958: \newblock Prentice Hall, 1974.

1959:

1960: \bibitem{Ste.81}

1961: David Stevenson et~al.

1962: \newblock A proposed standard for binary floating point arithmetic.

1963: \newblock {\em IEEE Computer}, 14(3):51--62, 1981.

1964:

1965: \bibitem{Ste.87}

1966: David Stevenson et~al.

1967: \newblock An american national standard: {IEEE} standard for binary floating

1968:   point arithmetic.

1969: \newblock {\em ACM SIGPLAN Notices}, 22(2):9--25, 1987.

1970:

1971: \bibitem{TriErc77}

1972: Kishor~S. Trivedi and Milo\v{s}~D. Ercegovac.

1973: \newblock On-line algorithms for division and multiplication.

1974: \newblock {\em IEEE Transactions on Computers}, 26(7):681--687, 1977.

1975:

1976: \bibitem{Ziv91}

1977: Abraham Ziv.

1978: \newblock Fast evaluation of elementary mathematical functions with correctly

1979:   rounded last bit.

1980: \newblock {\em ACM Transactions on Mathematical Software}, 17(3):410--423,

1981:   1991.

1982:

1983: \end{thebibliography}

1984:

1985:

1986: \appendix

1987: \section{A brief overview of Coq}

1988:

1989: In this appendix,  we give a quick overview of the  Coq system.  For a

1990: more complete introduction, we refer the reader to~\cite{HueKahPau97}.

1991:

1992: Coq is a generic prover based on type-theory. In this system,

1993: users can define new objects {\it and} prove properties that

1994: derive logically from these definitions. Objects in Coq are

1995: typed and functions are first-class objects using a Lisp-like

1996: notation. The system is distributed with standard libraires that

1997: define types like {\tt nat}, {\tt Z} and {\tt R} which correspond to

1998: the natural numbers, the relative numbers and the reals respectively.

1999: To give an example, the addition for natural number is represented by

2000: the function {\tt plus}

2001: whose type is {\tt nat $\rightarrow$ nat $\rightarrow$ nat}.

2002: A function {\tt plus3} that does the sum of 3 natural numbers is

2003: defined by the following command:

2004: {\small \begin{verbatim}

2005: Definition plus3 := [a,b,c:nat] (plus a (plus b c)).

2006: \end{verbatim}}

2007: \noindent

2008: Arguments between square brackets provide the parameters of the function.

2009: The type of {\tt plus3} is the expected one: {\tt nat $\rightarrow$  nat $\rightarrow$

2010: nat $\rightarrow$ nat}.

2011:

2012: In our formalization, we first define the type {\tt float} that

2013: represents the record composed of the mantissa and the exponent

2014: by the command:

2015: {\small \begin{verbatim}

2016: Record float: Set := Float {Fnum: Z; Fexp: Z}.

2017: \end{verbatim}}

2018: \noindent

2019: This command creates a new type {\tt float}, a constructor {\tt Float}

2020: of type {\tt Z $\rightarrow$  Z $\rightarrow$ float} and two

2021: destructors {\tt Fnum} of type {\tt  float $\rightarrow$ Z} and

2022: {\tt Fexp} of type {\tt  float $\rightarrow$ Z}. So for example,

2023: we can define the function {\tt Fzero} that takes an object of

2024: type {\tt float} and returns an object of same exponent but

2025: with the mantissa set to zero:

2026: {\small \begin{verbatim}

2027: Definition Fzero := [p:float](Float 0 (Fexp p)).

2028: \end{verbatim}}

2029: \noindent

2030: In a similar way, we define the notion of bound. It contains

2031: two integers: one for the mantissa and one for the exponent:

2032: {\small \begin{verbatim}

2033: Record Fbound: Set := Bound {vNum: nat;dExp: nat}.

2034: \end{verbatim}}

2035: \noindent

2036: In order to prove properties that are independent of a particular

2037: bound, we add the command:

2038: {\small \begin{verbatim}

2039: Variable b:Fbound.

2040: \end{verbatim}}

2041: \noindent

2042: With this arbitrary bound, the notion of being bounded is defined as:

2043: {\small \begin{verbatim}

2044: Definition Fbounded := [p:float]

2045:     ((Zle (Zopp (vNum b)) (Fnum p))  /\ (Zle (Fnum p) (vNum b))) /\

2046:     (Zle (Zopp (dExp b)) (Fexp p)).

2047: \end{verbatim}}

2048: \noindent

2049: A pretty-printed version of this definition could be:\\

2050: \vbox{

2051: \vskip5pt

2052: \hbox{{\tt Definition Fbounded := $\lambda$p:float.\,}}

2053: \hbox{ {\tt -(vNum b) $\le$ (Fnum p) $\le$ (vNum b) $\land$ -(dExp b) $\le$ (Fexp p).}}

2054: \vskip5pt

2055: }\\

2056: Now that we have these definitions we can prove a very simple

2057: first theorem:

2058: {\small \begin{verbatim}

2059: Theorem BoundedZero: (p:float) (Fbounded p) ->(Fbounded (Fzero p)).

2060: \end{verbatim}}

2061: \noindent

2062: A pretty-printed version of this theorem could be:\\

2063: \vbox{

2064: \vskip5pt

2065: \hbox{{\tt Theorem BoundedZero: $\forall$p:float.\, (Fbounded p) $\Rightarrow$ (Fbounded (Fzero p)).}}

2066: \vskip5pt

2067: }\\

2068: When the previous command is received by the system, it enters

2069: the proof mode. The statement of the theorem is put om the top

2070: of the goal stack. By applying a command called {\it tactic} we

2071: replace the goal at the top of the stack by a (possibly empty)

2072: list of subgoals. The proof is finished when the stack is empty.

2073: Our initial stack has only one goal. Each goal contains a list

2074: of hypothesis and a conclusion separated by a bar:

2075: {\small \begin{verbatim}

2076: 1 subgoal

2077:

2078:   ============================

2079:    (p:float)(Fbounded p)->(Fbounded (Fzero p))

2080: \end{verbatim}}

2081: \noindent

2082: The first step is to introduce the hypothesis. For this,

2083: we apply the tactic {\tt Intros}.

2084: {\small \begin{verbatim}

2085: Intros p H.

2086: \end{verbatim}}

2087: \noindent

2088: We are now reduce to prove {\tt (Fbounded (Fzero p))} in

2089: the contect where {\tt p} is a {\tt float} and {\tt H} is a

2090: proof that p is bounded:

2091: {\small \begin{verbatim}

2092: 1 subgoal

2093:

2094:   p : float

2095:   H : (Fbounded p)

2096:   ============================

2097:    (Fbounded (Fzero p))

2098: \end{verbatim}}

2099: \noindent

2100: Next we expand the definition of {\tt Fbounded} in the goal. For this,

2101: we use the tactic {\tt Red} and get the new goal:

2102: {\small \begin{verbatim}

2103: 1 subgoal

2104:

2105:   p : float

2106:   H : (Fbounded p)

2107:   ============================

2108:    ((Zle (Zopp (vNum b)) (Fnum (Fzero p)) /\

2109:     (Zle (Fnum (Fzero p)) (vNum b))) /\

2110:     (Zle (Zopp (dExp b)) (Fexp (Fzero p)))

2111: \end{verbatim}}

2112: \noindent

2113: This goal can be simplified using the definition of

2114: {\tt Fzero} by the tactic {\tt Simpl}:

2115: {\small \begin{verbatim}

2116: 1 subgoal

2117:

2118:   p : float

2119:   H : (Fbounded p)

2120:   ============================

2121:    ((Zle (Zopp (vNum b)) 0) /\ (Zle 0 (vNum b))) /\

2122:     (Zle (Zopp (dExp b)) (Fexp p))

2123: \end{verbatim}}

2124: \noindent

2125: We now need to break the conjunctions into separate

2126: subgoals. This is done by the tactic {\tt Split}. As

2127: the conjunction is nested, this tactic needs to be

2128: applied repeatedly:

2129: {\small \begin{verbatim}

2130: Repeat Split.

2131: \end{verbatim}}

2132: \noindent

2133: We get the expected three subgoals.

2134: {\small \begin{verbatim}

2135: 3 subgoals

2136:

2137:   p : float

2138:   H : (Fbounded p)

2139:   ============================

2140:    (Zle (Zopp (vNum b)) 0)

2141:

2142: subgoal 2 is:

2143:    (Zle 0 (vNum b))

2144:

2145: subgoal 3 is:

2146:    (Zle (Zopp (dExp b)) (Fexp p))

2147: \end{verbatim}}

2148: \noindent

2149: The first goal is simple enough to be proved automatically

2150: by the following tactic {\tt Intuition}. We are left with two subgoals.

2151: {\small \begin{verbatim}

2152: 2 subgoals

2153:

2154:   p : float

2155:   H : (Fbounded p)

2156:   ============================

2157:    (Zle 0 (vNum b))

2158:

2159: subgoal 2 is:

2160:    (Zle (Zopp (dExp b)) (Fexp p))

2161: \end{verbatim}}

2162: \noindent

2163: Applying the same tactic to these two subgoals ends the proof.

2164: The final proof script looks like:

2165: {\small \begin{verbatim}

2166: Theorem BoundedZero: (p:float) (Fbounded p) ->(Fbounded (Fzero p)).

2167: Intros p H.

2168: Red.

2169: Simpl.

2170: Repeat Split.

2171: Intuition.

2172: Intuition.

2173: Intuition.

2174: Qed.

2175: \end{verbatim}}

2176: \noindent

2177: Using the composition of tactics ";", it can be shorten to:

2178: {\small \begin{verbatim}

2179: Theorem BoundedZero: (p:float) (Fbounded p) ->(Fbounded (Fzero p)).

2180: Intros p H; Red; Simpl; Repeat Split; Intuition.

2181: Qed.

2182: \end{verbatim}}

2183:

2184: \end{document}

2185:

2186: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

2187: