1: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2: %
3: % File: Expansions/OnLine/Texte/jacm.tex
4: %
5: % Author(s): Marc Daumas
6: % <Marc.Daumas@Lip.Ens-Lyon.Fr>
7: % Laboratoire de l'Informatique du Parallelisme
8: %
9: % Last modified on Fri May 05 15:10:03 GMT MET DST 2000 by daumas
10: %
11:
12: %%%%%%%%%%%%%%%%
13: %%%%% TODO %%%%%
14: %%%%%%%%%%%%%%%%
15: % Rechercher les TODO
16:
17: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
18:
19: \documentclass[a4paper]{article}
20: %\documentclass[a4paper,twoside]{lipreport}
21:
22: %\usepackage{RR}
23:
24: \usepackage[latin1]{inputenc}
25: \usepackage[T1]{fontenc}
26: \usepackage[english]{babel}
27:
28: %\usepackage{styles}
29: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
30: %
31: % File: styles.sty
32: %
33: % Author(s): Marc Daumas
34: % <Marc.Daumas@Lip.Ens-Lyon.Fr>
35: % Laboratoire de l'Informatique du Parallelisme
36: %
37: % Last modified on Thu Jan 28 14:33:02 MET 1999 by daumas
38: % modified on Wed Feb 26 09:02:03 MET 1997 by mdaumas
39: %
40:
41: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
42: %
43: % Detection LaTeX vs. LaTeX2e
44: %
45:
46: \ifx\usepackage\undefined
47: \def\inca#1{\input #1.sty}
48: \def\incb#1#2{\input #1.sty}
49: \else
50: \newcommand{\inca}[1]{\usepackage{#1}}
51: \newcommand{\incb}[2]{\usepackage[#2]{#1}}
52: \fi
53:
54: %\if@compatibility
55: %\def\inca#1{\input #1.sty}
56: %\def\incb#1#2{\input #1.sty}
57: %\fi
58:
59: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
60: %
61: % Styles usuels dans un ordre que LaTeX accepte
62: %
63:
64: \ifx\pdfoutput\undefined
65: \incb{graphics}{dvips}
66: \incb{epsfig}{dvips}
67: \def\dvipsselector#1#2{#1}
68: \def\dvipsoption{dvips}
69: \else
70: \incb{graphics}{pdftex}
71: \def\dvipsselector#1#2{#2}
72: \def\dvipsoption{pdftex}
73: \fi
74:
75: %\inca{tgrind}
76: \inca{boxedminipage,url}
77: \inca{amstext,amsfonts,latexsym}
78: \ifx\usepackage\undefined\else\inca{latexsym}\fi
79: \inca{epic}
80: % \inca{eepic}
81:
82: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
83: %
84: % Macro personnelles
85: %
86:
87: %\inca{macro}
88: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
89: %
90: % File: macro.sty
91: %
92: % Author(s): Marc Daumas
93: % <Marc.Daumas@Lip.Ens-Lyon.Fr>
94: % Laboratoire de l'Informatique du Parallelisme
95: %
96: % Last modified on Wed Mar 01 12:44:17 GMT MET 2000 by daumas
97: % modified on Fri Feb 28 16:22:11 MET 1997 by mdaumas
98: %
99:
100: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
101: %
102: % Quelques alias utiles
103: %
104:
105: \def\ov#1{\overline{#1}}
106: \def\mc#1#2#3{\multicolumn{#1}{#2}{#3}}
107: \def\ita{\vspace{-\itemsep}\item}
108:
109: % \newcommand{\interligne}[1]{\baselineskip = #1pt}
110: % \renewcommand{\baselinestretch}{1.5}
111:
112: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
113: %
114: % Environnements
115: %
116:
117: \ifx\frenchname\undefined
118: \newtheorem{defin}{Definition}
119: \newtheorem{prop}{Proposition}
120: \newtheorem{lem}{Lemma}
121: \newtheorem{theo}{Theorem}
122: \newtheorem{corol}{Corollary}
123: \newtheorem{conj}{Conjecture}
124: \newtheorem{observ}{Observation}
125: \newtheorem{exemple}{Example}
126: \else
127: \newtheorem{defin}{Définition}
128: \newtheorem{prop}{Proposition}
129: \newtheorem{lem}{Lemme}
130: \newtheorem{theo}{Théorème}
131: \newtheorem{corol}{Corollaire}
132: \newtheorem{conj}{Conjecture}
133: \newtheorem{observ}{Observation}
134: \newtheorem{exemple}{Exemple}
135: \fi
136:
137: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
138: %
139: % Ensembles mathématiques usuels
140: %
141:
142: \def\R{\mathbb R}
143: \def\Q{\mathbb Q}
144: \def\Z{\mathbb Z}
145: \def\N{\mathbb N}
146: \def\F{\mathbb F}
147: \def\M{\mathbb M}
148: \def\K{\mathbb K}
149: \def\C{\mathbb C}
150: \def\Fb{\overline{\mathbb F}}
151: \def\Rb{\overline{\mathbb R}}
152:
153: \ifx\mathbb\undefined
154: \def\mathbb{\Bbb}
155: \fi
156:
157: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
158: %
159: % Fonction mathématiques utilisées en informatique
160: %
161:
162: \def\mod{\mathop{\rm mod}\nolimits}
163: \def\ulp{\mathop{\rm ulp}\nolimits}
164: \def\sign{\mathop{\rm sign}\nolimits}
165: \def\trunc{\mathop{\rm Trunc}\nolimits}
166: \def\tronc{\mathop{\rm Tronc}\nolimits}
167:
168: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
169: %
170: % Expressions mathématiques
171: %
172:
173: \def\ceil#1{ \left\lceil #1 \right\rceil }
174: \def\floor#1{\left\lfloor #1 \right\rfloor}
175: \def\fl#1{\left\lfloor \log #1 \right\rfloor}
176: \def\cl#1{\left\lceil \log #1 \right\rceil }
177:
178: \def\CG{\left[} \def\CD{\right]}
179: \def\PG{\left(} \def\PD{\right)}
180: \def\AG{\left\{} \def\AD{\right\}}
181:
182: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
183: %
184: % Modes d'arrondi normalisés IEEE
185: %
186:
187: \def\Rx{\mathop{\Box}\nolimits}
188:
189: \def\RO{\mathop{\bowtie}\nolimits}
190: \def\RU{\mathop{\triangle}\nolimits}
191: \def\RN{\mathop{\circ}\nolimits}
192: \def\RD{\mathop{\nabla}\nolimits}
193:
194: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
195: %
196: % Inclusion de figures
197: %
198:
199: \newenvironment{pfigure}{\begin{figure}}{\end{figure}}
200: \def\figdir{Fig} \def\figepsdir{\figdir}
201: \def\figext{\dvipsselector{pstex_t}{pdftex_t}}
202: \def\epsdir{../EPS}
203:
204: \newcommand{\figtex}[2]{%
205: \begin{pfigure}
206: \begin{center}
207: \input \figdir/#1.\figext
208: \end{center}
209: \caption{#2}
210: \label{fig/#1}
211: \label{fig:#1}
212: \end{pfigure}
213: }
214:
215: \newcommand{\figeps}[2]{%
216: \begin{pfigure}
217: \centerline{\epsfig{file=\epsdir/#1.eps}}
218: \caption{#2}
219: \label{fig/#1}
220: \label{fig:#1}
221: \end{pfigure}
222: }
223:
224: \newcommand{\figpic}[2]{%
225: \begin{pfigure}
226: \begin{center}
227: \input \figdir/#1.eepic
228: \end{center}
229: \caption{#2}
230: \label{fig/#1}
231: \label{fig:#1}
232: \end{pfigure}
233: }
234:
235: \newcommand{\figinc}[2]{%
236: \begin{pfigure}
237: \begin{center}
238: \input \figdir/#1.tex
239: \end{center}
240: \caption{#2}
241: \label{fig/#1}
242: \label{fig:#1}
243: \end{pfigure}
244: }
245:
246: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
247: %
248: % Dessins
249: %
250:
251: \newcommand{\pictex}[1]{%
252: \begin{center}
253: \input \figdir/#1.\figext
254: \end{center}
255: }
256:
257: \newcommand{\piceps}[1]{\centerline{\epsfig{file=\epsdir/#1.eps}}}
258:
259: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
260: %
261: % Encarts, listings
262: %
263:
264: \newcommand{\boxlabel}{}
265: \newcommand{\boxcaption}{}
266:
267: \newenvironment{encart}[2]{%
268: \fboxsep=5mm
269: \renewcommand{\boxlabel}{#1}
270: \renewcommand{\boxcaption}{#2}
271: \begin{pfigure}
272: \begin{center}
273: \begin{boxedminipage}{.9\textwidth}
274: }{%
275: \vspace{-1.5\baselineskip}
276: \hspace*{\fill}
277: \end{boxedminipage}
278: \end{center}
279: \caption{\boxcaption}
280: \label{enc/\boxlabel}
281: \end{pfigure}
282: }
283:
284: \newenvironment{encart2}[2]{%
285: \fboxsep=5mm
286: \renewcommand{\boxlabel}{#1}
287: \renewcommand{\boxcaption}{#2}
288: \begin{pfigure}
289: \begin{center}
290: \begin{boxedminipage}{.475\textwidth}
291: }{%
292: \vspace{-1.5\baselineskip}
293: \hspace*{\fill}
294: \end{boxedminipage}
295: \end{center}
296: \caption{\boxcaption}
297: \label{enc/\boxlabel}
298: \end{pfigure}
299: }
300:
301: \usepackage{times,palatino}
302: %\usepackage{fullpage}
303: %\usepackage{rri}
304: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
305: %
306: % File: rri.sty
307: %
308: % Author(s): Marc Daumas
309: % <Marc.Daumas@Lip.Ens-Lyon.Fr>
310: % Laboratoire de l'Informatique du Parallelisme
311: %
312: % Last modified on Wed Mar 01 12:44:17 GMT MET 2000 by daumas
313: % modified on Fri Feb 28 16:22:11 MET 1997 by mdaumas
314: %
315:
316: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
317: %
318: % Environnements
319: %
320:
321: \ifx\titleLIP\undefined
322: \ifx\RRetitle\undefined
323: \newcommand{\RRItitle}[1]{}
324: \newcommand{\RRItitre}[1]{}
325: \newcommand{\RRIdate}[1]{}
326: \newcommand{\RRIauthor}[1]{}
327: \newcommand{\RRIthead}[1]{}
328: \newcommand{\RRIahead}[1]{}
329: \newcommand{\RRIabstract}[1]{\newcommand{\RRIpre}{\begin{abstract}#1\end{abstract}}}
330: \newcommand{\RRIresume}[1]{}
331: \newcommand{\RRIkeywords}[1]{}
332: \newcommand{\RRImotscles}[1]{}
333: \newcommand{\RRIbegin}{}
334: \newcommand{\RRInumber}[1]{}
335: \newcommand{\RRItheme}[1]{}
336: \newcommand{\RRIprojet}[1]{}
337: \newcommand{\RRIno}[1]{#1}
338: \newcommand{\RRIyes}[1]{}
339: \newcommand{\RRIinria}[1]{}
340: \newcommand{\RRIlip}[1]{}
341: \else
342: \newcommand{\RRItitle}[1]{\RRetitle{#1}}
343: \newcommand{\RRItitre}[1]{\RRtitle{#1}}
344: \newcommand{\RRIdate}[1]{\RRdate{#1}}
345: \newcommand{\RRIauthor}[1]{\RRauthor{#1}}
346: \newcommand{\RRIthead}[1]{\titlehead{#1}}
347: \newcommand{\RRIahead}[1]{\authorhead{#1}}
348: \newcommand{\RRIabstract}[1]{\RRabstract{#1}}
349: \newcommand{\RRIresume}[1]{\RRresume{#1}}
350: \newcommand{\RRIkeywords}[1]{\RRkeyword{#1}}
351: \newcommand{\RRImotscles}[1]{\RRmotcle{#1}}
352: \newcommand{\RRIbegin}{\URRhoneAlpes \makeRR}
353: \newcommand{\RRIpre}{}
354: \newcommand{\RRInumber}[1]{}
355: \newcommand{\RRItheme}[1]{\RRtheme{#1}}
356: \newcommand{\RRIprojet}[1]{\RRprojet{#1}}
357: \newcommand{\RRIno}[1]{}
358: \newcommand{\RRIyes}[1]{#1}
359: \newcommand{\RRIinria}[1]{#1}
360: \newcommand{\RRIlip}[1]{}
361: \newcommand{\french}{}
362: \newcommand{\english}{}
363: \fi
364: \else
365: \usepackage{fancyheadings}
366: \newcommand{\RRItitle}[1]{\titleLIP{#1}}
367: \newcommand{\RRItitre}[1]{}
368: \newcommand{\RRIdate}[1]{\dateLIP{#1}}
369: \newcommand{\RRIauthor}[1]{\authorLIP{#1}}
370: \newcommand{\RRIthead}[1]{\newcommand{\RRItheadv}{#1}}
371: \newcommand{\RRIahead}[1]{\newcommand{\RRIaheadv}{#1}}
372: \newcommand{\RRIabstract}[1]{\abstractLIP{#1}}
373: \newcommand{\RRIresume}[1]{\resumeLIP{#1}}
374: \newcommand{\RRIkeywords}[1]{\keywordsLIP{#1}}
375: \newcommand{\RRImotscles}[1]{\motsclesLIP{#1}}
376: \newcommand{\RRIbegin}{\rapportLIP\pagenumbering{arabic}\pagestyle{fancy}\rhead[\RRIaheadv]{\thepage}\lhead[\thepage]{\RRItheadv}\cfoot{}}
377: \newcommand{\RRIpre}{}
378: \newcommand{\RRInumber}[1]{\numberLIP{#1}}
379: \newcommand{\RRItheme}[1]{}
380: \newcommand{\RRIprojet}[1]{}
381: \newcommand{\RRIno}[1]{}
382: \newcommand{\RRIyes}[1]{#1}
383: \newcommand{\RRIinria}[1]{}
384: \newcommand{\RRIlip}[1]{#1}
385: \fi
386:
387:
388: \usepackage{color,amssymb}
389:
390: \bibliographystyle{plain}
391:
392: \RRItitle{Computer validated proofs of \\ a toolset for adaptable arithmetic}
393:
394: \RRItitre{Boîte à outils pour l'arithmétique numérique \\ validée par un preuve sur ordinateur}
395:
396: \RRIdate{Janvier 2001}
397:
398: \RRIauthor{%
399: Sylvie Boldo\RRIinria{, LIP} \\
400: Marc Daumas\RRIinria{, CNRS} \\
401: Claire Moreau-Finot\RRIinria{, LIP} \\
402: Laurent Théry\RRIinria{, INRIA \\~}
403: }%
404:
405: \RRIthead{Computer validated proofs of a toolset for adaptable arithmetic}
406: \RRIahead{Sylvie Boldo, Marc Daumas, Claire Moreau-Finot and Laurent Théry}
407:
408: \RRInumber{2001-01}
409:
410: \RRIabstract{%
411: Most existing implementations of multiple precision arithmetic
412: demand that the user sets the precision {\em a priori}. Some
413: libraries are said adaptable in the sense that they dynamically
414: change the precision of each intermediate operation individually to
415: deliver the target accuracy according to the actual inputs. We
416: present in this text a new adaptable numeric core inspired both from
417: floating point expansions and from on-line arithmetic.
418:
419: The numeric core is cut down to four tools. The tool that contains
420: arithmetic operations is proved to be correct. The proofs have been
421: formally checked by the Coq assistant. Developing the proofs, we
422: have formally proved many results published in the literature and we
423: have extended a few of them. This work may let users (i) develop
424: application specific adaptable libraries based on the toolset and /
425: or (ii) write new formal proofs based on the set of validated facts.
426: }%
427:
428: \RRIresume{%
429: La plupart des codes à précision multiple existants demandent que
430: l'utilisateur spécifie la précision des calculs {\em a priori}. Une
431: solution est d'utiliser pour les calculs intermédiaires la précision
432: la plus élevée pour atteindre la précision demandée pour le
433: résultat. On dit que certaines bibliothèques sont adaptatives dans
434: le sens qu'elles changent dynamiquement la précision de chacun des
435: calculs intermédiaires en fonction des valeurs effectives des
436: entrées pour atteindre la précision demandée pour le résultat. Nous
437: présentons dans ce texte un nouveau noyau numérique adaptatif
438: inspiré à la fois des expansions de nombres flottants et de
439: l'arithmétique en-ligne.
440:
441: Le noyau numérique se décompose en cinq outils. Le premier outil
442: qui contient de nombreuses opérations arithmétiques est prouvé. Les
443: preuves ont été validées formellement par l'assistant Coq. En
444: développant les preuves, nous avons redéveloppé formellement de
445: nombreux résultats déjà publiés dans la littérature et nous en avons
446: étendu certains. Ce travail permettra aux utilisateurs de (i)
447: développer des bibliothèques adaptatives spécifiques à une
448: application basées sur la boîte à outils et / ou (ii) écrire de
449: nouvelles preuves formelles basées sur notre socle de faits validés.
450: }%
451:
452: \RRIkeywords{Multiple precision, expansion, on-line, formal proof, Coq.}
453:
454: \RRImotscles{Précision multiple, expansion, en-ligne, preuve formelle, Coq.}
455:
456: \RRItheme{2}
457:
458: \RRIprojet{Arénaire}
459:
460: \RRIinria{\RRnote{%
461: This text is also available as a research report of the
462: Laboratoire de l'Informatique du Parallélisme {\tt
463: http://www.ens-lyon.fr/LIP}.
464: }}%
465:
466: \begin{document}
467:
468: \RRIbegin
469:
470: \newtheorem{proc}{Lemma}
471: \newcommand{\ptys}[2]{{\noindent \bf{#1}}{\it{#2}}}
472:
473: \title{%
474: {\bf
475: Computer validated proofs of \\
476: a toolset for adaptable arithmetic}\thanks{%
477: Authors may be reached via e-mail at Sylvie.Boldo@ENS-Lyon.Fr,
478: Marc.Daumas@ENS-Lyon.Fr, Claire.Moreau@ENS-Lyon.Fr and
479: Laurent.Thery@INRIA.Fr. \RRIlip{This text is also available as a
480: research report of the Institut National de Recherche en
481: Informatique et en Automatique {\tt http://www.inria.fr}.}
482: }%
483: }%
484:
485: \author{
486: {\bf Sylvie Boldo, Marc Daumas, Claire Moreau-Finot} \\
487: Laboratoire de l'Informatique du Parallélisme \\
488: UMR 5668 - CNRS - ENS de Lyon - INRIA \\[24pt]
489: {\bf Laurent Théry} \\
490: INRIA Nice---Sophia-Antipolis
491: }%
492: \date{}
493:
494: \maketitle
495: \RRIpre
496:
497: \section*{Introduction}
498:
499: We will first present examples of numerical computations coming from
500: four fields of application. With each example we introduce new
501: properties, new goals and some needed references. As the subject will
502: become clear, we will present the contributions of this text and its
503: summary.
504:
505: \subsection*{Motivation}
506:
507: As an undergraduate student, we have all learnt to compute a quantity
508: with a few significant digits first as 5 miles is about 8 kilometers.
509: If the question is ``Are 4.995 miles more than 8 kilometers?'', we are
510: close to a threshold. To answer the question, we would compute more
511: (less significant) digits to find that 4.995 miles are a little over
512: 38 meters longer than 8 kilometers. We might first do a
513: multiplication of 5 by 1.6 and then the exact multiplication of 4.995
514: by 1.609344. However, we could save that 5 times 1.6 is exactly 8 and
515: afterward just multiply 5 by 9 to obtain 45 meters with a small
516: additional value that is less than 5 meters. That answers the
517: question since 0.005 miles is less than 10 meters.
518:
519: Numerical analysts have long designed algorithms that react to the
520: accuracy aimed by the user. Countless specific algorithms reduce the
521: error on the final result by iterative refinement in solving algebraic
522: equations, linear systems, ODEs, PDEs and so on \cite{Hig96}. Such
523: methods are so robust that the final solution can be very accurate
524: even if the first attempt before any refinement is not close.
525:
526: All the above-mentioned solutions assume that the intermediate
527: arithmetic operations are performed up to a fixed level of precision.
528: This level, usually the one given by machine double precision
529: arithmetic, is used for all the operations and is sufficient to store
530: the result to the accuracy aimed by the user. Recent work presented
531: in \cite{Hig96,SorTan91,Bar93} proposes new enhanced routines if two
532: levels of precision are available on the machine. The second level can
533: be obtained by a careful use of the tools presented in this article
534: \cite{BolDau01}. All these algorithms use the machine arithmetic as
535: an error prone tool that must be circumvented by mathematical
536: analysis.
537:
538: % For the solutions presented above, adaptability is defined at the {\em
539: % application level}, we will see in the remaining of this text, {\em
540: % arithmetic level} adaptability.
541:
542: \subsection*{Adaptability {\em versus} multiple precision}
543:
544: A numerical algorithm is only slightly modified to accommodate
545: adaptable arithmetic or multiple precision arithmetic. With multiple
546: precision arithmetic, each operator is fired only once and the result
547: is computed with the precision fixed at compile time. Static error
548: analysis is used to guarantee that the final result is known to the
549: accuracy specified by the user. Being static, the error analysis is
550: often too pessimistic. As it is readily available, multiple precision
551: arithmetic is often chosen by users.
552:
553: When a code has been modified to become adaptable, every single
554: operator is started with a very limited precision. The runtime system
555: freezes the operator but it is able to resume its work when necessary.
556: The program is not ended unless the final result is known to the
557: accuracy specified by the user. The evaluation continues until this is
558: the case, with the system increasing automatically the working
559: precision of each operator independently.
560:
561:
562: Coherency is key to the field of computational geometry where one
563: single tiny error may change a convex hull to a non-convex, possibly
564: non-planar, graph. Numerical routines are used to answer such
565: predicates as ``Is a point in a circle defined by 3 other points?''.
566: A Boolean answer cannot be approximated, it is either right or wrong,
567: and one single wrong answer might lead a program to an abstract state
568: that cannot occur in Euclidian geometry. Recent works have proposed
569: clever implementations of multiple precision arithmetic
570: \cite{ForVan93,KarLiPecYap99} to answer correctly such questions.
571: They reinforce the existing set of general purpose multiple precision
572: libraries \cite{Bre78a,SerVuiHer89,Bai93,Bai95,Gra2Kb}. Some marginal
573: work does implement iterative refinements of the geometric predicates
574: \cite{AvnBoiDevPreYvi97} or change the algorithm to work correctly
575: with limited predicates \cite{BoiPre2K}.
576:
577: Adaptability is used for our last example. It is the central scheme of
578: Ziv's paper \cite{Ziv91} where the author implements correctly rounded
579: elementary functions (circular and exponential). Correct rounding for
580: the floating point numbers is mathematically defined by two
581: international standards (ANSI-IEEE-ISO~754 and 854
582: \cite{Ste.81,Ste.87,Gol91,CodKar.84}). It is a monotonous projection
583: from the real set to the set of the floating point numbers. If all
584: the numbers of a given interval round to the same floating point
585: value, you can safely round this interval to the common result.
586: Otherwise, you cannot round the interval.
587:
588: An approximated algorithm such as the ones used for the evaluation of
589: the elementary functions \cite{Mul97} does not produce directly the
590: rounded result but it computes an approximated result and an error
591: bound. This pair defines an interval that contains the exact result.
592: \begin{itemize}
593: \item If the interval is narrow, it rounds to one single floating
594: point value that is the correctly rounded value of the exact
595: elementary function.
596: \item If we discover from the current approximation that the interval
597: contains a discriminating point, the approximation must be refined
598: to reduce its width. This process continues until a satisfying
599: approximation is obtained to round the interval to one single value.
600: The set of the discriminating point is the set of the representable
601: numbers for the directed roundings and the set of the midpoints when
602: rounding to the nearest.
603: \end{itemize}
604:
605: With the first iteration of Ziv's scheme, machine precision is used.
606: For the result interval to get sharper, the approximation scheme is
607: only slightly modified but the target precision is increased and the
608: arithmetic operations become increasingly slow. Cases where a high
609: precision result is needed are very infrequent and the running time
610: seen by a user will most probably be the time for the first or the
611: second iteration.
612:
613: Adaptable behaviour is sometime recreated with Mathematica or Maple.
614: These two softwares let the user change the working precision and they
615: provide some criteria to estimate the final accuracy of many numerical
616: routines \cite{ChaGedGonMonWat91}. Users set the number of digits of
617: the intermediate precision after a few educated random attempts.
618:
619: \subsection*{Prior art}
620:
621: \paragraph{Adaptability}
622:
623: Some few authors already presented a general purpose adaptable
624: arithmetic library
625: \cite{Boe87,Men94,Men95,BerDau97,MicMor97,BurFleMehSch99}, but some
626: others stopped with one or two adaptable attempts followed by an exact
627: evaluation \cite{ForVan93,She97}. These last solutions are
628: appropriate in many cases because the first few attempts are fast and
629: they succeed by far in the most frequent case for many applications
630: \cite{DevPre98}. These authors where forced to stop because they used
631: a data structure that is not appropriate for adaptability. They
632: forced one or two rounds of adaptability with great efforts.
633:
634: \paragraph{Floating point expansion}
635:
636: This data structure was introduced by Priest \cite{Pri91} based on
637: some earlier operations \cite{Knu69,Knu73}. Actually, M{\o}ller and
638: Dekker were the first in the past to propose such techniques to extend
639: precision on the floating point unit \cite{Mol65a,Mol65b,Dek71}.
640: Kahan and Pichat applied similar techniques for other purposes
641: \cite{Kah65,Pic72}.
642:
643:
644: Fast algorithms, tightly connected to the machines, are possible with
645: IEEE-754 compatible commercial floating point units. Shewchuk was the
646: first one to present a working library available on the net with an
647: actual application to computational geometry \cite{She96,She97}.
648: Assuming that the floating point unit is IEEE compatible gives us a
649: powerful set of axiomatic properties on floating point operations. We
650: have proposed in the past some algorithms and their implementation for
651: multiplication and division on expansions
652: \cite{Dau99a,DauFin98,DauFin99a}. Developments are still undergone as
653: new research teams get interested by this approach \cite{Sei2K}. Our
654: contribution forces us to define a new kind of expansions. We call
655: them pseudo-expansions.
656:
657: % This definition yields a more redundant notation
658: % \cite{Mat82,NieKor99}. Although the proof of \cite{Maz93a} does not
659: % apply here, it is sensible that more redundancy in the notation eases
660: % the implementation of most significant digit first (MSDF) arithmetic
661: % \cite{Avi61,Mul94a,Fro91}.
662:
663: \paragraph{On-line arithmetic}
664:
665: We will present in this text that we can reduce the growth of the
666: running time by setting up an appropriate data structure that will not
667: restart an adaptable evaluation from scratch as the intermediate
668: precision is modified.
669:
670: This data structure is inspired by on-line arithmetic where
671: adaptability is natural. On line arithmetic was targeted in the past
672: to hardware design with small radices (typically 2 or 4)
673: \cite{TriErc77,CorDupHocMul91,LouErc93,DauMulVui94,DauMulTis97} but it
674: has proved to be suitable for software applications
675: \cite{MazMer93,BerDau97}.
676:
677: \subsection*{Contributions}
678:
679: The first contribution of this work is a set of primitives available
680: soon on the Internet through our home web site\footnote{{\tt
681: http://www.ens-lyon.fr/LIP/Arenaire/}.} and through the {\sc
682: netlib} repository\footnote{{\tt http://www.netlib.org/} among many
683: access protocols.}. A toy example shows that high level programming
684: with object interface is too slow for the kernel of numerical
685: softwares \cite{Gog2K}. We ran the same FLOPS routines three times
686: \cite{Don92,Don99}. The first run was performed with object automatic
687: allocation and destruction. The second run used in-line function calls
688: and no object allocation. The final code used explicit code in-lining.
689: Removing object programming increased the performance of 133\% and
690: using code in-lining yielded another 56\%. Our contributed building
691: blocks are pieces of code assembled by the designer according to its
692: needs. They present a straight interface to get a good trade off
693: between simplicity and speed.
694:
695: The second contribution is a set of validated proofs. To mechanise
696: our proofs, we have been using the Coq proof
697: assistant~\cite{HueKahPau97}. Systems like Coq allow the user to
698: define new objects and to derive consequences of these definitions
699: formally. The language of Coq is based on a higher-order logic. With
700: such an expressive logic, it is possible to state properties in their
701: most general form. For example, universal quantification has been
702: used to state properties that are true for an arbitrary format or an
703: arbitrary rounding mode. Proofs are built interactively using
704: high-level tactics. At the end of each proof, Coq records a proof
705: object that contains all the details of the derivation and ensures
706: that the theorem is valid. Theorem provers have already been
707: successfully used to mechanically check the correctness of
708: floating-point algorithms~\cite{Rus98,Har97a}.
709:
710: Our formal development is freely available on the
711: Internet\footnote{{\tt http://www-sop.inria.fr/lemme/AOC/coq/}.} and
712: it will soon be available as a Coq contribution\footnote{{\tt
713: http://coq.inria.fr/}.}. A clickage map of the hierarchy makes it
714: possible to browse into the different components. At the moment, it
715: is 22000 line long and it contains 106 Definitions and 545 Theorems.
716: No proof is presented in detail in this manuscript, but we outline the
717: main results used for the proofs. Since the proofs are mechanically
718: validated, the reader is sure that the annoying special cases have
719: been checked and that all the conditions of the properties used in the
720: proof have been met. \RRIyes{All the Definitions and Theorems of this
721: development are presented in annexes A through W.} For each
722: mentioned theorem we give its name as it appears in the formal
723: development and the file where it is proved.
724:
725: We first present definitions and validated properties for the floating
726: point numbers, the expansions and the pseudo-expansions. Section 2
727: presents the addition, multiplication and division toolset. This
728: presentation ends with concluding remarks in the last Section.
729:
730: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
731:
732: \newcommand{\rr}{\mathbb{R}}
733: \newcommand{\nn}{\mathbb{N}}
734: \newcommand{\ideal}[1]{<\!\!#1\!\!>}
735: \newcommand{\lcm}[2]{#1\!\hbox{\^{}}#2}
736:
737: \newlength{\ppwd}
738: \newlength{\ppht}
739: \newcommand{\rpp}[1]{
740: \settowidth{\ppwd}{#1}
741: \settoheight{\ppht}{#1}
742: \raisebox{\ppht}[0pt]{\makebox[0pt][l]{
743: \hspace{.15\ppwd}{\tiny \tt >}}}#1}
744: \newcommand\pp{\rpp{+}}
745: \newcommand{\rppa}[1]{
746: \settowidth{\ppwd}{#1}
747: \settoheight{\ppht}{#1}
748: \raisebox{\ppht}[0pt]{\makebox[0pt][r]{
749: {\tiny \tt >}\hspace{-5pt}}}#1}
750: \newcommand\ppa{\rppa{+}}
751: %\newcommand\pp{\hbox{\raise6pt\hbox{${\hbox{\raise-5pt\hbox{\tiny \tt >}}}\atop+$}}}
752: \newcommand{\limp}{\Rightarrow}
753: \newcommand{\reduce}{\rightarrow^{\ }_S}
754: \newcommand{\reducep}{\rightarrow^+_S}
755: \newcommand{\reduces}{\rightarrow^*_S}
756: \newcommand{\mon}{M_n}
757: \newcommand{\lm}{\le_{M_n}}
758: \newlength{\hsbw}
759: \newcommand\MSpacing{13pt}
760: \newenvironment{cboxed}{\begin{flushleft}
761: \setlength{\hsbw}{\textwidth}
762: \addtolength{\hsbw}{-\arrayrulewidth}
763: \addtolength{\hsbw}{-\tabcolsep}
764: \begin{tabular}{@{}|c@{}|@{}}\hline
765: \begin{minipage}[b]{\hsbw}
766: \vspace*{.06in}
767: \begingroup\small\baselineskip\MSpacing}{\endgroup\end{minipage}\\ \hline
768: \end{tabular}
769: \end{flushleft}}
770: \newcommand{\term}{{\it term}}
771: \newcommand{\pol}{{\it poly}}
772: \newcommand{\grob}{\hbox{\it Gröbner}}
773: \newcommand{\cset}{{\it Set}}
774: \newcommand{\cprop}{{\it Prop}}
775: \newcommand{\cfun}{\rightarrow}
776: \newcommand{\zerop}[1]{{\it zeroP}(#1)}
777: \newcommand{\lpol}{({\it list}\, \pol)}
778: \newcommand{\lterm}{({\it list}\, \term)}
779: \newcommand{\cano}{\mathcal{C}}
780: \newcommand{\olist}{\mathcal{O}}
781: \newcommand{\cforall}[3]{\forall #1\!\!:#2.\, #3}
782: \newcommand{\cforalln}[2]{\forall #1\!\!:#2}
783: \newcommand{\clambda}[3]{\lambda #1\!\!:#2.\, #3}
784: \newcommand{\cexists}[3]{\exists #1\!\!:#2.\, #3}
785: \newcommand{\cin}[2]{#1\,\,{\it in}\,#2}
786: \newcommand{\cdef}[3]{\hbox{{\bf Definition}$\,\, #1{\bf :}\, #2\,{\bf :=}\, \,#3$.}}
787: \newcommand{\cdefq}[2]{\hbox{{\bf Definition}$\,\, #1\,{\bf :=}\, \,#2$.}}
788: \newcommand{\cdefqn}[2]{\hbox{{\bf Definition}$\,\, #1\,{\bf :=}\, \,#2$}}
789: \newcommand{\cdeffn}[3]{\hbox{{\bf Definition}$\,\, #1{\bf :}\, #2\,{\bf :=}\,\, #3$}}
790: \newcommand{\cdefp}[2]{\hbox{{\bf Definition}$\,\, #1\, {\bf :}\, \, #2$.}}
791: \newcommand{\cdefpn}[2]{\hbox{{\bf Definition}$\,\, #1\, {\bf :}\, \, #2$}}
792: \newcommand{\cdefrn}[2]{\hbox{{\bf Definition}$\,\, #1\, {\bf :}\, \, #2\,{\bf :=}$}}
793: \newcommand{\cthm}[2]{\hbox{{\bf Theorem}$\,\, #1 {\bf :}\, #2$.}}
794: \newcommand{\cthmn}[2]{\hbox{{\bf Theorem}$\,\, #1 {\bf :}\, #2$}}
795: \newcommand{\cthmrn}[1]{\hbox{{\bf Theorem}$\,\, #1 {\bf :}$}}
796: \newcommand{\clem}[2]{\hbox{{\bf Lemma}$\,\, #1{\bf :}\quad #2$.}}
797: \newcommand{\clemn}[2]{\hbox{{\bf Lemma}$\,\, #1{\bf :}\quad #2$}}
798: \newcommand{\cdefn}[2]{\hbox{{\bf Definition}$\,\, #1{\bf :}\quad #2\, {\bf :=}$}}
799: \newcommand{\cfix}[3]{\hbox{{\bf Fixpoint}$\,\, #1\,{\bf [}#2{\bf ]:}\quad #3\, {\bf :=}$}
800: }
801: \newcommand{\cfixn}[4]{\hbox{{\bf Fixpoint}$\,\, #1\,{\bf [}#2{\bf ]:}\quad #3\, {\bf :=} #4$}
802: }
803: \newcommand{\caxiom}[2]{\hbox{{\bf Axiom}$\,\, #1{\bf :}\quad #2$.}}
804: \newcommand{\caxiomn}[2]{\hbox{{\bf Axiom}$\,\, #1{\bf :}\quad #2$}}
805: \newcommand{\cparam}[2]{\hbox{{\bf Parameter}$\,\, #1{\bf :}\quad #2$.}}
806: \newcommand{\cparamn}[2]{\hbox{{\bf Parameter}$\,\, #1{\bf :}\quad #2$}}
807: \newcommand{\ccas}[1]{\hbox{$\quad${\bf Cases}$\,\, #1\,\,{\bf of}$}}
808: \newcommand{\cend}{\hbox{$\quad${\bf end}}}
809: \newcommand{\cp}[1]{\hbox{#1.}}
810: \newcommand{\ccasf}[2]{\hbox{$\quad \ \, #1\,\, {\bf \Longrightarrow}\,\, #2$}}
811: \newcommand{\ccasi}[2]{\hbox{$\quad |\, #1\,\, {\bf \Longrightarrow}\,\, #2$}}
812: \newcommand{\clet}[2]{\hbox{$\quad${\bf let }$\, #1=#2\, {\bf in}$}}
813: \newcommand{\clin}[1]{\hbox{$\quad #1$}}
814: \newcommand{\cvar}[2]{\hbox{{\bf Variable}$\,\, #1{\bf :}\, #2$.}}
815: \newcommand{\chyp}[2]{\hbox{{\bf Hypothesis}$\,\, #1{\bf :}\, #2$.}}
816: \newcommand{\chypn}[2]{\hbox{{\bf Hypothesis}$\,\, #1{\bf :}\, #2$}}
817: \newcommand{\cind}[2]{\hbox{{\bf Inductive}$\,\, #1{\bf :}\, #2 :=$}}
818: \newcommand{\cindu}[3]{\hbox{{\bf Inductive}$\,\, #1[#2]{\bf :}\, #3 :=$}}
819: \newcommand{\cicn}[2]{\hbox{${\bf |}\quad #1\,{\bf :}\,#2$}}
820: \newcommand{\cicnf}[2]{\hbox{$\ \quad #1\,{\bf :}\,#2$}}
821: \newcommand{\cicnl}[2]{\hbox{${\bf |}\quad #1\,{\bf :}\,#2$.}}
822: \newcommand{\cicnfl}[2]{\hbox{$\ \quad #1\,{\bf :}\,#2$}}
823: \newcommand{\cic}[1]{\hbox{${\bf |}\quad #1$}}
824: \newcommand{\cicf}[1]{\hbox{$\ \quad #1$}}
825: \newcommand{\cicl}[1]{\hbox{${\bf |}\quad #1$.}}
826: \newcommand{\cicfl}[1]{\hbox{$\ \quad #1$.}}
827:
828: \section{Definitions and validated properties}
829:
830: \subsection{Floating point numbers}
831:
832: An IEEE double precision floating point number is built from 3 binary
833: fields: the sign (1 bit), the fraction (52 bits) and the biased
834: exponent (11 bits). Its normal interpretation $x$ as a rational
835: number is given below.
836: $$
837: \begin{array}{r c l}
838: \text{mantissa} & = & 1.\text{fraction} \\
839: \text{exponent} & = & \text{biased exponent} - \text{bias} \\
840: x & = & (-1)^{\text{sign}} \times \text{mantissa} \times 2^{\text{exponent}}
841: \end{array}
842: $$
843:
844: %\figtex{fp}{Binary fields of a floating point number.}
845:
846: We will use in this text the more general notation $x= n \times
847: \beta^{e}$ to define a floating point number of radix $\beta$ with two
848: integers, the significand $n$ and the amplitude $e$. This relation
849: gives a signification to any pair $(n, e)$. We will use the name
850: floating point number or {\bf float} to represent such a pair. Two
851: floats $(n, e)$ and $(n', e')$ are equivalent if they have the same
852: value as real numbers, we write $(n, e) \equiv (n', e')$. We extend
853: this notation to the represented real value and we may write $n \times
854: \beta^e \equiv (n ,e)$. We will also use the equivalence for
855: expression whenever there is no ambiguity. No order is defined on the
856: floats and inequalities are given implicitly on their interpretation.
857:
858: Given a pair $(n_{\max}, e_{\min}) \in \N_*^2$, we say that a float
859: $(n, e)$ is {\bf bounded} if and only if it satisfies
860: \begin{equation}
861: \label{eqn/bounded}
862: |n| \le n_{\max}
863: ~~~~~ \text{and} ~~~~~
864: - e_{\min} \le e.
865: \end{equation}
866:
867: We do not use the overflow bound of $e_{\max}$. The IEEE standard
868: defines an overflow when the rounded value of the result exceeds the
869: largest representable number. It implies that the rounding is
870: defined on a data type that does not have an upper bound for the
871: exponent. All the results of this text are true provided neither
872: overflow nor exact infinity (division by 0) occurred in the
873: computation. If this is not the case, infinity and {\em not a number}
874: quantities will proliferate in the results.
875:
876: The bound for an IEEE standard inspired representation with $p$ digits
877: of mantissa and $r$ digits of exponent is given Table~\ref{tab/ieee}.
878: We will use the common fraction notation for the examples although the
879: integer notation is used for the proofs. We use only radix
880: independent generic IEEE standard inspired bounds in the rest of this
881: text. A bounded float is {\bf normal} if $|n| \times \beta >
882: n_{\max}$. It is {\bf de-normal} if $|n| \times \beta \le n_{\max}$. A
883: de-normal float such that $e = -e_{\min}$ is called {\bf subnormal}. We
884: define the {\bf ulp} value as one unit in the last place of 1 that
885: equals $\beta / (n_{\max} + 1)$.
886:
887: \begin{table}
888: \caption{IEEE standard inspired bounds}
889: \label{tab/ieee}
890: \begin{center}
891: \begin{tabular}{| c | c | c | c |}\hline
892: & Radix independent & \mc{2}{| c |}{$\beta = 2$} \\
893: & (Generic) & Single & Double \\ \hline
894: $p$ & Mantissa width & 24 & 53 \\
895: $n_{\max}$ & $\beta^p - 1$ & 16~777~215 & 9~007~199~254~740~991 \\
896: $r$ & Exponent width & 8 & 11 \\
897: bias & $\lceil \beta^r / 2 \rceil - 1$ & 127 & 1023 \\
898: $e_{\min}$ & $\lceil \beta^r / 2 \rceil + p - 3$ & 149 & 1074 \\
899: $e_{\max}$ & $\lfloor \beta^r / 2 \rfloor - p$ & 124 & 971 \\
900: ulp & $\beta^{1 - p}$ & $1.19 \times 10^{-7}$ & $2.22 \times 10^{-16}$ \\ \hline
901: \end{tabular}
902: \end{center}
903: \end{table}
904:
905: \begin{theo}[FnormalizeCanonic and FcanonicUnique in Form]
906: ~ Any bounded float has one unique equivalent float $(n, e)$ that is
907: normal or subnormal. This float is the direct transcription of the
908: number in machine with
909: $$
910: \begin{array}{r c l}
911: n & = & \text{mantissa} \times \beta^{p - 1} \\
912: e & = & (\text{biased exponent} - \text{biais}) - p + 1. \\
913: \end{array}
914: $$%
915: It is later referred in this text as the {\bf canonical}
916: representation.
917: \end{theo}
918:
919: The IEEE standard describes four rounding modes but the rounding to
920: the nearest floating point number is the rounding mode used by default
921: in most computers. A rounding is generally defined as a monotonous
922: projection onto the set of the canonical floats. Considering all the
923: floats rather than just the canonical ones, we would rather define it
924: as a projection onto subsets of bounded floats.
925:
926: For any real number $x$, we define its bounded floor $\lfloor x
927: \rfloor$ as the class of the maximum bounded floats smaller than $x$.
928: It means that a bounded float $(n, e)$ is in $\lfloor x \rfloor$ if
929: and only if, for all the bounded floats $(n', e')$
930: $$
931: (n', e') \le x ~~~~~ \Longrightarrow ~~~~~ (n', e') \le (n, e)
932: $$%
933: We define identically its bounded ceil $\lceil x \rceil$ as the class
934: of the minimum bounded floats larger than $x$. We define the
935: truncated subset $F(x)$ as follows.
936: $$
937: F(x) = \left\{\begin{array}{l c}
938: \lfloor x \rfloor & x > 0 \\
939: \lceil x \rceil & x < 0 \\
940: \{ (0, e), ~~ -e_{\min} \le e \} & x = 0 \\
941: \end{array}\right.
942: $$
943:
944: We also define the class of the bounded floats nearest to $x$ as $\Box
945: (x)$ with $(n, e)$ is in $\Box (x)$ if and only if, for all the
946: bounded floats $(n', e')$
947: $$
948: | (n, e) - x | \le | (n', e') - x|
949: $$
950:
951: When $\Box (x)$ contains two distinct equivalence classes of bounded
952: floats, we define $\circ(x)$ as the class of $\Box (x)$ of the
953: elements where the canonical representation has an even significand.
954: In other cases, $\circ(x) = \Box (x)$. The following theorem states
955: that this definition is appropriate.
956:
957: \begin{theo}[*RoundedModeP in Fround and Closest]
958: ~ The relations rounding down $\lfloor \cdot \rfloor$, up $\lceil
959: \cdot \rceil$, to zero $F (\cdot)$ and to the nearest (even) $\circ
960: (\cdot)$ define each a total monotonous projection from the reals
961: onto the bounded floats compatible with the interpretation of the
962: floats.
963: \end{theo}
964:
965: We now give a few results on the rounding to the nearest. The theorem
966: could be defined on the canonical representation but it is better to
967: prove it for any bounded float.
968:
969: \begin{theo}[ClosestErrorBound in ClosestProp]
970: \label{theo/Round/Error}%
971: Let $x$ be any real number and $(n, e)$ any float of $\Box (x)$,
972: $$
973: |x - n \beta^e| \le \beta^e / 2.
974: $$
975: That relation is independent of the rounding being even or not.
976: Even rounding provides that if both $(n, e) \in \circ(x)$ and $|x -
977: (n, e)| \equiv \beta^e / 2$ then $n$ is even.
978: \end{theo}
979:
980: The following theorem gives a relation for every representation of the
981: rounded value and the error provided that the error is non zero and it
982: does have a bounded representation. The universal quantifier is the
983: key to prove the theorems~\ref{theo/TS/Exp} and \ref{theo/M/Exp}.
984:
985: \begin{theo}[RoundedModeErrorExpStrict in FRoundProp]
986: \label{theo/Round/Error/Strict}
987: Given an arbitrary rounding mode, let $x$ be a real number and $(n,
988: e)$ a bounded float rounded from $x$, such that $x$ is not
989: represented by $(n, e)$ ({\em ie.} $x \neq n \beta^e$). For all
990: bounded float $(n', e')$ that is a representation of $x - n
991: \beta^e$,
992: $$
993: e' < e.
994: $$
995: The bound is tight as $(n', e') = (1, e - 1)$ is an acceptable
996: round-off error.
997: \end{theo}
998:
999: The result of any implemented operation, namely the addition, the
1000: multiplication, the division and the square root extraction, is the
1001: rounded result of the exact mathematical operation. For example, if
1002: $a \oplus b$ is the machine addition then $a \oplus b \in \circ (a +
1003: b)$. The machine operation could have been defined to return the
1004: canonical float in $\circ(a + b)$. We preferred to define an
1005: acceptable implementation as a function that return any bounded float
1006: in $\circ(a + b)$.
1007:
1008: When a theorem is independent of the tie breaking rule implemented, we
1009: use the boxed symbols $\boxplus$, $\boxminus$ and $\boxtimes$ for the
1010: addition, the subtraction and the multiplication in the hypothesis.
1011: Figure~\ref{fig:fpop} presents the symbol of the four standard
1012: floating point operators used in this work. All the figures use even
1013: rounding as no other implementation exists.
1014:
1015: \figtex{fpop}{Standard floating point operations rounded to the
1016: nearest (even) value.}
1017:
1018: % Given a single or double precision, we can define the {\bf ulp}
1019: % function that stands for the weight of one unit in the last place of
1020: % $x$ and that is the weight of the last bit of the mantissa of $x$. Any
1021: % non zero normalized bounded floating point number $x$ satisfies
1022: % $\omega (x) \ge ulp (x) = \alpha (x) ulp (1)$. We will later use
1023: % these equations and the fact that $\alpha$ and $ulp$ are monotonous
1024: % functions. The last equality does not hold for denormalized numbers,
1025: % yet treating these numbers is beyond our goal in this work. For any
1026: % non zero floating point number, we define $m(x)=\frac{x}{\alpha(x)}$ ;
1027: % $m(x)$ is the mantissa of $x$. We know that $|m(x)| \in [1,2-ulp]$.
1028:
1029: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1030:
1031: \subsection{Exact operations}
1032:
1033: We used for our proofs of the exact operation a result first published
1034: in \cite{Ste74}. It was later presented in \cite{Gol91} for the radix
1035: 2 notation. This fact is true for any bounded representation with any
1036: radix.
1037:
1038: \begin{theo}[Sterbenz in Fprop]
1039: Given two bounded floats $x$ and $y$ such that ${y \over 2} \le x
1040: \le 2 y$, the rational $x - y$ can be represented by a bounded
1041: float.
1042: \end{theo}
1043:
1044: We immediately prove the next result from its negation after setting
1045: $\beta$ to be 2. We use it later in our proofs.
1046:
1047: \begin{theo}[plusClosestLowerBound in Closest2Plus]
1048: Given two bounded floats radix 2, $x$ and $y$, such that $x + y \neq
1049: x \oplus y$, we know that $|x \oplus y| \ge \max (|x|, |y|) / 2$.
1050: \end{theo}
1051:
1052: M{\o}ller first, then Knuth proposed the common exact sum of
1053: Figure~\ref{fig:badd}-(a) involving 4 floating point additions and 2
1054: floating point subtractions on an IEEE compliant computer. The result
1055: is a pair of floats $(a', b')$ such that $a' = a \oplus b$ and $a' +
1056: b' \equiv a + b$.
1057:
1058: \figtex{badd}{Exact sum}
1059:
1060: We have proved this algorithm correct (Theorem {\bf Knuth in TwoSum})
1061: using the proof given by Priest in~\cite{Pri91}. Actually we will
1062: never use directly the algorithm but only the property that the sum
1063: and the rounding error can be represented. This is proved picking
1064: bounded representations that have special properties:
1065:
1066: % We did not validate this long published algorithm, but we set up its
1067: % specification and we deduced the following properties on its
1068: % outputs\footnote{In fact, the correct behavior of the algorithm is
1069: % proved for radix 2 in file {\bf TwoSum} although it is not necessary
1070: % in the course of this work \cite{DauRidThe01}.}. Since we only
1071: % prove that there exists one bounded representation, the theorem gives
1072: % some properties of this special representation.
1073:
1074: % \begin{theo}[errorBoundedPlus in ClosestPlus]
1075: % \label{theo/TS/exists}
1076: % Given two bounded floats $a = (n_a, e_a)$ and $b = (n_b, e_b)$, for
1077: % any $a' \in a \boxplus b$, there exists $n'$ such that $(n', e')$ is
1078: % a bounded float with
1079: % $$
1080: % a' + (n', e') \equiv a + b
1081: % ~~~~~ \text{and} ~~~~~
1082: % e' = \min (e_a, e_b).
1083: % $$
1084: % \end{theo}
1085:
1086: %The rounded sum $a \oplus b$ and the additional error $a + b - (a
1087: %\oplus b)$ are both integer multiple of $\beta^l$ where $l$ is the
1088: %lowest of the ulps of $a$ and $b$. No significant bit is created
1089: %right of the rightmost non zero significant bit of the inputs and the
1090: %ulp of the sum is limited as we can see in the next fact.
1091:
1092: \begin{theo}[plusExactExp in ClosestPlus]
1093: \label{theo/TS/Exp}
1094: Given two bounded floats $a = (n_a, e_a)$ and $b = (n_b, e_b)$, for
1095: all $a' \in a \boxplus b$, we can define $(n, e) \equiv a'$ and $b'
1096: = (n', e')$, bounded floats, such that
1097: $$
1098: a' + b' \equiv a + b
1099: ~~~~~ \text{and} ~~~~~
1100: \min (e_a, e_b) = e' \le e \le \max (e_a, e_b) + 1.
1101: $$
1102: \end{theo}
1103:
1104: This theorem uses the Theorem~\ref{theo/Round/Error} and Theorems {\bf
1105: errorBoundedPlus}, {\bf plusExpBound} not presented here to define
1106: an existing representation of $a'$ and $b'$. The
1107: theorem~\ref{theo/Round/Error/Strict} is used to connect both results
1108: since it is true for any representation of $a'$ and $b'$. The special
1109: case where $b' = 0$ is handled independently.
1110:
1111: % \begin{theo}[plusExact1 and plusExact1 in FroundPlus]
1112: % Given two bounded representations $(n_a, e_a)$ and $(n_b, e_b)$ of
1113: % the floating point numbers $a$ and $b$. Let $(n, e)$ be a bounded
1114: % representation of $\circ (a + b)$. The rounded sum is exact, {\em
1115: % id} $\circ (a + b) = a + b$, when any one of the following
1116: % conditions is satisfied:
1117: % \begin{itemize}
1118: % \item $e \le \min (e_a, e_b)$.
1119: % \item $(n_a, e_a)$ is canonical and $e < e_a - 1$
1120: % \end{itemize}
1121: % \end{theo}
1122:
1123: The last Theorem prompts the fact that the addition is accurate up to
1124: one unit in the last place. Such result is needed to show that the
1125: numerical tools of the next Section actually do some work.
1126:
1127: \begin{theo}[plusErrorBoundUlp in ClosestPlus]
1128: \label{theo/work}
1129: Given two bounded floats $a$ and $b$, for all $a' \in a \boxplus b$
1130: such that $a' \not\equiv 0$, we know that
1131: $$
1132: |a + b - a'| \le |a'| {\ulp \over 2}
1133: \le \max (|a|, |b|) \ulp.
1134: $$%
1135: The last inequality is true only working radix 2. It is proved in
1136: theorem plusErrorBound2 in file Closest2Plus.
1137: \end{theo}
1138:
1139: It has also been proved by M{\o}ller and presented in Knuth's second
1140: edition that one can get a correct pair $(a', b')$ by an early exit of
1141: the exact sum provided $|b| \le |a|$. The conditional sum of
1142: Figure~\ref{fig:badd}-(b) returns an exact pair $(a', b'') \equiv (a',
1143: b')$ with only 2 floating point additions and one floating point
1144: subtraction.
1145:
1146: Knuth suggested that the early exit is still valid if $a$ and $b$ are
1147: both canonical and share the same exponent. We prove here a result
1148: rephrased from \cite{Dau99a} that $a$ and $b$ just have to be bounded
1149: and the exponent be in order.
1150:
1151: \begin{theo}[ExtDekker in EFast2Sum]
1152: On a radix 2 IEEE inspired floating point unit, given two bounded
1153: floats $a = (n_a, e_a)$ and $b = (n_b, e_b)$, the conditional sum
1154: presented Figure~\ref{fig:badd}-(b) returns an exact pair $(a \oplus
1155: b, a + b - a \oplus b)$ provided
1156: $$e_b \le e_a.$$
1157: \end{theo}
1158:
1159: This theorem is proved only for radix two notations as it is not
1160: necessarily correct for other radices. An example where the early
1161: exit is not correct radix 10 is given for $n_{\max} = 99$ with both
1162: inputs being $9.9$. The theorem is correct for radix 2 and 3
1163: \cite{Knu73}. One can check one last example radix 4: $n_{\max} = 3$
1164: and both inputs are 3.
1165:
1166: An exact multiplication is also available. It computes a pair $(a',
1167: b')$ such that $a' = a \otimes b$ and $a' + b' = a \times b$ with 7
1168: floating point multiplications, 5 floating point additions and 5
1169: floating point subtractions. These operators are surveyed in
1170: \cite{Dau99a}. Here are some properties obtained from the desired
1171: specification of $a'$ and $b'$, not from the algorithm. The condition
1172: on the exponent is not necessary but it is sufficient to discard cases
1173: of underflow.
1174:
1175: \begin{theo}[multExactExpCan in ClosestMult]
1176: \label{theo/M/Exp}
1177: Given two bounded floats $a = (n_a, e_a)$ and $b = (n_b, e_b)$, with
1178: $e_a + e_b \ge -e_{\min} + p$, for all $a'$ rounded to a nearest
1179: float of $a \times b$, we can define $(n, e) \equiv a'$ canonic and
1180: $b' = (n', e')$ bounded float, such that
1181: $$
1182: a' + b' \equiv a \times b
1183: ~~~~~ \text{and} ~~~~~
1184: e' = e - p.$$
1185: \end{theo}
1186:
1187: It was proved that comparable error quantities, that always fit in a
1188: common floating point number, may be defined for the division and
1189: square root \cite{BohWalKorMat91}.
1190:
1191: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1192:
1193: \subsection{Floating point expansions}
1194:
1195: A floating point {\bf expansion} is defined as a finite sequence ${\bf
1196: x} = (x_0, x_1, \cdots , x_{n - 1})$ of floating point numbers. The
1197: value represented is the exact, not rounded, sum of its components
1198: $\sum x_i$. The length of the expansion is the number of its
1199: components.
1200:
1201: %We say that $\widehat{x}$ is a {\bf fair most significant component}
1202: %of ${\bf x}$ if $\widehat{x}$ equals to the rounded to the nearest
1203: %value of $x'$ with $|x' - \sum x_i| \leq \beta^e / n_{\max}$ and $(n,
1204: %e)$ is the canonical representation of $\widehat{x}$. This includes
1205: %the case where $\widehat{x}$ equals the rounded to the nearest value
1206: %of the sum of the component of ${\bf x}$.
1207:
1208: Any component of an expansion may be equal to zero, but the
1209: subsequence of the non-zero components $x_i$ must be non overlapping
1210: and ordered by magnitude. The non overlapping condition means that
1211: two components cannot have significant bits with the same weight that
1212: is specified for each $i > 0$ there exists a bounded float $(n, e)$
1213: such that
1214: $$
1215: x_i \equiv (n, e) ~~~~~ \text{and} ~~~~~ |x_{i + 1}| < \beta^e.
1216: $$
1217:
1218: %\figtex{superp}{Floating point numbers A and B do not overlap whereas B and C do.}
1219:
1220: %A {\bf non-adjacent expansion} is an expansion ${\bf x}$ where the
1221: %non-zero subsequence has non-adjacent components. The condition is
1222: %specified below. There is at least one bit set to 0 between the two
1223: %nearest non-zero (signed) bits of two consecutive components.
1224: %$$\exists (n, e) \in \F_{\cal R}
1225: %% ~~~ \text{such that} ~~~ (n, e) \equiv x_i
1226: % ~~~ \text{and} ~~~ |x_{i - 1}| < \beta^e / 2
1227: %$$
1228:
1229: We have already presented two examples in \cite{Dau99a} showing that
1230: any float $(n, e)$ where $e \ge -e_{\min}$ can be expressed as an
1231: expansion.
1232:
1233: As noted in the introduction, previous works by Priest, Shewchuck and
1234: ourselves have produced arithmetic operators on expansions and useful
1235: primitives for computational geometry. In this process, we recognized
1236: that the operators for the addition and the multiplication computing
1237: the least significant digits first do not produce length optimal
1238: results but tend to break the expansion in a large number of
1239: components with small significands. A compression routine is used to
1240: group together the components and reduce the length of the result
1241: expansion.
1242:
1243: % (see figure \ref{fig:lsdf}(a))
1244: % The
1245: % program has to compress the result to reduce the memory occupation and
1246: % the length of the expansion. The exact floating point addition puts
1247: % the 52 most significant bits in the first word of the result and any
1248: % remaining bit in the correcting word. Thereafter, using the
1249: % correcting word as the output of a process will definitely have a
1250: % tendency to produce long expansions of components with a small number
1251: % of significant bits.
1252:
1253: % \begin{figure}[ht]
1254: % \begin{center}
1255: % \input{lsdf.pstex_t}
1256: % \caption{(a) LSDF operators : the result is a not optimal expansion (b) MSDF operator : the result is a pseudo-expansion}
1257: % \label{fig:lsdf}
1258: % \end{center}
1259: % \end{figure}
1260:
1261: It is possible to compute the arithmetic operations most significant
1262: digits first and compute directly components with a large significand.
1263: The common, both restoring and non restoring, division operators
1264: compute the components of the quotient like this. As a drawback, some
1265: of the components may overlap slightly. In our former work
1266: \cite{DauFin99a}, the sequence of the quotient digits is cleaned by a
1267: compression routine that produces an acceptable expansion.
1268:
1269: % (see figure \ref{fig:lsdf}(b))
1270:
1271: We define {\bf pseudo-expansions} as slightly overlapping expansions.
1272: The condition on the subsequence of non zero components as that
1273: $$|x_{i + 1}| \le \epsilon |x_i|.$$%
1274: We do not give the value of $\epsilon < {1 \over 2}$ now as it will be
1275: fixed in the next section considering the numerical algorithms. We
1276: deduce that
1277: $$|\sum_{j > i} x_j| \le {\epsilon \over 1 - \epsilon} |x_i|.$$
1278: Conversely, given a sequence where
1279: $$|\sum_{j > i} x_j| \le \lambda |x_i|,$$
1280: for all $i$, we deduce that
1281: $$|x_{i + 1}| \le {\lambda \over 1 - \lambda} |x_i|.$$%
1282:
1283: % The division of pseudo-expansions is copied from the existing one
1284: % except that it does not require the final cleaning loops. The
1285: % addition and the multiplication are inspired from the on-line
1286: % operators to produce the result most significant digit first as
1287: % components are available and safe. As we do for on-line arithmetic,
1288: % we recognize that for the three operators, least significant digits
1289: % are not necessary at the beginning of the process and the operators
1290: % are transformed from most significant digit first operators that
1291: % require its inputs to be exactly known to adaptable operators that
1292: % need only a limited number of most significant components to deliver
1293: % the first components of the output.
1294:
1295: % \begin{theo}[errorBoundedMult in FroundMult]
1296: % \label{theo/seq}
1297: % Given a sequence $(x_i)_{i \in \N}$ and provided $\epsilon < 1$. We know that
1298: % \begin{itemize}
1299: % \item $
1300: % \displaystyle \text{If} ~~~~~
1301: % \forall i \in \N, ~~~~~ |x_{i + 1}| \le \epsilon |x_i|,
1302: % ~~~~~ \text{then} ~~~~~
1303: % \forall i \in \N, ~~~~~ |\sum_{j \ge i} x_j| \le \epsilon' |x_i|$.
1304: % \item $
1305: % \displaystyle \text{If} ~~~~~
1306: % \forall i \in \N, ~~~~~ |\sum_{j \ge i} x_j| \le \epsilon |x_i|,
1307: % ~~~~~ \text{then} ~~~~~
1308: % \forall i \in \N, ~~~~~ |x_{i + 1}| \le \epsilon' |x_i|$.
1309: % \end{itemize}
1310: % \end{theo}
1311:
1312: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1313:
1314: \section{Toolset for the addition, the multiplication and the division}
1315:
1316: We combine in this Section tools to create the arithmetic operations:
1317: MQ, PP, PQ and $\Sigma_3$. The tools have been implemented in C.
1318: Among them, the numerical behavior of $\Sigma_3$ alone is difficult to
1319: analyze as the three other tools only sort and produce components as
1320: they are needed.
1321:
1322: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1323:
1324: \subsection{Addition}
1325:
1326: \figtex{addition}{Representation of the addition.}
1327: \figtex{insert3}{Definition of the $\Sigma_3$ operator.}
1328:
1329: Figure~\ref{fig/addition} represents the algorithm implemented to
1330: compute the sum of pseudo-expansions $A$ and $B$. We use two
1331: primitive operators MQ (merge queue) and $\Sigma_3$ (insert and sum of
1332: Figure~\ref{fig/insert3}). Arrows represent streams of floating point
1333: number flowing most significant digits first. The components of the
1334: result are in the stream named $C$ in Figure~\ref{fig/addition}. The
1335: output is only a pseudo-expansion since the $a'$ are sorted by
1336: decreasing order of magnitude but they may partially overlap each
1337: other.
1338:
1339: MQ is an operator fired only if it has a value on both inputs or if
1340: the flow of one input is finished and it has a value on the other
1341: input. The largest input, in magnitude, is then transmitted to the
1342: output. Cases of tie are not critical. The MQ operator merges two
1343: flows sorted by magnitude and produces a single flow once again sorted
1344: by magnitude. The following theorem states that this flow of values
1345: can be represented by a flow of floats to be fed into the $\Sigma_3$
1346: operator.
1347:
1348: \begin{theo}[IsRleExpRevIsExp in FexpAdd]
1349: Given a list of floats $(x_i)_{i \in L}$ sorted by magnitude, we can
1350: define the bounded floats $(n_i, e_i)$ equivalent to the $x_i$s such
1351: that the list $(e_i)_{i \in L}$ is sorted.
1352: \end{theo}
1353:
1354: We deduce the correct behavior of the operation by induction from the
1355: following Theorem. The property on the input floating point number
1356: $c$ is inherited from the fact that the two input streams $A$ and $B$
1357: are pseudo-expansions merged by the MQ operator by order of magnitude.
1358:
1359: \begin{theo}[bound3Sum and exp3Sum in ThreeSum2]
1360: \label{theo/sum3}
1361: Let $a = (n_a, e_a)$, $b = (n_b, e_b)$ and $c = (n_c, e_c)$ be the
1362: bounded inputs of the $\Sigma_3$ operator (see
1363: Figure~\ref{fig/insert3}). Provided that $e_a \ge e_b \ge e_c$ the
1364: $\Sigma_3$ operator returns three numbers $a'$, $b'$, $c'$
1365: represented by $(n_a', e_a')$, $(n_b', e_b')$ and $(n_c', e_c')$
1366: such that $e_a' \ge e_b' \ge e_c' = e_c$ and finaly either $c' = 0$
1367: or
1368: $$|b' + c'| \le 3 \ulp |a'|,$$
1369: $$\beta^{e_c} \le {3 \ulp^2 \over 2 - \ulp} |a'|.$$
1370:
1371: If $a = 0$ or $|b| + n_{\max} \beta^{e_c} \le n_{\max} \beta^{e_a}$,
1372: all the exact additions can use the early exit.
1373: \end{theo}
1374:
1375: If $c'$ equals to zero, $a'$ and $b'$ are not relevant enough. They
1376: remain in the operator as $a$ and $b$ for the next iteration. On the
1377: opposite, if $c'$ is not equal to zero, $a'$ is relevant. It is one
1378: component of the result, $b'$ and $c'$ are kept in the operator for
1379: the next iteration.
1380:
1381: \paragraph{\em Proof sketch:}
1382:
1383: We first define $u$, $v$, $a'$, $b'$ and $c'$ as they are computed in
1384: Figure~\ref{fig/insert3}. We are interested in the case where $c' \neq
1385: 0$. It implies that $w \neq 0$. We bound
1386: $$
1387: \begin{array}{r c l}
1388: |w| & \le & {\ulp \over 2} |a'| \\
1389: |v| & \le & \ulp \max (|u|, |a|)
1390: \end{array}
1391: $$
1392: Since $w \neq 0$, we know that
1393: $$|a'| \ge {\max (|u|, |a|) \over 2}.$$
1394: The first bound follows since
1395: $$|b' + c'| = |v + w| \le |v| + |w|.$$
1396:
1397: We continue for the second bound
1398: $$
1399: 3 \ulp |a'| \ge |b' + c'|
1400: \ge |b'| - |c'|
1401: \ge \left( 1 - {\ulp \over 2} \right)|b'|.
1402: $$
1403: We conclude with the representation of Theorem~\ref{theo/TS/Exp}
1404: $$
1405: \beta^{e_c} = \beta^{e_c'}
1406: \le |c'|
1407: \le {\ulp \over 2} |b'|
1408: \le {3 \ulp^2 \over 2 - \ulp} |a'|.
1409: $$
1410:
1411: \begin{flushright}
1412: $\Box$
1413: \end{flushright}
1414:
1415: The last theorem does the induction over time. It is written from both
1416: statement and proof of the theorem in Coq.
1417:
1418: \begin{theo}[FexpAdd in FexpAdd]
1419: Given a list $(n_i, e_i)_{i \in L}$ of $\#L$ bounded floats so that
1420: $(e_i)_{i \in L}$ is sorted the addition operator (see
1421: Figure~\ref{fig/addition}) produces a list $(c_i)_{i \in L}$ of at
1422: most $(\#L + 1)$ floats with
1423: $$
1424: |c_{i + 1}| \le \epsilon |c_i|
1425: ~~~~~ \text{with} ~~~~~
1426: \epsilon \le {6 \#L + 6 \over n_{\max} - 1 - 6\#L}
1427: $$
1428: under the condition that $\epsilon < 1$.
1429: \end{theo}
1430:
1431: \paragraph{\em Proof sketch:}
1432:
1433: We have to show that the input conditions are met at each iteration
1434: using pre-conditions and post-conditions. In the same time, we
1435: establish that
1436: $$|\sum_{j > i} c_j| \le 3(1 + 2 \#L) \ulp |c_i|,$$
1437: and
1438: $$|c_{i + 1}| \le 6(1 + 2 \#L) \ulp |c_i|.$$%
1439: Being more careful, we proved the tighter bound in Coq.
1440:
1441: \begin{flushright}
1442: $\Box$
1443: \end{flushright}
1444:
1445: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1446:
1447: \subsection{Multiplication}
1448:
1449: \figtex{multiplication3}{Representation of the multiplication.}
1450:
1451: The multiplication of the expansions $A$ (size $n_A$) by the
1452: expansions $B$ (size $n_B$) generates the $n_A \times n_B$ partial
1453: products and computes their sum. The problem is to sort the partial
1454: products $a_{i_j} \times b_j$ knowing that the lists $(a_i)$ and
1455: $(b_j)$ are sorted by magnitude. It is related to $X + Y$ sorting
1456: \cite{HarPaySavStr75,SteStr94}. Figure~\ref{fig/multiplication3}
1457: represents the algorithm implemented. We use new and extended
1458: primitives: PP, a partial product generator, PQ and a priority queue
1459: extending the MQ.
1460:
1461: % \figtex{prods}{Generation of partial product and priority queue.}
1462:
1463: Working with pseudo-expansions, we have to make sure that the list of
1464: the partial products is sorted by magnitude. As $b_j$ arrives at
1465: position $j$, the PP tool initializes the $i_j$ index to 0 and
1466: computes $a_{i_j} \otimes b_j$. Its magnitude is inserted at the end
1467: of PQ that is updated in $\lfloor \log j \rfloor$ operations. When the
1468: partial product $a_{i_j} \otimes b_j$ is output by PQ, the tool
1469: increments $i_j$. The generation is frozen until both components
1470: $a_{i_j}$ and $b_{j + 1}$ are known. Then the cell containing
1471: $|a_{i_j} \otimes b_j|$ in PQ is updated in $\lfloor \log j \rfloor$
1472: operation and the tools can generate the next partial product.
1473:
1474: % \figtex{somme5}{Definition of the $\Sigma_5$ operator.}
1475:
1476: Numbers flow by pair since $a_i \times b_j$ produces two floating
1477: point numbers. The upper part $p_{ij} = a_i \otimes b_j$ is used
1478: immediately. The lower part $a_i \times b_j - p_{ij}$ is put into a
1479: waiting queue. The two streams are merged together by the MQ
1480: operators. It is fired as long as it has an input on both entries. It
1481: does not compare the values but the keys that are associated with the
1482: floats. The operator does not produce a list of floats sorted by
1483: magnitude but directly a list sorted by amplitude as we use the second
1484: part of theorem~\ref{theo/M/Exp}.
1485:
1486: As $a_i \times b_j - p_{ij}$ cannot be selected before $p_{ij}$, its
1487: insertion into the waiting queue is not in the critical path.
1488: Instruction reordering will allow these operation to be hidden in dead
1489: cycles of the $\Sigma_3$ operator. The waiting list cannot hold more
1490: than $2n_B$ elements since the bound on $\epsilon$ is a few ulps.
1491:
1492: % $(A_i \times B_j)_{High}$ that is inserted in $c$ and $(A_i \times
1493: % B_j)_{Low}$ inserted in $e$. A formal extension of
1494: % Theorem~\ref{theo/sum3} will guarantee that this operator produces the
1495: % desired output.
1496:
1497: % If $d'$ equals to zero, then $e' = 0$ and $a'$, $b'$ and $c'$ are not
1498: % relevant enough. They remain in the operator for the next iteration.
1499: % If $d' \neq 0$, $a'$ is relevant. It is one component of the result.
1500: % If $e'$ is also non zero, $b'$ is immediately used as the next
1501: % component of the result. The three remaining floating point numbers
1502: % are kept in the operator for the next iteration.
1503:
1504: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1505:
1506: \subsection{Division}
1507:
1508: % \subsubsection{Common division}
1509:
1510: \figtex{division3}{Representation of the non-restoring division.}
1511:
1512: Figure~\ref{fig/division3} represents the division of $R$ by $D$,
1513: pseudo-expansions. We want to compute $Q$ such that $Q = R / D$, {\em
1514: ie.} $R - QD = 0$. Each component $q_i$ of $Q$ is deduced from the
1515: previous ones by computing
1516: $$R_i = R - \sum_{j < i} q_j \times D,$$%
1517: before evaluation $q_i \approx R_i / D$. We will see later that $d_0$
1518: is handled separately, so our scheme computes $Q \times (d_0 - D)$.
1519:
1520: The goal is to ensure that the values $c$ used as inputs of the
1521: $\Sigma_3$ operator are sorted by amplitude. The PP-PQ-MQ chain is
1522: close to the one used for the multiplication in the
1523: Figure~\ref{fig/multiplication3}. The generation is frozen only when
1524: $d_{j + 1}$ is not known.
1525:
1526: The test on $q_{i_j}$ is replaced by a new condition: the operation
1527: stops before it may insert a value $c$ such that $c < 4 |a + b| \ulp$.
1528: A new approximate quotient digit $q_i$ is then guessed from a fair
1529: most significant component $d_0$ of the divisor $D$ and a fair most
1530: significant component $a' \oplus b'$.
1531:
1532: % of the remainder $R_i$: $q_i =
1533: % \circ(a' / d_l)$.
1534: % The less significant components of the dividend are stored unchanged
1535: % with all the components of the terms $-q_i \cdot D$ that have not been
1536: % reduced so far. A priority queue (PQ) extracts the most significant
1537: % components among this set. These numbers are accumulated until a
1538: % relevant $a'$ can be estimated again.
1539:
1540: % Since $R_i$ is at least divided by a constant factor at each
1541: % iteration, the algorithm is converging.
1542: % $$
1543: % \displaystyle \frac{R_0}{D}=\sum_{i=0}^{n-1}{q_i}+\frac{R_n}{D}
1544: % $$
1545:
1546: % We use four primitive operators. The partial product PP, and the
1547: % priority queue PQ, sort by magnitude the pairs generated by the
1548: % product of $q_i$ by $D - d_l$. The merging queue MQ, takes the most
1549: % significant numbers between the components of the numerator that have
1550: % not been used so far and the numbers given by the priority queue. The
1551: % $\Sigma_5$ operator adds the new component to the left-over
1552: % components. The number $a'$ is significant enough when $d' \neq 0$.
1553: % This process guarantees that $a'$ is a relevant approximation of
1554: % $R_i$. We will prove that $e' = 0$.
1555:
1556: The following theorem states that one division step is very robust.
1557: For a reasonable value of $\epsilon$, it guarantees that the remainder
1558: decreases almost linearly.
1559:
1560: \begin{theo}[DivConv in FexpDiv]
1561: \label{theo/Div/Converge}
1562: Let $W$ and $D$ be two non zero real numbers approximated by $w$ and
1563: $d$ with a relative error bound $\epsilon < 1$ and let $q$ be an
1564: approximation of $w / d$ with the same relative error bound.
1565: $$
1566: |W - q D | \le \epsilon {3 + \epsilon \over 1 - \epsilon} |W|
1567: $$
1568: \end{theo}
1569:
1570: To reduce $\epsilon$, we replace the initial values $d_0$ and $d_1$ by
1571: $d_0 \oplus d_1$ and $d_0 + d_1 - d_0 \oplus d_1$. As a consequence,
1572: the worst error on the Theorem~\ref{theo/Div/Converge} is the one on
1573: the remainder and it is bounded by construction by 4 ulps. This
1574: guarantees with the loop condition that any term $q_i \times d_j$ is
1575: smaller than all the terms already produced by the partial product
1576: generation chain except $q_i \times d_0$. This last value is handled
1577: separately as $a'$ and $b'$ are replaced by $a' + b' - q_i d_0$. The
1578: loop condition also ensures that $c' = 0$.
1579:
1580: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1581:
1582: % \subsubsection{Pre-scaled division}
1583:
1584: % In modern computers, the floating point division is a very slow
1585: % operator. For example, Intel Pentium processors need 39 cycles
1586: % \cite{Int95a} whereas multiplications and additions may have a
1587: % throughput of 1 cycle. Prescaled division, can be used to reduce the
1588: % number of floating point division to 1.
1589:
1590: % As a price, we have to exactly prescale the numerator $R$ involving
1591: % $n_R$ multiplications. As a matter of fact, the prescaling fits the
1592: % size of the second input of the MQ operator that is waiting for a pair
1593: % of floating point digits from the PQ and now also from the numerator.
1594:
1595: % \begin{figure}[ht]
1596: % \begin{center}
1597: % \input{division2.pstex_t}
1598: % \caption{Representation of the pre-scaled division}
1599: % \label{fig:div2}
1600: % \end{center}
1601: % \end{figure}
1602:
1603: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1604:
1605: \section*{Conclusion}
1606:
1607: We have presented a new adaptable numeric core inspired both from
1608: floating point expansions and from on-line arithmetic. Our choice was
1609: to present only the arithmetic core as we have presented in past
1610: publications how a core can be used efficiently to compute matrix
1611: determinants for example \cite{DauFin99a}. Building a general purpose
1612: runtime environment raises question in areas like parallelism (demand
1613: driven {\em vs.} data flow...) and in compiler techniques (efficiency
1614: of object oriented code generated by existing compilers...) among
1615: others.
1616:
1617: The numeric core is cut down to four tools. The $\Sigma_3$ operator
1618: that contains many arithmetic operations is proved correct. The
1619: proofs have been validated by the Coq assistant. Developing the
1620: proofs, we have formally proved many result long published in the
1621: literature and we have extended a few of them. This work may let
1622: users
1623: \begin{itemize}
1624: \item Develop application specific adaptable libraries based on the
1625: toolset.
1626: \item Easily write new formal proofs based on the growing set of
1627: sensible validated facts.
1628: \end{itemize}
1629:
1630: The set of the validated facts is now sufficient to shield the user
1631: from the difficult implementation details of floating point
1632: arithmetic. The proof of the key Theorem on the $\Sigma_3$ operator
1633: does not use any low level result. It just uses the many lower and
1634: upper bounds defined in the Theorems of Section~1.
1635:
1636: \small
1637:
1638: %\bibliography{/home/arenaire/public_html/Bibliographies/daumas_ref}
1639: %\bibliography{p:/Arenaire/Bibliographies/daumas_ref}
1640:
1641: \begin{thebibliography}{10}
1642:
1643: \bibitem{AvnBoiDevPreYvi97}
1644: Francis Avnaim, Jean-Daniel Boissonnat, Olivier Devillers, Franco Preparata,
1645: and Mariette Yvinec.
1646: \newblock Evaluating signs of determinants using single precision arithmetic.
1647: \newblock {\em Algorithmica}, 17(2):111--132, 1997.
1648:
1649: \bibitem{Bai93}
1650: David~H. Bailey.
1651: \newblock Algorithm 719, multiprecision translation and execution of fortran
1652: programs.
1653: \newblock {\em ACM Transactions on Mathematical Software}, 19(3):288--319,
1654: 1993.
1655:
1656: \bibitem{Bai95}
1657: David~H. Bailey.
1658: \newblock A {F}ortran-90 based multiprecision system.
1659: \newblock {\em ACM Transactions on Mathematical Software}, 21(4):379--387,
1660: 1995.
1661:
1662: \bibitem{Bar93}
1663: Jesse~L. Barlow.
1664: \newblock Error analysis of update methods for the symmetric eigenvalue
1665: problem.
1666: \newblock {\em SIAM Journal of Matrix Analysis and Applications},
1667: 14(2):598--618, 1993.
1668:
1669: \bibitem{BerDau97}
1670: David Berthelot and Marc Daumas.
1671: \newblock Computing on sequences of embedded intervals.
1672: \newblock {\em Reliable Computing}, 3(3):219--227, 1997.
1673:
1674: \bibitem{Boe87}
1675: Hans-Juergen Boehm.
1676: \newblock Constructive real interpretation of numerical programs.
1677: \newblock {\em ACM SIGPLAN Notices}, 22(7):214--221, 1987.
1678:
1679: \bibitem{BohWalKorMat91}
1680: Gerd Bohlender, Wolfgang Walter, Peter Kornerup, and David~W. Matula.
1681: \newblock Semantics for exact floating point operations.
1682: \newblock In Peter Kornerup and David Matula, editors, {\em Proceedings of the
1683: 10th Symposium on Computer Arithmetic}, pages 22--26, Grenoble, France, 1991.
1684:
1685: \bibitem{BoiPre2K}
1686: Jean-Daniel Boissonnat and Franco~P. Preparata.
1687: \newblock Robust plane sweep for intersecting segments.
1688: \newblock {\em SIAM Journal on Computing}, 29(5):1401--1421, 2000.
1689:
1690: \bibitem{BolDau01}
1691: Sylvie Boldo and Marc Daumas.
1692: \newblock Performances d'implantations de l'addition en pr{\'e}cision
1693: quad-double sur diff{\'e}rentes machines.
1694: \newblock In {\em 7{\`e}me Symposium sur les Architectures Nouvelles de
1695: Machines}, Paris, France, 2001.
1696:
1697: \bibitem{Bre78a}
1698: Richard~P. Brent.
1699: \newblock A fortran multiple-precision arithmetic package.
1700: \newblock {\em ACM Transactions on Mathematical Software}, 4(1):57--70, 1978.
1701:
1702: \bibitem{BurFleMehSch99}
1703: C.~Burnikel, R.~Fleischer, K.~Mehlhorn, and S.~Schirra.
1704: \newblock Efficient exact geometric computation made easy.
1705: \newblock In {\em Proceedings of the 15th Annual ACM Symposium on Computational
1706: Geometry}, pages 341--350, Miami, Florida, 1999.
1707:
1708: \bibitem{ChaGedGonMonWat91}
1709: Bruce~W. Char, Keith~O. Geddes, Gaston~H. Gonnet, Michael~B. Monagan, and
1710: Stephen~M. Watt.
1711: \newblock {\em Maple {V}: langage reference manual}.
1712: \newblock Springer Verlag, 1991.
1713:
1714: \bibitem{CodKar.84}
1715: William~J. Cody, Richard Karpinski, et~al.
1716: \newblock A proposed radix and word-length independent standard for floating
1717: point arithmetic.
1718: \newblock {\em IEEE Micro}, 4(4):86--100, 1984.
1719:
1720: \bibitem{CorDupHocMul91}
1721: G.~Corbaz, Jean Duprat, Bertrand Hochet, and Jean-Michel Muller.
1722: \newblock Implementation of a {VLSI} polynomial evaluator for real-time
1723: applications.
1724: \newblock In {\em International Conference on Application-Specific Array
1725: Processors}, 1991.
1726:
1727: \bibitem{Dau99a}
1728: Marc Daumas.
1729: \newblock Multiplications of floating point expansions.
1730: \newblock In Israel Koren and Peter Kornerup, editors, {\em Proceedings of the
1731: 14th Symposium on Computer Arithmetic}, pages 250--257, Adelaide, Australia,
1732: 1999.
1733:
1734: \bibitem{DauFin98}
1735: Marc Daumas and Claire Finot.
1736: \newblock Division of floating point expansions.
1737: \newblock In {\em IMACS-GAMMM International Symposium on Scientific Computing,
1738: Computer Arithmetic and Validated Numerics}, pages 45--46, Budapest,
1739: Hungaria, 1998.
1740:
1741: \bibitem{DauFin99a}
1742: Marc Daumas and Claire Finot.
1743: \newblock Division of floating point expansions with an application to the
1744: computation of a determinant.
1745: \newblock {\em Journal of Universal Computer Science}, 5(6):323--338, 1999.
1746:
1747: \bibitem{DauMulTis97}
1748: Marc Daumas, Jean-Michel Muller, and Arnaud Tisserand.
1749: \newblock Very high radix on-line arithmetic for accurate computations.
1750: \newblock In {\em Proceedings of the 15th IMACS World Congress on Computational
1751: and Applied Mathematics}, volume~2, pages 347--352, Berlin, Germany, 1997.
1752:
1753: \bibitem{DauMulVui94}
1754: Marc Daumas, Jean-Michel Muller, and Jean Vuillemin.
1755: \newblock Implementing on-line arithmetic on {PAM}.
1756: \newblock In Reiner~W. Hartenstein and Michael~Z. Serv{\'\i}t, editors, {\em
1757: Field-Programmable Logic: Architectures Synthesis and Applications}, pages
1758: 196--207, Prague, Czech Republic, 1994. Springer Verlag.
1759:
1760: \bibitem{Dek71}
1761: Theodorus~J. Dekker.
1762: \newblock A floating point technique for extending the available precision.
1763: \newblock {\em Numerische Mathematik}, 18(3):224--242, 1971.
1764:
1765: \bibitem{DevPre98}
1766: Olivier Devillers and Franco Preparata.
1767: \newblock A probabilistic analysis of the power of arithmetic filters.
1768: \newblock {\em Discrete and Computational Geometry}, 20(4):523--547, 1998.
1769:
1770: \bibitem{Don92}
1771: Jack~J. Dongarra.
1772: \newblock Performance of various computers using standard linear equations
1773: software.
1774: \newblock {\em Computer architecture news}, 20(3):22--44, 1992.
1775:
1776: \bibitem{Don99}
1777: Jack~J. Dongarra.
1778: \newblock Performance of various computers using standard linear equations
1779: software.
1780: \newblock Report CS-89-85, University of Tennessee, 1999.
1781:
1782: \bibitem{ForVan93}
1783: Steven Fortune and Christopher~J. {Van Wyk}.
1784: \newblock Efficient exact arithmetic for computational geometry.
1785: \newblock In {\em Proceedings of the 9th ACM Symposium on Computational
1786: Geometry}, pages 163--172, 1993.
1787:
1788: \bibitem{Gog2K}
1789: Brice Goglin.
1790: \newblock Calcul sur des expansions de taille fixe.
1791: \newblock Technical report, {\'E}cole Normale Sup{\'e}rieure de Lyon, 2000.
1792:
1793: \bibitem{Gol91}
1794: David Goldberg.
1795: \newblock What every computer scientist should know about floating point
1796: arithmetic.
1797: \newblock {\em ACM Computing Surveys}, 23(1):5--47, 1991.
1798:
1799: \bibitem{Gra2Kb}
1800: Tobj\"orn Granlund.
1801: \newblock {\em The {GNU} multiple precision arithmetic library}, 2000.
1802: \newblock Version 3.1.
1803:
1804: \bibitem{HarPaySavStr75}
1805: L.~H. Harper, Terry~H. Payne, John~E. Savage, and E.~Straus.
1806: \newblock Sorting $x + y$.
1807: \newblock {\em Communications of the ACM}, 18(6):347--349, 1975.
1808:
1809: \bibitem{Har97a}
1810: John Harrison.
1811: \newblock Floating point verification in {HOL} light: the exponential function.
1812: \newblock Technical Report 428, University of Cambridge Computer Laboratory,
1813: 1997.
1814:
1815: \bibitem{Hig96}
1816: Nicholas~J. Higham.
1817: \newblock {\em Accuracy and stability of numerical algorithms}.
1818: \newblock SIAM, 1996.
1819:
1820: \bibitem{HueKahPau97}
1821: G{\'e}rard Huet, Gilles Kahn, and Christine Paulin-Mohring.
1822: \newblock The {Coq} {P}roof {A}ssistant: {A} {T}utorial: Version 6.1.
1823: \newblock Technical Report 204, Institut National de Recherche en Informatique
1824: et en Automatique, Le Chesnay, France, 1997.
1825:
1826: \bibitem{Kah65}
1827: William Kahan.
1828: \newblock Further remarks on reducing truncation errors.
1829: \newblock {\em Communications of the ACM}, 8(1):40, 1965.
1830:
1831: \bibitem{KarLiPecYap99}
1832: V.~Karamcheti, C.~Li, I.~Pechtchanski, and Chee Yap.
1833: \newblock A core library for robust numeric and geometric computation.
1834: \newblock In {\em Proceedings of the 15th Annual ACM Symposium on Computational
1835: Geometry}, pages 351--359, Miami, Florida, 1999.
1836:
1837: \bibitem{Knu69}
1838: Donald~E. Knuth.
1839: \newblock {\em The art of computer programming: Seminumerical Algorithms}.
1840: \newblock Addison Wesley, 1969.
1841: \newblock First edition.
1842:
1843: \bibitem{Knu73}
1844: Donald~E. Knuth.
1845: \newblock {\em The art of computer programming: Seminumerical Algorithms}.
1846: \newblock Addison Wesley, 1973.
1847: \newblock Second edition.
1848:
1849: \bibitem{LouErc93}
1850: Marianne~E. Louie and Milo\v{s}~D. Ercegovac.
1851: \newblock On digit recurrence division implementation for field programmable
1852: gate arrays.
1853: \newblock In Earl Swartzlander, Mary~Jane Irwin, and Graham Jullien, editors,
1854: {\em Proceedings of the 11th Symposium on Computer Arithmetic}, pages
1855: 202--209, Windsor, Ontario, 1993.
1856:
1857: \bibitem{MazMer93}
1858: Christophe Mazenc and Xavier Merrheim.
1859: \newblock Handling numbers in high precision and controlling the numerical
1860: precision in a serial system on a parallel {MIMD} machine.
1861: \newblock In {\em 26th International Conference on System Sciences}, pages
1862: 1196--1198, Honolulu, Hawaii, 1993.
1863:
1864: \bibitem{Men94}
1865: Val{\'e}rie M{\'e}nissier.
1866: \newblock {\em Arithm{\'e}tique Exacte~: Conception, Algorithmique et
1867: Performances d'une Implantation Informatique en Pr{\'e}cision Arbitraire}.
1868: \newblock PhD thesis, Universit{\'e} Paris VI, Paris, France, 1994.
1869:
1870: \bibitem{Men95}
1871: Val\'erie M\'enissier-Morain.
1872: \newblock Conception et algorithmique d'une repr\'esentation d'arithm\'etique
1873: r\'eelle en pr\'ecision arbitraire.
1874: \newblock In {\em 1st Real Numbers and Computers Conference}, pages 47--64,
1875: St-\'Etienne, France, 1995.
1876:
1877: \bibitem{MicMor97}
1878: Dominique Michelucci and Jean-Michel Moreau.
1879: \newblock Lazy arithmetic.
1880: \newblock {\em IEEE Transactions on Computers}, 46(9):961--975, 1997.
1881:
1882: \bibitem{Mol65b}
1883: Ole M{\o}ller.
1884: \newblock Note on quasi double-precision.
1885: \newblock {\em BIT}, 5(4):251--255, 1965.
1886:
1887: \bibitem{Mol65a}
1888: Ole M{\o}ller.
1889: \newblock Quasi double-precision in floating point addition.
1890: \newblock {\em BIT}, 5(1):37--50, 1965.
1891:
1892: \bibitem{Mul97}
1893: Jean-Michel Muller.
1894: \newblock {\em Elementary functions, algorithms and implementation}.
1895: \newblock Birkhauser, 1997.
1896:
1897: \bibitem{Pic72}
1898: Mich\`ele Pichat.
1899: \newblock Correction d'une somme en arithm\'etique \`a virgule flottante.
1900: \newblock {\em Numerische Mathematik}, 19:400--406, 1972.
1901:
1902: \bibitem{Pri91}
1903: Douglas~M. Priest.
1904: \newblock Algorithms for arbitrary precision floating point arithmetic.
1905: \newblock In Peter Kornerup and David Matula, editors, {\em Proceedings of the
1906: 10th Symposium on Computer Arithmetic}, pages 132--144, Grenoble, France,
1907: 1991.
1908:
1909: \bibitem{Rus98}
1910: David~M. Russinoff.
1911: \newblock A mechanically checked proof of {IEEE} compliance of the floating
1912: point multiplication, division and square root algorithms of the {AMD-K7}
1913: processor.
1914: \newblock {\em LMS Journal of Computation and Mathematics}, 1:148--200, 1998.
1915:
1916: \bibitem{Sei2K}
1917: Peter-Michael Seidel.
1918: \newblock Exact arithmetic based on floating-point numbers.
1919: \newblock In {\em IMACS-GAMMM International Symposium on Scientific Computing,
1920: Computer Arithmetic and Validated Numerics}, page 123, Karlsruhe, Germany,
1921: 2000.
1922:
1923: \bibitem{SerVuiHer89}
1924: Bernard Serpette, Jean Vuillemin, and Jean-Claude Herv\'{e}.
1925: \newblock {BigNum}: a portable and efficient package for arbitrary-precision
1926: arithmetic.
1927: \newblock Technical Report~2, Digital Equipment Corporation, Paris Research
1928: Laboratory, 1989.
1929:
1930: \bibitem{She96}
1931: Jonathan~R. Shewchuk.
1932: \newblock Robust adaptative floating point geometric predicates.
1933: \newblock In {\em Proceedings of the 12th Annual ACM Symposium on Computational
1934: Geometry}, pages 141--150, Philadelphia, Pensylvania, 1996.
1935:
1936: \bibitem{She97}
1937: Jonathan~R. Shewchuk.
1938: \newblock Adaptive precision floating-point arithmetic and fast robust
1939: geometric predicates.
1940: \newblock In {\em Discrete and Computational Geometry}, volume~18, pages
1941: 305--363, 1997.
1942:
1943: \bibitem{SorTan91}
1944: Danny~C. S{\o}rensen and Ping Tak~Peter Tang.
1945: \newblock On the orthogonality of eigenvectors computed by divide and conquer
1946: methods techniques.
1947: \newblock {\em SIAM Journal of Numerical Analysis}, 28(6):1752--1775, 1991.
1948:
1949: \bibitem{SteStr94}
1950: William Steiger and Ileana Streinu.
1951: \newblock A pseudo-algorithmic separation of line from pseudo-lines.
1952: \newblock In {\em Proceedings of the Sixth Canadian Conference on Computational
1953: Geometry}, Saskatoon, Saskatchewan, 1994.
1954:
1955: \bibitem{Ste74}
1956: Pat~H. Sterbenz.
1957: \newblock {\em Floating point computation}.
1958: \newblock Prentice Hall, 1974.
1959:
1960: \bibitem{Ste.81}
1961: David Stevenson et~al.
1962: \newblock A proposed standard for binary floating point arithmetic.
1963: \newblock {\em IEEE Computer}, 14(3):51--62, 1981.
1964:
1965: \bibitem{Ste.87}
1966: David Stevenson et~al.
1967: \newblock An american national standard: {IEEE} standard for binary floating
1968: point arithmetic.
1969: \newblock {\em ACM SIGPLAN Notices}, 22(2):9--25, 1987.
1970:
1971: \bibitem{TriErc77}
1972: Kishor~S. Trivedi and Milo\v{s}~D. Ercegovac.
1973: \newblock On-line algorithms for division and multiplication.
1974: \newblock {\em IEEE Transactions on Computers}, 26(7):681--687, 1977.
1975:
1976: \bibitem{Ziv91}
1977: Abraham Ziv.
1978: \newblock Fast evaluation of elementary mathematical functions with correctly
1979: rounded last bit.
1980: \newblock {\em ACM Transactions on Mathematical Software}, 17(3):410--423,
1981: 1991.
1982:
1983: \end{thebibliography}
1984:
1985:
1986: \appendix
1987: \section{A brief overview of Coq}
1988:
1989: In this appendix, we give a quick overview of the Coq system. For a
1990: more complete introduction, we refer the reader to~\cite{HueKahPau97}.
1991:
1992: Coq is a generic prover based on type-theory. In this system,
1993: users can define new objects {\it and} prove properties that
1994: derive logically from these definitions. Objects in Coq are
1995: typed and functions are first-class objects using a Lisp-like
1996: notation. The system is distributed with standard libraires that
1997: define types like {\tt nat}, {\tt Z} and {\tt R} which correspond to
1998: the natural numbers, the relative numbers and the reals respectively.
1999: To give an example, the addition for natural number is represented by
2000: the function {\tt plus}
2001: whose type is {\tt nat $\rightarrow$ nat $\rightarrow$ nat}.
2002: A function {\tt plus3} that does the sum of 3 natural numbers is
2003: defined by the following command:
2004: {\small \begin{verbatim}
2005: Definition plus3 := [a,b,c:nat] (plus a (plus b c)).
2006: \end{verbatim}}
2007: \noindent
2008: Arguments between square brackets provide the parameters of the function.
2009: The type of {\tt plus3} is the expected one: {\tt nat $\rightarrow$ nat $\rightarrow$
2010: nat $\rightarrow$ nat}.
2011:
2012: In our formalization, we first define the type {\tt float} that
2013: represents the record composed of the mantissa and the exponent
2014: by the command:
2015: {\small \begin{verbatim}
2016: Record float: Set := Float {Fnum: Z; Fexp: Z}.
2017: \end{verbatim}}
2018: \noindent
2019: This command creates a new type {\tt float}, a constructor {\tt Float}
2020: of type {\tt Z $\rightarrow$ Z $\rightarrow$ float} and two
2021: destructors {\tt Fnum} of type {\tt float $\rightarrow$ Z} and
2022: {\tt Fexp} of type {\tt float $\rightarrow$ Z}. So for example,
2023: we can define the function {\tt Fzero} that takes an object of
2024: type {\tt float} and returns an object of same exponent but
2025: with the mantissa set to zero:
2026: {\small \begin{verbatim}
2027: Definition Fzero := [p:float](Float 0 (Fexp p)).
2028: \end{verbatim}}
2029: \noindent
2030: In a similar way, we define the notion of bound. It contains
2031: two integers: one for the mantissa and one for the exponent:
2032: {\small \begin{verbatim}
2033: Record Fbound: Set := Bound {vNum: nat;dExp: nat}.
2034: \end{verbatim}}
2035: \noindent
2036: In order to prove properties that are independent of a particular
2037: bound, we add the command:
2038: {\small \begin{verbatim}
2039: Variable b:Fbound.
2040: \end{verbatim}}
2041: \noindent
2042: With this arbitrary bound, the notion of being bounded is defined as:
2043: {\small \begin{verbatim}
2044: Definition Fbounded := [p:float]
2045: ((Zle (Zopp (vNum b)) (Fnum p)) /\ (Zle (Fnum p) (vNum b))) /\
2046: (Zle (Zopp (dExp b)) (Fexp p)).
2047: \end{verbatim}}
2048: \noindent
2049: A pretty-printed version of this definition could be:\\
2050: \vbox{
2051: \vskip5pt
2052: \hbox{{\tt Definition Fbounded := $\lambda$p:float.\,}}
2053: \hbox{ {\tt -(vNum b) $\le$ (Fnum p) $\le$ (vNum b) $\land$ -(dExp b) $\le$ (Fexp p).}}
2054: \vskip5pt
2055: }\\
2056: Now that we have these definitions we can prove a very simple
2057: first theorem:
2058: {\small \begin{verbatim}
2059: Theorem BoundedZero: (p:float) (Fbounded p) ->(Fbounded (Fzero p)).
2060: \end{verbatim}}
2061: \noindent
2062: A pretty-printed version of this theorem could be:\\
2063: \vbox{
2064: \vskip5pt
2065: \hbox{{\tt Theorem BoundedZero: $\forall$p:float.\, (Fbounded p) $\Rightarrow$ (Fbounded (Fzero p)).}}
2066: \vskip5pt
2067: }\\
2068: When the previous command is received by the system, it enters
2069: the proof mode. The statement of the theorem is put om the top
2070: of the goal stack. By applying a command called {\it tactic} we
2071: replace the goal at the top of the stack by a (possibly empty)
2072: list of subgoals. The proof is finished when the stack is empty.
2073: Our initial stack has only one goal. Each goal contains a list
2074: of hypothesis and a conclusion separated by a bar:
2075: {\small \begin{verbatim}
2076: 1 subgoal
2077:
2078: ============================
2079: (p:float)(Fbounded p)->(Fbounded (Fzero p))
2080: \end{verbatim}}
2081: \noindent
2082: The first step is to introduce the hypothesis. For this,
2083: we apply the tactic {\tt Intros}.
2084: {\small \begin{verbatim}
2085: Intros p H.
2086: \end{verbatim}}
2087: \noindent
2088: We are now reduce to prove {\tt (Fbounded (Fzero p))} in
2089: the contect where {\tt p} is a {\tt float} and {\tt H} is a
2090: proof that p is bounded:
2091: {\small \begin{verbatim}
2092: 1 subgoal
2093:
2094: p : float
2095: H : (Fbounded p)
2096: ============================
2097: (Fbounded (Fzero p))
2098: \end{verbatim}}
2099: \noindent
2100: Next we expand the definition of {\tt Fbounded} in the goal. For this,
2101: we use the tactic {\tt Red} and get the new goal:
2102: {\small \begin{verbatim}
2103: 1 subgoal
2104:
2105: p : float
2106: H : (Fbounded p)
2107: ============================
2108: ((Zle (Zopp (vNum b)) (Fnum (Fzero p)) /\
2109: (Zle (Fnum (Fzero p)) (vNum b))) /\
2110: (Zle (Zopp (dExp b)) (Fexp (Fzero p)))
2111: \end{verbatim}}
2112: \noindent
2113: This goal can be simplified using the definition of
2114: {\tt Fzero} by the tactic {\tt Simpl}:
2115: {\small \begin{verbatim}
2116: 1 subgoal
2117:
2118: p : float
2119: H : (Fbounded p)
2120: ============================
2121: ((Zle (Zopp (vNum b)) 0) /\ (Zle 0 (vNum b))) /\
2122: (Zle (Zopp (dExp b)) (Fexp p))
2123: \end{verbatim}}
2124: \noindent
2125: We now need to break the conjunctions into separate
2126: subgoals. This is done by the tactic {\tt Split}. As
2127: the conjunction is nested, this tactic needs to be
2128: applied repeatedly:
2129: {\small \begin{verbatim}
2130: Repeat Split.
2131: \end{verbatim}}
2132: \noindent
2133: We get the expected three subgoals.
2134: {\small \begin{verbatim}
2135: 3 subgoals
2136:
2137: p : float
2138: H : (Fbounded p)
2139: ============================
2140: (Zle (Zopp (vNum b)) 0)
2141:
2142: subgoal 2 is:
2143: (Zle 0 (vNum b))
2144:
2145: subgoal 3 is:
2146: (Zle (Zopp (dExp b)) (Fexp p))
2147: \end{verbatim}}
2148: \noindent
2149: The first goal is simple enough to be proved automatically
2150: by the following tactic {\tt Intuition}. We are left with two subgoals.
2151: {\small \begin{verbatim}
2152: 2 subgoals
2153:
2154: p : float
2155: H : (Fbounded p)
2156: ============================
2157: (Zle 0 (vNum b))
2158:
2159: subgoal 2 is:
2160: (Zle (Zopp (dExp b)) (Fexp p))
2161: \end{verbatim}}
2162: \noindent
2163: Applying the same tactic to these two subgoals ends the proof.
2164: The final proof script looks like:
2165: {\small \begin{verbatim}
2166: Theorem BoundedZero: (p:float) (Fbounded p) ->(Fbounded (Fzero p)).
2167: Intros p H.
2168: Red.
2169: Simpl.
2170: Repeat Split.
2171: Intuition.
2172: Intuition.
2173: Intuition.
2174: Qed.
2175: \end{verbatim}}
2176: \noindent
2177: Using the composition of tactics ";", it can be shorten to:
2178: {\small \begin{verbatim}
2179: Theorem BoundedZero: (p:float) (Fbounded p) ->(Fbounded (Fzero p)).
2180: Intros p H; Red; Simpl; Repeat Split; Intuition.
2181: Qed.
2182: \end{verbatim}}
2183:
2184: \end{document}
2185:
2186: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2187: