cs0504020/cs0504020
1: %PHVA.tex (2/21/05)
2: %
3: \documentclass[11pt]{article}
4: \usepackage{amstext,amssymb}
5: \usepackage{latexsym,epsfig}
6: 
7: %--------------- Various Style Declarations ----------------------------
8: 
9: \textheight         9.00in
10: \textwidth          6.30in
11: \oddsidemargin      0.00in
12: \evensidemargin     0.00in
13: \topmargin         -0.75in
14: \topskip            0.50in
15: \footskip           0.50in
16: 
17: \parskip               4pt
18: \parindent             8pt
19: \renewcommand{\arraystretch}{1.2}
20: 
21: %%% For equation numbering   depth to only sections use
22: \renewcommand{\theequation}{\thesection.\arabic{equation}}
23: 
24: %%% For numbering of theorems, definitions, lemmas, etc --
25: %%% if you only want section numbering, use
26: 
27:  \newtheorem{definition}{Definition}[section]
28:  \newtheorem{example}{Example}[section]
29:  \newtheorem{theorem}{Theorem}[section]
30:  \newtheorem{lemma}[theorem]{Lemma}
31:  \newtheorem{remark}{Remark}[section]
32:  \newtheorem{proposition}{Proposition}[section]
33:  \newtheorem{corollary}[theorem]{Corollary}
34:  \newtheorem{problem}{Problem}[section]
35:  \newtheorem{conversion}{Conversion}[section]
36: 
37: %% various definitions
38: 
39: \newcommand{\Z}{{\mathbb{Z}}}
40: \newcommand{\F}{{\mathbb{F}}}
41: \newcommand{\R}{{\mathbb{R}}}
42: \newcommand{\C}{{\mathbb{\CC}}}
43: \newcommand{\K}{{\mathbb{K}}}
44: \newcommand{\A}{{\mathcal{A}}}
45: \newcommand{\B}{{\mathcal{B}}}
46: \newcommand{\Bf}{{\mathfrak{B}}}
47: \newcommand{\CC}{{\mathcal{C}}}
48: \newcommand{\D}{{\mathcal{D}}}
49: \newcommand{\E}{{\mathsf{E}}}
50: \newcommand{\EE}{{\mathcal{E}}}
51: \newcommand{\FF}{{\mathcal{F}}}
52: \newcommand{\HH}{{\mathcal{H}}}
53: \newcommand{\I}{{\mathcal{I}}}
54: \newcommand{\J}{{\mathcal{J}}}
55: \newcommand{\KK}{{\mathcal{K}}}
56: \renewcommand{\L}{{\mathcal{L}}}
57: \newcommand{\N}{{\mathcal{N}}}
58: \renewcommand{\P}{{\mathcal{P}}}
59: \newcommand{\Q}{Q^{\sqrt{}}}
60: \newcommand{\PP}{{\mathcal{P}}}
61: \newcommand{\RR}{{\mathcal{R}}}
62: \renewcommand{\S}{{\mathcal{S}}}
63: \newcommand{\SSS}{{\mathcal{S}}}
64: \newcommand{\T}{{\mathcal{T}}}
65: \newcommand{\U}{{\mathcal{U}}}
66: \newcommand{\V}{{\mathcal{V}}}
67: \newcommand{\W}{{\mathcal{W}}}
68: \newcommand{\X}{{\mathcal{X}}}
69: \newcommand{\Y}{{\mathcal{Y}}}
70: \newcommand{\Ah}{\A\hat{\ }}
71: \newcommand{\Gh}{G\hat{\ }}
72: \newcommand{\Ghh}{G\hat{\ }\hat{\ }}
73: \newcommand{\Hh}{H\hat{\ }}
74: \newcommand{\Uh}{U\hat{\ }}
75: \newcommand{\Vh}{V\hat{\ }}
76: \newcommand{\Sh}{S\hat{\ }}
77: \newcommand{\ab}{\mathbf a}
78: \newcommand{\Ab}{\mathbf A}
79: \newcommand{\bb}{\mathbf b}
80: \newcommand{\cb}{\mathbf c}
81: \newcommand{\db}{\mathbf d}
82: \newcommand{\eb}{\mathbf e}
83: \newcommand{\fb}{\mathbf f}
84: \newcommand{\gb}{\mathbf g}
85: \newcommand{\hb}{\mathbf h}
86: \newcommand{\ib}{\mathbf i}
87: \newcommand{\mb}{\mathbf m}
88: \newcommand{\nb}{\mathbf n}
89: \newcommand{\pb}{\mathbf p}
90: \newcommand{\qb}{\mathbf q}
91: \newcommand{\rb}{\mathbf r}
92: \renewcommand{\sb}{\mathbf s}
93: \newcommand{\tb}{\mathbf t}
94: \newcommand{\ub}{\mathbf u}
95: \newcommand{\vb}{\mathbf v}
96: \newcommand{\wb}{\mathbf w}
97: \newcommand{\xb}{\mathbf x}
98: \newcommand{\yb}{\mathbf y}
99: \newcommand{\zb}{\mathbf z}
100: \newcommand{\zerob}{\mathbf 0}
101: \newcommand{\oneb}{\mathbf 1}
102: 
103: \newcommand{\sigmab}{\mbox{\boldmath$\sigma$}}
104: \newcommand{\bsig}{\boldmath $\sigma$\unboldmath}
105: \newcommand{\bomeg}{\boldmath $\omega$\unboldmath}
106: 
107: \newcommand{\ie}{{\em i.e., }}
108: \newcommand{\eg}{{\em e.g., }}
109: \newcommand{\cf}{\emph{cf.\ }}
110: \newcommand{\etal}{\emph{et al.\ }}
111: 
112: \newcommand{\inner}[2]{\langle{#1},{#2}\rangle}
113: \newcommand{\Proof}{\hspace*{0pt}{\em Proof}}
114: \newcommand{\propeq}{\equiv_\alpha}
115: \newcommand{\mapform}[3]{{#1}\colon~{#2} \rightarrow {#3}}
116: \newcommand{\flr}[1]{\left\lfloor{#1}\right\rfloor}
117: \newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil}
118: \newcommand{\lra}{\leftrightarrow}
119: %\newcommand{\qed}{\hfill\rule{8pt}{8pt}} %(text mode only)
120: \newcommand{\qed}{\hspace*{1cm}\hspace*{\fill}\openbox}
121: \newcommand{\half}{\frac{1}{2}}
122: \newcommand{\dint}{\int\!\!\!\int}
123: 
124: \newcommand{\df}{\textbf}
125: \newcommand{\eqr}[1]{(\ref{#1})}
126: 
127: \newcommand{\mod}{\mathrm{~mod~}}  %(math mode only)
128: \newcommand{\im}{\mathrm{im~}}  %(math mode only)
129: \newcommand{\rect}{\mathrm{rect}}  %(math mode only)
130: \newcommand{\sinc}{\mathrm{sinc}}  %(math mode only)
131: \newcommand{\rank}{\mathrm{rank}}  %(math mode only)
132: \newcommand{\argmin}{\mathrm{arg\,min}}  %(math mode only)
133: \newcommand{\eps}{\varepsilon} %(math mode only)
134: \newcommand{\remove}[1]{}
135: 
136: \newcommand{\h}{\hat{\ }}
137: \newcommand{\cl}{^\mathrm{cl}}
138: 
139: %%% AMS proof environment
140: \newcommand{\openbox}{\leavevmode
141:   \hbox to.77778em{%
142:   \hfil\vrule
143:   \vbox to.675em{\hrule width.6em\vfil\hrule}%
144:   \vrule\hfil}}
145: \newcommand{\proofname}{Proof}
146: \newenvironment{proof}[1][\proofname]{\par\normalfont
147:  \trivlist\item[\hskip\labelsep\itshape #1:]\ignorespaces
148: }{\hspace*{1cm}\hspace*{\fill}\openbox \medskip\endtrivlist}
149: %%% End of AMS proof environment
150: 
151: 
152: %************************************************************************
153: %                                                                       *
154: %            End of preamble and beginning of text.                     *
155: %                                                                       *
156: %************************************************************************
157: 
158:  
159: \begin{document}
160: \renewcommand{\textfraction}{0}
161: 
162: \title{The Viterbi Algorithm:  A Personal History}
163:  \author{\normalsize
164: G. David Forney, Jr.
165:  \\[-5pt]
166: \small MIT \\[-5pt]Ê
167: \small Cambridge, MA 02139 USA \\[-5pt] \small
168: \texttt{forneyd@comcast.net} }
169: \date{}
170: \maketitle
171: \thispagestyle{empty}
172: \begin{abstract}
173: The story of the Viterbi algorithm (VA) is told from a personal
174: perspective. Applications both within and beyond communications are
175: discussed. 
176: In brief summary, the VA has proved to be an extremely important algorithm
177: in a surprising variety of fields.
178: \end{abstract}Ê
179: \normalsize
180: 
181: %{\bf Index terms:} 
182: 
183: %\pagebreak
184: \section{Introduction}
185: 
186: Andrew J. Viterbi is rightly celebrated as one of the leading
187: communications engineers and theorists of the twentieth century.  He has
188: received almost every professional award possible, including election not
189: only to the National Academy of Engineering (USA) but also to the National
190: Academy of Sciences (USA), where he chairs the Computer
191: Science section.  His award citations usually cite ``invention of the
192: Viterbi algorithm" as his most notable accomplishment.
193: 
194: On the other hand, Andy would be the first to tell you that other people
195: deserve much of the credit for recognizing its theoretical properties and
196: its practical attractiveness, and for extending its domain of
197: application.  He has often told this story himself (see, \eg  \cite{V90}).
198: 
199: Nevertheless, no one doubts that Andy's awards are entirely deserved, and
200: that their focus on the Viterbi algorithm (VA) is 
201: appropriate.  This article will attempt to explain why, by briefly
202: recounting the history of the VA.  It is a ``personal history," because
203: the story of the VA is so intertwined with my own history that I can
204: recount much of it from a personal perspective.
205: 
206: %\pagebreak
207: \section{Invention of the Viterbi algorithm}
208: 
209: The Viterbi algorithm was first presented in Andy's famous 1967 paper
210: \cite{V67a} to help prove an asymptotically optimum upper bound
211: on the error probability of convolutional codes, which had previously been
212: derived by Yudkin in the context of sequential decoding \cite{Y64}.  In
213: this paper, the VA is presented just as we understand it today.  This paper
214: introduces the important concept of \emph{survivors} (a term possibly
215: borrowed from tennis elimination tournaments), and shows that
216: only $q^K$ survivors need be retained to decode a convolutional code with
217: constraint length $K$ over the
218: $q$-ary field $GF(q)$.  Compared to a block
219: code with $q^K$ codewords, such a convolutional code is shown to have a
220: much better error exponent, particularly near
221: capacity.
222: 
223: Andy recalls in a 1999 interview \cite{M99} that
224: 
225: \begin{quote}
226:  ``the Viterbi algorithm
227: for convolutional codes \ldots came out of my teaching \ldots.  I found
228: information theory difficult to teach, so I started developing some tools.
229: \ldots  I wrote the first paper in March `66, but it wasn't published
230: until April `67. \ldots  At one point I was actually discouraged from
231: publishing the algorithm details.  Fortunately, one of the reviewers, Jim
232: Massey, encouraged me to include the algorithm. \ldots  Nobody thought
233: that it had any potential for practical value \ldots"
234: \end{quote}
235: 
236: It is clear from the paper that at this point Andy had no idea that the VA
237: was actually an optimum (maximum likelihood) decoder, nor that it was
238: potentially practical.  Indeed, the paper states that ``this decoding
239: algorithm is clearly suboptimal," and concludes: ``Although this algorithm
240: is rendered impractical by the excessive storage requirements, it
241: contributes to a general understanding of convolutional codes and
242: sequential decoding through its simplicity of mechanization and analysis"
243: \cite{V67a}.
244: 
245: %A second 1967 paper \cite{V67b} discussed the application of the Viterbi
246: %algorithm to orthogonal convolutional codes, whose analysis is
247: %particularly nice.
248: 
249: %\pagebreak
250: \section{Discovery that the VA is optimum}
251: 
252: I believe that I received a copy of Andy's paper prior to publication,
253: probably via Jim Massey.  At that time I was working at Codex Corp., a
254: small start-up company aiming at practical applications of convolutional
255: codes.  Our primary focus was initially on threshold decoding, which was
256: the subject of Jim's doctoral thesis \cite{M63};  Jim was a
257: consultant.  Subsequently, we developed a sequential decoding system
258: \cite{WR61} for the Pioneer deep-space satellite program, which became the
259: first code in space \cite{CHIW98}.
260: 
261: I had been trying to understand why in practice convolutional codes were
262: generally superior to block codes, so I studied Andy's paper
263: with great interest.  I realized that the path-merging property of
264: convolutional codes could be depicted in what I called a \emph{trellis
265: diagram}, to contrast with the then-conventional tree diagram used in the
266: analysis of sequential decoding.  It was then only a small step to see that
267: the Viterbi algorithm was an exact recursive algorithm for finding the
268: shortest path through a trellis, and thus was actually an optimum trellis
269: decoder.  I believe that at that point I called Andy, and told him that he
270: had been too modest when he asserted that the VA was ``asymptotically
271: optimum."
272: 
273: These results were written up in a 1967 technical report \cite{F67}
274: for NASA Ames Research Center.  They were not published in journal form
275: until many years later, in \cite{F73} and \cite{F74}.
276: 
277: Shortly afterward, in a paper submitted in May 1968  \cite{O69},
278: Jim Omura observed that the VA was simply the standard
279: forward dynamic programming solution to maximum-likelihood decoding of a
280: discrete-time, finite-state dynamical system observed in memoryless
281: noise.  Beyond proving optimality in a different way, he thus made the
282: first connection between the VA and system and control theory.  It is
283: interesting to speculate whether the history of the VA would have been
284: different if it had simply been called ``dynamic programming" from the
285: beginning. 
286: 
287: At this point, none of us had recognized that the VA might be practical. 
288: Jim's paper concludes:  ``\ldots the decoding algorithm discussed here
289: grows exponentially in complexity with constraint length $\nu$ and is
290: therefore impractical for large $\nu$ \ldots."  More embarrassingly, in a
291: 1970
292: \textsc{IEEE Spectrum} paper \cite{F70} describing practical coding
293: schemes for the space channel, I wrote:
294: 
295: \begin{quote}
296: Sequential decoding [is] the best-performing practical technique known for
297: memoryless channels like the space channel, and will probably be the
298: general-purpose workhorse for these channels in the future \ldots.
299: 
300: [The Viterbi algorithm] is competitive in performance with sequential
301: decoding for moderate error rates, but cannot achieve very low error rates
302: efficiently.  On the other hand, it [is] capable of extremely high speeds
303: (tens of megabits), where sequential decoders become uneconomic.  It
304: therefore may find application in high-data-rate systems with modest error
305: requirements, such as digitized television.
306: \end{quote}
307: 
308: %\pagebreak
309: \section{Recognition that the VA is practical}
310: 
311: Andy has always said that Jerry Heller was the first person to realize
312: that the VA might be practical.  Jerry simulated the performance of
313: short-constraint-length codes at the Jet Propulsion Laboratory (JPL) in
314: 1968-69
315: \cite{H68, H69}, and found that with only a 64-state code he could
316: obtain a sizable coding gain, of the order of 6 dB.
317: 
318: In 1968, Andy, Irwin Jacobs, and Len Kleinrock incorporated Linkabit
319: Corp.\ in San Diego as a vehicle to pool their consulting efforts and to
320: obtain small government study contracts.  All kept their jobs as
321: professors.  In 1969, Jerry Heller was hired as Linkabit's first full-time
322: employee.  Linkabit obtained some small Navy and NASA contracts,
323: which enabled the construction of a VA prototype in 1969-70.  ``It was a
324: big monster filling a rack"
325: \cite{M99}.
326: 
327: The first IEEE Communication Theory Workshop in 1970 in St.\ Petersburg
328: became famous as the ``coding is dead" workshop, after Ned Weldon and other
329: speakers worried publicly that coding theory had come to a dead end.  But
330: what I remember best from that session is Irwin Jacobs standing up in the
331: back row, flourishing an integrated circuit (a 4-bit shift register, I
332: believe), and asserting that this represented the future of coding.  He
333: was quite right. (Unfortunately, by this time Codex had made a business
334: decision to get out of coding.)
335: 
336: By 1971, Linkabit had implemented a 2 Mb/s, 64-state Viterbi decoder.  In
337: a special issue on coding of the \textsc{IEEE Transactions on Communication
338: Technology} in October 1971, Heller and Jacobs
339: \cite{HJ71} discuss this decoder and many practical issues in careful
340: detail.  They compare the VA with sequential decoding, and conclude that
341: the VA will often be preferable because it can use quantized soft decisions
342: easily, and is less sensitive to channel and equipment variations.  In the
343: same issue, Cohen, Heller and Viterbi \cite{CHV71} describe a system using
344: orthogonal convolutional codes and the VA for asynchronous multiple-access
345: communications, and Viterbi \cite{V71} introduces generating-function
346: analysis techniques for the VA.
347: 
348: %\pagebreak
349: 
350: During the 1970s, through the leadership of Linkabit and JPL, the VA
351: became part of the coding standard for deep-space communication,
352: ultimately in a concatenated coding system with a Reed-Solomon (RS) outer
353: code.  Linkabit developed a relatively inexpensive and flexible VA chip,
354: and the VA became a nice little business for Linkabit.  It didn't hurt
355: that the inventor of the Viterbi algorithm was a Linkabit founder.  The VA
356: also began to be incorporated in many other communications applications.
357: 
358: In the early 1990s, JPL built a $2^{14}$-state ``Big Viterbi Decoder"
359: (BVD) with 8192 parallel add-compare-select (ACS) units, which operated at
360: a rate of the order of 1 Mb/s \cite{C92}.  As far as I know, the BVD
361: remains the biggest Viterbi decoder ever built.  
362: 
363: When the primary antenna
364: failed to deploy during the Galileo mission in 1992, JPL devised an
365: elaborate concatenated coding scheme involving a $2^{14}$-state rate-1/4
366: inner convolutional code and a set of variable-strength RS outer codes,
367: and reprogrammed it into the spacecraft computers.  This scheme was
368: able to operate within about 2 dB of the Shannon limit at a bit error
369: probability of less than $10^{-6}$, which was the world record prior to
370: the advent of turbo codes \cite{CHIW98}.
371: 
372: %\pagebreak
373: \section{The VA and intersymbol interference channels}
374: 
375: In the late 1960s, Codex turned its attention to the voiceband
376: modem business.  Our first-generation product
377: was a single-sideband (SSB) 9600 b/s modem with a so-called Class IV or $1-
378: D^2$ ``partial response."  About 1969, I recognized that the symbol
379: correlation that was thus introduced could be exploited by an \emph{ad
380: hoc} error correction algorithm, which was able to improve the noise
381: margin by about 2--3 dB.  This little decoder extended the commercial life
382: of this marginal-performance modem by perhaps a year or two.
383: 
384: It took me a while to understand that I had in fact invented a
385: maximum-likelihood sequence detector for this modem.  Over time, I
386: realized that this was nothing more than the Viterbi algorithm again,
387: streamlined for the $1 - D^2$ response.  This led to a 1972 paper
388: \cite{F72} that showed that the VA could be used as a maximum-likelihood
389: sequence detector for digital sequences in the presence of intersymbol
390: interference (ISI) and AWGN noise.  
391: 
392: Meanwhile, Jim Omura had recognized independently at UCLA that the VA
393: could be used on intersymbol interference channels, because of their
394: convolutional character \cite{O71}.  Indeed, a tantalizing hint in this
395: direction appears in a book review by Andy Viterbi in 1970 \cite{V70}. 
396: After visiting UCLA, Hisashi Kobayashi further developed this idea,
397: particularly for practical applications in partial response modems  and
398: magnetic recording \cite{K71a, K71b}.
399: 
400: The VA proved to be too complicated for general use as an equalizer on ISI
401: channels.  However, it stimulated many suboptimal approximations, and
402: analysis of its performance gave bounds on the best possible
403: performance of any sequence detector.
404: 
405: However, the VA did become standard in the related application of
406: high-density magnetic recording.  In so-called PRML systems
407: (``partial-response equalization with maximum-likelihood sequence
408: detection") \cite{ISW98}, the magnetic recording channel is first equalized
409: to a simple ``partial response" such as $1 - D^2$,
410: and the resulting sequence is then detected by the VA, or by a simplified
411: version thereof, as Kobayashi had envisioned \cite{K71a}. 
412: In retrospect, it seems possible that my little SSB modem
413: decoder was the first implementation of such a PRML scheme.
414: 
415: %\pagebreak
416: \section{Trellis-coded modulation}
417: 
418: After Gottfried Ungerboeck published his invention of trellis-coded
419: modulation in 1982 \cite{U82}, the VA became the workhorse
420: decoder for the next several generations of voiceband modems.  
421: Ungerboeck extended trellis coding to multilevel constellations by
422: constructing trellis codes in which each branch of the trellis represents
423: a subset of constellation symbols, rather than a single symbol.  By clever
424: constellation partitioning and attention to distances between subsets, he
425: was able to obtain coding gains in the bandwidth-limited regime
426: comparable to those that can be obtained in the power-limited
427: regime.  
428: 
429: For example, the V.32 modem (1986) used an 8-state trellis code to
430: obtain a coding gain of about 3.5 dB, while the later V.34 modem (1994)
431:  used 16 to 64-state trellis codes to obtain coding gains of
432: 4.0 to 4.5 dB \cite{FBEM96}.
433: 
434: \section{Applications in mobile and broadcast communications}
435: 
436: The mobile communications channel is subject to fading, bursts, and
437: multiuser interference, and is a much more difficult medium than the
438: AWGN and linear Gaussian channels discussed above.  
439: The designers of second-generation (2G) cellular systems used every tool
440: available at the time (early 1990s) to provide reliable communication on
441: this difficult channel.
442: 
443: The CDMA system developed by Qualcomm uses a $2^8$-state, rate-1/3
444: convolutional code with interleaved 64-orthogonal modulation, and of course
445: a Viterbi decoder.  The TDMA system developed for GSM uses the VA
446: not only to decode a 16-state, rate-1/2 convolutional code, but
447: also for equalization.  A soft-output Viterbi algorithm (SOVA) is often
448: used in the latter application \cite{CHIW98}.
449: 
450: VA decoders are currently used in about one billion cellphones, which is
451: probably the largest number in any application.  However, the largest
452: current consumer of VA processor cycles is probably digital video
453: broadcasting.  A recent estimate at Qualcomm is that approximately
454: $10^{15}$ bits per second are now being decoded by the VA in digital TV
455: sets around the world, every second of every day \cite{P05}.
456: 
457: %\pagebreak
458: \section{General application to hidden Markov models}
459: 
460: In 1973, I wrote a tutorial paper on the Viterbi algorithm for the
461: \textsc{Proceedings of the IEEE} \cite{F73} that has turned out to be my 
462: most cited paper by far.  A recent search using Google Scholar shows 734
463: citations, far more than the 181 for my next-most-cited reference.
464: 
465: One of the main points of that paper was that the VA can be applied to any
466: problem that involves detecting the output sequence of a discrete-time,
467: finite-state machine in memoryless noise--- \ie to detection and pattern
468: recognition problems involving hidden Markov models (HMMs).  Of course,
469: decoding of convolutional codes and sequence detection on ISI channels
470: were the main applications discussed in that paper.
471: 
472: During the 70s and 80s, the VA became widely used
473: in a variety of pattern recognition problems that could be described by
474: HMMs, particularly for speech recognition;  see
475: \cite{R89}.  Here the VA is often used as the M-step of an EM algorithm,
476: which also adjusts HMM parameters.
477: 
478: Indeed, a recent search of IEEE Xplore shows that most
479: current IEEE references to the VA occur in such Transactions as
480: \textsc{Pattern Analysis and Machine Intelligence} or \textsc{Systems, Man
481: and Cybernetics},
482: rather than in \textsc{Communications} or \textsc{Information Theory}. 
483: It seems that everyone in these fields knows how to ``Viterbi the data."
484: 
485: Finally, in the past decade, the VA has become widely used in much more
486: distant fields such as computational biology, \eg to locate genes in DNA
487: sequences.  See for example \cite{HSF97}, with its ``Viterbi Exon-Intron
488: Locator" (VEIL).
489: 
490: %\pagebreak
491: \section{Related algorithms}
492: 
493: In the past decade, the development of the field of ``codes on graphs" and
494: their related decoding algorithms has led to a remarkable conceptual
495: unification of a variety of detection and estimation algorithms which have
496: been introduced under various names for various applications.
497: 
498: In his 1996 dissertation, generalizing the earlier work of Gallager
499: \cite{G63} and Tanner \cite{T81}, Niclas Wiberg
500: \cite{W96, WLK95} developed the generic ``sum-product" and ``min-sum"
501: decoding algorithms for cycle-free graphs which may include
502: both symbol (observable) and state (hidden) variables.  For trellis
503: graphs, he showed that these reduce to the BCJR algorithm
504: \cite{BCJR74} and an algorithm equivalent to the Viterbi algorithm,
505: respectively.  For capacity-approaching codes such as turbo codes and
506: low-density parity-check (LDPC) codes, the sum-product algorithm with an
507: appropriate schedule becomes the standard iterative decoding algorithm
508: that is normally used with such  codes.
509: 
510: Later authors (\eg \cite{AM00, KFL01}) have shown that the sum-product
511: algorithm is equivalent to Pearl's ``belief propagation" algorithm for
512: statistical inference on Bayesian networks;  the Baum-Welch or
513: ``forward-backward" algorithm for inference with hidden Markov models; and
514: the Kalman smoother for linear Gaussian state-space models.
515: 
516: However, it is important to note that the min-sum algorithm
517: is a two-way ``backward-forward" algorithm.  The VA obtains the same
518: result with a ``forward-only" algorithm by storing a path history with
519: each survivor.  Of course, ``forward-only" is a key simplification,
520: particularly for real-time communications;  the min-sum algorithm would
521: never have been adopted in practice as widely as the VA has
522: been.\footnote{Interestingly, Ungerboeck discovered both the sum-product
523: and the min-sum algorithms for equalization applications in his thesis
524: \cite{U71};  however, he missed the forward-only version.}
525: 
526: %\pagebreak
527: \section{Conclusion}
528: 
529: The Viterbi algorithm has been tremendously important in communications. 
530: For moderately complex (not capacity-approaching) codes, it has proved to
531: yield the best tradeoff between performance and complexity both on
532: power-limited channels, such as space channels, and on bandwidth-limited
533: channels, such as voiceband telephone lines.  In practice, in these regimes
534: it has clearly outstripped its earlier rivals, such as sequential decoding
535: and algebraic decoding.  (However, it seems likely that it will be
536: superseded in many of its principal communications applications by
537: capacity-approaching codes with iterative decoding.)
538: 
539: Moreover, the VA has become a general-purpose algorithm for
540: decoding hidden Markov models in a huge variety of applications, from
541: speech recognition to computational biology. 
542: 
543: Andy Viterbi clearly did not envision the full import of the VA when
544: he first introduced it.  However, he and his colleagues at Linkabit and
545: Qualcomm were largely responsible for making it practical, and for driving
546: its widespread adoption in communications.  The history might have been
547: otherwise, but it wasn't.  In actual fact, no one deserves more credit for
548: this tremendously important invention than its actual inventor.
549: 
550: \section*{Acknowledgments}
551: I am very grateful for comments on drafts of this paper by Keith Chugg,
552: Dan Costello, Bob Gallager, Jim Massey, Jim Omura, Sergio Verd\'{u} and
553: Andy Viterbi.
554: 
555: %\pagebreak
556: {\small
557: \begin{thebibliography}{10}
558: 
559: \bibitem{AM00}
560: S. M. Aji and R. J. McEliece, ``The generalized distributive law,"
561: \emph{IEEE Trans.\ Inform.\ Theory}, vol.\ 46, pp.\ 325--343, Mar.\
562: 2000.
563: 
564: \bibitem{BCJR74}
565: L. R. Bahl, J. Cocke, F. Jelinek and J. Raviv,
566: ``Optimal decoding of linear codes for minimizing symbol error rate,"  
567: \emph{IEEE Trans.\ Inform.\ Theory}, vol.\ IT--20, pp.\ 284--287, Mar.\
568: 1974.
569: 
570: \bibitem{CHV71}
571: A. R. Cohen, J. A. Heller and A. J. Viterbi, ``A new coding technique
572: for asynchronous multiple access communication," 
573: \emph{IEEE Trans.\ Commun.\ Tech.}, vol.\  COM--19, pp.\ 849--855, Oct.\
574: 1971.
575: 
576: \bibitem{C92}
577: O. M. Collins, ``The subtleties and intricacies of building a constraint
578: length 15 convolutional decoder," 
579: \emph{IEEE Trans.\ Commun.}, vol.\  40, pp.\ 1810--1819, Dec.\ 1992.
580: 
581: %\bibitem{CCSDS87}
582: %Consultative Committee for Space Data Systems,
583: %``Recommendations for space data standard:  Telemetry channel coding,"
584: %Blue Book Issue 2, CCSDS 101.0-B2, Jan. 1987.
585: 
586: \bibitem{CHIW98}
587: D. J. Costello, Jr., J. Hagenauer, H. Imai and S. B. Wicker,
588: ``Applications of error-control coding,"  
589: \emph{IEEE Trans.\ Inform.\ Theory}, vol.\ 44, pp.\ 2531--2560, Oct.\
590: 1998.
591: 
592: \bibitem{F67}
593: G. D. Forney, Jr., ``Review of random tree codes," Appendix A, Final
594: Report, Contract NAS2-3637, NASA CR73176, NASA Ames Res.\ Ctr.,
595: Moffett Field, CA, Dec.\ 1967.
596: 
597: \bibitem{F70}
598: G. D. Forney, Jr., ``Coding and its application in space communications," 
599: \emph{IEEE Spectrum}, vol.\ 7, pp. 47--58, 1970.
600: 
601: \bibitem{F72}
602: G. D. Forney, Jr., ``Maximum-likelihood sequence estimation of digital
603: sequences in the presence of intersymbol interference,"  \emph{IEEE Trans.\
604: Inform.\ Theory}, vol.\ IT--18, pp.\ 363--378, May 1972.
605: 
606: \bibitem{F73}  
607: G. D. Forney, Jr.,  ``The Viterbi algorithm,"  \emph{Proc.\ IEEE}, vol.\ 
608: 61, pp.\ 268--278, March 1973.
609: 
610: \bibitem{F74}
611: G. D. Forney, Jr., ``Convolutional codes II.  Maximum-likelihood
612: decoding,"  \emph{Inform.\ and Control}, vol.\ 25, pp.\ 222--266, 1974.
613: 
614: %\bibitem{F94}
615: %G. D. Forney, Jr., ``Trellises old and new,"  in \emph{Communications and
616: %Cryptography} (R. E. Blahut et al., eds.), pp.\ 115--128.  Boston:  Kluwer,
617: %1994.
618: 
619: \bibitem{FBEM96}
620: G. D. Forney, Jr., L. Brown, M. V. Eyuboglu, and J. L. Moran III, 
621: ``The V.34 high-speed modem standard,"  \emph{IEEE Commun. Mag.}, vol.\ 34,
622: no.\ 12, pp.\ 28-33, Dec.\ 1996.
623: 
624: %\bibitem{F01}  
625: %G. D. Forney, Jr.,  ``Codes on graphs:  Normal realizations,"  \emph{IEEE
626: %Trans.\ Inform.\ Theory}, vol.\  IT--13, pp.\ 520--548, Feb.\ 2001.
627: 
628: \bibitem{G63}
629: R. G. Gallager, \emph{Low-Density Parity-Check Codes}.  Cambridge, MA:  MIT
630: Press, 1963.
631: 
632: \bibitem{H68}
633: J. A. Heller, ``Short constraint length convolutional codes,"  Jet Prop.\
634: Lab., Space Prog.\ Summary 37--54, vol.\ III, pp.\ 171--177, 1968.
635: 
636: \bibitem{H69}
637: J. A. Heller, ``Improved performance of short constraint length
638: convolutional codes,"  Jet Prop.\ Lab., Space Prog.\ Summary 37--56, vol.\
639: III, pp.\ 83--84, 1969.
640: 
641: \bibitem{HJ71}
642: J. A. Heller and I. M. Jacobs, ``Viterbi decoding for satellite and
643: space communication,"  \emph{IEEE Trans.\ Commun.\ Tech.}, vol.\  COM--19,
644: pp.\ 835--848, Oct.\ 1971.
645: 
646: \bibitem{HSF97}
647: J. Henderson, S. Salzberg and K. H. Fasman,
648: ``Finding genes in DNA with a hidden Markov model,"
649: \emph{J. Comput. Biol.}, vol.\ 4, pp.\ 127--141, 1997.
650: 
651: \bibitem{ISW98}
652: K. A. S. Immink, P. H. Siegel and J. K. Wolf,
653: ``Codes for digital recorders,"  
654: \emph{IEEE Trans.\ Inform.\ Theory}, vol.\  44, pp.\ 2260--2299, Oct.\
655: 1998.
656: 
657: \bibitem{K71a}
658: H. Kobayashi, ``Application of probabilistic decoding to digital magnetic
659: recording systems,"  \emph{IBM J. Res.\ Dev.}, vol.\ 15, pp.\ pp.\ 64--74,
660: Jan.\ 1971.
661: 
662: \bibitem{K71b}
663: H. Kobayashi, ``Correlative level coding and maximum likelihood decoding,"  
664: \emph{IEEE Trans.\ Inform.\ Theory}, vol.\  IT--17, pp.\ 586--594, Sept.\
665: 1971.
666: 
667: \bibitem{KFL01}
668: F. R. Kschischang, B. J. Frey and H.-A. Loeliger, 
669: ``Factor graphs and the sum-product algorithm,"
670: \emph{IEEE Trans.\ Inform.\ Theory}, vol.\ 47, pp.\ 498--519, Feb.\ 2001.
671: 
672: \bibitem{M63}
673: J. L. Massey, \emph{Threshold Decoding}.  Cambridge, MA:  MIT Press, 1963.
674: 
675: \bibitem{M99}
676: D. Morton, ``Andrew Viterbi, electrical engineer:  An oral history,"  IEEE
677: History Center, Rutgers U., New Brunswick, NJ, Oct.\ 1999.
678: 
679: \bibitem{O69}
680: J. K. Omura,
681: ``On the Viterbi decoding algorithm,"  \emph{IEEE Trans.\
682: Inform.\ Theory}, vol.\ IT--15, pp.\ 177--179, 1969.
683: 
684: \bibitem{O71}
685: J. K. Omura,
686: ``Optimal receiver design for convolutional codes and channels with
687: memory via control theoretical concepts," 
688: \emph{Info.\ Sci.}, vol.\ 3, pp.\ 243--266, July 1971.
689: 
690: \bibitem{P05}
691: R. Padovani, ``Ten years of progress in CDMA," Viterbi Conference, Univ.\
692: So.\ Calif., Los Angeles, Mar.\ 2005.
693: 
694: \bibitem{R89}
695: L. R. Rabiner,
696: ``A tutorial on hidden Markov models and selected applications in speech
697: recognition," \emph{Proc.\ IEEE}, vol.\ 77, pp.\ 257-286, Feb.\ 1989.
698: 
699: \bibitem{T81}
700: R. M. Tanner, ``A recursive approach to low complexity codes,"
701: \emph{IEEE Trans.\ Inform.\ Theory}, vol.\ IT--27, pp.\ 533--547,
702: Sept.\ 1981.
703: 
704: \bibitem{U71}
705: G. Ungerboeck, ``Nonlinear equalization of binary signals in Gaussian
706: noise,"
707:   \emph{IEEE Trans.\ Commun.\ Tech.}, vol.\  COM--19, pp.\ 1128--1137,
708: Dec.\ 1971.
709: 
710: \bibitem{U82}
711: G. Ungerboeck, ``Channel coding with multilevel/phase signals,"
712:   \emph{IEEE Trans.\ Inform.\ Theory}, vol.\ IT--28, pp.\ 55--67, Jan.\
713: 1982.
714: 
715: %\bibitem{V86}
716: %S. Verd\'{u}, ``Minimum probability of error for asynchronous Gaussian
717: %multiple-access channels,"
718: %  \emph{IEEE Trans.\ Inform.\ Theory}, vol.\ IT--32,  pp.\ 85--96, Jan.\
719: %1986.
720: 
721: \bibitem{V67a}  
722: A. J. Viterbi,  ``Error bounds for convolutional codes and an
723: asymptotically optimum decoding algorithm,"  \emph{IEEE
724: Trans.\ Inform.\ Theory}, vol.\  IT--13, pp.\ 260--269, April 1967.
725: 
726: %\bibitem{V67b}  
727: %A. J. Viterbi,  ``Orthogonal tree codes for communication in the
728: %presence of additive white Gaussian noise," 
729: %\emph{IEEE Trans.\ Commun.\ Tech.}, vol.\  COM--15, pp.\ 238--242,
730: %1967.
731: 
732: \bibitem{V70}  
733: A. J. Viterbi,  ``Review of \emph{Statistical Theory of Signal
734: Detection} (2nd ed.), by Carl W. Helstrom," 
735: \emph{IEEE Trans.\ Inform.\ Theory}, vol.\ IT--16, p.\ 653, Sept.\
736: 1970.
737: 
738: \bibitem{V71}  
739: A. J. Viterbi,  ``Convolutional codes and their performance in
740: communication systems,"  
741: \emph{IEEE Trans.\ Commun.\ Tech.}, vol.\  COM--19, pp.\ 751--772,
742: Oct.\ 1971.
743: 
744: \bibitem{V90}  
745: A. J. Viterbi,  ``From proof to product," 1990 IEEE Communication Theory 
746: Workshop, Ojai, CA, April 1990.
747: 
748: %\bibitem{VO79}
749: %A. J. Viterbi and J. K. Omura,
750: %\emph{Principles of Digital Communication and Coding}.
751: %New York:  McGraw-Hill, 1979.
752: 
753: \bibitem{W96}
754: N. Wiberg, ``Codes and decoding on general graphs,"
755: Ph.D.\ dissertation, Link\"{o}ping U., Link\"{o}ping, Sweden, 1996.
756: 
757: \bibitem{WLK95}
758: N. Wiberg, H.-A.\ Loeliger and R. K\"{o}tter, ``Codes and iterative
759: decoding on general graphs," \emph{Eur.\ Trans.\ Telecomm.}, vol.\ 6, pp.\
760: 513--525, Sept./Oct.\ 1995.
761: 
762: \bibitem{WR61}
763: J. M. Wozencraft and B. Reiffen, \emph{Sequential Decoding}.
764:   Cambridge, MA:  MIT Press, 1961.
765: 
766: \bibitem{Y64}
767: H. Yudkin, ``Channel state testing in information decoding,"  Sc.D.\
768: dissertation, Dept.\ Elec.\ Engg., MIT, Cambridge, MA, 1964.
769: 
770: \end{thebibliography}
771: }
772: \end{document}
773: 
774: 
775: