0806.2159/qr.bbl
1: \begin{thebibliography}{10}
2: 
3: \bibitem{abde:71}
4: {\sc N.~N. Abdelmalek}, {\em Round off error analysis for {G}ram--{S}chmidt
5:   method and solution of linear least squares problems}, BIT, 11 (1971),
6:   pp.~345--368.
7: 
8: \bibitem{boboulin2008issues}
9: {\sc M.~Baboulin, J.~J. Dongarra, and S.~Tomov}, {\em Some issues in dense
10:   linear algebra for multicore and special purpose architectures}, Tech. Rep.
11:   UT-CS-08-615, University of Tennessee, May 2008.
12: \newblock LAWN \#200.
13: 
14: \bibitem{irbleigs}
15: {\sc J.~Baglama, D.~Calvetti, and L.~Reichel}, {\em Algorithm 827: irbleigs: A
16:   {MATLAB} program for computing a few eigenpairs of a large sparse {H}ermitian
17:   matrix}, ACM Trans. Math. Softw., 29 (2003), pp.~337--348.
18: 
19: \bibitem{templatesEigenBai}
20: {\sc Z.~Bai and D.~Day}, {\em Block {A}rnoldi method}, in Templates for the
21:   Solution of Algebraic Eigenvalue Problems: A Practical Guide, Z.~Bai, J.~W.
22:   Demmel, J.~J. Dongarra, A.~Ruhe, and H.~{van der Vorst}, eds., Society for
23:   Industrial and Applied Mathematics, Philadelphia, PA, USA, 2000,
24:   pp.~196--204.
25: 
26: \bibitem{TRILINOSwebpage}
27: {\sc C.~G. Baker, U.~L. Hetmaniuk, R.~B. Lehoucq, and H.~K. Thornquist}, {\em
28:   Anasazi webpage}.
29: \newblock \url{http://trilinos.sandia.gov/packages/anasazi/}.
30: 
31: \bibitem{bjor:67}
32: {\sc {\AA}.~Bj{\"o}rck}, {\em Solving linear least squares problems by
33:   {Gram-Schmidt} orthogonalization}, BIT, 7 (1967), pp.~1--21.
34: 
35: \bibitem{scalapackusersguide}
36: {\sc L.~S. Blackford, J.~Choi, A.~Cleary, E.~D'Azevedo, J.~W. Demmel,
37:   I.~Dhillon, J.~J. Dongarra, S.~Hammarling, G.~Henry, A.~Petitet, K.~Stanley,
38:   D.~Walker, and R.~C. Whaley}, {\em {ScaLAPACK} Users' Guide}, SIAM,
39:   Philadelphia, PA, USA, May 1997.
40: 
41: \bibitem{buttari2007class}
42: {\sc A.~Buttari, J.~Langou, J.~Kurzak, and J.~J. Dongarra}, {\em A class of
43:   parallel tiled linear algebra algorithms for multicore architectures}, Tech.
44:   Rep. UT-CS-07-600, University of Tennessee, Sept. 2007.
45: \newblock LAWN \#191.
46: 
47: \bibitem{buttari2007parallel}
48: \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em Parallel tiled {QR}
49:   factorization for multicore architectures}, Tech. Rep. UT-CS-07-598,
50:   University of Tennessee, July 2007.
51: \newblock LAWN \#190.
52: 
53: \bibitem{lawn80}
54: {\sc J.~Choi, J.~J. Dongarra, S.~Ostrouchov, A.~P. Petitet, D.~W. Walker, and
55:   R.~C. Whaley}, {\em The design and implementation of the {ScaLAPACK} {LU},
56:   {QR}, and {Cholesky} factorization routines}, Tech. Rep. UT-CS-94-246, Oak
57:   Ridge National Laboratory, Sept. 1994.
58: \newblock LAWN \#80.
59: 
60: \bibitem{coppersmith1982asymptotic}
61: {\sc D.~Coppersmith and S.~Winograd}, {\em On the asymptotic complexity of
62:   matrix multiplication}, SIAM Journal on Computing, 11 (1982).
63: 
64: \bibitem{cosnard86}
65: {\sc M.~Cosnard, J.-M. Muller, and Y.~Robert}, {\em Parallel {QR}
66:   {D}ecomposition of a {R}ectangular {M}atrix}, Numer. Math., 48 (1986),
67:   pp.~239--249.
68: 
69: \bibitem{cosnard83:_qr}
70: {\sc M.~Cosnard and Y.~Robert}, {\em Complexite de la factorisation {QR} en
71:   parallele}, C.R. Acad. Sci., 297 (1983), pp.~549--552.
72: 
73: \bibitem{csanky1976fast}
74: {\sc L.~Csanky}, {\em Fast parallel matrix inversion algorithms}, SIAM J.
75:   Comput., 5 (1976), pp.~618--623.
76: 
77: \bibitem{cunha2002new}
78: {\sc R.~D.~D. Cunha, D.~Becker, and J.~C. Patterson}, {\em New parallel
79:   (rank-revealing) {QR} factorization algorithms}, in Euro-Par 2002. Parallel
80:   Processing: Eighth International Euro-Par Conference, Paderborn, Germany,
81:   August 27--30, 2002, 2002.
82: 
83: \bibitem{dazevedo1997design}
84: {\sc E.~F. D'Azevedo and J.~J. Dongarra}, {\em The design and implementation of
85:   the parallel out-of-core {ScaLAPACK} {LU}, {QR}, and {Cholesky} factorization
86:   routines}, Tech. Rep. 118 CS-97-247, University of Tennessee, Knoxville, Jan.
87:   1997.
88: 
89: \bibitem{demmel1992trading}
90: {\sc J.~W. Demmel}, {\em Trading off parallelism and numerical stability},
91:   Tech. Rep. UT-CS-92-179, University of Tennessee, June 1992.
92: \newblock LAWN \#53.
93: 
94: \bibitem{FastLinearAlgebraIsStable}
95: {\sc J.~W. Demmel, I.~Dumitriu, and O.~Holtz}, {\em Fast linear algebra is
96:   stable}, Numerische Mathematik, 108 (2007), pp.~59--91.
97: 
98: \bibitem{demmel2008comm}
99: {\sc J.~W. Demmel and M.~Hoemmen}, {\em Communication-avoiding {Krylov}
100:   subspace methods}, tech. rep., University of California Berkeley, Department
101:   of Electrical Engineering and Computer Science, in preparation.
102: 
103: \bibitem{dongarra1996key}
104: {\sc J.~J. Dongarra, S.~Hammarling, and D.~W. Walker}, {\em Key concepts for
105:   parallel out-of-core {LU} factorization}, Scientific Programming, 5 (1996),
106:   pp.~173--184.
107: 
108: \bibitem{elmroth1998new}
109: {\sc E.~Elmroth and F.~Gustavson}, {\em New serial and parallel recursive {QR}
110:   factorization algorithms for {SMP} systems}, in Applied Parallel Computing.
111:   Large Scale Scientific and Industrial Problems., B.~K. et~al., ed., vol.~1541
112:   of Lecture Notes in Computer Science, Springer, 1998, pp.~120--128.
113: 
114: \bibitem{elmroth2000applying}
115: \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em Applying recursion
116:   to serial and parallel {QR} factorization leads to better performance}, IBM
117:   Journal of Research and Development, 44 (2000), pp.~605--624.
118: 
119: \bibitem{elmroth2004recursive}
120: {\sc E.~Elmroth, F.~Gustavson, I.~Jonsson, and B.~K{\aa}gstr{\"o}m}, {\em
121:   Recursive blocked algorithms and hybrid data structures for dense matrix
122:   library software}, SIAM Review, 46 (2004), pp.~3--45.
123: 
124: \bibitem{Freund:1997:BQA}
125: {\sc R.~W. Freund and M.~Malhotra}, {\em A block {QMR} algorithm for
126:   non-{Hermitian} linear systems with multiple right-hand sides}, Linear
127:   Algebra and its Applications, 254 (1997), pp.~119--157.
128: \newblock Proceedings of the Fifth Conference of the International Linear
129:   Algebra Society (Atlanta, {GA}, 1995).
130: 
131: \bibitem{gilbert1992predicting}
132: {\sc J.~R. Gilbert and E.~G. Ng}, {\em Predicting structure in nonsymmetric
133:   sparse matrix factorization}, Tech. Rep. ORNL/TM-12205, Oak Ridge National
134:   Laboratory, 1992.
135: 
136: \bibitem{govl:96}
137: {\sc G.~H. Golub and C.~F.~V. Loan}, {\em Matrix Computations}, The Johns
138:   Hopkins University Press, Baltimore, MD, USA, third~ed., 1996.
139: 
140: \bibitem{golub1988parallel}
141: {\sc G.~H. Golub, R.~J. Plemmons, and A.~Sameh}, {\em Parallel block schemes
142:   for large-scale least-squares computations}, in High-Speed Computing:
143:   Scientific Applications and Algorithm Design, R.~B. Wilhelmson, ed.,
144:   University of Illinois Press, Urbana and Chicago, IL, USA, 1988,
145:   pp.~171--179.
146: 
147: \bibitem{graham2005getting}
148: {\sc S.~L. Graham, M.~Snir, and C.~A. Patterson}, eds., {\em Getting Up To
149:   Speed: The Future Of Supercomputing}, National Academies Press, Washington,
150:   D.C., USA, 2005.
151: 
152: \bibitem{greenbaum1997numerical}
153: {\sc A.~Greenbaum, M.~Rozlo{\v{z}}n{\'i}k, and Z.~Strako{\v{s}}}, {\em
154:   Numerical behavior of the modified {Gram-Schmidt} {GMRES} implementation},
155:   BIT Numerical Mathematics, 37 (1997), pp.~706--719.
156: 
157: \bibitem{grigori2008calu}
158: {\sc L.~Grigori, J.~W. Demmel, and H.~Xiang}, {\em Communication avoiding
159:   {Gaussian} elimination}, Tech. Rep. inria-00277901, INRIA, 2008.
160: \newblock version 2.
161: 
162: \bibitem{gropp1999using}
163: {\sc W.~Gropp, E.~Lusk, and A.~Skjellum}, {\em Using {MPI}: Portable Parallel
164:   Programming with the Message-Passing Interface}, MIT Press, 1999.
165: 
166: \bibitem{gunter2005parallel}
167: {\sc B.~C. Gunter and R.~A. van~de Geijn}, {\em Parallel out-of-core
168:   computation and updating of the {QR} factorization}, ACM Transactions on
169:   Mathematical Software, 31 (2005), pp.~60--78.
170: 
171: \bibitem{lehoucqORTH}
172: {\sc U.~Hetmaniuk and R.~Lehoucq}, {\em Basis selection in {LOBPCG}}, Journal
173:   of Computational Physics, 218 (2006), pp.~324--332.
174: 
175: \bibitem{irony2004communication}
176: {\sc D.~Irony, S.~Toledo, and A.~Tiskin}, {\em Communication lower bounds for
177:   distributed-memory matrix multiplication}, J. Parallel Distrib. Comput., 64
178:   (2004), pp.~1017--1026.
179: 
180: \bibitem{hong1981io}
181: {\sc H.~Jia-Wei and H.~T. Kung}, {\em {I/O} complexity: The {R}ed-{B}lue
182:   {P}ebble {G}ame}, in STOC '81: Proceedings of the Thirteenth Annual {ACM}
183:   Symposium on Theory of Computing, New York, NY, USA, 1981, ACM, pp.~326--333.
184: 
185: \bibitem{kiel:74}
186: {\sc A.~Kie{\l}basi{\' n}ski}, {\em Analiza numeryczna algorytmu
187:   ortogonalizacji {G}rama--{S}chmidta}, Seria III: Matematyka Stosowana II,
188:   (1974), pp.~15--35.
189: 
190: \bibitem{BLOPEXwebpage}
191: {\sc A.~V. Knyazev}, {\em {BLOPEX} webpage}.
192: \newblock \url{http://www-math.cudenver.edu/~aknyazev/software/BLOPEX/}.
193: 
194: \bibitem{andrewORTH}
195: {\sc A.~V. Knyazev, M.~Argentati, I.~Lashuk, and E.~E. Ovtchinnikov}, {\em
196:   Block locally optimal preconditioned eigenvalue xolvers ({BLOPEX}) in {HYPRE}
197:   and {PETS}c}, Tech. Rep. UCDHSC-CCM-251P, University of California Davis,
198:   2007.
199: 
200: \bibitem{kurzak2008qr}
201: {\sc J.~Kurzak and J.~J. Dongarra}, {\em {QR} factorization for the {CELL}
202:   processor}, Tech. Rep. UT-CS-08-616, University of Tennessee, May 2008.
203: \newblock LAWN \#201.
204: 
205: \bibitem{templatesEigenLehoucq}
206: {\sc R.~Lehoucq and K.~Maschhoff}, {\em Block {A}rnoldi method}, in Templates
207:   for the Solution of Algebraic Eigenvalue Problems: A Practical Guide, Z.~Bai,
208:   J.~W. Demmel, J.~J. Dongarra, A.~Ruhe, and H.~{van der Vorst}, eds., Society
209:   for Industrial and Applied Mathematics, Philadelphia, PA, USA, 2000,
210:   pp.~185--187.
211: 
212: \bibitem{leoncini1999parallel}
213: {\sc M.~Leoncini, G.~Manzini, and L.~Margara}, {\em Parallel complexity of
214:   numerically accurate linear system solvers}, SIAM J. Comput., 28 (1999),
215:   pp.~2030--2058.
216: 
217: \bibitem{loomis1949inequality}
218: {\sc L.~H. Loomis and H.~Whitney}, {\em An inequality related to the
219:   isoperimetric inequality}, Bull. Amer. Math. Soc., 55 (1949), pp.~961--962.
220: 
221: \bibitem{samsung2008ssd}
222: {\sc L.~Lugmayr}, {\em Samsung {256GB} {SSD} is world's fastest}.
223: \newblock \url{http://www.i4u.com/article17560.html}, 25 May 2008.
224: \newblock Accessed 30 May 2008.
225: 
226: \bibitem{BLZPACKwebpage}
227: {\sc O.~Marques}, {\em {BLZPACK} webpage}.
228: \newblock \url{http://crd.lbl.gov/~osni/}.
229: 
230: \bibitem{modi84:_given}
231: {\sc J.~J. Modi and M.~R.~B. Clarke}, {\em An alternative {Givens} ordering},
232:   Numer. Math.,  (1984), pp.~83--90.
233: 
234: \bibitem{nishtala2008performance}
235: {\sc R.~Nishtala, G.~Alm{\'{a}}si, and C.~Ca{\c{s}}caval}, {\em Performance
236:   without pain = productivity: Data layout and collective communication in
237:   {UPC}}, in Proceedings of the {ACM} {SIGPLAN} 2008 Symposium on Principles
238:   and Practice of Parallel Programming, 2008.
239: 
240: \bibitem{oleary:80}
241: {\sc D.~P. O'Leary}, {\em The block conjugate gradient algorithm and related
242:   methods}, Linear Algebra and its Applications, 29 (1980), pp.~293--322.
243: 
244: \bibitem{parlett1998symmetric}
245: {\sc B.~N. Parlett}, {\em The Symmetric Eigenvalue Problem}, SIAM,
246:   Philadelphia, 1998.
247: 
248: \bibitem{pothen1989distributed}
249: {\sc A.~Pothen and P.~Raghavan}, {\em Distributed orthogonal factorization:
250:   {Givens} and {Householder} algorithms}, SIAM J. Sci. Stat. Comput., 10
251:   (1989), pp.~1113--1134.
252: 
253: \bibitem{quintana-orti2008design}
254: {\sc G.~Quintana-Ort\'{i}, E.~S. Quintana-Ort\'{i}, E.~Chan, R.~A. van~de
255:   Geijn, and F.~G.~V. Zee}, {\em Design of scalable dense linear algebra
256:   libraries for multithreaded architectures: the {LU} factorization}, in
257:   Proceedings of the Workshop on Multithreaded Architectures and Applications,
258:   Miami, Florida, Apr. 2008.
259: \newblock {FLAME} Working Note \#26.
260: 
261: \bibitem{quintana-orti2008scheduling}
262: {\sc G.~Quintana-Ort\'{i}, E.~S. Quintana-Ort\'{i}, E.~Chan, F.~G.~V. Zee, and
263:   R.~A. van~de Geijn}, {\em Scheduling of {QR} factorization algorithms on
264:   {SMP} and multi-core architectures}, in Proceedings of the 16th Euromicro
265:   International Conference on Parallel, Distributed and Network-Based
266:   Processing, Toulouse, France, Feb. 2008.
267: \newblock {FLAME} Working Note \#24.
268: 
269: \bibitem{rabani2001outcore}
270: {\sc E.~Rabani and S.~Toledo}, {\em Out-of-core {SVD} and {QR} decompositions},
271:   in Proceedings of the 10th SIAM Conference on Parallel Processing for
272:   Scientific Computing, Norfolk, Virginia, SIAM, Mar. 2001.
273: 
274: \bibitem{sagan1994space}
275: {\sc H.~Sagan}, {\em Space-Filling Curves}, Springer-Verlag, 1994.
276: 
277: \bibitem{sameh78:_stabl_solver}
278: {\sc A.~H. Sameh and D.~J. Kuck}, {\em On {S}table {P}arallel {L}inear {S}ystem
279:   {S}olvers}, Journal of the Association for Computing Machinery, 25 (1978),
280:   pp.~81--91.
281: 
282: \bibitem{schreiber1989storage}
283: {\sc R.~Schreiber and C.~V. Loan}, {\em A storage efficient {$WY$}
284:   representation for products of {Householder} transformations}, SIAM J. Sci.
285:   Stat. Comput., 10 (1989), pp.~53--57.
286: 
287: \bibitem{smbl:06}
288: {\sc A.~Smoktunowicz, J.~Barlow, and J.~Langou}, {\em A note on the error
289:   analysis of {C}lassical {G}ram-{S}chmidt}, Numerische Mathematik, 105 (2006),
290:   pp.~299--313.
291: 
292: \bibitem{PRIMMEwebpage}
293: {\sc A.~Stathopoulos}, {\em {PRIMME} webpage}.
294: \newblock \url{http://www.cs.wm.edu/~andreas/software/}.
295: 
296: \bibitem{stwu:02}
297: {\sc A.~Stathopoulos and K.~Wu}, {\em A block orthogonalization procedure with
298:   constant synchronization requirements}, SIAM Journal on Scientific Computing,
299:   23 (2002), pp.~2165--2182.
300: 
301: \bibitem{strassen1969gaussian}
302: {\sc V.~Strassen}, {\em {Gaussian} elimination is not optimal}, Numerische
303:   Mathematik, 13 (1969).
304: 
305: \bibitem{toledo1997locality}
306: {\sc S.~Toledo}, {\em Locality of reference in {LU} decomposition with partial
307:   pivoting}, SIAM J. Matrix Anal. Appl., 18 (1997), pp.~1065--1081.
308: 
309: \bibitem{toledo99survey}
310: \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em A survey of
311:   out-of-core algorithms in numerical linear algebra}, in External Memory
312:   Algorithms and Visualization, J.~Abello and J.~S. Vitter, eds., American
313:   Mathematical Society Press, Providence, RI, 1999, pp.~161--180.
314: 
315: \bibitem{vital:phdthesis:90}
316: {\sc B.~Vital}, {\em {\'Etude de quelques m\'ethodes de r\'esolution de
317:   probl\`emes lin\'eaires de grande taille sur multiprocesseur}}, {P}h.{D}.
318:   dissertation, {Universit\'e de Rennes I, Rennes}, Nov. 1990.
319: 
320: \bibitem{walker1985implementation}
321: {\sc H.~F. Walker}, {\em Implementation of the {GMRES} and {Arnoldi} methods
322:   using {Householder} transformations}, Tech. Rep. UCRL-93589, Lawrence
323:   Livermore National Laboratory, Oct. 1985.
324: 
325: \bibitem{TRLANwebpage}
326: {\sc K.~Wu and H.~D. Simon}, {\em {TRLAN} webpage}.
327: \newblock \url{http://crd.lbl.gov/~kewu/ps/trlan_.html}.
328: 
329: \end{thebibliography}
330: