1: \begin{thebibliography}{10}
2:
3: \bibitem{abde:71}
4: {\sc N.~N. Abdelmalek}, {\em Round off error analysis for {G}ram--{S}chmidt
5: method and solution of linear least squares problems}, BIT, 11 (1971),
6: pp.~345--368.
7:
8: \bibitem{boboulin2008issues}
9: {\sc M.~Baboulin, J.~J. Dongarra, and S.~Tomov}, {\em Some issues in dense
10: linear algebra for multicore and special purpose architectures}, Tech. Rep.
11: UT-CS-08-615, University of Tennessee, May 2008.
12: \newblock LAWN \#200.
13:
14: \bibitem{irbleigs}
15: {\sc J.~Baglama, D.~Calvetti, and L.~Reichel}, {\em Algorithm 827: irbleigs: A
16: {MATLAB} program for computing a few eigenpairs of a large sparse {H}ermitian
17: matrix}, ACM Trans. Math. Softw., 29 (2003), pp.~337--348.
18:
19: \bibitem{templatesEigenBai}
20: {\sc Z.~Bai and D.~Day}, {\em Block {A}rnoldi method}, in Templates for the
21: Solution of Algebraic Eigenvalue Problems: A Practical Guide, Z.~Bai, J.~W.
22: Demmel, J.~J. Dongarra, A.~Ruhe, and H.~{van der Vorst}, eds., Society for
23: Industrial and Applied Mathematics, Philadelphia, PA, USA, 2000,
24: pp.~196--204.
25:
26: \bibitem{TRILINOSwebpage}
27: {\sc C.~G. Baker, U.~L. Hetmaniuk, R.~B. Lehoucq, and H.~K. Thornquist}, {\em
28: Anasazi webpage}.
29: \newblock \url{http://trilinos.sandia.gov/packages/anasazi/}.
30:
31: \bibitem{bjor:67}
32: {\sc {\AA}.~Bj{\"o}rck}, {\em Solving linear least squares problems by
33: {Gram-Schmidt} orthogonalization}, BIT, 7 (1967), pp.~1--21.
34:
35: \bibitem{scalapackusersguide}
36: {\sc L.~S. Blackford, J.~Choi, A.~Cleary, E.~D'Azevedo, J.~W. Demmel,
37: I.~Dhillon, J.~J. Dongarra, S.~Hammarling, G.~Henry, A.~Petitet, K.~Stanley,
38: D.~Walker, and R.~C. Whaley}, {\em {ScaLAPACK} Users' Guide}, SIAM,
39: Philadelphia, PA, USA, May 1997.
40:
41: \bibitem{buttari2007class}
42: {\sc A.~Buttari, J.~Langou, J.~Kurzak, and J.~J. Dongarra}, {\em A class of
43: parallel tiled linear algebra algorithms for multicore architectures}, Tech.
44: Rep. UT-CS-07-600, University of Tennessee, Sept. 2007.
45: \newblock LAWN \#191.
46:
47: \bibitem{buttari2007parallel}
48: \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em Parallel tiled {QR}
49: factorization for multicore architectures}, Tech. Rep. UT-CS-07-598,
50: University of Tennessee, July 2007.
51: \newblock LAWN \#190.
52:
53: \bibitem{lawn80}
54: {\sc J.~Choi, J.~J. Dongarra, S.~Ostrouchov, A.~P. Petitet, D.~W. Walker, and
55: R.~C. Whaley}, {\em The design and implementation of the {ScaLAPACK} {LU},
56: {QR}, and {Cholesky} factorization routines}, Tech. Rep. UT-CS-94-246, Oak
57: Ridge National Laboratory, Sept. 1994.
58: \newblock LAWN \#80.
59:
60: \bibitem{coppersmith1982asymptotic}
61: {\sc D.~Coppersmith and S.~Winograd}, {\em On the asymptotic complexity of
62: matrix multiplication}, SIAM Journal on Computing, 11 (1982).
63:
64: \bibitem{cosnard86}
65: {\sc M.~Cosnard, J.-M. Muller, and Y.~Robert}, {\em Parallel {QR}
66: {D}ecomposition of a {R}ectangular {M}atrix}, Numer. Math., 48 (1986),
67: pp.~239--249.
68:
69: \bibitem{cosnard83:_qr}
70: {\sc M.~Cosnard and Y.~Robert}, {\em Complexite de la factorisation {QR} en
71: parallele}, C.R. Acad. Sci., 297 (1983), pp.~549--552.
72:
73: \bibitem{csanky1976fast}
74: {\sc L.~Csanky}, {\em Fast parallel matrix inversion algorithms}, SIAM J.
75: Comput., 5 (1976), pp.~618--623.
76:
77: \bibitem{cunha2002new}
78: {\sc R.~D.~D. Cunha, D.~Becker, and J.~C. Patterson}, {\em New parallel
79: (rank-revealing) {QR} factorization algorithms}, in Euro-Par 2002. Parallel
80: Processing: Eighth International Euro-Par Conference, Paderborn, Germany,
81: August 27--30, 2002, 2002.
82:
83: \bibitem{dazevedo1997design}
84: {\sc E.~F. D'Azevedo and J.~J. Dongarra}, {\em The design and implementation of
85: the parallel out-of-core {ScaLAPACK} {LU}, {QR}, and {Cholesky} factorization
86: routines}, Tech. Rep. 118 CS-97-247, University of Tennessee, Knoxville, Jan.
87: 1997.
88:
89: \bibitem{demmel1992trading}
90: {\sc J.~W. Demmel}, {\em Trading off parallelism and numerical stability},
91: Tech. Rep. UT-CS-92-179, University of Tennessee, June 1992.
92: \newblock LAWN \#53.
93:
94: \bibitem{FastLinearAlgebraIsStable}
95: {\sc J.~W. Demmel, I.~Dumitriu, and O.~Holtz}, {\em Fast linear algebra is
96: stable}, Numerische Mathematik, 108 (2007), pp.~59--91.
97:
98: \bibitem{demmel2008comm}
99: {\sc J.~W. Demmel and M.~Hoemmen}, {\em Communication-avoiding {Krylov}
100: subspace methods}, tech. rep., University of California Berkeley, Department
101: of Electrical Engineering and Computer Science, in preparation.
102:
103: \bibitem{dongarra1996key}
104: {\sc J.~J. Dongarra, S.~Hammarling, and D.~W. Walker}, {\em Key concepts for
105: parallel out-of-core {LU} factorization}, Scientific Programming, 5 (1996),
106: pp.~173--184.
107:
108: \bibitem{elmroth1998new}
109: {\sc E.~Elmroth and F.~Gustavson}, {\em New serial and parallel recursive {QR}
110: factorization algorithms for {SMP} systems}, in Applied Parallel Computing.
111: Large Scale Scientific and Industrial Problems., B.~K. et~al., ed., vol.~1541
112: of Lecture Notes in Computer Science, Springer, 1998, pp.~120--128.
113:
114: \bibitem{elmroth2000applying}
115: \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em Applying recursion
116: to serial and parallel {QR} factorization leads to better performance}, IBM
117: Journal of Research and Development, 44 (2000), pp.~605--624.
118:
119: \bibitem{elmroth2004recursive}
120: {\sc E.~Elmroth, F.~Gustavson, I.~Jonsson, and B.~K{\aa}gstr{\"o}m}, {\em
121: Recursive blocked algorithms and hybrid data structures for dense matrix
122: library software}, SIAM Review, 46 (2004), pp.~3--45.
123:
124: \bibitem{Freund:1997:BQA}
125: {\sc R.~W. Freund and M.~Malhotra}, {\em A block {QMR} algorithm for
126: non-{Hermitian} linear systems with multiple right-hand sides}, Linear
127: Algebra and its Applications, 254 (1997), pp.~119--157.
128: \newblock Proceedings of the Fifth Conference of the International Linear
129: Algebra Society (Atlanta, {GA}, 1995).
130:
131: \bibitem{gilbert1992predicting}
132: {\sc J.~R. Gilbert and E.~G. Ng}, {\em Predicting structure in nonsymmetric
133: sparse matrix factorization}, Tech. Rep. ORNL/TM-12205, Oak Ridge National
134: Laboratory, 1992.
135:
136: \bibitem{govl:96}
137: {\sc G.~H. Golub and C.~F.~V. Loan}, {\em Matrix Computations}, The Johns
138: Hopkins University Press, Baltimore, MD, USA, third~ed., 1996.
139:
140: \bibitem{golub1988parallel}
141: {\sc G.~H. Golub, R.~J. Plemmons, and A.~Sameh}, {\em Parallel block schemes
142: for large-scale least-squares computations}, in High-Speed Computing:
143: Scientific Applications and Algorithm Design, R.~B. Wilhelmson, ed.,
144: University of Illinois Press, Urbana and Chicago, IL, USA, 1988,
145: pp.~171--179.
146:
147: \bibitem{graham2005getting}
148: {\sc S.~L. Graham, M.~Snir, and C.~A. Patterson}, eds., {\em Getting Up To
149: Speed: The Future Of Supercomputing}, National Academies Press, Washington,
150: D.C., USA, 2005.
151:
152: \bibitem{greenbaum1997numerical}
153: {\sc A.~Greenbaum, M.~Rozlo{\v{z}}n{\'i}k, and Z.~Strako{\v{s}}}, {\em
154: Numerical behavior of the modified {Gram-Schmidt} {GMRES} implementation},
155: BIT Numerical Mathematics, 37 (1997), pp.~706--719.
156:
157: \bibitem{grigori2008calu}
158: {\sc L.~Grigori, J.~W. Demmel, and H.~Xiang}, {\em Communication avoiding
159: {Gaussian} elimination}, Tech. Rep. inria-00277901, INRIA, 2008.
160: \newblock version 2.
161:
162: \bibitem{gropp1999using}
163: {\sc W.~Gropp, E.~Lusk, and A.~Skjellum}, {\em Using {MPI}: Portable Parallel
164: Programming with the Message-Passing Interface}, MIT Press, 1999.
165:
166: \bibitem{gunter2005parallel}
167: {\sc B.~C. Gunter and R.~A. van~de Geijn}, {\em Parallel out-of-core
168: computation and updating of the {QR} factorization}, ACM Transactions on
169: Mathematical Software, 31 (2005), pp.~60--78.
170:
171: \bibitem{lehoucqORTH}
172: {\sc U.~Hetmaniuk and R.~Lehoucq}, {\em Basis selection in {LOBPCG}}, Journal
173: of Computational Physics, 218 (2006), pp.~324--332.
174:
175: \bibitem{irony2004communication}
176: {\sc D.~Irony, S.~Toledo, and A.~Tiskin}, {\em Communication lower bounds for
177: distributed-memory matrix multiplication}, J. Parallel Distrib. Comput., 64
178: (2004), pp.~1017--1026.
179:
180: \bibitem{hong1981io}
181: {\sc H.~Jia-Wei and H.~T. Kung}, {\em {I/O} complexity: The {R}ed-{B}lue
182: {P}ebble {G}ame}, in STOC '81: Proceedings of the Thirteenth Annual {ACM}
183: Symposium on Theory of Computing, New York, NY, USA, 1981, ACM, pp.~326--333.
184:
185: \bibitem{kiel:74}
186: {\sc A.~Kie{\l}basi{\' n}ski}, {\em Analiza numeryczna algorytmu
187: ortogonalizacji {G}rama--{S}chmidta}, Seria III: Matematyka Stosowana II,
188: (1974), pp.~15--35.
189:
190: \bibitem{BLOPEXwebpage}
191: {\sc A.~V. Knyazev}, {\em {BLOPEX} webpage}.
192: \newblock \url{http://www-math.cudenver.edu/~aknyazev/software/BLOPEX/}.
193:
194: \bibitem{andrewORTH}
195: {\sc A.~V. Knyazev, M.~Argentati, I.~Lashuk, and E.~E. Ovtchinnikov}, {\em
196: Block locally optimal preconditioned eigenvalue xolvers ({BLOPEX}) in {HYPRE}
197: and {PETS}c}, Tech. Rep. UCDHSC-CCM-251P, University of California Davis,
198: 2007.
199:
200: \bibitem{kurzak2008qr}
201: {\sc J.~Kurzak and J.~J. Dongarra}, {\em {QR} factorization for the {CELL}
202: processor}, Tech. Rep. UT-CS-08-616, University of Tennessee, May 2008.
203: \newblock LAWN \#201.
204:
205: \bibitem{templatesEigenLehoucq}
206: {\sc R.~Lehoucq and K.~Maschhoff}, {\em Block {A}rnoldi method}, in Templates
207: for the Solution of Algebraic Eigenvalue Problems: A Practical Guide, Z.~Bai,
208: J.~W. Demmel, J.~J. Dongarra, A.~Ruhe, and H.~{van der Vorst}, eds., Society
209: for Industrial and Applied Mathematics, Philadelphia, PA, USA, 2000,
210: pp.~185--187.
211:
212: \bibitem{leoncini1999parallel}
213: {\sc M.~Leoncini, G.~Manzini, and L.~Margara}, {\em Parallel complexity of
214: numerically accurate linear system solvers}, SIAM J. Comput., 28 (1999),
215: pp.~2030--2058.
216:
217: \bibitem{loomis1949inequality}
218: {\sc L.~H. Loomis and H.~Whitney}, {\em An inequality related to the
219: isoperimetric inequality}, Bull. Amer. Math. Soc., 55 (1949), pp.~961--962.
220:
221: \bibitem{samsung2008ssd}
222: {\sc L.~Lugmayr}, {\em Samsung {256GB} {SSD} is world's fastest}.
223: \newblock \url{http://www.i4u.com/article17560.html}, 25 May 2008.
224: \newblock Accessed 30 May 2008.
225:
226: \bibitem{BLZPACKwebpage}
227: {\sc O.~Marques}, {\em {BLZPACK} webpage}.
228: \newblock \url{http://crd.lbl.gov/~osni/}.
229:
230: \bibitem{modi84:_given}
231: {\sc J.~J. Modi and M.~R.~B. Clarke}, {\em An alternative {Givens} ordering},
232: Numer. Math., (1984), pp.~83--90.
233:
234: \bibitem{nishtala2008performance}
235: {\sc R.~Nishtala, G.~Alm{\'{a}}si, and C.~Ca{\c{s}}caval}, {\em Performance
236: without pain = productivity: Data layout and collective communication in
237: {UPC}}, in Proceedings of the {ACM} {SIGPLAN} 2008 Symposium on Principles
238: and Practice of Parallel Programming, 2008.
239:
240: \bibitem{oleary:80}
241: {\sc D.~P. O'Leary}, {\em The block conjugate gradient algorithm and related
242: methods}, Linear Algebra and its Applications, 29 (1980), pp.~293--322.
243:
244: \bibitem{parlett1998symmetric}
245: {\sc B.~N. Parlett}, {\em The Symmetric Eigenvalue Problem}, SIAM,
246: Philadelphia, 1998.
247:
248: \bibitem{pothen1989distributed}
249: {\sc A.~Pothen and P.~Raghavan}, {\em Distributed orthogonal factorization:
250: {Givens} and {Householder} algorithms}, SIAM J. Sci. Stat. Comput., 10
251: (1989), pp.~1113--1134.
252:
253: \bibitem{quintana-orti2008design}
254: {\sc G.~Quintana-Ort\'{i}, E.~S. Quintana-Ort\'{i}, E.~Chan, R.~A. van~de
255: Geijn, and F.~G.~V. Zee}, {\em Design of scalable dense linear algebra
256: libraries for multithreaded architectures: the {LU} factorization}, in
257: Proceedings of the Workshop on Multithreaded Architectures and Applications,
258: Miami, Florida, Apr. 2008.
259: \newblock {FLAME} Working Note \#26.
260:
261: \bibitem{quintana-orti2008scheduling}
262: {\sc G.~Quintana-Ort\'{i}, E.~S. Quintana-Ort\'{i}, E.~Chan, F.~G.~V. Zee, and
263: R.~A. van~de Geijn}, {\em Scheduling of {QR} factorization algorithms on
264: {SMP} and multi-core architectures}, in Proceedings of the 16th Euromicro
265: International Conference on Parallel, Distributed and Network-Based
266: Processing, Toulouse, France, Feb. 2008.
267: \newblock {FLAME} Working Note \#24.
268:
269: \bibitem{rabani2001outcore}
270: {\sc E.~Rabani and S.~Toledo}, {\em Out-of-core {SVD} and {QR} decompositions},
271: in Proceedings of the 10th SIAM Conference on Parallel Processing for
272: Scientific Computing, Norfolk, Virginia, SIAM, Mar. 2001.
273:
274: \bibitem{sagan1994space}
275: {\sc H.~Sagan}, {\em Space-Filling Curves}, Springer-Verlag, 1994.
276:
277: \bibitem{sameh78:_stabl_solver}
278: {\sc A.~H. Sameh and D.~J. Kuck}, {\em On {S}table {P}arallel {L}inear {S}ystem
279: {S}olvers}, Journal of the Association for Computing Machinery, 25 (1978),
280: pp.~81--91.
281:
282: \bibitem{schreiber1989storage}
283: {\sc R.~Schreiber and C.~V. Loan}, {\em A storage efficient {$WY$}
284: representation for products of {Householder} transformations}, SIAM J. Sci.
285: Stat. Comput., 10 (1989), pp.~53--57.
286:
287: \bibitem{smbl:06}
288: {\sc A.~Smoktunowicz, J.~Barlow, and J.~Langou}, {\em A note on the error
289: analysis of {C}lassical {G}ram-{S}chmidt}, Numerische Mathematik, 105 (2006),
290: pp.~299--313.
291:
292: \bibitem{PRIMMEwebpage}
293: {\sc A.~Stathopoulos}, {\em {PRIMME} webpage}.
294: \newblock \url{http://www.cs.wm.edu/~andreas/software/}.
295:
296: \bibitem{stwu:02}
297: {\sc A.~Stathopoulos and K.~Wu}, {\em A block orthogonalization procedure with
298: constant synchronization requirements}, SIAM Journal on Scientific Computing,
299: 23 (2002), pp.~2165--2182.
300:
301: \bibitem{strassen1969gaussian}
302: {\sc V.~Strassen}, {\em {Gaussian} elimination is not optimal}, Numerische
303: Mathematik, 13 (1969).
304:
305: \bibitem{toledo1997locality}
306: {\sc S.~Toledo}, {\em Locality of reference in {LU} decomposition with partial
307: pivoting}, SIAM J. Matrix Anal. Appl., 18 (1997), pp.~1065--1081.
308:
309: \bibitem{toledo99survey}
310: \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em A survey of
311: out-of-core algorithms in numerical linear algebra}, in External Memory
312: Algorithms and Visualization, J.~Abello and J.~S. Vitter, eds., American
313: Mathematical Society Press, Providence, RI, 1999, pp.~161--180.
314:
315: \bibitem{vital:phdthesis:90}
316: {\sc B.~Vital}, {\em {\'Etude de quelques m\'ethodes de r\'esolution de
317: probl\`emes lin\'eaires de grande taille sur multiprocesseur}}, {P}h.{D}.
318: dissertation, {Universit\'e de Rennes I, Rennes}, Nov. 1990.
319:
320: \bibitem{walker1985implementation}
321: {\sc H.~F. Walker}, {\em Implementation of the {GMRES} and {Arnoldi} methods
322: using {Householder} transformations}, Tech. Rep. UCRL-93589, Lawrence
323: Livermore National Laboratory, Oct. 1985.
324:
325: \bibitem{TRLANwebpage}
326: {\sc K.~Wu and H.~D. Simon}, {\em {TRLAN} webpage}.
327: \newblock \url{http://crd.lbl.gov/~kewu/ps/trlan_.html}.
328:
329: \end{thebibliography}
330: