0806:0806.2159/qr.bbl

1: \begin{thebibliography}{10}

2:

3: \bibitem{abde:71}

4: {\sc N.~N. Abdelmalek}, {\em Round off error analysis for {G}ram--{S}chmidt

5:   method and solution of linear least squares problems}, BIT, 11 (1971),

6:   pp.~345--368.

7:

8: \bibitem{boboulin2008issues}

9: {\sc M.~Baboulin, J.~J. Dongarra, and S.~Tomov}, {\em Some issues in dense

10:   linear algebra for multicore and special purpose architectures}, Tech. Rep.

11:   UT-CS-08-615, University of Tennessee, May 2008.

12: \newblock LAWN \#200.

13:

14: \bibitem{irbleigs}

15: {\sc J.~Baglama, D.~Calvetti, and L.~Reichel}, {\em Algorithm 827: irbleigs: A

16:   {MATLAB} program for computing a few eigenpairs of a large sparse {H}ermitian

17:   matrix}, ACM Trans. Math. Softw., 29 (2003), pp.~337--348.

18:

19: \bibitem{templatesEigenBai}

20: {\sc Z.~Bai and D.~Day}, {\em Block {A}rnoldi method}, in Templates for the

21:   Solution of Algebraic Eigenvalue Problems: A Practical Guide, Z.~Bai, J.~W.

22:   Demmel, J.~J. Dongarra, A.~Ruhe, and H.~{van der Vorst}, eds., Society for

23:   Industrial and Applied Mathematics, Philadelphia, PA, USA, 2000,

24:   pp.~196--204.

25:

26: \bibitem{TRILINOSwebpage}

27: {\sc C.~G. Baker, U.~L. Hetmaniuk, R.~B. Lehoucq, and H.~K. Thornquist}, {\em

28:   Anasazi webpage}.

29: \newblock \url{http://trilinos.sandia.gov/packages/anasazi/}.

30:

31: \bibitem{bjor:67}

32: {\sc {\AA}.~Bj{\"o}rck}, {\em Solving linear least squares problems by

33:   {Gram-Schmidt} orthogonalization}, BIT, 7 (1967), pp.~1--21.

34:

35: \bibitem{scalapackusersguide}

36: {\sc L.~S. Blackford, J.~Choi, A.~Cleary, E.~D'Azevedo, J.~W. Demmel,

37:   I.~Dhillon, J.~J. Dongarra, S.~Hammarling, G.~Henry, A.~Petitet, K.~Stanley,

38:   D.~Walker, and R.~C. Whaley}, {\em {ScaLAPACK} Users' Guide}, SIAM,

39:   Philadelphia, PA, USA, May 1997.

40:

41: \bibitem{buttari2007class}

42: {\sc A.~Buttari, J.~Langou, J.~Kurzak, and J.~J. Dongarra}, {\em A class of

43:   parallel tiled linear algebra algorithms for multicore architectures}, Tech.

44:   Rep. UT-CS-07-600, University of Tennessee, Sept. 2007.

45: \newblock LAWN \#191.

46:

47: \bibitem{buttari2007parallel}

48: \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em Parallel tiled {QR}

49:   factorization for multicore architectures}, Tech. Rep. UT-CS-07-598,

50:   University of Tennessee, July 2007.

51: \newblock LAWN \#190.

52:

53: \bibitem{lawn80}

54: {\sc J.~Choi, J.~J. Dongarra, S.~Ostrouchov, A.~P. Petitet, D.~W. Walker, and

55:   R.~C. Whaley}, {\em The design and implementation of the {ScaLAPACK} {LU},

56:   {QR}, and {Cholesky} factorization routines}, Tech. Rep. UT-CS-94-246, Oak

57:   Ridge National Laboratory, Sept. 1994.

58: \newblock LAWN \#80.

59:

60: \bibitem{coppersmith1982asymptotic}

61: {\sc D.~Coppersmith and S.~Winograd}, {\em On the asymptotic complexity of

62:   matrix multiplication}, SIAM Journal on Computing, 11 (1982).

63:

64: \bibitem{cosnard86}

65: {\sc M.~Cosnard, J.-M. Muller, and Y.~Robert}, {\em Parallel {QR}

66:   {D}ecomposition of a {R}ectangular {M}atrix}, Numer. Math., 48 (1986),

67:   pp.~239--249.

68:

69: \bibitem{cosnard83:_qr}

70: {\sc M.~Cosnard and Y.~Robert}, {\em Complexite de la factorisation {QR} en

71:   parallele}, C.R. Acad. Sci., 297 (1983), pp.~549--552.

72:

73: \bibitem{csanky1976fast}

74: {\sc L.~Csanky}, {\em Fast parallel matrix inversion algorithms}, SIAM J.

75:   Comput., 5 (1976), pp.~618--623.

76:

77: \bibitem{cunha2002new}

78: {\sc R.~D.~D. Cunha, D.~Becker, and J.~C. Patterson}, {\em New parallel

79:   (rank-revealing) {QR} factorization algorithms}, in Euro-Par 2002. Parallel

80:   Processing: Eighth International Euro-Par Conference, Paderborn, Germany,

81:   August 27--30, 2002, 2002.

82:

83: \bibitem{dazevedo1997design}

84: {\sc E.~F. D'Azevedo and J.~J. Dongarra}, {\em The design and implementation of

85:   the parallel out-of-core {ScaLAPACK} {LU}, {QR}, and {Cholesky} factorization

86:   routines}, Tech. Rep. 118 CS-97-247, University of Tennessee, Knoxville, Jan.

87:   1997.

88:

89: \bibitem{demmel1992trading}

90: {\sc J.~W. Demmel}, {\em Trading off parallelism and numerical stability},

91:   Tech. Rep. UT-CS-92-179, University of Tennessee, June 1992.

92: \newblock LAWN \#53.

93:

94: \bibitem{FastLinearAlgebraIsStable}

95: {\sc J.~W. Demmel, I.~Dumitriu, and O.~Holtz}, {\em Fast linear algebra is

96:   stable}, Numerische Mathematik, 108 (2007), pp.~59--91.

97:

98: \bibitem{demmel2008comm}

99: {\sc J.~W. Demmel and M.~Hoemmen}, {\em Communication-avoiding {Krylov}

100:   subspace methods}, tech. rep., University of California Berkeley, Department

101:   of Electrical Engineering and Computer Science, in preparation.

102:

103: \bibitem{dongarra1996key}

104: {\sc J.~J. Dongarra, S.~Hammarling, and D.~W. Walker}, {\em Key concepts for

105:   parallel out-of-core {LU} factorization}, Scientific Programming, 5 (1996),

106:   pp.~173--184.

107:

108: \bibitem{elmroth1998new}

109: {\sc E.~Elmroth and F.~Gustavson}, {\em New serial and parallel recursive {QR}

110:   factorization algorithms for {SMP} systems}, in Applied Parallel Computing.

111:   Large Scale Scientific and Industrial Problems., B.~K. et~al., ed., vol.~1541

112:   of Lecture Notes in Computer Science, Springer, 1998, pp.~120--128.

113:

114: \bibitem{elmroth2000applying}

115: \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em Applying recursion

116:   to serial and parallel {QR} factorization leads to better performance}, IBM

117:   Journal of Research and Development, 44 (2000), pp.~605--624.

118:

119: \bibitem{elmroth2004recursive}

120: {\sc E.~Elmroth, F.~Gustavson, I.~Jonsson, and B.~K{\aa}gstr{\"o}m}, {\em

121:   Recursive blocked algorithms and hybrid data structures for dense matrix

122:   library software}, SIAM Review, 46 (2004), pp.~3--45.

123:

124: \bibitem{Freund:1997:BQA}

125: {\sc R.~W. Freund and M.~Malhotra}, {\em A block {QMR} algorithm for

126:   non-{Hermitian} linear systems with multiple right-hand sides}, Linear

127:   Algebra and its Applications, 254 (1997), pp.~119--157.

128: \newblock Proceedings of the Fifth Conference of the International Linear

129:   Algebra Society (Atlanta, {GA}, 1995).

130:

131: \bibitem{gilbert1992predicting}

132: {\sc J.~R. Gilbert and E.~G. Ng}, {\em Predicting structure in nonsymmetric

133:   sparse matrix factorization}, Tech. Rep. ORNL/TM-12205, Oak Ridge National

134:   Laboratory, 1992.

135:

136: \bibitem{govl:96}

137: {\sc G.~H. Golub and C.~F.~V. Loan}, {\em Matrix Computations}, The Johns

138:   Hopkins University Press, Baltimore, MD, USA, third~ed., 1996.

139:

140: \bibitem{golub1988parallel}

141: {\sc G.~H. Golub, R.~J. Plemmons, and A.~Sameh}, {\em Parallel block schemes

142:   for large-scale least-squares computations}, in High-Speed Computing:

143:   Scientific Applications and Algorithm Design, R.~B. Wilhelmson, ed.,

144:   University of Illinois Press, Urbana and Chicago, IL, USA, 1988,

145:   pp.~171--179.

146:

147: \bibitem{graham2005getting}

148: {\sc S.~L. Graham, M.~Snir, and C.~A. Patterson}, eds., {\em Getting Up To

149:   Speed: The Future Of Supercomputing}, National Academies Press, Washington,

150:   D.C., USA, 2005.

151:

152: \bibitem{greenbaum1997numerical}

153: {\sc A.~Greenbaum, M.~Rozlo{\v{z}}n{\'i}k, and Z.~Strako{\v{s}}}, {\em

154:   Numerical behavior of the modified {Gram-Schmidt} {GMRES} implementation},

155:   BIT Numerical Mathematics, 37 (1997), pp.~706--719.

156:

157: \bibitem{grigori2008calu}

158: {\sc L.~Grigori, J.~W. Demmel, and H.~Xiang}, {\em Communication avoiding

159:   {Gaussian} elimination}, Tech. Rep. inria-00277901, INRIA, 2008.

160: \newblock version 2.

161:

162: \bibitem{gropp1999using}

163: {\sc W.~Gropp, E.~Lusk, and A.~Skjellum}, {\em Using {MPI}: Portable Parallel

164:   Programming with the Message-Passing Interface}, MIT Press, 1999.

165:

166: \bibitem{gunter2005parallel}

167: {\sc B.~C. Gunter and R.~A. van~de Geijn}, {\em Parallel out-of-core

168:   computation and updating of the {QR} factorization}, ACM Transactions on

169:   Mathematical Software, 31 (2005), pp.~60--78.

170:

171: \bibitem{lehoucqORTH}

172: {\sc U.~Hetmaniuk and R.~Lehoucq}, {\em Basis selection in {LOBPCG}}, Journal

173:   of Computational Physics, 218 (2006), pp.~324--332.

174:

175: \bibitem{irony2004communication}

176: {\sc D.~Irony, S.~Toledo, and A.~Tiskin}, {\em Communication lower bounds for

177:   distributed-memory matrix multiplication}, J. Parallel Distrib. Comput., 64

178:   (2004), pp.~1017--1026.

179:

180: \bibitem{hong1981io}

181: {\sc H.~Jia-Wei and H.~T. Kung}, {\em {I/O} complexity: The {R}ed-{B}lue

182:   {P}ebble {G}ame}, in STOC '81: Proceedings of the Thirteenth Annual {ACM}

183:   Symposium on Theory of Computing, New York, NY, USA, 1981, ACM, pp.~326--333.

184:

185: \bibitem{kiel:74}

186: {\sc A.~Kie{\l}basi{\' n}ski}, {\em Analiza numeryczna algorytmu

187:   ortogonalizacji {G}rama--{S}chmidta}, Seria III: Matematyka Stosowana II,

188:   (1974), pp.~15--35.

189:

190: \bibitem{BLOPEXwebpage}

191: {\sc A.~V. Knyazev}, {\em {BLOPEX} webpage}.

192: \newblock \url{http://www-math.cudenver.edu/~aknyazev/software/BLOPEX/}.

193:

194: \bibitem{andrewORTH}

195: {\sc A.~V. Knyazev, M.~Argentati, I.~Lashuk, and E.~E. Ovtchinnikov}, {\em

196:   Block locally optimal preconditioned eigenvalue xolvers ({BLOPEX}) in {HYPRE}

197:   and {PETS}c}, Tech. Rep. UCDHSC-CCM-251P, University of California Davis,

198:   2007.

199:

200: \bibitem{kurzak2008qr}

201: {\sc J.~Kurzak and J.~J. Dongarra}, {\em {QR} factorization for the {CELL}

202:   processor}, Tech. Rep. UT-CS-08-616, University of Tennessee, May 2008.

203: \newblock LAWN \#201.

204:

205: \bibitem{templatesEigenLehoucq}

206: {\sc R.~Lehoucq and K.~Maschhoff}, {\em Block {A}rnoldi method}, in Templates

207:   for the Solution of Algebraic Eigenvalue Problems: A Practical Guide, Z.~Bai,

208:   J.~W. Demmel, J.~J. Dongarra, A.~Ruhe, and H.~{van der Vorst}, eds., Society

209:   for Industrial and Applied Mathematics, Philadelphia, PA, USA, 2000,

210:   pp.~185--187.

211:

212: \bibitem{leoncini1999parallel}

213: {\sc M.~Leoncini, G.~Manzini, and L.~Margara}, {\em Parallel complexity of

214:   numerically accurate linear system solvers}, SIAM J. Comput., 28 (1999),

215:   pp.~2030--2058.

216:

217: \bibitem{loomis1949inequality}

218: {\sc L.~H. Loomis and H.~Whitney}, {\em An inequality related to the

219:   isoperimetric inequality}, Bull. Amer. Math. Soc., 55 (1949), pp.~961--962.

220:

221: \bibitem{samsung2008ssd}

222: {\sc L.~Lugmayr}, {\em Samsung {256GB} {SSD} is world's fastest}.

223: \newblock \url{http://www.i4u.com/article17560.html}, 25 May 2008.

224: \newblock Accessed 30 May 2008.

225:

226: \bibitem{BLZPACKwebpage}

227: {\sc O.~Marques}, {\em {BLZPACK} webpage}.

228: \newblock \url{http://crd.lbl.gov/~osni/}.

229:

230: \bibitem{modi84:_given}

231: {\sc J.~J. Modi and M.~R.~B. Clarke}, {\em An alternative {Givens} ordering},

232:   Numer. Math.,  (1984), pp.~83--90.

233:

234: \bibitem{nishtala2008performance}

235: {\sc R.~Nishtala, G.~Alm{\'{a}}si, and C.~Ca{\c{s}}caval}, {\em Performance

236:   without pain = productivity: Data layout and collective communication in

237:   {UPC}}, in Proceedings of the {ACM} {SIGPLAN} 2008 Symposium on Principles

238:   and Practice of Parallel Programming, 2008.

239:

240: \bibitem{oleary:80}

241: {\sc D.~P. O'Leary}, {\em The block conjugate gradient algorithm and related

242:   methods}, Linear Algebra and its Applications, 29 (1980), pp.~293--322.

243:

244: \bibitem{parlett1998symmetric}

245: {\sc B.~N. Parlett}, {\em The Symmetric Eigenvalue Problem}, SIAM,

246:   Philadelphia, 1998.

247:

248: \bibitem{pothen1989distributed}

249: {\sc A.~Pothen and P.~Raghavan}, {\em Distributed orthogonal factorization:

250:   {Givens} and {Householder} algorithms}, SIAM J. Sci. Stat. Comput., 10

251:   (1989), pp.~1113--1134.

252:

253: \bibitem{quintana-orti2008design}

254: {\sc G.~Quintana-Ort\'{i}, E.~S. Quintana-Ort\'{i}, E.~Chan, R.~A. van~de

255:   Geijn, and F.~G.~V. Zee}, {\em Design of scalable dense linear algebra

256:   libraries for multithreaded architectures: the {LU} factorization}, in

257:   Proceedings of the Workshop on Multithreaded Architectures and Applications,

258:   Miami, Florida, Apr. 2008.

259: \newblock {FLAME} Working Note \#26.

260:

261: \bibitem{quintana-orti2008scheduling}

262: {\sc G.~Quintana-Ort\'{i}, E.~S. Quintana-Ort\'{i}, E.~Chan, F.~G.~V. Zee, and

263:   R.~A. van~de Geijn}, {\em Scheduling of {QR} factorization algorithms on

264:   {SMP} and multi-core architectures}, in Proceedings of the 16th Euromicro

265:   International Conference on Parallel, Distributed and Network-Based

266:   Processing, Toulouse, France, Feb. 2008.

267: \newblock {FLAME} Working Note \#24.

268:

269: \bibitem{rabani2001outcore}

270: {\sc E.~Rabani and S.~Toledo}, {\em Out-of-core {SVD} and {QR} decompositions},

271:   in Proceedings of the 10th SIAM Conference on Parallel Processing for

272:   Scientific Computing, Norfolk, Virginia, SIAM, Mar. 2001.

273:

274: \bibitem{sagan1994space}

275: {\sc H.~Sagan}, {\em Space-Filling Curves}, Springer-Verlag, 1994.

276:

277: \bibitem{sameh78:_stabl_solver}

278: {\sc A.~H. Sameh and D.~J. Kuck}, {\em On {S}table {P}arallel {L}inear {S}ystem

279:   {S}olvers}, Journal of the Association for Computing Machinery, 25 (1978),

280:   pp.~81--91.

281:

282: \bibitem{schreiber1989storage}

283: {\sc R.~Schreiber and C.~V. Loan}, {\em A storage efficient {$WY$}

284:   representation for products of {Householder} transformations}, SIAM J. Sci.

285:   Stat. Comput., 10 (1989), pp.~53--57.

286:

287: \bibitem{smbl:06}

288: {\sc A.~Smoktunowicz, J.~Barlow, and J.~Langou}, {\em A note on the error

289:   analysis of {C}lassical {G}ram-{S}chmidt}, Numerische Mathematik, 105 (2006),

290:   pp.~299--313.

291:

292: \bibitem{PRIMMEwebpage}

293: {\sc A.~Stathopoulos}, {\em {PRIMME} webpage}.

294: \newblock \url{http://www.cs.wm.edu/~andreas/software/}.

295:

296: \bibitem{stwu:02}

297: {\sc A.~Stathopoulos and K.~Wu}, {\em A block orthogonalization procedure with

298:   constant synchronization requirements}, SIAM Journal on Scientific Computing,

299:   23 (2002), pp.~2165--2182.

300:

301: \bibitem{strassen1969gaussian}

302: {\sc V.~Strassen}, {\em {Gaussian} elimination is not optimal}, Numerische

303:   Mathematik, 13 (1969).

304:

305: \bibitem{toledo1997locality}

306: {\sc S.~Toledo}, {\em Locality of reference in {LU} decomposition with partial

307:   pivoting}, SIAM J. Matrix Anal. Appl., 18 (1997), pp.~1065--1081.

308:

309: \bibitem{toledo99survey}

310: \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em A survey of

311:   out-of-core algorithms in numerical linear algebra}, in External Memory

312:   Algorithms and Visualization, J.~Abello and J.~S. Vitter, eds., American

313:   Mathematical Society Press, Providence, RI, 1999, pp.~161--180.

314:

315: \bibitem{vital:phdthesis:90}

316: {\sc B.~Vital}, {\em {\'Etude de quelques m\'ethodes de r\'esolution de

317:   probl\`emes lin\'eaires de grande taille sur multiprocesseur}}, {P}h.{D}.

318:   dissertation, {Universit\'e de Rennes I, Rennes}, Nov. 1990.

319:

320: \bibitem{walker1985implementation}

321: {\sc H.~F. Walker}, {\em Implementation of the {GMRES} and {Arnoldi} methods

322:   using {Householder} transformations}, Tech. Rep. UCRL-93589, Lawrence

323:   Livermore National Laboratory, Oct. 1985.

324:

325: \bibitem{TRLANwebpage}

326: {\sc K.~Wu and H.~D. Simon}, {\em {TRLAN} webpage}.

327: \newblock \url{http://crd.lbl.gov/~kewu/ps/trlan_.html}.

328:

329: \end{thebibliography}

330: