0810.3286/SVT.tex
1: \documentclass[11pt]{article}
2: \usepackage{graphicx,subfigure,amsmath,amsfonts,bm,cite,epsfig,epsf,url,alg}
3: \usepackage{fullpage}
4: \usepackage[small,bf]{caption}
5: \setlength{\captionmargin}{30pt}
6: 
7: %-----------------------------
8: % for color remarks
9: \usepackage{color}
10: \newcommand{\red}[1]{\textcolor{red}{#1}}
11: \newcommand{\blue}[1]{\textcolor{blue}{#1}}
12: 
13: 
14: %--------------
15: \newtheorem{theorem}{Theorem}[section]
16: \newtheorem{lemma}[theorem]{Lemma}
17: \newtheorem{corollary}[theorem]{Corollary}
18: \newtheorem{proposition}[theorem]{Proposition}
19: \newtheorem{definition}[theorem]{Definition}
20: \newtheorem{conjecture}[theorem]{Conjecture}
21: 
22: % From Zuowei
23: %\newtheorem{algorithm}{Algorithm}[section]
24: 
25: \newtheorem{remark}[subsection]{Remark}
26: \newtheorem{remarks}[subsection]{Remarks}
27: \newtheorem{example}[subsection]{Example}
28: 
29: %--------------
30: % EJC's macros
31: \newcommand{\R}{\mathbb{R}}
32: \newcommand{\C}{\mathbb{C}}
33: \newcommand{\Z}{\mathbb{Z}}
34: \newcommand{\<}{\langle}
35: \renewcommand{\>}{\rangle}
36: \newcommand{\Var}{\textrm{Var}}
37: \newcommand{\goto}{\rightarrow}
38: \newcommand{\sgn}{\textrm{sgn}}
39: \renewcommand{\P}{\operatorname{\mathbb{P}}}
40: \newcommand{\E}{\operatorname{\mathbb{E}}}
41: \newcommand{\norm}[1]{{\left\lVert{#1}\right\rVert}}
42: \newcommand{\col}{\textrm{col}}
43: 
44: \newcommand{\TODO}[1]{{\bf TODO: #1}}
45: \newcommand{\e}{\mathrm{e}}
46: \renewcommand{\i}{\imath}
47: 
48: \newcommand{\cA}{\mathcal{A}}
49: \newcommand{\cP}{\mathcal{P}}
50: \newcommand{\cD}{\mathcal{D}}
51: \newcommand{\cF}{\mathcal{F}}
52: \newcommand{\cL}{\mathcal{L}}
53: 
54: 
55: 
56: 
57: % Linear algebra macros
58: \newcommand{\vct}[1]{\bm{#1}}
59: \newcommand{\mtx}[1]{\bm{#1}}
60: %\newcommand{\mtx}[1]{\mathsfsl{#1}}
61: 
62: \newcommand{\transp}{T}
63: \newcommand{\adj}{*}
64: \newcommand{\psinv}{\dagger}
65: 
66: \newcommand{\lspan}[1]{\operatorname{span}{#1}}
67: 
68: \newcommand{\range}{\operatorname{range}}
69: \newcommand{\colspan}{\operatorname{colspan}}
70: 
71: \newcommand{\rank}{\operatorname{rank}}
72: 
73: \newcommand{\diag}{\operatorname{diag}}
74: \newcommand{\trace}{\operatorname{trace}}
75: 
76: \newcommand{\supp}[1]{\operatorname{supp}(#1)}
77: 
78: \newcommand{\smax}{\sigma_{\max}}
79: \newcommand{\smin}{\sigma_{\min}}
80: 
81: \newcommand{\restrict}[1]{\big\vert_{#1}}
82: 
83: \newcommand{\Id}{\text{\em I}}
84: \newcommand{\OpId}{\mathcal{I}}
85: 
86: \numberwithin{equation}{section}
87: 
88: \newcommand{\mymathbf}[1]{\mbox{\boldmath$#1$}}
89: 
90: \newenvironment{proof}{\noindent\emph{Proof.}}{\hfill\fbox{}\vspace*{1mm}}
91: 
92: \title{A Singular Value Thresholding Algorithm for Matrix Completion}
93: 
94: 
95: \author{Jian-Feng Cai$^{\dagger}$ ~~Emmanuel J. Cand\`es$^{\sharp}$
96:   ~~ Zuowei Shen$^{\S}$\\
97:   \vspace{-.1cm}\\
98:   $\dagger$ Temasek Laboratories, National University of
99:   Singapore, Singapore 117543\\
100:   \vspace{-.3cm}\\
101:   $\sharp$ Applied and Computational Mathematics,
102:   Caltech, Pasadena, CA 91125\\
103:   \vspace{-.3cm}\\
104:   $\S$ Department of Mathematics, National University of Singapore,
105:   Singapore 117543}
106: 
107: \date{September 2008}
108: 
109: \begin{document}
110: \maketitle
111: 
112: \begin{abstract}
113:   This paper introduces a novel algorithm to approximate the matrix
114:   with minimum nuclear norm among all matrices obeying a set of convex
115:   constraints.  This problem may be understood as the convex
116:   relaxation of a rank minimization problem, and arises in many
117:   important applications as in the task of recovering a large matrix
118:   from a small subset of its entries (the famous Netflix problem).
119:   Off-the-shelf algorithms such as interior point methods are not
120:   directly amenable to large problems of this kind with over a million
121:   unknown entries.
122: 
123:   This paper develops a simple first-order and easy-to-implement
124:   algorithm that is extremely efficient at addressing problems in
125:   which the optimal solution has low rank.  The algorithm is iterative
126:   and produces a sequence of matrices $\{\mtx{X}^k,\mtx{Y}^k\}$ and at
127:   each step, mainly performs a soft-thresholding operation on the
128:   singular values of the matrix $\mtx{Y}^k$. There are two remarkable
129:   features making this attractive for low-rank matrix completion
130:   problems. The first is that the soft-thresholding operation is
131:   applied to a sparse matrix; the second is that the rank of the
132:   iterates $\{\mtx{X}^k\}$ is empirically nondecreasing. Both these
133:   facts allow the algorithm to make use of very minimal storage space
134:   and keep the computational cost of each iteration low.  On the
135:   theoretical side, we provide a convergence analysis showing that the
136:   sequence of iterates converges. On the practical side, we provide
137:   numerical examples in which $1,000 \times 1,000$ matrices are
138:   recovered in less than a minute on a modest desktop computer. We
139:   also demonstrate that our approach is amenable to very large scale
140:   problems by recovering matrices of rank about 10 with nearly a
141:   billion unknowns from just about 0.4\% of their sampled entries. Our
142:   methods are connected with the recent literature on linearized
143:   Bregman iterations for $\ell_1$ minimization, and we develop a
144:   framework in which one can understand these algorithms in terms of
145:   well-known Lagrange multiplier algorithms.
146: \end{abstract}
147: 
148: 
149: {\bf Keywords.} Nuclear norm minimization, matrix completion, singular
150: value thresholding, Lagrange dual function, Uzawa's algorithm.
151: 
152: \section{Introduction}
153: 
154: \subsection{Motivation}
155: 
156: There is a rapidly growing interest in the recovery of an unknown
157: low-rank or approximately low-rank matrix from very limited
158: information. This problem occurs in many areas of engineering and
159: applied science such as machine learning \cite{Abernethy06,
160:   Argyriou07, Amit07}, control \cite{Mesbahi97} and computer vision,
161: see \cite{Tomasi}. As a motivating example, consider the problem of
162: recovering a data matrix from a sampling of its entries. This
163: routinely comes up whenever one collects partially filled out surveys,
164: and one would like to infer the many missing entries. In the area of
165: recommender systems, users submit ratings on a subset of entries in a
166: database, and the vendor provides recommendations based on the user's
167: preferences. Because users only rate a few items, one would like to
168: infer their preference for unrated items; this is the famous Netflix
169: problem \cite{NetflixPrize}.  Recovering a rectangular matrix from a
170: sampling of its entries is known as the {\em matrix completion}
171: problem. The issue is of course that this problem is extraordinarily
172: ill posed since with fewer samples than entries, we have infinitely
173: many completions. Therefore, it is apparently impossible to identify
174: which of these candidate solutions is indeed the ``correct'' one
175: without some additional information.
176: 
177: In many instances, however, the matrix we wish to recover has low rank
178: or approximately low rank. For instance, the Netflix data matrix of
179: all user-ratings may be approximately low-rank because it is commonly
180: believed that only a few factors contribute to anyone's taste or
181: preference. In computer vision, inferring scene geometry and camera
182: motion from a sequence of images is a well-studied problem known as
183: the structure-from-motion problem. This is an ill-conditioned problem
184: for objects may be distant with respect to their size, or especially
185: for ``missing data'' which occur because of occlusion or tracking
186: failures. However, when properly stacked and indexed, these images
187: form a matrix which has very low rank (e.g.~rank 3 under orthography)
188: \cite{Tomasi,ChenSuter}.  Other examples of low-rank matrix fitting
189: abound; e.g.~in control (system identification), machine learning
190: (multi-class learning) and so on.  Having said this, the premise that
191: the unknown has (approximately) low  rank radically changes the problem,
192: making the search for solutions feasible since the lowest-rank
193: solution now tends to be the right one.
194: 
195: In a recent paper \cite{CR:XXX:08}, Cand\`es and Recht showed that
196: matrix completion is not as ill-posed as people thought. Indeed, they
197: proved that most low-rank matrices can be recovered {\em exactly} from
198: most sets of sampled entries even though these sets have surprisingly
199: small cardinality, and more importantly, they proved that this can
200: be done by solving a simple {\em convex} optimization problem. To
201: state their results, suppose to simplify that the unknown matrix
202: $\mtx{M} \in \R^{n \times n}$ is square, and that one has available
203: $m$ sampled entries $\{\mtx{M}_{ij} : (i, j) \in \Omega\}$ where
204: $\Omega$ is a random subset of cardinality $m$. Then \cite{CR:XXX:08}
205: proves that most matrices $\mtx{M}$ of rank $r$ can be perfectly
206: recovered by solving the optimization problem
207: \begin{equation}
208:   \label{eqn:min}
209:   \begin{array}{ll}
210:     \textrm{minimize}   & \quad \|\mtx{X}\|_*\\
211:     \textrm{subject to} & \quad X_{ij} = M_{ij}, \quad (i,j) \in \Omega,
212:  \end{array}
213: \end{equation}
214: provided that the number of samples obeys
215: \begin{equation}\label{conm}
216: m \ge C n^{6/5} r \log n
217: \end{equation}
218: for some positive numerical constant $C$.\footnote{Note that an $n
219:   \times n$ matrix of rank $r$ depends upon $r(2n-r)$ degrees of
220:   freedom.}  In \eqref{eqn:min}, the functional $\|\mtx{X}\|_*$ is the
221: nuclear norm of the matrix $\mtx{M}$, which is the sum of its singular
222: values. The optimization problem \eqref{eqn:min} is convex and can be
223: recast as a semidefinite program \cite{FazelThesis,fazelRank}. In some
224: sense, this is the tightest convex relaxation of the NP-hard rank
225: minimization problem
226: \begin{equation}
227:   \label{eq:rank}
228:   \begin{array}{ll}
229:     \textrm{minimize}   & \quad \text{rank}(\mtx{X})\\
230:     \textrm{subject to} & \quad X_{ij} = M_{ij}, \quad (i,j) \in \Omega,
231:  \end{array}
232: \end{equation}
233: since the nuclear ball $\{\mtx{X} : \|\mtx{X}\|_* \le 1\}$ is the
234: convex hull of the set of rank-one matrices with spectral norm bounded
235: by one. Another interpretation of Cand\`es and Recht's result is that
236: under suitable conditions, the rank minimization program
237: \eqref{eq:rank} and the convex program \eqref{eqn:min} are {\em
238:   formally equivalent} in the sense that they have exactly the same
239: unique solution.
240: 
241: \subsection{Algorithm outline}
242: 
243: Because minimizing the nuclear norm both provably recovers the
244: lowest-rank matrix subject to constraints (see \cite{Recht07} for
245: related results) and gives generally good empirical results in a
246: variety of situations, it is understandably of great interest to
247: develop numerical methods for solving \eqref{eqn:min}.  In
248: \cite{CR:XXX:08}, this optimization problem was solved using one of
249: the most advanced semidefinite programming solvers, namely, SDPT3
250: \cite{TTT:SDPT3}. This solver and others like
251: SeDuMi %\cite{sedumi:www}
252: are based on interior-point methods, and are problematic when the size
253: of the matrix is large because they need to solve huge systems of
254: linear equations to compute the Newton direction. In fact, SDPT3 can
255: only handle $n \times n$ matrices with $n \le 100$. Presumably, one
256: could resort to iterative solvers such as the method of conjugate
257: gradients to solve for the Newton step but this is problematic as well
258: since it is well known that the condition number of the Newton system
259: increases rapidly as one gets closer to the solution. In addition,
260: none of these general purpose solvers use the fact that the solution
261: may have low rank. We refer the reader to \cite{VandenbergheNuc} for
262: some recent progress on interior-point methods concerning some special
263: nuclear norm-minimization problems.
264: 
265: This paper develops the {\em singular value thresholding} algorithm
266: for approximately solving the nuclear norm minimization problem
267: \eqref{eqn:min} and by extension, problems of the form
268: \begin{equation}
269:   \label{eqn:nuc_norm}
270:   \begin{array}{ll}
271:     \textrm{minimize}   & \quad \|\mtx{X}\|_*\\
272:     \textrm{subject to} & \quad {\cal A}(\mtx{X})  = \vct{b},
273:  \end{array}
274: \end{equation}
275: where ${\cal A}$ is a linear operator acting on the space of $n_1
276: \times n_2$ matrices and $\vct{b} \in \R^m$.  This algorithm is a
277: simple first-order method, and is especially well suited for problems
278: of very large sizes in which the solution has low rank.  We sketch
279: this algorithm in the special matrix completion setting and let
280: $\mathcal{P}_{\Omega}$ be the orthogonal projector onto the span of
281: matrices vanishing outside of $\Omega$ so that the $(i,j)$th component
282: of $\mathcal{P}_{\Omega}(\mtx{X})$ is equal to $X_{ij}$ if $(i,j) \in
283: \Omega$ and zero otherwise. Our problem may be expressed as
284: \begin{equation}
285:   \label{eqn:nuc_norm2}
286:   \begin{array}{ll}
287:     \textrm{minimize}   & \quad \|\mtx{X}\|_*\\
288:     \textrm{subject to} & \quad {\cal P}_\Omega(\mtx{X}) = {\cal P}_\Omega(\mtx{M}),
289:  \end{array}
290: \end{equation}
291: with optimization variable $\mtx{X} \in \R^{n_1 \times n_2}$. Fix
292: $\tau > 0$ and a sequence $\{\delta_k\}_{k \ge 1}$ of scalar step
293: sizes.  Then starting with $\mtx{Y}^0 = 0 \in \R^{n_1 \times n_2}$,
294: the algorithm inductively defines
295: \begin{equation}\label{eqn:iter0}
296: \begin{cases}
297:   \mtx{X}^{k}  = \text{shrink}(\mtx{Y}^{k-1}, \tau),\cr
298:   \mtx{Y}^{k}  = \mtx{Y}^{k-1} + \delta_{k}
299:   \mathcal{P}_{\Omega}(\mtx{M}-\mtx{X}^k)
300: \end{cases}
301: \end{equation}
302: until a stopping criterion is reached. In \eqref{eqn:iter0},
303: $\text{shrink}(\mtx{Y},\tau)$ is a nonlinear function which applies a
304: soft-thresholding rule at level $\tau$ to the singular values of the
305: input matrix, see Section \ref{sec:alg} for details.  The key property
306: here is that for large values of $\tau$, the sequence $\{\mtx{X}^k\}$
307: converges to a solution which very nearly minimizes
308: \eqref{eqn:nuc_norm2}.  Hence, at each step, one only needs to compute
309: at most one singular value decomposition and perform a few elementary
310: matrix additions. Two important remarks are in order:
311: \begin{enumerate}
312: \item {\em Sparsity.} For each $k \ge 0$, $\mtx{Y}^k$ vanishes outside
313:   of $\Omega$ and is, therefore, sparse, a fact which can be used to
314:   evaluate the shrink function rapidly.
315: 
316: \item {\em Low-rank property.} The matrices $\mtx{X}^k$ turn out to
317:   have low rank, and hence the algorithm has minimum storage
318:   requirement since we only need to keep principal factors in memory.
319: \end{enumerate}
320: 
321: Our numerical experiments demonstrate that the proposed algorithm can
322: solve problems, in Matlab, involving matrices of size $30,000 \times
323: 30,000$ having close to a billion unknowns in 17 minutes on a standard
324: desktop computer with a 1.86 GHz CPU (dual core with Matlab's
325: multithreading option enabled) and 3 GB of memory.
326: % CS: done. We just used the default setting of Matlab R2008a, where
327: % multithreading is used, and the maximum number of threads is the
328: % number of cores.
329: As a consequence, the singular value thresholding algorithm may
330: become a rather powerful computational tool for large scale matrix
331: completion.
332: 
333: \subsection{General formulation}
334: 
335: The singular value thresholding algorithm can be adapted to deal with
336: other types of convex constraints. For instance, it may address
337: problems of the form
338: \begin{equation}
339:   \label{eqn:nuc_norm}
340:   \begin{array}{ll}
341:     \textrm{minimize}   & \quad \|\mtx{X}\|_*\\
342:     \textrm{subject to} & \quad f_i(\mtx{X}) \le 0, \quad i = 1, \ldots, m,
343:  \end{array}
344: \end{equation}
345: where each $f_i$ is a Lipschitz convex function (note that one can
346: handle linear equality constraints by considering pairs of affine
347: functionals). In the simpler case where the $f_i$'s are affine
348: functionals, the general algorithm goes through a sequence of
349: iterations which greatly resemble \eqref{eqn:iter0}. This is useful
350: because this enables the development of numerical algorithms which are
351: effective for recovering matrices from a small subset of sampled
352: entries possibly contaminated with noise.
353: 
354: 
355: \subsection{Contents and notations}
356: 
357: 
358: The rest of the paper is organized as follows. In Section
359: \ref{sec:alg}, we derive the singular value thresholding (SVT)
360: algorithm for the matrix completion problem, and recasts it in terms
361: of a well-known Lagrange multiplier algorithm. In Section
362: \ref{sec:general}, we extend the SVT algorithm and formulate a general
363: iteration which is applicable to general convex constraints. In
364: Section \ref{sec:conv}, we establish the convergence results for the
365: iterations given in Sections \ref{sec:alg} and \ref{sec:general}.
366: We demonstrate the performance and effectiveness of the algorithm
367: through numerical examples in Section \ref{sec:num}, and review
368: additional implementation details. Finally, we conclude the paper with
369: a short discussion in Section \ref{sec:discussion}.
370: 
371: Before continuing, we provide here a brief summary of the notations
372: used throughout the paper. Matrices are bold capital, vectors are bold
373: lowercase and scalars or entries are not bold. For instance, $\mtx{X}$
374: is a matrix and $X_{ij}$ its $(i,j)$th entry. Likewise, $\vct{x}$ is a
375: vector and $x_i$ its $i$th component. The nuclear norm of a matrix is
376: denoted by $\|\mtx{X}\|_*$, the Frobenius norm by $\|\mymathbf{X}\|_F$
377: and the spectral norm by $\|\mymathbf{X}\|_2$; note that these are
378: respectively the 1-norm, the 2-norm and the sup-norm of the vector of
379: singular values. The adjoint of a matrix $\mtx{X}$ is $\mtx{X}^*$ and
380: similarly for vectors. The notation $\diag(\mymathbf{x})$, where
381: $\vct{x}$ is a vector, stands for the diagonal matrix with $\{x_i\}$
382: as diagonal elements. We denote by $\langle\mtx{X}, \mtx{Y}\rangle =
383: \trace(\mtx{X}^*\mtx{Y})$ the standard inner product between two
384: matrices ($\|\mtx{X}\|_F^2 = \langle\mtx{X},\mtx{X}\rangle$).  The
385: Cauchy-Schwarz inequality gives
386: $\langle\mymathbf{X},\mymathbf{Y}\rangle\leq\|\mymathbf{X}\|_F\|\mymathbf{Y}\|_F$
387: and it is well known that we also have
388: $\langle\mymathbf{X},\mymathbf{Y}\rangle\leq\|\mymathbf{X}\|_*\|\mymathbf{Y}\|_2$
389: (the spectral and nuclear norms are dual from one another), see
390: e.g.~\cite{CR:XXX:08,Recht07}.
391: 
392: \section{The Singular Value Thresholding Algorithm }
393: \label{sec:alg}
394: 
395: This section introduces the singular value thresholding algorithm
396: and discusses some of its basic properties. We begin with the
397: definition of a key building block, namely, the singular value
398: thresholding operator.
399: 
400: \subsection{The singular value shrinkage operator}
401: 
402: Consider the singular value decomposition (SVD) of a matrix $\mtx{X}
403: \in \R^{n_1 \times n_2}$ of rank $r$
404: \begin{equation}
405:   \label{eq:svd}
406:   \mtx{X} = \mtx{U}  \mtx{\Sigma}   \mtx{V}^*,
407: \quad   \mtx{\Sigma}  = \diag(\{\sigma_i\}_{1 \le i \le r}),
408: \end{equation}
409: where $\mtx{U}$ and $\mtx{V}$ are respectively $n_1 \times r$ and $n_2
410: \times r$ matrices with orthonormal columns, and the singular values
411: $\sigma_i$ are positive (unless specified otherwise, we will always
412: assume that the SVD of a matrix is given in the reduced form
413: above). For each $\tau \ge 0$, we introduce the soft-thresholding
414: operator $\mathcal{D}_\tau$ defined as follows:
415: \begin{equation}
416: \label{eqn:DlamM2} \mathcal{D}_{\tau}(\mtx{X}):=  \mtx{U}
417: \mathcal{D}_{\tau}(\mtx{\Sigma}) \mtx{V}^*, \quad
418: \mathcal{D}_{\tau}(\mtx{\Sigma}) = \diag(\{\sigma_i - \tau)_+\}),
419: \end{equation}
420: where $t_+$ is the positive part of $t$, namely, $t_+ = \max(0,t)$. In
421: words, this operator simply applies a soft-thresholding rule to the
422: singular values of $\mtx{X}$, effectively shrinking these towards
423: zero. This is the reason why we will also refer to this transformation
424: as the {\em singular value shrinkage} operator. Even though the SVD
425: may not be unique, it is easy to see that the singular value shrinkage
426: operator is well defined and we do not elaborate further on this
427: issue.  In some sense, this shrinkage operator is a straightforward
428: extension of the soft-thresholding rule for scalars and vectors. In
429: particular, note that if many of the singular values of $\mtx{X}$ are
430: below the threshold $\tau$, the rank of $\mathcal{D}_{\tau}(\mtx{X})$
431: may be considerably lower than that of $\mtx{X}$, just like the
432: soft-thresholding rule applied to vectors leads to sparser outputs
433: whenever some entries of the input are below threshold.
434: 
435: 
436: The singular value thresholding operator is the proximity operator
437: associated with the nuclear norm. Details about the proximity
438: operator can be found in e.g.~\cite{HL:BOOK:93}.
439: \begin{theorem}\label{thm:prox}
440:   For each $\tau \ge 0$ and $\mtx{Y} \in \R^{n_1 \times n_2}$, the
441:   singular value shrinkage operator $\eqref{eqn:DlamM2}$ obeys
442: \begin{equation}
443: \label{eqn:DlamM}
444: \mathcal{D}_{\tau}(\mtx{Y}) = \arg\min_{\mtx{X}} \left\{
445: \frac12\|\mtx{X}-\mtx{Y}\|_F^2 + \tau\|\mymathbf{X}\|_{*}
446: \right\}.
447: \end{equation}
448: \end{theorem}
449: \begin{proof} Since the function $h_0(\mtx{X}) := \tau \|\mtx{X}\|_* +
450:   \frac{1}{2} \|\mtx{X}-\mtx{Y}\|_F^2$ is strictly convex, it is easy
451:   to see that there exists a unique minimizer, and we thus need to
452:   prove that it is equal to $\mathcal{D}_{\tau}(\mtx{Y})$.  To do
453:   this, recall the definition of a subgradient of a convex function $f
454:   : \R^{n_1 \times n_2} \goto \R$. We say that $\mtx{Z}$ is a
455:   subgradient of $f$ at $\mtx{X}_0$, denoted $\mtx{Z} \in \partial
456:   f(\mtx{X}_0)$, if
457: \begin{equation}
458:   \label{eq:subgradient}
459:   f(\mtx{X}) \ge f(\mtx{X}_0) + \<\mtx{Z}, \mtx{X} - \mtx{X}_0\>
460: \end{equation}
461: for all $\mtx{X}$.  Now $\hat{\mtx{X}}$ minimizes $h_0$ if and only if
462: $\mymathbf{0}$ is a subgradient of the functional $h_0$ at the point
463: $\hat{\mtx{X}}$, i.e.
464: \begin{equation}\label{eqn:subdiff}
465: \mymathbf{0} \in \hat{\mtx{X}}-\mtx{Y}+\tau\partial\|\hat{\mtx{X}}\|_*,
466: \end{equation}
467: where $\partial\|\hat{\mtx{X}}\|_*$ is the set of subgradients of the
468: nuclear norm. Let $\mtx{X} \in \R^{n_1 \times n_2}$ be an arbitrary
469: matrix and $\mtx{U} \mtx{\Sigma} \mtx{V}^*$ be its SVD.  It is known
470: \cite{CR:XXX:08,Lew:MP:03,Wat:LAA:92} that
471: \begin{equation}\label{eqn:subdiffNorm}
472:   \partial\|\mtx{X}\|_*=\left\{\mtx{U} \mtx{V}^* + \mtx{W} :
473:     ~\mtx{W}\in\mathbb{R}^{n_1 \times n_2},~~
474:     \mtx{U}^*\mtx{W}=0,~~
475:     \mtx{W} \mtx{V} =0,~~
476:     \|\mtx{W}\|_2\leq1\right\}.
477: \end{equation}
478: 
479: Set $\hat{\mtx{X}} := {\cal D}_\tau(\mtx{Y})$ for short. In order to
480: show that $\hat{\mtx{X}}$ obeys \eqref{eqn:subdiff}, decompose the SVD
481: of $\mtx{Y}$ as
482: \[
483: \mtx{Y} = \mtx{U}_0 \mtx{\Sigma}_0 \mtx{V}_0^* + \mtx{U}_1 \mtx{\Sigma}_1 \mtx{V}_1^*,
484: \]
485: where $\mtx{U}_0$, $\mtx{V}_0$ (resp.~$\mtx{U}_1$, $\mtx{V}_1$) are
486: the singular vectors associated with singular values greater than
487: $\tau$ (resp.~smaller than or equal to $\tau$). With these notations, we
488: have
489: \[
490: \hat{\mtx{X}} = \mtx{U}_0 (\mtx{\Sigma}_0 - \tau \mtx{I})\mtx{V}_0^*
491: \]
492: and, therefore,
493: \[
494: \mtx{Y} - \hat{\mtx{X}} = \tau (\mtx{U}_0 \mtx{V}_0^* +
495: \mtx{W}), \quad \mtx{W} = \tau^{-1} \mtx{U}_1 \mtx{\Sigma}_1
496: \mtx{V}_1^*.
497: \]
498: By definition, $\mtx{U}_0^* \mtx{W} = 0$, $\mtx{W} \mtx{V}_0 = 0$ and
499: since the diagonal elements of $\mtx{\Sigma}_1$ have magnitudes
500: bounded by $\tau$, we also have $\|\mtx{W}\|_2 \le 1$. Hence $\mtx{Y} -
501: \hat{\mtx{X}} \in \tau \partial\|\hat{\mtx{X}}\|_*$, which concludes the
502: proof.
503: \end{proof}
504: 
505: \subsection{Shrinkage iterations}
506: 
507: We are now in the position to introduce the singular value
508: thresholding algorithm. Fix $\tau > 0$ and a sequence $\{\delta_k\}$
509: of positive step sizes. Starting with $\mtx{Y}_0$, inductively define
510: for $k = 1, 2, \ldots$,
511: \begin{equation}\label{eqn:iter}
512: \begin{cases}
513:   \mtx{X}^{k}  = {\cal D}_\tau(\mtx{Y}^{k-1}),\cr
514:   \mtx{Y}^{k}  = \mtx{Y}^{k-1} + \delta_{k}
515:   \mathcal{P}_{\Omega}(\mtx{M}-\mtx{X}^k)
516: \end{cases}
517: \end{equation}
518: until a stopping criterion is reached (we postpone the discussion
519: this stopping criterion and of the choice of step sizes). This
520: shrinkage iteration is very simple to implement. At each step, we only
521: need to compute an SVD and perform elementary matrix operations. With
522: the help of a standard numerical linear algebra package, the whole
523: algorithm can be coded in just a few lines.
524: 
525: Before addressing further computational issues, we would like to make
526: explicit the relationship between this iteration and the original
527: problem \eqref{eqn:min}.  In Section \ref{sec:conv}, we will show that
528: the sequence $\{\mtx{X}^k\}$ converges to the unique solution of an
529: optimization problem closely related to \eqref{eqn:min}, namely,
530: \begin{equation}\label{eqn:minnuc+fro}
531:   \begin{array}{ll}
532:     \textrm{minimize}   & \quad \tau \|\mtx{X}\|_* +
533: \frac{1}{2} \|\mtx{X}\|_F^2\\
534:     \textrm{subject to} & \quad \mathcal{P}_\Omega(\mtx{X}) = \mathcal{P}_\Omega(\mtx{M}).
535:  \end{array}
536: \end{equation}
537: Furthermore, it is intuitive that the solution to this modified
538: problem converges to that of \eqref{eqn:nuc_norm2} as $\tau \to
539: \infty$ as shown in Section \ref{sec:general}. Thus by selecting a
540: large value of the parameter $\tau$, the sequence of iterates
541: converges to a matrix which nearly minimizes \eqref{eqn:min}.
542: 
543: 
544: 
545: As mentioned earlier, there are two crucial properties which make this
546: algorithm ideally suited for matrix completion.
547: \begin{itemize}
548: \item {\em Low-rank property.} A remarkable empirical fact is that the
549:   matrices in the sequence $\{\mtx{X}^k\}$ have low rank (provided, of
550:   course, that the solution to \eqref{eqn:minnuc+fro} has low
551:   rank). We use the word ``empirical'' because all of our numerical
552:   experiments have produced low-rank sequences but we cannot
553:   rigorously prove that this is true in general.  The reason for this
554:   phenomenon is, however, simple: because we are interested in large
555:   values of $\tau$ (as to better approximate the solution to
556:   \eqref{eqn:min}), the thresholding step happens to `kill' most of
557:   the small singular values and produces a low-rank output.
558:   % Furthermore, since the SVT algorithm starts from
559:   % $\mymathbf{X}^0=\mymathbf{0}$, the principle components
560:   % corresponding to large singular values come up first, and minor
561:   % components corresponding to small singular values are added in as
562:   % the iteration going on.
563:   In fact, our numerical results show that the rank of $\mtx{X}^{k}$
564:   is nondecreasing with $k$, and the maximum rank is reached in the
565:   last steps of the algorithm, see Section \ref{sec:num}.
566: 
567: 
568:   Thus, when the rank of the solution is substantially smaller than
569:   either dimension of the matrix, the storage requirement is low since
570:   we could store each $\mtx{X^k}$ in its SVD form (note that we only
571:   need to keep the current iterate and may discard earlier
572:   values).
573: 
574: \item {\em Sparsity.}  Another important property of the SVT algorithm
575:   is that the iteration matrix $\mtx{Y}^k$ is sparse. Since
576:   $\mtx{Y}^0=\mtx{0}$, we have by induction that $\mtx{Y}^{k}$
577:   vanishes outside of $\Omega$. The fewer entries available, the
578:   sparser $\mtx{Y}^k$. Because the sparsity pattern $\Omega$ is fixed
579:   throughout, one can then apply sparse matrix techniques to save
580:   storage. Also, if $|\Omega| = m$, the computational cost of updating
581:   $\mtx{Y}^k$ is of order $m$. Moreover, we can call
582:   subroutines supporting sparse matrix computations, which can further
583:   reduce computational costs.
584: 
585:   One such subroutine is the SVD. However, note that we do not need to
586:   compute the entire SVD of $\mtx{Y}^k$ to apply the singular value
587:   thresholding operator. Only the part corresponding to singular
588:   values greater than $\tau$ is needed. Hence, a good strategy is to
589:   apply the iterative Lanczos algorithm to compute the first few
590:   singular values and singular vectors. Because $\mtx{Y}^k$ is sparse,
591:   $\mtx{Y}^k$ can be applied to arbitrary vectors rapidly, and this
592:   procedure offers a considerable speedup over naive methods.
593: \end{itemize}
594: 
595: % \begin{theorem}\label{thm:largemu}
596: %   Let $\mtx{X}_{\tau}^{\star}$ be the solution to
597: %   \eqref{eqn:minnuc+fro} (or equivalently, the limit of
598: %   \eqref{eqn:iter}). Let $\mtx{X}_\infty$ be the minimum Frobenius-norm
599: %   solution to \eqref{eqn:min} defined as
600: %   \begin{equation}\label{eqn:minfro}
601: %     \mtx{X}_\infty:= \arg \min_{\mtx{X}}\{\|\mtx{X}\|_F^2~:~\mtx{X}\text{ is a solution of \eqref{eqn:min}}\}.
602: %   \end{equation} Then
603: % \begin{equation}\label{eqn:limitXmustar}
604: % \lim_{\tau\to\infty}\|\mtx{X}_{\tau}^{\star}-\mtx{X}_\infty\|_F=0.
605: % \end{equation}
606: % \end{theorem}
607: % \begin{proof}
608: % It follows from the definition of $\mtx{X}_{\tau}^{\star}$ and
609: % $\mtx{X}_\infty$ that
610: % \begin{equation}\label{eqn:XmustarX0}
611: %   \|\mtx{X}_{\tau}^{\star}\|_*+\frac{1}{2\tau}\|\mtx{X}_{\tau}^{\star}\|_F^2\leq
612: %   \|\mtx{X}_{\infty}\|_*+\frac{1}{2\tau}\|\mtx{X}_{\infty}\|_F^2,\quad \text{ and } \quad
613: %   \|\mtx{X}_{\infty}\|_*\leq\|\mtx{X}_{\tau}^{\star}\|_*.
614: % \end{equation}
615: % Summing these two inequalities gives
616: % \begin{equation}\label{eqn:boundedXmustar}
617: % \|\mtx{X}_{\tau}^{\star}\|_F^2\leq\|\mtx{X}_{\infty}\|_F^2,
618: % \end{equation}
619: % which implies that $\|\mtx{X}_{\tau}^{\star}\|_F^2$ is bounded
620: % uniformly in $\tau$.  Thus, we would prove the theorem if we could
621: % establish that any convergent subsequence
622: % $\{\mtx{X}^{\star}_{\tau_i}\}_{i \ge 1}$ must converge to
623: % $\mtx{X}_\infty$.
624: 
625: % Consider an arbitrary converging subsequence
626: % $\mtx{X}^{\star}_{\tau_i}$ and set $\mtx{X}_c := \lim_{i \goto
627: % \infty} \mtx{X}^{\star}_{\tau_i}$. Since
628: % $\mathcal{P}_{\Omega}(\mtx{X}^{\star}_{\tau_i}) =
629: % \mathcal{P}_{\Omega}(\mtx{M})$ and $\mathcal{P}_{\Omega}$ is an
630: % orthogonal projector, $\mtx{X}_c$ obeys
631: % \begin{equation}\label{eqn:constaintXc}
632: % \mathcal{P}_{\Omega}\mtx{X}_c=\mathcal{P}_{\Omega}\mtx{M}.
633: % \end{equation}
634: % Furthermore, since $\|\mtx{X}_{\tau}^{\star}\|_F^2$ is bounded,
635: % \eqref{eqn:XmustarX0} yields
636: % $$
637: % \limsup_{\tau\to\infty}\|\mtx{X}_{\tau}^{\star}\|_*\leq\|\mtx{X}_{\infty}\|_*,
638: % \quad
639: % \|\mtx{X}_{\infty}\|_*\leq\liminf_{\tau\to\infty}\|\mtx{X}_{\tau}^{\star}\|_*.
640: % $$
641: % An immediate consequence is
642: % $\lim_{\tau\to\infty}\|\mtx{X}_{\tau}^{\star}\|_*=\|\mtx{X}_{\infty}\|_*$
643: % and, therefore, $\|\mtx{X}_{c}\|_*=\|\mtx{X}_{\infty}\|_*$. This
644: % shows that $\mtx{X}_c$ is a solution to \eqref{eqn:min}. On the one
645: % hand, it follows from the definition of $\mtx{X}_\infty$ that
646: % $\|\mtx{X}_c\|_F \geq \|\mtx{X}_{\infty}\|_F$, while on the other
647: % hand, $\|\mtx{X}_c\|_F \leq \|\mtx{X}_{\infty}\|_F$ because of
648: % \eqref{eqn:boundedXmustar}.  We conclude that $\|\mtx{X}_c\|_F
649: % =\|\mtx{X}_{\infty}\|_F$ and thus $\mtx{X}_c=\mtx{X}_{\infty}$ since
650: % $\mtx{X}_{\infty}$ is unique.
651: % \end{proof}
652: 
653: 
654: \subsection{Relation with other works}
655: 
656: Our algorithm is inspired by recent work in the area of $\ell_1$
657: minimization, and especially by the work on linearized Bregman
658: iterations for compressed sensing, see
659: \cite{COS:XXX:08:3,COS:XXX:08:2,COS:XXX:08,DO:XXX:07,YOGD:SIIMS:08,ODY:XXX:08}
660: for linearized Bregman iterations and
661: \cite{CR:IP:07,CRT:TIT:06,CT:TIT:05,CT:TIT:06,Don:TIT:06} for some
662: information about the field of compressed sensing. In this line of
663: work, linearized Bregman iterations are used to find the solution to
664: an underdetermined system of linear equations with minimum $\ell_1$
665: norm.  In fact, Theorem \ref{thm:prox} asserts that the singular value
666: thresholding algorithm can be formulated as a linearized Bregman
667: iteration.  Bregman iterations were first introduced in
668: \cite{OBGXY:MMS:05} as a convenient tool for solving computational
669: problems in the imaging sciences, and a later paper
670: \cite{YOGD:SIIMS:08} showed that they were useful for solving
671: $\ell_1$-norm minimization problems in the area of compressed sensing.
672: Linearized Bregman iterations were proposed in \cite{DO:XXX:07} to
673: improve performance of plain Bregman iterations, see also
674: \cite{YOGD:SIIMS:08}. Additional details together with a technique for
675: improving the speed of convergence called {\em kicking} are described
676: in \cite{ODY:XXX:08}. On the practical side, the paper
677: \cite{COS:XXX:08:2} applied Bregman iterations to solve a deblurring
678: problem while on the theoretical side, the references
679: \cite{COS:XXX:08:3,COS:XXX:08} gave a rigorous analysis of the
680: convergence of such iterations. New developments keep on coming out at
681: a rapid pace and recently, \cite{GO:XXX:08} introduced a new
682: iteration, the {\em split Bregman iteration}, to extend Bregman-type
683: iterations (such as linearized Bregman iterations) to problems
684: involving the minimization of $\ell_1$-like functionals such as
685: total-variation norms, Besov norms, and so forth.
686: 
687: When applied to $\ell_1$-minimization problems, linearized Bregman
688: iterations are sequences of soft-thresholding rules operating on
689: vectors. Iterative soft-thresholding algorithms in connection with
690: $\ell_1$ or total-variation minimization have quite a bit of history
691: in signal and image processing and we would like to mention the
692: works \cite{TVSynthesis,Lintner} for total-variation minimization,
693: \cite{Nowak_EM,DDD:CPAM:04,DTV:IPI:07} for $\ell_1$ minimization,
694: and
695: \cite{CCS:ACHA:08,CCSS:SISC:08,CS:NM:07,CCSS:SISC:03,ESQD:ACHA:05,FSM:CJ:07,SDC:AA:03,BBAC:ECCV:04}
696: for some recent applications in the area of image inpainting and
697: image restoration. Just as iterative soft-thresholding methods are
698: designed to find sparse solutions, our iterative singular value
699: thresholding scheme is designed to find a sparse vector of singular
700: values. In classical problems arising in the areas of compressed
701: sensing, and signal or image processing, the sparsity is expressed
702: in a known transformed domain and soft-thresholding is applied to
703: transformed coefficients. In contrast, the shrinkage operator
704: $\mathcal{D}_{\tau}$ is adaptive. The SVT not only discovers a
705: sparse singular vector but also the bases in which we have a sparse
706: representation.  In this sense, the SVT algorithm is an extension of
707: earlier iterative soft-thresholding schemes.
708: 
709: Finally, we would like to contrast the SVT iteration \eqref{eqn:iter}
710: with the popular iterative soft-thresholding algorithm used in many
711: papers in imaging processing and perhaps best known under the name of
712: Proximal Forward-Backward Splitting method (PFBS), see
713: \cite{CW:MMS:05,Nowak_EM,DDD:CPAM:04,YinFPC,CCS:ACHA:08} for example.
714: The constrained minimization problem \eqref{eqn:nuc_norm2} may be
715: relaxed into
716: \begin{equation}
717: \label{eqn:min_uncon}
718:  \textrm{minimize} \quad \lambda \|\mtx{X}\|_* +
719:  \frac{1}{2} \|\cP_{\Omega}(\mtx{X})-\cP_{\Omega}(\mtx{M})\|_F^2
720: \end{equation}
721: for some $\lambda > 0$. Theorem \ref{thm:prox} asserts that
722: $\cD_{\lambda}$ is the proximity operator of $\lambda \|\mtx{X}\|_*$
723: and Proposition 3.1(iii) in \cite{CW:MMS:05} gives that the solution
724: to this unconstrained problem is characterized by the fixed point
725: equation $\mtx{X} = \cD_{\lambda \delta}(\mtx{X} + \delta
726: P_{\Omega}(\mtx{M}-\mtx{X}))$ for each $\delta > 0$. One can then
727: apply a simplified version of the PFBS method (see (3.6) in
728: \cite{CW:MMS:05}) to obtain iterations of the form
729: \[
730:  \mtx{X}^{k} =
731: \cD_{\lambda \delta_{k-1}}(\mtx{X}^{k-1}+\delta_{k-1} P_{\Omega}(\mtx{M}-\mtx{X}^{k-1})).
732: \]
733: Introducing an intermediate matrix $\mtx{Y}^{k}$, this algorithm may
734: be expressed as
735: \begin{equation}
736: \label{eqn:iter_thresh}
737: \begin{cases}
738:  \mtx{X}^k = \cD_{\lambda \delta_{k-1}}(\mtx{Y}^{k-1}),\cr
739:  \mtx{Y}^{k} = \mtx{X}^{k}+\delta_kP_{\Omega}(\mtx{M}-\mtx{X}^{k}).
740: \end{cases}
741: \end{equation}
742: The difference with \eqref{eqn:iter} may seem subtle at
743: first---replacing $\mtx{X}^{k}$ in \eqref{eqn:iter_thresh} with
744: $\mtx{Y}^{k-1}$ and setting $\delta_k = \delta$ gives \eqref{eqn:iter}
745: with $\tau = \lambda \delta$---but has enormous consequences as this
746: gives entirely different algorithms.  First, they have different
747: limits: while \eqref{eqn:iter} converges to the solution of the
748: constrained minimization \eqref{eqn:minnuc+fro},
749: \eqref{eqn:iter_thresh} converges to the solution of
750: \eqref{eqn:min_uncon} provided that the sequence of step sizes is
751: appropriately selected.  Second, selecting a large $\lambda$ (or a
752: large value of $\tau = \lambda \delta$) in \eqref{eqn:iter_thresh}
753: gives a low-rank sequence of iterates and a limit with small nuclear
754: norm. The limit, however, does not fit the data and this is why one
755: has to choose a small or moderate value of $\lambda$ (or of $\tau =
756: \lambda \delta$).  However, when $\lambda$ is not sufficiently large,
757: the $\mtx{X}^k$ may not have low rank even though the solution has low
758: rank (and one may need to compute many singular vectors), and
759: $\mtx{Y}^k$ is not sufficiently sparse to make the algorithm
760: computationally attractive. Moreover, the limit does not necessary
761: have a small nuclear norm.  These are reasons why
762: \eqref{eqn:iter_thresh} is not suitable for matrix completion.
763: 
764: 
765: 
766: % To overcome this, one may chose a decreasing sequence $\tau_k$ and a
767: % proper choice of step size sequence $\delta_k$ adapted to $\tau_k$,
768: % so that the sequences generated by \eqref{eqn:iter_thresh} still
769: % keep the sparsity and low rank when $\tau_k$ decreases. These
770: % choices must adapt to the given data to make it work which is a
771: % highly nontrivial task.
772: % Then,
773: % why a similar iteration as \eqref{eqn:iter_thresh} works in imaging
774: % processing?
775: 
776: % The situation is a little different in signal or image processing
777: % since 1) the soft-thresholding rule can be applied efficiently and 2)
778: % objects of interest are not very sparse, only moderately sparse, so
779: % that $\tau$ can be chosen to be moderately small.
780: % Hence, iteration
781: % \eqref{eqn:iter_thresh} goes to the limit that balances the $\ell$
782: % norm and fidelity. This is one of the reasons why similar iterations
783: % to \eqref{eqn:iter_thresh} works in imaging processing as shown
784: % e.g. \cite{CCS:ACHA:08, Nowak_EM,DDD:CPAM:04}.
785: % For matrix completion,
786: % since the underlying solution is highly sparse in the singular value
787: % domain, $\tau$ must be chosen to be very large to start with. Hence,
788: % \eqref{eqn:iter_thresh} is not a right choice for the matrix
789: % completion.
790: 
791: \subsection{Interpretation as a Lagrange multiplier method}
792: \label{sec:uzawa}
793: 
794: In this section, we recast the SVT algorithm as a type of Lagrange
795: multiplier algorithm known as Uzawa's algorithm. An important
796: consequence is that this will allow us to extend the SVT algorithm to
797: other problems involving the minimization of the nuclear norm under
798: convex constraints, see Section \ref{sec:general}. Further, another
799: contribution of this paper is that this framework actually recasts
800: linear Bregman iterations as a very special form of Uzawa's algorithm,
801: hence providing fresh and clear insights about
802: these %sometimes mysterious
803: iterations.
804: 
805: In what follows, we set $f_\tau(\mtx{X}) = \tau \|\mtx{X}\|_* +
806: \frac{1}{2} \|\mtx{X}\|_F^2$ for some fixed $\tau > 0$ and recall that
807: we wish to solve \eqref{eqn:minnuc+fro}
808: \[
809:   \begin{array}{ll}
810:     \textrm{minimize}   & \quad f_\tau(\mtx{X})\\
811:     \textrm{subject to} & \quad \mathcal{P}_\Omega(\mtx{X}) = \mathcal{P}_\Omega(\mtx{M}).
812:  \end{array}
813: \]
814: The Lagrangian for this problem is given by
815: \[
816: {\cal L}(\mtx{X},\mtx{Y}) = f_\tau(\mtx{X}) + \<\mtx{Y}, {\cal
817:   P}_\Omega(\mtx{M} - \mtx{X})\>,
818: \]
819: where $\mtx{Y} \in \R^{n_1 \times n_2}$.  Strong duality holds and
820: $\mtx{X}^\star$ and $\mtx{Y}^\star$ are primal-dual optimal if
821: $(\mtx{X}^\star, \mtx{Y}^\star)$ is a saddlepoint of the Lagrangian
822: ${\cal L}(\mtx{X},\mtx{Y})$, i.e.~a pair obeying
823: \begin{equation}
824: \label{eq:saddlepoint}
825: \sup_{\mtx{Y}} \inf_{\mtx{X}} {\cal L}(\mtx{X}, \mtx{Y}) = {\cal L}(\mtx{X}^\star, \mtx{Y}^\star) = \inf_{\mtx{X}} \sup_{\mtx{Y}}
826: {\cal L}(\mtx{X}, \mtx{Y}).
827: \end{equation}
828: (The function $g_0(\mtx{Y}) = \inf_{\mtx{X}} {\cal L}(\mtx{X},
829: \mtx{Y})$ is called the dual function.) Uzawa's algorithm approaches the
830: problem of finding a saddlepoint with an iterative procedure.  From
831: $\mtx{Y}_0 = \mtx{0}$, say, inductively define
832: \begin{equation}
833: \label{eq:Lag1}
834: \begin{cases}
835: {\cal L}(\mtx{X}^{k}, \mtx{Y}^{k-1})
836:  = \min_{\mtx{X}} {\cal L}(\mtx{X},\mtx{Y}^{k-1})\\  \mtx{Y}^k
837:  = \mtx{Y}^{k-1} + \delta_k \mathcal{P}_\Omega(\mtx{M} -\mtx{X}^k),
838: \end{cases}
839: \end{equation}
840: where $\{\delta_k\}_{k \ge 1}$ is a sequence of positive step
841: sizes. Uzawa's algorithm is, in fact, a subgradient method applied to
842: the dual problem, where each step moves the current iterate in the
843: direction of the gradient or of a subgradient. Indeed, observe that
844: \begin{equation}\label{eqn:partialg0}
845: \partial_{\mtx{Y}} g_0(\mtx{Y}) = \partial_{\mtx{Y}} {\cal
846:   L}(\tilde{\mtx{X}}, \mtx{Y}) =
847: \mathcal{P}_\Omega(\mtx{M}-\tilde{\mtx{X}}),
848: \end{equation}
849: where $\tilde{\mtx{X}}$ is the minimizer of the Lagrangian for that
850: value of $\mtx{Y}$ so that a gradient descent update for $\mtx{Y}$ is
851: of the form
852: \[
853: \mtx{Y}^k = \mtx{Y}^{k-1} + \delta_k \partial_{\mtx{Y}}
854: g_0(\mtx{Y}^{k-1}) = \mtx{Y}^{k-1} + \delta_k
855: \mathcal{P}_\Omega(\mtx{M}-\mtx{X}^k).
856: \]
857: 
858: It remains to compute the minimizer of the Lagrangian \eqref{eq:Lag1},
859: and note that
860: \begin{equation}\label{eqn:argminequiv}
861: \arg \min \, f_\tau(\mtx{X}) + \<\mtx{Y}, {\cal P}_\Omega(\mtx{M} -
862: \mtx{X})\> = \arg \min \, \tau \|\mtx{X}\|_* + \frac{1}{2} \|\mtx{X} -
863: \mathcal{P}_\Omega \mtx{Y}\|^2_F.
864: \end{equation}
865: However, we know that the minimizer is given by
866: $\mathcal{D}_\tau(\mathcal{P}_\Omega(\mtx{Y}))$ and since $\mtx{Y}^{k}
867: = \mathcal{P}_\Omega(\mtx{Y}^k)$ for all $k \ge 0$, Uzawa's algorithm
868: takes the form
869: \begin{align*}
870: \begin{cases}
871:   \mtx{X}^k   = {\cal D}_{\tau} (\mtx{Y}^{k-1})\cr
872:   \mtx{Y}^k  = \mtx{Y}^{k-1} + \delta_k \mathcal{P}_\Omega(\mtx{M} - \mtx{X}^k),
873: \end{cases}
874: \end{align*}
875: which is exactly the update \eqref{eqn:iter}. This point of view
876: brings to bear many different mathematical tools for proving the
877: convergence of the singular value thresholding iterations. For an early
878: use of Uzawa's algorithm minimizing an $\ell_1$-like functional, the
879: total-variation norm, under linear inequality constraints, see
880: \cite{TVSynthesis}.
881: 
882: \section{General Formulation}
883: \label{sec:general}
884: 
885: This section presents a general formulation of the SVT algorithm for
886: approximately minimizing the nuclear norm of a matrix under convex
887: constraints.
888: 
889: \subsection{Linear equality constraints}
890: 
891: Set the objective functional $f_\tau(\mtx{X}) = \tau \|\mtx{X}\|_* +
892: \frac{1}{2}\|\mtx{X}\|_F^2$ for some fixed $\tau > 0$, and consider
893: the following optimization problem:
894: \begin{equation}
895: \label{eq:linear}
896:   \begin{array}{ll}
897:     \textrm{minimize}   & \quad f_\tau(\mtx{X})\\
898:     \textrm{subject to} & \quad \mathcal{A}(\mtx{X})  = \vct{b},
899:  \end{array}
900: \end{equation}
901: where $\mathcal{A}$ is a linear transformation mapping $n_1 \times
902: n_2$ matrices into $\R^m$ ($\mathcal{A}^*$ is the adjoint of
903: $\mathcal{A}$). This more general formulation is considered in
904: \cite{CR:XXX:08} and \cite{Recht07} as an extension of the matrix
905: completion problem.  Then the Lagrangian for this problem is of the
906: form
907: \begin{equation}
908:   \label{eq:Lagrangian1}
909:   {\cal L}(\mtx{X}, \vct{y}) = f_\tau(\mtx{X}) + \< \vct{y}, \vct{b} -
910:   \mathcal{A}(\mtx{X})\>,
911: \end{equation}
912: where $\mtx{X} \in \R^{n_1 \times n_2}$ and $\vct{y} \in \R^m$, and
913: starting with $\vct{y}^0 = \vct{0}$, Uzawa's iteration is given by
914: \begin{equation}
915: \label{eqn:itergeneral}
916: \begin{cases}
917:   \mymathbf{X}^{k} =
918:   \mathcal{D}_{\tau}(\mathcal{A}^*(\vct{y}^{k-1})),\cr \vct{y}^{k} =
919:   \vct{y}^{k-1} + \delta_k (\vct{b} - \mathcal{A}(\mtx{X}^k)).
920: \end{cases}
921: \end{equation}
922: The iteration \eqref{eqn:itergeneral} is of course the same as
923: \eqref{eqn:iter} in the case where $\mathcal{A}$ is a sampling
924: operator extracting $m$ entries with indices in $\Omega$ out of an
925: $n_1 \times n_2$ matrix. To verify this claim, observe that in this
926: situation, $\mathcal{A}^* \mathcal{A} = \mathcal{P}_\Omega$, and let
927: $\mtx{M}$ be any matrix obeying $\mathcal{A}(\mtx{M}) = \vct{b}$. Then
928: defining $\mtx{Y}^k = \mathcal{A}^*(\vct{y}^{k})$ and substituting
929: this expression in \eqref{eqn:itergeneral} gives \eqref{eqn:iter}.
930: 
931: \subsection{General convex constraints}
932: 
933: One can also adapt the algorithm to handle general convex
934: constraints. Suppose we wish to minimize $f_\tau(\mtx{X})$ defined as
935: before over a convex set $\mtx{X} \in \mathcal{C}$. To simplify, we
936: will assume that this convex set is given by
937: \[
938: \mathcal{C} = \{\mtx{X} : f_i(\mtx{X}) \le 0, \, \forall i = 1,
939: \ldots, m\},
940: \]
941: where the $f_i$'s are convex functionals (note that one can handle
942: linear equality constraints by considering pairs of affine
943: functionals).  The problem of interest is then of the form
944: \begin{equation}
945: \label{eq:convex}
946:   \begin{array}{ll}
947:     \textrm{minimize}   & \quad f_\tau(\mtx{X})\\
948:     \textrm{subject to} & \quad f_i(\mtx{X}) \le 0, \quad i = 1, \ldots, m.
949:  \end{array}
950: \end{equation}
951: Just as before, it is intuitive that as $\tau \to \infty$, the
952: solution to this problem converges to a minimizer of the nuclear norm
953: under the same constraints \eqref{eqn:nuc_norm} as shown in Theorem
954: \ref{thm:largemu2} at the end of this section.
955: 
956: Put $\mathcal{F}(\mtx{X}) := (f_1(\mtx{X}), \ldots, f_m(\mtx{X}))$
957: for short.  Then the Lagrangian for \eqref{eq:convex} is equal to
958: \[
959: {\cal L}(\mtx{X}, \vct{y}) = f_\tau(\mtx{X}) + \< \vct{y},
960: \mathcal{F}(\mtx{X})\>,
961: \]
962: where $\mtx{X} \in \R^{n_1 \times n_2}$ and $\vct{y} \in \R^m$ is now
963: a vector with nonnegative components denoted, as usual, by $\vct{y}
964: \ge \vct{0}$. One can apply Uzawa's method just as before with the
965: only modification that we will use a subgradient method with
966: projection to maximize the dual function since we need to make sure
967: that the successive updates $\vct{y}^k$ belong to the nonnegative
968: orthant. This gives
969: \begin{equation}
970: \label{eqn:itergeneral2}
971: \begin{cases}
972:   \mymathbf{X}^{k} = \arg \min \, \{f_\tau(\mtx{X}) + \<\vct{y}^{k-1},  \mathcal{F}(\mtx{X})\>\}, \cr
973:   \vct{y}^{k} =
974:   [\vct{y}^{k-1} + \delta_k \mathcal{F}(\mtx{X}^k) ]_+.
975: \end{cases}
976: \end{equation}
977: Above, $\vct{x}_+$ is of course the vector with entries equal to
978: $\max(x_i,0)$.  When $\mathcal{F}$ is an affine mapping of the form
979: $\vct{b} - \cA(\mtx{X})$ so that one solves
980: \[
981:   \begin{array}{ll}
982:     \textrm{minimize}   & \quad f_\tau(\mtx{X})\\
983:     \textrm{subject to} & \quad \cA(\mtx{X}) \ge \vct{b},
984:  \end{array}
985: \]
986:  this simplifies to
987: \begin{equation}
988: \label{eqn:itergeneral3}
989: \begin{cases}
990:   \mymathbf{X}^{k} = \mathcal{D}_\tau(\mathcal{A}^*(\vct{y}^{k-1})), \cr
991:   \vct{y}^{k} =
992:   [\vct{y}^{k-1} + \delta_k (\vct{b} - \mathcal{A}(\mtx{X}^k))]_+,
993: \end{cases}
994: \end{equation}
995: and thus the extension to linear inequality constraints is
996: straightforward.
997: 
998: \subsection{Example}
999: \label{sec:Dantzig}
1000: 
1001: An interesting example concerns the extension of the Dantzig selector
1002: \cite{DS} to matrix problems. Suppose we have available linear
1003: measurements about a matrix $\mtx{M}$ of interest
1004: \begin{equation}
1005:   \label{eq:noisy}
1006:   \vct{b} = \mathcal{A}(\mtx{M}) + \vct{z},
1007: \end{equation}
1008: where $\vct{z} \in \R^m$ is a noise vector. Then under these
1009: circumstances, one might want to find the matrix which minimizes the
1010: nuclear norm among all matrices which are consistent with the data
1011: $\vct{b}$. Inspired by the work on the Dantzig selector which was
1012: originally developed for estimating sparse parameter vectors from
1013: noisy data, one could approach this problem by solving
1014: \begin{equation}
1015: \label{eq:DS}
1016:   \begin{array}{ll}
1017:     \textrm{minimize}   & \quad \|\mtx{X}\|_*\\
1018:     \textrm{subject to} & \quad |\text{\bf vec}(\mathcal{A}^*(\vct{r}))| \le
1019:     \text{\bf vec}(\mtx{E}), \quad \vct{r} := \vct{b} - \mathcal{A}(\mtx{X}),
1020:  \end{array}
1021: \end{equation}
1022: where $\mtx{E}$ is an array of tolerances, which is adjusted to fit
1023: the noise statistics \cite{DS}. Above, $\text{\bf vec}(\mtx{A}) \le
1024: \text{\bf vec}(\mtx{B})$, for any two matrices $\mtx{A}$ and
1025: $\mtx{B}$, means componentwise inequalities; that is, $A_{ij} \le
1026: B_{ij}$ for all indices $i, j$. We use this notation as not to confuse
1027: the reader with the positive semidefinite ordering.  In the case of
1028: the matrix completion problem where $\mathcal{A}$ extracts sampled
1029: entries indexed by $\Omega$, one can always see the data vector as the
1030: sampled entries of some matrix $\mtx{B}$ obeying $\cP_\Omega(\mtx{B})
1031: = \cA^*(\vct{b})$; the constraint is then natural for it may be expressed
1032: as
1033: \[
1034: |B_{ij} - X_{ij}| \le E_{ij}, \quad (i,j) \in \Omega,
1035: \]
1036: If $\vct{z}$ is white noise with standard deviation $\sigma$, one may
1037: want to use a multiple of $\sigma$ for $E_{ij}$.  In words, we are
1038: looking for a matrix with minimum nuclear norm under the constraint
1039: that all of its sampled entries do not deviate too much from what has
1040: been observed.
1041: 
1042: Let $\mtx{Y}_+ \in \R^{n_1 \times n_2}$ (resp.~$\mtx{Y}_- \in \R^{n_1
1043:   \times n_2}$) be the Lagrange multiplier associated with the
1044: componentwise linear inequality constraints $\text{\bf vec}(\mathcal{A}^*(\vct{r})) \le
1045: \text{\bf vec}(\mtx{E})$ (resp.~$-\text{\bf vec}(\mathcal{A}^*(\vct{r})) \le \text{\bf vec}(\mtx{E}$)).  Then starting
1046: with $\mtx{Y}_{\pm}^0 = \mtx{0}$, the SVT iteration for this problem
1047: is of the form
1048: \begin{equation}
1049: \label{eqn:iterDS}
1050: \begin{cases}
1051:   \mymathbf{X}^{k} =
1052:   \mathcal{D}_\tau(\mathcal{A}^*\mathcal{A}(\mtx{Y}_+^{k-1}-\mtx{Y}_{-}^{k-1})),
1053:   \cr \mtx{Y}_{\pm}^k = [\mtx{Y}_{\pm}^{k-1} + \delta_k(\pm
1054:   \mathcal{A}^*(\vct{r}^k) - \mtx{E})]_+,\quad \vct{r}^k = \vct{b}^k -
1055:   \mathcal{A}(\mtx{X}^k),
1056: \end{cases}
1057: \end{equation}
1058: where again $[\cdot]_+$ is applied componentwise.
1059: 
1060: We conclude by noting that in the matrix completion problem where
1061: $\mathcal{A}^*\mathcal{A} = \mathcal{P}_\Omega$ and one observes $
1062: \mathcal{P}_\Omega (\mtx{B})$, one can check that this iteration
1063: simplifies to
1064: \begin{equation}
1065: \label{eqn:iterDS2}
1066: \begin{cases}
1067:   \mymathbf{X}^{k} =
1068:   \mathcal{D}_\tau(\mtx{Y}_+^{k-1}-\mtx{Y}_{-}^{k-1}), \cr
1069:   \mtx{Y}_{\pm}^k = [\mtx{Y}_{\pm}^{k-1} + \delta_k
1070:   \mathcal{P}_\Omega(\pm (\mtx{B}-\mtx{X}^k) - \mtx{E})]_+.
1071: \end{cases}
1072: \end{equation}
1073: Again, this is easy to implement and whenever the solution has low
1074: rank, the iterates $\mtx{X}^k$ have low rank as well.
1075: 
1076: \subsection{When the proximal problem gets close}
1077: 
1078: We now show that minimizing the proximal objective $f_\tau(\mtx{X}) =
1079: \tau \|\mtx{X}\|_* + \frac{1}{2} \|\mtx{X}\|_F^2$ is the same as
1080: minimizing the nuclear norm in the limit of large $\tau$'s. The
1081: theorem below is general and covers the special case of linear
1082: equality constraints as in \eqref{eqn:minnuc+fro}.
1083: 
1084: \begin{theorem}\label{thm:largemu2}
1085:   Let $\mtx{X}_{\tau}^{\star}$ be the solution to \eqref{eq:convex}
1086:   and $\mtx{X}_\infty$ be the minimum Frobenius-norm solution to
1087:   \eqref{eqn:nuc_norm} defined as
1088:   \begin{equation}\label{eqn:minfro2}
1089:     \mtx{X}_\infty:= \arg \min_{\mtx{X}}\{\|\mtx{X}\|_F^2~:~\mtx{X}\text{ is a solution of \eqref{eqn:nuc_norm}}\}.
1090:   \end{equation}
1091:   Assume that the $f_i(\mtx{X})$'s, $1 \le i \le m$, are convex and
1092:   lower semi-continuous. Then
1093: \begin{equation}\label{eqn:limitXmustar2}
1094:   \lim_{\tau\to\infty}\|\mtx{X}_{\tau}^{\star}-\mtx{X}_\infty\|_F=0.
1095: \end{equation}
1096: \end{theorem}
1097: \begin{proof}
1098: It follows from the definition of $\mtx{X}_{\tau}^{\star}$ and
1099: $\mtx{X}_\infty$ that
1100: \begin{equation}\label{eqn:XmustarX0}
1101:   \|\mtx{X}_{\tau}^{\star}\|_*+\frac{1}{2\tau}\|\mtx{X}_{\tau}^{\star}\|_F^2\leq
1102:   \|\mtx{X}_{\infty}\|_*+\frac{1}{2\tau}\|\mtx{X}_{\infty}\|_F^2,\quad \text{ and } \quad
1103:   \|\mtx{X}_{\infty}\|_*\leq\|\mtx{X}_{\tau}^{\star}\|_*.
1104: \end{equation}
1105: Summing these two inequalities gives
1106: \begin{equation}\label{eqn:boundedXmustar}
1107: \|\mtx{X}_{\tau}^{\star}\|_F^2\leq\|\mtx{X}_{\infty}\|_F^2,
1108: \end{equation}
1109: which implies that $\|\mtx{X}_{\tau}^{\star}\|_F^2$ is bounded
1110: uniformly in $\tau$.  Thus, we would prove the theorem if we could
1111: establish that any convergent subsequence
1112: $\{\mtx{X}^{\star}_{\tau_k}\}_{k \ge 1}$ must converge to
1113: $\mtx{X}_\infty$.
1114: 
1115: Consider an arbitrary converging subsequence
1116: $\{\mtx{X}^{\star}_{\tau_k}\}$ and set $\mtx{X}_c := \lim_{k \goto
1117:   \infty} \mtx{X}^{\star}_{\tau_k}$. Since for each $1 \le i \le m$,
1118: $f_i(\mtx{X}^{\star}_{\tau_k}) \le 0$ and $f_i$ is lower
1119: semi-continuous, $\mtx{X}_c$ obeys
1120: \begin{equation}\label{eqn:constaintXc2}
1121: f_i(\mtx{X}_c)\le 0, \quad i=1,\ldots,m.
1122: \end{equation}
1123: Furthermore, since $\|\mtx{X}_{\tau}^{\star}\|_F^2$ is bounded,
1124: \eqref{eqn:XmustarX0} yields
1125: $$
1126: \limsup_{\tau\to\infty}\|\mtx{X}_{\tau}^{\star}\|_*\leq\|\mtx{X}_{\infty}\|_*,
1127: \quad
1128: \|\mtx{X}_{\infty}\|_*\leq\liminf_{\tau\to\infty}\|\mtx{X}_{\tau}^{\star}\|_*.
1129: $$
1130: An immediate consequence is
1131: $\lim_{\tau\to\infty}\|\mtx{X}_{\tau}^{\star}\|_*=\|\mtx{X}_{\infty}\|_*$
1132: and, therefore, $\|\mtx{X}_{c}\|_*=\|\mtx{X}_{\infty}\|_*$. This shows
1133: that $\mtx{X}_c$ is a solution to \eqref{eqn:min}. Now it follows from
1134: the definition of $\mtx{X}_\infty$ that $\|\mtx{X}_c\|_F \geq
1135: \|\mtx{X}_{\infty}\|_F$, while we also have $\|\mtx{X}_c\|_F \leq
1136: \|\mtx{X}_{\infty}\|_F$ because of \eqref{eqn:boundedXmustar}.  We
1137: conclude that $\|\mtx{X}_c\|_F =\|\mtx{X}_{\infty}\|_F$ and thus
1138: $\mtx{X}_c=\mtx{X}_{\infty}$ since $\mtx{X}_{\infty}$ is unique.
1139: \end{proof}
1140: 
1141: 
1142: \section{Convergence Analysis}
1143: \label{sec:conv}
1144: 
1145: This section establishes the convergence of the SVT iterations.  We
1146: begin with the simpler proof of the convergence of \eqref{eqn:iter} in
1147: the special case of the matrix completion problem, and then present
1148: the argument for the more general constraints
1149: \eqref{eqn:itergeneral2}. We hope that this progression will make the
1150: second and more general proof more transparent.
1151: 
1152: 
1153: \subsection{Convergence for matrix completion}
1154: 
1155: We begin by recording a lemma which establishes the strong convexity
1156: of the objective $f_\tau$.
1157: \begin{lemma}
1158:   \label{lem:alpha}
1159:   Let $\mtx{Z} \in \partial f_\tau(\mtx{X})$ and $\mtx{Z}' \in \partial
1160:   f_\tau(\mtx{X}')$. Then
1161: \begin{equation}
1162: \label{eq:alpha}
1163: \<\mtx{Z} - \mtx{Z}', \mtx{X} - \mtx{X}'\> \ge \|\mtx{X} - \mtx{X}'\|_F^2.
1164: \end{equation}
1165: \end{lemma}
1166: \begin{proof}
1167:   An element $\mtx{Z}$ of $\partial f_\tau(\mtx{X})$ is of the form
1168:   $\mtx{Z} = \tau \mtx{Z}_0 + \mtx{X}$, where $\mtx{Z}_0 \in \partial
1169:   \|\mtx{X}\|_*$, and similarly for $\mtx{Z}'$. This gives
1170: \[
1171: \<\mtx{Z} - \mtx{Z}', \mtx{X} - \mtx{X}'\> = \tau \, \<\mtx{Z}_0 - \mtx{Z}_0', \mtx{X} - \mtx{X}'\> + \|\mtx{X}-\mtx{X}'\|_F^2
1172: \]
1173: and it thus suffices to show that the first term of the right-hand
1174: side is nonnegative. From \eqref{eqn:subdiffNorm}, we have that any
1175: subgradient of the nuclear norm at $\mtx{X}$ obeys $\|\mtx{Z}_0\|_2
1176: \le 1$ and $\<\mtx{Z}_0, \mtx{X}\> = \|\mtx{X}\|_*$. In particular,
1177: this gives
1178: \begin{equation*}
1179: |\<\mtx{Z}_0, \mtx{X}'\>|  \le \|\mtx{Z}_0\|_2
1180: \|\mtx{X}'\|_* \le \|\mtx{X}'\|_*, \qquad
1181: |\<\mtx{Z}'_0, \mtx{X}\>|  \le \|\mtx{Z}'_0\|_2
1182: \|\mtx{X}\|_* \le \|\mtx{X}\|_*.
1183: \end{equation*}
1184: Whence,
1185: \begin{align*}
1186:  \<\mtx{Z}_0 - \mtx{Z}_0', \mtx{X} - \mtx{X}'\> & =  \<\mtx{Z}_0 , \mtx{X} \>  +  \<\mtx{Z}_0',\mtx{X}'\>  - \<\mtx{Z}_0, \mtx{X}'\>  -  \<\mtx{Z}_0', \mtx{X}\>\cr
1187: & =    \|\mtx{X}\|_*  +  \|\mtx{X}'\|_*   - \<\mtx{Z}_0, \mtx{X}'\>  -  \<\mtx{Z}_0', \mtx{X}\> \ge 0,
1188: \end{align*}
1189: which proves the lemma.
1190: \end{proof}
1191: 
1192: This lemma is key in showing that the SVT algorithm \eqref{eqn:iter} converges.
1193: \begin{theorem}
1194: \label{thm:converge} Suppose that the sequence of step sizes obeys
1195: $0 < \inf \delta_k \le \sup \delta_k < 2$. Then the sequence
1196: $\{\mtx{X}^k\}$ obtained via \eqref{eqn:iter} converges to the
1197: unique solution of \eqref{eqn:minnuc+fro}.
1198: \end{theorem}
1199: \begin{proof}
1200:   Let $(\mtx{X}^\star,\mtx{Y}^\star)$ be primal-dual optimal for the
1201:   problem \eqref{eqn:minnuc+fro}. % Then
1202:  % $\mtx{Y}^{\star}=\mathcal{P}_{\Omega}\mtx{Y}^{\star}$. Moreover,
1203:  % since $\mtx{Y}^0=\mtx{0}$, by induction, we have
1204:  % $\mtx{Y}^{k}=\mathcal{P}_{\Omega}\mtx{Y}^{k}$ for all $k$.
1205:   The optimality conditions give
1206:   \begin{align*}
1207:     \mymathbf{0} & = \mtx{Z}^k - \mathcal{P}_{\Omega}(\mtx{Y}^{k-1})\cr
1208:     \mymathbf{0} & = \mtx{Z}^\star -  \mathcal{P}_{\Omega}(\mtx{Y}^\star),
1209:   \end{align*}
1210:   for some $\mtx{Z}^k \in \partial f_\tau(\mtx{X}^k)$ and some
1211:   $\mtx{Z}^\star \in \partial f_\tau(\mtx{X}^\star)$. We then deduce that
1212: \[
1213: (\mtx{Z}^k - \mtx{Z}^\star) - \mathcal{P}_{\Omega}(\mtx{Y}^{k-1} - \mtx{Y}^\star) = \mymathbf{0}
1214: \]
1215: and, therefore, it follows from Lemma \ref{lem:alpha} that
1216: \begin{equation}
1217: \label{eq:crucial2}
1218: \<\mtx{X}^k - \mtx{X}^\star, \mathcal{P}_{\Omega}(\mtx{Y}^{k-1} - \mtx{Y}^\star)\> =
1219: \<\mtx{Z}^k - \mtx{Z}^\star, \mtx{X}^k - \mtx{X}^\star\> \ge \|\mtx{X}^k - \mtx{X}^\star\|_F^2.
1220: \end{equation}
1221: We continue and observe that because $\cP_{\Omega}\mtx{X}^\star = \cP_{\Omega}\mtx{M}$,
1222: \[
1223: \|\mathcal{P}_{\Omega}(\mtx{Y}^{k} - \mtx{Y}^\star)\|_F  =
1224: \|\mathcal{P}_{\Omega}(\mtx{Y}^{k-1} - \mtx{Y}^\star) + \delta_k \cP_{\Omega}(\mtx{X}^\star - \mtx{X}^k)\|_F.
1225: \]
1226: Therefore, setting $r_k = \|\mathcal{P}_{\Omega}(\mtx{Y}^{k} -
1227: \mtx{Y}^\star)\|_F$,
1228: \begin{align}
1229: \label{eq:stepsize}
1230:   r_k^2 & = r_{k-1}^2 - 2\delta_k \<\cP_{\Omega}(\mtx{Y}^{k-1} - \mtx{Y}^\star),\mtx{X}^k -
1231:   \mtx{X}^\star\> + \delta_k^2 \|\cP_{\Omega}(\mtx{X}^\star - \mtx{X}^k)\|_F^2\cr & \le r_{k-1}^2 -
1232:   2\delta_k \|\mtx{X}^k - \mtx{X}^\star\|_F^2 + \delta_k^2 \|\mtx{X}^k -
1233:   \mtx{X}^\star\|_F^2
1234: \end{align}
1235: since for any matrix $\mtx{X}$, $\|\cP_\Omega(\mtx{X})\|_F \le
1236: \|\mtx{X}\|_F$.  Under our assumptions about the size of $\delta_k$,
1237: we have $2\delta_k - \delta_k^2 \ge \beta$ for all $k \ge 1$ and some
1238: $\beta > 0$ and thus
1239: \begin{equation}
1240: \label{eq:crucial}
1241: r_k^2 \le r_{k-1}^2 - \beta \|\mtx{X}^k - \mtx{X}^\star\|_F^2.
1242: \end{equation}
1243: Two properties follow from this:
1244: \begin{enumerate}
1245: \item The sequence $\{\|\mathcal{P}_{\Omega}(\mtx{Y}^{k} -
1246:   \mtx{Y}^\star)\|_F\}$ is nonincreasing and, therefore, converges to
1247:   a limit.
1248: \item As a consequence, $\|\mtx{X}^k - \mtx{X}^\star\|_F^2 \goto 0$ as
1249:   $k \goto \infty$.
1250: \end{enumerate}
1251: The theorem is established.
1252: \end{proof}
1253: 
1254: \subsection{General convergence theorem}
1255: 
1256: Our second result is more general and establishes the convergence of
1257: the SVT iterations to the solution of \eqref{eq:convex} under general
1258: convex constraints. From now now, we will only assume that the
1259: function $\cF(\mtx{X})$ is Lipschitz in the sense that
1260: \begin{equation}
1261:   \label{eq:Lipschitz}
1262:   \|\cF(\mtx{X}) - \cF(\mtx{Y}\| \le L(\cF) \|\mtx{X} - \mtx{Y}\|_F,
1263: \end{equation}
1264: for some nonnegative constant $L(\cF)$.  Note that if $\cF$ is affine,
1265: $\cF(\mtx{X}) = \vct{b} - \cA(\mtx{X})$, we have $L(\cF) = \|\cA\|_2$
1266: where $\|\cA\|_2$ is the spectrum norm of the linear transformation
1267: $\cA$ defined as $\|\cA\|_2:=\sup\{\|\cA(\mtx{X})\|_{\ell_2}:\|\mtx{X}\|_F=1\}$.
1268: We also recall that $\cF(\mtx{X}) =
1269: (f_1(\mtx{X}), \ldots, f_m(\mtx{X}))$ where each $f_i$ is convex, and
1270: that the Lagrangian for the problem \eqref{eq:convex} is given by
1271: \[
1272: \cL(\mtx{X},\vct{y}) = f_\tau(\mtx{X}) + \<\vct{y}, \cF(\mtx{X})\>, \quad \vct{y} \ge \vct{0}.
1273: \]
1274: We will assume to simplify that strong duality holds which is
1275: automatically true if the constraints obey constraint qualifications
1276: such as Slater's condition \cite{BoydBook}.
1277: 
1278: We first establish the following preparatory lemma.
1279: \begin{lemma}
1280:   \label{teo:proj}
1281:   Let $(\mtx{X}^\star,\vct{y}^\star)$ be a primal-dual optimal pair for
1282:   \eqref{eq:convex}. Then for each $\delta > 0$, $\vct{y}^\star$ obeys
1283:   \begin{equation}
1284:     \label{eq:proj}
1285:     \vct{y}^\star = [\vct{y}^\star + \delta \cF(\mtx{X}^\star)]_+.
1286:   \end{equation}
1287: \end{lemma}
1288: \begin{proof}
1289:   Recall that the projection $\vct{x}_0$ of a point $\vct{x}$ onto a convex
1290:   set $\mathcal{C}$ is characterized by
1291: \[
1292: \begin{cases} \vct{x}_0 \in \mathcal{C},\cr
1293: \<\vct{y}-\vct{x}_0, \vct{x} - \vct{x}_0\> \le 0, \,\, \forall \vct{y} \in \mathcal{C}.
1294: \end{cases}
1295: \]
1296: In the case where $\mathcal{C} = \R^m_+ = \{\vct{x} \in \R^m : \vct{x} \ge \vct{0}\}$, this
1297: condition becomes $\vct{x}_0 \ge \vct{0}$ and
1298: \[
1299: \<\vct{y} - \vct{x}_0, \vct{x} - \vct{x}_0\> \le 0, \,\,
1300: \forall \vct{y} \ge \vct{0}.
1301: \]
1302: 
1303: Now because $\vct{y}^\star$ is dual optimal we have
1304: \[
1305: \cL(\mtx{X}^\star,\vct{y}^\star) \ge \cL(\mtx{X}^\star,\vct{y}), \quad \forall \vct{y} \ge \vct{0}.
1306: \]
1307: Substituting the expression for the Lagrangian, this is equivalent to
1308: \[
1309: \<\vct{y} - \vct{y}^\star, \cF(\mtx{X}^\star)\> \le 0, \quad \forall \vct{y} \ge \vct{0},
1310: \]
1311: which is the same as
1312: \[
1313: \<\vct{y} - \vct{y}^\star, \vct{y}^\star + \rho \cF(\mtx{X}^\star) - \vct{y}^\star\> \le 0, \quad \forall \vct{y} \ge \vct{0}, \,\, \forall \rho \ge 0.
1314: \]
1315: Hence it follows that $\vct{y}^\star$ must be the projection of $\vct{y}^\star +
1316: \rho \cF(\mtx{X}^\star)$ onto  the nonnegative orthant $\R^m_+$. Since
1317: the projection of an arbitrary vector $\vct{x}$ onto $\R^m_+$ is given by
1318: $\vct{x}_+$, our claim follows.
1319: \end{proof}
1320: 
1321: We are now in the position to state our general convergence result.
1322: \begin{theorem}
1323: \label{thm:converge3}
1324: Suppose that the sequence of step sizes obeys $0 < \inf \delta_k \le
1325: \sup \delta_k < 2/\|L(\mathcal{F})\|^2$, where $L(\cF)$ is the
1326: Lipschitz constant in \eqref{eq:Lipschitz}. Then assuming strong
1327: duality, the sequence $\{\mtx{X}^k\}$ obtained via
1328: \eqref{eqn:itergeneral2} converges to the unique solution of
1329: \eqref{eq:convex}.
1330: \end{theorem}
1331: \begin{proof}
1332:   Let $(\mtx{X}^\star,\vct{y}^\star)$ be primal-dual optimal for the
1333:   problem \eqref{eq:convex}. We claim that the optimality conditions
1334:   give that for all $\mtx{X}$
1335:   \begin{align}
1336:     \label{eq:subtle}
1337:     \<\mtx{Z}^k, \mtx{X} - \mtx{X}^k\>  + \<\vct{y}^{k-1}, \cF(\mtx{X}) - \cF(\mtx{X}^k)\> & \ge 0,\cr
1338:     \<\mtx{Z}^\star, \mtx{X} - \mtx{X}^\star\> +\<\vct{y}^\star, \cF(\mtx{X}) - \cF(\mtx{X}^\star)\> & \ge 0,
1339:   \end{align}
1340:   for some $\mtx{Z}^k \in \partial f_\tau(\mtx{X}^k)$ and some
1341:   $\mtx{Z}^\star \in \partial f_\tau(\mtx{X}^\star)$. We justify this
1342:   assertion by proving one of the two inequalities since the other is
1343:   exactly similar. For the first, $\mtx{X}^k$ minimizes
1344:   $\cL(\mtx{X},\vct{y}^{k-1})$ over all $\mtx{X}$ and, therefore,
1345:   there exist $\mtx{Z}^k \in \partial f_\tau(\mtx{X}^k)$ and $\mtx{Z}_i^k
1346:   \in \partial f_i(\mtx{X}^k)$, $1 \le i \le m$, such that
1347: \[
1348:   \mtx{Z}^k + \sum_{i  =  1}^m y_i^{k-1} \mtx{Z}_i^k = 0.
1349: \]
1350: Now because each $f_i$ is convex,
1351: \[
1352: f_i(\mtx{X}) - f_i(\mtx{X}^k) \ge \< \mtx{Z}_i^k, \mtx{X} - \mtx{X}^k\>
1353: \]
1354: and, therefore,
1355: \[
1356: \<\mtx{Z}^k, \mtx{X} - \mtx{X}^k\> + \sum_{i = 1}^m y_i^{k-1}
1357: (f_i(\mtx{X}) - f_i(\mtx{X}^k)) \ge \< \mtx{Z}^k + \sum_{i = 1}^m
1358: y_i^{k-1} \mtx{Z}_i^k, \mtx{X} - \mtx{X}^k\> = 0.
1359: \]
1360: This is \eqref{eq:subtle}.
1361: 
1362: Now write the first inequality in \eqref{eq:subtle} for
1363: $\mtx{X}^\star$, the second for $\mtx{X}^k$ and sum the two
1364: inequalities. This gives
1365: \[
1366: \<\mtx{Z}^k - \mtx{Z}^\star, \mtx{X}^k - \mtx{X}^\star\> + \<\vct{y}^{k-1} - \vct{y}^\star, \cF(\mtx{X}^k) -
1367: \cF(\mtx{X}^\star)\> \le 0.
1368: \]
1369: The rest of the proof is essentially the same as that of Theorem
1370: \ref{thm:converge2}. It follows from Lemma \ref{lem:alpha} that
1371: \begin{equation}
1372: \label{eq:crucial3}
1373: \<\vct{y}^{k-1} - \vct{y}^\star, \cF(\mtx{X}^k) - \cF(\mtx{X}^\star)\> \le
1374: -\<\mtx{Z}^k - \mtx{Z}^\star, \mtx{X}^k - \mtx{X}^\star\> \le -\|\mtx{X}^k - \mtx{X}^\star\|_F^2.
1375: \end{equation}
1376: We continue and observe that because $\vct{y}^\star = [\vct{y}^\star + \delta_k
1377: \cF(\mtx{X})]_+$ by Lemma \ref{teo:proj}, we have
1378: \begin{align*}
1379: \|\vct{y}^{k} - \vct{y}^\star\| & = \|[\vct{y}^{k-1} + \delta_k \cF(\mtx{X}^k)]_+
1380: - [\vct{y}^\star + \delta_k \cF(\mtx{X}^\star)]_+\|\cr
1381: & \le \|\vct{y}^{k-1} - \vct{y}^\star + \delta_k (\cF(\mtx{X}^k)
1382: - \cF(\mtx{X}^\star))\|
1383: \end{align*}
1384: since the projection onto the convex set $\R^m_+$ is a contraction.
1385: Therefore,
1386: \begin{align*}
1387: \|\vct{y}^{k} - \vct{y}^\star\|^2 & = \|\vct{y}^{k-1} -
1388: \vct{y}^\star\|^2 + 2\delta_k \, \<\vct{y}^{k-1} - \vct{y}^\star,
1389: \cF(\mtx{X}^k) - \cF(\mtx{X}^\star)\> + \delta_k^2 \|\cF(\mtx{X}^k)
1390: - \cF(\mtx{X}^\star)\|^2\cr & \le \|\vct{y}^{k-1} - \vct{y}^\star\|^2 -
1391: 2\delta_k \|\mtx{X}^k - \mtx{X}^\star\|_F^2 + \delta_k^2 L^2\,
1392: \|\mtx{X}^k - \mtx{X}^\star\|_F^2,
1393: \end{align*}
1394: where we have put $L$ instead of $L(\cF)$ for short.  Under our
1395: assumptions about the size of $\delta_k$, we have $2\delta_k -
1396: \delta_k^2 L^2\ge \beta$ for all $k \ge 1$ and some $\beta > 0$. Then
1397: \begin{equation}
1398: \label{eq:crucialno}
1399: \|\vct{y}^{k} - \vct{y}^\star\|^2 \le \|\vct{y}^{k-1} - \vct{y}^\star\|^2 - \beta \|\mtx{X}^k - \mtx{X}^\star\|_F^2,
1400: \end{equation}
1401: and the conclusion is as before.
1402: \end{proof}
1403: 
1404: The problem \eqref{eq:linear} with linear constraints can be reduced
1405: to \eqref{eq:convex} by choosing
1406: \[
1407: \cF(\mtx{X})=\left[\begin{matrix}\vct{b}\cr
1408:     -\vct{b}\end{matrix}\right]- \left[\begin{matrix}\cA\cr
1409:     -\cA\end{matrix}\right]\mtx{X},
1410: \]
1411: and we have the following corollary:
1412: \begin{corollary}
1413:   \label{thm:converge2} Suppose that the sequence of step sizes obeys
1414:   $0 < \inf \delta_k \le \sup \delta_k < 2/\|\cA\|_2^2$. Then the
1415:   sequence $\{\mtx{X}^k\}$ obtained via \eqref{eqn:itergeneral}
1416:   converges to the unique solution of \eqref{eq:linear}.
1417: \end{corollary}
1418: 
1419: Let $\|\cA\|_2 := \sup \{\|\cA(\mtx{X})\|_{F} : \|\mtx{X}\|_F = 1\}$.
1420: With $\cF(\mtx{X})$ given as above, we have $|L(\cF)|^2 =
1421: 2\|\cA\|^2_2$ and thus, Theorem \ref{thm:converge3} guarantees
1422: convergence as long as $0 < \inf \delta_k \le \sup \delta_k <
1423: 1/\|\cA\|_2^2$. However, an argument identical to the proof of Theorem
1424: \ref{thm:converge} would remove the extra factor of two. We omit the
1425: details.
1426: 
1427: \section{Implementation and Numerical Results}
1428: \label{sec:num}
1429: 
1430: This section provides implementation details of the SVT algorithm---as
1431: to make it practically effective for matrix completion---such as the
1432: numerical evaluation of the singular value thresholding operator, the
1433: selection of the step size $\delta_k$, the selection of a stopping
1434: criterion, and so on. This section also introduces several numerical
1435: simulation results which demonstrate the performance and effectiveness
1436: of the SVT algorithm. We show that $30,000 \times 30,000$ matrices of
1437: rank 10 are recovered from just about 0.4\% of their sampled entries
1438: in a matter of a few minutes on a modest desktop computer with a 1.86
1439: GHz CPU (dual core with Matlab's multithreading option enabled) and 3
1440: GB of memory.
1441: 
1442: \subsection{Implementation details}
1443: \label{sec:implementation}
1444: 
1445: 
1446: \subsubsection{Evaluation of the singular value thresholding operator}
1447: 
1448: To apply the singular value tresholding operator at level $\tau$ to an
1449: input matrix, it suffices to know those singular values and
1450: corresponding singular vectors above the threshold $\tau$. In the
1451: matrix completion problem, the singular value thresholding operator is
1452: applied to sparse matrices $\{\mtx{Y}^k\}$ since the number of sampled
1453: entries is typically much lower than the number of entries in the
1454: unknown matrix $\mtx{M}$, and we are hence interested in numerical
1455: methods for computing the dominant singular values and singular
1456: vectors of large sparse matrices. The development of such methods is a
1457: relatively mature area in scientific computing and numerical linear
1458: algebra in particular. In fact, many high-quality packages are readily
1459: available. Our implementation uses PROPACK, see \cite{Lar:Propack} for
1460: documentation and availability. One reason for this choice is
1461: convenience: PROPACK comes in a Matlab and a Fortran version, and we
1462: find it convenient to use the well-documented Matlab version. More
1463: importantly, PROPACK uses the iterative Lanczos algorithm to compute
1464: the singular values and singular vectors directly, by using the
1465: Lanczos bidiagonalization algorithm with partial
1466: reorthogonalization. In particular, PROPACK does not compute the
1467: eigenvalues and eigenvectors of $(\mtx{Y}^k)^*\mtx{Y}^k$ and
1468: $\mtx{Y}^k(\mtx{Y}^k)^*$, or of an augmented matrix as in the Matlab
1469: built-in function `\texttt{svds}' for example. Consequently, PROPACK
1470: is an efficient---both in terms of number of flops and storage
1471: requirement---and stable package for computing the dominant singular
1472: values and singular vectors of a large sparse matrix. For information,
1473: the available documentation \cite{Lar:Propack} reports a speedup
1474: factor of about ten over Matlab's `\texttt{svds}'. Furthermore, the
1475: Fortran version of PROPACK is about 3--4 times faster than the Matlab
1476: version. Despite this significant speedup, we have only used the
1477: Matlab version but since the singular value shrinkage operator is
1478: by-and-large the dominant cost in the SVT algorithm, we expect that a
1479: Fortran implementation would run about 3 to 4 times faster.
1480: 
1481: 
1482: As for most SVD packages, though one can specify the number of
1483: singular values to compute, PROPACK can not automatically compute only
1484: those singular values exceeding the threshold $\tau$.  One must
1485: instead specify the number $s$ of singular values ahead of time, and
1486: the software will compute the $s$ largest singular values and
1487: corresponding singular vectors. To use this package, we must then
1488: determine the number $s_k$ of singular values of $\mtx{Y}^{k-1}$ to be
1489: computed at the $k$th iteration. We use the following simple
1490: method. Let $r_{k-1}=\rank(\mtx{X}^{k-1})$ be the number of nonzero
1491: singular values of $\mtx{X}^{k-1}$ at the previous iteration. Set $s_k
1492: = r_{k-1}+1$ and compute the first $s_{k}$ singular values of
1493: $\mtx{Y}^{k-1}$. If some of the computed singular values are already
1494: smaller than $\tau$, then $s_k$ is a right choice. Otherwise,
1495: increment $s_k$ by a predefined integer $\ell$ repeatedly until some
1496: of the singular values fall below $\tau$.  In the experiments, we
1497: choose $\ell=5$.  Another rule might be to repeatedly multiply $s_k$
1498: by a positive number---e.g.~2---until our criterion is
1499: met. Incrementing $s_k$ by a fixed integer works very well in
1500: practice; in our experiments, we very rarely need more than one
1501: update.
1502: 
1503: We note that it is not necessary to rerun the Lanczos iterations for
1504: the first $s_k$ vectors since they have been already computed; only a
1505: few new singular values ($\ell$ of them) need to be numerically
1506: evaluated. This can be done by modifying the PROPACK routines. We
1507: have not yet modified PROPACK, however. Had we done so, our run times
1508: would be decreased.
1509: 
1510: 
1511: \subsubsection{Step sizes}
1512: 
1513: There is a large literature on ways of selecting a step size but for
1514: simplicity, we shall use step sizes that are independent of the
1515: iteration count; that is $\delta_k = \delta$ for $k = 1, 2, \ldots$.
1516: From Theorem \ref{thm:converge}, convergence for the completion
1517: problem is guaranteed \eqref{eqn:iter} provided that $0 < \delta <
1518: 2$. This choice is, however, too conservative and the convergence is
1519: typically slow. In our experiments, we use instead
1520: \begin{equation}
1521:   \label{eq:heuristic}
1522:   \delta  = 1.2 \, \frac{n_1 n_2}{m},
1523: \end{equation}
1524: i.e.~$1.2$ times the undersampling ratio. We give a heuristic
1525: justification below.
1526: 
1527: Consider a fixed matrix $\mtx{A} \in \R^{n_1 \times n_2}$.  Under the
1528: assumption that the column and row spaces of $\mtx{A}$ are not well
1529: aligned with the vectors taken from the canonical basis of $\R^{n_1}$
1530: and $\R^{n_2}$ respectively---the {\em incoherence assumption} in
1531: \cite{CR:XXX:08}---then with very large probability over the choices
1532: of $\Omega$, we have
1533: \begin{equation}
1534:   \label{eq:weakRIP}
1535:   (1-\epsilon)  p \,  \|\mtx{A}\|_F^2  \le  \|\mathcal{P}_\Omega(\mtx{A})\|_F^2 \le   (1+\epsilon)  p \,  \|\mtx{A}\|_F^2, \quad p := m/(n_1n_2),
1536: \end{equation}
1537: provided that the rank of $\mtx{A}$ is not too large.  The probability
1538: model is that $\Omega$ is a set of sampled entries of cardinality $m$
1539: sampled uniformly at random so that all the choices are equally
1540: likely. In \eqref{eq:weakRIP}, we want to think of $\epsilon$ as a
1541: small constant, e.g.~smaller than 1/2. In other words, the `energy' of
1542: $\mtx{A}$ on $\Omega$ (the set of sampled entries) is just about
1543: proportional to the size of $\Omega$. The near isometry
1544: \eqref{eq:weakRIP} is a consequence of Theorem 4.1 in
1545: \cite{CR:XXX:08}, and we omit the details.
1546: 
1547: Now returning to the proof of Theorem \ref{thm:converge}, we see that a sufficient
1548: condition for the convergence of \eqref{eqn:iter} is
1549: \[
1550: \exists\beta>0,\quad -2\delta \|\mtx{X}^\star - \mtx{X}^k\|_F^2 +
1551: \delta^2 \|\cP_{\Omega}(\mtx{X}^\star -
1552: \mtx{X}^k)\|_F^2\leq-\beta\|\mtx{X}^\star - \mtx{X}^k\|_F^2,
1553: \]
1554: compare \eqref{eq:crucial}, which is equivalent to
1555: \[
1556: 0<\delta<2\frac{\|\mtx{X}^\star - \mtx{X}^k\|_F^2}
1557: {\|\cP_{\Omega}(\mtx{X}^\star - \mtx{X}^k)\|_F^2}.
1558: \]
1559: Since $\|\cP_\Omega(\mtx{X})\|_F \le \|\mtx{X}\|_F$ for any matrix
1560: $\mtx{X} \in \R^{n_1 \times n_2}$, it is safe to select $\delta <
1561: 2$. But suppose that we could apply \eqref{eq:weakRIP} to the matrix
1562: $\mtx{A} = \mtx{X}^\star - \mtx{X}^k$. Then we could take $\delta$
1563: inversely proportional to $p$; e.g.~with $\epsilon = 1/4$, we could
1564: take $\delta \le 1.6 p^{-1}$. Below, we shall use the value $\delta =
1565: 1.2 p^{-1}$ which allows us to take large steps and still provides
1566: convergence, at least empirically.
1567: 
1568: The reason why this is not a rigorous argument is that
1569: \eqref{eq:weakRIP} cannot be applied to $\mtx{A} = \mtx{X}^\star -
1570: \mtx{X}^k$ even though this matrix difference may obey the incoherence
1571: assumption.  The issue here is that $\mtx{X}^\star - \mtx{X}^k$ is not
1572: a fixed matrix, but rather depends on $\Omega$ since the iterates
1573: $\{\mtx{X}^k\}$ are computed with the knowledge of the sampled set.
1574: 
1575: \subsubsection{Initial steps}
1576: 
1577: The SVT algorithm starts with $\mtx{Y}^0=\mtx{0}$, and we want to
1578: choose a large $\tau$ to make sure that the solution of
1579: \eqref{eqn:minnuc+fro} is close enough to a solution of
1580: \eqref{eqn:min}. Define $k_0$ as that integer obeying
1581: \begin{equation}\label{eqn:k0}
1582: \frac{\tau}{\delta\|\mathcal{P}_{\Omega}(\mtx{M})\|_2} \in (k_0-1,
1583: k_0].
1584: \end{equation}
1585: Since $\mtx{Y}^0=\mtx{0}$, it is not difficult to see that
1586: \[
1587: \mtx{X}^k = \mtx{0}, \quad \mtx{Y}^k= k\delta\,
1588: \mathcal{P}_{\Omega}(\mtx{M}), \quad k = 1, \ldots, k_0.
1589: \]
1590: To save work, we may simply skip the computations of
1591: $\mtx{X}^1,\ldots,\mtx{X}^{k_0}$, and start the iteration by computing
1592: $\mtx{X}^{k_0+1}$ from $\mtx{Y}^{k_0}$.
1593: 
1594: This strategy is a special case of a {\em kicking device} introduced
1595: in \cite{ODY:XXX:08}; the main idea of such a kicking scheme is that
1596: one can `jump over' a few steps whenever possible. Just like in the
1597: aforementioned reference, we can develop similar kicking strategies
1598: here as well.  Because in our numerical experiments the kicking is
1599: rarely triggered, we forgo the description of such strategies.
1600: 
1601: 
1602: \subsubsection{Stopping criteria}
1603: 
1604: Here, we discuss stopping criteria for the sequence of SVT iterations
1605: \eqref{eqn:iter}, and present two possibilities.
1606: 
1607: The first is motivated by the first-order optimality conditions or KKT
1608: conditions tailored to the minimization problem
1609: \eqref{eqn:minnuc+fro}. By \eqref{eqn:argminequiv} and letting
1610: $\partial_{\mtx{Y}} g_0(\mtx{Y})=\mtx{0}$ in \eqref{eqn:partialg0}, we
1611: see that the solution $\mtx{X}^\star_{\tau}$ to \eqref{eqn:minnuc+fro}
1612: must also verify
1613: \begin{equation}\label{eqn:KKT}
1614: \begin{cases}
1615: \mtx{X}=\mathcal{D}_{\tau}(\mtx{Y}),\cr
1616: \mathcal{P}_{\Omega}(\mtx{X}-\mtx{M})=\mtx{0},
1617: \end{cases}
1618: \end{equation}
1619: where $\mtx{Y}$ is a matrix vanishing outside of $\Omega^c$.
1620: Therefore, to make sure that $\mtx{X}^k$ is close to
1621: $\mtx{X}^{\star}_\tau$, it is sufficient to check how close
1622: $(\mtx{X}^k,\mtx{Y}^{k-1})$ is to obeying \eqref{eqn:KKT}. By
1623: definition, the first equation in \eqref{eqn:KKT} is always
1624: true. Therefore, it is natural to stop \eqref{eqn:iter} when the error
1625: in the second equation is below a specified tolerance. We suggest
1626: stopping the algorithm when
1627: \begin{equation}\label{eqn:stop0}
1628: \frac{\|\mathcal{P}_{\Omega}(\mtx{X}^k-\mtx{M})\|_F}{\|\mathcal{P}_{\Omega}(\mtx{M})\|_F}\leq\epsilon,
1629: \end{equation}
1630: where $\epsilon$ is a fixed tolerance, e.g.~$10^{-4}$. We provide a
1631: short heuristic argument justifying this choice below.
1632: 
1633: In the matrix completion problem, we know that under suitable
1634: assumptions
1635: \[
1636: \|\mathcal{P}_\Omega(\mtx{M})\|_F^2 \asymp p \, \|\mtx{M}\|_F^2,
1637: \]
1638: which is just \eqref{eq:weakRIP} applied to the fixed matrix $\mtx{M}$
1639: (the symbol $\asymp$ here means that there is a constant $\epsilon$ as
1640: in \eqref{eq:weakRIP}). Suppose we could also apply \eqref{eq:weakRIP}
1641: to the matrix $\mtx{X}^k - \mtx{M}$ (which we rigorously cannot since
1642: $\mtx{X}^k$ depends on $\Omega$), then we would have
1643: \begin{equation}\label{eq:xk-m}
1644: \|\mathcal{P}_\Omega(\mtx{X}^k - \mtx{M})\|_F^2 \asymp p \,
1645: \|\mtx{X}^k - \mtx{M}\|_F^2,
1646: \end{equation}
1647: and thus
1648: \[
1649: \frac{\|\mathcal{P}_{\Omega}(\mtx{X}^k-\mtx{M})\|_F}{\|\mathcal{P}_{\Omega}(\mtx{M})\|_F}\asymp
1650: \frac{\|\mtx{X}^k - \mtx{M}\|_F}{\|\mtx{M}\|_F}.
1651: \]
1652: In words, one would control the relative reconstruction error by
1653: controlling the relative error on the set of sampled locations.
1654: 
1655: A second stopping criterion comes from duality theory. Firstly, the
1656: iterates $\mtx{X}^k$ are generally not feasible for
1657: \eqref{eqn:minnuc+fro} although they become asymptotically
1658: feasible. One can construct a feasible point from $\mtx{X}^k$ by
1659: projecting it onto the affine space $\{\mtx{X} :
1660: \mathcal{P}_{\Omega}(\mtx{X}) = \mathcal{P}_{\Omega}(\mtx{M})\}$ as
1661: follows:
1662: \[
1663: \tilde{\mtx{X}}^k = \mtx{X}^k +  \mathcal{P}_{\Omega}(\mtx{M}-\mtx{X}^k).
1664: \]
1665: As usual let $f_\tau(\mtx{X}) = \tau \|\mtx{X}\|_* +\frac{1}{2}
1666: \|\mtx{X}\|_F^2$ and denote by $p^\star$ the optimal value of
1667: \eqref{eqn:minnuc+fro}. Since $\tilde{\mtx{X}}^k$ is feasible, we
1668: have
1669: \[
1670: p^\star \le f_\tau(\tilde{\mtx{X}}^k) := b_k.
1671: \]
1672: Secondly, using the notations of Section \ref{sec:uzawa},
1673: duality theory gives that
1674: \[
1675: a_k := g_0(\mtx{Y}^{k-1}) = {\cal L}(\mtx{X}^k,\mtx{Y}^{k-1}) \le p^\star.
1676: \]
1677: Therefore, $b_k - a_k$ is an upper bound on the duality gap and one
1678: can stop the algorithm when this quantity falls below a given
1679: tolerance.
1680: 
1681: For very large problems in which one holds $\mtx{X}^k$ in reduced SVD
1682: form, one may not want to compute the projection $\tilde{\mtx{X}}^k$
1683: since this matrix would not have low rank and would require
1684: significant storage space (presumably, one would not want to spend
1685: much time computing this projection either). Hence, the second method
1686: only makes practical sense when the dimensions are not prohibitively
1687: large, or when the iterates do not have low rank.
1688: 
1689: % The above theorem also gives us a posterior criteria for the choice
1690: % of $\tau$. By Theorem \ref{thm:largemu}, we have to choose a large
1691: % $\tau$ in order that we obtain a solution of \eqref{eqn:min}.
1692: % However, larger $\tau$ means slower convergence of \eqref{eqn:iter}.
1693: % We have to balance the convergence speed and the relative error of
1694: % the recovered matrix. By Theorem \ref{thm:stop}, if we choose a
1695: % moderate large $\tau$ such that the rank of $\mtx{X}^k$ is in
1696: % control, then, with high probability, the relative error of the
1697: % recovered matrix is controled by $\epsilon$.
1698: 
1699: 
1700: % \subsubsection{Noisy data}
1701: 
1702: % The SVT algorithm can be adapted to the case when there contains
1703: % noise in the sampled entries. Though the theory for matrix
1704: % completion under noise is not established, we take this adventure by
1705: % numerical simulations. We assume that the observed data is given by
1706: % $$
1707: % \widetilde{M}_{ij}=M_{ij}+Z_{ij}, \qquad (i,j)\in\Omega,
1708: % $$
1709: % where $\mtx{Z}$ is a zero-mean Gaussian noise of variance $\sigma$.
1710: % Therefore, there exists a constant $C_3$, with high probability,
1711: % \begin{equation}\label{eqn:noisyM}
1712: % \|\mathcal{P}_{\Omega}\widetilde{\mtx{M}}-\mathcal{P}_{\Omega}\mtx{M}\|_F^2\leq
1713: % C_3m\sigma^2.
1714: % \end{equation}
1715: % We still use the SVT iteration \eqref{eqn:iter}, and replace the
1716: % sampled data $\mathcal{P}_{\Omega}\mtx{M}$ in \eqref{eqn:iter} by
1717: % the observed noisy data $\mathcal{P}_{\Omega}\widetilde{\mtx{M}}$.
1718: % We stop the iteration of the SVT algorithm early, e.g.,
1719: % \begin{equation}\label{eqn:stop1}
1720: % \|\mathcal{P}_{\Omega}\mtx{X}^{k}-\mathcal{P}_{\Omega}\widetilde{\mtx{M}}\|_F^2
1721: % \leq m\sigma^2.
1722: % \end{equation}
1723: % We set $\mtx{X}^k$ as an approximation of $\mtx{M}$. In the
1724: % following, we prove that $\mtx{X}^k$ is a good approximation of
1725: % $\mtx{M}$ in the sense that the relative error between $\mtx{X}^k$
1726: % and $\mtx{M}$ is bounded by a multiple of the noise level, i.e.,
1727: % there exists a constant $C_4$ such that
1728: % \begin{equation}\label{eqn:error}
1729: % \|\mtx{X}^{k}-\mtx{M}\|_F^2\leq C_4p^{-1}m\sigma^2
1730: % \end{equation}
1731: % under suitable assumptions. By the proof of Theorem \ref{thm:stop},
1732: % there exists a constant $C_2$ such that
1733: % $C_2p^{-1}\|\mathcal{P}_{\Omega}\mtx{M}\|_F^2\leq\|\mtx{M}\|_F^2$
1734: % with high probability. Similarly, there exists a constant $C_7$ such
1735: % that $\|\mtx{M}\|_F^2\leq
1736: % C_7p^{-1}\|\mathcal{P}_{\Omega}\mtx{M}\|_F^2$ with high probability.
1737: % Therefore, \eqref{eqn:noisyM} and \eqref{eqn:error} imply that the
1738: % relative error of between $\mtx{X}^k$ and $\mtx{M}$ is proportional
1739: % to the relative error between $\widetilde{\mtx{M}}$ and $\mtx{M}$.
1740: 
1741: 
1742: 
1743: \subsubsection{Algorithm}
1744: 
1745: We conclude this section by summarizing the implementation details and
1746: give the SVT algorithm for matrix completion below (Algorithm
1747: \ref{alg:SVT}). Of course, one would obtain  a very similar
1748: structure for the more general problems of the form \eqref{eq:linear}
1749: and \eqref{eq:convex} with linear inequality constraints.  For
1750: convenience, define for each nonnegative integer $s \le
1751: \min\{n_1,n_2\}$,
1752: \[
1753: [\mtx{U}^k,\mtx{\Sigma}^k,\mtx{V}^k]_s, \quad k = 1, 2, \ldots,
1754: \]
1755: where $\mtx{U}^{k} = [\vct{u}_1^{k}, \ldots, \vct{u}_{s}^{k}]$ and  $\mtx{V}^{k} =
1756: [\vct{v}_1^{k}, \ldots, \vct{v}_s^{k}]$ are the first $s$ singular vectors
1757: of the matrix $\mtx{Y}^{k}$, and $\mtx{\Sigma}^{k}$ is a diagonal
1758: matrix with the first $s$ singular values $\sigma^{k}_1, \ldots,
1759: \sigma_s^{k}$ on the diagonal.
1760: 
1761: \begin{algorithm}[htb]
1762: \caption{Singular Value Thresholding (SVT) Algorithm}{}
1763: \label{alg:SVT} \centering \fbox{
1764: \begin{minipage}{.9\textwidth}
1765:   \vspace{4pt} \alginout{sampled set $\Omega$ and sampled entries
1766:     $\mathcal{P}_{\Omega}(\mtx{M})$, step size $\delta$, tolerance
1767:     $\epsilon$, parameter $\tau$, increment $\ell$, and maximum
1768:     iteration count $k_{\max}$}  {$\mtx{X}^{\mathrm{opt}}$}
1769:   \algdescript{Recover a low-rank matrix $\mtx{M}$ from a subset of
1770:     sampled entries}  \vspace{6pt}%\hrule\vspace{6pt}
1771: 
1772: \begin{algtab}
1773:   Set $\mtx{Y}^0 = k_0\delta \, \mathcal{P}_{\Omega}(\mtx{M})$ ($k_0$ is defined in \eqref{eqn:k0}) \\
1774:   Set $r_0=0$\\
1775:   \algforto{$k=1$}{$k_{\max}$}
1776:   Set $s_k=r_{k-1}+1$\\
1777:   \algrepeat
1778:   Compute $[\mtx{U}^{k-1},\mtx{\Sigma}^{k-1},\mtx{V}^{k-1}]_{s_k}$\\
1779:  %\algwhile{$\sigma_{s_k}^{k-1}>\tau$}
1780:   Set $s_k = s_k + \ell$\\
1781:   %Compute $[\mtx{U}^{k-1},\mtx{\Sigma}^{k-1},\mtx{V}^{k-1}]_{s_k}$\\
1782:  \alguntil{$\sigma_{s_k-\ell}^{k-1}\le\tau$}
1783:  %\algend {\bf end} {\em  while}\\
1784:  Set $r_{k} = \max\{j : \sigma^{k-1}_j > \tau\}$\\
1785:  Set $\mtx{X}^k = \sum_{j = 1}^{r_k}
1786:   (\sigma_j^{k-1}-\tau) \vct{u}_j^{k-1} \vct{v}^{k-1}_j$\\
1787: 
1788:  \algifthen{$\|\mathcal{P}_{\Omega}(\mtx{X}^k-\mtx{M})\|_F/\|\mathcal{P}_{\Omega}\mtx{M}\|_F
1789:            \leq\epsilon$}{{\bf break}}
1790: 
1791:   Set
1792:     $Y^{k}_{ij}=\begin{cases} 0&\mbox{if }(i,j)\not\in\Omega,\cr
1793:       Y^{k-1}_{ij}+\delta(M_{ij}-X_{ij}^k)&\mbox{if }(i,j)\in\Omega \end{cases}
1794:     $ \\
1795:  \algend {\bf end} {\em  for $k$} \\
1796:  Set $\mtx{X}^{\mathrm{opt}}= \mtx{X}^k$\\
1797:  \algend
1798: \end{algtab}
1799: \end{minipage}}
1800: \end{algorithm}
1801: 
1802: 
1803: %\begin{algorithm}[Singular Value Thresholding (SVT) Algorithm]\label{alg:SVT}{~}
1804: %\begin{enumerate}
1805: %\item Determine $\delta$, $\epsilon$, $\tau$. Compute $j$ according to
1806: %  \eqref{eqn:j}.  Set
1807: %  $\mtx{Y}^0=(j+1)\delta\mathcal{P}_{\Omega}\mtx{M}$, and $k=0$.
1808: %\item Iterate as follows.
1809: %\begin{enumerate}
1810: %\item
1811: % Set $k=k+1$.
1812: %\item\label{stepSVD} Compute the first $s_k$ singular values
1813: %  $\sigma_1^{k-1},\ldots,\sigma_{s_k}^{k-1}$ of $\mtx{Y}^{k-1}$ and
1814: %  its corresponding singular vectors
1815: %  $\vct{u}^{k-1}_1,\ldots,\vct{u}^{k-1}_{s_k}$ and
1816: %  $\vct{v}^{k-1}_1,\ldots,\vct{v}^{k-1}_{s_k}$.  Here $s_k$ is the
1817: %  minimum integer such that $\sigma_{s_k}^{k-1}<\tau$.
1818: %\item Compute explicitly the entries of $\mtx{X}^k$ on $\Omega$ by
1819: % $$X^{k}_{ij}=\sum_{\ell=1}^{s_k-1}
1820: % (\sigma_{\ell}^{k-1}-\tau)u_{\ell,i}^{k-1}v_{\ell,j}^{k-1},
1821: % \quad\forall(i,j)\in\Omega,$$ where $u_{\ell,i}^{k-1}$ is the $i$th
1822: % entry of $\vct{u}_{\ell}^{k-1}$, and similarly for
1823: % $v_{\ell,j}^{k-1}$.
1824: %\item
1825: % If
1826: % $$
1827: % \|\mathcal{P}_{\Omega}(\mtx{X}^k-\mtx{M})\|_F/\|\mathcal{P}_{\Omega}\mtx{M}\|_F\leq\epsilon,
1828: % $$
1829: % then stop.
1830: %\item
1831: % Set
1832: % $$
1833: % Y^{k}_{ij}=\begin{cases}
1834: % 0&\mbox{if }(i,j)\not\in\Omega,\cr
1835: % Y^{k-1}_{ij}+\delta(M_{ij}-X_{ij}^k)&\mbox{if }(i,j)\in\Omega.
1836: % \end{cases}
1837: % $$
1838: %\end{enumerate}
1839: %\item Output $\mtx{X}^{\mathrm{opt}}=\mtx{X}^k$ in the form of its
1840: %  reduced SVD as
1841: % $$
1842: % \mtx{X}^{\mathrm{opt}}=\sum_{\ell=1}^{s_k-1}\sigma_{\ell}^{k-1}\vct{u}_\ell^{k-1}(\vct{v}_{\ell}^{k-1})^{*}.
1843: % $$
1844: %\end{enumerate}
1845: %\end{algorithm}
1846: 
1847: 
1848: \subsection{Numerical results}
1849: \label{sec:results}
1850: 
1851: \subsubsection{Linear equality constraints} 
1852: 
1853: Our implementation is in Matlab and all the computational results we
1854: are about to report were obtained on a desktop computer with a 1.86
1855: GHz CPU (dual core with Matlab's multithreading option enabled) and 3
1856: GB of memory. In our simulations, we generate $n \times n$ matrices of
1857: rank $r$ by sampling two $n\times r$ factors $\mtx{M}_L$ and
1858: $\mtx{M}_R$ independently, each having i.i.d.~Gaussian entries, and
1859: setting $\mtx{M}=\mtx{M}_L\mtx{M}_R^*$ as it is suggested in
1860: \cite{CR:XXX:08}. The set of observed entries $\Omega$ is sampled
1861: uniformly at random among all sets of cardinality $m$.
1862: 
1863: The recovery is
1864: performed via the SVT algorithm (Algorithm \ref{alg:SVT}), and we use
1865: \begin{equation}\label{eqn:stop}
1866: \|\mathcal{P}_{\Omega}(\mtx{X}^k-\mtx{M})\|_F/
1867: \|\mathcal{P}_{\Omega}\mtx{M}\|_F<10^{-4}
1868: \end{equation}
1869: as a stopping criterion.  As discussed earlier, the step sizes are
1870: constant and we set $\delta=1.2 p^{-1}$.  Throughout this section, we
1871: denote the output of the SVT algorithm by $\mtx{X}^{\mathrm{opt}}$.
1872: The parameter $\tau$ is chosen empirically and set to $\tau = 5n$. A
1873: heuristic argument is as follows. Clearly, we would like the term
1874: $\tau \|\mtx{M}\|_*$ to dominate the other, namely,
1875: $\frac{1}{2}\|\mtx{M}\|_F^2$. For products of Gaussian matrices as
1876: above, standard random matrix theory asserts that the Frobenius norm
1877: of $\mtx{M}$ concentrates around $n\sqrt{r}$, and that the nuclear
1878: norm concentrates around about $nr$ (this should be clear in the
1879: simple case where $r = 1$ and is generally valid). The value $\tau=5n$
1880: makes sure that on the average, the value of $\tau \|\mtx{M}\|_*$ is
1881: about $10$ times that of $\frac12\|\mtx{M}\|_F^2$ as long as the rank
1882: is bounded away from the dimension $n$.
1883: 
1884: 
1885: 
1886: \begin{table}
1887: \begin{center}
1888: \begin{tabular}{cccc|ccc}\hline
1889:   \multicolumn{4}{c|}{Unknown $\mtx{M}$}&\multicolumn{3}{c}{Computational results}\\ \hline
1890:   size ($n\times n$)& rank ($r$) & $m/d_r$ & $m/n^2$ & time(s) & \# iters & relative error\\ \hline
1891:   & 10 & 6 & 0.12 & 23 & 117 & $1.64\times10^{-4}$ \\
1892:   $1,000\times 1,000$ & 50 & 4 & 0.39 & 196 & 114 & $1.59\times10^{-4}$ \\
1893:   & 100 & 3 & 0.57 & 501 & 129 & $1.68\times10^{-4}$ \\ \hline
1894: 
1895:   & 10 & 6 & 0.024 & 147 & 123 & $1.73\times10^{-4}$ \\
1896:  $5,000\times 5,000$ & 50 & 5 & 0.10 & 950 & 108 & $1.61\times10^{-4}$ \\
1897:   & 100 & 4 & 0.158 & 3,339 & 123 & $1.72\times10^{-4}$ \\ \hline
1898: 
1899:   & 10 & 6 & 0.012 & 281 & 123 & $1.73\times10^{-4}$ \\
1900:  $10,000\times 10,000$ & 50 & 5 & 0.050 & 2,096 & 110 & $1.65\times10^{-4}$ \\
1901:   & 100 & 4 & 0.080 & 7,059 & 127 & $1.79\times10^{-4}$ \\ \hline
1902: 
1903:   & 10 & 6 & 0.006 & 588 & 124 & $1.73\times10^{-4}$ \\
1904:  \raisebox{1.5ex}[0pt]{$20,000\times 20,000$} & 50 & 5 & 0.025 & 4,581 & 111 & $1.66\times10^{-4}$ \\ \hline
1905: 
1906:  $30,000\times30,000$ & 10 & 6 & 0.004 & 1,030 & 125 & $1.73\times10^{-4}$ \\ \hline
1907: \end{tabular}
1908: \end{center}
1909: \caption{Experimental results for matrix
1910:   completion. The rank $r$ is the rank of the unknown matrix
1911:   $\mtx{M}$, $m/d_r$ is the ratio between the number of sampled entries and the number of degrees of freedom in an $n \times n$ matrix of rank $r$
1912:   (oversampling ratio), and $m/n^2$ is the fraction of observed entries.  All the computational results on the right are averaged over five runs.}
1913: \label{tab:result1}
1914: \end{table}
1915: 
1916: 
1917: Our computational results are displayed in Table
1918: \ref{tab:result1}. There, we report the run time in seconds, the
1919: number of iterations it takes to reach convergence \eqref{eqn:stop},
1920: and the relative error of the reconstruction
1921: \begin{equation}\label{eqn:relerr}
1922:   \mathrm{relative~
1923:     error}=\|\mtx{X}^{\mathrm{opt}}-\mtx{M}\|_F/\|\mtx{M}\|_F,
1924: \end{equation}
1925: where $\mtx{M}$ is the real unknown matrix.  All of these quantities
1926: are averaged over five runs. The table also gives the percentage of
1927: entries that are observed, namely, $m/n^2$ together with a quantity
1928: that we may want to think as the information oversampling ratio.
1929: Recall that an $n \times n$ matrix of rank $r$ depends upon $d_r :=
1930: r(2n-r)$ degrees of freedom. Then $m/d_r$ is the ratio between the
1931: number of sampled entries and the `true dimensionality' of an $n \times n$
1932: matrix of rank $r$.
1933: 
1934: The first observation is that the SVT algorithm performs extremely
1935: well in these experiments. In all of our experiments, it takes fewer
1936: than 200 SVT iterations to reach convergence. As a consequence, the
1937: run times are short. As indicated in the table, we note that one
1938: recovers a $1,000\times 1,000$ matrix of rank $10$ in less than a
1939: minute. The algorithm also recovers $30,000 \times 30,000$ matrices of
1940: rank $10$ from about $0.4\%$ of their sampled entries in just about 17
1941: minutes.  In addition, higher-rank matrices are also efficiently
1942: completed: for example, it takes between one and two hours to recover
1943: $10,000 \times 10,000$ matrices of rank $100$ and $20,000\times
1944: 20,000$ matrices of rank $50$. We would like to stress that these
1945: numbers were obtained on a modest CPU (1.86GHz). Furthermore, a
1946: Fortran implementation is likely to cut down on these numbers by a
1947: multiplicative factor typically between three and four.
1948: 
1949: We also check the validity of the stopping criterion \eqref{eqn:stop}
1950: by inspecting the relative error defined in \eqref{eqn:relerr}. The
1951: table shows that the heuristic and nonrigorous analysis of Section
1952: \ref{sec:implementation} holds in practice since the relative
1953: reconstruction error is of the same order as
1954: $\|\mathcal{P}_{\Omega}(\mtx{X}^{\mathrm{opt}}-\mtx{M})\|_F/
1955: \|\mathcal{P}_{\Omega}\mtx{M}\|_F \sim 10^{-4}$. Indeed, the overall
1956: relative errors reported in Table \ref{tab:result1} are all less than
1957: $2\times10^{-4}$.
1958: 
1959: We emphasized all along an important feature of the SVT algorithm,
1960: which is that the matrices $\mtx{X}^k$ have low rank.  We demonstrate
1961: this fact empirically in Figure \ref{fig:rank}, which plots the rank
1962: of $\mtx{X}^k$ versus the iteration count $k$, and does this for
1963: unknown matrices of size $5,000\times 5,000$ with different ranks. The
1964: plots reveal an interesting phenomenon: in our experiments, the rank
1965: of $\mtx{X}^{k}$ is nondecreasing so that the maximum rank is reached
1966: in the final steps of the algorithm. In fact, the rank of the iterates
1967: quickly reaches the value $r$ of the true rank. After these few
1968: initial steps, the SVT iterations search for that matrix with rank $r$
1969: minimizing the objective functional.  As mentioned earlier, the
1970: low-rank property is crucial for making the algorithm run fast.
1971: \begin{figure}[h]
1972:   \begin{center}
1973:     \begin{tabular}{ccc}
1974: \includegraphics[width=.32\textwidth]{rank10} &
1975: \includegraphics[width=.32\textwidth]{rank50} &
1976: \includegraphics[width=.32\textwidth]{rank100} \\
1977: $r = 10$ & $r = 50$ & $r = 100$
1978: \end{tabular}
1979: \end{center}
1980: \caption{Rank of $\mtx{X}^k$ as a function $k$ when the unknown matrix
1981:   $\mtx{M}$ is of size $5,000 \times 5,000$ and of rank $r$.}
1982: \label{fig:rank}
1983: \end{figure}
1984: 
1985: Finally, we demonstrate the results of the SVT algorithm for matrix
1986: completion from noisy sampled entries. Suppose we observe data from
1987: the model
1988: \begin{equation}
1989: \label{eq:noisy2}
1990: {B}_{ij} = M_{ij} + Z_{ij}, \qquad (i,j) \in \Omega,
1991: \end{equation}
1992: where $\mtx{Z}$ is a zero-mean Gaussian white noise with standard
1993: deviation $\sigma$. We run the SVT algorithm but stop early, as soon
1994: as $\mtx{X}^k$ is consistent with the data and obeys
1995: \begin{equation}\label{eqn:stop1}
1996: \|\mathcal{P}_{\Omega}(\mtx{X}^{k}-\mtx{B})\|_F^2
1997:  \leq (1+\epsilon) \, m\sigma^2,
1998:  \end{equation}
1999:  where $\epsilon$ is a small parameter.  Our reconstruction
2000:  $\hat{\mtx{M}}$ is the first $\mtx{X}^k$ obeying \eqref{eqn:stop1}.
2001:  The results are shown in Table \ref{tab:noise} (the quantities are
2002:  averages of 5 runs). Define the noise ratio as
2003: \[
2004: \|\cP_\Omega(\mtx{Z})\|_F/\|\mathcal{P}_{\Omega}(\mtx{M})\|_F,
2005: \]
2006: and the relative error by \eqref{eqn:relerr}. From Table
2007: \ref{tab:noise}, we see that the SVT algorithm works well as the
2008: relative error between the recovered and the true data matrix is just
2009: about equal to the noise ratio.
2010: \begin{table}
2011: \begin{center}
2012: \begin{tabular}{c|cccc|ccc}\hline
2013:   &\multicolumn{4}{c|}{Unknown matrix $\mtx{M}$}&\multicolumn{3}{c}{Computational results}\\
2014:   \cline{2-8}
2015:   \raisebox{2ex}[0pt]{\small{noise ratio}}
2016:   & size ($n\times n$) & rank ($r$) & $m/d_r$ & $m/n^2$ & time(s)& \# iters & relative error\\\hline
2017: 
2018:  & & 10 & 6 & 0.12 & 10.8 & 51 & $0.78\times10^{-2}$ \\
2019:  $10^{-2}$&$1,000\times 1,000$ & 50 & 4 & 0.39 & 87.7 & 48 & $0.95\times10^{-2}$ \\
2020:  & & 100 & 3 & 0.57 & 216 & 50 & $1.13\times10^{-2}$ \\ \hline
2021: 
2022:  & & 10 & 6 & 0.12 & 4.0 & 19 & $0.72\times10^{-1}$ \\
2023:  $10^{-1}$&$1,000\times 1,000$ & 50 & 4 & 0.39 & 33.2 & 17 & $0.89\times10^{-1}$ \\
2024:  & & 100 & 3 & 0.57 & 85.2 & 17 & $1.01\times10^{-1}$ \\ \hline
2025: 
2026:  & & 10 & 6 & 0.12 & 0.9 & 3 & $0.52$ \\
2027:  1&$1,000\times 1,000$ & 50 & 4 & 0.39 & 7.8 & 3 & $0.63$ \\
2028:  & & 100 & 3 & 0.57 & 34.8 & 3 & $0.69$ \\ \hline
2029: \end{tabular}
2030: \end{center}
2031: \caption{Simulation results for noisy data. The computational results are averaged over five runs.}
2032: \label{tab:noise}
2033: \end{table}
2034: 
2035: \newcommand{\sol}{\hat{\mtx{M}}}
2036: \newcommand{\true}{\mtx{M}}
2037: 
2038: The theory of low-rank matrix recovery from noisy data is nonexistent
2039: at the moment, and is obviously beyond the scope of this paper. Having
2040: said this, we would like to conclude this section with an intuitive
2041: and nonrigorous discussion, which may explain why the observed
2042: recovery error is within the noise level.  Suppose again that
2043: $\sol$ obeys \eqref{eq:xk-m}, namely,
2044:  \begin{equation}\label{eqn:error}
2045:  \|\cP_\Omega(\sol - \true)\|_F^2 \asymp p
2046: \|\sol - \true\|_F^2.
2047: \end{equation}
2048: As mentioned earlier, one condition for this to happen is that $\true$
2049: and $\sol$ have low rank. This is the reason why it is important to
2050: stop the algorithm early as we hope to obtain a solution which is both
2051: consistent with the data and has low rank (the limit of the SVT
2052: iterations, $\lim_{k \goto \infty} \mtx{X}^k$, will not generally have
2053: low rank since there may be no low-rank matrix matching the noisy
2054: data). From
2055: \[
2056: \|\cP_\Omega(\sol - \true)\|_F \le \|\cP_\Omega(\sol-\mtx{B})\|_F +
2057: \|\cP_\Omega({\mtx{B}}-\mtx{M})\|_F,
2058: \]
2059: and the fact that both terms on the right-hand side are on the order
2060: of $\sqrt{m\sigma^2}$, we would have $p \|\sol - \true\|_F^2 = O(m
2061: \sigma^2)$ by \eqref{eqn:error}. In particular, this would give that
2062: the relative reconstruction error is on the order of the noise
2063: ratio since $\|\cP_\Omega(\true)\|_F^2 \asymp p \|\true\|_F^2$---as
2064: observed experimentally.
2065: 
2066: \subsubsection{Linear inequality constraints} 
2067: 
2068: We now examine the speed at which one can solve similar problems with
2069: linear inequality constraints instead of linear equality
2070: constraints. We assume the model \eqref{eq:noisy2}, where the matrix
2071: $\true$ of rank $r$ is sampled as before, and solve the problem
2072: \eqref{eq:DS} by using \eqref{eqn:iterDS2}. We formulate the
2073: inequality constraints in \eqref{eq:DS} with $E_{ij} = \sigma$ so that
2074: one searches for a solution $\sol$ with minimum nuclear norm among all
2075: those matrices whose sampled entries deviate from the observed ones by
2076: at most the noise level $\sigma$.\footnote{This may not be
2077:   conservative enough from a statistical viewpoint but this works well
2078:   in this case, and our emphasis here is on computational rather than
2079:   statistical issues.}  In this experiment, we adjust $\sigma$ to be
2080: one tenth of a typical absolute entry of $\mtx{M}$, i.e.~$\sigma = 0.1
2081: \, \sum_{ij \in \Omega} |M_{ij}|/m$, and the noise ratio as defined
2082: earlier is 0.780. We set $n = 1,000$, $r = 10$, and the number $m$ of
2083: sampled entries is five times the number of degrees of freedom,
2084: i.e.~$m = 5 d_r$. Just as before, we set $\tau = 5n$, and choose a
2085: constant step size $\delta = 1.2p^{-1}$.
2086: 
2087: \begin{figure}[h]
2088:   \begin{center}
2089:     \begin{tabular}{cc}
2090: \includegraphics[width=.38\textwidth]{resid1} &
2091: \includegraphics[width=.38\textwidth]{resid2} \\
2092: (a) & (b)\\
2093: \includegraphics[width=.38\textwidth]{resid3} & \\
2094: (c) & 
2095: \end{tabular}
2096: \end{center}
2097: \caption{Computational results of the algorithm applied to noisy
2098:   (linear inequality constraints as in \eqref{eq:DS}) and noiseless
2099:   data (equality constraints). The blue (resp.~red) color is used for
2100:   the noisy (resp.~noiseless) experiment.  (a) Plot of the
2101:   reconstruction errors from noisy and noiseless data as a function of
2102:   the iteration count. The thin line is the residual relative error
2103:   $\|\cP_\Omega(\mtx{X^k}-\mtx{M})\|_F/\|\cP_\Omega(\mtx{M})\|_F$ and
2104:   the thick line is the overall relative error
2105:   $\|\mtx{X^k}-\mtx{M}\|_F/\|\mtx{M}\|_F$. (b) Rank of the iterates as
2106:   a function of the iteration count. (c) Time it takes to compute the
2107:   singular value thresholding operation as a function of the iteration
2108:   count. The computer here is a single-core 3.00GHz Pentium 4 running
2109:   Matlab 7.2.0.}
2110: \label{fig:DS}
2111: \end{figure}
2112: 
2113: The results, reported in Figure \ref{fig:DS}, show that the algorithm
2114: behaves just as well with linear inequality constraints. To make this
2115: point, we compare our results with those obtained from noiseless data
2116: (same unknown matrix and sampled locations). In the noiseless case, it
2117: takes about 150 iterations to reach the tolerance $\epsilon = 10^{-4}$
2118: whereas in the noisy case, convergence occurs in about 200 iterations
2119: (Figure \ref{fig:DS}(a)). In addition, just as in the noiseless
2120: problem, the rank of the iterates is nondecreasing and quickly reaches
2121: the true value $r$ of the rank of the unknown matrix $\mtx{M}$ we wish
2122: to recover (Figure \ref{fig:DS}(b)). As a consequence the SVT
2123: iterations take about the same amount of time as in the noiseless case
2124: (Figure \ref{fig:DS}(c)) so that the total running time of the
2125: algorithm does not appear to be substantially different from that in
2126: the noiseless case.
2127: 
2128: We close by pointing out that from a statistical point of view, the
2129: recovery of the matrix $\mtx{M}$ from undersampled and noisy entries
2130: by the matrix equivalent of the Dantzig selector appears to be
2131: accurate since the relative error obeys $\|\sol-\true\|_F/\|\true\|_F =
2132: 0.0769$ (recall that the noise ratio is about $0.08$).
2133: 
2134:  % \blue{Here we give a heuristic justification for the choice of
2135:  %   $\hat{\mtx{X}}$.  By abusing of notation, the constant $C$ may be
2136:  %   different in different occurrences below.  By \eqref{eq:xk-m} and
2137:  %   with choice of \eqref{eqn:stop1},
2138: %  \begin{equation}\label{eqn:error}
2139: %  \|\hat{\mtx{X}}-\mtx{M}\|_F^2 \leq C p^{-1}\|\cP_\Omega(\hat{\mtx{X}}-\mtx{M})\|_F^2 \leq 2C p^{-1}(\|\cP_\Omega(\hat{\mtx{X}}-\mtx{B})\|_F^2+\|\cP_\Omega({\mtx{B}}-\mtx{M})\|_F^2).
2140: %  \end{equation}
2141: %  Since the most right hand side has a high probability to be smaller than $C p^{-1} \|\cP_\Omega({\mtx{B}}-\mtx{M})\|_F^2$, and  since $ \|\mathcal{P}_\Omega(\mtx{M})\|_F^2 \asymp p \, \|\mtx{M}\|_F^2$, one has
2142: %  $$\frac{\|\hat{\mtx{X}}-\mtx{M}\|_F}{\|\mtx{M}\|_F}\leq C \frac{\|\cP_\Omega(\mtx{Z})\|_F}{\|\mathcal{P}_{\Omega}(\mtx{M})\|_F},$$
2143: %  where $C$ is independent of $p$.
2144: % }
2145: 
2146: \section{Discussion}
2147: \label{sec:discussion}
2148: 
2149: This paper introduced a novel algorithm, namely, the singular value
2150: thresholding algorithm for matrix completion and related nuclear norm
2151: minimization problems.  This algorithm is easy to implement and
2152: surprisingly effective both in terms of computational cost and
2153: storage requirement when the minimum nuclear-norm solution is also
2154: the lowest-rank solution.  We would like to close this paper by
2155: discussing a few open problems and research directions related to this
2156: work.
2157: 
2158: Our algorithm exploits the fact that the sequence of iterates
2159: $\{\mtx{X}^k\}$ have low rank when the minimum nuclear solution has
2160: low rank.  An interesting question is whether one can prove (or
2161: disprove) that in a majority of the cases, this is indeed the case.
2162: 
2163: It would be interesting to explore other ways of computing
2164: $\cD_\tau(\mtx{Y})$---in words, the action of the singular value shrinkage
2165: operator. Our approach uses the Lanczos bidiagonalization algorithm
2166: with partial reorthogonalization which takes advantages of sparse
2167: inputs but other approaches are possible. We mention two of them.
2168: \begin{enumerate}
2169: \item A series of papers have proposed the use of randomized procedures
2170:   for the approximation of a matrix $\mtx{Y}$ with a matrix $\mtx{Z}$ of
2171:   rank $r$ \cite{RokhlinQR1,RokhlinQR2}. When this approximation
2172:   consists of the truncated SVD retaining the part of the expansion
2173:   corresponding to singular values greater than $\tau$, this can be
2174:   used to evaluate $\cD_\tau(\mtx{Y})$. Some of these algorithms are
2175:   efficient when the input $\mtx{Y}$ is sparse \cite{RokhlinQR1}, and
2176:   it would be interesting to know whether these methods are fast and
2177:   accurate enough to be used in the SVT iteration \eqref{eqn:iter}.
2178: 
2179: \item A wide range of iterative methods for computing matrix functions
2180:   of the general form $f(\mtx{Y})$ are available today, see
2181:   \cite{Higham} for a survey. A valuable research direction is to
2182:   investigate whether some of these iterative methods, or other to be
2183:   developed, would provide powerful ways for computing
2184:   $\cD_\tau(\mtx{Y})$.
2185: \end{enumerate}
2186: 
2187: In practice, one would like to solve \eqref{eqn:minnuc+fro} for large
2188: values of $\tau$. However, a larger value of $\tau$ generally means a
2189: slower rate of convergence. A good strategy might be to start with
2190: a value of $\tau$, which is large enough so that \eqref{eqn:minnuc+fro}
2191: admits a low-rank solution, and at the same time for which the
2192: algorithm converges rapidly. One could then use a continuation method
2193: as in \cite{Continuation} to increase the value of $\tau$ sequentially
2194: according to a schedule $\tau_0, \tau_1, \ldots$, and use the solution
2195: to the previous problem with $\tau = \tau_{i-1}$ as an initial guess
2196: for the solution to the current problem with $\tau = \tau_i$ (warm
2197: starting). We hope to report on this in a separate paper.
2198: 
2199: \small
2200: 
2201: \subsection*{Acknowledgments}
2202: J-F.~C.~is supported by the Wavelets and Information Processing
2203: Programme under a grant from DSTA, Singapore. E.~C.~is partially
2204: supported by the Waterman Award from the National Science Foundation
2205: and by an ONR grant N00014-08-1-0749. Z.~S.~is supported in part by
2206: Grant R-146-000-113-112 from the National University of
2207: Singapore. E.~C.~would like to thank Benjamin Recht and Joel Tropp for
2208: fruitful conversations related to this project, and Stephen Becker for
2209: his help in preparing the computational results of Section 5.2.2.
2210: 
2211: \bibliographystyle{abbrv}
2212: 
2213: \begin{thebibliography}{1}
2214: 
2215: \bibitem{Abernethy06}
2216: J.~Abernethy, F.~Bach, T.~Evgeniou, and J.-P. Vert.
2217: \newblock Low-rank matrix factorization with attributes.
2218: \newblock Technical Report N24/06/MM, Ecole des Mines de Paris, 2006.
2219: 
2220: \bibitem{NetflixPrize}
2221: ACM SIGKDD and Netflix.
2222: \newblock {\em Proceedings of KDD Cup and Workshop}, 2007.
2223: \newblock Proceedings available online at
2224:   \url{http://www.cs.uic.edu/~liub/KDD-cup-2007/proceedings.html}.
2225: 
2226: \bibitem{Amit07}
2227: Y.~Amit, M.~Fink, N.~Srebro, and S.~Ullman.
2228: \newblock Uncovering shared structures in multiclass classification.
2229: \newblock In {\em Proceedings of the Twenty-fourth International Conference on
2230:   Machine Learning}, 2007.
2231: 
2232: \bibitem{Argyriou07}
2233: A.~Argyriou, T.~Evgeniou, and M.~Pontil.
2234: \newblock Multi-task feature learning.
2235: \newblock In {\em Neural Information Processing Systems}, 2007.
2236: 
2237: 
2238: \bibitem{BBAC:ECCV:04}
2239: J.~Bect, L.~Blanc-F{\'e}raud, G.~Aubert, and A.~Chambolle,
2240: \newblock A $\ell_1$ unified variational framework for image restoration,
2241: \newblock in {\em Proc. Eighth Europ. Conf. Comput. Vision}, 2004.
2242: 
2243: 
2244: \bibitem{BoydBook} S. Boyd, and L. Vandenberghe.
2245: \newblock {\em Convex Optimization}.
2246: \newblock Cambridge University Press, 2004.
2247: 
2248: \bibitem{CCSS:SISC:08}
2249: J.-F. Cai, R.~Chan, L.~Shen, and Z.~Shen.
2250: \newblock Restoration of chopped and nodded images by framelets.
2251: \newblock {\em SIAM J. Sci. Comput.}, 30(3):1205--1227, 2008.
2252: 
2253: \bibitem{CCS:ACHA:08}
2254: J.-F. Cai, R.~H. Chan, and Z.~Shen.
2255: \newblock A framelet-based image inpainting algorithm.
2256: \newblock {\em Appl. Comput. Harmon. Anal.}, 24(2):131--149, 2008.
2257: 
2258: \bibitem{COS:XXX:08:3}
2259: J.-F. Cai, S.~Osher, and Z.~Shen.
2260: \newblock {\em Convergence of the Linearized {B}regman Iteration for
2261:   $\ell_1$-norm Minimization}, 2008.
2262: \newblock UCLA CAM Report (08-52).
2263: 
2264: \bibitem{COS:XXX:08}
2265: J.-F. Cai, S.~Osher, and Z.~Shen.
2266: \newblock {\em Linearized {B}regman Iterations for Compressed Sensing}, 2008.
2267: \newblock Math. Comp., to appear; see also UCLA CAM Report (08-06).
2268: 
2269: \bibitem{COS:XXX:08:2}
2270: J.-F. Cai, S.~Osher, and Z.~Shen.
2271: \newblock {\em Linearized {B}regman Iterations for Frame-Based Image
2272:   Deblurring}, 2008.
2273: \newblock preprint.
2274: 
2275: %\bibitem{Can:ICM:06}
2276: %E.~J. Cand{\`e}s.
2277: %\newblock Compressive sampling.
2278: %\newblock In {\em International Congress of Mathematicians. Vol. III}, pages
2279: %  1433--1452. Eur. Math. Soc., Z\"urich, 2006.
2280: 
2281: \bibitem{TVSynthesis}
2282: E.~J. Cand\`es, and F.~Guo.
2283: \newblock New multiscale transforms, minimum total variation synthesis:
2284:   Applications to edge-preserving image reconstruction.
2285: \newblock {\em Signal Processing}, 82:1519--1543, 2002.
2286: 
2287: \bibitem{CR:XXX:08}
2288: E.~J. Cand{\`e}s and B.~Recht.
2289: \newblock {\em Exact Matrix Completion via Convex Optimization}, 2008.
2290: 
2291: \bibitem{CR:IP:07}
2292: E.~J. Cand{\`e}s and J.~Romberg.
2293: \newblock Sparsity and incoherence in compressive sampling.
2294: \newblock {\em Inverse Problems}, 23(3):969--985, 2007.
2295: 
2296: \bibitem{CRT:TIT:06}
2297: E.~J. Cand{\`e}s, J.~Romberg, and T.~Tao.
2298: \newblock Robust uncertainty principles: exact signal reconstruction from
2299:   highly incomplete frequency information.
2300: \newblock {\em IEEE Trans. Inform. Theory}, 52(2):489--509, 2006.
2301: 
2302: \bibitem{CT:TIT:05}
2303: E.~J. Cand{\`e}s and T.~Tao.
2304: \newblock Decoding by linear programming.
2305: \newblock {\em IEEE Trans. Inform. Theory}, 51(12):4203--4215, 2005.
2306: 
2307: \bibitem{CT:TIT:06}
2308: E.~J. Cand{\`e}s and T.~Tao.
2309: \newblock Near-optimal signal recovery from random projections: universal
2310:   encoding strategies?
2311: \newblock {\em IEEE Trans. Inform. Theory}, 52(12):5406--5425, 2006.
2312: 
2313: \bibitem{DS} E.~J. Cand\`es and T.~Tao.
2314: \newblock The Dantzig selector: statistical estimation when $p$ is
2315:   much larger than $n$.
2316: \newblock {\em Annals of Statistics} 35:2313--2351, 2007.
2317: 
2318: \bibitem{CS:NM:07}
2319: A.~Chai and Z.~Shen.
2320: \newblock Deconvolution: A wavelet frame approach.
2321: \newblock {\em Numer. Math.}, 106(4):529--587, 2007.
2322: 
2323: \bibitem{CCSS:SISC:03}
2324: R.~H. Chan, T.~F. Chan, L.~Shen, and Z.~Shen.
2325: \newblock Wavelet algorithms for high-resolution image reconstruction.
2326: \newblock {\em SIAM J. Sci. Comput.}, 24(4):1408--1432 (electronic), 2003.
2327: 
2328: \bibitem{ChenSuter} P. Chen, and D.~Suter.
2329: \newblock Recovering the
2330:   missing components in a large noisy low-rank matrix: application to
2331:   {SFM} source.
2332: \newblock {\em IEEE Transactions on Pattern Analysis
2333:     and Machine Intelligence}, 26(8):1051-1063, 2004.
2334: 
2335: \bibitem{CW:MMS:05}
2336: P.~L. Combettes and V.~R. Wajs.
2337: \newblock Signal recovery by proximal forward-backward splitting.
2338: \newblock {\em Multiscale Model. Simul.}, 4(4):1168--1200 (electronic), 2005.
2339: 
2340: \bibitem{DO:XXX:07}
2341: J.~Darbon and S.~Osher.
2342: \newblock {\em Fast discrete optimization for sparse approximations and
2343:   deconvolutions}, 2007.
2344: \newblock preprint.
2345: 
2346: \bibitem{DDD:CPAM:04}
2347: I.~Daubechies, M.~Defrise, and C.~De~Mol.
2348: \newblock An iterative thresholding algorithm for linear inverse problems with
2349:   a sparsity constraint.
2350: \newblock {\em Comm. Pure Appl. Math.}, 57(11):1413--1457, 2004.
2351: 
2352: \bibitem{DTV:IPI:07}
2353: I.~Daubechies, G.~Teschke, and L.~Vese.
2354: \newblock Iteratively solving linear inverse problems under general convex
2355:   constraints.
2356: \newblock {\em Inverse Probl. Imaging}, 1(1):29--46, 2007.
2357: 
2358: %\bibitem{Don:TIT:95}
2359: %D.~L. Donoho.
2360: %\newblock De-noising by soft-thresholding.
2361: %\newblock {\em IEEE Trans. Inform. Theory}, 41(3):613--627, 1995.
2362: 
2363: \bibitem{Don:TIT:06}
2364: D.~L. Donoho.
2365: \newblock Compressed sensing.
2366: \newblock {\em IEEE Trans. Inform. Theory}, 52(4):1289--1306, 2006.
2367: 
2368: \bibitem{ESQD:ACHA:05}
2369: M.~Elad, J.-L. Starck, P.~Querre, and D.~L. Donoho.
2370: \newblock Simultaneous cartoon and texture image inpainting using morphological
2371:   component analysis ({MCA}).
2372: \newblock {\em Appl. Comput. Harmon. Anal.}, 19(3):340--358, 2005.
2373: 
2374: 
2375: \bibitem{FSM:CJ:07}
2376: M.~J.~Fadili, J.-L.~Starck, and F.~Murtagh.
2377: \newblock Inpainting and zooming using sparse representations.
2378: \newblock {\em The Computer Journal}, to appear.
2379: 
2380: 
2381: \bibitem{FazelThesis}
2382: M.~Fazel.
2383: \newblock {\em Matrix Rank Minimization with Applications}.
2384: \newblock PhD thesis, Stanford University, 2002.
2385: 
2386: \bibitem{fazelRank}
2387: M.~Fazel, H.~Hindi, and S.~Boyd,
2388: \newblock Log-det heuristic for matrix rank minimization with applications to
2389:   {H}ankel and {E}uclidean distance matrices.
2390: \newblock in {\em Proc. Am. Control Conf.}, June 2003.
2391: 
2392: \bibitem{Nowak_EM} M.~Figueiredo, and R.~Nowak,
2393: \newblock An {EM} algorithm for wavelet-based image restoration.
2394: \newblock {\em IEEE Transactions on Image Processing}, 12(8):906--916,
2395: 2003.
2396: 
2397: \bibitem{GO:XXX:08}
2398: T.~Goldstein and S.~Osher.
2399: \newblock {\em The Split {B}regman Algorithm for L1 Regularized Problems},
2400:   2008.
2401: \newblock UCLA CAM Reprots (08-29).
2402: 
2403: %\bibitem{GV:BOOK:96}
2404: %G.~H. Golub and C.~F. Van~Loan.
2405: %\newblock {\em Matrix computations}.
2406: %\newblock Johns Hopkins Studies in the Mathematical Sciences. Johns Hopkins
2407: %  University Press, Baltimore, MD, third edition, 1996.
2408: 
2409: \bibitem{YinFPC} E.~T. Hale, W. Yin, and Y. Zhang.  \newblock {\em
2410:     Fixed-point continuation for l1-minimization: methodology and
2411:     convergence}.  \newblock 2008.  \newblock preprint.
2412: 
2413: \bibitem{Higham}
2414: N.~J.~Higham.
2415: \newblock {\em Functions of Matrices: {Theory} and Computation}.
2416: \newblock Society for Industrial and Applied Mathematics,
2417: Philadelphia, PA, USA, 2008.
2418: 
2419: \bibitem{HL:BOOK:93}
2420: J.-B. Hiriart-Urruty and C.~Lemar{\'e}chal.
2421: \newblock {\em Convex analysis and minimization algorithms. {I}}, volume 305 of
2422:   {\em Grundlehren der Mathematischen Wissenschaften [Fundamental Principles of
2423:   Mathematical Sciences]}.
2424: \newblock Springer-Verlag, Berlin, 1993.
2425: \newblock Fundamentals.
2426: 
2427: \bibitem{Lar:Propack}
2428: R.~M. Larsen, \newblock {\em PROPACK -- Software for large and
2429: sparse SVD calculations},
2430: \newblock Available from
2431: \url{http://sun.stanford.edu/~rmunk/PROPACK/}.
2432: 
2433: \bibitem{Lew:MP:03}
2434: A.~S. Lewis.
2435: \newblock The mathematics of eigenvalue optimization.
2436: \newblock {\em Math. Program.}, 97(1-2, Ser. B):155--176, 2003.
2437: \newblock ISMP, 2003 (Copenhagen).
2438: 
2439: \bibitem{RokhlinQR2}
2440: E.~Liberty, F.~Woolfe, P.-G.~Martinsson, V.~Rokhlin, and M.~Tygert.
2441: \newblock Randomized algorithms for the low-rank approximation of
2442:             matrices.
2443: \newblock {\em Proc. Natl. Acad. Sci. USA}, 104(51): 20167--20172, 2007.
2444: 
2445: \bibitem{Lintner}
2446: S.~Lintner, and F. Malgouyres.
2447: \newblock Solving a variational image restoration model
2448: which involves $\ell_\infty$ constraints.
2449: \newblock {\em Inverse Problems}, 20:815--831, 2004.
2450: 
2451: \bibitem{VandenbergheNuc}
2452: Z. Liu, and L. Vandenberghe.
2453: \newblock Interior-point method for nuclear norm approximation with
2454: application to system identification.
2455: \newblock submitted to {\em Mathematical Programming}, 2008.
2456: 
2457: \bibitem{RokhlinQR1}
2458: P.-G.~Martinsson, V.~Rokhlin, and M.~Tygert.
2459: \newblock A randomized algorithm for the approximation of matrices
2460: \newblock Department of Computer Science, Yale University, New Haven,
2461: CT, Technical Report 1361, 2006.
2462: 
2463: \bibitem{Mesbahi97}
2464: M.~Mesbahi and G.~P. Papavassilopoulos.
2465: \newblock On the rank minimization problem over a positive semidefinite linear
2466:   matrix inequality.
2467: \newblock {\em IEEE Transactions on Automatic Control}, 42(2):239--243, 1997.
2468: 
2469: \bibitem{OBGXY:MMS:05}
2470: S.~Osher, M.~Burger, D.~Goldfarb, J.~Xu, and W.~Yin.
2471: \newblock An iterative regularization method for total variation-based image
2472:   restoration.
2473: \newblock {\em Multiscale Model. Simul.}, 4(2):460--489 (electronic), 2005.
2474: 
2475: \bibitem{ODY:XXX:08}
2476: S.~Osher, Y.~Mao, B.~Dong, and W.~Yin.
2477: \newblock {\em Fast Linearized Bregman Iteration for Compressed Sensing and
2478:   Sparse Denoising}, 2008.
2479: \newblock UCLA CAM Reprots (08-37).
2480: 
2481: \bibitem{Recht07}
2482: B.~Recht, M.~Fazel, and P.~Parrilo.
2483: \newblock Guaranteed minimum rank solutions of matrix equations via nuclear
2484:   norm minimization.
2485: \newblock 2007.
2486: \newblock Submitted to {\em SIAM Review}.
2487: 
2488: 
2489: \bibitem{SDC:AA:03}
2490: J.-L.~Starck, D.~L.~Donoho, and E.~J.~Cand{\`e}s,
2491: \newblock Astronomical image representation by the curvelet
2492:    transform.
2493: \newblock {\em Astronom. and Astrophys.}, 398:785--800, 2003.
2494: 
2495: \bibitem{TTT:SDPT3}
2496: K.~C. Toh, M.~J. Todd, and R.~H. T\"{u}t\"{u}nc\"{u}.
2497: \newblock {\em SDPT3 -- a MATLAB software package for semidefinite-quadratic-linear
2498: programming},
2499: \newblock Available from
2500: \url{http://www.math.nus.edu.sg/~mattohkc/sdpt3.html}.
2501: 
2502: \bibitem{Tomasi}
2503: C. Tomasi and T. Kanade.
2504: \newblock Shape and motion from image streams under orthography: a
2505:   factorization method.
2506: \newblock {\em International Journal of Computer Vision},
2507:   9(2):137--154, 1992.
2508: 
2509: \bibitem{Wat:LAA:92}
2510: G.~A. Watson.
2511: \newblock Characterization of the subdifferential of some matrix norms.
2512: \newblock {\em Linear Algebra Appl.}, 170:33--45, 1992.
2513: 
2514: \bibitem{Continuation}
2515: S.~J.~Wright, R.~Nowak, and M.~Figueiredo.
2516: \newblock Sparse reconstruction by separable approximation.
2517: \newblock Submitted for publication, 2007.
2518: 
2519: 
2520: \bibitem{YOGD:SIIMS:08}
2521: W.~Yin, S.~Osher, D.~Goldfarb, and J.~Darbon.
2522: \newblock {B}regman iterative algorithms for $\ell_1$-minimization with
2523:   applications to compressed sensing.
2524: \newblock {\em SIAM J. Imaging Sci.}, 1(1):143--168, 2008.
2525: 
2526: 
2527: 
2528: \end{thebibliography}
2529: 
2530: \end{document}
2531: