0806:0806.3008/ssp.tex

1: %% Discounted stochastic shortest path problem, general state space.

2: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

3:

4: \documentclass[a4paper, 10pt, twoside]{amsart}

5:

6: \usepackage{yfonts, enumitem, mathtools}

7: \usepackage{amsmath}

8: \usepackage{amssymb, latexsym, mathrsfs}

9: \usepackage{fancybox, graphicx, subfigure}

10: \usepackage{endnotes}

11: %\usepackage{xr-hyper, showkeys}

12: \usepackage[usenames, dvipsnames]{color}

13: \usepackage[colorlinks=true,

14:         raiselinks=true,

15:         linkcolor=MidnightBlue,

16:         citecolor=Mahogany,

17:         urlcolor=ForestGreen,

18:         pdfauthor=Debasish Chatterjee,

19:         pdftitle={Discounted stochastic shortest path problem, general state space},

20:         pdfkeywords={discounted stochastic shortest path, Markov control processes},

21:         pdfsubject={Technical Report},

22:         plainpages=false]{hyperref}

23:

24: \usepackage[charter]{mathdesign}

25: %\usepackage[charter]{mathdesign}

26:

27: %% Various colors

28: %\definecolor{myred}{rgb}{0.6, 0, 0}

29: %\definecolor{mygreen}{rgb}{0, 0.5, 0}

30: %\definecolor{myblue}{rgb}{0, 0, 0.5}

31: %\definecolor{mycyan}{rgb}{0, 0.5, 0.5}

32:

33: \addtolength{\oddsidemargin}{-0.5cm}

34: \addtolength{\evensidemargin}{-0.5cm}

35: \addtolength{\textwidth}{1cm}

36: \addtolength{\topmargin}{-0.5cm}

37: \addtolength{\textheight}{1cm}

38:

39: \DeclareMathOperator*{\argmin}{arg\,min}

40:

41: \newcommand{\R}{\ensuremath{\mathbb{R}}}

42: \newcommand{\N}{\ensuremath{\mathbb{N}}}

43: \newcommand{\Nz}{\ensuremath{\mathbb{N}_0}}

44: \newcommand{\posR}{\ensuremath{\R_{\ge 0}}}

45:

46: \newcommand{\nn}{\ensuremath{\nonumber}}

47: \newcommand{\ra}{\ensuremath{\rightarrow}}

48: \newcommand{\Ra}{\ensuremath{\;\Longrightarrow\;}}

49: \newcommand{\lra}{\ensuremath{\longrightarrow}}

50: \newcommand{\eps}{\ensuremath{\varepsilon}}

51: \newcommand{\fa}{\ensuremath{\forall\,}}

52: \renewcommand{\le}{\ensuremath{\leqslant}}

53: \renewcommand{\ge}{\ensuremath{\geqslant}}

54: \renewcommand{\mapsto}{\ensuremath{\longmapsto}}

55: \newcommand{\eqvcond}{\ensuremath{\Longleftrightarrow}}

56: \newcommand{\therex}{\ensuremath{\exists\,}}

57: \newcommand{\setmin}{\ensuremath{\!\smallsetminus}}

58: \newcommand{\Let}{\coloneqq}

59: \newcommand{\teL}{\eqqcolon}

60:

61: \newcommand{\ClassK}{\ensuremath{\mathcal{K}}}

62: \newcommand{\ClassKinfty}{\ensuremath{\mathcal{K}_{\infty}}}

63: \newcommand{\ClassKL}{\ensuremath{\mathcal{KL}}}

64:

65: \newcommand{\cl}[1]{\ensuremath{\closure\bigl(#1\bigr)}}

66: \newcommand{\bd}[1]{\ensuremath{\boundary\bigl(#1\bigr)}}

67: \newcommand{\intr}[1]{\ensuremath{\interior\bigl(#1\bigr)}}

68: \newcommand{\epower}[1]{\ensuremath{\mathrm{e}^{#1}}}

69: \newcommand{\norm}[1]{\ensuremath{\left\lVert #1 \right\rVert}}

70: \newcommand{\Borelsigalg}[1]{\ensuremath{\mathfrak{B}\!\left(#1\right)}}

71: \newcommand{\Expec}[1]{\ensuremath{\mathsf{E}\!\left[\vphantom{\big|}#1\vphantom{\big|}\right]}}

72: \newcommand{\CExpec}[2]{\ensuremath{\mathsf{E}^{#2}_{\vphantom{T}}\!\!\left[\vphantom{\big|}#1\vphantom{\big|}\right]}}

73: \newcommand{\CProb}[2]{\ensuremath{\mathsf{P}^{#2}_{\vphantom{T}}\!\!\left(\vphantom{\big|}#1\vphantom{\big|}\right)}}

74: %\newcommand{\CExpec}[2]{\ensuremath{\mathsf{E}\!\left[\vphantom{\big|#2}#1\left|\vphantom{\big|#1}#2\right.\right]}}

75: \newcommand{\Prob}[1]{\ensuremath{\mathsf{P}\!\left(\vphantom{\big|}#1\vphantom{\big|}\right)}}

76: \newcommand{\LieD}[2]{\ensuremath{\mathrm L_{#1}{#2}}}

77: \newcommand{\indic}[1]{\ensuremath{\boldsymbol{1}_{#1}}}

78: \newcommand{\abs}[1]{\ensuremath{\left\lvert{#1}\right\rvert}}

79: \newcommand{\Ball}[1]{\ensuremath{\mathcal{B}(#1)}}

80: \newcommand{\restr}[1]{\ensuremath{|_{\!#1}}}

81: \newcommand{\pb}[2]{\ensuremath{\left(#1\right)^\star\!{#2}}}

82: %\newcommand{\pf}[2]{\ensuremath{\left(#1\right)_\star\!{#2}}}

83: \newcommand{\mrm}[1]{\ensuremath{\mathrm{#1}}}

84: \newcommand{\mc}[1]{\ensuremath{\mathcal{#1}}}

85: \newcommand{\bs}[1]{\ensuremath{\boldsymbol{#1}}}

86: \newcommand{\mscr}[1]{\ensuremath{\mathscr{#1}}}

87: \newcommand{\Lp}[1]{\ensuremath{\boldsymbol{L}_{#1}}}

88: \newcommand{\inprod}[2]{\ensuremath{\left\langle{#1}\vphantom{\big|},\vphantom{\big|}{#2}\right\rangle}}

89: \newcommand{\interject}[1]{\textcolor{red}{[#1]}}

90: \newcommand{\embf}[1]{\textit{\textbf{#1}}}

91: \newcommand{\secref}[1]{\S\ref{#1}}

92: \newcommand{\hlo}[1]{\textcolor{RawSienna}{\textbf{#1}}}

93:

94: \newcommand{\bst}{\ensuremath{\,\big|\,}}

95: \newcommand{\EE}{\ensuremath{\mathsf{E}}}

96: \newcommand{\PP}{\ensuremath{\mathsf{P}}}

97: \newcommand{\transp}{\ensuremath{^{\scriptscriptstyle{\mathrm T}}}}

98: \newcommand{\sigalg}{\ensuremath{\mathfrak{F}}}

99: \newcommand{\cardP}{\ensuremath{\mathrm{N}}}

100: \newcommand{\dist}{\ensuremath{\mathrm d}}

101: \newcommand{\tz}{\ensuremath{t_0}}

102: \newcommand{\xiz}{\ensuremath{\xi_0}}

103: \newcommand{\xz}{\ensuremath{x_0}}

104: \newcommand{\sigmaz}{\ensuremath{\sigma_0}}

105: \newcommand{\gproc}{\ensuremath{\boldsymbol x}}

106: \newcommand{\xref}{\ensuremath{x^\ast}}

107: \newcommand{\xrefz}{\ensuremath{x^\ast_0}}

108: \newcommand{\drv}{\ensuremath{\,\mathrm{d}}}

109: \newcommand{\ol}{\overline}

110: \newcommand{\ul}{\underline}

111: \newcommand{\wt}{\widetilde}

112: \newcommand{\wh}{\widehat}

113: \newcommand{\Vvec}{\ensuremath{\ol V}}

114: \newcommand{\gasas}{{\sc gas}~a.s.}

115: \newcommand{\gasm}{{\sc gas-m}}

116: \newcommand{\gas}{{\sc gas}}

117: \newcommand{\gasp}{{\sc gas-p}}

118: \newcommand{\issm}{{\sc iss-m}}

119: \newcommand{\idmat}[1]{\ensuremath{{I}_{#1\times#1}^{\vphantom{T}}}}

120: \newcommand{\jet}{\ensuremath{\boldsymbol{j}}}

121: \newcommand{\cadlag}{c{\`a}dl{\`a}g}

122: \renewcommand{\subset}{\ensuremath{\subseteq}}

123: \renewcommand{\supset}{\ensuremath{\supseteq}}

124: \newcommand{\opensubset}{\ensuremath{\subseteq_0}}

125: \newcommand{\clsubset}{\ensuremath{\sqsubseteq}}

126: \newcommand{\compsubset}{\ensuremath{\Subset}}

127: \newcommand{\finsubset}{\ensuremath{\subset\subset}}

128: \newcommand{\mx}{\ensuremath{\vee}}

129: \newcommand{\mn}{\ensuremath{\wedge}}

130: \renewcommand{\limsup}{\ensuremath{\varlimsup}}

131:

132: \newcommand{\RemarkEnd}{\hspace{\stretch{1}}{$\vartriangleleft$}}

133: \newcommand{\ExampleEnd}{\hspace{\stretch{1}}{$\triangle$}}

134: \newcommand{\DefEnd}{\hspace{\stretch{1}}{$\Diamond$}}

135: \newcommand{\AssumptionEnd}{\hspace{\stretch{1}}{$\diamondsuit$}}

136: \newcommand{\vphi}{\ensuremath{\varphi}}

137:

138:

139: %% Change the default section notation

140: \renewcommand{\sectionname}{\S\!}

141: \renewcommand{\subsectionname}{\S\!}

142: \renewcommand{\subsubsectionname}{\S\!}

143:

144: %% Theorem styles

145: \numberwithin{equation}{section}

146: \swapnumbers

147: \newtheorem{theorem}[equation]{Theorem}

148: \newtheorem{corollary}[equation]{Corollary}

149: \newtheorem{lemma}[equation]{Lemma}

150: \newtheorem{proposition}[equation]{Proposition}

151:

152: %\newtheorem{corollary}[thm]{Corollary}

153: %\newtheorem{lemma}[thm]{Lemma}

154:

155: \theoremstyle{definition}

156: \newtheorem{defn}[equation]{Definition}

157: \theoremstyle{remark}

158: \newtheorem{remark}[equation]{Remark}

159: \newtheorem{example}[equation]{Example}

160: \newtheorem{prgr}[equation]{}

161: \newtheorem{assumption}[equation]{Assumption}

162:

163: \allowdisplaybreaks

164:

165: \title[On Stochastic Control up to a Hitting Time]{Stochastic Control up to a Hitting Time: Optimality and Rolling-horizon Implementation}

166: \author[D.~Chatterjee]{Debasish Chatterjee}

167: \author[E.~Cinquemani]{Eugenio Cinquemani}

168: \author[G.~Chaloulos]{Georgios Chaloulos}

169: \author[J.~Lygeros]{John Lygeros}

170: \address{Automatic Control Laboratory, Physikstrasse 3, ETH Z\"urich, 8092 Z\"urich, Switzerland}

171: \email{\{chatterjee,cinquemani,chaloulos,lygeros\}@control.ee.ethz.ch}

172: \urladdr{\url{http://control.ee.ethz.ch}}

173:

174: \date{\today}

175: \subjclass[2000]{Primary: 90C39, 90C40; Secondary: 93E20}

176:

177: \begin{document}

178:

179: 	\begin{abstract}

180: 		We present a dynamic programming-based solution to a stochastic optimal control problem up to a hitting time for a discrete-time Markov control process. First we determine an optimal control policy to steer the process toward a compact target set while simultaneously minimizing an expected discounted cost. We then provide a rolling-horizon strategy for approximating the optimal policy, together with quantitative characterization of its sub-optimality with respect to the optimal policy. Finally we address related issues of asymptotic discount-optimality of the value-iteration policy.

181: 	\end{abstract}

182:

183: 	\maketitle

184:

185: 	\section{Introduction}

186: 	\label{s:intro}

187: 		Optimal control of Markov control processes (MCP) up to an exit time is a problem with a long and rich history. It has mostly been studied as the minimization of an expected undiscounted cost until the first time that the state enters a given target set, see e.g.,~\cite[Chapter~II]{ref:borkarTopicsControlledMC},~\cite[Chapter~8]{ref:hernandez-lerma2}, and the references therein. In particular, if a unit cost is incurred as long as the state is outside the target set, then the problem of minimizing the cost accumulated until the state enters the target is known variously as the \textsl{pursuit problem}~\cite{ref:eatonzadeh62}, \textsl{transient programming}~\cite{ref:whittleOptimization}, the \textsl{first passage problem}~\cite{ref:dermanMDP, ref:kushnerIntroStochControl}, the \textsl{stochastic shortest path problem}~\cite{ref:bertsekasDP2}, and \textsl{control up to an exit time}~\cite{ref:borkarConvexAnalyticApproach, ref:borkarTopicsControlledMC, ref:kestenMCP}. These articles deal with at most countable state and action spaces. The problem of optimally controlling a system until an exit time from a given set has gained significance in financial and insurance mathematics, see, e.g., \cite{ref:boda04, ref:schmidliInsurance}.

188:

189: 		Our interest in this problem stems from our attempts to develop a general theory of stochastic model-predictive control (MPC). In its bare essentials, deterministic MPC~\cite{ref:maciejowskibk} consists of two steps: (i) solving a finite-horizon optimal control problem with constraints on the state and the controlled inputs to get an optimal policy, and (ii) applying a controller derived from the policy obtained in step (i) in a rolling-horizon fashion. Theoretical foundation of stochastic MPC is still in its infancy, see~\cite{ref:PrimbsSung09, ref:bertsimas2007, ref:vanHessem2006, ref:kouvaritakissMPCIneqconstraints, ref:batinaPhDthesis} and the references therein for some related work. In view of its close relationship with applications, any satisfactory theory of stochastic MPC must necessarily take into account its practical aspects. In this context an examination of a standard linear system with constrained controlled inputs affected by independent and identically distributed (i.i.d.)\ unbounded (e.g., Gaussian) disturbance inputs shows that no control policy can ensure that with probability one the state stays confined to a bounded \emph{safe set} for all instants of time. This is because the noise is unbounded and the samples are independent of each other. Although disturbances are not likely to be unbounded in practice, assigning an a priori bound seems to demand considerable insight. In case a bounded-noise model is adopted, existing robust MPC techniques~\cite{ref:bemporad1999rmp, ref:blanchini1999sic} may be applied, in which the central idea is to synthesize a controller based on the bounds of the noise such that the target set becomes invariant with respect to the closed-loop dynamics. However, since the optimal policy is based on a worst-case analysis, it usually leads to rather conservative controllers and sometimes even to infeasibility. Moreover, complexity of the optimization problem grows rapidly (typically exponentially) with the optimization horizon. An alternative is to replace the hard constraints by probabilistic (soft) ones. The idea is to find a policy that guarantees that the state constraints are satisfied with high probability over a sufficiently long time horizon. While this approach may improve feasibility aspects of the problem, it does not address the issue of what actions should be taken once the state violates the constraints. See~\cite{ref:hokayemcdc09, ref:smpcbnddu, ref:accl08} for recent results in this direction.

190:

191: 		In view of the above considerations, developing recovery strategies appears to be a necessary step. Such a strategy is to be activated once the state violates the constraints and to be deactivated whenever the system returns to the safe set. In general, a recovery strategy must drive the system quickly to the safe set while simultaneously meeting other performance objectives. In the context of MPC, two merits are immediate: (a) once the constraints are transgressed, appropriate actions can be taken to bring the state back to the safe set quickly and optimally, and (b) if the original problem is posed with hard constraints on the state, in view of (a) they may be relaxed to probabilistic ones to improve feasibility.

192:

193: 		In this article we address the problem of synthesizing optimal recovery strategies. We formulate the problem as the minimization of an expected discounted cost until the state enters the safe set. An almost customary assumption in the literature (see, e.g.,~\cite{ref:hindererAbsorbingSet} and the references therein,) concerned with stochastic optimal control up to an exit time is that the target set is absorbing. That is, there exists a control policy that makes the target set invariant with respect to the closed-loop stochastic dynamics. This is rather restrictive for MPC problems---it is invalid, for instance, in the very simple case of a linear controlled system with i.i.d.\ Gaussian noise inputs. We do not make this assumption, for, as mentioned above, our primary motivation for solving this problem is precisely to deal with the case that the target set is not absorbing. As a result of this, it turns out that the dynamic programming equations involve integration over subsets of the state-space and therefore are difficult to solve. At present there is no established method to solve such equations in uncountable state-spaces. However, in finite state-space cases tractable approximate dynamic programming methods~\cite{ref:bertsekasNDP, ref:powellADP} may be employed to arrive at suboptimal but efficient policies.

194:

195: 		This article unfolds as follows. In~\secref{s:prelims} we define the general setting of the problem, namely, Markov control processes on Polish spaces, their transition kernels and the main types of control strategies. In~\secref{s:EDC} we establish our main Theorem~\ref{t:EDC} under standard mild hypotheses. This result guarantees the existence of a deterministic stationary policy that leads to the minimal cost and also provides a Bellman equation that the value function must satisfy. A contraction mapping approach to the problem is pursued in~\secref{s:contr} under the (standard) assumption that the cost-per-stage function satisfies certain growth-rate conditions. The main result (Proposition~\ref{p:Tfp}) of this section asserts both the existence and uniqueness of the optimal value function. Asymptotic discount-optimality of the value-iteration policy is investigated in~\secref{s:ado} under two different sets of hypotheses; in particular, the results of this section show that rolling-horizon strategy approaches optimality as the length of the horizon window increases to infinity. A rolling-horizon strategy corresponding to our optimal control problem is developed in~\secref{s:rh}; in Theorem~\ref{t:rh} we establish quantitative bounds on the degree of sub-optimality of the rolling-horizon strategy with respect to the optimal policy. We conclude in~\secref{s:concl} with a discussion of future work. The state and control/action sets are assumed to be Borel subsets of Polish spaces.

196:

197: 	\section{Preliminaries}

198: 	\label{s:prelims}

199: 		We employ the following standard notations. Let $\N$ denote the natural numbers $\{1, 2, \ldots\}$, and $\Nz$ denote the nonnegative integers $\{0\}\cup\N$. Let $\indic{A}(\cdot)$ be the standard indicator function of a set $A$, i.e., $\indic{A}(\xi) = 1$ if $\xi\in A$ and $0$ otherwise. For two real numbers $a$ and $b$, let $a\mn b \Let \min\{a, b\}$.

200:

201: 		Given a nonempty Borel set $X$ (i.e., a Borel subset of a Polish space), its Borel $\sigma$-algebra is denoted by $\Borelsigalg{X}$. By convention ``measurable'' means ``Borel-measurable'' in the sequel. If $X$ and $Y$ are nonempty Borel spaces, a \emph{stochastic kernel} on $X$ given $Y$ is a mapping $Q(\cdot|\cdot)$ such that $Q(\cdot|y)$ is a probability measure on $X$ for each fixed $y\in Y$, and $Q(B|\cdot)$ is a measurable function on $Y$ for each fixed $B\in\Borelsigalg X$. We let $\mc P(X|Y)$ be the family of all stochastic kernels on $X$ given $Y$.% We say that $Q(\cdot|\cdot)$ is a \emph{sub-stochastic kernel} on $X$ given $Y$ if $Q(B|\cdot)$ is a measurable function on $Y$ for each $B\in\Borelsigalg{X}$, and $Q(\cdot|y)$ is a measure on $X$ with $Q(X|y) \le 1$ for each $y\in Y$.

202:

203: 		We briefly recall some standard definitions.

204:

205: 		\begin{defn}

206: 		\label{d:mcm}

207: 			A \emph{Markov control model} is a five-tuple

208: 			\begin{equation}

209: 				\label{e:mmodel}

210: 				\bigl(X, A, \{A(x)\mid x\in X\}, Q, c\bigr)

211: 			\end{equation}

212: 			consisting of a nonempty Borel space $X$ called the \emph{state space}, a nonempty Borel space $A$ called the \emph{control} or \emph{action set}, a family $\{A(x)\mid x\in X\}$ of nonempty measurable subsets $A(x)$ of $A$, where $A(x)$ denotes the set of \emph{feasible controls} or \emph{actions} when the system is in state $x\in X$, and with the property that the set $\mathbb K \Let \bigl\{(x, a)\big|x\in X, a\in A(x)\bigr\}$ of feasible state-action pairs is a measurable subset of $X\times A$, a stochastic kernel $Q$ on $X$ given $\mathbb K$ called the \emph{transition law}, and a measurable function $c:\mathbb K\lra \R$ called the \emph{cost-per-stage function}.\DefEnd

213: 		\end{defn}

214:

215: 		\begin{assumption}

216: 		\label{a:basic}

217: 			The set $\mathbb K$ of feasible state-action pairs contains the graph of a measurable function from $X$ to $A$.\AssumptionEnd%That is, there exists a measurable function $f:X\lra A$ such that $f(x)\in A(x)$ for all $x\in X$.

218: 		\end{assumption}

219:

220: 		We let $\Pi$, $\Pi_{RM}$, $\Pi_{DM}$ and $\Pi_{DS}$ denote the set of all randomized and history-dependent admissible policies, randomized Markov, deterministic Markov and deterministic stationary policies, respectively. For further details and notations on policies see, e.g.,~\cite{ref:hernandez-lerma1}. Consider the Markov control model~\eqref{e:mmodel}, and for each $i=0, 1, \ldots,$ define the space $H_i$ of \emph{admissible histories} up to time $i$ as $H_0 \Let X$, and $H_i \Let \mathbb K^i\times X = \mathbb K\times H_{i-1}$ for $i\in \N$. A generic element $h_i$ of $H_i$, called an admissible $i$-history is a vector of the form $h_i = (x_0, a_0, \ldots, x_{i-1}, a_{i-1}, x_i)$, with $(x_j, a_j)\in\mathbb K$ for $j=0, \ldots, i-1$ and $x_i\in X$. Hereafter we let the $\sigma$-algebra generated by the history $h_i$ be denoted by $\sigalg_i$, $i\in\Nz$. %A policy $\pi = (\pi_i)_{i\in\Nz}$ is a sequence $(a_i)_{i\in\Nz}$ of $A$-valued random variables, called actions or controls, such that for every $i$-history $h_i, i\in\Nz$, the law of $a_i$ is $\pi_i(\cdot|h_i)$, with support of $\pi_i(\cdot|h_i)$ contained in $A(x_i)$, the set of feasible actions in the state $x_i$.

221: 		Let $(\Omega, \sigalg)$ be the measurable space consisting of the (canonical) sample space $\Omega \Let \ol H_\infty = (X\times A)^\infty$, and $\sigalg$ is the corresponding product $\sigma$-algebra. %For $\omega = (x_0, a_0, x_1, a_1, \ldots)\in\Omega$, the projections $x_i$ and $a_i$ from $\Omega$ to the sets $X$ and $A$ are called \emph{state} and \emph{control} (or \emph{action}) variables, respectively.

222: 		Let $\pi = (\pi_i)_{i\in\Nz}$ be an arbitrary control policy and $\nu$ an arbitrary probability measure on $X$, referred to as the initial distribution. By a theorem of Ionescu-Tulcea~\cite[Chapter 3, \S4, Theorem~5]{ref:raoProbTheo}, there exists a unique probability measure $\mathsf P_\nu^\pi$ on $(\Omega, \sigalg)$ supported on $H^\infty$, and such that for all $B\in\Borelsigalg X$, $C\in\Borelsigalg A$, and $h_i\in H_i$, $i\in\Nz$, $\mathsf P_\nu^\pi\bigl(\xz\in B\bigr) = \nu(B)$ and

223: 		\begin{subequations}

224: 		%\label{e:probmeasure}

225: 		\begin{align}

226: 			\label{e:actiontrans}

227: 			\mathsf P_\nu^\pi\bigl(a_i\in C\,\big|\, h_i\bigr) & = \pi_i\bigl(C\,\big|\, h_i\bigr)\\

228: 			\label{e:statetrans}

229: 			\mathsf P_\nu^\pi\bigl(x_{i+1}\in B\,\big|\, h_i, a_i\bigr) & = Q\bigl(B\,\big|\, x_i, a_i\bigr).

230: 		\end{align}

231: 		\end{subequations}

232: 		The stochastic process $\bigl(\Omega, \sigalg, \mathsf P_\nu^\pi, (x_i)_{i\in\Nz}\bigr)$ is called a discrete-time \emph{Markov control process}. Let $\Phi$ denote the set of stochastic kernels $\vphi$ in $\mathcal P(A| X)$ such that $\vphi(A(x)| x) = 1$ for all $x\in X$, and let $\mathbb F$ denote the set of all measurable functions $f:X\lra A$ satisfying $f(x)\in A(x)$ for all $x\in X$. The functions in $\mathbb F$ are called \emph{selectors} of the set-valued mapping $X\ni x\mapsto A(x)\subset A$.

233: 		%Recall that a policy $\pi = (\pi_i)_{i\in\Nz}\in\Pi$ is said to be \emph{randomized Markov} if there exists a sequence $(\vphi_i)_{i\in\Nz}$ of stochastic kernels $\vphi_i\in\Phi$ such that $\pi_i(\cdot| h_i) = \vphi_i(\cdot| x_i)\;\; \fa h_i\in H_i, \;i\in\Nz$, \emph{deterministic Markov} if there exists a sequence $(f_i)_{i\in\Nz}$ of functions $f_i\in\mathbb F$ such that $\pi_i(\cdot| h_i) = \delta_{f(x_i)}(\cdot)$, and \emph{deterministic stationary} if there exists a function $f\in\mathbb F$ such that $\pi_i(\cdot| h_i) = \delta_{f(x_i)}(\cdot)$.

234:

235: 		%We note that the process $\bigl(\Omega, \sigalg, \mathsf P_\nu^\pi, (x_i)_{i\in\Nz}\bigr)$ is not necessarily Markovian due to the dependence on the entire history $h_i$ in~\eqref{e:actiontrans}; however, if $(\pi_i)_{i\in\Nz}$ is restricted to randomized Markov policies, then $(x_i)_{i\in\Nz}$ is a Markov process, as established in~\cite[Proposition~2.3.5]{ref:hernandez-lerma1}.

236:

237: 		The transition kernel $Q$ in~\eqref{e:statetrans} under a policy $\pi \Let (\vphi_i)_{i\in\Nz}\in\Pi_{RM}$ is given by $\bigl(Q(\cdot|\cdot, \vphi_i)\bigr)_{i\in\Nz}$, defined as $\Borelsigalg{X}\times X\ni (B, x)\mapsto Q(B|x, \vphi_i(x)) \Let \int_{A(x)}\vphi_i(\mrm da|x) Q(B|x, a)$. Occasionally we suppress the dependence of $\vphi_i$ on $x$ and write $Q(B|x, \vphi_i)$ in place of $Q(B|x, \vphi_i(x))$. The cost-per-stage function at the $j$-th stage under a policy $(\vphi_i)_{i\in\Nz}$ is written as $c(x_j, \vphi_j) \Let \int_{A(x_j)} \vphi_j(\mrm da|x_j)c(x_j, a)$. We simply write $\vphi^\infty$ and $f^\infty$, respectively, for policies $(\vphi, \vphi, \ldots)\in\Pi_{RS}$ and $(f, f, \ldots)\in \Pi_{DS}$.

238:

239: 		%\begin{proposition}[{\cite[Proposition~2.3.5]{ref:hernandez-lerma1}}]

240: 		%	\label{p:Markovprop}

241: 		%	Let $\nu$ be an arbitrary initial distribution. If $\pi = (\vphi_i)_{i\in\Nz}$ is a randomized or deterministic Markov policy, then $(x_i)_{i\in\Nz}$ is a inhomogeneous Markov process with transition kernel $(Q(\cdot| \cdot, \vphi_i)$ at the $i$-th step. In particular, if $\pi = \vphi^\infty$ and $\pi = f^\infty$ are a stationary randomized and a deterministic stationary policy, respectively, then $(x_i)_{i\in\Nz}$ is a time-homogeneous Markov process with corresponding transition kernels $Q(\cdot| \cdot, \vphi)$ and $Q(\cdot| \cdot, f)$ at each step, respectively.

242: 		%\end{proposition}

243:

244: 		Since we shall be exclusively concerned with Markov policies and its subclasses, in the sequel we use the notation $\Pi$ for the class of all randomized Markov strategies.

245:

246: 	\section{Expected Discounted Cost up to the first Exit Time}

247: 	\label{s:EDC}

248: 		Let $K\subset X$ be a measurable set, $x_0 = x\in X$ and let $\tau \Let \inf\bigl\{i\in\Nz\big| x_i\in K\bigr\}$.\footnote{As usual the infimum over an empty set is taken to be $+\infty$.} We note that $\tau$ is an $(\sigalg_i)_{i\in\Nz}$-stopping time. Let us define

249: 		\[

250: 			V(\pi, x) \Let \mathsf E_x^\pi\!\left[\sum_{i=0}^{\tau-1} \alpha^i c(x_i, a_i)\right], \qquad \alpha\in\:]0, 1[,

251: 		\]

252: 		as the \emph{$\alpha$-discounted expected cost} under policy $\pi\in\Pi$ corresponding to the Markov control process $\bigl(\Omega, \sigalg, \mathsf P_\nu^\pi, (x_i)_{i\in\Nz}\bigr)$.\footnote{We employ the standard convention that a summation from a higher to a lower index is defined to be $0$.} Our objective is to minimize $V(\pi,x)$ over a class of control policies $\Pi$, i.e., find the $\alpha$-discount value function

253: 		\begin{align}

254: 			\label{e:problem}

255: 			V^\star(x) \Let \inf_{\pi\in\Pi} V(\pi, x) = \inf_{\pi\in\Pi}\mathsf E_x^\pi\!\left[\sum_{i=0}^{\tau-1} \alpha^i c(x_i, a_i)\right], \qquad \alpha\in\:]0, 1[.

256: 		\end{align}

257: 		A policy that attains the infimum above is said to be \emph{$\alpha$-discount optimal}.

258:

259: 		\begin{remark}

260: 			As mentioned in the introduction, the optimization problem~\eqref{e:problem} with $\alpha = 1$ and the cost-per-stage function $c(x, a) = \indic{X\setmin K}(x)$ is known as the stochastic shortest path problem. The objective of this problem is to drive the state to a desired set ($K$ in our case) as soon as possible, and the expected cost $V_{\text{ssp}}(\pi, x)$ for a policy $\pi$ corresponding to the above cost-per-stage function is readily seen to be $\mathsf E^\pi_x\bigl[\tau\bigr]$. In this light we observe that the minimization problem in~\eqref{e:problem} with the cost-per-stage function $c(x, a) = \indic{X\setmin K}(x)$ can be viewed as a discounted stochastic shortest path problem. It follows immediately that the corresponding expected cost $V_{\text{dssp}}(\pi, x)$ is $\bigl(1-\mathsf E^\pi_x\bigl[\alpha^\tau\bigr]\bigr)/(1-\alpha)$. Note that the minimization of $V_{\text{dssp}}(\pi, x)$ over a class of policies is always well-defined for $\alpha < 1$. Moreover, because of the monotonic behavior of the map $]0, 1[\;\ni\alpha\mapsto \bigl(1-\mathsf E^\pi_x\bigl[\alpha^\tau\bigr]\bigr)/(1-\alpha)$, one may hope to get a good approximation of the original stochastic shortest path problem. However, pathological examples can be constructed to show that a solution to the stochastic shortest path problem may not exist, whereas minimization of $V_{\text{dssp}}(\pi, x)$ is always well defined, although in either case the state may never reach the desired set $K$ almost surely-$\PP^\pi_x$.\RemarkEnd

261: 		\end{remark}

262:

263: 		\begin{remark}

264: 			\label{r:diffc}

265: 			%If we take the particular case of $c(x, a) = \indic{X\setmin K}(x)$, then we obtain the problem of minimizing the time taken by the process $(x_i)_{i\in\Nz}$ to hit the set $K$ for the first time, which is the stochastic shortest time to hit $K$ problem. Also

266: 			Given a cost-per-stage function $c$ on $\mathbb K$, one can redefine it to be $c'(x, a) \Let c(x, a)\indic{X\setmin K}(x)$ to turn the problem~\eqref{e:problem} into the minimization of $\mathsf E_x^\pi\!\left[\sum_{i=0}^\tau \alpha^i c'(x_i, a_i)\right]$ for $\alpha\in\:]0, 1[$. This cost functional can be equivalently written as an infinite horizon cost functional, as in $\mathsf E_x^\pi\!\left[\sum_{i=0}^\infty \alpha^i c'(x_i, a_i)\indic{\{i \le \tau\}}\right]$, or as in $\mathsf E_x^\pi\!\left[\sum_{i=0}^\infty \alpha^i c(x_i, a_i)\indic{\{i < \tau\}}\right]$. However, the absence of a policy that guarantees that $(x_i)_{i\in\Nz}$ stays inside $K$ for all time after $\tau$ necessarily means that the problem~\eqref{e:problem} corresponding to the Markov control model in Definition~\ref{d:mcm} is not equivalent to the minimization of the infinite horizon cost functional $\mathsf E_x^\pi\!\left[\sum_{i=0}^\infty \alpha^i c'(x_i, a_i)\right]$.\RemarkEnd

267: 		\end{remark}

268:

269: 		\begin{prgr}

270: 		\label{pgr:policies}

271: 			\emph{A word about admissible policies.} It is clear at once that the class of admissible policies for the problem~\eqref{e:problem} is different from the classes considered in~\secref{s:prelims}. Indeed, since the process is killed at the stopping time $\tau$, it follows that the class of admissible policies should also be truncated at the stage $\tau-1$. For a given stage $t\in\Nz$ we define the $t$-th policy element $\pi_t$ only on the set $\{t < \tau\}$. Note that with this definition $\pi_t$ becomes a $\sigalg_{t\mn\tau}$-measurable randomized control (in general). It is also immediate from the definition of $\tau$ that if the initial condition $x$ is inside $K$, then the set of admissible policies is empty; indeed, in this case $\tau = 0$, and there is no control needed. In other words, the domain of $\pi_t$ is contained in the ``spatial'' region $\bigl\{(x, a)\in\mathbb K\,\big|\,x\in X\setmin K, a\in A(x)\bigr\}$; since $\pi_t$ is not defined on $K$, this is equivalent to $\pi_t$ being well-defined on $\{t < \tau\}$.

272: 		\end{prgr}

273:

274: 		\begin{prgr}

275: 		\label{pgr:convention}

276: 			\emph{Some re-definitions.} To simplify the formulas from now on we let the cost-per-stage function to be defined on $X\setmin K$. With this convention in place our problem~\eqref{e:problem} can be posed as the minimization of $\EE^\pi_x\bigl[\sum_{i=0}^{\tau-1} \alpha^i c(x_{i}, a_{i})\bigr]$ over admissible policies. Also, henceforth we redefine the set $\mathbb K$ of state-action pairs to be $\mathbb K \Let \bigl\{(x, a)\in X\times A\,\big|\,x\in X\setmin K, a\in A(x)\bigr\}$, and we note that this new set is a measurable subset of the original set of state-action pairs. Also, we let $\mathbb F$ be the set of selectors of the set-valued mapping $X\setmin K\ni x\mapsto A(x)\subset A$.

277: 		\end{prgr}

278:

279: 		%\begin{prgr}

280: 		%	\label{prgr:fndef}

281: 		Recall that a function $g:\mathbb K\lra \R$ is said to be \emph{inf-compact on $\mathbb K$} if for every $x\in X$ and $r\in \R$ the set $\bigl\{a\in A(x)\big| g(x, a) \le r\bigr\}$ is compact. A transition kernel $Q$ on a measurable space $X$ given another measurable space $Y$ is said to be \emph{strongly Feller} (or \emph{strongly continuous}) if the mapping $y\mapsto \int_X g(x) Q(\mrm dx| y)$ is continuous and bounded for every measurable and bounded function $g:X\lra\R$. A function $g:\mathbb K\lra\R$ is \emph{lower semicontinuous} (l.s.c.) if for every sequence $(x_j, a_j)_{j\in\N}\subset\mathbb K$ converging to $(x, a)\in\mathbb K$, we have $\liminf_{j\ra\infty} g(x_j, a_j) \ge g(x, a)$; or, equivalently, if for every $r\in\R$, the set $\bigl\{(x, a)\in\mathbb K\big| g(x, a) \le r\bigr\}$ is closed in $\mathbb K$.

282: 		%\end{prgr}

283:

284: 		\begin{assumption}

285: 			\label{a:key}

286: 			In addition to Assumption~\ref{a:basic}, we stipulate that

287: 			\begin{enumerate}[align=right, leftmargin=*, widest=iii, label=(\roman*)]

288: 				\item the set $A(x)$ is compact for every $x\in X$,

289: 				\item the cost-per-stage $c$ is lower semicontinuous, nonnegative, and inf-compact on $\mathbb K$, and

290: 				\item the transition kernel $Q$ is strongly Feller.\AssumptionEnd

291: 			\end{enumerate}

292: 		\end{assumption}

293:

294: 		The following is our main result on expected discounted cost up to the first time $\tau$ to hit $K$; a proof is presented later in this section.

295:

296: 		\begin{theorem}

297: 			\label{t:EDC}

298: 			Suppose that Assumption {\rm \ref{a:key}} holds. Then

299: 			\begin{enumerate}[label=\emph{(\roman*)}, align=right, leftmargin=*, widest=iii]

300: 				\item The $\alpha$-discount value function $V^\star$ is the (positive) minimal measurable solution to the $\alpha$-discounted cost optimality equation ($\alpha$-DCOE)

301: 				\begin{equation}

302: 					\label{e:alphadcoe}

303: 					\xi(x) = \min_{A(x)}\left[c(x, a) + \alpha\int_{X\setmin K} Q(\mrm dy| x, a)\:\xi(y)\right]\qquad \fa x\in X\setmin K.

304: 				\end{equation}

305: 				\item There exists a selector $f_\star\in\mathbb F$ such that $f_\star(x)\in A(x)$, $x\in X\setmin K$, attains the minimum in~\eqref{e:alphadcoe}, i.e.,

306: 				\begin{equation}

307: 					\label{e:alphado}

308: 					V^\star(x) = c(x, f_\star) + \alpha\int_{X\setmin K}Q(\mrm dy| x, f_\star)\:V^\star(y)\qquad \fa x\in X\setmin K,

309: 				\end{equation}

310: 				and the deterministic stationary policy $f_\star^\infty$ is $\alpha$-discount optimal; conversely, if $f_\star^\infty\in\Pi_{DS}$ is $\alpha$-discount optimal, then it satisfies~\eqref{e:alphado}.

311: 				%\item If an $\alpha$-discount optimal policy exists, then there exists one that is deterministic stationary.

312: 			\end{enumerate}

313: 		\end{theorem}

314:

315: 		We observe that Theorem~\ref{t:EDC} does not assert that the optimal value function $V^\star$ is unique in any sense. In~\secref{s:contr} we prove a result (Proposition~\ref{p:Tfp}) under additional hypotheses that guarantees uniqueness of $V^\star$.

316:

317: 		Since we do not assume that the cost-per-stage function $c$ is bounded, a useful approach is to consider the $\alpha$-\emph{value iteration} ($\alpha$-VI) \emph{functions} defined by

318: 		\begin{equation}

319: 		\label{e:VI}

320: 		\begin{cases}

321: 			v_0(x) = 0,\\

322: 			v_n(x) = \displaystyle{\min_{A(x)}\left[c(x, a) + \alpha \int_{X\setmin K} Q(\mrm dy|x, a)\: v_{n-1}(y)\right]},

323: 		\end{cases}

324: 		n\in\N,\;\; x\in X\setmin K.

325: 		\end{equation}

326: 		Of course we have to demonstrate that $V^\star(x) = \lim_{n\ra\infty} v_n(x)$ for all $x\in X$.

327:

328: 		The functions $v_n$, $n\in\N$, may be identified with the optimal cost function for the minimization of the process stopped at the $n\mn(\tau-1)$-th step, i.e.,

329: 		\[

330: 			v_n(x) = \inf_{\pi\in\Pi} \mathsf E_x^\pi\!\left[\sum_{i=0}^{(n-1)\mn(\tau-1)} \alpha^i c(x_i, a_i)\right].

331: 		\]

332: 		To get an intuitive idea, fix a deterministic Markov policy $\pi = (\pi_i)_{i\in\Nz}$, and take the first iterate $v_1$. From~\eqref{e:VI} it is immediately clear that $v_1(x) = \min_{a\in A(x)} c(x, a)$ if $x\not\in K$, and not defined otherwise. For the second iterate, we have

333: 		\begin{align*}

334: 			v_2(x) & = \inf_{\pi\in\Pi} \mathsf E_x^\pi\!\left[\sum_{i=0}^{1\mn (\tau-1)} \alpha^i c(x_i, a_i)\right]\\

335: 			& = \inf_{\pi\in\Pi}\left(c(x, \pi_0(x)) + \alpha\!\int_X \! Q(\mrm d\xi_1|x, \pi_1(x))\indic{X\setmin K}(\xi_1) c(\xi_1, \pi_1)\right).

336: 		\end{align*}

337: 		Note that only those sample paths that do not enter $K$ at the first step contribute to the cost at the second stage. This property is ensured by the indicator function that appears on the right-hand side of the last equality above.

338:

339: 		\begin{example}

340: 			Let $(x_i)_{i\in\Nz}$ be a Markov chain with state-space $X = \{1, 2, \ldots, m\}$ and transition probability matrix $Q = [q_{ij}(a)]_{m\times m}$, where the argument of $q_{ij}$ depicts the dependence on the action $a\in A$ with $A$ being a compact subset of $\R$. Let $K = \{1, 2, \ldots, m'\}$ for $m' < m$, fix $\alpha\in\;]0, 1[$ and let $c(x, a) \Let \indic{X\setmin K}(x)$. Suppose further that $\inf_{a}q_{ij}(a) > 0$ for all $i, j\in X$; this means, in particular, that the target set $K$ cannot be absorbing for any deterministic stationary policy. Our objective is to find an optimal policy corresponding to the the minimal cost~\eqref{e:alphado}. The optimal value function $V^\star$ is $0$ on $K$ and for every $i\in \{m'+1, \ldots, m\}$ we have $V^\star(i) = \min_{a\in A(i)}\bigl[\indic{X\setmin K}(i) + \alpha\int_{X\setmin K}Q(\mrm dy|i, a) V^\star(y)\bigr] = 1 + \alpha\min_{a\in A(i)}\sum_{j=m'+1}^m q_{ij}(a) V^\star(j)$. The most elementary case is that of $m' = m - 1$; then $V^\star(m) = 1 + \alpha\min_{a\in A(m)}q_{mm}(a)V^\star(m)$, and given a sufficiently regular function $q_{mm}(\cdot)$ this can be solved at once to get $V^\star(m)$, which characterizes the function (vector) $V^\star$ completely. The optimal policy in this case is $f(m) \in \argmin_{a\in A(m)} q_{mm}(a)V^\star(m)$; if the function $q_{mm}(\cdot)$ is convex, then the minimum is attained on $A$ and thus leads to a unique optimal policy.\ExampleEnd

341: 		\end{example}

342:

343: 		\subsection*{Proof of Theorem~\ref{t:EDC}}

344: 		%\label{s:proofsEDC}

345: 		Recall from paragraph~\ref{pgr:convention} that $c$ is defined on $X\setmin K$, $\mathbb K = \bigl\{(x, a)\in X\times A\,\big|\, x\in X\setmin K, a\in A(x)\bigr\}$ and $\mathbb F$ is the set of selectors of the set-valued map $X\setmin K\ni x\mapsto A(x)\subset A$. We begin with a sequence of Lemmas.

346:

347: 		\begin{lemma}[{\cite[Lemma~4.2.4]{ref:hernandez-lerma1}}]

348: 			\label{l:keyconvergence}

349: 			Let the functions $u:\mathbb K\lra\R$ and $u_i:\mathbb K\lra\R$, $i\in\N$, be l.s.c., inf-compact and bounded below. If $u_i\uparrow u$, then

350: 			\[

351: 				\lim_{i\ra\infty}\min_{A(x)} u_i(x, a) = \min_{A(x)} u(x, a)\qquad \fa x\in X.

352: 			\]

353: 		\end{lemma}

354:

355: 		\begin{lemma}[Adapted from \cite{ref:riederselectors}]

356: 			\label{l:basicselector}

357: 			Suppose that

358: 			\begin{itemize}[label=\textbullet, align=right, leftmargin=*]

359: 				\item $A(x)$ is compact for each $x\in X\setmin K$ and $\mathbb K$ is a measurable subset of $(X\setmin K)\times A$, and

360: 				\item $v:\mathbb K\lra\posR$ is a measurable inf-compact function, $v(x, \cdot)$ is l.s.c.\ on $A(x)$ for each $x\in X$.

361: 			\end{itemize}

362: 			Then there exists a selector $f_\star\in\mathbb F$ such that

363: 			\[

364: 				v(x, f_\star(x)) = v^\star(x) \Let \min_{A(x)}v(x, a)\qquad \fa x\in X\setmin K,

365: 			\]

366: 			and $v^\star$ is a measurable function.

367: 		\end{lemma}

368:

369: 		\begin{defn}

370: 			Let $\Lp 0(X\setmin K)^+$ denote the convex cone of nonnegative extended real-valued measurable functions on $X\setmin K$, and for every $u\in \Lp 0(X\setmin K)^+$ let us define the map $T u$ by

371: 			\begin{equation}

372: 				\label{e:Tdef}

373: 				X\setmin K\ni x\mapsto T u(x) \Let \inf_{A(x)}\left[c(x, a) + \alpha\int_{X\setmin K}Q(\mrm dy|x, a)\:u(y)\right].

374: 			\end{equation}

375: 			The map $T$ is the \emph{dynamic programming operator} corresponding to our problem~\eqref{e:problem}.\DefEnd

376: 		\end{defn}

377:

378: 		Having defined the dynamic programming operator $T$ above, it is important to distinguish conditions under which the function $T u$ is measurable for $u\in\Lp 0(X\setmin K)^+$. We have the following lemma.

379:

380: 		\begin{lemma}

381: 			\label{l:selector}

382: 			Under Assumption {\rm \ref{a:key}}, the mapping $T$ in~\eqref{e:Tdef} takes $\Lp 0(X\setmin K)^+$ into itself. Moreover, there exists a selector $f\in \mathbb F$ such that $T u$ defined in~\eqref{e:Tdef} satisfies

383: 			\begin{equation}

384: 				\label{e:Tviaselector}

385: 				T u(x) = c(x, f) + \alpha\int_{X\setmin K} Q(\mrm dy|x, f)\:u(y)\qquad \fa x\in X\setmin K.

386: 			\end{equation}

387: 		\end{lemma}

388: 		\begin{proof}

389: 			Fix $u\in\Lp 0(X\setmin K)^+$. The strong-Feller property of $Q$ on $\mathbb K$ and lower-semicontinuity of the cost-per-stage function $c$ defined on $K$ show that the map

390: 			\[

391: 				\mathbb K\ni (x, a)\mapsto T'u(x, a) \Let c(x, a) + \alpha\int_{X}Q(\mrm dy|x, a)\;\indic{X\setmin K}(y) u(y)

392: 			\]

393: 			is lower-semicontinuous.

394: 			%Firstly we observe that the map $\mathbb K\ni (x, a)\mapsto c(x, a)\indic{X\setmin K}(x)\in\posR$ is l.s.c. on $\mathbb K$, which follows simply by restriction of the domain of the l.s.c function $c$.

395: 			%Secondly, the map $\mathbb K\ni (x, a)\mapsto \indic{X\setmin K}(x)\int_{X\setmin K}Q(\mrm dy|x, a)u(y)\in\posR$ is l.s.c. on $\mathbb K$ for every $u\in\Lp 0(X)^+$.  Indeed, fix $u\in\Lp 0(X)^+$, and let $(u_i)_{i\in\N}\subset\Lp 0(X)^+$ be a sequence of bounded measurable functions such that $u_i \uparrow u$. By assumption $Q$ is strongly Feller, and $\indic{X\setmin K}(\cdot)u_i(\cdot)$ is a bounded measurable function on $X$; therefore, the real-valued map

396: 			%\begin{equation}

397: 			%	\label{e:Tviaselector1}

398: 			%	\mathbb K\ni (x, a)\mapsto \int_{X\setmin K} Q(\mrm dy|x, a) u_i(y) = \int_X Q(\mrm dy|x, a)\indic{X\setmin K}(y) u_i(y)

399: 			%\end{equation}

400: 			%is nonnegative and continuous. Fix a sequence $(x_j, a_j)_{j\in\N}\subset\mathbb K$ converging to $(x, a)\in\mathbb K$. Since $u_i \le u$ for every $i$, we have

401: 			%\begin{align*}

402: 			%	\liminf_{j\ra\infty} \int_X Q(\mrm dy|x_j, a_j) \indic{X\setmin K}(y)u(y) & \ge \liminf_{j\ra\infty} \int_X Q(\mrm dy|x_j, a_j)\indic{X\setmin K}(y) u_i(y)\\

403: 			%	& =  \int_X Q(\mrm dy|x, a)\indic{X\setmin K}(y) u_i(y),

404: 			%\end{align*}

405: 			%where the equality above follows from continuity of the map~\eqref{e:Tviaselector1}. Taking the limit as $i\ra\infty$, the monotone convergence theorem shows that

406: 			%\begin{align*}

407: 			%	\liminf_{j\ra\infty} \int_X Q(\mrm dy|x_j, a_j) \indic{X\setmin K}(y) u(y) & \ge \lim_{i\ra\infty}\int_X Q(\mrm dy|x, a)\indic{X\setmin K}(y) u_i(y)\\

408: 			%	& = \int_X Q(\mrm dy|x, a)\indic{X\setmin K}(y) u(y).

409: 			%\end{align*}

410: 			%This shows that $\mathbb K\ni (x, a)\mapsto \int_{X\setmin K} Q(\mrm dy|x, a) u(y)$ is l.s.c., nonnegative, and so is $\mathbb K\ni (x, a)\mapsto \indic{X\setmin K}(x)\int_{X\setmin K}Q(\mrm dy|x, a)u(y)$ by definition of $\mathbb K$.

411: 			%Thirdly, for $u\in\Lp 0(X)^+$ we define the map

412: 			%\begin{align*}

413: 			%	\mathbb K\ni (x, a) \mapsto & T'u(x, a) \Let \\

414: 			%	& \indic{X\setmin K}(x)\left(c(x, a) + \alpha\int_X Q(\mrm dy|x, a)\indic{X\setmin K}(y)u(y)\right)\in\posR,

415: 			%\end{align*}

416: 			%and claim that $T'u$ is an l.s.c.\ function on $\mathbb K$. Indeed, from the first and second claims above we have seen that both the maps $(x, a)\mapsto c(x, a)\indic{X\setmin K}(x)$ and $(x, a)\mapsto \alpha\indic{X\setmin K}(x)\int_{X\setmin K}Q(\mrm dy|x, a)u(y)$ are l.s.c.\ on $\mathbb K$; so the claim follows immediately from the elementary fact that the sum of two l.s.c.\ functions is l.s.c.

417: 			From nonnegativity of $u$ it follows that for every $x\in X\setmin K$ and $r \in \R$,

418: 			\begin{equation}

419: 				\label{e:selector1}

420: 				K' \Let \bigl\{a\in A(x) \big| T'u(x, a) \le r\bigr\}\subset\bigl\{a\in A(x)\big| c(x, a) \le r\bigr\},

421: 			\end{equation}

422: 			and the set $\bigl\{a\in A(x)\big| c(x, a) \le r\bigr\}$ is compact by inf-compactness of $c$. Since by definition $T u(x) = \inf_{A(x)} T'u(x, a)$, by Lemma \ref{l:basicselector} it would follow that a selector $f$ exists such that $T u(x) = T'u(x, f(x))\;\;\fa x\in X\setmin K$ once we verify the hypotheses of this Lemma. For this we only have to verify that $T'u$ is l.s.c.\ (which implies it is measurable) and inf-compact on $\mathbb K$. We have seen above that $T'u$ is a l.s.c.\ function on $\mathbb K$. Therefore, for each $x\in X\setmin K$ the map $T'u(x, \cdot)$ is also l.s.c.\ on $A(x)$. Thus, by definition of lower semicontinuity, the set $K'$ in~\eqref{e:selector1} is closed for every $x\in X\setmin K$ and $r\in\R$. Since a closed subset of a compact set is compact, it follows that $K'$ is compact, which in turn shows inf-compactness of $T'u$ on $\mathbb K$ and proves the assertion.

423: 		\end{proof}

424:

425: 		The following lemma shows how functions $u\in\Lp 0(X\setmin K)^+$ satisfying $u\ge T u$ relate to the optimal value function.

426:

427: 		\begin{lemma}

428: 			\label{l:Tineq}

429: 			Suppose that Assumption {\rm \ref{a:key}} holds. If $u\in \Lp 0(X\setmin K)^+$ is such that $u\ge T u$, then $u\ge V^\star$.

430: 		\end{lemma}

431: 		\begin{proof}

432: 			Suppose $u\in\Lp 0(X\setmin K)^+$ satisfies $u \ge T u$, and let $f$ be a selector (whose existence is guaranteed by Lemma \ref{l:selector}) that attains the infimum in~\eqref{e:Tdef}. Fix $x\in X\setmin K$. We have

433: 			\[

434: 				u(x) \ge T u(x) = c(x, f) + \alpha \int_X Q(\mrm d\xi_1|x, f)\:\indic{X\setmin K}(\xi_1)u(\xi_1).

435: 			\]

436: 			The operator $T$ in~\eqref{e:Tdef} is monotone, for if $u, u'\in \Lp 0(X\setmin K)^+$ are two functions with $u \le u'$, then clearly $T u \le T u'$ due to nonnegativity of $c$. Therefore, iterating the above inequality for a second time we obtain

437: 			\begin{equation*}

438: 			\begin{aligned}

439: 				u(x) & \ge c(x, f) + \alpha \int_X Q(\mrm d\xi_1|x, f)\:\indic{X\setmin K}(\xi_1) c(\xi_1, f)\\

440: 				& \qquad + \alpha^2\int_X Q(\mrm d\xi_1|x, f)\:\indic{X\setmin K}(\xi_1)\int_X Q(\xi_2|\xi_1,f)\: \indic{X\setmin K}(\xi_2)u(\xi_2).

441: 			\end{aligned}

442: 			\end{equation*}

443: 			After $n$ such iterations we arrive at

444: 			\[

445: 				u(x) \ge \mathsf E^{f^\infty}_x\!\left[\sum_{i=0}^{(n-1)\mn(\tau-1)} \alpha^i c(x_i, f)\right] + \mathsf E^{f^\infty}_x\bigl[\alpha^{n} u(x_{n})\indic{\{n < \tau\}}\bigr].

446: 			\]

447: 			Since $u\ge 0$, letting $n\ra\infty$ we get

448: 			\[

449: 				u(x) \ge V(f, x) \ge V^\star(x).

450: 			\]

451: 			Since $x\in X\setmin K$ is arbitrary, the assertion follows.

452: 		\end{proof}

453:

454: 		The next lemma deals with convergence of the value iterations to the optimal value function.

455:

456: 		\begin{lemma}

457: 			\label{l:VIconv}

458: 			Suppose that Assumption {\rm \ref{a:key}} holds. Then $v_n\uparrow V^\star$ on $X\setmin K$, and the function $V^\star$ satisfies the $\alpha$-DCOE \eqref{e:alphadcoe}.

459: 		\end{lemma}

460: 		\begin{proof}

461: 			Note that since $v_n(x) = \inf_{\pi\in\Pi} \mathsf E_x^\pi\left[\sum_{i=0}^{(n-1)\mn(\tau-1)} \alpha^i c(x_i, a_i)\right]$ for $x\in X\setmin K$, it follows that

462: 			\[

463: 				v_n(x) \le \mathsf E_x^\pi\!\left[\sum_{i=0}^{(n-1)\mn(\tau-1)} \alpha^i c(x_i, a_i)\right] \le \mathsf E_x^\pi\!\left[\sum_{i=0}^{\tau-1} \alpha^i c(x_i, a_i)\right],

464: 			\]

465: 			and therefore, taking the infimum over all policies $\pi\in\Pi$ on the right hand side, we get

466: 			\begin{equation}

467: 				\label{e:vnbound}

468: 				v_n(x) \le V^\star(x)\qquad \fa x\in X\setmin K.

469: 			\end{equation}

470: 			Since the cost-per-stage function is nonnegative, $T$ is a monotone operator. Therefore, since $v_0 \Let 0$ and $v_n = T v_{n-1}$ for $n\in\N$, it follows that the $\alpha$-VI functions form a nondecreasing sequence in $\Lp 0(X\setmin K)^+$, which implies that $v_n\uparrow v^\star$ for some function $v^\star \in \Lp 0(X\setmin K)^+$. For $n\in\N$ we define

471: 			\begin{align*}

472: 				\mathbb K\ni (x, a)& \mapsto T'v_n(x, a) \Let c(x, a) + \alpha \int_{X} Q(\mrm dy|x, a)\indic{X\setmin K}(y)v_n(y)\in\R,\\

473: 				\mathbb K\ni (x, a)& \mapsto T'v^\star(x, a) \Let c(x, a) + \alpha \int_{X} Q(\mrm dy|x, a)\indic{X\setmin K}(y) v^\star(y)\in\R.

474: 			\end{align*}

475: 			The monotone convergence theorem guarantees that $T'v_n\uparrow T'v^\star$ pointwise on $\mathbb K$. As in the proof of Lemma~\ref{l:selector} one can establish inf-compactness and lower semicontinuity of $T'v_n$, and $T'v^\star$ on $\mathbb K$. From Lemma~\ref{l:keyconvergence} it now follows that for every $x\in X\setmin K$ we have

476: 			\begin{align*}

477: 				v^\star(x) & = \lim_{n\ra\infty}v_n(x) = \lim_{n\ra\infty} T v_{n-1}(x)\\

478: 				& = \lim_{n\ra\infty} \min_{A(x)}T'v_{n-1}(x, a) = \min_{A(x)} T'v^\star(x, a)\\

479: 				& = T v^\star(x).

480: 			\end{align*}

481: 			This shows that $v^\star$ satisfies the $\alpha$-DCOE, $v^\star = T v^\star$.

482:

483: 			It remains to show that $v^\star = V^\star$. But by Lemma \ref{l:Tineq}, $v^\star = T v^\star$ implies that $v^\star \ge V^\star$ and the reverse inequality follows from~\eqref{e:vnbound} by taking limits as $v^\star = \lim_{n\ra\infty} v_n \le V^\star$.

484: 		\end{proof}

485:

486: 		\begin{lemma}

487: 			\label{l:adcoestat}

488: 			For every deterministic stationary policy $f^\infty$ we have

489: 			\begin{equation}

490: 				\label{e:adcoepolicy}

491: 				V(f^\infty, x) = c(x, f) + \alpha\int_X Q(\mrm dy|x, f)\:\indic{X\setmin K}(y)V(f^\infty, y)\qquad \fa x\in X\setmin K.

492: 			\end{equation}

493: 		\end{lemma}

494: 		\begin{proof}

495: 			Fix a deterministic stationary policy $f^\infty$ and $x\in X\setmin K$. The $\alpha$-discounted cost $V(f^\infty, x)$ corresponding to this policy satisfies, in view of the definition of $\tau$ and the fact that $x\in X\setmin K$,

496: 			\begin{align}

497: 				V(f^\infty, x) & \Let \mathsf E_x^{f^\infty}\!\left[\sum_{i=0}^{\tau-1} \alpha^i c(x_i, f)\right] = \mathsf E_x^{f^\infty}\!\left[ c(x, f) + \sum_{i=1}^{\tau-1} \alpha^i c(x_i, f)\right]\nonumber\\

498: 				& = c(x, f) + \alpha\mathsf E_x^{f^\infty}\!\left[\sum_{i=1}^{\tau-1} \alpha^{i-1} c(x_i, f)\right].\label{e:spolicy1}

499: 			\end{align}

500: 			But then by the Markov property,

501: 			\begin{align*}

502: 				\mathsf E_x^{f^\infty}\!\left[\sum_{i=1}^{\tau-1} \alpha^{i-1} c(x_i, f)\right] & = \mathsf E^{f^\infty}\!\left[\mathsf E^{f^\infty}\!\left[\sum_{i=1}^{\tau-1} \alpha^{i-1} c(x_i, f)\left.\left.\vphantom{\sum_i^\tau}\right|x_{1\mn(\tau-1)}\right]\right|x_0 = x\right]\\

503: 				& = \int_X \indic{X\setmin K}(y) Q(\mrm dy|x, f)\; \mathsf E^{f^\infty}\!\left[\sum_{i=1}^{\tau-1} \alpha^{i-1} c(x_i, f)\left.\vphantom{\sum_i^\tau}\right|x_1 = y\right]\\

504: 				& = \int_X \indic{X\setmin K}(y) Q(\mrm dy|x, f)\; V(f^\infty, y).

505: 			\end{align*}

506: 			This substituted back in~\eqref{e:spolicy1} gives~\eqref{e:adcoepolicy}.

507: 		\end{proof}

508:

509: 		\begin{proof}[Proof of Theorem {\rm \ref{t:EDC}}]

510: 			(i) That $V^\star$ is a solution of the $\alpha$-DCOE follows from Lemma \ref{l:VIconv}, and that $V^\star$ is the minimal solution follows from Lemma~\ref{l:Tineq}, since $u = T u$ implies $u \ge V^\star$.

511:

512: 			(ii) Lemma~\ref{l:selector} guarantees the existence of a selector $f_\star\in\mathbb F$ such that~\eqref{e:alphado} holds. Fix $n\in\N$ and $x\in X\setmin K$. As in the proof of Lemma~\ref{l:Tineq}, iterating equation~\eqref{e:alphado} $n$-times we arrive at

513: 			\begin{align*}

514: 				V^\star(x) & = \mathsf E_x^{f_\star^\infty}\!\left[\sum_{i=0}^{(n-1)\mn(\tau-1)} \alpha^i c(x_i, f_\star)\right] + \mathsf E^{f_\star^\infty}_x\bigl[\alpha^{n} V^\star(x_{n})\indic{\{n < \tau\}}\bigr] \ge \mathsf E_x^{f_\star^\infty}\!\left[\sum_{i=0}^{(n-1)\mn(\tau-1)} \alpha^i c(x_i, f_\star)\right].

515: 			\end{align*}

516: 			By the monotone convergence theorem we have

517: 			\[

518: 				V^\star(x) \ge \lim_{n\ra\infty} \mathsf E_x^{f_\star^\infty}\!\left[\sum_{i=0}^{(n-1)\mn(\tau-1)} \alpha^i c(x_i, f_\star)\right] = \mathsf E_x^{f_\star^\infty}\!\left[\sum_{i=0}^{\tau-1} \alpha^i c(x_i, f_\star)\right],

519: 			\]

520: 			which shows that $V^\star(x) \ge V(f_\star^\infty, x)$, and since $x\in X\setmin K$ is arbitrary, it follows that $V^\star(\cdot) \ge V(f_\star^\infty, \cdot)$. The reverse inequality follows from the definition of $V^\star$ in~\eqref{e:problem}. We conclude that $V^\star(\cdot) = V(f_\star^\infty, \cdot)$, and that $f_\star^\infty$ is an optimal policy.

521:

522: 			For the converse, if $f_\star^\infty$ is an optimal deterministic stationary policy, then by Lemma \ref{l:adcoestat}, equation~\eqref{e:adcoepolicy} becomes

523: 			\[

524: 				V^\star(x) = V(f_\star^\infty, x) = c(x, f_\star) + \alpha \int_X Q(\mrm dy|x, f_\star)\:\indic{X\setmin K}(y)V(f_\star^\infty, y)

525: 			\]

526: 			for $x\in X\setmin K$, which is identical to~\eqref{e:alphado}.

527: 		\end{proof}

528:

529: 	\section{A Contraction Mapping Approach}

530: 	\label{s:contr}

531: 		For the purposes of this section we let $\Lp 0(X\setmin K)$ denote the real vector space of real-valued measurable functions on $X$, and $\Lp 0(X\setmin K)^+$ be the convex cone of nonnegative elements of $\Lp 0(X\setmin K)$. (Note that according to paragraph~\ref{pgr:convention} we let the elements of $\Lp 0(X\setmin K)^+$ take the value $+\infty$.) Given a measurable \emph{weight function} $w:X\setmin K\lra[1, \infty[$ in $\Lp 0(X\setmin K)^+$, we define the weighted norm $\norm{u}_w \Let \sup_{x\in X} \abs{u(x)}/w(x)$. It is well-known that $\bigl(\Lp 0(X\setmin K), \norm{\cdot}_w\bigr)$ is a Banach space.

532:

533: 		\begin{assumption}

534: 			\label{a:further}

535: 			In addition to Assumption~\ref{a:key}, we require that there exist $\ol c > 0$, $\beta\in[1, 1/\alpha[$, and a measurable weight function $w:X\setmin K\lra[1, \infty[$ such that for every $x\in X\setmin K$

536: 			\begin{enumerate}[label=(\roman*), leftmargin=*, align=right, widest=iii]

537: 				\item $\displaystyle{\sup_{A(x)} c(x, a) \le \ol c w(x)}$;

538: 				\item $\displaystyle{\sup_{A(x)} \int_{X\setmin K} Q(\mrm dy|x, a) w(y) \le \beta w(x)}$.\AssumptionEnd

539: 			\end{enumerate}

540: 		\end{assumption}

541:

542: 		\begin{remark}

543: 			If $c$ is bounded, the weight function $w$ may be taken to be $\indic{X\setmin K}$. Also, if $x$ and $x^+$ are the current and the next states of the Markov control process, respectively, then Assumption~\ref{a:further}(ii) implies that

544: 			\[

545: 				\sup_{A(x)}\mathsf E\bigl[w(x^+)\indic{\{x^+\in X\setmin K\}}\big|(x, a)\bigr] \le \beta w(x)\qquad \fa x\in X\setmin K.

546: 			\]

547: 			We observe that this bears a resemblance with classical Lyapunov-like stability criteria, more specifically, the Foster-Lyapunov conditions~\cite[Chapter~8]{ref:meynCTCN}, \cite{ref:foss04}. However, the condition in Assumption~\ref{a:further}(ii) is uniform over the set of actions $A(x)$ pointwise in $x$. It connects the growth of the cost-per-stage function $c$ with a contraction induced by the discount factor $\alpha$.\RemarkEnd

548: 		\end{remark}

549:

550: 		Recall that a mapping $f:Y\lra Y$ on a nonempty complete metric space $(Y, \rho)$ is a \emph{contraction} if there exists a constant $\gamma\in[0, 1[$ such that $\rho(f(x_1), f(x_2)) \le \gamma\rho(x_1, x_2)$ for all $x_1, x_2\in Y$. The constant $\gamma$ is said to the the \emph{modulus} of the map $f$. A contraction has a unique fixed point $x^\star\in Y$ satisfying $f(x^\star) = x^\star$.

551:

552: 		\begin{proposition}[{\cite[Proposition~7.2.9]{ref:hernandez-lerma2}}]

553: 			\label{p:contr}

554: 			Let $T$ be a monotone map from the Banach space $\bigl(\Lp 0(X\setmin K), \norm{\cdot}_w\bigr)$ into itself. If there exists a $\gamma\in[0, 1[$ such that

555: 			\begin{equation}

556: 				\label{e:contr}

557: 				T(u+rw) \le T(u) + \gamma rw\qquad\text{whenever}\quad u\in\bigl(\Lp 0(X\setmin K), \norm{\cdot}_w\bigr),\quad r\in\R,

558: 			\end{equation}

559: 			then $T$ is a contraction with modulus $\gamma$.

560: 		\end{proposition}

561:

562: 		We have the following lemma.

563: 		\begin{lemma}

564: 			\label{l:Tcontr}

565: 			Under Assumption {\rm \ref{a:further}}, the map $T$ in~\eqref{e:Tdef} is a contraction on $\bigl(\Lp 0(X\setmin K)^+, \norm{\cdot}_w\bigr)$ with modulus $\gamma = \alpha\beta < 1$.

566: 		\end{lemma}

567: 		\begin{proof}

568: 			Fix $u\in\Lp 0(X\setmin K)^+$ with $\norm{u}_w < \infty$. As in the proof of Lemma~\ref{l:selector}, the mapping

569: 			\[

570: 				\mathbb K\ni(x, a)\mapsto T'u(x, a) = c(x, a) + \alpha\int_{X\setmin K} Q(\mrm dy|x, a) u(y)\in\posR

571: 			\]

572: 			is well-defined and l.s.c.\ in $a\in A(x)$ for all $x\in X\setmin K$. By the same Lemma we also know that $T$ maps $\Lp 0(X\setmin K)^+$ into $\Lp 0(X\setmin K)^+$. For every $(x, a)\in\mathbb K$, by Assumption~\ref{a:further},

573: 			\begin{align*}

574: 				\abs{T'(x, a)} & \le c(x, a) + \alpha\int_{X\setmin K}Q(\mrm dy|x, a) \frac{u(y)}{w(y)} w(y) \le \ol cw(x) + \alpha \norm{u}_w\int_{X\setmin K} Q(\mrm dy|x, a) w(y)\\

575: 				& \le \bigl(\ol c + \alpha\beta\norm{u}_w\bigr) w(x),

576: 			\end{align*}

577: 			which shows that $\norm{T'u}_w \le \ol c + \alpha\beta\norm{u}_w$. Therefore, $T$ maps $\bigl(\Lp 0(X\setmin K)^+, \norm{\cdot}_w\bigr)$ into itself. Since $c\ge 0$, it is clear that $T$ is a monotone map on $\bigl(\Lp 0(X\setmin K)^+, \norm{\cdot}_w\bigr)$. By Assumption~\ref{a:further}(ii), for $r\in\R$ and $x\in X\setmin K$ we have

578: 			\begin{align*}

579: 				T(u+rw)(x) & = \min_{A(x)}\left(c(x, a) + \alpha\int_{X\setmin K} Q(\mrm dy|x, a)\bigl(u(y) + rw(y)\bigr)\right)\\

580: 					& \le \min_{A(x)}\left(c(x, a) + \alpha\int_{X\setmin K} Q(\mrm dy|x, a)u(y)\right) + r\alpha\beta w(x)\\

581: 					%& \qquad\qquad + \alpha r\left.\indic{X\setmin K}(x)\int_{X\setmin K} Q(\mrm dy|x, a) w(y)\right)\\

582: 					& \le Tu(x) + r\alpha\beta w(x).

583: 			\end{align*}

584: 			This shows that~\eqref{e:contr} holds with $\gamma = \alpha\beta$, and Proposition~\ref{p:contr} implies that $T$ is a contraction on $\bigl(\Lp 0(X\setmin K)^+, \norm{\cdot}_w\bigr)$.

585: 		\end{proof}

586:

587: 		The following proposition establishes bounds for the distance between the optimal value function $V^\star$ and the $\alpha$-VI functions $(v_n)_{n\in\Nz}$ by employing the contraction mapping $T$ of Lemma~\ref{l:Tcontr}.

588:

589: 		\begin{proposition}

590: 			\label{p:Tfp}

591: 			Suppose that Assumption {\rm \ref{a:further}} holds, and let $\gamma \Let \alpha\beta$. Then:

592: 			\begin{enumerate}[label=\emph{(\roman*)}, align=right, leftmargin=*, widest=iii]

593: 				\item The $\alpha$-discount value function $V^\star$ satisfies $\norm{V^\star}_w \le \ol c/(1-\gamma)$.

594: 				\item The $\alpha$-VI functions $(v_n)_{n\in\Nz}$ satisfy

595: 				\[

596: 					V^\star(x) - v_n(x) \le \ol c w(x)\left(\frac{\gamma^n}{1-\gamma}\right)\qquad \fa x\in X\setmin K, \quad \fa n\in\N.

597: 				\]

598: 				In particular, $\norm{v_n - V^\star}_w \le \ol c\gamma^n/(1-\gamma)\;\;\; \fa n\in\Nz$.

599: 				\item The optimal value function $V^\star$ is the unique function in $\bigl(\Lp 0(X\setmin K)^+, \norm{\cdot}_w\bigr)$ that solves the $\alpha$-DCOE~\eqref{e:alphadcoe}.

600: 			\end{enumerate}

601: 		\end{proposition}

602: 		\begin{proof}

603: 			(i) Let $\pi$ be an arbitrary Markov policy. Trivially we have $\mathsf E^\pi_x\bigl[w(x_0)\bigr] \le w(x)$. Fix $i\in\N$, and a history $h_i\in\sigalg_{i\mn\tau}$. In view of Assumption~\ref{a:further}(ii), on the event $\{i < \tau\}$ we have

604: 			\begin{align*}

605: 				\mathsf E^\pi_x\bigl[w(x_i)\big|h_{i-1}, a_{i-1}\bigr] = \int_{X\setmin K} Q(\mrm dy|x_{i-1}, a_{i-1}) w(y) \le \beta w(x_{i-1})\quad \fa a_i\in A(x_i),

606: 			\end{align*}

607: 			which shows that $\mathsf E^\pi_x\bigl[w(x_i)\indic{\{i < \tau\}}\bigr] \le \beta \mathsf E^\pi_x\bigl[w(x_{i-1})\indic{\{i < \tau\}}\bigr]$. Iterating this inequality we arrive at $\mathsf E^\pi_x\bigl[w(x_i)\indic{\{i < \tau\}}\bigr] \le \beta^i w(x)$.  Also, by Assumption~\ref{a:further}(i) we have $c(x_i, a_i) \le \ol c w(x_i)$ for all $i\in\Nz$ such that $i < \tau$, which in conjunction with the above inequality gives

608: 			\begin{equation}

609: 			\label{e:Tfp1}

610: 				\mathsf E^\pi_x\bigl[c(x_i, a_i)\indic{\{i < \tau\}}\bigr] \le \ol c\beta^i w(x).

611: 			\end{equation}

612: 			By the monotone convergence theorem and~\eqref{e:Tfp1} we have

613: 			\begin{equation}

614: 			\label{e:Tfp2}

615: 			\begin{aligned}

616: 				V(\pi, x) & = \mathsf E^\pi_x\!\left[\sum_{i=0}^\infty \alpha^i c(x_i, a_i) \indic{\{i < \tau\}}\right] \le \sum_{i=0}^\infty \alpha^i \mathsf E^\pi_x\bigl[c(x_i, a_i)\indic{\{i < \tau\}}\bigr]\\

617: 				& \le \ol c\sum_{i=0}^\infty (\alpha\beta)^i w(x) \le w(x)\cdot\frac{\ol c}{1-\gamma}.

618: 			\end{aligned}

619: 			\end{equation}

620: 			It follows immediately that $\norm{V^\star}_w = \norm{\inf_{\Pi} V(\pi, x)}_w \le \ol c/(1-\gamma)$.

621:

622: 			(ii) By definition, the $\alpha$-VI functions $(v_n)_{n\in\Nz}$ satisfy $v_n = T v_{n-1} = T^n v_0$, with $v_0 \Let 0$. Since $T$ is a contraction on $\bigl(\Lp 0(X\setmin K)^+, \norm{\cdot}_w\bigr)$ by Lemma \ref{l:Tcontr}, it follows that $T$ has a unique fixed point, which, by definition is $V^\star$, since $\norm{V^\star}_w < \infty$ by (i). A standard property of contraction maps implies that

623: 			\[

624: 				\norm{T^n v_0 - V^\star}_w \le \gamma^n\norm{v_0 - V^\star}_w \qquad\fa u\in\Lp 0(X\setmin K)^+, \norm{u}_w < \infty,\quad \fa n\in\Nz.

625: 			\]

626: 			With the bound on $\norm{V^\star}_w$ obtained in (i), we get $\norm{v_n - V^\star}_w \le \ol c\cdot\gamma^n/(1-\gamma)$. Since $T$ is also a contraction on $\bigl(\Lp 0(X\setmin K)^+, \norm{\cdot}_w\bigr)$, $v_n|_{K} = 0$, and $v_n\uparrow V^\star$, the last inequality yields $V^\star(x) - v_n(x) \le \ol cw(x)\gamma^n/(1-\gamma)$ for every $x\in X\setmin K$.

627:

628: 			(iii) Of course $V^\star$ solves the $\alpha$-DCOE~\eqref{e:alphadcoe}. Uniqueness follows from the facts that the operator $T$ in~\eqref{e:Tdef} is a contraction by Lemma \ref{l:Tcontr}, and that the fixed point of a contraction mapping in a Banach space (or more generally, in a complete metric space) is unique.

629: 		\end{proof}

630:

631: 		Note that the conditions in Assumption~\ref{a:further} are automatic if $c$ is bounded. This gives the following straightforward result.

632: 		\begin{corollary}

633: 			\label{c:Tfp}

634: 			Suppose that Assumption {\rm \ref{a:key}} holds, and $\wt c \Let \sup_{\mathbb K}c(x, a) < \infty$. Then:

635: 			\begin{enumerate}[label=\emph{(\roman*)}, align=right, leftmargin=*, widest=iii]

636: 				\item The $\alpha$-discount value function $V^\star$ satisfies $\norm{V^\star} \le \wt c/(1-\alpha)$.

637: 				\item The $\alpha$-VI functions $(v_n)_{n\in\Nz}$ satisfy

638: 				\[

639: 					V^\star(x) - v_n(x) \le \wt c \left(\frac{\alpha^n}{1-\alpha}\right)\qquad \fa x\in X\setmin K,\quad \fa n\in\N.

640: 				\]

641: 				In particular, $\norm{v_n - V^\star} \le \wt c\:\alpha^n/(1-\alpha)\;\;\; \fa n\in\Nz$.

642: 				\item The optimal value function $V^\star$ is the unique function in $\bigl(\Lp 0(X\setmin K)^+, \norm{\cdot}_w\bigr)$ that solves the $\alpha$-DCOE~\eqref{e:alphadcoe}.

643: 			\end{enumerate}

644: 		\end{corollary}

645:

646: 	\section{Asymptotic Discount Optimality of the $\alpha$-VI Policy}

647: 	\label{s:ado}

648: 		We have seen that the $\alpha$-value iteration functions $(v_n)_{n\in\Nz}$ defined in~\eqref{e:VI} converge to $V^\star$ by Lemma~\ref{l:VIconv}. In this section we address the question whether the $\alpha$-VI policies converge in some sense to a policy $f_\star^\infty$ as $n\ra\infty$.

649:

650: 		\begin{defn}

651: 			\label{d:avip}

652: 			Let $(v_n)_{n\in\Nz}$ be the sequence of $\alpha$-VI functions in~\eqref{e:VI}, and let $\wh\pi = (\wh f_n)_{n\in\Nz}\in\Pi_{DM}$ be a deterministic Markov policy such that $\wh f_0\in\mathbb F$ is arbitrary, and for $n\in\N$,

653: 			\[

654: 				v_n(x) = c\bigl(x, \wh f_n\bigr) + \alpha \int_{X\setmin K}Q\bigl(\mrm dy\big|x, \wh f_n\bigr) v_{n-1}(y)\qquad \fa x\in X\setmin K.

655: 			\]

656: 			Then $\wh\pi$ is called an \emph{$\alpha$-VI policy}.\DefEnd

657: 		\end{defn}

658:

659: 		Under Assumption~\ref{a:key} we get the following basic existential result.

660:

661: 		\begin{proposition}

662: 			\label{p:ado}

663: 			Suppose that Assumption {\rm \ref{a:key}} holds, the action space $A$ is locally compact, and let $\wh\pi = \bigl(\wh f_n\bigr)_{n\in\Nz}\in\Pi_{DM}$ be an $\alpha$-VI policy as defined in Definition~\ref{d:avip}. Then there exists a selector $\wh f\in\mathbb F$ such that for every $x\in X\setmin K$, $\wh f(x)\in A(x)$ is an accumulation point of $\bigl(\wh f_n(x)\bigr)_{n\in\Nz}$, and the corresponding deterministic stationary policy $\wh f^\infty\in\Pi_{DS}$ is $\alpha$-discount optimal.

664: 		\end{proposition}

665:

666: 		The proof is based on the following immediate adaptation of \cite[Lemma 4.6.6]{ref:hernandez-lerma1}.

667: 		\begin{lemma}

668: 			\label{l:ado}

669: 			Let $u$ and $u_n$, $n\in\N$, be l.s.c.\ functions, bounded below, and inf-compact on $\mathbb K$. For every $n\in\N$ let $u_n^\star(x) \Let \min_{A(x)} u_n(x, a)$ and $u^\star(x) \Let \min_{A(x)} u(x, a)$, let $\wh f_n\in\mathbb F$ be a selector such that $u_n^\star(x) = u_n\bigl(x, \wh f_n(x)\bigr)$ for all $x\in X\setmin K$. If $A$ is locally compact and $u_n\uparrow u$, then there exists a selector $\wh f\in\mathbb F$ such that $\wh f(x)\in A(x)$ is an accumulation point of the sequence $\bigl(\wh f_n(x)\bigr)_{n\in\N}$ for every $x\in X\setmin K$, and $u^\star(x) = u\bigl(x, \wh f(x)\bigr)$.

670: 		\end{lemma}

671:

672: 		\begin{proof}[Proof of Proposition {\rm \ref{p:ado}}]

673: 			For $(x, a)\in\mathbb K$ we define $u(x, a) \Let c(x, a) + \alpha \int_{X\setmin K} Q(\mrm dy|x, a) V^\star(y)$, and

674: 			\begin{equation}

675: 				\label{e:unconv}

676: 				u_n(x, a) \Let c(x, a) + \alpha \int_{X\setmin K} Q(\mrm dy|x, a) v_{n-1}(y).

677: 			\end{equation}

678: 			Since $c\ge 0$, the functions $u_n$ and $u$ are nonnegative. Since $v_n\uparrow V^\star$ by Lemma~\ref{l:VIconv}, the monotone convergence theorem implies that

679: 			\[

680: 				\int_{X\setmin K}Q(\mrm dy|x, a) v_n(y) \lra \int_{X\setmin K} Q(\mrm dy|x, a) V^\star(y)

681: 			\]

682: 			pointwise on $\mathbb K$. It is clear that $u_n\uparrow u$, and the assertion follows at once from Lemma~\ref{l:ado}.

683: 		\end{proof}

684:

685: 		Under the stronger Assumption~\ref{a:further} we get quantitative estimates of the rate at which the $\alpha$-VI policy defined in Definition~\ref{d:avip} converges to an optimal one.

686:

687: 		\begin{defn}

688: 			The function $D:\mathbb K\lra\posR$ defined by

689: 			\[

690: 				\mathbb K\ni (x, a) \mapsto D(x, a) \Let c(x, a) + \alpha\int_{X\setmin K} Q(\mrm dy|x, a) V^\star(y) - V^\star(x)

691: 			\]

692: 			is called the \emph{$\alpha$-discount discrepancy function}. The $\alpha$-VI policy $\wh\pi = \bigl(\wh f_n\bigr)_{n\in\Nz}$ defined in Definition~\ref{d:avip} is called \emph{pointwise asymptotically discount optimal} if for every $x\in X\setmin K$ we have $\lim_{n\ra\infty} D\bigl(x, \wh f_n\bigr) = 0$.\DefEnd

693: 		\end{defn}

694:

695: 		It is clear that for $x\in X\setmin K$ and a selector $f\in\mathbb F$ (see paragraph~\ref{pgr:convention}), the $\alpha$-discount discrepancy function $D(x, f(x))$ is $0$ if and only if $f^\infty$ is an optimal policy. The function $D$ measures closeness to an optimal selector in a weak sense.

696:

697: 		\begin{proposition}

698: 			Suppose that Assumption {\rm \ref{a:further}} holds, and let $\gamma \Let \alpha\beta$. Then the $\alpha$-VI policy $\wh\pi = \bigl(\wh f_n\bigr)_{n\in\Nz}$ is pointwise asymptotically discount optimal, and for every $x\in X\setmin K$ and $n\in\N$,

699: 			\[

700: 				0 \le D\bigl(x, \wh f_n\bigr) \le 2\ol c\left(\frac{\gamma^{n+1}}{1-\gamma}\right) w(x).

701: 			\]

702: 		\end{proposition}

703: 		\begin{proof}

704: 			The first inequality follows directly from the definition of $V^\star$. To prove the second inequality fix $x\in X\setmin K$. We see that by the definition of the discrepancy function,

705: 			\begin{equation}

706: 			\label{e:adiscop1}

707: 			\begin{aligned}

708: 				D\bigl(x, \wh f_n\bigr) & = c\bigl(x, \wh f_n\bigr) + \alpha \int_{X\setmin K} Q\bigl(\mrm dy\big|x, \wh f_n\bigr) V^\star(y) - V^\star(x)\\

709: 				& = \bigl(v_{n+1}(x) - V^\star(x)\bigr) + \alpha\int_{X\setmin K} Q\bigl(\mrm dy\big|x, \wh f_n\bigr)\bigl(V^\star(y) - v_{n}(y)\bigr).

710: 			\end{aligned}

711: 			\end{equation}

712: 			By Proposition~\ref{p:Tfp}(ii) we have

713: 			\begin{equation}

714: 			\label{e:adiscop2}

715: 				\abs{v_{n+1}(x) - V^\star(x)} \le \ol cw(x) \frac{\gamma^{n+1}}{1-\gamma},

716: 			\end{equation}

717: 			and in the light of Assumption~\ref{a:further}(ii) we arrive at

718: 			\begin{equation}

719: 			\label{e:adiscop3}

720: 			\begin{aligned}

721: 				\int_{X\setmin K}Q\bigl(\mrm dy\big|x, \wh f_n\bigr)\bigl(V^\star(y) - v_{n}(y)\bigr) & \le \int_{X\setmin K}Q\bigl(\mrm dy\big|x, \wh f_n\bigr)\bigl(V^\star(y) - v_{n}(y)\bigr)\\

722: 				& \le \frac{\gamma^{n}}{1-\gamma}\beta w(x).

723: 			\end{aligned}

724: 			\end{equation}

725: 			The assertion follows immediately after substituting~\eqref{e:adiscop2} and~\eqref{e:adiscop3} in~\eqref{e:adiscop1}.

726: 		\end{proof}

727:

728: 		For bounded costs we have the following straightforward conclusion.

729:

730: 		\begin{corollary}

731: 			Suppose that Assumption {\rm \ref{a:key}} holds, and $\wt c \Let \sup_{\mathbb K}c(x, a) < \infty$. Then the $\alpha$-VI policy $\wh\pi = \bigl(\wh f_n\bigr)_{n\in\Nz}$ is pointwise asymptotically discount optimal, and for every $x\in X\setmin K$ and $n\in\N$,

732: 			\[

733: 				0 \le D\bigl(x, \wh f_n\bigr) \le 2\wt c\:\left(\frac{\alpha^{n+1}}{1-\alpha}\right).

734: 			\]

735: 		\end{corollary}

736:

737: 	\section{Average cost of recovery}

738: 		As mentioned in \secref{s:intro}, a motivation for this work was to come up with a suitable recovery strategy for MPC. Tracing our development of the MPC methodology in \secref{s:intro}, one sees that in the presence of state and/or action constraints, one seeks a deterministic stationary policy $g_\star^\infty$ that is active whenever the state is inside the safe set $K$, and a recovery strategy outside $K$. Let us assume that for a given problem we have determined such a policy, and we have also determined a deterministic stationary policy $f_\star^\infty$ corresponding to the recovery strategy corresponding to a cost-per-stage function defined on $X\setmin K$ for the same problem as described in the preceding sections. One of the natural questions at this stage is whether one can find estimates of the average cost of recovery.%To wit, what is the average cost of the excursions of the state of the process outside $K$ that is incurred when the policies $g_\star^\infty$ and $f_\star^\infty$ are applied inside and outside $K$, respectively?

739:

740: 		To this end let us define two constants:

741: 		\begin{equation}

742: 		\label{e:betadef}

743: 			\beta_1 \Let \inf_{x\in K}\int_{X\setmin K} Q(\mrm dy|x, g_{\star}) V^\star(y),\quad  \beta_2 \Let \sup_{x\in K}\int_{X\setmin K} Q(\mrm dy|x, g_{\star}) V^\star(y),

744: 		\end{equation}

745: 		where $V^\star$ is as defined in \eqref{e:problem}. Let $\wt f^\infty$ be the deterministic stationary policy defined by

746: 		\begin{equation}

747: 		\label{e:overallpolicy}

748: 			\wt f(x) \Let f_\star(x)\indic{X\setmin K}(x) + g_\star(x)\indic{K}(x);

749: 		\end{equation}

750: 		to wit, $\wt f^\infty$ consists of concatenation of $f_\star^\infty$ and $g_\star^\infty$ between exit and entry times to $K$. We have the following result:

751:

752: 		\begin{proposition}

753: 			Let $g_\star^\infty$ be a deterministic stationary policy that is active whenever the state is inside the set $K$, and let $f_\star^\infty$ be a recovery strategy corresponding to the problem \eqref{e:problem}. Let the initial condition $x$ be in $X\setmin K$. We define the average cost of recovery

754: 			\[

755: 				\wt W(x) \Let \lim_{n\to\infty} \frac{1}{n+1} \EE^{\wt f^\infty}_x\Biggl[\sum_{i=0}^{n}\sum_{t=\tau_{2i}}^{\tau_{2i+1}-1} \alpha^{t-\tau_{2i}} c(x_t, a_t)\Biggr],

756: 			\]

757: 			where $\wt f^\infty$ is as defined in \eqref{e:overallpolicy}, $\tau_0 \Let 0$, $\tau_1$ is the first entry time to $K$, $\tau_2$ is the first exit time from $K$ after $\tau_1$, and so on. Suppose that from any initial condition in $X\setmin K$ the first hitting time of $K$ is finite almost surely under $f_\star^\infty$, and from any initial condition in $K$ the first hitting time of $X\setmin K$ is finite almost surely under $g_\star^\infty$. Then we have $\beta_1 \le \wt W(x) \le \beta_2$, where $\beta_1, \beta_2$ are as defined in \eqref{e:betadef}.

758: 		\end{proposition}

759: 		Note that an identical bound holds if the initial condition $x\in K$, with an obvious relabelling of the stopping times $(\tau_i)_{i\in\Nz}$.

760: 		\begin{proof}

761: 			First of all, note that the policy $\wt f^\infty$ is deterministic stationary, and under this policy the controlled process is stationary Markov. Now we have for a fixed $n\in \N$:

762: 			\begin{equation}

763: 			\label{e:costofrecovery}

764: 			\begin{aligned}

765: 				\EE^{\wt f^\infty}_x & \Biggl[\sum_{i=0}^n \sum_{t=\tau_{2i}}^{\tau_{2i+1}-1} \alpha^{t-\tau_{2i}} c(x_t, a_t)\Biggr] = \sum_{i=0}^n \EE^{\wt f^\infty}_x\Biggl[\sum_{t=\tau_{2i}}^{\tau_{2i+1}-1} \alpha^{t-\tau_{2i}} c(x_t, a_t)\Biggr]\\

766: 				& = \sum_{i=0}^n \EE^{\wt f^\infty}_x\Biggl[\EE^{f_\star^\infty}\Biggl[\sum_{t=\tau_{2i}}^{\tau_{2i+1}-1} \alpha^{t-\tau_{2i}} c(x_t, a_t)\Bigg|\sigalg_{\tau_{2i}}\Biggr]\Biggr] = \sum_{i=0}^n \EE^{\wt f^\infty}_x\Bigl[\EE^{f_\star^\infty}\bigl[V^\star(x_{\tau_{2i}})\big|\sigalg_{\tau_{2i}}\bigr]\Bigr],

767: 			\end{aligned}

768: 			\end{equation}

769: 			where the first equality follows from monotone convergence and the last equality from the strong Markov property. Appealing to the strong Markov property once again we see that $\EE^{f_\star^\infty}\bigl[V^\star(x_{\tau_{2i}})\big|\sigalg_{\tau_{2i}}\bigr] = \EE^{f_\star^\infty}\bigl[V^\star(x_{\tau_{2i}})\big|x_{\tau_{2i}}\bigr]$. Finally, from the definition of $\tau_{2i}$ it follows that

770: 			\[

771: 				\EE^{f_\star^\infty}\bigl[V^\star(x_{\tau_{2i}})\big|x_{\tau_{2i}}\bigr] \le \sup_{\xi\in K}\int_X Q(\mrm dy|\xi, f_{\mathrm{in}}) V^\star(y)\indic{X\setmin K}(y) = \beta_2.

772: 			\]

773: 			It is not difficult to arrive at the lower bound $\EE^{f_\star^\infty}\bigl[V^\star(x_{\tau_{2i}})\big|x_{\tau_{2i}}\bigr]\ge \beta_1$ by following the same steps as above. Substituting in~\eqref{e:costofrecovery} and taking limits we arrive at the assertion.

774: 		\end{proof}

775:

776: 	\section{A Rolling Horizon Implementation}

777: 	\label{s:rh}

778: 		The \emph{rolling-horizon} procedure can be briefly described as follows. Fix a horizon $N\in\N$ and set $n = 0$. Then

779: 		\begin{enumerate}[label=(\alph*), leftmargin=*, align=right]

780: 			\item we determine an optimal control policy, say $\pi^\star_{n:n+N}$, for the $(N+1)$-period cost function starting from time $n$, given the (perfectly observed) initial condition $x_n$; standard arguments lead to a realization of this policy as a sequence of $(N+1)$ selectors $\bigl\{\wh f_{n, n+N-j}\big|j=n, n+1, \ldots, n+N\bigr\}$;% and we define $\wh f_N \Let f_{n, N}$;

781: 			\item we increase $n$ to $n+1$, and go back to step (a).

782: 		\end{enumerate}

783: 		Accordingly, the $n$-th step of this procedure consists of minimizing the stopped $(N+1)$-period cost function starting at time $n$, namely, the objective is to find a control policy that attains

784: 		\begin{equation}

785: 		\label{e:rhcf}

786: 		\begin{aligned}

787: 			\inf_{\pi\in\Pi} V_{n, n+N}(\pi, x) \Let \;\inf_{\pi\in\Pi}\mathsf E^\pi\!\left[\left.\sum_{i=n\mn(\tau-1)}^{(n+N)\mn(\tau-1)} \alpha^{i-n\mn(\tau-1)} c(x_i, a_i)\right|x_{n\mn(\tau-1)} = x\right]

788: 		\end{aligned}

789: 		\end{equation}

790: 		for $x\in X\setmin K$. By stationarity and Markovian nature of the control model, it is enough to consider the control problem of minimizing the cost for $n = 0$, i.e., the problem of minimizing $V_{0, N}(\pi, x)$ over $\pi\in\Pi$. The corresponding policy $\pi$ is given by the policy that minimizes the $(N+1)$-stage $\alpha$-VI function $v_{N+1}$ in~\eqref{e:VI}. This particular policy is realized as a sequence of $(N+1)$ selectors $\bigl(\wh f_N, \ldots, \wh f_0\bigr)$. Thus, in the light of the above discussion, the rolling-horizon procedure yields the stationary suboptimal control policy $\wh\pi \Let \wh f_N^\infty$ for the original problem~\eqref{e:problem}.

791:

792: 		Let $V\bigl(\wh f_N^\infty, x\bigr)$ be the value function corresponding to the deterministic stationary policy $\wh f_N^\infty \Let \bigl(\wh f_N, \wh f_N, \ldots\bigr)$, $x\in X\setmin K$. Observe that $\norm{V\bigl(\wh f_N^\infty, x\bigr)}_w < \infty$, which follows from the more general estimate in~\eqref{e:Tfp2}. Our objective in this section is to give quantitative estimates of the extent of sub-optimality of the rolling-horizon policy $\wh\pi$, compared to the optimal policy $\pi^\star$ that attains the infimum in~\eqref{e:problem}. We shall follow the notations of~\secref{s:contr} above.

793:

794: 		\begin{theorem}

795: 			\label{t:rh}

796: 			Suppose that Assumption {\rm \ref{a:further}} holds, and let $\gamma \Let \alpha\beta$. For every $N\in\Nz$ and $x\in X\setmin K$ we have

797: 			\begin{equation}

798: 			\label{e:keyrh}

799: 				0 \le V\bigl(\wh f_N^\infty, x\bigr) - v_{N+1}(x) \le \ol c w(x) \left(\frac{\gamma^{N+1}}{1-\gamma}\right),

800: 			\end{equation}

801: 			where $v_{N+1}$ is the $(N+1)$-th $\alpha$-VI function defined in~\eqref{e:VI}. In particular,

802: 			\begin{equation}

803: 			\label{e:sloppyrh}

804: 				V\bigl(\wh f_N^\infty, x\bigr) - V^\star(x) \le \ol c w(x) \left(\frac{\gamma^{N+1}}{1-\gamma}\right).

805: 			\end{equation}

806: 		\end{theorem}

807:

808: 		A proof of Theorem \ref{t:rh} is given in the Appendix, if follows the arguments in~\cite[Theorem~1]{ref:aldenRH} for finite state-space Markov decision processes and bounded costs. It is of interest to note that the bound in~\eqref{e:keyrh} is identical to the bound between $V^\star(\cdot)$ and $v_{N+1}(\cdot)$ that appears in Proposition~\ref{p:Tfp}.

809:

810: 		If the cost-per-stage function $c$ is bounded on $\mathbb K$, we have the following immediate corollary:

811: 		\begin{corollary}

812: 			Suppose the Markov control process satisfies Assumption {\rm \ref{a:key}}. Let the cost-per-stage function $c:\mathbb K\lra\posR$ be bounded, with $\wt c \Let \sup_{\mathbb K} c(x, a) < \infty$. Then $V\bigl(\wh f_N^\infty, x\bigr) \ge V^\star(x)$ for every $x\in X\setmin K$, and

813: 			\[

814: 				\sup_{x\in X\setmin K}\left(V\bigl(\wh f_N^\infty, x) - V^\star(x)\right) \le \frac{\wt c\cdot\alpha^{N+1}}{1-\alpha}.

815: 			\]

816: 		\end{corollary}

817:

818: 	\section{Application}

819: 	\label{s:appl}

820: 		In this section we give a numerical example concerning fishery management. The example is motivated by~\cite[Chapter~7]{hastings1989introduction}. The example considers a fishery modeled in discrete-time with the time period representing a fishing season. The state of the controlled Markov chain is the population of the fish species of interest. Fishermen might on the one hand want to harvest all that they can manage in order to increase their short-run profit, but on the other hand this might lead to very low levels of the population.  Our goal is to design a recovery strategy for the case that the population gets over-fished and goes below a critical level.

821:

822:         For doing so, we consider a simple model, with four possible fish population levels, 1 (almost extinct), 2, 3, and 4 (the target set). We assume that we can accurately measure the population size at the beginning of each season $k$, $X_k$. During a season the following set of actions are available: {Harvest (1), Harvest less (2), Do nothing (3), Import fish (4), Import less (5)}. We also take as given the following transition probabilities between the Markov States, where $T_a(i,j)$ denotes the probability that the population level at the beginning of the next season will be $j$, given that the current population is $i$ and action $a$ is applied during this season.

823:

824:         \begin{alignat*}{2}

825:         T_1 & = \begin{bmatrix} 1 & 0 & 0 &0\\

826:         0.7 & 0.3 & 0 & 0\\

827:         0.1 & 0.6 & 0.3 & 0\\

828:         - & - & - & - \end{bmatrix} & \quad &

829:         T_2 = \begin{bmatrix} 1 & 0 & 0 & 0\\

830:         0.35 & 0.65 & 0 & 0\\

831:         0.04 & 0.5 & 0.46 & 0\\

832:         - & - & - & - \end{bmatrix}\\

833:         T_3 & = \begin{bmatrix} 0.99 & 0.01 & 0 &0\\

834:         0.01 & 0.7 & 0.28 & 0.01\\

835:         0 & 0.03 & 0.65 & 0.32\\

836:         - & - & - & - \end{bmatrix} & \quad &

837:         T_4 = \begin{bmatrix} 0.4 & 0.6 & 0 & 0\\

838:         0 & 0.3 & 0.65 & 0.05\\

839:         0 & 0 & 0.25 & 0.75\\

840:         - & - & - & - \end{bmatrix}\\

841:         T_5 & = \begin{bmatrix} 0.6 & 0.4 & 0 & 0\\

842:         0 & 0.45 & 0.54 & 0.01\\

843:         0 & 0 & 0.45 & 0.55\\

844:         - & - & - & - \end{bmatrix} & &

845:         \end{alignat*}

846:

847:         The costs incurred at each state are $c(x_i,\alpha_i) = C(x_i) + A(x_i,\alpha_i)$, where \begin{equation*}C(x_i)=\begin{bmatrix} 300 & 150 & 100 & - \end{bmatrix}^\textrm{T} \end{equation*} represents a cost incurred for being at the current state and \begin{equation*}A(x_i,\alpha_i) = \begin{bmatrix} -20 & -10 & 0 & 150 & 75\\ -40 & -20 & 0 & 150 & 75\\ -80 & -40 & 0 & 150 & 75\\ - & - & - & - & - \end{bmatrix}\end{equation*} the action cost associated with each action and state. We assume a discount factor $\alpha = 0.9$.

848:

849:         Using this setting, one can compute the policy that attains the $\alpha$-discount value function~\eqref{e:problem}. This turns out to be to import fish when in state $(1)$, to import fewer fish in state $(2)$, and do nothing at state $(3)$. Next, we search for the optimum policy, while using a rolling horizon control scheme, i.e., finding the policy that attains~\eqref{e:rhcf}. We solve the problem for horizon lengths between $1$ and $10$, in order to compare the results with the infinite horizon optimal policy.

850:

851:         \begin{figure}[h]

852:           \centering

853:           \includegraphics[width=0.49\textwidth]{avg_cost.pdf}

854:           \includegraphics[width=0.49\textwidth]{std_cost.pdf}

855:           \caption{Accumulated cost average and standard-deviation}

856:           \label{f:cost}

857:         \end{figure}

858:

859:         \begin{figure}[h]

860:           \centering

861:           \includegraphics[width=0.49\textwidth]{avg_hitting_time.pdf}

862:           \includegraphics[width=0.49\textwidth]{std_hitting_time.pdf}

863:           \caption{Hitting time average and standard-deviation}

864:           \label{f:time}

865:         \end{figure}

866:

867:         Figure~\ref{f:cost} shows the average and the standard-deviation of the accumulated costs over $2\times 10^5$ Monte Carlo runs, with the initial population level at state $1$. Similarly, Figure~\ref{f:time} shows the average and the standard-deviation of the time steps needed for the recovery into the target state $4$. The results suggest that for the rolling horizon policy to match the optimal infinite horizon one, a horizon length of at least $8$ should be used. Smaller horizons provide sub-optimal policies (with respect to the infinite horizon one), with the sub-optimality gap reducing as the horizon length increases. Note that the case of $N = 1$ is not included in the data; this is because for horizon length of $1$ the optimal policy is to harvest while the system is at state $1$, leading to an $\infty$ cost and recovery time, which does not allow the system to ever recover to state $4$.

868:

869:

870: 	\section{Future Work}

871: 	\label{s:concl}

872: 		We established in~\secref{s:EDC} that the optimal value function $V^\star$ is the minimal solution of the $\alpha$-discounted cost optimality equation~\eqref{e:alphadcoe}. However, obtaining analytical expression of the optimal value function $V^\star$ is difficult, particularly due to the integration over a subset $X\setmin K$ of the state space. Obtaining good approximations of $V^\star$ is of vital importance, and will be reported in subsequent articles.

873:

874: 		It is interesting to note that our basic framework of stochastic model-predictive control (described in~\secref{s:intro}) naturally leads to a partitioning of the state-space with different dynamics in each partition; thus, the controlled system may be viewed as a stochastic hybrid system. One of the basic questions in this context is that of stability of the controlled system, and in view of the fact that in general there will be infinitely many excursions of the state outside the safe set, establishing any stability property is a challenging task. Classical Lyapunov-based methods are difficult to apply directly precisely because of the infinitely many state-dependent switches between multiple regimes, each with different dynamics. However, excursion-theory of Markov processes~\cite{ref:blumenthal1992} enables us to establish certain stability properties of quite general stochastic hybrid systems with state-dependent switching; some of these results are reported in~\cite{ref:palExcur}.

875:

876: 	\section*{Acknowledgments}

877: 		The authors are grateful to Vivek S. Borkar, On\'esimo Hern\'andez-Lerma, and Sean P. Meyn for illuminating discussions and pointers to relevant literature. They also thank the anonymous reviewers for their helpful comments.

878:

879: %\bibliographystyle{siam}

880: %\bibliography{../references}

881:

882: 	\begin{appendix}

883: 	\section{Proof of Theorem \ref{t:rh}}

884: 		\begin{proof}[Proof of Theorem {\rm \ref{t:rh}}]

885: 			For brevity of notation in this proof, we let $\wh\pi \Let \wh f_N^\infty$, and let $\wh\pi_{i:j}$ denote the (ordered) elements of the policy $\wh\pi$ from stage $i$ through $j$ for $j > i$. The first inequality in~\eqref{e:keyrh} is trivial because $v_{N+1}(x) \le V^\star(x) \le V\bigl(\wh f_N^\infty, x\bigr)$ for all $x\in X\setmin K$. Before the proof of the second inequality in~\eqref{e:keyrh}, let us fix some notation. Pick $N\in\Nz$. For $n\in\Nz$, a policy $\pi_{n:n+N}$ for stages $n$ through $n+N$, and $i\in\{n, \ldots, n+N\}$, let $\pi_{n:n+N}(i)$ denote $i$-th element of the policy $\pi_{n:n+N}$. Also, let $Q\bigl(\cdot\big| x, \pi_{n:n+N}\bigr)$ denote the sub-stochastic kernel\footnote{Recall that $Q(\cdot|\cdot)$ is a \emph{sub-stochastic kernel} on $X\setmin K$ given $Y$ if $Q(B|\cdot)$ is a measurable function on $Y$ for each $B\in\Borelsigalg{X}$, and $Q(\cdot|y)$ is a measure on $X$ with $Q(X|y) \le 1$ for each $y\in Y$.} defined for $x\in X\setmin K$ by

886: 			\begin{align*}

887: 				Q\bigl(B\big|x, \pi_{n:n+N}\bigr) \Let &{} \int_{X\setmin K}Q\bigl(\mrm d\xi_0\big|x, \pi_{n:n+N}(n)\bigr)\cdots\int_{X\setmin K}Q\bigl(\mrm d\xi_N\bigr|\xi_{N-1}, \pi_{n:n+N}(n+N)\bigr)\indic{B}(\xi_N)

888: 			\end{align*}

889: 			for $B\in\Borelsigalg{X\setmin K}$.

890:

891: 			Let $\pi^\star_{n:n+N}$ be an optimal policy for stages $n$ through $n+N$, i.e., let $\pi^\star_{n:n+N}$ attain the infimum in~\eqref{e:rhcf}. Fix $x\in X\setmin K$. Let $\zeta_{n+1:n+N+1}$ be an $(N+1)$-period policy starting from stage $n+1$, such that its first $N$ elements are identical to the last $N$ elements of $\pi^\star_{n:n+N}$, i.e., $\zeta_{n+1:n+N+1}(j) = \pi^\star_{n:n+N}(j)$ for $j=n+1, \ldots, n+N$. By optimality of $\pi^\star_{n:n+N}$ we have

892: 			\begin{multline*}

893: 				\mathsf E^{\zeta_{n+1:n+N+1}}_x\!\left[\sum_{i=n+1}^{n+N+1} \alpha^i c(x_i, a_i)\indic{\{i< \tau\}}\left.\vphantom{\sum_{i=n+1}^{n+N+1}}\right|x_{(n+1)\mn(\tau-1)}\right]\\

894: 				\ge \mathsf E^{\pi^\star_{n+1:n+N+1}}_x\!\left[\sum_{i=n+1}^{n+N+1} \alpha^i c(x_i, a_i)\indic{\{i < \tau\}}\left.\vphantom{\sum_{i=n+1}^{n+N+1}}\right|x_{(n+1)\mn(\tau-1)}\right].

895: 			\end{multline*}

896: 			Since $\wh\pi_{n:n+N}(n) = \pi^\star_{n:n+N}(n)$ by construction, conditional on $x_{n\mn(\tau-1)} = x'\in X\setmin K$,

897: 			\begin{multline}

898: 				\label{e:keyineq}

899: 				\int_{X\setmin K}Q\bigl(\mrm dy\big|x', \pi^\star_{n:n+N}(n)\bigr)\mathsf E^{\zeta_{n+1:n+N+1}}\!\left[\sum_{i=n+1}^{n+N+1} \alpha^i c(x_i, a_i)\indic{\{i < \tau\}}\left.\vphantom{\sum_{i=n+1}^{n+N+1}}\right|x_{(n+1)\mn(\tau-1)} = y\right]\ge\\

900: 				\int_{X\setmin K}Q\bigl(\mrm dy\big|x', \wh\pi_{n:n+N}(n)\bigr)\mathsf E^{\pi^\star_{n+1:n+N+1}}_x\!\left[\sum_{i=n+1}^{n+N+1} \alpha^i c(x_i, a_i)\indic{\{i < \tau\}}\left.\vphantom{\sum_{i=n+1}^{n+N+1}}\right|x_{(n+1)\mn(\tau-1)} = y\right].

901: 			\end{multline}

902: 			By definition of $\zeta$ we have

903: 			\begin{align*}

904: 				\mathsf E^{\zeta_{n+1:n+N+1}}\!& \left[\sum_{i=n+1}^{n+N+1} \alpha^i c(x_i, a_i)\indic{\{i < \tau\}}\left.\vphantom{\sum_{i=n+1}^{n+N+1}}\right|x_{(n+1)\mn(\tau-1)}\right]\\

905: 				& = \mathsf E^{\zeta_{n+1:n+N+1}}\!\left[\sum_{i=n+1}^{n+N} \alpha^i c(x_i, a_i)\indic{\{i < \tau\}}\left.\vphantom{\sum_{i=n+1}^{n+N+1}}\right|x_{(n+1)\mn(\tau-1)}\right] \\

906: 				& \qquad+ \mathsf E^{\zeta_{n+1:n+N+1}}\!\left[\alpha^{n+N+1} c(x_{n+N+1}, a_{n+N+1})\indic{\{n+N+1 < \tau\}}\big|x_{(n+1)\mn(\tau-1)}\right],

907: 			\end{align*}

908: 			and the right-hand side equals%letting $\theta_{n+1:n+N}$ be an $N$-period policy starting from stage $n+1$ defined as $\theta_{n+1:n+N}(j) \Let \pi^\star_{n:n+N}(j)$ for $j=n+1, \ldots, n+N$, we rewrite the right-hand side as

909: 			\begin{multline*}

910: 				\mathsf E^{\pi^\star_{n:n+N}}\!\left[\sum_{i=n+1}^{n+N} \alpha^i c(x_i, a_i)\indic{\{i < \tau\}}\left.\vphantom{\sum_{i=n+1}^{n+N+1}}\right|x_{(n+1)\mn(\tau-1)}\right] \\

911: 				+ \mathsf E^{\zeta_{n+1:n+N+1}}\!\left[\alpha^{n+N+1} c(x_{n+N+1}, a_{n+N+1})\indic{\{n+N+1 < \tau\}}\big|x_{(n+1)\mn(\tau-1)}\right].

912: 			\end{multline*}

913: 			In conjunction with~\eqref{e:keyineq} and conditional on $x_{n\mn(\tau-1)} = x'\in X\setmin K$, we have

914: 			\begin{multline*}

915: 				\int_{X\setmin K}Q\bigl(\mrm dy\big|x', \pi^\star_{n:n+N}(n)\bigr)\mathsf E^{\pi^\star_{n:n+N}}\!\left[\sum_{i=n+1}^{n+N} \alpha^i c(x_i, a_i)\indic{\{i < \tau\}}\left.\vphantom{\sum_{i}^N}\right|x_{(n+1)\mn(\tau-1)}=y\right]\\

916: 					+ \int_{X\setmin K} Q\bigl(\mrm dy\big|x', \pi^\star_{n:n+N}(n)\bigr)\mathsf E^{\zeta_{n+1:n+N+1}}\bigl[\alpha^{n+N+1} c(x_{n+N+1}, a_{n+N+1})\indic{\{n+N+1 < \tau\}}\big|x_{(n+1)\mn(\tau-1)}=y\bigr]\\

917: 					\ge \int_{X\setmin K} Q\bigl(\mrm dy\big|x', \wh\pi_{n:n+N}(n)\bigr)\mathsf E^{\pi^\star_{n+1:n+N+1}}\!\left[\sum_{i=n+1}^{n+N+1}\alpha^i c(x_i, a_i)\indic{\{i < \tau\}}\left.\vphantom{\sum_i^N}\right|x_{(n+1)\mn(\tau-1)}=y\right].

918: 			\end{multline*}

919: 			To wit, conditional on $x_{n\mn(\tau-1)} = x'\in X\setmin K$,

920: 			\begin{multline*}

921: 				\mathsf E^{\pi^\star_{n:n+N}}\!\left[\sum_{i=n}^{n+N} \alpha^i c(x_i, a_i)\indic{\{i < \tau\}}\left.\vphantom{\sum_{i}^N}\right|x_{n\mn(\tau-1)} = x'\right] - \mathsf E^{\pi^\star_{n:n+N}}\!\left[\alpha^n c(x_n, a_n)\indic{\{i < \tau\}}\big|x_{n\mn(\tau-1)} = x'\right]\\

922: 					+ \alpha^{n+N+1} \int_{X\setmin K} \!\!Q\bigl(\mrm dy\big|x', \pi^\star_{n:n+N}(n)\bigr)\mathsf E^{\zeta_{n+1:n+N+1}}\bigl[c(x_{n+N+1}, a_{n+N+1})\indic{\{n+N+1 < \tau\}}\big|x_{(n+1)\mn(\tau-1)}=y\bigr]\\

923: 					\ge \int_{X\setmin K} Q\bigl(\mrm dy\big|x', \wh\pi_{n:n+N}(n)\bigr)\mathsf E^{\pi^\star_{n+1:n+N+1}}\!\left[\sum_{i=n+1}^{n+N+1}\alpha^i c(x_i, a_i)\indic{\{i < \tau\}}\left.\vphantom{\sum_i^N}\right|x_{(n+1)\mn(\tau-1)}=y\right].

924: 			\end{multline*}

925: 			Let $\zeta_{n+1:n+N+1}(n+N+1)(\cdot)$ be a selector that attains the minimal value of

926: 			\[

927: 				\alpha^{n+N+1}\int_{X\setmin K}Q\bigl(\mrm dy\big|x', \pi^\star_{n:n+N}(n)\bigr)\mathsf E^{\zeta_{n+1:n+N+1}}\bigl[c(x_{n+N+1}, a_{n+N+1})\indic{\{n+N+1 < \tau\}}\big|x_{(n+1)\mn(\tau-1)}=y\bigr]

928: 			\]

929: 			whenever $x'\in X\setmin K$, and let the corresponding minimal value be denoted by $e_n(x')$; clearly $e_n$ is well-defined on $X\setmin K$, and is a measurable function of $x'$. With this notation, the last inequality becomes

930: 			\begin{multline*}

931: 				\mathsf E^{\pi^\star_{n:n+N}}\!\left[\sum_{i=n}^{n+N}\alpha^i c(x_i, a_i)\indic{\{i < \tau\}}\left.\vphantom{\sum_i^N}\right|x_{n\mn(\tau-1)} = x'\right] - \mathsf E^{\pi^\star_{n:n+N}}\bigl[\alpha^n c(x_n, a_n)\indic{\{n < \tau\}}\big|x_{n\mn(\tau-1)} = x'\bigr]\\

932: 				+ e_n(x') \ge \int_{X\setmin K} Q\bigl(\mrm dy\big|x', \wh\pi_{n:n+N}(n)\bigr) \mathsf E^{\pi^\star_{n+1:n+N+1}}\!\left[\sum_{i=n+1}^{n+N+1}\alpha^i c(x_i, a_i)\indic{\{i < \tau\}}\left.\vphantom{\sum_i^N}\right|x_{(n+1)\mn(\tau-1)}=y\right]

933: 			\end{multline*}

934: 			whenever $x'\in X\setmin K$. Therefore,

935: 			\begin{multline*}

936: 				\int_{X\setmin K}Q\bigl(\mrm dy\big|x, \wh\pi_{0:n-1}\bigr) \mathsf E^{\pi^\star_{n:n+N}}\!\left[\sum_{i=n}^{n+N}\alpha^i c(x_i, a_i)\indic{\{i < \tau\}}\left.\vphantom{\sum_i^N}\right|x_{n\mn(\tau-1)}=y\right]\\

937: 					- \int_{X\setmin K}Q\bigl(\mrm dy\big|x, \wh\pi_{0:n-1}\bigr) \mathsf E^{\pi^\star_{n:n+N}}\bigl[\alpha^n c(x_n, a_n)\indic{\{n < \tau\}}\big|x_{n\mn(\tau-1)}\bigr] + \int_{X\setmin K}Q\bigl(\mrm dy\big|x, \wh\pi_{0:n-1}\bigr) e_n(y)\\

938: 					\ge \int_{X\setmin K}Q\bigl(\mrm dy\big|x, \wh\pi_{0:n}\bigr) \mathsf E^{\pi^\star_{n+1:n+N+1}}\!\left[\sum_{i=n+1}^{n+N+1}\alpha^i c(x_i, a_i)\indic{\{i < \tau\}}\left.\vphantom{\sum_i^N}\right|x_{(n+1)\mn(\tau-1)}=y\right].

939: 			\end{multline*}

940: 			Rearranging and summing over $n$ we arrive at

941: 			\begin{multline}

942: 			\label{e:ineq1}

943: 				\sum_{n=0}^\infty \int_{X\setmin K}Q\bigl(\mrm dy\big|x, \wh\pi_{0:n-1}\bigr) \mathsf E^{\pi^\star_{n:n+N}}\!\left[\alpha^n c(x_n, a_n)\indic{\{n < \tau\}}\left.\vphantom{\sum}\right|x_{n\mn(\tau-1)}=y\right]\\

944: 				\le \sum_{n=0}^\infty\left(\alpha^n\int_{X\setmin K}Q\bigl(\mrm dy\big|x, \wh\pi_{0:n-1}\bigr)\mathsf E^{\pi^\star_{n:n+N}}\!\left[\sum_{i=n}^{n+N}\alpha^{i-n} c(x_i, a_i)\indic{\{i < \tau\}}\left.\vphantom{\sum_i^N}\right| x_{n\mn(\tau-1)}=y\right]\right.\\

945: 				- \left.\alpha^{n+1}\int_{X\setmin K} Q\bigl(\mrm dy\big|x, \wh\pi_{0:n}\bigr)\mathsf E^{\pi^\star_{n+1:n+N+1}}\!\left[\sum_{i=n+1}^{n+N+1}\alpha^{i-n-1} c(x_i, a_i)\indic{\{i < \tau\}}\left.\vphantom{\sum_i^N}\right| x_{(n+1)\mn(\tau-1)}=y\right]\right)\\

946: 				+ \sum_{n=0}^\infty \int_{X\setmin K} Q\bigl(\mrm dy\big|x, \wh\pi_{0:n-1}\bigr) e_n(y).

947: 			\end{multline}

948: 			In~\eqref{e:ineq1} we have employed the notation $\int_{X\setmin K}Q\bigl(\mrm dy|x, \pi_{0:-1}\bigr) g(y) \Let g(x)$ for any policy $\pi$. We observe that the left-hand side of~\eqref{e:ineq1} is just $\mathsf E^{\wh\pi}_x\!\left[\sum_{i=0}^{\tau-1} \alpha^i c(x_i, a_i)\right]$. By Assumption~\ref{a:further}(i),

949: 			\[

950: 				\mathsf E^{\pi^\star_{n:n+N}}\!\left[\sum_{i=n}^{n+N}\alpha^{i-n} c(x_i, a_i)\indic{\{i < \tau\}}\left.\vphantom{\sum_i^N}\right| x_{n\mn(\tau-1)}=y\right] \le \ol c\mathsf E^{\pi^\star_{n:n+N}}\!\left[\sum_{i=n}^{n+N}\alpha^{i-n} w(x_i)\indic{\{i < \tau\}}\left.\vphantom{\sum_i^N}\right| x_{n\mn(\tau-1)}=y\right],

951: 			\]

952: 			and by Assumption~\ref{a:further}(ii),

953: 			\[

954: 				\mathsf E^{\pi^\star_{n:n+N}}\!\left[\sum_{i=n}^{n+N}\alpha^{i-n} w(x_i)\indic{\{i < \tau\}}\left.\vphantom{\sum_i^N}\right| x_{n\mn(\tau-1)}=y\right] \le w(y)\sum_{i=n}^{n+N}\gamma^{i-n}.

955: 			\]

956: 			We notice that since $c\ge 0$, the first series on the right-hand side of~\eqref{e:ineq1} is at most

957: 			\begin{equation}

958: 			\label{e:ineq2}

959: 			\begin{aligned}

960: 				\ol c \sum_{n=0}^\infty \alpha^n\sum_{i=n}^{n+N}\gamma^{i-n} \int_{X\setmin K}Q\bigl(\mrm dy\big|x, \wh\pi_{0:n-1}\bigr) w(y).

961: 			\end{aligned}

962: 			\end{equation}

963: 			For a fixed $n\in\Nz$, the quantity $\int_{X\setmin K}Q\bigl(\mrm dy|x, \wh\pi_{0:n}\bigr)w(y)$ is at most $\beta^{n+1} w(x)$ in view of Assumption~\ref{a:further}(ii) and the definition of the stochastic kernel $Q\bigl(\cdot\big|x, \pi_{n:n+N}\bigr)$ at the beginning of this proof. Therefore,

964: 			\begin{align*}

965: 				\sum_{n=0}^\infty & \alpha^n\sum_{i=n}^{n+N}\gamma^{i-n} \int_{X\setmin K}Q\bigl(\mrm dy\big|x, \wh\pi_{0:n-1}\bigr) w(y) \le \sum_{n=0}^\infty\alpha^n\sum_{i=n}^{n+N}\gamma^{i-n}\beta^n w(x)\\

966: 				& \le w(x) \left(\frac{1-\gamma^{N+1}}{1-\gamma}\right) < \infty.

967: 			\end{align*}

968: 			This shows that series in~\eqref{e:ineq2} is summable. Hence, cancellations of the telescopic terms in the first series on the right-hand side of~\eqref{e:ineq1} are justified. The inequality in~\eqref{e:ineq1} now simplifies to

969: 			\begin{multline}

970: 			\label{e:ineq3}

971: 				\mathsf E^{\wh\pi}_x\!\left[\sum_{i=0}^{\tau-1} \alpha^i c(x_i, a_i)\right] \le \mathsf E^{\pi^\star_{0:N}}_x\!\left[\sum_{i=0}^{(N+1)\mn(\tau-1)}\!\!\alpha^{i} c(x_i, a_i)\right] + \sum_{n=0}^\infty \int_{X\setmin K} Q\bigl(\mrm dy\big|x, \wh\pi_{0:n-1}\bigr) e_n(y).

972: 			\end{multline}

973: 			By Assumption~\ref{a:further}(ii) and the definition of $e_n$, conditional on $x_{n\mn\tau} = x'\in X\setmin K$,

974: 			\begin{align*}

975: 				& e_n(x')\\

976: 				& \le \alpha^{n+N+1}\int_{X\setmin K} \!\!Q\bigl(\mrm dy\big| x', \pi^\star_{n:n+N}(n)\bigr)\mathsf E^{\zeta_{n+1:n+N+1}}\bigl[c(x_{n+N+1}, a_{n+N+1})\indic{\{n+N+1 < \tau\}}\big|x_{(n+1)\mn(\tau-1)} = y\bigr]\\

977: 				& \le \ol c w(x')\alpha^n \gamma^{N+1}.

978: 			\end{align*}

979: 			Substituting the last inequality in~\eqref{e:ineq3} we arrive at

980: 			\[

981: 				\mathsf E^{\wh\pi}_x\!\left[\sum_{i=0}^{\tau-1} \alpha^i c(x_i, a_i)\right] \le \mathsf E^{\pi^\star_{0:N}}_x\!\left[\sum_{i=0}^{(N+1)\mn(\tau-1)}\!\!\alpha^{i} c(x_i, a_i)\right] + \frac{\ol c\gamma^{N+1}}{1-\gamma}w(x),

982: 			\]

983: 			which is the second bound in~\eqref{e:keyrh}. The inequality~\eqref{e:sloppyrh} follows immediately from the fact that $V^\star \ge v_n$ for every $n\in\N$.

984: 		\end{proof}

985: 	\end{appendix}

986:

987: \def\cprime{$'$}

988: \begin{thebibliography}{10}

989:

990: \bibitem{ref:accl08}

991: {\sc M.~Agarwal, E.~Cinquemani, D.~Chatterjee, and J.~Lygeros}, {\em On

992:   convexity of stochastic optimization problems with constraints}.

993: \newblock To be presented at the ECC 2009, 2008.

994:

995: \bibitem{ref:batinaPhDthesis}

996: {\sc I.~Batina}, {\em Model predictive control for stochastic systems by

997:   randomized algorithms}, PhD thesis, Technische Universiteit Eindhoven, 2004.

998:

999: \bibitem{ref:aldenRH}

1000: {\sc J.~M. Alden and R.~L. Smith}, {\em Rolling horizon procedures in

1001:   nonhomogeneous {M}arkov decision processes}, Operations Research, 40 (1992),

1002:   pp.~S183--S194.

1003:

1004: \bibitem{ref:bemporad1999rmp}

1005: {\sc A.~Bemporad and M.~Morari}, {\em {Robust model predictive control: a

1006:   survey}}, Robustness in Identification and Control, 245 (1999), pp.~207--226.

1007:

1008: \bibitem{ref:bertsekasNDP}

1009: {\sc D.~Bertsekas and J.~Tsitsiklis}, {\em Neuro-{D}ynamic {P}rogramming},

1010:   Athena Scientific, 1996.

1011:

1012: \bibitem{ref:bertsekasDP2}

1013: {\sc D.~P. Bertsekas}, {\em Dynamic {P}rogramming and {O}ptimal {C}ontrol},

1014:   vol.~2, Athena Scientific, 3~ed., 2007.

1015:

1016: \bibitem{ref:bertsimas2007}

1017: {\sc D.~Bertsimas and D.~B. Brown}, {\em Constrained stochastic {LQC}: a

1018:   tractable approach}, IEEE Transactions on Automatic Control, 52 (2007),

1019:   pp.~1826--1841.

1020:

1021: \bibitem{ref:blanchini1999sic}

1022: {\sc F.~Blanchini}, {\em {Set invariance in control}}, Automatica, 35 (1999),

1023:   pp.~1747--1767.

1024:

1025: \bibitem{ref:blumenthal1992}

1026: {\sc R.~M. Blumenthal}, {\em Excursions of {M}arkov processes}, Probability and

1027:   its Applications, Birkh\"auser Boston Inc., Boston, MA, 1992.

1028:

1029: \bibitem{ref:boda04}

1030: {\sc K.~Boda, J.~A. Filar, Y.~Lin, and L.~Spanjers}, {\em Stochastic target

1031:   hitting time and the problem of early retirement}, IEEE Transactions on

1032:   Automatic Control, 49 (2004), pp.~409--419.

1033:

1034: \bibitem{ref:borkarConvexAnalyticApproach}

1035: {\sc V.~S. Borkar}, {\em A convex analytic approach to {M}arkov decision

1036:   processes}, Probabability Theory and Related Fields, 78 (1988), pp.~583--602.

1037:

1038: \bibitem{ref:borkarTopicsControlledMC}

1039: \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em Topics in

1040:   {C}ontrolled {M}arkov {C}hains}, vol.~240 of Pitman Research Notes in

1041:   Mathematics Series, Longman Scientific \& Technical, Harlow, 1991.

1042:

1043: \bibitem{ref:palExcur}

1044: {\sc D.~Chatterjee and S.~Pal}, {\em An excursion-theoretic approach to

1045:   stability of stochastic hybrid systems}.

1046: \newblock \url{http://arxiv.org/abs/0901.2269}, 2008.

1047:

1048: \bibitem{ref:kouvaritakissMPCIneqconstraints}

1049: {\sc P.~D. Couchman, M.~Cannon, and B.~Kouvaritakis}, {\em Stochastic {MPC}

1050:   with inequality stability constraints}, Automatica J. IFAC, 42 (2006),

1051:   pp.~2169--2174.

1052:

1053: \bibitem{ref:dermanMDP}

1054: {\sc C.~Derman}, {\em Finite {S}tate {M}arkovian {D}ecision {P}rocesses},

1055:   vol.~67 of Mathematics in Science and Engineering, Academic Press, New York,

1056:   1970.

1057:

1058: \bibitem{ref:eatonzadeh62}

1059: {\sc J.~H. Eaton and L.~A. Zadeh}, {\em Optimal pursuit strategies in

1060:   discrete-state probabilistic systems}, Transactions of the ASME Ser. D. J.

1061:   Basic Engineering, 84 (1962), pp.~23--29.

1062:

1063: \bibitem{ref:foss04}

1064: {\sc S.~Foss and T.~Konstantopoulos}, {\em An overview of some stochastic

1065:   stability methods}, Journal of Operations Research Society of Japan, 47

1066:   (2004), pp.~275--303.

1067:

1068: \bibitem{hastings1989introduction}

1069: {\sc K.~Hastings}, {\em Introduction to the {M}athematics of

1070: {O}perations {R}esearch}, Pure and Applied Mathematics, 128, 1989.

1071:

1072: \bibitem{ref:hernandez-lerma1}

1073: {\sc O.~Hern{\'a}ndez-Lerma and J.~B. Lasserre}, {\em Discrete-{T}ime {M}arkov

1074:   {C}ontrol {P}rocesses: {B}asic {O}ptimality {C}riteria}, vol.~30 of

1075:   Applications of Mathematics, Springer-Verlag, New York, 1996.

1076:

1077: \bibitem{ref:hernandez-lerma2}

1078: \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em Further {T}opics on

1079:   {D}iscrete-{T}ime {M}arkov {C}ontrol {P}rocesses}, vol.~42 of Applications of

1080:   Mathematics, Springer-Verlag, New York, 1999.

1081:

1082: \bibitem{ref:hindererAbsorbingSet}

1083: {\sc K.~Hinderer and K.-H. Waldmann}, {\em Algorithms for countable state

1084:   {M}arkov decision models with an absorbing set}, SIAM Journal on Control and

1085:   Optimization, 43 (2005), pp.~2109--2131 (electronic).

1086:

1087: \bibitem{ref:smpcbnddu}

1088: {\sc D.~Chatterjee, P.~Hokayem and J.~Lygeros}, {\em Stochastic model predictive control

1089:   with bounded control inputs: a vector space approach}.

1090: \newblock \url{http://arxiv.org/abs/0903.5444}, 2009.

1091:

1092: \bibitem{ref:hokayemcdc09}

1093: {\sc P.~Hokayem, D.~Chatterjee, and J.~Lygeros}, {\em On stochastic model

1094:   predictive control with bounded control inputs}.

1095: \newblock \url{http://arxiv.org/abs/0902.3944}, 2009.

1096:

1097: \bibitem{ref:kestenMCP}

1098: {\sc H.~Kesten and F.~Spitzer}, {\em Controlled {M}arkov chains}, Annals of

1099:   Probability, 3 (1975), pp.~32--40.

1100:

1101: \bibitem{ref:kushnerIntroStochControl}

1102: {\sc H.~Kushner}, {\em Introduction to {S}tochastic {C}ontrol}, Holt, Rinehart

1103:   and Winston, Inc., New York, 1971.

1104:

1105: \bibitem{ref:maciejowskibk}

1106: {\sc J.~M. Maciejowski}, {\em Predictive Control with Constraints}, Prentice

1107:   Hall, 2001.

1108:

1109: \bibitem{ref:meynCTCN}

1110: {\sc S.~P. Meyn}, {\em Control {T}echniques for {C}omplex {N}etworks},

1111:   Cambridge University Press, Cambridge, 2008.

1112:

1113: \bibitem{ref:powellADP}

1114: {\sc W.~B. Powell}, {\em Approximate {D}ynamic {P}rogramming}, Wiley Series in

1115:   Probability and Statistics, Wiley-Interscience [John Wiley \& Sons], Hoboken,

1116:   NJ, 2007.

1117:

1118: \bibitem{ref:PrimbsSung09}

1119: {\sc J.~A. Primbs and C.~H. Sung}, {\em Stochastic receding horizon control of

1120:   constrained linear systems with state and control multiplicative noise}, IEEE

1121:   Transactions on Automatic Control, 54 (2009), pp.~221--230.

1122:

1123: \bibitem{ref:raoProbTheo}

1124: {\sc M.~M. Rao and R.~J. Swift}, {\em Probability {T}heory with

1125:   {A}pplications}, vol.~582 of Mathematics and Its Applications,

1126:   Springer-Verlag, 2~ed., 2006.

1127:

1128: \bibitem{ref:riederselectors}

1129: {\sc U.~Rieder}, {\em Measurable selection theorems for optimization problems},

1130:   Manuscripta Mathematica, 24 (1978), pp.~115--131.

1131:

1132: \bibitem{ref:schmidliInsurance}

1133: {\sc H.~Schmidli}, {\em Stochastic {C}ontrol in {I}nsurance}, Probability and

1134:   its Applications, Springer-Verlag London Ltd., London, 2008.

1135:

1136: \bibitem{ref:vanHessem2006}

1137: {\sc D.~H. van Hessem and O.~H. Bosgra}, {\em Stochastic closed-loop model

1138:   predictive control of continuous nonlinear chemical processes}, Journal of

1139:   Process Control, 16 (2006), pp.~225--241.

1140:

1141: \bibitem{ref:whittleOptimization}

1142: {\sc P.~Whittle}, {\em Optimization {O}ver {T}ime. {V}ol. {II}}, vol.~2 of

1143:   Wiley Series in Probability and Mathematical Statistics: Applied Probability

1144:   and Statistics, John Wiley \& Sons Ltd., Chichester, 1983.

1145:

1146: \end{thebibliography}

1147:

1148: %\bibliographystyle{siam}

1149: %\bibliography{../references}

1150:

1151: \end{document}

1152:

1153: