diff --git a/tex/biblio.bib b/tex/biblio.bib index 8058fb4..624230e 100644 --- a/tex/biblio.bib +++ b/tex/biblio.bib @@ -196,7 +196,7 @@ @article{gould2001solution publisher={SIAM} } -@software{jax2018github, +@misc{jax2018github, author = {James Bradbury and Roy Frostig and Peter Hawkins and Matthew James Johnson and Chris Leary and Dougal Maclaurin and George Necula and Adam Paszke and Jake Vander{P}las and Skye Wanderman-{M}ilne and Qiao Zhang}, title = {{JAX}: composable transformations of {P}ython+{N}um{P}y programs}, url = {http://github.com/google/jax}, @@ -311,7 +311,7 @@ @article{shin2021graph } @article{montoison2023krylov, - title={Krylov. jl: A Julia basket of hand-picked Krylov methods}, + title={Krylov. jl: A Julia basket of hand-picked {K}rylov methods}, author={Montoison, Alexis and Orban, Dominique}, journal={Journal of Open Source Software}, volume={8}, @@ -378,7 +378,7 @@ @inproceedings{bischof1991exploiting year={1991} } -@software{Montoison_CUDSS, +@misc{Montoison_CUDSS, author = {Montoison, Alexis}, license = {MIT}, title = {{CUDSS.jl: Julia interface for NVIDIA cuDSS}}, @@ -397,17 +397,6 @@ @article{hestenes-stiefel-1952 doi = {10.6028/jres.049.044} } -@article{cao-seth-laird-2016, - title = {An augmented Lagrangian interior-point approach for large-scale NLP problems on graphics processing units}, - author = {Cao, Yankai and Seth, Arpan and Laird, Carl D}, - journal = {Computers \& Chemical Engineering}, - volume = {85}, - pages = {76--83}, - year = {2016}, - publisher = {Elsevier}, - doi = {10.1016/j.compchemeng.2015.10.010} -} - @article{gondzio-2012, title = {Interior point methods 25 years later}, author = {Gondzio, Jacek}, @@ -420,14 +409,6 @@ @article{gondzio-2012 doi = {10.1016/j.ejor.2011.09.017} } -@inproceedings{openblas, - title = {{AUGEM: automatically generate high performance dense linear algebra kernels on x86 CPUs}}, - author = {Wang, Qian and Zhang, Xianyi and Zhang, Yunquan and Yi, Qing}, - booktitle = {Proceedings of the international conference on high performance computing, networking, storage and analysis}, - pages = {1--12}, - year = {2013} -} - @misc{montoison-orban-hsl-2021, author = {A. Montoison and D. Orban and {contributors}}, title = {{HSL.jl}: A {J}ulia interface to the {HSL} Mathematical Software Library}, @@ -438,7 +419,7 @@ @misc{montoison-orban-hsl-2021 } @article{chen-davis-hager-rajamanickam-2008, - title = {Algorithm 887: CHOLMOD, supernodal sparse Cholesky factorization and update/downdate}, + title = {Algorithm 887: {CHOLMOD}, supernodal sparse Cholesky factorization and update/downdate}, author = {Chen, Yanqing and Davis, Timothy A and Hager, William W and Rajamanickam, Sivasankaran}, journal = {ACM Transactions on Mathematical Software (TOMS)}, volume = {35}, @@ -547,3 +528,10 @@ @techreport{montoison-orban-saunders-2023 type = {{Les Cahiers du GERAD}}, institution = {Groupe d’études et de recherche en analyse des décisions} } + +@article{stewart1990matrix, + title={Matrix perturbation theory}, + author={Stewart, Gilbert W and Sun, Ji-guang}, + journal={(No Title)}, + year={1990} +} diff --git a/tex/main.tex b/tex/main.tex index b57ad17..d1cb783 100644 --- a/tex/main.tex +++ b/tex/main.tex @@ -35,7 +35,7 @@ \DeclareMathOperator{\inertia}{inertia} \newcommand{\ldlt}{$\mathrm{LDL^T}$\xspace} \newcommand{\lblt}{$\mathrm{LBL^T}$\xspace} -\newcommand{\llt}{$\mathrm{LL^T}$\xspace} +\newcommand{\llt}{\text{Cholesky}\xspace} \newcommand{\lu}{$\mathrm{LU}$\xspace} \newcommand{\CG}{\textsc{Cg}\xspace} \newcommand{\CR}{\textsc{Cr}\xspace} @@ -74,7 +74,6 @@ % \tableofcontents - \input{sections/introduction.tex} \input{sections/ipm.tex} \input{sections/kkt_systems.tex} @@ -94,15 +93,14 @@ \section{Conclusion} Looking ahead, our future plans involve enhancing the robustness of the two condensed KKT methods, particularly focusing on stabilizing convergence for small tolerances (below $10^{-8}$). It is worth noting that the sparse Cholesky solver can be further customized to meet the specific requirements of the interior-point method~\cite{wright1999modified}. % Add a sentence about NCL on GPU? -Enhancing the two methods on the GPU would enable the resolution of large-scale problems that are currently intractable on classical CPU architectures. -Examples include multiperiod and security-constrained OPF problems, particularly when combined with a Schur-complement approach. +Enhancing the two methods on the GPU would enable the resolution of large-scale problems that are currently intractable on classical CPU architectures such as multiperiod and security-constrained OPF problems. \small \bibliographystyle{spmpsci} \bibliography{biblio.bib} \normalsize \section{Acknowledgements} -This research used resources of the Argonne Leadership Computing Facility, a U.S. Department of Energy (DOE) Office of Science user facility at Argonne National Laboratory and is based on research supported by the U.S. DOE Office of Science-Advanced Scientific Computing Research Program, under Contract No. DE-AC02-06CH11357. +This research used resources of the Argonne Leadership Computing Facility, a U.S. Department of Energy (DOE) Office of Science user facility at Argonne National Laboratory and is based on research supported by the U.S. DOE Office of Science-Advanced Scientific Computing Research Program, under Contract No. DE-AC02-06CH11357. \end{document} %%% Local Variables: diff --git a/tex/sections/conditioning.tex b/tex/sections/conditioning.tex index 1c7104c..b15fcdc 100644 --- a/tex/sections/conditioning.tex +++ b/tex/sections/conditioning.tex @@ -1,7 +1,7 @@ \section{Conditioning of the condensed KKT system} -The condensed matrix $K$ appearing in \eqref{eq:kkt:condensed} is known to be increasingly ill-conditioned as the primal-dual iterates approach to the local solution with active inequalities. +The condensed matrix $K$ appearing in \eqref{eq:kkt:condensed} is known to be increasingly ill-conditioned as the primal-dual iterates approach to a local solution with active inequalities. This behavior is amplified for the matrices $K_\gamma$ and $K_\tau$, -as the implementation of these methods requires the use of +as HyKKT and LiftedKKT require the use of respectively large $\gamma$ and small $\tau$. In this section, we analyze the numerical error associated with the solution of the condensed KKT system and discuss how the structured ill-conditioning makes the @@ -43,9 +43,11 @@ \section{Conditioning of the condensed KKT system} \subsection{Centrality conditions} We start the discussion by recalling important results porting on the iterates of the interior-point algorithm. -Let $p = (x, s, y, z)$ the current primal-dual iterate (the multiplier $v$ is considered apart), +Let $p = (x, s, y, z)$ the current primal-dual iterate (the bound multiplier $v$ is considered apart), and $p^\star$ a solution of the KKT conditions~\eqref{eq:kktconditions}. We suppose that the solution $p^\star$ is regular and satisfies the following assumptions. +We denote $\cactive = \cactive(x^\star)$ the active-set at the optimal solution $x^\star$, +and $\cinactive = \cinactive(x^\star)$ the inactive set. % Sungho: not sure if this should be introduced here. \begin{assumption} @@ -55,9 +57,9 @@ \subsection{Centrality conditions} \begin{itemize} \item Continuity: The Hessian $\nabla^2_{x x} L(\cdot)$ is Lipschitz continuous near $p^\star$; - \item Linear Independence Constraint Qualification: the active Jacobian $A(x^\star)$ is full row-rank; - \item Strict Complementarity: for every $i \in \mathcal{B}(x^\star)$, $z_i^\star > 0$. - \item Second-order sufficiency: for every $h \in \nullspace\!\big(A(x^\star)\big)$, + \item Linear Independence Constraint Qualification (LICQ): the active Jacobian $A(x^\star)$ is full row-rank; + \item Strict Complementarity (SCS): for every $i \in \mathcal{B}(x^\star)$, $z_i^\star > 0$. + \item Second-order sufficiency (SOSC): for every $h \in \nullspace\!\big(A(x^\star)\big)$, $h^\top \nabla_{x x}^2 L(p^\star)h > 0$. \end{itemize} \end{assumption} @@ -79,7 +81,7 @@ \subsection{Centrality conditions} where $m_i$ is the number of inequality constraints. The duality measure encodes the current satisfaction of the complementarity constraints. For a solution $(p^\star, v^\star)$, we have $\Xi(s^\star, v^\star) = 0$. -The duality measure is sometimes used to define the barrier parameter in IPM. +The duality measure can be used to define the barrier parameter in IPM. We suppose the iterates $(p, v)$ satisfy the \emph{centrality conditions} \begin{subequations} @@ -90,12 +92,11 @@ \subsection{Centrality conditions} & (s, v) > 0 \;,\quad s_i v_i \geq \alpha \, \Xi(s, v) \quad \forall i =1, \cdots, m_i \; , \end{align} \end{subequations} -for some constant $C > 0$ and $\alpha \in (0, 1)$. +for some constants $C > 0$ and $\alpha \in (0, 1)$. Conditions~\eqref{eq:centralitycond:complement} ensures that the products $s_i v_i$ are not too disparate in the diagonal term $D_s$. -This condition is satisfied in the solver Ipopt (see \cite[Equation (16)]{wachter2006implementation}). -We denote $\cactive = \cactive(x^\star)$ the active-set at the optimal solution $x^\star$, -and $\cinactive = \cinactive(x^\star)$ the inactive set. +This condition is satisfied (despite rather loosely) +in the solver Ipopt (see \cite[Equation (16)]{wachter2006implementation}). \begin{proposition}[\cite{wright2001effects}, Lemma 3.2 and Theorem 3.3] \label{prop:cond:boundslack} @@ -107,7 +108,7 @@ \subsection{Centrality conditions} i \in \mathcal{B} \implies s_i = \Theta(\Xi) \, , \quad v_i = \Theta(1) \;, \\ i \in \mathcal{N} \implies s_i = \Theta(1) \, , \quad v_i = \Theta(\Xi) \; . \end{align} - and the distance to the solution $\delta(p, v)$ is bounded by the duality measure $\Xi$. + and the distance to the solution $\delta(p, v)$ is bounded by the duality measure $\Xi$: \begin{equation} \delta(p, v) = O(\Xi) \; . \end{equation} @@ -116,32 +117,33 @@ \subsection{Centrality conditions} \subsection{Structured ill-conditioning} The following subsection looks at the structure -of the condensed matrix $K_\gamma$. All the results -apply directly to the matrix $K_\tau$, by setting the number +of the condensed matrix $K_\gamma$ in HyKKT. All the results +apply directly to the matrix $K_\tau$ in LiftedKKT, by setting the number of equality constraints to $m_e = 0$. -First, We show that if the iterates $(p, v)$ satisfy +First, we show that if the iterates $(p, v)$ satisfy the centrality conditions~\eqref{eq:centralitycond}, then the condensed matrix $K_\gamma$ exhibits a structured ill-conditioning. \subsubsection{Invariant subspaces in $K_\gamma$} -Witout regularization we have that $K_\gamma = W + H^\top D_s H + \gamma G^\top G$, with +Without regularization we have that $K_\gamma = W + H^\top D_s H + \gamma G^\top G$, with the diagonal $D_s = S^{-1} V$. -We note by $m_a$ the cardinality of $\mathcal{B}$, +We note by $m_a$ the cardinality of the active set $\mathcal{B}$, $H_{\cactive}$ the Jacobian of active inequality constraints, $H_{\cinactive}$ the Jacobian of inactive inequality constraints and by -$A := \begin{bmatrix} G^\top & H_{\cactive}^\top \end{bmatrix}^\top$ the active Jacobian. +$A := \begin{bmatrix} H_{\cactive}^\top & G^\top \end{bmatrix}^\top$ the active Jacobian. We define the minimum and maximum active slack values as \begin{equation} s_{min} = \min_{i \in \cactive} s_i \; , \quad s_{max} = \max_{i \in \cactive} s_i \; . \end{equation} -We remind that $m_e$ is the total number of equality constraints, +We remind that $m_e$ is the number of equality constraints, and define $\ell := m_e + m_a$. -To establish the structured ill-conditioning of $K_\gamma$ by +We explicit the structured ill-conditioning of $K_\gamma$ by modifying the approach outlined in \cite[Theorem 3.2]{wright1998ill} to account for the additional term $\gamma G^\top G$ arising from the equality constraints. -We show that the matrix $K_\gamma$ has two invariant subspaces, +We show that the matrix $K_\gamma$ has two invariant subspaces +(in the sense defined in \cite[Chapter 5]{stewart1990matrix}), associated respectively to the range of the transposed active Jacobian (\emph{large space}) and to the null space of the active Jacobian (\emph{small space}). @@ -154,8 +156,8 @@ \subsubsection{Invariant subspaces in $K_\gamma$} $K_\gamma$, ordered as $|\lambda_1| \geq \cdots \geq |\lambda_n|$. Let $\begin{bmatrix} Y & Z \end{bmatrix}$ be an orthogonal matrix, where $Z$ encodes the basis of the null-space of - $A$. Let $\underline{\sigma} =\min\left(\frac{1}{\Xi}, \gamma\right)$ - and $\overline{\sigma} = \max\left(\frac{1}{s_{min}}, \gamma\right)$. + $A$. Let $\underline{\sigma} :=\min\left(\frac{1}{\Xi}, \gamma\right)$ + and $\overline{\sigma} := \max\left(\frac{1}{s_{min}}, \gamma\right)$. Then, \begin{enumerate} \item[(i)] The $\ell$ largest-magnitude eigenvalues of $K_\gamma$ are positive, @@ -164,7 +166,7 @@ \subsubsection{Invariant subspaces in $K_\gamma$} are $\Theta(1)$. \item[(iii)] If $0 < \ell < n$, then $\cond(K_\gamma) = \Theta(\overline{\sigma})$. \item[(iv)] There are orthonormal matrices $\widetilde{Y}$ and $\widetilde{Z}$ for - simple invariant subspaces of $K_\gamma$, such that $Y - \widetilde{Y} = O(\underline{\sigma}^{-1})$ + simple invariant subspaces of $K_\gamma$ such that $Y - \widetilde{Y} = O(\underline{\sigma}^{-1})$ and $Z - \widetilde{Z} = O(\underline{\sigma}^{-1})$. \end{enumerate} \end{theorem} @@ -206,7 +208,7 @@ \subsubsection{Invariant subspaces in $K_\gamma$} Using \cite[Lemma 3.1]{wright1998ill}, we deduce $\lambda_1 = \Theta(\overline{\sigma})$ and $\lambda_\ell = \Omega(\underline{\sigma})$, proving the first result (i). - Let $L_\gamma = A^\top D_\gamma A$. + Let $L_\gamma := A^\top D_\gamma A$. We have that \begin{equation} \begin{bmatrix} @@ -294,13 +296,19 @@ \subsubsection{Invariant subspaces in $K_\gamma$} and the ratio $\frac{s_{max}}{s_{min}} = O(\Xi \overline{\sigma})$. The condition of $\Sigma_S$ reflects the condition of the reduced Hessian $Z^\top W Z$. -\begin{remark} - Theorem~\ref{thm:cond} (iii) tells us that $\cond(K_\gamma) = \Theta(\overline{\sigma})$, - meaning that if $\gamma$ is large-enough to satisfy $\gamma \geq \frac{1}{s_{min}}$, then - the conditioning $\cond(K_\gamma)$ increases linearly with $\gamma$. In - other words, the value $\frac{1}{s_{min}}$ gives us a lower-bound for $\gamma_{max}$ - in Proposition~\ref{prop:kkt:hykkt:cond}. -\end{remark} +Three observations are due: +\begin{enumerate} + \item Theorem~\ref{thm:cond} (iii) tells us that $\cond(K_\gamma) = \Theta(\overline{\sigma})$, + meaning that if $\gamma \geq \frac{1}{s_{min}}$, then + the conditioning $\cond(K_\gamma)$ increases linearly with $\gamma$, hence + recovering a known result \cite{regev2023hykkt}. + \item In early IPM iterations, the slacks are pushed away from the boundary + and the number of active inequality constraints is $m_a = 0$. The ill-conditioning + in $K_\gamma$ is caused only by $\gamma G^\top G$ and $\underline{\sigma} = \overline{\sigma} = \gamma$. + \item In late IPM iterations, the active slacks are converging to $0$. We observe + that if $\frac{1}{\Xi} \leq \gamma \leq \frac{1}{s_{min}}$ the parameter $\gamma$ + does not increase the ill-conditioning of the condensed matrix $K_\gamma$. +\end{enumerate} \subsubsection{Numerical accuracy of the condensed matrix $K_\gamma$} In floating-point arithmetic, the condensed matrix $K_\gamma$ is evaluated as @@ -315,7 +323,7 @@ \subsubsection{Numerical accuracy of the condensed matrix $K_\gamma$} \begin{proposition} \label{prop:cond:boundcondensedmatrix} In floating-point arithmetic, the perturbation - of the condensed matrix $\Delta K_\gamma$ satisfies + of the condensed matrix $K_\gamma$ satisfies $\Delta K_\gamma := \widehat{K_\gamma} - K_\gamma = O(\epstol \overline{\sigma})$. \end{proposition} \begin{proof} @@ -338,10 +346,10 @@ \subsubsection{Numerical accuracy of the condensed matrix $K_\gamma$} all bounded by $O(\overline{\sigma} \epstol)$, hence concluding the proof. \end{proof} -The perturbation $\Delta K_\gamma$ displays no specific structure. -Hence, we should determine under which -conditions the perturbed matrix $\widehat{K}_\gamma$ -keeps the structured ill-conditioning highlighted in \eqref{eq:cond:svd}. +If it is large enough, +the unstructured perturbation $\Delta K_\gamma$ +can impact the structured ill-conditioning in the perturbed +matrix $\widehat{K}_\gamma$. We know that the smallest eigenvalue $\eta_\ell$ of $A^\top D_\gamma A$ is $\Omega(\underline{\sigma})$. As mentioned in \cite[Section 3.4.2]{wright1998ill}, the perturbed matrix @@ -353,12 +361,12 @@ \subsubsection{Numerical accuracy of the condensed matrix $K_\gamma$} \| \Delta K_\gamma \| \ll \eta_\ell = \Omega(\underline{\sigma}) \; . \end{equation} However, the bound given in Proposition~\ref{prop:cond:boundcondensedmatrix} is too loose -for \eqref{eq:cond:perturbationbound} to hold without any further assumption, -as we have only $\underline{\sigma} \leq \overline{\sigma}$. +for \eqref{eq:cond:perturbationbound} to hold without any further assumption +(we have only $\underline{\sigma} \leq \overline{\sigma}$). We note that for some constant $C > 0$, $\Delta K_\gamma \leq C \epstol \overline{\sigma}$, implying $\Delta K_\gamma / \underline{\sigma} \leq C \epstol \overline{\sigma} / \underline{\sigma}$. Hence, if we suppose in addition the ratio $\overline{\sigma}/\underline{\sigma}$ is close to $1$, -then $\|\Delta K_\gamma\| = O(\epstol \overline{\sigma})$ can be replaced instead by +then $\|\Delta K_\gamma\| = O(\epstol \overline{\sigma})$ can instead be replaced by $\| \Delta K_\gamma\|= O(\epstol \underline{\sigma})$, ensuring \eqref{eq:cond:perturbationbound} holds. \subsubsection{Numerical solution of the condensed system} @@ -385,7 +393,7 @@ \subsubsection{Numerical solution of the condensed system} \item[(b)] The parameter $\gamma$ satisfies $\gamma = \Theta(\Xi^{-1})$. \item[(c)] The duality measure is large enough relative to the precision $\epstol$: $\epstol \ll \Xi$. \item[(d)] The primal step $\widehat{x}$ is computed using a backward - stable method satisfying \eqref{eq:cond:backwardstable} for a small constant + stable method satisfying~\eqref{eq:cond:backwardstable} for a small constant $\varepsilon_n$. \end{itemize} \label{hyp:cond:wellcond} @@ -396,9 +404,9 @@ \subsubsection{Numerical solution of the condensed system} the matrix $\Sigma_L$ well-conditioned with $\underline{\sigma} = \Theta(\Xi^{-1})$, $\overline{\sigma} = \Theta(\Xi^{-1})$ and $\overline{\sigma}/\underline{\sigma} = \Theta(1)$. -Condition (c) ensures that the conditioning of $\cond(K_\gamma) = \Theta(\overline{\sigma})$ -satisfies $\cond(K_\gamma) \epstol \ll 1$, -ensuring the Cholesky factorization runs to completion. +Condition (c) ensures that $\cond(K_\gamma) = \Theta(\overline{\sigma})$ +satisfies $\cond(K_\gamma) \epstol \ll 1$ +(implying the Cholesky factorization runs to completion). Condition (d) tells us that the perturbation caused by the Cholesky factorization is $\Delta_s K_\gamma = O(\epstol \| K_\gamma\|)$. As \eqref{eq:cond:boundinvariantsubspace} implies $\|K_\gamma \| = \Theta(\Xi^{-1})$, @@ -467,21 +475,26 @@ \subsubsection{Numerical solution of the condensed system} \begin{equation} \label{eq:cond:invperturbed} \begin{aligned} - (K_\gamma + \Delta K_\gamma)^{-1} &= (I + K_\gamma^{-1} \Delta K_\gamma)^{-1} K_\gamma^{-1} \\ + (K_\gamma + \Delta K_\gamma)^{-1} &= (I + K_\gamma^{-1} \Delta K_\gamma)^{-1} K_\gamma^{-1} \; , \\ &= K_\gamma^{-1} - K_\gamma^{-1}\Delta K_\gamma K_\gamma^{-1} + O(\|\Delta K_\gamma\|^2) \; . \end{aligned} \end{equation} -Using \eqref{eq:cond:inversecondensed} and noting $\Delta K_\gamma = \begin{bmatrix} +We decompose $\Delta K_\gamma$ in two matrices +$\Gamma_L \in \mathbb{R}^{\ell \times n}$ and $\Gamma_S \in \mathbb{R}^{(n-\ell) \times n}$ such that +$\Delta K_\gamma = \begin{bmatrix} \Gamma_L \\ \Gamma _S -\end{bmatrix}$, the first-order error is given by +\end{bmatrix}$. +Using \eqref{eq:cond:inversecondensed} the first-order error is given by \begin{equation} \label{eq:cond:inversecondensederror} K_\gamma^{-1}\Delta K_\gamma K_\gamma^{-1} = U_L \Sigma_L^{-1} \Gamma_L \Sigma_L^{-1}U_L^\top + U_S \Sigma_S^{-1} \Gamma_S \Sigma_S^{-1}U_S^\top \;. \end{equation} -Using \eqref{eq:cond:perturbationbound} and $(\Gamma_L, \Gamma_S)= O( \Xi^{-1}\epstol)$, -we deduce that the error made in the large space is $O(\Xi\epstol)$ whereas +Using \eqref{eq:cond:boundinvariantsubspace} and $(\Gamma_L, \Gamma_S)= O( \Xi^{-1}\epstol)$, +we obtain $\Sigma_L^{-1} \Gamma_L \Sigma_L^{-1} = O(\Xi \epstol)$ +and $\Sigma_S^{-1} \Gamma_S \Sigma_S^{-1} = O(\Xi^{-1} \epstol)$. +We deduce that the error made in the large space is $O(\Xi\epstol)$ whereas the error in the small space is $O(\Xi^{-1}\epstol )$. \subsection{Solution of the condensed KKT system} @@ -524,10 +537,24 @@ \subsubsection{Solution with HyKKT} the regularization parameter $\gamma$. \paragraph{Initial right-hand-side.} -In floating-point arithmetic, the initial right-hand side in \eqref{eq:kkt:schurcomplhykkt} +Let $\widehat{s}_\gamma := \bar{r}_1 + \gamma \widehat{G}^\top \bar{r}_2$. +The initial right-hand side in~\eqref{eq:kkt:schurcomplhykkt} is evaluated as -$\widehat{r}_\gamma :=\widehat{G} \widehat{K}_\gamma^{-1} (\bar{r}_1 + \gamma \widehat{G}^\top \bar{r}_2) - \bar{r}_2$. Using \eqref{eq:cond:condensedrhs}, we have +$\widehat{r}_\gamma :=\widehat{G} \widehat{K}_\gamma^{-1} \widehat{s}_\gamma - \bar{r}_2$. +The following proposition shows that despite an expression involving the inverse +of the ill-conditioned condensed matrix $K_\gamma$, the error made in $r_\gamma$ +is bounded only by the machine precision $\epstol$. + +\begin{proposition} +In floating point arithmetic, the error in the right-hand-side $\Delta \widehat{r}_\gamma$ satisfies: \begin{equation} + \label{eq:cond:errorrgamma} + \Delta \widehat{r}_\gamma = -\Delta \bar{r}_2 + \widehat{G} \widehat{K}_\gamma^{-1} \Delta s_\gamma = O(\epstol) \;. +\end{equation} +\end{proposition} +\begin{proof} +Using \eqref{eq:cond:condensedrhs}, we have +\begin{equation*} \label{eq:cond:boundderivationhykkt} \begin{aligned} \bar{r}_1 + \gamma \widehat{G}^\top \bar{r}_2 &= @@ -541,18 +568,15 @@ \subsubsection{Solution with HyKKT} \gamma \widehat{r}_3 \end{bmatrix}}_{O(\Xi^{-1}\epstol)} \; . \end{aligned} -\end{equation} -We denote $\widehat{s}_\gamma = \bar{r}_1 + \gamma \widehat{G}^\top \bar{r}_2 = -Y \widehat{s}_Y + Z \widehat{s}_Z$. +\end{equation*} The error decomposes as $\Delta s_\gamma = Y \Delta s_Y + Z \Delta s_Z = U_L \Delta s_L + U_S \Delta s_S$. -Using \eqref{eq:cond:boundderivationhykkt}, -we have $\Delta s_Y = O(\Xi^{-1} \epstol)$ and $\Delta s_Z = O(\epstol)$. +We have $\Delta s_Y = O(\Xi^{-1} \epstol)$ and $\Delta s_Z = O(\epstol)$. Using \eqref{eq:cond:invariantsubpsace}, we deduce $\Delta s_L = U_L^\top \Delta s_\gamma = O(\Xi^{-1} \epstol)$ and $\Delta s_S = U_S^\top \Delta s_\gamma = O(\epstol)$. -We obtain using \eqref{eq:cond:boundinvariantsubspace} and \eqref{eq:cond:inversecondensed} -that the error in the large space $\Delta s_L$ annihilates in the backsolve: +Using \eqref{eq:cond:boundinvariantsubspace} and \eqref{eq:cond:inversecondensed}, +the error in the large space $\Delta s_L$ annihilates in the backsolve: \begin{equation} \label{eq:cond:boundhykkt1} K_\gamma^{-1} \Delta s_\gamma = U_L \Sigma_L^{-1} \Delta s_L + U_S \Sigma_S^{-1} \Delta s_S = O(\epstol) @@ -570,19 +594,13 @@ \subsubsection{Solution with HyKKT} \big[ G U_L \Sigma_L^{-1} \Gamma_L + G U_S \Sigma_S^{-1} \Gamma_S \big] (K_\gamma^{-1} \Delta s_\gamma) \; . \end{equation} Using again \eqref{eq:cond:invariantsubpsace}: -$G U_L = G Y + O(\Xi)$ and $G U_S = GZ + O(\Xi) = O(\Xi)$. Hence +$G U_L = G Y + O(\Xi)$ and $G U_S = O(\Xi)$. Hence $G U_L \Sigma_L^{-1} \Gamma_L = O(1)$ and $G U_S \Sigma_S^{-1} \Gamma_S = O(1)$. Using \eqref{eq:cond:rhserrorfull}, we have $K_\gamma^{-1} \Delta G^\top = O(\epstol)$, implying $\Delta G K_\gamma^{-1} \Delta K_\gamma (K_\gamma^{-1} \Delta s_\gamma) = O(\Xi^{-1} \epstol^{2})$. -Assumption~\ref{hyp:cond:wellcond} implies that $\Xi^{-1} \epstol^2 \ll \epstol$. -All in all, the error in the right-hand-side $\Delta \widehat{r}_\gamma$ satisfies: -\begin{equation} - \label{eq:cond:errorrgamma} - \Delta \widehat{r}_\gamma = -\Delta \bar{r}_2 + \widehat{G} \widehat{K}_\gamma^{-1} \Delta s_\gamma = O(\epstol) \;. -\end{equation} -The result is remarkable: despite an expression involving the inverse -of the ill-conditioned condensed matrix $K_\gamma$, the error made in $r_\gamma$ -is bounded only by the machine precision $\epstol$. +Assumption~\ref{hyp:cond:wellcond} implies that $\Xi^{-1} \epstol^2 \ll \epstol$, +proving \eqref{eq:cond:errorrgamma}. +\end{proof} \paragraph{Schur-complement operator.} The solution of the system~\eqref{eq:kkt:schurcomplhykkt} @@ -609,7 +627,7 @@ \subsubsection{Solution with HyKKT} \begin{equation} G U_L \Sigma_L^{-1} U_L^\top G^\top = G Y \Sigma_L^{-1} Y^\top G^\top + O(\Xi^2) \; . \end{equation} - Using again \eqref{eq:cond:invariantsubpsace}, we have $G U_S = GZ + O(\Xi) = O(\Xi)$. + Using again \eqref{eq:cond:invariantsubpsace}, we have $G U_S = O(\Xi)$. Hence, $G U_S \Sigma_S^{-1} U_S^\top G^\top = O(\Xi^2)$, concluding the proof. \end{proof} @@ -624,7 +642,7 @@ \subsubsection{Solution with HyKKT} \end{equation} \end{proposition} \begin{proof} - We denote $\widehat{G} = G + \Delta G$ (with $G = O(\epstol)$). Then + We denote $\widehat{G} = G + \Delta G$ (with $\Delta G = O(\epstol)$). Then \begin{equation} \begin{aligned} \widehat{S}_\gamma &= \widehat{G} \widehat{K}_\gamma^{-1} \widehat{G}^\top \; , \\ @@ -663,13 +681,13 @@ \subsubsection{Solution with HyKKT} arithmetic. \subsubsection{Solution with Lifted KKT system} -The sparse-condensed KKT system has removed the equality -constraints from the optimization problems, simplifying +The equality relaxation strategy used in LiftedKKT +removes the equality constraints from the optimization problems, simplifying the solution of the condensed KKT system to \eqref{eq:liftedkkt}. -The analysis is similar than in \cite{wright1998ill}, -and the active Jacobian $A$ reduces to the active inequalities $A = H_{\cactive}$. -Using the similar arguments than in \eqref{eq:cond:boundderivationhykkt}, -the error in the right-hand-side is $O(\Xi^{-1} \epstol)$ and is in the +The active Jacobian $A$ reduces to the active inequalities $A = H_{\cactive}$, +and we recover the original setting presented in \cite{wright1998ill}. +Using the same arguments than in \eqref{eq:cond:boundderivationhykkt}, +the error in the right-hand-side is bounded by $O(\Xi^{-1} \epstol)$ and is in the range space of the active Jacobian $A$. Using \eqref{eq:cond:inversecondensed}, we can show that the absolute error on $\widehat{d}_x$ is bounded by $O(\Xi \epstol)$. That implies the descent direction $\widehat{d}_x$ retains @@ -682,7 +700,7 @@ \subsubsection{Summary} is computed only with an (absolute) precision $\varepsilon_{K}$, greater than the machine precision $\epstol$ (for HyKKT, $\varepsilon_K$ is the absolute tolerance of the \CG algorithm, for LiftedKKT the -absolute tolerance of the iterative refinement algorithm $\varepsilon_{K}$). +absolute tolerance of the iterative refinement algorithm). The errors $\widehat{d}_x - d_x = O(\varepsilon_K)$ and $\widehat{d}_y - d_y = O(\varepsilon_K)$ propagate further in $(\widehat{d}_s, \widehat{d}_z)$. @@ -695,19 +713,16 @@ \subsubsection{Summary} giving the following bounds for the errors in the inactive and active components: \begin{equation} \begin{aligned} - & \widehat{d}_{z,\cactive} &= -\widehat{r}_{2,\cactive} - \widehat{D}_{s,\cactive} \widehat{d}_{s,\cactive} - &= d_{z,\cactive} + O(\max\{\epstol, \varepsilon_K \Xi^{-1}\}) \;,\\ - & \widehat{d}_{z,\cinactive} &= -\widehat{r}_{2,\cinactive} - \widehat{D}_{s,\cinactive} \widehat{d}_{s,\cinactive} - &= d_{z,\cinactive} + O(\varepsilon_K \Xi) \; . + \widehat{d}_{z,\cactive} &= -\widehat{r}_{2,\cactive} - \widehat{D}_{s,\cactive} \widehat{d}_{s,\cactive} + = d_{z,\cactive} + O(\varepsilon_K \Xi^{-1}) \;,\\ + \widehat{d}_{z,\cinactive} &= -\widehat{r}_{2,\cinactive} - \widehat{D}_{s,\cinactive} \widehat{d}_{s,\cinactive} + = d_{z,\cinactive} + O(\varepsilon_K \Xi) \; . \end{aligned} \end{equation} -The error arises mostly in the active components $\widehat{d}_{z,\cactive}$. -We want to set $\varepsilon_K \ll \Xi$ to limit the loss of accuracy, but doing -so is not trivial as we are approaching the optimum. For example, -solving the condensed system with a very good tolerance $\varepsilon_K = 10^{-12}$ -would only ensure that the absolute error is bounded by $10^{-4}$ if $\Xi = 10^{-8}$ -(as it is typical in primal-dual IPM). The impact remains limited if we -have only few active inequality constraints. +Most of the error arises in the active components $\widehat{d}_{z,\cactive}$. +To limit the loss of accuracy, the algorithm has to decrease the absolute precision $\varepsilon_K$ +as we are approaching to a local optimum. +The impact remains limited if we have only few active inequality constraints. %%% Local Variables: diff --git a/tex/sections/introduction.tex b/tex/sections/introduction.tex index 9cf9d13..85e5b7f 100644 --- a/tex/sections/introduction.tex +++ b/tex/sections/introduction.tex @@ -1,6 +1,6 @@ \section{Introduction} Graphics processing units (GPUs) are driving the advancement of scientific computing, their most remarkable success being the capabilities to train and utilize large artificial intelligence (AI) models. -GPUs offer several practical advantages: (1) massive parallel computing capability for applications that can exploit coarse-grain parallelism and high-memory bandwidth and (2) power efficiency due to requiring fewer transistors to process multiple tasks in parallel utilizing ``single instruction, multiple data'' (SIMD) parallelism. +GPUs offer two practical advantages: (1) massive parallel computing capability for applications that can exploit coarse-grain parallelism and high-memory bandwidth and (2) power efficiency due to requiring fewer transistors to process multiple tasks in parallel utilizing ``single instruction, multiple data'' (SIMD) parallelism. While GPUs have made significant strides in enhancing machine learning applications, their adoption in the mathematical programming community has been relatively limited. This limitation stems primarily from the fact that most optimization solvers were developed in the 1990s and are heavily optimized for CPU architectures. @@ -9,7 +9,7 @@ \section{Introduction} \item \textbf{Improved sparse matrix operations}: The performance of sparse matrix operations has seen substantial improvements in the CUDA library, largely attributed to the integration of novel tensor cores in recent GPUs~\cite{markidis2018nvidia}. \item \textbf{Interest in batch optimization}: There is a growing interest in solving parametric optimization problems in batch mode, for problems sharing the same structure but with different parameters~\cite{amos2017optnet,pineda2022theseus}. \item \textbf{Advancements in automatic differentiation}: GPUs offer unparalleled performance for automatic differentiation, benefiting both machine learning~\cite{jax2018github} and scientific computing applications \cite{enzyme2021}. Engineering problems often exhibit recurring patterns throughout the model. Once these patterns are identified, they can be evaluated in parallel within a SIMD framework, enabling near speed-of-light performance~\cite{shin2023accelerating}. - \item \textbf{Role in exascale computing}: With the emergence of new exascale supercomputers (e.g., Frontier and Aurora), the capabilities to run on GPUs have become even more important for supercomputing. + \item \textbf{Role in exascale computing}: With the emergence of new exascale supercomputers (e.g., Frontier and Aurora), the capabilities to run on GPUs have become central for supercomputing. \end{enumerate} \subsection{Solving optimization problems on GPU: current state-of-the-art} @@ -20,7 +20,7 @@ \subsection{Solving optimization problems on GPU: current state-of-the-art} The factorization of sparse matrices encountered within second-order optimization algorithms has been considered to be challenging on GPUs. For this reason, practitioners often has resorted to using first-order methods on GPUs, leveraging level-1 and level-2 BLAS operations that -are more amenable to parallel computation. +are more amenable to parallel computations. First-order algorithms depend mostly on (sparse) matrix-vector operations that run very efficiently on modern GPUs. Hence, we can counterbalance the relative inaccuracy of the first-order method by running more @@ -52,23 +52,23 @@ \subsection{Solving optimization problems on GPU: current state-of-the-art} in nonlinear programming: Most engineering problems encode complex physical equations that are likely to break any convex structure in the problem. Previous experiments on the alternating current (AC) optimal power flow (OPF) problem have shown that even a simple -algorithm as an alternating direction method of multipliers (ADMM) has trouble converging as soon as the convergence +algorithm as the alternating direction method of multipliers (ADMM) has trouble converging as soon as the convergence tolerance is set below $10^{-3}$~\cite{kimLeveragingGPUBatching2021}. Thus, second-order methods remain a competitive option, particularly -for scenarios that require higher levels of accuracy and robust convergence. -Second-order algorithms require solving a Newton step at each +for scenarios that demand higher levels of accuracy and robust convergence. +Second-order algorithms solve a Newton step at each iteration, an operation relying on non-trivial sparse linear algebra operations. The previous generation of GPU-accelerated sparse linear solvers were lagging behind their CPU equivalents, as illustrated in subsequent surveys~\cite{swirydowicz2021linear,tasseff2019exploring}. Fortunately, sparse solvers on GPUs are becoming increasingly better: NVIDIA has released in November 2023 the {\tt cuDSS} sparse direct solver that implements different sparse factorization routines with remarkably improved performance. -Our benchmark results incidate that {\tt cuDSS} is significantly faster than the previous sparse solvers using NVIDIA GPUs (e.g., published in \cite{shin2023accelerating}). +Our benchmark results indicate that {\tt cuDSS} is significantly faster than the previous sparse solvers using NVIDIA GPUs (e.g., published in \cite{shin2023accelerating}). % SS: removed due to irrelevance. % The new NVIDIA Grace CPU architecture could also be a game changer in the future of sparse linear solvers, thanks to fast communication between the CPU and GPU. Furthermore, variants of interior point methods have been proposed -that does not require the use of numerical pivoting in the linear solves, +that does not depend on numerical pivoting in the linear solves, opening the door to parallelized sparse solvers. Coupled with a GPU-accelerated automatic differentiation library and a sparse Cholesky solver, these nonlinear programming solvers can solve @@ -76,14 +76,14 @@ \subsection{Solving optimization problems on GPU: current state-of-the-art} methods~\cite{shin2023accelerating}. There exist a few alternatives to sparse linear solvers for solving the KKT systems on the GPU. -On the one hand, iterative and Krylov methods rely only on matrix-vector products to solve linear systems. +On the one hand, iterative and Krylov methods depend only on matrix-vector products to solve linear systems. They often require non-trivial reformulation or specialized preconditioning of the KKT systems to mitigate the inherent ill-conditioning of the KKT matrices, which has limited their use within the interior-point methods \cite{curtisNoteImplementationInteriorpoint2012,rodriguezScalablePreconditioningBlockstructured2020}. New results are giving promising outlooks for convex problems~\cite{ghannad2022linear}, -but nonconvex problems often require an Augmented Lagrangian reformulation +but nonconvex problems often need an Augmented Lagrangian reformulation to be tractable~\cite{cao2016augmented,regev2023hykkt}. In particular, \cite{regev2023hykkt} presents an interesting use of the Golub and Greif hybrid method~\cite{golub2003solving} to solve the KKT systems arising in @@ -91,7 +91,7 @@ \subsection{Solving optimization problems on GPU: current state-of-the-art} On the other hand, null-space methods (also known as reduced Hessian methods) reduce the KKT system down to a dense matrix, a setting also favorable for GPUs. Our previous research has shown that the method plays nicely with the interior-point -methods if the number of degrees of freedom in the problem is relatively small~\cite{pacaud2022condensed}. +methods if the number of degrees of freedom in the problem remains relatively small~\cite{pacaud2022condensed}. \subsection{Contributions} @@ -116,7 +116,7 @@ \subsection{Contributions} Extending beyond the classical OPF instances examined in our previous work, we incorporate large-scale problems sourced from the COPS nonlinear benchmark~\cite{dolan2004benchmarking}. Our assessment involves comparing the performance achieved on the GPU with that of a state-of-the-art method executed on the CPU. -The findings reveal that the condensed-space IPM enables a remarkable 10x acceleration in solving large-scale OPF instances when utilizing the GPU. +The findings reveal that the condensed-space IPM enables a remarkable ten-time acceleration in solving large-scale OPF instances when utilizing the GPU. However, performance outcomes on the COPS benchmark exhibit more variability. \subsection{Notations} diff --git a/tex/sections/ipm.tex b/tex/sections/ipm.tex index 01c8118..70833f4 100644 --- a/tex/sections/ipm.tex +++ b/tex/sections/ipm.tex @@ -48,12 +48,12 @@ \subsection{Problem formulation and KKT conditions} In~\eqref{eq:slack_problem}, the inequality constraints are encoded inside the variable bounds. -We denote by $y \in \mathbb{R}^{m_e}$ the multipliers associated -to the equality constraints and $z \in \mathbb{R}^{m_i}$ the multipliers -associated to the inequality constraints. Similarly, we denote +We denote by $y \in \mathbb{R}^{m_e}$ and $z \in \mathbb{R}^{m_i}$ the multipliers associated +resp. to the equality constraints and the inequality constraints. +Similarly, we denote by $(u, v) \in \mathbb{R}^{n + m_i}$ the multipliers associated -respectively to $x \geq 0$ and $s \geq 0$. -Using the multipliers $(y, z, u, v)$, we define the Lagrangian of \eqref{eq:slack_problem} as +respectively to the bounds $x \geq 0$ and $s \geq 0$. +Using the dual variable $(y, z, u, v)$, we define the Lagrangian of \eqref{eq:slack_problem} as \begin{equation} \label{eq:lagrangian} L(x, s, y, z, u, v) = f(x) + y^\top g(x) + z^\top \big(h(x) +s\big) @@ -221,11 +221,11 @@ \subsection{Solving the KKT conditions with the interior-point method} if and only if the Jacobian $J = \begin{bmatrix} G \; &\; 0 \\ H \;&\; I \end{bmatrix}$ is full row-rank and the matrix $\begin{bmatrix} W + D_x & 0 \\ 0 & D_s \end{bmatrix}$ projected onto the null-space of the Jacobian $J$ is positive-definite~\cite{benzi2005numerical}. -The condition is satisfied if the inertia of the matrix is $(n + m_i, m_i + m_e, 0)$. -The inertia is defined as the respective numbers of positive, negative and zero eigenvalues. +The condition is satisfied if the inertia (defined as the respective numbers +of positive, negative and zero eigenvalues) of the matrix~\eqref{eq:kkt:augmented} is $(n + m_i, m_i + m_e, 0)$. We use the inertia-controlling method introduced in \cite{wachter2006implementation} to regularize the augmented matrix by adding multiple of the identity -on the diagonal of \eqref{eq:kkt:augmented} if the inertia is not $(n+m_i, m_e+m_i, 0)$. +on the diagonal of \eqref{eq:kkt:augmented} if the inertia is not equal to $(n+m_i, m_e+m_i, 0)$. As a consequence, the system \eqref{eq:kkt:augmented} is usually factorized using an inertia-revealing \lblt factorization~\cite{duff1983multifrontal}. @@ -233,7 +233,7 @@ \subsection{Solving the KKT conditions with the interior-point method} as the block diagonal terms $D_x$ and $D_s$ are getting increasingly ill-conditioned near the solution. Their use in IPM has been limited in linear and convex quadratic programming \cite{gondzio-2012} (when paired -with a suitable preconditioner). We also refer to \cite{cao-seth-laird-2016} +with a suitable preconditioner). We also refer to \cite{cao2016augmented} for an efficient implementation of a preconditioned conjugate gradient on GPU, for solving the Newton step arising in an augmented Lagrangian interior-point approach. @@ -308,7 +308,7 @@ \subsection{Discussion} Additionally, condensation may result in increased fill-in within the condensed system \eqref{eq:kkt:condensed}~\cite[Section 19.3, p.571]{nocedal_numerical_2006}. In the worst cases \eqref{eq:kkt:condensed} itself may become fully dense if an inequality row is completely dense (fortunately, a case rarer than in the normal equations used in linear programming). Consequently, condensation methods are not commonly utilized in practical optimization settings. -To the best our knowledge, Knitro~\cite{waltz2006interior} is the only solver that supports computing the descent direction with \eqref{eq:kkt:condensed}. +To the best of our knowledge, Artelys Knitro~\cite{waltz2006interior} is the only solver that supports computing the descent direction with \eqref{eq:kkt:condensed}. %%% Local Variables: %%% mode: latex diff --git a/tex/sections/kkt_systems.tex b/tex/sections/kkt_systems.tex index c014d05..c8479d5 100644 --- a/tex/sections/kkt_systems.tex +++ b/tex/sections/kkt_systems.tex @@ -2,7 +2,7 @@ \section{Solving KKT systems on the GPU} The GPU has emerged as a new prominent computing hardware not only for graphics-related but also for general-purpose computing. GPUs employ a SIMD formalism that yields excellent throughput for parallelizing small-scale operations. However, their utility remains limited when computational algorithms require global communication. -Sparse factorization algorithms, which heavily rely on numerical pivoting, pose significant challenges for implementation on GPUs. Previous research has demonstrated that GPU-based linear solvers significantly lag behind their CPU counterparts \cite{tasseff2019exploring,swirydowicz2021linear}. +Sparse factorization algorithms, which heavily rely on numerical pivoting, pose significant challenges for implementation on GPUs. Previous research has demonstrated that GPU-based linear solvers significantly lag behind their CPU counterparts \cite{swirydowicz2021linear,tasseff2019exploring}. One emerging strategy is to utilize sparse factorization techniques that do not necessitate numerical pivoting \cite{regev2023hykkt,shin2023accelerating} by leveraging the structure of the condensed KKT system \eqref{eq:kkt:condensed}. We present two alternative methods to solve \eqref{eq:kkt:condensed}. @@ -11,7 +11,7 @@ \section{Solving KKT systems on the GPU} On the other hand, LiftedKKT~\cite{shin2023accelerating} uses an equality relaxation strategy and is presented in \S\ref{sec:kkt:sckkt}. -\subsection{Golub \& Greif strategy} +\subsection{Golub \& Greif strategy: HyKKT} \label{sec:kkt:golubgreif} The Golub \& Greif~\cite{golub2003solving} strategy reformulates the KKT system using an Augmented Lagrangian formulation. @@ -42,17 +42,8 @@ \subsection{Golub \& Greif strategy} for all $\gamma > \underline{\gamma}$, the reduced Hessian $Z^\top K Z$ is positive definite if and only if $K_\gamma$ is positive definite. Using the Sylvester's law of inertia stated in \eqref{eq:ipm:inertia}, we deduce -that for $\gamma > \underline{\gamma}$, $\inertia(K_2) = (n + m_i, m_e +m_i, 0)$ -implies that $K_\gamma$ is positive definite. - -If $\gamma$ is large enough, we can prove that the conditioning -of $K_\gamma$ increases linearly with $\gamma$. -\begin{proposition}[\cite{regev2023hykkt}, Theorem 2, p.7] - \label{prop:kkt:hykkt:cond} - We suppose $G$ is full row rank. Then there exists $\overline{\gamma} - \geq \underline{\gamma}$ such that for all $\gamma \geq \overline{\gamma}$, - $\cond(K_\gamma)$ increases linearly with $\gamma$. -\end{proposition} +that for $\gamma > \underline{\gamma}$, if $\inertia(K_2) = (n + m_i, m_e +m_i, 0)$ +then $K_\gamma$ is positive definite. The linear solver HyKKT~\cite{regev2023hykkt} leverages the positive definiteness of $K_\gamma$ to solve @@ -73,7 +64,7 @@ \subsection{Golub \& Greif strategy} The sparse Cholesky factorization has the advantage of being stable without numerical pivoting, rendering the algorithm tractable on a GPU. Each \CG iteration requires the application of sparse triangular solves with the -factors of $K_\gamma$, operations that can be numerically demanding. For that reason, +factors of $K_\gamma$. For that reason, HyKKT is efficient only if the \CG solver converges in a small number of iterations. Fortunately, the eigenvalues of the Schur-complement $S_\gamma := G K_\gamma^{-1} G^\top$ all converge to $\frac{1}{\gamma}$ as we increase the regularization parameter @@ -84,29 +75,31 @@ \subsection{Golub \& Greif strategy} \CR \cite{hestenes-stiefel-1952} and \CAR \cite{montoison-orban-saunders-2023} can also be used as an alternative to \CG. Although we observe similar performance, these methods ensure a monotonic decrease in the residual norm of \eqref{eq:kkt:schurcomplhykkt} at each iteration. -\subsection{Equality relaxation strategy} +\subsection{Equality relaxation strategy: LiftedKKT} \label{sec:kkt:sckkt} -If the initial problem~\eqref{eq:problem} has only inequality constraints, -we keep only the $(1,1)$ block in the system \eqref{eq:kkt:condensed}. -Using the relation~\eqref{eq:ipm:inertia}, the system -is guaranteed to be positive definite if the primal regularization parameter $\delta_x$ is adequately large. -Furthermore, the parameter $\delta_x$ is chosen dynamically using the inertia information of the system in \eqref{eq:kkt:condensed}. This motivates the Lifted KKT System strategy: we approximate the equalities with lifted inequalities. For a small relaxation parameter $\tau > 0$ (chosen based on the numerical tolerance of the optimization solver -$\varepsilon_{tol}$), we solve the relaxed problem +$\varepsilon_{tol}$), the equality relaxation strategy~\cite{shin2023accelerating} approximates +the equalities with lifted inequalities: \begin{equation} \label{eq:problemrelaxation} \min_{x \in \mathbb{R}^n} \; f(x) \quad \text{subject to}\quad - \tau \leq g(x) \leq \tau \;,~ h(x) \leq 0 \; . \end{equation} -The problem~\eqref{eq:problemrelaxation} has only inequality constraints. After introducing slack variables, the condensed KKT system -\eqref{eq:kkt:condensed} reduces to +The problem~\eqref{eq:problemrelaxation} has only inequality constraints. +After introducing slack variables, the condensed KKT system \eqref{eq:kkt:condensed} reduces to \begin{equation} \label{eq:liftedkkt} - K_\tau \,d_x = - r_1 - H_\tau^\top(D_H r_4 - C r_2) \; . + K_\tau \,d_x = - r_1 - H_\tau^\top(D_H r_4 - C r_2) \; , \end{equation} -Using the standard inertia correction procedure, the parameter $\delta_x$ is set to a value high enough to ensure that $K_\tau$ is positive definite. Therefore, $K_\tau$ can be factorized with a Cholesky decomposition, satisfying the key requirement of stable pivoting for the implementation on the GPU. The relaxation causes error in the final solution, but the error is in the same order of the solver tolerance, and thus, do not significantly deteriorates the solution quality for small $\varepsilon_{tol}$. +with $H_\tau = \big(G^\top ~ H^\top \big)^\top$ and +$K_\tau := W + D_x + \delta_w I + H_\tau^\top D_H H_\tau$. +Using the relation~\eqref{eq:ipm:inertia}, the matrix $K_\tau$ +is guaranteed to be positive definite if the primal regularization parameter $\delta_w$ is adequately large. +As such, the parameter $\delta_w$ is chosen dynamically using the inertia information of the system in \eqref{eq:kkt:condensed}. +Therefore, $K_\tau$ can be factorized with a Cholesky decomposition, satisfying the key requirement of stable pivoting for the implementation on the GPU. The relaxation causes error in the final solution. +Fortunately, the error is in the same order of the solver tolerance, thus it does not significantly deteriorate the solution quality for small $\varepsilon_{tol}$. While this method can be implemented with small modification in the optimization solver, the presence of tight inequality in \eqref{eq:problemrelaxation} causes severe ill-conditioning throughout the IPM iterations. Thus, using an accurate iterative refinement algorithm is necessary to get a reliable convergence behavior. diff --git a/tex/sections/numerics.tex b/tex/sections/numerics.tex index 7820671..84bce20 100644 --- a/tex/sections/numerics.tex +++ b/tex/sections/numerics.tex @@ -1,12 +1,11 @@ \section{Numerical results} -We implement LiftedKKT and HyKKT on the GPU. For comparison, -we use HSL MA27 and MA57 as a baseline. -First, we detail in \S\ref{sec:num:pprof} the respective performance of the two hybrid solvers -on a large-scale OPF instance. Then, we present in \S\ref{sec:num:opf} -the results reported on the PGLIB OPF benchmark, complemented in \S\ref{sec:num:cops} by -the COPS benchmark. +We have implemented LiftedKKT and HyKKT on the GPU. +First, we detail in \S\ref{sec:num:implementation} +our implementation and assess in \S\ref{sec:num:pprof} the performance on the GPU of the two hybrid solvers +Then, we present in \S\ref{sec:num:opf} the results reported on the PGLIB OPF benchmark, complemented in \S\ref{sec:num:cops} by the COPS benchmark. \subsection{Implementation} +\label{sec:num:implementation} All our implementation uses the Julia language \cite{bezanson-edelman-karpinski-shah-2017}. We have used our local workstation to generate the results on the CPU, here equipped with an AMD EPYC 7443 processor (3.2GHz). @@ -46,7 +45,7 @@ \subsection{Implementation} \end{itemize} CHOLMOD \cite{chen-davis-hager-rajamanickam-2008} is shipped with Julia. For the HSL linear solvers, we utilize libHSL \cite{fowkes-lister-montoison-orban-2024} with the Julia interface HSL.jl \cite{montoison-orban-hsl-2021}. -HSL MA57 and CHOLMOD are both compiled with OpenBLAS \cite{openblas}, a multithreaded version of BLAS and LAPACK. +HSL MA57 and CHOLMOD are both compiled with OpenBLAS, a multithreaded version of BLAS and LAPACK. The Julia package Krylov.jl~\cite{montoison2023krylov} contains a collection of Krylov methods with a polymorphic implementation that can be used on both CPU and GPU architectures. % Should we provide the version of the linear solvers? @@ -67,7 +66,7 @@ \subsubsection{Individual performance of the linear solvers} the factorization and the triangular solves with those reported in CHOLMOD. The results are displayed in Table~\ref{tab:linsol:time}. -We benchmark the three decompositions implemented in cuDSS (\llt, \ldlt, \lu), and assess the accuracy of the solution by computing the infinity norm of the residual. +We benchmark the three decompositions implemented in cuDSS (\llt, \ldlt, \lu), and assess the absolve accuracy of the solution by computing the infinity norm of the residual $\|K_\gamma x - b \|_\infty$. % The accuracy of the solution depends of on more factors than the backsolve. % The main error comes from the factorization. We observe that the analysis phase is four times slower for cuDSS compared to CHOLMOD. @@ -105,7 +104,7 @@ \subsubsection{Individual performance of the linear solvers} \resizebox{\textwidth}{!}{ \begin{tabular}{|lrrrr|} \hline - linear solver & analysis (s) & factorization (s) & backsolve (s) & accuracy \\ + linear solver & analysis (s) & factorization (s) & backsolve (s) & abs. accuracy \\ \hline CHOLMOD & 1.18 & $8.57 \times 10^{-1}$ & $1.27 \times 10^{-1}$ & $3.60\times 10^{-13}$\\ cuDSS-\llt & 4.52 & $3.75 \times 10^{-2}$ & $1.32 \times 10^{-2}$ & $2.64\times 10^{-13}$\\ % Sungho: not Cholesky? it would be better to use consistent term. I'd recommend Cholesky instead of LL^T. @@ -118,7 +117,6 @@ \subsubsection{Individual performance of the linear solvers} The matrix $K_\gamma$ is symmetric positive definite, with a size $n = 674,562$. The matrix is extremely sparse, with only $7,342,680$ non-zero entries ($0.002$\%). \label{tab:linsol:time} - (A100 GPU) } \end{table} @@ -132,8 +130,8 @@ \subsubsection{Tuning the Golub \& Greif strategy} the faster the \CG algorithm: we decrease the total number of iterations spent in \CG by a factor of 10. However, we have to pay a price in term of accuracy: for $\gamma > 10^8$ the solution returned by the linear solver -is not accurate enough and the IPM algorithm has to proceed to more -primal-dual regularization, leading to an increase in the total number of iterations. +is not accurate enough and the IPM algorithm has to proceed to additional +primal-dual regularizations. On the numerical side, the table in Figure~\ref{fig:hybrid:gamma} compares the time spent in the IPM solver on the CPU (using CHOLMOD) and on the GPU @@ -147,7 +145,7 @@ \subsubsection{Tuning the Golub \& Greif strategy} \resizebox{\textwidth}{!}{ \begin{tabular}{|r|rrrr >{\bfseries}r|rrrr >{\bfseries}r|} \hline - & \multicolumn{5}{c|}{\bf CHOLMOD (CPU)} & \multicolumn{5}{c|}{\bf cuDSS-\ldlt (CUDA)} \\ + & \multicolumn{5}{c|}{\bf CHOLMOD-\ldlt (CPU)} & \multicolumn{5}{c|}{\bf cuDSS-\ldlt (CUDA)} \\ \hline $\gamma$ & \# it & cond. (s) & \CG (s) & linsol (s) & IPM (s) & \# it & cond. (s) & \CG (s) & linsol (s) & IPM (s) \\ \hline @@ -188,10 +186,10 @@ \subsubsection{Tuning the equality relaxation strategy} as we decrease the IPM tolerance $\varepsilon_{tol}$. We display both the runtimes on the CPU (using CHOLMOD-\ldlt) and on the GPU (using {\tt cuDSS}-\ldlt). The slacks associated with the relaxed equality constraints are converging to a value below $2 \tau$, -leading to highly ill-conditioned terms in the diagonal matrices $\Sigma_s$. +leading to highly ill-conditioned terms in the diagonal matrices $D_s$. As a consequence, the conditioning of the matrix $K_\tau$ in \eqref{eq:liftedkkt} can increase above $10^{18}$, leading to a nearly singular linear system. -We observe {\tt cuDSS}-\ldlt is more stable: the factorization +We observe {\tt cuDSS}-\ldlt is more stable than CHOLMOD: the factorization succeeds, and the loss of accuracy caused by the ill-conditioning is tamed by the multiple Richardson iterations that reduces the relative accuracy in the residual down to an acceptable level. As a result, {\tt cuDSS} can solve @@ -213,7 +211,7 @@ \subsubsection{Tuning the equality relaxation strategy} % $10^{-8}$ & - & - &- & - & 105 & 20.3&$1.2 \times 10^{-6}$ \\ % \hline % \end{tabular} - \begin{tabular}{|l|rr|rr|rr|} + \begin{tabular}{|l|rr|rr|r|} \hline & \multicolumn{2}{c|}{\bf CHOLMOD-\ldlt (CPU)} & \multicolumn{2}{c|}{\bf cuDSS-\ldlt (CUDA)}& \\ \hline @@ -237,7 +235,13 @@ \subsubsection{Tuning the equality relaxation strategy} \subsubsection{Breakdown of the time spent in one IPM iteration} We decompose the time spent in a single -IPM iteration for all the available KKT solvers (HSL MA27, LiftedKKT, and HyKKT). +IPM iteration for LiftedKKT and HyKKT. +As a reference running on the CPU, we show the time spent in the solver HSL MA27. +We observe that HSL MA57 is slower +than HSL MA27, as the OPF instances are super-sparse. +Hence, the block elimination algorithm implemented in HSL MA57 is not beneficial there +\footnote{Personal communication with Iain Duff.}. + When solving the KKT system, the time can be decomposed into: (1) assembling the KKT system, (2) factorizing the KKT system, and (3) computing the descent direction with triangular solves. As depicted in Figure~\ref{fig:timebreakdown}, we observe @@ -246,7 +250,8 @@ \subsubsection{Breakdown of the time spent in one IPM iteration} 30x and 15x in the factorization compared to MA27 and CHOLMOD running on the CPU. Once the KKT system is factorized, computing the descent direction with LiftedKKT is faster than with HyKKT (0.04s compared to 0.13s) as HyKKT has to run a \CG algorithm to solve the Schur complement -system~\eqref{eq:kkt:schurcomplhykkt}. +system~\eqref{eq:kkt:schurcomplhykkt}, leading to additional backsolves +in the linear solver. \begin{figure}[!ht] \centering @@ -275,20 +280,16 @@ \subsection{Benchmark on OPF instances} \label{sec:num:opf} We run a benchmark on difficult OPF instances taken from the PGLIB benchmark~\cite{babaeinejadsarookolaee2019power}. -We compare our two condensed-space methods with HSL MA27 running -on the CPU. The results are displayed in Table~\ref{tab:opf:benchmark}, +We compare LiftedKKT and HyKKT with HSL MA27. +The results are displayed in Table~\ref{tab:opf:benchmark}, for an IPM tolerance set to $10^{-6}$. Regarding HyKKT, we set $\gamma = 10^7$ following the analysis in \S\ref{sec:num:tuninghykkt}. The table displays the time spent in the initialization, the time spent in the linear solver and the total solving time. We complement the table with a Dolan \& Moré performance profile~\cite{dolan2002benchmarking} displayed in Figure~\ref{fig:opf:pprof}. - Overall, the performance of HSL MA27 on the CPU is consistent with what was reported -in \cite{babaeinejadsarookolaee2019power}: We observe that HSL MA57 is slower -than HSL MA27, as the OPF instances are super-sparse. -Hence, the block elimination algorithm implemented in HSL MA57 is not beneficial there -\footnote{Personal communication with Iain Duff.}. +in \cite{babaeinejadsarookolaee2019power}. On the GPU, LiftedKKT+cuDSS is faster than HyKKT+cuDSS on small and medium instances: indeed, the algorithm does not have to run a \CG algorithm at each IPM iteration, limiting the number @@ -302,13 +303,13 @@ \subsection{Benchmark on OPF instances} The benchmark presented in Table~\ref{tab:opf:benchmark} has been generated using a NVIDIA A100 GPUs (current selling price: \$10k). We have also compared the performance -with cheaper GPUs: a NVIDIA A1000 (a laptop-based GPU, 4GB memory, price: \$150) and a NVIDIA A30 +with cheaper GPUs: a NVIDIA A1000 (a laptop-based GPU, 4GB memory) and a NVIDIA A30 (24GB memory, price: \$5k). As a comparison, the selling price of the AMD EPYC 7443 processor we used for the benchmark on the CPU is \$1.2k. The results are displayed in Figure~\ref{fig:gpubench}. We observe that the performance of the A30 and the A100 are relatively similar. The cheaper A1000 GPU is already faster -than the GPU, but is not able to solve the largest instance as it is running out of memory. +than HSL MA27 running on the CPU, but is not able to solve the largest instance as it is running out of memory. \begin{table}[!ht] \centering @@ -384,11 +385,9 @@ \subsection{Benchmark on COPS instances} and HyKKT outperforms HSL MA27 when running on the GPU. However, the OPF instances are specific nonlinear instances. For that reason, we complement our analysis by looking -at the performance of LiftedKKT and HyKKT on the COPS benchmark, -which gathers generic nonlinear programs~\cite{dolan2004benchmarking}. -We look at the performance we get on the particular COPS instances used in -the Mittelmann benchmark, used to benchmark nonlinear optimization -solvers~\cite{mittelmann2002benchmark}. +at the performance of LiftedKKT and HyKKT on large-scale COPS instances~\cite{dolan2004benchmarking}. +We look at the performance we get on the COPS instances used in +the Mittelmann benchmark~\cite{mittelmann2002benchmark}. To illustrate the heterogeneity of the COPS instances, we display in Figure~\ref{fig:cops:nnz} the sparsity pattern of the condensed matrices $K_\gamma$ \eqref{eq:kkt:hykkt} for one OPF instance and for multiple @@ -399,7 +398,7 @@ \subsection{Benchmark on COPS instances} The results of the COPS benchmark are displayed in Table~\ref{tab:cops:benchmark}. HSL MA57 gives better results than HSL MA27 for the COPS benchmark, and -for that reason we have decided to use HSL MA57 as the reference. As expected, +for that reason we have decided to replace HSL MA27 by HSL MA57. As expected, the results are different than on the OPF benchmark. We observe that LiftedKKT+cuDSS and HyKKT+cuDSS outperform HSL MA57 on the dense instance {\tt elec} (20x speed-up) and {\tt bearing} --- an instance whose sparsity pattern @@ -413,7 +412,7 @@ \subsection{Benchmark on COPS instances} \caption{Sparsity patterns for one OPF instance and for various COPS problems. The first row displays the sparsity pattern of $K_\gamma$, after AMD reordering. The second row displays - the sparsity pattern of the triangular factor computed by CHOLMOD. + the sparsity pattern of $K_\gamma$ after Metis reordering. \label{fig:cops:nnz} } \end{figure} @@ -453,7 +452,7 @@ \subsection{Benchmark on COPS instances} \hline \end{tabular} } - \caption{COPS benchmark , solved with a tolerance {\tt tol=1e-6}\label{tab:cops:benchmark} (A100 GPU)} + \caption{COPS benchmark , solved with a tolerance {\tt tol=1e-6}\label{tab:cops:benchmark}} \end{table}