\documentclass[]{llncs}      % it must be
\pagestyle{myheadings}       % it must be
\usepackage[dvips]{graphicx} % if needed
\begin{document}             % it must be

\title{A recursive formulation of the Inversion \\
       of Symmetric Positive Definite Matrices \\
       in Packed Storage Data Format}

\author{Bjarne S.~Andersen\inst{1}, John A.~Gunnels\inst{2},
        Fred Gustavson\inst{2} \\ and Jerzy Wa{\'s}niewski\inst{1}}
\institute{
         UNI$\bullet$C Danish IT Center for Education and Research \\
        Technical University of Denmark, Bldg.~304 \\
        DK-2800 Lyngby, Denmark \\
        \{bjarne.stig.andersen, jerzy.wasniewski\}@uni-c.dk
\and
        IBM T.J. Watson Research Center \\
        P.O. Box 218, Yorktown Heights, NY 10598, USA \\
        \{gunnels@us, gustav@watson\}.ibm.com}

\maketitle
\index{Andersen, Bjarne S.}
\index{Gunnels, John A.}
\index{Gustavson, Fred}
\index{Wa{\'s}niewski, Jerzy}

\markboth{B.S.~Andersen, J.A.~Gunnels, F.~Gustavson and J.~Wa{\'s}niewski}
{Recursive Inversion, Symmetric Positive Definite Matrices in Packed Storage}

\begin{abstract}
  A new Recursive Packed Inverse Calculation Algorithm for symmetric
positive definite matrices has been developed. The new Recursive
Inverse Calculation algorithm uses minimal storage, $n(n+1)/2$, and
has nearly the same performance as the LAPACK full storage algorithm
using $n^2$ memory words. New recursive packed BLAS needed for this
algorithm have been developed too. Two transformation routines, from
the LAPACK packed storage data format to the recursive storage data
format were added to the package too.

  We present performance measurements on several current architectures
that demonstrate improvements over the traditional packed routines.
\end{abstract}

\section{Introduction}

  LAPACK~\cite{laug} has two different kinds of algorithms for
symmetric matrices. There are two ways of storing data, consisting of
the full storage data format and the packed storage data format. The
full storage data format algorithms require $n^2$ memory locations
even though these algorithms only use $n(n+1)/2$ matrix elements.
The LAPACK full storage algorithms perform well but they require
a lot of memory. The LAPACK packed storage data format algorithms
require minimal storage (of size $n(n+1)/2$) but their speed is several
times slower than that of the LAPACK full storage data format algorithms.
The LAPACK packed storage data format algorithms are not able to use
level~3 BLAS~\cite{blas,atlas98}. The size of available computer memory
is still critical for many large scale problems. Thus, the user's program
should perform quickly and require minimum memory.
%\resizebox*{0.50\textwidth}{!}{\includegraphics{intel_fac.eps}} &

\begin{figure}[]
{\centering \begin{tabular}{ccc}
\hspace{-0.3 cm}
\resizebox*{0.50\textwidth}{0.39\textheight}{\includegraphics{intel_fac.eps}} &
~~&
\hspace{-0.3 cm}
\resizebox*{0.50\textwidth}{0.39\textheight}{\includegraphics{intel_inv.eps}} \\
{\bf A} & & {\bf B}
\end{tabular}\par}\vspace{-0.25 cm}
\caption{Intel Pentium III, @ 500 MHz, ATLAS BLAS Library}\label{fig:intel}
%\caption{Performance of the recursive and LAPACK algorithms, of Cholesky
%factorization and inverse calculation of positive definite symmetric
%matrix algorithms on INTEL Pentium III, @ 500 MHz. All routines call
%the optimized ATLAS BLAS. The curves with + (DRPPTRF+ and DRPPTRI+)
%include data transformation from LAPACK packed storage data format to the
%recursive packed and vice versa.}\label{fig:intel}
\end{figure}

  There are already several papers on Numerical Linear Algebra with
Recursive Algorithms (for example~\cite{ElmrothGustavson00ibmjour,gus1,tol1,eupar99,FredGustavson98,JerzyPoland99}).
A new Recursive Packed Storage Data Format has been discovered by 
applying recursion to the Cholesky factorization with packed storage
data format. This new Recursive Packed Cholesky Factorization algorithm
performs well. It runs with almost the same speed as the LAPACK
full storage data format algorithm and only uses $n(n+1)/2$ memory
locations (see~\cite{bfj:rfc:27:2001} and graph {\bf A} on
Fig.~\ref{fig:intel}).

  Our research into packed data format algorithms went further. We will
present the features of our Recursive Packed Algorithm for calculating
the inverse of a symmetric positive definite matrix. The graph {\bf B} on
Fig.~\ref{fig:intel} compares performance of the new Inverse Calculation
algorithm with LAPACK's full storage algorithm DPOTRI. The new Recursive
Inverse Calculation algorithm uses minimal storage and has nearly the same
performance.

\section{Recursive Formulation of the Inverse Calculation Algorithm
and Its Auxiliary Routines}
\label{sec:algor}
  The xRPPTRI\footnote{x can be S for a single precision, D for double
precision, C for a complex, and Z for a
double complex.} computes the inverse of a real symmetric positive definite
matrix, $A$, using the Cholesky factorization $A = U^TU$ or $A = LL^T$
computed by xRPPTRF.  Matrix $A$ is given either in the LAPACK or recursive
packed storage data format. The transformation routine must
be called if $A$ is given in LAPACK storage data format.

  The subroutine xRPPTRI calls two recursive auxiliary routines, xRPTRTRI
and xRPTRRK. xRPTRTRI computes the inverse of an upper or a lower triangular
matrix, $A$, stored in recursive packed format. xRPTRRK computes one of the
product $LL^T$, $L^TL$, $UU^T$, or $U^TU$, where $L$ and $U$ are in
recursive packed format.

  The subroutine xRPTRTRI calls two recursive BLAS routines, xRPTRMM
and xRPTRSM. xRPTRMM performs one of the following matrix-matrix operations:
$$ B := \alpha A B,\;\; \alpha A^T B,\;\; \alpha B A,\;\; \mbox{or}\;\; \alpha B A^T, $$
where $\alpha$ is a scalar,  $B$ is an $m \times n$ matrix, $A$ is a
unit, or non-unit, upper or lower triangular matrix. Both matrices
$A$ and $B$ are in recursive packed format. xRPTRSM solves one of
the following matrix equations:
$$A X = \alpha B,\;\; A^T B = \alpha B,\;\; X A = \alpha B,\;\;
  \mbox{or}\;\; X A^T = \alpha B, $$
where $\alpha$ is a scalar, $X$ and $B$ are $m \times n$ matrices, $A$
is a unit, or non-unit,  upper or lower triangular matrix. $A$, $B$
and $X$ are in recursive packed format. Matrix $B$ is overwritten by
the solution matrix, $X$.

  The subroutine xRPTRRK also calls two recursive BLAS routines
xRPPSYRK and xRPTRMM. xRPPSYRK performs one of the symmetric rank $k$
operations
 $$ C := \alpha A A^T + \beta C,\;\; \mbox{or}\;\; C := \alpha A^TA +
      \beta C, $$
where $\alpha$ and $\beta$ are scalars, $C$ is an $n \times n$ symmetric matrix
and $A$ is an $n \times k$ matrix in the first case, and a $k \times n$
matrix in the second case. Matrices $A$ and $C$ are in recursive packed
format. Operation xRPTRMM was defined above.

  The recursive BLAS operations xRPTRMM, xRPTRSM and xRPPSYRK call the
xGEMM routine. The routine xGEMM does the vast majority of the computational
work. Carefully choosing the xGEMM operation is very important for the
speed of the numerical linear algebra calculations,
see~\cite{atlas98,AtlasBerkeley97,kagstrom95b,FredGustavson98a,BoKagstrom97}.

\section{Performance results}
\label{sec:performance}
  The new recursive packed BLAS (xRPTRMM, xRPTRSM and xRPSYRK), and
the new recursive packed Cholesky factorization (xRPPTRF)
and inverse calculation (xRPPTRI) routines were compared to the
traditional LAPACK subroutines both in terms of use numerical results
and the performance achieved.
\vspace{-0.5 cm}
\begin{table}[]
\renewcommand{\arraystretch}{1.25}
\begin{center}
   \begin{tabular}{|llr|c|} \hline
\multicolumn{3}{|c}{Processor} & \multicolumn{1}{|c|}{Peak performance} \\ \hline
    IBM    &  Power3 NH2       & @ 375 MHz & 1500 Mflops \\
    SUN    &  UltraSparc II    & @ 400 MHz &  800 Mflops \\
    SGI    &  R12000           & @ 300 MHz &  390 Mflops \\
    COMPAQ &  Alpha EV6        & @ 500 MHz & 1000 Mflops \\
    INTEL  &  Pentium III      & @ 500 MHz &  500 Mflops \\ \hline
   \end{tabular} \vspace{0.25 cm}
   \caption{Computers used \label{computer:names}}
\end{center}
\end{table}

%=====================================================================

\begin{figure}[]
{\centering \begin{tabular}{ccc}
\hspace{-.3 cm}
\resizebox*{0.50\textwidth}{0.39\textheight}{\includegraphics{sgi_fac.eps}} &
~~&
\hspace{-.3 cm}
\resizebox*{0.50\textwidth}{0.39\textheight}{\includegraphics{sgi_inv.eps}} \\
{\bf A} & & {\bf B}
\end{tabular}\par}
\caption{SGI Origin 2000 R12000, @ 300 MHz, Sgi Math BLAS Library.}\label{fig:sgi}
\end{figure}

\begin{figure}[]
{\centering \begin{tabular}{ccc}
\hspace{-.3 cm}
\resizebox*{0.50\textwidth}{0.39\textheight}{\includegraphics{compaq_fac.eps}} &
~~&
\hspace{-.3 cm}
\resizebox*{0.50\textwidth}{0.39\textheight}{\includegraphics{compaq_inv.eps}} \\
{\bf A} & & {\bf B}
\end{tabular}\par}
\caption{COMPAQ Alpha EV6, @ 500 MHz, CXML BLAS Library}\label{fig:compaq}
\end{figure}

\begin{figure}[]
{\centering \begin{tabular}{ccc}
\hspace{-.3 cm}
\resizebox*{0.50\textwidth}{0.39\textheight}{\includegraphics{ibm_fac.eps}} &
~~&
\hspace{-.3 cm}
\resizebox*{0.50\textwidth}{0.39\textheight}{\includegraphics{ibm_inv.eps}} \\
{\bf A} & & {\bf B}
\end{tabular}\par}
\caption{IBM Power3 NH2, @ 375 MHz, ESSL BLAS Library}\label{fig:ibm}
\end{figure}

\vspace{-0.5 cm}
The comparisons were made on five
different architectures, listed in
Table~\ref{computer:names}. The result, in color\footnote{DPOTRF and
DPOTRI in magenta, DPPTRF and DPPTRI in red, DRPPTRF and DRPPTRI
in blue, and DRPPTRF+ and DRPPTRI+ in green.} graphs, are in
Figures~\ref{fig:intel},~\ref{fig:sun},~\ref{fig:sgi},~\ref{fig:compaq}
and~\ref{fig:ibm}.
Every figure has two graphs, {\bf A} and {\bf B}. Every graph
has four curves: the LAPACK full and packed (PO and PP) storage data
format comparison curves, and two recursive packed storage comparison
curves. The recursive curves, one contains a timing results with
no-conversion time,
while the second curve (DRPPTRF+ and DRPPTRI+) reflects the
time required by the conversion from and to the LAPACK packed data
format. Graph {\bf A} compares the factorization algorithms, while
graph {\bf B} contrasts the inverse calculation algorithms. The
UPLO = 'L' case were used in all experiments.

\begin{table}[]
\begin{center}
\renewcommand{\arraystretch}{1.25}
   \begin{tabular}{|lll|} \hline
    IBM     & Essl 3.2                           & -lessl     \\
    SUN     & Sun Performance Library 2.0        & -lsunperf  \\
    SGI     & Standard Execution Environment 7.3 & -lblas     \\
    COMPAQ  & CXML V3.5                          & -lcxml\_ev6 \\
    INTEL   & Atlas 3.0 Beta                     & -latlas    \\ \hline
   \end{tabular} \vspace{0.25 cm}
   \caption{Computer library versions \label{computer:libraries}}
\end{center}
\end{table}

  Double precision arithmetic in Fortran~95~\cite{fortran95} was used
in all cases. On each machine, the recursive and the traditional routines
were compiled with the same compiler and compiler flags and they call the
same optimized BLAS. The BLAS library versions can be seen in
Table~\ref{computer:libraries}. All routines received the same input
and produced the same output for each time measurement. The CPU time is
measured by the standard Fortran~95 timing routine. The operation
counts for Cholesky factorization and the inverse calculation are
approximately $$ \mathit{NFP_{fac}} = n^3/3\;\; \mbox{ and }
\;\; \mathit{NFP_{inv}} = 2\,n^3/3, $$ where $n$ indicates the size
of the matrix. These counts were used to convert run times to FLOP
rates.


\begin{thebibliography}{10}

\bibitem{AgawalGustavson94}
R.C. Agawal, F.G. Gustavson, and M.~Zubair.
\newblock Exploiting functional parallelism on power2 to design
  high-performance numerical algorithms.
\newblock {\em IBM Journal of Research and Development}, 38(5):563--576,
  September 1994.

\bibitem{bfj:rfc:27:2001}
Bjarne~S. Andersen, Fred~G. Gustavson, and Jerzy Wa\'{s}niewski.
\newblock A recursive formulation of {Cholesky} facorization of a matrix in
  packed storage.
\newblock {\em {ACM} Transactions on Mathematical Software}, 27(2):214--244,
  June 2001.

\bibitem{JerzyPoland99}
B.S. Andersen, F.~Gustavson, A.~Karaivanov, J.~Wa{\'s}niewski, and P.Y.
  Yalamov.
\newblock {LAWRA} -- {L}inear {A}lgebra with {R}ecursive {A}lgorithms.
\newblock In R.~Wyrzykowski, B.~Mochnacki, H.~Piech, and J.~Szopa, editors,
  {\em Proceedings of the $3^{th}$ International Conference on Parallel
  Processing and Applied Mathematics, PPAM'99}, pages 63--76, Kazimierz Dolny,
  Poland, 1999. Technical University of Cz{\c{e}}stochowa.

\bibitem{laug}
E.~Anderson, Z.~Bai, C.~Bischof, L.~S. Blackford, J.~Demmel, J.~Dongarra,
  J.~Du~Croz, A.~Greenbaum, S.~Hammarling, A.~McKenney, and D.~Sorensen.
\newblock {\em {LAPACK} Users' Guide}.
\newblock Society for Industrial and Applied Mathematics, Philadelphia, PA,
  third edition, 1999.

\bibitem{laug95}
Vincent~A. Barker, L.~Susan Blackford, Jack~J. Dongarra, Jeremy~Du Croz, Sven
  Hammarling, Minka Marinova, Jerzy Wa{\'{s}}niewski, and Plamen Yalamov.
\newblock {\em {LAPACK95} Users' Guide}.
\newblock Society for Industrial and Applied Mathematics, Philadelphia, PA,
  first edition, 2001.

\bibitem{AtlasBerkeley97}
J.~Bilmes, K.~Asanovi{\'c}, C.W. Chin, and J.~Demmel.
\newblock {O}ptimizing {M}atrix {M}ultiply {U}sing {PHIPAC}: a {P}ortable,
  {H}igh-{P}erformance, {ANSI C} {C}oding {M}ethodology.
\newblock In {\em Proceedings of the International Conference on
  Supercomputing}, Vienna, Austria, Jul 1997. {ACM} {SIGARC}.

\bibitem{demmel97}
J.W. Demmel.
\newblock {\em Applied Numerical Linear Algebra}.
\newblock SIAM, Philadelphia, 1997.

\bibitem{blas3}
J.~Dongarra, J.~Du~Croz, I.~S. Duff, and S.~Hammarling.
\newblock A {S}et of {L}evel 3 {B}asic {L}inear {A}lgebra {S}ubprograms.
\newblock {\em {ACM} Trans. Math. Soft.}, 16(1):1--17, March 1990.

\bibitem{blas2}
J.~Dongarra, J.~Du~Croz, S.~Hammarling, and Richard~J. Hanson.
\newblock {A}n {E}xtended {S}et of {F}ortran {B}asic {L}inear {A}lgebra
  {S}ubroutines.
\newblock {\em {ACM} Trans. Math. Soft.}, 14(1):1--17, March 1988.

\bibitem{blas}
J.~Dongarra et~al.
\newblock {BLAS} ({B}asic {L}inear {A}lgebra {S}ubprograms).
\newblock http://www.netlib.org/blas/.
\newblock Ongoing Projects at the Innovative Computing Laboratory, Computer
  Science Department, University of Tennessee at Knoxville, USA.

\bibitem{ElmrothGustavson00ibmjour}
E.~Elmroth and F.~Gustavson.
\newblock {A}pplying {R}ecursion to {S}erial and {P}arallel {QR}
  {F}actorization {L}eads to {B}etter {P}erformance.
\newblock {\em IBM Journal of Research and Development}, 44(4):605--624, 2000.

\bibitem{GVL2}
G.~Golub and C.~F. Van~Loan.
\newblock {\em Matrix Computations}.
\newblock Johns {H}opkins {U}niversity {P}ress, Baltimore, MD, third edition,
  1996.

\bibitem{eupar99}
A.~Gupta, F.~Gustavson, A.~Karaivanov, J.~Wa{\'s}niewski, and P.~Yalamov.
\newblock {E}xperience with a {R}ecursive {P}erturbation {B}ased {A}lgorithm
  for {S}ymmetric {I}ndefinite {L}inear {S}ystems.
\newblock In I.~Duff, editor, {\em EuroPar'99 Conference Proceedings},
  Toulouse, France, September 1999.

\bibitem{FredGustavson98}
F.~Gustavson, A.~Henriksson, I.~Jonsson, B.~K{\aa}gstr{\"o}m, and P.~Ling.
\newblock Recursive {B}locked {D}ata {F}ormats and {BLAS}' for {D}ense {L}inear
  {A}lgebra {A}lgorithms.
\newblock In B.~K{\aa}gstr{\"o}m, J.~Dongarra, E.~Elmroth, and
  J.~Wa{\'s}niewski, editors, {\em Proceedings of the $4^{th}$ International
  Workshop, Applied Parallel Computing, Large Scale Scientific and Industrial
  Problems, PARA'98}, number 1541 in Lecture Notes in Computer Science Number,
  pages 195--206, Ume{\aa}, Sweden, June 1998. Springer.

\bibitem{FredGustavson98a}
F.~Gustavson, A.~Henriksson, I.~Jonsson, B.~K{\aa}gstr{\"o}m, and P.~Ling.
\newblock Superscalar {GEMM}-based {L}evel~3 {BLAS} -- {T}he {O}n-going
  {E}volution of {P}ortable and {H}igh-{P}erformance {L}ibrary.
\newblock In B.~K{\aa}gstr{\"o}m, J.~Dongarra, E.~Elmroth, and
  J.~Wa{\'s}niewski, editors, {\em Proceedings of the $4^{th}$ International
  Workshop, Applied Parallel Computing, Large Scale Scientific and Industrial
  Problems, PARA'98}, number 1541 in Lecture Notes in Computer Science Number,
  pages 207--215, Ume{\aa}, Sweden, June 1998. Springer.

\bibitem{gus1}
F.G. Gustavson.
\newblock {R}ecursion {L}eads to {A}utomatic {V}ariable {B}locking for {D}ense
  {L}inear-{A}lgebra {A}lgorithms.
\newblock {\em IBM Journal of Research and Development}, 41(6), November 1997.

\bibitem{Higham96}
N.~J. Higham.
\newblock {\em Accuracy and Stability of Numerical Algorithms}.
\newblock SIAM, 1996.

\bibitem{ibmessl97}
IBM.
\newblock {\em {E}ngineering and {S}cientific {S}ubroutine {L}ibrary for
  {AIX}}, {V}ersion 3, {V}olume 1 edition, December 1997.
\newblock Pub. number {SA}22--7272--0.

\bibitem{kagstrom95b}
B.~K{\aa}gstr\"{o}m, P.~Ling, and C.~Van Loan.
\newblock {GEMM}-based level 3 {BLAS}: High-performance model implementations
  and performance evaluation benchmark.
\newblock Technical Report UMINF 95-18, Department of Computing Science,
  Ume{\aa} University, 1995.
\newblock Submitted to ACM Trans. Math. Softw.

\bibitem{BoKagstrom97}
B.~K{\aa}gstr{\"o}m, P.~Ling, and C.~Van Loan.
\newblock {GEMM}-based level 3 {BLAS}: {H}igh-{P}erformance {M}odel
  {I}mplementations and {P}erformance {E}valuation {B}enchmark.
\newblock {\em ACM Trans. Math. Software}, 24(3):268--302, 1998.

\bibitem{blas1}
C.~L. Lawson, R.~J. Hanson, D.~Kincaid, and F.~T. Krogh.
\newblock {B}asic {L}inear {A}lgebra {S}ubprograms for {F}ortran {U}sage.
\newblock {\em {ACM} Trans. Math. Soft.}, 5:308--323, 1979.

\bibitem{fortran95}
M.~Metcalf and J.~Reid.
\newblock {\em {Fortran~90/95} Explained}.
\newblock Oxford University Press, Oxford, UK, second edition, 1999.

\bibitem{tol1}
S.~Toledo.
\newblock {L}ocality of {R}eference in {LU} {D}ecomposition with {P}artial
  {P}ivoting.
\newblock {\em SIAM Journal of Matrix Analysis and Applications},
  18(4):1065--1081, 1997.

\bibitem{Trefethen97}
L.N. Trefethen and D.~Bau.
\newblock {\em Numerical Linear Algebra}.
\newblock SIAM, Philadelphia, 1997.

\bibitem{atlas98}
R.C. Whaley, A.~Petitet, and J.~Dongarra.
\newblock {ATLAS}: {A}utomatically {T}uned {L}inear {A}lgebra {S}oftware.
\newblock http://www.netlib.org/atlas/, 2000.
\newblock University of Tennessee at Knoxville, Tennessee, USA.

\end{thebibliography}

\end{document}

