From cda8a83b76d1908376d3f16b1f98dadac21a493a Mon Sep 17 00:00:00 2001 From: Angelika Schwarz Date: Sat, 19 Feb 2022 13:32:31 +0000 Subject: [PATCH] Add level-3 BLAS triangular Sylvester equation solver Force compatibility with [ds]trsyl. Use two floating-point scaling factors (rather than integer scaling factors). This does not eliminate the problem that scalings can be flushed, making any result useless. That problem could be eliminated by replacing the floating-point scale factor with an integer scale factor. --- SRC/CMakeLists.txt | 10 +- SRC/Makefile | 10 +- SRC/dlarmm.f | 99 ++++ SRC/dtrsyl3.f | 1242 ++++++++++++++++++++++++++++++++++++++++++++ SRC/ilaenv.f | 8 + SRC/slarmm.f | 99 ++++ SRC/strsyl3.f | 1241 +++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 2699 insertions(+), 10 deletions(-) create mode 100644 SRC/dlarmm.f create mode 100644 SRC/dtrsyl3.f create mode 100644 SRC/slarmm.f create mode 100644 SRC/strsyl3.f diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 79e79f06eb..b5fa0168f5 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -105,8 +105,8 @@ set(SLASRC slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f - slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f - slarrv.f slartv.f + slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f + slargv.f slarmm.f slarrv.f slartv.f slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f @@ -141,7 +141,7 @@ set(SLASRC stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f stptrs.f strcon.f strevc.f strevc3.f strexc.f strrfs.f strsen.f strsna.f strsyl.f - strti2.f strtri.f strtrs.f stzrzf.f sstemr.f + strsyl3.f strti2.f strtri.f strtrs.f stzrzf.f sstemr.f slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f stfttr.f stpttf.f stpttr.f strttf.f strttp.f sgejsv.f sgesvj.f sgsvj0.f sgsvj1.f @@ -306,7 +306,7 @@ set(DLASRC dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f - dlargv.f dlarrv.f dlartv.f + dlargv.f dlarmm.f dlarrv.f dlartv.f dlarz.f dlarzb.f dlarzt.f dlaswp.f dlasy2.f dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f dlauu2.f @@ -342,7 +342,7 @@ set(DLASRC dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f dtptrs.f dtrcon.f dtrevc.f dtrevc3.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f - dtrti2.f dtrtri.f dtrtrs.f dtzrzf.f dstemr.f + dtrsyl3.f dtrti2.f dtrtri.f dtrtrs.f dtzrzf.f dstemr.f dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f diff --git a/SRC/Makefile b/SRC/Makefile index b05c81fddd..3f8f6f4695 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -137,8 +137,8 @@ SLASRC = \ slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \ slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \ slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \ - slarf.o slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \ - slarrv.o slartv.o \ + slarf.o slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o \ + slargv.o slarmm.o slarrv.o slartv.o \ slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \ slasyf_rk.o \ slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o \ @@ -174,7 +174,7 @@ SLASRC = \ stgsja.o stgsna.o stgsy2.o stgsyl.o stpcon.o stprfs.o stptri.o \ stptrs.o \ strcon.o strevc.o strevc3.o strexc.o strrfs.o strsen.o strsna.o strsyl.o \ - strti2.o strtri.o strtrs.o stzrzf.o sstemr.o \ + strsyl3.o strti2.o strtri.o strtrs.o stzrzf.o sstemr.o \ slansf.o spftrf.o spftri.o spftrs.o ssfrk.o stfsm.o stftri.o stfttp.o \ stfttr.o stpttf.o stpttr.o strttf.o strttp.o \ sgejsv.o sgesvj.o sgsvj0.o sgsvj1.o \ @@ -340,7 +340,7 @@ DLASRC = \ dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \ dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \ dlarf.o dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \ - dlargv.o dlarrv.o dlartv.o \ + dlargv.o dlarmm.o dlarrv.o dlartv.o \ dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o \ dlasyf.o dlasyf_rook.o dlasyf_rk.o \ dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlauu2.o \ @@ -376,7 +376,7 @@ DLASRC = \ dtgsja.o dtgsna.o dtgsy2.o dtgsyl.o dtpcon.o dtprfs.o dtptri.o \ dtptrs.o \ dtrcon.o dtrevc.o dtrevc3.o dtrexc.o dtrrfs.o dtrsen.o dtrsna.o dtrsyl.o \ - dtrti2.o dtrtri.o dtrtrs.o dtzrzf.o dstemr.o \ + dtrsyl3.o dtrti2.o dtrtri.o dtrtrs.o dtzrzf.o dstemr.o \ dsgesv.o dsposv.o dlag2s.o slag2d.o dlat2s.o \ dlansf.o dpftrf.o dpftri.o dpftrs.o dsfrk.o dtfsm.o dtftri.o dtfttp.o \ dtfttr.o dtpttf.o dtpttr.o dtrttf.o dtrttp.o \ diff --git a/SRC/dlarmm.f b/SRC/dlarmm.f new file mode 100644 index 0000000000..c360420092 --- /dev/null +++ b/SRC/dlarmm.f @@ -0,0 +1,99 @@ +*> \brief \b DLARMM +* +* Definition: +* =========== +* +* DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) +* +* .. Scalar Arguments .. +* DOUBLE PRECISION ANORM, BNORM, CNORM +* .. +* +*> \par Purpose: +* ======= +*> +*> \verbatim +*> +*> DLARMM returns a factor s in (0, 1] such that the linear updates +*> +*> (s * C) - A * (s * B) and (s * C) - (s * A) * B +*> +*> cannot overflow, where A, B, and C are matrices of conforming +*> dimensions. +*> +*> This is an auxiliary routine so there is no argument checking. +*> \endverbatim +* +* Arguments: +* ========= +* +*> \param[in] ANORM +*> \verbatim +*> ANORM is DOUBLE PRECISION +*> The infinity norm of A. ANORM >= 0. +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] BNORM +*> \verbatim +*> BNORM is DOUBLE PRECISION +*> The infinity norm of B. BNORM >= 0. +*> \endverbatim +*> +*> \param[in] CNORM +*> \verbatim +*> CNORM is DOUBLE PRECISION +*> The infinity norm of C. CNORM >= 0. +*> \endverbatim +*> +*> +* ===================================================================== +*> References: +*> C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for +*> Robust Solution of Triangular Linear Systems. In: International +*> Conference on Parallel Processing and Applied Mathematics, pages +*> 68--78. Springer, 2017. +*> +*> \ingroup OTHERauxiliary +* ===================================================================== + + DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) + IMPLICIT NONE +* .. Scalar Arguments .. + DOUBLE PRECISION ANORM, BNORM, CNORM +* .. Parameters .. + DOUBLE PRECISION ONE, HALF, FOUR + PARAMETER ( ONE = 1.0D0, HALF = 0.5D+0, FOUR = 4.0D0 ) +* .. +* .. Local Scalars .. + DOUBLE PRECISION BIGNUM, SMLNUM +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH + EXTERNAL DLAMCH +* .. +* .. Executable Statements .. +* +* +* Determine machine dependent parameters to control overflow. +* + SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) + BIGNUM = ( ONE / SMLNUM ) / FOUR +* +* Compute a scale factor. +* + DLARMM = ONE + IF( BNORM .LE. ONE ) THEN + IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN + DLARMM = HALF + END IF + ELSE + IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN + DLARMM = HALF / BNORM + END IF + END IF + RETURN +* +* ==== End of DLARMM ==== +* + END diff --git a/SRC/dtrsyl3.f b/SRC/dtrsyl3.f new file mode 100644 index 0000000000..dd5f2f48f5 --- /dev/null +++ b/SRC/dtrsyl3.f @@ -0,0 +1,1242 @@ +*> \brief \b DTRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> DTRSYL3 solves the real Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**T, and A and B are both upper quasi- +*> triangular. A is M-by-M and B is N-by-N; the right hand side C and +*> the solution X are M-by-N; and scale is an output scale factor, set +*> <= 1 to avoid overflow in X. +*> +*> A and B must be in Schur canonical form (as returned by DHSEQR), that +*> is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; +*> each 2-by-2 diagonal block has its diagonal elements equal and its +*> off-diagonal elements of opposite sign. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'T': op(A) = A**T (Transpose) +*> = 'C': op(A) = A**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'T': op(B) = B**T (Transpose) +*> = 'C': op(B) = B**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,M) +*> The upper quasi-triangular matrix A, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (LDB,N) +*> The upper quasi-triangular matrix B, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> IWORK is INTEGER +*> The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) +*> + ((N + NB - 1) / NB + 1), where NB is the optimal block size. +*> +*> If LIWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimension of the IWORK array, +*> returns this value as the first entry of the IWORK array, and +*> no error message related to LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), +*> MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE DTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, IWORK, LIWORK, SWORK, LDSWORK, + $ INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, M, N, + $ LIWORK, LDSWORK + DOUBLE PRECISION SCALE +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY, SKIP + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB, PC + DOUBLE PRECISION ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM +* .. +* .. Local Arrays .. + DOUBLE PRECISION WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLANGE, DLAMCH, DLARMM + EXTERNAL DLANGE, DLAMCH, DLARMM, ILAENV, LSAME +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DLASCL, DSCAL, DTRSYL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, EXPONENT, DBLE, MAX, MIN +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX(8, ILAENV( 1, 'DTRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LIWORK.EQ.-1 .OR. LDSWORK.EQ.-1 ) + IWORK( 1 ) = NBA + NBB + 2 + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK( 1, 1 ) = MAX( NBA, NBB ) + SWORK( 2, 1 ) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT.LSAME( TRANA, 'T' ) .AND. .NOT. + $ LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT.LSAME( TRANB, 'T' ) .AND. .NOT. + $ LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + ELSE IF( .NOT.LQUERY .AND. LIWORK.LT.IWORK(1) ) THEN + INFO = -14 + ELSE IF( .NOT.LQUERY .AND. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + INFO = -16 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DTRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems +* + IF( NBA.EQ.1 .OR. NBB.EQ.1 ) THEN + CALL DTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = DLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Partition A such that 2-by-2 blocks on the diagonal are not split +* + SKIP = .FALSE. + DO I = 1, NBA + IWORK( I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( NBA + 1 ) = M + 1 + DO K = 1, NBA + L1 = IWORK( K ) + L2 = IWORK( K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.M ) THEN +* A( M, M ) is a 1-by-1 block + CYCLE + END IF + IF( A( L, L+1 ).NE.ZERO .AND. A( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( K + 1 ) ) THEN + IWORK( K + 1 ) = IWORK( K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( NBA + 1 ) = M + 1 + IF( IWORK( NBA ).GE.IWORK( NBA + 1 ) ) THEN + IWORK( NBA ) = IWORK( NBA + 1 ) + NBA = NBA - 1 + END IF +* +* Partition B such that 2-by-2 blocks on the diagonal are not split +* + PC = NBA + 1 + SKIP = .FALSE. + DO I = 1, NBB + IWORK( PC + I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( PC + NBB + 1 ) = N + 1 + DO K = 1, NBB + L1 = IWORK( PC + K ) + L2 = IWORK( PC + K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.N ) THEN +* B( N, N ) is a 1-by-1 block + CYCLE + END IF + IF( B( L, L+1 ).NE.ZERO .AND. B( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( PC + K + 1 ) ) THEN + IWORK( PC + K + 1 ) = IWORK( PC + K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( PC + NBB + 1 ) = N + 1 + IF( IWORK( PC + NBB ).GE.IWORK( PC + NBB + 1 ) ) THEN + IWORK( PC + NBB ) = IWORK( PC + NBB + 1 ) + NBB = NBB - 1 + END IF +* +* Set local scaling factors - must never attain zero. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = K, NBA + L1 = IWORK( L ) + L2 = IWORK( L + 1 ) + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = DLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = DLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = IWORK( PC + K ) + K2 = IWORK( PC + K + 1 ) + DO L = K, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = DLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = DLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = DBLE( ISGN ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF ( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO JJ = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + SWORK( K, L ) = SCALOC * SWORK( K, L ) + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO +* + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to +* zero and give up. +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF + + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = C( 1, 1 ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( C( K, L ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of DTRSYL3 +* + END diff --git a/SRC/ilaenv.f b/SRC/ilaenv.f index af28503986..0e3fd38159 100644 --- a/SRC/ilaenv.f +++ b/SRC/ilaenv.f @@ -469,6 +469,14 @@ INTEGER FUNCTION ILAENV( ISPEC, NAME, OPTS, N1, N2, N3, N4 ) ELSE NB = 64 END IF + ELSE IF( C3.EQ.'SYL' ) THEN +* The upper bound is to prevent overly aggressive scaling. + IF( SNAME ) THEN + NB = MIN( MAX( 48, INT( ( MIN( N1, N2 ) * 16 ) / 100) ), + $ 240 ) + ELSE + NB = -1 + END IF END IF ELSE IF( C2.EQ.'LA' ) THEN IF( C3.EQ.'UUM' ) THEN diff --git a/SRC/slarmm.f b/SRC/slarmm.f new file mode 100644 index 0000000000..643dd67487 --- /dev/null +++ b/SRC/slarmm.f @@ -0,0 +1,99 @@ +*> \brief \b SLARMM +* +* Definition: +* =========== +* +* REAL FUNCTION SLARMM( ANORM, BNORM, CNORM ) +* +* .. Scalar Arguments .. +* REAL ANORM, BNORM, CNORM +* .. +* +*> \par Purpose: +* ======= +*> +*> \verbatim +*> +*> SLARMM returns a factor s in (0, 1] such that the linear updates +*> +*> (s * C) - A * (s * B) and (s * C) - (s * A) * B +*> +*> cannot overflow, where A, B, and C are matrices of conforming +*> dimensions. +*> +*> This is an auxiliary routine so there is no argument checking. +*> \endverbatim +* +* Arguments: +* ========= +* +*> \param[in] ANORM +*> \verbatim +*> ANORM is REAL +*> The infinity norm of A. ANORM >= 0. +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] BNORM +*> \verbatim +*> BNORM is REAL +*> The infinity norm of B. BNORM >= 0. +*> \endverbatim +*> +*> \param[in] CNORM +*> \verbatim +*> CNORM is REAL +*> The infinity norm of C. CNORM >= 0. +*> \endverbatim +*> +*> +* ===================================================================== +*> References: +*> C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for +*> Robust Solution of Triangular Linear Systems. In: International +*> Conference on Parallel Processing and Applied Mathematics, pages +*> 68--78. Springer, 2017. +*> +*> \ingroup OTHERauxiliary +* ===================================================================== + + REAL FUNCTION SLARMM( ANORM, BNORM, CNORM ) + IMPLICIT NONE +* .. Scalar Arguments .. + REAL ANORM, BNORM, CNORM +* .. Parameters .. + REAL ONE, HALF, FOUR + PARAMETER ( ONE = 1.0E0, HALF = 0.5E+0, FOUR = 4.0E+0 ) +* .. +* .. Local Scalars .. + REAL BIGNUM, SMLNUM +* .. +* .. External Functions .. + REAL SLAMCH + EXTERNAL SLAMCH +* .. +* .. Executable Statements .. +* +* +* Determine machine dependent parameters to control overflow. +* + SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' ) + BIGNUM = ( ONE / SMLNUM ) / FOUR +* +* Compute a scale factor. +* + SLARMM = ONE + IF( BNORM .LE. ONE ) THEN + IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN + SLARMM = HALF + END IF + ELSE + IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN + SLARMM = HALF / BNORM + END IF + END IF + RETURN +* +* ==== End of SLARMM ==== +* + END diff --git a/SRC/strsyl3.f b/SRC/strsyl3.f new file mode 100644 index 0000000000..8f9766837a --- /dev/null +++ b/SRC/strsyl3.f @@ -0,0 +1,1241 @@ +*> \brief \b STRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> STRSYL3 solves the real Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**T, and A and B are both upper quasi- +*> triangular. A is M-by-M and B is N-by-N; the right hand side C and +*> the solution X are M-by-N; and scale is an output scale factor, set +*> <= 1 to avoid overflow in X. +*> +*> A and B must be in Schur canonical form (as returned by SHSEQR), that +*> is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; +*> each 2-by-2 diagonal block has its diagonal elements equal and its +*> off-diagonal elements of opposite sign. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'T': op(A) = A**T (Transpose) +*> = 'C': op(A) = A**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'T': op(B) = B**T (Transpose) +*> = 'C': op(B) = B**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is REAL array, dimension (LDA,M) +*> The upper quasi-triangular matrix A, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is REAL array, dimension (LDB,N) +*> The upper quasi-triangular matrix B, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> IWORK is INTEGER +*> The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) +*> + ((N + NB - 1) / NB + 1), where NB is the optimal block size. +*> +*> If LIWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimension of the IWORK array, +*> returns this value as the first entry of the IWORK array, and +*> no error message related to LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is REAL array, dimension (MAX(2, ROWS), +*> MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE STRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, IWORK, LIWORK, SWORK, LDSWORK, + $ INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, M, N, + $ LIWORK, LDSWORK + REAL SCALE +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + REAL A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY, SKIP + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB, PC + REAL ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM +* .. +* .. Local Arrays .. + REAL WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLANGE, SLAMCH, SLARMM + EXTERNAL SLANGE, SLAMCH, SLARMM, ILAENV, LSAME +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SLASCL, SSCAL, STRSYL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, EXPONENT, MAX, MIN, REAL +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX(8, ILAENV( 1, 'STRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LIWORK.EQ.-1 .OR. LDSWORK.EQ.-1 ) + IWORK( 1 ) = NBA + NBB + 2 + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK( 1, 1 ) = MAX( NBA, NBB ) + SWORK( 2, 1 ) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT.LSAME( TRANA, 'T' ) .AND. .NOT. + $ LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT.LSAME( TRANB, 'T' ) .AND. .NOT. + $ LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + ELSE IF( .NOT.LQUERY .AND. LIWORK.LT.IWORK(1) ) THEN + INFO = -14 + ELSE IF( .NOT.LQUERY .AND. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + INFO = -16 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'STRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems +* + IF( NBA.EQ.1 .OR. NBB.EQ.1 ) THEN + CALL STRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = SLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Partition A such that 2-by-2 blocks on the diagonal are not split +* + SKIP = .FALSE. + DO I = 1, NBA + IWORK( I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( NBA + 1 ) = M + 1 + DO K = 1, NBA + L1 = IWORK( K ) + L2 = IWORK( K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.M ) THEN +* A( M, M ) is a 1-by-1 block + CYCLE + END IF + IF( A( L, L+1 ).NE.ZERO .AND. A( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( K + 1 ) ) THEN + IWORK( K + 1 ) = IWORK( K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( NBA + 1 ) = M + 1 + IF( IWORK( NBA ).GE.IWORK( NBA + 1 ) ) THEN + IWORK( NBA ) = IWORK( NBA + 1 ) + NBA = NBA - 1 + END IF +* +* Partition B such that 2-by-2 blocks on the diagonal are not split +* + PC = NBA + 1 + SKIP = .FALSE. + DO I = 1, NBB + IWORK( PC + I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( PC + NBB + 1 ) = N + 1 + DO K = 1, NBB + L1 = IWORK( PC + K ) + L2 = IWORK( PC + K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.N ) THEN +* B( N, N ) is a 1-by-1 block + CYCLE + END IF + IF( B( L, L+1 ).NE.ZERO .AND. B( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( PC + K + 1 ) ) THEN + IWORK( PC + K + 1 ) = IWORK( PC + K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( PC + NBB + 1 ) = N + 1 + IF( IWORK( PC + NBB ).GE.IWORK( PC + NBB + 1 ) ) THEN + IWORK( PC + NBB ) = IWORK( PC + NBB + 1 ) + NBB = NBB - 1 + END IF +* +* Set local scaling factors - must never attain zero. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = K, NBA + L1 = IWORK( L ) + L2 = IWORK( L + 1 ) + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = SLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = SLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = IWORK( PC + K ) + K2 = IWORK( PC + K + 1 ) + DO L = K, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = SLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = SLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = REAL( ISGN ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF ( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO JJ = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO +* + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is REAL. Set SCALE to zero and give up. +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF + + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = C( 1, 1 ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( C( K, L ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of STRSYL3 +* + END