diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 79e79f06eb..b5fa0168f5 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -105,8 +105,8 @@ set(SLASRC slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f - slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f - slarrv.f slartv.f + slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f + slargv.f slarmm.f slarrv.f slartv.f slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f @@ -141,7 +141,7 @@ set(SLASRC stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f stptrs.f strcon.f strevc.f strevc3.f strexc.f strrfs.f strsen.f strsna.f strsyl.f - strti2.f strtri.f strtrs.f stzrzf.f sstemr.f + strsyl3.f strti2.f strtri.f strtrs.f stzrzf.f sstemr.f slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f stfttr.f stpttf.f stpttr.f strttf.f strttp.f sgejsv.f sgesvj.f sgsvj0.f sgsvj1.f @@ -306,7 +306,7 @@ set(DLASRC dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f - dlargv.f dlarrv.f dlartv.f + dlargv.f dlarmm.f dlarrv.f dlartv.f dlarz.f dlarzb.f dlarzt.f dlaswp.f dlasy2.f dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f dlauu2.f @@ -342,7 +342,7 @@ set(DLASRC dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f dtptrs.f dtrcon.f dtrevc.f dtrevc3.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f - dtrti2.f dtrtri.f dtrtrs.f dtzrzf.f dstemr.f + dtrsyl3.f dtrti2.f dtrtri.f dtrtrs.f dtzrzf.f dstemr.f dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f diff --git a/SRC/Makefile b/SRC/Makefile index b05c81fddd..3f8f6f4695 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -137,8 +137,8 @@ SLASRC = \ slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \ slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \ slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \ - slarf.o slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \ - slarrv.o slartv.o \ + slarf.o slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o \ + slargv.o slarmm.o slarrv.o slartv.o \ slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \ slasyf_rk.o \ slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o \ @@ -174,7 +174,7 @@ SLASRC = \ stgsja.o stgsna.o stgsy2.o stgsyl.o stpcon.o stprfs.o stptri.o \ stptrs.o \ strcon.o strevc.o strevc3.o strexc.o strrfs.o strsen.o strsna.o strsyl.o \ - strti2.o strtri.o strtrs.o stzrzf.o sstemr.o \ + strsyl3.o strti2.o strtri.o strtrs.o stzrzf.o sstemr.o \ slansf.o spftrf.o spftri.o spftrs.o ssfrk.o stfsm.o stftri.o stfttp.o \ stfttr.o stpttf.o stpttr.o strttf.o strttp.o \ sgejsv.o sgesvj.o sgsvj0.o sgsvj1.o \ @@ -340,7 +340,7 @@ DLASRC = \ dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \ dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \ dlarf.o dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \ - dlargv.o dlarrv.o dlartv.o \ + dlargv.o dlarmm.o dlarrv.o dlartv.o \ dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o \ dlasyf.o dlasyf_rook.o dlasyf_rk.o \ dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlauu2.o \ @@ -376,7 +376,7 @@ DLASRC = \ dtgsja.o dtgsna.o dtgsy2.o dtgsyl.o dtpcon.o dtprfs.o dtptri.o \ dtptrs.o \ dtrcon.o dtrevc.o dtrevc3.o dtrexc.o dtrrfs.o dtrsen.o dtrsna.o dtrsyl.o \ - dtrti2.o dtrtri.o dtrtrs.o dtzrzf.o dstemr.o \ + dtrsyl3.o dtrti2.o dtrtri.o dtrtrs.o dtzrzf.o dstemr.o \ dsgesv.o dsposv.o dlag2s.o slag2d.o dlat2s.o \ dlansf.o dpftrf.o dpftri.o dpftrs.o dsfrk.o dtfsm.o dtftri.o dtfttp.o \ dtfttr.o dtpttf.o dtpttr.o dtrttf.o dtrttp.o \ diff --git a/SRC/dlarmm.f b/SRC/dlarmm.f new file mode 100644 index 0000000000..c360420092 --- /dev/null +++ b/SRC/dlarmm.f @@ -0,0 +1,99 @@ +*> \brief \b DLARMM +* +* Definition: +* =========== +* +* DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) +* +* .. Scalar Arguments .. +* DOUBLE PRECISION ANORM, BNORM, CNORM +* .. +* +*> \par Purpose: +* ======= +*> +*> \verbatim +*> +*> DLARMM returns a factor s in (0, 1] such that the linear updates +*> +*> (s * C) - A * (s * B) and (s * C) - (s * A) * B +*> +*> cannot overflow, where A, B, and C are matrices of conforming +*> dimensions. +*> +*> This is an auxiliary routine so there is no argument checking. +*> \endverbatim +* +* Arguments: +* ========= +* +*> \param[in] ANORM +*> \verbatim +*> ANORM is DOUBLE PRECISION +*> The infinity norm of A. ANORM >= 0. +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] BNORM +*> \verbatim +*> BNORM is DOUBLE PRECISION +*> The infinity norm of B. BNORM >= 0. +*> \endverbatim +*> +*> \param[in] CNORM +*> \verbatim +*> CNORM is DOUBLE PRECISION +*> The infinity norm of C. CNORM >= 0. +*> \endverbatim +*> +*> +* ===================================================================== +*> References: +*> C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for +*> Robust Solution of Triangular Linear Systems. In: International +*> Conference on Parallel Processing and Applied Mathematics, pages +*> 68--78. Springer, 2017. +*> +*> \ingroup OTHERauxiliary +* ===================================================================== + + DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) + IMPLICIT NONE +* .. Scalar Arguments .. + DOUBLE PRECISION ANORM, BNORM, CNORM +* .. Parameters .. + DOUBLE PRECISION ONE, HALF, FOUR + PARAMETER ( ONE = 1.0D0, HALF = 0.5D+0, FOUR = 4.0D0 ) +* .. +* .. Local Scalars .. + DOUBLE PRECISION BIGNUM, SMLNUM +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH + EXTERNAL DLAMCH +* .. +* .. Executable Statements .. +* +* +* Determine machine dependent parameters to control overflow. +* + SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) + BIGNUM = ( ONE / SMLNUM ) / FOUR +* +* Compute a scale factor. +* + DLARMM = ONE + IF( BNORM .LE. ONE ) THEN + IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN + DLARMM = HALF + END IF + ELSE + IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN + DLARMM = HALF / BNORM + END IF + END IF + RETURN +* +* ==== End of DLARMM ==== +* + END diff --git a/SRC/dtrsyl3.f b/SRC/dtrsyl3.f new file mode 100644 index 0000000000..dd5f2f48f5 --- /dev/null +++ b/SRC/dtrsyl3.f @@ -0,0 +1,1242 @@ +*> \brief \b DTRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> DTRSYL3 solves the real Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**T, and A and B are both upper quasi- +*> triangular. A is M-by-M and B is N-by-N; the right hand side C and +*> the solution X are M-by-N; and scale is an output scale factor, set +*> <= 1 to avoid overflow in X. +*> +*> A and B must be in Schur canonical form (as returned by DHSEQR), that +*> is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; +*> each 2-by-2 diagonal block has its diagonal elements equal and its +*> off-diagonal elements of opposite sign. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'T': op(A) = A**T (Transpose) +*> = 'C': op(A) = A**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'T': op(B) = B**T (Transpose) +*> = 'C': op(B) = B**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,M) +*> The upper quasi-triangular matrix A, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (LDB,N) +*> The upper quasi-triangular matrix B, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> IWORK is INTEGER +*> The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) +*> + ((N + NB - 1) / NB + 1), where NB is the optimal block size. +*> +*> If LIWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimension of the IWORK array, +*> returns this value as the first entry of the IWORK array, and +*> no error message related to LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), +*> MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE DTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, IWORK, LIWORK, SWORK, LDSWORK, + $ INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, M, N, + $ LIWORK, LDSWORK + DOUBLE PRECISION SCALE +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY, SKIP + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB, PC + DOUBLE PRECISION ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM +* .. +* .. Local Arrays .. + DOUBLE PRECISION WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLANGE, DLAMCH, DLARMM + EXTERNAL DLANGE, DLAMCH, DLARMM, ILAENV, LSAME +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DLASCL, DSCAL, DTRSYL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, EXPONENT, DBLE, MAX, MIN +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX(8, ILAENV( 1, 'DTRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LIWORK.EQ.-1 .OR. LDSWORK.EQ.-1 ) + IWORK( 1 ) = NBA + NBB + 2 + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK( 1, 1 ) = MAX( NBA, NBB ) + SWORK( 2, 1 ) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT.LSAME( TRANA, 'T' ) .AND. .NOT. + $ LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT.LSAME( TRANB, 'T' ) .AND. .NOT. + $ LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + ELSE IF( .NOT.LQUERY .AND. LIWORK.LT.IWORK(1) ) THEN + INFO = -14 + ELSE IF( .NOT.LQUERY .AND. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + INFO = -16 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DTRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems +* + IF( NBA.EQ.1 .OR. NBB.EQ.1 ) THEN + CALL DTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = DLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Partition A such that 2-by-2 blocks on the diagonal are not split +* + SKIP = .FALSE. + DO I = 1, NBA + IWORK( I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( NBA + 1 ) = M + 1 + DO K = 1, NBA + L1 = IWORK( K ) + L2 = IWORK( K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.M ) THEN +* A( M, M ) is a 1-by-1 block + CYCLE + END IF + IF( A( L, L+1 ).NE.ZERO .AND. A( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( K + 1 ) ) THEN + IWORK( K + 1 ) = IWORK( K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( NBA + 1 ) = M + 1 + IF( IWORK( NBA ).GE.IWORK( NBA + 1 ) ) THEN + IWORK( NBA ) = IWORK( NBA + 1 ) + NBA = NBA - 1 + END IF +* +* Partition B such that 2-by-2 blocks on the diagonal are not split +* + PC = NBA + 1 + SKIP = .FALSE. + DO I = 1, NBB + IWORK( PC + I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( PC + NBB + 1 ) = N + 1 + DO K = 1, NBB + L1 = IWORK( PC + K ) + L2 = IWORK( PC + K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.N ) THEN +* B( N, N ) is a 1-by-1 block + CYCLE + END IF + IF( B( L, L+1 ).NE.ZERO .AND. B( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( PC + K + 1 ) ) THEN + IWORK( PC + K + 1 ) = IWORK( PC + K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( PC + NBB + 1 ) = N + 1 + IF( IWORK( PC + NBB ).GE.IWORK( PC + NBB + 1 ) ) THEN + IWORK( PC + NBB ) = IWORK( PC + NBB + 1 ) + NBB = NBB - 1 + END IF +* +* Set local scaling factors - must never attain zero. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = K, NBA + L1 = IWORK( L ) + L2 = IWORK( L + 1 ) + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = DLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = DLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = IWORK( PC + K ) + K2 = IWORK( PC + K + 1 ) + DO L = K, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = DLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = DLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = DBLE( ISGN ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF ( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO JJ = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + SWORK( K, L ) = SCALOC * SWORK( K, L ) + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO +* + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to +* zero and give up. +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF + + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = C( 1, 1 ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( C( K, L ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of DTRSYL3 +* + END diff --git a/SRC/ilaenv.f b/SRC/ilaenv.f index af28503986..0e3fd38159 100644 --- a/SRC/ilaenv.f +++ b/SRC/ilaenv.f @@ -469,6 +469,14 @@ INTEGER FUNCTION ILAENV( ISPEC, NAME, OPTS, N1, N2, N3, N4 ) ELSE NB = 64 END IF + ELSE IF( C3.EQ.'SYL' ) THEN +* The upper bound is to prevent overly aggressive scaling. + IF( SNAME ) THEN + NB = MIN( MAX( 48, INT( ( MIN( N1, N2 ) * 16 ) / 100) ), + $ 240 ) + ELSE + NB = -1 + END IF END IF ELSE IF( C2.EQ.'LA' ) THEN IF( C3.EQ.'UUM' ) THEN diff --git a/SRC/slarmm.f b/SRC/slarmm.f new file mode 100644 index 0000000000..643dd67487 --- /dev/null +++ b/SRC/slarmm.f @@ -0,0 +1,99 @@ +*> \brief \b SLARMM +* +* Definition: +* =========== +* +* REAL FUNCTION SLARMM( ANORM, BNORM, CNORM ) +* +* .. Scalar Arguments .. +* REAL ANORM, BNORM, CNORM +* .. +* +*> \par Purpose: +* ======= +*> +*> \verbatim +*> +*> SLARMM returns a factor s in (0, 1] such that the linear updates +*> +*> (s * C) - A * (s * B) and (s * C) - (s * A) * B +*> +*> cannot overflow, where A, B, and C are matrices of conforming +*> dimensions. +*> +*> This is an auxiliary routine so there is no argument checking. +*> \endverbatim +* +* Arguments: +* ========= +* +*> \param[in] ANORM +*> \verbatim +*> ANORM is REAL +*> The infinity norm of A. ANORM >= 0. +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] BNORM +*> \verbatim +*> BNORM is REAL +*> The infinity norm of B. BNORM >= 0. +*> \endverbatim +*> +*> \param[in] CNORM +*> \verbatim +*> CNORM is REAL +*> The infinity norm of C. CNORM >= 0. +*> \endverbatim +*> +*> +* ===================================================================== +*> References: +*> C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for +*> Robust Solution of Triangular Linear Systems. In: International +*> Conference on Parallel Processing and Applied Mathematics, pages +*> 68--78. Springer, 2017. +*> +*> \ingroup OTHERauxiliary +* ===================================================================== + + REAL FUNCTION SLARMM( ANORM, BNORM, CNORM ) + IMPLICIT NONE +* .. Scalar Arguments .. + REAL ANORM, BNORM, CNORM +* .. Parameters .. + REAL ONE, HALF, FOUR + PARAMETER ( ONE = 1.0E0, HALF = 0.5E+0, FOUR = 4.0E+0 ) +* .. +* .. Local Scalars .. + REAL BIGNUM, SMLNUM +* .. +* .. External Functions .. + REAL SLAMCH + EXTERNAL SLAMCH +* .. +* .. Executable Statements .. +* +* +* Determine machine dependent parameters to control overflow. +* + SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' ) + BIGNUM = ( ONE / SMLNUM ) / FOUR +* +* Compute a scale factor. +* + SLARMM = ONE + IF( BNORM .LE. ONE ) THEN + IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN + SLARMM = HALF + END IF + ELSE + IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN + SLARMM = HALF / BNORM + END IF + END IF + RETURN +* +* ==== End of SLARMM ==== +* + END diff --git a/SRC/strsyl3.f b/SRC/strsyl3.f new file mode 100644 index 0000000000..8f9766837a --- /dev/null +++ b/SRC/strsyl3.f @@ -0,0 +1,1241 @@ +*> \brief \b STRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> STRSYL3 solves the real Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**T, and A and B are both upper quasi- +*> triangular. A is M-by-M and B is N-by-N; the right hand side C and +*> the solution X are M-by-N; and scale is an output scale factor, set +*> <= 1 to avoid overflow in X. +*> +*> A and B must be in Schur canonical form (as returned by SHSEQR), that +*> is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; +*> each 2-by-2 diagonal block has its diagonal elements equal and its +*> off-diagonal elements of opposite sign. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'T': op(A) = A**T (Transpose) +*> = 'C': op(A) = A**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'T': op(B) = B**T (Transpose) +*> = 'C': op(B) = B**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is REAL array, dimension (LDA,M) +*> The upper quasi-triangular matrix A, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is REAL array, dimension (LDB,N) +*> The upper quasi-triangular matrix B, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> IWORK is INTEGER +*> The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) +*> + ((N + NB - 1) / NB + 1), where NB is the optimal block size. +*> +*> If LIWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimension of the IWORK array, +*> returns this value as the first entry of the IWORK array, and +*> no error message related to LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is REAL array, dimension (MAX(2, ROWS), +*> MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE STRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, IWORK, LIWORK, SWORK, LDSWORK, + $ INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, M, N, + $ LIWORK, LDSWORK + REAL SCALE +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + REAL A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY, SKIP + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB, PC + REAL ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM +* .. +* .. Local Arrays .. + REAL WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLANGE, SLAMCH, SLARMM + EXTERNAL SLANGE, SLAMCH, SLARMM, ILAENV, LSAME +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SLASCL, SSCAL, STRSYL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, EXPONENT, MAX, MIN, REAL +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX(8, ILAENV( 1, 'STRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LIWORK.EQ.-1 .OR. LDSWORK.EQ.-1 ) + IWORK( 1 ) = NBA + NBB + 2 + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK( 1, 1 ) = MAX( NBA, NBB ) + SWORK( 2, 1 ) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT.LSAME( TRANA, 'T' ) .AND. .NOT. + $ LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT.LSAME( TRANB, 'T' ) .AND. .NOT. + $ LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + ELSE IF( .NOT.LQUERY .AND. LIWORK.LT.IWORK(1) ) THEN + INFO = -14 + ELSE IF( .NOT.LQUERY .AND. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + INFO = -16 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'STRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems +* + IF( NBA.EQ.1 .OR. NBB.EQ.1 ) THEN + CALL STRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = SLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Partition A such that 2-by-2 blocks on the diagonal are not split +* + SKIP = .FALSE. + DO I = 1, NBA + IWORK( I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( NBA + 1 ) = M + 1 + DO K = 1, NBA + L1 = IWORK( K ) + L2 = IWORK( K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.M ) THEN +* A( M, M ) is a 1-by-1 block + CYCLE + END IF + IF( A( L, L+1 ).NE.ZERO .AND. A( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( K + 1 ) ) THEN + IWORK( K + 1 ) = IWORK( K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( NBA + 1 ) = M + 1 + IF( IWORK( NBA ).GE.IWORK( NBA + 1 ) ) THEN + IWORK( NBA ) = IWORK( NBA + 1 ) + NBA = NBA - 1 + END IF +* +* Partition B such that 2-by-2 blocks on the diagonal are not split +* + PC = NBA + 1 + SKIP = .FALSE. + DO I = 1, NBB + IWORK( PC + I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( PC + NBB + 1 ) = N + 1 + DO K = 1, NBB + L1 = IWORK( PC + K ) + L2 = IWORK( PC + K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.N ) THEN +* B( N, N ) is a 1-by-1 block + CYCLE + END IF + IF( B( L, L+1 ).NE.ZERO .AND. B( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( PC + K + 1 ) ) THEN + IWORK( PC + K + 1 ) = IWORK( PC + K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( PC + NBB + 1 ) = N + 1 + IF( IWORK( PC + NBB ).GE.IWORK( PC + NBB + 1 ) ) THEN + IWORK( PC + NBB ) = IWORK( PC + NBB + 1 ) + NBB = NBB - 1 + END IF +* +* Set local scaling factors - must never attain zero. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = K, NBA + L1 = IWORK( L ) + L2 = IWORK( L + 1 ) + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = SLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = SLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = IWORK( PC + K ) + K2 = IWORK( PC + K + 1 ) + DO L = K, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = SLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = SLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = REAL( ISGN ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF ( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO JJ = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO +* + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is REAL. Set SCALE to zero and give up. +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF + + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = C( 1, 1 ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( C( K, L ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of STRSYL3 +* + END