From 2495f1ced2d2d080bb523685502e1fc7b9d62713 Mon Sep 17 00:00:00 2001 From: "Weslley S. Pereira" Date: Fri, 15 Oct 2021 14:11:16 -0600 Subject: [PATCH 1/9] Solves a precision bug in clartg --- SRC/clartg.f90 | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/SRC/clartg.f90 b/SRC/clartg.f90 index f63a0f8d20..08c1b5e12c 100644 --- a/SRC/clartg.f90 +++ b/SRC/clartg.f90 @@ -187,7 +187,11 @@ subroutine CLARTG( f, g, c, s, r ) d = sqrt( f2 )*sqrt( h2 ) end if p = 1 / d - c = f2*p + if( f2 > safmin * g2 ) then + c = 1 / sqrt( one + g2/f2 ) + else + c = f2*p + end if s = conjg( g )*( f*p ) r = f*( h2*p ) else @@ -224,6 +228,11 @@ subroutine CLARTG( f, g, c, s, r ) d = sqrt( f2 )*sqrt( h2 ) end if p = 1 / d + if( f2 > safmin * g2 ) then + c = (1 / sqrt( one + g2/f2 )) * w + else + c = ( f2*p )*w + end if c = ( f2*p )*w s = conjg( gs )*( fs*p ) r = ( fs*( h2*p ) )*u From b89b15b2f408717812c44214c0b9a09589c8ed06 Mon Sep 17 00:00:00 2001 From: "Weslley S. Pereira" Date: Mon, 18 Oct 2021 09:49:56 -0600 Subject: [PATCH 2/9] Removes one line from clartg --- SRC/clartg.f90 | 1 - 1 file changed, 1 deletion(-) diff --git a/SRC/clartg.f90 b/SRC/clartg.f90 index 08c1b5e12c..f0327830df 100644 --- a/SRC/clartg.f90 +++ b/SRC/clartg.f90 @@ -233,7 +233,6 @@ subroutine CLARTG( f, g, c, s, r ) else c = ( f2*p )*w end if - c = ( f2*p )*w s = conjg( gs )*( fs*p ) r = ( fs*( h2*p ) )*u end if From ac11f62708b0ef10bbd28fa33d8d29e0e0e34c86 Mon Sep 17 00:00:00 2001 From: "Weslley S. Pereira" Date: Tue, 26 Oct 2021 17:50:27 -0600 Subject: [PATCH 3/9] Several changes to reduce the computation error --- SRC/clartg.f90 | 76 ++++++++++++++++++++++++++++---------------------- 1 file changed, 42 insertions(+), 34 deletions(-) diff --git a/SRC/clartg.f90 b/SRC/clartg.f90 index f0327830df..4e07b29d57 100644 --- a/SRC/clartg.f90 +++ b/SRC/clartg.f90 @@ -129,7 +129,7 @@ subroutine CLARTG( f, g, c, s, r ) complex(wp) f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w + real(wp) :: d, f1, f2, g1, g2, h2, u, v, w complex(wp) :: fs, gs, t ! .. ! .. Intrinsic Functions .. @@ -154,8 +154,7 @@ subroutine CLARTG( f, g, c, s, r ) ! ! Use unscaled algorithm ! - g2 = ABSSQ( g ) - d = sqrt( g2 ) + d = abs( g ) s = conjg( g ) / d r = d else @@ -163,10 +162,8 @@ subroutine CLARTG( f, g, c, s, r ) ! Use scaled algorithm ! u = min( safmax, max( safmin, g1 ) ) - uu = one / u - gs = g*uu - g2 = ABSSQ( gs ) - d = sqrt( g2 ) + gs = g / u + d = abs( gs ) s = conjg( gs ) / d r = d*u end if @@ -181,36 +178,40 @@ subroutine CLARTG( f, g, c, s, r ) f2 = ABSSQ( f ) g2 = ABSSQ( g ) h2 = f2 + g2 - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) - else - d = sqrt( f2 )*sqrt( h2 ) - end if - p = 1 / d if( f2 > safmin * g2 ) then - c = 1 / sqrt( one + g2/f2 ) + d = sqrt( one + g2/f2 ) + c = one / d + if( f2 > rtmin .and. h2 < rtmax ) then + s = conjg( g )*( f / sqrt( f2*h2 ) ) + else + s = conjg( g )*( f /( f2*d ) ) + end if + r = f * d else - c = f2*p + if( f2 > rtmin .and. h2 < rtmax ) then + d = sqrt( f2*h2 ) + else + d = sqrt( f2 )*sqrt( h2 ) + end if + c = f2 / d + s = conjg( g )*( f / d ) + r = f*( h2 / d ) end if - s = conjg( g )*( f*p ) - r = f*( h2*p ) else ! ! Use scaled algorithm ! u = min( safmax, max( safmin, f1, g1 ) ) - uu = one / u - gs = g*uu + gs = g / u g2 = ABSSQ( gs ) - if( f1*uu < rtmin ) then + if( f1 < rtmin * u ) then ! ! f is not well-scaled when scaled by g1. ! Use a different scaling for f. ! v = min( safmax, max( safmin, f1 ) ) - vv = one / v - w = v * uu - fs = f*vv + w = v / u + fs = f / v f2 = ABSSQ( fs ) h2 = f2*w**2 + g2 else @@ -218,23 +219,30 @@ subroutine CLARTG( f, g, c, s, r ) ! Otherwise use the same scaling for f and g. ! w = one - fs = f*uu + fs = f / u f2 = ABSSQ( fs ) h2 = f2 + g2 end if - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) - else - d = sqrt( f2 )*sqrt( h2 ) - end if - p = 1 / d if( f2 > safmin * g2 ) then - c = (1 / sqrt( one + g2/f2 )) * w + ! Use a precise algorithm + d = sqrt( w**2 + g2/f2 ) + c = w / d + if( f2 > rtmin .and. h2 < rtmax ) then + s = conjg( gs )*( fs / sqrt( f2*h2 ) ) + else + s = conjg( gs )*( fs / ( f2*d ) ) + end if + r = ( fs * d ) * u else - c = ( f2*p )*w + if( f2 > rtmin .and. h2 < rtmax ) then + d = sqrt( f2*h2 ) + else + d = sqrt( f2 )*sqrt( h2 ) + end if + c = ( f2 / d )*w + s = conjg( gs )*( fs / d ) + r = ( fs*( h2 / d ) )*u end if - s = conjg( gs )*( fs*p ) - r = ( fs*( h2*p ) )*u end if end if return From 43208822cba931cabc9c34582c964174ea8cfd06 Mon Sep 17 00:00:00 2001 From: "Weslley S. Pereira" Date: Thu, 28 Oct 2021 10:27:14 -0600 Subject: [PATCH 4/9] Starting to modify zlartg --- SRC/clartg.f90 | 1 - SRC/zlartg.f90 | 37 +++++++++++++++---------------------- 2 files changed, 15 insertions(+), 23 deletions(-) diff --git a/SRC/clartg.f90 b/SRC/clartg.f90 index 4e07b29d57..7dde0f1d34 100644 --- a/SRC/clartg.f90 +++ b/SRC/clartg.f90 @@ -224,7 +224,6 @@ subroutine CLARTG( f, g, c, s, r ) h2 = f2 + g2 end if if( f2 > safmin * g2 ) then - ! Use a precise algorithm d = sqrt( w**2 + g2/f2 ) c = w / d if( f2 > rtmin .and. h2 < rtmax ) then diff --git a/SRC/zlartg.f90 b/SRC/zlartg.f90 index e509898a1c..6d320217b6 100644 --- a/SRC/zlartg.f90 +++ b/SRC/zlartg.f90 @@ -129,7 +129,7 @@ subroutine ZLARTG( f, g, c, s, r ) complex(wp) f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w + real(wp) :: d, f1, f2, g1, g2, h2, u, v, w complex(wp) :: fs, gs, t ! .. ! .. Intrinsic Functions .. @@ -154,8 +154,7 @@ subroutine ZLARTG( f, g, c, s, r ) ! ! Use unscaled algorithm ! - g2 = ABSSQ( g ) - d = sqrt( g2 ) + d = abs( g ) s = conjg( g ) / d r = d else @@ -163,10 +162,8 @@ subroutine ZLARTG( f, g, c, s, r ) ! Use scaled algorithm ! u = min( safmax, max( safmin, g1 ) ) - uu = one / u - gs = g*uu - g2 = ABSSQ( gs ) - d = sqrt( g2 ) + gs = g / u + d = abs( gs ) s = conjg( gs ) / d r = d*u end if @@ -186,27 +183,24 @@ subroutine ZLARTG( f, g, c, s, r ) else d = sqrt( f2 )*sqrt( h2 ) end if - p = 1 / d - c = f2*p - s = conjg( g )*( f*p ) - r = f*( h2*p ) + c = f2 / d + s = conjg( g )*( f / d ) + r = f*( h2 / d ) else ! ! Use scaled algorithm ! u = min( safmax, max( safmin, f1, g1 ) ) - uu = one / u - gs = g*uu + gs = g / u g2 = ABSSQ( gs ) - if( f1*uu < rtmin ) then + if( f1 < rtmin*u ) then ! ! f is not well-scaled when scaled by g1. ! Use a different scaling for f. ! v = min( safmax, max( safmin, f1 ) ) - vv = one / v - w = v * uu - fs = f*vv + w = v / u + fs = f / v f2 = ABSSQ( fs ) h2 = f2*w**2 + g2 else @@ -214,7 +208,7 @@ subroutine ZLARTG( f, g, c, s, r ) ! Otherwise use the same scaling for f and g. ! w = one - fs = f*uu + fs = f / u f2 = ABSSQ( fs ) h2 = f2 + g2 end if @@ -223,10 +217,9 @@ subroutine ZLARTG( f, g, c, s, r ) else d = sqrt( f2 )*sqrt( h2 ) end if - p = 1 / d - c = ( f2*p )*w - s = conjg( gs )*( fs*p ) - r = ( fs*( h2*p ) )*u + c = ( f2 / d )*w + s = conjg( gs )*( fs / d ) + r = ( fs*( h2 / d ) )*u end if end if return From 37a1a1e6896a4037fdb1f287ee643f8a57ce679e Mon Sep 17 00:00:00 2001 From: "weslley.spereira" Date: Tue, 2 Nov 2021 18:49:08 -0600 Subject: [PATCH 5/9] Fix all other Givens rotation routines --- BLAS/SRC/crotg.f90 | 37 ++++++++++++++-------------------- BLAS/SRC/zrotg.f90 | 37 ++++++++++++++-------------------- SRC/clartg.f90 | 49 ++++++++++++---------------------------------- SRC/dlartg.f90 | 20 +++++++++---------- SRC/slartg.f90 | 20 +++++++++---------- SRC/zlartg.f90 | 37 ++++++++++++++-------------------- 6 files changed, 76 insertions(+), 124 deletions(-) diff --git a/BLAS/SRC/crotg.f90 b/BLAS/SRC/crotg.f90 index 7806140668..24f461ef02 100644 --- a/BLAS/SRC/crotg.f90 +++ b/BLAS/SRC/crotg.f90 @@ -122,7 +122,7 @@ subroutine CROTG( a, b, c, s ) complex(wp) :: a, b, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w + real(wp) :: d, f1, f2, g1, g2, h2, u, v, w complex(wp) :: f, fs, g, gs, r, t ! .. ! .. Intrinsic Functions .. @@ -149,8 +149,7 @@ subroutine CROTG( a, b, c, s ) ! ! Use unscaled algorithm ! - g2 = ABSSQ( g ) - d = sqrt( g2 ) + d = abs( g ) s = conjg( g ) / d r = d else @@ -158,10 +157,8 @@ subroutine CROTG( a, b, c, s ) ! Use scaled algorithm ! u = min( safmax, max( safmin, g1 ) ) - uu = one / u - gs = g*uu - g2 = ABSSQ( gs ) - d = sqrt( g2 ) + gs = g / u + d = abs( gs ) s = conjg( gs ) / d r = d*u end if @@ -181,27 +178,24 @@ subroutine CROTG( a, b, c, s ) else d = sqrt( f2 )*sqrt( h2 ) end if - p = 1 / d - c = f2*p - s = conjg( g )*( f*p ) - r = f*( h2*p ) + c = f2 / d + s = conjg( g )*( f / d ) + r = f*( h2 / d ) else ! ! Use scaled algorithm ! u = min( safmax, max( safmin, f1, g1 ) ) - uu = one / u - gs = g*uu + gs = g / u g2 = ABSSQ( gs ) - if( f1*uu < rtmin ) then + if( f1 / u < rtmin ) then ! ! f is not well-scaled when scaled by g1. ! Use a different scaling for f. ! v = min( safmax, max( safmin, f1 ) ) - vv = one / v - w = v * uu - fs = f*vv + w = v / u + fs = f / v f2 = ABSSQ( fs ) h2 = f2*w**2 + g2 else @@ -209,7 +203,7 @@ subroutine CROTG( a, b, c, s ) ! Otherwise use the same scaling for f and g. ! w = one - fs = f*uu + fs = f / u f2 = ABSSQ( fs ) h2 = f2 + g2 end if @@ -218,10 +212,9 @@ subroutine CROTG( a, b, c, s ) else d = sqrt( f2 )*sqrt( h2 ) end if - p = 1 / d - c = ( f2*p )*w - s = conjg( gs )*( fs*p ) - r = ( fs*( h2*p ) )*u + c = ( f2 / d )*w + s = conjg( gs )*( fs / d ) + r = ( fs*( h2 / d ) )*u end if end if a = r diff --git a/BLAS/SRC/zrotg.f90 b/BLAS/SRC/zrotg.f90 index 288e5c7ef5..2bd6fba2e5 100644 --- a/BLAS/SRC/zrotg.f90 +++ b/BLAS/SRC/zrotg.f90 @@ -122,7 +122,7 @@ subroutine ZROTG( a, b, c, s ) complex(wp) :: a, b, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w + real(wp) :: d, f1, f2, g1, g2, h2, u, v, w complex(wp) :: f, fs, g, gs, r, t ! .. ! .. Intrinsic Functions .. @@ -149,8 +149,7 @@ subroutine ZROTG( a, b, c, s ) ! ! Use unscaled algorithm ! - g2 = ABSSQ( g ) - d = sqrt( g2 ) + d = abs( g ) s = conjg( g ) / d r = d else @@ -158,10 +157,8 @@ subroutine ZROTG( a, b, c, s ) ! Use scaled algorithm ! u = min( safmax, max( safmin, g1 ) ) - uu = one / u - gs = g*uu - g2 = ABSSQ( gs ) - d = sqrt( g2 ) + gs = g / u + d = abs( gs ) s = conjg( gs ) / d r = d*u end if @@ -181,27 +178,24 @@ subroutine ZROTG( a, b, c, s ) else d = sqrt( f2 )*sqrt( h2 ) end if - p = 1 / d - c = f2*p - s = conjg( g )*( f*p ) - r = f*( h2*p ) + c = f2 / d + s = conjg( g )*( f / d ) + r = f*( h2 / d ) else ! ! Use scaled algorithm ! u = min( safmax, max( safmin, f1, g1 ) ) - uu = one / u - gs = g*uu + gs = g / u g2 = ABSSQ( gs ) - if( f1*uu < rtmin ) then + if( f1 / u < rtmin ) then ! ! f is not well-scaled when scaled by g1. ! Use a different scaling for f. ! v = min( safmax, max( safmin, f1 ) ) - vv = one / v - w = v * uu - fs = f*vv + w = v / u + fs = f / v f2 = ABSSQ( fs ) h2 = f2*w**2 + g2 else @@ -209,7 +203,7 @@ subroutine ZROTG( a, b, c, s ) ! Otherwise use the same scaling for f and g. ! w = one - fs = f*uu + fs = f / u f2 = ABSSQ( fs ) h2 = f2 + g2 end if @@ -218,10 +212,9 @@ subroutine ZROTG( a, b, c, s ) else d = sqrt( f2 )*sqrt( h2 ) end if - p = 1 / d - c = ( f2*p )*w - s = conjg( gs )*( fs*p ) - r = ( fs*( h2*p ) )*u + c = ( f2 / d )*w + s = conjg( gs )*( fs / d ) + r = ( fs*( h2 / d ) )*u end if end if a = r diff --git a/SRC/clartg.f90 b/SRC/clartg.f90 index 4e07b29d57..4392e70e9e 100644 --- a/SRC/clartg.f90 +++ b/SRC/clartg.f90 @@ -178,25 +178,14 @@ subroutine CLARTG( f, g, c, s, r ) f2 = ABSSQ( f ) g2 = ABSSQ( g ) h2 = f2 + g2 - if( f2 > safmin * g2 ) then - d = sqrt( one + g2/f2 ) - c = one / d - if( f2 > rtmin .and. h2 < rtmax ) then - s = conjg( g )*( f / sqrt( f2*h2 ) ) - else - s = conjg( g )*( f /( f2*d ) ) - end if - r = f * d + if( f2 > rtmin .and. h2 < rtmax ) then + d = sqrt( f2*h2 ) else - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) - else - d = sqrt( f2 )*sqrt( h2 ) - end if - c = f2 / d - s = conjg( g )*( f / d ) - r = f*( h2 / d ) + d = sqrt( f2 )*sqrt( h2 ) end if + c = f2 / d + s = conjg( g )*( f / d ) + r = f*( h2 / d ) else ! ! Use scaled algorithm @@ -204,7 +193,7 @@ subroutine CLARTG( f, g, c, s, r ) u = min( safmax, max( safmin, f1, g1 ) ) gs = g / u g2 = ABSSQ( gs ) - if( f1 < rtmin * u ) then + if( f1 / u < rtmin ) then ! ! f is not well-scaled when scaled by g1. ! Use a different scaling for f. @@ -223,26 +212,14 @@ subroutine CLARTG( f, g, c, s, r ) f2 = ABSSQ( fs ) h2 = f2 + g2 end if - if( f2 > safmin * g2 ) then - ! Use a precise algorithm - d = sqrt( w**2 + g2/f2 ) - c = w / d - if( f2 > rtmin .and. h2 < rtmax ) then - s = conjg( gs )*( fs / sqrt( f2*h2 ) ) - else - s = conjg( gs )*( fs / ( f2*d ) ) - end if - r = ( fs * d ) * u + if( f2 > rtmin .and. h2 < rtmax ) then + d = sqrt( f2*h2 ) else - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) - else - d = sqrt( f2 )*sqrt( h2 ) - end if - c = ( f2 / d )*w - s = conjg( gs )*( fs / d ) - r = ( fs*( h2 / d ) )*u + d = sqrt( f2 )*sqrt( h2 ) end if + c = ( f2 / d )*w + s = conjg( gs )*( fs / d ) + r = ( fs*( h2 / d ) )*u end if end if return diff --git a/SRC/dlartg.f90 b/SRC/dlartg.f90 index 03a708f863..365fa7207d 100644 --- a/SRC/dlartg.f90 +++ b/SRC/dlartg.f90 @@ -123,7 +123,7 @@ subroutine DLARTG( f, g, c, s, r ) real(wp) :: c, f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, fs, g1, gs, p, u, uu + real(wp) :: d, f1, fs, g1, gs, u ! .. ! .. Intrinsic Functions .. intrinsic :: abs, sign, sqrt @@ -143,20 +143,18 @@ subroutine DLARTG( f, g, c, s, r ) else if( f1 > rtmin .and. f1 < rtmax .and. & g1 > rtmin .and. g1 < rtmax ) then d = sqrt( f*f + g*g ) - p = one / d - c = f1*p - s = g*sign( p, f ) + c = f1 / d r = sign( d, f ) + s = g / r else u = min( safmax, max( safmin, f1, g1 ) ) - uu = one / u - fs = f*uu - gs = g*uu + fs = f / u + gs = g / u d = sqrt( fs*fs + gs*gs ) - p = one / d - c = abs( fs )*p - s = gs*sign( p, f ) - r = sign( d, f )*u + c = abs( fs ) / d + r = sign( d, f ) + s = gs / r + r = r * u end if return end subroutine diff --git a/SRC/slartg.f90 b/SRC/slartg.f90 index 2a936a919f..0ef73ddc2c 100644 --- a/SRC/slartg.f90 +++ b/SRC/slartg.f90 @@ -123,7 +123,7 @@ subroutine SLARTG( f, g, c, s, r ) real(wp) :: c, f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, fs, g1, gs, p, u, uu + real(wp) :: d, f1, fs, g1, gs, u ! .. ! .. Intrinsic Functions .. intrinsic :: abs, sign, sqrt @@ -143,20 +143,18 @@ subroutine SLARTG( f, g, c, s, r ) else if( f1 > rtmin .and. f1 < rtmax .and. & g1 > rtmin .and. g1 < rtmax ) then d = sqrt( f*f + g*g ) - p = one / d - c = f1*p - s = g*sign( p, f ) + c = f1 / d r = sign( d, f ) + s = g / r else u = min( safmax, max( safmin, f1, g1 ) ) - uu = one / u - fs = f*uu - gs = g*uu + fs = f / u + gs = g / u d = sqrt( fs*fs + gs*gs ) - p = one / d - c = abs( fs )*p - s = gs*sign( p, f ) - r = sign( d, f )*u + c = abs( fs ) / d + r = sign( d, f ) + s = gs / r + r = r*u end if return end subroutine diff --git a/SRC/zlartg.f90 b/SRC/zlartg.f90 index e509898a1c..2bdb8fcc85 100644 --- a/SRC/zlartg.f90 +++ b/SRC/zlartg.f90 @@ -129,7 +129,7 @@ subroutine ZLARTG( f, g, c, s, r ) complex(wp) f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w + real(wp) :: d, f1, f2, g1, g2, h2, u, v, w complex(wp) :: fs, gs, t ! .. ! .. Intrinsic Functions .. @@ -154,8 +154,7 @@ subroutine ZLARTG( f, g, c, s, r ) ! ! Use unscaled algorithm ! - g2 = ABSSQ( g ) - d = sqrt( g2 ) + d = abs( g ) s = conjg( g ) / d r = d else @@ -163,10 +162,8 @@ subroutine ZLARTG( f, g, c, s, r ) ! Use scaled algorithm ! u = min( safmax, max( safmin, g1 ) ) - uu = one / u - gs = g*uu - g2 = ABSSQ( gs ) - d = sqrt( g2 ) + gs = g / u + d = abs( gs ) s = conjg( gs ) / d r = d*u end if @@ -186,27 +183,24 @@ subroutine ZLARTG( f, g, c, s, r ) else d = sqrt( f2 )*sqrt( h2 ) end if - p = 1 / d - c = f2*p - s = conjg( g )*( f*p ) - r = f*( h2*p ) + c = f2 / d + s = conjg( g )*( f / d ) + r = f*( h2 / d ) else ! ! Use scaled algorithm ! u = min( safmax, max( safmin, f1, g1 ) ) - uu = one / u - gs = g*uu + gs = g / u g2 = ABSSQ( gs ) - if( f1*uu < rtmin ) then + if( f1 / u < rtmin ) then ! ! f is not well-scaled when scaled by g1. ! Use a different scaling for f. ! v = min( safmax, max( safmin, f1 ) ) - vv = one / v - w = v * uu - fs = f*vv + w = v / u + fs = f / v f2 = ABSSQ( fs ) h2 = f2*w**2 + g2 else @@ -214,7 +208,7 @@ subroutine ZLARTG( f, g, c, s, r ) ! Otherwise use the same scaling for f and g. ! w = one - fs = f*uu + fs = f / u f2 = ABSSQ( fs ) h2 = f2 + g2 end if @@ -223,10 +217,9 @@ subroutine ZLARTG( f, g, c, s, r ) else d = sqrt( f2 )*sqrt( h2 ) end if - p = 1 / d - c = ( f2*p )*w - s = conjg( gs )*( fs*p ) - r = ( fs*( h2*p ) )*u + c = ( f2 / d )*w + s = conjg( gs )*( fs / d ) + r = ( fs*( h2 / d ) )*u end if end if return From 2904d8763e83ea4c7ca8bc4b16740a8ccf5527e2 Mon Sep 17 00:00:00 2001 From: "Weslley S. Pereira" Date: Mon, 22 Nov 2021 18:11:04 -0700 Subject: [PATCH 6/9] Algorithm precise and with no bias in the error --- SRC/clartg.f90 | 80 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 66 insertions(+), 14 deletions(-) diff --git a/SRC/clartg.f90 b/SRC/clartg.f90 index 4392e70e9e..741e86af97 100644 --- a/SRC/clartg.f90 +++ b/SRC/clartg.f90 @@ -117,7 +117,7 @@ subroutine CLARTG( f, g, c, s, r ) use LA_CONSTANTS, & only: wp=>sp, zero=>szero, one=>sone, two=>stwo, czero, & - rtmin=>srtmin, rtmax=>srtmax, safmin=>ssafmin, safmax=>ssafmax + safmin=>ssafmin, safmax=>ssafmax ! ! -- LAPACK auxiliary routine (version 3.10.0) -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -129,7 +129,7 @@ subroutine CLARTG( f, g, c, s, r ) complex(wp) f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, f2, g1, g2, h2, u, v, w + real(wp) :: d, f1, f2, g1, g2, h2, u, v, w, rtmin, rtmax complex(wp) :: fs, gs, t ! .. ! .. Intrinsic Functions .. @@ -141,6 +141,9 @@ subroutine CLARTG( f, g, c, s, r ) ! .. Statement Function definitions .. ABSSQ( t ) = real( t )**2 + aimag( t )**2 ! .. +! .. Constants .. + rtmin = sqrt( safmin ) +! .. ! .. Executable Statements .. ! if( g == czero ) then @@ -150,6 +153,7 @@ subroutine CLARTG( f, g, c, s, r ) else if( f == czero ) then c = zero g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/2 ) if( g1 > rtmin .and. g1 < rtmax ) then ! ! Use unscaled algorithm @@ -170,6 +174,7 @@ subroutine CLARTG( f, g, c, s, r ) else f1 = max( abs(real(f)), abs(aimag(f)) ) g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/4 ) if( f1 > rtmin .and. f1 < rtmax .and. & g1 > rtmin .and. g1 < rtmax ) then ! @@ -178,14 +183,36 @@ subroutine CLARTG( f, g, c, s, r ) f2 = ABSSQ( f ) g2 = ABSSQ( g ) h2 = f2 + g2 - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = f / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( g ) * ( f / sqrt( f2*h2 ) ) + else + s = conjg( g ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = f / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = f * ( h2 / d ) + end if + s = conjg( g ) * ( f / d ) end if - c = f2 / d - s = conjg( g )*( f / d ) - r = f*( h2 / d ) else ! ! Use scaled algorithm @@ -212,14 +239,39 @@ subroutine CLARTG( f, g, c, s, r ) f2 = ABSSQ( fs ) h2 = f2 + g2 end if - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = fs / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( gs ) * ( fs / sqrt( f2*h2 ) ) + else + s = conjg( gs ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = fs / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = fs * ( h2 / d ) + end if + s = conjg( gs ) * ( fs / d ) end if - c = ( f2 / d )*w - s = conjg( gs )*( fs / d ) - r = ( fs*( h2 / d ) )*u + ! Rescale c and r + c = c * w + r = r * u end if end if return From 95b6e84b03a95618cb12939118b934a7ebcc1ded Mon Sep 17 00:00:00 2001 From: "Weslley S. Pereira" Date: Fri, 10 Dec 2021 15:43:43 -0700 Subject: [PATCH 7/9] Updates Givens rotations with preciser algorithms --- BLAS/SRC/crotg.f90 | 103 +++++++++++++++++++++++++++++------------- BLAS/SRC/zrotg.f90 | 103 +++++++++++++++++++++++++++++------------- SRC/clartg.f90 | 26 +++++------ SRC/dlartg.f90 | 16 ++++--- SRC/slartg.f90 | 12 ++--- SRC/zlartg.f90 | 108 +++++++++++++++++++++++++++++++++------------ 6 files changed, 251 insertions(+), 117 deletions(-) diff --git a/BLAS/SRC/crotg.f90 b/BLAS/SRC/crotg.f90 index 24f461ef02..c7e49e18b5 100644 --- a/BLAS/SRC/crotg.f90 +++ b/BLAS/SRC/crotg.f90 @@ -1,4 +1,4 @@ -!> \brief \b CROTG +!> \brief \b CROTG generates a Givens rotation with real cosine and complex sine. ! ! =========== DOCUMENTATION =========== ! @@ -24,12 +24,12 @@ !> = 1 if x = 0 !> c = |a| / sqrt(|a|**2 + |b|**2) !> s = sgn(a) * conjg(b) / sqrt(|a|**2 + |b|**2) -!> When a and b are real and r /= 0, the formulas simplify to !> r = sgn(a)*sqrt(|a|**2 + |b|**2) +!> When a and b are real and r /= 0, the formulas simplify to !> c = a / r !> s = b / r -!> the same as in CROTG when |a| > |b|. When |b| >= |a|, the -!> sign of c and s will be different from those computed by CROTG +!> the same as in SROTG when |a| > |b|. When |b| >= |a|, the +!> sign of c and s will be different from those computed by SROTG !> if the signs of a and b are not the same. !> !> \endverbatim @@ -65,20 +65,19 @@ ! Authors: ! ======== ! -!> \author Edward Anderson, Lockheed Martin +!> \author Weslley Pereira, University of Colorado Denver, USA ! -!> \par Contributors: -! ================== -!> -!> Weslley Pereira, University of Colorado Denver, USA +!> \date December 2021 ! -!> \ingroup single_blas_level1 +!> \ingroup OTHERauxiliary ! !> \par Further Details: ! ===================== !> !> \verbatim !> +!> Based on the algorithm from +!> !> Anderson E. (2017) !> Algorithm 978: Safe Scaling in the Level 1 BLAS !> ACM Trans Math Softw 44:1--28 @@ -108,21 +107,14 @@ subroutine CROTG( a, b, c, s ) 1-minexponent(0._wp), & maxexponent(0._wp)-1 & ) - real(wp), parameter :: rtmin = sqrt( real(radix(0._wp),wp)**max( & - minexponent(0._wp)-1, & - 1-maxexponent(0._wp) & - ) / epsilon(0._wp) ) - real(wp), parameter :: rtmax = sqrt( real(radix(0._wp),wp)**max( & - 1-minexponent(0._wp), & - maxexponent(0._wp)-1 & - ) * epsilon(0._wp) ) + real(wp), parameter :: rtmin = sqrt( safmin ) ! .. ! .. Scalar Arguments .. real(wp) :: c complex(wp) :: a, b, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, f2, g1, g2, h2, u, v, w + real(wp) :: d, f1, f2, g1, g2, h2, u, v, w, rtmax complex(wp) :: f, fs, g, gs, r, t ! .. ! .. Intrinsic Functions .. @@ -145,6 +137,7 @@ subroutine CROTG( a, b, c, s ) else if( f == czero ) then c = zero g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/2 ) if( g1 > rtmin .and. g1 < rtmax ) then ! ! Use unscaled algorithm @@ -165,6 +158,7 @@ subroutine CROTG( a, b, c, s ) else f1 = max( abs(real(f)), abs(aimag(f)) ) g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/4 ) if( f1 > rtmin .and. f1 < rtmax .and. & g1 > rtmin .and. g1 < rtmax ) then ! @@ -173,14 +167,36 @@ subroutine CROTG( a, b, c, s ) f2 = ABSSQ( f ) g2 = ABSSQ( g ) h2 = f2 + g2 - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = f / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( g ) * ( f / sqrt( f2*h2 ) ) + else + s = conjg( g ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = f / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = f * ( h2 / d ) + end if + s = conjg( g ) * ( f / d ) end if - c = f2 / d - s = conjg( g )*( f / d ) - r = f*( h2 / d ) else ! ! Use scaled algorithm @@ -207,14 +223,39 @@ subroutine CROTG( a, b, c, s ) f2 = ABSSQ( fs ) h2 = f2 + g2 end if - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = fs / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( gs ) * ( fs / sqrt( f2*h2 ) ) + else + s = conjg( gs ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = fs / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = fs * ( h2 / d ) + end if + s = conjg( gs ) * ( fs / d ) end if - c = ( f2 / d )*w - s = conjg( gs )*( fs / d ) - r = ( fs*( h2 / d ) )*u + ! Rescale c and r + c = c * w + r = r * u end if end if a = r diff --git a/BLAS/SRC/zrotg.f90 b/BLAS/SRC/zrotg.f90 index 2bd6fba2e5..37aca1e757 100644 --- a/BLAS/SRC/zrotg.f90 +++ b/BLAS/SRC/zrotg.f90 @@ -1,4 +1,4 @@ -!> \brief \b ZROTG +!> \brief \b ZROTG generates a Givens rotation with real cosine and complex sine. ! ! =========== DOCUMENTATION =========== ! @@ -24,12 +24,12 @@ !> = 1 if x = 0 !> c = |a| / sqrt(|a|**2 + |b|**2) !> s = sgn(a) * conjg(b) / sqrt(|a|**2 + |b|**2) -!> When a and b are real and r /= 0, the formulas simplify to !> r = sgn(a)*sqrt(|a|**2 + |b|**2) +!> When a and b are real and r /= 0, the formulas simplify to !> c = a / r !> s = b / r -!> the same as in ZROTG when |a| > |b|. When |b| >= |a|, the -!> sign of c and s will be different from those computed by ZROTG +!> the same as in DROTG when |a| > |b|. When |b| >= |a|, the +!> sign of c and s will be different from those computed by DROTG !> if the signs of a and b are not the same. !> !> \endverbatim @@ -65,20 +65,19 @@ ! Authors: ! ======== ! -!> \author Edward Anderson, Lockheed Martin +!> \author Weslley Pereira, University of Colorado Denver, USA ! -!> \par Contributors: -! ================== -!> -!> Weslley Pereira, University of Colorado Denver, USA +!> \date December 2021 ! -!> \ingroup single_blas_level1 +!> \ingroup OTHERauxiliary ! !> \par Further Details: ! ===================== !> !> \verbatim !> +!> Based on the algorithm from +!> !> Anderson E. (2017) !> Algorithm 978: Safe Scaling in the Level 1 BLAS !> ACM Trans Math Softw 44:1--28 @@ -108,21 +107,14 @@ subroutine ZROTG( a, b, c, s ) 1-minexponent(0._wp), & maxexponent(0._wp)-1 & ) - real(wp), parameter :: rtmin = sqrt( real(radix(0._wp),wp)**max( & - minexponent(0._wp)-1, & - 1-maxexponent(0._wp) & - ) / epsilon(0._wp) ) - real(wp), parameter :: rtmax = sqrt( real(radix(0._wp),wp)**max( & - 1-minexponent(0._wp), & - maxexponent(0._wp)-1 & - ) * epsilon(0._wp) ) + real(wp), parameter :: rtmin = sqrt( safmin ) ! .. ! .. Scalar Arguments .. real(wp) :: c complex(wp) :: a, b, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, f2, g1, g2, h2, u, v, w + real(wp) :: d, f1, f2, g1, g2, h2, u, v, w, rtmax complex(wp) :: f, fs, g, gs, r, t ! .. ! .. Intrinsic Functions .. @@ -145,6 +137,7 @@ subroutine ZROTG( a, b, c, s ) else if( f == czero ) then c = zero g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/2 ) if( g1 > rtmin .and. g1 < rtmax ) then ! ! Use unscaled algorithm @@ -165,6 +158,7 @@ subroutine ZROTG( a, b, c, s ) else f1 = max( abs(real(f)), abs(aimag(f)) ) g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/4 ) if( f1 > rtmin .and. f1 < rtmax .and. & g1 > rtmin .and. g1 < rtmax ) then ! @@ -173,14 +167,36 @@ subroutine ZROTG( a, b, c, s ) f2 = ABSSQ( f ) g2 = ABSSQ( g ) h2 = f2 + g2 - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = f / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( g ) * ( f / sqrt( f2*h2 ) ) + else + s = conjg( g ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = f / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = f * ( h2 / d ) + end if + s = conjg( g ) * ( f / d ) end if - c = f2 / d - s = conjg( g )*( f / d ) - r = f*( h2 / d ) else ! ! Use scaled algorithm @@ -207,14 +223,39 @@ subroutine ZROTG( a, b, c, s ) f2 = ABSSQ( fs ) h2 = f2 + g2 end if - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = fs / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( gs ) * ( fs / sqrt( f2*h2 ) ) + else + s = conjg( gs ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = fs / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = fs * ( h2 / d ) + end if + s = conjg( gs ) * ( fs / d ) end if - c = ( f2 / d )*w - s = conjg( gs )*( fs / d ) - r = ( fs*( h2 / d ) )*u + ! Rescale c and r + c = c * w + r = r * u end if end if a = r diff --git a/SRC/clartg.f90 b/SRC/clartg.f90 index 741e86af97..b1ecf5755a 100644 --- a/SRC/clartg.f90 +++ b/SRC/clartg.f90 @@ -30,7 +30,7 @@ !> The mathematical formulas used for C and S are !> !> sgn(x) = { x / |x|, x != 0 -!> { 1, x = 0 +!> { 1, x = 0 !> !> R = sgn(F) * sqrt(|F|**2 + |G|**2) !> @@ -38,19 +38,20 @@ !> !> S = sgn(F) * conjg(G) / sqrt(|F|**2 + |G|**2) !> +!> Special conditions: +!> If G=0, then C=1 and S=0. +!> If F=0, then C=0 and S is chosen so that R is real. +!> !> When F and G are real, the formulas simplify to C = F/R and !> S = G/R, and the returned values of C, S, and R should be -!> identical to those returned by CLARTG. +!> identical to those returned by SLARTG. !> !> The algorithm used to compute these quantities incorporates scaling !> to avoid overflow or underflow in computing the square root of the !> sum of squares. !> -!> This is a faster version of the BLAS1 routine CROTG, except for -!> the following differences: -!> F and G are unchanged on return. -!> If G=0, then C=1 and S=0. -!> If F=0, then C=0 and S is chosen so that R is real. +!> This is the same routine CROTG fom BLAS1, except that +!> F and G are unchanged on return. !> !> Below, wp=>sp stands for single precision from LA_CONSTANTS module. !> \endverbatim @@ -91,22 +92,19 @@ ! Authors: ! ======== ! -!> \author Edward Anderson, Lockheed Martin +!> \author Weslley Pereira, University of Colorado Denver, USA ! -!> \date August 2016 +!> \date December 2021 ! !> \ingroup OTHERauxiliary ! -!> \par Contributors: -! ================== -!> -!> Weslley Pereira, University of Colorado Denver, USA -! !> \par Further Details: ! ===================== !> !> \verbatim !> +!> Based on the algorithm from +!> !> Anderson E. (2017) !> Algorithm 978: Safe Scaling in the Level 1 BLAS !> ACM Trans Math Softw 44:1--28 diff --git a/SRC/dlartg.f90 b/SRC/dlartg.f90 index 365fa7207d..ea05ad3c6e 100644 --- a/SRC/dlartg.f90 +++ b/SRC/dlartg.f90 @@ -11,7 +11,7 @@ ! SUBROUTINE DLARTG( F, G, C, S, R ) ! ! .. Scalar Arguments .. -! REAL(wp) C, F, G, R, S +! REAL(wp) C, F, G, R, S ! .. ! !> \par Purpose: @@ -37,7 +37,7 @@ !> This version is discontinuous in R at F = 0 but it returns the same !> C and S as ZLARTG for complex inputs (F,0) and (G,0). !> -!> This is a more accurate version of the BLAS1 routine DROTG, +!> This is a more accurate version of the BLAS1 routine SROTG, !> with the following other differences: !> F and G are unchanged on return. !> If G=0, then C=1 and S=0. @@ -45,8 +45,6 @@ !> floating point operations (saves work in DBDSQR when !> there are zeros on the diagonal). !> -!> If F exceeds G in magnitude, C will be positive. -!> !> Below, wp=>dp stands for double precision from LA_CONSTANTS module. !> \endverbatim ! @@ -112,7 +110,7 @@ subroutine DLARTG( f, g, c, s, r ) use LA_CONSTANTS, & only: wp=>dp, zero=>dzero, half=>dhalf, one=>done, & - rtmin=>drtmin, rtmax=>drtmax, safmin=>dsafmin, safmax=>dsafmax + safmin=>dsafmin, safmax=>dsafmax ! ! -- LAPACK auxiliary routine (version 3.10.0) -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -123,11 +121,15 @@ subroutine DLARTG( f, g, c, s, r ) real(wp) :: c, f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, fs, g1, gs, u + real(wp) :: d, f1, fs, g1, gs, u, rtmin, rtmax ! .. ! .. Intrinsic Functions .. intrinsic :: abs, sign, sqrt ! .. +! .. Constants .. + rtmin = sqrt( safmin ) + rtmax = sqrt( safmax/2 ) +! .. ! .. Executable Statements .. ! f1 = abs( f ) @@ -154,7 +156,7 @@ subroutine DLARTG( f, g, c, s, r ) c = abs( fs ) / d r = sign( d, f ) s = gs / r - r = r * u + r = r*u end if return end subroutine diff --git a/SRC/slartg.f90 b/SRC/slartg.f90 index 0ef73ddc2c..c445e951c9 100644 --- a/SRC/slartg.f90 +++ b/SRC/slartg.f90 @@ -35,7 +35,7 @@ !> square root of the sum of squares. !> !> This version is discontinuous in R at F = 0 but it returns the same -!> C and S as SLARTG for complex inputs (F,0) and (G,0). +!> C and S as CLARTG for complex inputs (F,0) and (G,0). !> !> This is a more accurate version of the BLAS1 routine SROTG, !> with the following other differences: @@ -45,8 +45,6 @@ !> floating point operations (saves work in SBDSQR when !> there are zeros on the diagonal). !> -!> If F exceeds G in magnitude, C will be positive. -!> !> Below, wp=>sp stands for single precision from LA_CONSTANTS module. !> \endverbatim ! @@ -112,7 +110,7 @@ subroutine SLARTG( f, g, c, s, r ) use LA_CONSTANTS, & only: wp=>sp, zero=>szero, half=>shalf, one=>sone, & - rtmin=>srtmin, rtmax=>srtmax, safmin=>ssafmin, safmax=>ssafmax + safmin=>ssafmin, safmax=>ssafmax ! ! -- LAPACK auxiliary routine (version 3.10.0) -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -123,11 +121,15 @@ subroutine SLARTG( f, g, c, s, r ) real(wp) :: c, f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, fs, g1, gs, u + real(wp) :: d, f1, fs, g1, gs, u, rtmin, rtmax ! .. ! .. Intrinsic Functions .. intrinsic :: abs, sign, sqrt ! .. +! .. Constants .. + rtmin = sqrt( safmin ) + rtmax = sqrt( safmax/2 ) +! .. ! .. Executable Statements .. ! f1 = abs( f ) diff --git a/SRC/zlartg.f90 b/SRC/zlartg.f90 index 2bdb8fcc85..047929747b 100644 --- a/SRC/zlartg.f90 +++ b/SRC/zlartg.f90 @@ -11,8 +11,8 @@ ! SUBROUTINE ZLARTG( F, G, C, S, R ) ! ! .. Scalar Arguments .. -! REAL(wp) C -! COMPLEX(wp) F, G, R, S +! REAL(wp) C +! COMPLEX(wp) F, G, R, S ! .. ! !> \par Purpose: @@ -30,7 +30,7 @@ !> The mathematical formulas used for C and S are !> !> sgn(x) = { x / |x|, x != 0 -!> { 1, x = 0 +!> { 1, x = 0 !> !> R = sgn(F) * sqrt(|F|**2 + |G|**2) !> @@ -38,6 +38,10 @@ !> !> S = sgn(F) * conjg(G) / sqrt(|F|**2 + |G|**2) !> +!> Special conditions: +!> If G=0, then C=1 and S=0. +!> If F=0, then C=0 and S is chosen so that R is real. +!> !> When F and G are real, the formulas simplify to C = F/R and !> S = G/R, and the returned values of C, S, and R should be !> identical to those returned by DLARTG. @@ -46,11 +50,8 @@ !> to avoid overflow or underflow in computing the square root of the !> sum of squares. !> -!> This is a faster version of the BLAS1 routine ZROTG, except for -!> the following differences: -!> F and G are unchanged on return. -!> If G=0, then C=1 and S=0. -!> If F=0, then C=0 and S is chosen so that R is real. +!> This is the same routine CROTG fom BLAS1, except that +!> F and G are unchanged on return. !> !> Below, wp=>dp stands for double precision from LA_CONSTANTS module. !> \endverbatim @@ -91,22 +92,19 @@ ! Authors: ! ======== ! -!> \author Edward Anderson, Lockheed Martin +!> \author Weslley Pereira, University of Colorado Denver, USA ! -!> \date August 2016 +!> \date December 2021 ! !> \ingroup OTHERauxiliary ! -!> \par Contributors: -! ================== -!> -!> Weslley Pereira, University of Colorado Denver, USA -! !> \par Further Details: ! ===================== !> !> \verbatim !> +!> Based on the algorithm from +!> !> Anderson E. (2017) !> Algorithm 978: Safe Scaling in the Level 1 BLAS !> ACM Trans Math Softw 44:1--28 @@ -117,7 +115,7 @@ subroutine ZLARTG( f, g, c, s, r ) use LA_CONSTANTS, & only: wp=>dp, zero=>dzero, one=>done, two=>dtwo, czero=>zzero, & - rtmin=>drtmin, rtmax=>drtmax, safmin=>dsafmin, safmax=>dsafmax + safmin=>dsafmin, safmax=>dsafmax ! ! -- LAPACK auxiliary routine (version 3.10.0) -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -129,7 +127,7 @@ subroutine ZLARTG( f, g, c, s, r ) complex(wp) f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, f2, g1, g2, h2, u, v, w + real(wp) :: d, f1, f2, g1, g2, h2, u, v, w, rtmin, rtmax complex(wp) :: fs, gs, t ! .. ! .. Intrinsic Functions .. @@ -141,6 +139,9 @@ subroutine ZLARTG( f, g, c, s, r ) ! .. Statement Function definitions .. ABSSQ( t ) = real( t )**2 + aimag( t )**2 ! .. +! .. Constants .. + rtmin = sqrt( safmin ) +! .. ! .. Executable Statements .. ! if( g == czero ) then @@ -150,6 +151,7 @@ subroutine ZLARTG( f, g, c, s, r ) else if( f == czero ) then c = zero g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/2 ) if( g1 > rtmin .and. g1 < rtmax ) then ! ! Use unscaled algorithm @@ -170,6 +172,7 @@ subroutine ZLARTG( f, g, c, s, r ) else f1 = max( abs(real(f)), abs(aimag(f)) ) g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/4 ) if( f1 > rtmin .and. f1 < rtmax .and. & g1 > rtmin .and. g1 < rtmax ) then ! @@ -178,14 +181,36 @@ subroutine ZLARTG( f, g, c, s, r ) f2 = ABSSQ( f ) g2 = ABSSQ( g ) h2 = f2 + g2 - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = f / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( g ) * ( f / sqrt( f2*h2 ) ) + else + s = conjg( g ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = f / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = f * ( h2 / d ) + end if + s = conjg( g ) * ( f / d ) end if - c = f2 / d - s = conjg( g )*( f / d ) - r = f*( h2 / d ) else ! ! Use scaled algorithm @@ -212,14 +237,39 @@ subroutine ZLARTG( f, g, c, s, r ) f2 = ABSSQ( fs ) h2 = f2 + g2 end if - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = fs / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( gs ) * ( fs / sqrt( f2*h2 ) ) + else + s = conjg( gs ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = fs / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = fs * ( h2 / d ) + end if + s = conjg( gs ) * ( fs / d ) end if - c = ( f2 / d )*w - s = conjg( gs )*( fs / d ) - r = ( fs*( h2 / d ) )*u + ! Rescale c and r + c = c * w + r = r * u end if end if return From cdc8f33194e2e52b5bb54959da81893ea2bc1754 Mon Sep 17 00:00:00 2001 From: "Weslley S. Pereira" Date: Mon, 13 Dec 2021 09:06:04 -0700 Subject: [PATCH 8/9] Fix documentation thanks to @vladimir-ch --- SRC/dlartg.f90 | 2 +- SRC/zlartg.f90 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/SRC/dlartg.f90 b/SRC/dlartg.f90 index ea05ad3c6e..0c5f1e1b95 100644 --- a/SRC/dlartg.f90 +++ b/SRC/dlartg.f90 @@ -37,7 +37,7 @@ !> This version is discontinuous in R at F = 0 but it returns the same !> C and S as ZLARTG for complex inputs (F,0) and (G,0). !> -!> This is a more accurate version of the BLAS1 routine SROTG, +!> This is a more accurate version of the BLAS1 routine DROTG, !> with the following other differences: !> F and G are unchanged on return. !> If G=0, then C=1 and S=0. diff --git a/SRC/zlartg.f90 b/SRC/zlartg.f90 index 047929747b..d1333e9265 100644 --- a/SRC/zlartg.f90 +++ b/SRC/zlartg.f90 @@ -50,7 +50,7 @@ !> to avoid overflow or underflow in computing the square root of the !> sum of squares. !> -!> This is the same routine CROTG fom BLAS1, except that +!> This is the same routine ZROTG fom BLAS1, except that !> F and G are unchanged on return. !> !> Below, wp=>dp stands for double precision from LA_CONSTANTS module. From c362fff1eee80fdd88b6fd60be3fb9d045cb08bb Mon Sep 17 00:00:00 2001 From: "Weslley S. Pereira" Date: Mon, 13 Dec 2021 13:23:06 -0700 Subject: [PATCH 9/9] Minor changes --- BLAS/SRC/crotg.f90 | 44 +++++++++++++++++++++++++++++--------------- BLAS/SRC/zrotg.f90 | 44 +++++++++++++++++++++++++++++--------------- SRC/clartg.f90 | 42 ++++++++++++++++++++++++++++-------------- SRC/zlartg.f90 | 42 ++++++++++++++++++++++++++++-------------- 4 files changed, 114 insertions(+), 58 deletions(-) diff --git a/BLAS/SRC/crotg.f90 b/BLAS/SRC/crotg.f90 index c7e49e18b5..0fb7bb09f8 100644 --- a/BLAS/SRC/crotg.f90 +++ b/BLAS/SRC/crotg.f90 @@ -69,7 +69,7 @@ ! !> \date December 2021 ! -!> \ingroup OTHERauxiliary +!> \ingroup single_blas_level1 ! !> \par Further Details: ! ===================== @@ -136,24 +136,38 @@ subroutine CROTG( a, b, c, s ) r = f else if( f == czero ) then c = zero - g1 = max( abs(real(g)), abs(aimag(g)) ) - rtmax = sqrt( safmax/2 ) - if( g1 > rtmin .and. g1 < rtmax ) then + if( real(g) == zero ) then + r = abs(aimag(g)) + s = conjg( g ) / r + elseif( aimag(g) == zero ) then + r = abs(real(g)) + s = conjg( g ) / r + else + g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/2 ) + if( g1 > rtmin .and. g1 < rtmax ) then ! ! Use unscaled algorithm ! - d = abs( g ) - s = conjg( g ) / d - r = d - else +! The following two lines can be replaced by `d = abs( g )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( g ) + d = sqrt( g2 ) + s = conjg( g ) / d + r = d + else ! ! Use scaled algorithm ! - u = min( safmax, max( safmin, g1 ) ) - gs = g / u - d = abs( gs ) - s = conjg( gs ) / d - r = d*u + u = min( safmax, max( safmin, g1 ) ) + gs = g / u +! The following two lines can be replaced by `d = abs( gs )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( gs ) + d = sqrt( g2 ) + s = conjg( gs ) / d + r = d*u + end if end if else f1 = max( abs(real(f)), abs(aimag(f)) ) @@ -192,7 +206,7 @@ subroutine CROTG( a, b, c, s ) r = f / c else ! f2 / sqrt(f2 * h2) < safmin, then - ! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax r = f * ( h2 / d ) end if s = conjg( g ) * ( f / d ) @@ -248,7 +262,7 @@ subroutine CROTG( a, b, c, s ) r = fs / c else ! f2 / sqrt(f2 * h2) < safmin, then - ! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax r = fs * ( h2 / d ) end if s = conjg( gs ) * ( fs / d ) diff --git a/BLAS/SRC/zrotg.f90 b/BLAS/SRC/zrotg.f90 index 37aca1e757..bea4c278ff 100644 --- a/BLAS/SRC/zrotg.f90 +++ b/BLAS/SRC/zrotg.f90 @@ -69,7 +69,7 @@ ! !> \date December 2021 ! -!> \ingroup OTHERauxiliary +!> \ingroup single_blas_level1 ! !> \par Further Details: ! ===================== @@ -136,24 +136,38 @@ subroutine ZROTG( a, b, c, s ) r = f else if( f == czero ) then c = zero - g1 = max( abs(real(g)), abs(aimag(g)) ) - rtmax = sqrt( safmax/2 ) - if( g1 > rtmin .and. g1 < rtmax ) then + if( real(g) == zero ) then + r = abs(aimag(g)) + s = conjg( g ) / r + elseif( aimag(g) == zero ) then + r = abs(real(g)) + s = conjg( g ) / r + else + g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/2 ) + if( g1 > rtmin .and. g1 < rtmax ) then ! ! Use unscaled algorithm ! - d = abs( g ) - s = conjg( g ) / d - r = d - else +! The following two lines can be replaced by `d = abs( g )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( g ) + d = sqrt( g2 ) + s = conjg( g ) / d + r = d + else ! ! Use scaled algorithm ! - u = min( safmax, max( safmin, g1 ) ) - gs = g / u - d = abs( gs ) - s = conjg( gs ) / d - r = d*u + u = min( safmax, max( safmin, g1 ) ) + gs = g / u +! The following two lines can be replaced by `d = abs( gs )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( gs ) + d = sqrt( g2 ) + s = conjg( gs ) / d + r = d*u + end if end if else f1 = max( abs(real(f)), abs(aimag(f)) ) @@ -192,7 +206,7 @@ subroutine ZROTG( a, b, c, s ) r = f / c else ! f2 / sqrt(f2 * h2) < safmin, then - ! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax r = f * ( h2 / d ) end if s = conjg( g ) * ( f / d ) @@ -248,7 +262,7 @@ subroutine ZROTG( a, b, c, s ) r = fs / c else ! f2 / sqrt(f2 * h2) < safmin, then - ! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax r = fs * ( h2 / d ) end if s = conjg( gs ) * ( fs / d ) diff --git a/SRC/clartg.f90 b/SRC/clartg.f90 index b1ecf5755a..0ab8b8e089 100644 --- a/SRC/clartg.f90 +++ b/SRC/clartg.f90 @@ -150,24 +150,38 @@ subroutine CLARTG( f, g, c, s, r ) r = f else if( f == czero ) then c = zero - g1 = max( abs(real(g)), abs(aimag(g)) ) - rtmax = sqrt( safmax/2 ) - if( g1 > rtmin .and. g1 < rtmax ) then + if( real(g) == zero ) then + r = abs(aimag(g)) + s = conjg( g ) / r + elseif( aimag(g) == zero ) then + r = abs(real(g)) + s = conjg( g ) / r + else + g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/2 ) + if( g1 > rtmin .and. g1 < rtmax ) then ! ! Use unscaled algorithm ! - d = abs( g ) - s = conjg( g ) / d - r = d - else +! The following two lines can be replaced by `d = abs( g )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( g ) + d = sqrt( g2 ) + s = conjg( g ) / d + r = d + else ! ! Use scaled algorithm ! - u = min( safmax, max( safmin, g1 ) ) - gs = g / u - d = abs( gs ) - s = conjg( gs ) / d - r = d*u + u = min( safmax, max( safmin, g1 ) ) + gs = g / u +! The following two lines can be replaced by `d = abs( gs )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( gs ) + d = sqrt( g2 ) + s = conjg( gs ) / d + r = d*u + end if end if else f1 = max( abs(real(f)), abs(aimag(f)) ) @@ -206,7 +220,7 @@ subroutine CLARTG( f, g, c, s, r ) r = f / c else ! f2 / sqrt(f2 * h2) < safmin, then - ! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax r = f * ( h2 / d ) end if s = conjg( g ) * ( f / d ) @@ -262,7 +276,7 @@ subroutine CLARTG( f, g, c, s, r ) r = fs / c else ! f2 / sqrt(f2 * h2) < safmin, then - ! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax r = fs * ( h2 / d ) end if s = conjg( gs ) * ( fs / d ) diff --git a/SRC/zlartg.f90 b/SRC/zlartg.f90 index d1333e9265..289e2cf1ae 100644 --- a/SRC/zlartg.f90 +++ b/SRC/zlartg.f90 @@ -150,24 +150,38 @@ subroutine ZLARTG( f, g, c, s, r ) r = f else if( f == czero ) then c = zero - g1 = max( abs(real(g)), abs(aimag(g)) ) - rtmax = sqrt( safmax/2 ) - if( g1 > rtmin .and. g1 < rtmax ) then + if( real(g) == zero ) then + r = abs(aimag(g)) + s = conjg( g ) / r + elseif( aimag(g) == zero ) then + r = abs(real(g)) + s = conjg( g ) / r + else + g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/2 ) + if( g1 > rtmin .and. g1 < rtmax ) then ! ! Use unscaled algorithm ! - d = abs( g ) - s = conjg( g ) / d - r = d - else +! The following two lines can be replaced by `d = abs( g )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( g ) + d = sqrt( g2 ) + s = conjg( g ) / d + r = d + else ! ! Use scaled algorithm ! - u = min( safmax, max( safmin, g1 ) ) - gs = g / u - d = abs( gs ) - s = conjg( gs ) / d - r = d*u + u = min( safmax, max( safmin, g1 ) ) + gs = g / u +! The following two lines can be replaced by `d = abs( gs )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( gs ) + d = sqrt( g2 ) + s = conjg( gs ) / d + r = d*u + end if end if else f1 = max( abs(real(f)), abs(aimag(f)) ) @@ -206,7 +220,7 @@ subroutine ZLARTG( f, g, c, s, r ) r = f / c else ! f2 / sqrt(f2 * h2) < safmin, then - ! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax r = f * ( h2 / d ) end if s = conjg( g ) * ( f / d ) @@ -262,7 +276,7 @@ subroutine ZLARTG( f, g, c, s, r ) r = fs / c else ! f2 / sqrt(f2 * h2) < safmin, then - ! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax r = fs * ( h2 / d ) end if s = conjg( gs ) * ( fs / d )