Merge branch 'Reference-LAPACK:master' into master

chrwarm · Mar 21, 2023 · bd1204d · bd1204d
2 parents 3b08c51 + cfaa5ae
commit bd1204d
Show file tree

Hide file tree

Showing 144 changed files with 7,525 additions and 2,791 deletions.
diff --git a/.appveyor.yml b/.appveyor.yml
@@ -11,18 +11,14 @@ skip_commits:
 # Add [av skip] to commit messages
   message: /\[av skip\]/
 
-cache:
-  - '%APPVEYOR_BUILD_FOLDER%\build'
-
 environment:
   global:
-    CONDA_INSTALL_LOCN: C:\\Miniconda36-x64
+    CONDA_INSTALL_LOCN: C:\\Miniconda37-x64
 
 install:
   - call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
-  - conda config --set auto_update_conda false
-  - conda config --add channels conda-forge --force
-  - conda install --yes --quiet flang jom
+#  - conda config --set auto_update_conda false
+  - conda install -c conda-forge --yes --quiet flang jom
   - call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
   - set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%"
   - set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%"

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
@@ -33,6 +33,9 @@ on:
     - '!**Makefile'
     - '!**md'
 
+permissions:
+  contents: read
+
 env:
   CFLAGS: "-Wall -pedantic"
   # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
@@ -75,6 +78,7 @@ jobs:
         cmake -B build
         -D CMAKE_C_COMPILER="gcc-11"
         -D CMAKE_Fortran_COMPILER="gfortran-11"
+        -D USE_FLAT_NAMESPACE:BOOL=ON
 
     # - name: Use Unix Makefiles on Windows
     #   if: ${{ matrix.os == 'windows-latest' }}

diff --git a/.github/workflows/makefile.yml b/.github/workflows/makefile.yml
@@ -31,6 +31,9 @@ on:
     - '!**CMakeLists.txt'
     - '!**md'
 
+permissions:
+  contents: read
+
 env:
   CC: "gcc"
   FC: "gfortran"

diff --git a/BLAS/SRC/crotg.f90 b/BLAS/SRC/crotg.f90
@@ -1,4 +1,4 @@
-!> \brief \b CROTG
+!> \brief \b CROTG  generates a Givens rotation with real cosine and complex sine.
 !
 !  =========== DOCUMENTATION ===========
 !
@@ -24,8 +24,8 @@
 !>           = 1        if x  = 0
 !>    c = |a| / sqrt(|a|**2 + |b|**2)
 !>    s = sgn(a) * conjg(b) / sqrt(|a|**2 + |b|**2)
-!> When a and b are real and r /= 0, the formulas simplify to
 !>    r = sgn(a)*sqrt(|a|**2 + |b|**2)
+!> When a and b are real and r /= 0, the formulas simplify to
 !>    c = a / r
 !>    s = b / r
 !> the same as in SROTG when |a| > |b|.  When |b| >= |a|, the
@@ -65,12 +65,9 @@
 !  Authors:
 !  ========
 !
-!> \author Edward Anderson, Lockheed Martin
+!> \author Weslley Pereira, University of Colorado Denver, USA
 !
-!> \par Contributors:
-!  ==================
-!>
-!> Weslley Pereira, University of Colorado Denver, USA
+!> \date December 2021
 !
 !> \ingroup single_blas_level1
 !
@@ -79,6 +76,8 @@
 !>
 !> \verbatim
 !>
+!> Based on the algorithm from
+!>
 !>  Anderson E. (2017)
 !>  Algorithm 978: Safe Scaling in the Level 1 BLAS
 !>  ACM Trans Math Softw 44:1--28
@@ -108,21 +107,14 @@ subroutine CROTG( a, b, c, s )
       1-minexponent(0._wp), &
       maxexponent(0._wp)-1 &
    )
-   real(wp), parameter :: rtmin = sqrt( real(radix(0._wp),wp)**max( &
-      minexponent(0._wp)-1, &
-      1-maxexponent(0._wp) &
-   ) / epsilon(0._wp) )
-   real(wp), parameter :: rtmax = sqrt( real(radix(0._wp),wp)**max( &
-      1-minexponent(0._wp), &
-      maxexponent(0._wp)-1 &
-   ) * epsilon(0._wp) )
+   real(wp), parameter :: rtmin = sqrt( safmin )
 !  ..
 !  .. Scalar Arguments ..
    real(wp) :: c
    complex(wp) :: a, b, s
 !  ..
 !  .. Local Scalars ..
-   real(wp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w
+   real(wp) :: d, f1, f2, g1, g2, h2, u, v, w, rtmax
    complex(wp) :: f, fs, g, gs, r, t
 !  ..
 !  .. Intrinsic Functions ..
@@ -144,30 +136,43 @@ subroutine CROTG( a, b, c, s )
       r = f
    else if( f == czero ) then
       c = zero
-      g1 = max( abs(real(g)), abs(aimag(g)) )
-      if( g1 > rtmin .and. g1 < rtmax ) then
+      if( real(g) == zero ) then
+         r = abs(aimag(g))
+         s = conjg( g ) / r
+      elseif( aimag(g) == zero ) then
+         r = abs(real(g))
+         s = conjg( g ) / r
+      else
+         g1 = max( abs(real(g)), abs(aimag(g)) )
+         rtmax = sqrt( safmax/2 )
+         if( g1 > rtmin .and. g1 < rtmax ) then
 !
 !        Use unscaled algorithm
 !
-         g2 = ABSSQ( g )
-         d = sqrt( g2 )
-         s = conjg( g ) / d
-         r = d
-      else
+!           The following two lines can be replaced by `d = abs( g )`.
+!           This algorithm do not use the intrinsic complex abs.
+            g2 = ABSSQ( g )
+            d = sqrt( g2 )
+            s = conjg( g ) / d
+            r = d
+         else
 !
 !        Use scaled algorithm
 !
-         u = min( safmax, max( safmin, g1 ) )
-         uu = one / u
-         gs = g*uu
-         g2 = ABSSQ( gs )
-         d = sqrt( g2 )
-         s = conjg( gs ) / d
-         r = d*u
+            u = min( safmax, max( safmin, g1 ) )
+            gs = g / u
+!           The following two lines can be replaced by `d = abs( gs )`.
+!           This algorithm do not use the intrinsic complex abs.
+            g2 = ABSSQ( gs )
+            d = sqrt( g2 )
+            s = conjg( gs ) / d
+            r = d*u
+         end if
       end if
    else
       f1 = max( abs(real(f)), abs(aimag(f)) )
       g1 = max( abs(real(g)), abs(aimag(g)) )
+      rtmax = sqrt( safmax/4 )
       if( f1 > rtmin .and. f1 < rtmax .and. &
           g1 > rtmin .and. g1 < rtmax ) then
 !
@@ -176,52 +181,95 @@ subroutine CROTG( a, b, c, s )
          f2 = ABSSQ( f )
          g2 = ABSSQ( g )
          h2 = f2 + g2
-         if( f2 > rtmin .and. h2 < rtmax ) then
-            d = sqrt( f2*h2 )
+         ! safmin <= f2 <= h2 <= safmax 
+         if( f2 >= h2 * safmin ) then
+            ! safmin <= f2/h2 <= 1, and h2/f2 is finite
+            c = sqrt( f2 / h2 )
+            r = f / c
+            rtmax = rtmax * 2
+            if( f2 > rtmin .and. h2 < rtmax ) then
+               ! safmin <= sqrt( f2*h2 ) <= safmax
+               s = conjg( g ) * ( f / sqrt( f2*h2 ) )
+            else
+               s = conjg( g ) * ( r / h2 )
+            end if
          else
-            d = sqrt( f2 )*sqrt( h2 )
+            ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow.
+            ! Moreover,
+            !  safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax,
+            !  sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax).
+            ! Also,
+            !  g2 >> f2, which means that h2 = g2.
+            d = sqrt( f2 * h2 )
+            c = f2 / d
+            if( c >= safmin ) then
+               r = f / c
+            else
+               ! f2 / sqrt(f2 * h2) < safmin, then
+               !  sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax
+               r = f * ( h2 / d )
+            end if
+            s = conjg( g ) * ( f / d )
          end if
-         p = 1 / d
-         c = f2*p
-         s = conjg( g )*( f*p )
-         r = f*( h2*p )
       else
 !
 !        Use scaled algorithm
 !
          u = min( safmax, max( safmin, f1, g1 ) )
-         uu = one / u
-         gs = g*uu
+         gs = g / u
          g2 = ABSSQ( gs )
-         if( f1*uu < rtmin ) then
+         if( f1 / u < rtmin ) then
 !
 !           f is not well-scaled when scaled by g1.
 !           Use a different scaling for f.
 !
             v = min( safmax, max( safmin, f1 ) )
-            vv = one / v
-            w = v * uu
-            fs = f*vv
+            w = v / u
+            fs = f / v
             f2 = ABSSQ( fs )
             h2 = f2*w**2 + g2
          else
 !
 !           Otherwise use the same scaling for f and g.
 !
             w = one
-            fs = f*uu
+            fs = f / u
             f2 = ABSSQ( fs )
             h2 = f2 + g2
          end if
-         if( f2 > rtmin .and. h2 < rtmax ) then
-            d = sqrt( f2*h2 )
+         ! safmin <= f2 <= h2 <= safmax 
+         if( f2 >= h2 * safmin ) then
+            ! safmin <= f2/h2 <= 1, and h2/f2 is finite
+            c = sqrt( f2 / h2 )
+            r = fs / c
+            rtmax = rtmax * 2
+            if( f2 > rtmin .and. h2 < rtmax ) then
+               ! safmin <= sqrt( f2*h2 ) <= safmax
+               s = conjg( gs ) * ( fs / sqrt( f2*h2 ) )
+            else
+               s = conjg( gs ) * ( r / h2 )
+            end if
          else
-            d = sqrt( f2 )*sqrt( h2 )
+            ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow.
+            ! Moreover,
+            !  safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax,
+            !  sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax).
+            ! Also,
+            !  g2 >> f2, which means that h2 = g2.
+            d = sqrt( f2 * h2 )
+            c = f2 / d
+            if( c >= safmin ) then
+               r = fs / c
+            else
+               ! f2 / sqrt(f2 * h2) < safmin, then
+               !  sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax
+               r = fs * ( h2 / d )
+            end if
+            s = conjg( gs ) * ( fs / d )
          end if
-         p = 1 / d
-         c = ( f2*p )*w
-         s = conjg( gs )*( fs*p )
-         r = ( fs*( h2*p ) )*u
+         ! Rescale c and r
+         c = c * w
+         r = r * u
       end if
    end if
    a = r