[libclc] Move mad to the CLC library (#123607)

All targets build `__clc_mad` -- even SPIR-V targets -- since it compiles to the optimal `llvm.fmuladd` intrinsic. There is no change to the bytecode generated for non-SPIR-V targets. The `mix` builtin, which is implemented as a wrapper around `mad`, is left as an OpenCL-layer wrapper of `__clc_mad`. I don't know if it's worth having a specific CLC version of `mix`. The changes to the other CLC files/functions are moving uses of `mad` to `__clc_mad`, and reformatting. There is an additional instance of `trunc` becoming `__clc_trunc`, which was missed before.
intel · Jan 20, 2025 · c8eb865 · c8eb865
1 parent 8368018
commit c8eb865
Show file tree

Hide file tree

Showing 22 changed files with 2,014 additions and 1,840 deletions.
diff --git a/libclc/clc/include/clc/clcmacro.h b/libclc/clc/include/clc/clcmacro.h
@@ -184,6 +184,33 @@
     return BUILTIN(x);                                                         \
   }
 
+#define _CLC_DEFINE_TERNARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE,    \
+                                    ARG2_TYPE, ARG3_TYPE)                      \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y,           \
+                                           ARG3_TYPE z) {                      \
+    return BUILTIN(x, y, z);                                                   \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y,  \
+                                              ARG3_TYPE##2 z) {                \
+    return BUILTIN(x, y, z);                                                   \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y,  \
+                                              ARG3_TYPE##3 z) {                \
+    return BUILTIN(x, y, z);                                                   \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y,  \
+                                              ARG3_TYPE##4 z) {                \
+    return BUILTIN(x, y, z);                                                   \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y,  \
+                                              ARG3_TYPE##8 z) {                \
+    return BUILTIN(x, y, z);                                                   \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##16 FUNCTION(                                \
+      ARG1_TYPE##16 x, ARG2_TYPE##16 y, ARG3_TYPE##16 z) {                     \
+    return BUILTIN(x, y, z);                                                   \
+  }
+
 #ifdef cl_khr_fp16
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable

diff --git a/libclc/clc/include/clc/math/clc_mad.h b/libclc/clc/include/clc/math/clc_mad.h
@@ -0,0 +1,12 @@
+#ifndef __CLC_MATH_CLC_MAD_H__
+#define __CLC_MATH_CLC_MAD_H__
+
+#define __CLC_BODY <clc/math/ternary_decl.inc>
+#define __CLC_FUNCTION __clc_mad
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_MAD_H__
diff --git a/libclc/clc/include/clc/math/ternary_decl.inc b/libclc/clc/include/clc/math/ternary_decl.inc
@@ -0,0 +1,3 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a,
+                                                     __CLC_GENTYPE b,
+                                                     __CLC_GENTYPE c);
diff --git a/libclc/clc/lib/clspv/SOURCES b/libclc/clc/lib/clspv/SOURCES
@@ -1,6 +1,7 @@
 ../generic/math/clc_ceil.cl
 ../generic/math/clc_fabs.cl
 ../generic/math/clc_floor.cl
+../generic/math/clc_mad.cl
 ../generic/math/clc_rint.cl
 ../generic/math/clc_trunc.cl
 ../generic/shared/clc_clamp.cl
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
@@ -7,6 +7,7 @@ integer/clc_abs_diff.cl
 math/clc_ceil.cl
 math/clc_fabs.cl
 math/clc_floor.cl
+math/clc_mad.cl
 math/clc_rint.cl
 math/clc_trunc.cl
 relational/clc_all.cl

diff --git a/libclc/clc/lib/generic/math/clc_mad.cl b/libclc/clc/lib/generic/math/clc_mad.cl
@@ -0,0 +1,4 @@
+#include <clc/internal/clc.h>
+
+#define __CLC_BODY <clc_mad.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/math/clc_mad.inc b/libclc/clc/lib/generic/math/clc_mad.inc
@@ -0,0 +1,5 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_mad(__CLC_GENTYPE a, __CLC_GENTYPE b,
+                                               __CLC_GENTYPE c) {
+#pragma OPENCL FP_CONTRACT ON
+  return a * b + c;
+}
diff --git a/libclc/clc/lib/spirv/SOURCES b/libclc/clc/lib/spirv/SOURCES
@@ -5,6 +5,7 @@
 ../generic/math/clc_ceil.cl
 ../generic/math/clc_fabs.cl
 ../generic/math/clc_floor.cl
+../generic/math/clc_mad.cl
 ../generic/math/clc_rint.cl
 ../generic/math/clc_trunc.cl
 ../generic/shared/clc_clamp.cl
diff --git a/libclc/clc/lib/spirv64/SOURCES b/libclc/clc/lib/spirv64/SOURCES
@@ -5,6 +5,7 @@
 ../generic/math/clc_ceil.cl
 ../generic/math/clc_fabs.cl
 ../generic/math/clc_floor.cl
+../generic/math/clc_mad.cl
 ../generic/math/clc_rint.cl
 ../generic/math/clc_trunc.cl
 ../generic/shared/clc_clamp.cl
diff --git a/libclc/generic/include/clc/math/ternary_decl.inc b/libclc/generic/include/clc/math/ternary_decl.inc
diff --git a/libclc/generic/lib/common/mix.cl b/libclc/generic/lib/common/mix.cl
@@ -1,4 +1,5 @@
 #include <clc/clc.h>
+#include <clc/math/clc_mad.h>
 
 #define __CLC_BODY <mix.inc>
 #include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/common/mix.inc b/libclc/generic/lib/common/mix.inc
@@ -1,9 +1,11 @@
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE a) {
-  return mad( y - x, a, x );
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y,
+                                         __CLC_GENTYPE a) {
+  return __clc_mad(y - x, a, x);
 }
 
 #ifndef __CLC_SCALAR
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_SCALAR_GENTYPE a) {
-    return mix(x, y, (__CLC_GENTYPE)a);
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y,
+                                         __CLC_SCALAR_GENTYPE a) {
+  return mix(x, y, (__CLC_GENTYPE)a);
 }
 #endif
diff --git a/libclc/generic/lib/math/clc_exp10.cl b/libclc/generic/lib/math/clc_exp10.cl
@@ -22,6 +22,7 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/math/clc_mad.h>
 #include <clc/relational/clc_isnan.h>
 
 #include "config.h"
@@ -53,98 +54,109 @@
 //
 //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
 
-_CLC_DEF _CLC_OVERLOAD float __clc_exp10(float x)
-{
-    const float X_MAX =  0x1.344134p+5f; // 128*log2/log10 : 38.53183944498959
-    const float X_MIN = -0x1.66d3e8p+5f; // -149*log2/log10 : -44.8534693539332
-
-    const float R_64_BY_LOG10_2 = 0x1.a934f0p+7f; // 64*log10/log2 : 212.6033980727912
-    const float R_LOG10_2_BY_64_LD = 0x1.340000p-8f; // log2/(64 * log10) lead : 0.004699707
-    const float R_LOG10_2_BY_64_TL = 0x1.04d426p-18f; // log2/(64 * log10) tail : 0.00000388665057
-    const float R_LN10 = 0x1.26bb1cp+1f;
-
-    int return_nan = __clc_isnan(x);
-    int return_inf = x > X_MAX;
-    int return_zero = x < X_MIN;
-
-    int n = convert_int(x * R_64_BY_LOG10_2);
-
-    float fn = (float)n;
-    int j = n & 0x3f;
-    int m = n >> 6;
-    int m2 = m << EXPSHIFTBITS_SP32;
-    float r;
-
-    r = R_LN10 * mad(fn, -R_LOG10_2_BY_64_TL, mad(fn, -R_LOG10_2_BY_64_LD, x));
-
-    // Truncated Taylor series for e^r
-    float z2 = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
-
-    float two_to_jby64 = USE_TABLE(exp_tbl, j);
-    z2 = mad(two_to_jby64, z2, two_to_jby64);
-
-    float z2s = z2 * as_float(0x1 << (m + 149));
-    float z2n = as_float(as_int(z2) + m2);
-    z2 = m <= -126 ? z2s : z2n;
-
-
-    z2 = return_inf ? as_float(PINFBITPATT_SP32) : z2;
-    z2 = return_zero ? 0.0f : z2;
-    z2 = return_nan ? x : z2;
-    return z2;
+_CLC_DEF _CLC_OVERLOAD float __clc_exp10(float x) {
+  // 128*log2/log10 : 38.53183944498959
+  const float X_MAX = 0x1.344134p+5f;
+  // -149*log2/log10 : -44.8534693539332
+  const float X_MIN = -0x1.66d3e8p+5f;
+  // 64*log10/log2 : 212.6033980727912
+  const float R_64_BY_LOG10_2 = 0x1.a934f0p+7f;
+  // log2/(64 * log10) lead : 0.004699707
+  const float R_LOG10_2_BY_64_LD = 0x1.340000p-8f;
+  // log2/(64 * log10) tail : 0.00000388665057
+  const float R_LOG10_2_BY_64_TL = 0x1.04d426p-18f;
+  const float R_LN10 = 0x1.26bb1cp+1f;
+
+  int return_nan = __clc_isnan(x);
+  int return_inf = x > X_MAX;
+  int return_zero = x < X_MIN;
+
+  int n = convert_int(x * R_64_BY_LOG10_2);
+
+  float fn = (float)n;
+  int j = n & 0x3f;
+  int m = n >> 6;
+  int m2 = m << EXPSHIFTBITS_SP32;
+  float r;
+
+  r = R_LN10 *
+      __clc_mad(fn, -R_LOG10_2_BY_64_TL, __clc_mad(fn, -R_LOG10_2_BY_64_LD, x));
+
+  // Truncated Taylor series for e^r
+  float z2 = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f),
+                                 r, 0x1.000000p-1f),
+                       r * r, r);
+
+  float two_to_jby64 = USE_TABLE(exp_tbl, j);
+  z2 = __clc_mad(two_to_jby64, z2, two_to_jby64);
+
+  float z2s = z2 * as_float(0x1 << (m + 149));
+  float z2n = as_float(as_int(z2) + m2);
+  z2 = m <= -126 ? z2s : z2n;
+
+  z2 = return_inf ? as_float(PINFBITPATT_SP32) : z2;
+  z2 = return_zero ? 0.0f : z2;
+  z2 = return_nan ? x : z2;
+  return z2;
 }
 _CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_exp10, float)
 
 #ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x)
-{
-    const double X_MAX = 0x1.34413509f79ffp+8; // 1024*ln(2)/ln(10)
-    const double X_MIN = -0x1.434e6420f4374p+8; // -1074*ln(2)/ln(10)
-
-    const double R_64_BY_LOG10_2 = 0x1.a934f0979a371p+7; // 64*ln(10)/ln(2)
-    const double R_LOG10_2_BY_64_LD = 0x1.3441350000000p-8; // head ln(2)/(64*ln(10))
-    const double R_LOG10_2_BY_64_TL = 0x1.3ef3fde623e25p-37; // tail ln(2)/(64*ln(10))
-    const double R_LN10 = 0x1.26bb1bbb55516p+1; // ln(10)
-
-    int n = convert_int(x * R_64_BY_LOG10_2);
-
-    double dn = (double)n;
-
-    int j = n & 0x3f;
-    int m = n >> 6;
-
-    double r = R_LN10 * fma(-R_LOG10_2_BY_64_TL, dn, fma(-R_LOG10_2_BY_64_LD, dn, x));
-
-    // 6 term tail of Taylor expansion of e^r
-    double z2 = r * fma(r,
-	                fma(r,
-		            fma(r,
-			        fma(r,
-			            fma(r, 0x1.6c16c16c16c17p-10, 0x1.1111111111111p-7),
-			            0x1.5555555555555p-5),
-			        0x1.5555555555555p-3),
-		            0x1.0000000000000p-1),
-		        1.0);
-
-    double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
-    z2 = fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0;
-
-    int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0));
-
-	int n1 = m >> 2;
-	int n2 = m-n1;
-	double z3= z2 * as_double(((long)n1 + 1023) << 52);
-	z3 *= as_double(((long)n2 + 1023) << 52);
-
-    z2 = ldexp(z2, m);
-    z2 = small_value ? z3: z2;
-
-    z2 = __clc_isnan(x) ? x : z2;
-
-    z2 = x > X_MAX ? as_double(PINFBITPATT_DP64) : z2;
-    z2 = x < X_MIN ? 0.0 : z2;
-
-    return z2;
+_CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x) {
+  // 1024*ln(2)/ln(10)
+  const double X_MAX = 0x1.34413509f79ffp+8;
+  // -1074*ln(2)/ln(10)
+  const double X_MIN = -0x1.434e6420f4374p+8;
+  // 64*ln(10)/ln(2)
+  const double R_64_BY_LOG10_2 = 0x1.a934f0979a371p+7;
+  // head ln(2)/(64*ln(10))
+  const double R_LOG10_2_BY_64_LD = 0x1.3441350000000p-8;
+  // tail ln(2)/(64*ln(10))
+  const double R_LOG10_2_BY_64_TL = 0x1.3ef3fde623e25p-37;
+  // ln(10)
+  const double R_LN10 = 0x1.26bb1bbb55516p+1;
+
+  int n = convert_int(x * R_64_BY_LOG10_2);
+
+  double dn = (double)n;
+
+  int j = n & 0x3f;
+  int m = n >> 6;
+
+  double r =
+      R_LN10 * fma(-R_LOG10_2_BY_64_TL, dn, fma(-R_LOG10_2_BY_64_LD, dn, x));
+
+  // 6 term tail of Taylor expansion of e^r
+  double z2 =
+      r *
+      fma(r,
+          fma(r,
+              fma(r,
+                  fma(r, fma(r, 0x1.6c16c16c16c17p-10, 0x1.1111111111111p-7),
+                      0x1.5555555555555p-5),
+                  0x1.5555555555555p-3),
+              0x1.0000000000000p-1),
+          1.0);
+
+  double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
+  z2 = fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0;
+
+  int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0));
+
+  int n1 = m >> 2;
+  int n2 = m - n1;
+  double z3 = z2 * as_double(((long)n1 + 1023) << 52);
+  z3 *= as_double(((long)n2 + 1023) << 52);
+
+  z2 = ldexp(z2, m);
+  z2 = small_value ? z3 : z2;
+
+  z2 = __clc_isnan(x) ? x : z2;
+
+  z2 = x > X_MAX ? as_double(PINFBITPATT_DP64) : z2;
+  z2 = x < X_MIN ? 0.0 : z2;
+
+  return z2;
 }
 _CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_exp10, double)
 #endif
diff --git a/libclc/generic/lib/math/clc_hypot.cl b/libclc/generic/lib/math/clc_hypot.cl
@@ -23,6 +23,7 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/integer/clc_abs.h>
+#include <clc/math/clc_mad.h>
 #include <clc/relational/clc_isnan.h>
 #include <clc/shared/clc_clamp.h>
 #include <math/clc_hypot.h>
@@ -48,7 +49,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y) {
   float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
   float fx = as_float(ux) * fi_exp;
   float fy = as_float(uy) * fi_exp;
-  retval = sqrt(mad(fx, fx, fy * fy)) * fx_exp;
+  retval = sqrt(__clc_mad(fx, fx, fy * fy)) * fx_exp;
 
   retval = ux > PINFBITPATT_SP32 | uy == 0 ? as_float(ux) : retval;
   retval = ux == PINFBITPATT_SP32 | uy == PINFBITPATT_SP32