Skip to content

Commit

Permalink
[libclc] Move mad to the CLC library (#123607)
Browse files Browse the repository at this point in the history
All targets build `__clc_mad` -- even SPIR-V targets -- since it
compiles to the optimal `llvm.fmuladd` intrinsic. There is no change to
the bytecode generated for non-SPIR-V targets.

The `mix` builtin, which is implemented as a wrapper around `mad`, is
left as an OpenCL-layer wrapper of `__clc_mad`. I don't know if it's
worth having a specific CLC version of `mix`.

The changes to the other CLC files/functions are moving uses of `mad` to
`__clc_mad`, and reformatting. There is an additional instance of
`trunc` becoming `__clc_trunc`, which was missed before.
  • Loading branch information
frasercrmck authored Jan 20, 2025
1 parent 8368018 commit c8eb865
Show file tree
Hide file tree
Showing 22 changed files with 2,014 additions and 1,840 deletions.
27 changes: 27 additions & 0 deletions libclc/clc/include/clc/clcmacro.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,33 @@
return BUILTIN(x); \
}

#define _CLC_DEFINE_TERNARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, \
ARG2_TYPE, ARG3_TYPE) \
_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y, \
ARG3_TYPE z) { \
return BUILTIN(x, y, z); \
} \
_CLC_DEF _CLC_OVERLOAD RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y, \
ARG3_TYPE##2 z) { \
return BUILTIN(x, y, z); \
} \
_CLC_DEF _CLC_OVERLOAD RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y, \
ARG3_TYPE##3 z) { \
return BUILTIN(x, y, z); \
} \
_CLC_DEF _CLC_OVERLOAD RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y, \
ARG3_TYPE##4 z) { \
return BUILTIN(x, y, z); \
} \
_CLC_DEF _CLC_OVERLOAD RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y, \
ARG3_TYPE##8 z) { \
return BUILTIN(x, y, z); \
} \
_CLC_DEF _CLC_OVERLOAD RET_TYPE##16 FUNCTION( \
ARG1_TYPE##16 x, ARG2_TYPE##16 y, ARG3_TYPE##16 z) { \
return BUILTIN(x, y, z); \
}

#ifdef cl_khr_fp16

#pragma OPENCL EXTENSION cl_khr_fp16 : enable
Expand Down
12 changes: 12 additions & 0 deletions libclc/clc/include/clc/math/clc_mad.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#ifndef __CLC_MATH_CLC_MAD_H__
#define __CLC_MATH_CLC_MAD_H__

#define __CLC_BODY <clc/math/ternary_decl.inc>
#define __CLC_FUNCTION __clc_mad

#include <clc/math/gentype.inc>

#undef __CLC_BODY
#undef __CLC_FUNCTION

#endif // __CLC_MATH_CLC_MAD_H__
3 changes: 3 additions & 0 deletions libclc/clc/include/clc/math/ternary_decl.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a,
__CLC_GENTYPE b,
__CLC_GENTYPE c);
1 change: 1 addition & 0 deletions libclc/clc/lib/clspv/SOURCES
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
../generic/math/clc_ceil.cl
../generic/math/clc_fabs.cl
../generic/math/clc_floor.cl
../generic/math/clc_mad.cl
../generic/math/clc_rint.cl
../generic/math/clc_trunc.cl
../generic/shared/clc_clamp.cl
1 change: 1 addition & 0 deletions libclc/clc/lib/generic/SOURCES
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ integer/clc_abs_diff.cl
math/clc_ceil.cl
math/clc_fabs.cl
math/clc_floor.cl
math/clc_mad.cl
math/clc_rint.cl
math/clc_trunc.cl
relational/clc_all.cl
Expand Down
4 changes: 4 additions & 0 deletions libclc/clc/lib/generic/math/clc_mad.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#include <clc/internal/clc.h>

#define __CLC_BODY <clc_mad.inc>
#include <clc/math/gentype.inc>
5 changes: 5 additions & 0 deletions libclc/clc/lib/generic/math/clc_mad.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_mad(__CLC_GENTYPE a, __CLC_GENTYPE b,
__CLC_GENTYPE c) {
#pragma OPENCL FP_CONTRACT ON
return a * b + c;
}
1 change: 1 addition & 0 deletions libclc/clc/lib/spirv/SOURCES
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
../generic/math/clc_ceil.cl
../generic/math/clc_fabs.cl
../generic/math/clc_floor.cl
../generic/math/clc_mad.cl
../generic/math/clc_rint.cl
../generic/math/clc_trunc.cl
../generic/shared/clc_clamp.cl
1 change: 1 addition & 0 deletions libclc/clc/lib/spirv64/SOURCES
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
../generic/math/clc_ceil.cl
../generic/math/clc_fabs.cl
../generic/math/clc_floor.cl
../generic/math/clc_mad.cl
../generic/math/clc_rint.cl
../generic/math/clc_trunc.cl
../generic/shared/clc_clamp.cl
1 change: 0 additions & 1 deletion libclc/generic/include/clc/math/ternary_decl.inc

This file was deleted.

1 change: 1 addition & 0 deletions libclc/generic/lib/common/mix.cl
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <clc/clc.h>
#include <clc/math/clc_mad.h>

#define __CLC_BODY <mix.inc>
#include <clc/math/gentype.inc>
10 changes: 6 additions & 4 deletions libclc/generic/lib/common/mix.inc
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE a) {
return mad( y - x, a, x );
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y,
__CLC_GENTYPE a) {
return __clc_mad(y - x, a, x);
}

#ifndef __CLC_SCALAR
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_SCALAR_GENTYPE a) {
return mix(x, y, (__CLC_GENTYPE)a);
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y,
__CLC_SCALAR_GENTYPE a) {
return mix(x, y, (__CLC_GENTYPE)a);
}
#endif
188 changes: 100 additions & 88 deletions libclc/generic/lib/math/clc_exp10.cl
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

#include <clc/clc.h>
#include <clc/clcmacro.h>
#include <clc/math/clc_mad.h>
#include <clc/relational/clc_isnan.h>

#include "config.h"
Expand Down Expand Up @@ -53,98 +54,109 @@
//
// e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )

_CLC_DEF _CLC_OVERLOAD float __clc_exp10(float x)
{
const float X_MAX = 0x1.344134p+5f; // 128*log2/log10 : 38.53183944498959
const float X_MIN = -0x1.66d3e8p+5f; // -149*log2/log10 : -44.8534693539332

const float R_64_BY_LOG10_2 = 0x1.a934f0p+7f; // 64*log10/log2 : 212.6033980727912
const float R_LOG10_2_BY_64_LD = 0x1.340000p-8f; // log2/(64 * log10) lead : 0.004699707
const float R_LOG10_2_BY_64_TL = 0x1.04d426p-18f; // log2/(64 * log10) tail : 0.00000388665057
const float R_LN10 = 0x1.26bb1cp+1f;

int return_nan = __clc_isnan(x);
int return_inf = x > X_MAX;
int return_zero = x < X_MIN;

int n = convert_int(x * R_64_BY_LOG10_2);

float fn = (float)n;
int j = n & 0x3f;
int m = n >> 6;
int m2 = m << EXPSHIFTBITS_SP32;
float r;

r = R_LN10 * mad(fn, -R_LOG10_2_BY_64_TL, mad(fn, -R_LOG10_2_BY_64_LD, x));

// Truncated Taylor series for e^r
float z2 = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);

float two_to_jby64 = USE_TABLE(exp_tbl, j);
z2 = mad(two_to_jby64, z2, two_to_jby64);

float z2s = z2 * as_float(0x1 << (m + 149));
float z2n = as_float(as_int(z2) + m2);
z2 = m <= -126 ? z2s : z2n;


z2 = return_inf ? as_float(PINFBITPATT_SP32) : z2;
z2 = return_zero ? 0.0f : z2;
z2 = return_nan ? x : z2;
return z2;
_CLC_DEF _CLC_OVERLOAD float __clc_exp10(float x) {
// 128*log2/log10 : 38.53183944498959
const float X_MAX = 0x1.344134p+5f;
// -149*log2/log10 : -44.8534693539332
const float X_MIN = -0x1.66d3e8p+5f;
// 64*log10/log2 : 212.6033980727912
const float R_64_BY_LOG10_2 = 0x1.a934f0p+7f;
// log2/(64 * log10) lead : 0.004699707
const float R_LOG10_2_BY_64_LD = 0x1.340000p-8f;
// log2/(64 * log10) tail : 0.00000388665057
const float R_LOG10_2_BY_64_TL = 0x1.04d426p-18f;
const float R_LN10 = 0x1.26bb1cp+1f;

int return_nan = __clc_isnan(x);
int return_inf = x > X_MAX;
int return_zero = x < X_MIN;

int n = convert_int(x * R_64_BY_LOG10_2);

float fn = (float)n;
int j = n & 0x3f;
int m = n >> 6;
int m2 = m << EXPSHIFTBITS_SP32;
float r;

r = R_LN10 *
__clc_mad(fn, -R_LOG10_2_BY_64_TL, __clc_mad(fn, -R_LOG10_2_BY_64_LD, x));

// Truncated Taylor series for e^r
float z2 = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f),
r, 0x1.000000p-1f),
r * r, r);

float two_to_jby64 = USE_TABLE(exp_tbl, j);
z2 = __clc_mad(two_to_jby64, z2, two_to_jby64);

float z2s = z2 * as_float(0x1 << (m + 149));
float z2n = as_float(as_int(z2) + m2);
z2 = m <= -126 ? z2s : z2n;

z2 = return_inf ? as_float(PINFBITPATT_SP32) : z2;
z2 = return_zero ? 0.0f : z2;
z2 = return_nan ? x : z2;
return z2;
}
_CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_exp10, float)

#ifdef cl_khr_fp64
_CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x)
{
const double X_MAX = 0x1.34413509f79ffp+8; // 1024*ln(2)/ln(10)
const double X_MIN = -0x1.434e6420f4374p+8; // -1074*ln(2)/ln(10)

const double R_64_BY_LOG10_2 = 0x1.a934f0979a371p+7; // 64*ln(10)/ln(2)
const double R_LOG10_2_BY_64_LD = 0x1.3441350000000p-8; // head ln(2)/(64*ln(10))
const double R_LOG10_2_BY_64_TL = 0x1.3ef3fde623e25p-37; // tail ln(2)/(64*ln(10))
const double R_LN10 = 0x1.26bb1bbb55516p+1; // ln(10)

int n = convert_int(x * R_64_BY_LOG10_2);

double dn = (double)n;

int j = n & 0x3f;
int m = n >> 6;

double r = R_LN10 * fma(-R_LOG10_2_BY_64_TL, dn, fma(-R_LOG10_2_BY_64_LD, dn, x));

// 6 term tail of Taylor expansion of e^r
double z2 = r * fma(r,
fma(r,
fma(r,
fma(r,
fma(r, 0x1.6c16c16c16c17p-10, 0x1.1111111111111p-7),
0x1.5555555555555p-5),
0x1.5555555555555p-3),
0x1.0000000000000p-1),
1.0);

double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
z2 = fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0;

int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0));

int n1 = m >> 2;
int n2 = m-n1;
double z3= z2 * as_double(((long)n1 + 1023) << 52);
z3 *= as_double(((long)n2 + 1023) << 52);

z2 = ldexp(z2, m);
z2 = small_value ? z3: z2;

z2 = __clc_isnan(x) ? x : z2;

z2 = x > X_MAX ? as_double(PINFBITPATT_DP64) : z2;
z2 = x < X_MIN ? 0.0 : z2;

return z2;
_CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x) {
// 1024*ln(2)/ln(10)
const double X_MAX = 0x1.34413509f79ffp+8;
// -1074*ln(2)/ln(10)
const double X_MIN = -0x1.434e6420f4374p+8;
// 64*ln(10)/ln(2)
const double R_64_BY_LOG10_2 = 0x1.a934f0979a371p+7;
// head ln(2)/(64*ln(10))
const double R_LOG10_2_BY_64_LD = 0x1.3441350000000p-8;
// tail ln(2)/(64*ln(10))
const double R_LOG10_2_BY_64_TL = 0x1.3ef3fde623e25p-37;
// ln(10)
const double R_LN10 = 0x1.26bb1bbb55516p+1;

int n = convert_int(x * R_64_BY_LOG10_2);

double dn = (double)n;

int j = n & 0x3f;
int m = n >> 6;

double r =
R_LN10 * fma(-R_LOG10_2_BY_64_TL, dn, fma(-R_LOG10_2_BY_64_LD, dn, x));

// 6 term tail of Taylor expansion of e^r
double z2 =
r *
fma(r,
fma(r,
fma(r,
fma(r, fma(r, 0x1.6c16c16c16c17p-10, 0x1.1111111111111p-7),
0x1.5555555555555p-5),
0x1.5555555555555p-3),
0x1.0000000000000p-1),
1.0);

double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
z2 = fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0;

int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0));

int n1 = m >> 2;
int n2 = m - n1;
double z3 = z2 * as_double(((long)n1 + 1023) << 52);
z3 *= as_double(((long)n2 + 1023) << 52);

z2 = ldexp(z2, m);
z2 = small_value ? z3 : z2;

z2 = __clc_isnan(x) ? x : z2;

z2 = x > X_MAX ? as_double(PINFBITPATT_DP64) : z2;
z2 = x < X_MIN ? 0.0 : z2;

return z2;
}
_CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_exp10, double)
#endif
3 changes: 2 additions & 1 deletion libclc/generic/lib/math/clc_hypot.cl
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <clc/clc.h>
#include <clc/clcmacro.h>
#include <clc/integer/clc_abs.h>
#include <clc/math/clc_mad.h>
#include <clc/relational/clc_isnan.h>
#include <clc/shared/clc_clamp.h>
#include <math/clc_hypot.h>
Expand All @@ -48,7 +49,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y) {
float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
float fx = as_float(ux) * fi_exp;
float fy = as_float(uy) * fi_exp;
retval = sqrt(mad(fx, fx, fy * fy)) * fx_exp;
retval = sqrt(__clc_mad(fx, fx, fy * fy)) * fx_exp;

retval = ux > PINFBITPATT_SP32 | uy == 0 ? as_float(ux) : retval;
retval = ux == PINFBITPATT_SP32 | uy == PINFBITPATT_SP32
Expand Down
Loading

0 comments on commit c8eb865

Please sign in to comment.