Skip to content

Commit

Permalink
-
Browse files Browse the repository at this point in the history
  • Loading branch information
DoubangoTelecom committed Nov 15, 2019
1 parent 315e586 commit 9d711c3
Show file tree
Hide file tree
Showing 11 changed files with 383 additions and 14 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/* Copyright (C) 2016-2019 Doubango Telecom <https://www.doubango.org>
* File author: Mamadou DIOP (Doubango Telecom, France).
* License: GPLv3. For commercial license please contact us.
* Source code: https://github.com/DoubangoTelecom/compv
* WebSite: http://compv.org
*/
#if !defined(_COMPV_BASE_MATH_ACTIVATION_FUNCTIONS_INTRIN_NEON64_H_)
#define _COMPV_BASE_MATH_ACTIVATION_FUNCTIONS_INTRIN_NEON64_H_

#include "compv/base/compv_config.h"
#include "compv/base/compv_common.h"

#if defined(_COMPV_API_H_)
#error("This is a private file and must not be part of the API")
#endif

#if COMPV_ARCH_ARM64 && COMPV_INTRINSIC

COMPV_NAMESPACE_BEGIN()

void CompVMathActivationFunctionsTanh_64f64f_Intrin_NEON64(
const compv_float64_t* lut_ptr,
const compv_uscalar_t& lut_length_minus1,
const compv_float64_t* scale1,
const compv_uscalar_t& in_out_length,
const compv_float64_t* in_ptr,
compv_float64_t* out_ptr
);

void CompVMathActivationFunctionsTanhMul_64f64f_Intrin_NEON64(
const compv_float64_t* lut_ptr,
const compv_uscalar_t& lut_length_minus1,
const compv_float64_t* scale1,
const compv_uscalar_t& in_out_length,
const compv_float64_t* in_ptr,
const compv_float64_t* mul_ptr,
compv_float64_t* out_ptr
);

void CompVMathActivationFunctionsLogistic_64f64f_Intrin_NEON64(
const compv_float64_t* lut_ptr,
const compv_uscalar_t& lut_length_minus1,
const compv_float64_t* scale1,
const compv_uscalar_t& in_out_length,
const compv_float64_t* in_ptr,
compv_float64_t* out_ptr
);

void CompVMathActivationFunctionsLogisticMul_64f64f_Intrin_NEON64(
const compv_float64_t* lut_ptr,
const compv_uscalar_t& lut_length_minus1,
const compv_float64_t* scale1,
const compv_uscalar_t& in_out_length,
const compv_float64_t* in_ptr,
const compv_float64_t* mul_ptr,
compv_float64_t* out_ptr
);

COMPV_NAMESPACE_END()

#endif /* COMPV_ARCH_ARM64 && COMPV_INTRINSIC */

#endif /* _COMPV_BASE_MATH_ACTIVATION_FUNCTIONS_INTRIN_NEON64_H_ */
17 changes: 13 additions & 4 deletions base/math/compv_math_activation_functions.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "compv/base/math/intrin/x86/compv_math_activation_functions_intrin_sse41.h"
#include "compv/base/math/intrin/x86/compv_math_activation_functions_intrin_avx.h"
#include "compv/base/math/intrin/x86/compv_math_activation_functions_intrin_avx2.h"
#include "compv/base/math/intrin/arm/compv_math_activation_functions_intrin_neon64.h"

// More information about activation functions: https://towardsdatascience.com/activation-functions-neural-networks-1cbd9f8d91d6
// Some of these function comes from Tesseract and was adapted to make them SIMD-friendly and branchless
Expand Down Expand Up @@ -45,7 +46,9 @@ COMPV_ERROR_CODE CompVMathActivationFunctions::tanh(const compv_float64_t* lut_p
COMPV_EXEC_IFDEF_INTRIN_X86((CompVMathActivationFunctionsTanh = CompVMathActivationFunctionsTanh_64f64f_Intrin_SSE41));
}
#elif COMPV_ARCH_ARM

if (COMPV_IS_ALIGNED(in_out_length, 2) && CompVCpu::isEnabled(kCpuFlagARM_NEON)) {
COMPV_EXEC_IFDEF_INTRIN_ARM64((CompVMathActivationFunctionsTanh = CompVMathActivationFunctionsTanh_64f64f_Intrin_NEON64));
}
#endif

CompVMathActivationFunctionsTanh(
Expand Down Expand Up @@ -73,7 +76,9 @@ COMPV_ERROR_CODE CompVMathActivationFunctions::tanhMul(const compv_float64_t* lu
COMPV_EXEC_IFDEF_INTRIN_X86((CompVMathActivationFunctionsTanhMul = CompVMathActivationFunctionsTanhMul_64f64f_Intrin_SSE41));
}
#elif COMPV_ARCH_ARM

if (COMPV_IS_ALIGNED(in_out_length, 2) && CompVCpu::isEnabled(kCpuFlagARM_NEON)) {
COMPV_EXEC_IFDEF_INTRIN_ARM64((CompVMathActivationFunctionsTanhMul = CompVMathActivationFunctionsTanhMul_64f64f_Intrin_NEON64));
}
#endif

CompVMathActivationFunctionsTanhMul(
Expand Down Expand Up @@ -101,7 +106,9 @@ COMPV_ERROR_CODE CompVMathActivationFunctions::logistic(const compv_float64_t* l
COMPV_EXEC_IFDEF_INTRIN_X86((CompVMathActivationFunctionsLogistic = CompVMathActivationFunctionsLogistic_64f64f_Intrin_SSE41));
}
#elif COMPV_ARCH_ARM

if (COMPV_IS_ALIGNED(in_out_length, 2) && CompVCpu::isEnabled(kCpuFlagARM_NEON)) {
COMPV_EXEC_IFDEF_INTRIN_ARM64((CompVMathActivationFunctionsLogistic = CompVMathActivationFunctionsLogistic_64f64f_Intrin_NEON64));
}
#endif

CompVMathActivationFunctionsLogistic(
Expand Down Expand Up @@ -129,7 +136,9 @@ COMPV_ERROR_CODE CompVMathActivationFunctions::logisticMul(const compv_float64_t
COMPV_EXEC_IFDEF_INTRIN_X86((CompVMathActivationFunctionsLogisticMul = CompVMathActivationFunctionsLogisticMul_64f64f_Intrin_SSE41));
}
#elif COMPV_ARCH_ARM

if (COMPV_IS_ALIGNED(in_out_length, 2) && CompVCpu::isEnabled(kCpuFlagARM_NEON)) {
COMPV_EXEC_IFDEF_INTRIN_ARM64((CompVMathActivationFunctionsLogisticMul = CompVMathActivationFunctionsLogisticMul_64f64f_Intrin_NEON64));
}
#endif

CompVMathActivationFunctionsLogisticMul(
Expand Down
241 changes: 241 additions & 0 deletions base/math/intrin/arm/compv_math_activation_functions_intrin_neon64.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
/* Copyright (C) 2016-2019 Doubango Telecom <https://www.doubango.org>
* File author: Mamadou DIOP (Doubango Telecom, France).
* License: GPLv3. For commercial license please contact us.
* Source code: https://github.com/DoubangoTelecom/compv
* WebSite: http://compv.org
*/
#include "compv/base/math/intrin/arm/compv_math_activation_functions_intrin_neon64.h"

#if COMPV_ARCH_ARM64 && COMPV_INTRINSIC
#include "compv/base/compv_debug.h"
#include "compv/base/intrin/arm/compv_intrin_neon.h"

COMPV_NAMESPACE_BEGIN()

// This is an internal function
// - Up to the caller to make sure LUT is padded with at least #1 double to alow reading beyond valid data
// - Make sure in_out_length is multiple of 2
void CompVMathActivationFunctionsTanh_64f64f_Intrin_NEON64(
const compv_float64_t* lut_ptr,
const compv_uscalar_t& lut_length_minus1,
const compv_float64_t* scale1,
const compv_uscalar_t& in_out_length,
const compv_float64_t* in_ptr,
compv_float64_t* out_ptr
)
{
COMPV_DEBUG_INFO_CHECK_NEON();
const float64x2_t vecMinus1 = vdupq_n_f64(-1.0);
const float64x2_t vecPlus1 = vdupq_n_f64(1.0);
const float64x2_t vecScale = vdupq_n_f64(*scale1);
const float64x2_t vecZero = vdupq_n_f64(0.0);
const uint32x2_t vecLut_length_minus1 = vdup_n_u32(static_cast<uint32_t>(lut_length_minus1));
for (compv_uscalar_t i = 0; i < in_out_length; i += 2) {
float64x2_t vecX = vld1q_f64(&in_ptr[i]);
float64x2_t vecSign = vcltq_f64(vecX, vecZero);
vecSign = vorrq_s64(vandq_s64(vecSign, vecMinus1), vbicq_s64(vecPlus1, vecSign));
vecX = vmulq_f64(vmulq_f64(vecX, vecSign), vecScale);
uint32x2_t vecIndex = vqmovn_u64(vcvtq_u64_f64(vecX));
uint32x2_t vecIndexMask = vclt_u32(vecIndex, vecLut_length_minus1);
if (COMPV_ARM_NEON_NEQ_ZEROD(vecIndexMask)) {
const float64x2_t vecIndexRounded = vrndmq_f64(vecX);
vecIndex = vmin_u32(vecIndex, vecLut_length_minus1); // Clip indices to avoid reading beyond valid data. LUT should be padded with at lease #1 double
const uint32x2x2_t vecIndexMask32x2x2 = vzip_u32(vecIndexMask, vecIndexMask); // Convert low #2 32B -> #2 64B
const uint64x2_t vecIndexMask64x2 = vcombine_s32(vecIndexMask32x2x2.val[0], vecIndexMask32x2x2.val[1]); // _mm_unpacklo_epi32(vecIndexMask, vecIndexMask)
const float64x1x2_t vecLUT0 = vld2_f64(lut_ptr + vget_lane_u32(vecIndex, 0));
const float64x1x2_t vecLUT1 = vld2_f64(lut_ptr + vget_lane_u32(vecIndex, 1));
const float64x2_t vecTanh_i0 = vcombine_f64(vecLUT0.val[0], vecLUT1.val[0]);
const float64x2_t vecTanh_i1 = vcombine_f64(vecLUT0.val[1], vecLUT1.val[1]);
const float64x2_t vecResult = vmulq_f64(
vfmaq_f64(vecTanh_i0, vsubq_f64(vecTanh_i1, vecTanh_i0), vsubq_f64(vecX, vecIndexRounded)),
vecSign
);
vst1q_f64(&out_ptr[i],
vorrq_s64(vandq_s64(vecResult, vecIndexMask64x2), vbicq_s64(vecSign, vecIndexMask64x2))
);
}
else {
vst1q_f64(&out_ptr[i], vecSign);
}
}
}

// This is an internal function
// - Up to the caller to make sure LUT is padded with at least #1 double to alow reading beyond valid data
// - Make sure in_out_length is multiple of 2
void CompVMathActivationFunctionsTanhMul_64f64f_Intrin_NEON64(
const compv_float64_t* lut_ptr,
const compv_uscalar_t& lut_length_minus1,
const compv_float64_t* scale1,
const compv_uscalar_t& in_out_length,
const compv_float64_t* in_ptr,
const compv_float64_t* mul_ptr,
compv_float64_t* out_ptr
)
{
COMPV_DEBUG_INFO_CHECK_NEON();
const float64x2_t vecMinus1 = vdupq_n_f64(-1.0);
const float64x2_t vecPlus1 = vdupq_n_f64(1.0);
const float64x2_t vecScale = vdupq_n_f64(*scale1);
const float64x2_t vecZero = vdupq_n_f64(0.0);
const uint32x2_t vecLut_length_minus1 = vdup_n_u32(static_cast<uint32_t>(lut_length_minus1));
for (compv_uscalar_t i = 0; i < in_out_length; i += 2) {
float64x2_t vecX = vld1q_f64(&in_ptr[i]);
float64x2_t vecSign = vcltq_f64(vecX, vecZero);
vecSign = vorrq_s64(vandq_s64(vecSign, vecMinus1), vbicq_s64(vecPlus1, vecSign));
vecX = vmulq_f64(vmulq_f64(vecX, vecSign), vecScale);
uint32x2_t vecIndex = vqmovn_u64(vcvtq_u64_f64(vecX));
uint32x2_t vecIndexMask = vclt_u32(vecIndex, vecLut_length_minus1);
if (COMPV_ARM_NEON_NEQ_ZEROD(vecIndexMask)) {
const float64x2_t vecIndexRounded = vrndmq_f64(vecX);
vecIndex = vmin_u32(vecIndex, vecLut_length_minus1); // Clip indices to avoid reading beyond valid data. LUT should be padded with at lease #1 double
const uint32x2x2_t vecIndexMask32x2x2 = vzip_u32(vecIndexMask, vecIndexMask); // Convert low #2 32B -> #2 64B
const uint64x2_t vecIndexMask64x2 = vcombine_s32(vecIndexMask32x2x2.val[0], vecIndexMask32x2x2.val[1]); // _mm_unpacklo_epi32(vecIndexMask, vecIndexMask)
const float64x1x2_t vecLUT0 = vld2_f64(lut_ptr + vget_lane_u32(vecIndex, 0));
const float64x1x2_t vecLUT1 = vld2_f64(lut_ptr + vget_lane_u32(vecIndex, 1));
const float64x2_t vecTanh_i0 = vcombine_f64(vecLUT0.val[0], vecLUT1.val[0]);
const float64x2_t vecTanh_i1 = vcombine_f64(vecLUT0.val[1], vecLUT1.val[1]);
const float64x2_t vecResult = vmulq_f64(
vfmaq_f64(vecTanh_i0, vsubq_f64(vecTanh_i1, vecTanh_i0), vsubq_f64(vecX, vecIndexRounded)),
vecSign
);
vst1q_f64(&out_ptr[i],
vmulq_f64(
vorrq_s64(vandq_s64(vecResult, vecIndexMask64x2), vbicq_s64(vecSign, vecIndexMask64x2)),
vld1q_f64(&mul_ptr[i])
)
);
}
else {
vst1q_f64(&out_ptr[i],
vmulq_f64(
vecSign,
vld1q_f64(&mul_ptr[i])
)
);
}
}
}

// This is an internal function
// - Up to the caller to make sure LUT is padded with at least #1 double to alow reading beyond valid data
// - Make sure in_out_length is multiple of 2
void CompVMathActivationFunctionsLogistic_64f64f_Intrin_NEON64(
const compv_float64_t* lut_ptr,
const compv_uscalar_t& lut_length_minus1,
const compv_float64_t* scale1,
const compv_uscalar_t& in_out_length,
const compv_float64_t* in_ptr,
compv_float64_t* out_ptr
)
{
COMPV_DEBUG_INFO_CHECK_NEON();
const float64x2_t vecMinus1 = vdupq_n_f64(-1.0);
const float64x2_t vecPlus1 = vdupq_n_f64(1.0);
const float64x2_t vecScale = vdupq_n_f64(*scale1);
const float64x2_t vecZero = vdupq_n_f64(0.0);
const uint32x2_t vecLut_length_minus1 = vdup_n_u32(static_cast<uint32_t>(lut_length_minus1));
for (compv_uscalar_t i = 0; i < in_out_length; i += 2) {
float64x2_t vecX = vld1q_f64(&in_ptr[i]);
const float64x2_t vecSignMask = vcltq_f64(vecX, vecZero);
const float64x2_t vecSign = vorrq_s64(vandq_s64(vecSignMask, vecMinus1), vbicq_s64(vecPlus1, vecSignMask));
vecX = vmulq_f64(vmulq_f64(vecX, vecSign), vecScale);
uint32x2_t vecIndex = vqmovn_u64(vcvtq_u64_f64(vecX));
uint32x2_t vecIndexMask = vclt_u32(vecIndex, vecLut_length_minus1);
if (COMPV_ARM_NEON_NEQ_ZEROD(vecIndexMask)) {
const float64x2_t vecIndexRounded = vrndmq_f64(vecX);
vecIndex = vmin_u32(vecIndex, vecLut_length_minus1); // Clip indices to avoid reading beyond valid data. LUT should be padded with at lease #1 double
const uint32x2x2_t vecIndexMask32x2x2 = vzip_u32(vecIndexMask, vecIndexMask); // Convert low #2 32B -> #2 64B
const uint64x2_t vecIndexMask64x2 = vcombine_s32(vecIndexMask32x2x2.val[0], vecIndexMask32x2x2.val[1]); // _mm_unpacklo_epi32(vecIndexMask, vecIndexMask)
const float64x1x2_t vecLUT0 = vld2_f64(lut_ptr + vget_lane_u32(vecIndex, 0));
const float64x1x2_t vecLUT1 = vld2_f64(lut_ptr + vget_lane_u32(vecIndex, 1));
const float64x2_t vec_l0 = vcombine_f64(vecLUT0.val[0], vecLUT1.val[0]);
const float64x2_t vec_l1 = vcombine_f64(vecLUT0.val[1], vecLUT1.val[1]);
const float64x2_t vecResult = vmulq_f64(
vfmaq_f64(vec_l0, vsubq_f64(vec_l1, vec_l0), vsubq_f64(vecX, vecIndexRounded)),
vecSign
);
vst1q_f64(&out_ptr[i],
vaddq_f64(
vorrq_s64(vandq_s64(vecResult, vecIndexMask64x2), vbicq_s64(vecSign, vecIndexMask64x2)),
vandq_s64(vecSignMask, vecPlus1)
)
);
}
else {
vst1q_f64(&out_ptr[i],
vaddq_f64(
vecSign,
vandq_s64(vecSignMask, vecPlus1)
)
);
}
}
}

// This is an internal function
// - Up to the caller to make sure LUT is padded with at least #1 double to alow reading beyond valid data
// - Make sure in_out_length is multiple of 2
void CompVMathActivationFunctionsLogisticMul_64f64f_Intrin_NEON64(
const compv_float64_t* lut_ptr,
const compv_uscalar_t& lut_length_minus1,
const compv_float64_t* scale1,
const compv_uscalar_t& in_out_length,
const compv_float64_t* in_ptr,
const compv_float64_t* mul_ptr,
compv_float64_t* out_ptr
)
{
COMPV_DEBUG_INFO_CHECK_NEON();
const float64x2_t vecMinus1 = vdupq_n_f64(-1.0);
const float64x2_t vecPlus1 = vdupq_n_f64(1.0);
const float64x2_t vecScale = vdupq_n_f64(*scale1);
const float64x2_t vecZero = vdupq_n_f64(0.0);
const uint32x2_t vecLut_length_minus1 = vdup_n_u32(static_cast<uint32_t>(lut_length_minus1));
for (compv_uscalar_t i = 0; i < in_out_length; i += 2) {
float64x2_t vecX = vld1q_f64(&in_ptr[i]);
const float64x2_t vecSignMask = vcltq_f64(vecX, vecZero);
const float64x2_t vecSign = vorrq_s64(vandq_s64(vecSignMask, vecMinus1), vbicq_s64(vecPlus1, vecSignMask));
vecX = vmulq_f64(vmulq_f64(vecX, vecSign), vecScale);
uint32x2_t vecIndex = vqmovn_u64(vcvtq_u64_f64(vecX));
uint32x2_t vecIndexMask = vclt_u32(vecIndex, vecLut_length_minus1);
if (COMPV_ARM_NEON_NEQ_ZEROD(vecIndexMask)) {
const float64x2_t vecIndexRounded = vrndmq_f64(vecX);
vecIndex = vmin_u32(vecIndex, vecLut_length_minus1); // Clip indices to avoid reading beyond valid data. LUT should be padded with at lease #1 double
const uint32x2x2_t vecIndexMask32x2x2 = vzip_u32(vecIndexMask, vecIndexMask); // Convert low #2 32B -> #2 64B
const uint64x2_t vecIndexMask64x2 = vcombine_s32(vecIndexMask32x2x2.val[0], vecIndexMask32x2x2.val[1]); // _mm_unpacklo_epi32(vecIndexMask, vecIndexMask)
const float64x1x2_t vecLUT0 = vld2_f64(lut_ptr + vget_lane_u32(vecIndex, 0));
const float64x1x2_t vecLUT1 = vld2_f64(lut_ptr + vget_lane_u32(vecIndex, 1));
const float64x2_t vec_l0 = vcombine_f64(vecLUT0.val[0], vecLUT1.val[0]);
const float64x2_t vec_l1 = vcombine_f64(vecLUT0.val[1], vecLUT1.val[1]);
const float64x2_t vecResult = vmulq_f64(
vfmaq_f64(vec_l0, vsubq_f64(vec_l1, vec_l0), vsubq_f64(vecX, vecIndexRounded)),
vecSign
);
vst1q_f64(&out_ptr[i],
vmulq_f64(
vaddq_f64(
vorrq_s64(vandq_s64(vecResult, vecIndexMask64x2), vbicq_s64(vecSign, vecIndexMask64x2)),
vandq_s64(vecSignMask, vecPlus1)
),
vld1q_f64(&mul_ptr[i])
)
);
}
else {
vst1q_f64(&out_ptr[i],
vmulq_f64(
vaddq_f64(
vecSign,
vandq_s64(vecSignMask, vecPlus1)
),
vld1q_f64(&mul_ptr[i])
)
);
}
}
}

COMPV_NAMESPACE_END()

#endif /* COMPV_ARCH_ARM64 && COMPV_INTRINSIC */
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ void CompVMathActivationFunctionsTanh_64f64f_Intrin_AVX(
const __m256d vecScale = _mm256_set1_pd(*scale1);
const __m128i vecLut_length_minus1 = _mm_set1_epi32(static_cast<int32_t>(lut_length_minus1));
for (compv_uscalar_t i = 0; i < in_out_length; i += 4) {
__m256d vecX = _mm256_load_pd(&in_ptr[i]);
__m256d vecX = _mm256_loadu_pd(&in_ptr[i]);
const __m256d vecSign = _mm256_blendv_pd(vecPlus1, vecMinus1, _mm256_cmp_pd(vecX, vecZero, _CMP_LT_OQ));
vecX = _mm256_mul_pd(_mm256_mul_pd(vecX, vecSign), vecScale);
__m128i vecIndex128 = _mm256_cvttpd_epi32(vecX);
Expand Down Expand Up @@ -99,7 +99,7 @@ void CompVMathActivationFunctionsTanhMul_64f64f_Intrin_AVX(
const __m256d vecScale = _mm256_set1_pd(*scale1);
const __m128i vecLut_length_minus1 = _mm_set1_epi32(static_cast<int32_t>(lut_length_minus1));
for (compv_uscalar_t i = 0; i < in_out_length; i += 4) {
__m256d vecX = _mm256_load_pd(&in_ptr[i]);
__m256d vecX = _mm256_loadu_pd(&in_ptr[i]);
const __m256d vecSign = _mm256_blendv_pd(vecPlus1, vecMinus1, _mm256_cmp_pd(vecX, vecZero, _CMP_LT_OQ));
vecX = _mm256_mul_pd(_mm256_mul_pd(vecX, vecSign), vecScale);
__m128i vecIndex128 = _mm256_cvttpd_epi32(vecX);
Expand Down Expand Up @@ -171,7 +171,7 @@ void CompVMathActivationFunctionsLogistic_64f64f_Intrin_AVX(
const __m256d vecScale = _mm256_set1_pd(*scale1);
const __m128i vecLut_length_minus1 = _mm_set1_epi32(static_cast<int32_t>(lut_length_minus1));
for (compv_uscalar_t i = 0; i < in_out_length; i += 4) {
__m256d vecX = _mm256_load_pd(&in_ptr[i]);
__m256d vecX = _mm256_loadu_pd(&in_ptr[i]);
const __m256d vecSignMask = _mm256_cmp_pd(vecX, vecZero, _CMP_LT_OQ);
const __m256d vecSign = _mm256_blendv_pd(vecPlus1, vecMinus1, vecSignMask);
vecX = _mm256_mul_pd(_mm256_mul_pd(vecX, vecSign), vecScale);
Expand Down Expand Up @@ -245,7 +245,7 @@ void CompVMathActivationFunctionsLogisticMul_64f64f_Intrin_AVX(
const __m256d vecScale = _mm256_set1_pd(*scale1);
const __m128i vecLut_length_minus1 = _mm_set1_epi32(static_cast<int32_t>(lut_length_minus1));
for (compv_uscalar_t i = 0; i < in_out_length; i += 4) {
__m256d vecX = _mm256_load_pd(&in_ptr[i]);
__m256d vecX = _mm256_loadu_pd(&in_ptr[i]);
const __m256d vecSignMask = _mm256_cmp_pd(vecX, vecZero, _CMP_LT_OQ);
const __m256d vecSign = _mm256_blendv_pd(vecPlus1, vecMinus1, vecSignMask);
vecX = _mm256_mul_pd(_mm256_mul_pd(vecX, vecSign), vecScale);
Expand Down
Loading

0 comments on commit 9d711c3

Please sign in to comment.