Skip to content

Commit

Permalink
-
Browse files Browse the repository at this point in the history
  • Loading branch information
DoubangoTelecom committed Nov 19, 2019
1 parent 1b8cf3c commit 84b0d73
Show file tree
Hide file tree
Showing 103 changed files with 298 additions and 29,113 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ build
.idea/workspace.xml
workspace.xml
local.properties
.vs
.gradle
gradle
gradlew
Expand Down
116 changes: 116 additions & 0 deletions CMakeSettings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
{
"configurations": [
{
"name": "Linux-Debug",
"generator": "Unix Makefiles",
"remoteMachineName": "${defaultRemoteMachineName}",
"configurationType": "Debug",
"remoteCMakeListsRoot": "/var/tmp/src/${workspaceHash}/${name}",
"cmakeExecutable": "/usr/bin/cmake",
"buildRoot": "${env.USERPROFILE}\\CMakeBuilds\\${workspaceHash}\\build\\${name}",
"installRoot": "${env.USERPROFILE}\\CMakeBuilds\\${workspaceHash}\\install\\${name}",
"remoteBuildRoot": "/var/tmp/build/${workspaceHash}/build/${name}",
"remoteInstallRoot": "/var/tmp/build/${workspaceHash}/install/${name}",
"remoteCopySources": true,
"remoteCopySourcesOutputVerbosity": "Normal",
"remoteCopySourcesConcurrentCopies": "10",
"remoteCopySourcesMethod": "rsync",
"remoteCopySourcesExclusionList": [
".vs",
".git"
],
"rsyncCommandArgs": "-t --delete --delete-excluded",
"remoteCopyBuildOutput": false,
"cmakeCommandArgs": "",
"buildCommandArgs": "",
"ctestCommandArgs": "",
"inheritEnvironments": [
"linux_x64"
]
},
{
"name": "Linux-Release",
"generator": "Unix Makefiles",
"remoteMachineName": "${defaultRemoteMachineName}",
"configurationType": "Release",
"remoteCMakeListsRoot": "/var/tmp/src/${workspaceHash}/${name}",
"cmakeExecutable": "/usr/bin/cmake",
"buildRoot": "${env.USERPROFILE}\\CMakeBuilds\\${workspaceHash}\\build\\${name}",
"installRoot": "${env.USERPROFILE}\\CMakeBuilds\\${workspaceHash}\\install\\${name}",
"remoteBuildRoot": "/var/tmp/build/${workspaceHash}/build/${name}",
"remoteInstallRoot": "/var/tmp/build/${workspaceHash}/install/${name}",
"remoteCopySources": true,
"remoteCopySourcesOutputVerbosity": "Normal",
"remoteCopySourcesConcurrentCopies": "10",
"remoteCopySourcesMethod": "rsync",
"remoteCopySourcesExclusionList": [
".vs",
".git"
],
"rsyncCommandArgs": "-t --delete --delete-excluded",
"remoteCopyBuildOutput": false,
"cmakeCommandArgs": "",
"buildCommandArgs": "",
"ctestCommandArgs": "",
"inheritEnvironments": [
"linux_x64"
]
},
{
"name": "Raspberry-Debug",
"generator": "Unix Makefiles",
"remoteMachineName": "${defaultRemoteMachineName}",
"configurationType": "Debug",
"remoteCMakeListsRoot": "/var/tmp/src/${workspaceHash}/${name}",
"cmakeExecutable": "/usr/bin/cmake",
"buildRoot": "${env.USERPROFILE}\\CMakeBuilds\\${workspaceHash}\\build\\${name}",
"installRoot": "${env.USERPROFILE}\\CMakeBuilds\\${workspaceHash}\\install\\${name}",
"remoteBuildRoot": "/var/tmp/build/${workspaceHash}/build/${name}",
"remoteInstallRoot": "/var/tmp/build/${workspaceHash}/install/${name}",
"remoteCopySources": true,
"remoteCopySourcesOutputVerbosity": "Normal",
"remoteCopySourcesConcurrentCopies": "10",
"remoteCopySourcesMethod": "rsync",
"remoteCopySourcesExclusionList": [
".vs",
".git"
],
"rsyncCommandArgs": "-t --delete --delete-excluded",
"remoteCopyBuildOutput": false,
"cmakeCommandArgs": "-DCMAKE_VERBOSE_MAKEFILE=on -DLIB_BUILD_TYPE=STATIC -DTOOLCHAIN_RPI_TRIPLET=\"arm-linux-gnueabihf\" -DCMAKE_TOOLCHAIN_FILE:PATH=\"rpi.toolchain.cmake\"",
"buildCommandArgs": "",
"ctestCommandArgs": "",
"inheritEnvironments": [
"rpi"
]
},
{
"name": "Raspberry-Release",
"generator": "Unix Makefiles",
"remoteMachineName": "${defaultRemoteMachineName}",
"configurationType": "Release",
"remoteCMakeListsRoot": "/var/tmp/src/${workspaceHash}/${name}",
"cmakeExecutable": "/usr/bin/cmake",
"buildRoot": "${env.USERPROFILE}\\CMakeBuilds\\${workspaceHash}\\build\\${name}",
"installRoot": "${env.USERPROFILE}\\CMakeBuilds\\${workspaceHash}\\install\\${name}",
"remoteBuildRoot": "/var/tmp/build/${workspaceHash}/build/${name}",
"remoteInstallRoot": "/var/tmp/build/${workspaceHash}/install/${name}",
"remoteCopySources": true,
"remoteCopySourcesOutputVerbosity": "Normal",
"remoteCopySourcesConcurrentCopies": "10",
"remoteCopySourcesMethod": "rsync",
"remoteCopySourcesExclusionList": [
".vs",
".git"
],
"rsyncCommandArgs": "-t --delete --delete-excluded",
"remoteCopyBuildOutput": false,
"cmakeCommandArgs": "-DCMAKE_VERBOSE_MAKEFILE=on -DLIB_BUILD_TYPE=STATIC -DTOOLCHAIN_RPI_TRIPLET=\"arm-linux-gnueabihf\" -DCMAKE_TOOLCHAIN_FILE:PATH=\"rpi.toolchain.cmake\"",
"buildCommandArgs": "",
"ctestCommandArgs": "",
"inheritEnvironments": [
"rpi"
]
}
]
}
108 changes: 54 additions & 54 deletions base/image/intrin/arm/compv_image_conv_hsv_intrin_neon.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -64,28 +64,28 @@ static const float32x4_t vecHalf = vdupq_n_f32(0.5f);
vec1 = vmovl_u16(vget_high_u16(vec1)); \
vec2 = vmovl_u16(vget_low_u16(vec3)); \
vec3 = vmovl_u16(vget_high_u16(vec3)); \
vec0 = vcvtq_f32_u32(vec0); \
vec1 = vcvtq_f32_u32(vec1); \
vec2 = vcvtq_f32_u32(vec2); \
vec3 = vcvtq_f32_u32(vec3); \
vec0 = (int32x4_t)vcvtq_f32_u32(vec0); \
vec1 = (int32x4_t)vcvtq_f32_u32(vec1); \
vec2 = (int32x4_t)vcvtq_f32_u32(vec2); \
vec3 = (int32x4_t)vcvtq_f32_u32(vec3); \
\
/* maxVal = ToFloat32(ToUInt32(ToUInt16(ToUInt8(vec4)))) */ \
vec1f = vmovl_u8(vget_low_u8(vec4)); \
vec3f = vmovl_u8(vget_high_u8(vec4)); \
vec0f = vmovl_u16(vget_low_u16(vec1f)); \
vec1f = vmovl_u16(vget_high_u16(vec1f)); \
vec2f = vmovl_u16(vget_low_u16(vec3f)); \
vec3f = vmovl_u16(vget_high_u16(vec3f)); \
vec0f = vcvtq_f32_u32(vec0f); \
vec1f = vcvtq_f32_u32(vec1f); \
vec2f = vcvtq_f32_u32(vec2f); \
vec3f = vcvtq_f32_u32(vec3f); \
vec1f = (float32x4_t)vmovl_u8(vget_low_u8(vec4)); \
vec3f = (float32x4_t)vmovl_u8(vget_high_u8(vec4)); \
vec0f = (float32x4_t)vmovl_u16(vget_low_u16((uint16x8_t)vec1f)); \
vec1f = (float32x4_t)vmovl_u16(vget_high_u16((uint16x8_t)vec1f)); \
vec2f = (float32x4_t)vmovl_u16(vget_low_u16((uint16x8_t)vec3f)); \
vec3f = (float32x4_t)vmovl_u16(vget_high_u16((uint16x8_t)vec3f)); \
vec0f = vcvtq_f32_u32((uint32x4_t)vec0f); \
vec1f = vcvtq_f32_u32((uint32x4_t)vec1f); \
vec2f = vcvtq_f32_u32((uint32x4_t)vec2f); \
vec3f = vcvtq_f32_u32((uint32x4_t)vec3f); \
\
/* scale = maxVal ? (1.f / maxVal) : 0.f */ \
vec0f = vbicq_u32(COMPV_ARM_NEON_RECIPROCAL(vec0f), vceqq_s32(vec0f, vecZero)); \
vec1f = vbicq_u32(COMPV_ARM_NEON_RECIPROCAL(vec1f), vceqq_s32(vec1f, vecZero)); \
vec2f = vbicq_u32(COMPV_ARM_NEON_RECIPROCAL(vec2f), vceqq_s32(vec2f, vecZero)); \
vec3f = vbicq_u32(COMPV_ARM_NEON_RECIPROCAL(vec3f), vceqq_s32(vec3f, vecZero)); \
vec0f = (float32x4_t)vbicq_u32((uint32x4_t)COMPV_ARM_NEON_RECIPROCAL(vec0f), vceqq_s32((int32x4_t)vec0f, vecZero)); \
vec1f = (float32x4_t)vbicq_u32((uint32x4_t)COMPV_ARM_NEON_RECIPROCAL(vec1f), vceqq_s32((int32x4_t)vec1f, vecZero)); \
vec2f = (float32x4_t)vbicq_u32((uint32x4_t)COMPV_ARM_NEON_RECIPROCAL(vec2f), vceqq_s32((int32x4_t)vec2f, vecZero)); \
vec3f = (float32x4_t)vbicq_u32((uint32x4_t)COMPV_ARM_NEON_RECIPROCAL(vec3f), vceqq_s32((int32x4_t)vec3f, vecZero)); \
\
/* scales255 = (255 * scale) */ \
vec0f = vmulq_f32(vec0f, vec255f); \
Expand All @@ -94,48 +94,48 @@ static const float32x4_t vecHalf = vdupq_n_f32(0.5f);
vec3f = vmulq_f32(vec3f, vec255f); \
\
/* hsv[1].float = static_cast<uint8_t>((scales255 * minus)) - unsigned */ \
vec0f = vmulq_f32(vec0f, vec0); \
vec1f = vmulq_f32(vec1f, vec1); \
vec2f = vmulq_f32(vec2f, vec2); \
vec3f = vmulq_f32(vec3f, vec3); \
vec0f = vcvtq_u32_f32(vec0f); \
vec1f = vcvtq_u32_f32(vec1f); \
vec2f = vcvtq_u32_f32(vec2f); \
vec3f = vcvtq_u32_f32(vec3f); \
vec0f = vcombine_u16(vmovn_s32(vec0f), vmovn_s32(vec1f)); \
vec2f = vcombine_u16(vmovn_s32(vec2f), vmovn_s32(vec3f)); \
vec8 = vcombine_u8(vqmovun_s16(vec0f), vqmovun_s16(vec2f)); /* vec8 = hsv[1].u8 */ \
vec0f = vmulq_f32(vec0f, (float32x4_t)vec0); \
vec1f = vmulq_f32(vec1f, (float32x4_t)vec1); \
vec2f = vmulq_f32(vec2f, (float32x4_t)vec2); \
vec3f = vmulq_f32(vec3f, (float32x4_t)vec3); \
vec0f = (float32x4_t)vcvtq_u32_f32(vec0f); \
vec1f = (float32x4_t)vcvtq_u32_f32(vec1f); \
vec2f = (float32x4_t)vcvtq_u32_f32(vec2f); \
vec3f = (float32x4_t)vcvtq_u32_f32(vec3f); \
vec0f = (float32x4_t)vcombine_u16(vmovn_s32((int32x4_t)vec0f), vmovn_s32((int32x4_t)vec1f)); \
vec2f = (float32x4_t)vcombine_u16(vmovn_s32((int32x4_t)vec2f), vmovn_s32((int32x4_t)vec3f)); \
vec8 = vcombine_u8(vqmovun_s16((int16x8_t)vec0f), vqmovun_s16((int16x8_t)vec2f)); /* vec8 = hsv[1].u8 */ \
\
/* B = ToFloat32(ToInt32(ToInt16(diff * 43))) */ \
vec1f = vmull_s8(vget_low_s8(vec5), vec43n); \
vec3f = vmull_s8(vget_high_s8(vec5), vec43n); \
vec0f = vmovl_s16(vget_low_s16(vec1f)); \
vec1f = vmovl_s16(vget_high_s16(vec1f)); \
vec2f = vmovl_s16(vget_low_s16(vec3f)); \
vec3f = vmovl_s16(vget_high_s16(vec3f)); \
vec0f = vcvtq_f32_s32(vec0f); \
vec1f = vcvtq_f32_s32(vec1f); \
vec2f = vcvtq_f32_s32(vec2f); \
vec3f = vcvtq_f32_s32(vec3f); \
vec1f = (float32x4_t)vmull_s8(vget_low_s8(vec5), vec43n); \
vec3f = (float32x4_t)vmull_s8(vget_high_s8(vec5), vec43n); \
vec0f = (float32x4_t)vmovl_s16(vget_low_s16((int16x8_t)vec1f)); \
vec1f = (float32x4_t)vmovl_s16(vget_high_s16((int16x8_t)vec1f)); \
vec2f = (float32x4_t)vmovl_s16(vget_low_s16((int16x8_t)vec3f)); \
vec3f = (float32x4_t)vmovl_s16(vget_high_s16((int16x8_t)vec3f)); \
vec0f = (float32x4_t)vcvtq_f32_s32((int32x4_t)vec0f); \
vec1f = (float32x4_t)vcvtq_f32_s32((int32x4_t)vec1f); \
vec2f = (float32x4_t)vcvtq_f32_s32((int32x4_t)vec2f); \
vec3f = (float32x4_t)vcvtq_f32_s32((int32x4_t)vec3f); \
\
/* scale = minus ? (1.f / minus) : 0.f */ \
vec0 = vbicq_u32(COMPV_ARM_NEON_RECIPROCAL(vec0), vceqq_s32(vec0, vecZero)); \
vec1 = vbicq_u32(COMPV_ARM_NEON_RECIPROCAL(vec1), vceqq_s32(vec1, vecZero)); \
vec2 = vbicq_u32(COMPV_ARM_NEON_RECIPROCAL(vec2), vceqq_s32(vec2, vecZero)); \
vec3 = vbicq_u32(COMPV_ARM_NEON_RECIPROCAL(vec3), vceqq_s32(vec3, vecZero)); \
vec0 = vbicq_u32((uint32x4_t)COMPV_ARM_NEON_RECIPROCAL(vec0), vceqq_s32(vec0, vecZero)); \
vec1 = vbicq_u32((uint32x4_t)COMPV_ARM_NEON_RECIPROCAL(vec1), vceqq_s32(vec1, vecZero)); \
vec2 = vbicq_u32((uint32x4_t)COMPV_ARM_NEON_RECIPROCAL(vec2), vceqq_s32(vec2, vecZero)); \
vec3 = vbicq_u32((uint32x4_t)COMPV_ARM_NEON_RECIPROCAL(vec3), vceqq_s32(vec3, vecZero)); \
\
/* compute static_cast<uint8_t>(round(B * scale) + ((85 & m1) | (171 & m2)) */ \
vec0f = vmulq_f32(vec0f, vec0); \
vec1f = vmulq_f32(vec1f, vec1); \
vec2f = vmulq_f32(vec2f, vec2); \
vec3f = vmulq_f32(vec3f, vec3); \
vec0f = COMPV_ARM_NEON_MATH_ROUNDF_2_NEAREST_INT(vec0f); /*!\\ **MUST NOT*** use vcvtq_s32_f32 */ \
vec1f = COMPV_ARM_NEON_MATH_ROUNDF_2_NEAREST_INT(vec1f); \
vec2f = COMPV_ARM_NEON_MATH_ROUNDF_2_NEAREST_INT(vec2f); \
vec3f = COMPV_ARM_NEON_MATH_ROUNDF_2_NEAREST_INT(vec3f); \
vec0f = vcombine_s16(vmovn_s32(vec0f), vmovn_s32(vec1f)); \
vec2f = vcombine_s16(vmovn_s32(vec2f), vmovn_s32(vec3f)); \
vec9 = vcombine_s8(vqmovn_s16(vec0f), vqmovn_s16(vec2f)); /*!\\ 'vqmovn_s16' instead of 'vqmovun_s16' because the values are signed */ \
vec0f = vmulq_f32(vec0f, (float32x4_t)vec0); \
vec1f = vmulq_f32(vec1f, (float32x4_t)vec1); \
vec2f = vmulq_f32(vec2f, (float32x4_t)vec2); \
vec3f = vmulq_f32(vec3f, (float32x4_t)vec3); \
vec0f = (float32x4_t)COMPV_ARM_NEON_MATH_ROUNDF_2_NEAREST_INT(vec0f); /*!\\ **MUST NOT*** use vcvtq_s32_f32 */ \
vec1f = (float32x4_t)COMPV_ARM_NEON_MATH_ROUNDF_2_NEAREST_INT(vec1f); \
vec2f = (float32x4_t)COMPV_ARM_NEON_MATH_ROUNDF_2_NEAREST_INT(vec2f); \
vec3f = (float32x4_t)COMPV_ARM_NEON_MATH_ROUNDF_2_NEAREST_INT(vec3f); \
vec0f = (float32x4_t)vcombine_s16(vmovn_s32((int32x4_t)vec0f), vmovn_s32((int32x4_t)vec1f)); \
vec2f = (float32x4_t)vcombine_s16(vmovn_s32((int32x4_t)vec2f), vmovn_s32((int32x4_t)vec3f)); \
vec9 = vcombine_s8(vqmovn_s16((int16x8_t)vec0f), vqmovn_s16((int16x8_t)vec2f)); /*!\\ 'vqmovn_s16' instead of 'vqmovun_s16' because the values are signed */ \
vec6 = vandq_s8(vec6, vec85); /* (85 & m1) */ \
vec7 = vandq_s8(vec7, vec171); /* (171 & m2) */ \
vec6 = vorrq_s8(vec6, vec7); /* (85 & m1) | (171 & m2) */ \
Expand Down
8 changes: 4 additions & 4 deletions base/image/intrin/arm/compv_image_remap_intrin_neon.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ void CompVImageRemapBilinear_8u32f_Intrin_NEON(
const float32x4_t x = vld1q_f32(&mapXPtr[i]);
const float32x4_t y = vld1q_f32(&mapYPtr[i]);

const float32x4_t cmp = vandq_s32(
vandq_s32(vcgeq_f32(x, roi_left), vcleq_f32(x, roi_right)),
vandq_s32(vcgeq_f32(y, roi_top), vcleq_f32(y, roi_bottom))
const float32x4_t cmp = (float32x4_t)vandq_u32(
vandq_u32(vcgeq_f32(x, roi_left), vcleq_f32(x, roi_right)),
vandq_u32(vcgeq_f32(y, roi_top), vcleq_f32(y, roi_bottom))
);

if (COMPV_ARM_NEON_NEQ_ZEROQ(cmp)) {
Expand Down Expand Up @@ -101,7 +101,7 @@ void CompVImageRemapBilinear_8u32f_Intrin_NEON(
const float32x4_t y2x2_vec = vcvtq_f32_s32(vld1q_s32(y2x2_mem));

float32x4_t pixel = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_f32(y1x1_vec, A), vmulq_f32(y1x2_vec, B)), vmulq_f32(y2x1_vec, C)), vmulq_f32(y2x2_vec, xyfractpart));
pixel = vorrq_s32(vandq_s32(pixel, cmp), vbicq_s32(defaultPixelValue, cmp));
pixel = (float32x4_t)vorrq_s32(vandq_s32((int32x4_t)pixel, (int32x4_t)cmp), vbicq_s32((int32x4_t)defaultPixelValue, (int32x4_t)cmp));

vst1q_f32(&outputPtr[i], pixel);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,10 @@ void CompVImageScaleBicubicPreprocess_32s32f_Intrin_NEON(
const float32x4_t vecIntegralf = COMPV_ARM_NEON_FLOOR_F32(vecFract); // SSE2: _mm_round_ps(vecFract, _MM_FROUND_FLOOR);
const int32x4_t vecIntegrali = vcvtq_s32_f32(vecIntegralf);

int32x4_t vecIntegrali0 = vaddq_s32(vdupq_lane_s32(vget_low_f32(vecIntegrali), 0), vecIntegralOffset);
int32x4_t vecIntegrali1 = vaddq_s32(vdupq_lane_s32(vget_low_f32(vecIntegrali), 1), vecIntegralOffset);
int32x4_t vecIntegrali2 = vaddq_s32(vdupq_lane_s32(vget_high_f32(vecIntegrali), 0), vecIntegralOffset);
int32x4_t vecIntegrali3 = vaddq_s32(vdupq_lane_s32(vget_high_f32(vecIntegrali), 1), vecIntegralOffset);
int32x4_t vecIntegrali0 = vaddq_s32(vdupq_lane_s32(vget_low_s32(vecIntegrali), 0), vecIntegralOffset);
int32x4_t vecIntegrali1 = vaddq_s32(vdupq_lane_s32(vget_low_s32(vecIntegrali), 1), vecIntegralOffset);
int32x4_t vecIntegrali2 = vaddq_s32(vdupq_lane_s32(vget_high_s32(vecIntegrali), 0), vecIntegralOffset);
int32x4_t vecIntegrali3 = vaddq_s32(vdupq_lane_s32(vget_high_s32(vecIntegrali), 1), vecIntegralOffset);
vecIntegrali0 = vmaxq_s32(vecZero, vminq_s32(vecIntegrali0, vecIntergralMax));
vecIntegrali1 = vmaxq_s32(vecZero, vminq_s32(vecIntegrali1, vecIntergralMax));
vecIntegrali2 = vmaxq_s32(vecZero, vminq_s32(vecIntegrali2, vecIntergralMax));
Expand Down
16 changes: 8 additions & 8 deletions base/include/compv/base/intrin/arm/compv_intrin_neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ COMPV_NAMESPACE_BEGIN()
// (s/u)qxtn v30.2s, v21.2d
// fmov x27, d30
// cbz r27, AllZeros
# define COMPV_ARM_NEON_NEQ_ZEROQ(vec) (vgetq_lane_u64(vec, 0) || vgetq_lane_u64(vec, 1))
# define COMPV_ARM_NEON_NEQ_ZEROQ(vec) (vgetq_lane_u64((uint64x2_t)(vec), 0) || vgetq_lane_u64((uint64x2_t)(vec), 1))
# define COMPV_ARM_NEON_EQ_ZEROQ(vec) !COMPV_ARM_NEON_NEQ_ZEROQ(vec)
# define COMPV_ARM_NEON_NEQ_ZEROD(vec) vget_lane_u64(vec, 0)
# define COMPV_ARM_NEON_NEQ_ZEROD(vec) vget_lane_u64((uint64x2_t)(vec), 0)
# define COMPV_ARM_NEON_EQ_ZEROD(vec) !COMPV_ARM_NEON_NEQ_ZEROD(vec)

#else
Expand All @@ -49,15 +49,15 @@ COMPV_NAMESPACE_BEGIN()
//vmrs APSR_nzcv, fpscr
# define COMPV_ARM_NEON_NEQ_ZEROQ(vec) ({ \
bool __ret; \
uint8x8_t __vec = vorr_u8(vget_high_u8(vec), vget_low_u8(vec)); \
__ret = vget_lane_u32(__vec, 0) || vget_lane_u32(__vec, 1); \
uint8x8_t __vec = vorr_u8(vget_high_u8((uint8x16_t)(vec)), vget_low_u8((uint8x16_t)(vec))); \
__ret = vget_lane_u32((uint32x2_t)(__vec), 0) || vget_lane_u32((uint32x2_t)(__vec), 1); \
__ret; \
})
# define COMPV_ARM_NEON_EQ_ZEROQ(vec) !COMPV_ARM_NEON_NEQ_ZEROQ(vec)

# define COMPV_ARM_NEON_NEQ_ZEROD(vec) ({ \
bool __ret; \
__ret = vget_lane_u32(vec, 0) || vget_lane_u32(vec, 1); \
__ret = vget_lane_u32((uint32x2_t)(vec), 0) || vget_lane_u32((uint32x2_t)(vec), 1); \
__ret; \
})
# define COMPV_ARM_NEON_EQ_ZEROD(vec) !COMPV_ARM_NEON_NEQ_ZEROD(vec)
Expand Down Expand Up @@ -92,8 +92,8 @@ COMPV_NAMESPACE_BEGIN()
# define COMPV_ARM_NEON_MATH_ROUNDF_2_NEAREST_INT(vec) vcvtaq_s32_f32(vec) /* in two instruction: vcvtq_s32_f32(vrndaq_f32(v)) -> fcvtas Vd.4S, Vn.4S */
#else
# define COMPV_ARM_NEON_MATH_ROUNDF_2_NEAREST_INT(vec)({ \
const float32x4_t vecSign = vcvtq_f32_u32((vshrq_n_u32(vec, 31))); \
int32x4_t __ret = vcvtq_s32_f32(vsubq_f32(vaddq_f32(vec, vecHalf), vecSign)); \
const float32x4_t vecSign = vcvtq_f32_u32((vshrq_n_u32((uint32x4_t)(vec), 31))); \
int32x4_t __ret = vcvtq_s32_f32(vsubq_f32(vaddq_f32((float32x4_t)(vec), (float32x4_t)(vecHalf)), vecSign)); \
__ret; \
})
#endif
Expand All @@ -107,7 +107,7 @@ COMPV_NAMESPACE_BEGIN()
})

// Reciprocal, less accurate than 'COMPV_ARM_NEON_RECIPROCAL_NEWTON_RAPHSON'
#define COMPV_ARM_NEON_RECIPROCAL(vec) vrecpeq_f32(vec) /* AArch64 -> frecpe */
#define COMPV_ARM_NEON_RECIPROCAL(vec) vrecpeq_f32((float32x4_t)(vec)) /* AArch64 -> frecpe */


// TODO(dmi): ASM is faster
Expand Down
Loading

0 comments on commit 84b0d73

Please sign in to comment.