From 310992b745223379a1628e96d7fb117584f97137 Mon Sep 17 00:00:00 2001 From: nihui Date: Fri, 29 Nov 2024 07:30:56 +0000 Subject: [PATCH] comp++ --- src/layer/x86/convolution_3x3_winograd_int8.h | 268 ++---- src/layer/x86/convolution_im2col_gemm_int8.h | 319 ++----- src/layer/x86/convolution_packed_int8.h | 793 ++++-------------- src/layer/x86/lstm_int8.h | 92 +- src/layer/x86/x86_usability.h | 51 +- 5 files changed, 374 insertions(+), 1149 deletions(-) diff --git a/src/layer/x86/convolution_3x3_winograd_int8.h b/src/layer/x86/convolution_3x3_winograd_int8.h index e742cfa80e7..470cc42dfa2 100644 --- a/src/layer/x86/convolution_3x3_winograd_int8.h +++ b/src/layer/x86/convolution_3x3_winograd_int8.h @@ -705,41 +705,22 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, __m512i _pB2 = _mm512_shuffle_epi32(_pB0, _MM_PERM_BADC); __m512i _pB3 = _mm512_shuffle_epi32(_pB0, _MM_PERM_CBAD); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _pA0, _pB1); - _sum2 = _mm512_dpwssd_epi32(_sum2, _pA0, _pB2); - _sum3 = _mm512_dpwssd_epi32(_sum3, _pA0, _pB3); - _sum4 = _mm512_dpwssd_epi32(_sum4, _pA1, _pB0); - _sum5 = _mm512_dpwssd_epi32(_sum5, _pA1, _pB1); - _sum6 = _mm512_dpwssd_epi32(_sum6, _pA1, _pB2); - _sum7 = _mm512_dpwssd_epi32(_sum7, _pA1, _pB3); - _sum8 = _mm512_dpwssd_epi32(_sum8, _pA2, _pB0); - _sum9 = _mm512_dpwssd_epi32(_sum9, _pA2, _pB1); - _suma = _mm512_dpwssd_epi32(_suma, _pA2, _pB2); - _sumb = _mm512_dpwssd_epi32(_sumb, _pA2, _pB3); - _sumc = _mm512_dpwssd_epi32(_sumc, _pA3, _pB0); - _sumd = _mm512_dpwssd_epi32(_sumd, _pA3, _pB1); - _sume = _mm512_dpwssd_epi32(_sume, _pA3, _pB2); - _sumf = _mm512_dpwssd_epi32(_sumf, _pA3, _pB3); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA0, _pB1)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_pA0, _pB2)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_pA0, _pB3)); - _sum4 = _mm512_add_epi32(_sum4, _mm512_madd_epi16(_pA1, _pB0)); - _sum5 = _mm512_add_epi32(_sum5, _mm512_madd_epi16(_pA1, _pB1)); - _sum6 = _mm512_add_epi32(_sum6, _mm512_madd_epi16(_pA1, _pB2)); - _sum7 = _mm512_add_epi32(_sum7, _mm512_madd_epi16(_pA1, _pB3)); - _sum8 = _mm512_add_epi32(_sum8, _mm512_madd_epi16(_pA2, _pB0)); - _sum9 = _mm512_add_epi32(_sum9, _mm512_madd_epi16(_pA2, _pB1)); - _suma = _mm512_add_epi32(_suma, _mm512_madd_epi16(_pA2, _pB2)); - _sumb = _mm512_add_epi32(_sumb, _mm512_madd_epi16(_pA2, _pB3)); - _sumc = _mm512_add_epi32(_sumc, _mm512_madd_epi16(_pA3, _pB0)); - _sumd = _mm512_add_epi32(_sumd, _mm512_madd_epi16(_pA3, _pB1)); - _sume = _mm512_add_epi32(_sume, _mm512_madd_epi16(_pA3, _pB2)); - _sumf = _mm512_add_epi32(_sumf, _mm512_madd_epi16(_pA3, _pB3)); -#endif + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA0, _pB0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _pA0, _pB1); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _pA0, _pB2); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _pA0, _pB3); + _sum4 = _mm512_comp_dpwssd_epi32(_sum4, _pA1, _pB0); + _sum5 = _mm512_comp_dpwssd_epi32(_sum5, _pA1, _pB1); + _sum6 = _mm512_comp_dpwssd_epi32(_sum6, _pA1, _pB2); + _sum7 = _mm512_comp_dpwssd_epi32(_sum7, _pA1, _pB3); + _sum8 = _mm512_comp_dpwssd_epi32(_sum8, _pA2, _pB0); + _sum9 = _mm512_comp_dpwssd_epi32(_sum9, _pA2, _pB1); + _suma = _mm512_comp_dpwssd_epi32(_suma, _pA2, _pB2); + _sumb = _mm512_comp_dpwssd_epi32(_sumb, _pA2, _pB3); + _sumc = _mm512_comp_dpwssd_epi32(_sumc, _pA3, _pB0); + _sumd = _mm512_comp_dpwssd_epi32(_sumd, _pA3, _pB1); + _sume = _mm512_comp_dpwssd_epi32(_sume, _pA3, _pB2); + _sumf = _mm512_comp_dpwssd_epi32(_sumf, _pA3, _pB3); pA += 32; pB += 32; @@ -984,25 +965,14 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, __m512i _pB2 = _mm512_shuffle_epi32(_pB0, _MM_PERM_BADC); __m512i _pB3 = _mm512_shuffle_epi32(_pB0, _MM_PERM_CBAD); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _pA0, _pB1); - _sum2 = _mm512_dpwssd_epi32(_sum2, _pA0, _pB2); - _sum3 = _mm512_dpwssd_epi32(_sum3, _pA0, _pB3); - _sum4 = _mm512_dpwssd_epi32(_sum4, _pA1, _pB0); - _sum5 = _mm512_dpwssd_epi32(_sum5, _pA1, _pB1); - _sum6 = _mm512_dpwssd_epi32(_sum6, _pA1, _pB2); - _sum7 = _mm512_dpwssd_epi32(_sum7, _pA1, _pB3); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA0, _pB1)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_pA0, _pB2)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_pA0, _pB3)); - _sum4 = _mm512_add_epi32(_sum4, _mm512_madd_epi16(_pA1, _pB0)); - _sum5 = _mm512_add_epi32(_sum5, _mm512_madd_epi16(_pA1, _pB1)); - _sum6 = _mm512_add_epi32(_sum6, _mm512_madd_epi16(_pA1, _pB2)); - _sum7 = _mm512_add_epi32(_sum7, _mm512_madd_epi16(_pA1, _pB3)); -#endif + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA0, _pB0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _pA0, _pB1); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _pA0, _pB2); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _pA0, _pB3); + _sum4 = _mm512_comp_dpwssd_epi32(_sum4, _pA1, _pB0); + _sum5 = _mm512_comp_dpwssd_epi32(_sum5, _pA1, _pB1); + _sum6 = _mm512_comp_dpwssd_epi32(_sum6, _pA1, _pB2); + _sum7 = _mm512_comp_dpwssd_epi32(_sum7, _pA1, _pB3); pA += 32; pB += 16; @@ -1150,17 +1120,10 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, __m512i _pA1 = _mm512_shuffle_epi32(_pA0, _MM_PERM_BADC); __m512i _pB1 = _mm512_shuffle_epi32(_pB0, _MM_PERM_ADCB); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _pA0, _pB1); - _sum2 = _mm512_dpwssd_epi32(_sum2, _pA1, _pB0); - _sum3 = _mm512_dpwssd_epi32(_sum3, _pA1, _pB1); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA0, _pB1)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_pA1, _pB0)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_pA1, _pB1)); -#endif + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA0, _pB0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _pA0, _pB1); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _pA1, _pB0); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _pA1, _pB1); pA += 32; pB += 8; @@ -1244,13 +1207,8 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, __m512i _pB0 = _mm512_castpd_si512(_mm512_set1_pd(((const double*)pB)[0])); __m512i _pB1 = _mm512_shuffle_epi32(_pB0, _MM_PERM_CDAB); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA, _pB0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _pA, _pB1); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA, _pB0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA, _pB1)); -#endif + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA, _pB0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _pA, _pB1); pA += 32; pB += 4; @@ -1312,11 +1270,7 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, __m512i _pA = _mm512_loadu_si512((const __m512i*)pA); __m512i _pB = _mm512_set1_epi32(((const int*)pB)[0]); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA, _pB); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA, _pB)); -#endif + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA, _pB); pA += 32; pB += 2; @@ -1396,25 +1350,14 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, __m512i _pB2 = _mm512_shuffle_epi32(_pB0, _MM_PERM_BADC); __m512i _pB3 = _mm512_shuffle_epi32(_pB0, _MM_PERM_CBAD); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA00, _pB0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _pA00, _pB1); - _sum2 = _mm512_dpwssd_epi32(_sum2, _pA00, _pB2); - _sum3 = _mm512_dpwssd_epi32(_sum3, _pA00, _pB3); - _sum4 = _mm512_dpwssd_epi32(_sum4, _pA11, _pB0); - _sum5 = _mm512_dpwssd_epi32(_sum5, _pA11, _pB1); - _sum6 = _mm512_dpwssd_epi32(_sum6, _pA11, _pB2); - _sum7 = _mm512_dpwssd_epi32(_sum7, _pA11, _pB3); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA00, _pB0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA00, _pB1)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_pA00, _pB2)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_pA00, _pB3)); - _sum4 = _mm512_add_epi32(_sum4, _mm512_madd_epi16(_pA11, _pB0)); - _sum5 = _mm512_add_epi32(_sum5, _mm512_madd_epi16(_pA11, _pB1)); - _sum6 = _mm512_add_epi32(_sum6, _mm512_madd_epi16(_pA11, _pB2)); - _sum7 = _mm512_add_epi32(_sum7, _mm512_madd_epi16(_pA11, _pB3)); -#endif // __AVX512VNNI__ + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA00, _pB0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _pA00, _pB1); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _pA00, _pB2); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _pA00, _pB3); + _sum4 = _mm512_comp_dpwssd_epi32(_sum4, _pA11, _pB0); + _sum5 = _mm512_comp_dpwssd_epi32(_sum5, _pA11, _pB1); + _sum6 = _mm512_comp_dpwssd_epi32(_sum6, _pA11, _pB2); + _sum7 = _mm512_comp_dpwssd_epi32(_sum7, _pA11, _pB3); pA += 16; pB += 32; @@ -1602,24 +1545,16 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, __m512i _pB01 = _mm512_inserti32x8(_mm512_castsi256_si512(_pB0), _pB1, 1); __m512i _pB23 = _mm512_shuffle_epi32(_pB01, _MM_PERM_BADC); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA00, _pB01); - _sum1 = _mm512_dpwssd_epi32(_sum1, _pA00, _pB23); - _sum2 = _mm512_dpwssd_epi32(_sum2, _pA11, _pB01); - _sum3 = _mm512_dpwssd_epi32(_sum3, _pA11, _pB23); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA00, _pB01)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA00, _pB23)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_pA11, _pB01)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_pA11, _pB23)); -#endif // __AVX512VNNI__ + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA00, _pB01); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _pA00, _pB23); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _pA11, _pB01); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _pA11, _pB23); #else // __AVX512F__ __m256i _pA1 = _mm256_permute4x64_epi64(_pA0, _MM_SHUFFLE(1, 0, 3, 2)); __m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1)); __m256i _pB2 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(1, 0, 3, 2)); __m256i _pB3 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(2, 1, 0, 3)); -#if __AVXVNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0); _sum1 = _mm256_comp_dpwssd_epi32(_sum1, _pA0, _pB1); _sum2 = _mm256_comp_dpwssd_epi32(_sum2, _pA0, _pB2); @@ -1628,16 +1563,6 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, _sum5 = _mm256_comp_dpwssd_epi32(_sum5, _pA1, _pB1); _sum6 = _mm256_comp_dpwssd_epi32(_sum6, _pA1, _pB2); _sum7 = _mm256_comp_dpwssd_epi32(_sum7, _pA1, _pB3); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA0, _pB1)); - _sum2 = _mm256_add_epi32(_sum2, _mm256_madd_epi16(_pA0, _pB2)); - _sum3 = _mm256_add_epi32(_sum3, _mm256_madd_epi16(_pA0, _pB3)); - _sum4 = _mm256_add_epi32(_sum4, _mm256_madd_epi16(_pA1, _pB0)); - _sum5 = _mm256_add_epi32(_sum5, _mm256_madd_epi16(_pA1, _pB1)); - _sum6 = _mm256_add_epi32(_sum6, _mm256_madd_epi16(_pA1, _pB2)); - _sum7 = _mm256_add_epi32(_sum7, _mm256_madd_epi16(_pA1, _pB3)); -#endif // __AVXVNNI__ #endif // __AVX512F__ pA += 16; @@ -1854,17 +1779,10 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, __m256i _pB0 = _mm256_inserti128_si256(_mm256_castsi128_si256(_pB), _pB, 1); __m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1)); -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0); _sum1 = _mm256_comp_dpwssd_epi32(_sum1, _pA0, _pB1); _sum2 = _mm256_comp_dpwssd_epi32(_sum2, _pA1, _pB0); _sum3 = _mm256_comp_dpwssd_epi32(_sum3, _pA1, _pB1); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA0, _pB1)); - _sum2 = _mm256_add_epi32(_sum2, _mm256_madd_epi16(_pA1, _pB0)); - _sum3 = _mm256_add_epi32(_sum3, _mm256_madd_epi16(_pA1, _pB1)); -#endif pA += 16; pB += 8; @@ -1948,13 +1866,8 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, __m256i _pB0 = _mm256_castpd_si256(_mm256_broadcast_sd((const double*)pB)); __m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 1, 0, 1)); -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA, _pB0); _sum1 = _mm256_comp_dpwssd_epi32(_sum1, _pA, _pB1); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA, _pB0)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA, _pB1)); -#endif pA += 16; pB += 4; @@ -2081,17 +1994,10 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, __m512i _pA1 = _mm512_shuffle_epi32(_pA0, _MM_PERM_BADC); __m512i _pB1 = _mm512_shuffle_epi32(_pB0, _MM_PERM_ADCB); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _pA0, _pB1); - _sum2 = _mm512_dpwssd_epi32(_sum2, _pA1, _pB0); - _sum3 = _mm512_dpwssd_epi32(_sum3, _pA1, _pB1); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA0, _pB1)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_pA1, _pB0)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_pA1, _pB1)); -#endif + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA0, _pB0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _pA0, _pB1); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _pA1, _pB0); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _pA1, _pB1); pA += 8; pB += 32; @@ -2231,17 +2137,10 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, __m256i _pA1 = _mm256_shuffle_epi32(_pA0, _MM_SHUFFLE(1, 0, 3, 2)); __m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1)); -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0); _sum1 = _mm256_comp_dpwssd_epi32(_sum1, _pA0, _pB1); _sum2 = _mm256_comp_dpwssd_epi32(_sum2, _pA1, _pB0); _sum3 = _mm256_comp_dpwssd_epi32(_sum3, _pA1, _pB1); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA0, _pB1)); - _sum2 = _mm256_add_epi32(_sum2, _mm256_madd_epi16(_pA1, _pB0)); - _sum3 = _mm256_add_epi32(_sum3, _mm256_madd_epi16(_pA1, _pB1)); -#endif #else // __AVX2__ __m128i _pA0 = _mm_loadu_si128((const __m128i*)pA); __m128i _pB0 = _mm_loadu_si128((const __m128i*)pB); @@ -2250,25 +2149,14 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, __m128i _pB2 = _mm_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1)); __m128i _pB3 = _mm_shuffle_epi32(_pB1, _MM_SHUFFLE(0, 3, 2, 1)); -#if __XOP__ - _sum0 = _mm_maddd_epi16(_pA0, _pB0, _sum0); - _sum1 = _mm_maddd_epi16(_pA0, _pB1, _sum1); - _sum2 = _mm_maddd_epi16(_pA0, _pB2, _sum2); - _sum3 = _mm_maddd_epi16(_pA0, _pB3, _sum3); - _sum4 = _mm_maddd_epi16(_pA1, _pB0, _sum4); - _sum5 = _mm_maddd_epi16(_pA1, _pB1, _sum5); - _sum6 = _mm_maddd_epi16(_pA1, _pB2, _sum6); - _sum7 = _mm_maddd_epi16(_pA1, _pB3, _sum7); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA0, _pB0)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_pA0, _pB1)); - _sum2 = _mm_add_epi32(_sum2, _mm_madd_epi16(_pA0, _pB2)); - _sum3 = _mm_add_epi32(_sum3, _mm_madd_epi16(_pA0, _pB3)); - _sum4 = _mm_add_epi32(_sum4, _mm_madd_epi16(_pA1, _pB0)); - _sum5 = _mm_add_epi32(_sum5, _mm_madd_epi16(_pA1, _pB1)); - _sum6 = _mm_add_epi32(_sum6, _mm_madd_epi16(_pA1, _pB2)); - _sum7 = _mm_add_epi32(_sum7, _mm_madd_epi16(_pA1, _pB3)); -#endif + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _pA0, _pB0); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _pA0, _pB1); + _sum2 = _mm_comp_dpwssd_epi32(_sum2, _pA0, _pB2); + _sum3 = _mm_comp_dpwssd_epi32(_sum3, _pA0, _pB3); + _sum4 = _mm_comp_dpwssd_epi32(_sum4, _pA1, _pB0); + _sum5 = _mm_comp_dpwssd_epi32(_sum5, _pA1, _pB1); + _sum6 = _mm_comp_dpwssd_epi32(_sum6, _pA1, _pB2); + _sum7 = _mm_comp_dpwssd_epi32(_sum7, _pA1, _pB3); #endif // __AVX2__ pA += 8; @@ -2473,17 +2361,10 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, __m128i _pA1 = _mm_shuffle_epi32(_pA0, _MM_SHUFFLE(1, 0, 3, 2)); __m128i _pB1 = _mm_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1)); -#if __XOP__ - _sum0 = _mm_maddd_epi16(_pA0, _pB0, _sum0); - _sum1 = _mm_maddd_epi16(_pA0, _pB1, _sum1); - _sum2 = _mm_maddd_epi16(_pA1, _pB0, _sum2); - _sum3 = _mm_maddd_epi16(_pA1, _pB1, _sum3); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA0, _pB0)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_pA0, _pB1)); - _sum2 = _mm_add_epi32(_sum2, _mm_madd_epi16(_pA1, _pB0)); - _sum3 = _mm_add_epi32(_sum3, _mm_madd_epi16(_pA1, _pB1)); -#endif + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _pA0, _pB0); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _pA0, _pB1); + _sum2 = _mm_comp_dpwssd_epi32(_sum2, _pA1, _pB0); + _sum3 = _mm_comp_dpwssd_epi32(_sum3, _pA1, _pB1); pA += 8; pB += 8; @@ -2582,13 +2463,8 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, __m128i _pB0 = _mm_castpd_si128(_mm_load1_pd((const double*)pB)); __m128i _pB1 = _mm_shuffle_epi32(_pB0, _MM_SHUFFLE(2, 3, 0, 1)); -#if __XOP__ - _sum0 = _mm_maddd_epi16(_pA, _pB0, _sum0); - _sum1 = _mm_maddd_epi16(_pA, _pB1, _sum1); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA, _pB0)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_pA, _pB1)); -#endif + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _pA, _pB0); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _pA, _pB1); pA += 8; pB += 4; @@ -2660,11 +2536,7 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, __m128i _pA = _mm_loadu_si128((const __m128i*)pA); __m128i _pB = _mm_castps_si128(_mm_load1_ps((const float*)pB)); -#if __XOP__ - _sum0 = _mm_maddd_epi16(_pA, _pB, _sum0); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA, _pB)); -#endif + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _pA, _pB); pA += 8; pB += 2; @@ -2729,13 +2601,9 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, __m512i _pA0 = _mm512_set1_epi32(((const int*)pA)[0]); __m512i _pA1 = _mm512_set1_epi32(((const int*)pA)[1]); __m512i _pB0 = _mm512_loadu_si512((const __m512i*)pB); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _pA1, _pB0); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA1, _pB0)); -#endif // __AVX512VNNI__ + + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA0, _pB0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _pA1, _pB0); pA += 4; pB += 32; @@ -3091,11 +2959,7 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, __m512i _pA0 = _mm512_set1_epi32(((const int*)pA)[0]); __m512i _pB0 = _mm512_loadu_si512((const __m512i*)pB); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0)); -#endif + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA0, _pB0); pA += 2; pB += 32; diff --git a/src/layer/x86/convolution_im2col_gemm_int8.h b/src/layer/x86/convolution_im2col_gemm_int8.h index bda8b96a172..0b211c2b3b8 100644 --- a/src/layer/x86/convolution_im2col_gemm_int8.h +++ b/src/layer/x86/convolution_im2col_gemm_int8.h @@ -738,41 +738,22 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M __m512i _pB2 = _mm512_shuffle_epi32(_pB0, _MM_PERM_BADC); __m512i _pB3 = _mm512_shuffle_epi32(_pB0, _MM_PERM_CBAD); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _pA0, _pB1); - _sum2 = _mm512_dpwssd_epi32(_sum2, _pA0, _pB2); - _sum3 = _mm512_dpwssd_epi32(_sum3, _pA0, _pB3); - _sum4 = _mm512_dpwssd_epi32(_sum4, _pA1, _pB0); - _sum5 = _mm512_dpwssd_epi32(_sum5, _pA1, _pB1); - _sum6 = _mm512_dpwssd_epi32(_sum6, _pA1, _pB2); - _sum7 = _mm512_dpwssd_epi32(_sum7, _pA1, _pB3); - _sum8 = _mm512_dpwssd_epi32(_sum8, _pA2, _pB0); - _sum9 = _mm512_dpwssd_epi32(_sum9, _pA2, _pB1); - _suma = _mm512_dpwssd_epi32(_suma, _pA2, _pB2); - _sumb = _mm512_dpwssd_epi32(_sumb, _pA2, _pB3); - _sumc = _mm512_dpwssd_epi32(_sumc, _pA3, _pB0); - _sumd = _mm512_dpwssd_epi32(_sumd, _pA3, _pB1); - _sume = _mm512_dpwssd_epi32(_sume, _pA3, _pB2); - _sumf = _mm512_dpwssd_epi32(_sumf, _pA3, _pB3); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA0, _pB1)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_pA0, _pB2)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_pA0, _pB3)); - _sum4 = _mm512_add_epi32(_sum4, _mm512_madd_epi16(_pA1, _pB0)); - _sum5 = _mm512_add_epi32(_sum5, _mm512_madd_epi16(_pA1, _pB1)); - _sum6 = _mm512_add_epi32(_sum6, _mm512_madd_epi16(_pA1, _pB2)); - _sum7 = _mm512_add_epi32(_sum7, _mm512_madd_epi16(_pA1, _pB3)); - _sum8 = _mm512_add_epi32(_sum8, _mm512_madd_epi16(_pA2, _pB0)); - _sum9 = _mm512_add_epi32(_sum9, _mm512_madd_epi16(_pA2, _pB1)); - _suma = _mm512_add_epi32(_suma, _mm512_madd_epi16(_pA2, _pB2)); - _sumb = _mm512_add_epi32(_sumb, _mm512_madd_epi16(_pA2, _pB3)); - _sumc = _mm512_add_epi32(_sumc, _mm512_madd_epi16(_pA3, _pB0)); - _sumd = _mm512_add_epi32(_sumd, _mm512_madd_epi16(_pA3, _pB1)); - _sume = _mm512_add_epi32(_sume, _mm512_madd_epi16(_pA3, _pB2)); - _sumf = _mm512_add_epi32(_sumf, _mm512_madd_epi16(_pA3, _pB3)); -#endif + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA0, _pB0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _pA0, _pB1); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _pA0, _pB2); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _pA0, _pB3); + _sum4 = _mm512_comp_dpwssd_epi32(_sum4, _pA1, _pB0); + _sum5 = _mm512_comp_dpwssd_epi32(_sum5, _pA1, _pB1); + _sum6 = _mm512_comp_dpwssd_epi32(_sum6, _pA1, _pB2); + _sum7 = _mm512_comp_dpwssd_epi32(_sum7, _pA1, _pB3); + _sum8 = _mm512_comp_dpwssd_epi32(_sum8, _pA2, _pB0); + _sum9 = _mm512_comp_dpwssd_epi32(_sum9, _pA2, _pB1); + _suma = _mm512_comp_dpwssd_epi32(_suma, _pA2, _pB2); + _sumb = _mm512_comp_dpwssd_epi32(_sumb, _pA2, _pB3); + _sumc = _mm512_comp_dpwssd_epi32(_sumc, _pA3, _pB0); + _sumd = _mm512_comp_dpwssd_epi32(_sumd, _pA3, _pB1); + _sume = _mm512_comp_dpwssd_epi32(_sume, _pA3, _pB2); + _sumf = _mm512_comp_dpwssd_epi32(_sumf, _pA3, _pB3); pA += 32; pB += 32; @@ -1480,25 +1461,14 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M __m512i _pB2 = _mm512_shuffle_epi32(_pB0, _MM_PERM_BADC); __m512i _pB3 = _mm512_shuffle_epi32(_pB0, _MM_PERM_CBAD); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _pA0, _pB1); - _sum2 = _mm512_dpwssd_epi32(_sum2, _pA0, _pB2); - _sum3 = _mm512_dpwssd_epi32(_sum3, _pA0, _pB3); - _sum4 = _mm512_dpwssd_epi32(_sum4, _pA1, _pB0); - _sum5 = _mm512_dpwssd_epi32(_sum5, _pA1, _pB1); - _sum6 = _mm512_dpwssd_epi32(_sum6, _pA1, _pB2); - _sum7 = _mm512_dpwssd_epi32(_sum7, _pA1, _pB3); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA0, _pB1)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_pA0, _pB2)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_pA0, _pB3)); - _sum4 = _mm512_add_epi32(_sum4, _mm512_madd_epi16(_pA1, _pB0)); - _sum5 = _mm512_add_epi32(_sum5, _mm512_madd_epi16(_pA1, _pB1)); - _sum6 = _mm512_add_epi32(_sum6, _mm512_madd_epi16(_pA1, _pB2)); - _sum7 = _mm512_add_epi32(_sum7, _mm512_madd_epi16(_pA1, _pB3)); -#endif + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA0, _pB0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _pA0, _pB1); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _pA0, _pB2); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _pA0, _pB3); + _sum4 = _mm512_comp_dpwssd_epi32(_sum4, _pA1, _pB0); + _sum5 = _mm512_comp_dpwssd_epi32(_sum5, _pA1, _pB1); + _sum6 = _mm512_comp_dpwssd_epi32(_sum6, _pA1, _pB2); + _sum7 = _mm512_comp_dpwssd_epi32(_sum7, _pA1, _pB3); pA += 32; pB += 16; @@ -1914,17 +1884,10 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M // 1230 1230 1230 1230 __m512i _pB1 = _mm512_shuffle_epi32(_pB0, _MM_PERM_ADCB); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _pA0, _pB1); - _sum2 = _mm512_dpwssd_epi32(_sum2, _pA1, _pB0); - _sum3 = _mm512_dpwssd_epi32(_sum3, _pA1, _pB1); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA0, _pB1)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_pA1, _pB0)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_pA1, _pB1)); -#endif + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA0, _pB0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _pA0, _pB1); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _pA1, _pB0); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _pA1, _pB1); pA += 32; pB += 8; @@ -2164,13 +2127,8 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M // 1010 1010 1010 1010 __m512i _pB1 = _mm512_shuffle_epi32(_pB0, _MM_PERM_CDAB); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _pA0, _pB1); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA0, _pB1)); -#endif + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA0, _pB0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _pA0, _pB1); pA += 32; pB += 4; @@ -2286,11 +2244,7 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M // 0xxx0xxx0xxx0xxx -> 00000000... __m512i _pB0 = _mm512_shuffle_epi32(_pBBBB, _MM_PERM_AAAA); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0)); -#endif + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA0, _pB0); pA += 32; pB += 2; @@ -2417,25 +2371,14 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M __m512i _pB2 = _mm512_shuffle_epi32(_pB0, _MM_PERM_BADC); __m512i _pB3 = _mm512_shuffle_epi32(_pB0, _MM_PERM_CBAD); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA00, _pB0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _pA00, _pB1); - _sum2 = _mm512_dpwssd_epi32(_sum2, _pA00, _pB2); - _sum3 = _mm512_dpwssd_epi32(_sum3, _pA00, _pB3); - _sum4 = _mm512_dpwssd_epi32(_sum4, _pA11, _pB0); - _sum5 = _mm512_dpwssd_epi32(_sum5, _pA11, _pB1); - _sum6 = _mm512_dpwssd_epi32(_sum6, _pA11, _pB2); - _sum7 = _mm512_dpwssd_epi32(_sum7, _pA11, _pB3); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA00, _pB0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA00, _pB1)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_pA00, _pB2)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_pA00, _pB3)); - _sum4 = _mm512_add_epi32(_sum4, _mm512_madd_epi16(_pA11, _pB0)); - _sum5 = _mm512_add_epi32(_sum5, _mm512_madd_epi16(_pA11, _pB1)); - _sum6 = _mm512_add_epi32(_sum6, _mm512_madd_epi16(_pA11, _pB2)); - _sum7 = _mm512_add_epi32(_sum7, _mm512_madd_epi16(_pA11, _pB3)); -#endif // __AVX512VNNI__ + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA00, _pB0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _pA00, _pB1); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _pA00, _pB2); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _pA00, _pB3); + _sum4 = _mm512_comp_dpwssd_epi32(_sum4, _pA11, _pB0); + _sum5 = _mm512_comp_dpwssd_epi32(_sum5, _pA11, _pB1); + _sum6 = _mm512_comp_dpwssd_epi32(_sum6, _pA11, _pB2); + _sum7 = _mm512_comp_dpwssd_epi32(_sum7, _pA11, _pB3); pA += 16; pB += 32; @@ -2802,17 +2745,10 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M __m512i _pB01 = _mm512_inserti32x8(_mm512_castsi256_si512(_pB0), _pB1, 1); __m512i _pB23 = _mm512_shuffle_epi32(_pB01, _MM_PERM_BADC); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA00, _pB01); - _sum1 = _mm512_dpwssd_epi32(_sum1, _pA00, _pB23); - _sum2 = _mm512_dpwssd_epi32(_sum2, _pA11, _pB01); - _sum3 = _mm512_dpwssd_epi32(_sum3, _pA11, _pB23); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA00, _pB01)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA00, _pB23)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_pA11, _pB01)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_pA11, _pB23)); -#endif // __AVX512VNNI__ + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA00, _pB01); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _pA00, _pB23); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _pA11, _pB01); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _pA11, _pB23); #else // __AVX512F__ // 0123 4567 @@ -2827,7 +2763,6 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M __m256i _pB2 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(1, 0, 3, 2)); __m256i _pB3 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(2, 1, 0, 3)); -#if __AVXVNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0); _sum1 = _mm256_comp_dpwssd_epi32(_sum1, _pA0, _pB1); _sum2 = _mm256_comp_dpwssd_epi32(_sum2, _pA0, _pB2); @@ -2836,16 +2771,6 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M _sum5 = _mm256_comp_dpwssd_epi32(_sum5, _pA1, _pB1); _sum6 = _mm256_comp_dpwssd_epi32(_sum6, _pA1, _pB2); _sum7 = _mm256_comp_dpwssd_epi32(_sum7, _pA1, _pB3); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA0, _pB1)); - _sum2 = _mm256_add_epi32(_sum2, _mm256_madd_epi16(_pA0, _pB2)); - _sum3 = _mm256_add_epi32(_sum3, _mm256_madd_epi16(_pA0, _pB3)); - _sum4 = _mm256_add_epi32(_sum4, _mm256_madd_epi16(_pA1, _pB0)); - _sum5 = _mm256_add_epi32(_sum5, _mm256_madd_epi16(_pA1, _pB1)); - _sum6 = _mm256_add_epi32(_sum6, _mm256_madd_epi16(_pA1, _pB2)); - _sum7 = _mm256_add_epi32(_sum7, _mm256_madd_epi16(_pA1, _pB3)); -#endif // __AVXVNNI__ #endif // __AVX512F__ pA += 16; @@ -3315,17 +3240,10 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M // 1230 1230 __m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1)); -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0); _sum1 = _mm256_comp_dpwssd_epi32(_sum1, _pA0, _pB1); _sum2 = _mm256_comp_dpwssd_epi32(_sum2, _pA1, _pB0); _sum3 = _mm256_comp_dpwssd_epi32(_sum3, _pA1, _pB1); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA0, _pB1)); - _sum2 = _mm256_add_epi32(_sum2, _mm256_madd_epi16(_pA1, _pB0)); - _sum3 = _mm256_add_epi32(_sum3, _mm256_madd_epi16(_pA1, _pB1)); -#endif pA += 16; pB += 8; @@ -3517,13 +3435,8 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M // 1010 1010 __m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 1, 0, 1)); -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0); _sum1 = _mm256_comp_dpwssd_epi32(_sum1, _pA0, _pB1); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA0, _pB1)); -#endif pA += 16; pB += 4; @@ -3653,11 +3566,7 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M // 0xxx0xxx -> 00000000 11111111 __m256i _pB0 = _mm256_shuffle_epi32(_pBB, _MM_SHUFFLE(0, 0, 0, 0)); -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0)); -#endif pA += 16; pB += 2; @@ -3773,17 +3682,10 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M // 1230 5674 9ab8 defc __m512i _pB1 = _mm512_shuffle_epi32(_pB0, _MM_PERM_ADCB); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _pA0, _pB1); - _sum2 = _mm512_dpwssd_epi32(_sum2, _pA1, _pB0); - _sum3 = _mm512_dpwssd_epi32(_sum3, _pA1, _pB1); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA0, _pB1)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_pA1, _pB0)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_pA1, _pB1)); -#endif + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA0, _pB0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _pA0, _pB1); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _pA1, _pB0); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _pA1, _pB1); pA += 8; pB += 32; @@ -3983,17 +3885,10 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M // 1230 5674 __m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1)); -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0); _sum1 = _mm256_comp_dpwssd_epi32(_sum1, _pA0, _pB1); _sum2 = _mm256_comp_dpwssd_epi32(_sum2, _pA1, _pB0); _sum3 = _mm256_comp_dpwssd_epi32(_sum3, _pA1, _pB1); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA0, _pB1)); - _sum2 = _mm256_add_epi32(_sum2, _mm256_madd_epi16(_pA1, _pB0)); - _sum3 = _mm256_add_epi32(_sum3, _mm256_madd_epi16(_pA1, _pB1)); -#endif #else // __AVX2__ #if __SSE4_1__ _pA = _mm_cvtepi8_epi16(_pA); @@ -4018,25 +3913,14 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M __m128i _pB2 = _mm_shuffle_epi32(_pBl, _MM_SHUFFLE(0, 3, 2, 1)); __m128i _pB3 = _mm_shuffle_epi32(_pBh, _MM_SHUFFLE(0, 3, 2, 1)); -#if __XOP__ - _sum0 = _mm_maddd_epi16(_pA0, _pB0, _sum0); - _sum1 = _mm_maddd_epi16(_pA0, _pB1, _sum1); - _sum2 = _mm_maddd_epi16(_pA0, _pB2, _sum2); - _sum3 = _mm_maddd_epi16(_pA0, _pB3, _sum3); - _sum4 = _mm_maddd_epi16(_pA1, _pB0, _sum4); - _sum5 = _mm_maddd_epi16(_pA1, _pB1, _sum5); - _sum6 = _mm_maddd_epi16(_pA1, _pB2, _sum6); - _sum7 = _mm_maddd_epi16(_pA1, _pB3, _sum7); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA0, _pB0)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_pA0, _pB1)); - _sum2 = _mm_add_epi32(_sum2, _mm_madd_epi16(_pA0, _pB2)); - _sum3 = _mm_add_epi32(_sum3, _mm_madd_epi16(_pA0, _pB3)); - _sum4 = _mm_add_epi32(_sum4, _mm_madd_epi16(_pA1, _pB0)); - _sum5 = _mm_add_epi32(_sum5, _mm_madd_epi16(_pA1, _pB1)); - _sum6 = _mm_add_epi32(_sum6, _mm_madd_epi16(_pA1, _pB2)); - _sum7 = _mm_add_epi32(_sum7, _mm_madd_epi16(_pA1, _pB3)); -#endif + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _pA0, _pB0); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _pA0, _pB1); + _sum2 = _mm_comp_dpwssd_epi32(_sum2, _pA0, _pB2); + _sum3 = _mm_comp_dpwssd_epi32(_sum3, _pA0, _pB3); + _sum4 = _mm_comp_dpwssd_epi32(_sum4, _pA1, _pB0); + _sum5 = _mm_comp_dpwssd_epi32(_sum5, _pA1, _pB1); + _sum6 = _mm_comp_dpwssd_epi32(_sum6, _pA1, _pB2); + _sum7 = _mm_comp_dpwssd_epi32(_sum7, _pA1, _pB3); #endif // __AVX2__ pA += 8; @@ -4381,17 +4265,10 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M __m128i _pB0 = _pB; __m128i _pB1 = _mm_shuffle_epi32(_pB, _MM_SHUFFLE(0, 3, 2, 1)); -#if __XOP__ - _sum0 = _mm_maddd_epi16(_pA0, _pB0, _sum0); - _sum1 = _mm_maddd_epi16(_pA0, _pB1, _sum1); - _sum2 = _mm_maddd_epi16(_pA1, _pB0, _sum2); - _sum3 = _mm_maddd_epi16(_pA1, _pB1, _sum3); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA0, _pB0)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_pA0, _pB1)); - _sum2 = _mm_add_epi32(_sum2, _mm_madd_epi16(_pA1, _pB0)); - _sum3 = _mm_add_epi32(_sum3, _mm_madd_epi16(_pA1, _pB1)); -#endif + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _pA0, _pB0); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _pA0, _pB1); + _sum2 = _mm_comp_dpwssd_epi32(_sum2, _pA1, _pB0); + _sum3 = _mm_comp_dpwssd_epi32(_sum3, _pA1, _pB1); pA += 8; pB += 8; @@ -4570,13 +4447,8 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M __m128i _pB0 = _pB; __m128i _pB1 = _mm_shuffle_epi32(_pB, _MM_SHUFFLE(2, 3, 0, 1)); -#if __XOP__ - _sum0 = _mm_maddd_epi16(_pA, _pB0, _sum0); - _sum1 = _mm_maddd_epi16(_pA, _pB1, _sum1); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA, _pB0)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_pA, _pB1)); -#endif + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _pA, _pB0); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _pA, _pB1); pA += 8; pB += 4; @@ -4707,11 +4579,7 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M _pB = _mm_unpacklo_epi8(_pB, _mm_cmpgt_epi8(_mm_setzero_si128(), _pB)); #endif -#if __XOP__ - _sum0 = _mm_maddd_epi16(_pA, _pB, _sum0); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA, _pB)); -#endif + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _pA, _pB); pA += 8; pB += 2; @@ -4821,13 +4689,8 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M // 1230 5674 9ab8 defc __m512i _pB1 = _mm512_shuffle_epi32(_pB0, _MM_PERM_ADCB); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _pA0, _pB1); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA0, _pB1)); -#endif + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA0, _pB0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _pA0, _pB1); pA += 4; pB += 32; @@ -4942,13 +4805,8 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M // 1230 5674 __m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1)); -#if __AVX512VNNI__ || __AVXVNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0); _sum1 = _mm256_comp_dpwssd_epi32(_sum1, _pA0, _pB1); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA0, _pB1)); -#endif // __AVX512VNNI__ || __AVXVNNI__ #else // __AVX2__ #if __SSE4_1__ _pA = _mm_cvtepi8_epi16(_pA); @@ -4968,17 +4826,10 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M // 0123 // 4567 -#if __XOP__ - _sum0 = _mm_maddd_epi16(_pA0, _pB0, _sum0); - _sum1 = _mm_maddd_epi16(_pA0, _pB1, _sum1); - _sum2 = _mm_maddd_epi16(_pA1, _pB0, _sum2); - _sum3 = _mm_maddd_epi16(_pA1, _pB1, _sum3); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA0, _pB0)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_pA0, _pB1)); - _sum2 = _mm_add_epi32(_sum2, _mm_madd_epi16(_pA1, _pB0)); - _sum3 = _mm_add_epi32(_sum3, _mm_madd_epi16(_pA1, _pB1)); -#endif + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _pA0, _pB0); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _pA0, _pB1); + _sum2 = _mm_comp_dpwssd_epi32(_sum2, _pA1, _pB0); + _sum3 = _mm_comp_dpwssd_epi32(_sum3, _pA1, _pB1); #endif // __AVX2__ pA += 4; @@ -5158,13 +5009,8 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M __m128i _pB0 = _pB; __m128i _pB1 = _mm_shuffle_epi32(_pB, _MM_SHUFFLE(0, 3, 2, 1)); -#if __XOP__ - _sum0 = _mm_maddd_epi16(_pA, _pB0, _sum0); - _sum1 = _mm_maddd_epi16(_pA, _pB1, _sum1); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA, _pB0)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_pA, _pB1)); -#endif + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _pA, _pB0); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _pA, _pB1); pA += 4; pB += 8; @@ -5387,11 +5233,7 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M __m512i _pA0 = _mm512_cvtepi8_epi16(_pA); __m512i _pB0 = _mm512_cvtepi8_epi16(_pB); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0)); -#endif + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _pA0, _pB0); pA += 2; pB += 32; @@ -5466,11 +5308,7 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M __m256i _pA0 = _mm256_cvtepi8_epi16(_pA); __m256i _pB0 = _mm256_cvtepi8_epi16(_pB); -#if __AVX512VNNI__ || __AVXVNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0)); -#endif // __AVX512VNNI__ || __AVXVNNI__ #else // __AVX2__ #if __SSE4_1__ _pA = _mm_cvtepi8_epi16(_pA); @@ -5482,13 +5320,8 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M __m128i _pB0 = _mm_unpacklo_epi8(_pB, _extpB); __m128i _pB1 = _mm_unpackhi_epi8(_pB, _extpB); -#if __XOP__ - _sum0 = _mm_maddd_epi16(_pA, _pB0, _sum0); - _sum1 = _mm_maddd_epi16(_pA, _pB1, _sum1); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA, _pB0)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_pA, _pB1)); -#endif + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _pA, _pB0); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _pA, _pB1); #endif // __AVX2__ pA += 2; @@ -5580,11 +5413,7 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M // 0xxx -> 0000 __m128i _pA0 = _mm_shuffle_epi32(_pA, _MM_SHUFFLE(0, 0, 0, 0)); -#if __XOP__ - _sum0 = _mm_maddd_epi16(_pA0, _pB, _sum0); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA0, _pB)); -#endif + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _pA0, _pB); pA += 2; pB += 8; diff --git a/src/layer/x86/convolution_packed_int8.h b/src/layer/x86/convolution_packed_int8.h index 3c854bbcbc3..21166fd191b 100644 --- a/src/layer/x86/convolution_packed_int8.h +++ b/src/layer/x86/convolution_packed_int8.h @@ -1056,73 +1056,38 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m512i _rrrr30 = _mm512_broadcast_i32x4(_mm256_extracti128_si256(_rr3, 0)); __m512i _rrrr31 = _mm512_broadcast_i32x4(_mm256_extracti128_si256(_rr3, 1)); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_AAAA), _w0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr10, _MM_PERM_AAAA), _w0); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr20, _MM_PERM_AAAA), _w0); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr30, _MM_PERM_AAAA), _w0); - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_BBBB), _w1); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr10, _MM_PERM_BBBB), _w1); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr20, _MM_PERM_BBBB), _w1); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr30, _MM_PERM_BBBB), _w1); - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_CCCC), _w2); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr10, _MM_PERM_CCCC), _w2); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr20, _MM_PERM_CCCC), _w2); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr30, _MM_PERM_CCCC), _w2); - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_DDDD), _w3); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr10, _MM_PERM_DDDD), _w3); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr20, _MM_PERM_DDDD), _w3); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr30, _MM_PERM_DDDD), _w3); - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_AAAA), _w4); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr11, _MM_PERM_AAAA), _w4); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr21, _MM_PERM_AAAA), _w4); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr31, _MM_PERM_AAAA), _w4); - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_BBBB), _w5); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr11, _MM_PERM_BBBB), _w5); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr21, _MM_PERM_BBBB), _w5); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr31, _MM_PERM_BBBB), _w5); - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_CCCC), _w6); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr11, _MM_PERM_CCCC), _w6); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr21, _MM_PERM_CCCC), _w6); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr31, _MM_PERM_CCCC), _w6); - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_DDDD), _w7); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr11, _MM_PERM_DDDD), _w7); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr21, _MM_PERM_DDDD), _w7); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr31, _MM_PERM_DDDD), _w7); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr00, _MM_PERM_AAAA), _w0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr10, _MM_PERM_AAAA), _w0)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr20, _MM_PERM_AAAA), _w0)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr30, _MM_PERM_AAAA), _w0)); - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr00, _MM_PERM_BBBB), _w1)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr10, _MM_PERM_BBBB), _w1)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr20, _MM_PERM_BBBB), _w1)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr30, _MM_PERM_BBBB), _w1)); - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr00, _MM_PERM_CCCC), _w2)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr10, _MM_PERM_CCCC), _w2)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr20, _MM_PERM_CCCC), _w2)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr30, _MM_PERM_CCCC), _w2)); - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr00, _MM_PERM_DDDD), _w3)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr10, _MM_PERM_DDDD), _w3)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr20, _MM_PERM_DDDD), _w3)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr30, _MM_PERM_DDDD), _w3)); - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr01, _MM_PERM_AAAA), _w4)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr11, _MM_PERM_AAAA), _w4)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr21, _MM_PERM_AAAA), _w4)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr31, _MM_PERM_AAAA), _w4)); - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr01, _MM_PERM_BBBB), _w5)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr11, _MM_PERM_BBBB), _w5)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr21, _MM_PERM_BBBB), _w5)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr31, _MM_PERM_BBBB), _w5)); - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr01, _MM_PERM_CCCC), _w6)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr11, _MM_PERM_CCCC), _w6)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr21, _MM_PERM_CCCC), _w6)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr31, _MM_PERM_CCCC), _w6)); - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr01, _MM_PERM_DDDD), _w7)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr11, _MM_PERM_DDDD), _w7)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr21, _MM_PERM_DDDD), _w7)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr31, _MM_PERM_DDDD), _w7)); -#endif // __AVX512VNNI__ + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_AAAA), _w0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr10, _MM_PERM_AAAA), _w0); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr20, _MM_PERM_AAAA), _w0); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr30, _MM_PERM_AAAA), _w0); + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_BBBB), _w1); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr10, _MM_PERM_BBBB), _w1); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr20, _MM_PERM_BBBB), _w1); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr30, _MM_PERM_BBBB), _w1); + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_CCCC), _w2); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr10, _MM_PERM_CCCC), _w2); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr20, _MM_PERM_CCCC), _w2); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr30, _MM_PERM_CCCC), _w2); + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_DDDD), _w3); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr10, _MM_PERM_DDDD), _w3); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr20, _MM_PERM_DDDD), _w3); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr30, _MM_PERM_DDDD), _w3); + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_AAAA), _w4); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr11, _MM_PERM_AAAA), _w4); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr21, _MM_PERM_AAAA), _w4); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr31, _MM_PERM_AAAA), _w4); + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_BBBB), _w5); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr11, _MM_PERM_BBBB), _w5); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr21, _MM_PERM_BBBB), _w5); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr31, _MM_PERM_BBBB), _w5); + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_CCCC), _w6); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr11, _MM_PERM_CCCC), _w6); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr21, _MM_PERM_CCCC), _w6); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr31, _MM_PERM_CCCC), _w6); + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_DDDD), _w7); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr11, _MM_PERM_DDDD), _w7); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr21, _MM_PERM_DDDD), _w7); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr31, _MM_PERM_DDDD), _w7); kptr += 256; } @@ -1180,41 +1145,22 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m512i _rrrr2 = _mm512_broadcast_i32x4(_r2); __m512i _rrrr3 = _mm512_broadcast_i32x4(_r3); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_AAAA), _w0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr1, _MM_PERM_AAAA), _w0); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr2, _MM_PERM_AAAA), _w0); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr3, _MM_PERM_AAAA), _w0); - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_BBBB), _w1); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr1, _MM_PERM_BBBB), _w1); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr2, _MM_PERM_BBBB), _w1); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr3, _MM_PERM_BBBB), _w1); - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_CCCC), _w2); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr1, _MM_PERM_CCCC), _w2); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr2, _MM_PERM_CCCC), _w2); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr3, _MM_PERM_CCCC), _w2); - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_DDDD), _w3); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr1, _MM_PERM_DDDD), _w3); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr2, _MM_PERM_DDDD), _w3); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr3, _MM_PERM_DDDD), _w3); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr0, _MM_PERM_AAAA), _w0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr1, _MM_PERM_AAAA), _w0)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr2, _MM_PERM_AAAA), _w0)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr3, _MM_PERM_AAAA), _w0)); - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr0, _MM_PERM_BBBB), _w1)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr1, _MM_PERM_BBBB), _w1)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr2, _MM_PERM_BBBB), _w1)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr3, _MM_PERM_BBBB), _w1)); - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr0, _MM_PERM_CCCC), _w2)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr1, _MM_PERM_CCCC), _w2)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr2, _MM_PERM_CCCC), _w2)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr3, _MM_PERM_CCCC), _w2)); - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr0, _MM_PERM_DDDD), _w3)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr1, _MM_PERM_DDDD), _w3)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr2, _MM_PERM_DDDD), _w3)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr3, _MM_PERM_DDDD), _w3)); -#endif // __AVX512VNNI__ + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_AAAA), _w0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr1, _MM_PERM_AAAA), _w0); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr2, _MM_PERM_AAAA), _w0); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr3, _MM_PERM_AAAA), _w0); + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_BBBB), _w1); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr1, _MM_PERM_BBBB), _w1); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr2, _MM_PERM_BBBB), _w1); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr3, _MM_PERM_BBBB), _w1); + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_CCCC), _w2); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr1, _MM_PERM_CCCC), _w2); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr2, _MM_PERM_CCCC), _w2); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr3, _MM_PERM_CCCC), _w2); + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_DDDD), _w3); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr1, _MM_PERM_DDDD), _w3); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr2, _MM_PERM_DDDD), _w3); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr3, _MM_PERM_DDDD), _w3); kptr += 128; } @@ -1242,17 +1188,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m512i _w = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*)kptr)); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _r0, _w); - _sum1 = _mm512_dpwssd_epi32(_sum1, _r1, _w); - _sum2 = _mm512_dpwssd_epi32(_sum2, _r2, _w); - _sum3 = _mm512_dpwssd_epi32(_sum3, _r3, _w); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_r0, _w)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_r1, _w)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_r2, _w)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_r3, _w)); -#endif // __AVX512VNNI__ + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _r0, _w); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _r1, _w); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _r2, _w); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _r3, _w); kptr += 32; } @@ -1413,41 +1352,22 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m512i _rrrr10 = _mm512_broadcast_i32x4(_mm256_extracti128_si256(_rr1, 0)); __m512i _rrrr11 = _mm512_broadcast_i32x4(_mm256_extracti128_si256(_rr1, 1)); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_AAAA), _w0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr10, _MM_PERM_AAAA), _w0); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_BBBB), _w1); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr10, _MM_PERM_BBBB), _w1); - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_CCCC), _w2); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr10, _MM_PERM_CCCC), _w2); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_DDDD), _w3); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr10, _MM_PERM_DDDD), _w3); - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_AAAA), _w4); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr11, _MM_PERM_AAAA), _w4); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_BBBB), _w5); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr11, _MM_PERM_BBBB), _w5); - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_CCCC), _w6); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr11, _MM_PERM_CCCC), _w6); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_DDDD), _w7); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr11, _MM_PERM_DDDD), _w7); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr00, _MM_PERM_AAAA), _w0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr10, _MM_PERM_AAAA), _w0)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr00, _MM_PERM_BBBB), _w1)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr10, _MM_PERM_BBBB), _w1)); - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr00, _MM_PERM_CCCC), _w2)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr10, _MM_PERM_CCCC), _w2)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr00, _MM_PERM_DDDD), _w3)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr10, _MM_PERM_DDDD), _w3)); - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr01, _MM_PERM_AAAA), _w4)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr11, _MM_PERM_AAAA), _w4)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr01, _MM_PERM_BBBB), _w5)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr11, _MM_PERM_BBBB), _w5)); - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr01, _MM_PERM_CCCC), _w6)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr11, _MM_PERM_CCCC), _w6)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr01, _MM_PERM_DDDD), _w7)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr11, _MM_PERM_DDDD), _w7)); -#endif // __AVX512VNNI__ + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_AAAA), _w0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr10, _MM_PERM_AAAA), _w0); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_BBBB), _w1); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr10, _MM_PERM_BBBB), _w1); + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_CCCC), _w2); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr10, _MM_PERM_CCCC), _w2); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_DDDD), _w3); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr10, _MM_PERM_DDDD), _w3); + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_AAAA), _w4); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr11, _MM_PERM_AAAA), _w4); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_BBBB), _w5); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr11, _MM_PERM_BBBB), _w5); + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_CCCC), _w6); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr11, _MM_PERM_CCCC), _w6); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_DDDD), _w7); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr11, _MM_PERM_DDDD), _w7); kptr += 256; } @@ -1491,25 +1411,14 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m512i _rrrr0 = _mm512_broadcast_i32x4(_r0); __m512i _rrrr1 = _mm512_broadcast_i32x4(_r1); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_AAAA), _w0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr1, _MM_PERM_AAAA), _w0); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_BBBB), _w1); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr1, _MM_PERM_BBBB), _w1); - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_CCCC), _w2); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr1, _MM_PERM_CCCC), _w2); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_DDDD), _w3); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr1, _MM_PERM_DDDD), _w3); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr0, _MM_PERM_AAAA), _w0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr1, _MM_PERM_AAAA), _w0)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr0, _MM_PERM_BBBB), _w1)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr1, _MM_PERM_BBBB), _w1)); - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr0, _MM_PERM_CCCC), _w2)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr1, _MM_PERM_CCCC), _w2)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr0, _MM_PERM_DDDD), _w3)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr1, _MM_PERM_DDDD), _w3)); -#endif // __AVX512VNNI__ + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_AAAA), _w0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr1, _MM_PERM_AAAA), _w0); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_BBBB), _w1); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr1, _MM_PERM_BBBB), _w1); + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_CCCC), _w2); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr1, _MM_PERM_CCCC), _w2); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_DDDD), _w3); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr1, _MM_PERM_DDDD), _w3); kptr += 128; } @@ -1531,13 +1440,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m512i _w = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*)kptr)); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _r0, _w); - _sum1 = _mm512_dpwssd_epi32(_sum1, _r1, _w); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_r0, _w)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_r1, _w)); -#endif // __AVX512VNNI__ + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _r0, _w); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _r1, _w); kptr += 32; } @@ -1664,25 +1568,14 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m512i _rrrr00 = _mm512_broadcast_i32x4(_mm256_extracti128_si256(_rr0, 0)); __m512i _rrrr01 = _mm512_broadcast_i32x4(_mm256_extracti128_si256(_rr0, 1)); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_AAAA), _w0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_BBBB), _w1); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_CCCC), _w2); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_DDDD), _w3); - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_AAAA), _w4); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_BBBB), _w5); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_CCCC), _w6); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_DDDD), _w7); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr00, _MM_PERM_AAAA), _w0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr00, _MM_PERM_BBBB), _w1)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr00, _MM_PERM_CCCC), _w2)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr00, _MM_PERM_DDDD), _w3)); - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr01, _MM_PERM_AAAA), _w4)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr01, _MM_PERM_BBBB), _w5)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr01, _MM_PERM_CCCC), _w6)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr01, _MM_PERM_DDDD), _w7)); -#endif // __AVX512VNNI__ + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_AAAA), _w0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_BBBB), _w1); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_CCCC), _w2); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr00, _MM_PERM_DDDD), _w3); + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_AAAA), _w4); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_BBBB), _w5); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_CCCC), _w6); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr01, _MM_PERM_DDDD), _w7); kptr += 256; } @@ -1719,17 +1612,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const // 01234567 -> 01010101 01010101 01010101 01010101 __m512i _rrrr0 = _mm512_broadcast_i32x4(_r0); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_AAAA), _w0); - _sum1 = _mm512_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_BBBB), _w1); - _sum2 = _mm512_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_CCCC), _w2); - _sum3 = _mm512_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_DDDD), _w3); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr0, _MM_PERM_AAAA), _w0)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr0, _MM_PERM_BBBB), _w1)); - _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr0, _MM_PERM_CCCC), _w2)); - _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_mm512_shuffle_epi32(_rrrr0, _MM_PERM_DDDD), _w3)); -#endif // __AVX512VNNI__ + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_AAAA), _w0); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_BBBB), _w1); + _sum2 = _mm512_comp_dpwssd_epi32(_sum2, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_CCCC), _w2); + _sum3 = _mm512_comp_dpwssd_epi32(_sum3, _mm512_shuffle_epi32(_rrrr0, _MM_PERM_DDDD), _w3); kptr += 128; } @@ -1748,11 +1634,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m512i _w = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*)kptr)); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _val, _w); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_val, _w)); -#endif + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _val, _w); kptr += 32; } @@ -1941,41 +1823,22 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m512i _rrr3l = _mm512_unpacklo_epi64(_rrr3, _rrr3); __m512i _rrr3h = _mm512_unpackhi_epi64(_rrr3, _rrr3); -#if __AVX512VNNI__ - _sum00 = _mm512_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(0, 0, 0, 0)), _w0); - _sum11 = _mm512_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(0, 0, 0, 0)), _w0); - _sum22 = _mm512_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr2l, _rrr2h, _MM_SHUFFLE(0, 0, 0, 0)), _w0); - _sum33 = _mm512_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr3l, _rrr3h, _MM_SHUFFLE(0, 0, 0, 0)), _w0); - _sum00 = _mm512_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(2, 2, 2, 2)), _w1); - _sum11 = _mm512_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(2, 2, 2, 2)), _w1); - _sum22 = _mm512_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr2l, _rrr2h, _MM_SHUFFLE(2, 2, 2, 2)), _w1); - _sum33 = _mm512_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr3l, _rrr3h, _MM_SHUFFLE(2, 2, 2, 2)), _w1); - _sum00 = _mm512_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(1, 1, 1, 1)), _w2); - _sum11 = _mm512_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(1, 1, 1, 1)), _w2); - _sum22 = _mm512_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr2l, _rrr2h, _MM_SHUFFLE(1, 1, 1, 1)), _w2); - _sum33 = _mm512_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr3l, _rrr3h, _MM_SHUFFLE(1, 1, 1, 1)), _w2); - _sum00 = _mm512_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(3, 3, 3, 3)), _w3); - _sum11 = _mm512_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(3, 3, 3, 3)), _w3); - _sum22 = _mm512_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr2l, _rrr2h, _MM_SHUFFLE(3, 3, 3, 3)), _w3); - _sum33 = _mm512_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr3l, _rrr3h, _MM_SHUFFLE(3, 3, 3, 3)), _w3); -#else - _sum00 = _mm512_add_epi32(_sum00, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum11 = _mm512_add_epi32(_sum11, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum22 = _mm512_add_epi32(_sum22, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr2l, _rrr2h, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum33 = _mm512_add_epi32(_sum33, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr3l, _rrr3h, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum00 = _mm512_add_epi32(_sum00, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(2, 2, 2, 2)), _w1)); - _sum11 = _mm512_add_epi32(_sum11, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(2, 2, 2, 2)), _w1)); - _sum22 = _mm512_add_epi32(_sum22, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr2l, _rrr2h, _MM_SHUFFLE(2, 2, 2, 2)), _w1)); - _sum33 = _mm512_add_epi32(_sum33, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr3l, _rrr3h, _MM_SHUFFLE(2, 2, 2, 2)), _w1)); - _sum00 = _mm512_add_epi32(_sum00, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(1, 1, 1, 1)), _w2)); - _sum11 = _mm512_add_epi32(_sum11, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(1, 1, 1, 1)), _w2)); - _sum22 = _mm512_add_epi32(_sum22, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr2l, _rrr2h, _MM_SHUFFLE(1, 1, 1, 1)), _w2)); - _sum33 = _mm512_add_epi32(_sum33, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr3l, _rrr3h, _MM_SHUFFLE(1, 1, 1, 1)), _w2)); - _sum00 = _mm512_add_epi32(_sum00, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); - _sum11 = _mm512_add_epi32(_sum11, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); - _sum22 = _mm512_add_epi32(_sum22, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr2l, _rrr2h, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); - _sum33 = _mm512_add_epi32(_sum33, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr3l, _rrr3h, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); -#endif // __AVX512VNNI__ + _sum00 = _mm512_comp_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(0, 0, 0, 0)), _w0); + _sum11 = _mm512_comp_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(0, 0, 0, 0)), _w0); + _sum22 = _mm512_comp_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr2l, _rrr2h, _MM_SHUFFLE(0, 0, 0, 0)), _w0); + _sum33 = _mm512_comp_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr3l, _rrr3h, _MM_SHUFFLE(0, 0, 0, 0)), _w0); + _sum00 = _mm512_comp_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(2, 2, 2, 2)), _w1); + _sum11 = _mm512_comp_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(2, 2, 2, 2)), _w1); + _sum22 = _mm512_comp_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr2l, _rrr2h, _MM_SHUFFLE(2, 2, 2, 2)), _w1); + _sum33 = _mm512_comp_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr3l, _rrr3h, _MM_SHUFFLE(2, 2, 2, 2)), _w1); + _sum00 = _mm512_comp_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(1, 1, 1, 1)), _w2); + _sum11 = _mm512_comp_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(1, 1, 1, 1)), _w2); + _sum22 = _mm512_comp_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr2l, _rrr2h, _MM_SHUFFLE(1, 1, 1, 1)), _w2); + _sum33 = _mm512_comp_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr3l, _rrr3h, _MM_SHUFFLE(1, 1, 1, 1)), _w2); + _sum00 = _mm512_comp_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(3, 3, 3, 3)), _w3); + _sum11 = _mm512_comp_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(3, 3, 3, 3)), _w3); + _sum22 = _mm512_comp_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr2l, _rrr2h, _MM_SHUFFLE(3, 3, 3, 3)), _w3); + _sum33 = _mm512_comp_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr3l, _rrr3h, _MM_SHUFFLE(3, 3, 3, 3)), _w3); kptr += 128; } @@ -2063,7 +1926,6 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m256i _rr2 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r2), _r2, 1); __m256i _rr3 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r3), _r3, 1); -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(0, 0, 0, 0)), _w0); _sum1 = _mm256_comp_dpwssd_epi32(_sum1, _mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(0, 0, 0, 0)), _w0); _sum2 = _mm256_comp_dpwssd_epi32(_sum2, _mm256_shuffle_epi32(_rr2, _MM_SHUFFLE(0, 0, 0, 0)), _w0); @@ -2080,24 +1942,6 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const _sum1 = _mm256_comp_dpwssd_epi32(_sum1, _mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(3, 3, 3, 3)), _w3); _sum2 = _mm256_comp_dpwssd_epi32(_sum2, _mm256_shuffle_epi32(_rr2, _MM_SHUFFLE(3, 3, 3, 3)), _w3); _sum3 = _mm256_comp_dpwssd_epi32(_sum3, _mm256_shuffle_epi32(_rr3, _MM_SHUFFLE(3, 3, 3, 3)), _w3); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum2 = _mm256_add_epi32(_sum2, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr2, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum3 = _mm256_add_epi32(_sum3, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr3, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(1, 1, 1, 1)), _w1)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(1, 1, 1, 1)), _w1)); - _sum2 = _mm256_add_epi32(_sum2, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr2, _MM_SHUFFLE(1, 1, 1, 1)), _w1)); - _sum3 = _mm256_add_epi32(_sum3, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr3, _MM_SHUFFLE(1, 1, 1, 1)), _w1)); - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(2, 2, 2, 2)), _w2)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(2, 2, 2, 2)), _w2)); - _sum2 = _mm256_add_epi32(_sum2, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr2, _MM_SHUFFLE(2, 2, 2, 2)), _w2)); - _sum3 = _mm256_add_epi32(_sum3, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr3, _MM_SHUFFLE(2, 2, 2, 2)), _w2)); - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); - _sum2 = _mm256_add_epi32(_sum2, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr2, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); - _sum3 = _mm256_add_epi32(_sum3, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr3, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); -#endif kptr += 64; } @@ -2129,17 +1973,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m256i _w = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*)kptr)); -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _rr0, _w); _sum1 = _mm256_comp_dpwssd_epi32(_sum1, _rr1, _w); _sum2 = _mm256_comp_dpwssd_epi32(_sum2, _rr2, _w); _sum3 = _mm256_comp_dpwssd_epi32(_sum3, _rr3, _w); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_rr0, _w)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_rr1, _w)); - _sum2 = _mm256_add_epi32(_sum2, _mm256_madd_epi16(_rr2, _w)); - _sum3 = _mm256_add_epi32(_sum3, _mm256_madd_epi16(_rr3, _w)); -#endif kptr += 16; } @@ -2336,25 +2173,14 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m512i _rrr1l = _mm512_unpacklo_epi64(_rrr1, _rrr1); __m512i _rrr1h = _mm512_unpackhi_epi64(_rrr1, _rrr1); -#if __AVX512VNNI__ - _sum00 = _mm512_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(0, 0, 0, 0)), _w0); - _sum11 = _mm512_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(0, 0, 0, 0)), _w0); - _sum22 = _mm512_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(2, 2, 2, 2)), _w1); - _sum33 = _mm512_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(2, 2, 2, 2)), _w1); - _sum00 = _mm512_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(1, 1, 1, 1)), _w2); - _sum11 = _mm512_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(1, 1, 1, 1)), _w2); - _sum22 = _mm512_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(3, 3, 3, 3)), _w3); - _sum33 = _mm512_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(3, 3, 3, 3)), _w3); -#else - _sum00 = _mm512_add_epi32(_sum00, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum11 = _mm512_add_epi32(_sum11, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum22 = _mm512_add_epi32(_sum22, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(2, 2, 2, 2)), _w1)); - _sum33 = _mm512_add_epi32(_sum33, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(2, 2, 2, 2)), _w1)); - _sum00 = _mm512_add_epi32(_sum00, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(1, 1, 1, 1)), _w2)); - _sum11 = _mm512_add_epi32(_sum11, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(1, 1, 1, 1)), _w2)); - _sum22 = _mm512_add_epi32(_sum22, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); - _sum33 = _mm512_add_epi32(_sum33, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); -#endif // __AVX512VNNI__ + _sum00 = _mm512_comp_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(0, 0, 0, 0)), _w0); + _sum11 = _mm512_comp_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(0, 0, 0, 0)), _w0); + _sum22 = _mm512_comp_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(2, 2, 2, 2)), _w1); + _sum33 = _mm512_comp_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(2, 2, 2, 2)), _w1); + _sum00 = _mm512_comp_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(1, 1, 1, 1)), _w2); + _sum11 = _mm512_comp_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(1, 1, 1, 1)), _w2); + _sum22 = _mm512_comp_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(3, 3, 3, 3)), _w3); + _sum33 = _mm512_comp_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(3, 3, 3, 3)), _w3); kptr += 128; } @@ -2422,7 +2248,6 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m256i _rr0 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r0), _r0, 1); __m256i _rr1 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r1), _r1, 1); -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(0, 0, 0, 0)), _w0); _sum1 = _mm256_comp_dpwssd_epi32(_sum1, _mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(0, 0, 0, 0)), _w0); _sum2 = _mm256_comp_dpwssd_epi32(_sum2, _mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(1, 1, 1, 1)), _w1); @@ -2431,16 +2256,6 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const _sum1 = _mm256_comp_dpwssd_epi32(_sum1, _mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(2, 2, 2, 2)), _w2); _sum2 = _mm256_comp_dpwssd_epi32(_sum2, _mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(3, 3, 3, 3)), _w3); _sum3 = _mm256_comp_dpwssd_epi32(_sum3, _mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(3, 3, 3, 3)), _w3); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum2 = _mm256_add_epi32(_sum2, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(1, 1, 1, 1)), _w1)); - _sum3 = _mm256_add_epi32(_sum3, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(1, 1, 1, 1)), _w1)); - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(2, 2, 2, 2)), _w2)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(2, 2, 2, 2)), _w2)); - _sum2 = _mm256_add_epi32(_sum2, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); - _sum3 = _mm256_add_epi32(_sum3, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); -#endif kptr += 64; } @@ -2464,13 +2279,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m256i _w = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*)kptr)); -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _rr0, _w); _sum1 = _mm256_comp_dpwssd_epi32(_sum1, _rr1, _w); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_rr0, _w)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_rr1, _w)); -#endif kptr += 16; } @@ -2614,17 +2424,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m512i _rrr0l = _mm512_unpacklo_epi64(_rrr0, _rrr0); __m512i _rrr0h = _mm512_unpackhi_epi64(_rrr0, _rrr0); -#if __AVX512VNNI__ - _sum00 = _mm512_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(0, 0, 0, 0)), _w0); - _sum11 = _mm512_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(2, 2, 2, 2)), _w1); - _sum22 = _mm512_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(1, 1, 1, 1)), _w2); - _sum33 = _mm512_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(3, 3, 3, 3)), _w3); -#else - _sum00 = _mm512_add_epi32(_sum00, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum11 = _mm512_add_epi32(_sum11, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(2, 2, 2, 2)), _w1)); - _sum22 = _mm512_add_epi32(_sum22, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(1, 1, 1, 1)), _w2)); - _sum33 = _mm512_add_epi32(_sum33, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); -#endif // __AVX512VNNI__ + _sum00 = _mm512_comp_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(0, 0, 0, 0)), _w0); + _sum11 = _mm512_comp_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(2, 2, 2, 2)), _w1); + _sum22 = _mm512_comp_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(1, 1, 1, 1)), _w2); + _sum33 = _mm512_comp_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(3, 3, 3, 3)), _w3); kptr += 128; } @@ -2683,17 +2486,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const // 01234567 -> 01010101 01010101 __m256i _rr0 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r0), _r0, 1); -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(0, 0, 0, 0)), _w0); _sum1 = _mm256_comp_dpwssd_epi32(_sum1, _mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(1, 1, 1, 1)), _w1); _sum2 = _mm256_comp_dpwssd_epi32(_sum2, _mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(2, 2, 2, 2)), _w2); _sum3 = _mm256_comp_dpwssd_epi32(_sum3, _mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(3, 3, 3, 3)), _w3); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(1, 1, 1, 1)), _w1)); - _sum2 = _mm256_add_epi32(_sum2, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(2, 2, 2, 2)), _w2)); - _sum3 = _mm256_add_epi32(_sum3, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); -#endif kptr += 64; } @@ -2717,11 +2513,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m256i _w = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*)kptr)); -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _val, _w); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_val, _w)); -#endif kptr += 16; } @@ -2915,25 +2707,14 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m512i _rrr3l = _mm512_unpacklo_epi64(_rrr3, _rrr3); __m512i _rrr3h = _mm512_unpackhi_epi64(_rrr3, _rrr3); -#if __AVX512VNNI__ - _sum00 = _mm512_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(2, 0, 2, 0)), _w0); - _sum11 = _mm512_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(2, 0, 2, 0)), _w0); - _sum22 = _mm512_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr2l, _rrr2h, _MM_SHUFFLE(2, 0, 2, 0)), _w0); - _sum33 = _mm512_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr3l, _rrr3h, _MM_SHUFFLE(2, 0, 2, 0)), _w0); - _sum00 = _mm512_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(3, 1, 3, 1)), _w1); - _sum11 = _mm512_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(3, 1, 3, 1)), _w1); - _sum22 = _mm512_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr2l, _rrr2h, _MM_SHUFFLE(3, 1, 3, 1)), _w1); - _sum33 = _mm512_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr3l, _rrr3h, _MM_SHUFFLE(3, 1, 3, 1)), _w1); -#else - _sum00 = _mm512_add_epi32(_sum00, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(2, 0, 2, 0)), _w0)); - _sum11 = _mm512_add_epi32(_sum11, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(2, 0, 2, 0)), _w0)); - _sum22 = _mm512_add_epi32(_sum22, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr2l, _rrr2h, _MM_SHUFFLE(2, 0, 2, 0)), _w0)); - _sum33 = _mm512_add_epi32(_sum33, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr3l, _rrr3h, _MM_SHUFFLE(2, 0, 2, 0)), _w0)); - _sum00 = _mm512_add_epi32(_sum00, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(3, 1, 3, 1)), _w1)); - _sum11 = _mm512_add_epi32(_sum11, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(3, 1, 3, 1)), _w1)); - _sum22 = _mm512_add_epi32(_sum22, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr2l, _rrr2h, _MM_SHUFFLE(3, 1, 3, 1)), _w1)); - _sum33 = _mm512_add_epi32(_sum33, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr3l, _rrr3h, _MM_SHUFFLE(3, 1, 3, 1)), _w1)); -#endif // __AVX512VNNI__ + _sum00 = _mm512_comp_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(2, 0, 2, 0)), _w0); + _sum11 = _mm512_comp_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(2, 0, 2, 0)), _w0); + _sum22 = _mm512_comp_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr2l, _rrr2h, _MM_SHUFFLE(2, 0, 2, 0)), _w0); + _sum33 = _mm512_comp_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr3l, _rrr3h, _MM_SHUFFLE(2, 0, 2, 0)), _w0); + _sum00 = _mm512_comp_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(3, 1, 3, 1)), _w1); + _sum11 = _mm512_comp_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(3, 1, 3, 1)), _w1); + _sum22 = _mm512_comp_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr2l, _rrr2h, _MM_SHUFFLE(3, 1, 3, 1)), _w1); + _sum33 = _mm512_comp_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr3l, _rrr3h, _MM_SHUFFLE(3, 1, 3, 1)), _w1); kptr += 64; } @@ -3037,7 +2818,6 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m256i _rr2 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r2), _mm_shuffle_epi32(_r2, _MM_SHUFFLE(2, 3, 0, 1)), 1); __m256i _rr3 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r3), _mm_shuffle_epi32(_r3, _MM_SHUFFLE(2, 3, 0, 1)), 1); -#if __AVXVNNI__ || __AVX512VNNI__ _sum00 = _mm256_comp_dpwssd_epi32(_sum00, _mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(0, 0, 0, 0)), _w0); _sum11 = _mm256_comp_dpwssd_epi32(_sum11, _mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(0, 0, 0, 0)), _w0); _sum22 = _mm256_comp_dpwssd_epi32(_sum22, _mm256_shuffle_epi32(_rr2, _MM_SHUFFLE(0, 0, 0, 0)), _w0); @@ -3046,16 +2826,6 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const _sum11 = _mm256_comp_dpwssd_epi32(_sum11, _mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(2, 2, 2, 2)), _w1); _sum22 = _mm256_comp_dpwssd_epi32(_sum22, _mm256_shuffle_epi32(_rr2, _MM_SHUFFLE(2, 2, 2, 2)), _w1); _sum33 = _mm256_comp_dpwssd_epi32(_sum33, _mm256_shuffle_epi32(_rr3, _MM_SHUFFLE(2, 2, 2, 2)), _w1); -#else - _sum00 = _mm256_add_epi32(_sum00, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum11 = _mm256_add_epi32(_sum11, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum22 = _mm256_add_epi32(_sum22, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr2, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum33 = _mm256_add_epi32(_sum33, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr3, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum00 = _mm256_add_epi32(_sum00, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(2, 2, 2, 2)), _w1)); - _sum11 = _mm256_add_epi32(_sum11, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(2, 2, 2, 2)), _w1)); - _sum22 = _mm256_add_epi32(_sum22, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr2, _MM_SHUFFLE(2, 2, 2, 2)), _w1)); - _sum33 = _mm256_add_epi32(_sum33, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr3, _MM_SHUFFLE(2, 2, 2, 2)), _w1)); -#endif #else // __AVX2__ __m128i _w01 = _mm_load_si128((const __m128i*)kptr); __m128i _w23 = _mm_load_si128((const __m128i*)(kptr + 16)); @@ -3067,41 +2837,22 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m128i _w3 = _mm_unpackhi_epi8(_w23, _extw23); // 01234567 -> 01010101 -#if __XOP__ - _sum0 = _mm_maddd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(0, 0, 0, 0)), _w0, _sum0); - _sum1 = _mm_maddd_epi16(_mm_shuffle_epi32(_r1, _MM_SHUFFLE(0, 0, 0, 0)), _w0, _sum1); - _sum2 = _mm_maddd_epi16(_mm_shuffle_epi32(_r2, _MM_SHUFFLE(0, 0, 0, 0)), _w0, _sum2); - _sum3 = _mm_maddd_epi16(_mm_shuffle_epi32(_r3, _MM_SHUFFLE(0, 0, 0, 0)), _w0, _sum3); - _sum0 = _mm_maddd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(1, 1, 1, 1)), _w1, _sum0); - _sum1 = _mm_maddd_epi16(_mm_shuffle_epi32(_r1, _MM_SHUFFLE(1, 1, 1, 1)), _w1, _sum1); - _sum2 = _mm_maddd_epi16(_mm_shuffle_epi32(_r2, _MM_SHUFFLE(1, 1, 1, 1)), _w1, _sum2); - _sum3 = _mm_maddd_epi16(_mm_shuffle_epi32(_r3, _MM_SHUFFLE(1, 1, 1, 1)), _w1, _sum3); - _sum0 = _mm_maddd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(2, 2, 2, 2)), _w2, _sum0); - _sum1 = _mm_maddd_epi16(_mm_shuffle_epi32(_r1, _MM_SHUFFLE(2, 2, 2, 2)), _w2, _sum1); - _sum2 = _mm_maddd_epi16(_mm_shuffle_epi32(_r2, _MM_SHUFFLE(2, 2, 2, 2)), _w2, _sum2); - _sum3 = _mm_maddd_epi16(_mm_shuffle_epi32(_r3, _MM_SHUFFLE(2, 2, 2, 2)), _w2, _sum3); - _sum0 = _mm_maddd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(3, 3, 3, 3)), _w3, _sum0); - _sum1 = _mm_maddd_epi16(_mm_shuffle_epi32(_r1, _MM_SHUFFLE(3, 3, 3, 3)), _w3, _sum1); - _sum2 = _mm_maddd_epi16(_mm_shuffle_epi32(_r2, _MM_SHUFFLE(3, 3, 3, 3)), _w3, _sum2); - _sum3 = _mm_maddd_epi16(_mm_shuffle_epi32(_r3, _MM_SHUFFLE(3, 3, 3, 3)), _w3, _sum3); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_mm_shuffle_epi32(_r1, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum2 = _mm_add_epi32(_sum2, _mm_madd_epi16(_mm_shuffle_epi32(_r2, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum3 = _mm_add_epi32(_sum3, _mm_madd_epi16(_mm_shuffle_epi32(_r3, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(1, 1, 1, 1)), _w1)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_mm_shuffle_epi32(_r1, _MM_SHUFFLE(1, 1, 1, 1)), _w1)); - _sum2 = _mm_add_epi32(_sum2, _mm_madd_epi16(_mm_shuffle_epi32(_r2, _MM_SHUFFLE(1, 1, 1, 1)), _w1)); - _sum3 = _mm_add_epi32(_sum3, _mm_madd_epi16(_mm_shuffle_epi32(_r3, _MM_SHUFFLE(1, 1, 1, 1)), _w1)); - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(2, 2, 2, 2)), _w2)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_mm_shuffle_epi32(_r1, _MM_SHUFFLE(2, 2, 2, 2)), _w2)); - _sum2 = _mm_add_epi32(_sum2, _mm_madd_epi16(_mm_shuffle_epi32(_r2, _MM_SHUFFLE(2, 2, 2, 2)), _w2)); - _sum3 = _mm_add_epi32(_sum3, _mm_madd_epi16(_mm_shuffle_epi32(_r3, _MM_SHUFFLE(2, 2, 2, 2)), _w2)); - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_mm_shuffle_epi32(_r1, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); - _sum2 = _mm_add_epi32(_sum2, _mm_madd_epi16(_mm_shuffle_epi32(_r2, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); - _sum3 = _mm_add_epi32(_sum3, _mm_madd_epi16(_mm_shuffle_epi32(_r3, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); -#endif // __XOP__ + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _mm_shuffle_epi32(_r0, _MM_SHUFFLE(0, 0, 0, 0)), _w0); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _mm_shuffle_epi32(_r1, _MM_SHUFFLE(0, 0, 0, 0)), _w0); + _sum2 = _mm_comp_dpwssd_epi32(_sum2, _mm_shuffle_epi32(_r2, _MM_SHUFFLE(0, 0, 0, 0)), _w0); + _sum3 = _mm_comp_dpwssd_epi32(_sum3, _mm_shuffle_epi32(_r3, _MM_SHUFFLE(0, 0, 0, 0)), _w0); + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _mm_shuffle_epi32(_r0, _MM_SHUFFLE(1, 1, 1, 1)), _w1); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _mm_shuffle_epi32(_r1, _MM_SHUFFLE(1, 1, 1, 1)), _w1); + _sum2 = _mm_comp_dpwssd_epi32(_sum2, _mm_shuffle_epi32(_r2, _MM_SHUFFLE(1, 1, 1, 1)), _w1); + _sum3 = _mm_comp_dpwssd_epi32(_sum3, _mm_shuffle_epi32(_r3, _MM_SHUFFLE(1, 1, 1, 1)), _w1); + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _mm_shuffle_epi32(_r0, _MM_SHUFFLE(2, 2, 2, 2)), _w2); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _mm_shuffle_epi32(_r1, _MM_SHUFFLE(2, 2, 2, 2)), _w2); + _sum2 = _mm_comp_dpwssd_epi32(_sum2, _mm_shuffle_epi32(_r2, _MM_SHUFFLE(2, 2, 2, 2)), _w2); + _sum3 = _mm_comp_dpwssd_epi32(_sum3, _mm_shuffle_epi32(_r3, _MM_SHUFFLE(2, 2, 2, 2)), _w2); + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _mm_shuffle_epi32(_r0, _MM_SHUFFLE(3, 3, 3, 3)), _w3); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _mm_shuffle_epi32(_r1, _MM_SHUFFLE(3, 3, 3, 3)), _w3); + _sum2 = _mm_comp_dpwssd_epi32(_sum2, _mm_shuffle_epi32(_r2, _MM_SHUFFLE(3, 3, 3, 3)), _w3); + _sum3 = _mm_comp_dpwssd_epi32(_sum3, _mm_shuffle_epi32(_r3, _MM_SHUFFLE(3, 3, 3, 3)), _w3); #endif // __AVX2__ kptr += 32; @@ -3146,22 +2897,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const _w = _mm_unpacklo_epi8(_w, _mm_cmpgt_epi8(_mm_setzero_si128(), _w)); #endif -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm_comp_dpwssd_epi32(_sum0, _r0, _w); _sum1 = _mm_comp_dpwssd_epi32(_sum1, _r1, _w); _sum2 = _mm_comp_dpwssd_epi32(_sum2, _r2, _w); _sum3 = _mm_comp_dpwssd_epi32(_sum3, _r3, _w); -#elif __XOP__ - _sum0 = _mm_maddd_epi16(_r0, _w, _sum0); - _sum1 = _mm_maddd_epi16(_r1, _w, _sum1); - _sum2 = _mm_maddd_epi16(_r2, _w, _sum2); - _sum3 = _mm_maddd_epi16(_r3, _w, _sum3); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_r0, _w)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_r1, _w)); - _sum2 = _mm_add_epi32(_sum2, _mm_madd_epi16(_r2, _w)); - _sum3 = _mm_add_epi32(_sum3, _mm_madd_epi16(_r3, _w)); -#endif kptr += 8; } @@ -3356,17 +3095,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m512i _rrr1l = _mm512_unpacklo_epi64(_rrr1, _rrr1); __m512i _rrr1h = _mm512_unpackhi_epi64(_rrr1, _rrr1); -#if __AVX512VNNI__ - _sum00 = _mm512_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(2, 0, 2, 0)), _w0); - _sum11 = _mm512_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(2, 0, 2, 0)), _w0); - _sum22 = _mm512_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(3, 1, 3, 1)), _w1); - _sum33 = _mm512_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(3, 1, 3, 1)), _w1); -#else - _sum00 = _mm512_add_epi32(_sum00, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(2, 0, 2, 0)), _w0)); - _sum11 = _mm512_add_epi32(_sum11, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(2, 0, 2, 0)), _w0)); - _sum22 = _mm512_add_epi32(_sum22, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(3, 1, 3, 1)), _w1)); - _sum33 = _mm512_add_epi32(_sum33, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(3, 1, 3, 1)), _w1)); -#endif // __AVX512VNNI__ + _sum00 = _mm512_comp_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(2, 0, 2, 0)), _w0); + _sum11 = _mm512_comp_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(2, 0, 2, 0)), _w0); + _sum22 = _mm512_comp_dpwssd_epi32(_sum22, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(3, 1, 3, 1)), _w1); + _sum33 = _mm512_comp_dpwssd_epi32(_sum33, _mm512_shuffle_i32x4(_rrr1l, _rrr1h, _MM_SHUFFLE(3, 1, 3, 1)), _w1); kptr += 64; } @@ -3448,17 +3180,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m256i _rr0 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r0), _mm_shuffle_epi32(_r0, _MM_SHUFFLE(2, 3, 0, 1)), 1); __m256i _rr1 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r1), _mm_shuffle_epi32(_r1, _MM_SHUFFLE(2, 3, 0, 1)), 1); -#if __AVXVNNI__ || __AVX512VNNI__ _sum00 = _mm256_comp_dpwssd_epi32(_sum00, _mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(0, 0, 0, 0)), _w0); _sum11 = _mm256_comp_dpwssd_epi32(_sum11, _mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(0, 0, 0, 0)), _w0); _sum22 = _mm256_comp_dpwssd_epi32(_sum22, _mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(2, 2, 2, 2)), _w1); _sum33 = _mm256_comp_dpwssd_epi32(_sum33, _mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(2, 2, 2, 2)), _w1); -#else - _sum00 = _mm256_add_epi32(_sum00, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum11 = _mm256_add_epi32(_sum11, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum22 = _mm256_add_epi32(_sum22, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(2, 2, 2, 2)), _w1)); - _sum33 = _mm256_add_epi32(_sum33, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr1, _MM_SHUFFLE(2, 2, 2, 2)), _w1)); -#endif #else // __AVX2__ __m128i _w01 = _mm_load_si128((const __m128i*)kptr); __m128i _w23 = _mm_load_si128((const __m128i*)(kptr + 16)); @@ -3470,25 +3195,14 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m128i _w3 = _mm_unpackhi_epi8(_w23, _extw23); // 01234567 -> 01010101 -#if __XOP__ - _sum0 = _mm_maddd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(0, 0, 0, 0)), _w0, _sum0); - _sum1 = _mm_maddd_epi16(_mm_shuffle_epi32(_r1, _MM_SHUFFLE(0, 0, 0, 0)), _w0, _sum1); - _sum2 = _mm_maddd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(1, 1, 1, 1)), _w1, _sum2); - _sum3 = _mm_maddd_epi16(_mm_shuffle_epi32(_r1, _MM_SHUFFLE(1, 1, 1, 1)), _w1, _sum3); - _sum0 = _mm_maddd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(2, 2, 2, 2)), _w2, _sum0); - _sum1 = _mm_maddd_epi16(_mm_shuffle_epi32(_r1, _MM_SHUFFLE(2, 2, 2, 2)), _w2, _sum1); - _sum2 = _mm_maddd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(3, 3, 3, 3)), _w3, _sum2); - _sum3 = _mm_maddd_epi16(_mm_shuffle_epi32(_r1, _MM_SHUFFLE(3, 3, 3, 3)), _w3, _sum3); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_mm_shuffle_epi32(_r1, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum2 = _mm_add_epi32(_sum2, _mm_madd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(1, 1, 1, 1)), _w1)); - _sum3 = _mm_add_epi32(_sum3, _mm_madd_epi16(_mm_shuffle_epi32(_r1, _MM_SHUFFLE(1, 1, 1, 1)), _w1)); - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(2, 2, 2, 2)), _w2)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_mm_shuffle_epi32(_r1, _MM_SHUFFLE(2, 2, 2, 2)), _w2)); - _sum2 = _mm_add_epi32(_sum2, _mm_madd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); - _sum3 = _mm_add_epi32(_sum3, _mm_madd_epi16(_mm_shuffle_epi32(_r1, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); -#endif // __XOP__ + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _mm_shuffle_epi32(_r0, _MM_SHUFFLE(0, 0, 0, 0)), _w0); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _mm_shuffle_epi32(_r1, _MM_SHUFFLE(0, 0, 0, 0)), _w0); + _sum2 = _mm_comp_dpwssd_epi32(_sum2, _mm_shuffle_epi32(_r0, _MM_SHUFFLE(1, 1, 1, 1)), _w1); + _sum3 = _mm_comp_dpwssd_epi32(_sum3, _mm_shuffle_epi32(_r1, _MM_SHUFFLE(1, 1, 1, 1)), _w1); + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _mm_shuffle_epi32(_r0, _MM_SHUFFLE(2, 2, 2, 2)), _w2); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _mm_shuffle_epi32(_r1, _MM_SHUFFLE(2, 2, 2, 2)), _w2); + _sum2 = _mm_comp_dpwssd_epi32(_sum2, _mm_shuffle_epi32(_r0, _MM_SHUFFLE(3, 3, 3, 3)), _w3); + _sum3 = _mm_comp_dpwssd_epi32(_sum3, _mm_shuffle_epi32(_r1, _MM_SHUFFLE(3, 3, 3, 3)), _w3); #endif // __AVX2__ kptr += 32; @@ -3527,16 +3241,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const _w = _mm_unpacklo_epi8(_w, _mm_cmpgt_epi8(_mm_setzero_si128(), _w)); #endif -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm_comp_dpwssd_epi32(_sum0, _r0, _w); _sum1 = _mm_comp_dpwssd_epi32(_sum1, _r1, _w); -#elif __XOP__ - _sum0 = _mm_maddd_epi16(_r0, _w, _sum0); - _sum1 = _mm_maddd_epi16(_r1, _w, _sum1); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_r0, _w)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_r1, _w)); -#endif kptr += 8; } @@ -3681,13 +3387,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m512i _rrr0l = _mm512_unpacklo_epi64(_rrr0, _rrr0); __m512i _rrr0h = _mm512_unpackhi_epi64(_rrr0, _rrr0); -#if __AVX512VNNI__ - _sum00 = _mm512_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(2, 0, 2, 0)), _w0); - _sum11 = _mm512_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(3, 1, 3, 1)), _w1); -#else - _sum00 = _mm512_add_epi32(_sum00, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(2, 0, 2, 0)), _w0)); - _sum11 = _mm512_add_epi32(_sum11, _mm512_madd_epi16(_mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(3, 1, 3, 1)), _w1)); -#endif // __AVX512VNNI__ + _sum00 = _mm512_comp_dpwssd_epi32(_sum00, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(2, 0, 2, 0)), _w0); + _sum11 = _mm512_comp_dpwssd_epi32(_sum11, _mm512_shuffle_i32x4(_rrr0l, _rrr0h, _MM_SHUFFLE(3, 1, 3, 1)), _w1); kptr += 64; } @@ -3752,13 +3453,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const // 01234567 -> 01010101 23232323 __m256i _rr0 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r0), _mm_shuffle_epi32(_r0, _MM_SHUFFLE(2, 3, 0, 1)), 1); -#if __AVXVNNI__ || __AVX512VNNI__ _sum00 = _mm256_comp_dpwssd_epi32(_sum00, _mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(0, 0, 0, 0)), _w0); _sum11 = _mm256_comp_dpwssd_epi32(_sum11, _mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(2, 2, 2, 2)), _w1); -#else - _sum00 = _mm256_add_epi32(_sum00, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum11 = _mm256_add_epi32(_sum11, _mm256_madd_epi16(_mm256_shuffle_epi32(_rr0, _MM_SHUFFLE(2, 2, 2, 2)), _w1)); -#endif #else // __AVX2__ __m128i _w01 = _mm_load_si128((const __m128i*)kptr); __m128i _w23 = _mm_load_si128((const __m128i*)(kptr + 16)); @@ -3770,17 +3466,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m128i _w3 = _mm_unpackhi_epi8(_w23, _extw23); // 01234567 -> 01010101 -#if __XOP__ - _sum0 = _mm_maddd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(0, 0, 0, 0)), _w0, _sum0); - _sum1 = _mm_maddd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(1, 1, 1, 1)), _w1, _sum1); - _sum2 = _mm_maddd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(2, 2, 2, 2)), _w2, _sum2); - _sum3 = _mm_maddd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(3, 3, 3, 3)), _w3, _sum3); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(0, 0, 0, 0)), _w0)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(1, 1, 1, 1)), _w1)); - _sum2 = _mm_add_epi32(_sum2, _mm_madd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(2, 2, 2, 2)), _w2)); - _sum3 = _mm_add_epi32(_sum3, _mm_madd_epi16(_mm_shuffle_epi32(_r0, _MM_SHUFFLE(3, 3, 3, 3)), _w3)); -#endif // __XOP__ + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _mm_shuffle_epi32(_r0, _MM_SHUFFLE(0, 0, 0, 0)), _w0); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _mm_shuffle_epi32(_r0, _MM_SHUFFLE(1, 1, 1, 1)), _w1); + _sum2 = _mm_comp_dpwssd_epi32(_sum2, _mm_shuffle_epi32(_r0, _MM_SHUFFLE(2, 2, 2, 2)), _w2); + _sum3 = _mm_comp_dpwssd_epi32(_sum3, _mm_shuffle_epi32(_r0, _MM_SHUFFLE(3, 3, 3, 3)), _w3); #endif // __AVX2__ kptr += 32; @@ -3815,13 +3504,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const _w = _mm_unpacklo_epi8(_w, _mm_cmpgt_epi8(_mm_setzero_si128(), _w)); #endif -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm_comp_dpwssd_epi32(_sum0, _r0, _w); -#elif __XOP__ - _sum0 = _mm_maddd_epi16(_r0, _w, _sum0); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_r0, _w)); -#endif kptr += 8; } @@ -3996,17 +3679,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m512i _w = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*)kptr)); -#if __AVX512VNNI__ - _sum00 = _mm512_dpwssd_epi32(_sum00, _valval0, _w); - _sum11 = _mm512_dpwssd_epi32(_sum11, _valval1, _w); - _sum22 = _mm512_dpwssd_epi32(_sum22, _valval2, _w); - _sum33 = _mm512_dpwssd_epi32(_sum33, _valval3, _w); -#else - _sum00 = _mm512_add_epi32(_sum00, _mm512_madd_epi16(_valval0, _w)); - _sum11 = _mm512_add_epi32(_sum11, _mm512_madd_epi16(_valval1, _w)); - _sum22 = _mm512_add_epi32(_sum22, _mm512_madd_epi16(_valval2, _w)); - _sum33 = _mm512_add_epi32(_sum33, _mm512_madd_epi16(_valval3, _w)); -#endif // __AVX512VNNI__ + _sum00 = _mm512_comp_dpwssd_epi32(_sum00, _valval0, _w); + _sum11 = _mm512_comp_dpwssd_epi32(_sum11, _valval1, _w); + _sum22 = _mm512_comp_dpwssd_epi32(_sum22, _valval2, _w); + _sum33 = _mm512_comp_dpwssd_epi32(_sum33, _valval3, _w); kptr += 32; } @@ -4115,41 +3791,23 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m256i _w = _mm256_cvtepi8_epi16(_w01); -#if __AVXVNNI__ || __AVX512VNNI__ _sum00 = _mm256_comp_dpwssd_epi32(_sum00, _valval0, _w); _sum11 = _mm256_comp_dpwssd_epi32(_sum11, _valval1, _w); _sum22 = _mm256_comp_dpwssd_epi32(_sum22, _valval2, _w); _sum33 = _mm256_comp_dpwssd_epi32(_sum33, _valval3, _w); -#else - _sum00 = _mm256_add_epi32(_sum00, _mm256_madd_epi16(_valval0, _w)); - _sum11 = _mm256_add_epi32(_sum11, _mm256_madd_epi16(_valval1, _w)); - _sum22 = _mm256_add_epi32(_sum22, _mm256_madd_epi16(_valval2, _w)); - _sum33 = _mm256_add_epi32(_sum33, _mm256_madd_epi16(_valval3, _w)); -#endif #else // __AVX2__ __m128i _extw01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w01); __m128i _w0 = _mm_unpacklo_epi8(_w01, _extw01); __m128i _w1 = _mm_unpackhi_epi8(_w01, _extw01); -#if __XOP__ - _sum00 = _mm_maddd_epi16(_r0, _w0, _sum00); - _sum10 = _mm_maddd_epi16(_r0, _w1, _sum10); - _sum01 = _mm_maddd_epi16(_r1, _w0, _sum01); - _sum11 = _mm_maddd_epi16(_r1, _w1, _sum11); - _sum02 = _mm_maddd_epi16(_r2, _w0, _sum02); - _sum12 = _mm_maddd_epi16(_r2, _w1, _sum12); - _sum03 = _mm_maddd_epi16(_r3, _w0, _sum03); - _sum13 = _mm_maddd_epi16(_r3, _w1, _sum13); -#else - _sum00 = _mm_add_epi32(_sum00, _mm_madd_epi16(_r0, _w0)); - _sum10 = _mm_add_epi32(_sum10, _mm_madd_epi16(_r0, _w1)); - _sum01 = _mm_add_epi32(_sum01, _mm_madd_epi16(_r1, _w0)); - _sum11 = _mm_add_epi32(_sum11, _mm_madd_epi16(_r1, _w1)); - _sum02 = _mm_add_epi32(_sum02, _mm_madd_epi16(_r2, _w0)); - _sum12 = _mm_add_epi32(_sum12, _mm_madd_epi16(_r2, _w1)); - _sum03 = _mm_add_epi32(_sum03, _mm_madd_epi16(_r3, _w0)); - _sum13 = _mm_add_epi32(_sum13, _mm_madd_epi16(_r3, _w1)); -#endif // __XOP__ + _sum00 = _mm_comp_dpwssd_epi32(_sum00, _r0, _w0); + _sum10 = _mm_comp_dpwssd_epi32(_sum10, _r0, _w1); + _sum01 = _mm_comp_dpwssd_epi32(_sum01, _r1, _w0); + _sum11 = _mm_comp_dpwssd_epi32(_sum11, _r1, _w1); + _sum02 = _mm_comp_dpwssd_epi32(_sum02, _r2, _w0); + _sum12 = _mm_comp_dpwssd_epi32(_sum12, _r2, _w1); + _sum03 = _mm_comp_dpwssd_epi32(_sum03, _r3, _w0); + _sum13 = _mm_comp_dpwssd_epi32(_sum13, _r3, _w1); #endif // __AVX2__ kptr += 16; @@ -4209,13 +3867,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m128i _w0 = _mm_setr_epi16(kptr[0], kptr[2], kptr[0], kptr[2], kptr[0], kptr[2], kptr[0], kptr[2]); __m128i _w1 = _mm_setr_epi16(kptr[1], kptr[3], kptr[1], kptr[3], kptr[1], kptr[3], kptr[1], kptr[3]); -#if __XOP__ - _sum0 = _mm_maddd_epi16(_r, _w0, _sum0); - _sum1 = _mm_maddd_epi16(_r, _w1, _sum1); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_r, _w0)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_r, _w1)); -#endif // __XOP__ + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _r, _w0); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _r, _w1); kptr += 4; } @@ -4328,13 +3981,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m512i _w = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*)kptr)); -#if __AVX512VNNI__ - _sum0 = _mm512_dpwssd_epi32(_sum0, _valval0, _w); - _sum1 = _mm512_dpwssd_epi32(_sum1, _valval1, _w); -#else - _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_valval0, _w)); - _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_valval1, _w)); -#endif // __AVX512VNNI__ + _sum0 = _mm512_comp_dpwssd_epi32(_sum0, _valval0, _w); + _sum1 = _mm512_comp_dpwssd_epi32(_sum1, _valval1, _w); kptr += 32; } @@ -4414,29 +4062,17 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m256i _valval0 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r0), _r0, 1); __m256i _valval1 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r1), _r1, 1); -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _valval0, _w); _sum1 = _mm256_comp_dpwssd_epi32(_sum1, _valval1, _w); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_valval0, _w)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_valval1, _w)); -#endif #else // __AVX2__ __m128i _extw01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w01); __m128i _w0 = _mm_unpacklo_epi8(_w01, _extw01); __m128i _w1 = _mm_unpackhi_epi8(_w01, _extw01); -#if __XOP__ - _sum00 = _mm_maddd_epi16(_r0, _w0, _sum00); - _sum10 = _mm_maddd_epi16(_r0, _w1, _sum10); - _sum01 = _mm_maddd_epi16(_r1, _w0, _sum01); - _sum11 = _mm_maddd_epi16(_r1, _w1, _sum11); -#else - _sum00 = _mm_add_epi32(_sum00, _mm_madd_epi16(_r0, _w0)); - _sum10 = _mm_add_epi32(_sum10, _mm_madd_epi16(_r0, _w1)); - _sum01 = _mm_add_epi32(_sum01, _mm_madd_epi16(_r1, _w0)); - _sum11 = _mm_add_epi32(_sum11, _mm_madd_epi16(_r1, _w1)); -#endif // __XOP__ + _sum00 = _mm_comp_dpwssd_epi32(_sum00, _r0, _w0); + _sum10 = _mm_comp_dpwssd_epi32(_sum10, _r0, _w1); + _sum01 = _mm_comp_dpwssd_epi32(_sum01, _r1, _w0); + _sum11 = _mm_comp_dpwssd_epi32(_sum11, _r1, _w1); #endif // __AVX2__ kptr += 16; @@ -4563,11 +4199,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m512i _w = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*)kptr)); -#if __AVX512VNNI__ - _sum01 = _mm512_dpwssd_epi32(_sum01, _valval, _w); -#else - _sum01 = _mm512_add_epi32(_sum01, _mm512_madd_epi16(_valval, _w)); -#endif + _sum01 = _mm512_comp_dpwssd_epi32(_sum01, _valval, _w); kptr += 32; } @@ -4629,23 +4261,14 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m256i _rr0 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r0), _r0, 1); -#if __AVXVNNI__ || __AVX512VNNI__ _sum = _mm256_comp_dpwssd_epi32(_sum, _rr0, _w); -#else - _sum = _mm256_add_epi32(_sum, _mm256_madd_epi16(_rr0, _w)); -#endif #else // __AVX2__ __m128i _extw01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w01); __m128i _w0 = _mm_unpacklo_epi8(_w01, _extw01); __m128i _w1 = _mm_unpackhi_epi8(_w01, _extw01); -#if __XOP__ - _sum0 = _mm_maddd_epi16(_r0, _w0, _sum0); - _sum1 = _mm_maddd_epi16(_r0, _w1, _sum1); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_r0, _w0)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_r0, _w1)); -#endif // __XOP__ + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _r0, _w0); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _r0, _w1); #endif // __AVX2__ kptr += 16; @@ -4796,17 +4419,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m256i _w = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*)kptr)); -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _val0, _w); _sum1 = _mm256_comp_dpwssd_epi32(_sum1, _val1, _w); _sum2 = _mm256_comp_dpwssd_epi32(_sum2, _val2, _w); _sum3 = _mm256_comp_dpwssd_epi32(_sum3, _val3, _w); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_val0, _w)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_val1, _w)); - _sum2 = _mm256_add_epi32(_sum2, _mm256_madd_epi16(_val2, _w)); - _sum3 = _mm256_add_epi32(_sum3, _mm256_madd_epi16(_val3, _w)); -#endif kptr += 16; } @@ -4898,22 +4514,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const _w = _mm_unpacklo_epi8(_w, _mm_cmpgt_epi8(_mm_setzero_si128(), _w)); #endif -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm_comp_dpwssd_epi32(_sum0, _r0, _w); _sum1 = _mm_comp_dpwssd_epi32(_sum1, _r1, _w); _sum2 = _mm_comp_dpwssd_epi32(_sum2, _r2, _w); _sum3 = _mm_comp_dpwssd_epi32(_sum3, _r3, _w); -#elif __XOP__ - _sum0 = _mm_maddd_epi16(_r0, _w, _sum0); - _sum1 = _mm_maddd_epi16(_r1, _w, _sum1); - _sum2 = _mm_maddd_epi16(_r2, _w, _sum2); - _sum3 = _mm_maddd_epi16(_r3, _w, _sum3); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_r0, _w)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_r1, _w)); - _sum2 = _mm_add_epi32(_sum2, _mm_madd_epi16(_r2, _w)); - _sum3 = _mm_add_epi32(_sum3, _mm_madd_epi16(_r3, _w)); -#endif kptr += 8; } @@ -4956,11 +4560,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m128i _r = _mm_setr_epi16(r0s[0], r0s[N], r1s[0], r1s[N], r2s[0], r2s[N], r3s[0], r3s[N]); __m128i _w = _mm_setr_epi16(kptr[0], kptr[1], kptr[0], kptr[1], kptr[0], kptr[1], kptr[0], kptr[1]); -#if __XOP__ - _sum = _mm_maddd_epi16(_r, _w, _sum); -#else - _sum = _mm_add_epi32(_sum, _mm_madd_epi16(_r, _w)); -#endif // __XOP__ + _sum = _mm_comp_dpwssd_epi32(_sum, _r, _w); kptr += 2; } @@ -5073,13 +4673,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m256i _w = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*)kptr)); -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm256_comp_dpwssd_epi32(_sum0, _val0, _w); _sum1 = _mm256_comp_dpwssd_epi32(_sum1, _val1, _w); -#else - _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_val0, _w)); - _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_val1, _w)); -#endif kptr += 16; } @@ -5149,16 +4744,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const _w = _mm_unpacklo_epi8(_w, _mm_cmpgt_epi8(_mm_setzero_si128(), _w)); #endif -#if __AVXVNNI__ || __AVX512VNNI__ _sum0 = _mm_comp_dpwssd_epi32(_sum0, _r0, _w); _sum1 = _mm_comp_dpwssd_epi32(_sum1, _r1, _w); -#elif __XOP__ - _sum0 = _mm_maddd_epi16(_r0, _w, _sum0); - _sum1 = _mm_maddd_epi16(_r1, _w, _sum1); -#else - _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_r0, _w)); - _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_r1, _w)); -#endif kptr += 8; } @@ -5264,11 +4851,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const __m256i _w = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*)kptr)); -#if __AVXVNNI__ || __AVX512VNNI__ _sum = _mm256_comp_dpwssd_epi32(_sum, _val, _w); -#else - _sum = _mm256_add_epi32(_sum, _mm256_madd_epi16(_val, _w)); -#endif kptr += 16; } @@ -5324,13 +4907,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const _w = _mm_unpacklo_epi8(_w, _mm_cmpgt_epi8(_mm_setzero_si128(), _w)); #endif -#if __AVXVNNI__ || __AVX512VNNI__ _sum = _mm_comp_dpwssd_epi32(_sum, _r0, _w); -#elif __XOP__ - _sum = _mm_maddd_epi16(_r0, _w, _sum); -#else - _sum = _mm_add_epi32(_sum, _mm_madd_epi16(_r0, _w)); -#endif kptr += 8; } diff --git a/src/layer/x86/lstm_int8.h b/src/layer/x86/lstm_int8.h index a6655611e18..0358028cabc 100644 --- a/src/layer/x86/lstm_int8.h +++ b/src/layer/x86/lstm_int8.h @@ -2004,11 +2004,7 @@ static void lstm_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_d __m512i _xixi0 = _mm512_shuffle_epi32(_xixi, _MM_PERM_AAAA); -#if __AVX512VNNI__ - _lstm_IFOGx0 = _mm512_dpwssd_epi32(_lstm_IFOGx0, _ww, _xixi0); -#else - _lstm_IFOGx0 = _mm512_add_epi32(_lstm_IFOGx0, _mm512_madd_epi16(_ww, _xixi0)); -#endif // __AVX512VNNI__ + _lstm_IFOGx0 = _mm512_comp_dpwssd_epi32(_lstm_IFOGx0, _ww, _xixi0); kptr += 32; } @@ -2191,11 +2187,7 @@ static void lstm_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_d __m512i _hh_cont0 = _mm512_shuffle_epi32(_hh_cont, _MM_PERM_AAAA); -#if __AVX512VNNI__ - _lstm_IFOGh0 = _mm512_dpwssd_epi32(_lstm_IFOGh0, _ww, _hh_cont0); -#else - _lstm_IFOGh0 = _mm512_add_epi32(_lstm_IFOGh0, _mm512_madd_epi16(_ww, _hh_cont0)); -#endif // __AVX512VNNI__ + _lstm_IFOGh0 = _mm512_comp_dpwssd_epi32(_lstm_IFOGh0, _ww, _hh_cont0); kptr += 32; } @@ -2394,11 +2386,7 @@ static void lstm_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_d __m256i _xixi0 = _mm256_shuffle_epi32(_xixi, _MM_SHUFFLE(0, 0, 0, 0)); -#if __AVXVNNI__ || __AVX512VNNI__ _lstm_IFOGx0 = _mm256_comp_dpwssd_epi32(_lstm_IFOGx0, _ww, _xixi0); -#else - _lstm_IFOGx0 = _mm256_add_epi32(_lstm_IFOGx0, _mm256_madd_epi16(_ww, _xixi0)); -#endif // __AVXVNNI__ || __AVX512VNNI__ kptr += 16; } @@ -2555,11 +2543,7 @@ static void lstm_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_d __m256i _hh_cont0 = _mm256_shuffle_epi32(_hh_cont, _MM_SHUFFLE(0, 0, 0, 0)); -#if __AVXVNNI__ || __AVX512VNNI__ _lstm_IFOGh0 = _mm256_comp_dpwssd_epi32(_lstm_IFOGh0, _ww, _hh_cont0); -#else - _lstm_IFOGh0 = _mm256_add_epi32(_lstm_IFOGh0, _mm256_madd_epi16(_ww, _hh_cont0)); -#endif // __AVXVNNI__ || __AVX512VNNI__ kptr += 16; } @@ -2712,21 +2696,10 @@ static void lstm_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_d _w3 = _mm_unpacklo_epi8(_w3, _mm_cmpgt_epi8(_mm_setzero_si128(), _w3)); #endif -#if __XOP__ - _sum0 = _mm_maddd_epi16(_w0, _xi, _sum0); - _sum1 = _mm_maddd_epi16(_w1, _xi, _sum1); - _sum2 = _mm_maddd_epi16(_w2, _xi, _sum2); - _sum3 = _mm_maddd_epi16(_w3, _xi, _sum3); -#else - __m128i _s0 = _mm_madd_epi16(_w0, _xi); - __m128i _s1 = _mm_madd_epi16(_w1, _xi); - __m128i _s2 = _mm_madd_epi16(_w2, _xi); - __m128i _s3 = _mm_madd_epi16(_w3, _xi); - _sum0 = _mm_add_epi32(_sum0, _s0); - _sum1 = _mm_add_epi32(_sum1, _s1); - _sum2 = _mm_add_epi32(_sum2, _s2); - _sum3 = _mm_add_epi32(_sum3, _s3); -#endif + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _w0, _xi); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _w1, _xi); + _sum2 = _mm_comp_dpwssd_epi32(_sum2, _w2, _xi); + _sum3 = _mm_comp_dpwssd_epi32(_sum3, _w3, _xi); kptr += 32; } @@ -2757,15 +2730,8 @@ static void lstm_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_d _w1 = _mm_unpacklo_epi8(_w1, _mm_cmpgt_epi8(_mm_setzero_si128(), _w1)); #endif -#if __XOP__ - _sum0 = _mm_maddd_epi16(_w0, _xi, _sum0); - _sum1 = _mm_maddd_epi16(_w1, _xi, _sum1); -#else - __m128i _s0 = _mm_madd_epi16(_w0, _xi); - __m128i _s1 = _mm_madd_epi16(_w1, _xi); - _sum0 = _mm_add_epi32(_sum0, _s0); - _sum1 = _mm_add_epi32(_sum1, _s1); -#endif + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _w0, _xi); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _w1, _xi); kptr += 16; } @@ -2794,11 +2760,7 @@ static void lstm_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_d _xi = _mm_unpacklo_epi8(_xi, _mm_cmpgt_epi8(_mm_setzero_si128(), _xi)); #endif -#if __XOP__ - _lstm_IFOGx0 = _mm_maddd_epi16(_w, _xi, _lstm_IFOGx0); -#else - _lstm_IFOGx0 = _mm_add_epi32(_lstm_IFOGx0, _mm_madd_epi16(_w, _xi)); -#endif + _lstm_IFOGx0 = _mm_comp_dpwssd_epi32(_lstm_IFOGx0, _w, _xi); kptr += 8; } @@ -2921,21 +2883,10 @@ static void lstm_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_d _w3 = _mm_unpacklo_epi8(_w3, _mm_cmpgt_epi8(_mm_setzero_si128(), _w3)); #endif -#if __XOP__ - _sum0 = _mm_maddd_epi16(_w0, _h_cont, _sum0); - _sum1 = _mm_maddd_epi16(_w1, _h_cont, _sum1); - _sum2 = _mm_maddd_epi16(_w2, _h_cont, _sum2); - _sum3 = _mm_maddd_epi16(_w3, _h_cont, _sum3); -#else - __m128i _s0 = _mm_madd_epi16(_w0, _h_cont); - __m128i _s1 = _mm_madd_epi16(_w1, _h_cont); - __m128i _s2 = _mm_madd_epi16(_w2, _h_cont); - __m128i _s3 = _mm_madd_epi16(_w3, _h_cont); - _sum0 = _mm_add_epi32(_sum0, _s0); - _sum1 = _mm_add_epi32(_sum1, _s1); - _sum2 = _mm_add_epi32(_sum2, _s2); - _sum3 = _mm_add_epi32(_sum3, _s3); -#endif + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _w0, _h_cont); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _w1, _h_cont); + _sum2 = _mm_comp_dpwssd_epi32(_sum2, _w2, _h_cont); + _sum3 = _mm_comp_dpwssd_epi32(_sum3, _w3, _h_cont); kptr += 32; } @@ -2966,15 +2917,8 @@ static void lstm_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_d _w1 = _mm_unpacklo_epi8(_w1, _mm_cmpgt_epi8(_mm_setzero_si128(), _w1)); #endif -#if __XOP__ - _sum0 = _mm_maddd_epi16(_w0, _h_cont, _sum0); - _sum1 = _mm_maddd_epi16(_w1, _h_cont, _sum1); -#else - __m128i _s0 = _mm_madd_epi16(_w0, _h_cont); - __m128i _s1 = _mm_madd_epi16(_w1, _h_cont); - _sum0 = _mm_add_epi32(_sum0, _s0); - _sum1 = _mm_add_epi32(_sum1, _s1); -#endif + _sum0 = _mm_comp_dpwssd_epi32(_sum0, _w0, _h_cont); + _sum1 = _mm_comp_dpwssd_epi32(_sum1, _w1, _h_cont); kptr += 16; } @@ -3003,11 +2947,7 @@ static void lstm_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_d _h_cont = _mm_unpacklo_epi8(_h_cont, _mm_cmpgt_epi8(_mm_setzero_si128(), _h_cont)); #endif -#if __XOP__ - _lstm_IFOGh0 = _mm_maddd_epi16(_w, _h_cont, _lstm_IFOGh0); -#else - _lstm_IFOGh0 = _mm_add_epi32(_lstm_IFOGh0, _mm_madd_epi16(_w, _h_cont)); -#endif + _lstm_IFOGh0 = _mm_comp_dpwssd_epi32(_lstm_IFOGh0, _w, _h_cont); kptr += 8; } diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h index 4dbef6b089b..10673884266 100644 --- a/src/layer/x86/x86_usability.h +++ b/src/layer/x86/x86_usability.h @@ -304,6 +304,19 @@ static NCNN_FORCEINLINE __m128 _mm_comp_fnmsub_ps(const __m128& _a, const __m128 #endif } +static NCNN_FORCEINLINE __m128i _mm_comp_dpwssd_epi32(__m128i src, __m128i a, __m128i b) +{ +#if __AVX512VNNI__ + return _mm_dpwssd_epi32(src, a, b); +#elif __AVXVNNI__ + return _mm_dpwssd_avx_epi32(src, a, b); +#elif __XOP__ + return _mm_maddd_epi16(a, b, src); +#else + return _mm_add_epi32(src, _mm_madd_epi16(a, b)); +#endif +} + #if __AVX__ static NCNN_FORCEINLINE __m256 _mm256_comp_fmadd_ps(const __m256& _a, const __m256& _b, const __m256& _c) { @@ -841,6 +854,17 @@ static NCNN_FORCEINLINE __m256i float2bfloat_avx(const __m256& v0, const __m256& } #if __AVX2__ +static NCNN_FORCEINLINE __m256i _mm256_comp_dpwssd_epi32(__m256i src, __m256i a, __m256i b) +{ +#if __AVX512VNNI__ + return _mm256_dpwssd_epi32(src, a, b); +#elif __AVXVNNI__ + return _mm256_dpwssd_avx_epi32(src, a, b); +#else + return _mm256_add_epi32(src, _mm256_madd_epi16(a, b)); +#endif +} + #if __AVX512VNNI__ || __AVXVNNI__ static NCNN_FORCEINLINE __m128i _mm_comp_dpbusd_epi32(__m128i src, __m128i a, __m128i b) { @@ -859,24 +883,6 @@ static NCNN_FORCEINLINE __m256i _mm256_comp_dpbusd_epi32(__m256i src, __m256i a, return _mm256_dpbusd_avx_epi32(src, a, b); #endif } - -static NCNN_FORCEINLINE __m128i _mm_comp_dpwssd_epi32(__m128i src, __m128i a, __m128i b) -{ -#if __AVX512VNNI__ - return _mm_dpwssd_epi32(src, a, b); -#else - return _mm_dpwssd_avx_epi32(src, a, b); -#endif -} - -static NCNN_FORCEINLINE __m256i _mm256_comp_dpwssd_epi32(__m256i src, __m256i a, __m256i b) -{ -#if __AVX512VNNI__ - return _mm256_dpwssd_epi32(src, a, b); -#else - return _mm256_dpwssd_avx_epi32(src, a, b); -#endif -} #endif // __AVX512VNNI__ || __AVXVNNI__ static NCNN_FORCEINLINE void transpose8x2_epi32(__m256i& _r0, __m256i& _r1) @@ -928,6 +934,15 @@ static NCNN_FORCEINLINE void transpose16x8_epi16(__m256i& _r0, __m256i& _r1, __m } #if __AVX512F__ +static NCNN_FORCEINLINE __m512i _mm512_comp_dpwssd_epi32(__m512i src, __m512i a, __m512i b) +{ +#if __AVX512VNNI__ + return _mm512_dpwssd_epi32(src, a, b); +#else + return _mm512_add_epi32(src, _mm512_madd_epi16(a, b)); +#endif +} + static NCNN_FORCEINLINE void transpose16x16_ps(__m512& _r0, __m512& _r1, __m512& _r2, __m512& _r3, __m512& _r4, __m512& _r5, __m512& _r6, __m512& _r7, __m512& _r8, __m512& _r9, __m512& _ra, __m512& _rb, __m512& _rc, __m512& _rd, __m512& _re, __m512& _rf) {