diff --git a/src/layer/x86/convolution_im2col_gemm_int8.h b/src/layer/x86/convolution_im2col_gemm_int8.h index e3196f49ffa..cc8e22cd9e7 100644 --- a/src/layer/x86/convolution_im2col_gemm_int8.h +++ b/src/layer/x86/convolution_im2col_gemm_int8.h @@ -76,7 +76,7 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M #endif } -static void convolution_im2col_gemm_get_optimal_tile_mnk_int8(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) +static NCNN_FORCEINLINE void convolution_im2col_gemm_get_optimal_tile_mnk_int8(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) { // resolve optimal tile size from cache size const size_t l2_cache_size_int8 = (int)(get_cpu_level2_cache_size() / sizeof(signed char)); @@ -205,11 +205,13 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_int8(int M, int N, int } } -static void convolution_im2col_input_tile_conv1x1s1d1_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk) +static NCNN_FORCEINLINE void convolution_im2col_input_tile_conv1x1s1d1_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk) { const int elempack = bottom_blob.elempack; const int cstep = (int)bottom_blob.cstep; + // NCNN_LOGE("convolution_im2col_input_tile_conv1x1s1d1_int8 %d %d %d %d @%d", j, max_jj, k, max_kk, elempack); + signed char* pp = B; int jj = 0; @@ -820,7 +822,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat& __m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk))); __m128i _u = div_kernel_w._mm_comp_div_epu32(_uv); __m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w))); - _p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep)); + _p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep)); _u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h)); _v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w)); _u = _mm_mullo_epi32(_u, _mm_set1_epi32(w)); @@ -1031,7 +1033,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat& __m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk))); __m128i _u = div_kernel_w._mm_comp_div_epu32(_uv); __m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w))); - _p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep)); + _p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep)); _u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h)); _v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w)); _u = _mm_mullo_epi32(_u, _mm_set1_epi32(w)); @@ -1316,7 +1318,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat& __m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk))); __m128i _u = div_kernel_w._mm_comp_div_epu32(_uv); __m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w))); - _p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep)); + _p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep)); _u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h)); _v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w)); _u = _mm_mullo_epi32(_u, _mm_set1_epi32(w)); @@ -1523,7 +1525,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat& __m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk))); __m128i _u = div_kernel_w._mm_comp_div_epu32(_uv); __m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w))); - _p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep)); + _p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep)); _u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h)); _v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w)); _u = _mm_mullo_epi32(_u, _mm_set1_epi32(w)); @@ -1835,7 +1837,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat& __m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk))); __m128i _u = div_kernel_w._mm_comp_div_epu32(_uv); __m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w))); - _p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep)); + _p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep)); _u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h)); _v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w)); _u = _mm_mullo_epi32(_u, _mm_set1_epi32(w)); @@ -1974,7 +1976,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat& __m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk))); __m128i _u = div_kernel_w._mm_comp_div_epu32(_uv); __m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w))); - _p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep)); + _p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep)); _u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h)); _v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w)); _u = _mm_mullo_epi32(_u, _mm_set1_epi32(w)); @@ -2197,7 +2199,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat& __m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk))); __m128i _u = div_kernel_w._mm_comp_div_epu32(_uv); __m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w))); - _p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep)); + _p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep)); _u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h)); _v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w)); _u = _mm_mullo_epi32(_u, _mm_set1_epi32(w)); @@ -2321,7 +2323,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat& __m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk))); __m128i _u = div_kernel_w._mm_comp_div_epu32(_uv); __m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w))); - _p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep)); + _p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep)); _u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h)); _v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w)); _u = _mm_mullo_epi32(_u, _mm_set1_epi32(w)); @@ -2481,7 +2483,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat& __m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk))); __m128i _u = div_kernel_w._mm_comp_div_epu32(_uv); __m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w))); - _p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep)); + _p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep)); _u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h)); _v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w)); _u = _mm_mullo_epi32(_u, _mm_set1_epi32(w)); diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h index c25facf79f5..778934557f7 100644 --- a/src/layer/x86/x86_usability.h +++ b/src/layer/x86/x86_usability.h @@ -42,7 +42,7 @@ class FastDivider_epu32 { public: - FastDivider_epu32(unsigned int d) + NCNN_FORCEINLINE FastDivider_epu32(unsigned int d) { unsigned int m, sh1, sh2; if (d == 1) @@ -54,13 +54,7 @@ class FastDivider_epu32 else { // sh = ceil(log2(d)) -#ifdef _MSC_VER - unsigned long index; - _BitScanReverse(&index, d - 1); - uint32_t sh = index + 1; -#else - uint32_t sh = 32 - __builtin_clz(d - 1); -#endif + uint32_t sh = portable_ceil_log2(d); uint32_t m0 = sh == 32 ? 0 : 1 << sh; m = 1 + uint32_t((uint64_t(m0 - d) << 32) / d); @@ -81,7 +75,7 @@ class FastDivider_epu32 #if __AVX2__ #if __AVX512F__ - __m512i _mm512_comp_div_epu32(__m512i x) const + NCNN_FORCEINLINE __m512i _mm512_comp_div_epu32(__m512i x) const { // xm = (x * multiplier) >> 32 __m512i xm_low = _mm512_srli_epi64(_mm512_mul_epu32(x, _multiplier), 32); @@ -93,13 +87,13 @@ class FastDivider_epu32 } #endif // __AVX512F__ - __m256i _mm256_comp_div_epu32(__m256i x) const + NCNN_FORCEINLINE __m256i _mm256_comp_div_epu32(__m256i x) const { // xm = (x * multiplier) >> 32 #if __AVX512F__ __m256i xm_low = _mm256_srli_epi64(_mm256_mul_epu32(x, _mm512_castsi512_si256(_multiplier)), 32); __m256i xm_high = _mm256_mul_epu32(_mm256_srli_epi64(x, 32), _mm512_castsi512_si256(_multiplier)); -#elif __AVX2__ +#else __m256i xm_low = _mm256_srli_epi64(_mm256_mul_epu32(x, _multiplier), 32); __m256i xm_high = _mm256_mul_epu32(_mm256_srli_epi64(x, 32), _multiplier); #endif @@ -109,7 +103,7 @@ class FastDivider_epu32 } #endif // __AVX2__ - __m128i _mm_comp_div_epu32(__m128i x) const + NCNN_FORCEINLINE __m128i _mm_comp_div_epu32(__m128i x) const { // xm = (x * multiplier) >> 32 #if __AVX512F__