diff --git a/src/layer/x86/convolution_im2col_gemm_int8.h b/src/layer/x86/convolution_im2col_gemm_int8.h
index e3196f49ffa..cc8e22cd9e7 100644
--- a/src/layer/x86/convolution_im2col_gemm_int8.h
+++ b/src/layer/x86/convolution_im2col_gemm_int8.h
@@ -76,7 +76,7 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M
 #endif
 }
 
-static void convolution_im2col_gemm_get_optimal_tile_mnk_int8(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
+static NCNN_FORCEINLINE void convolution_im2col_gemm_get_optimal_tile_mnk_int8(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
 {
     // resolve optimal tile size from cache size
     const size_t l2_cache_size_int8 = (int)(get_cpu_level2_cache_size() / sizeof(signed char));
@@ -205,11 +205,13 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_int8(int M, int N, int
     }
 }
 
-static void convolution_im2col_input_tile_conv1x1s1d1_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk)
+static NCNN_FORCEINLINE void convolution_im2col_input_tile_conv1x1s1d1_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk)
 {
     const int elempack = bottom_blob.elempack;
     const int cstep = (int)bottom_blob.cstep;
 
+    // NCNN_LOGE("convolution_im2col_input_tile_conv1x1s1d1_int8  %d %d %d %d  @%d", j, max_jj, k, max_kk, elempack);
+
     signed char* pp = B;
 
     int jj = 0;
@@ -820,7 +822,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
                         __m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk)));
                         __m128i _u = div_kernel_w._mm_comp_div_epu32(_uv);
                         __m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w)));
-                        _p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep));
+                        _p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep));
                         _u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h));
                         _v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w));
                         _u = _mm_mullo_epi32(_u, _mm_set1_epi32(w));
@@ -1031,7 +1033,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
                         __m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk)));
                         __m128i _u = div_kernel_w._mm_comp_div_epu32(_uv);
                         __m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w)));
-                        _p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep));
+                        _p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep));
                         _u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h));
                         _v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w));
                         _u = _mm_mullo_epi32(_u, _mm_set1_epi32(w));
@@ -1316,7 +1318,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
                         __m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk)));
                         __m128i _u = div_kernel_w._mm_comp_div_epu32(_uv);
                         __m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w)));
-                        _p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep));
+                        _p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep));
                         _u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h));
                         _v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w));
                         _u = _mm_mullo_epi32(_u, _mm_set1_epi32(w));
@@ -1523,7 +1525,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
                         __m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk)));
                         __m128i _u = div_kernel_w._mm_comp_div_epu32(_uv);
                         __m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w)));
-                        _p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep));
+                        _p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep));
                         _u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h));
                         _v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w));
                         _u = _mm_mullo_epi32(_u, _mm_set1_epi32(w));
@@ -1835,7 +1837,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
                         __m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk)));
                         __m128i _u = div_kernel_w._mm_comp_div_epu32(_uv);
                         __m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w)));
-                        _p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep));
+                        _p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep));
                         _u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h));
                         _v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w));
                         _u = _mm_mullo_epi32(_u, _mm_set1_epi32(w));
@@ -1974,7 +1976,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
                         __m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk)));
                         __m128i _u = div_kernel_w._mm_comp_div_epu32(_uv);
                         __m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w)));
-                        _p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep));
+                        _p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep));
                         _u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h));
                         _v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w));
                         _u = _mm_mullo_epi32(_u, _mm_set1_epi32(w));
@@ -2197,7 +2199,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
                         __m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk)));
                         __m128i _u = div_kernel_w._mm_comp_div_epu32(_uv);
                         __m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w)));
-                        _p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep));
+                        _p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep));
                         _u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h));
                         _v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w));
                         _u = _mm_mullo_epi32(_u, _mm_set1_epi32(w));
@@ -2321,7 +2323,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
                         __m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk)));
                         __m128i _u = div_kernel_w._mm_comp_div_epu32(_uv);
                         __m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w)));
-                        _p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep));
+                        _p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep));
                         _u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h));
                         _v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w));
                         _u = _mm_mullo_epi32(_u, _mm_set1_epi32(w));
@@ -2481,7 +2483,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
                     __m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk)));
                     __m128i _u = div_kernel_w._mm_comp_div_epu32(_uv);
                     __m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w)));
-                    _p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep));
+                    _p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep));
                     _u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h));
                     _v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w));
                     _u = _mm_mullo_epi32(_u, _mm_set1_epi32(w));
diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h
index c25facf79f5..778934557f7 100644
--- a/src/layer/x86/x86_usability.h
+++ b/src/layer/x86/x86_usability.h
@@ -42,7 +42,7 @@
 class FastDivider_epu32
 {
 public:
-    FastDivider_epu32(unsigned int d)
+    NCNN_FORCEINLINE FastDivider_epu32(unsigned int d)
     {
         unsigned int m, sh1, sh2;
         if (d == 1)
@@ -54,13 +54,7 @@ class FastDivider_epu32
         else
         {
             // sh = ceil(log2(d))
-#ifdef _MSC_VER
-            unsigned long index;
-            _BitScanReverse(&index, d - 1);
-            uint32_t sh = index + 1;
-#else
-            uint32_t sh = 32 - __builtin_clz(d - 1);
-#endif
+            uint32_t sh = portable_ceil_log2(d);
             uint32_t m0 = sh == 32 ? 0 : 1 << sh;
 
             m = 1 + uint32_t((uint64_t(m0 - d) << 32) / d);
@@ -81,7 +75,7 @@ class FastDivider_epu32
 
 #if __AVX2__
 #if __AVX512F__
-    __m512i _mm512_comp_div_epu32(__m512i x) const
+    NCNN_FORCEINLINE __m512i _mm512_comp_div_epu32(__m512i x) const
     {
         // xm = (x * multiplier) >> 32
         __m512i xm_low = _mm512_srli_epi64(_mm512_mul_epu32(x, _multiplier), 32);
@@ -93,13 +87,13 @@ class FastDivider_epu32
     }
 #endif // __AVX512F__
 
-    __m256i _mm256_comp_div_epu32(__m256i x) const
+    NCNN_FORCEINLINE __m256i _mm256_comp_div_epu32(__m256i x) const
     {
         // xm = (x * multiplier) >> 32
 #if __AVX512F__
         __m256i xm_low = _mm256_srli_epi64(_mm256_mul_epu32(x, _mm512_castsi512_si256(_multiplier)), 32);
         __m256i xm_high = _mm256_mul_epu32(_mm256_srli_epi64(x, 32), _mm512_castsi512_si256(_multiplier));
-#elif __AVX2__
+#else
         __m256i xm_low = _mm256_srli_epi64(_mm256_mul_epu32(x, _multiplier), 32);
         __m256i xm_high = _mm256_mul_epu32(_mm256_srli_epi64(x, 32), _multiplier);
 #endif
@@ -109,7 +103,7 @@ class FastDivider_epu32
     }
 #endif // __AVX2__
 
-    __m128i _mm_comp_div_epu32(__m128i x) const
+    NCNN_FORCEINLINE __m128i _mm_comp_div_epu32(__m128i x) const
     {
         // xm = (x * multiplier) >> 32
 #if __AVX512F__