Skip to content

Commit

Permalink
Merge branch 'master' into x86-conv-gemm-0
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui authored Feb 7, 2025
2 parents b4e99e1 + 4a70be4 commit 3d682fa
Show file tree
Hide file tree
Showing 8 changed files with 400 additions and 125 deletions.
17 changes: 12 additions & 5 deletions src/layer/arm/convolution_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1376,15 +1376,22 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con
#if __ARM_NEON
if (opt.use_packing_layout)
{
#if NCNN_ARM82
if (ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic)
if (use_int8_requantize)
{
out_elempack_int32 = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
out_elempack_int32 = num_output % 8 == 0 ? 8 : 1;
}
else
#endif // NCNN_ARM82
{
out_elempack_int32 = num_output % 4 == 0 ? 4 : 1;
#if NCNN_ARM82
if (ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic)
{
out_elempack_int32 = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
}
else
#endif // NCNN_ARM82
{
out_elempack_int32 = num_output % 4 == 0 ? 4 : 1;
}
}
}
#endif // __ARM_NEON
Expand Down
8 changes: 4 additions & 4 deletions src/layer/arm/requantize_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,8 @@ static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scal
for (; i < size; i++)
{
float v = *intptr * scale;
if (v < 0) v = 0;
*ptr = float2int8(v);
if (*ptr < 0) *ptr = 0;
intptr++;
ptr++;
}
Expand Down Expand Up @@ -190,8 +190,8 @@ static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scal
for (; i < size; i++)
{
float v = *intptr * scale + bias;
if (v < 0) v = 0;
*ptr = float2int8(v);
if (*ptr < 0) *ptr = 0;
intptr++;
ptr++;
}
Expand Down Expand Up @@ -288,8 +288,8 @@ static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat&
for (; i < size; i++)
{
float v = *intptr * scale;
if (v < 0) v *= slope;
*ptr = float2int8(v);
if (*ptr < 0) *ptr *= slope;
intptr++;
ptr++;
}
Expand Down Expand Up @@ -358,8 +358,8 @@ static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat&
for (; i < size; i++)
{
float v = *intptr * scale + bias;
if (v < 0) v *= slope;
*ptr = float2int8(v);
if (*ptr < 0) *ptr *= slope;
intptr++;
ptr++;
}
Expand Down
8 changes: 4 additions & 4 deletions src/layer/loongarch/requantize_loongarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,8 @@ static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scal
for (; i < size; i++)
{
float v = *intptr * scale;
if (v < 0) v = 0;
*ptr = float2int8(v);
if (*ptr < 0) *ptr = 0;
intptr++;
ptr++;
}
Expand Down Expand Up @@ -182,8 +182,8 @@ static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scal
for (; i < size; i++)
{
float v = *intptr * scale + bias;
if (v < 0) v = 0;
*ptr = float2int8(v);
if (*ptr < 0) *ptr = 0;
intptr++;
ptr++;
}
Expand Down Expand Up @@ -281,8 +281,8 @@ static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat&
for (; i < size; i++)
{
float v = *intptr * scale;
if (v < 0) v *= slope;
*ptr = float2int8(v);
if (*ptr < 0) *ptr *= slope;
intptr++;
ptr++;
}
Expand Down Expand Up @@ -343,8 +343,8 @@ static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat&
for (; i < size; i++)
{
float v = *intptr * scale + bias;
if (v < 0) v *= slope;
*ptr = float2int8(v);
if (*ptr < 0) *ptr *= slope;
intptr++;
ptr++;
}
Expand Down
8 changes: 4 additions & 4 deletions src/layer/mips/requantize_mips.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,8 @@ static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scal
for (; i < size; i++)
{
float v = *intptr * scale;
if (v < 0) v = 0;
*ptr = float2int8(v);
if (*ptr < 0) *ptr = 0;
intptr++;
ptr++;
}
Expand Down Expand Up @@ -182,8 +182,8 @@ static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scal
for (; i < size; i++)
{
float v = *intptr * scale + bias;
if (v < 0) v = 0;
*ptr = float2int8(v);
if (*ptr < 0) *ptr = 0;
intptr++;
ptr++;
}
Expand Down Expand Up @@ -281,8 +281,8 @@ static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat&
for (; i < size; i++)
{
float v = *intptr * scale;
if (v < 0) v *= slope;
*ptr = float2int8(v);
if (*ptr < 0) *ptr *= slope;
intptr++;
ptr++;
}
Expand Down Expand Up @@ -343,8 +343,8 @@ static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat&
for (; i < size; i++)
{
float v = *intptr * scale + bias;
if (v < 0) v *= slope;
*ptr = float2int8(v);
if (*ptr < 0) *ptr *= slope;
intptr++;
ptr++;
}
Expand Down
4 changes: 4 additions & 0 deletions src/layer/x86/convolution_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -995,7 +995,11 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
{
if (use_int8_requantize)
{
#if __AVX__
out_elempack_int32 = num_output % 8 == 0 ? 8 : 1;
#else
out_elempack_int32 = num_output % 8 == 0 ? 4 : 1;
#endif
}
else
{
Expand Down
191 changes: 171 additions & 20 deletions src/layer/x86/requantize_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,18 +330,103 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_
}
}

#if __SSE2__
#if !__AVX__
static void requantize_pack4to8(const int* intptr0, const int* intptr1, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int activation_type, const Mat& activation_params, int elemcount)
{
const int scale_in_data_size = scale_in_data.w;
const int bias_data_size = bias_data.w;
const int scale_out_data_size = scale_out_data.w;

// NCNN_LOGE("requantize_pack4to8 %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount);

__m128 _scale_in0 = _mm_set1_ps(scale_in_data[0]);
__m128 _scale_in1 = _scale_in0;
if (scale_in_data_size > 1)
{
_scale_in0 = _mm_loadu_ps((const float*)scale_in_data);
_scale_in1 = _mm_loadu_ps((const float*)scale_in_data + 4);
}

__m128 _scale_out0 = _mm_set1_ps(scale_out_data[0]);
__m128 _scale_out1 = _scale_out0;
if (scale_out_data_size > 1)
{
_scale_out0 = _mm_loadu_ps((const float*)scale_out_data);
_scale_out1 = _mm_loadu_ps((const float*)scale_out_data + 4);
}

if (bias_data_size == 0)
{
int i = 0;
for (; i < elemcount; i++)
{
__m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0));
__m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1));
_v0 = _mm_mul_ps(_v0, _scale_in0);
_v1 = _mm_mul_ps(_v1, _scale_in1);
_v0 = activation_sse(_v0, activation_type, activation_params);
_v1 = activation_sse(_v1, activation_type, activation_params);
_v0 = _mm_mul_ps(_v0, _scale_out0);
_v1 = _mm_mul_ps(_v1, _scale_out1);
*(int64_t*)ptr = float2int8_sse(_v0, _v1);
intptr0 += 4;
intptr1 += 4;
ptr += 8;
}
}
else
{
__m128 _bias0 = _mm_set1_ps(bias_data[0]);
__m128 _bias1 = _bias0;
if (bias_data_size > 1)
{
_bias0 = _mm_loadu_ps((const float*)bias_data);
_bias1 = _mm_loadu_ps((const float*)bias_data + 4);
}

int i = 0;
for (; i < elemcount; i++)
{
__m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0));
__m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1));
_v0 = _mm_comp_fmadd_ps(_v0, _scale_in0, _bias0);
_v1 = _mm_comp_fmadd_ps(_v1, _scale_in1, _bias1);
_v0 = activation_sse(_v0, activation_type, activation_params);
_v1 = activation_sse(_v1, activation_type, activation_params);
_v0 = _mm_mul_ps(_v0, _scale_out0);
_v1 = _mm_mul_ps(_v1, _scale_out1);
*(int64_t*)ptr = float2int8_sse(_v0, _v1);
intptr0 += 4;
intptr1 += 4;
ptr += 8;
}
}
}
#endif // !__AVX__
#endif // __SSE2__

int Requantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
const int dims = bottom_blob.dims;
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int channels = bottom_blob.c;
const int elempack = bottom_blob.elempack;
const size_t out_elemsize = elempack * 1u;

if (dims == 1)
{
top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
int out_elempack = 1;
#if __SSE2__
if (opt.use_packing_layout)
{
out_elempack = w * elempack % 8 == 0 ? 8 : 1;
}
#endif
const int outw = w * elempack / out_elempack;
const size_t out_elemsize = out_elempack * 1u;

top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator);
if (top_blob.empty())
return -100;

Expand All @@ -368,41 +453,107 @@ int Requantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option&

if (dims == 2)
{
top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
int out_elempack = 1;
#if __SSE2__
if (opt.use_packing_layout)
{
out_elempack = h * elempack % 8 == 0 ? 8 : 1;
}
#endif
const int outh = h * elempack / out_elempack;
const size_t out_elemsize = out_elempack * 1u;

top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
if (top_blob.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int i = 0; i < h; i++)
#if __SSE2__
#if !__AVX__
if (elempack == 4 && out_elempack == 8)
{
const int* intptr = bottom_blob.row<const int>(i);
signed char* ptr = top_blob.row<signed char>(i);
#pragma omp parallel for num_threads(opt.num_threads)
for (int i = 0; i < outh; i++)
{
const int* intptr0 = bottom_blob.row<const int>(i * 2);
const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
signed char* ptr = top_blob.row<signed char>(i);

const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data;
const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data;
const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data;
const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * out_elempack, out_elempack) : scale_in_data;
const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * out_elempack, out_elempack) : bias_data;
const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * out_elempack, out_elempack) : scale_out_data;

requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack);
requantize_pack4to8(intptr0, intptr1, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w);
}
}
#endif // !__AVX__
#endif // __SSE2__
if (elempack == out_elempack)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int i = 0; i < h; i++)
{
const int* intptr = bottom_blob.row<const int>(i);
signed char* ptr = top_blob.row<signed char>(i);

const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data;
const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data;
const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data;

requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack);
}
}
}

if (dims == 3)
{
top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
int out_elempack = 1;
#if __SSE2__
if (opt.use_packing_layout)
{
out_elempack = channels * elempack % 8 == 0 ? 8 : 1;
}
#endif
const int outc = channels * elempack / out_elempack;
const size_t out_elemsize = out_elempack * 1u;

top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
if (top_blob.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
#if __SSE2__
#if !__AVX__
if (elempack == 4 && out_elempack == 8)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < outc; q++)
{
const int* intptr0 = bottom_blob.channel(q * 2);
const int* intptr1 = bottom_blob.channel(q * 2 + 1);
signed char* ptr = top_blob.channel(q);

const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * out_elempack, out_elempack) : scale_in_data;
const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * out_elempack, out_elempack) : bias_data;
const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * out_elempack, out_elempack) : scale_out_data;

requantize_pack4to8(intptr0, intptr1, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h);
}
}
#endif // !__AVX__
#endif // __SSE2__
if (elempack == out_elempack)
{
const int* intptr = bottom_blob.channel(q);
signed char* ptr = top_blob.channel(q);
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const int* intptr = bottom_blob.channel(q);
signed char* ptr = top_blob.channel(q);

const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data;
const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data;
const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data;
const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data;
const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data;
const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data;

requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h, elempack);
requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h, elempack);
}
}
}

Expand Down
Loading

0 comments on commit 3d682fa

Please sign in to comment.