diff --git a/src/layer/arm/requantize_arm.cpp b/src/layer/arm/requantize_arm.cpp index d5fe92428b3..a1111a77d6b 100644 --- a/src/layer/arm/requantize_arm.cpp +++ b/src/layer/arm/requantize_arm.cpp @@ -540,6 +540,511 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_ } } +#if __ARM_NEON +static void requantize_relu_pack4to8(const int* intptr0, const int* intptr1, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int elemcount) +{ + const int scale_in_data_size = scale_in_data.w; + const int bias_data_size = bias_data.w; + const int scale_out_data_size = scale_out_data.w; + + // NCNN_LOGE("requantize_relu_pack4to8 %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount); + + // int8(relu(v * scale_in) * scale_out) + // int8_relu(v * (scale_in * scale_out)) + + // int8(relu(v * scale_in + bias) * scale_out) + // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) + + float32x4_t _scale_in0 = vdupq_n_f32(scale_in_data[0]); + float32x4_t _scale_in1 = _scale_in0; + if (scale_in_data_size > 1) + { + _scale_in0 = vld1q_f32((const float*)scale_in_data); + _scale_in1 = vld1q_f32((const float*)scale_in_data + 4); + } + + float32x4_t _scale_out0 = vdupq_n_f32(scale_out_data[0]); + float32x4_t _scale_out1 = _scale_out0; + if (scale_out_data_size > 1) + { + _scale_out0 = vld1q_f32((const float*)scale_out_data); + _scale_out1 = vld1q_f32((const float*)scale_out_data + 4); + } + + float32x4_t _scale0 = vmulq_f32(_scale_in0, _scale_out0); + float32x4_t _scale1 = vmulq_f32(_scale_in1, _scale_out1); + + if (bias_data_size == 0) + { + int i = 0; + for (; i < elemcount; i++) + { + float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0)); + float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1)); + _v0 = vmulq_f32(_v0, _scale0); + _v1 = vmulq_f32(_v1, _scale1); + vst1_s8(ptr, float2int8relu(_v0, _v1)); + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + else + { + float32x4_t _bias0 = vdupq_n_f32(bias_data[0]); + float32x4_t _bias1 = _bias0; + if (bias_data_size > 1) + { + _bias0 = vld1q_f32((const float*)bias_data); + _bias1 = vld1q_f32((const float*)bias_data + 4); + } + + _bias0 = vmulq_f32(_bias0, _scale_out0); + _bias1 = vmulq_f32(_bias1, _scale_out1); + + int i = 0; + for (; i < elemcount; i++) + { + float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0)); + float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1)); +#if __aarch64__ + _v0 = vfmaq_f32(_bias0, _v0, _scale0); + _v1 = vfmaq_f32(_bias1, _v1, _scale1); +#else // __aarch64__ + _v0 = vmlaq_f32(_bias0, _v0, _scale0); + _v1 = vmlaq_f32(_bias1, _v1, _scale1); +#endif // __aarch64__ + vst1_s8(ptr, float2int8relu(_v0, _v1)); + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } +} + +static void requantize_leakyrelu_pack4to8(const int* intptr0, const int* intptr1, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, float slope, int elemcount) +{ + const int scale_in_data_size = scale_in_data.w; + const int bias_data_size = bias_data.w; + const int scale_out_data_size = scale_out_data.w; + + // NCNN_LOGE("requantize_leakyrelu_pack4to8 %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount); + + // int8(leakyrelu(v * scale_in, slope) * scale_out) + // int8_leakyrelu(v * (scale_in * scale_out), slope) + + // int8(leakyrelu(v * scale_in + bias, slope) * scale_out) + // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope) + + float32x4_t _scale_in0 = vdupq_n_f32(scale_in_data[0]); + float32x4_t _scale_in1 = _scale_in0; + if (scale_in_data_size > 1) + { + _scale_in0 = vld1q_f32((const float*)scale_in_data); + _scale_in1 = vld1q_f32((const float*)scale_in_data + 4); + } + + float32x4_t _scale_out0 = vdupq_n_f32(scale_out_data[0]); + float32x4_t _scale_out1 = _scale_out0; + if (scale_out_data_size > 1) + { + _scale_out0 = vld1q_f32((const float*)scale_out_data); + _scale_out1 = vld1q_f32((const float*)scale_out_data + 4); + } + + float32x4_t _scale0 = vmulq_f32(_scale_in0, _scale_out0); + float32x4_t _scale1 = vmulq_f32(_scale_in1, _scale_out1); + + float32x4_t _slope = vdupq_n_f32(slope); + + if (bias_data_size == 0) + { + int i = 0; + for (; i < elemcount; i++) + { + float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0)); + float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1)); + _v0 = vmulq_f32(_v0, _scale0); + _v1 = vmulq_f32(_v1, _scale1); + vst1_s8(ptr, float2int8leakyrelu(_v0, _v1, _slope)); + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + else + { + float32x4_t _bias0 = vdupq_n_f32(bias_data[0]); + float32x4_t _bias1 = _bias0; + if (bias_data_size > 1) + { + _bias0 = vld1q_f32((const float*)bias_data); + _bias1 = vld1q_f32((const float*)bias_data + 4); + } + + _bias0 = vmulq_f32(_bias0, _scale_out0); + _bias1 = vmulq_f32(_bias1, _scale_out1); + + int i = 0; + for (; i < elemcount; i++) + { + float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0)); + float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1)); +#if __aarch64__ + _v0 = vfmaq_f32(_bias0, _v0, _scale0); + _v1 = vfmaq_f32(_bias1, _v1, _scale1); +#else // __aarch64__ + _v0 = vmlaq_f32(_bias0, _v0, _scale0); + _v1 = vmlaq_f32(_bias1, _v1, _scale1); +#endif // __aarch64__ + vst1_s8(ptr, float2int8leakyrelu(_v0, _v1, _slope)); + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } +} + +static void requantize_pack4to8(const int* intptr0, const int* intptr1, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int activation_type, const Mat& activation_params, int elemcount) +{ + if (activation_type == 1) + { + requantize_relu_pack4to8(intptr0, intptr1, ptr, scale_in_data, bias_data, scale_out_data, elemcount); + return; + } + + if (activation_type == 2 && activation_params[0] > 0.f) + { + const float slope = activation_params[0]; + requantize_leakyrelu_pack4to8(intptr0, intptr1, ptr, scale_in_data, bias_data, scale_out_data, slope, elemcount); + return; + } + + const int scale_in_data_size = scale_in_data.w; + const int bias_data_size = bias_data.w; + const int scale_out_data_size = scale_out_data.w; + + // NCNN_LOGE("requantize_pack4to8 %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount); + + float32x4_t _scale_in0 = vdupq_n_f32(scale_in_data[0]); + float32x4_t _scale_in1 = _scale_in0; + if (scale_in_data_size > 1) + { + _scale_in0 = vld1q_f32((const float*)scale_in_data); + _scale_in1 = vld1q_f32((const float*)scale_in_data + 4); + } + + float32x4_t _scale_out0 = vdupq_n_f32(scale_out_data[0]); + float32x4_t _scale_out1 = _scale_out0; + if (scale_out_data_size > 1) + { + _scale_out0 = vld1q_f32((const float*)scale_out_data); + _scale_out1 = vld1q_f32((const float*)scale_out_data + 4); + } + + if (bias_data_size == 0) + { + int i = 0; + for (; i < elemcount; i++) + { + float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0)); + float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1)); + _v0 = vmulq_f32(_v0, _scale_in0); + _v1 = vmulq_f32(_v1, _scale_in1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = vmulq_f32(_v0, _scale_out0); + _v1 = vmulq_f32(_v1, _scale_out1); + vst1_s8(ptr, float2int8(_v0, _v1)); + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + else + { + float32x4_t _bias0 = vdupq_n_f32(bias_data[0]); + float32x4_t _bias1 = _bias0; + if (bias_data_size > 1) + { + _bias0 = vld1q_f32((const float*)bias_data); + _bias1 = vld1q_f32((const float*)bias_data + 4); + } + + int i = 0; + for (; i < elemcount; i++) + { + float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0)); + float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1)); +#if __aarch64__ + _v0 = vfmaq_f32(_bias0, _v0, _scale_in0); + _v1 = vfmaq_f32(_bias1, _v1, _scale_in1); +#else // __aarch64__ + _v0 = vmlaq_f32(_bias0, _v0, _scale_in0); + _v1 = vmlaq_f32(_bias1, _v1, _scale_in1); +#endif // __aarch64__ + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = vmulq_f32(_v0, _scale_out0); + _v1 = vmulq_f32(_v1, _scale_out1); + vst1_s8(ptr, float2int8(_v0, _v1)); + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } +} + +static void requantize_relu_pack4to1(const int* intptr, signed char* ptr0, signed char* ptr1, signed char* ptr2, signed char* ptr3, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int elemcount) +{ + const int scale_in_data_size = scale_in_data.w; + const int bias_data_size = bias_data.w; + const int scale_out_data_size = scale_out_data.w; + + // NCNN_LOGE("requantize_relu_pack4to1 %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount); + + // int8(relu(v * scale_in) * scale_out) + // int8_relu(v * (scale_in * scale_out)) + + // int8(relu(v * scale_in + bias) * scale_out) + // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) + + float32x4_t _scale_in = vdupq_n_f32(scale_in_data[0]); + if (scale_in_data_size > 1) + { + _scale_in = vld1q_f32((const float*)scale_in_data); + } + + float32x4_t _scale_out = vdupq_n_f32(scale_out_data[0]); + if (scale_out_data_size > 1) + { + _scale_out = vld1q_f32((const float*)scale_out_data); + } + + float32x4_t _scale = vmulq_f32(_scale_in, _scale_out); + + if (bias_data_size == 0) + { + int i = 0; + for (; i < elemcount; i++) + { + float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); + _v = vmulq_f32(_v, _scale); + int8x8_t v = float2int8relu(_v, _v); + ptr0[0] = vget_lane_s8(v, 0); + ptr1[0] = vget_lane_s8(v, 1); + ptr2[0] = vget_lane_s8(v, 2); + ptr3[0] = vget_lane_s8(v, 3); + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + else + { + float32x4_t _bias = vdupq_n_f32(bias_data[0]); + if (bias_data_size > 1) + { + _bias = vld1q_f32((const float*)bias_data); + } + + _bias = vmulq_f32(_bias, _scale_out); + + int i = 0; + for (; i < elemcount; i++) + { + float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); +#if __aarch64__ + _v = vfmaq_f32(_bias, _v, _scale); +#else // __aarch64__ + _v = vmlaq_f32(_bias, _v, _scale); +#endif // __aarch64__ + int8x8_t v = float2int8relu(_v, _v); + ptr0[0] = vget_lane_s8(v, 0); + ptr1[0] = vget_lane_s8(v, 1); + ptr2[0] = vget_lane_s8(v, 2); + ptr3[0] = vget_lane_s8(v, 3); + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } +} + +static void requantize_leakyrelu_pack4to1(const int* intptr, signed char* ptr0, signed char* ptr1, signed char* ptr2, signed char* ptr3, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, float slope, int elemcount) +{ + const int scale_in_data_size = scale_in_data.w; + const int bias_data_size = bias_data.w; + const int scale_out_data_size = scale_out_data.w; + + // NCNN_LOGE("requantize_leakyrelu_pack4to1 %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount); + + // int8(leakyrelu(v * scale_in, slope) * scale_out) + // int8_leakyrelu(v * (scale_in * scale_out), slope) + + // int8(leakyrelu(v * scale_in + bias, slope) * scale_out) + // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope) + + float32x4_t _scale_in = vdupq_n_f32(scale_in_data[0]); + if (scale_in_data_size > 1) + { + _scale_in = vld1q_f32((const float*)scale_in_data); + } + + float32x4_t _scale_out = vdupq_n_f32(scale_out_data[0]); + if (scale_out_data_size > 1) + { + _scale_out = vld1q_f32((const float*)scale_out_data); + } + + float32x4_t _scale = vmulq_f32(_scale_in, _scale_out); + + float32x4_t _slope = vdupq_n_f32(slope); + + if (bias_data_size == 0) + { + int i = 0; + for (; i < elemcount; i++) + { + float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); + _v = vmulq_f32(_v, _scale); + int8x8_t v = float2int8leakyrelu(_v, _v, _slope); + ptr0[0] = vget_lane_s8(v, 0); + ptr1[0] = vget_lane_s8(v, 1); + ptr2[0] = vget_lane_s8(v, 2); + ptr3[0] = vget_lane_s8(v, 3); + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + else + { + float32x4_t _bias = vdupq_n_f32(bias_data[0]); + if (bias_data_size > 1) + { + _bias = vld1q_f32((const float*)bias_data); + } + + _bias = vmulq_f32(_bias, _scale_out); + + int i = 0; + for (; i < elemcount; i++) + { + float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); +#if __aarch64__ + _v = vfmaq_f32(_bias, _v, _scale); +#else // __aarch64__ + _v = vmlaq_f32(_bias, _v, _scale); +#endif // __aarch64__ + int8x8_t v = float2int8leakyrelu(_v, _v, _slope); + ptr0[0] = vget_lane_s8(v, 0); + ptr1[0] = vget_lane_s8(v, 1); + ptr2[0] = vget_lane_s8(v, 2); + ptr3[0] = vget_lane_s8(v, 3); + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } +} + +static void requantize_pack4to1(const int* intptr, signed char* ptr0, signed char* ptr1, signed char* ptr2, signed char* ptr3, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int activation_type, const Mat& activation_params, int elemcount) +{ + if (activation_type == 1) + { + requantize_relu_pack4to1(intptr, ptr0, ptr1, ptr2, ptr3, scale_in_data, bias_data, scale_out_data, elemcount); + return; + } + + if (activation_type == 2 && activation_params[0] > 0.f) + { + const float slope = activation_params[0]; + requantize_leakyrelu_pack4to1(intptr, ptr0, ptr1, ptr2, ptr3, scale_in_data, bias_data, scale_out_data, slope, elemcount); + return; + } + + const int scale_in_data_size = scale_in_data.w; + const int bias_data_size = bias_data.w; + const int scale_out_data_size = scale_out_data.w; + + // NCNN_LOGE("requantize_pack4to1 %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount); + + float32x4_t _scale_in = vdupq_n_f32(scale_in_data[0]); + if (scale_in_data_size > 1) + { + _scale_in = vld1q_f32((const float*)scale_in_data); + } + + float32x4_t _scale_out = vdupq_n_f32(scale_out_data[0]); + if (scale_out_data_size > 1) + { + _scale_out = vld1q_f32((const float*)scale_out_data); + } + + if (bias_data_size == 0) + { + int i = 0; + for (; i < elemcount; i++) + { + float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); + _v = vmulq_f32(_v, _scale_in); + _v = activation_ps(_v, activation_type, activation_params); + _v = vmulq_f32(_v, _scale_out); + int8x8_t v = float2int8(_v, _v); + ptr0[0] = vget_lane_s8(v, 0); + ptr1[0] = vget_lane_s8(v, 1); + ptr2[0] = vget_lane_s8(v, 2); + ptr3[0] = vget_lane_s8(v, 3); + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + else + { + float32x4_t _bias = vdupq_n_f32(bias_data[0]); + if (bias_data_size > 1) + { + _bias = vld1q_f32((const float*)bias_data); + } + + int i = 0; + for (; i < elemcount; i++) + { + float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); +#if __aarch64__ + _v = vfmaq_f32(_bias, _v, _scale_in); +#else // __aarch64__ + _v = vmlaq_f32(_bias, _v, _scale_in); +#endif // __aarch64__ + _v = activation_ps(_v, activation_type, activation_params); + _v = vmulq_f32(_v, _scale_out); + int8x8_t v = float2int8(_v, _v); + ptr0[0] = vget_lane_s8(v, 0); + ptr1[0] = vget_lane_s8(v, 1); + ptr2[0] = vget_lane_s8(v, 2); + ptr3[0] = vget_lane_s8(v, 3); + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } +} +#endif // __ARM_NEON + int Requantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int dims = bottom_blob.dims; @@ -547,11 +1052,20 @@ int Requantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& const int h = bottom_blob.h; const int channels = bottom_blob.c; const int elempack = bottom_blob.elempack; - const size_t out_elemsize = elempack * 1u; if (dims == 1) { - top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __ARM_NEON + if (opt.use_packing_layout) + { + out_elempack = w * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outw = w * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -578,41 +1092,139 @@ int Requantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& if (dims == 2) { - top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __ARM_NEON + if (opt.use_packing_layout) + { + out_elempack = h * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outh = h * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) +#if __ARM_NEON + if (elempack == 4 && out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const int* intptr0 = bottom_blob.row(i * 2); + const int* intptr1 = bottom_blob.row(i * 2 + 1); + signed char* ptr = top_blob.row(i); + + const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * out_elempack, out_elempack) : scale_in_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * out_elempack, out_elempack) : bias_data; + const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * out_elempack, out_elempack) : scale_out_data; + + requantize_pack4to8(intptr0, intptr1, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w); + } + } + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr0 = top_blob.row(i * 4); + signed char* ptr1 = top_blob.row(i * 4 + 1); + signed char* ptr2 = top_blob.row(i * 4 + 2); + signed char* ptr3 = top_blob.row(i * 4 + 3); + + const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; + const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data; + + requantize_pack4to1(intptr, ptr0, ptr1, ptr2, ptr3, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w); + } + } +#endif // __ARM_NEON + if (elempack == out_elempack) { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); - const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data; - const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; - const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data; + const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; + const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data; - requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack); + requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack); + } } } if (dims == 3) { - top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __ARM_NEON + if (opt.use_packing_layout) + { + out_elempack = channels * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outc = channels * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) +#if __ARM_NEON + if (elempack == 4 && out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * out_elempack, out_elempack) : scale_in_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * out_elempack, out_elempack) : bias_data; + const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * out_elempack, out_elempack) : scale_out_data; + + requantize_pack4to8(intptr0, intptr1, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h); + } + } + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + + const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; + const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data; + + requantize_pack4to1(intptr, ptr0, ptr1, ptr2, ptr3, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h); + } + } +#endif // __ARM_NEON + if (elempack == out_elempack) { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); - const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data; - const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; - const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data; + const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; + const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data; - requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h, elempack); + requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h, elempack); + } } } diff --git a/src/layer/x86/requantize_x86.cpp b/src/layer/x86/requantize_x86.cpp index 6b64f86967d..69d730bdc2b 100644 --- a/src/layer/x86/requantize_x86.cpp +++ b/src/layer/x86/requantize_x86.cpp @@ -330,6 +330,216 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_ } } +#if __SSE2__ +#if __AVX512F__ +static void requantize_pack16to8(const int* intptr, signed char* ptr0, signed char* ptr1, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int activation_type, const Mat& activation_params, int elemcount) +{ + const int scale_in_data_size = scale_in_data.w; + const int bias_data_size = bias_data.w; + const int scale_out_data_size = scale_out_data.w; + + // NCNN_LOGE("requantize_pack16to8 %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount); + + __m512 _scale_in = _mm512_set1_ps(scale_in_data[0]); + if (scale_in_data_size > 1) + { + _scale_in = _mm512_loadu_ps((const float*)scale_in_data); + } + + __m512 _scale_out = _mm512_set1_ps(scale_out_data[0]); + if (scale_out_data_size > 1) + { + _scale_out = _mm512_loadu_ps((const float*)scale_out_data); + } + + if (bias_data_size == 0) + { + int i = 0; + for (; i < elemcount; i++) + { + __m512 _v = _mm512_cvtepi32_ps(_mm512_loadu_si512((const __m512i*)intptr)); + _v = _mm512_mul_ps(_v, _scale_in); + _v = activation_avx512(_v, activation_type, activation_params); + _v = _mm512_mul_ps(_v, _scale_out); + __m128i v = float2int8_avx512(_v); + _mm_storel_pd((double*)ptr0, _mm_castsi128_pd(v)); + _mm_storeh_pd((double*)ptr1, _mm_castsi128_pd(v)); + intptr += 16; + ptr0 += 8; + ptr1 += 8; + } + } + else + { + __m512 _bias = _mm512_set1_ps(bias_data[0]); + if (bias_data_size > 1) + { + _bias = _mm512_loadu_ps((const float*)bias_data); + } + + int i = 0; + for (; i < elemcount; i++) + { + __m512 _v = _mm512_cvtepi32_ps(_mm512_loadu_si512((const __m512i*)intptr)); + _v = _mm512_fmadd_ps(_v, _scale_in, _bias); + _v = activation_avx512(_v, activation_type, activation_params); + _v = _mm512_mul_ps(_v, _scale_out); + __m128i v = float2int8_avx512(_v); + _mm_storel_pd((double*)ptr0, _mm_castsi128_pd(v)); + _mm_storeh_pd((double*)ptr1, _mm_castsi128_pd(v)); + intptr += 16; + ptr0 += 8; + ptr1 += 8; + } + } +} +#endif // __AVX512F__ + +#if !__AVX__ +static void requantize_pack4to8(const int* intptr0, const int* intptr1, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int activation_type, const Mat& activation_params, int elemcount) +{ + const int scale_in_data_size = scale_in_data.w; + const int bias_data_size = bias_data.w; + const int scale_out_data_size = scale_out_data.w; + + // NCNN_LOGE("requantize_pack4to8 %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount); + + __m128 _scale_in0 = _mm_set1_ps(scale_in_data[0]); + __m128 _scale_in1 = _scale_in0; + if (scale_in_data_size > 1) + { + _scale_in0 = _mm_loadu_ps((const float*)scale_in_data); + _scale_in1 = _mm_loadu_ps((const float*)scale_in_data + 4); + } + + __m128 _scale_out0 = _mm_set1_ps(scale_out_data[0]); + __m128 _scale_out1 = _scale_out0; + if (scale_out_data_size > 1) + { + _scale_out0 = _mm_loadu_ps((const float*)scale_out_data); + _scale_out1 = _mm_loadu_ps((const float*)scale_out_data + 4); + } + + if (bias_data_size == 0) + { + int i = 0; + for (; i < elemcount; i++) + { + __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0)); + __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1)); + _v0 = _mm_mul_ps(_v0, _scale_in0); + _v1 = _mm_mul_ps(_v1, _scale_in1); + _v0 = activation_sse(_v0, activation_type, activation_params); + _v1 = activation_sse(_v1, activation_type, activation_params); + _v0 = _mm_mul_ps(_v0, _scale_out0); + _v1 = _mm_mul_ps(_v1, _scale_out1); + *(int64_t*)ptr = float2int8_sse(_v0, _v1); + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + else + { + __m128 _bias0 = _mm_set1_ps(bias_data[0]); + __m128 _bias1 = _bias0; + if (bias_data_size > 1) + { + _bias0 = _mm_loadu_ps((const float*)bias_data); + _bias1 = _mm_loadu_ps((const float*)bias_data + 4); + } + + int i = 0; + for (; i < elemcount; i++) + { + __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0)); + __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1)); + _v0 = _mm_comp_fmadd_ps(_v0, _scale_in0, _bias0); + _v1 = _mm_comp_fmadd_ps(_v1, _scale_in1, _bias1); + _v0 = activation_sse(_v0, activation_type, activation_params); + _v1 = activation_sse(_v1, activation_type, activation_params); + _v0 = _mm_mul_ps(_v0, _scale_out0); + _v1 = _mm_mul_ps(_v1, _scale_out1); + *(int64_t*)ptr = float2int8_sse(_v0, _v1); + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } +} +#endif // !__AVX__ + +static void requantize_pack4to1(const int* intptr, signed char* ptr0, signed char* ptr1, signed char* ptr2, signed char* ptr3, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int activation_type, const Mat& activation_params, int elemcount) +{ + const int scale_in_data_size = scale_in_data.w; + const int bias_data_size = bias_data.w; + const int scale_out_data_size = scale_out_data.w; + + // NCNN_LOGE("requantize_pack4to1 %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount); + + __m128 _scale_in = _mm_set1_ps(scale_in_data[0]); + if (scale_in_data_size > 1) + { + _scale_in = _mm_loadu_ps((const float*)scale_in_data); + } + + __m128 _scale_out = _mm_set1_ps(scale_out_data[0]); + if (scale_out_data_size > 1) + { + _scale_out = _mm_loadu_ps((const float*)scale_out_data); + } + + if (bias_data_size == 0) + { + int i = 0; + for (; i < elemcount; i++) + { + __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); + _v = _mm_mul_ps(_v, _scale_in); + _v = activation_sse(_v, activation_type, activation_params); + _v = _mm_mul_ps(_v, _scale_out); + int32_t v = float2int8_sse(_v); + ptr0[0] = (v >> 0) & 0xff; + ptr1[0] = (v >> 8) & 0xff; + ptr2[0] = (v >> 16) & 0xff; + ptr3[0] = (v >> 24) & 0xff; + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + else + { + __m128 _bias = _mm_set1_ps(bias_data[0]); + if (bias_data_size > 1) + { + _bias = _mm_loadu_ps((const float*)bias_data); + } + + int i = 0; + for (; i < elemcount; i++) + { + __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); + _v = _mm_comp_fmadd_ps(_v, _scale_in, _bias); + _v = activation_sse(_v, activation_type, activation_params); + _v = _mm_mul_ps(_v, _scale_out); + int32_t v = float2int8_sse(_v); + ptr0[0] = (v >> 0) & 0xff; + ptr1[0] = (v >> 8) & 0xff; + ptr2[0] = (v >> 16) & 0xff; + ptr3[0] = (v >> 24) & 0xff; + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } +} +#endif // __SSE2__ + int Requantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int dims = bottom_blob.dims; @@ -337,11 +547,20 @@ int Requantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& const int h = bottom_blob.h; const int channels = bottom_blob.c; const int elempack = bottom_blob.elempack; - const size_t out_elemsize = elempack * 1u; if (dims == 1) { - top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __SSE2__ + if (opt.use_packing_layout) + { + out_elempack = w * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outw = w * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -368,41 +587,179 @@ int Requantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& if (dims == 2) { - top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __SSE2__ + if (opt.use_packing_layout) + { + out_elempack = h * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outh = h * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) +#if __SSE2__ +#if __AVX512F__ + if (elempack == 16 && out_elempack == 8) { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr0 = top_blob.row(i * 2); + signed char* ptr1 = top_blob.row(i * 2 + 1); - const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data; - const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; - const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data; + const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; + const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data; - requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack); + requantize_pack16to8(intptr, ptr0, ptr1, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w); + } + } +#endif // __AVX512F__ +#if !__AVX__ + if (elempack == 4 && out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const int* intptr0 = bottom_blob.row(i * 2); + const int* intptr1 = bottom_blob.row(i * 2 + 1); + signed char* ptr = top_blob.row(i); + + const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * out_elempack, out_elempack) : scale_in_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * out_elempack, out_elempack) : bias_data; + const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * out_elempack, out_elempack) : scale_out_data; + + requantize_pack4to8(intptr0, intptr1, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w); + } + } +#endif // !__AVX__ + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr0 = top_blob.row(i * 4); + signed char* ptr1 = top_blob.row(i * 4 + 1); + signed char* ptr2 = top_blob.row(i * 4 + 2); + signed char* ptr3 = top_blob.row(i * 4 + 3); + + const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; + const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data; + + requantize_pack4to1(intptr, ptr0, ptr1, ptr2, ptr3, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w); + } + } +#endif // __SSE2__ + if (elempack == out_elempack) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; + const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data; + + requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack); + } } } if (dims == 3) { - top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __SSE2__ + if (opt.use_packing_layout) + { + out_elempack = channels * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outc = channels * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) +#if __SSE2__ +#if __AVX512F__ + if (elempack == 16 && out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 2); + signed char* ptr1 = top_blob.channel(q * 2 + 1); + + const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; + const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data; + + requantize_pack16to8(intptr, ptr0, ptr1, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h); + } + } +#endif // __AVX512F__ +#if !__AVX__ + if (elempack == 4 && out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * out_elempack, out_elempack) : scale_in_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * out_elempack, out_elempack) : bias_data; + const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * out_elempack, out_elempack) : scale_out_data; + + requantize_pack4to8(intptr0, intptr1, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h); + } + } +#endif // !__AVX__ + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + + const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; + const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data; + + requantize_pack4to1(intptr, ptr0, ptr1, ptr2, ptr3, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h); + } + } +#endif // __SSE2__ + if (elempack == out_elempack) { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); - const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data; - const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; - const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data; + const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; + const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data; - requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h, elempack); + requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h, elempack); + } } }