From 4936e808e99426d3cbb55b542a1de5246b8b1c5e Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Thu, 15 Feb 2024 18:45:45 +0800
Subject: [PATCH] fix riscv requantize/dequantize packing

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
---
 src/layer/riscv/dequantize_riscv.cpp | 24 ++++++++++++------------
 src/layer/riscv/quantize_riscv.cpp   |  2 --
 src/layer/riscv/requantize_riscv.cpp | 20 ++++++++++----------
 3 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/src/layer/riscv/dequantize_riscv.cpp b/src/layer/riscv/dequantize_riscv.cpp
index 9a172a41405..27662b4f98e 100644
--- a/src/layer/riscv/dequantize_riscv.cpp
+++ b/src/layer/riscv/dequantize_riscv.cpp
@@ -83,7 +83,7 @@ int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         float* ptr = (float*)top_blob + i * packn;
 
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = vfmacc_vf_f32m4(_v, scale, _bias, vl);
+                        _v = vfmacc_vf_f32m4(_bias, scale, _v, vl);
                         vse32_v_f32m4(ptr, _v, vl);
                     }
                 }
@@ -97,7 +97,7 @@ int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
 
                         vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = vfmacc_vf_f32m4(_v, scale, _bias, vl);
+                        _v = vfmacc_vf_f32m4(_bias, scale, _v, vl);
                         vse32_v_f32m4(ptr, _v, vl);
                     }
                 }
@@ -130,7 +130,7 @@ int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
 
                         vfloat32m4_t _scale = vle32_v_f32m4((const float*)scale_data + i * packn, vl);
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = vfmacc_vv_f32m4(_v, _scale, _bias, vl);       
+                        _v = vfmacc_vv_f32m4(_bias, _scale, _v, vl);       
                         vse32_v_f32m4(ptr, _v, vl);                 
                     }
                 }
@@ -145,7 +145,7 @@ int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _scale = vle32_v_f32m4((const float*)scale_data + i * packn, vl);
                         vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = vfmacc_vv_f32m4(_v, _scale, _bias, vl);
+                        _v = vfmacc_vv_f32m4(_bias, _scale, _v, vl);
                         vse32_v_f32m4(ptr, _v, vl);
                     }
                 }
@@ -227,10 +227,10 @@ int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _v1 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn, vl), vl);
                         vfloat32m4_t _v2 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 2, vl), vl);
                         vfloat32m4_t _v3 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 3, vl), vl);
-                        _v0 = vfmacc_vv_f32m4(_v0, _scale0, _bias0, vl);
-                        _v1 = vfmacc_vv_f32m4(_v1, _scale1, _bias1, vl);
-                        _v2 = vfmacc_vv_f32m4(_v2, _scale2, _bias2, vl);
-                        _v3 = vfmacc_vv_f32m4(_v3, _scale3, _bias3, vl);
+                        _v0 = vfmacc_vv_f32m4(_bias0, _scale0, _v0, vl);
+                        _v1 = vfmacc_vv_f32m4(_bias1, _scale1, _v1, vl);
+                        _v2 = vfmacc_vv_f32m4(_bias2, _scale2, _v2, vl);
+                        _v3 = vfmacc_vv_f32m4(_bias3, _scale3, _v3, vl);
                         vse32_v_f32m4(ptr0, _v0, vl);
                         vse32_v_f32m4(ptr1, _v1, vl);
                         vse32_v_f32m4(ptr2, _v2, vl);
@@ -323,10 +323,10 @@ int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _v1 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn, vl), vl);
                         vfloat32m4_t _v2 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 2, vl), vl);
                         vfloat32m4_t _v3 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 3, vl), vl);
-                        _v0 = vfmacc_vv_f32m4(_v0, _scale0, _bias0, vl);
-                        _v1 = vfmacc_vv_f32m4(_v1, _scale1, _bias1, vl);
-                        _v2 = vfmacc_vv_f32m4(_v2, _scale2, _bias2, vl);
-                        _v3 = vfmacc_vv_f32m4(_v3, _scale3, _bias3, vl);
+                        _v0 = vfmacc_vv_f32m4(_bias0, _scale0, _v0, vl);
+                        _v1 = vfmacc_vv_f32m4(_bias1, _scale1, _v1, vl);
+                        _v2 = vfmacc_vv_f32m4(_bias2, _scale2, _v2, vl);
+                        _v3 = vfmacc_vv_f32m4(_bias3, _scale3, _v3, vl);
                         vse32_v_f32m4(ptr0, _v0, vl);
                         vse32_v_f32m4(ptr1, _v1, vl);
                         vse32_v_f32m4(ptr2, _v2, vl);
diff --git a/src/layer/riscv/quantize_riscv.cpp b/src/layer/riscv/quantize_riscv.cpp
index 172c7d45ab3..ed2a331dfd9 100644
--- a/src/layer/riscv/quantize_riscv.cpp
+++ b/src/layer/riscv/quantize_riscv.cpp
@@ -231,8 +231,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
             int size = w * h;
             int out_elempack = opt.use_packing_layout && channels * elempack % out_packn == 0 ? out_packn : 1;
             int outc = channels * elempack / out_elempack;
-            NCNN_LOGE("out_elempack:%d", out_elempack);
-            NCNN_LOGE("outc:%d", outc);
 
             top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
             if (top_blob.empty())
diff --git a/src/layer/riscv/requantize_riscv.cpp b/src/layer/riscv/requantize_riscv.cpp
index 220087691d5..f2d3db2d1ff 100644
--- a/src/layer/riscv/requantize_riscv.cpp
+++ b/src/layer/riscv/requantize_riscv.cpp
@@ -83,7 +83,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         signed char* ptr = (signed char*)top_blob + i * packn;
 
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }
@@ -98,7 +98,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
 
                         vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }
@@ -137,7 +137,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl);
 
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }
@@ -154,7 +154,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
 
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }
@@ -193,7 +193,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _scale_in = vle32_v_f32m4((const float*)scale_in_data + i * packn, vl);
 
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vv_f32m4(_bias, _scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }
@@ -210,7 +210,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
 
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vv_f32m4(_bias, _scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }
@@ -249,7 +249,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl);
 
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vv_f32m4(_bias, _scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }
@@ -267,7 +267,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
 
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vv_f32m4(_bias, _scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }
@@ -322,7 +322,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                     for (int j = 0; j < w; j++)
                     {
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vv_f32m4(_v, scale_in, bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vv_f32m4(bias, scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
 
@@ -394,7 +394,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                     for (int i = 0; i < size; i++)
                     {
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vv_f32m4(_v, scale_in, bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vv_f32m4(bias, scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);