diff --git a/src/layer/riscv/gemm_riscv.cpp b/src/layer/riscv/gemm_riscv.cpp
index 33f8913bd1b..9b4b58ac651 100644
--- a/src/layer/riscv/gemm_riscv.cpp
+++ b/src/layer/riscv/gemm_riscv.cpp
@@ -99,23 +99,10 @@ static void pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max
                 vfloat32m1_t _r6h = vle32_v_f32m1(p6 + 4, vl);
                 vfloat32m1_t _r7l = vle32_v_f32m1(p7, vl);
                 vfloat32m1_t _r7h = vle32_v_f32m1(p7 + 4, vl);
-                transpose8x8_ps(_r0l, _r0h, _r1l, _r1h, _r2l, _r2h, _r3l, _r3h, _r4l, _r4h, _r5l, _r5h, _r6l, _r6h, _r7l, _r7h, vl);
-                vse32_v_f32m1(pp, _r0l, vl);
-                vse32_v_f32m1(pp + 4, _r0h, vl);
-                vse32_v_f32m1(pp + 8, _r1l, vl);
-                vse32_v_f32m1(pp + 12, _r1h, vl);
-                vse32_v_f32m1(pp + 8 * 2, _r2l, vl);
-                vse32_v_f32m1(pp + 8 * 2 + 4, _r2h, vl);
-                vse32_v_f32m1(pp + 8 * 3, _r3l, vl);
-                vse32_v_f32m1(pp + 8 * 3 + 4, _r3h, vl);
-                vse32_v_f32m1(pp + 8 * 4, _r4l, vl);
-                vse32_v_f32m1(pp + 8 * 4 + 4, _r4h, vl);
-                vse32_v_f32m1(pp + 8 * 5, _r5l, vl);
-                vse32_v_f32m1(pp + 8 * 5 + 4, _r5h, vl);
-                vse32_v_f32m1(pp + 8 * 6, _r6l, vl);
-                vse32_v_f32m1(pp + 8 * 6 + 4, _r6h, vl);
-                vse32_v_f32m1(pp + 8 * 7, _r7l, vl);
-                vse32_v_f32m1(pp + 8 * 7 + 4, _r7h, vl);
+
+                vsseg8e32_v_f32m1(pp, _r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l, vl);
+                vsseg8e32_v_f32m1(pp + 32, _r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h, vl);
+
                 pp += 64;
                 p0 += 8;
                 p1 += 8;
@@ -175,7 +162,7 @@ static void pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max
                 vfloat32m1_t v1 = vle32_v_f32m1(p1, vl);
                 vfloat32m1_t v2 = vle32_v_f32m1(p2, vl);
                 vfloat32m1_t v3 = vle32_v_f32m1(p3, vl);
-                store_float_v4(v0, v1, v2, v3, pp, vl);
+                vsseg4e32_v_f32m1(pp, v0, v1, v2, v3, vl);
                 pp += 16;
                 p0 += 4;
                 p1 += 4;
@@ -210,7 +197,7 @@ static void pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max
             {
                 vfloat32m1_t v0 = vle32_v_f32m1(p0, vl);
                 vfloat32m1_t v1 = vle32_v_f32m1(p1, vl);
-                store_float_v2(v0, v1, pp, vl);
+                vsseg2e32_v_f32m1(pp, v0, v1, vl);
                 pp += 8;
                 p0 += 4;
                 p1 += 4;
@@ -353,7 +340,7 @@ static void transpose_pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int
             {
                 vfloat32m1_t v0 = vle32_v_f32m1(p0, vl);
                 vfloat32m1_t v1 = vle32_v_f32m1(p0 + 4, vl);
-                store_float_v2(v0, v1, pp, vl);
+                vsseg2e32_v_f32m1(pp, v0, v1, vl);
                 pp += 8;
                 p0 += A_hstep * 4;
             }
@@ -562,17 +549,8 @@ static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max
                 vfloat32m1_t _r6 = vle32_v_f32m1(p6, vl);
                 vfloat32m1_t _r7 = vle32_v_f32m1(p7, vl);
 
-                transpose4x4_ps(_r0, _r1, _r2, _r3, vl);
-                transpose4x4_ps(_r4, _r5, _r6, _r7, vl);
+                vsseg8e32_v_f32m1(pp, _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, vl);
 
-                vse32_v_f32m1(pp, _r0, vl);
-                vse32_v_f32m1(pp + 4, _r4, vl);
-                vse32_v_f32m1(pp + 4 * 2, _r1, vl);
-                vse32_v_f32m1(pp + 4 * 3, _r5, vl);
-                vse32_v_f32m1(pp + 4 * 4, _r2, vl);
-                vse32_v_f32m1(pp + 4 * 5, _r6, vl);
-                vse32_v_f32m1(pp + 4 * 6, _r3, vl);
-                vse32_v_f32m1(pp + 4 * 7, _r7, vl);
                 pp += 32;
                 p0 += 4;
                 p1 += 4;
@@ -632,7 +610,7 @@ static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max
                 vfloat32m1_t v1 = vle32_v_f32m1(p1, vl);
                 vfloat32m1_t v2 = vle32_v_f32m1(p2, vl);
                 vfloat32m1_t v3 = vle32_v_f32m1(p3, vl);
-                store_float_v4(v0, v1, v2, v3, pp, vl);
+                vsseg4e32_v_f32m1(pp, v0, v1, v2, v3, vl);
                 pp += 16;
                 p0 += 4;
                 p1 += 4;
@@ -667,7 +645,7 @@ static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max
             {
                 vfloat32m1_t v0 = vle32_v_f32m1(p0, vl);
                 vfloat32m1_t v1 = vle32_v_f32m1(p1, vl);
-                store_float_v2(v0, v1, pp, vl);
+                vsseg2e32_v_f32m1(pp, v0, v1, vl);
                 pp += 8;
                 p0 += 4;
                 p1 += 4;
@@ -865,7 +843,7 @@ static void transpose_pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int
             {
                 vfloat32m1_t v0 = vle32_v_f32m1(p0, vl);
                 vfloat32m1_t v1 = vle32_v_f32m1(p0 + 4, vl);
-                store_float_v2(v0, v1, pp, vl);
+                vsseg2e32_v_f32m1(pp, v0, v1, vl);
                 pp += 8;
                 p0 += B_hstep * 4;
             }
@@ -937,12 +915,12 @@ static void transpose_unpack_output_tile(const Mat& topT, Mat& top_blob, int i,
                 vfloat32m1_t v1 = vle32_v_f32m1(pp + 8, vl);
                 vfloat32m1_t v2 = vle32_v_f32m1(pp + 16, vl);
                 vfloat32m1_t v3 = vle32_v_f32m1(pp + 24, vl);
-                store_float_v4(v0, v1, v2, v3, p0, vl);
+                vsseg4e32_v_f32m1(p0, v0, v1, v2, v3, vl);
                 v0 = vle32_v_f32m1(pp + 4, vl);
                 v1 = vle32_v_f32m1(pp + 12, vl);
                 v2 = vle32_v_f32m1(pp + 20, vl);
                 v3 = vle32_v_f32m1(pp + 28, vl);
-                store_float_v4(v0, v1, v2, v3, p0 + 16, vl);
+                vsseg4e32_v_f32m1(p0 + 16, v0, v1, v2, v3, vl);
                 pp += 32;
                 p0 += out_hstep * 4;
             }
@@ -974,7 +952,7 @@ static void transpose_unpack_output_tile(const Mat& topT, Mat& top_blob, int i,
                 vfloat32m1_t v1 = vle32_v_f32m1(pp + 4, vl);
                 vfloat32m1_t v2 = vle32_v_f32m1(pp + 8, vl);
                 vfloat32m1_t v3 = vle32_v_f32m1(pp + 12, vl);
-                store_float_v4(v0, v1, v2, v3, p0, vl);
+                vsseg4e32_v_f32m1(p0, v0, v1, v2, v3, vl);
                 pp += 16;
                 p0 += out_hstep * 4;
             }
@@ -2887,9 +2865,9 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
             }
             else
             {
-                store_float_v2(_sum00, _sum10, outptr, vl);
-                store_float_v2(_sum01, _sum11, outptr + 8, vl);
-                store_float_v2(_sum02, _sum12, outptr + 16, vl);
+                vsseg2e32_v_f32m1(outptr, _sum00, _sum10, vl);
+                vsseg2e32_v_f32m1(outptr + 8, _sum01, _sum11, vl);
+                vsseg2e32_v_f32m1(outptr + 16, _sum02, _sum12, vl);
             }
 
             outptr += 24;
@@ -2974,8 +2952,8 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
             }
             else
             {
-                store_float_v2(_sum00, _sum10, outptr, vl);
-                store_float_v2(_sum01, _sum11, outptr + 8, vl);
+                vsseg2e32_v_f32m1(outptr, _sum00, _sum10, vl);
+                vsseg2e32_v_f32m1(outptr + 8, _sum01, _sum11, vl);
             }
 
             outptr += 16;
@@ -3048,7 +3026,7 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
             }
             else
             {
-                store_float_v2(_sum0, _sum1, outptr, vl);
+                vsseg2e32_v_f32m1(outptr, _sum0, _sum1, vl);
             }
 
             outptr += 8;
diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index 938d3ce3998..e2824646f87 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -86,282 +86,6 @@ static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr)
     return vloxei32_v_f32m8(ptr, bindex, vl);
 }
 
-static inline void transpose8x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
-                                   vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
-                                   vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
-                                   vfloat32m1_t& _r3l, vfloat32m1_t& _r3h,
-                                   vfloat32m1_t& _r4l, vfloat32m1_t& _r4h,
-                                   vfloat32m1_t& _r5l, vfloat32m1_t& _r5h,
-                                   vfloat32m1_t& _r6l, vfloat32m1_t& _r6h,
-                                   vfloat32m1_t& _r7l, vfloat32m1_t& _r7h, size_t vl)
-{
-    float tmp[8][8];
-    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 8, _r0l, vl);
-    vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 8, _r0h, vl);
-    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 8, _r1l, vl);
-    vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 8, _r1h, vl);
-    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 8, _r2l, vl);
-    vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 8, _r2h, vl);
-    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 8, _r3l, vl);
-    vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 8, _r3h, vl);
-    vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 8, _r4l, vl);
-    vsse32_v_f32m1(&tmp[4][4], sizeof(float) * 8, _r4h, vl);
-    vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 8, _r5l, vl);
-    vsse32_v_f32m1(&tmp[4][5], sizeof(float) * 8, _r5h, vl);
-    vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 8, _r6l, vl);
-    vsse32_v_f32m1(&tmp[4][6], sizeof(float) * 8, _r6h, vl);
-    vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 8, _r7l, vl);
-    vsse32_v_f32m1(&tmp[4][7], sizeof(float) * 8, _r7h, vl);
-    float* ptr = (float*)tmp;
-    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
-    _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
-    _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
-    _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
-    _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
-    _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
-    _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
-    _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
-    _r4l = vle32_v_f32m1(ptr + 8 * 4, vl);
-    _r4h = vle32_v_f32m1(ptr + 9 * 4, vl);
-    _r5l = vle32_v_f32m1(ptr + 10 * 4, vl);
-    _r5h = vle32_v_f32m1(ptr + 11 * 4, vl);
-    _r6l = vle32_v_f32m1(ptr + 12 * 4, vl);
-    _r6h = vle32_v_f32m1(ptr + 13 * 4, vl);
-    _r7l = vle32_v_f32m1(ptr + 14 * 4, vl);
-    _r7h = vle32_v_f32m1(ptr + 15 * 4, vl);
-}
-
-static inline void transpose4x4_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, size_t vl)
-{
-    float tmp[4][4];
-    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 4, _r0, vl);
-    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 4, _r1, vl);
-    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 4, _r2, vl);
-    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 4, _r3, vl);
-    float* ptr = (float*)tmp;
-    _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
-    _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
-    _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
-    _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
-}
-
-static inline void transpose8x12_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
-                                    vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
-                                    vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
-                                    vfloat32m1_t& _r3l, vfloat32m1_t& _r3h,
-                                    vfloat32m1_t& _r4l, vfloat32m1_t& _r4h,
-                                    vfloat32m1_t& _r5l, vfloat32m1_t& _r5h,
-                                    vfloat32m1_t& _r6l, vfloat32m1_t& _r6h,
-                                    vfloat32m1_t& _r7l, vfloat32m1_t& _r7h,
-                                    vfloat32m1_t& _r8l, vfloat32m1_t& _r8h,
-                                    vfloat32m1_t& _r9l, vfloat32m1_t& _r9h,
-                                    vfloat32m1_t& _ral, vfloat32m1_t& _rah,
-                                    vfloat32m1_t& _rbl, vfloat32m1_t& _rbh, size_t vl)
-{
-    float tmp[8][12];
-    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0l, vl);
-    vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 12, _r0h, vl);
-    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1l, vl);
-    vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 12, _r1h, vl);
-    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 12, _r2l, vl);
-    vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 12, _r2h, vl);
-    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 12, _r3l, vl);
-    vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 12, _r3h, vl);
-    vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 12, _r4l, vl);
-    vsse32_v_f32m1(&tmp[4][4], sizeof(float) * 12, _r4h, vl);
-    vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 12, _r5l, vl);
-    vsse32_v_f32m1(&tmp[4][5], sizeof(float) * 12, _r5h, vl);
-    vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 12, _r6l, vl);
-    vsse32_v_f32m1(&tmp[4][6], sizeof(float) * 12, _r6h, vl);
-    vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 12, _r7l, vl);
-    vsse32_v_f32m1(&tmp[4][7], sizeof(float) * 12, _r7h, vl);
-    vsse32_v_f32m1(&tmp[0][8], sizeof(float) * 12, _r8l, vl);
-    vsse32_v_f32m1(&tmp[4][8], sizeof(float) * 12, _r8h, vl);
-    vsse32_v_f32m1(&tmp[0][9], sizeof(float) * 12, _r9l, vl);
-    vsse32_v_f32m1(&tmp[4][9], sizeof(float) * 12, _r9h, vl);
-    vsse32_v_f32m1(&tmp[0][10], sizeof(float) * 12, _ral, vl);
-    vsse32_v_f32m1(&tmp[4][10], sizeof(float) * 12, _rah, vl);
-    vsse32_v_f32m1(&tmp[0][11], sizeof(float) * 12, _rbl, vl);
-    vsse32_v_f32m1(&tmp[4][11], sizeof(float) * 12, _rbh, vl);
-    float* ptr = (float*)tmp;
-    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
-    _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
-    _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
-    _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
-    _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
-    _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
-    _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
-    _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
-    _r4l = vle32_v_f32m1(ptr + 8 * 4, vl);
-    _r4h = vle32_v_f32m1(ptr + 9 * 4, vl);
-    _r5l = vle32_v_f32m1(ptr + 10 * 4, vl);
-    _r5h = vle32_v_f32m1(ptr + 11 * 4, vl);
-    _r6l = vle32_v_f32m1(ptr + 12 * 4, vl);
-    _r6h = vle32_v_f32m1(ptr + 13 * 4, vl);
-    _r7l = vle32_v_f32m1(ptr + 14 * 4, vl);
-    _r7h = vle32_v_f32m1(ptr + 15 * 4, vl);
-    _r8l = vle32_v_f32m1(ptr + 16 * 4, vl);
-    _r8h = vle32_v_f32m1(ptr + 17 * 4, vl);
-    _r9l = vle32_v_f32m1(ptr + 18 * 4, vl);
-    _r9h = vle32_v_f32m1(ptr + 19 * 4, vl);
-    _ral = vle32_v_f32m1(ptr + 20 * 4, vl);
-    _rah = vle32_v_f32m1(ptr + 21 * 4, vl);
-    _rbl = vle32_v_f32m1(ptr + 22 * 4, vl);
-    _rbh = vle32_v_f32m1(ptr + 23 * 4, vl);
-}
-
-static inline void transpose12x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0m, vfloat32m1_t& _r0h,
-                                    vfloat32m1_t& _r1l, vfloat32m1_t& _r1m, vfloat32m1_t& _r1h,
-                                    vfloat32m1_t& _r2l, vfloat32m1_t& _r2m, vfloat32m1_t& _r2h,
-                                    vfloat32m1_t& _r3l, vfloat32m1_t& _r3m, vfloat32m1_t& _r3h,
-                                    vfloat32m1_t& _r4l, vfloat32m1_t& _r4m, vfloat32m1_t& _r4h,
-                                    vfloat32m1_t& _r5l, vfloat32m1_t& _r5m, vfloat32m1_t& _r5h,
-                                    vfloat32m1_t& _r6l, vfloat32m1_t& _r6m, vfloat32m1_t& _r6h,
-                                    vfloat32m1_t& _r7l, vfloat32m1_t& _r7m, vfloat32m1_t& _r7h, size_t vl)
-{
-    float tmp[12][8];
-    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 8, _r0l, vl);
-    vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 8, _r0m, vl);
-    vsse32_v_f32m1(&tmp[8][0], sizeof(float) * 8, _r0h, vl);
-    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 8, _r1l, vl);
-    vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 8, _r1m, vl);
-    vsse32_v_f32m1(&tmp[8][0], sizeof(float) * 8, _r1h, vl);
-    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 8, _r2l, vl);
-    vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 8, _r2m, vl);
-    vsse32_v_f32m1(&tmp[8][2], sizeof(float) * 8, _r2h, vl);
-    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 8, _r3l, vl);
-    vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 8, _r3m, vl);
-    vsse32_v_f32m1(&tmp[8][3], sizeof(float) * 8, _r3h, vl);
-    vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 8, _r4l, vl);
-    vsse32_v_f32m1(&tmp[4][4], sizeof(float) * 8, _r4m, vl);
-    vsse32_v_f32m1(&tmp[8][4], sizeof(float) * 8, _r4h, vl);
-    vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 8, _r5l, vl);
-    vsse32_v_f32m1(&tmp[4][5], sizeof(float) * 8, _r5m, vl);
-    vsse32_v_f32m1(&tmp[8][5], sizeof(float) * 8, _r5h, vl);
-    vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 8, _r6l, vl);
-    vsse32_v_f32m1(&tmp[4][6], sizeof(float) * 8, _r6m, vl);
-    vsse32_v_f32m1(&tmp[8][6], sizeof(float) * 8, _r6h, vl);
-    vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 8, _r7l, vl);
-    vsse32_v_f32m1(&tmp[4][7], sizeof(float) * 8, _r7m, vl);
-    vsse32_v_f32m1(&tmp[8][7], sizeof(float) * 8, _r7h, vl);
-    float* ptr = (float*)tmp;
-    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
-    _r0m = vle32_v_f32m1(ptr + 1 * 4, vl);
-    _r0h = vle32_v_f32m1(ptr + 2 * 4, vl);
-    _r1l = vle32_v_f32m1(ptr + 3 * 4, vl);
-    _r1m = vle32_v_f32m1(ptr + 4 * 4, vl);
-    _r1h = vle32_v_f32m1(ptr + 5 * 4, vl);
-    _r2l = vle32_v_f32m1(ptr + 6 * 4, vl);
-    _r2m = vle32_v_f32m1(ptr + 7 * 4, vl);
-    _r2h = vle32_v_f32m1(ptr + 8 * 4, vl);
-    _r3l = vle32_v_f32m1(ptr + 9 * 4, vl);
-    _r3m = vle32_v_f32m1(ptr + 10 * 4, vl);
-    _r3h = vle32_v_f32m1(ptr + 11 * 4, vl);
-    _r4l = vle32_v_f32m1(ptr + 12 * 4, vl);
-    _r4m = vle32_v_f32m1(ptr + 13 * 4, vl);
-    _r4h = vle32_v_f32m1(ptr + 14 * 4, vl);
-    _r5l = vle32_v_f32m1(ptr + 15 * 4, vl);
-    _r5m = vle32_v_f32m1(ptr + 16 * 4, vl);
-    _r5h = vle32_v_f32m1(ptr + 17 * 4, vl);
-    _r6l = vle32_v_f32m1(ptr + 18 * 4, vl);
-    _r6m = vle32_v_f32m1(ptr + 19 * 4, vl);
-    _r6h = vle32_v_f32m1(ptr + 20 * 4, vl);
-    _r7l = vle32_v_f32m1(ptr + 21 * 4, vl);
-    _r7m = vle32_v_f32m1(ptr + 22 * 4, vl);
-    _r7h = vle32_v_f32m1(ptr + 23 * 4, vl);
-}
-
-static inline void transpose4x8_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, size_t vl)
-{
-    float tmp[4][8];
-    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 8, _r0, vl);
-    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 8, _r1, vl);
-    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 8, _r2, vl);
-    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 8, _r3, vl);
-    vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 8, _r4, vl);
-    vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 8, _r5, vl);
-    vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 8, _r6, vl);
-    vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 8, _r7, vl);
-    float* ptr = (float*)tmp;
-    _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
-    _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
-    _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
-    _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
-    _r4 = vle32_v_f32m1(ptr + 4 * 4, vl);
-    _r5 = vle32_v_f32m1(ptr + 5 * 4, vl);
-    _r6 = vle32_v_f32m1(ptr + 6 * 4, vl);
-    _r7 = vle32_v_f32m1(ptr + 7 * 4, vl);
-}
-
-static inline void transpose4x12_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, vfloat32m1_t& _r8, vfloat32m1_t& _r9, vfloat32m1_t& _ra, vfloat32m1_t& _rb, size_t vl)
-{
-    float tmp[4][12];
-    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0, vl);
-    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1, vl);
-    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 12, _r2, vl);
-    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 12, _r3, vl);
-    vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 12, _r4, vl);
-    vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 12, _r5, vl);
-    vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 12, _r6, vl);
-    vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 12, _r7, vl);
-    vsse32_v_f32m1(&tmp[0][8], sizeof(float) * 12, _r8, vl);
-    vsse32_v_f32m1(&tmp[0][9], sizeof(float) * 12, _r9, vl);
-    vsse32_v_f32m1(&tmp[0][10], sizeof(float) * 12, _ra, vl);
-    vsse32_v_f32m1(&tmp[0][11], sizeof(float) * 12, _rb, vl);
-    float* ptr = (float*)tmp;
-    _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
-    _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
-    _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
-    _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
-    _r4 = vle32_v_f32m1(ptr + 4 * 4, vl);
-    _r5 = vle32_v_f32m1(ptr + 5 * 4, vl);
-    _r6 = vle32_v_f32m1(ptr + 6 * 4, vl);
-    _r7 = vle32_v_f32m1(ptr + 7 * 4, vl);
-    _r8 = vle32_v_f32m1(ptr + 8 * 4, vl);
-    _r9 = vle32_v_f32m1(ptr + 9 * 4, vl);
-    _ra = vle32_v_f32m1(ptr + 10 * 4, vl);
-    _rb = vle32_v_f32m1(ptr + 11 * 4, vl);
-}
-
-static inline void transpose8x4_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
-                                   vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
-                                   vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
-                                   vfloat32m1_t& _r3l, vfloat32m1_t& _r3h, size_t vl)
-{
-    float tmp[8][4];
-    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 4, _r0l, vl);
-    vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 4, _r0h, vl);
-    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 4, _r1l, vl);
-    vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 4, _r1h, vl);
-    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 4, _r2l, vl);
-    vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 4, _r2h, vl);
-    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 4, _r3l, vl);
-    vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 4, _r3h, vl);
-    float* ptr = (float*)tmp;
-    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
-    _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
-    _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
-    _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
-    _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
-    _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
-    _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
-    _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
-}
-
-static inline void store_float_v2(vfloat32m1_t& vector1, vfloat32m1_t& vector2, float* buf, size_t vl)
-{
-    vsse32_v_f32m1(buf + 0, sizeof(float) * 2, vector1, vl);
-    vsse32_v_f32m1(buf + 1, sizeof(float) * 2, vector2, vl);
-}
-
-static inline void store_float_v4(vfloat32m1_t& vector1, vfloat32m1_t& vector2, vfloat32m1_t& vector3, vfloat32m1_t& vector4, float* buf, size_t vl)
-{
-    vsse32_v_f32m1(buf + 0, sizeof(float) * 4, vector1, vl);
-    vsse32_v_f32m1(buf + 1, sizeof(float) * 4, vector2, vl);
-    vsse32_v_f32m1(buf + 2, sizeof(float) * 4, vector3, vl);
-    vsse32_v_f32m1(buf + 3, sizeof(float) * 4, vector4, vl);
-}
-
 #if __riscv_zfh
 static inline vfloat16m8_t vle16_v_f16m8_f16m1(const __fp16* ptr)
 {
@@ -675,4 +399,221 @@ static inline void vlseg2e16_v_f16m4(vfloat16m4_t* v0, vfloat16m4_t* v1, const f
 #endif // __riscv_zfh
 #endif // __riscv_vector
 
+#ifdef __riscv_vector
+
+static inline void transpose8x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
+                                   vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
+                                   vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
+                                   vfloat32m1_t& _r3l, vfloat32m1_t& _r3h,
+                                   vfloat32m1_t& _r4l, vfloat32m1_t& _r4h,
+                                   vfloat32m1_t& _r5l, vfloat32m1_t& _r5h,
+                                   vfloat32m1_t& _r6l, vfloat32m1_t& _r6h,
+                                   vfloat32m1_t& _r7l, vfloat32m1_t& _r7h, size_t vl)
+{
+    float tmp[64];
+    vsseg8e32_v_f32m1(&tmp[0], _r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l, vl);
+    vsseg8e32_v_f32m1(&tmp[32], _r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h, vl);
+    float* ptr = (float*)tmp;
+    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
+    _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
+    _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
+    _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
+    _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
+    _r4l = vle32_v_f32m1(ptr + 8 * 4, vl);
+    _r4h = vle32_v_f32m1(ptr + 9 * 4, vl);
+    _r5l = vle32_v_f32m1(ptr + 10 * 4, vl);
+    _r5h = vle32_v_f32m1(ptr + 11 * 4, vl);
+    _r6l = vle32_v_f32m1(ptr + 12 * 4, vl);
+    _r6h = vle32_v_f32m1(ptr + 13 * 4, vl);
+    _r7l = vle32_v_f32m1(ptr + 14 * 4, vl);
+    _r7h = vle32_v_f32m1(ptr + 15 * 4, vl);
+}
+
+static inline void transpose4x4_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, size_t vl)
+{
+    float tmp[16];
+    vsseg4e32_v_f32m1(&tmp[0], _r0, _r1, _r2, _r3, vl);
+    float* ptr = (float*)tmp;
+    _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
+}
+
+static inline void transpose8x12_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
+                                    vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
+                                    vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
+                                    vfloat32m1_t& _r3l, vfloat32m1_t& _r3h,
+                                    vfloat32m1_t& _r4l, vfloat32m1_t& _r4h,
+                                    vfloat32m1_t& _r5l, vfloat32m1_t& _r5h,
+                                    vfloat32m1_t& _r6l, vfloat32m1_t& _r6h,
+                                    vfloat32m1_t& _r7l, vfloat32m1_t& _r7h,
+                                    vfloat32m1_t& _r8l, vfloat32m1_t& _r8h,
+                                    vfloat32m1_t& _r9l, vfloat32m1_t& _r9h,
+                                    vfloat32m1_t& _ral, vfloat32m1_t& _rah,
+                                    vfloat32m1_t& _rbl, vfloat32m1_t& _rbh, size_t vl)
+{
+    float tmp[8][12];
+
+    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0l, vl);
+    vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 12, _r0h, vl);
+    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1l, vl);
+    vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 12, _r1h, vl);
+    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 12, _r2l, vl);
+    vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 12, _r2h, vl);
+    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 12, _r3l, vl);
+    vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 12, _r3h, vl);
+    vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 12, _r4l, vl);
+    vsse32_v_f32m1(&tmp[4][4], sizeof(float) * 12, _r4h, vl);
+    vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 12, _r5l, vl);
+    vsse32_v_f32m1(&tmp[4][5], sizeof(float) * 12, _r5h, vl);
+    vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 12, _r6l, vl);
+    vsse32_v_f32m1(&tmp[4][6], sizeof(float) * 12, _r6h, vl);
+    vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 12, _r7l, vl);
+    vsse32_v_f32m1(&tmp[4][7], sizeof(float) * 12, _r7h, vl);
+    vsse32_v_f32m1(&tmp[0][8], sizeof(float) * 12, _r8l, vl);
+    vsse32_v_f32m1(&tmp[4][8], sizeof(float) * 12, _r8h, vl);
+    vsse32_v_f32m1(&tmp[0][9], sizeof(float) * 12, _r9l, vl);
+    vsse32_v_f32m1(&tmp[4][9], sizeof(float) * 12, _r9h, vl);
+    vsse32_v_f32m1(&tmp[0][10], sizeof(float) * 12, _ral, vl);
+    vsse32_v_f32m1(&tmp[4][10], sizeof(float) * 12, _rah, vl);
+    vsse32_v_f32m1(&tmp[0][11], sizeof(float) * 12, _rbl, vl);
+    vsse32_v_f32m1(&tmp[4][11], sizeof(float) * 12, _rbh, vl);
+    float* ptr = (float*)tmp;
+    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
+    _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
+    _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
+    _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
+    _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
+    _r4l = vle32_v_f32m1(ptr + 8 * 4, vl);
+    _r4h = vle32_v_f32m1(ptr + 9 * 4, vl);
+    _r5l = vle32_v_f32m1(ptr + 10 * 4, vl);
+    _r5h = vle32_v_f32m1(ptr + 11 * 4, vl);
+    _r6l = vle32_v_f32m1(ptr + 12 * 4, vl);
+    _r6h = vle32_v_f32m1(ptr + 13 * 4, vl);
+    _r7l = vle32_v_f32m1(ptr + 14 * 4, vl);
+    _r7h = vle32_v_f32m1(ptr + 15 * 4, vl);
+    _r8l = vle32_v_f32m1(ptr + 16 * 4, vl);
+    _r8h = vle32_v_f32m1(ptr + 17 * 4, vl);
+    _r9l = vle32_v_f32m1(ptr + 18 * 4, vl);
+    _r9h = vle32_v_f32m1(ptr + 19 * 4, vl);
+    _ral = vle32_v_f32m1(ptr + 20 * 4, vl);
+    _rah = vle32_v_f32m1(ptr + 21 * 4, vl);
+    _rbl = vle32_v_f32m1(ptr + 22 * 4, vl);
+    _rbh = vle32_v_f32m1(ptr + 23 * 4, vl);
+}
+
+static inline void transpose12x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0m, vfloat32m1_t& _r0h,
+                                    vfloat32m1_t& _r1l, vfloat32m1_t& _r1m, vfloat32m1_t& _r1h,
+                                    vfloat32m1_t& _r2l, vfloat32m1_t& _r2m, vfloat32m1_t& _r2h,
+                                    vfloat32m1_t& _r3l, vfloat32m1_t& _r3m, vfloat32m1_t& _r3h,
+                                    vfloat32m1_t& _r4l, vfloat32m1_t& _r4m, vfloat32m1_t& _r4h,
+                                    vfloat32m1_t& _r5l, vfloat32m1_t& _r5m, vfloat32m1_t& _r5h,
+                                    vfloat32m1_t& _r6l, vfloat32m1_t& _r6m, vfloat32m1_t& _r6h,
+                                    vfloat32m1_t& _r7l, vfloat32m1_t& _r7m, vfloat32m1_t& _r7h, size_t vl)
+{
+    float tmp[96];
+    vsseg8e32_v_f32m1(&tmp[0], _r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l, vl);
+    vsseg8e32_v_f32m1(&tmp[32], _r0m, _r1m, _r2m, _r3m, _r4m, _r5m, _r6m, _r7m, vl);
+    vsseg8e32_v_f32m1(&tmp[64], _r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h, vl);
+
+    float* ptr = (float*)tmp;
+    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r0m = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r0h = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r1l = vle32_v_f32m1(ptr + 3 * 4, vl);
+    _r1m = vle32_v_f32m1(ptr + 4 * 4, vl);
+    _r1h = vle32_v_f32m1(ptr + 5 * 4, vl);
+    _r2l = vle32_v_f32m1(ptr + 6 * 4, vl);
+    _r2m = vle32_v_f32m1(ptr + 7 * 4, vl);
+    _r2h = vle32_v_f32m1(ptr + 8 * 4, vl);
+    _r3l = vle32_v_f32m1(ptr + 9 * 4, vl);
+    _r3m = vle32_v_f32m1(ptr + 10 * 4, vl);
+    _r3h = vle32_v_f32m1(ptr + 11 * 4, vl);
+    _r4l = vle32_v_f32m1(ptr + 12 * 4, vl);
+    _r4m = vle32_v_f32m1(ptr + 13 * 4, vl);
+    _r4h = vle32_v_f32m1(ptr + 14 * 4, vl);
+    _r5l = vle32_v_f32m1(ptr + 15 * 4, vl);
+    _r5m = vle32_v_f32m1(ptr + 16 * 4, vl);
+    _r5h = vle32_v_f32m1(ptr + 17 * 4, vl);
+    _r6l = vle32_v_f32m1(ptr + 18 * 4, vl);
+    _r6m = vle32_v_f32m1(ptr + 19 * 4, vl);
+    _r6h = vle32_v_f32m1(ptr + 20 * 4, vl);
+    _r7l = vle32_v_f32m1(ptr + 21 * 4, vl);
+    _r7m = vle32_v_f32m1(ptr + 22 * 4, vl);
+    _r7h = vle32_v_f32m1(ptr + 23 * 4, vl);
+}
+
+static inline void transpose4x8_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, size_t vl)
+{
+    float tmp[32];
+    vsseg8e32_v_f32m1(&tmp[0], _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, vl);
+
+    float* ptr = (float*)tmp;
+    _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
+    _r4 = vle32_v_f32m1(ptr + 4 * 4, vl);
+    _r5 = vle32_v_f32m1(ptr + 5 * 4, vl);
+    _r6 = vle32_v_f32m1(ptr + 6 * 4, vl);
+    _r7 = vle32_v_f32m1(ptr + 7 * 4, vl);
+}
+
+static inline void transpose4x12_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, vfloat32m1_t& _r8, vfloat32m1_t& _r9, vfloat32m1_t& _ra, vfloat32m1_t& _rb, size_t vl)
+{
+    float tmp[4][12];
+    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0, vl);
+    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1, vl);
+    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 12, _r2, vl);
+    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 12, _r3, vl);
+    vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 12, _r4, vl);
+    vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 12, _r5, vl);
+    vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 12, _r6, vl);
+    vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 12, _r7, vl);
+    vsse32_v_f32m1(&tmp[0][8], sizeof(float) * 12, _r8, vl);
+    vsse32_v_f32m1(&tmp[0][9], sizeof(float) * 12, _r9, vl);
+    vsse32_v_f32m1(&tmp[0][10], sizeof(float) * 12, _ra, vl);
+    vsse32_v_f32m1(&tmp[0][11], sizeof(float) * 12, _rb, vl);
+    float* ptr = (float*)tmp;
+    _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
+    _r4 = vle32_v_f32m1(ptr + 4 * 4, vl);
+    _r5 = vle32_v_f32m1(ptr + 5 * 4, vl);
+    _r6 = vle32_v_f32m1(ptr + 6 * 4, vl);
+    _r7 = vle32_v_f32m1(ptr + 7 * 4, vl);
+    _r8 = vle32_v_f32m1(ptr + 8 * 4, vl);
+    _r9 = vle32_v_f32m1(ptr + 9 * 4, vl);
+    _ra = vle32_v_f32m1(ptr + 10 * 4, vl);
+    _rb = vle32_v_f32m1(ptr + 11 * 4, vl);
+}
+
+static inline void transpose8x4_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
+                                   vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
+                                   vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
+                                   vfloat32m1_t& _r3l, vfloat32m1_t& _r3h, size_t vl)
+{
+    float tmp[32];
+    vsseg4e32_v_f32m1(&tmp[0], _r0l, _r1l, _r2l, _r3l, vl);
+    vsseg4e32_v_f32m1(&tmp[16], _r0h, _r1h, _r2h, _r3h, vl);
+    float* ptr = (float*)tmp;
+    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
+    _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
+    _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
+    _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
+    _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
+}
+#endif
+
 #endif // RISCV_USABILITY_H