From 091a19671506ae173a7c014616bdb440994cf3a3 Mon Sep 17 00:00:00 2001 From: Xinyu Yang Date: Wed, 7 Feb 2024 18:29:53 +0800 Subject: [PATCH 1/2] refine code --- src/layer/riscv/gemm_riscv.cpp | 372 ++++++++++++++++----------------- 1 file changed, 186 insertions(+), 186 deletions(-) diff --git a/src/layer/riscv/gemm_riscv.cpp b/src/layer/riscv/gemm_riscv.cpp index 9b4b58ac651..10c8a49a04a 100644 --- a/src/layer/riscv/gemm_riscv.cpp +++ b/src/layer/riscv/gemm_riscv.cpp @@ -1271,30 +1271,30 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl); vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl); - _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl); - _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl); - _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl); - _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl); - _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl); - _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl); - _sum40 = vfmadd_vf_f32m1(_pA0, pB[4], _sum40, vl); - _sum41 = vfmadd_vf_f32m1(_pA1, pB[4], _sum41, vl); - _sum50 = vfmadd_vf_f32m1(_pA0, pB[5], _sum50, vl); - _sum51 = vfmadd_vf_f32m1(_pA1, pB[5], _sum51, vl); - _sum60 = vfmadd_vf_f32m1(_pA0, pB[6], _sum60, vl); - _sum61 = vfmadd_vf_f32m1(_pA1, pB[6], _sum61, vl); - _sum70 = vfmadd_vf_f32m1(_pA0, pB[7], _sum70, vl); - _sum71 = vfmadd_vf_f32m1(_pA1, pB[7], _sum71, vl); - _sum80 = vfmadd_vf_f32m1(_pA0, pB[8], _sum80, vl); - _sum81 = vfmadd_vf_f32m1(_pA1, pB[8], _sum81, vl); - _sum90 = vfmadd_vf_f32m1(_pA0, pB[9], _sum90, vl); - _sum91 = vfmadd_vf_f32m1(_pA1, pB[9], _sum91, vl); - _suma0 = vfmadd_vf_f32m1(_pA0, pB[10], _suma0, vl); - _suma1 = vfmadd_vf_f32m1(_pA1, pB[10], _suma1, vl); - _sumb0 = vfmadd_vf_f32m1(_pA0, pB[11], _sumb0, vl); - _sumb1 = vfmadd_vf_f32m1(_pA1, pB[11], _sumb1, vl); +_sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); +_sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); +_sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); +_sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); +_sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); +_sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); +_sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); +_sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); +_sum40 = vfmadd_vf_f32m1(_sum40, pB[4], _pA0, vl); +_sum41 = vfmadd_vf_f32m1(_sum41, pB[4], _pA1, vl); +_sum50 = vfmadd_vf_f32m1(_sum50, pB[5], _pA0, vl); +_sum51 = vfmadd_vf_f32m1(_sum51, pB[5], _pA1, vl); +_sum60 = vfmadd_vf_f32m1(_sum60, pB[6], _pA0, vl); +_sum61 = vfmadd_vf_f32m1(_sum61, pB[6], _pA1, vl); +_sum70 = vfmadd_vf_f32m1(_sum70, pB[7], _pA0, vl); +_sum71 = vfmadd_vf_f32m1(_sum71, pB[7], _pA1, vl); +_sum80 = vfmadd_vf_f32m1(_sum80, pB[8], _pA0, vl); +_sum81 = vfmadd_vf_f32m1(_sum81, pB[8], _pA1, vl); +_sum90 = vfmadd_vf_f32m1(_sum90, pB[9], _pA0, vl); +_sum91 = vfmadd_vf_f32m1(_sum91, pB[9], _pA1, vl); +_suma0 = vfmadd_vf_f32m1(_suma0, pB[10], _pA0, vl); +_suma1 = vfmadd_vf_f32m1(_suma1, pB[10], _pA1, vl); +_sumb0 = vfmadd_vf_f32m1(_sumb0, pB[11], _pA0, vl); +_sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); pA += 8; pB += 12; @@ -1302,30 +1302,30 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons _pA0 = vle32_v_f32m1(pA, vl); _pA1 = vle32_v_f32m1(pA + 4, vl); - _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl); - _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl); - _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl); - _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl); - _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl); - _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl); - _sum40 = vfmadd_vf_f32m1(_pA0, pB[4], _sum40, vl); - _sum41 = vfmadd_vf_f32m1(_pA1, pB[4], _sum41, vl); - _sum50 = vfmadd_vf_f32m1(_pA0, pB[5], _sum50, vl); - _sum51 = vfmadd_vf_f32m1(_pA1, pB[5], _sum51, vl); - _sum60 = vfmadd_vf_f32m1(_pA0, pB[6], _sum60, vl); - _sum61 = vfmadd_vf_f32m1(_pA1, pB[6], _sum61, vl); - _sum70 = vfmadd_vf_f32m1(_pA0, pB[7], _sum70, vl); - _sum71 = vfmadd_vf_f32m1(_pA1, pB[7], _sum71, vl); - _sum80 = vfmadd_vf_f32m1(_pA0, pB[8], _sum80, vl); - _sum81 = vfmadd_vf_f32m1(_pA1, pB[8], _sum81, vl); - _sum90 = vfmadd_vf_f32m1(_pA0, pB[9], _sum90, vl); - _sum91 = vfmadd_vf_f32m1(_pA1, pB[9], _sum91, vl); - _suma0 = vfmadd_vf_f32m1(_pA0, pB[10], _suma0, vl); - _suma1 = vfmadd_vf_f32m1(_pA1, pB[10], _suma1, vl); - _sumb0 = vfmadd_vf_f32m1(_pA0, pB[11], _sumb0, vl); - _sumb1 = vfmadd_vf_f32m1(_pA1, pB[11], _sumb1, vl); +_sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); +_sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); +_sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); +_sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); +_sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); +_sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); +_sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); +_sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); +_sum40 = vfmadd_vf_f32m1(_sum40, pB[4], _pA0, vl); +_sum41 = vfmadd_vf_f32m1(_sum41, pB[4], _pA1, vl); +_sum50 = vfmadd_vf_f32m1(_sum50, pB[5], _pA0, vl); +_sum51 = vfmadd_vf_f32m1(_sum51, pB[5], _pA1, vl); +_sum60 = vfmadd_vf_f32m1(_sum60, pB[6], _pA0, vl); +_sum61 = vfmadd_vf_f32m1(_sum61, pB[6], _pA1, vl); +_sum70 = vfmadd_vf_f32m1(_sum70, pB[7], _pA0, vl); +_sum71 = vfmadd_vf_f32m1(_sum71, pB[7], _pA1, vl); +_sum80 = vfmadd_vf_f32m1(_sum80, pB[8], _pA0, vl); +_sum81 = vfmadd_vf_f32m1(_sum81, pB[8], _pA1, vl); +_sum90 = vfmadd_vf_f32m1(_sum90, pB[9], _pA0, vl); +_sum91 = vfmadd_vf_f32m1(_sum91, pB[9], _pA1, vl); +_suma0 = vfmadd_vf_f32m1(_suma0, pB[10], _pA0, vl); +_suma1 = vfmadd_vf_f32m1(_suma1, pB[10], _pA1, vl); +_sumb0 = vfmadd_vf_f32m1(_sumb0, pB[11], _pA0, vl); +_sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); pA += 8; pB += 12; @@ -1333,30 +1333,30 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons _pA0 = vle32_v_f32m1(pA, vl); _pA1 = vle32_v_f32m1(pA + 4, vl); - _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl); - _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl); - _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl); - _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl); - _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl); - _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl); - _sum40 = vfmadd_vf_f32m1(_pA0, pB[4], _sum40, vl); - _sum41 = vfmadd_vf_f32m1(_pA1, pB[4], _sum41, vl); - _sum50 = vfmadd_vf_f32m1(_pA0, pB[5], _sum50, vl); - _sum51 = vfmadd_vf_f32m1(_pA1, pB[5], _sum51, vl); - _sum60 = vfmadd_vf_f32m1(_pA0, pB[6], _sum60, vl); - _sum61 = vfmadd_vf_f32m1(_pA1, pB[6], _sum61, vl); - _sum70 = vfmadd_vf_f32m1(_pA0, pB[7], _sum70, vl); - _sum71 = vfmadd_vf_f32m1(_pA1, pB[7], _sum71, vl); - _sum80 = vfmadd_vf_f32m1(_pA0, pB[8], _sum80, vl); - _sum81 = vfmadd_vf_f32m1(_pA1, pB[8], _sum81, vl); - _sum90 = vfmadd_vf_f32m1(_pA0, pB[9], _sum90, vl); - _sum91 = vfmadd_vf_f32m1(_pA1, pB[9], _sum91, vl); - _suma0 = vfmadd_vf_f32m1(_pA0, pB[10], _suma0, vl); - _suma1 = vfmadd_vf_f32m1(_pA1, pB[10], _suma1, vl); - _sumb0 = vfmadd_vf_f32m1(_pA0, pB[11], _sumb0, vl); - _sumb1 = vfmadd_vf_f32m1(_pA1, pB[11], _sumb1, vl); +_sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); +_sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); +_sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); +_sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); +_sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); +_sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); +_sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); +_sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); +_sum40 = vfmadd_vf_f32m1(_sum40, pB[4], _pA0, vl); +_sum41 = vfmadd_vf_f32m1(_sum41, pB[4], _pA1, vl); +_sum50 = vfmadd_vf_f32m1(_sum50, pB[5], _pA0, vl); +_sum51 = vfmadd_vf_f32m1(_sum51, pB[5], _pA1, vl); +_sum60 = vfmadd_vf_f32m1(_sum60, pB[6], _pA0, vl); +_sum61 = vfmadd_vf_f32m1(_sum61, pB[6], _pA1, vl); +_sum70 = vfmadd_vf_f32m1(_sum70, pB[7], _pA0, vl); +_sum71 = vfmadd_vf_f32m1(_sum71, pB[7], _pA1, vl); +_sum80 = vfmadd_vf_f32m1(_sum80, pB[8], _pA0, vl); +_sum81 = vfmadd_vf_f32m1(_sum81, pB[8], _pA1, vl); +_sum90 = vfmadd_vf_f32m1(_sum90, pB[9], _pA0, vl); +_sum91 = vfmadd_vf_f32m1(_sum91, pB[9], _pA1, vl); +_suma0 = vfmadd_vf_f32m1(_suma0, pB[10], _pA0, vl); +_suma1 = vfmadd_vf_f32m1(_suma1, pB[10], _pA1, vl); +_sumb0 = vfmadd_vf_f32m1(_sumb0, pB[11], _pA0, vl); +_sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); pA += 8; pB += 12; @@ -1364,30 +1364,30 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons _pA0 = vle32_v_f32m1(pA, vl); _pA1 = vle32_v_f32m1(pA + 4, vl); - _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl); - _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl); - _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl); - _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl); - _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl); - _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl); - _sum40 = vfmadd_vf_f32m1(_pA0, pB[4], _sum40, vl); - _sum41 = vfmadd_vf_f32m1(_pA1, pB[4], _sum41, vl); - _sum50 = vfmadd_vf_f32m1(_pA0, pB[5], _sum50, vl); - _sum51 = vfmadd_vf_f32m1(_pA1, pB[5], _sum51, vl); - _sum60 = vfmadd_vf_f32m1(_pA0, pB[6], _sum60, vl); - _sum61 = vfmadd_vf_f32m1(_pA1, pB[6], _sum61, vl); - _sum70 = vfmadd_vf_f32m1(_pA0, pB[7], _sum70, vl); - _sum71 = vfmadd_vf_f32m1(_pA1, pB[7], _sum71, vl); - _sum80 = vfmadd_vf_f32m1(_pA0, pB[8], _sum80, vl); - _sum81 = vfmadd_vf_f32m1(_pA1, pB[8], _sum81, vl); - _sum90 = vfmadd_vf_f32m1(_pA0, pB[9], _sum90, vl); - _sum91 = vfmadd_vf_f32m1(_pA1, pB[9], _sum91, vl); - _suma0 = vfmadd_vf_f32m1(_pA0, pB[10], _suma0, vl); - _suma1 = vfmadd_vf_f32m1(_pA1, pB[10], _suma1, vl); - _sumb0 = vfmadd_vf_f32m1(_pA0, pB[11], _sumb0, vl); - _sumb1 = vfmadd_vf_f32m1(_pA1, pB[11], _sumb1, vl); +_sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); +_sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); +_sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); +_sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); +_sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); +_sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); +_sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); +_sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); +_sum40 = vfmadd_vf_f32m1(_sum40, pB[4], _pA0, vl); +_sum41 = vfmadd_vf_f32m1(_sum41, pB[4], _pA1, vl); +_sum50 = vfmadd_vf_f32m1(_sum50, pB[5], _pA0, vl); +_sum51 = vfmadd_vf_f32m1(_sum51, pB[5], _pA1, vl); +_sum60 = vfmadd_vf_f32m1(_sum60, pB[6], _pA0, vl); +_sum61 = vfmadd_vf_f32m1(_sum61, pB[6], _pA1, vl); +_sum70 = vfmadd_vf_f32m1(_sum70, pB[7], _pA0, vl); +_sum71 = vfmadd_vf_f32m1(_sum71, pB[7], _pA1, vl); +_sum80 = vfmadd_vf_f32m1(_sum80, pB[8], _pA0, vl); +_sum81 = vfmadd_vf_f32m1(_sum81, pB[8], _pA1, vl); +_sum90 = vfmadd_vf_f32m1(_sum90, pB[9], _pA0, vl); +_sum91 = vfmadd_vf_f32m1(_sum91, pB[9], _pA1, vl); +_suma0 = vfmadd_vf_f32m1(_suma0, pB[10], _pA0, vl); +_suma1 = vfmadd_vf_f32m1(_suma1, pB[10], _pA1, vl); +_sumb0 = vfmadd_vf_f32m1(_sumb0, pB[11], _pA0, vl); +_sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); pA += 8; pB += 12; @@ -1397,30 +1397,30 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl); vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl); - _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl); - _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl); - _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl); - _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl); - _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl); - _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl); - _sum40 = vfmadd_vf_f32m1(_pA0, pB[4], _sum40, vl); - _sum41 = vfmadd_vf_f32m1(_pA1, pB[4], _sum41, vl); - _sum50 = vfmadd_vf_f32m1(_pA0, pB[5], _sum50, vl); - _sum51 = vfmadd_vf_f32m1(_pA1, pB[5], _sum51, vl); - _sum60 = vfmadd_vf_f32m1(_pA0, pB[6], _sum60, vl); - _sum61 = vfmadd_vf_f32m1(_pA1, pB[6], _sum61, vl); - _sum70 = vfmadd_vf_f32m1(_pA0, pB[7], _sum70, vl); - _sum71 = vfmadd_vf_f32m1(_pA1, pB[7], _sum71, vl); - _sum80 = vfmadd_vf_f32m1(_pA0, pB[8], _sum80, vl); - _sum81 = vfmadd_vf_f32m1(_pA1, pB[8], _sum81, vl); - _sum90 = vfmadd_vf_f32m1(_pA0, pB[9], _sum90, vl); - _sum91 = vfmadd_vf_f32m1(_pA1, pB[9], _sum91, vl); - _suma0 = vfmadd_vf_f32m1(_pA0, pB[10], _suma0, vl); - _suma1 = vfmadd_vf_f32m1(_pA1, pB[10], _suma1, vl); - _sumb0 = vfmadd_vf_f32m1(_pA0, pB[11], _sumb0, vl); - _sumb1 = vfmadd_vf_f32m1(_pA1, pB[11], _sumb1, vl); +_sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); +_sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); +_sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); +_sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); +_sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); +_sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); +_sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); +_sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); +_sum40 = vfmadd_vf_f32m1(_sum40, pB[4], _pA0, vl); +_sum41 = vfmadd_vf_f32m1(_sum41, pB[4], _pA1, vl); +_sum50 = vfmadd_vf_f32m1(_sum50, pB[5], _pA0, vl); +_sum51 = vfmadd_vf_f32m1(_sum51, pB[5], _pA1, vl); +_sum60 = vfmadd_vf_f32m1(_sum60, pB[6], _pA0, vl); +_sum61 = vfmadd_vf_f32m1(_sum61, pB[6], _pA1, vl); +_sum70 = vfmadd_vf_f32m1(_sum70, pB[7], _pA0, vl); +_sum71 = vfmadd_vf_f32m1(_sum71, pB[7], _pA1, vl); +_sum80 = vfmadd_vf_f32m1(_sum80, pB[8], _pA0, vl); +_sum81 = vfmadd_vf_f32m1(_sum81, pB[8], _pA1, vl); +_sum90 = vfmadd_vf_f32m1(_sum90, pB[9], _pA0, vl); +_sum91 = vfmadd_vf_f32m1(_sum91, pB[9], _pA1, vl); +_suma0 = vfmadd_vf_f32m1(_suma0, pB[10], _pA0, vl); +_suma1 = vfmadd_vf_f32m1(_suma1, pB[10], _pA1, vl); +_sumb0 = vfmadd_vf_f32m1(_sumb0, pB[11], _pA0, vl); +_sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); pA += 8; pB += 12; @@ -1667,22 +1667,22 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl); vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl); - _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl); - _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl); - _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl); - _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl); - _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl); - _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl); - _sum40 = vfmadd_vf_f32m1(_pA0, pB[4], _sum40, vl); - _sum41 = vfmadd_vf_f32m1(_pA1, pB[4], _sum41, vl); - _sum50 = vfmadd_vf_f32m1(_pA0, pB[5], _sum50, vl); - _sum51 = vfmadd_vf_f32m1(_pA1, pB[5], _sum51, vl); - _sum60 = vfmadd_vf_f32m1(_pA0, pB[6], _sum60, vl); - _sum61 = vfmadd_vf_f32m1(_pA1, pB[6], _sum61, vl); - _sum70 = vfmadd_vf_f32m1(_pA0, pB[7], _sum70, vl); - _sum71 = vfmadd_vf_f32m1(_pA1, pB[7], _sum71, vl); +_sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); +_sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); +_sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); +_sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); +_sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); +_sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); +_sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); +_sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); +_sum40 = vfmadd_vf_f32m1(_sum40, pB[4], _pA0, vl); +_sum41 = vfmadd_vf_f32m1(_sum41, pB[4], _pA1, vl); +_sum50 = vfmadd_vf_f32m1(_sum50, pB[5], _pA0, vl); +_sum51 = vfmadd_vf_f32m1(_sum51, pB[5], _pA1, vl); +_sum60 = vfmadd_vf_f32m1(_sum60, pB[6], _pA0, vl); +_sum61 = vfmadd_vf_f32m1(_sum61, pB[6], _pA1, vl); +_sum70 = vfmadd_vf_f32m1(_sum70, pB[7], _pA0, vl); +_sum71 = vfmadd_vf_f32m1(_sum71, pB[7], _pA1, vl); pA += 8; pB += 8; @@ -1849,14 +1849,14 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl); vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl); - _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl); - _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl); - _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl); - _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl); - _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl); - _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl); +_sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); +_sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); +_sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); +_sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); +_sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); +_sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); +_sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); +_sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); pA += 8; pB += 4; @@ -1971,10 +1971,10 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl); vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl); - _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl); - _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl); +_sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); +_sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); +_sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); +_sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); pA += 8; pB += 2; @@ -2261,18 +2261,18 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons { vfloat32m1_t _pA = vle32_v_f32m1(pA, vl); - _sum0 = vfmadd_vf_f32m1(_pA, pB[0], _sum0, vl); - _sum1 = vfmadd_vf_f32m1(_pA, pB[1], _sum1, vl); - _sum2 = vfmadd_vf_f32m1(_pA, pB[2], _sum2, vl); - _sum3 = vfmadd_vf_f32m1(_pA, pB[3], _sum3, vl); - _sum4 = vfmadd_vf_f32m1(_pA, pB[4], _sum4, vl); - _sum5 = vfmadd_vf_f32m1(_pA, pB[5], _sum5, vl); - _sum6 = vfmadd_vf_f32m1(_pA, pB[6], _sum6, vl); - _sum7 = vfmadd_vf_f32m1(_pA, pB[7], _sum7, vl); - _sum8 = vfmadd_vf_f32m1(_pA, pB[8], _sum8, vl); - _sum9 = vfmadd_vf_f32m1(_pA, pB[9], _sum9, vl); - _suma = vfmadd_vf_f32m1(_pA, pB[10], _suma, vl); - _sumb = vfmadd_vf_f32m1(_pA, pB[11], _sumb, vl); +_sum0 = vfmadd_vf_f32m1(_sum0, pB[0], _pA, vl); +_sum1 = vfmadd_vf_f32m1(_sum1, pB[1], _pA, vl); +_sum2 = vfmadd_vf_f32m1(_sum2, pB[2], _pA, vl); +_sum3 = vfmadd_vf_f32m1(_sum3, pB[3], _pA, vl); +_sum4 = vfmadd_vf_f32m1(_sum4, pB[4], _pA, vl); +_sum5 = vfmadd_vf_f32m1(_sum5, pB[5], _pA, vl); +_sum6 = vfmadd_vf_f32m1(_sum6, pB[6], _pA, vl); +_sum7 = vfmadd_vf_f32m1(_sum7, pB[7], _pA, vl); +_sum8 = vfmadd_vf_f32m1(_sum8, pB[8], _pA, vl); +_sum9 = vfmadd_vf_f32m1(_sum9, pB[9], _pA, vl); +_suma = vfmadd_vf_f32m1(_suma, pB[10], _pA, vl); +_sumb = vfmadd_vf_f32m1(_sumb, pB[11], _pA, vl); pA += 4; pB += 12; @@ -2423,14 +2423,14 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons { vfloat32m1_t _pA = vle32_v_f32m1(pA, vl); - _sum0 = vfmadd_vf_f32m1(_pA, pB[0], _sum0, vl); - _sum1 = vfmadd_vf_f32m1(_pA, pB[1], _sum1, vl); - _sum2 = vfmadd_vf_f32m1(_pA, pB[2], _sum2, vl); - _sum3 = vfmadd_vf_f32m1(_pA, pB[3], _sum3, vl); - _sum4 = vfmadd_vf_f32m1(_pA, pB[4], _sum4, vl); - _sum5 = vfmadd_vf_f32m1(_pA, pB[5], _sum5, vl); - _sum6 = vfmadd_vf_f32m1(_pA, pB[6], _sum6, vl); - _sum7 = vfmadd_vf_f32m1(_pA, pB[7], _sum7, vl); +_sum0 = vfmadd_vf_f32m1(_sum0, pB[0], _pA, vl); +_sum1 = vfmadd_vf_f32m1(_sum1, pB[1], _pA, vl); +_sum2 = vfmadd_vf_f32m1(_sum2, pB[2], _pA, vl); +_sum3 = vfmadd_vf_f32m1(_sum3, pB[3], _pA, vl); +_sum4 = vfmadd_vf_f32m1(_sum4, pB[4], _pA, vl); +_sum5 = vfmadd_vf_f32m1(_sum5, pB[5], _pA, vl); +_sum6 = vfmadd_vf_f32m1(_sum6, pB[6], _pA, vl); +_sum7 = vfmadd_vf_f32m1(_sum7, pB[7], _pA, vl); pA += 4; pB += 8; @@ -2541,10 +2541,10 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons { vfloat32m1_t _pA = vle32_v_f32m1(pA, vl); - _sum0 = vfmadd_vf_f32m1(_pA, pB[0], _sum0, vl); - _sum1 = vfmadd_vf_f32m1(_pA, pB[1], _sum1, vl); - _sum2 = vfmadd_vf_f32m1(_pA, pB[2], _sum2, vl); - _sum3 = vfmadd_vf_f32m1(_pA, pB[3], _sum3, vl); +_sum0 = vfmadd_vf_f32m1(_sum0, pB[0], _pA, vl); +_sum1 = vfmadd_vf_f32m1(_sum1, pB[1], _pA, vl); +_sum2 = vfmadd_vf_f32m1(_sum2, pB[2], _pA, vl); +_sum3 = vfmadd_vf_f32m1(_sum3, pB[3], _pA, vl); pA += 4; pB += 4; } @@ -2628,8 +2628,8 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons { vfloat32m1_t _pA = vle32_v_f32m1(pA, vl); - _sum0 = vfmadd_vf_f32m1(_pA, pB[0], _sum0, vl); - _sum1 = vfmadd_vf_f32m1(_pA, pB[1], _sum1, vl); +_sum0 = vfmadd_vf_f32m1(_sum0, pB[0], _pA, vl); +_sum1 = vfmadd_vf_f32m1(_sum1, pB[1], _pA, vl); pA += 4; pB += 2; @@ -2839,12 +2839,12 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons vfloat32m1_t _pB1 = vle32_v_f32m1(pB + 4, vl); vfloat32m1_t _pB2 = vle32_v_f32m1(pB + 8, vl); - _sum00 = vfmadd_vf_f32m1(_pB0, pA[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pB1, pA[0], _sum01, vl); - _sum02 = vfmadd_vf_f32m1(_pB2, pA[0], _sum02, vl); - _sum10 = vfmadd_vf_f32m1(_pB0, pA[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pB1, pA[1], _sum11, vl); - _sum12 = vfmadd_vf_f32m1(_pB2, pA[1], _sum12, vl); +_sum00 = vfmadd_vf_f32m1(_sum00, pA[0], _pB0, vl); +_sum01 = vfmadd_vf_f32m1(_sum01, pA[0], _pB1, vl); +_sum02 = vfmadd_vf_f32m1(_sum02, pA[0], _pB2, vl); +_sum10 = vfmadd_vf_f32m1(_sum10, pA[1], _pB0, vl); +_sum11 = vfmadd_vf_f32m1(_sum11, pA[1], _pB1, vl); +_sum12 = vfmadd_vf_f32m1(_sum12, pA[1], _pB2, vl); pA += 2; pB += 12; @@ -2931,10 +2931,10 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons vfloat32m1_t _pB0 = vle32_v_f32m1(pB, vl); vfloat32m1_t _pB1 = vle32_v_f32m1(pB + 4, vl); - _sum00 = vfmadd_vf_f32m1(_pB0, pA[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pB1, pA[0], _sum01, vl); - _sum10 = vfmadd_vf_f32m1(_pB0, pA[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pB1, pA[1], _sum11, vl); +_sum00 = vfmadd_vf_f32m1(_sum00, pA[0], _pB0, vl); +_sum01 = vfmadd_vf_f32m1(_sum01, pA[0], _pB1, vl); +_sum10 = vfmadd_vf_f32m1(_sum10, pA[1], _pB0, vl); +_sum11 = vfmadd_vf_f32m1(_sum11, pA[1], _pB1, vl); pA += 2; pB += 8; } @@ -3008,8 +3008,8 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons { vfloat32m1_t _pB = vle32_v_f32m1(pB, vl); - _sum0 = vfmadd_vf_f32m1(_pB, pA[0], _sum0, vl); - _sum1 = vfmadd_vf_f32m1(_pB, pA[1], _sum1, vl); +_sum0 = vfmadd_vf_f32m1(_sum0, pA[0], _pB, vl); +_sum1 = vfmadd_vf_f32m1(_sum1, pA[1], _pB, vl); pA += 2; pB += 4; From e30639480a300c4e43cf12f4a509a9eb3df11d3f Mon Sep 17 00:00:00 2001 From: Xinyu302 Date: Wed, 7 Feb 2024 10:31:30 +0000 Subject: [PATCH 2/2] apply code-format changes --- src/layer/riscv/gemm_riscv.cpp | 372 ++++++++++++++++----------------- 1 file changed, 186 insertions(+), 186 deletions(-) diff --git a/src/layer/riscv/gemm_riscv.cpp b/src/layer/riscv/gemm_riscv.cpp index 10c8a49a04a..2df7ddbe3e3 100644 --- a/src/layer/riscv/gemm_riscv.cpp +++ b/src/layer/riscv/gemm_riscv.cpp @@ -1271,30 +1271,30 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl); vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl); -_sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); -_sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); -_sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); -_sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); -_sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); -_sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); -_sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); -_sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); -_sum40 = vfmadd_vf_f32m1(_sum40, pB[4], _pA0, vl); -_sum41 = vfmadd_vf_f32m1(_sum41, pB[4], _pA1, vl); -_sum50 = vfmadd_vf_f32m1(_sum50, pB[5], _pA0, vl); -_sum51 = vfmadd_vf_f32m1(_sum51, pB[5], _pA1, vl); -_sum60 = vfmadd_vf_f32m1(_sum60, pB[6], _pA0, vl); -_sum61 = vfmadd_vf_f32m1(_sum61, pB[6], _pA1, vl); -_sum70 = vfmadd_vf_f32m1(_sum70, pB[7], _pA0, vl); -_sum71 = vfmadd_vf_f32m1(_sum71, pB[7], _pA1, vl); -_sum80 = vfmadd_vf_f32m1(_sum80, pB[8], _pA0, vl); -_sum81 = vfmadd_vf_f32m1(_sum81, pB[8], _pA1, vl); -_sum90 = vfmadd_vf_f32m1(_sum90, pB[9], _pA0, vl); -_sum91 = vfmadd_vf_f32m1(_sum91, pB[9], _pA1, vl); -_suma0 = vfmadd_vf_f32m1(_suma0, pB[10], _pA0, vl); -_suma1 = vfmadd_vf_f32m1(_suma1, pB[10], _pA1, vl); -_sumb0 = vfmadd_vf_f32m1(_sumb0, pB[11], _pA0, vl); -_sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); + _sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); + _sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); + _sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); + _sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); + _sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); + _sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); + _sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); + _sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); + _sum40 = vfmadd_vf_f32m1(_sum40, pB[4], _pA0, vl); + _sum41 = vfmadd_vf_f32m1(_sum41, pB[4], _pA1, vl); + _sum50 = vfmadd_vf_f32m1(_sum50, pB[5], _pA0, vl); + _sum51 = vfmadd_vf_f32m1(_sum51, pB[5], _pA1, vl); + _sum60 = vfmadd_vf_f32m1(_sum60, pB[6], _pA0, vl); + _sum61 = vfmadd_vf_f32m1(_sum61, pB[6], _pA1, vl); + _sum70 = vfmadd_vf_f32m1(_sum70, pB[7], _pA0, vl); + _sum71 = vfmadd_vf_f32m1(_sum71, pB[7], _pA1, vl); + _sum80 = vfmadd_vf_f32m1(_sum80, pB[8], _pA0, vl); + _sum81 = vfmadd_vf_f32m1(_sum81, pB[8], _pA1, vl); + _sum90 = vfmadd_vf_f32m1(_sum90, pB[9], _pA0, vl); + _sum91 = vfmadd_vf_f32m1(_sum91, pB[9], _pA1, vl); + _suma0 = vfmadd_vf_f32m1(_suma0, pB[10], _pA0, vl); + _suma1 = vfmadd_vf_f32m1(_suma1, pB[10], _pA1, vl); + _sumb0 = vfmadd_vf_f32m1(_sumb0, pB[11], _pA0, vl); + _sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); pA += 8; pB += 12; @@ -1302,30 +1302,30 @@ _sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); _pA0 = vle32_v_f32m1(pA, vl); _pA1 = vle32_v_f32m1(pA + 4, vl); -_sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); -_sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); -_sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); -_sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); -_sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); -_sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); -_sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); -_sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); -_sum40 = vfmadd_vf_f32m1(_sum40, pB[4], _pA0, vl); -_sum41 = vfmadd_vf_f32m1(_sum41, pB[4], _pA1, vl); -_sum50 = vfmadd_vf_f32m1(_sum50, pB[5], _pA0, vl); -_sum51 = vfmadd_vf_f32m1(_sum51, pB[5], _pA1, vl); -_sum60 = vfmadd_vf_f32m1(_sum60, pB[6], _pA0, vl); -_sum61 = vfmadd_vf_f32m1(_sum61, pB[6], _pA1, vl); -_sum70 = vfmadd_vf_f32m1(_sum70, pB[7], _pA0, vl); -_sum71 = vfmadd_vf_f32m1(_sum71, pB[7], _pA1, vl); -_sum80 = vfmadd_vf_f32m1(_sum80, pB[8], _pA0, vl); -_sum81 = vfmadd_vf_f32m1(_sum81, pB[8], _pA1, vl); -_sum90 = vfmadd_vf_f32m1(_sum90, pB[9], _pA0, vl); -_sum91 = vfmadd_vf_f32m1(_sum91, pB[9], _pA1, vl); -_suma0 = vfmadd_vf_f32m1(_suma0, pB[10], _pA0, vl); -_suma1 = vfmadd_vf_f32m1(_suma1, pB[10], _pA1, vl); -_sumb0 = vfmadd_vf_f32m1(_sumb0, pB[11], _pA0, vl); -_sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); + _sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); + _sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); + _sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); + _sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); + _sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); + _sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); + _sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); + _sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); + _sum40 = vfmadd_vf_f32m1(_sum40, pB[4], _pA0, vl); + _sum41 = vfmadd_vf_f32m1(_sum41, pB[4], _pA1, vl); + _sum50 = vfmadd_vf_f32m1(_sum50, pB[5], _pA0, vl); + _sum51 = vfmadd_vf_f32m1(_sum51, pB[5], _pA1, vl); + _sum60 = vfmadd_vf_f32m1(_sum60, pB[6], _pA0, vl); + _sum61 = vfmadd_vf_f32m1(_sum61, pB[6], _pA1, vl); + _sum70 = vfmadd_vf_f32m1(_sum70, pB[7], _pA0, vl); + _sum71 = vfmadd_vf_f32m1(_sum71, pB[7], _pA1, vl); + _sum80 = vfmadd_vf_f32m1(_sum80, pB[8], _pA0, vl); + _sum81 = vfmadd_vf_f32m1(_sum81, pB[8], _pA1, vl); + _sum90 = vfmadd_vf_f32m1(_sum90, pB[9], _pA0, vl); + _sum91 = vfmadd_vf_f32m1(_sum91, pB[9], _pA1, vl); + _suma0 = vfmadd_vf_f32m1(_suma0, pB[10], _pA0, vl); + _suma1 = vfmadd_vf_f32m1(_suma1, pB[10], _pA1, vl); + _sumb0 = vfmadd_vf_f32m1(_sumb0, pB[11], _pA0, vl); + _sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); pA += 8; pB += 12; @@ -1333,30 +1333,30 @@ _sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); _pA0 = vle32_v_f32m1(pA, vl); _pA1 = vle32_v_f32m1(pA + 4, vl); -_sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); -_sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); -_sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); -_sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); -_sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); -_sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); -_sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); -_sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); -_sum40 = vfmadd_vf_f32m1(_sum40, pB[4], _pA0, vl); -_sum41 = vfmadd_vf_f32m1(_sum41, pB[4], _pA1, vl); -_sum50 = vfmadd_vf_f32m1(_sum50, pB[5], _pA0, vl); -_sum51 = vfmadd_vf_f32m1(_sum51, pB[5], _pA1, vl); -_sum60 = vfmadd_vf_f32m1(_sum60, pB[6], _pA0, vl); -_sum61 = vfmadd_vf_f32m1(_sum61, pB[6], _pA1, vl); -_sum70 = vfmadd_vf_f32m1(_sum70, pB[7], _pA0, vl); -_sum71 = vfmadd_vf_f32m1(_sum71, pB[7], _pA1, vl); -_sum80 = vfmadd_vf_f32m1(_sum80, pB[8], _pA0, vl); -_sum81 = vfmadd_vf_f32m1(_sum81, pB[8], _pA1, vl); -_sum90 = vfmadd_vf_f32m1(_sum90, pB[9], _pA0, vl); -_sum91 = vfmadd_vf_f32m1(_sum91, pB[9], _pA1, vl); -_suma0 = vfmadd_vf_f32m1(_suma0, pB[10], _pA0, vl); -_suma1 = vfmadd_vf_f32m1(_suma1, pB[10], _pA1, vl); -_sumb0 = vfmadd_vf_f32m1(_sumb0, pB[11], _pA0, vl); -_sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); + _sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); + _sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); + _sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); + _sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); + _sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); + _sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); + _sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); + _sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); + _sum40 = vfmadd_vf_f32m1(_sum40, pB[4], _pA0, vl); + _sum41 = vfmadd_vf_f32m1(_sum41, pB[4], _pA1, vl); + _sum50 = vfmadd_vf_f32m1(_sum50, pB[5], _pA0, vl); + _sum51 = vfmadd_vf_f32m1(_sum51, pB[5], _pA1, vl); + _sum60 = vfmadd_vf_f32m1(_sum60, pB[6], _pA0, vl); + _sum61 = vfmadd_vf_f32m1(_sum61, pB[6], _pA1, vl); + _sum70 = vfmadd_vf_f32m1(_sum70, pB[7], _pA0, vl); + _sum71 = vfmadd_vf_f32m1(_sum71, pB[7], _pA1, vl); + _sum80 = vfmadd_vf_f32m1(_sum80, pB[8], _pA0, vl); + _sum81 = vfmadd_vf_f32m1(_sum81, pB[8], _pA1, vl); + _sum90 = vfmadd_vf_f32m1(_sum90, pB[9], _pA0, vl); + _sum91 = vfmadd_vf_f32m1(_sum91, pB[9], _pA1, vl); + _suma0 = vfmadd_vf_f32m1(_suma0, pB[10], _pA0, vl); + _suma1 = vfmadd_vf_f32m1(_suma1, pB[10], _pA1, vl); + _sumb0 = vfmadd_vf_f32m1(_sumb0, pB[11], _pA0, vl); + _sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); pA += 8; pB += 12; @@ -1364,30 +1364,30 @@ _sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); _pA0 = vle32_v_f32m1(pA, vl); _pA1 = vle32_v_f32m1(pA + 4, vl); -_sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); -_sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); -_sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); -_sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); -_sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); -_sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); -_sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); -_sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); -_sum40 = vfmadd_vf_f32m1(_sum40, pB[4], _pA0, vl); -_sum41 = vfmadd_vf_f32m1(_sum41, pB[4], _pA1, vl); -_sum50 = vfmadd_vf_f32m1(_sum50, pB[5], _pA0, vl); -_sum51 = vfmadd_vf_f32m1(_sum51, pB[5], _pA1, vl); -_sum60 = vfmadd_vf_f32m1(_sum60, pB[6], _pA0, vl); -_sum61 = vfmadd_vf_f32m1(_sum61, pB[6], _pA1, vl); -_sum70 = vfmadd_vf_f32m1(_sum70, pB[7], _pA0, vl); -_sum71 = vfmadd_vf_f32m1(_sum71, pB[7], _pA1, vl); -_sum80 = vfmadd_vf_f32m1(_sum80, pB[8], _pA0, vl); -_sum81 = vfmadd_vf_f32m1(_sum81, pB[8], _pA1, vl); -_sum90 = vfmadd_vf_f32m1(_sum90, pB[9], _pA0, vl); -_sum91 = vfmadd_vf_f32m1(_sum91, pB[9], _pA1, vl); -_suma0 = vfmadd_vf_f32m1(_suma0, pB[10], _pA0, vl); -_suma1 = vfmadd_vf_f32m1(_suma1, pB[10], _pA1, vl); -_sumb0 = vfmadd_vf_f32m1(_sumb0, pB[11], _pA0, vl); -_sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); + _sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); + _sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); + _sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); + _sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); + _sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); + _sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); + _sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); + _sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); + _sum40 = vfmadd_vf_f32m1(_sum40, pB[4], _pA0, vl); + _sum41 = vfmadd_vf_f32m1(_sum41, pB[4], _pA1, vl); + _sum50 = vfmadd_vf_f32m1(_sum50, pB[5], _pA0, vl); + _sum51 = vfmadd_vf_f32m1(_sum51, pB[5], _pA1, vl); + _sum60 = vfmadd_vf_f32m1(_sum60, pB[6], _pA0, vl); + _sum61 = vfmadd_vf_f32m1(_sum61, pB[6], _pA1, vl); + _sum70 = vfmadd_vf_f32m1(_sum70, pB[7], _pA0, vl); + _sum71 = vfmadd_vf_f32m1(_sum71, pB[7], _pA1, vl); + _sum80 = vfmadd_vf_f32m1(_sum80, pB[8], _pA0, vl); + _sum81 = vfmadd_vf_f32m1(_sum81, pB[8], _pA1, vl); + _sum90 = vfmadd_vf_f32m1(_sum90, pB[9], _pA0, vl); + _sum91 = vfmadd_vf_f32m1(_sum91, pB[9], _pA1, vl); + _suma0 = vfmadd_vf_f32m1(_suma0, pB[10], _pA0, vl); + _suma1 = vfmadd_vf_f32m1(_suma1, pB[10], _pA1, vl); + _sumb0 = vfmadd_vf_f32m1(_sumb0, pB[11], _pA0, vl); + _sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); pA += 8; pB += 12; @@ -1397,30 +1397,30 @@ _sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl); vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl); -_sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); -_sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); -_sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); -_sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); -_sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); -_sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); -_sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); -_sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); -_sum40 = vfmadd_vf_f32m1(_sum40, pB[4], _pA0, vl); -_sum41 = vfmadd_vf_f32m1(_sum41, pB[4], _pA1, vl); -_sum50 = vfmadd_vf_f32m1(_sum50, pB[5], _pA0, vl); -_sum51 = vfmadd_vf_f32m1(_sum51, pB[5], _pA1, vl); -_sum60 = vfmadd_vf_f32m1(_sum60, pB[6], _pA0, vl); -_sum61 = vfmadd_vf_f32m1(_sum61, pB[6], _pA1, vl); -_sum70 = vfmadd_vf_f32m1(_sum70, pB[7], _pA0, vl); -_sum71 = vfmadd_vf_f32m1(_sum71, pB[7], _pA1, vl); -_sum80 = vfmadd_vf_f32m1(_sum80, pB[8], _pA0, vl); -_sum81 = vfmadd_vf_f32m1(_sum81, pB[8], _pA1, vl); -_sum90 = vfmadd_vf_f32m1(_sum90, pB[9], _pA0, vl); -_sum91 = vfmadd_vf_f32m1(_sum91, pB[9], _pA1, vl); -_suma0 = vfmadd_vf_f32m1(_suma0, pB[10], _pA0, vl); -_suma1 = vfmadd_vf_f32m1(_suma1, pB[10], _pA1, vl); -_sumb0 = vfmadd_vf_f32m1(_sumb0, pB[11], _pA0, vl); -_sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); + _sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); + _sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); + _sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); + _sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); + _sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); + _sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); + _sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); + _sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); + _sum40 = vfmadd_vf_f32m1(_sum40, pB[4], _pA0, vl); + _sum41 = vfmadd_vf_f32m1(_sum41, pB[4], _pA1, vl); + _sum50 = vfmadd_vf_f32m1(_sum50, pB[5], _pA0, vl); + _sum51 = vfmadd_vf_f32m1(_sum51, pB[5], _pA1, vl); + _sum60 = vfmadd_vf_f32m1(_sum60, pB[6], _pA0, vl); + _sum61 = vfmadd_vf_f32m1(_sum61, pB[6], _pA1, vl); + _sum70 = vfmadd_vf_f32m1(_sum70, pB[7], _pA0, vl); + _sum71 = vfmadd_vf_f32m1(_sum71, pB[7], _pA1, vl); + _sum80 = vfmadd_vf_f32m1(_sum80, pB[8], _pA0, vl); + _sum81 = vfmadd_vf_f32m1(_sum81, pB[8], _pA1, vl); + _sum90 = vfmadd_vf_f32m1(_sum90, pB[9], _pA0, vl); + _sum91 = vfmadd_vf_f32m1(_sum91, pB[9], _pA1, vl); + _suma0 = vfmadd_vf_f32m1(_suma0, pB[10], _pA0, vl); + _suma1 = vfmadd_vf_f32m1(_suma1, pB[10], _pA1, vl); + _sumb0 = vfmadd_vf_f32m1(_sumb0, pB[11], _pA0, vl); + _sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); pA += 8; pB += 12; @@ -1667,22 +1667,22 @@ _sumb1 = vfmadd_vf_f32m1(_sumb1, pB[11], _pA1, vl); vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl); vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl); -_sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); -_sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); -_sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); -_sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); -_sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); -_sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); -_sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); -_sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); -_sum40 = vfmadd_vf_f32m1(_sum40, pB[4], _pA0, vl); -_sum41 = vfmadd_vf_f32m1(_sum41, pB[4], _pA1, vl); -_sum50 = vfmadd_vf_f32m1(_sum50, pB[5], _pA0, vl); -_sum51 = vfmadd_vf_f32m1(_sum51, pB[5], _pA1, vl); -_sum60 = vfmadd_vf_f32m1(_sum60, pB[6], _pA0, vl); -_sum61 = vfmadd_vf_f32m1(_sum61, pB[6], _pA1, vl); -_sum70 = vfmadd_vf_f32m1(_sum70, pB[7], _pA0, vl); -_sum71 = vfmadd_vf_f32m1(_sum71, pB[7], _pA1, vl); + _sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); + _sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); + _sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); + _sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); + _sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); + _sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); + _sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); + _sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); + _sum40 = vfmadd_vf_f32m1(_sum40, pB[4], _pA0, vl); + _sum41 = vfmadd_vf_f32m1(_sum41, pB[4], _pA1, vl); + _sum50 = vfmadd_vf_f32m1(_sum50, pB[5], _pA0, vl); + _sum51 = vfmadd_vf_f32m1(_sum51, pB[5], _pA1, vl); + _sum60 = vfmadd_vf_f32m1(_sum60, pB[6], _pA0, vl); + _sum61 = vfmadd_vf_f32m1(_sum61, pB[6], _pA1, vl); + _sum70 = vfmadd_vf_f32m1(_sum70, pB[7], _pA0, vl); + _sum71 = vfmadd_vf_f32m1(_sum71, pB[7], _pA1, vl); pA += 8; pB += 8; @@ -1849,14 +1849,14 @@ _sum71 = vfmadd_vf_f32m1(_sum71, pB[7], _pA1, vl); vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl); vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl); -_sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); -_sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); -_sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); -_sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); -_sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); -_sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); -_sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); -_sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); + _sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); + _sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); + _sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); + _sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); + _sum20 = vfmadd_vf_f32m1(_sum20, pB[2], _pA0, vl); + _sum21 = vfmadd_vf_f32m1(_sum21, pB[2], _pA1, vl); + _sum30 = vfmadd_vf_f32m1(_sum30, pB[3], _pA0, vl); + _sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); pA += 8; pB += 4; @@ -1971,10 +1971,10 @@ _sum31 = vfmadd_vf_f32m1(_sum31, pB[3], _pA1, vl); vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl); vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl); -_sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); -_sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); -_sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); -_sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); + _sum00 = vfmadd_vf_f32m1(_sum00, pB[0], _pA0, vl); + _sum01 = vfmadd_vf_f32m1(_sum01, pB[0], _pA1, vl); + _sum10 = vfmadd_vf_f32m1(_sum10, pB[1], _pA0, vl); + _sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); pA += 8; pB += 2; @@ -2261,18 +2261,18 @@ _sum11 = vfmadd_vf_f32m1(_sum11, pB[1], _pA1, vl); { vfloat32m1_t _pA = vle32_v_f32m1(pA, vl); -_sum0 = vfmadd_vf_f32m1(_sum0, pB[0], _pA, vl); -_sum1 = vfmadd_vf_f32m1(_sum1, pB[1], _pA, vl); -_sum2 = vfmadd_vf_f32m1(_sum2, pB[2], _pA, vl); -_sum3 = vfmadd_vf_f32m1(_sum3, pB[3], _pA, vl); -_sum4 = vfmadd_vf_f32m1(_sum4, pB[4], _pA, vl); -_sum5 = vfmadd_vf_f32m1(_sum5, pB[5], _pA, vl); -_sum6 = vfmadd_vf_f32m1(_sum6, pB[6], _pA, vl); -_sum7 = vfmadd_vf_f32m1(_sum7, pB[7], _pA, vl); -_sum8 = vfmadd_vf_f32m1(_sum8, pB[8], _pA, vl); -_sum9 = vfmadd_vf_f32m1(_sum9, pB[9], _pA, vl); -_suma = vfmadd_vf_f32m1(_suma, pB[10], _pA, vl); -_sumb = vfmadd_vf_f32m1(_sumb, pB[11], _pA, vl); + _sum0 = vfmadd_vf_f32m1(_sum0, pB[0], _pA, vl); + _sum1 = vfmadd_vf_f32m1(_sum1, pB[1], _pA, vl); + _sum2 = vfmadd_vf_f32m1(_sum2, pB[2], _pA, vl); + _sum3 = vfmadd_vf_f32m1(_sum3, pB[3], _pA, vl); + _sum4 = vfmadd_vf_f32m1(_sum4, pB[4], _pA, vl); + _sum5 = vfmadd_vf_f32m1(_sum5, pB[5], _pA, vl); + _sum6 = vfmadd_vf_f32m1(_sum6, pB[6], _pA, vl); + _sum7 = vfmadd_vf_f32m1(_sum7, pB[7], _pA, vl); + _sum8 = vfmadd_vf_f32m1(_sum8, pB[8], _pA, vl); + _sum9 = vfmadd_vf_f32m1(_sum9, pB[9], _pA, vl); + _suma = vfmadd_vf_f32m1(_suma, pB[10], _pA, vl); + _sumb = vfmadd_vf_f32m1(_sumb, pB[11], _pA, vl); pA += 4; pB += 12; @@ -2423,14 +2423,14 @@ _sumb = vfmadd_vf_f32m1(_sumb, pB[11], _pA, vl); { vfloat32m1_t _pA = vle32_v_f32m1(pA, vl); -_sum0 = vfmadd_vf_f32m1(_sum0, pB[0], _pA, vl); -_sum1 = vfmadd_vf_f32m1(_sum1, pB[1], _pA, vl); -_sum2 = vfmadd_vf_f32m1(_sum2, pB[2], _pA, vl); -_sum3 = vfmadd_vf_f32m1(_sum3, pB[3], _pA, vl); -_sum4 = vfmadd_vf_f32m1(_sum4, pB[4], _pA, vl); -_sum5 = vfmadd_vf_f32m1(_sum5, pB[5], _pA, vl); -_sum6 = vfmadd_vf_f32m1(_sum6, pB[6], _pA, vl); -_sum7 = vfmadd_vf_f32m1(_sum7, pB[7], _pA, vl); + _sum0 = vfmadd_vf_f32m1(_sum0, pB[0], _pA, vl); + _sum1 = vfmadd_vf_f32m1(_sum1, pB[1], _pA, vl); + _sum2 = vfmadd_vf_f32m1(_sum2, pB[2], _pA, vl); + _sum3 = vfmadd_vf_f32m1(_sum3, pB[3], _pA, vl); + _sum4 = vfmadd_vf_f32m1(_sum4, pB[4], _pA, vl); + _sum5 = vfmadd_vf_f32m1(_sum5, pB[5], _pA, vl); + _sum6 = vfmadd_vf_f32m1(_sum6, pB[6], _pA, vl); + _sum7 = vfmadd_vf_f32m1(_sum7, pB[7], _pA, vl); pA += 4; pB += 8; @@ -2541,10 +2541,10 @@ _sum7 = vfmadd_vf_f32m1(_sum7, pB[7], _pA, vl); { vfloat32m1_t _pA = vle32_v_f32m1(pA, vl); -_sum0 = vfmadd_vf_f32m1(_sum0, pB[0], _pA, vl); -_sum1 = vfmadd_vf_f32m1(_sum1, pB[1], _pA, vl); -_sum2 = vfmadd_vf_f32m1(_sum2, pB[2], _pA, vl); -_sum3 = vfmadd_vf_f32m1(_sum3, pB[3], _pA, vl); + _sum0 = vfmadd_vf_f32m1(_sum0, pB[0], _pA, vl); + _sum1 = vfmadd_vf_f32m1(_sum1, pB[1], _pA, vl); + _sum2 = vfmadd_vf_f32m1(_sum2, pB[2], _pA, vl); + _sum3 = vfmadd_vf_f32m1(_sum3, pB[3], _pA, vl); pA += 4; pB += 4; } @@ -2628,8 +2628,8 @@ _sum3 = vfmadd_vf_f32m1(_sum3, pB[3], _pA, vl); { vfloat32m1_t _pA = vle32_v_f32m1(pA, vl); -_sum0 = vfmadd_vf_f32m1(_sum0, pB[0], _pA, vl); -_sum1 = vfmadd_vf_f32m1(_sum1, pB[1], _pA, vl); + _sum0 = vfmadd_vf_f32m1(_sum0, pB[0], _pA, vl); + _sum1 = vfmadd_vf_f32m1(_sum1, pB[1], _pA, vl); pA += 4; pB += 2; @@ -2839,12 +2839,12 @@ _sum1 = vfmadd_vf_f32m1(_sum1, pB[1], _pA, vl); vfloat32m1_t _pB1 = vle32_v_f32m1(pB + 4, vl); vfloat32m1_t _pB2 = vle32_v_f32m1(pB + 8, vl); -_sum00 = vfmadd_vf_f32m1(_sum00, pA[0], _pB0, vl); -_sum01 = vfmadd_vf_f32m1(_sum01, pA[0], _pB1, vl); -_sum02 = vfmadd_vf_f32m1(_sum02, pA[0], _pB2, vl); -_sum10 = vfmadd_vf_f32m1(_sum10, pA[1], _pB0, vl); -_sum11 = vfmadd_vf_f32m1(_sum11, pA[1], _pB1, vl); -_sum12 = vfmadd_vf_f32m1(_sum12, pA[1], _pB2, vl); + _sum00 = vfmadd_vf_f32m1(_sum00, pA[0], _pB0, vl); + _sum01 = vfmadd_vf_f32m1(_sum01, pA[0], _pB1, vl); + _sum02 = vfmadd_vf_f32m1(_sum02, pA[0], _pB2, vl); + _sum10 = vfmadd_vf_f32m1(_sum10, pA[1], _pB0, vl); + _sum11 = vfmadd_vf_f32m1(_sum11, pA[1], _pB1, vl); + _sum12 = vfmadd_vf_f32m1(_sum12, pA[1], _pB2, vl); pA += 2; pB += 12; @@ -2931,10 +2931,10 @@ _sum12 = vfmadd_vf_f32m1(_sum12, pA[1], _pB2, vl); vfloat32m1_t _pB0 = vle32_v_f32m1(pB, vl); vfloat32m1_t _pB1 = vle32_v_f32m1(pB + 4, vl); -_sum00 = vfmadd_vf_f32m1(_sum00, pA[0], _pB0, vl); -_sum01 = vfmadd_vf_f32m1(_sum01, pA[0], _pB1, vl); -_sum10 = vfmadd_vf_f32m1(_sum10, pA[1], _pB0, vl); -_sum11 = vfmadd_vf_f32m1(_sum11, pA[1], _pB1, vl); + _sum00 = vfmadd_vf_f32m1(_sum00, pA[0], _pB0, vl); + _sum01 = vfmadd_vf_f32m1(_sum01, pA[0], _pB1, vl); + _sum10 = vfmadd_vf_f32m1(_sum10, pA[1], _pB0, vl); + _sum11 = vfmadd_vf_f32m1(_sum11, pA[1], _pB1, vl); pA += 2; pB += 8; } @@ -3008,8 +3008,8 @@ _sum11 = vfmadd_vf_f32m1(_sum11, pA[1], _pB1, vl); { vfloat32m1_t _pB = vle32_v_f32m1(pB, vl); -_sum0 = vfmadd_vf_f32m1(_sum0, pA[0], _pB, vl); -_sum1 = vfmadd_vf_f32m1(_sum1, pA[1], _pB, vl); + _sum0 = vfmadd_vf_f32m1(_sum0, pA[0], _pB, vl); + _sum1 = vfmadd_vf_f32m1(_sum1, pA[1], _pB, vl); pA += 2; pB += 4;