diff --git a/cmake/nncaseruntimeConfig.cmake.in b/cmake/nncaseruntimeConfig.cmake.in index cce5810298..664c43d7dd 100644 --- a/cmake/nncaseruntimeConfig.cmake.in +++ b/cmake/nncaseruntimeConfig.cmake.in @@ -1,5 +1,4 @@ include(${CMAKE_CURRENT_LIST_DIR}/nncaseruntimeTargets.cmake) -if(NOT TARGET gsl-lite) - find_package(gsl-lite REQUIRED) -endif() \ No newline at end of file +set(nncaseruntime_INCLUDE_DIRS ${CMAKE_CURRENT_LIST_DIR}/../../../include) +set(nncaseruntime_LIBS ${CMAKE_CURRENT_LIST_DIR}/../../libNncase.Runtime.Native.a) \ No newline at end of file diff --git a/src/Native/src/kernels/stackvm/optimized/reduce.cpp b/src/Native/src/kernels/stackvm/optimized/reduce.cpp index 77c197fb26..4cbe6d6693 100644 --- a/src/Native/src/kernels/stackvm/optimized/reduce.cpp +++ b/src/Native/src/kernels/stackvm/optimized/reduce.cpp @@ -32,7 +32,13 @@ result optimized::reduce( gsl::span in_shape, gsl::span axis, gsl::span in_strides, gsl::span out_strides, bool keep_dims, kernel_context &context) noexcept { +#if __riscv_vector + return stackvm::optimized::reduce(typecode, op, init_value, input, output, + in_shape, axis, in_strides, out_strides, + keep_dims, context); +#else return stackvm::reference::reduce(typecode, op, init_value, input, output, in_shape, axis, in_strides, out_strides, keep_dims, context); +#endif } diff --git a/src/Native/src/kernels/stackvm/optimized/riscv64/CMakeLists.txt b/src/Native/src/kernels/stackvm/optimized/riscv64/CMakeLists.txt index e69de29bb2..0759b9e504 100644 --- a/src/Native/src/kernels/stackvm/optimized/riscv64/CMakeLists.txt +++ b/src/Native/src/kernels/stackvm/optimized/riscv64/CMakeLists.txt @@ -0,0 +1 @@ +cmake_minimum_required (VERSION 3.13) diff --git a/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp b/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp index 86908a2391..62326326d0 100644 --- a/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp +++ b/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp @@ -18,6 +18,7 @@ #include #include #include +#include using namespace nncase; using namespace nncase::runtime; @@ -27,139 +28,264 @@ using namespace nncase::kernels::stackvm; using namespace nncase::kernels::stackvm::optimized; #if __riscv_vector -static void reduce_block(int dim, int block, const float *input, float *out, - int gap) { - __asm volatile( - "vsetvli t0, %[block], e32, m8;" - "mv a1, %[input];" - "mv a3, %[gap];" - "mv a0, %[dim];" - "fcvt.s.w ft0, a0;" - "slli a3, a3, 2;" - "vmv.v.x v8, x0;" - "reduce_block%=:;" - "vle32.v v16, (a1);" - "vfadd.vv v8, v16, v8;" - "add a1,a1,a3;" - "addi a0, a0, -1;" - "bnez a0, reduce_block%=;" - "vfdiv.vf v8, v8, ft0;" - "vse32.v v8, ( %[out]);" ::[dim] "r"(dim), - [block] "r"(block), [input] "r"(input), [out] "r"(out), [gap] "r"(gap) - : "t0", "a0", "a1", "a3", "ft0", "v8", "v16"); -} -static void reduce_mean(const float *input, float *out, int dim, int n) { - __asm volatile( - - "mv a1, %[input];" - "mv a2, %[out]; " - "mv a3, %[n];" - "fcvt.s.w ft0, %[dim];" - "reduce_mean_n_cycle%=:;" - "mv a0, %[dim];" - "vsetvli t0, a0, e32, m8;" - "vmv.s.x v16, x0;" - "reduce_mean2%=:;" - "vsetvli t0, a0, e32, m8;" - "vle32.v v8, (a1);" - "slli t1,t0, 2;" - "sub a0,a0,t0;" - "add a1, a1, t1;" - "vfredusum.vs v16,v8,v16;" - "bnez a0, reduce_mean2%=;" - "vfmv.f.s ft1, v16;" - "fdiv.s ft1,ft1,ft0;" - "fsw ft1, (a2);" - "addi a2, a2, 4;" - "addi a3,a3, -1;" - "bnez a3, reduce_mean_n_cycle%=;" ::[dim] "r"(dim), - [n] "r"(n), [input] "r"(input), [out] "r"(out) - : "t0", "t1", "a0", "a1", "a2", "a3", "ft0", "ft1", "v8", "v16"); -} +result reduce_max_impl(const float *in, float *out, size_t outter_size, + size_t inner_size, size_t reduce_size) { + for (size_t i = 0; i < outter_size; ++i) { + size_t outer_offset = i * reduce_size * inner_size; + for (size_t j = 0; j < inner_size; ++j) { + size_t base_index = outer_offset + j; + float max_val = in[base_index]; + size_t remaining = reduce_size; -static void reduce_mean_s(int32_t c, int32_t dim, const float *input, - float *out, int32_t gap) { -#define BLOCK_N 32 - while (c--) { - const float *tmp_input = input; - for (int j = 0; j < gap / BLOCK_N; ++j) { - reduce_block(dim, BLOCK_N, tmp_input, out, gap); - tmp_input += BLOCK_N; - out += BLOCK_N; - } - int left_number = gap & (BLOCK_N - 1); - if (left_number) { - reduce_block(dim, left_number, tmp_input, out, gap); - out += left_number; + // set vlen and convert scaler to vector + if (0) // m1 + { + size_t vl = vsetvl_e32m1(remaining); + vfloat32m1_t v_max = vfmv_v_f_f32m1(max_val, vl); + + // process full registers data. + while (remaining / vl > 0) { + vfloat32m1_t v_in = vle32_v_f32m1(&in[base_index], vl); + v_max = vfmax_vv_f32m1(v_max, v_in, vl); + + remaining -= vl; + base_index += vl; + } + vfloat32m1_t reduced_max_ = + vfredmax_vs_f32m1_f32m1(v_max, v_max, v_max, vl); + max_val = vfmv_f_s_f32m1_f32(reduced_max_); + + // process the remaining elements + if (remaining > 0) { + vl = vsetvl_e32m1(remaining); + v_max = vfmv_v_f_f32m1(max_val, vl); + vfloat32m1_t v_in = vle32_v_f32m1(&in[base_index], vl); + v_max = vfmax_vv_f32m1(v_max, v_in, vl); + reduced_max_ = + vfredmax_vs_f32m1_f32m1(v_max, v_max, v_max, vl); + max_val = vfmv_f_s_f32m1_f32(reduced_max_); + } + } else // m4 + { + // set vlen and convert scaler to vector + size_t vl = vsetvl_e32m4(remaining); + vfloat32m4_t v_max = vfmv_v_f_f32m4(max_val, vl); + + // process full registers data. + while (remaining / vl > 0) { + vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl); + v_max = vfmax_vv_f32m4(v_max, v_in, vl); + + remaining -= vl; + base_index += vl; + } + vfloat32m1_t reduced_max_ = vfredmax_vs_f32m4_f32m1( + vundefined_f32m1(), v_max, vfmv_v_f_f32m1(max_val, vl), vl); + max_val = vfmv_f_s_f32m1_f32(reduced_max_); + + // process the remaining elements + if (remaining > 0) { + vl = vsetvl_e32m4(remaining); + v_max = vfmv_v_f_f32m4(max_val, vl); + vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl); + v_max = vfmax_vv_f32m4(v_max, v_in, vl); + reduced_max_ = vfredmax_vs_f32m4_f32m1( + vundefined_f32m1(), v_max, vfmv_v_f_f32m1(max_val, vl), + vl); + max_val = vfmv_f_s_f32m1_f32(reduced_max_); + } + } + out[i * inner_size + j] = max_val; } - input += dim * gap; } + return ok(); } -static int compute_size_by_index(gsl::span input, int start_index, - int end_index) { - int init_value = 1; - for (int i = start_index; i < end_index; ++i) { - init_value *= input[i]; +result reduce_min_impl(const float *in, float *out, size_t outter_size, + size_t inner_size, size_t reduce_size) { + for (size_t i = 0; i < outter_size; ++i) { + size_t outer_offset = i * reduce_size * inner_size; + for (size_t j = 0; j < inner_size; ++j) { + size_t base_index = outer_offset + j; + float min_val = in[base_index]; + size_t remaining = reduce_size; + + // set vlen and convert scaler to vector + size_t vl = vsetvl_e32m4(remaining); + vfloat32m4_t v_min = vfmv_v_f_f32m4(min_val, vl); + + // process full registers data. + while (remaining / vl > 0) { + vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl); + v_min = vfmin_vv_f32m4(v_min, v_in, vl); + + remaining -= vl; + base_index += vl; + } + vfloat32m1_t reduced_min_ = vfredmin_vs_f32m4_f32m1( + vundefined_f32m1(), v_min, vfmv_v_f_f32m1(min_val, vl), vl); + min_val = vfmv_f_s_f32m1_f32(reduced_min_); + + // process the remaining elements + if (remaining > 0) { + vl = vsetvl_e32m4(remaining); + v_min = vfmv_v_f_f32m4(min_val, vl); + vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl); + v_min = vfmin_vv_f32m4(v_min, v_in, vl); + reduced_min_ = vfredmin_vs_f32m4_f32m1( + vundefined_f32m1(), v_min, vfmv_v_f_f32m1(min_val, vl), vl); + min_val = vfmv_f_s_f32m1_f32(reduced_min_); + } + + out[i * inner_size + j] = min_val; + } } - return init_value; + return ok(); } -static int get_parameter(gsl::span in_shape, - gsl::span axis, gsl::span out) { - int min_index = axis[0]; - int max_index = axis[0]; - for (int i = 1; i < (int)axis.size(); ++i) { - int value = axis[i]; - if (value < min_index) - min_index = value; - else if (value > max_index) - max_index = value; - } - int _sum1 = (max_index + min_index) * (max_index - min_index + 1) >> 1; - int _sum2 = axis[0]; - for (int i = 1; i < (int)axis.size(); ++i) { - _sum2 += axis[i]; +result reduce_sum_impl(const float *in, float *out, size_t outter_size, + size_t inner_size, size_t reduce_size) { + for (size_t i = 0; i < outter_size; ++i) { + size_t outer_offset = i * reduce_size * inner_size; + for (size_t j = 0; j < inner_size; ++j) { + size_t base_index = outer_offset + j; + float sum = 0.0f; + size_t remaining = reduce_size; + + // set vlen and convert scaler to vector + size_t vl = vsetvl_e32m4(remaining); + vfloat32m4_t v_sum = vfmv_v_f_f32m4(0.0f, vl); + + // process full registers data. + while (remaining / vl > 0) { + vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl); + v_sum = vfadd_vv_f32m4(v_sum, v_in, vl); + + remaining -= vl; + base_index += vl; + } + vfloat32m1_t reduced_sum_ = vfredosum_vs_f32m4_f32m1( + vundefined_f32m1(), v_sum, vfmv_v_f_f32m1(0.0f, vl), vl); + sum += vfmv_f_s_f32m1_f32(reduced_sum_); + + // process the remaining elements + if (remaining > 0) { + vl = vsetvl_e32m4(remaining); + v_sum = vfmv_v_f_f32m4(0.0f, vl); + vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl); + v_sum = vfadd_vv_f32m4(v_sum, v_in, vl); + reduced_sum_ = vfredosum_vs_f32m4_f32m1( + vundefined_f32m1(), v_sum, vfmv_v_f_f32m1(0.0f, vl), vl); + sum += vfmv_f_s_f32m1_f32(reduced_sum_); + } + + out[i * inner_size + j] = sum; + } } - if (_sum2 != _sum1) { - return 1; + return ok(); +} + +result reduce_prod_impl(const float *in, float *out, size_t outter_size, + size_t inner_size, size_t reduce_size) { + for (size_t i = 0; i < outter_size; ++i) { + size_t outer_offset = i * reduce_size * inner_size; + for (size_t j = 0; j < inner_size; ++j) { + size_t base_index = outer_offset + j; + float acc = 1.0f; + size_t remaining = reduce_size; + + // set vlen and convert scaler to vector + size_t vl = vsetvl_e32m4(remaining); + vfloat32m4_t v_acc = vfmv_v_f_f32m4(1.0f, vl); + + // process full registers data. + while (remaining / vl > 0) { + vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl); + v_acc = vfmul_vv_f32m4(v_acc, v_in, vl); + + remaining -= vl; + base_index += vl; + } + for (size_t i = 0; i < vl; i++) { + acc *= vfmv_f_s_f32m4_f32( + vslidedown_vx_f32m4(vundefined_f32m4(), v_acc, i, vl)); + } + + // process the remaining elements + if (remaining > 0) { + vl = vsetvl_e32m4(remaining); + v_acc = vfmv_v_f_f32m4(1.0f, vl); + vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl); + v_acc = vfmul_vv_f32m4(v_acc, v_in, vl); + for (size_t i = 0; i < vl; i++) { + acc *= vfmv_f_s_f32m4_f32( + vslidedown_vx_f32m4(vundefined_f32m4(), v_acc, i, vl)); + } + } + + out[i * inner_size + j] = acc; + } } - out[0] = compute_size_by_index(in_shape, min_index, max_index + 1); - out[1] = compute_size_by_index(in_shape, 0, min_index); - out[2] = compute_size_by_index(in_shape, max_index + 1, in_shape.size()); - return 0; + return ok(); } + #endif result optimized::reduce( - typecode_t typecode, nncase::runtime::stackvm::reduce_op_t op, - const gsl::byte *init_value, const gsl::byte *input, gsl::byte *output, - gsl::span in_shape, gsl::span axis, - gsl::span in_strides, gsl::span out_strides, - bool keep_dims, kernel_context &context) noexcept { + NNCASE_UNUSED typecode_t typecode, nncase::runtime::stackvm::reduce_op_t op, + NNCASE_UNUSED const gsl::byte *init_value, const gsl::byte *input, + gsl::byte *output, gsl::span in_shape, + gsl::span axis, + NNCASE_UNUSED gsl::span in_strides, + NNCASE_UNUSED gsl::span out_strides, + NNCASE_UNUSED bool keep_dims, + NNCASE_UNUSED kernel_context &context) noexcept { #if __riscv_vector - do { - if (op == reduce_op_t::mean && typecode == dt_float32) { - int parameters[3]; - int ret = get_parameter(in_shape, axis, parameters); - if (ret) { - break; - } - auto input_data = IN_CAST(float, input); - auto out_data = OUT_CAST(float, output); - int gap = parameters[2]; - if (gap == 1) { - reduce_mean(input_data, out_data, parameters[0], parameters[1]); - } else { - reduce_mean_s(parameters[1], parameters[0], input_data, - out_data, gap); + // The type of axis is 'size_t'. It is real axis. + // 计算inner_size、outter_size + size_t inner_size = 1, outter_size = 1; + size_t reduce_size = in_shape[axis[0]]; + + for (size_t i = 0; i < axis[0]; i++) { + outter_size *= in_shape[i]; + } + + for (size_t i = axis[0] + 1; i < in_shape.size(); i++) { + inner_size *= in_shape[i]; + } + + const float *in = reinterpret_cast(input); + float *out = reinterpret_cast(output); + if (axis.size() == 1 && axis[0] == in_shape.size() - 1) { + switch (op) { + case reduce_op_t::max: + return reduce_max_impl(in, out, outter_size, inner_size, + reduce_size); + case reduce_op_t::min: + return reduce_min_impl(in, out, outter_size, inner_size, + reduce_size); + case reduce_op_t::sum: + return reduce_sum_impl(in, out, outter_size, inner_size, + reduce_size); + case reduce_op_t::mean: + reduce_sum_impl(in, out, outter_size, inner_size, reduce_size) + .unwrap(); + for (size_t i = 0; i < outter_size; i++) { + out[i] *= 1.0f / reduce_size; } return ok(); + case reduce_op_t::prod: + return reduce_prod_impl(in, out, outter_size, inner_size, + reduce_size); + default: + break; } - } while (0); -#endif + } + // TODO: implement non-last axis reduce + // TODO: implement multi-axis reduce +#endif return stackvm::reference::reduce(typecode, op, init_value, input, output, in_shape, axis, in_strides, out_strides, keep_dims, context); diff --git a/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp b/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp index 58442a40ca..4539baee78 100644 --- a/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp +++ b/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp @@ -249,51 +249,91 @@ result optimized_softmax_impl(const T *input, T *output, float *ptr_output_vl = ptr_output; // max - float max = std::numeric_limits::lowest(); - while (n) { - auto vl = vsetvl_e32m8(n); - auto v = vle32_v_f32m8(ptr_input_vl, vl); - auto s = vfmv_s_f_f32m1(vundefined_f32m1(), max, vl); + float max = *ptr_input_vl; + { + size_t vl = vsetvl_e32m4(n); + vfloat32m4_t s = vfmv_v_f_f32m4(max, vl); + while (n / vl > 0) { + vfloat32m4_t v = vle32_v_f32m4(ptr_input_vl, vl); + s = vfmax_vv_f32m4(s, v, vl); - s = vfredmax_vs_f32m8_f32m1(s, v, s, vl); - max = vfmv_f_s_f32m1_f32(s); - ptr_input_vl += vl; - n -= vl; + n -= vl; + ptr_input_vl += vl; + } + + vfloat32m1_t reduced_max_ = vfredmax_vs_f32m4_f32m1( + vundefined_f32m1(), s, vfmv_v_f_f32m1(max, vl), vl); + max = vfmv_f_s_f32m1_f32(reduced_max_); + + if (n > 0) { + vl = vsetvl_e32m4(n); + s = vfmv_v_f_f32m4(max, vl); + vfloat32m4_t v = vle32_v_f32m4(ptr_input_vl, vl); + s = vfmax_vv_f32m4(s, v, vl); + reduced_max_ = vfredmax_vs_f32m4_f32m1( + vundefined_f32m1(), s, vfmv_v_f_f32m1(max, vl), vl); + max = vfmv_f_s_f32m1_f32(reduced_max_); + } } // exp((x - max) * beta) and sum(exp) float sum = 0.f; ptr_input_vl = ptr_input; n = axis_dim; - while (n) { - auto vl = vsetvl_e32m8(n); - auto v_in = vle32_v_f32m8(ptr_input_vl, vl); - auto s = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl); + { + auto vl = vsetvl_e32m4(n); + auto s = vfmv_v_f_f32m4(0.0f, vl); + while (n / vl > 0) { + + auto v_in = vle32_v_f32m4(ptr_input_vl, vl); + auto v_out = exp_ps( + vfmul_vf_f32m4(vfsub_vf_f32m4(v_in, max, vl), beta, vl), + vl); + s = vfadd_vv_f32m4(s, v_out, vl); + vse32_v_f32m4(ptr_output_vl, v_out, vl); - auto v_out = exp_ps( - vfmul_vf_f32m8(vfsub_vf_f32m8(v_in, max, vl), beta, vl), - vl); - s = vfredosum_vs_f32m8_f32m1(s, v_out, s, vl); - - vse32_v_f32m8(ptr_output_vl, v_out, vl); - sum = vfmv_f_s_f32m1_f32(s); - ptr_input_vl += vl; - ptr_output_vl += vl; - n -= vl; + ptr_input_vl += vl; + ptr_output_vl += vl; + n -= vl; + } + vfloat32m1_t reduce_sum_ = vfredosum_vs_f32m4_f32m1( + vundefined_f32m1(), s, vfmv_v_f_f32m1(0.0f, vl), vl); + sum += vfmv_f_s_f32m1_f32(reduce_sum_); + + if (n > 0) { + vl = vsetvl_e32m4(n); + auto v_in = vle32_v_f32m4(ptr_input_vl, vl); + auto v_out = exp_ps( + vfmul_vf_f32m4(vfsub_vf_f32m4(v_in, max, vl), beta, vl), + vl); + reduce_sum_ = + vfredosum_vs_f32m4_f32m1(vundefined_f32m1(), v_out, + vfmv_v_f_f32m1(0.0f, vl), vl); + + vse32_v_f32m4(ptr_output_vl, v_out, vl); + sum += vfmv_f_s_f32m1_f32(reduce_sum_); + } } - // div ptr_input_vl = ptr_input; ptr_output_vl = ptr_output; n = axis_dim; sum = 1.0f / sum; - while (n) { - auto vl = vsetvl_e32m8(n); - auto v_out = vle32_v_f32m8(ptr_output_vl, vl); - v_out = vfmul_vf_f32m8(v_out, sum, vl); - vse32_v_f32m8(ptr_output_vl, v_out, vl); - ptr_output_vl += vl; - n -= vl; + { + auto vl = vsetvl_e32m4(n); + while (n / vl > 0) { + auto v_out = vle32_v_f32m4(ptr_output_vl, vl); + v_out = vfmul_vf_f32m4(v_out, sum, vl); + vse32_v_f32m4(ptr_output_vl, v_out, vl); + ptr_output_vl += vl; + n -= vl; + } + if (n > 0) { + vl = vsetvl_e32m4(n); + auto v_out = vle32_v_f32m4(ptr_output_vl, vl); + v_out = vfmul_vf_f32m4(v_out, sum, vl); + vse32_v_f32m4(ptr_output_vl, v_out, vl); + } } ptr_input += axis_dim;