diff --git a/CMakeLists.txt b/CMakeLists.txt index 4ca2a9a5158..1463e8a215a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -422,6 +422,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)") set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv") check_cxx_source_compiles("#include \nint main() { vfloat32m8_t _s, _w; float _v; size_t vl; _s = __riscv_vfmacc_vf_f32m8(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_V) + set(CMAKE_REQUIRED_FLAGS "-march=rv64gc_zfh -D__fp16=_Float16") + check_cxx_source_compiles("int main() { __fp16 s, v; s = v * v; return 0; }" NCNN_COMPILER_SUPPORT_RISCV_ZFH) + set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16") check_cxx_source_compiles("#include \nint main() { vfloat16m8_t _s, _w; __fp16 _v; size_t vl; _s = __riscv_vfmacc_vf_f16m8(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_ZVFH) @@ -432,8 +435,20 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)") if(NCNN_COMPILER_SUPPORT_RISCV_V OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) option(NCNN_RVV "optimize risc-v platform with v extension" ON) + else() + message(WARNING "The compiler does not support risc-v v or xtheadvector extension. NCNN_RVV will be OFF.") + endif() + + if(NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) + option(NCNN_XTHEADVECTOR "optimize risc-v platform with xtheadvector extension" ON) + else() + message(WARNING "The compiler does not support risc-v xtheadvector extension. NCNN_XTHEADVECTOR will be OFF.") + endif() + + if(NCNN_COMPILER_SUPPORT_RISCV_ZFH) + option(NCNN_ZFH "optimize risc-v platform with zfh extension" ON) if(NCNN_COMPILER_SUPPORT_RISCV_ZVFH OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) - if(NCNN_RVV) + if(NCNN_RVV AND NCNN_ZFH) option(NCNN_ZVFH "optimize risc-v platform with zvfh extension" ON) endif() else() @@ -458,13 +473,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)") # add_definitions(-D__rvv_tuple) # endif() else() - message(WARNING "The compiler does not support risc-v v or xtheadvector extension. NCNN_RVV will be OFF.") - endif() - - if(NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) - option(NCNN_XTHEADVECTOR "optimize risc-v platform with xtheadvector extension" ON) - else() - message(WARNING "The compiler does not support risc-v xtheadvector extension. NCNN_XTHEADVECTOR will be OFF.") + message(WARNING "The compiler does not support risc-v zfh extension. NCNN_ZFH will be OFF.") endif() endif() diff --git a/cmake/ncnn_add_layer.cmake b/cmake/ncnn_add_layer.cmake index 01e1f3cbafb..6a7e16ce0f2 100644 --- a/cmake/ncnn_add_layer.cmake +++ b/cmake/ncnn_add_layer.cmake @@ -54,6 +54,28 @@ macro(ncnn_add_arch_opt_source class NCNN_TARGET_ARCH_OPT NCNN_TARGET_ARCH_OPT_C endif() endmacro() +macro(ncnn_add_arch_opt_layer_source class NCNN_TARGET_ARCH_OPT_BASE NCNN_TARGET_ARCH_OPT NCNN_TARGET_ARCH_OPT_CFLAGS) + set(NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT_BASE}.cpp) + + if(WITH_LAYER_${name} AND EXISTS ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE}) + + set(NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE ${CMAKE_CURRENT_BINARY_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}.cpp) + + add_custom_command( + OUTPUT ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE} + COMMAND ${CMAKE_COMMAND} -DSRC=${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE} -DDST=${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE} -DCLASS=${class} -P "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_${NCNN_TARGET_ARCH_OPT}_source.cmake" + DEPENDS ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE} + COMMENT "Generating source ${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}.cpp" + VERBATIM + ) + set_source_files_properties(${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE} PROPERTIES GENERATED TRUE) + + set_source_files_properties(${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE} PROPERTIES COMPILE_FLAGS ${NCNN_TARGET_ARCH_OPT_CFLAGS}) + + list(APPEND ncnn_SRCS ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE}) + endif() +endmacro() + macro(ncnn_add_layer class) string(TOLOWER ${class} name) @@ -394,11 +416,15 @@ macro(ncnn_add_layer class) if(NCNN_RUNTIME_CPU AND NCNN_RVV) ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv") endif() + if(NCNN_ZFH) + ncnn_add_arch_opt_source(${class} zfh "-march=rv64gc_zfh -D__fp16=_Float16") + endif() if(NCNN_RUNTIME_CPU AND NCNN_XTHEADVECTOR) - ncnn_add_arch_opt_layer(${class} xtheadvector "-march=rv64gc_zfh_xtheadvector -D__fp16=_Float16") + ncnn_add_arch_opt_layer(${class} xtheadvector "-march=rv64gc_xtheadvector") + ncnn_add_arch_opt_layer_source(${class} zfh xtheadvector "-march=rv64gc_zfh_xtheadvector -D__fp16=_Float16") endif() - if(NCNN_ZVFH) - ncnn_add_arch_opt_source(${class} zvfh "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16") + if(NCNN_RUNTIME_CPU AND NCNN_ZVFH) + ncnn_add_arch_opt_layer_source(${class} zfh rvv "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16") endif() endif() diff --git a/src/layer/riscv/absval_riscv.cpp b/src/layer/riscv/absval_riscv.cpp index 8c84af7284c..805fe8f54c8 100644 --- a/src/layer/riscv/absval_riscv.cpp +++ b/src/layer/riscv/absval_riscv.cpp @@ -13,24 +13,27 @@ // specific language governing permissions and limitations under the License. #include "absval_riscv.h" -#include "cpu.h" #if __riscv_vector #include #endif // __riscv_vector -namespace ncnn { +#include "cpu.h" -#include "absval_fp16.h" +namespace ncnn { AbsVal_riscv::AbsVal_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH || NCNN_XTHEADVECTOR - support_fp16_storage = cpu_support_riscv_zvfh() || cpu_support_riscv_xtheadvector(); -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif } #if __riscv_vector @@ -42,10 +45,10 @@ static inline vfloat32m8_t __riscv_vfabs_v_f32m8_absval(vfloat32m8_t op1, size_t int AbsVal_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if __riscv_vector +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); - if (support_fp16_storage && opt.use_fp16_storage && elembits == 16) + if (opt.use_fp16_storage && elembits == 16) { return forward_inplace_fp16s(bottom_top_blob, opt); } @@ -89,12 +92,4 @@ int AbsVal_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return 0; } -#if __riscv_vector -int AbsVal_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const -{ - absval_fp16(bottom_top_blob, opt); - return 0; -} -#endif // __riscv_vector - } // namespace ncnn diff --git a/src/layer/riscv/absval_riscv.h b/src/layer/riscv/absval_riscv.h index b08a8a7bb31..ca9bde067fa 100644 --- a/src/layer/riscv/absval_riscv.h +++ b/src/layer/riscv/absval_riscv.h @@ -27,7 +27,7 @@ class AbsVal_riscv : public AbsVal virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if __riscv_vector +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; #endif }; diff --git a/src/layer/riscv/absval_fp16.h b/src/layer/riscv/absval_riscv_zfh.cpp similarity index 78% rename from src/layer/riscv/absval_fp16.h rename to src/layer/riscv/absval_riscv_zfh.cpp index b74d3be7187..43bb6a2b0bc 100644 --- a/src/layer/riscv/absval_fp16.h +++ b/src/layer/riscv/absval_riscv_zfh.cpp @@ -12,9 +12,13 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#if NCNN_RUNTIME_CPU && NCNN_ZVFH && __riscv_vector && !__riscv_zvfh -void absval_fp16_zvfh(Mat& bottom_top_blob, const Option& opt); -#endif +#include "absval_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +namespace ncnn { #if __riscv_zvfh static inline vfloat16m8_t __riscv_vfabs_v_f16m8_absval(vfloat16m8_t op1, size_t vl) @@ -23,17 +27,9 @@ static inline vfloat16m8_t __riscv_vfabs_v_f16m8_absval(vfloat16m8_t op1, size_t } #endif // __riscv_zvfh -static void absval_fp16(Mat& bottom_top_blob, const Option& opt) +#if NCNN_ZFH +int AbsVal_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const { -#if NCNN_RUNTIME_CPU && NCNN_ZVFH && __riscv_vector && !__riscv_xtheadvector && !__riscv_zvfh - if (ncnn::cpu_support_riscv_zvfh()) - { - absval_fp16_zvfh(bottom_top_blob, opt); - return; - } -#endif - -#if __riscv_zvfh const int w = bottom_top_blob.w; const int h = bottom_top_blob.h; const int d = bottom_top_blob.d; @@ -46,6 +42,7 @@ static void absval_fp16(Mat& bottom_top_blob, const Option& opt) { __fp16* ptr = bottom_top_blob.channel(q); +#if __riscv_zvfh int n = size; while (n > 0) { @@ -58,9 +55,17 @@ static void absval_fp16(Mat& bottom_top_blob, const Option& opt) ptr += vl; n -= vl; } - } -#else - (void)bottom_top_blob; - (void)opt; +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *ptr = (*ptr > (__fp16)0.f) ? (*ptr) : (-*ptr); + ptr++; + } #endif // __riscv_zvfh + } + + return 0; } +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/absval_riscv_zvfh.cpp b/src/layer/riscv/absval_riscv_zvfh.cpp deleted file mode 100644 index 1eb9554a6b2..00000000000 --- a/src/layer/riscv/absval_riscv_zvfh.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#include "cpu.h" -#include "mat.h" - -namespace ncnn { - -#include "absval_fp16.h" - -void absval_fp16_zvfh(Mat& bottom_top_blob, const Option& opt) -{ - absval_fp16(bottom_top_blob, opt); -} - -} // namespace ncnn diff --git a/src/layer/riscv/batchnorm_riscv.cpp b/src/layer/riscv/batchnorm_riscv.cpp index c12b554933e..e6e5af89033 100644 --- a/src/layer/riscv/batchnorm_riscv.cpp +++ b/src/layer/riscv/batchnorm_riscv.cpp @@ -16,9 +16,10 @@ #if __riscv_vector #include +#include "riscv_usability.h" #endif // __riscv_vector -#include "riscv_usability.h" +#include "cpu.h" namespace ncnn { @@ -26,15 +27,19 @@ BatchNorm_riscv::BatchNorm_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector } int BatchNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) @@ -75,7 +80,6 @@ int BatchNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co } #else int w = bottom_top_blob.w; - #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < w; i++) { ptr[i] = b_data[i] * ptr[i] + a_data[i]; diff --git a/src/layer/riscv/batchnorm_riscv.h b/src/layer/riscv/batchnorm_riscv.h index 8fc761bcca6..9f9b105cae1 100644 --- a/src/layer/riscv/batchnorm_riscv.h +++ b/src/layer/riscv/batchnorm_riscv.h @@ -26,7 +26,7 @@ class BatchNorm_riscv : public BatchNorm virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if NCNN_ZVFH +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif diff --git a/src/layer/riscv/batchnorm_riscv_zvfh.cpp b/src/layer/riscv/batchnorm_riscv_zfh.cpp similarity index 87% rename from src/layer/riscv/batchnorm_riscv_zvfh.cpp rename to src/layer/riscv/batchnorm_riscv_zfh.cpp index 2eef3729525..bd2a9be289a 100644 --- a/src/layer/riscv/batchnorm_riscv_zvfh.cpp +++ b/src/layer/riscv/batchnorm_riscv_zfh.cpp @@ -16,23 +16,23 @@ #if __riscv_vector #include -#endif // __riscv_vector - #include "riscv_usability.h" +#endif // __riscv_vector namespace ncnn { -#if __riscv_zvfh +#if NCNN_ZFH int BatchNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const { int dims = bottom_top_blob.dims; int elempack = bottom_top_blob.elempack; if (dims == 1) { - int n = bottom_top_blob.w * elempack; __fp16* ptr = bottom_top_blob; +#if __riscv_zvfh const float* ptr_a = a_data; const float* ptr_b = b_data; + int n = bottom_top_blob.w * elempack; while (n > 0) { size_t vl = __riscv_vsetvl_e16m4(n); @@ -50,6 +50,13 @@ int BatchNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o ptr_b += vl; n -= vl; } +#else // __riscv_zvfh + int w = bottom_top_blob.w; + for (int i = 0; i < w; i++) + { + ptr[i] = (__fp16)(b_data[i] * (float)ptr[i] + a_data[i]); + } +#endif // __riscv_zvfh return 0; } @@ -67,6 +74,7 @@ int BatchNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o float a = a_data[i]; float b = b_data[i]; +#if __riscv_zvfh int n = w; while (n > 0) { @@ -79,6 +87,12 @@ int BatchNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o ptr += vl; n -= vl; } +#else // __riscv_zvfh + for (int j = 0; j < w; j++) + { + ptr[j] = (__fp16)(b * (float)ptr[j] + a); + } +#endif // __riscv_zvfh } } if (dims == 3 || dims == 4) @@ -94,6 +108,7 @@ int BatchNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o float a = a_data[q]; float b = b_data[q]; +#if __riscv_zvfh int n = size; while (n > 0) { @@ -106,12 +121,19 @@ int BatchNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o ptr += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + ptr[i] = (__fp16)(b * (float)ptr[i] + a); + } +#endif // __riscv_zvfh } } return 0; } +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; // fp16 if (elempack == packn) { @@ -172,6 +194,7 @@ int BatchNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o } } } +#endif // __riscv_zvfh return 0; } @@ -182,10 +205,11 @@ int BatchNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& int elempack = bottom_top_blob.elempack; if (dims == 1) { - int n = bottom_top_blob.w * elempack; __fp16* ptr = bottom_top_blob; +#if __riscv_zvfh const float* ptr_a = a_data; const float* ptr_b = b_data; + int n = bottom_top_blob.w * elempack; while (n > 0) { size_t vl = __riscv_vsetvl_e16m4(n); @@ -203,6 +227,13 @@ int BatchNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& ptr_b += vl; n -= vl; } +#else // __riscv_zvfh + int w = bottom_top_blob.w; + for (int i = 0; i < w; i++) + { + ptr[i] = (__fp16)b_data[i] * ptr[i] + (__fp16)a_data[i]; + } +#endif // __riscv_zvfh return 0; } @@ -217,9 +248,10 @@ int BatchNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& for (int i = 0; i < h; i++) { __fp16* ptr = bottom_top_blob.row<__fp16>(i); - float a = a_data[i]; - float b = b_data[i]; + __fp16 a = (__fp16)a_data[i]; + __fp16 b = (__fp16)b_data[i]; +#if __riscv_zvfh int n = w; while (n > 0) { @@ -232,6 +264,12 @@ int BatchNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& ptr += vl; n -= vl; } +#else // __riscv_zvfh + for (int j = 0; j < w; j++) + { + ptr[j] = b * ptr[j] + a; + } +#endif // __riscv_zvfh } } if (dims == 3 || dims == 4) @@ -244,9 +282,10 @@ int BatchNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& for (int q = 0; q < c; q++) { __fp16* ptr = bottom_top_blob.channel(q); - float a = a_data[q]; - float b = b_data[q]; + __fp16 a = (__fp16)a_data[q]; + __fp16 b = (__fp16)b_data[q]; +#if __riscv_zvfh int n = size; while (n > 0) { @@ -259,12 +298,19 @@ int BatchNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& ptr += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + ptr[i] = b * ptr[i] + a; + } +#endif // __riscv_zvfh } } return 0; } +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; // fp16 if (elempack == packn) { @@ -325,9 +371,10 @@ int BatchNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& } } } +#endif // __riscv_zvfh return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/binaryop_riscv.cpp b/src/layer/riscv/binaryop_riscv.cpp index 7b5a77d774d..f7d750614cc 100644 --- a/src/layer/riscv/binaryop_riscv.cpp +++ b/src/layer/riscv/binaryop_riscv.cpp @@ -20,9 +20,10 @@ #if __riscv_vector #include #include "rvv_mathfun.h" +#include "riscv_usability.h" #endif // __riscv_vector -#include "riscv_usability.h" +#include "cpu.h" namespace ncnn { @@ -30,8 +31,12 @@ BinaryOp_riscv::BinaryOp_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); #endif #endif } @@ -468,7 +473,7 @@ static int get_reverse_op_type(int op_type) int BinaryOp_riscv::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = std::max(bottom_blobs[0].elembits(), bottom_blobs[1].elembits()); if (opt.use_fp16_storage && elembits == 16) @@ -626,7 +631,7 @@ int BinaryOp_riscv::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; #endif diff --git a/src/layer/riscv/binaryop_riscv_zvfh.cpp b/src/layer/riscv/binaryop_riscv_zfh.cpp similarity index 94% rename from src/layer/riscv/binaryop_riscv_zvfh.cpp rename to src/layer/riscv/binaryop_riscv_zfh.cpp index 56ff0fa6701..b37e483cbae 100644 --- a/src/layer/riscv/binaryop_riscv_zvfh.cpp +++ b/src/layer/riscv/binaryop_riscv_zfh.cpp @@ -17,21 +17,21 @@ #if __riscv_vector #include #include "rvv_mathfun.h" +#include "riscv_usability.h" #if __riscv_zvfh #include "rvv_mathfun_fp16s.h" #endif #endif // __riscv_vector -#include "riscv_usability.h" - namespace ncnn { -#if __riscv_zvfh +#if NCNN_ZFH template static void binary_op_vector_no_broadcast_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int size) { const Op op; +#if __riscv_zvfh int n = size; while (n > 0) { @@ -45,6 +45,15 @@ static void binary_op_vector_no_broadcast_fp16s(const __fp16* ptr, const __fp16* ptr1 += vl; outptr += vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *outptr = op(*ptr, *ptr1); + ptr += 1; + ptr1 += 1; + outptr += 1; + } +#endif // __riscv_zvfh } template @@ -54,6 +63,7 @@ static void binary_op_vector_broadcast_b_fp16s(const __fp16* ptr, const __fp16* const __fp16 b = *ptr1; +#if __riscv_zvfh int n = size; vfloat16m8_t _bx = (elempack == 1) ? __riscv_vfmv_v_f_f16m8(b, __riscv_vsetvl_e16m8(n)) : __riscv_vle16_v_f16m8_f16m1(ptr1); while (n > 0) @@ -66,6 +76,14 @@ static void binary_op_vector_broadcast_b_fp16s(const __fp16* ptr, const __fp16* ptr += vl; outptr += vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *outptr = op(*ptr, b); + ptr += 1; + outptr += 1; + } +#endif // __riscv_zvfh } template @@ -75,6 +93,7 @@ static void binary_op_vector_broadcast_a_fp16s(const __fp16* ptr, const __fp16* const __fp16 a = *ptr; +#if __riscv_zvfh int n = size; vfloat16m8_t _ax = (elempack == 1) ? __riscv_vfmv_v_f_f16m8(a, __riscv_vsetvl_e16m8(n)) : __riscv_vle16_v_f16m8_f16m1(ptr); while (n > 0) @@ -87,6 +106,14 @@ static void binary_op_vector_broadcast_a_fp16s(const __fp16* ptr, const __fp16* ptr1 += vl; outptr += vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *outptr = op(a, *ptr1); + ptr1 += 1; + outptr += 1; + } +#endif // __riscv_zvfh } template @@ -94,6 +121,7 @@ static void binary_op_vector_broadcast_pb_fp16s(const __fp16* ptr, const __fp16* { const Op op; +#if __riscv_zvfh // if (elempack == packn) { size_t vl = __riscv_vsetvl_e16m8(elempack); @@ -108,6 +136,7 @@ static void binary_op_vector_broadcast_pb_fp16s(const __fp16* ptr, const __fp16* outptr += vl; } } +#endif // __riscv_zvfh } template @@ -115,6 +144,7 @@ static void binary_op_vector_broadcast_pb_b_fp16s(const __fp16* ptr, const __fp1 { const Op op; +#if __riscv_zvfh int n = w * elempack; vfloat16m8_t _bx = __riscv_vfmv_v_f_f16m8(*ptr1, __riscv_vsetvl_e16m8(n)); @@ -128,6 +158,7 @@ static void binary_op_vector_broadcast_pb_b_fp16s(const __fp16* ptr, const __fp1 ptr += vl; outptr += vl; } +#endif // __riscv_zvfh } template @@ -135,6 +166,7 @@ static void binary_op_vector_broadcast_pb_a_fp16s(const __fp16* ptr, const __fp1 { const Op op; +#if __riscv_zvfh // if (elempack == packn) { size_t vl = __riscv_vsetvl_e16m8(elempack); @@ -147,6 +179,7 @@ static void binary_op_vector_broadcast_pb_a_fp16s(const __fp16* ptr, const __fp1 outptr += vl; } } +#endif // __riscv_zvfh } template @@ -203,6 +236,7 @@ static void binary_op_vector_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16 namespace BinaryOp_riscv_functor { +#if __riscv_zvfh #define MAKE_FUNCTION(NAME, IMPL, IMPLVV, IMPLVS, IMPLSV) \ struct NAME \ { \ @@ -223,6 +257,16 @@ namespace BinaryOp_riscv_functor { return IMPLSV; \ } \ }; +#else +#define MAKE_FUNCTION(NAME, IMPL, IMPLVV, IMPLVS, IMPLSV) \ + struct NAME \ + { \ + __fp16 operator()(const __fp16& x, const __fp16& y) const \ + { \ + return IMPL; \ + } \ + }; +#endif // clang-format off // *INDENT-OFF* @@ -567,6 +611,6 @@ int BinaryOp_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& op return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/clip_riscv.cpp b/src/layer/riscv/clip_riscv.cpp index c1127e90e9f..52a82fe9b81 100644 --- a/src/layer/riscv/clip_riscv.cpp +++ b/src/layer/riscv/clip_riscv.cpp @@ -16,32 +16,34 @@ #if __riscv_vector #include -#include "rvv_mathfun.h" #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { Clip_riscv::Clip_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector } int Clip_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) { - if (opt.use_fp16_arithmetic) - return forward_inplace_fp16sa(bottom_top_blob, opt); - else - return forward_inplace_fp16s(bottom_top_blob, opt); + return forward_inplace_fp16s(bottom_top_blob, opt); } #endif diff --git a/src/layer/riscv/clip_riscv.h b/src/layer/riscv/clip_riscv.h index e439dd9b34a..241a249a204 100644 --- a/src/layer/riscv/clip_riscv.h +++ b/src/layer/riscv/clip_riscv.h @@ -27,9 +27,8 @@ class Clip_riscv : public Clip virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if NCNN_ZVFH +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; - int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif }; diff --git a/src/layer/riscv/clip_riscv_zvfh.cpp b/src/layer/riscv/clip_riscv_zfh.cpp similarity index 61% rename from src/layer/riscv/clip_riscv_zvfh.cpp rename to src/layer/riscv/clip_riscv_zfh.cpp index 0776db98a0b..feeca51477f 100644 --- a/src/layer/riscv/clip_riscv_zvfh.cpp +++ b/src/layer/riscv/clip_riscv_zfh.cpp @@ -16,15 +16,11 @@ #if __riscv_vector #include -#include "rvv_mathfun.h" -#if __riscv_zvfh -#include "rvv_mathfun_fp16s.h" -#endif #endif // __riscv_vector namespace ncnn { -#if __riscv_zvfh +#if NCNN_ZFH int Clip_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; @@ -39,55 +35,38 @@ int Clip_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c { __fp16* ptr = bottom_top_blob.channel(q); + __fp16 _min = (__fp16)min; + __fp16 _max = (__fp16)max; +#if __riscv_zvfh int n = size; while (n > 0) { size_t vl = __riscv_vsetvl_e16m4(n); vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); - _p = __riscv_vfmax_vf_f32m8(_p, min, vl); - _p = __riscv_vfmin_vf_f32m8(_p, max, vl); + _p = __riscv_vfmax_vf_f32m8(_p, _min, vl); + _p = __riscv_vfmin_vf_f32m8(_p, _max, vl); __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); ptr += vl; n -= vl; } - } - - return 0; -} - -int Clip_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int d = bottom_top_blob.d; - int channels = bottom_top_blob.c; - int elempack = bottom_top_blob.elempack; - int size = w * h * d * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - - int n = size; - while (n > 0) +#else // __riscv_vector + for (int i = 0; i < size; i++) { - size_t vl = __riscv_vsetvl_e16m8(n); + if (*ptr < _min) + *ptr = _min; - vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); - _p = __riscv_vfmax_vf_f16m8(_p, min, vl); - _p = __riscv_vfmin_vf_f16m8(_p, max, vl); - __riscv_vse16_v_f16m8(ptr, _p, vl); + if (*ptr > _max) + *ptr = _max; - ptr += vl; - n -= vl; + ptr++; } +#endif // __riscv_vector } return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } //namespace ncnn diff --git a/src/layer/riscv/concat_riscv.cpp b/src/layer/riscv/concat_riscv.cpp index 89d1565c42d..ae991d9251a 100644 --- a/src/layer/riscv/concat_riscv.cpp +++ b/src/layer/riscv/concat_riscv.cpp @@ -16,9 +16,10 @@ #if __riscv_vector #include +#include "riscv_usability.h" #endif // __riscv_vector -#include "riscv_usability.h" +#include "cpu.h" namespace ncnn { @@ -26,10 +27,10 @@ Concat_riscv::Concat_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH - support_fp16_storage = cpu_support_riscv_zvfh(); -#endif #endif // __riscv_vector +#if NCNN_ZFH + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #if NCNN_BF16 support_bf16_storage = true; @@ -40,7 +41,7 @@ int Concat_riscv::forward(const std::vector& bottom_blobs, std::vector { int elembits = bottom_blobs[0].elembits(); -#if NCNN_ZVFH +#if NCNN_ZFH if (opt.use_fp16_storage && elembits == 16) return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt); #endif diff --git a/src/layer/riscv/convolution1d_riscv.cpp b/src/layer/riscv/convolution1d_riscv.cpp index 65d89afa788..b2c1c198857 100644 --- a/src/layer/riscv/convolution1d_riscv.cpp +++ b/src/layer/riscv/convolution1d_riscv.cpp @@ -30,10 +30,14 @@ Convolution1D_riscv::Convolution1D_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector } int Convolution1D_riscv::create_pipeline(const Option& opt) @@ -41,7 +45,7 @@ int Convolution1D_riscv::create_pipeline(const Option& opt) if (dynamic_weight) return 0; -#if NCNN_ZVFH +#if NCNN_ZFH if (opt.use_fp16_storage) { return create_pipeline_fp16s(opt); @@ -108,7 +112,7 @@ int Convolution1D_riscv::destroy_pipeline(const Option& /*opt*/) int Convolution1D_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) diff --git a/src/layer/riscv/convolution1d_riscv.h b/src/layer/riscv/convolution1d_riscv.h index 3e7875199f0..98a21be002f 100644 --- a/src/layer/riscv/convolution1d_riscv.h +++ b/src/layer/riscv/convolution1d_riscv.h @@ -32,7 +32,7 @@ class Convolution1D_riscv : public Convolution1D virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; protected: -#if NCNN_ZVFH +#if NCNN_ZFH int create_pipeline_fp16s(const Option& opt); int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; diff --git a/src/layer/riscv/convolution1d_riscv_zvfh.cpp b/src/layer/riscv/convolution1d_riscv_zfh.cpp similarity index 96% rename from src/layer/riscv/convolution1d_riscv_zvfh.cpp rename to src/layer/riscv/convolution1d_riscv_zfh.cpp index 1eb8fa45c4a..47a9e7a5a9a 100644 --- a/src/layer/riscv/convolution1d_riscv_zvfh.cpp +++ b/src/layer/riscv/convolution1d_riscv_zfh.cpp @@ -26,21 +26,24 @@ namespace ncnn { -#if __riscv_zvfh +#if NCNN_ZFH int Convolution1D_riscv::create_pipeline_fp16s(const Option& opt) { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh const int num_input = weight_data_size / kernel_w / num_output; int elempack = 1; int out_elempack = 1; - +#if __riscv_zvfh if (opt.use_packing_layout) { elempack = num_input % packn == 0 ? packn : 1; out_elempack = num_output % packn == 0 ? packn : 1; } +#endif // __riscv_zvfh // src = kw-inch-outch // dst = pb-pa-kw-inch/pa-outch/pb @@ -83,8 +86,10 @@ int Convolution1D_riscv::create_pipeline_fp16s(const Option& opt) int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; const size_t vl = __riscv_vsetvl_e16m1(packn); +#endif // __riscv_zvfh int w = bottom_blob.w; int h = bottom_blob.h; @@ -101,7 +106,13 @@ int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co w = bottom_blob_bordered.w; h = bottom_blob_bordered.h; - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh size_t out_elemsize = elemsize / elempack * out_elempack; const int outw = (w - kernel_extent_w) / stride_w + 1; @@ -111,6 +122,7 @@ int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co if (top_blob.empty()) return -100; +#if __riscv_zvfh if (elempack == packn && out_elempack == packn) { { @@ -256,6 +268,7 @@ int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co } } } +#endif // __riscv_zvfh if (elempack == 1 && out_elempack == 1) { @@ -304,8 +317,10 @@ int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; const size_t vl = __riscv_vsetvl_e16m1(packn); +#endif // __riscv_zvfh int w = bottom_blob.w; int h = bottom_blob.h; @@ -322,7 +337,13 @@ int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c w = bottom_blob_bordered.w; h = bottom_blob_bordered.h; - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh size_t out_elemsize = elemsize / elempack * out_elempack; const int outw = (w - kernel_extent_w) / stride_w + 1; @@ -332,6 +353,7 @@ int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c if (top_blob.empty()) return -100; +#if __riscv_zvfh if (elempack == packn && out_elempack == packn) { { @@ -467,6 +489,7 @@ int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c } } } +#endif // __riscv_zvfh if (elempack == 1 && out_elempack == 1) { @@ -512,6 +535,6 @@ int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp index 41979c07ba1..c252d25f235 100644 --- a/src/layer/riscv/convolution_riscv.cpp +++ b/src/layer/riscv/convolution_riscv.cpp @@ -21,7 +21,6 @@ #if __riscv_vector #include #endif // __riscv_vector - #include "riscv_activation.h" #include "riscv_usability.h" @@ -57,10 +56,14 @@ Convolution_riscv::Convolution_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector activation = 0; } @@ -116,7 +119,7 @@ int Convolution_riscv::create_pipeline(const Option& opt) } #endif -#if NCNN_ZVFH +#if NCNN_ZFH if (opt.use_fp16_storage) { return create_pipeline_fp16s(opt); @@ -302,7 +305,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti return 0; } -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) diff --git a/src/layer/riscv/convolution_riscv.h b/src/layer/riscv/convolution_riscv.h index e1c4a9c546d..4add7108643 100644 --- a/src/layer/riscv/convolution_riscv.h +++ b/src/layer/riscv/convolution_riscv.h @@ -32,7 +32,7 @@ class Convolution_riscv : public Convolution virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; protected: -#if NCNN_ZVFH +#if NCNN_ZFH int create_pipeline_fp16s(const Option& opt); int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; diff --git a/src/layer/riscv/convolution_riscv_zvfh.cpp b/src/layer/riscv/convolution_riscv_zfh.cpp similarity index 96% rename from src/layer/riscv/convolution_riscv_zvfh.cpp rename to src/layer/riscv/convolution_riscv_zfh.cpp index ad4fcd1fdc2..861510f1ee4 100644 --- a/src/layer/riscv/convolution_riscv_zvfh.cpp +++ b/src/layer/riscv/convolution_riscv_zfh.cpp @@ -17,26 +17,25 @@ #if __riscv_vector #include #endif // __riscv_vector - #include "riscv_activation.h" #include "riscv_usability.h" namespace ncnn { -#if __riscv_vector -#if __riscv_zvfh +#if NCNN_ZFH #include "convolution_fp16s.h" +#include "convolution_sgemm_fp16s.h" +#include "convolution_1x1_fp16s.h" +#if __riscv_zvfh #include "convolution_packn_fp16s.h" #include "convolution_pack1ton_fp16s.h" #include "convolution_packnto1_fp16s.h" -#include "convolution_sgemm_fp16s.h" #include "convolution_sgemm_packn_fp16s.h" #include "convolution_sgemm_pack1ton_fp16s.h" #include "convolution_sgemm_packnto1_fp16s.h" #include "convolution_winograd_transform_packn_fp16s.h" #include "convolution_winograd_dot_packn_fp16s.h" -#include "convolution_1x1_fp16s.h" #include "convolution_1x1_packn_fp16s.h" #include "convolution_1x1_pack1ton_fp16s.h" #include "convolution_1x1_packnto1_fp16s.h" @@ -44,9 +43,9 @@ namespace ncnn { #include "convolution_3x3_pack1ton_fp16s.h" #include "convolution_7x7_pack1ton_fp16s.h" #endif -#endif // __riscv_vector +#endif // NCNN_ZFH -#if __riscv_zvfh +#if NCNN_ZFH static void convolution_transform_kernel_packed_fp16s_rvv(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) { const int maxk = kernel_w * kernel_h; @@ -85,7 +84,9 @@ static void convolution_transform_kernel_packed_fp16s_rvv(const Mat& weight_data int Convolution_riscv::create_pipeline_fp16s(const Option& opt) { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh const int maxk = kernel_w * kernel_h; const int num_input = weight_data_size / maxk / num_output; @@ -93,12 +94,15 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt) int elempack = 1; int out_elempack = 1; +#if __riscv_zvfh if (opt.use_packing_layout) { elempack = num_input % packn == 0 ? packn : 1; out_elempack = num_output % packn == 0 ? packn : 1; } +#endif // __riscv_zvfh +#if __riscv_zvfh // packn if (elempack == packn && out_elempack == packn) { @@ -143,6 +147,7 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt) convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); } } +#endif // __riscv_zvfh // pack1 if (elempack == 1 && out_elempack == 1) @@ -174,7 +179,9 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt) int Convolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh int w = bottom_blob.w; int h = bottom_blob.h; @@ -196,13 +203,20 @@ int Convolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, cons int outw = (w - kernel_extent_w) / stride_w + 1; int outh = (h - kernel_extent_h) / stride_h + 1; - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh size_t out_elemsize = elemsize / elempack * out_elempack; top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; +#if __riscv_zvfh if (elempack == packn && out_elempack == packn) { { @@ -223,6 +237,7 @@ int Convolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, cons convolution_packnto1_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } +#endif // __riscv_zvfh if (elempack == 1 && out_elempack == 1) { @@ -236,7 +251,9 @@ int Convolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, cons int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh int w = bottom_blob.w; int h = bottom_blob.h; @@ -259,7 +276,13 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con int outw = (w - kernel_extent_w) / stride_w + 1; int outh = (h - kernel_extent_h) / stride_h + 1; - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh size_t out_elemsize = elemsize / elempack * out_elempack; top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); @@ -268,6 +291,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con const int num_input = channels * elempack; +#if __riscv_zvfh if (elempack == packn && out_elempack == packn) { if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) @@ -404,6 +428,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con convolution_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } +#endif // __riscv_zvfh if (elempack == 1 && out_elempack == 1) { @@ -433,6 +458,6 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp index 195caf7eb59..a0926077b34 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp @@ -14,9 +14,6 @@ #include "convolutiondepthwise_riscv.h" -#include "cpu.h" -#include "layer_type.h" - #if __riscv_vector #include #endif // __riscv_vector @@ -24,6 +21,9 @@ #include "riscv_activation.h" #include "riscv_usability.h" +#include "cpu.h" +#include "layer_type.h" + namespace ncnn { #include "convolutiondepthwise_3x3.h" @@ -37,10 +37,14 @@ ConvolutionDepthWise_riscv::ConvolutionDepthWise_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector activation = 0; } @@ -60,7 +64,7 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt) } #endif -#if NCNN_ZVFH +#if NCNN_ZFH if (opt.use_fp16_storage) { return create_pipeline_fp16s(opt); @@ -259,7 +263,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c } #endif -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) diff --git a/src/layer/riscv/convolutiondepthwise_riscv.h b/src/layer/riscv/convolutiondepthwise_riscv.h index ee9fce28a6a..7f2c66d8e73 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.h +++ b/src/layer/riscv/convolutiondepthwise_riscv.h @@ -33,7 +33,7 @@ class ConvolutionDepthWise_riscv : public ConvolutionDepthWise protected: int create_group_ops(const Option& opt); -#if NCNN_ZVFH +#if NCNN_ZFH int create_pipeline_fp16s(const Option& opt); int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; diff --git a/src/layer/riscv/convolutiondepthwise_riscv_zvfh.cpp b/src/layer/riscv/convolutiondepthwise_riscv_zfh.cpp similarity index 94% rename from src/layer/riscv/convolutiondepthwise_riscv_zvfh.cpp rename to src/layer/riscv/convolutiondepthwise_riscv_zfh.cpp index fa2176d2be9..9764166ca56 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv_zvfh.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv_zfh.cpp @@ -23,17 +23,19 @@ namespace ncnn { -#if __riscv_vector +#if NCNN_ZFH #if __riscv_zvfh #include "convolutiondepthwise_3x3_packn_fp16s.h" #include "convolutiondepthwise_5x5_packn_fp16s.h" #endif -#endif // __riscv_vector +#endif // NCNN_ZFH -#if __riscv_zvfh +#if NCNN_ZFH int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh const int maxk = kernel_w * kernel_h; int channels = (weight_data_size / group) / maxk / (num_output / group) * group; @@ -42,11 +44,14 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) if (channels == group && group == num_output) { int elempack = 1; +#if __riscv_zvfh if (opt.use_packing_layout) { elempack = channels % packn == 0 ? packn : 1; } +#endif // __riscv_zvfh +#if __riscv_zvfh // packn if (elempack == packn) { @@ -56,6 +61,7 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt); } +#endif // __riscv_zvfh if (elempack == 1) { @@ -81,8 +87,10 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; const size_t vl = __riscv_vsetvl_e16m1(packn); +#endif // __riscv_zvfh int w = bottom_blob.w; int h = bottom_blob.h; @@ -103,7 +111,13 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b int outw = (w - kernel_extent_w) / stride_w + 1; int outh = (h - kernel_extent_h) / stride_h + 1; - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh size_t out_elemsize = elemsize / elempack * out_elempack; top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); @@ -113,6 +127,7 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b // depth-wise if (channels * elempack == group && group == num_output) { +#if __riscv_zvfh if (elempack == packn) { { @@ -174,6 +189,7 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b } } } +#endif // __riscv_zvfh if (elempack == 1) { @@ -242,8 +258,15 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b const int channels_g = channels * elempack / group; const int num_output_g = num_output / group; - int g_elempack = (opt.use_packing_layout && channels_g % packn == 0) ? packn : 1; - int out_g_elempack = (opt.use_packing_layout && num_output_g % packn == 0) ? packn : 1; + int g_elempack = 1; + int out_g_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + g_elempack = channels_g % packn == 0 ? packn : 1; + out_g_elempack = num_output_g % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh // unpacking Mat bottom_blob_bordered_unpacked = bottom_blob_bordered; @@ -291,8 +314,10 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; const size_t vl = __riscv_vsetvl_e16m1(packn); +#endif // __riscv_zvfh int w = bottom_blob.w; int h = bottom_blob.h; @@ -313,7 +338,13 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ int outw = (w - kernel_extent_w) / stride_w + 1; int outh = (h - kernel_extent_h) / stride_h + 1; - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh size_t out_elemsize = elemsize / elempack * out_elempack; top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); @@ -323,6 +354,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ // depth-wise if (channels * elempack == group && group == num_output) { +#if __riscv_zvfh if (elempack == packn) { if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) @@ -421,6 +453,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ } } } +#endif // __riscv_zvfh if (elempack == 1) { @@ -489,8 +522,15 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ const int channels_g = channels * elempack / group; const int num_output_g = num_output / group; - int g_elempack = (opt.use_packing_layout && channels_g % packn == 0) ? packn : 1; - int out_g_elempack = (opt.use_packing_layout && num_output_g % packn == 0) ? packn : 1; + int g_elempack = 1; + int out_g_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + g_elempack = channels_g % packn == 0 ? packn : 1; + out_g_elempack = num_output_g % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh // unpacking Mat bottom_blob_bordered_unpacked = bottom_blob_bordered; @@ -535,6 +575,6 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/crop_riscv.cpp b/src/layer/riscv/crop_riscv.cpp index 98dd6de4af4..95717c45900 100644 --- a/src/layer/riscv/crop_riscv.cpp +++ b/src/layer/riscv/crop_riscv.cpp @@ -20,16 +20,22 @@ #include "riscv_usability.h" +#include "cpu.h" + namespace ncnn { Crop_riscv::Crop_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector #if NCNN_BF16 support_bf16_storage = true; diff --git a/src/layer/riscv/deconvolution_riscv.cpp b/src/layer/riscv/deconvolution_riscv.cpp index 5aa5d307956..27afddd795c 100644 --- a/src/layer/riscv/deconvolution_riscv.cpp +++ b/src/layer/riscv/deconvolution_riscv.cpp @@ -14,9 +14,6 @@ #include "deconvolution_riscv.h" -#include "cpu.h" -#include "layer_type.h" - #if __riscv_vector #include #endif // __riscv_vector @@ -24,6 +21,9 @@ #include "riscv_activation.h" #include "riscv_usability.h" +#include "cpu.h" +#include "layer_type.h" + namespace ncnn { #if __riscv_vector @@ -36,10 +36,14 @@ Deconvolution_riscv::Deconvolution_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector } int Deconvolution_riscv::create_pipeline(const Option& opt) @@ -47,7 +51,7 @@ int Deconvolution_riscv::create_pipeline(const Option& opt) if (dynamic_weight) return 0; -#if NCNN_ZVFH +#if NCNN_ZFH if (opt.use_fp16_storage) { return create_pipeline_fp16s(opt); @@ -154,7 +158,7 @@ int Deconvolution_riscv::destroy_pipeline(const Option& opt) int Deconvolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) diff --git a/src/layer/riscv/deconvolution_riscv.h b/src/layer/riscv/deconvolution_riscv.h index 760fe88286c..96233bd79dc 100644 --- a/src/layer/riscv/deconvolution_riscv.h +++ b/src/layer/riscv/deconvolution_riscv.h @@ -32,7 +32,7 @@ class Deconvolution_riscv : public Deconvolution virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; protected: -#if NCNN_ZVFH +#if NCNN_ZFH int create_pipeline_fp16s(const Option& opt); int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; diff --git a/src/layer/riscv/deconvolution_riscv_zvfh.cpp b/src/layer/riscv/deconvolution_riscv_zfh.cpp similarity index 93% rename from src/layer/riscv/deconvolution_riscv_zvfh.cpp rename to src/layer/riscv/deconvolution_riscv_zfh.cpp index 79b35871601..80d59a93ce4 100644 --- a/src/layer/riscv/deconvolution_riscv_zvfh.cpp +++ b/src/layer/riscv/deconvolution_riscv_zfh.cpp @@ -23,31 +23,34 @@ namespace ncnn { -#if __riscv_vector -#if __riscv_zvfh +#if NCNN_ZFH #include "deconvolution_fp16s.h" +#if __riscv_zvfh #include "deconvolution_packn_fp16s.h" #include "deconvolution_pack1ton_fp16s.h" #include "deconvolution_packnto1_fp16s.h" #endif -#endif // __riscv_vector +#endif // NCNN_ZFH -#if __riscv_zvfh +#if NCNN_ZFH int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt) { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh const int maxk = kernel_w * kernel_h; const int num_input = weight_data_size / maxk / num_output; int elempack = 1; int out_elempack = 1; - +#if __riscv_zvfh if (opt.use_packing_layout) { elempack = num_input % packn == 0 ? packn : 1; out_elempack = num_output % packn == 0 ? packn : 1; } +#endif // __riscv_zvfh Mat weight_data_transposed(weight_data.w); { @@ -97,6 +100,7 @@ int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt) } } +#if __riscv_zvfh // packn if (elempack == packn && out_elempack == packn) { @@ -111,6 +115,7 @@ int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt) if (elempack == packn && out_elempack == 1) { } +#endif // __riscv_zvfh // pack1 if (elempack == 1 && out_elempack == 1) @@ -127,7 +132,9 @@ int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt) int Deconvolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh // deconvolv with NxN kernel // value = value + bias @@ -145,7 +152,13 @@ int Deconvolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right; int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom; - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh size_t out_elemsize = elemsize / elempack * out_elempack; Mat top_blob_bordered; @@ -161,6 +174,7 @@ int Deconvolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co if (top_blob_bordered.empty()) return -100; +#if __riscv_zvfh if (elempack == packn && out_elempack == packn) { { @@ -181,6 +195,7 @@ int Deconvolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co deconvolution_packnto1_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } +#endif // __riscv_zvfh if (elempack == 1 && out_elempack == 1) { @@ -198,7 +213,9 @@ int Deconvolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co int Deconvolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh // deconvolv with NxN kernel // value = value + bias @@ -216,7 +233,13 @@ int Deconvolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right; int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom; - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh size_t out_elemsize = elemsize / elempack * out_elempack; Mat top_blob_bordered; @@ -232,6 +255,7 @@ int Deconvolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c if (top_blob_bordered.empty()) return -100; +#if __riscv_zvfh if (elempack == packn && out_elempack == packn) { { @@ -252,6 +276,7 @@ int Deconvolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c deconvolution_packnto1_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } +#endif // __riscv_zvfh if (elempack == 1 && out_elempack == 1) { @@ -266,6 +291,6 @@ int Deconvolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp index 272a74dded7..5fee5c24764 100644 --- a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp @@ -14,9 +14,6 @@ #include "deconvolutiondepthwise_riscv.h" -#include "cpu.h" -#include "layer_type.h" - #if __riscv_vector #include #endif // __riscv_vector @@ -24,16 +21,23 @@ #include "riscv_activation.h" #include "riscv_usability.h" +#include "cpu.h" +#include "layer_type.h" + namespace ncnn { DeconvolutionDepthWise_riscv::DeconvolutionDepthWise_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector } int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt) @@ -41,7 +45,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt) if (dynamic_weight) return 0; -#if NCNN_ZVFH +#if NCNN_ZFH if (opt.use_fp16_storage) { return create_pipeline_fp16s(opt); @@ -196,7 +200,7 @@ int DeconvolutionDepthWise_riscv::destroy_pipeline(const Option& opt) int DeconvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.h b/src/layer/riscv/deconvolutiondepthwise_riscv.h index 3522edf3ced..a965b5a4699 100644 --- a/src/layer/riscv/deconvolutiondepthwise_riscv.h +++ b/src/layer/riscv/deconvolutiondepthwise_riscv.h @@ -33,7 +33,7 @@ class DeconvolutionDepthWise_riscv : public DeconvolutionDepthWise protected: int create_group_ops(const Option& opt); -#if NCNN_ZVFH +#if NCNN_ZFH int create_pipeline_fp16s(const Option& opt); int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv_zvfh.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv_zfh.cpp similarity index 94% rename from src/layer/riscv/deconvolutiondepthwise_riscv_zvfh.cpp rename to src/layer/riscv/deconvolutiondepthwise_riscv_zfh.cpp index f8bfe22bb56..fb2d8d2b32c 100644 --- a/src/layer/riscv/deconvolutiondepthwise_riscv_zvfh.cpp +++ b/src/layer/riscv/deconvolutiondepthwise_riscv_zfh.cpp @@ -23,10 +23,12 @@ namespace ncnn { -#if __riscv_zvfh +#if NCNN_ZFH int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh const int maxk = kernel_w * kernel_h; int channels = (weight_data_size / group) / maxk / (num_output / group) * group; @@ -35,10 +37,12 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) if (channels == group && group == num_output) { int elempack = 1; +#if __riscv_zvfh if (opt.use_packing_layout) { elempack = channels % packn == 0 ? packn : 1; } +#endif // __riscv_zvfh Mat weight_data_transposed(weight_data.w); { @@ -57,6 +61,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) } } +#if __riscv_zvfh // packn if (elempack == packn) { @@ -66,6 +71,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt); } +#endif // __riscv_zvfh if (elempack == 1) { @@ -91,8 +97,10 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; const size_t vl = __riscv_vsetvl_e16m1(packn); +#endif // __riscv_zvfh int w = bottom_blob.w; int h = bottom_blob.h; @@ -105,7 +113,13 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right; int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom; - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh size_t out_elemsize = elemsize / elempack * out_elempack; Mat top_blob_bordered; @@ -126,6 +140,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top // depth-wise if (channels * elempack == group && group == num_output) { +#if __riscv_zvfh if (elempack == packn) { { @@ -187,6 +202,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top } } } +#endif // __riscv_zvfh if (elempack == 1) { @@ -258,8 +274,15 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top const int channels_g = channels * elempack / group; const int num_output_g = num_output / group; - int g_elempack = (opt.use_packing_layout && channels_g % packn == 0) ? packn : 1; - int out_g_elempack = (opt.use_packing_layout && num_output_g % packn == 0) ? packn : 1; + int g_elempack = 1; + int out_g_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + g_elempack = channels_g % packn == 0 ? packn : 1; + out_g_elempack = num_output_g % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh // unpacking Mat bottom_blob_unpacked = bottom_blob; @@ -312,8 +335,10 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; const size_t vl = __riscv_vsetvl_e16m1(packn); +#endif // __riscv_zvfh int w = bottom_blob.w; int h = bottom_blob.h; @@ -326,7 +351,13 @@ int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& to int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right; int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom; - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + int out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh size_t out_elemsize = elemsize / elempack * out_elempack; Mat top_blob_bordered; @@ -347,6 +378,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& to // depth-wise if (channels * elempack == group && group == num_output) { +#if __riscv_zvfh if (elempack == packn) { { @@ -408,6 +440,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& to } } } +#endif // __riscv_zvfh if (elempack == 1) { @@ -479,8 +512,15 @@ int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& to const int channels_g = channels * elempack / group; const int num_output_g = num_output / group; - int g_elempack = (opt.use_packing_layout && channels_g % packn == 0) ? packn : 1; - int out_g_elempack = (opt.use_packing_layout && num_output_g % packn == 0) ? packn : 1; + int g_elempack = 1; + int out_g_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + g_elempack = channels_g % packn == 0 ? packn : 1; + out_g_elempack = num_output_g % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh // unpacking Mat bottom_blob_unpacked = bottom_blob; @@ -530,6 +570,6 @@ int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& to return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/flatten_riscv.cpp b/src/layer/riscv/flatten_riscv.cpp index f5d1bc0a528..baf4e11fe8a 100644 --- a/src/layer/riscv/flatten_riscv.cpp +++ b/src/layer/riscv/flatten_riscv.cpp @@ -20,16 +20,22 @@ #include "riscv_usability.h" +#include "cpu.h" + namespace ncnn { Flatten_riscv::Flatten_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector #if NCNN_BF16 support_bf16_storage = true; @@ -43,7 +49,7 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& if (elembits == 8) return forward_int8(bottom_blob, top_blob, opt); -#if NCNN_ZVFH +#if NCNN_ZFH if (opt.use_fp16_storage && elembits == 16) return forward_bf16s_fp16s(bottom_blob, top_blob, opt); #endif diff --git a/src/layer/riscv/gru_riscv.cpp b/src/layer/riscv/gru_riscv.cpp index 46254d70b96..97d7cb82eb0 100644 --- a/src/layer/riscv/gru_riscv.cpp +++ b/src/layer/riscv/gru_riscv.cpp @@ -210,8 +210,12 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we GRU_riscv::GRU_riscv() { -#if NCNN_ZVFH +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif } @@ -225,7 +229,7 @@ int GRU_riscv::create_pipeline(const Option& opt) } #endif -#if NCNN_ZVFH +#if NCNN_ZFH if (opt.use_fp16_storage && opt.use_fp16_arithmetic) return create_pipeline_fp16sa(opt); #endif @@ -242,9 +246,7 @@ int GRU_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) } #endif -#if __riscv_vector - -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) @@ -256,6 +258,8 @@ int GRU_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) } #endif +#if __riscv_vector + int T = bottom_blob.h; int num_directions = direction == 2 ? 2 : 1; @@ -326,8 +330,7 @@ int GRU_riscv::forward(const std::vector& bottom_blobs, std::vector& t const Mat& bottom_blob = bottom_blobs[0]; -#if __riscv_vector -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) @@ -339,6 +342,7 @@ int GRU_riscv::forward(const std::vector& bottom_blobs, std::vector& t } #endif +#if __riscv_vector int T = bottom_blob.h; int num_directions = direction == 2 ? 2 : 1; diff --git a/src/layer/riscv/gru_riscv.h b/src/layer/riscv/gru_riscv.h index 32d75d83d58..a9434f83083 100644 --- a/src/layer/riscv/gru_riscv.h +++ b/src/layer/riscv/gru_riscv.h @@ -29,7 +29,7 @@ class GRU_riscv : public GRU virtual int create_pipeline(const Option& opt); protected: -#if NCNN_ZVFH +#if NCNN_ZFH int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; diff --git a/src/layer/riscv/gru_riscv_zvfh.cpp b/src/layer/riscv/gru_riscv_zfh.cpp similarity index 92% rename from src/layer/riscv/gru_riscv_zvfh.cpp rename to src/layer/riscv/gru_riscv_zfh.cpp index d714d690e88..7ffe9723629 100644 --- a/src/layer/riscv/gru_riscv_zvfh.cpp +++ b/src/layer/riscv/gru_riscv_zfh.cpp @@ -20,7 +20,7 @@ namespace ncnn { -#if __riscv_zvfh +#if NCNN_ZFH static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt) { int size = bottom_blob.w; @@ -56,10 +56,11 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M float R = bias_c_R[q]; float U = bias_c_U[q]; - int n = size; +#if __riscv_zvfh const __fp16* ptr_x = x; const float* ptr_xcr = weight_xc_R; const float* ptr_xcu = weight_xc_U; + int n = size; while (n > 0) { size_t vl = __riscv_vsetvl_e16m4(n); @@ -85,11 +86,21 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M ptr_x = NULL; ptr_xcr = NULL; ptr_xcu = NULL; +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + float xi = x[i]; - int n_out = num_output; + R += weight_xc_R[i] * xi; + U += weight_xc_U[i] * xi; + } +#endif // __riscv_zvfh + +#if __riscv_zvfh const float* ptr_hc = hidden_state; const float* ptr_hcr = weight_hc_R; const float* ptr_hcu = weight_hc_U; + int n_out = num_output; while (n_out > 0) { size_t vl = __riscv_vsetvl_e16m4(n_out); @@ -115,6 +126,15 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M ptr_hc = NULL; ptr_hcr = NULL; ptr_hcu = NULL; +#else // __riscv_zvfh + for (int i = 0; i < num_output; i++) + { + float h_cont = hidden_state[i]; + + R += weight_hc_R[i] * h_cont; + U += weight_hc_U[i] * h_cont; + } +#endif // __riscv_zvfh // sigmoid(R) // sigmoid(U) @@ -130,9 +150,10 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M float N = bias_c_BN[q]; - int n_out2 = num_output; +#if __riscv_zvfh const float* ptr_hc2 = hidden_state; const float* ptr_whc_n = weight_hc_N; + int n_out2 = num_output; while (n_out2 > 0) { size_t vl = __riscv_vsetvl_e16m4(n_out2); @@ -151,12 +172,21 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M } ptr_hc2 = NULL; ptr_whc_n = NULL; +#else // __riscv_zvfh + for (int i = 0; i < num_output; i++) + { + float h_cont = hidden_state[i]; + + N += weight_hc_N[i] * h_cont; + } +#endif // __riscv_zvfh N = bias_c_WN[q] + R * N; - int n2 = size; +#if __riscv_zvfh const __fp16* ptr_x2 = x; const float* ptr_xcn = weight_xc_N; + int n2 = size; while (n2 > 0) { size_t vl = __riscv_vsetvl_e16m4(n2); @@ -175,6 +205,14 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M } ptr_x2 = NULL; ptr_xcn = NULL; +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + float xi = x[i]; + + N += weight_xc_N[i] * xi; + } +#endif // __riscv_zvfh // tanh(N) N = tanh(N); @@ -388,10 +426,11 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const __fp16 R = bias_c_R[q]; __fp16 U = bias_c_U[q]; - int n = size; +#if __riscv_zvfh const __fp16* ptr_x = x; const __fp16* ptr_xcr = weight_xc_R; const __fp16* ptr_xcu = weight_xc_U; + int n = size; while (n > 0) { size_t vl = __riscv_vsetvl_e16m8(n); @@ -414,11 +453,21 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const ptr_xcu += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + float xi = x[i]; - int n_out = num_output; + R += weight_xc_R[i] * xi; + U += weight_xc_U[i] * xi; + } +#endif // __riscv_zvfh + +#if __riscv_zvfh const float* ptr_hc = hidden_state; const __fp16* ptr_hcr = weight_hc_R; const __fp16* ptr_hcu = weight_hc_U; + int n_out = num_output; while (n_out > 0) { size_t vl = __riscv_vsetvl_e16m4(n_out); @@ -441,6 +490,15 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const ptr_hcu += vl; n_out -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < num_output; i++) + { + float h_cont = hidden_state[i]; + + R += weight_hc_R[i] * h_cont; + U += weight_hc_U[i] * h_cont; + } +#endif // __riscv_zvfh // sigmoid(R) // sigmoid(U) @@ -456,9 +514,10 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const __fp16 N = bias_c_BN[q]; - int n_out2 = num_output; +#if __riscv_zvfh const float* ptr_hc2 = hidden_state; const __fp16* ptr_whc_n = weight_hc_N; + int n_out2 = num_output; while (n_out2 > 0) { size_t vl = __riscv_vsetvl_e16m4(n_out2); @@ -475,11 +534,21 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const ptr_hc2 += vl; ptr_whc_n += vl; } +#else // __riscv_zvfh + for (int i = 0; i < num_output; i++) + { + float h_cont = hidden_state[i]; + + N += weight_hc_N[i] * h_cont; + } +#endif // __riscv_zvfh + N = bias_c_WN[q] + R * N; - int n2 = size; +#if __riscv_zvfh const __fp16* ptr_x2 = x; const __fp16* ptr_xcn = weight_xc_N; + int n2 = size; while (n2 > 0) { size_t vl = __riscv_vsetvl_e16m8(n2); @@ -496,6 +565,14 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const ptr_x2 += vl; ptr_xcn += vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + float xi = x[i]; + + N += weight_xc_N[i] * xi; + } +#endif // __riscv_zvfh // tanh(N) N = (__fp16)tanh((float)N); @@ -656,6 +733,6 @@ int GRU_riscv::forward_fp16sa(const std::vector& bottom_blobs, std::vector< return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/hardsigmoid_riscv.cpp b/src/layer/riscv/hardsigmoid_riscv.cpp index 459ec5fc53e..370f623e9d8 100644 --- a/src/layer/riscv/hardsigmoid_riscv.cpp +++ b/src/layer/riscv/hardsigmoid_riscv.cpp @@ -21,21 +21,27 @@ #include #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { HardSigmoid_riscv::HardSigmoid_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector } int HardSigmoid_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) diff --git a/src/layer/riscv/hardsigmoid_riscv.h b/src/layer/riscv/hardsigmoid_riscv.h index 3ae1b83d022..0883422101c 100644 --- a/src/layer/riscv/hardsigmoid_riscv.h +++ b/src/layer/riscv/hardsigmoid_riscv.h @@ -27,7 +27,7 @@ class HardSigmoid_riscv : public HardSigmoid virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if NCNN_ZVFH +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; #endif }; diff --git a/src/layer/riscv/hardsigmoid_riscv_zvfh.cpp b/src/layer/riscv/hardsigmoid_riscv_zfh.cpp similarity index 65% rename from src/layer/riscv/hardsigmoid_riscv_zvfh.cpp rename to src/layer/riscv/hardsigmoid_riscv_zfh.cpp index b37864f3c7a..f1f8ab6f618 100644 --- a/src/layer/riscv/hardsigmoid_riscv_zvfh.cpp +++ b/src/layer/riscv/hardsigmoid_riscv_zfh.cpp @@ -20,7 +20,7 @@ namespace ncnn { -#if __riscv_zvfh +#if NCNN_ZFH int HardSigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; @@ -35,27 +35,44 @@ int HardSigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& { __fp16* ptr = bottom_top_blob.channel(q); + __fp16 _lower = (__fp16)lower; + __fp16 _upper = (__fp16)upper; + __fp16 _alpha = (__fp16)alpha; + __fp16 _beta = (__fp16)beta; + +#if __riscv_zvfh int n = size; while (n > 0) { size_t vl = __riscv_vsetvl_e16m8(n); vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); - vbool2_t _lower = __riscv_vmflt_vf_f16m8_b2(_p, lower, vl); - vbool2_t _higher = __riscv_vmfgt_vf_f16m8_b2(_p, upper, vl); - vbool2_t _apply = __riscv_vmnor_mm_b2(_lower, _higher, vl); - _p = __riscv_vfmerge_vfm_f16m8(_p, .0f, _lower, vl); - _p = __riscv_vfmerge_vfm_f16m8(_p, 1.f, _higher, vl); + vbool2_t _is_lower = __riscv_vmflt_vf_f16m8_b2(_p, _lower, vl); + vbool2_t _is_higher = __riscv_vmfgt_vf_f16m8_b2(_p, _upper, vl); + vbool2_t _apply = __riscv_vmnor_mm_b2(_is_lower, _is_higher, vl); + _p = __riscv_vfmerge_vfm_f16m8(_p, (__fp16)0.f, _is_lower, vl); + _p = __riscv_vfmerge_vfm_f16m8(_p, (__fp16)1.f, _is_higher, vl); - _p = __riscv_vfadd_vf_f16m8_mu(_apply, _p, __riscv_vfmul_vf_f16m8_m(_apply, _p, alpha, vl), beta, vl); + _p = __riscv_vfadd_vf_f16m8_mu(_apply, _p, __riscv_vfmul_vf_f16m8_m(_apply, _p, _alpha, vl), _beta, vl); __riscv_vse16_v_f16m8(ptr, _p, vl); ptr += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + if (ptr[i] < _lower) + ptr[i] = (__fp16)0.f; + else if (ptr[i] > _upper) + ptr[i] = (__fp16)1.f; + else + ptr[i] = ptr[i] * _alpha + _beta; + } +#endif // __riscv_zvfh } return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/hardswish_riscv.cpp b/src/layer/riscv/hardswish_riscv.cpp index dd14104592b..5de9b943a60 100644 --- a/src/layer/riscv/hardswish_riscv.cpp +++ b/src/layer/riscv/hardswish_riscv.cpp @@ -21,21 +21,27 @@ #include #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { HardSwish_riscv::HardSwish_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector } int HardSwish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) diff --git a/src/layer/riscv/hardswish_riscv.h b/src/layer/riscv/hardswish_riscv.h index 758c8dae4dd..b882487ba21 100644 --- a/src/layer/riscv/hardswish_riscv.h +++ b/src/layer/riscv/hardswish_riscv.h @@ -30,7 +30,7 @@ class HardSwish_riscv : public HardSwish virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if NCNN_ZVFH +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; #endif }; diff --git a/src/layer/riscv/hardswish_riscv_zvfh.cpp b/src/layer/riscv/hardswish_riscv_zfh.cpp similarity index 67% rename from src/layer/riscv/hardswish_riscv_zvfh.cpp rename to src/layer/riscv/hardswish_riscv_zfh.cpp index 1d3826605b0..2afdf07d9dd 100644 --- a/src/layer/riscv/hardswish_riscv_zvfh.cpp +++ b/src/layer/riscv/hardswish_riscv_zfh.cpp @@ -20,7 +20,7 @@ namespace ncnn { -#if __riscv_zvfh +#if NCNN_ZFH int HardSwish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; @@ -35,28 +35,45 @@ int HardSwish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o { __fp16* ptr = bottom_top_blob.channel(q); + __fp16 _lower = (__fp16)lower; + __fp16 _upper = (__fp16)upper; + __fp16 _alpha = (__fp16)alpha; + __fp16 _beta = (__fp16)beta; + +#if __riscv_zvfh int n = size; while (n > 0) { size_t vl = __riscv_vsetvl_e16m8(n); vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); - vbool2_t _lower = __riscv_vmflt_vf_f16m8_b2(_p, lower, vl); - vbool2_t _higher = __riscv_vmfgt_vf_f16m8_b2(_p, upper, vl); - vbool2_t _apply = __riscv_vmnor_mm_b2(_lower, _higher, vl); - _p = __riscv_vfmerge_vfm_f16m8(_p, .0f, _lower, vl); + vbool2_t _is_lower = __riscv_vmflt_vf_f16m8_b2(_p, _lower, vl); + vbool2_t _is_higher = __riscv_vmfgt_vf_f16m8_b2(_p, _upper, vl); + vbool2_t _apply = __riscv_vmnor_mm_b2(_is_lower, _is_higher, vl); + _p = __riscv_vfmerge_vfm_f16m8(_p, (__fp16).0f, _is_lower, vl); - vfloat16m8_t _p0 = __riscv_vfadd_vf_f16m8_m(_apply, __riscv_vfmul_vf_f16m8_m(_apply, _p, alpha, vl), beta, vl); + vfloat16m8_t _p0 = __riscv_vfadd_vf_f16m8_m(_apply, __riscv_vfmul_vf_f16m8_m(_apply, _p, _alpha, vl), _beta, vl); _p = __riscv_vfmul_vv_f16m8_mu(_apply, _p, _p, _p0, vl); __riscv_vse16_v_f16m8(ptr, _p, vl); ptr += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + if (ptr[i] < _lower) + ptr[i] = (__fp16)0.f; + else if (ptr[i] > _upper) + ; + else + ptr[i] = ptr[i] * (ptr[i] * _alpha + _beta); + } +#endif // __riscv_zvfh } return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp index 8b89da90f41..3b8751f6a2a 100644 --- a/src/layer/riscv/innerproduct_riscv.cpp +++ b/src/layer/riscv/innerproduct_riscv.cpp @@ -19,20 +19,25 @@ #if __riscv_vector #include #endif // __riscv_vector - #include "riscv_activation.h" #include "riscv_usability.h" +#include "cpu.h" + namespace ncnn { InnerProduct_riscv::InnerProduct_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector flatten = 0; } @@ -57,7 +62,7 @@ int InnerProduct_riscv::create_pipeline(const Option& opt) } #endif -#if NCNN_ZVFH +#if NCNN_ZFH if (opt.use_fp16_storage) { return create_pipeline_fp16s(opt); @@ -153,7 +158,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt } #endif -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) diff --git a/src/layer/riscv/innerproduct_riscv.h b/src/layer/riscv/innerproduct_riscv.h index 4c6384a2e0d..17be502d0b1 100644 --- a/src/layer/riscv/innerproduct_riscv.h +++ b/src/layer/riscv/innerproduct_riscv.h @@ -30,7 +30,7 @@ class InnerProduct_riscv : public InnerProduct virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; protected: -#if NCNN_ZVFH +#if NCNN_ZFH int create_pipeline_fp16s(const Option& opt); int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; diff --git a/src/layer/riscv/innerproduct_riscv_zvfh.cpp b/src/layer/riscv/innerproduct_riscv_zfh.cpp similarity index 94% rename from src/layer/riscv/innerproduct_riscv_zvfh.cpp rename to src/layer/riscv/innerproduct_riscv_zfh.cpp index 86c32c0a993..347cca0a640 100644 --- a/src/layer/riscv/innerproduct_riscv_zvfh.cpp +++ b/src/layer/riscv/innerproduct_riscv_zfh.cpp @@ -17,25 +17,28 @@ #if __riscv_vector #include #endif // __riscv_vector - #include "riscv_activation.h" #include "riscv_usability.h" namespace ncnn { -#if __riscv_zvfh +#if NCNN_ZFH int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt) { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh const int num_input = weight_data_size / num_output; int out_elempack = 1; +#if __riscv_zvfh if (opt.use_packing_layout) { out_elempack = num_output % packn == 0 ? packn : 1; } +#endif // __riscv_zvfh // src = inch-outch // dst = pb-inch-outch/pb @@ -68,7 +71,9 @@ int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt) int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh const int num_input = weight_data_size / num_output; @@ -83,11 +88,18 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con if (top_blob.empty()) return -100; - int num_output_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1; + int num_output_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + num_output_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh #pragma omp parallel for num_threads(opt.num_threads) for (int j = 0; j < h; j++) { +#if __riscv_zvfh if (elempack == packn && num_output_elempack == packn) { const size_t vl = __riscv_vsetvl_e16m1(packn); @@ -201,6 +213,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con outptr += packn; } } +#endif // __riscv_zvfh if (elempack == 1 && num_output_elempack == 1) { @@ -247,13 +260,20 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con size_t elemsize = bottom_blob_flattened.elemsize; int elempack = bottom_blob_flattened.elempack; - int out_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh size_t out_elemsize = elemsize / elempack * out_elempack; top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; +#if __riscv_zvfh if (out_elempack == packn) { // num_output @@ -290,6 +310,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con __riscv_vse16_v_f16m1(outptr + p * packn, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); } } +#endif // __riscv_zvfh if (out_elempack == 1) { @@ -330,7 +351,9 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh const int num_input = weight_data_size / num_output; @@ -345,11 +368,18 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co if (top_blob.empty()) return -100; - int num_output_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1; + int num_output_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + num_output_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh #pragma omp parallel for num_threads(opt.num_threads) for (int j = 0; j < h; j++) { +#if __riscv_zvfh if (elempack == packn && num_output_elempack == packn) { const size_t vl = __riscv_vsetvl_e16m1(packn); @@ -463,6 +493,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co outptr += packn; } } +#endif // __riscv_zvfh if (elempack == 1 && num_output_elempack == 1) { @@ -509,13 +540,20 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co size_t elemsize = bottom_blob_flattened.elemsize; int elempack = bottom_blob_flattened.elempack; - int out_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh size_t out_elemsize = elemsize / elempack * out_elempack; top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; +#if __riscv_zvfh if (out_elempack == packn) { // num_output @@ -552,6 +590,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co __riscv_vse16_v_f16m1(outptr + p * packn, _sum, vl); } } +#endif // __riscv_zvfh if (out_elempack == 1) { @@ -589,6 +628,6 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/instancenorm_riscv.cpp b/src/layer/riscv/instancenorm_riscv.cpp index 95a39a20ba5..3c32cdb25ff 100644 --- a/src/layer/riscv/instancenorm_riscv.cpp +++ b/src/layer/riscv/instancenorm_riscv.cpp @@ -20,20 +20,26 @@ #include "riscv_usability.h" +#include "cpu.h" + namespace ncnn { InstanceNorm_riscv::InstanceNorm_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector } int InstanceNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) @@ -54,13 +60,8 @@ int InstanceNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) int size = w * h; int dims = bottom_top_blob.dims; -#if __riscv_vector if (elempack == 1) -#endif // __riscv_vector { -#if __riscv_vector - size = elempack * size; -#endif #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < c; q++) { diff --git a/src/layer/riscv/instancenorm_riscv.h b/src/layer/riscv/instancenorm_riscv.h index 81807e3af29..006b8bfc1ad 100644 --- a/src/layer/riscv/instancenorm_riscv.h +++ b/src/layer/riscv/instancenorm_riscv.h @@ -26,7 +26,7 @@ class InstanceNorm_riscv : public InstanceNorm virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if NCNN_ZVFH +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif diff --git a/src/layer/riscv/instancenorm_riscv_zvfh.cpp b/src/layer/riscv/instancenorm_riscv_zfh.cpp similarity index 91% rename from src/layer/riscv/instancenorm_riscv_zvfh.cpp rename to src/layer/riscv/instancenorm_riscv_zfh.cpp index ec4b1c8d9b9..9751bdbc797 100644 --- a/src/layer/riscv/instancenorm_riscv_zvfh.cpp +++ b/src/layer/riscv/instancenorm_riscv_zfh.cpp @@ -22,7 +22,7 @@ namespace ncnn { -#if __riscv_zvfh +#if NCNN_ZFH int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const { // x = (x - mean) / (sqrt(var + eps)) * gamma + beta @@ -37,7 +37,6 @@ int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option int dims = bottom_top_blob.dims; if (elempack == 1) { - size = elempack * size; #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < c; q++) { @@ -46,6 +45,7 @@ int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option // mean and var float sum = 0.f; float sqsum = 0.f; +#if __riscv_zvfh vfloat32m1_t _sum = __riscv_vfmv_s_f_f32m1(0.f, __riscv_vsetvlmax_e32m1()); vfloat32m1_t _sqsum = __riscv_vfmv_s_f_f32m1(0.f, __riscv_vsetvlmax_e32m1()); { @@ -62,7 +62,15 @@ int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option } } sum = __riscv_vfmv_f_s_f32m1_f32(_sum); +#else + for (int i = 0; i < size; i++) + { + sum += ptr[i]; + //sqsum += ptr[i] * ptr[i]; + } +#endif // __riscv_zvfh float mean = sum / size; +#if __riscv_zvfh { int n = size; __fp16* ptr_sqsum = ptr; @@ -77,6 +85,14 @@ int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option } } sqsum = __riscv_vfmv_f_s_f32m1_f32(_sqsum); +#else + float tmp = 0.f; + for (int i = 0; i < size; i++) + { + tmp = ptr[i] - mean; + sqsum += tmp * tmp; + } +#endif // __riscv_zvfh float var = sqsum / size; // the var maybe minus due to accuracy //float var = sqsum / size - mean * mean; @@ -96,6 +112,7 @@ int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option a = 1.f / (sqrtf(var + eps)); b = -mean * a; } +#if __riscv_zvfh { int n = size; __fp16* ptr_store = ptr; @@ -110,10 +127,17 @@ int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option ptr_store += vl; } } +#else + for (int i = 0; i < size; i++) + { + ptr[i] = ptr[i] * a + b; + } +#endif // __riscv_zvfh } return 0; } +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; if (elempack == packn) { @@ -166,6 +190,7 @@ int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option } return 0; } +#endif // __riscv_zvfh return 0; } @@ -182,7 +207,6 @@ int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Optio int dims = bottom_top_blob.dims; if (elempack == 1) { - size = elempack * size; #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < c; q++) { @@ -191,6 +215,7 @@ int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Optio // mean and var __fp16 sum = 0.f; __fp16 sqsum = 0.f; +#if __riscv_zvfh vfloat16m1_t _sum = __riscv_vfmv_s_f_f16m1(0.f, __riscv_vsetvlmax_e32m1()); vfloat16m1_t _sqsum = __riscv_vfmv_s_f_f16m1(0.f, __riscv_vsetvlmax_e32m1()); { @@ -207,7 +232,15 @@ int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Optio } } sum = __riscv_vfmv_f_s_f16m1_f16(_sum); +#else + for (int i = 0; i < size; i++) + { + sum += ptr[i]; + //sqsum += ptr[i] * ptr[i]; + } +#endif // __riscv_zvfh __fp16 mean = sum / size; +#if __riscv_zvfh { int n = size; __fp16* ptr_sqsum = ptr; @@ -222,6 +255,14 @@ int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Optio } } sqsum = __riscv_vfmv_f_s_f16m1_f16(_sqsum); +#else + float tmp = 0.f; + for (int i = 0; i < size; i++) + { + tmp = ptr[i] - mean; + sqsum += tmp * tmp; + } +#endif // __riscv_zvfh __fp16 var = sqsum / size; // the var maybe minus due to accuracy //float var = sqsum / size - mean * mean; @@ -241,6 +282,7 @@ int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Optio a = static_cast<__fp16>(1.f / (sqrt(var + eps))); b = static_cast<__fp16>(-mean * a); } +#if __riscv_zvfh { int n = size; __fp16* ptr_store = ptr; @@ -255,10 +297,17 @@ int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Optio ptr_store += vl; } } +#else + for (int i = 0; i < size; i++) + { + ptr[i] = ptr[i] * a + b; + } +#endif // __riscv_zvfh } return 0; } +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; if (elempack == packn) { @@ -311,9 +360,9 @@ int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Optio } return 0; } +#endif // __riscv_zvfh return 0; } - -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/interp_bilinear_fp16s.h b/src/layer/riscv/interp_bilinear_fp16s.h index 40140a02b47..d0fcde65643 100644 --- a/src/layer/riscv/interp_bilinear_fp16s.h +++ b/src/layer/riscv/interp_bilinear_fp16s.h @@ -128,6 +128,7 @@ static void resize_bilinear_image_fp16s(const Mat& src, Mat& dst, float* alpha, float* rows1p = rows1; __fp16* Dp = dst.row<__fp16>(dy); +#if __riscv_zvfh int n = w; while (n > 0) { @@ -145,6 +146,12 @@ static void resize_bilinear_image_fp16s(const Mat& src, Mat& dst, float* alpha, rows1p += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < w; i++) + { + *Dp++ = (__fp16)(*rows0p++ * b0 + *rows1p++ * b1); + } +#endif // __riscv_zvfh beta += 2; } @@ -229,6 +236,7 @@ static void resize_bilinear_image_fp16sa(const Mat& src, Mat& dst, __fp16* alpha __fp16* rows1p = rows1; __fp16* Dp = dst.row<__fp16>(dy); +#if __riscv_zvfh int n = w; while (n > 0) { @@ -246,6 +254,12 @@ static void resize_bilinear_image_fp16sa(const Mat& src, Mat& dst, __fp16* alpha rows1p += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < w; i++) + { + *Dp++ = *rows0p++ * b0 + *rows1p++ * b1; + } +#endif // __riscv_zvfh beta += 2; } diff --git a/src/layer/riscv/interp_riscv.cpp b/src/layer/riscv/interp_riscv.cpp index 0d7e9d7cd2d..1ecb2eaf96d 100644 --- a/src/layer/riscv/interp_riscv.cpp +++ b/src/layer/riscv/interp_riscv.cpp @@ -19,6 +19,8 @@ #include "riscv_usability.h" #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { #include "interp_bicubic.h" @@ -33,10 +35,14 @@ Interp_riscv::Interp_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector } int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const @@ -45,7 +51,7 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector const Mat& reference_blob = bottom_blobs[1]; Mat& top_blob = top_blobs[0]; -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) diff --git a/src/layer/riscv/interp_riscv.h b/src/layer/riscv/interp_riscv.h index 4b79c755edc..e9510c4afeb 100644 --- a/src/layer/riscv/interp_riscv.h +++ b/src/layer/riscv/interp_riscv.h @@ -27,7 +27,7 @@ class Interp_riscv : public Interp virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; protected: -#if NCNN_ZVFH +#if NCNN_ZFH int forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; int forward_fp16sa(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; #endif diff --git a/src/layer/riscv/interp_riscv_zvfh.cpp b/src/layer/riscv/interp_riscv_zfh.cpp similarity index 98% rename from src/layer/riscv/interp_riscv_zvfh.cpp rename to src/layer/riscv/interp_riscv_zfh.cpp index 7ebbcc236af..deb39a7bb58 100644 --- a/src/layer/riscv/interp_riscv_zvfh.cpp +++ b/src/layer/riscv/interp_riscv_zfh.cpp @@ -24,19 +24,21 @@ namespace ncnn { #include "interp_bicubic.h" #include "interp_bilinear.h" -#if __riscv_vector -#if __riscv_zvfh +#if NCNN_ZFH #include "interp_bicubic_fp16s.h" -#include "interp_bicubic_packn_fp16s.h" #include "interp_bilinear_fp16s.h" +#if __riscv_zvfh +#include "interp_bicubic_packn_fp16s.h" #include "interp_bilinear_packn_fp16s.h" #endif #endif -#if __riscv_zvfh +#if NCNN_ZFH int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh const Mat& bottom_blob = bottom_blobs[0]; const Mat& reference_blob = bottom_blobs[1]; @@ -58,6 +60,7 @@ int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vecto if (top_blob.empty()) return -100; +#if __riscv_zvfh if (elempack == packn) { const size_t vl = __riscv_vsetvl_e16m1(packn); @@ -72,6 +75,7 @@ int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vecto return 0; } +#endif // __riscv_zvfh #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < w; q++) @@ -96,6 +100,7 @@ int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vecto if (top_blob.empty()) return -100; +#if __riscv_zvfh if (elempack == packn) { if (resize_type == 1) // nearest @@ -199,6 +204,7 @@ int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vecto return 0; } +#endif // __riscv_zvfh if (resize_type == 1) // nearest { @@ -292,6 +298,7 @@ int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vecto if (top_blob.empty()) return -100; +#if __riscv_zvfh if (elempack == packn) { if (resize_type == 1) // nearest @@ -378,6 +385,7 @@ int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vecto return 0; } +#endif // __riscv_zvfh if (resize_type == 1) // nearest { @@ -460,7 +468,9 @@ int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vecto int Interp_riscv::forward_fp16sa(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh const Mat& bottom_blob = bottom_blobs[0]; const Mat& reference_blob = bottom_blobs[1]; @@ -493,6 +503,7 @@ int Interp_riscv::forward_fp16sa(const std::vector& bottom_blobs, std::vect if (top_blob.empty()) return -100; +#if __riscv_zvfh if (elempack == packn) { if (resize_type == 2) // bilinear @@ -573,6 +584,7 @@ int Interp_riscv::forward_fp16sa(const std::vector& bottom_blobs, std::vect return 0; } +#endif // __riscv_zvfh if (resize_type == 2) // bilinear { @@ -649,6 +661,7 @@ int Interp_riscv::forward_fp16sa(const std::vector& bottom_blobs, std::vect if (top_blob.empty()) return -100; +#if __riscv_zvfh if (elempack == packn) { if (resize_type == 2) // bilinear @@ -703,6 +716,7 @@ int Interp_riscv::forward_fp16sa(const std::vector& bottom_blobs, std::vect return 0; } +#endif // __riscv_zvfh if (resize_type == 2) // bilinear { @@ -756,6 +770,6 @@ int Interp_riscv::forward_fp16sa(const std::vector& bottom_blobs, std::vect return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/mish_riscv.cpp b/src/layer/riscv/mish_riscv.cpp index 9d2c2d34d7c..f1e8e477bd0 100644 --- a/src/layer/riscv/mish_riscv.cpp +++ b/src/layer/riscv/mish_riscv.cpp @@ -19,21 +19,27 @@ #include "rvv_mathfun.h" #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { Mish_riscv::Mish_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector } int Mish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) diff --git a/src/layer/riscv/mish_riscv.h b/src/layer/riscv/mish_riscv.h index 52ed8210306..a7dc62018a9 100644 --- a/src/layer/riscv/mish_riscv.h +++ b/src/layer/riscv/mish_riscv.h @@ -27,7 +27,7 @@ class Mish_riscv : public Mish virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if NCNN_ZVFH +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif diff --git a/src/layer/riscv/mish_riscv_zvfh.cpp b/src/layer/riscv/mish_riscv_zfh.cpp similarity index 82% rename from src/layer/riscv/mish_riscv_zvfh.cpp rename to src/layer/riscv/mish_riscv_zfh.cpp index 3712aa37ef3..6f614b20abb 100644 --- a/src/layer/riscv/mish_riscv_zvfh.cpp +++ b/src/layer/riscv/mish_riscv_zfh.cpp @@ -24,7 +24,7 @@ namespace ncnn { -#if __riscv_zvfh +#if NCNN_ZFH int Mish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; @@ -39,18 +39,27 @@ int Mish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c { __fp16* ptr = bottom_top_blob.channel(q); +#if __riscv_zvfh int n = size; while (n > 0) { size_t vl = __riscv_vsetvl_e16m4(n); vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); - _p = __riscv_vfmul_vv_f32m8(_p, tanh_ps(log_ps(__riscv_vfadd_vf_f32m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl); + _p = __riscv_vfmul_vv_f32m8(_p, tanh_ps(log_ps(__riscv_vfadd_vf_f32m8(exp_ps(_p, vl), (__fp16)1.f, vl), vl), vl), vl); __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); ptr += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + float v = (float)*ptr; + *ptr = (__fp16)(v * tanh(log(exp(v) + 1.f))); + ptr++; + } +#endif // __riscv_zvfh } return 0; @@ -70,22 +79,30 @@ int Mish_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) { __fp16* ptr = bottom_top_blob.channel(q); +#if __riscv_zvfh int n = size; while (n > 0) { size_t vl = __riscv_vsetvl_e16m8(n); vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); - _p = __riscv_vfmul_vv_f16m8(_p, tanh_ps(log_ps(__riscv_vfadd_vf_f16m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl); + _p = __riscv_vfmul_vv_f16m8(_p, tanh_ps(log_ps(__riscv_vfadd_vf_f16m8(exp_ps(_p, vl), (__fp16)1.f, vl), vl), vl), vl); __riscv_vse16_v_f16m8(ptr, _p, vl); ptr += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *ptr = *ptr * (__fp16)tanh(log(exp((float)*ptr) + 1.f)); + ptr++; + } +#endif // __riscv_zvfh } return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/packing_riscv.cpp b/src/layer/riscv/packing_riscv.cpp index 15b206b063c..aef5522d080 100644 --- a/src/layer/riscv/packing_riscv.cpp +++ b/src/layer/riscv/packing_riscv.cpp @@ -27,8 +27,12 @@ namespace ncnn { Packing_riscv::Packing_riscv() { support_packing = true; -#if NCNN_ZVFH +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif support_bf16_storage = true; } @@ -40,7 +44,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& if (elembits == 8) return forward_int8(bottom_blob, top_blob, opt); -#if NCNN_ZVFH +#if NCNN_ZFH if (opt.use_fp16_storage && elembits == 16) return forward_bf16s_fp16s(bottom_blob, top_blob, opt); #endif diff --git a/src/layer/riscv/padding_riscv.cpp b/src/layer/riscv/padding_riscv.cpp index d51333551bb..87a9ef2e827 100644 --- a/src/layer/riscv/padding_riscv.cpp +++ b/src/layer/riscv/padding_riscv.cpp @@ -20,6 +20,8 @@ #include "riscv_usability.h" +#include "cpu.h" + namespace ncnn { #if __riscv_vector @@ -30,10 +32,14 @@ Padding_riscv::Padding_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector #if NCNN_BF16 support_bf16_storage = true; @@ -42,7 +48,7 @@ Padding_riscv::Padding_riscv() int Padding_riscv::create_pipeline(const Option& opt) { -#if NCNN_ZVFH +#if NCNN_ZFH if (opt.use_fp16_storage) { value_fp16 = float32_to_float16(value); @@ -81,7 +87,7 @@ int Padding_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& if (elembits == 8) return forward_int8(bottom_blob, top_blob, opt); -#if NCNN_ZVFH +#if NCNN_ZFH if (opt.use_fp16_storage && elembits == 16) return forward_bf16s_fp16s(bottom_blob, top_blob, opt); #endif @@ -293,7 +299,7 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co // clang-format off // *INDENT-OFF* vuint16m1_t pad_value; -#if NCNN_ZVFH +#if NCNN_ZFH if (opt.use_fp16_storage) { pad_value = __riscv_vmv_v_x_u16m1(value_fp16, vl); @@ -334,7 +340,7 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co // clang-format off // *INDENT-OFF* vuint16m1_t pad_value; -#if NCNN_ZVFH +#if NCNN_ZFH if (opt.use_fp16_storage) { pad_value = __riscv_vmv_v_x_u16m1(value_fp16, vl); @@ -382,7 +388,7 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co // clang-format off // *INDENT-OFF* vuint16m1_t pad_value; -#if NCNN_ZVFH +#if NCNN_ZFH if (opt.use_fp16_storage) { pad_value = per_channel_pad_data_size ? __riscv_vle16_v_u16m1((const unsigned short*)per_channel_pad_data_fp16 + q * packn, vl) : __riscv_vmv_v_x_u16m1(value_fp16, vl); @@ -440,7 +446,7 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co // clang-format off // *INDENT-OFF* vuint16m1_t pad_value; -#if NCNN_ZVFH +#if NCNN_ZFH if (opt.use_fp16_storage) { pad_value = per_channel_pad_data_size ? __riscv_vle16_v_u16m1((const unsigned short*)per_channel_pad_data_fp16 + q * packn, vl) : __riscv_vmv_v_x_u16m1(value_fp16, vl); diff --git a/src/layer/riscv/pooling_riscv.cpp b/src/layer/riscv/pooling_riscv.cpp index 9091e3e8b2a..92f0521bbfd 100644 --- a/src/layer/riscv/pooling_riscv.cpp +++ b/src/layer/riscv/pooling_riscv.cpp @@ -18,9 +18,10 @@ #if __riscv_vector #include +#include "riscv_usability.h" #endif // __riscv_vector -#include "riscv_usability.h" +#include "cpu.h" namespace ncnn { @@ -28,10 +29,14 @@ Pooling_riscv::Pooling_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector } int Pooling_riscv::create_pipeline(const Option& /*opt*/) @@ -55,7 +60,7 @@ int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& return Pooling::forward(bottom_blob, top_blob, opt); } -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) diff --git a/src/layer/riscv/pooling_riscv.h b/src/layer/riscv/pooling_riscv.h index 4c14577b3ca..8f14df8fabb 100644 --- a/src/layer/riscv/pooling_riscv.h +++ b/src/layer/riscv/pooling_riscv.h @@ -28,7 +28,7 @@ class Pooling_riscv : public Pooling virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; protected: -#if NCNN_ZVFH +#if NCNN_ZFH int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif diff --git a/src/layer/riscv/pooling_riscv_zvfh.cpp b/src/layer/riscv/pooling_riscv_zfh.cpp similarity index 98% rename from src/layer/riscv/pooling_riscv_zvfh.cpp rename to src/layer/riscv/pooling_riscv_zfh.cpp index 214c56975a0..8da0afbb737 100644 --- a/src/layer/riscv/pooling_riscv_zvfh.cpp +++ b/src/layer/riscv/pooling_riscv_zfh.cpp @@ -18,20 +18,21 @@ #if __riscv_vector #include -#endif // __riscv_vector - #include "riscv_usability.h" +#endif // __riscv_vector namespace ncnn { -#if __riscv_zvfh +#if NCNN_ZFH int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { // max value in NxN window // avg value in NxN window +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; const size_t vl = __riscv_vsetvl_e16m1(packn); +#endif // __riscv_zvfh int w = bottom_blob.w; int h = bottom_blob.h; @@ -51,6 +52,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op if (pooling_type == PoolMethod_MAX) { +#if __riscv_zvfh if (elempack == packn) { #pragma omp parallel for num_threads(opt.num_threads) @@ -70,6 +72,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op __riscv_vse16_v_f16m1(outptr + q * packn, _max, vl); } } +#endif // __riscv_zvfh if (elempack == 1) { @@ -92,6 +95,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op if (pooling_type == PoolMethod_AVE) { +#if __riscv_zvfh if (elempack == packn) { #pragma omp parallel for num_threads(opt.num_threads) @@ -113,6 +117,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op __riscv_vse16_v_f16m1(outptr + q * packn, __riscv_vfncvt_f_f_w_f16m1(_avg, vl), vl); } } +#endif // __riscv_zvfh if (elempack == 1) { @@ -174,6 +179,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op if (pooling_type == PoolMethod_MAX) { +#if __riscv_zvfh if (elempack == packn) { #pragma omp parallel for num_threads(opt.num_threads) @@ -203,6 +209,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op } } } +#endif // __riscv_zvfh if (elempack == 1) { @@ -248,6 +255,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom; } +#if __riscv_zvfh if (elempack == packn) { #pragma omp parallel for num_threads(opt.num_threads) @@ -301,6 +309,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op } } } +#endif // __riscv_zvfh if (elempack == 1) { @@ -358,6 +367,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op if (avgpool_count_include_pad == 1) { +#if __riscv_zvfh if (elempack == packn) { #pragma omp parallel for num_threads(opt.num_threads) @@ -390,6 +400,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op } } } +#endif // __riscv_zvfh if (elempack == 1) { @@ -436,8 +447,10 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O return forward_fp16s(bottom_blob, top_blob, opt); } +#if __riscv_zvfh const int packn = csrr_vlenb() / 2; const size_t vl = __riscv_vsetvl_e16m1(packn); +#endif // __riscv_zvfh int w = bottom_blob.w; int h = bottom_blob.h; @@ -496,6 +509,7 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom; } +#if __riscv_zvfh if (elempack == packn) { #pragma omp parallel for num_threads(opt.num_threads) @@ -512,7 +526,7 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O { int sx0 = j * stride_w; - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); int area = 0; for (int ki = 0; ki < kernel_h; ki++) @@ -549,6 +563,7 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O } } } +#endif // __riscv_zvfh if (elempack == 1) { @@ -606,6 +621,7 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O if (avgpool_count_include_pad == 1) { +#if __riscv_zvfh if (elempack == packn) { #pragma omp parallel for num_threads(opt.num_threads) @@ -622,7 +638,7 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O { const __fp16* sptr = m.row(i * stride_h) + j * stride_w * packn; - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); for (int k = 0; k < maxk; k++) { @@ -638,6 +654,7 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O } } } +#endif // __riscv_zvfh if (elempack == 1) { @@ -673,6 +690,6 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/prelu_riscv.cpp b/src/layer/riscv/prelu_riscv.cpp index 75a1876519b..19682cd1171 100644 --- a/src/layer/riscv/prelu_riscv.cpp +++ b/src/layer/riscv/prelu_riscv.cpp @@ -18,21 +18,27 @@ #include #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { PReLU_riscv::PReLU_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); #endif #endif } int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) @@ -44,23 +50,19 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const } #endif - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int channels = bottom_top_blob.c; - int size = w * h; int elempack = bottom_top_blob.elempack; int dims = bottom_top_blob.dims; -#if __riscv_vector + if (dims == 1) { int w = bottom_top_blob.w; float* ptr = bottom_top_blob; - const float* ptr_slope = slope_data; if (num_slope > 1) { - int n = w * elempack; +#if __riscv_vector + const float* ptr_slope = slope_data; - // #pragma omp parallel for num_threads(opt.num_threads) + int n = w * elempack; while (n > 0) { size_t vl = __riscv_vsetvl_e32m8(n); @@ -75,13 +77,21 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const ptr_slope += vl; n -= vl; } +#else // __riscv_vector + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + if (ptr[i] < 0) + ptr[i] *= slope_data[i]; + } +#endif // __riscv_vector } else { float slope = slope_data[0]; +#if __riscv_vector int n = w * elempack; - // #pragma omp parallel for num_threads(opt.num_threads) while (n > 0) { size_t vl = __riscv_vsetvl_e32m8(n); @@ -94,6 +104,14 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const ptr += vl; n -= vl; } +#else // __riscv_vector + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + if (ptr[i] < 0) + ptr[i] *= slope; + } +#endif // __riscv_vector } } @@ -106,6 +124,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const for (int i = 0; i < h; i++) { float* ptr = bottom_top_blob.row(i); +#if __riscv_vector if (num_slope > 1) { for (int j = 0; j < w; j++) @@ -146,6 +165,15 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const n -= vl; } } +#else // __riscv_vector + float slope = num_slope > 1 ? slope_data[i] : slope_data[0]; + + for (int j = 0; j < w; j++) + { + if (ptr[j] < 0) + ptr[j] *= slope; + } +#endif // __riscv_vector } } @@ -160,6 +188,8 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const for (int q = 0; q < channels; q++) { float* ptr = bottom_top_blob.channel(q); + +#if __riscv_vector int n = size * elempack; if (num_slope > 1 && elempack != 1) @@ -202,68 +232,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const n -= vl; } } - } - } - -#else - if (dims == 1) - { - int w = bottom_top_blob.w; - - float* ptr = bottom_top_blob; - - if (num_slope > 1) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - if (ptr[i] < 0) - ptr[i] *= slope_data[i]; - } - } - else - { - float slope = slope_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - if (ptr[i] < 0) - ptr[i] *= slope; - } - } - } - - if (dims == 2) - { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - float* ptr = bottom_top_blob.row(i); - float slope = num_slope > 1 ? slope_data[i] : slope_data[0]; - - for (int j = 0; j < w; j++) - { - if (ptr[j] < 0) - ptr[j] *= slope; - } - } - } - - if (dims == 3) - { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int channels = bottom_top_blob.c; - int size = w * h; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - float* ptr = bottom_top_blob.channel(q); +#else // __riscv_vector float slope = num_slope > 1 ? slope_data[q] : slope_data[0]; for (int i = 0; i < size; i++) @@ -271,11 +240,10 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const if (ptr[i] < 0) ptr[i] *= slope; } +#endif // __riscv_vector } } -#endif - return 0; } diff --git a/src/layer/riscv/prelu_riscv.h b/src/layer/riscv/prelu_riscv.h index d957b4e1474..4f56f8ce1ab 100644 --- a/src/layer/riscv/prelu_riscv.h +++ b/src/layer/riscv/prelu_riscv.h @@ -27,7 +27,7 @@ class PReLU_riscv : public PReLU virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if NCNN_ZVFH +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif diff --git a/src/layer/riscv/prelu_riscv_zvfh.cpp b/src/layer/riscv/prelu_riscv_zfh.cpp similarity index 80% rename from src/layer/riscv/prelu_riscv_zvfh.cpp rename to src/layer/riscv/prelu_riscv_zfh.cpp index b84ef6d4497..565f457bcd6 100644 --- a/src/layer/riscv/prelu_riscv_zvfh.cpp +++ b/src/layer/riscv/prelu_riscv_zfh.cpp @@ -20,15 +20,12 @@ namespace ncnn { -#if __riscv_zvfh +#if NCNN_ZFH //fp16s(a) //hint: slope always store as fp32 int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int size = w * h; int elempack = bottom_top_blob.elempack; int dims = bottom_top_blob.dims; @@ -36,19 +33,19 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) { int w = bottom_top_blob.w; __fp16* ptr = bottom_top_blob; - const float* ptr_slope = slope_data; if (num_slope > 1) { - int n = w * elempack; +#if __riscv_zvfh + const float* ptr_slope = slope_data; - // #pragma omp parallel for num_threads(opt.num_threads) + int n = w * elempack; while (n > 0) { size_t vl = __riscv_vsetvl_e16m4(n); vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); vfloat32m8_t _slope = __riscv_vle32_v_f32m8(ptr_slope, vl); - vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, .0f, vl); + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, (__fp16)0.f, vl); _p = __riscv_vfmul_vv_f32m8_mu(_lower, _p, _p, _slope, vl); __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); @@ -56,18 +53,26 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) ptr_slope += vl; n -= vl; } +#else // __riscv_zvfh + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + if (ptr[i] < (__fp16)0.f) + ptr[i] = (__fp16)((float)ptr[i] * slope_data[i]); + } +#endif // __riscv_zvfh } else { float slope = slope_data[0]; +#if __riscv_zvfh int n = w * elempack; - // #pragma omp parallel for num_threads(opt.num_threads) while (n > 0) { size_t vl = __riscv_vsetvl_e16m4(n); vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); - vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, .0f, vl); + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, (__fp16)0.f, vl); _p = __riscv_vfmul_vf_f32m8_mu(_lower, _p, _p, slope, vl); __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); @@ -75,6 +80,14 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) ptr += vl; n -= vl; } +#else // __riscv_zvfh + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + if (ptr[i] < (__fp16)0.f) + ptr[i] = (__fp16)((float)ptr[i] * slope); + } +#endif // __riscv_zvfh } } @@ -87,6 +100,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) for (int i = 0; i < h; i++) { __fp16* ptr = bottom_top_blob.row<__fp16>(i); +#if __riscv_zvfh if (num_slope > 1) { for (int j = 0; j < w; j++) @@ -100,7 +114,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); vfloat32m8_t _slope = __riscv_vle32_v_f32m8(ptr_slope, vl); - vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, .0f, vl); + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, (__fp16)0.f, vl); _p = __riscv_vfmul_vv_f32m8_mu(_lower, _p, _p, _slope, vl); __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); @@ -118,7 +132,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) { size_t vl = __riscv_vsetvl_e16m4(n); vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); - vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, .0f, vl); + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, (__fp16)0.f, vl); _p = __riscv_vfmul_vf_f32m8_mu(_lower, _p, _p, slope, vl); __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); @@ -127,6 +141,15 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) n -= vl; } } +#else // __riscv_zvfh + float slope = num_slope > 1 ? slope_data[i] : slope_data[0]; + + for (int j = 0; j < w; j++) + { + if (ptr[j] < (__fp16)0.f) + ptr[j] = (__fp16)((float)ptr[j] * slope); + } +#endif // __riscv_zvfh } } @@ -141,6 +164,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) for (int q = 0; q < channels; q++) { __fp16* ptr = bottom_top_blob.channel(q); +#if __riscv_zvfh int n = size * elempack; if (num_slope > 1 && elempack != 1) @@ -155,7 +179,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); vfloat32m8_t _slope = __riscv_vle32_v_f32m8(slope_ptr, vl); - vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, .0f, vl); + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, (__fp16)0.f, vl); _p = __riscv_vfmul_vv_f32m8_mu(_lower, _p, _p, _slope, vl); __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); @@ -175,7 +199,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) size_t vl = __riscv_vsetvl_e16m4(n); vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); - vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, .0f, vl); + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, (__fp16)0.f, vl); _p = __riscv_vfmul_vf_f32m8_mu(_lower, _p, _p, slope, vl); __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); @@ -183,6 +207,15 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) n -= vl; } } +#else // __riscv_zvfh + float slope = num_slope > 1 ? slope_data[q] : slope_data[0]; + + for (int i = 0; i < size; i++) + { + if (ptr[i] < (__fp16)0.f) + ptr[i] = (__fp16)((float)ptr[i] * slope); + } +#endif // __riscv_zvfh } } @@ -191,9 +224,6 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int size = w * h; int elempack = bottom_top_blob.elempack; int dims = bottom_top_blob.dims; @@ -201,18 +231,18 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) { int w = bottom_top_blob.w; __fp16* ptr = bottom_top_blob; - const float* ptr_slope = slope_data; if (num_slope > 1) { - int n = w * elempack; +#if __riscv_zvfh + const float* ptr_slope = slope_data; - // #pragma omp parallel for num_threads(opt.num_threads) + int n = w * elempack; while (n > 0) { size_t vl = __riscv_vsetvl_e16m4(n); vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl); vfloat16m4_t _slope = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(ptr_slope, vl), vl); - vbool4_t _lower = __riscv_vmflt_vf_f16m4_b4(_p, .0f, vl); + vbool4_t _lower = __riscv_vmflt_vf_f16m4_b4(_p, (__fp16)0.f, vl); _p = __riscv_vfmul_vv_f16m4_mu(_lower, _p, _p, _slope, vl); __riscv_vse16_v_f16m4(ptr, _p, vl); @@ -221,18 +251,26 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) ptr_slope += vl; n -= vl; } +#else // __riscv_zvfh + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + if (ptr[i] < (__fp16)0.f) + ptr[i] *= (__fp16)slope_data[i]; + } +#endif // __riscv_zvfh } else { - __fp16 slope = slope_data[0]; + __fp16 slope = (__fp16)slope_data[0]; +#if __riscv_zvfh int n = w * elempack; - // #pragma omp parallel for num_threads(opt.num_threads) while (n > 0) { size_t vl = __riscv_vsetvl_e16m8(n); vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); - vbool2_t _lower = __riscv_vmflt_vf_f16m8_b2(_p, .0f, vl); + vbool2_t _lower = __riscv_vmflt_vf_f16m8_b2(_p, (__fp16)0.f, vl); _p = __riscv_vfmul_vf_f16m8_mu(_lower, _p, _p, slope, vl); __riscv_vse16_v_f16m8(ptr, _p, vl); @@ -240,6 +278,14 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) ptr += vl; n -= vl; } +#else // __riscv_zvfh + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + if (ptr[i] < (__fp16)0.f) + ptr[i] *= slope; + } +#endif // __riscv_zvfh } } @@ -252,6 +298,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) for (int i = 0; i < h; i++) { __fp16* ptr = bottom_top_blob.row<__fp16>(i); +#if __riscv_zvfh if (num_slope > 1) { for (int j = 0; j < w; j++) @@ -265,7 +312,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl); vfloat16m4_t _slope = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(ptr_slope, vl), vl); - vbool4_t _lower = __riscv_vmflt_vf_f16m4_b4(_p, .0f, vl); + vbool4_t _lower = __riscv_vmflt_vf_f16m4_b4(_p, (__fp16)0.f, vl); _p = __riscv_vfmul_vv_f16m4_mu(_lower, _p, _p, _slope, vl); __riscv_vse16_v_f16m4(ptr, _p, vl); @@ -277,13 +324,13 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) } else { - __fp16 slope = slope_data[0]; + __fp16 slope = (__fp16)slope_data[0]; int n = w * elempack; while (n > 0) { size_t vl = __riscv_vsetvl_e16m8(n); vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); - vbool2_t _lower = __riscv_vmflt_vf_f16m8_b2(_p, .0f, vl); + vbool2_t _lower = __riscv_vmflt_vf_f16m8_b2(_p, (__fp16)0.f, vl); _p = __riscv_vfmul_vf_f16m8_mu(_lower, _p, _p, slope, vl); __riscv_vse16_v_f16m8(ptr, _p, vl); @@ -292,6 +339,15 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) n -= vl; } } +#else // __riscv_zvfh + __fp16 slope = num_slope > 1 ? (__fp16)slope_data[i] : (__fp16)slope_data[0]; + + for (int j = 0; j < w; j++) + { + if (ptr[j] < (__fp16)0.f) + ptr[j] *= slope; + } +#endif // __riscv_zvfh } } @@ -306,6 +362,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) for (int q = 0; q < channels; q++) { __fp16* ptr = bottom_top_blob.channel(q); +#if __riscv_zvfh int n = size * elempack; if (num_slope > 1 && elempack != 1) @@ -320,7 +377,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl); vfloat16m4_t _slope = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(slope_ptr, vl), vl); - vbool4_t _lower = __riscv_vmflt_vf_f16m4_b4(_p, .0f, vl); + vbool4_t _lower = __riscv_vmflt_vf_f16m4_b4(_p, (__fp16)0.f, vl); _p = __riscv_vfmul_vv_f16m4_mu(_lower, _p, _p, _slope, vl); __riscv_vse16_v_f16m4(ptr, _p, vl); @@ -340,7 +397,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) size_t vl = __riscv_vsetvl_e16m8(n); vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); - vbool2_t _lower = __riscv_vmflt_vf_f16m8_b2(_p, .0f, vl); + vbool2_t _lower = __riscv_vmflt_vf_f16m8_b2(_p, (__fp16)0.f, vl); _p = __riscv_vfmul_vf_f16m8_mu(_lower, _p, _p, (__fp16)slope, vl); __riscv_vse16_v_f16m8(ptr, _p, vl); @@ -348,11 +405,20 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) n -= vl; } } +#else // __riscv_zvfh + __fp16 slope = num_slope > 1 ? (__fp16)slope_data[q] : (__fp16)slope_data[0]; + + for (int i = 0; i < size; i++) + { + if (ptr[i] < (__fp16)0.f) + ptr[i] *= slope; + } +#endif // __riscv_zvfh } } return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/relu_riscv.cpp b/src/layer/riscv/relu_riscv.cpp index cad03e24bfb..fe4291331c7 100644 --- a/src/layer/riscv/relu_riscv.cpp +++ b/src/layer/riscv/relu_riscv.cpp @@ -18,21 +18,27 @@ #include #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { ReLU_riscv::ReLU_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); #endif #endif } int ReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) diff --git a/src/layer/riscv/relu_riscv.h b/src/layer/riscv/relu_riscv.h index 9dd6c503bdb..7fae384abd1 100644 --- a/src/layer/riscv/relu_riscv.h +++ b/src/layer/riscv/relu_riscv.h @@ -27,7 +27,7 @@ class ReLU_riscv : public ReLU virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if NCNN_ZVFH +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; #endif }; diff --git a/src/layer/riscv/relu_riscv_zvfh.cpp b/src/layer/riscv/relu_riscv_zfh.cpp similarity index 81% rename from src/layer/riscv/relu_riscv_zvfh.cpp rename to src/layer/riscv/relu_riscv_zfh.cpp index bca2891a919..b4503421755 100644 --- a/src/layer/riscv/relu_riscv_zvfh.cpp +++ b/src/layer/riscv/relu_riscv_zfh.cpp @@ -20,7 +20,7 @@ namespace ncnn { -#if __riscv_zvfh +#if NCNN_ZFH int ReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; @@ -36,6 +36,7 @@ int ReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c __fp16* ptr = bottom_top_blob.channel(q); if (slope == 0.f) { +#if __riscv_zvfh int n = size; while (n > 0) { @@ -48,27 +49,44 @@ int ReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c ptr += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + if (*ptr < (__fp16)0.f) + *ptr = (__fp16)0.f; + ptr++; + } +#endif // __riscv_zvfh } else { - int n = size; __fp16 _slope = (__fp16)slope; +#if __riscv_zvfh + int n = size; while (n > 0) { size_t vl = __riscv_vsetvl_e16m8(n); vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); - _p = __riscv_vfmul_vf_f16m8_mu(__riscv_vmflt_vf_f16m8_b2(_p, .0f, vl), _p, _p, _slope, vl); + _p = __riscv_vfmul_vf_f16m8_mu(__riscv_vmflt_vf_f16m8_b2(_p, (__fp16)0.f, vl), _p, _p, _slope, vl); __riscv_vse16_v_f16m8(ptr, _p, vl); ptr += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + if (*ptr < (__fp16)0.f) + *ptr *= _slope; + ptr++; + } +#endif // __riscv_zvfh } } return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h index e5c60c882d1..3983baee8a5 100644 --- a/src/layer/riscv/riscv_usability.h +++ b/src/layer/riscv/riscv_usability.h @@ -411,57 +411,10 @@ static inline void transpose8x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h, vfloat32m1_t& _r7l, vfloat32m1_t& _r7h, size_t vl) { float tmp[64]; -#if __riscv_vector -#if 1 //__riscv_v_intrinsic > 12000 -#warning A vfloat32m1x8_t _rl = __riscv_vcreate_v_f32m1x8(_r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l); vfloat32m1x8_t _rh = __riscv_vcreate_v_f32m1x8(_r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h); -#else -#warning B - vfloat32m1x8_t _rl = vfloat32m1x8_t(); - _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 0, _r0l); - _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 1, _r1l); - _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 2, _r2l); - _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 3, _r3l); - _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 4, _r4l); - _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 5, _r5l); - _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 6, _r6l); - _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 7, _r7l); - vfloat32m1x8_t _rh = vfloat32m1x8_t(); - _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 0, _r0h); - _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 1, _r1h); - _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 2, _r2h); - _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 3, _r3h); - _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 4, _r4h); - _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 5, _r5h); - _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 6, _r6h); - _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 7, _r7h); -#endif __riscv_vsseg8e32_v_f32m1x8(&tmp[0], _rl, vl); __riscv_vsseg8e32_v_f32m1x8(&tmp[32], _rh, vl); -#elif __riscv_xtheadvector - vfloat32m1x8_t _rl = vfloat32m1x8_t(); - // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 0, _r0l); - // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 1, _r1l); - // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 2, _r2l); - // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 3, _r3l); - // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 4, _r4l); - // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 5, _r5l); - // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 6, _r6l); - // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 7, _r7l); - vfloat32m1x8_t _rh = vfloat32m1x8_t(); - // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 0, _r0h); - // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 1, _r1h); - // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 2, _r2h); - // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 3, _r3h); - // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 4, _r4h); - // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 5, _r5h); - // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 6, _r6h); - // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 7, _r7h); - __riscv_th_vsseg8e32_v_f32m1x8(&tmp[0], _rl, vl); - __riscv_th_vsseg8e32_v_f32m1x8(&tmp[32], _rh, vl); -#endif - float* ptr = (float*)tmp; _r0l = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl); _r0h = __riscv_vle32_v_f32m1(ptr + 1 * 4, vl); @@ -484,15 +437,7 @@ static inline void transpose8x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h, static inline void transpose4x4_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, size_t vl) { float tmp[16]; -#if __riscv_vector && __riscv_v_intrinsic > 12000 vfloat32m1x4_t _r = __riscv_vcreate_v_f32m1x4(_r0, _r1, _r2, _r3); -#else - vfloat32m1x4_t _r = vfloat32m1x4_t(); - _r = __riscv_vset_v_f32m1_f32m1x4(_r, 0, _r0); - _r = __riscv_vset_v_f32m1_f32m1x4(_r, 1, _r1); - _r = __riscv_vset_v_f32m1_f32m1x4(_r, 2, _r2); - _r = __riscv_vset_v_f32m1_f32m1x4(_r, 3, _r3); -#endif __riscv_vsseg4e32_v_f32m1x4(&tmp[0], _r, vl); float* ptr = (float*)tmp; _r0 = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl); @@ -515,7 +460,6 @@ static inline void transpose8x12_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h, vfloat32m1_t& _rbl, vfloat32m1_t& _rbh, size_t vl) { float tmp[8][12]; - __riscv_vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0l, vl); __riscv_vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 12, _r0h, vl); __riscv_vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1l, vl); @@ -577,43 +521,12 @@ static inline void transpose12x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0m, vflo vfloat32m1_t& _r7l, vfloat32m1_t& _r7m, vfloat32m1_t& _r7h, size_t vl) { float tmp[96]; -#if __riscv_vector && __riscv_v_intrinsic > 12000 vfloat32m1x8_t _rl = __riscv_vcreate_v_f32m1x8(_r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l); - vfloat32m1x8_t _rh = __riscv_vcreate_v_f32m1x8(_r0m, _r1m, _r2m, _r3m, _r4m, _r5m, _r6m, _r7m); + vfloat32m1x8_t _rm = __riscv_vcreate_v_f32m1x8(_r0m, _r1m, _r2m, _r3m, _r4m, _r5m, _r6m, _r7m); vfloat32m1x8_t _rh = __riscv_vcreate_v_f32m1x8(_r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h); -#else - vfloat32m1x8_t _rl = vfloat32m1x8_t(); - _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 0, _r0l); - _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 1, _r1l); - _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 2, _r2l); - _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 3, _r3l); - _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 4, _r4l); - _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 5, _r5l); - _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 6, _r6l); - _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 7, _r7l); - vfloat32m1x8_t _rm = vfloat32m1x8_t(); - _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 0, _r0m); - _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 1, _r1m); - _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 2, _r2m); - _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 3, _r3m); - _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 4, _r4m); - _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 5, _r5m); - _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 6, _r6m); - _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 7, _r7m); - vfloat32m1x8_t _rh = vfloat32m1x8_t(); - _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 0, _r0h); - _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 1, _r1h); - _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 2, _r2h); - _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 3, _r3h); - _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 4, _r4h); - _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 5, _r5h); - _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 6, _r6h); - _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 7, _r7h); -#endif __riscv_vsseg8e32_v_f32m1x8(&tmp[0], _rl, vl); __riscv_vsseg8e32_v_f32m1x8(&tmp[32], _rm, vl); __riscv_vsseg8e32_v_f32m1x8(&tmp[64], _rh, vl); - float* ptr = (float*)tmp; _r0l = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl); _r0m = __riscv_vle32_v_f32m1(ptr + 1 * 4, vl); @@ -644,21 +557,8 @@ static inline void transpose12x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0m, vflo static inline void transpose4x8_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, size_t vl) { float tmp[32]; -#if __riscv_vector && __riscv_v_intrinsic > 12000 vfloat32m1x8_t _r = __riscv_vcreate_v_f32m1x8(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7); -#else - vfloat32m1x8_t _r = vfloat32m1x8_t(); - _r = __riscv_vset_v_f32m1_f32m1x8(_r, 0, _r0); - _r = __riscv_vset_v_f32m1_f32m1x8(_r, 1, _r1); - _r = __riscv_vset_v_f32m1_f32m1x8(_r, 2, _r2); - _r = __riscv_vset_v_f32m1_f32m1x8(_r, 3, _r3); - _r = __riscv_vset_v_f32m1_f32m1x8(_r, 4, _r4); - _r = __riscv_vset_v_f32m1_f32m1x8(_r, 5, _r5); - _r = __riscv_vset_v_f32m1_f32m1x8(_r, 6, _r6); - _r = __riscv_vset_v_f32m1_f32m1x8(_r, 7, _r7); -#endif __riscv_vsseg8e32_v_f32m1x8(&tmp[0], _r, vl); - float* ptr = (float*)tmp; _r0 = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl); _r1 = __riscv_vle32_v_f32m1(ptr + 1 * 4, vl); @@ -706,21 +606,8 @@ static inline void transpose8x4_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h, vfloat32m1_t& _r3l, vfloat32m1_t& _r3h, size_t vl) { float tmp[32]; -#if __riscv_vector && __riscv_v_intrinsic > 12000 vfloat32m1x4_t _rl = __riscv_vcreate_v_f32m1x4(_r0l, _r1l, _r2l, _r3l); vfloat32m1x4_t _rh = __riscv_vcreate_v_f32m1x4(_r0h, _r1h, _r2h, _r3h); -#else - vfloat32m1x4_t _rl = vfloat32m1x4_t(); - _rl = __riscv_vset_v_f32m1_f32m1x4(_rl, 0, _r0l); - _rl = __riscv_vset_v_f32m1_f32m1x4(_rl, 1, _r1l); - _rl = __riscv_vset_v_f32m1_f32m1x4(_rl, 2, _r2l); - _rl = __riscv_vset_v_f32m1_f32m1x4(_rl, 3, _r3l); - vfloat32m1x4_t _rh = vfloat32m1x4_t(); - _rh = __riscv_vset_v_f32m1_f32m1x4(_rh, 0, _r0h); - _rh = __riscv_vset_v_f32m1_f32m1x4(_rh, 1, _r1h); - _rh = __riscv_vset_v_f32m1_f32m1x4(_rh, 2, _r2h); - _rh = __riscv_vset_v_f32m1_f32m1x4(_rh, 3, _r3h); -#endif __riscv_vsseg4e32_v_f32m1x4(&tmp[0], _rl, vl); __riscv_vsseg4e32_v_f32m1x4(&tmp[16], _rh, vl); float* ptr = (float*)tmp; diff --git a/src/layer/riscv/rvv_mathfun.h b/src/layer/riscv/rvv_mathfun.h index 107492c832b..69be624a23a 100644 --- a/src/layer/riscv/rvv_mathfun.h +++ b/src/layer/riscv/rvv_mathfun.h @@ -562,7 +562,7 @@ _RVV_FLOAT32_ERFC_OP(8, 4) //TODO rvv optimize #define _RVV_FLOAT32_ATAN2_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t atan2_ps(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, size_t vl) \ + static inline vfloat32m##LMUL##_t atan2_ps(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, volatile size_t vl) \ { \ std::vector tmpx(vl); \ std::vector tmpy(vl); \ diff --git a/src/layer/riscv/rvv_mathfun_fp16s.h b/src/layer/riscv/rvv_mathfun_fp16s.h index ade4b05f237..0db6b0c15c2 100644 --- a/src/layer/riscv/rvv_mathfun_fp16s.h +++ b/src/layer/riscv/rvv_mathfun_fp16s.h @@ -411,9 +411,9 @@ _RVV_FLOAT16_SIGMOID_OP(8, 2) return __riscv_vle16_v_f16m##LMUL(tmpx.data(), vl); \ } -_RVV_FLOAT16_ATAN2_OP(1, 32) -_RVV_FLOAT16_ATAN2_OP(2, 16) -_RVV_FLOAT16_ATAN2_OP(4, 8) -_RVV_FLOAT16_ATAN2_OP(8, 4) +_RVV_FLOAT16_ATAN2_OP(1, 16) +_RVV_FLOAT16_ATAN2_OP(2, 8) +_RVV_FLOAT16_ATAN2_OP(4, 4) +_RVV_FLOAT16_ATAN2_OP(8, 2) #endif // RVV_MATHFUN_FP16S_H diff --git a/src/layer/riscv/sigmoid_riscv.cpp b/src/layer/riscv/sigmoid_riscv.cpp index 2311231d6d2..e929ba82a4e 100644 --- a/src/layer/riscv/sigmoid_riscv.cpp +++ b/src/layer/riscv/sigmoid_riscv.cpp @@ -19,21 +19,27 @@ #include "rvv_mathfun.h" #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { Sigmoid_riscv::Sigmoid_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector } int Sigmoid_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) diff --git a/src/layer/riscv/sigmoid_riscv.h b/src/layer/riscv/sigmoid_riscv.h index 02fd3adb4cb..67378486789 100644 --- a/src/layer/riscv/sigmoid_riscv.h +++ b/src/layer/riscv/sigmoid_riscv.h @@ -27,7 +27,7 @@ class Sigmoid_riscv : public Sigmoid virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if NCNN_ZVFH +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif diff --git a/src/layer/riscv/sigmoid_riscv_zvfh.cpp b/src/layer/riscv/sigmoid_riscv_zfh.cpp similarity index 86% rename from src/layer/riscv/sigmoid_riscv_zvfh.cpp rename to src/layer/riscv/sigmoid_riscv_zfh.cpp index 776d0b44748..f4806f64e8f 100644 --- a/src/layer/riscv/sigmoid_riscv_zvfh.cpp +++ b/src/layer/riscv/sigmoid_riscv_zfh.cpp @@ -24,7 +24,7 @@ namespace ncnn { -#if __riscv_zvfh +#if NCNN_ZFH int Sigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; @@ -39,6 +39,7 @@ int Sigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt { __fp16* ptr = bottom_top_blob.channel(q); +#if __riscv_zvfh int n = size; while (n > 0) { @@ -51,6 +52,14 @@ int Sigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt ptr += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *ptr = (__fp16)(1.f / (1.f + exp(-(float)*ptr))); + + ptr++; + } +#endif // __riscv_zvfh } return 0; @@ -70,6 +79,7 @@ int Sigmoid_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op { __fp16* ptr = bottom_top_blob.channel(q); +#if __riscv_zvfh int n = size; while (n > 0) { @@ -82,10 +92,18 @@ int Sigmoid_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op ptr += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *ptr = (__fp16)1.f / ((__fp16)1.f + (__fp16)exp((float)-*ptr)); + + ptr++; + } +#endif // __riscv_zvfh } return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/swish_riscv.cpp b/src/layer/riscv/swish_riscv.cpp index 4c6db7a31ea..f09e4065d2a 100644 --- a/src/layer/riscv/swish_riscv.cpp +++ b/src/layer/riscv/swish_riscv.cpp @@ -19,21 +19,27 @@ #include "rvv_mathfun.h" #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { Swish_riscv::Swish_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector } int Swish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) diff --git a/src/layer/riscv/swish_riscv.h b/src/layer/riscv/swish_riscv.h index 977d34b8ef0..971b5cb2b40 100644 --- a/src/layer/riscv/swish_riscv.h +++ b/src/layer/riscv/swish_riscv.h @@ -27,7 +27,7 @@ class Swish_riscv : public Swish virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if NCNN_ZVFH +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif diff --git a/src/layer/riscv/swish_riscv_zvfh.cpp b/src/layer/riscv/swish_riscv_zfh.cpp similarity index 82% rename from src/layer/riscv/swish_riscv_zvfh.cpp rename to src/layer/riscv/swish_riscv_zfh.cpp index bfa615b8c76..2fa4e028cd2 100644 --- a/src/layer/riscv/swish_riscv_zvfh.cpp +++ b/src/layer/riscv/swish_riscv_zfh.cpp @@ -24,7 +24,7 @@ namespace ncnn { -#if __riscv_zvfh +#if NCNN_ZFH int Swish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; @@ -39,18 +39,27 @@ int Swish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) { __fp16* ptr = bottom_top_blob.channel(q); +#if __riscv_zvfh int n = size; while (n > 0) { size_t vl = __riscv_vsetvl_e16m4(n); vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); - _p = __riscv_vfdiv_vv_f32m8(_p, __riscv_vfadd_vf_f32m8(exp_ps(__riscv_vfneg_v_f32m8(_p, vl), vl), 1.f, vl), vl); + _p = __riscv_vfdiv_vv_f32m8(_p, __riscv_vfadd_vf_f32m8(exp_ps(__riscv_vfneg_v_f32m8(_p, vl), vl), (__fp16)1.f, vl), vl); __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); ptr += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + float v = (float)*ptr; + *ptr = (__fp16)(v / (1.f + exp(-v))); + ptr++; + } +#endif // __riscv_zvfh } return 0; @@ -70,22 +79,30 @@ int Swish_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) { __fp16* ptr = bottom_top_blob.channel(q); +#if __riscv_zvfh int n = size; while (n > 0) { size_t vl = __riscv_vsetvl_e16m8(n); vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); - _p = __riscv_vfdiv_vv_f16m8(_p, __riscv_vfadd_vf_f16m8(exp_ps(__riscv_vfneg_v_f16m8(_p, vl), vl), 1.f, vl), vl); + _p = __riscv_vfdiv_vv_f16m8(_p, __riscv_vfadd_vf_f16m8(exp_ps(__riscv_vfneg_v_f16m8(_p, vl), vl), (__fp16)1.f, vl), vl); __riscv_vse16_v_f16m8(ptr, _p, vl); ptr += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *ptr = *ptr / ((__fp16)1.f + (__fp16)exp((float)-*ptr)); + ptr++; + } +#endif // __riscv_zvfh } return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/tanh_riscv.cpp b/src/layer/riscv/tanh_riscv.cpp index e799e782819..8c3bdb22b3c 100644 --- a/src/layer/riscv/tanh_riscv.cpp +++ b/src/layer/riscv/tanh_riscv.cpp @@ -19,21 +19,27 @@ #include "rvv_mathfun.h" #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { TanH_riscv::TanH_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector } int TanH_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) diff --git a/src/layer/riscv/tanh_riscv.h b/src/layer/riscv/tanh_riscv.h index 22c919a9943..69cb0d4e7cc 100644 --- a/src/layer/riscv/tanh_riscv.h +++ b/src/layer/riscv/tanh_riscv.h @@ -27,7 +27,7 @@ class TanH_riscv : public TanH virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if NCNN_ZVFH +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif diff --git a/src/layer/riscv/tanh_riscv_zvfh.cpp b/src/layer/riscv/tanh_riscv_zfh.cpp similarity index 87% rename from src/layer/riscv/tanh_riscv_zvfh.cpp rename to src/layer/riscv/tanh_riscv_zfh.cpp index 24f7b2c4ad8..6cdb9113231 100644 --- a/src/layer/riscv/tanh_riscv_zvfh.cpp +++ b/src/layer/riscv/tanh_riscv_zfh.cpp @@ -24,7 +24,7 @@ namespace ncnn { -#if __riscv_zvfh +#if NCNN_ZFH int TanH_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; @@ -39,6 +39,7 @@ int TanH_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c { __fp16* ptr = bottom_top_blob.channel(q); +#if __riscv_zvfh int n = size; while (n > 0) { @@ -51,6 +52,13 @@ int TanH_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c ptr += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *ptr = (__fp16)tanh((float)*ptr); + ptr++; + } +#endif // __riscv_zvfh } return 0; @@ -70,6 +78,7 @@ int TanH_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) { __fp16* ptr = bottom_top_blob.channel(q); +#if __riscv_zvfh int n = size; while (n > 0) { @@ -82,10 +91,17 @@ int TanH_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) ptr += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *ptr = (__fp16)tanh((float)*ptr); + ptr++; + } +#endif // __riscv_zvfh } return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/layer/riscv/unaryop_riscv.cpp b/src/layer/riscv/unaryop_riscv.cpp index 58af0802d33..30fb05fb837 100644 --- a/src/layer/riscv/unaryop_riscv.cpp +++ b/src/layer/riscv/unaryop_riscv.cpp @@ -19,16 +19,22 @@ #include "rvv_mathfun.h" #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { UnaryOp_riscv::UnaryOp_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif -#endif // __riscv_vector } #if __riscv_vector @@ -291,7 +297,7 @@ struct unary_op_trunc int UnaryOp_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if NCNN_ZVFH +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) diff --git a/src/layer/riscv/unaryop_riscv.h b/src/layer/riscv/unaryop_riscv.h index c21ab3ec95b..c3db29bb4aa 100644 --- a/src/layer/riscv/unaryop_riscv.h +++ b/src/layer/riscv/unaryop_riscv.h @@ -27,7 +27,7 @@ class UnaryOp_riscv : public UnaryOp virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if NCNN_ZVFH +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; #endif }; diff --git a/src/layer/riscv/unaryop_riscv_zvfh.cpp b/src/layer/riscv/unaryop_riscv_zfh.cpp similarity index 69% rename from src/layer/riscv/unaryop_riscv_zvfh.cpp rename to src/layer/riscv/unaryop_riscv_zfh.cpp index e706002dd3c..09f26cb7e28 100644 --- a/src/layer/riscv/unaryop_riscv_zvfh.cpp +++ b/src/layer/riscv/unaryop_riscv_zfh.cpp @@ -22,9 +22,11 @@ #endif #endif // __riscv_vector +#include + namespace ncnn { -#if __riscv_zvfh +#if NCNN_ZFH template static int unary_op_inplace_fp16s(Mat& a, const Option& opt) { @@ -42,6 +44,7 @@ static int unary_op_inplace_fp16s(Mat& a, const Option& opt) { __fp16* ptr = a.channel(q); +#if __riscv_zvfh int n = size * elempack; while (n > 0) { @@ -54,6 +57,12 @@ static int unary_op_inplace_fp16s(Mat& a, const Option& opt) ptr += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + ptr[i] = op(ptr[i]); + } +#endif // __riscv_zvfh } return 0; @@ -63,101 +72,179 @@ namespace UnaryOp_riscv_functor { struct unary_op_abs_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { - return __riscv_vfsgnj_vf_f16m8(x, 1.f, vl); + return __riscv_vfsgnj_vf_f16m8(x, (__fp16)1.f, vl); + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)fabsf((float)x); } +#endif // __riscv_zvfh }; struct unary_op_neg_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return __riscv_vfneg_v_f16m8(x, vl); } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return -x; + } +#endif // __riscv_zvfh }; struct unary_op_floor_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return __riscv_vfcvt_f_x_v_f16m8(__riscv_vfcvt_x_f_v_i16m8_rm(x, __RISCV_FRM_RDN, vl), vl); } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)floorf((float)x); + } +#endif // __riscv_zvfh }; struct unary_op_ceil_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return __riscv_vfcvt_f_x_v_f16m8(__riscv_vfcvt_x_f_v_i16m8_rm(x, __RISCV_FRM_RUP, vl), vl); } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)ceilf((float)x); + } +#endif // __riscv_zvfh }; struct unary_op_square_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return __riscv_vfmul_vv_f16m8(x, x, vl); } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return x * x; + } +#endif // __riscv_zvfh }; struct unary_op_sqrt_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return __riscv_vfsqrt_v_f16m8(x, vl); } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)sqrtf((float)x); + } +#endif // __riscv_zvfh }; struct unary_op_rsqrt_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { #if __riscv_xtheadvector - vfloat16m8_t _reciprocal = __riscv_vfrdiv_vf_f16m8(__riscv_vfsqrt_v_f16m8(x, vl), 1.f, vl); + vfloat16m8_t _reciprocal = __riscv_vfrdiv_vf_f16m8(__riscv_vfsqrt_v_f16m8(x, vl), (__fp16)1.f, vl); #else vfloat16m8_t _reciprocal = __riscv_vfrsqrt7_v_f16m8(x, vl); - _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(__riscv_vfmul_vf_f16m8(x, 0.5f, vl), __riscv_vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl); - // _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(__riscv_vfmul_vf_f16m8(x, 0.5f, vl), __riscv_vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl); + _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(__riscv_vfmul_vf_f16m8(x, (__fp16)0.5f, vl), __riscv_vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), (__fp16)1.5f, vl), _reciprocal, vl); + // _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(__riscv_vfmul_vf_f16m8(x, (__fp16)0.5f, vl), __riscv_vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), (__fp16)1.5f, vl), _reciprocal, vl); #endif return _reciprocal; } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)(1.f / sqrtf((float)x)); + } +#endif // __riscv_zvfh }; struct unary_op_exp_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return exp_ps(x, vl); } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)expf((float)x); + } +#endif // __riscv_zvfh }; struct unary_op_log_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return log_ps(x, vl); } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)logf((float)x); + } +#endif // __riscv_zvfh }; struct unary_op_sin_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return sin_ps(x, vl); } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)sinf((float)x); + } +#endif // __riscv_zvfh }; struct unary_op_cos_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return cos_ps(x, vl); } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)cosf((float)x); + } +#endif // __riscv_zvfh }; struct unary_op_tan_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { // TODO rvv optimize @@ -169,10 +256,17 @@ struct unary_op_tan_fp16s } return __riscv_vle16_v_f16m8(tmp.data(), vl); } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)tanf((float)x); + } +#endif // __riscv_zvfh }; struct unary_op_asin_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { // TODO rvv optimize @@ -184,10 +278,17 @@ struct unary_op_asin_fp16s } return __riscv_vle16_v_f16m8(tmp.data(), vl); } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)asin((float)x); + } +#endif // __riscv_zvfh }; struct unary_op_acos_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { // TODO rvv optimize @@ -199,10 +300,17 @@ struct unary_op_acos_fp16s } return __riscv_vle16_v_f16m8(tmp.data(), vl); } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)acos((float)x); + } +#endif // __riscv_zvfh }; struct unary_op_atan_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { // TODO rvv optimize @@ -214,49 +322,93 @@ struct unary_op_atan_fp16s } return __riscv_vle16_v_f16m8(tmp.data(), vl); } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)atan((float)x); + } +#endif // __riscv_zvfh }; struct unary_op_reciprocal_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { #if __riscv_xtheadvector - vfloat16m8_t _reciprocal = __riscv_vfrdiv_vf_f16m8(x, 1.f, vl); + vfloat16m8_t _reciprocal = __riscv_vfrdiv_vf_f16m8(x, (__fp16)1.f, vl); #else vfloat16m8_t _reciprocal = __riscv_vfrec7_v_f16m8(x, vl); - _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl); - // _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl); + _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(x, _reciprocal, vl), (__fp16)2.f, vl), _reciprocal, vl); + // _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(x, _reciprocal, vl), (__fp16)2.f, vl), _reciprocal, vl); #endif return _reciprocal; } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)1.f / x; + } +#endif // __riscv_zvfh }; struct unary_op_tanh_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return tanh_ps(x, vl); } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)tanhf((float)x); + } +#endif // __riscv_zvfh }; struct unary_op_log10_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { - return __riscv_vfmul_vf_f16m8(log_ps(x, vl), 0.434294481903, vl); + return __riscv_vfmul_vf_f16m8(log_ps(x, vl), (__fp16)0.434294481903, vl); } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)log10f((float)x); + } +#endif // __riscv_zvfh }; struct unary_op_round_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return __riscv_vfcvt_f_x_v_f16m8(__riscv_vfcvt_x_f_v_i16m8(x, vl), vl); } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + // round to nearest even +#ifdef FE_TONEAREST + int old_rm = fegetround(); + fesetround(FE_TONEAREST); +#endif + float y = nearbyintf((float)x); +#ifdef FE_TONEAREST + fesetround(old_rm); +#endif + return (__fp16)y; + } +#endif // __riscv_zvfh }; struct unary_op_trunc_fp16s { +#if __riscv_zvfh vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { #if __riscv_xtheadvector @@ -271,12 +423,18 @@ struct unary_op_trunc_fp16s vint16m8_t _floorx = __riscv_vsub_vx_i16m8_mu(_floormask, _xi, _xi, 1, vl); vbool2_t _ceilmask = __riscv_vmflt_vv_f16m8_b2(_xf, x, vl); vint16m8_t _ceilx = __riscv_vadd_vx_i16m8_mu(_ceilmask, _xi, _xi, 1, vl); - vbool2_t _negative = __riscv_vmflt_vf_f16m8_b2(x, 0.f, vl); + vbool2_t _negative = __riscv_vmflt_vf_f16m8_b2(x, (__fp16)0.f, vl); return __riscv_vfcvt_f_x_v_f16m8(__riscv_vmerge_vvm_i16m8(_floorx, _ceilx, _negative, vl), vl); #else return __riscv_vfcvt_f_x_v_f16m8(__riscv_vfcvt_rtz_x_f_v_i16m8(x, vl), vl); #endif } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)truncf((float)x); + } +#endif // __riscv_zvfh }; } // namespace UnaryOp_riscv_functor @@ -347,6 +505,6 @@ int UnaryOp_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt return 0; } -#endif // __riscv_zvfh +#endif // NCNN_ZFH } // namespace ncnn diff --git a/src/platform.h.in b/src/platform.h.in index d9efed0b3bc..023d5d11023 100644 --- a/src/platform.h.in +++ b/src/platform.h.in @@ -62,6 +62,7 @@ #cmakedefine01 NCNN_LSX #cmakedefine01 NCNN_MMI #cmakedefine01 NCNN_RVV +#cmakedefine01 NCNN_ZFH #cmakedefine01 NCNN_ZVFH #cmakedefine01 NCNN_XTHEADVECTOR #cmakedefine01 NCNN_INT8