diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4ca2a9a5158..1463e8a215a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -422,6 +422,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")
         set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv")
         check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat32m8_t _s, _w; float _v; size_t vl; _s = __riscv_vfmacc_vf_f32m8(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_V)
 
+        set(CMAKE_REQUIRED_FLAGS "-march=rv64gc_zfh -D__fp16=_Float16")
+        check_cxx_source_compiles("int main() { __fp16 s, v; s = v * v; return 0; }" NCNN_COMPILER_SUPPORT_RISCV_ZFH)
+
         set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16")
         check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat16m8_t _s, _w; __fp16 _v; size_t vl; _s = __riscv_vfmacc_vf_f16m8(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_ZVFH)
 
@@ -432,8 +435,20 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")
 
         if(NCNN_COMPILER_SUPPORT_RISCV_V OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)
             option(NCNN_RVV "optimize risc-v platform with v extension" ON)
+        else()
+            message(WARNING "The compiler does not support risc-v v or xtheadvector extension. NCNN_RVV will be OFF.")
+        endif()
+
+        if(NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)
+            option(NCNN_XTHEADVECTOR "optimize risc-v platform with xtheadvector extension" ON)
+        else()
+            message(WARNING "The compiler does not support risc-v xtheadvector extension. NCNN_XTHEADVECTOR will be OFF.")
+        endif()
+
+        if(NCNN_COMPILER_SUPPORT_RISCV_ZFH)
+            option(NCNN_ZFH "optimize risc-v platform with zfh extension" ON)
             if(NCNN_COMPILER_SUPPORT_RISCV_ZVFH OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)
-                if(NCNN_RVV)
+                if(NCNN_RVV AND NCNN_ZFH)
                     option(NCNN_ZVFH "optimize risc-v platform with zvfh extension" ON)
                 endif()
             else()
@@ -458,13 +473,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")
         #         add_definitions(-D__rvv_tuple)
         #     endif()
         else()
-            message(WARNING "The compiler does not support risc-v v or xtheadvector extension. NCNN_RVV will be OFF.")
-        endif()
-
-        if(NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)
-            option(NCNN_XTHEADVECTOR "optimize risc-v platform with xtheadvector extension" ON)
-        else()
-            message(WARNING "The compiler does not support risc-v xtheadvector extension. NCNN_XTHEADVECTOR will be OFF.")
+            message(WARNING "The compiler does not support risc-v zfh extension. NCNN_ZFH will be OFF.")
         endif()
 
     endif()
diff --git a/cmake/ncnn_add_layer.cmake b/cmake/ncnn_add_layer.cmake
index 01e1f3cbafb..6a7e16ce0f2 100644
--- a/cmake/ncnn_add_layer.cmake
+++ b/cmake/ncnn_add_layer.cmake
@@ -54,6 +54,28 @@ macro(ncnn_add_arch_opt_source class NCNN_TARGET_ARCH_OPT NCNN_TARGET_ARCH_OPT_C
     endif()
 endmacro()
 
+macro(ncnn_add_arch_opt_layer_source class NCNN_TARGET_ARCH_OPT_BASE NCNN_TARGET_ARCH_OPT NCNN_TARGET_ARCH_OPT_CFLAGS)
+    set(NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT_BASE}.cpp)
+
+    if(WITH_LAYER_${name} AND EXISTS ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE})
+
+        set(NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE ${CMAKE_CURRENT_BINARY_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}.cpp)
+
+        add_custom_command(
+            OUTPUT ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE}
+            COMMAND ${CMAKE_COMMAND} -DSRC=${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE} -DDST=${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE} -DCLASS=${class} -P "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_${NCNN_TARGET_ARCH_OPT}_source.cmake"
+            DEPENDS ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE}
+            COMMENT "Generating source ${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}.cpp"
+            VERBATIM
+        )
+        set_source_files_properties(${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE} PROPERTIES GENERATED TRUE)
+
+        set_source_files_properties(${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE} PROPERTIES COMPILE_FLAGS ${NCNN_TARGET_ARCH_OPT_CFLAGS})
+
+        list(APPEND ncnn_SRCS ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE})
+    endif()
+endmacro()
+
 macro(ncnn_add_layer class)
     string(TOLOWER ${class} name)
 
@@ -394,11 +416,15 @@ macro(ncnn_add_layer class)
         if(NCNN_RUNTIME_CPU AND NCNN_RVV)
             ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv")
         endif()
+        if(NCNN_ZFH)
+            ncnn_add_arch_opt_source(${class} zfh "-march=rv64gc_zfh -D__fp16=_Float16")
+        endif()
         if(NCNN_RUNTIME_CPU AND NCNN_XTHEADVECTOR)
-            ncnn_add_arch_opt_layer(${class} xtheadvector "-march=rv64gc_zfh_xtheadvector -D__fp16=_Float16")
+            ncnn_add_arch_opt_layer(${class} xtheadvector "-march=rv64gc_xtheadvector")
+            ncnn_add_arch_opt_layer_source(${class} zfh xtheadvector "-march=rv64gc_zfh_xtheadvector -D__fp16=_Float16")
         endif()
-        if(NCNN_ZVFH)
-            ncnn_add_arch_opt_source(${class} zvfh "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16")
+        if(NCNN_RUNTIME_CPU AND NCNN_ZVFH)
+            ncnn_add_arch_opt_layer_source(${class} zfh rvv "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16")
         endif()
     endif()
 
diff --git a/src/layer/riscv/absval_riscv.cpp b/src/layer/riscv/absval_riscv.cpp
index 8c84af7284c..805fe8f54c8 100644
--- a/src/layer/riscv/absval_riscv.cpp
+++ b/src/layer/riscv/absval_riscv.cpp
@@ -13,24 +13,27 @@
 // specific language governing permissions and limitations under the License.
 
 #include "absval_riscv.h"
-#include "cpu.h"
 
 #if __riscv_vector
 #include <riscv_vector.h>
 #endif // __riscv_vector
 
-namespace ncnn {
+#include "cpu.h"
 
-#include "absval_fp16.h"
+namespace ncnn {
 
 AbsVal_riscv::AbsVal_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH || NCNN_XTHEADVECTOR
-    support_fp16_storage = cpu_support_riscv_zvfh() || cpu_support_riscv_xtheadvector();
-#endif
 #endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
+    support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
+#endif
 }
 
 #if __riscv_vector
@@ -42,10 +45,10 @@ static inline vfloat32m8_t __riscv_vfabs_v_f32m8_absval(vfloat32m8_t op1, size_t
 
 int AbsVal_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
-#if __riscv_vector
+#if NCNN_ZFH
     int elembits = bottom_top_blob.elembits();
 
-    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
+    if (opt.use_fp16_storage && elembits == 16)
     {
         return forward_inplace_fp16s(bottom_top_blob, opt);
     }
@@ -89,12 +92,4 @@ int AbsVal_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     return 0;
 }
 
-#if __riscv_vector
-int AbsVal_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
-{
-    absval_fp16(bottom_top_blob, opt);
-    return 0;
-}
-#endif // __riscv_vector
-
 } // namespace ncnn
diff --git a/src/layer/riscv/absval_riscv.h b/src/layer/riscv/absval_riscv.h
index b08a8a7bb31..ca9bde067fa 100644
--- a/src/layer/riscv/absval_riscv.h
+++ b/src/layer/riscv/absval_riscv.h
@@ -27,7 +27,7 @@ class AbsVal_riscv : public AbsVal
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 protected:
-#if __riscv_vector
+#if NCNN_ZFH
     int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };
diff --git a/src/layer/riscv/absval_fp16.h b/src/layer/riscv/absval_riscv_zfh.cpp
similarity index 78%
rename from src/layer/riscv/absval_fp16.h
rename to src/layer/riscv/absval_riscv_zfh.cpp
index b74d3be7187..43bb6a2b0bc 100644
--- a/src/layer/riscv/absval_fp16.h
+++ b/src/layer/riscv/absval_riscv_zfh.cpp
@@ -12,9 +12,13 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#if NCNN_RUNTIME_CPU && NCNN_ZVFH && __riscv_vector && !__riscv_zvfh
-void absval_fp16_zvfh(Mat& bottom_top_blob, const Option& opt);
-#endif
+#include "absval_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#endif // __riscv_vector
+
+namespace ncnn {
 
 #if __riscv_zvfh
 static inline vfloat16m8_t __riscv_vfabs_v_f16m8_absval(vfloat16m8_t op1, size_t vl)
@@ -23,17 +27,9 @@ static inline vfloat16m8_t __riscv_vfabs_v_f16m8_absval(vfloat16m8_t op1, size_t
 }
 #endif // __riscv_zvfh
 
-static void absval_fp16(Mat& bottom_top_blob, const Option& opt)
+#if NCNN_ZFH
+int AbsVal_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
 {
-#if NCNN_RUNTIME_CPU && NCNN_ZVFH && __riscv_vector && !__riscv_xtheadvector && !__riscv_zvfh
-    if (ncnn::cpu_support_riscv_zvfh())
-    {
-        absval_fp16_zvfh(bottom_top_blob, opt);
-        return;
-    }
-#endif
-
-#if __riscv_zvfh
     const int w = bottom_top_blob.w;
     const int h = bottom_top_blob.h;
     const int d = bottom_top_blob.d;
@@ -46,6 +42,7 @@ static void absval_fp16(Mat& bottom_top_blob, const Option& opt)
     {
         __fp16* ptr = bottom_top_blob.channel(q);
 
+#if __riscv_zvfh
         int n = size;
         while (n > 0)
         {
@@ -58,9 +55,17 @@ static void absval_fp16(Mat& bottom_top_blob, const Option& opt)
             ptr += vl;
             n -= vl;
         }
-    }
-#else
-    (void)bottom_top_blob;
-    (void)opt;
+#else  // __riscv_zvfh
+        for (int i = 0; i < size; i++)
+        {
+            *ptr = (*ptr > (__fp16)0.f) ? (*ptr) : (-*ptr);
+            ptr++;
+        }
 #endif // __riscv_zvfh
+    }
+
+    return 0;
 }
+#endif // NCNN_ZFH
+
+} // namespace ncnn
diff --git a/src/layer/riscv/absval_riscv_zvfh.cpp b/src/layer/riscv/absval_riscv_zvfh.cpp
deleted file mode 100644
index 1eb9554a6b2..00000000000
--- a/src/layer/riscv/absval_riscv_zvfh.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#include "cpu.h"
-#include "mat.h"
-
-namespace ncnn {
-
-#include "absval_fp16.h"
-
-void absval_fp16_zvfh(Mat& bottom_top_blob, const Option& opt)
-{
-    absval_fp16(bottom_top_blob, opt);
-}
-
-} // namespace ncnn
diff --git a/src/layer/riscv/batchnorm_riscv.cpp b/src/layer/riscv/batchnorm_riscv.cpp
index c12b554933e..e6e5af89033 100644
--- a/src/layer/riscv/batchnorm_riscv.cpp
+++ b/src/layer/riscv/batchnorm_riscv.cpp
@@ -16,9 +16,10 @@
 
 #if __riscv_vector
 #include <riscv_vector.h>
+#include "riscv_usability.h"
 #endif // __riscv_vector
 
-#include "riscv_usability.h"
+#include "cpu.h"
 
 namespace ncnn {
 
@@ -26,15 +27,19 @@ BatchNorm_riscv::BatchNorm_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 }
 
 int BatchNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_top_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
@@ -75,7 +80,6 @@ int BatchNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
         }
 #else
         int w = bottom_top_blob.w;
-        #pragma omp parallel for num_threads(opt.num_threads)
         for (int i = 0; i < w; i++)
         {
             ptr[i] = b_data[i] * ptr[i] + a_data[i];
diff --git a/src/layer/riscv/batchnorm_riscv.h b/src/layer/riscv/batchnorm_riscv.h
index 8fc761bcca6..9f9b105cae1 100644
--- a/src/layer/riscv/batchnorm_riscv.h
+++ b/src/layer/riscv/batchnorm_riscv.h
@@ -26,7 +26,7 @@ class BatchNorm_riscv : public BatchNorm
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
     int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
 #endif
diff --git a/src/layer/riscv/batchnorm_riscv_zvfh.cpp b/src/layer/riscv/batchnorm_riscv_zfh.cpp
similarity index 87%
rename from src/layer/riscv/batchnorm_riscv_zvfh.cpp
rename to src/layer/riscv/batchnorm_riscv_zfh.cpp
index 2eef3729525..bd2a9be289a 100644
--- a/src/layer/riscv/batchnorm_riscv_zvfh.cpp
+++ b/src/layer/riscv/batchnorm_riscv_zfh.cpp
@@ -16,23 +16,23 @@
 
 #if __riscv_vector
 #include <riscv_vector.h>
-#endif // __riscv_vector
-
 #include "riscv_usability.h"
+#endif // __riscv_vector
 
 namespace ncnn {
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 int BatchNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
 {
     int dims = bottom_top_blob.dims;
     int elempack = bottom_top_blob.elempack;
     if (dims == 1)
     {
-        int n = bottom_top_blob.w * elempack;
         __fp16* ptr = bottom_top_blob;
+#if __riscv_zvfh
         const float* ptr_a = a_data;
         const float* ptr_b = b_data;
+        int n = bottom_top_blob.w * elempack;
         while (n > 0)
         {
             size_t vl = __riscv_vsetvl_e16m4(n);
@@ -50,6 +50,13 @@ int BatchNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o
             ptr_b += vl;
             n -= vl;
         }
+#else  // __riscv_zvfh
+        int w = bottom_top_blob.w;
+        for (int i = 0; i < w; i++)
+        {
+            ptr[i] = (__fp16)(b_data[i] * (float)ptr[i] + a_data[i]);
+        }
+#endif // __riscv_zvfh
 
         return 0;
     }
@@ -67,6 +74,7 @@ int BatchNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o
                 float a = a_data[i];
                 float b = b_data[i];
 
+#if __riscv_zvfh
                 int n = w;
                 while (n > 0)
                 {
@@ -79,6 +87,12 @@ int BatchNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o
                     ptr += vl;
                     n -= vl;
                 }
+#else  // __riscv_zvfh
+                for (int j = 0; j < w; j++)
+                {
+                    ptr[j] = (__fp16)(b * (float)ptr[j] + a);
+                }
+#endif // __riscv_zvfh
             }
         }
         if (dims == 3 || dims == 4)
@@ -94,6 +108,7 @@ int BatchNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o
                 float a = a_data[q];
                 float b = b_data[q];
 
+#if __riscv_zvfh
                 int n = size;
                 while (n > 0)
                 {
@@ -106,12 +121,19 @@ int BatchNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o
                     ptr += vl;
                     n -= vl;
                 }
+#else  // __riscv_zvfh
+                for (int i = 0; i < size; i++)
+                {
+                    ptr[i] = (__fp16)(b * (float)ptr[i] + a);
+                }
+#endif // __riscv_zvfh
             }
         }
 
         return 0;
     }
 
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2; // fp16
     if (elempack == packn)
     {
@@ -172,6 +194,7 @@ int BatchNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o
             }
         }
     }
+#endif // __riscv_zvfh
 
     return 0;
 }
@@ -182,10 +205,11 @@ int BatchNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option&
     int elempack = bottom_top_blob.elempack;
     if (dims == 1)
     {
-        int n = bottom_top_blob.w * elempack;
         __fp16* ptr = bottom_top_blob;
+#if __riscv_zvfh
         const float* ptr_a = a_data;
         const float* ptr_b = b_data;
+        int n = bottom_top_blob.w * elempack;
         while (n > 0)
         {
             size_t vl = __riscv_vsetvl_e16m4(n);
@@ -203,6 +227,13 @@ int BatchNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option&
             ptr_b += vl;
             n -= vl;
         }
+#else  // __riscv_zvfh
+        int w = bottom_top_blob.w;
+        for (int i = 0; i < w; i++)
+        {
+            ptr[i] = (__fp16)b_data[i] * ptr[i] + (__fp16)a_data[i];
+        }
+#endif // __riscv_zvfh
 
         return 0;
     }
@@ -217,9 +248,10 @@ int BatchNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option&
             for (int i = 0; i < h; i++)
             {
                 __fp16* ptr = bottom_top_blob.row<__fp16>(i);
-                float a = a_data[i];
-                float b = b_data[i];
+                __fp16 a = (__fp16)a_data[i];
+                __fp16 b = (__fp16)b_data[i];
 
+#if __riscv_zvfh
                 int n = w;
                 while (n > 0)
                 {
@@ -232,6 +264,12 @@ int BatchNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option&
                     ptr += vl;
                     n -= vl;
                 }
+#else  // __riscv_zvfh
+                for (int j = 0; j < w; j++)
+                {
+                    ptr[j] = b * ptr[j] + a;
+                }
+#endif // __riscv_zvfh
             }
         }
         if (dims == 3 || dims == 4)
@@ -244,9 +282,10 @@ int BatchNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option&
             for (int q = 0; q < c; q++)
             {
                 __fp16* ptr = bottom_top_blob.channel(q);
-                float a = a_data[q];
-                float b = b_data[q];
+                __fp16 a = (__fp16)a_data[q];
+                __fp16 b = (__fp16)b_data[q];
 
+#if __riscv_zvfh
                 int n = size;
                 while (n > 0)
                 {
@@ -259,12 +298,19 @@ int BatchNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option&
                     ptr += vl;
                     n -= vl;
                 }
+#else  // __riscv_zvfh
+                for (int i = 0; i < size; i++)
+                {
+                    ptr[i] = b * ptr[i] + a;
+                }
+#endif // __riscv_zvfh
             }
         }
 
         return 0;
     }
 
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2; // fp16
     if (elempack == packn)
     {
@@ -325,9 +371,10 @@ int BatchNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option&
             }
         }
     }
+#endif // __riscv_zvfh
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/binaryop_riscv.cpp b/src/layer/riscv/binaryop_riscv.cpp
index 7b5a77d774d..f7d750614cc 100644
--- a/src/layer/riscv/binaryop_riscv.cpp
+++ b/src/layer/riscv/binaryop_riscv.cpp
@@ -20,9 +20,10 @@
 #if __riscv_vector
 #include <riscv_vector.h>
 #include "rvv_mathfun.h"
+#include "riscv_usability.h"
 #endif // __riscv_vector
 
-#include "riscv_usability.h"
+#include "cpu.h"
 
 namespace ncnn {
 
@@ -30,8 +31,12 @@ BinaryOp_riscv::BinaryOp_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
 #endif
 #endif
 }
@@ -468,7 +473,7 @@ static int get_reverse_op_type(int op_type)
 
 int BinaryOp_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = std::max(bottom_blobs[0].elembits(), bottom_blobs[1].elembits());
 
     if (opt.use_fp16_storage && elembits == 16)
@@ -626,7 +631,7 @@ int BinaryOp_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Ma
 
 int BinaryOp_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_top_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
diff --git a/src/layer/riscv/binaryop_riscv.h b/src/layer/riscv/binaryop_riscv.h
index 1ab14a90e5b..9a2d473c51f 100644
--- a/src/layer/riscv/binaryop_riscv.h
+++ b/src/layer/riscv/binaryop_riscv.h
@@ -31,7 +31,7 @@ class BinaryOp_riscv : public BinaryOp
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
     int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
diff --git a/src/layer/riscv/binaryop_riscv_zvfh.cpp b/src/layer/riscv/binaryop_riscv_zfh.cpp
similarity index 94%
rename from src/layer/riscv/binaryop_riscv_zvfh.cpp
rename to src/layer/riscv/binaryop_riscv_zfh.cpp
index 56ff0fa6701..b37e483cbae 100644
--- a/src/layer/riscv/binaryop_riscv_zvfh.cpp
+++ b/src/layer/riscv/binaryop_riscv_zfh.cpp
@@ -17,21 +17,21 @@
 #if __riscv_vector
 #include <riscv_vector.h>
 #include "rvv_mathfun.h"
+#include "riscv_usability.h"
 #if __riscv_zvfh
 #include "rvv_mathfun_fp16s.h"
 #endif
 #endif // __riscv_vector
 
-#include "riscv_usability.h"
-
 namespace ncnn {
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 template<typename Op>
 static void binary_op_vector_no_broadcast_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int size)
 {
     const Op op;
 
+#if __riscv_zvfh
     int n = size;
     while (n > 0)
     {
@@ -45,6 +45,15 @@ static void binary_op_vector_no_broadcast_fp16s(const __fp16* ptr, const __fp16*
         ptr1 += vl;
         outptr += vl;
     }
+#else  // __riscv_zvfh
+    for (int i = 0; i < size; i++)
+    {
+        *outptr = op(*ptr, *ptr1);
+        ptr += 1;
+        ptr1 += 1;
+        outptr += 1;
+    }
+#endif // __riscv_zvfh
 }
 
 template<typename Op>
@@ -54,6 +63,7 @@ static void binary_op_vector_broadcast_b_fp16s(const __fp16* ptr, const __fp16*
 
     const __fp16 b = *ptr1;
 
+#if __riscv_zvfh
     int n = size;
     vfloat16m8_t _bx = (elempack == 1) ? __riscv_vfmv_v_f_f16m8(b, __riscv_vsetvl_e16m8(n)) : __riscv_vle16_v_f16m8_f16m1(ptr1);
     while (n > 0)
@@ -66,6 +76,14 @@ static void binary_op_vector_broadcast_b_fp16s(const __fp16* ptr, const __fp16*
         ptr += vl;
         outptr += vl;
     }
+#else  // __riscv_zvfh
+    for (int i = 0; i < size; i++)
+    {
+        *outptr = op(*ptr, b);
+        ptr += 1;
+        outptr += 1;
+    }
+#endif // __riscv_zvfh
 }
 
 template<typename Op>
@@ -75,6 +93,7 @@ static void binary_op_vector_broadcast_a_fp16s(const __fp16* ptr, const __fp16*
 
     const __fp16 a = *ptr;
 
+#if __riscv_zvfh
     int n = size;
     vfloat16m8_t _ax = (elempack == 1) ? __riscv_vfmv_v_f_f16m8(a, __riscv_vsetvl_e16m8(n)) : __riscv_vle16_v_f16m8_f16m1(ptr);
     while (n > 0)
@@ -87,6 +106,14 @@ static void binary_op_vector_broadcast_a_fp16s(const __fp16* ptr, const __fp16*
         ptr1 += vl;
         outptr += vl;
     }
+#else  // __riscv_zvfh
+    for (int i = 0; i < size; i++)
+    {
+        *outptr = op(a, *ptr1);
+        ptr1 += 1;
+        outptr += 1;
+    }
+#endif // __riscv_zvfh
 }
 
 template<typename Op>
@@ -94,6 +121,7 @@ static void binary_op_vector_broadcast_pb_fp16s(const __fp16* ptr, const __fp16*
 {
     const Op op;
 
+#if __riscv_zvfh
     // if (elempack == packn)
     {
         size_t vl = __riscv_vsetvl_e16m8(elempack);
@@ -108,6 +136,7 @@ static void binary_op_vector_broadcast_pb_fp16s(const __fp16* ptr, const __fp16*
             outptr += vl;
         }
     }
+#endif // __riscv_zvfh
 }
 
 template<typename Op>
@@ -115,6 +144,7 @@ static void binary_op_vector_broadcast_pb_b_fp16s(const __fp16* ptr, const __fp1
 {
     const Op op;
 
+#if __riscv_zvfh
     int n = w * elempack;
 
     vfloat16m8_t _bx = __riscv_vfmv_v_f_f16m8(*ptr1, __riscv_vsetvl_e16m8(n));
@@ -128,6 +158,7 @@ static void binary_op_vector_broadcast_pb_b_fp16s(const __fp16* ptr, const __fp1
         ptr += vl;
         outptr += vl;
     }
+#endif // __riscv_zvfh
 }
 
 template<typename Op>
@@ -135,6 +166,7 @@ static void binary_op_vector_broadcast_pb_a_fp16s(const __fp16* ptr, const __fp1
 {
     const Op op;
 
+#if __riscv_zvfh
     // if (elempack == packn)
     {
         size_t vl = __riscv_vsetvl_e16m8(elempack);
@@ -147,6 +179,7 @@ static void binary_op_vector_broadcast_pb_a_fp16s(const __fp16* ptr, const __fp1
             outptr += vl;
         }
     }
+#endif // __riscv_zvfh
 }
 
 template<typename Op>
@@ -203,6 +236,7 @@ static void binary_op_vector_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16
 
 namespace BinaryOp_riscv_functor {
 
+#if __riscv_zvfh
 #define MAKE_FUNCTION(NAME, IMPL, IMPLVV, IMPLVS, IMPLSV)                                            \
     struct NAME                                                                                      \
     {                                                                                                \
@@ -223,6 +257,16 @@ namespace BinaryOp_riscv_functor {
             return IMPLSV;                                                                           \
         }                                                                                            \
     };
+#else
+#define MAKE_FUNCTION(NAME, IMPL, IMPLVV, IMPLVS, IMPLSV)         \
+    struct NAME                                                   \
+    {                                                             \
+        __fp16 operator()(const __fp16& x, const __fp16& y) const \
+        {                                                         \
+            return IMPL;                                          \
+        }                                                         \
+    };
+#endif
 
 // clang-format off
 // *INDENT-OFF*
@@ -567,6 +611,6 @@ int BinaryOp_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& op
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/clip_riscv.cpp b/src/layer/riscv/clip_riscv.cpp
index c1127e90e9f..52a82fe9b81 100644
--- a/src/layer/riscv/clip_riscv.cpp
+++ b/src/layer/riscv/clip_riscv.cpp
@@ -16,32 +16,34 @@
 
 #if __riscv_vector
 #include <riscv_vector.h>
-#include "rvv_mathfun.h"
 #endif // __riscv_vector
 
+#include "cpu.h"
+
 namespace ncnn {
 
 Clip_riscv::Clip_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 }
 
 int Clip_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_top_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
     {
-        if (opt.use_fp16_arithmetic)
-            return forward_inplace_fp16sa(bottom_top_blob, opt);
-        else
-            return forward_inplace_fp16s(bottom_top_blob, opt);
+        return forward_inplace_fp16s(bottom_top_blob, opt);
     }
 #endif
 
diff --git a/src/layer/riscv/clip_riscv.h b/src/layer/riscv/clip_riscv.h
index e439dd9b34a..241a249a204 100644
--- a/src/layer/riscv/clip_riscv.h
+++ b/src/layer/riscv/clip_riscv.h
@@ -27,9 +27,8 @@ class Clip_riscv : public Clip
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
-    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };
 
diff --git a/src/layer/riscv/clip_riscv_zvfh.cpp b/src/layer/riscv/clip_riscv_zfh.cpp
similarity index 61%
rename from src/layer/riscv/clip_riscv_zvfh.cpp
rename to src/layer/riscv/clip_riscv_zfh.cpp
index 0776db98a0b..feeca51477f 100644
--- a/src/layer/riscv/clip_riscv_zvfh.cpp
+++ b/src/layer/riscv/clip_riscv_zfh.cpp
@@ -16,15 +16,11 @@
 
 #if __riscv_vector
 #include <riscv_vector.h>
-#include "rvv_mathfun.h"
-#if __riscv_zvfh
-#include "rvv_mathfun_fp16s.h"
-#endif
 #endif // __riscv_vector
 
 namespace ncnn {
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 int Clip_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
@@ -39,55 +35,38 @@ int Clip_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c
     {
         __fp16* ptr = bottom_top_blob.channel(q);
 
+        __fp16 _min = (__fp16)min;
+        __fp16 _max = (__fp16)max;
+#if __riscv_zvfh
         int n = size;
         while (n > 0)
         {
             size_t vl = __riscv_vsetvl_e16m4(n);
 
             vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl);
-            _p = __riscv_vfmax_vf_f32m8(_p, min, vl);
-            _p = __riscv_vfmin_vf_f32m8(_p, max, vl);
+            _p = __riscv_vfmax_vf_f32m8(_p, _min, vl);
+            _p = __riscv_vfmin_vf_f32m8(_p, _max, vl);
             __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl);
 
             ptr += vl;
             n -= vl;
         }
-    }
-
-    return 0;
-}
-
-int Clip_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
-{
-    int w = bottom_top_blob.w;
-    int h = bottom_top_blob.h;
-    int d = bottom_top_blob.d;
-    int channels = bottom_top_blob.c;
-    int elempack = bottom_top_blob.elempack;
-    int size = w * h * d * elempack;
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int q = 0; q < channels; q++)
-    {
-        __fp16* ptr = bottom_top_blob.channel(q);
-
-        int n = size;
-        while (n > 0)
+#else  // __riscv_vector
+        for (int i = 0; i < size; i++)
         {
-            size_t vl = __riscv_vsetvl_e16m8(n);
+            if (*ptr < _min)
+                *ptr = _min;
 
-            vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
-            _p = __riscv_vfmax_vf_f16m8(_p, min, vl);
-            _p = __riscv_vfmin_vf_f16m8(_p, max, vl);
-            __riscv_vse16_v_f16m8(ptr, _p, vl);
+            if (*ptr > _max)
+                *ptr = _max;
 
-            ptr += vl;
-            n -= vl;
+            ptr++;
         }
+#endif // __riscv_vector
     }
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } //namespace ncnn
diff --git a/src/layer/riscv/concat_riscv.cpp b/src/layer/riscv/concat_riscv.cpp
index 89d1565c42d..ae991d9251a 100644
--- a/src/layer/riscv/concat_riscv.cpp
+++ b/src/layer/riscv/concat_riscv.cpp
@@ -16,9 +16,10 @@
 
 #if __riscv_vector
 #include <riscv_vector.h>
+#include "riscv_usability.h"
 #endif // __riscv_vector
 
-#include "riscv_usability.h"
+#include "cpu.h"
 
 namespace ncnn {
 
@@ -26,10 +27,10 @@ Concat_riscv::Concat_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
-    support_fp16_storage = cpu_support_riscv_zvfh();
-#endif
 #endif // __riscv_vector
+#if NCNN_ZFH
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 
 #if NCNN_BF16
     support_bf16_storage = true;
@@ -40,7 +41,7 @@ int Concat_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
 {
     int elembits = bottom_blobs[0].elembits();
 
-#if NCNN_ZVFH
+#if NCNN_ZFH
     if (opt.use_fp16_storage && elembits == 16)
         return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
 #endif
diff --git a/src/layer/riscv/convolution1d_riscv.cpp b/src/layer/riscv/convolution1d_riscv.cpp
index 65d89afa788..b2c1c198857 100644
--- a/src/layer/riscv/convolution1d_riscv.cpp
+++ b/src/layer/riscv/convolution1d_riscv.cpp
@@ -30,10 +30,14 @@ Convolution1D_riscv::Convolution1D_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 }
 
 int Convolution1D_riscv::create_pipeline(const Option& opt)
@@ -41,7 +45,7 @@ int Convolution1D_riscv::create_pipeline(const Option& opt)
     if (dynamic_weight)
         return 0;
 
-#if NCNN_ZVFH
+#if NCNN_ZFH
     if (opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
@@ -108,7 +112,7 @@ int Convolution1D_riscv::destroy_pipeline(const Option& /*opt*/)
 
 int Convolution1D_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
diff --git a/src/layer/riscv/convolution1d_riscv.h b/src/layer/riscv/convolution1d_riscv.h
index 3e7875199f0..98a21be002f 100644
--- a/src/layer/riscv/convolution1d_riscv.h
+++ b/src/layer/riscv/convolution1d_riscv.h
@@ -32,7 +32,7 @@ class Convolution1D_riscv : public Convolution1D
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int create_pipeline_fp16s(const Option& opt);
     int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
     int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
diff --git a/src/layer/riscv/convolution1d_riscv_zvfh.cpp b/src/layer/riscv/convolution1d_riscv_zfh.cpp
similarity index 96%
rename from src/layer/riscv/convolution1d_riscv_zvfh.cpp
rename to src/layer/riscv/convolution1d_riscv_zfh.cpp
index 1eb8fa45c4a..47a9e7a5a9a 100644
--- a/src/layer/riscv/convolution1d_riscv_zvfh.cpp
+++ b/src/layer/riscv/convolution1d_riscv_zfh.cpp
@@ -26,21 +26,24 @@
 
 namespace ncnn {
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 int Convolution1D_riscv::create_pipeline_fp16s(const Option& opt)
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
+#endif // __riscv_zvfh
 
     const int num_input = weight_data_size / kernel_w / num_output;
 
     int elempack = 1;
     int out_elempack = 1;
-
+#if __riscv_zvfh
     if (opt.use_packing_layout)
     {
         elempack = num_input % packn == 0 ? packn : 1;
         out_elempack = num_output % packn == 0 ? packn : 1;
     }
+#endif // __riscv_zvfh
 
     // src = kw-inch-outch
     // dst = pb-pa-kw-inch/pa-outch/pb
@@ -83,8 +86,10 @@ int Convolution1D_riscv::create_pipeline_fp16s(const Option& opt)
 
 int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
     const size_t vl = __riscv_vsetvl_e16m1(packn);
+#endif // __riscv_zvfh
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -101,7 +106,13 @@ int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co
     w = bottom_blob_bordered.w;
     h = bottom_blob_bordered.h;
 
-    int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1;
+    int out_elempack = 1;
+#if __riscv_zvfh
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % packn == 0 ? packn : 1;
+    }
+#endif // __riscv_zvfh
     size_t out_elemsize = elemsize / elempack * out_elempack;
 
     const int outw = (w - kernel_extent_w) / stride_w + 1;
@@ -111,6 +122,7 @@ int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co
     if (top_blob.empty())
         return -100;
 
+#if __riscv_zvfh
     if (elempack == packn && out_elempack == packn)
     {
         {
@@ -256,6 +268,7 @@ int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co
             }
         }
     }
+#endif // __riscv_zvfh
 
     if (elempack == 1 && out_elempack == 1)
     {
@@ -304,8 +317,10 @@ int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co
 
 int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
     const size_t vl = __riscv_vsetvl_e16m1(packn);
+#endif // __riscv_zvfh
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -322,7 +337,13 @@ int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
     w = bottom_blob_bordered.w;
     h = bottom_blob_bordered.h;
 
-    int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1;
+    int out_elempack = 1;
+#if __riscv_zvfh
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % packn == 0 ? packn : 1;
+    }
+#endif // __riscv_zvfh
     size_t out_elemsize = elemsize / elempack * out_elempack;
 
     const int outw = (w - kernel_extent_w) / stride_w + 1;
@@ -332,6 +353,7 @@ int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
     if (top_blob.empty())
         return -100;
 
+#if __riscv_zvfh
     if (elempack == packn && out_elempack == packn)
     {
         {
@@ -467,6 +489,7 @@ int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
             }
         }
     }
+#endif // __riscv_zvfh
 
     if (elempack == 1 && out_elempack == 1)
     {
@@ -512,6 +535,6 @@ int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp
index 41979c07ba1..c252d25f235 100644
--- a/src/layer/riscv/convolution_riscv.cpp
+++ b/src/layer/riscv/convolution_riscv.cpp
@@ -21,7 +21,6 @@
 #if __riscv_vector
 #include <riscv_vector.h>
 #endif // __riscv_vector
-
 #include "riscv_activation.h"
 #include "riscv_usability.h"
 
@@ -57,10 +56,14 @@ Convolution_riscv::Convolution_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 
     activation = 0;
 }
@@ -116,7 +119,7 @@ int Convolution_riscv::create_pipeline(const Option& opt)
     }
 #endif
 
-#if NCNN_ZVFH
+#if NCNN_ZFH
     if (opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
@@ -302,7 +305,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
         return 0;
     }
 
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
diff --git a/src/layer/riscv/convolution_riscv.h b/src/layer/riscv/convolution_riscv.h
index e1c4a9c546d..4add7108643 100644
--- a/src/layer/riscv/convolution_riscv.h
+++ b/src/layer/riscv/convolution_riscv.h
@@ -32,7 +32,7 @@ class Convolution_riscv : public Convolution
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int create_pipeline_fp16s(const Option& opt);
     int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
     int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
diff --git a/src/layer/riscv/convolution_riscv_zvfh.cpp b/src/layer/riscv/convolution_riscv_zfh.cpp
similarity index 96%
rename from src/layer/riscv/convolution_riscv_zvfh.cpp
rename to src/layer/riscv/convolution_riscv_zfh.cpp
index ad4fcd1fdc2..861510f1ee4 100644
--- a/src/layer/riscv/convolution_riscv_zvfh.cpp
+++ b/src/layer/riscv/convolution_riscv_zfh.cpp
@@ -17,26 +17,25 @@
 #if __riscv_vector
 #include <riscv_vector.h>
 #endif // __riscv_vector
-
 #include "riscv_activation.h"
 #include "riscv_usability.h"
 
 namespace ncnn {
 
-#if __riscv_vector
-#if __riscv_zvfh
+#if NCNN_ZFH
 #include "convolution_fp16s.h"
+#include "convolution_sgemm_fp16s.h"
+#include "convolution_1x1_fp16s.h"
+#if __riscv_zvfh
 #include "convolution_packn_fp16s.h"
 #include "convolution_pack1ton_fp16s.h"
 #include "convolution_packnto1_fp16s.h"
 
-#include "convolution_sgemm_fp16s.h"
 #include "convolution_sgemm_packn_fp16s.h"
 #include "convolution_sgemm_pack1ton_fp16s.h"
 #include "convolution_sgemm_packnto1_fp16s.h"
 #include "convolution_winograd_transform_packn_fp16s.h"
 #include "convolution_winograd_dot_packn_fp16s.h"
-#include "convolution_1x1_fp16s.h"
 #include "convolution_1x1_packn_fp16s.h"
 #include "convolution_1x1_pack1ton_fp16s.h"
 #include "convolution_1x1_packnto1_fp16s.h"
@@ -44,9 +43,9 @@ namespace ncnn {
 #include "convolution_3x3_pack1ton_fp16s.h"
 #include "convolution_7x7_pack1ton_fp16s.h"
 #endif
-#endif // __riscv_vector
+#endif // NCNN_ZFH
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 static void convolution_transform_kernel_packed_fp16s_rvv(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
 {
     const int maxk = kernel_w * kernel_h;
@@ -85,7 +84,9 @@ static void convolution_transform_kernel_packed_fp16s_rvv(const Mat& weight_data
 
 int Convolution_riscv::create_pipeline_fp16s(const Option& opt)
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
+#endif // __riscv_zvfh
 
     const int maxk = kernel_w * kernel_h;
     const int num_input = weight_data_size / maxk / num_output;
@@ -93,12 +94,15 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt)
     int elempack = 1;
     int out_elempack = 1;
 
+#if __riscv_zvfh
     if (opt.use_packing_layout)
     {
         elempack = num_input % packn == 0 ? packn : 1;
         out_elempack = num_output % packn == 0 ? packn : 1;
     }
+#endif // __riscv_zvfh
 
+#if __riscv_zvfh
     // packn
     if (elempack == packn && out_elempack == packn)
     {
@@ -143,6 +147,7 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt)
             convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
         }
     }
+#endif // __riscv_zvfh
 
     // pack1
     if (elempack == 1 && out_elempack == 1)
@@ -174,7 +179,9 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt)
 
 int Convolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
+#endif // __riscv_zvfh
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -196,13 +203,20 @@ int Convolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
 
     int outw = (w - kernel_extent_w) / stride_w + 1;
     int outh = (h - kernel_extent_h) / stride_h + 1;
-    int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1;
+    int out_elempack = 1;
+#if __riscv_zvfh
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % packn == 0 ? packn : 1;
+    }
+#endif // __riscv_zvfh
     size_t out_elemsize = elemsize / elempack * out_elempack;
 
     top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
+#if __riscv_zvfh
     if (elempack == packn && out_elempack == packn)
     {
         {
@@ -223,6 +237,7 @@ int Convolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
             convolution_packnto1_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
         }
     }
+#endif // __riscv_zvfh
 
     if (elempack == 1 && out_elempack == 1)
     {
@@ -236,7 +251,9 @@ int Convolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
 
 int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
+#endif // __riscv_zvfh
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -259,7 +276,13 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
 
     int outw = (w - kernel_extent_w) / stride_w + 1;
     int outh = (h - kernel_extent_h) / stride_h + 1;
-    int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1;
+    int out_elempack = 1;
+#if __riscv_zvfh
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % packn == 0 ? packn : 1;
+    }
+#endif // __riscv_zvfh
     size_t out_elemsize = elemsize / elempack * out_elempack;
 
     top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
@@ -268,6 +291,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
 
     const int num_input = channels * elempack;
 
+#if __riscv_zvfh
     if (elempack == packn && out_elempack == packn)
     {
         if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
@@ -404,6 +428,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
             convolution_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
         }
     }
+#endif // __riscv_zvfh
 
     if (elempack == 1 && out_elempack == 1)
     {
@@ -433,6 +458,6 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp
index 195caf7eb59..a0926077b34 100644
--- a/src/layer/riscv/convolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp
@@ -14,9 +14,6 @@
 
 #include "convolutiondepthwise_riscv.h"
 
-#include "cpu.h"
-#include "layer_type.h"
-
 #if __riscv_vector
 #include <riscv_vector.h>
 #endif // __riscv_vector
@@ -24,6 +21,9 @@
 #include "riscv_activation.h"
 #include "riscv_usability.h"
 
+#include "cpu.h"
+#include "layer_type.h"
+
 namespace ncnn {
 
 #include "convolutiondepthwise_3x3.h"
@@ -37,10 +37,14 @@ ConvolutionDepthWise_riscv::ConvolutionDepthWise_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 
     activation = 0;
 }
@@ -60,7 +64,7 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt)
     }
 #endif
 
-#if NCNN_ZVFH
+#if NCNN_ZFH
     if (opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
@@ -259,7 +263,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c
     }
 #endif
 
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
diff --git a/src/layer/riscv/convolutiondepthwise_riscv.h b/src/layer/riscv/convolutiondepthwise_riscv.h
index ee9fce28a6a..7f2c66d8e73 100644
--- a/src/layer/riscv/convolutiondepthwise_riscv.h
+++ b/src/layer/riscv/convolutiondepthwise_riscv.h
@@ -33,7 +33,7 @@ class ConvolutionDepthWise_riscv : public ConvolutionDepthWise
 
 protected:
     int create_group_ops(const Option& opt);
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int create_pipeline_fp16s(const Option& opt);
     int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
     int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
diff --git a/src/layer/riscv/convolutiondepthwise_riscv_zvfh.cpp b/src/layer/riscv/convolutiondepthwise_riscv_zfh.cpp
similarity index 94%
rename from src/layer/riscv/convolutiondepthwise_riscv_zvfh.cpp
rename to src/layer/riscv/convolutiondepthwise_riscv_zfh.cpp
index fa2176d2be9..9764166ca56 100644
--- a/src/layer/riscv/convolutiondepthwise_riscv_zvfh.cpp
+++ b/src/layer/riscv/convolutiondepthwise_riscv_zfh.cpp
@@ -23,17 +23,19 @@
 
 namespace ncnn {
 
-#if __riscv_vector
+#if NCNN_ZFH
 #if __riscv_zvfh
 #include "convolutiondepthwise_3x3_packn_fp16s.h"
 #include "convolutiondepthwise_5x5_packn_fp16s.h"
 #endif
-#endif // __riscv_vector
+#endif // NCNN_ZFH
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
+#endif // __riscv_zvfh
 
     const int maxk = kernel_w * kernel_h;
     int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
@@ -42,11 +44,14 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
     if (channels == group && group == num_output)
     {
         int elempack = 1;
+#if __riscv_zvfh
         if (opt.use_packing_layout)
         {
             elempack = channels % packn == 0 ? packn : 1;
         }
+#endif // __riscv_zvfh
 
+#if __riscv_zvfh
         // packn
         if (elempack == packn)
         {
@@ -56,6 +61,7 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
 
             ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt);
         }
+#endif // __riscv_zvfh
 
         if (elempack == 1)
         {
@@ -81,8 +87,10 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
 
 int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
     const size_t vl = __riscv_vsetvl_e16m1(packn);
+#endif // __riscv_zvfh
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -103,7 +111,13 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b
 
     int outw = (w - kernel_extent_w) / stride_w + 1;
     int outh = (h - kernel_extent_h) / stride_h + 1;
-    int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1;
+    int out_elempack = 1;
+#if __riscv_zvfh
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % packn == 0 ? packn : 1;
+    }
+#endif // __riscv_zvfh
     size_t out_elemsize = elemsize / elempack * out_elempack;
 
     top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
@@ -113,6 +127,7 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b
     // depth-wise
     if (channels * elempack == group && group == num_output)
     {
+#if __riscv_zvfh
         if (elempack == packn)
         {
             {
@@ -174,6 +189,7 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b
                 }
             }
         }
+#endif // __riscv_zvfh
 
         if (elempack == 1)
         {
@@ -242,8 +258,15 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b
     const int channels_g = channels * elempack / group;
     const int num_output_g = num_output / group;
 
-    int g_elempack = (opt.use_packing_layout && channels_g % packn == 0) ? packn : 1;
-    int out_g_elempack = (opt.use_packing_layout && num_output_g % packn == 0) ? packn : 1;
+    int g_elempack = 1;
+    int out_g_elempack = 1;
+#if __riscv_zvfh
+    if (opt.use_packing_layout)
+    {
+        g_elempack = channels_g % packn == 0 ? packn : 1;
+        out_g_elempack = num_output_g % packn == 0 ? packn : 1;
+    }
+#endif // __riscv_zvfh
 
     // unpacking
     Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
@@ -291,8 +314,10 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b
 
 int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
     const size_t vl = __riscv_vsetvl_e16m1(packn);
+#endif // __riscv_zvfh
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -313,7 +338,13 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_
 
     int outw = (w - kernel_extent_w) / stride_w + 1;
     int outh = (h - kernel_extent_h) / stride_h + 1;
-    int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1;
+    int out_elempack = 1;
+#if __riscv_zvfh
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % packn == 0 ? packn : 1;
+    }
+#endif // __riscv_zvfh
     size_t out_elemsize = elemsize / elempack * out_elempack;
 
     top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
@@ -323,6 +354,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_
     // depth-wise
     if (channels * elempack == group && group == num_output)
     {
+#if __riscv_zvfh
         if (elempack == packn)
         {
             if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
@@ -421,6 +453,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_
                 }
             }
         }
+#endif // __riscv_zvfh
 
         if (elempack == 1)
         {
@@ -489,8 +522,15 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_
     const int channels_g = channels * elempack / group;
     const int num_output_g = num_output / group;
 
-    int g_elempack = (opt.use_packing_layout && channels_g % packn == 0) ? packn : 1;
-    int out_g_elempack = (opt.use_packing_layout && num_output_g % packn == 0) ? packn : 1;
+    int g_elempack = 1;
+    int out_g_elempack = 1;
+#if __riscv_zvfh
+    if (opt.use_packing_layout)
+    {
+        g_elempack = channels_g % packn == 0 ? packn : 1;
+        out_g_elempack = num_output_g % packn == 0 ? packn : 1;
+    }
+#endif // __riscv_zvfh
 
     // unpacking
     Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
@@ -535,6 +575,6 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/crop_riscv.cpp b/src/layer/riscv/crop_riscv.cpp
index 98dd6de4af4..95717c45900 100644
--- a/src/layer/riscv/crop_riscv.cpp
+++ b/src/layer/riscv/crop_riscv.cpp
@@ -20,16 +20,22 @@
 
 #include "riscv_usability.h"
 
+#include "cpu.h"
+
 namespace ncnn {
 
 Crop_riscv::Crop_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 
 #if NCNN_BF16
     support_bf16_storage = true;
diff --git a/src/layer/riscv/deconvolution_riscv.cpp b/src/layer/riscv/deconvolution_riscv.cpp
index 5aa5d307956..27afddd795c 100644
--- a/src/layer/riscv/deconvolution_riscv.cpp
+++ b/src/layer/riscv/deconvolution_riscv.cpp
@@ -14,9 +14,6 @@
 
 #include "deconvolution_riscv.h"
 
-#include "cpu.h"
-#include "layer_type.h"
-
 #if __riscv_vector
 #include <riscv_vector.h>
 #endif // __riscv_vector
@@ -24,6 +21,9 @@
 #include "riscv_activation.h"
 #include "riscv_usability.h"
 
+#include "cpu.h"
+#include "layer_type.h"
+
 namespace ncnn {
 
 #if __riscv_vector
@@ -36,10 +36,14 @@ Deconvolution_riscv::Deconvolution_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 }
 
 int Deconvolution_riscv::create_pipeline(const Option& opt)
@@ -47,7 +51,7 @@ int Deconvolution_riscv::create_pipeline(const Option& opt)
     if (dynamic_weight)
         return 0;
 
-#if NCNN_ZVFH
+#if NCNN_ZFH
     if (opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
@@ -154,7 +158,7 @@ int Deconvolution_riscv::destroy_pipeline(const Option& opt)
 
 int Deconvolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
diff --git a/src/layer/riscv/deconvolution_riscv.h b/src/layer/riscv/deconvolution_riscv.h
index 760fe88286c..96233bd79dc 100644
--- a/src/layer/riscv/deconvolution_riscv.h
+++ b/src/layer/riscv/deconvolution_riscv.h
@@ -32,7 +32,7 @@ class Deconvolution_riscv : public Deconvolution
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int create_pipeline_fp16s(const Option& opt);
     int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
     int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
diff --git a/src/layer/riscv/deconvolution_riscv_zvfh.cpp b/src/layer/riscv/deconvolution_riscv_zfh.cpp
similarity index 93%
rename from src/layer/riscv/deconvolution_riscv_zvfh.cpp
rename to src/layer/riscv/deconvolution_riscv_zfh.cpp
index 79b35871601..80d59a93ce4 100644
--- a/src/layer/riscv/deconvolution_riscv_zvfh.cpp
+++ b/src/layer/riscv/deconvolution_riscv_zfh.cpp
@@ -23,31 +23,34 @@
 
 namespace ncnn {
 
-#if __riscv_vector
-#if __riscv_zvfh
+#if NCNN_ZFH
 #include "deconvolution_fp16s.h"
+#if __riscv_zvfh
 #include "deconvolution_packn_fp16s.h"
 #include "deconvolution_pack1ton_fp16s.h"
 #include "deconvolution_packnto1_fp16s.h"
 #endif
-#endif // __riscv_vector
+#endif // NCNN_ZFH
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt)
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
+#endif // __riscv_zvfh
 
     const int maxk = kernel_w * kernel_h;
     const int num_input = weight_data_size / maxk / num_output;
 
     int elempack = 1;
     int out_elempack = 1;
-
+#if __riscv_zvfh
     if (opt.use_packing_layout)
     {
         elempack = num_input % packn == 0 ? packn : 1;
         out_elempack = num_output % packn == 0 ? packn : 1;
     }
+#endif // __riscv_zvfh
 
     Mat weight_data_transposed(weight_data.w);
     {
@@ -97,6 +100,7 @@ int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt)
         }
     }
 
+#if __riscv_zvfh
     // packn
     if (elempack == packn && out_elempack == packn)
     {
@@ -111,6 +115,7 @@ int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt)
     if (elempack == packn && out_elempack == 1)
     {
     }
+#endif // __riscv_zvfh
 
     // pack1
     if (elempack == 1 && out_elempack == 1)
@@ -127,7 +132,9 @@ int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt)
 
 int Deconvolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
+#endif // __riscv_zvfh
 
     // deconvolv with NxN kernel
     // value = value + bias
@@ -145,7 +152,13 @@ int Deconvolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co
 
     int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
     int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
-    int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1;
+    int out_elempack = 1;
+#if __riscv_zvfh
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % packn == 0 ? packn : 1;
+    }
+#endif // __riscv_zvfh
     size_t out_elemsize = elemsize / elempack * out_elempack;
 
     Mat top_blob_bordered;
@@ -161,6 +174,7 @@ int Deconvolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co
     if (top_blob_bordered.empty())
         return -100;
 
+#if __riscv_zvfh
     if (elempack == packn && out_elempack == packn)
     {
         {
@@ -181,6 +195,7 @@ int Deconvolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co
             deconvolution_packnto1_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
         }
     }
+#endif // __riscv_zvfh
 
     if (elempack == 1 && out_elempack == 1)
     {
@@ -198,7 +213,9 @@ int Deconvolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co
 
 int Deconvolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
+#endif // __riscv_zvfh
 
     // deconvolv with NxN kernel
     // value = value + bias
@@ -216,7 +233,13 @@ int Deconvolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
 
     int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
     int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
-    int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1;
+    int out_elempack = 1;
+#if __riscv_zvfh
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % packn == 0 ? packn : 1;
+    }
+#endif // __riscv_zvfh
     size_t out_elemsize = elemsize / elempack * out_elempack;
 
     Mat top_blob_bordered;
@@ -232,6 +255,7 @@ int Deconvolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
     if (top_blob_bordered.empty())
         return -100;
 
+#if __riscv_zvfh
     if (elempack == packn && out_elempack == packn)
     {
         {
@@ -252,6 +276,7 @@ int Deconvolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
             deconvolution_packnto1_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
         }
     }
+#endif // __riscv_zvfh
 
     if (elempack == 1 && out_elempack == 1)
     {
@@ -266,6 +291,6 @@ int Deconvolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
index 272a74dded7..5fee5c24764 100644
--- a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
@@ -14,9 +14,6 @@
 
 #include "deconvolutiondepthwise_riscv.h"
 
-#include "cpu.h"
-#include "layer_type.h"
-
 #if __riscv_vector
 #include <riscv_vector.h>
 #endif // __riscv_vector
@@ -24,16 +21,23 @@
 #include "riscv_activation.h"
 #include "riscv_usability.h"
 
+#include "cpu.h"
+#include "layer_type.h"
+
 namespace ncnn {
 
 DeconvolutionDepthWise_riscv::DeconvolutionDepthWise_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 }
 
 int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt)
@@ -41,7 +45,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt)
     if (dynamic_weight)
         return 0;
 
-#if NCNN_ZVFH
+#if NCNN_ZFH
     if (opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
@@ -196,7 +200,7 @@ int DeconvolutionDepthWise_riscv::destroy_pipeline(const Option& opt)
 
 int DeconvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.h b/src/layer/riscv/deconvolutiondepthwise_riscv.h
index 3522edf3ced..a965b5a4699 100644
--- a/src/layer/riscv/deconvolutiondepthwise_riscv.h
+++ b/src/layer/riscv/deconvolutiondepthwise_riscv.h
@@ -33,7 +33,7 @@ class DeconvolutionDepthWise_riscv : public DeconvolutionDepthWise
 
 protected:
     int create_group_ops(const Option& opt);
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int create_pipeline_fp16s(const Option& opt);
     int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
     int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv_zvfh.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv_zfh.cpp
similarity index 94%
rename from src/layer/riscv/deconvolutiondepthwise_riscv_zvfh.cpp
rename to src/layer/riscv/deconvolutiondepthwise_riscv_zfh.cpp
index f8bfe22bb56..fb2d8d2b32c 100644
--- a/src/layer/riscv/deconvolutiondepthwise_riscv_zvfh.cpp
+++ b/src/layer/riscv/deconvolutiondepthwise_riscv_zfh.cpp
@@ -23,10 +23,12 @@
 
 namespace ncnn {
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
+#endif // __riscv_zvfh
 
     const int maxk = kernel_w * kernel_h;
     int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
@@ -35,10 +37,12 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
     if (channels == group && group == num_output)
     {
         int elempack = 1;
+#if __riscv_zvfh
         if (opt.use_packing_layout)
         {
             elempack = channels % packn == 0 ? packn : 1;
         }
+#endif // __riscv_zvfh
 
         Mat weight_data_transposed(weight_data.w);
         {
@@ -57,6 +61,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
             }
         }
 
+#if __riscv_zvfh
         // packn
         if (elempack == packn)
         {
@@ -66,6 +71,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
 
             ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt);
         }
+#endif // __riscv_zvfh
 
         if (elempack == 1)
         {
@@ -91,8 +97,10 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
 
 int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
     const size_t vl = __riscv_vsetvl_e16m1(packn);
+#endif // __riscv_zvfh
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -105,7 +113,13 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top
 
     int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
     int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
-    int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1;
+    int out_elempack = 1;
+#if __riscv_zvfh
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % packn == 0 ? packn : 1;
+    }
+#endif // __riscv_zvfh
     size_t out_elemsize = elemsize / elempack * out_elempack;
 
     Mat top_blob_bordered;
@@ -126,6 +140,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top
     // depth-wise
     if (channels * elempack == group && group == num_output)
     {
+#if __riscv_zvfh
         if (elempack == packn)
         {
             {
@@ -187,6 +202,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top
                 }
             }
         }
+#endif // __riscv_zvfh
 
         if (elempack == 1)
         {
@@ -258,8 +274,15 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top
         const int channels_g = channels * elempack / group;
         const int num_output_g = num_output / group;
 
-        int g_elempack = (opt.use_packing_layout && channels_g % packn == 0) ? packn : 1;
-        int out_g_elempack = (opt.use_packing_layout && num_output_g % packn == 0) ? packn : 1;
+        int g_elempack = 1;
+        int out_g_elempack = 1;
+#if __riscv_zvfh
+        if (opt.use_packing_layout)
+        {
+            g_elempack = channels_g % packn == 0 ? packn : 1;
+            out_g_elempack = num_output_g % packn == 0 ? packn : 1;
+        }
+#endif // __riscv_zvfh
 
         // unpacking
         Mat bottom_blob_unpacked = bottom_blob;
@@ -312,8 +335,10 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top
 
 int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
     const size_t vl = __riscv_vsetvl_e16m1(packn);
+#endif // __riscv_zvfh
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -326,7 +351,13 @@ int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& to
 
     int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
     int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
-    int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1;
+    int out_elempack = 1;
+#if __riscv_zvfh
+    if (opt.use_packing_layout)
+    {
+        int out_elempack = num_output % packn == 0 ? packn : 1;
+    }
+#endif // __riscv_zvfh
     size_t out_elemsize = elemsize / elempack * out_elempack;
 
     Mat top_blob_bordered;
@@ -347,6 +378,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& to
     // depth-wise
     if (channels * elempack == group && group == num_output)
     {
+#if __riscv_zvfh
         if (elempack == packn)
         {
             {
@@ -408,6 +440,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& to
                 }
             }
         }
+#endif // __riscv_zvfh
 
         if (elempack == 1)
         {
@@ -479,8 +512,15 @@ int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& to
         const int channels_g = channels * elempack / group;
         const int num_output_g = num_output / group;
 
-        int g_elempack = (opt.use_packing_layout && channels_g % packn == 0) ? packn : 1;
-        int out_g_elempack = (opt.use_packing_layout && num_output_g % packn == 0) ? packn : 1;
+        int g_elempack = 1;
+        int out_g_elempack = 1;
+#if __riscv_zvfh
+        if (opt.use_packing_layout)
+        {
+            g_elempack = channels_g % packn == 0 ? packn : 1;
+            out_g_elempack = num_output_g % packn == 0 ? packn : 1;
+        }
+#endif // __riscv_zvfh
 
         // unpacking
         Mat bottom_blob_unpacked = bottom_blob;
@@ -530,6 +570,6 @@ int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& to
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/flatten_riscv.cpp b/src/layer/riscv/flatten_riscv.cpp
index f5d1bc0a528..baf4e11fe8a 100644
--- a/src/layer/riscv/flatten_riscv.cpp
+++ b/src/layer/riscv/flatten_riscv.cpp
@@ -20,16 +20,22 @@
 
 #include "riscv_usability.h"
 
+#include "cpu.h"
+
 namespace ncnn {
 
 Flatten_riscv::Flatten_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 
 #if NCNN_BF16
     support_bf16_storage = true;
@@ -43,7 +49,7 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
     if (elembits == 8)
         return forward_int8(bottom_blob, top_blob, opt);
 
-#if NCNN_ZVFH
+#if NCNN_ZFH
     if (opt.use_fp16_storage && elembits == 16)
         return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
 #endif
diff --git a/src/layer/riscv/gru_riscv.cpp b/src/layer/riscv/gru_riscv.cpp
index 46254d70b96..97d7cb82eb0 100644
--- a/src/layer/riscv/gru_riscv.cpp
+++ b/src/layer/riscv/gru_riscv.cpp
@@ -210,8 +210,12 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
 
 GRU_riscv::GRU_riscv()
 {
-#if NCNN_ZVFH
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
 }
 
@@ -225,7 +229,7 @@ int GRU_riscv::create_pipeline(const Option& opt)
     }
 #endif
 
-#if NCNN_ZVFH
+#if NCNN_ZFH
     if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
         return create_pipeline_fp16sa(opt);
 #endif
@@ -242,9 +246,7 @@ int GRU_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     }
 #endif
 
-#if __riscv_vector
-
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
@@ -256,6 +258,8 @@ int GRU_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     }
 #endif
 
+#if __riscv_vector
+
     int T = bottom_blob.h;
 
     int num_directions = direction == 2 ? 2 : 1;
@@ -326,8 +330,7 @@ int GRU_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t
 
     const Mat& bottom_blob = bottom_blobs[0];
 
-#if __riscv_vector
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
@@ -339,6 +342,7 @@ int GRU_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t
     }
 #endif
 
+#if __riscv_vector
     int T = bottom_blob.h;
     int num_directions = direction == 2 ? 2 : 1;
 
diff --git a/src/layer/riscv/gru_riscv.h b/src/layer/riscv/gru_riscv.h
index 32d75d83d58..a9434f83083 100644
--- a/src/layer/riscv/gru_riscv.h
+++ b/src/layer/riscv/gru_riscv.h
@@ -29,7 +29,7 @@ class GRU_riscv : public GRU
     virtual int create_pipeline(const Option& opt);
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
     int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
     int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
diff --git a/src/layer/riscv/gru_riscv_zvfh.cpp b/src/layer/riscv/gru_riscv_zfh.cpp
similarity index 92%
rename from src/layer/riscv/gru_riscv_zvfh.cpp
rename to src/layer/riscv/gru_riscv_zfh.cpp
index d714d690e88..7ffe9723629 100644
--- a/src/layer/riscv/gru_riscv_zvfh.cpp
+++ b/src/layer/riscv/gru_riscv_zfh.cpp
@@ -20,7 +20,7 @@
 
 namespace ncnn {
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
 {
     int size = bottom_blob.w;
@@ -56,10 +56,11 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             float R = bias_c_R[q];
             float U = bias_c_U[q];
 
-            int n = size;
+#if __riscv_zvfh
             const __fp16* ptr_x = x;
             const float* ptr_xcr = weight_xc_R;
             const float* ptr_xcu = weight_xc_U;
+            int n = size;
             while (n > 0)
             {
                 size_t vl = __riscv_vsetvl_e16m4(n);
@@ -85,11 +86,21 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             ptr_x = NULL;
             ptr_xcr = NULL;
             ptr_xcu = NULL;
+#else  // __riscv_zvfh
+            for (int i = 0; i < size; i++)
+            {
+                float xi = x[i];
 
-            int n_out = num_output;
+                R += weight_xc_R[i] * xi;
+                U += weight_xc_U[i] * xi;
+            }
+#endif // __riscv_zvfh
+
+#if __riscv_zvfh
             const float* ptr_hc = hidden_state;
             const float* ptr_hcr = weight_hc_R;
             const float* ptr_hcu = weight_hc_U;
+            int n_out = num_output;
             while (n_out > 0)
             {
                 size_t vl = __riscv_vsetvl_e16m4(n_out);
@@ -115,6 +126,15 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             ptr_hc = NULL;
             ptr_hcr = NULL;
             ptr_hcu = NULL;
+#else  // __riscv_zvfh
+            for (int i = 0; i < num_output; i++)
+            {
+                float h_cont = hidden_state[i];
+
+                R += weight_hc_R[i] * h_cont;
+                U += weight_hc_U[i] * h_cont;
+            }
+#endif // __riscv_zvfh
 
             // sigmoid(R)
             // sigmoid(U)
@@ -130,9 +150,10 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
 
             float N = bias_c_BN[q];
 
-            int n_out2 = num_output;
+#if __riscv_zvfh
             const float* ptr_hc2 = hidden_state;
             const float* ptr_whc_n = weight_hc_N;
+            int n_out2 = num_output;
             while (n_out2 > 0)
             {
                 size_t vl = __riscv_vsetvl_e16m4(n_out2);
@@ -151,12 +172,21 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             }
             ptr_hc2 = NULL;
             ptr_whc_n = NULL;
+#else  // __riscv_zvfh
+            for (int i = 0; i < num_output; i++)
+            {
+                float h_cont = hidden_state[i];
+
+                N += weight_hc_N[i] * h_cont;
+            }
+#endif // __riscv_zvfh
 
             N = bias_c_WN[q] + R * N;
 
-            int n2 = size;
+#if __riscv_zvfh
             const __fp16* ptr_x2 = x;
             const float* ptr_xcn = weight_xc_N;
+            int n2 = size;
             while (n2 > 0)
             {
                 size_t vl = __riscv_vsetvl_e16m4(n2);
@@ -175,6 +205,14 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             }
             ptr_x2 = NULL;
             ptr_xcn = NULL;
+#else  // __riscv_zvfh
+            for (int i = 0; i < size; i++)
+            {
+                float xi = x[i];
+
+                N += weight_xc_N[i] * xi;
+            }
+#endif // __riscv_zvfh
 
             // tanh(N)
             N = tanh(N);
@@ -388,10 +426,11 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             __fp16 R = bias_c_R[q];
             __fp16 U = bias_c_U[q];
 
-            int n = size;
+#if __riscv_zvfh
             const __fp16* ptr_x = x;
             const __fp16* ptr_xcr = weight_xc_R;
             const __fp16* ptr_xcu = weight_xc_U;
+            int n = size;
             while (n > 0)
             {
                 size_t vl = __riscv_vsetvl_e16m8(n);
@@ -414,11 +453,21 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
                 ptr_xcu += vl;
                 n -= vl;
             }
+#else  // __riscv_zvfh
+            for (int i = 0; i < size; i++)
+            {
+                float xi = x[i];
 
-            int n_out = num_output;
+                R += weight_xc_R[i] * xi;
+                U += weight_xc_U[i] * xi;
+            }
+#endif // __riscv_zvfh
+
+#if __riscv_zvfh
             const float* ptr_hc = hidden_state;
             const __fp16* ptr_hcr = weight_hc_R;
             const __fp16* ptr_hcu = weight_hc_U;
+            int n_out = num_output;
             while (n_out > 0)
             {
                 size_t vl = __riscv_vsetvl_e16m4(n_out);
@@ -441,6 +490,15 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
                 ptr_hcu += vl;
                 n_out -= vl;
             }
+#else  // __riscv_zvfh
+            for (int i = 0; i < num_output; i++)
+            {
+                float h_cont = hidden_state[i];
+
+                R += weight_hc_R[i] * h_cont;
+                U += weight_hc_U[i] * h_cont;
+            }
+#endif // __riscv_zvfh
 
             // sigmoid(R)
             // sigmoid(U)
@@ -456,9 +514,10 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
 
             __fp16 N = bias_c_BN[q];
 
-            int n_out2 = num_output;
+#if __riscv_zvfh
             const float* ptr_hc2 = hidden_state;
             const __fp16* ptr_whc_n = weight_hc_N;
+            int n_out2 = num_output;
             while (n_out2 > 0)
             {
                 size_t vl = __riscv_vsetvl_e16m4(n_out2);
@@ -475,11 +534,21 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
                 ptr_hc2 += vl;
                 ptr_whc_n += vl;
             }
+#else  // __riscv_zvfh
+            for (int i = 0; i < num_output; i++)
+            {
+                float h_cont = hidden_state[i];
+
+                N += weight_hc_N[i] * h_cont;
+            }
+#endif // __riscv_zvfh
+
             N = bias_c_WN[q] + R * N;
 
-            int n2 = size;
+#if __riscv_zvfh
             const __fp16* ptr_x2 = x;
             const __fp16* ptr_xcn = weight_xc_N;
+            int n2 = size;
             while (n2 > 0)
             {
                 size_t vl = __riscv_vsetvl_e16m8(n2);
@@ -496,6 +565,14 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
                 ptr_x2 += vl;
                 ptr_xcn += vl;
             }
+#else  // __riscv_zvfh
+            for (int i = 0; i < size; i++)
+            {
+                float xi = x[i];
+
+                N += weight_xc_N[i] * xi;
+            }
+#endif // __riscv_zvfh
 
             // tanh(N)
             N = (__fp16)tanh((float)N);
@@ -656,6 +733,6 @@ int GRU_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/hardsigmoid_riscv.cpp b/src/layer/riscv/hardsigmoid_riscv.cpp
index 459ec5fc53e..370f623e9d8 100644
--- a/src/layer/riscv/hardsigmoid_riscv.cpp
+++ b/src/layer/riscv/hardsigmoid_riscv.cpp
@@ -21,21 +21,27 @@
 #include <riscv_vector.h>
 #endif // __riscv_vector
 
+#include "cpu.h"
+
 namespace ncnn {
 
 HardSigmoid_riscv::HardSigmoid_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 }
 
 int HardSigmoid_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_top_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
diff --git a/src/layer/riscv/hardsigmoid_riscv.h b/src/layer/riscv/hardsigmoid_riscv.h
index 3ae1b83d022..0883422101c 100644
--- a/src/layer/riscv/hardsigmoid_riscv.h
+++ b/src/layer/riscv/hardsigmoid_riscv.h
@@ -27,7 +27,7 @@ class HardSigmoid_riscv : public HardSigmoid
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };
diff --git a/src/layer/riscv/hardsigmoid_riscv_zvfh.cpp b/src/layer/riscv/hardsigmoid_riscv_zfh.cpp
similarity index 65%
rename from src/layer/riscv/hardsigmoid_riscv_zvfh.cpp
rename to src/layer/riscv/hardsigmoid_riscv_zfh.cpp
index b37864f3c7a..f1f8ab6f618 100644
--- a/src/layer/riscv/hardsigmoid_riscv_zvfh.cpp
+++ b/src/layer/riscv/hardsigmoid_riscv_zfh.cpp
@@ -20,7 +20,7 @@
 
 namespace ncnn {
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 int HardSigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
@@ -35,27 +35,44 @@ int HardSigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option&
     {
         __fp16* ptr = bottom_top_blob.channel(q);
 
+        __fp16 _lower = (__fp16)lower;
+        __fp16 _upper = (__fp16)upper;
+        __fp16 _alpha = (__fp16)alpha;
+        __fp16 _beta = (__fp16)beta;
+
+#if __riscv_zvfh
         int n = size;
         while (n > 0)
         {
             size_t vl = __riscv_vsetvl_e16m8(n);
             vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
 
-            vbool2_t _lower = __riscv_vmflt_vf_f16m8_b2(_p, lower, vl);
-            vbool2_t _higher = __riscv_vmfgt_vf_f16m8_b2(_p, upper, vl);
-            vbool2_t _apply = __riscv_vmnor_mm_b2(_lower, _higher, vl);
-            _p = __riscv_vfmerge_vfm_f16m8(_p, .0f, _lower, vl);
-            _p = __riscv_vfmerge_vfm_f16m8(_p, 1.f, _higher, vl);
+            vbool2_t _is_lower = __riscv_vmflt_vf_f16m8_b2(_p, _lower, vl);
+            vbool2_t _is_higher = __riscv_vmfgt_vf_f16m8_b2(_p, _upper, vl);
+            vbool2_t _apply = __riscv_vmnor_mm_b2(_is_lower, _is_higher, vl);
+            _p = __riscv_vfmerge_vfm_f16m8(_p, (__fp16)0.f, _is_lower, vl);
+            _p = __riscv_vfmerge_vfm_f16m8(_p, (__fp16)1.f, _is_higher, vl);
 
-            _p = __riscv_vfadd_vf_f16m8_mu(_apply, _p, __riscv_vfmul_vf_f16m8_m(_apply, _p, alpha, vl), beta, vl);
+            _p = __riscv_vfadd_vf_f16m8_mu(_apply, _p, __riscv_vfmul_vf_f16m8_m(_apply, _p, _alpha, vl), _beta, vl);
             __riscv_vse16_v_f16m8(ptr, _p, vl);
             ptr += vl;
             n -= vl;
         }
+#else  // __riscv_zvfh
+        for (int i = 0; i < size; i++)
+        {
+            if (ptr[i] < _lower)
+                ptr[i] = (__fp16)0.f;
+            else if (ptr[i] > _upper)
+                ptr[i] = (__fp16)1.f;
+            else
+                ptr[i] = ptr[i] * _alpha + _beta;
+        }
+#endif // __riscv_zvfh
     }
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/hardswish_riscv.cpp b/src/layer/riscv/hardswish_riscv.cpp
index dd14104592b..5de9b943a60 100644
--- a/src/layer/riscv/hardswish_riscv.cpp
+++ b/src/layer/riscv/hardswish_riscv.cpp
@@ -21,21 +21,27 @@
 #include <riscv_vector.h>
 #endif // __riscv_vector
 
+#include "cpu.h"
+
 namespace ncnn {
 
 HardSwish_riscv::HardSwish_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 }
 
 int HardSwish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_top_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
diff --git a/src/layer/riscv/hardswish_riscv.h b/src/layer/riscv/hardswish_riscv.h
index 758c8dae4dd..b882487ba21 100644
--- a/src/layer/riscv/hardswish_riscv.h
+++ b/src/layer/riscv/hardswish_riscv.h
@@ -30,7 +30,7 @@ class HardSwish_riscv : public HardSwish
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };
diff --git a/src/layer/riscv/hardswish_riscv_zvfh.cpp b/src/layer/riscv/hardswish_riscv_zfh.cpp
similarity index 67%
rename from src/layer/riscv/hardswish_riscv_zvfh.cpp
rename to src/layer/riscv/hardswish_riscv_zfh.cpp
index 1d3826605b0..2afdf07d9dd 100644
--- a/src/layer/riscv/hardswish_riscv_zvfh.cpp
+++ b/src/layer/riscv/hardswish_riscv_zfh.cpp
@@ -20,7 +20,7 @@
 
 namespace ncnn {
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 int HardSwish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
@@ -35,28 +35,45 @@ int HardSwish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o
     {
         __fp16* ptr = bottom_top_blob.channel(q);
 
+        __fp16 _lower = (__fp16)lower;
+        __fp16 _upper = (__fp16)upper;
+        __fp16 _alpha = (__fp16)alpha;
+        __fp16 _beta = (__fp16)beta;
+
+#if __riscv_zvfh
         int n = size;
         while (n > 0)
         {
             size_t vl = __riscv_vsetvl_e16m8(n);
             vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
 
-            vbool2_t _lower = __riscv_vmflt_vf_f16m8_b2(_p, lower, vl);
-            vbool2_t _higher = __riscv_vmfgt_vf_f16m8_b2(_p, upper, vl);
-            vbool2_t _apply = __riscv_vmnor_mm_b2(_lower, _higher, vl);
-            _p = __riscv_vfmerge_vfm_f16m8(_p, .0f, _lower, vl);
+            vbool2_t _is_lower = __riscv_vmflt_vf_f16m8_b2(_p, _lower, vl);
+            vbool2_t _is_higher = __riscv_vmfgt_vf_f16m8_b2(_p, _upper, vl);
+            vbool2_t _apply = __riscv_vmnor_mm_b2(_is_lower, _is_higher, vl);
+            _p = __riscv_vfmerge_vfm_f16m8(_p, (__fp16).0f, _is_lower, vl);
 
-            vfloat16m8_t _p0 = __riscv_vfadd_vf_f16m8_m(_apply, __riscv_vfmul_vf_f16m8_m(_apply, _p, alpha, vl), beta, vl);
+            vfloat16m8_t _p0 = __riscv_vfadd_vf_f16m8_m(_apply, __riscv_vfmul_vf_f16m8_m(_apply, _p, _alpha, vl), _beta, vl);
             _p = __riscv_vfmul_vv_f16m8_mu(_apply, _p, _p, _p0, vl);
 
             __riscv_vse16_v_f16m8(ptr, _p, vl);
             ptr += vl;
             n -= vl;
         }
+#else  // __riscv_zvfh
+        for (int i = 0; i < size; i++)
+        {
+            if (ptr[i] < _lower)
+                ptr[i] = (__fp16)0.f;
+            else if (ptr[i] > _upper)
+                ;
+            else
+                ptr[i] = ptr[i] * (ptr[i] * _alpha + _beta);
+        }
+#endif // __riscv_zvfh
     }
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp
index 8b89da90f41..3b8751f6a2a 100644
--- a/src/layer/riscv/innerproduct_riscv.cpp
+++ b/src/layer/riscv/innerproduct_riscv.cpp
@@ -19,20 +19,25 @@
 #if __riscv_vector
 #include <riscv_vector.h>
 #endif // __riscv_vector
-
 #include "riscv_activation.h"
 #include "riscv_usability.h"
 
+#include "cpu.h"
+
 namespace ncnn {
 
 InnerProduct_riscv::InnerProduct_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 
     flatten = 0;
 }
@@ -57,7 +62,7 @@ int InnerProduct_riscv::create_pipeline(const Option& opt)
     }
 #endif
 
-#if NCNN_ZVFH
+#if NCNN_ZFH
     if (opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
@@ -153,7 +158,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
     }
 #endif
 
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
diff --git a/src/layer/riscv/innerproduct_riscv.h b/src/layer/riscv/innerproduct_riscv.h
index 4c6384a2e0d..17be502d0b1 100644
--- a/src/layer/riscv/innerproduct_riscv.h
+++ b/src/layer/riscv/innerproduct_riscv.h
@@ -30,7 +30,7 @@ class InnerProduct_riscv : public InnerProduct
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int create_pipeline_fp16s(const Option& opt);
     int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
     int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
diff --git a/src/layer/riscv/innerproduct_riscv_zvfh.cpp b/src/layer/riscv/innerproduct_riscv_zfh.cpp
similarity index 94%
rename from src/layer/riscv/innerproduct_riscv_zvfh.cpp
rename to src/layer/riscv/innerproduct_riscv_zfh.cpp
index 86c32c0a993..347cca0a640 100644
--- a/src/layer/riscv/innerproduct_riscv_zvfh.cpp
+++ b/src/layer/riscv/innerproduct_riscv_zfh.cpp
@@ -17,25 +17,28 @@
 #if __riscv_vector
 #include <riscv_vector.h>
 #endif // __riscv_vector
-
 #include "riscv_activation.h"
 #include "riscv_usability.h"
 
 namespace ncnn {
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt)
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
+#endif // __riscv_zvfh
 
     const int num_input = weight_data_size / num_output;
 
     int out_elempack = 1;
 
+#if __riscv_zvfh
     if (opt.use_packing_layout)
     {
         out_elempack = num_output % packn == 0 ? packn : 1;
     }
+#endif // __riscv_zvfh
 
     // src = inch-outch
     // dst = pb-inch-outch/pb
@@ -68,7 +71,9 @@ int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt)
 
 int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
+#endif // __riscv_zvfh
 
     const int num_input = weight_data_size / num_output;
 
@@ -83,11 +88,18 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con
         if (top_blob.empty())
             return -100;
 
-        int num_output_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1;
+        int num_output_elempack = 1;
+#if __riscv_zvfh
+        if (opt.use_packing_layout)
+        {
+            num_output_elempack = num_output % packn == 0 ? packn : 1;
+        }
+#endif // __riscv_zvfh
 
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int j = 0; j < h; j++)
         {
+#if __riscv_zvfh
             if (elempack == packn && num_output_elempack == packn)
             {
                 const size_t vl = __riscv_vsetvl_e16m1(packn);
@@ -201,6 +213,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con
                     outptr += packn;
                 }
             }
+#endif // __riscv_zvfh
 
             if (elempack == 1 && num_output_elempack == 1)
             {
@@ -247,13 +260,20 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con
     size_t elemsize = bottom_blob_flattened.elemsize;
     int elempack = bottom_blob_flattened.elempack;
 
-    int out_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1;
+    int out_elempack = 1;
+#if __riscv_zvfh
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % packn == 0 ? packn : 1;
+    }
+#endif // __riscv_zvfh
     size_t out_elemsize = elemsize / elempack * out_elempack;
 
     top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
+#if __riscv_zvfh
     if (out_elempack == packn)
     {
         // num_output
@@ -290,6 +310,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con
             __riscv_vse16_v_f16m1(outptr + p * packn, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl);
         }
     }
+#endif // __riscv_zvfh
 
     if (out_elempack == 1)
     {
@@ -330,7 +351,9 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con
 
 int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
+#endif // __riscv_zvfh
 
     const int num_input = weight_data_size / num_output;
 
@@ -345,11 +368,18 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co
         if (top_blob.empty())
             return -100;
 
-        int num_output_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1;
+        int num_output_elempack = 1;
+#if __riscv_zvfh
+        if (opt.use_packing_layout)
+        {
+            num_output_elempack = num_output % packn == 0 ? packn : 1;
+        }
+#endif // __riscv_zvfh
 
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int j = 0; j < h; j++)
         {
+#if __riscv_zvfh
             if (elempack == packn && num_output_elempack == packn)
             {
                 const size_t vl = __riscv_vsetvl_e16m1(packn);
@@ -463,6 +493,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co
                     outptr += packn;
                 }
             }
+#endif // __riscv_zvfh
 
             if (elempack == 1 && num_output_elempack == 1)
             {
@@ -509,13 +540,20 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co
     size_t elemsize = bottom_blob_flattened.elemsize;
     int elempack = bottom_blob_flattened.elempack;
 
-    int out_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1;
+    int out_elempack = 1;
+#if __riscv_zvfh
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % packn == 0 ? packn : 1;
+    }
+#endif // __riscv_zvfh
     size_t out_elemsize = elemsize / elempack * out_elempack;
 
     top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
+#if __riscv_zvfh
     if (out_elempack == packn)
     {
         // num_output
@@ -552,6 +590,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co
             __riscv_vse16_v_f16m1(outptr + p * packn, _sum, vl);
         }
     }
+#endif // __riscv_zvfh
 
     if (out_elempack == 1)
     {
@@ -589,6 +628,6 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/instancenorm_riscv.cpp b/src/layer/riscv/instancenorm_riscv.cpp
index 95a39a20ba5..3c32cdb25ff 100644
--- a/src/layer/riscv/instancenorm_riscv.cpp
+++ b/src/layer/riscv/instancenorm_riscv.cpp
@@ -20,20 +20,26 @@
 
 #include "riscv_usability.h"
 
+#include "cpu.h"
+
 namespace ncnn {
 InstanceNorm_riscv::InstanceNorm_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 }
 
 int InstanceNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_top_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
@@ -54,13 +60,8 @@ int InstanceNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt)
     int size = w * h;
 
     int dims = bottom_top_blob.dims;
-#if __riscv_vector
     if (elempack == 1)
-#endif // __riscv_vector
     {
-#if __riscv_vector
-        size = elempack * size;
-#endif
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < c; q++)
         {
diff --git a/src/layer/riscv/instancenorm_riscv.h b/src/layer/riscv/instancenorm_riscv.h
index 81807e3af29..006b8bfc1ad 100644
--- a/src/layer/riscv/instancenorm_riscv.h
+++ b/src/layer/riscv/instancenorm_riscv.h
@@ -26,7 +26,7 @@ class InstanceNorm_riscv : public InstanceNorm
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
     int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
 #endif
diff --git a/src/layer/riscv/instancenorm_riscv_zvfh.cpp b/src/layer/riscv/instancenorm_riscv_zfh.cpp
similarity index 91%
rename from src/layer/riscv/instancenorm_riscv_zvfh.cpp
rename to src/layer/riscv/instancenorm_riscv_zfh.cpp
index ec4b1c8d9b9..9751bdbc797 100644
--- a/src/layer/riscv/instancenorm_riscv_zvfh.cpp
+++ b/src/layer/riscv/instancenorm_riscv_zfh.cpp
@@ -22,7 +22,7 @@
 
 namespace ncnn {
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
 {
     // x = (x - mean) / (sqrt(var + eps)) * gamma + beta
@@ -37,7 +37,6 @@ int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option
     int dims = bottom_top_blob.dims;
     if (elempack == 1)
     {
-        size = elempack * size;
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < c; q++)
         {
@@ -46,6 +45,7 @@ int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option
             // mean and var
             float sum = 0.f;
             float sqsum = 0.f;
+#if __riscv_zvfh
             vfloat32m1_t _sum = __riscv_vfmv_s_f_f32m1(0.f, __riscv_vsetvlmax_e32m1());
             vfloat32m1_t _sqsum = __riscv_vfmv_s_f_f32m1(0.f, __riscv_vsetvlmax_e32m1());
             {
@@ -62,7 +62,15 @@ int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option
                 }
             }
             sum = __riscv_vfmv_f_s_f32m1_f32(_sum);
+#else
+            for (int i = 0; i < size; i++)
+            {
+                sum += ptr[i];
+                //sqsum += ptr[i] * ptr[i];
+            }
+#endif // __riscv_zvfh
             float mean = sum / size;
+#if __riscv_zvfh
             {
                 int n = size;
                 __fp16* ptr_sqsum = ptr;
@@ -77,6 +85,14 @@ int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option
                 }
             }
             sqsum = __riscv_vfmv_f_s_f32m1_f32(_sqsum);
+#else
+            float tmp = 0.f;
+            for (int i = 0; i < size; i++)
+            {
+                tmp = ptr[i] - mean;
+                sqsum += tmp * tmp;
+            }
+#endif // __riscv_zvfh
             float var = sqsum / size;
             // the var maybe minus due to accuracy
             //float var = sqsum / size - mean * mean;
@@ -96,6 +112,7 @@ int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option
                 a = 1.f / (sqrtf(var + eps));
                 b = -mean * a;
             }
+#if __riscv_zvfh
             {
                 int n = size;
                 __fp16* ptr_store = ptr;
@@ -110,10 +127,17 @@ int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option
                     ptr_store += vl;
                 }
             }
+#else
+            for (int i = 0; i < size; i++)
+            {
+                ptr[i] = ptr[i] * a + b;
+            }
+#endif // __riscv_zvfh
         }
         return 0;
     }
 
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
     if (elempack == packn)
     {
@@ -166,6 +190,7 @@ int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option
         }
         return 0;
     }
+#endif // __riscv_zvfh
     return 0;
 }
 
@@ -182,7 +207,6 @@ int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Optio
     int dims = bottom_top_blob.dims;
     if (elempack == 1)
     {
-        size = elempack * size;
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < c; q++)
         {
@@ -191,6 +215,7 @@ int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Optio
             // mean and var
             __fp16 sum = 0.f;
             __fp16 sqsum = 0.f;
+#if __riscv_zvfh
             vfloat16m1_t _sum = __riscv_vfmv_s_f_f16m1(0.f, __riscv_vsetvlmax_e32m1());
             vfloat16m1_t _sqsum = __riscv_vfmv_s_f_f16m1(0.f, __riscv_vsetvlmax_e32m1());
             {
@@ -207,7 +232,15 @@ int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Optio
                 }
             }
             sum = __riscv_vfmv_f_s_f16m1_f16(_sum);
+#else
+            for (int i = 0; i < size; i++)
+            {
+                sum += ptr[i];
+                //sqsum += ptr[i] * ptr[i];
+            }
+#endif // __riscv_zvfh
             __fp16 mean = sum / size;
+#if __riscv_zvfh
             {
                 int n = size;
                 __fp16* ptr_sqsum = ptr;
@@ -222,6 +255,14 @@ int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Optio
                 }
             }
             sqsum = __riscv_vfmv_f_s_f16m1_f16(_sqsum);
+#else
+            float tmp = 0.f;
+            for (int i = 0; i < size; i++)
+            {
+                tmp = ptr[i] - mean;
+                sqsum += tmp * tmp;
+            }
+#endif // __riscv_zvfh
             __fp16 var = sqsum / size;
             // the var maybe minus due to accuracy
             //float var = sqsum / size - mean * mean;
@@ -241,6 +282,7 @@ int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Optio
                 a = static_cast<__fp16>(1.f / (sqrt(var + eps)));
                 b = static_cast<__fp16>(-mean * a);
             }
+#if __riscv_zvfh
             {
                 int n = size;
                 __fp16* ptr_store = ptr;
@@ -255,10 +297,17 @@ int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Optio
                     ptr_store += vl;
                 }
             }
+#else
+            for (int i = 0; i < size; i++)
+            {
+                ptr[i] = ptr[i] * a + b;
+            }
+#endif // __riscv_zvfh
         }
         return 0;
     }
 
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
     if (elempack == packn)
     {
@@ -311,9 +360,9 @@ int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Optio
         }
         return 0;
     }
+#endif // __riscv_zvfh
     return 0;
 }
-
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/interp_bilinear_fp16s.h b/src/layer/riscv/interp_bilinear_fp16s.h
index 40140a02b47..d0fcde65643 100644
--- a/src/layer/riscv/interp_bilinear_fp16s.h
+++ b/src/layer/riscv/interp_bilinear_fp16s.h
@@ -128,6 +128,7 @@ static void resize_bilinear_image_fp16s(const Mat& src, Mat& dst, float* alpha,
         float* rows1p = rows1;
         __fp16* Dp = dst.row<__fp16>(dy);
 
+#if __riscv_zvfh
         int n = w;
         while (n > 0)
         {
@@ -145,6 +146,12 @@ static void resize_bilinear_image_fp16s(const Mat& src, Mat& dst, float* alpha,
             rows1p += vl;
             n -= vl;
         }
+#else  // __riscv_zvfh
+        for (int i = 0; i < w; i++)
+        {
+            *Dp++ = (__fp16)(*rows0p++ * b0 + *rows1p++ * b1);
+        }
+#endif // __riscv_zvfh
 
         beta += 2;
     }
@@ -229,6 +236,7 @@ static void resize_bilinear_image_fp16sa(const Mat& src, Mat& dst, __fp16* alpha
         __fp16* rows1p = rows1;
         __fp16* Dp = dst.row<__fp16>(dy);
 
+#if __riscv_zvfh
         int n = w;
         while (n > 0)
         {
@@ -246,6 +254,12 @@ static void resize_bilinear_image_fp16sa(const Mat& src, Mat& dst, __fp16* alpha
             rows1p += vl;
             n -= vl;
         }
+#else  // __riscv_zvfh
+        for (int i = 0; i < w; i++)
+        {
+            *Dp++ = *rows0p++ * b0 + *rows1p++ * b1;
+        }
+#endif // __riscv_zvfh
 
         beta += 2;
     }
diff --git a/src/layer/riscv/interp_riscv.cpp b/src/layer/riscv/interp_riscv.cpp
index 0d7e9d7cd2d..1ecb2eaf96d 100644
--- a/src/layer/riscv/interp_riscv.cpp
+++ b/src/layer/riscv/interp_riscv.cpp
@@ -19,6 +19,8 @@
 #include "riscv_usability.h"
 #endif // __riscv_vector
 
+#include "cpu.h"
+
 namespace ncnn {
 
 #include "interp_bicubic.h"
@@ -33,10 +35,14 @@ Interp_riscv::Interp_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 }
 
 int Interp_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
@@ -45,7 +51,7 @@ int Interp_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
     const Mat& reference_blob = bottom_blobs[1];
     Mat& top_blob = top_blobs[0];
 
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
diff --git a/src/layer/riscv/interp_riscv.h b/src/layer/riscv/interp_riscv.h
index 4b79c755edc..e9510c4afeb 100644
--- a/src/layer/riscv/interp_riscv.h
+++ b/src/layer/riscv/interp_riscv.h
@@ -27,7 +27,7 @@ class Interp_riscv : public Interp
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
     int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 #endif
diff --git a/src/layer/riscv/interp_riscv_zvfh.cpp b/src/layer/riscv/interp_riscv_zfh.cpp
similarity index 98%
rename from src/layer/riscv/interp_riscv_zvfh.cpp
rename to src/layer/riscv/interp_riscv_zfh.cpp
index 7ebbcc236af..deb39a7bb58 100644
--- a/src/layer/riscv/interp_riscv_zvfh.cpp
+++ b/src/layer/riscv/interp_riscv_zfh.cpp
@@ -24,19 +24,21 @@ namespace ncnn {
 #include "interp_bicubic.h"
 #include "interp_bilinear.h"
 
-#if __riscv_vector
-#if __riscv_zvfh
+#if NCNN_ZFH
 #include "interp_bicubic_fp16s.h"
-#include "interp_bicubic_packn_fp16s.h"
 #include "interp_bilinear_fp16s.h"
+#if __riscv_zvfh
+#include "interp_bicubic_packn_fp16s.h"
 #include "interp_bilinear_packn_fp16s.h"
 #endif
 #endif
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
+#endif // __riscv_zvfh
 
     const Mat& bottom_blob = bottom_blobs[0];
     const Mat& reference_blob = bottom_blobs[1];
@@ -58,6 +60,7 @@ int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vecto
         if (top_blob.empty())
             return -100;
 
+#if __riscv_zvfh
         if (elempack == packn)
         {
             const size_t vl = __riscv_vsetvl_e16m1(packn);
@@ -72,6 +75,7 @@ int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vecto
 
             return 0;
         }
+#endif // __riscv_zvfh
 
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < w; q++)
@@ -96,6 +100,7 @@ int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vecto
         if (top_blob.empty())
             return -100;
 
+#if __riscv_zvfh
         if (elempack == packn)
         {
             if (resize_type == 1) // nearest
@@ -199,6 +204,7 @@ int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vecto
 
             return 0;
         }
+#endif // __riscv_zvfh
 
         if (resize_type == 1) // nearest
         {
@@ -292,6 +298,7 @@ int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vecto
     if (top_blob.empty())
         return -100;
 
+#if __riscv_zvfh
     if (elempack == packn)
     {
         if (resize_type == 1) // nearest
@@ -378,6 +385,7 @@ int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vecto
 
         return 0;
     }
+#endif // __riscv_zvfh
 
     if (resize_type == 1) // nearest
     {
@@ -460,7 +468,9 @@ int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vecto
 
 int Interp_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
+#endif // __riscv_zvfh
 
     const Mat& bottom_blob = bottom_blobs[0];
     const Mat& reference_blob = bottom_blobs[1];
@@ -493,6 +503,7 @@ int Interp_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vect
         if (top_blob.empty())
             return -100;
 
+#if __riscv_zvfh
         if (elempack == packn)
         {
             if (resize_type == 2) // bilinear
@@ -573,6 +584,7 @@ int Interp_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vect
 
             return 0;
         }
+#endif // __riscv_zvfh
 
         if (resize_type == 2) // bilinear
         {
@@ -649,6 +661,7 @@ int Interp_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vect
     if (top_blob.empty())
         return -100;
 
+#if __riscv_zvfh
     if (elempack == packn)
     {
         if (resize_type == 2) // bilinear
@@ -703,6 +716,7 @@ int Interp_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vect
 
         return 0;
     }
+#endif // __riscv_zvfh
 
     if (resize_type == 2) // bilinear
     {
@@ -756,6 +770,6 @@ int Interp_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vect
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/mish_riscv.cpp b/src/layer/riscv/mish_riscv.cpp
index 9d2c2d34d7c..f1e8e477bd0 100644
--- a/src/layer/riscv/mish_riscv.cpp
+++ b/src/layer/riscv/mish_riscv.cpp
@@ -19,21 +19,27 @@
 #include "rvv_mathfun.h"
 #endif // __riscv_vector
 
+#include "cpu.h"
+
 namespace ncnn {
 
 Mish_riscv::Mish_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 }
 
 int Mish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_top_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
diff --git a/src/layer/riscv/mish_riscv.h b/src/layer/riscv/mish_riscv.h
index 52ed8210306..a7dc62018a9 100644
--- a/src/layer/riscv/mish_riscv.h
+++ b/src/layer/riscv/mish_riscv.h
@@ -27,7 +27,7 @@ class Mish_riscv : public Mish
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
     int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
 #endif
diff --git a/src/layer/riscv/mish_riscv_zvfh.cpp b/src/layer/riscv/mish_riscv_zfh.cpp
similarity index 82%
rename from src/layer/riscv/mish_riscv_zvfh.cpp
rename to src/layer/riscv/mish_riscv_zfh.cpp
index 3712aa37ef3..6f614b20abb 100644
--- a/src/layer/riscv/mish_riscv_zvfh.cpp
+++ b/src/layer/riscv/mish_riscv_zfh.cpp
@@ -24,7 +24,7 @@
 
 namespace ncnn {
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 int Mish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
@@ -39,18 +39,27 @@ int Mish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c
     {
         __fp16* ptr = bottom_top_blob.channel(q);
 
+#if __riscv_zvfh
         int n = size;
         while (n > 0)
         {
             size_t vl = __riscv_vsetvl_e16m4(n);
 
             vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl);
-            _p = __riscv_vfmul_vv_f32m8(_p, tanh_ps(log_ps(__riscv_vfadd_vf_f32m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl);
+            _p = __riscv_vfmul_vv_f32m8(_p, tanh_ps(log_ps(__riscv_vfadd_vf_f32m8(exp_ps(_p, vl), (__fp16)1.f, vl), vl), vl), vl);
             __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl);
 
             ptr += vl;
             n -= vl;
         }
+#else  // __riscv_zvfh
+        for (int i = 0; i < size; i++)
+        {
+            float v = (float)*ptr;
+            *ptr = (__fp16)(v * tanh(log(exp(v) + 1.f)));
+            ptr++;
+        }
+#endif // __riscv_zvfh
     }
 
     return 0;
@@ -70,22 +79,30 @@ int Mish_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
     {
         __fp16* ptr = bottom_top_blob.channel(q);
 
+#if __riscv_zvfh
         int n = size;
         while (n > 0)
         {
             size_t vl = __riscv_vsetvl_e16m8(n);
 
             vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
-            _p = __riscv_vfmul_vv_f16m8(_p, tanh_ps(log_ps(__riscv_vfadd_vf_f16m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl);
+            _p = __riscv_vfmul_vv_f16m8(_p, tanh_ps(log_ps(__riscv_vfadd_vf_f16m8(exp_ps(_p, vl), (__fp16)1.f, vl), vl), vl), vl);
             __riscv_vse16_v_f16m8(ptr, _p, vl);
 
             ptr += vl;
             n -= vl;
         }
+#else  // __riscv_zvfh
+        for (int i = 0; i < size; i++)
+        {
+            *ptr = *ptr * (__fp16)tanh(log(exp((float)*ptr) + 1.f));
+            ptr++;
+        }
+#endif // __riscv_zvfh
     }
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/packing_riscv.cpp b/src/layer/riscv/packing_riscv.cpp
index 15b206b063c..aef5522d080 100644
--- a/src/layer/riscv/packing_riscv.cpp
+++ b/src/layer/riscv/packing_riscv.cpp
@@ -27,8 +27,12 @@ namespace ncnn {
 Packing_riscv::Packing_riscv()
 {
     support_packing = true;
-#if NCNN_ZVFH
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
     support_bf16_storage = true;
 }
@@ -40,7 +44,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
     if (elembits == 8)
         return forward_int8(bottom_blob, top_blob, opt);
 
-#if NCNN_ZVFH
+#if NCNN_ZFH
     if (opt.use_fp16_storage && elembits == 16)
         return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
 #endif
diff --git a/src/layer/riscv/padding_riscv.cpp b/src/layer/riscv/padding_riscv.cpp
index d51333551bb..87a9ef2e827 100644
--- a/src/layer/riscv/padding_riscv.cpp
+++ b/src/layer/riscv/padding_riscv.cpp
@@ -20,6 +20,8 @@
 
 #include "riscv_usability.h"
 
+#include "cpu.h"
+
 namespace ncnn {
 
 #if __riscv_vector
@@ -30,10 +32,14 @@ Padding_riscv::Padding_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 
 #if NCNN_BF16
     support_bf16_storage = true;
@@ -42,7 +48,7 @@ Padding_riscv::Padding_riscv()
 
 int Padding_riscv::create_pipeline(const Option& opt)
 {
-#if NCNN_ZVFH
+#if NCNN_ZFH
     if (opt.use_fp16_storage)
     {
         value_fp16 = float32_to_float16(value);
@@ -81,7 +87,7 @@ int Padding_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
     if (elembits == 8)
         return forward_int8(bottom_blob, top_blob, opt);
 
-#if NCNN_ZVFH
+#if NCNN_ZFH
     if (opt.use_fp16_storage && elembits == 16)
         return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
 #endif
@@ -293,7 +299,7 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 // clang-format off
                 // *INDENT-OFF*
                 vuint16m1_t pad_value;
-#if NCNN_ZVFH
+#if NCNN_ZFH
                 if (opt.use_fp16_storage)
                 {
                     pad_value = __riscv_vmv_v_x_u16m1(value_fp16, vl);
@@ -334,7 +340,7 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 // clang-format off
                 // *INDENT-OFF*
                 vuint16m1_t pad_value;
-#if NCNN_ZVFH
+#if NCNN_ZFH
                 if (opt.use_fp16_storage)
                 {
                     pad_value = __riscv_vmv_v_x_u16m1(value_fp16, vl);
@@ -382,7 +388,7 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                     // clang-format off
                     // *INDENT-OFF*
                     vuint16m1_t pad_value;
-#if NCNN_ZVFH
+#if NCNN_ZFH
                     if (opt.use_fp16_storage)
                     {
                         pad_value = per_channel_pad_data_size ? __riscv_vle16_v_u16m1((const unsigned short*)per_channel_pad_data_fp16 + q * packn, vl) : __riscv_vmv_v_x_u16m1(value_fp16, vl);
@@ -440,7 +446,7 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                     // clang-format off
                     // *INDENT-OFF*
                     vuint16m1_t pad_value;
-#if NCNN_ZVFH
+#if NCNN_ZFH
                     if (opt.use_fp16_storage)
                     {
                         pad_value = per_channel_pad_data_size ? __riscv_vle16_v_u16m1((const unsigned short*)per_channel_pad_data_fp16 + q * packn, vl) : __riscv_vmv_v_x_u16m1(value_fp16, vl);
diff --git a/src/layer/riscv/pooling_riscv.cpp b/src/layer/riscv/pooling_riscv.cpp
index 9091e3e8b2a..92f0521bbfd 100644
--- a/src/layer/riscv/pooling_riscv.cpp
+++ b/src/layer/riscv/pooling_riscv.cpp
@@ -18,9 +18,10 @@
 
 #if __riscv_vector
 #include <riscv_vector.h>
+#include "riscv_usability.h"
 #endif // __riscv_vector
 
-#include "riscv_usability.h"
+#include "cpu.h"
 
 namespace ncnn {
 
@@ -28,10 +29,14 @@ Pooling_riscv::Pooling_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 }
 
 int Pooling_riscv::create_pipeline(const Option& /*opt*/)
@@ -55,7 +60,7 @@ int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
         return Pooling::forward(bottom_blob, top_blob, opt);
     }
 
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
diff --git a/src/layer/riscv/pooling_riscv.h b/src/layer/riscv/pooling_riscv.h
index 4c14577b3ca..8f14df8fabb 100644
--- a/src/layer/riscv/pooling_riscv.h
+++ b/src/layer/riscv/pooling_riscv.h
@@ -28,7 +28,7 @@ class Pooling_riscv : public Pooling
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
     int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
diff --git a/src/layer/riscv/pooling_riscv_zvfh.cpp b/src/layer/riscv/pooling_riscv_zfh.cpp
similarity index 98%
rename from src/layer/riscv/pooling_riscv_zvfh.cpp
rename to src/layer/riscv/pooling_riscv_zfh.cpp
index 214c56975a0..8da0afbb737 100644
--- a/src/layer/riscv/pooling_riscv_zvfh.cpp
+++ b/src/layer/riscv/pooling_riscv_zfh.cpp
@@ -18,20 +18,21 @@
 
 #if __riscv_vector
 #include <riscv_vector.h>
-#endif // __riscv_vector
-
 #include "riscv_usability.h"
+#endif // __riscv_vector
 
 namespace ncnn {
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     // max value in NxN window
     // avg value in NxN window
 
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
     const size_t vl = __riscv_vsetvl_e16m1(packn);
+#endif // __riscv_zvfh
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -51,6 +52,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op
 
         if (pooling_type == PoolMethod_MAX)
         {
+#if __riscv_zvfh
             if (elempack == packn)
             {
                 #pragma omp parallel for num_threads(opt.num_threads)
@@ -70,6 +72,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op
                     __riscv_vse16_v_f16m1(outptr + q * packn, _max, vl);
                 }
             }
+#endif // __riscv_zvfh
 
             if (elempack == 1)
             {
@@ -92,6 +95,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op
 
         if (pooling_type == PoolMethod_AVE)
         {
+#if __riscv_zvfh
             if (elempack == packn)
             {
                 #pragma omp parallel for num_threads(opt.num_threads)
@@ -113,6 +117,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op
                     __riscv_vse16_v_f16m1(outptr + q * packn, __riscv_vfncvt_f_f_w_f16m1(_avg, vl), vl);
                 }
             }
+#endif // __riscv_zvfh
 
             if (elempack == 1)
             {
@@ -174,6 +179,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op
 
     if (pooling_type == PoolMethod_MAX)
     {
+#if __riscv_zvfh
         if (elempack == packn)
         {
             #pragma omp parallel for num_threads(opt.num_threads)
@@ -203,6 +209,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op
                 }
             }
         }
+#endif // __riscv_zvfh
 
         if (elempack == 1)
         {
@@ -248,6 +255,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op
                 htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
             }
 
+#if __riscv_zvfh
             if (elempack == packn)
             {
                 #pragma omp parallel for num_threads(opt.num_threads)
@@ -301,6 +309,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op
                     }
                 }
             }
+#endif // __riscv_zvfh
 
             if (elempack == 1)
             {
@@ -358,6 +367,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op
 
         if (avgpool_count_include_pad == 1)
         {
+#if __riscv_zvfh
             if (elempack == packn)
             {
                 #pragma omp parallel for num_threads(opt.num_threads)
@@ -390,6 +400,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op
                     }
                 }
             }
+#endif // __riscv_zvfh
 
             if (elempack == 1)
             {
@@ -436,8 +447,10 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O
         return forward_fp16s(bottom_blob, top_blob, opt);
     }
 
+#if __riscv_zvfh
     const int packn = csrr_vlenb() / 2;
     const size_t vl = __riscv_vsetvl_e16m1(packn);
+#endif // __riscv_zvfh
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -496,6 +509,7 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O
                 htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
             }
 
+#if __riscv_zvfh
             if (elempack == packn)
             {
                 #pragma omp parallel for num_threads(opt.num_threads)
@@ -512,7 +526,7 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O
                         {
                             int sx0 = j * stride_w;
 
-                            vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
+                            vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
                             int area = 0;
 
                             for (int ki = 0; ki < kernel_h; ki++)
@@ -549,6 +563,7 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O
                     }
                 }
             }
+#endif // __riscv_zvfh
 
             if (elempack == 1)
             {
@@ -606,6 +621,7 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O
 
         if (avgpool_count_include_pad == 1)
         {
+#if __riscv_zvfh
             if (elempack == packn)
             {
                 #pragma omp parallel for num_threads(opt.num_threads)
@@ -622,7 +638,7 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O
                         {
                             const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * packn;
 
-                            vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
+                            vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
 
                             for (int k = 0; k < maxk; k++)
                             {
@@ -638,6 +654,7 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O
                     }
                 }
             }
+#endif // __riscv_zvfh
 
             if (elempack == 1)
             {
@@ -673,6 +690,6 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/prelu_riscv.cpp b/src/layer/riscv/prelu_riscv.cpp
index 75a1876519b..19682cd1171 100644
--- a/src/layer/riscv/prelu_riscv.cpp
+++ b/src/layer/riscv/prelu_riscv.cpp
@@ -18,21 +18,27 @@
 #include <riscv_vector.h>
 #endif // __riscv_vector
 
+#include "cpu.h"
+
 namespace ncnn {
 
 PReLU_riscv::PReLU_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
 #endif
 #endif
 }
 
 int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_top_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
@@ -44,23 +50,19 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     }
 #endif
 
-    int w = bottom_top_blob.w;
-    int h = bottom_top_blob.h;
-    int channels = bottom_top_blob.c;
-    int size = w * h;
     int elempack = bottom_top_blob.elempack;
     int dims = bottom_top_blob.dims;
-#if __riscv_vector
+
     if (dims == 1)
     {
         int w = bottom_top_blob.w;
         float* ptr = bottom_top_blob;
-        const float* ptr_slope = slope_data;
         if (num_slope > 1)
         {
-            int n = w * elempack;
+#if __riscv_vector
+            const float* ptr_slope = slope_data;
 
-            // #pragma omp parallel for num_threads(opt.num_threads)
+            int n = w * elempack;
             while (n > 0)
             {
                 size_t vl = __riscv_vsetvl_e32m8(n);
@@ -75,13 +77,21 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                 ptr_slope += vl;
                 n -= vl;
             }
+#else  // __riscv_vector
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                if (ptr[i] < 0)
+                    ptr[i] *= slope_data[i];
+            }
+#endif // __riscv_vector
         }
         else
         {
             float slope = slope_data[0];
 
+#if __riscv_vector
             int n = w * elempack;
-            // #pragma omp parallel for num_threads(opt.num_threads)
             while (n > 0)
             {
                 size_t vl = __riscv_vsetvl_e32m8(n);
@@ -94,6 +104,14 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                 ptr += vl;
                 n -= vl;
             }
+#else  // __riscv_vector
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                if (ptr[i] < 0)
+                    ptr[i] *= slope;
+            }
+#endif // __riscv_vector
         }
     }
 
@@ -106,6 +124,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
         for (int i = 0; i < h; i++)
         {
             float* ptr = bottom_top_blob.row(i);
+#if __riscv_vector
             if (num_slope > 1)
             {
                 for (int j = 0; j < w; j++)
@@ -146,6 +165,15 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                     n -= vl;
                 }
             }
+#else  // __riscv_vector
+            float slope = num_slope > 1 ? slope_data[i] : slope_data[0];
+
+            for (int j = 0; j < w; j++)
+            {
+                if (ptr[j] < 0)
+                    ptr[j] *= slope;
+            }
+#endif // __riscv_vector
         }
     }
 
@@ -160,6 +188,8 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
         for (int q = 0; q < channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
+
+#if __riscv_vector
             int n = size * elempack;
 
             if (num_slope > 1 && elempack != 1)
@@ -202,68 +232,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                     n -= vl;
                 }
             }
-        }
-    }
-
-#else
-    if (dims == 1)
-    {
-        int w = bottom_top_blob.w;
-
-        float* ptr = bottom_top_blob;
-
-        if (num_slope > 1)
-        {
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < w; i++)
-            {
-                if (ptr[i] < 0)
-                    ptr[i] *= slope_data[i];
-            }
-        }
-        else
-        {
-            float slope = slope_data[0];
-
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < w; i++)
-            {
-                if (ptr[i] < 0)
-                    ptr[i] *= slope;
-            }
-        }
-    }
-
-    if (dims == 2)
-    {
-        int w = bottom_top_blob.w;
-        int h = bottom_top_blob.h;
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int i = 0; i < h; i++)
-        {
-            float* ptr = bottom_top_blob.row(i);
-            float slope = num_slope > 1 ? slope_data[i] : slope_data[0];
-
-            for (int j = 0; j < w; j++)
-            {
-                if (ptr[j] < 0)
-                    ptr[j] *= slope;
-            }
-        }
-    }
-
-    if (dims == 3)
-    {
-        int w = bottom_top_blob.w;
-        int h = bottom_top_blob.h;
-        int channels = bottom_top_blob.c;
-        int size = w * h;
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
-        {
-            float* ptr = bottom_top_blob.channel(q);
+#else  // __riscv_vector
             float slope = num_slope > 1 ? slope_data[q] : slope_data[0];
 
             for (int i = 0; i < size; i++)
@@ -271,11 +240,10 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                 if (ptr[i] < 0)
                     ptr[i] *= slope;
             }
+#endif // __riscv_vector
         }
     }
 
-#endif
-
     return 0;
 }
 
diff --git a/src/layer/riscv/prelu_riscv.h b/src/layer/riscv/prelu_riscv.h
index d957b4e1474..4f56f8ce1ab 100644
--- a/src/layer/riscv/prelu_riscv.h
+++ b/src/layer/riscv/prelu_riscv.h
@@ -27,7 +27,7 @@ class PReLU_riscv : public PReLU
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
     int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
 #endif
diff --git a/src/layer/riscv/prelu_riscv_zvfh.cpp b/src/layer/riscv/prelu_riscv_zfh.cpp
similarity index 80%
rename from src/layer/riscv/prelu_riscv_zvfh.cpp
rename to src/layer/riscv/prelu_riscv_zfh.cpp
index b84ef6d4497..565f457bcd6 100644
--- a/src/layer/riscv/prelu_riscv_zvfh.cpp
+++ b/src/layer/riscv/prelu_riscv_zfh.cpp
@@ -20,15 +20,12 @@
 
 namespace ncnn {
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 //fp16s(a)
 //hint: slope always store as fp32
 
 int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
 {
-    int w = bottom_top_blob.w;
-    int h = bottom_top_blob.h;
-    int size = w * h;
     int elempack = bottom_top_blob.elempack;
     int dims = bottom_top_blob.dims;
 
@@ -36,19 +33,19 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
     {
         int w = bottom_top_blob.w;
         __fp16* ptr = bottom_top_blob;
-        const float* ptr_slope = slope_data;
         if (num_slope > 1)
         {
-            int n = w * elempack;
+#if __riscv_zvfh
+            const float* ptr_slope = slope_data;
 
-            // #pragma omp parallel for num_threads(opt.num_threads)
+            int n = w * elempack;
             while (n > 0)
             {
                 size_t vl = __riscv_vsetvl_e16m4(n);
 
                 vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl);
                 vfloat32m8_t _slope = __riscv_vle32_v_f32m8(ptr_slope, vl);
-                vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, .0f, vl);
+                vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, (__fp16)0.f, vl);
                 _p = __riscv_vfmul_vv_f32m8_mu(_lower, _p, _p, _slope, vl);
 
                 __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl);
@@ -56,18 +53,26 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                 ptr_slope += vl;
                 n -= vl;
             }
+#else  // __riscv_zvfh
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                if (ptr[i] < (__fp16)0.f)
+                    ptr[i] = (__fp16)((float)ptr[i] * slope_data[i]);
+            }
+#endif // __riscv_zvfh
         }
         else
         {
             float slope = slope_data[0];
 
+#if __riscv_zvfh
             int n = w * elempack;
-            // #pragma omp parallel for num_threads(opt.num_threads)
             while (n > 0)
             {
                 size_t vl = __riscv_vsetvl_e16m4(n);
                 vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl);
-                vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, .0f, vl);
+                vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, (__fp16)0.f, vl);
 
                 _p = __riscv_vfmul_vf_f32m8_mu(_lower, _p, _p, slope, vl);
                 __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl);
@@ -75,6 +80,14 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                 ptr += vl;
                 n -= vl;
             }
+#else  // __riscv_zvfh
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                if (ptr[i] < (__fp16)0.f)
+                    ptr[i] = (__fp16)((float)ptr[i] * slope);
+            }
+#endif // __riscv_zvfh
         }
     }
 
@@ -87,6 +100,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
         for (int i = 0; i < h; i++)
         {
             __fp16* ptr = bottom_top_blob.row<__fp16>(i);
+#if __riscv_zvfh
             if (num_slope > 1)
             {
                 for (int j = 0; j < w; j++)
@@ -100,7 +114,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                         vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl);
                         vfloat32m8_t _slope = __riscv_vle32_v_f32m8(ptr_slope, vl);
 
-                        vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, .0f, vl);
+                        vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, (__fp16)0.f, vl);
                         _p = __riscv_vfmul_vv_f32m8_mu(_lower, _p, _p, _slope, vl);
                         __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl);
 
@@ -118,7 +132,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                 {
                     size_t vl = __riscv_vsetvl_e16m4(n);
                     vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl);
-                    vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, .0f, vl);
+                    vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, (__fp16)0.f, vl);
 
                     _p = __riscv_vfmul_vf_f32m8_mu(_lower, _p, _p, slope, vl);
                     __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl);
@@ -127,6 +141,15 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                     n -= vl;
                 }
             }
+#else  // __riscv_zvfh
+            float slope = num_slope > 1 ? slope_data[i] : slope_data[0];
+
+            for (int j = 0; j < w; j++)
+            {
+                if (ptr[j] < (__fp16)0.f)
+                    ptr[j] = (__fp16)((float)ptr[j] * slope);
+            }
+#endif // __riscv_zvfh
         }
     }
 
@@ -141,6 +164,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
         for (int q = 0; q < channels; q++)
         {
             __fp16* ptr = bottom_top_blob.channel(q);
+#if __riscv_zvfh
             int n = size * elempack;
 
             if (num_slope > 1 && elempack != 1)
@@ -155,7 +179,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                         vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl);
                         vfloat32m8_t _slope = __riscv_vle32_v_f32m8(slope_ptr, vl);
 
-                        vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, .0f, vl);
+                        vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, (__fp16)0.f, vl);
                         _p = __riscv_vfmul_vv_f32m8_mu(_lower, _p, _p, _slope, vl);
                         __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl);
 
@@ -175,7 +199,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                     size_t vl = __riscv_vsetvl_e16m4(n);
                     vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl);
 
-                    vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, .0f, vl);
+                    vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, (__fp16)0.f, vl);
                     _p = __riscv_vfmul_vf_f32m8_mu(_lower, _p, _p, slope, vl);
                     __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl);
 
@@ -183,6 +207,15 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                     n -= vl;
                 }
             }
+#else  // __riscv_zvfh
+            float slope = num_slope > 1 ? slope_data[q] : slope_data[0];
+
+            for (int i = 0; i < size; i++)
+            {
+                if (ptr[i] < (__fp16)0.f)
+                    ptr[i] = (__fp16)((float)ptr[i] * slope);
+            }
+#endif // __riscv_zvfh
         }
     }
 
@@ -191,9 +224,6 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
 
 int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
 {
-    int w = bottom_top_blob.w;
-    int h = bottom_top_blob.h;
-    int size = w * h;
     int elempack = bottom_top_blob.elempack;
     int dims = bottom_top_blob.dims;
 
@@ -201,18 +231,18 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
     {
         int w = bottom_top_blob.w;
         __fp16* ptr = bottom_top_blob;
-        const float* ptr_slope = slope_data;
         if (num_slope > 1)
         {
-            int n = w * elempack;
+#if __riscv_zvfh
+            const float* ptr_slope = slope_data;
 
-            // #pragma omp parallel for num_threads(opt.num_threads)
+            int n = w * elempack;
             while (n > 0)
             {
                 size_t vl = __riscv_vsetvl_e16m4(n);
                 vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl);
                 vfloat16m4_t _slope = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(ptr_slope, vl), vl);
-                vbool4_t _lower = __riscv_vmflt_vf_f16m4_b4(_p, .0f, vl);
+                vbool4_t _lower = __riscv_vmflt_vf_f16m4_b4(_p, (__fp16)0.f, vl);
 
                 _p = __riscv_vfmul_vv_f16m4_mu(_lower, _p, _p, _slope, vl);
                 __riscv_vse16_v_f16m4(ptr, _p, vl);
@@ -221,18 +251,26 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
                 ptr_slope += vl;
                 n -= vl;
             }
+#else  // __riscv_zvfh
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                if (ptr[i] < (__fp16)0.f)
+                    ptr[i] *= (__fp16)slope_data[i];
+            }
+#endif // __riscv_zvfh
         }
         else
         {
-            __fp16 slope = slope_data[0];
+            __fp16 slope = (__fp16)slope_data[0];
 
+#if __riscv_zvfh
             int n = w * elempack;
-            // #pragma omp parallel for num_threads(opt.num_threads)
             while (n > 0)
             {
                 size_t vl = __riscv_vsetvl_e16m8(n);
                 vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
-                vbool2_t _lower = __riscv_vmflt_vf_f16m8_b2(_p, .0f, vl);
+                vbool2_t _lower = __riscv_vmflt_vf_f16m8_b2(_p, (__fp16)0.f, vl);
 
                 _p = __riscv_vfmul_vf_f16m8_mu(_lower, _p, _p, slope, vl);
                 __riscv_vse16_v_f16m8(ptr, _p, vl);
@@ -240,6 +278,14 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
                 ptr += vl;
                 n -= vl;
             }
+#else  // __riscv_zvfh
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                if (ptr[i] < (__fp16)0.f)
+                    ptr[i] *= slope;
+            }
+#endif // __riscv_zvfh
         }
     }
 
@@ -252,6 +298,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
         for (int i = 0; i < h; i++)
         {
             __fp16* ptr = bottom_top_blob.row<__fp16>(i);
+#if __riscv_zvfh
             if (num_slope > 1)
             {
                 for (int j = 0; j < w; j++)
@@ -265,7 +312,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
                         vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl);
                         vfloat16m4_t _slope = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(ptr_slope, vl), vl);
 
-                        vbool4_t _lower = __riscv_vmflt_vf_f16m4_b4(_p, .0f, vl);
+                        vbool4_t _lower = __riscv_vmflt_vf_f16m4_b4(_p, (__fp16)0.f, vl);
                         _p = __riscv_vfmul_vv_f16m4_mu(_lower, _p, _p, _slope, vl);
                         __riscv_vse16_v_f16m4(ptr, _p, vl);
 
@@ -277,13 +324,13 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
             }
             else
             {
-                __fp16 slope = slope_data[0];
+                __fp16 slope = (__fp16)slope_data[0];
                 int n = w * elempack;
                 while (n > 0)
                 {
                     size_t vl = __riscv_vsetvl_e16m8(n);
                     vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
-                    vbool2_t _lower = __riscv_vmflt_vf_f16m8_b2(_p, .0f, vl);
+                    vbool2_t _lower = __riscv_vmflt_vf_f16m8_b2(_p, (__fp16)0.f, vl);
 
                     _p = __riscv_vfmul_vf_f16m8_mu(_lower, _p, _p, slope, vl);
                     __riscv_vse16_v_f16m8(ptr, _p, vl);
@@ -292,6 +339,15 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
                     n -= vl;
                 }
             }
+#else  // __riscv_zvfh
+            __fp16 slope = num_slope > 1 ? (__fp16)slope_data[i] : (__fp16)slope_data[0];
+
+            for (int j = 0; j < w; j++)
+            {
+                if (ptr[j] < (__fp16)0.f)
+                    ptr[j] *= slope;
+            }
+#endif // __riscv_zvfh
         }
     }
 
@@ -306,6 +362,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
         for (int q = 0; q < channels; q++)
         {
             __fp16* ptr = bottom_top_blob.channel(q);
+#if __riscv_zvfh
             int n = size * elempack;
 
             if (num_slope > 1 && elempack != 1)
@@ -320,7 +377,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
                         vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl);
                         vfloat16m4_t _slope = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(slope_ptr, vl), vl);
 
-                        vbool4_t _lower = __riscv_vmflt_vf_f16m4_b4(_p, .0f, vl);
+                        vbool4_t _lower = __riscv_vmflt_vf_f16m4_b4(_p, (__fp16)0.f, vl);
                         _p = __riscv_vfmul_vv_f16m4_mu(_lower, _p, _p, _slope, vl);
                         __riscv_vse16_v_f16m4(ptr, _p, vl);
 
@@ -340,7 +397,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
                     size_t vl = __riscv_vsetvl_e16m8(n);
                     vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
 
-                    vbool2_t _lower = __riscv_vmflt_vf_f16m8_b2(_p, .0f, vl);
+                    vbool2_t _lower = __riscv_vmflt_vf_f16m8_b2(_p, (__fp16)0.f, vl);
                     _p = __riscv_vfmul_vf_f16m8_mu(_lower, _p, _p, (__fp16)slope, vl);
                     __riscv_vse16_v_f16m8(ptr, _p, vl);
 
@@ -348,11 +405,20 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
                     n -= vl;
                 }
             }
+#else  // __riscv_zvfh
+            __fp16 slope = num_slope > 1 ? (__fp16)slope_data[q] : (__fp16)slope_data[0];
+
+            for (int i = 0; i < size; i++)
+            {
+                if (ptr[i] < (__fp16)0.f)
+                    ptr[i] *= slope;
+            }
+#endif // __riscv_zvfh
         }
     }
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/relu_riscv.cpp b/src/layer/riscv/relu_riscv.cpp
index cad03e24bfb..fe4291331c7 100644
--- a/src/layer/riscv/relu_riscv.cpp
+++ b/src/layer/riscv/relu_riscv.cpp
@@ -18,21 +18,27 @@
 #include <riscv_vector.h>
 #endif // __riscv_vector
 
+#include "cpu.h"
+
 namespace ncnn {
 
 ReLU_riscv::ReLU_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
 #endif
 #endif
 }
 
 int ReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_top_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
diff --git a/src/layer/riscv/relu_riscv.h b/src/layer/riscv/relu_riscv.h
index 9dd6c503bdb..7fae384abd1 100644
--- a/src/layer/riscv/relu_riscv.h
+++ b/src/layer/riscv/relu_riscv.h
@@ -27,7 +27,7 @@ class ReLU_riscv : public ReLU
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };
diff --git a/src/layer/riscv/relu_riscv_zvfh.cpp b/src/layer/riscv/relu_riscv_zfh.cpp
similarity index 81%
rename from src/layer/riscv/relu_riscv_zvfh.cpp
rename to src/layer/riscv/relu_riscv_zfh.cpp
index bca2891a919..b4503421755 100644
--- a/src/layer/riscv/relu_riscv_zvfh.cpp
+++ b/src/layer/riscv/relu_riscv_zfh.cpp
@@ -20,7 +20,7 @@
 
 namespace ncnn {
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 int ReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
@@ -36,6 +36,7 @@ int ReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c
         __fp16* ptr = bottom_top_blob.channel(q);
         if (slope == 0.f)
         {
+#if __riscv_zvfh
             int n = size;
             while (n > 0)
             {
@@ -48,27 +49,44 @@ int ReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c
                 ptr += vl;
                 n -= vl;
             }
+#else  // __riscv_zvfh
+            for (int i = 0; i < size; i++)
+            {
+                if (*ptr < (__fp16)0.f)
+                    *ptr = (__fp16)0.f;
+                ptr++;
+            }
+#endif // __riscv_zvfh
         }
         else
         {
-            int n = size;
             __fp16 _slope = (__fp16)slope;
+#if __riscv_zvfh
+            int n = size;
             while (n > 0)
             {
                 size_t vl = __riscv_vsetvl_e16m8(n);
 
                 vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
-                _p = __riscv_vfmul_vf_f16m8_mu(__riscv_vmflt_vf_f16m8_b2(_p, .0f, vl), _p, _p, _slope, vl);
+                _p = __riscv_vfmul_vf_f16m8_mu(__riscv_vmflt_vf_f16m8_b2(_p, (__fp16)0.f, vl), _p, _p, _slope, vl);
                 __riscv_vse16_v_f16m8(ptr, _p, vl);
 
                 ptr += vl;
                 n -= vl;
             }
+#else  // __riscv_zvfh
+            for (int i = 0; i < size; i++)
+            {
+                if (*ptr < (__fp16)0.f)
+                    *ptr *= _slope;
+                ptr++;
+            }
+#endif // __riscv_zvfh
         }
     }
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index e5c60c882d1..3983baee8a5 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -411,57 +411,10 @@ static inline void transpose8x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
                                    vfloat32m1_t& _r7l, vfloat32m1_t& _r7h, size_t vl)
 {
     float tmp[64];
-#if __riscv_vector
-#if 1 //__riscv_v_intrinsic > 12000
-#warning A
     vfloat32m1x8_t _rl = __riscv_vcreate_v_f32m1x8(_r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l);
     vfloat32m1x8_t _rh = __riscv_vcreate_v_f32m1x8(_r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h);
-#else
-#warning B
-    vfloat32m1x8_t _rl = vfloat32m1x8_t();
-    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 0, _r0l);
-    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 1, _r1l);
-    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 2, _r2l);
-    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 3, _r3l);
-    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 4, _r4l);
-    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 5, _r5l);
-    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 6, _r6l);
-    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 7, _r7l);
-    vfloat32m1x8_t _rh = vfloat32m1x8_t();
-    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 0, _r0h);
-    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 1, _r1h);
-    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 2, _r2h);
-    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 3, _r3h);
-    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 4, _r4h);
-    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 5, _r5h);
-    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 6, _r6h);
-    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 7, _r7h);
-#endif
     __riscv_vsseg8e32_v_f32m1x8(&tmp[0], _rl, vl);
     __riscv_vsseg8e32_v_f32m1x8(&tmp[32], _rh, vl);
-#elif __riscv_xtheadvector
-    vfloat32m1x8_t _rl = vfloat32m1x8_t();
-    // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 0, _r0l);
-    // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 1, _r1l);
-    // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 2, _r2l);
-    // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 3, _r3l);
-    // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 4, _r4l);
-    // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 5, _r5l);
-    // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 6, _r6l);
-    // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 7, _r7l);
-    vfloat32m1x8_t _rh = vfloat32m1x8_t();
-    // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 0, _r0h);
-    // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 1, _r1h);
-    // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 2, _r2h);
-    // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 3, _r3h);
-    // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 4, _r4h);
-    // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 5, _r5h);
-    // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 6, _r6h);
-    // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 7, _r7h);
-    __riscv_th_vsseg8e32_v_f32m1x8(&tmp[0], _rl, vl);
-    __riscv_th_vsseg8e32_v_f32m1x8(&tmp[32], _rh, vl);
-#endif
-
     float* ptr = (float*)tmp;
     _r0l = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl);
     _r0h = __riscv_vle32_v_f32m1(ptr + 1 * 4, vl);
@@ -484,15 +437,7 @@ static inline void transpose8x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
 static inline void transpose4x4_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, size_t vl)
 {
     float tmp[16];
-#if __riscv_vector && __riscv_v_intrinsic > 12000
     vfloat32m1x4_t _r = __riscv_vcreate_v_f32m1x4(_r0, _r1, _r2, _r3);
-#else
-    vfloat32m1x4_t _r = vfloat32m1x4_t();
-    _r = __riscv_vset_v_f32m1_f32m1x4(_r, 0, _r0);
-    _r = __riscv_vset_v_f32m1_f32m1x4(_r, 1, _r1);
-    _r = __riscv_vset_v_f32m1_f32m1x4(_r, 2, _r2);
-    _r = __riscv_vset_v_f32m1_f32m1x4(_r, 3, _r3);
-#endif
     __riscv_vsseg4e32_v_f32m1x4(&tmp[0], _r, vl);
     float* ptr = (float*)tmp;
     _r0 = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl);
@@ -515,7 +460,6 @@ static inline void transpose8x12_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
                                     vfloat32m1_t& _rbl, vfloat32m1_t& _rbh, size_t vl)
 {
     float tmp[8][12];
-
     __riscv_vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0l, vl);
     __riscv_vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 12, _r0h, vl);
     __riscv_vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1l, vl);
@@ -577,43 +521,12 @@ static inline void transpose12x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0m, vflo
                                     vfloat32m1_t& _r7l, vfloat32m1_t& _r7m, vfloat32m1_t& _r7h, size_t vl)
 {
     float tmp[96];
-#if __riscv_vector && __riscv_v_intrinsic > 12000
     vfloat32m1x8_t _rl = __riscv_vcreate_v_f32m1x8(_r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l);
-    vfloat32m1x8_t _rh = __riscv_vcreate_v_f32m1x8(_r0m, _r1m, _r2m, _r3m, _r4m, _r5m, _r6m, _r7m);
+    vfloat32m1x8_t _rm = __riscv_vcreate_v_f32m1x8(_r0m, _r1m, _r2m, _r3m, _r4m, _r5m, _r6m, _r7m);
     vfloat32m1x8_t _rh = __riscv_vcreate_v_f32m1x8(_r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h);
-#else
-    vfloat32m1x8_t _rl = vfloat32m1x8_t();
-    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 0, _r0l);
-    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 1, _r1l);
-    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 2, _r2l);
-    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 3, _r3l);
-    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 4, _r4l);
-    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 5, _r5l);
-    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 6, _r6l);
-    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 7, _r7l);
-    vfloat32m1x8_t _rm = vfloat32m1x8_t();
-    _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 0, _r0m);
-    _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 1, _r1m);
-    _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 2, _r2m);
-    _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 3, _r3m);
-    _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 4, _r4m);
-    _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 5, _r5m);
-    _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 6, _r6m);
-    _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 7, _r7m);
-    vfloat32m1x8_t _rh = vfloat32m1x8_t();
-    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 0, _r0h);
-    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 1, _r1h);
-    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 2, _r2h);
-    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 3, _r3h);
-    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 4, _r4h);
-    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 5, _r5h);
-    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 6, _r6h);
-    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 7, _r7h);
-#endif
     __riscv_vsseg8e32_v_f32m1x8(&tmp[0], _rl, vl);
     __riscv_vsseg8e32_v_f32m1x8(&tmp[32], _rm, vl);
     __riscv_vsseg8e32_v_f32m1x8(&tmp[64], _rh, vl);
-
     float* ptr = (float*)tmp;
     _r0l = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl);
     _r0m = __riscv_vle32_v_f32m1(ptr + 1 * 4, vl);
@@ -644,21 +557,8 @@ static inline void transpose12x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0m, vflo
 static inline void transpose4x8_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, size_t vl)
 {
     float tmp[32];
-#if __riscv_vector && __riscv_v_intrinsic > 12000
     vfloat32m1x8_t _r = __riscv_vcreate_v_f32m1x8(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
-#else
-    vfloat32m1x8_t _r = vfloat32m1x8_t();
-    _r = __riscv_vset_v_f32m1_f32m1x8(_r, 0, _r0);
-    _r = __riscv_vset_v_f32m1_f32m1x8(_r, 1, _r1);
-    _r = __riscv_vset_v_f32m1_f32m1x8(_r, 2, _r2);
-    _r = __riscv_vset_v_f32m1_f32m1x8(_r, 3, _r3);
-    _r = __riscv_vset_v_f32m1_f32m1x8(_r, 4, _r4);
-    _r = __riscv_vset_v_f32m1_f32m1x8(_r, 5, _r5);
-    _r = __riscv_vset_v_f32m1_f32m1x8(_r, 6, _r6);
-    _r = __riscv_vset_v_f32m1_f32m1x8(_r, 7, _r7);
-#endif
     __riscv_vsseg8e32_v_f32m1x8(&tmp[0], _r, vl);
-
     float* ptr = (float*)tmp;
     _r0 = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl);
     _r1 = __riscv_vle32_v_f32m1(ptr + 1 * 4, vl);
@@ -706,21 +606,8 @@ static inline void transpose8x4_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
                                    vfloat32m1_t& _r3l, vfloat32m1_t& _r3h, size_t vl)
 {
     float tmp[32];
-#if __riscv_vector && __riscv_v_intrinsic > 12000
     vfloat32m1x4_t _rl = __riscv_vcreate_v_f32m1x4(_r0l, _r1l, _r2l, _r3l);
     vfloat32m1x4_t _rh = __riscv_vcreate_v_f32m1x4(_r0h, _r1h, _r2h, _r3h);
-#else
-    vfloat32m1x4_t _rl = vfloat32m1x4_t();
-    _rl = __riscv_vset_v_f32m1_f32m1x4(_rl, 0, _r0l);
-    _rl = __riscv_vset_v_f32m1_f32m1x4(_rl, 1, _r1l);
-    _rl = __riscv_vset_v_f32m1_f32m1x4(_rl, 2, _r2l);
-    _rl = __riscv_vset_v_f32m1_f32m1x4(_rl, 3, _r3l);
-    vfloat32m1x4_t _rh = vfloat32m1x4_t();
-    _rh = __riscv_vset_v_f32m1_f32m1x4(_rh, 0, _r0h);
-    _rh = __riscv_vset_v_f32m1_f32m1x4(_rh, 1, _r1h);
-    _rh = __riscv_vset_v_f32m1_f32m1x4(_rh, 2, _r2h);
-    _rh = __riscv_vset_v_f32m1_f32m1x4(_rh, 3, _r3h);
-#endif
     __riscv_vsseg4e32_v_f32m1x4(&tmp[0], _rl, vl);
     __riscv_vsseg4e32_v_f32m1x4(&tmp[16], _rh, vl);
     float* ptr = (float*)tmp;
diff --git a/src/layer/riscv/rvv_mathfun.h b/src/layer/riscv/rvv_mathfun.h
index 107492c832b..69be624a23a 100644
--- a/src/layer/riscv/rvv_mathfun.h
+++ b/src/layer/riscv/rvv_mathfun.h
@@ -562,7 +562,7 @@ _RVV_FLOAT32_ERFC_OP(8, 4)
 
 //TODO rvv optimize
 #define _RVV_FLOAT32_ATAN2_OP(LMUL, MLEN)                                                               \
-    static inline vfloat32m##LMUL##_t atan2_ps(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, size_t vl) \
+    static inline vfloat32m##LMUL##_t atan2_ps(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, volatile size_t vl) \
     {                                                                                                   \
         std::vector<float> tmpx(vl);                                                                    \
         std::vector<float> tmpy(vl);                                                                    \
diff --git a/src/layer/riscv/rvv_mathfun_fp16s.h b/src/layer/riscv/rvv_mathfun_fp16s.h
index ade4b05f237..0db6b0c15c2 100644
--- a/src/layer/riscv/rvv_mathfun_fp16s.h
+++ b/src/layer/riscv/rvv_mathfun_fp16s.h
@@ -411,9 +411,9 @@ _RVV_FLOAT16_SIGMOID_OP(8, 2)
         return __riscv_vle16_v_f16m##LMUL(tmpx.data(), vl);                                             \
     }
 
-_RVV_FLOAT16_ATAN2_OP(1, 32)
-_RVV_FLOAT16_ATAN2_OP(2, 16)
-_RVV_FLOAT16_ATAN2_OP(4, 8)
-_RVV_FLOAT16_ATAN2_OP(8, 4)
+_RVV_FLOAT16_ATAN2_OP(1, 16)
+_RVV_FLOAT16_ATAN2_OP(2, 8)
+_RVV_FLOAT16_ATAN2_OP(4, 4)
+_RVV_FLOAT16_ATAN2_OP(8, 2)
 
 #endif // RVV_MATHFUN_FP16S_H
diff --git a/src/layer/riscv/sigmoid_riscv.cpp b/src/layer/riscv/sigmoid_riscv.cpp
index 2311231d6d2..e929ba82a4e 100644
--- a/src/layer/riscv/sigmoid_riscv.cpp
+++ b/src/layer/riscv/sigmoid_riscv.cpp
@@ -19,21 +19,27 @@
 #include "rvv_mathfun.h"
 #endif // __riscv_vector
 
+#include "cpu.h"
+
 namespace ncnn {
 
 Sigmoid_riscv::Sigmoid_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 }
 
 int Sigmoid_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_top_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
diff --git a/src/layer/riscv/sigmoid_riscv.h b/src/layer/riscv/sigmoid_riscv.h
index 02fd3adb4cb..67378486789 100644
--- a/src/layer/riscv/sigmoid_riscv.h
+++ b/src/layer/riscv/sigmoid_riscv.h
@@ -27,7 +27,7 @@ class Sigmoid_riscv : public Sigmoid
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
     int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
 #endif
diff --git a/src/layer/riscv/sigmoid_riscv_zvfh.cpp b/src/layer/riscv/sigmoid_riscv_zfh.cpp
similarity index 86%
rename from src/layer/riscv/sigmoid_riscv_zvfh.cpp
rename to src/layer/riscv/sigmoid_riscv_zfh.cpp
index 776d0b44748..f4806f64e8f 100644
--- a/src/layer/riscv/sigmoid_riscv_zvfh.cpp
+++ b/src/layer/riscv/sigmoid_riscv_zfh.cpp
@@ -24,7 +24,7 @@
 
 namespace ncnn {
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 int Sigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
@@ -39,6 +39,7 @@ int Sigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt
     {
         __fp16* ptr = bottom_top_blob.channel(q);
 
+#if __riscv_zvfh
         int n = size;
         while (n > 0)
         {
@@ -51,6 +52,14 @@ int Sigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt
             ptr += vl;
             n -= vl;
         }
+#else  // __riscv_zvfh
+        for (int i = 0; i < size; i++)
+        {
+            *ptr = (__fp16)(1.f / (1.f + exp(-(float)*ptr)));
+
+            ptr++;
+        }
+#endif // __riscv_zvfh
     }
 
     return 0;
@@ -70,6 +79,7 @@ int Sigmoid_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op
     {
         __fp16* ptr = bottom_top_blob.channel(q);
 
+#if __riscv_zvfh
         int n = size;
         while (n > 0)
         {
@@ -82,10 +92,18 @@ int Sigmoid_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op
             ptr += vl;
             n -= vl;
         }
+#else  // __riscv_zvfh
+        for (int i = 0; i < size; i++)
+        {
+            *ptr = (__fp16)1.f / ((__fp16)1.f + (__fp16)exp((float)-*ptr));
+
+            ptr++;
+        }
+#endif // __riscv_zvfh
     }
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/swish_riscv.cpp b/src/layer/riscv/swish_riscv.cpp
index 4c6db7a31ea..f09e4065d2a 100644
--- a/src/layer/riscv/swish_riscv.cpp
+++ b/src/layer/riscv/swish_riscv.cpp
@@ -19,21 +19,27 @@
 #include "rvv_mathfun.h"
 #endif // __riscv_vector
 
+#include "cpu.h"
+
 namespace ncnn {
 
 Swish_riscv::Swish_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 }
 
 int Swish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_top_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
diff --git a/src/layer/riscv/swish_riscv.h b/src/layer/riscv/swish_riscv.h
index 977d34b8ef0..971b5cb2b40 100644
--- a/src/layer/riscv/swish_riscv.h
+++ b/src/layer/riscv/swish_riscv.h
@@ -27,7 +27,7 @@ class Swish_riscv : public Swish
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
     int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
 #endif
diff --git a/src/layer/riscv/swish_riscv_zvfh.cpp b/src/layer/riscv/swish_riscv_zfh.cpp
similarity index 82%
rename from src/layer/riscv/swish_riscv_zvfh.cpp
rename to src/layer/riscv/swish_riscv_zfh.cpp
index bfa615b8c76..2fa4e028cd2 100644
--- a/src/layer/riscv/swish_riscv_zvfh.cpp
+++ b/src/layer/riscv/swish_riscv_zfh.cpp
@@ -24,7 +24,7 @@
 
 namespace ncnn {
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 int Swish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
@@ -39,18 +39,27 @@ int Swish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
     {
         __fp16* ptr = bottom_top_blob.channel(q);
 
+#if __riscv_zvfh
         int n = size;
         while (n > 0)
         {
             size_t vl = __riscv_vsetvl_e16m4(n);
 
             vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl);
-            _p = __riscv_vfdiv_vv_f32m8(_p, __riscv_vfadd_vf_f32m8(exp_ps(__riscv_vfneg_v_f32m8(_p, vl), vl), 1.f, vl), vl);
+            _p = __riscv_vfdiv_vv_f32m8(_p, __riscv_vfadd_vf_f32m8(exp_ps(__riscv_vfneg_v_f32m8(_p, vl), vl), (__fp16)1.f, vl), vl);
             __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl);
 
             ptr += vl;
             n -= vl;
         }
+#else  // __riscv_zvfh
+        for (int i = 0; i < size; i++)
+        {
+            float v = (float)*ptr;
+            *ptr = (__fp16)(v / (1.f + exp(-v)));
+            ptr++;
+        }
+#endif // __riscv_zvfh
     }
 
     return 0;
@@ -70,22 +79,30 @@ int Swish_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
     {
         __fp16* ptr = bottom_top_blob.channel(q);
 
+#if __riscv_zvfh
         int n = size;
         while (n > 0)
         {
             size_t vl = __riscv_vsetvl_e16m8(n);
 
             vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
-            _p = __riscv_vfdiv_vv_f16m8(_p, __riscv_vfadd_vf_f16m8(exp_ps(__riscv_vfneg_v_f16m8(_p, vl), vl), 1.f, vl), vl);
+            _p = __riscv_vfdiv_vv_f16m8(_p, __riscv_vfadd_vf_f16m8(exp_ps(__riscv_vfneg_v_f16m8(_p, vl), vl), (__fp16)1.f, vl), vl);
             __riscv_vse16_v_f16m8(ptr, _p, vl);
 
             ptr += vl;
             n -= vl;
         }
+#else  // __riscv_zvfh
+        for (int i = 0; i < size; i++)
+        {
+            *ptr = *ptr / ((__fp16)1.f + (__fp16)exp((float)-*ptr));
+            ptr++;
+        }
+#endif // __riscv_zvfh
     }
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/tanh_riscv.cpp b/src/layer/riscv/tanh_riscv.cpp
index e799e782819..8c3bdb22b3c 100644
--- a/src/layer/riscv/tanh_riscv.cpp
+++ b/src/layer/riscv/tanh_riscv.cpp
@@ -19,21 +19,27 @@
 #include "rvv_mathfun.h"
 #endif // __riscv_vector
 
+#include "cpu.h"
+
 namespace ncnn {
 
 TanH_riscv::TanH_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 }
 
 int TanH_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_top_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
diff --git a/src/layer/riscv/tanh_riscv.h b/src/layer/riscv/tanh_riscv.h
index 22c919a9943..69cb0d4e7cc 100644
--- a/src/layer/riscv/tanh_riscv.h
+++ b/src/layer/riscv/tanh_riscv.h
@@ -27,7 +27,7 @@ class TanH_riscv : public TanH
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
     int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
 #endif
diff --git a/src/layer/riscv/tanh_riscv_zvfh.cpp b/src/layer/riscv/tanh_riscv_zfh.cpp
similarity index 87%
rename from src/layer/riscv/tanh_riscv_zvfh.cpp
rename to src/layer/riscv/tanh_riscv_zfh.cpp
index 24f7b2c4ad8..6cdb9113231 100644
--- a/src/layer/riscv/tanh_riscv_zvfh.cpp
+++ b/src/layer/riscv/tanh_riscv_zfh.cpp
@@ -24,7 +24,7 @@
 
 namespace ncnn {
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 int TanH_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
@@ -39,6 +39,7 @@ int TanH_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c
     {
         __fp16* ptr = bottom_top_blob.channel(q);
 
+#if __riscv_zvfh
         int n = size;
         while (n > 0)
         {
@@ -51,6 +52,13 @@ int TanH_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c
             ptr += vl;
             n -= vl;
         }
+#else  // __riscv_zvfh
+        for (int i = 0; i < size; i++)
+        {
+            *ptr = (__fp16)tanh((float)*ptr);
+            ptr++;
+        }
+#endif // __riscv_zvfh
     }
 
     return 0;
@@ -70,6 +78,7 @@ int TanH_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
     {
         __fp16* ptr = bottom_top_blob.channel(q);
 
+#if __riscv_zvfh
         int n = size;
         while (n > 0)
         {
@@ -82,10 +91,17 @@ int TanH_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
             ptr += vl;
             n -= vl;
         }
+#else  // __riscv_zvfh
+        for (int i = 0; i < size; i++)
+        {
+            *ptr = (__fp16)tanh((float)*ptr);
+            ptr++;
+        }
+#endif // __riscv_zvfh
     }
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/layer/riscv/unaryop_riscv.cpp b/src/layer/riscv/unaryop_riscv.cpp
index 58af0802d33..30fb05fb837 100644
--- a/src/layer/riscv/unaryop_riscv.cpp
+++ b/src/layer/riscv/unaryop_riscv.cpp
@@ -19,16 +19,22 @@
 #include "rvv_mathfun.h"
 #endif // __riscv_vector
 
+#include "cpu.h"
+
 namespace ncnn {
 
 UnaryOp_riscv::UnaryOp_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
     support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
 #endif
-#endif // __riscv_vector
 }
 
 #if __riscv_vector
@@ -291,7 +297,7 @@ struct unary_op_trunc
 
 int UnaryOp_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int elembits = bottom_top_blob.elembits();
 
     if (opt.use_fp16_storage && elembits == 16)
diff --git a/src/layer/riscv/unaryop_riscv.h b/src/layer/riscv/unaryop_riscv.h
index c21ab3ec95b..c3db29bb4aa 100644
--- a/src/layer/riscv/unaryop_riscv.h
+++ b/src/layer/riscv/unaryop_riscv.h
@@ -27,7 +27,7 @@ class UnaryOp_riscv : public UnaryOp
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if NCNN_ZFH
     int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };
diff --git a/src/layer/riscv/unaryop_riscv_zvfh.cpp b/src/layer/riscv/unaryop_riscv_zfh.cpp
similarity index 69%
rename from src/layer/riscv/unaryop_riscv_zvfh.cpp
rename to src/layer/riscv/unaryop_riscv_zfh.cpp
index e706002dd3c..09f26cb7e28 100644
--- a/src/layer/riscv/unaryop_riscv_zvfh.cpp
+++ b/src/layer/riscv/unaryop_riscv_zfh.cpp
@@ -22,9 +22,11 @@
 #endif
 #endif // __riscv_vector
 
+#include <float.h>
+
 namespace ncnn {
 
-#if __riscv_zvfh
+#if NCNN_ZFH
 template<typename Op>
 static int unary_op_inplace_fp16s(Mat& a, const Option& opt)
 {
@@ -42,6 +44,7 @@ static int unary_op_inplace_fp16s(Mat& a, const Option& opt)
     {
         __fp16* ptr = a.channel(q);
 
+#if __riscv_zvfh
         int n = size * elempack;
         while (n > 0)
         {
@@ -54,6 +57,12 @@ static int unary_op_inplace_fp16s(Mat& a, const Option& opt)
             ptr += vl;
             n -= vl;
         }
+#else  // __riscv_zvfh
+        for (int i = 0; i < size; i++)
+        {
+            ptr[i] = op(ptr[i]);
+        }
+#endif // __riscv_zvfh
     }
 
     return 0;
@@ -63,101 +72,179 @@ namespace UnaryOp_riscv_functor {
 
 struct unary_op_abs_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
-        return __riscv_vfsgnj_vf_f16m8(x, 1.f, vl);
+        return __riscv_vfsgnj_vf_f16m8(x, (__fp16)1.f, vl);
+    }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        return (__fp16)fabsf((float)x);
     }
+#endif // __riscv_zvfh
 };
 
 struct unary_op_neg_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return __riscv_vfneg_v_f16m8(x, vl);
     }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        return -x;
+    }
+#endif // __riscv_zvfh
 };
 
 struct unary_op_floor_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return __riscv_vfcvt_f_x_v_f16m8(__riscv_vfcvt_x_f_v_i16m8_rm(x, __RISCV_FRM_RDN, vl), vl);
     }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        return (__fp16)floorf((float)x);
+    }
+#endif // __riscv_zvfh
 };
 
 struct unary_op_ceil_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return __riscv_vfcvt_f_x_v_f16m8(__riscv_vfcvt_x_f_v_i16m8_rm(x, __RISCV_FRM_RUP, vl), vl);
     }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        return (__fp16)ceilf((float)x);
+    }
+#endif // __riscv_zvfh
 };
 
 struct unary_op_square_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return __riscv_vfmul_vv_f16m8(x, x, vl);
     }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        return x * x;
+    }
+#endif // __riscv_zvfh
 };
 
 struct unary_op_sqrt_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return __riscv_vfsqrt_v_f16m8(x, vl);
     }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        return (__fp16)sqrtf((float)x);
+    }
+#endif // __riscv_zvfh
 };
 
 struct unary_op_rsqrt_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
 #if __riscv_xtheadvector
-        vfloat16m8_t _reciprocal = __riscv_vfrdiv_vf_f16m8(__riscv_vfsqrt_v_f16m8(x, vl), 1.f, vl);
+        vfloat16m8_t _reciprocal = __riscv_vfrdiv_vf_f16m8(__riscv_vfsqrt_v_f16m8(x, vl), (__fp16)1.f, vl);
 #else
         vfloat16m8_t _reciprocal = __riscv_vfrsqrt7_v_f16m8(x, vl);
-        _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(__riscv_vfmul_vf_f16m8(x, 0.5f, vl), __riscv_vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl);
-        // _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(__riscv_vfmul_vf_f16m8(x, 0.5f, vl), __riscv_vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl);
+        _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(__riscv_vfmul_vf_f16m8(x, (__fp16)0.5f, vl), __riscv_vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), (__fp16)1.5f, vl), _reciprocal, vl);
+        // _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(__riscv_vfmul_vf_f16m8(x, (__fp16)0.5f, vl), __riscv_vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), (__fp16)1.5f, vl), _reciprocal, vl);
 #endif
         return _reciprocal;
     }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        return (__fp16)(1.f / sqrtf((float)x));
+    }
+#endif // __riscv_zvfh
 };
 
 struct unary_op_exp_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return exp_ps(x, vl);
     }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        return (__fp16)expf((float)x);
+    }
+#endif // __riscv_zvfh
 };
 
 struct unary_op_log_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return log_ps(x, vl);
     }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        return (__fp16)logf((float)x);
+    }
+#endif // __riscv_zvfh
 };
 
 struct unary_op_sin_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return sin_ps(x, vl);
     }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        return (__fp16)sinf((float)x);
+    }
+#endif // __riscv_zvfh
 };
 
 struct unary_op_cos_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return cos_ps(x, vl);
     }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        return (__fp16)cosf((float)x);
+    }
+#endif // __riscv_zvfh
 };
 
 struct unary_op_tan_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
@@ -169,10 +256,17 @@ struct unary_op_tan_fp16s
         }
         return __riscv_vle16_v_f16m8(tmp.data(), vl);
     }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        return (__fp16)tanf((float)x);
+    }
+#endif // __riscv_zvfh
 };
 
 struct unary_op_asin_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
@@ -184,10 +278,17 @@ struct unary_op_asin_fp16s
         }
         return __riscv_vle16_v_f16m8(tmp.data(), vl);
     }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        return (__fp16)asin((float)x);
+    }
+#endif // __riscv_zvfh
 };
 
 struct unary_op_acos_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
@@ -199,10 +300,17 @@ struct unary_op_acos_fp16s
         }
         return __riscv_vle16_v_f16m8(tmp.data(), vl);
     }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        return (__fp16)acos((float)x);
+    }
+#endif // __riscv_zvfh
 };
 
 struct unary_op_atan_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
@@ -214,49 +322,93 @@ struct unary_op_atan_fp16s
         }
         return __riscv_vle16_v_f16m8(tmp.data(), vl);
     }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        return (__fp16)atan((float)x);
+    }
+#endif // __riscv_zvfh
 };
 
 struct unary_op_reciprocal_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
 #if __riscv_xtheadvector
-        vfloat16m8_t _reciprocal = __riscv_vfrdiv_vf_f16m8(x, 1.f, vl);
+        vfloat16m8_t _reciprocal = __riscv_vfrdiv_vf_f16m8(x, (__fp16)1.f, vl);
 #else
         vfloat16m8_t _reciprocal = __riscv_vfrec7_v_f16m8(x, vl);
-        _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl);
-        // _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl);
+        _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(x, _reciprocal, vl), (__fp16)2.f, vl), _reciprocal, vl);
+        // _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(x, _reciprocal, vl), (__fp16)2.f, vl), _reciprocal, vl);
 #endif
         return _reciprocal;
     }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        return (__fp16)1.f / x;
+    }
+#endif // __riscv_zvfh
 };
 
 struct unary_op_tanh_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return tanh_ps(x, vl);
     }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        return (__fp16)tanhf((float)x);
+    }
+#endif // __riscv_zvfh
 };
 
 struct unary_op_log10_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
-        return __riscv_vfmul_vf_f16m8(log_ps(x, vl), 0.434294481903, vl);
+        return __riscv_vfmul_vf_f16m8(log_ps(x, vl), (__fp16)0.434294481903, vl);
     }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        return (__fp16)log10f((float)x);
+    }
+#endif // __riscv_zvfh
 };
 
 struct unary_op_round_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return __riscv_vfcvt_f_x_v_f16m8(__riscv_vfcvt_x_f_v_i16m8(x, vl), vl);
     }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        // round to nearest even
+#ifdef FE_TONEAREST
+        int old_rm = fegetround();
+        fesetround(FE_TONEAREST);
+#endif
+        float y = nearbyintf((float)x);
+#ifdef FE_TONEAREST
+        fesetround(old_rm);
+#endif
+        return (__fp16)y;
+    }
+#endif // __riscv_zvfh
 };
 
 struct unary_op_trunc_fp16s
 {
+#if __riscv_zvfh
     vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
 #if __riscv_xtheadvector
@@ -271,12 +423,18 @@ struct unary_op_trunc_fp16s
         vint16m8_t _floorx = __riscv_vsub_vx_i16m8_mu(_floormask, _xi, _xi, 1, vl);
         vbool2_t _ceilmask = __riscv_vmflt_vv_f16m8_b2(_xf, x, vl);
         vint16m8_t _ceilx = __riscv_vadd_vx_i16m8_mu(_ceilmask, _xi, _xi, 1, vl);
-        vbool2_t _negative = __riscv_vmflt_vf_f16m8_b2(x, 0.f, vl);
+        vbool2_t _negative = __riscv_vmflt_vf_f16m8_b2(x, (__fp16)0.f, vl);
         return __riscv_vfcvt_f_x_v_f16m8(__riscv_vmerge_vvm_i16m8(_floorx, _ceilx, _negative, vl), vl);
 #else
         return __riscv_vfcvt_f_x_v_f16m8(__riscv_vfcvt_rtz_x_f_v_i16m8(x, vl), vl);
 #endif
     }
+#else  // __riscv_zvfh
+    __fp16 operator()(const __fp16& x) const
+    {
+        return (__fp16)truncf((float)x);
+    }
+#endif // __riscv_zvfh
 };
 
 } // namespace UnaryOp_riscv_functor
@@ -347,6 +505,6 @@ int UnaryOp_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt
 
     return 0;
 }
-#endif // __riscv_zvfh
+#endif // NCNN_ZFH
 
 } // namespace ncnn
diff --git a/src/platform.h.in b/src/platform.h.in
index d9efed0b3bc..023d5d11023 100644
--- a/src/platform.h.in
+++ b/src/platform.h.in
@@ -62,6 +62,7 @@
 #cmakedefine01 NCNN_LSX
 #cmakedefine01 NCNN_MMI
 #cmakedefine01 NCNN_RVV
+#cmakedefine01 NCNN_ZFH
 #cmakedefine01 NCNN_ZVFH
 #cmakedefine01 NCNN_XTHEADVECTOR
 #cmakedefine01 NCNN_INT8