From 9634977303608833bb41437ee310bbdf37976637 Mon Sep 17 00:00:00 2001 From: nihui Date: Sat, 24 Jul 2021 20:25:18 +0800 Subject: [PATCH] [WIP] auto code-format bot (#849) * Create code-format.yml * trigger on push only * Update code-format.yml * apply code-format changes --- .clang-format | 298 +- .github/workflows/code-format.yml | 21 + benchmark/common/cmdline.hpp | 1412 ++++--- examples/common/common.h | 10 +- examples/common/compiler_fp16.h | 20 +- examples/common/msc_getopt.h | 735 ++-- examples/common/stb_image.h | 3627 ++++++++--------- examples/common/stb_image_write.h | 787 ++-- examples/common/tengine_operations.c | 134 +- examples/common/test_nnie_all.hpp | 48 +- examples/cpp_tm_classification.cpp | 91 +- examples/cpp_tm_mobilenet_ssd.cpp | 50 +- examples/tm_alphapose.cpp | 135 +- examples/tm_classification.c | 88 +- examples/tm_classification_acl.c | 88 +- examples/tm_classification_cuda.cpp | 92 +- examples/tm_classification_fp16.c | 84 +- examples/tm_classification_int8.c | 88 +- examples/tm_classification_timvx.c | 96 +- examples/tm_classification_trt.cpp | 91 +- examples/tm_classification_uint8.c | 88 +- examples/tm_classification_vulkan.c | 80 +- examples/tm_crnn.cpp | 48 +- examples/tm_efficientdet.c | 258 +- examples/tm_efficientdet_uint8.c | 266 +- examples/tm_hrnet.cpp | 175 +- examples/tm_hrnet_timvx.cpp | 204 +- examples/tm_landmark.cpp | 46 +- examples/tm_landmark_timvx.cpp | 46 +- examples/tm_landmark_uint8.cpp | 46 +- examples/tm_mobilefacenet.cpp | 32 +- examples/tm_mobilefacenet_uint8.cpp | 46 +- examples/tm_mobilenet_ssd.c | 56 +- examples/tm_mobilenet_ssd_acl.c | 56 +- examples/tm_mobilenet_ssd_uint8.cpp | 66 +- examples/tm_nanodet_m.cpp | 234 +- examples/tm_nanodet_m_timvx.cpp | 259 +- examples/tm_openpose.cpp | 67 +- examples/tm_retinaface.cpp | 86 +- examples/tm_ultraface.cpp | 121 +- examples/tm_unet.cpp | 206 +- examples/tm_yolact.cpp | 73 +- examples/tm_yolact_uint8.cpp | 112 +- examples/tm_yolofastest.cpp | 53 +- examples/tm_yolov3.cpp | 74 +- examples/tm_yolov3_tiny.cpp | 70 +- examples/tm_yolov3_tiny_uint8.cpp | 76 +- examples/tm_yolov3_uint8.cpp | 91 +- examples/tm_yolov4.cpp | 78 +- examples/tm_yolov4_tiny.cpp | 73 +- examples/tm_yolov4_tiny_timvx.cpp | 80 +- examples/tm_yolov4_tiny_uint8.cpp | 80 +- examples/tm_yolov4_uint8.cpp | 94 +- examples/tm_yolov5.cpp | 66 +- examples/tm_yolov5s.cpp | 120 +- examples/tm_yolov5s_timvx.cpp | 125 +- examples/tm_yolox.cpp | 114 +- examples/tm_yolox_timvx.cpp | 1153 +++--- source/api/c_api.c | 167 +- source/api/c_api.h | 101 +- source/api/plugin.c | 9 +- source/device/acl/acl_define.h | 3 +- source/device/acl/acl_device.hpp | 3 +- source/device/acl/acl_executor.hpp | 15 +- source/device/acl/acl_graph.hpp | 4 +- source/device/acl/acl_limit.hpp | 48 +- source/device/cpu/cpu_define.h | 35 +- source/device/cpu/cpu_device.c | 74 +- source/device/cpu/cpu_device.h | 3 - source/device/cpu/cpu_dump.c | 614 ++- source/device/cpu/cpu_dump.h | 1 - source/device/cpu/cpu_graph.c | 14 +- source/device/cpu/cpu_graph.h | 26 +- source/device/cpu/cpu_module.c | 17 +- source/device/cpu/cpu_module.h | 2 - source/device/cpu/cpu_node.c | 4 +- source/device/cpu/cpu_node.h | 17 +- source/device/cpu/cpu_pool.c | 37 +- source/device/cpu/cpu_pool.h | 10 +- source/device/cpu/op/absval/absval_ref.c | 11 +- .../cpu/op/absval/cortex-a/absval_hcl_arm.c | 12 +- source/device/cpu/op/add_n/add_n_ref.c | 15 +- source/device/cpu/op/argmax/argmax_ref.c | 15 +- source/device/cpu/op/argmin/argmin_ref.c | 15 +- .../cpu/op/batchnorm/batchnorm_kernel_ref.h | 1 - .../op/batchnorm/batchnorm_kernel_ref_fp32.c | 1 - .../op/batchnorm/batchnorm_kernel_ref_uint8.c | 9 +- .../device/cpu/op/batchnorm/batchnorm_ref.c | 30 +- .../op/batchnorm/cortex-a/batchnorm_hcl_arm.c | 28 +- .../batchnorm/cortex-a/batchnorm_kernel_arm.c | 11 +- .../batchnorm/cortex-a/batchnorm_kernel_arm.h | 1 - .../op/batchtospacend/batchtospacend_ref.c | 15 +- source/device/cpu/op/bias/bias_ref.c | 1 - source/device/cpu/op/broadmul/broadmul_ref.c | 2 - source/device/cpu/op/cast/cast_ref.c | 2 - source/device/cpu/op/ceil/ceil_ref.c | 11 +- source/device/cpu/op/clip/clip_kernel_ref.h | 2 - .../device/cpu/op/clip/clip_kernel_ref_fp32.c | 1 - .../cpu/op/clip/clip_kernel_ref_uint8.c | 13 +- source/device/cpu/op/clip/clip_ref.c | 3 +- .../comparison/comparison_kernel_ref_fp32.c | 61 +- .../device/cpu/op/comparison/comparison_ref.c | 3 +- .../device/cpu/op/concat/concat_kernel_ref.h | 2 - .../cpu/op/concat/concat_kernel_ref_fp32.c | 71 +- .../cpu/op/concat/concat_kernel_ref_int8.c | 105 +- .../cpu/op/concat/concat_kernel_ref_uint8.c | 111 +- source/device/cpu/op/concat/concat_ref.c | 19 +- source/device/cpu/op/conv/conv_kernel_ref.h | 9 +- .../device/cpu/op/conv/conv_kernel_ref_fp16.c | 29 +- .../device/cpu/op/conv/conv_kernel_ref_fp32.c | 16 +- .../device/cpu/op/conv/conv_kernel_ref_int8.c | 30 +- .../cpu/op/conv/conv_kernel_ref_uint8.c | 38 +- source/device/cpu/op/conv/conv_ref.c | 29 +- .../armv8.2/conv_dw_kernel_fp16_arm82.c | 16 +- .../armv8.2/conv_dw_kernel_fp16_arm82.h | 3 +- .../cortex-a/armv8.2/conv_kernel_fp16_arm82.c | 224 +- .../cortex-a/conv_dw_dilation_kernel_arm.h | 12 +- .../cpu/op/conv/cortex-a/conv_dw_hcl_arm.c | 45 +- .../conv/cortex-a/conv_dw_k5_k7_kernel_arm.h | 167 +- .../cpu/op/conv/cortex-a/conv_dw_kernel_arm.c | 17 +- .../cpu/op/conv/cortex-a/conv_dw_kernel_arm.h | 13 +- .../conv/cortex-a/conv_dw_kernel_int8_arm.c | 69 +- .../conv/cortex-a/conv_dw_kernel_int8_arm.h | 7 +- .../cpu/op/conv/cortex-a/conv_hcl_arm.c | 61 +- .../cpu/op/conv/cortex-a/conv_kernel_arm.c | 88 +- .../cpu/op/conv/cortex-a/conv_kernel_arm.h | 23 +- .../op/conv/cortex-a/conv_kernel_int8_arm.c | 790 ++-- .../op/conv/cortex-a/conv_kernel_int8_arm.h | 7 +- .../op/conv/cortex-a/wino_conv_kernel_1_arm.c | 424 +- .../op/conv/cortex-a/wino_conv_kernel_1_arm.h | 8 +- .../op/conv/cortex-a/wino_conv_kernel_arm.c | 251 +- .../op/conv/cortex-a/wino_conv_kernel_arm.h | 4 +- .../device/cpu/op/conv/cortex-m/conv_cmsis.c | 32 +- .../cpu/op/conv/mips/conv_dw_hcl_mips.c | 10 +- .../cpu/op/conv/mips/conv_dw_kernel_mips.c | 53 +- .../cpu/op/conv/mips/conv_dw_kernel_mips.h | 4 +- .../device/cpu/op/conv/mips/conv_hcl_mips.c | 28 +- .../cpu/op/conv/mips/conv_kernel_mips.c | 83 +- .../cpu/op/conv/mips/wino_conv_kernel_mips.c | 79 +- .../cpu/op/conv/mips/wino_conv_kernel_mips.h | 4 +- .../op/conv/risc-v/lp64dv/conv_dw_hcl_rv64.c | 19 +- .../conv/risc-v/lp64dv/conv_dw_kernel_rv64.c | 27 +- .../conv/risc-v/lp64dv/conv_dw_kernel_rv64.h | 1 - .../cpu/op/conv/risc-v/lp64dv/conv_hcl_rv64.c | 42 +- .../op/conv/risc-v/lp64dv/conv_kernel_rv64.c | 97 +- .../op/conv/risc-v/lp64dv/conv_kernel_rv64.h | 2 +- .../op/conv/x86/conv_direct_hcl_int8_x86.c | 96 +- .../device/cpu/op/conv/x86/conv_dw_hcl_x86.c | 105 +- .../cpu/op/conv/x86/conv_dw_kernel_x86.c | 49 +- .../cpu/op/conv/x86/conv_dw_kernel_x86.h | 1 - source/device/cpu/op/conv/x86/conv_hcl_x86.c | 46 +- .../device/cpu/op/conv/x86/conv_kernel_x86.c | 243 +- .../device/cpu/op/conv/x86/conv_kernel_x86.h | 1 - .../cpu/op/conv/x86/wino_conv_kernel_x86.c | 80 +- .../cpu/op/conv/x86/wino_conv_kernel_x86.h | 1 - source/device/cpu/op/crop/crop_ref.c | 31 +- .../op/deconv/cortex_a/deconv_dw_hcl_arm.c | 11 +- .../op/deconv/cortex_a/deconv_dw_kernel_arm.c | 49 +- .../op/deconv/cortex_a/deconv_dw_kernel_arm.h | 15 +- .../cpu/op/deconv/cortex_a/deconv_hcl_arm.c | 23 +- .../op/deconv/cortex_a/deconv_kernel_arm.c | 273 +- .../op/deconv/cortex_a/deconv_kernel_arm.h | 32 +- source/device/cpu/op/deconv/deconv_ref.c | 72 +- .../cpu/op/depthtospace/depthtospace_ref.c | 1 - .../detection_output/detection_output_ref.c | 73 +- .../detection_postprocess_ref.c | 160 +- source/device/cpu/op/dropout/dropout_ref.c | 1 - .../cpu/op/eltwise/cortex-a/eltwise_hcl_arm.c | 5 +- .../cpu/op/eltwise/cortex-a/eltwise_hcl_arm.h | 1 - .../op/eltwise/cortex-a/eltwise_kernel_arm.c | 83 +- .../op/eltwise/cortex-a/eltwise_kernel_arm.h | 1 - source/device/cpu/op/eltwise/eltwise_ref.c | 1032 ++--- .../device/cpu/op/elu/cortex-a/elu_hcl_arm.c | 3 +- .../cpu/op/elu/cortex-a/elu_kernel_arm.c | 7 +- .../cpu/op/elu/cortex-a/elu_kernel_arm.h | 1 - source/device/cpu/op/elu/elu_ref.c | 14 +- .../device/cpu/op/embedding/embedding_ref.c | 11 +- source/device/cpu/op/expand/expand_ref.c | 61 +- .../device/cpu/op/expanddims/expanddims_ref.c | 1 - .../cortex-a/armv8.2/fc_kernel_fp16_arm82.c | 95 +- .../cortex-a/armv8.2/fc_kernel_fp16_arm82.h | 28 +- source/device/cpu/op/fc/cortex-a/fc_hcl_arm.c | 40 +- .../device/cpu/op/fc/cortex-a/fc_kernel_arm.c | 21 +- .../device/cpu/op/fc/cortex-a/fc_kernel_arm.h | 1 - .../cpu/op/fc/cortex-a/fc_kernel_int8_arm.c | 179 +- .../cpu/op/fc/cortex-a/fc_kernel_int8_arm.h | 26 +- source/device/cpu/op/fc/cortex-m/fc_cmsis.c | 12 +- source/device/cpu/op/fc/fc_ref.c | 63 +- source/device/cpu/op/fc/x86/fc_hcl_x86.c | 30 +- source/device/cpu/op/flatten/flatten_ref.c | 1 - source/device/cpu/op/gather/gather_ref.c | 38 +- source/device/cpu/op/gru/gru_ref.c | 82 +- .../cpu/op/hardsigmoid/hardsigmoid_ref.c | 5 +- .../cpu/op/hardswish/hardswish_kernel_ref.h | 2 - .../op/hardswish/hardswish_kernel_ref_fp32.c | 3 +- .../op/hardswish/hardswish_kernel_ref_uint8.c | 7 +- .../device/cpu/op/hardswish/hardswish_ref.c | 4 +- source/device/cpu/op/input/input_ref.c | 1 - .../cpu/op/instancenorm/instancenorm_ref.c | 19 +- .../cpu/op/interp/cortex-a/interp_hcl_arm.c | 3 +- .../op/interp/cortex-a/interp_kernel_arm.c | 93 +- .../op/interp/cortex-a/interp_kernel_arm.h | 1 - source/device/cpu/op/interp/interp_ref.c | 68 +- .../op/l2normalization/l2normalization_ref.c | 8 +- source/device/cpu/op/l2pool/l2pool_ref.c | 46 +- source/device/cpu/op/logical/logical_ref.c | 49 +- source/device/cpu/op/logistic/logistic_ref.c | 7 +- .../device/cpu/op/logsoftmax/logsoftmax_ref.c | 48 +- .../device/cpu/op/lrn/cortex-a/lrn_hcl_arm.c | 3 +- .../cpu/op/lrn/cortex-a/lrn_kernel_arm.c | 17 +- .../cpu/op/lrn/cortex-a/lrn_kernel_arm.h | 1 - source/device/cpu/op/lrn/lrn_ref.c | 7 +- source/device/cpu/op/lstm/lstm_ref.c | 146 +- source/device/cpu/op/matmul/matmul_ref.c | 3 +- source/device/cpu/op/maximum/maximum_ref.c | 15 +- source/device/cpu/op/mean/mean_ref.c | 17 +- source/device/cpu/op/minimum/minimum_ref.c | 15 +- .../cpu/op/mish/cortex-a/mish_hcl_arm.c | 5 +- .../cpu/op/mish/cortex-a/mish_kernel_arm.c | 7 +- .../cpu/op/mish/cortex-a/mish_kernel_arm.h | 1 - .../cpu/op/mish/cortex-a/mish_math_func.h | 1 - source/device/cpu/op/mish/mish_kernel_ref.h | 6 +- .../device/cpu/op/mish/mish_kernel_ref_fp32.c | 1 - .../cpu/op/mish/mish_kernel_ref_uint8.c | 10 +- source/device/cpu/op/mish/mish_ref.c | 7 +- source/device/cpu/op/mvn/mvn_ref.c | 7 +- source/device/cpu/op/noop/noop_ref.c | 38 +- .../device/cpu/op/normalize/normalize_ref.c | 17 +- source/device/cpu/op/pad/pad_ref.c | 26 +- source/device/cpu/op/permute/permute_ref.c | 15 +- .../cpu/op/pooling/cortex-a/pooling_hcl_arm.c | 17 +- .../cpu/op/pooling/cortex-a/pooling_hcl_arm.h | 45 +- .../pooling/cortex-a/pooling_hcl_arm_int8.h | 251 +- .../cpu/op/pooling/cortex-m/pooling_cmsis.c | 2 +- .../cpu/op/pooling/pooling_kernel_ref.h | 9 +- .../cpu/op/pooling/pooling_kernel_ref_fp16.c | 44 +- .../cpu/op/pooling/pooling_kernel_ref_fp32.c | 10 +- .../cpu/op/pooling/pooling_kernel_ref_int8.c | 31 +- .../cpu/op/pooling/pooling_kernel_ref_uint8.c | 17 +- source/device/cpu/op/pooling/pooling_ref.c | 28 +- .../cpu/op/prelu/cortex_a/prelu_hcl_arm.c | 4 +- .../cpu/op/prelu/cortex_a/prelu_kernel_arm.c | 1 - source/device/cpu/op/prelu/prelu_ref.c | 28 +- source/device/cpu/op/priorbox/priorbox_ref.c | 23 +- .../cpu/op/psroipooling/psroipooling_ref.c | 17 +- .../device/cpu/op/reciprocal/reciprocal_ref.c | 18 +- source/device/cpu/op/reducel2/reducel2_ref.c | 7 +- .../cpu/op/reduction/reduction_kernel_ref.h | 910 ++--- .../device/cpu/op/reduction/reduction_ref.c | 16 +- source/device/cpu/op/region/region_ref.c | 3 +- .../cpu/op/relu/cortex-a/relu_hcl_arm.c | 3 +- .../cpu/op/relu/cortex-a/relu_hcl_arm.h | 5 +- .../cpu/op/relu/cortex-a/relu_kernel_arm.c | 7 +- .../cpu/op/relu/cortex-a/relu_kernel_arm.h | 1 - .../device/cpu/op/relu/cortex-m/relu_cmsis.c | 1 - source/device/cpu/op/relu/relu_kernel_ref.h | 2 - .../device/cpu/op/relu/relu_kernel_ref_fp16.c | 5 +- .../device/cpu/op/relu/relu_kernel_ref_fp32.c | 1 - .../device/cpu/op/relu/relu_kernel_ref_int8.c | 7 +- .../cpu/op/relu/relu_kernel_ref_uint8.c | 7 +- source/device/cpu/op/relu/relu_ref.c | 9 +- source/device/cpu/op/relu1/relu1_ref.c | 1 - source/device/cpu/op/relu6/relu6_ref.c | 18 +- source/device/cpu/op/reorg/reorg_ref.c | 3 +- source/device/cpu/op/reshape/reshape_ref.c | 52 +- source/device/cpu/op/resize/resize_ref.c | 14 +- source/device/cpu/op/reverse/reverse_ref.c | 31 +- source/device/cpu/op/rnn/rnn_ref.c | 19 +- source/device/cpu/op/roialign/roialign_ref.c | 21 +- .../device/cpu/op/roipooling/roipooling_ref.c | 17 +- source/device/cpu/op/round/round_ref.c | 1 - source/device/cpu/op/rpn/rpn_ref.c | 37 +- source/device/cpu/op/scale/scale_ref.c | 3 +- source/device/cpu/op/scatter/scatter_ref.c | 276 +- .../cpu/op/selu/cortex-a/selu_hcl_arm.c | 3 +- .../cpu/op/selu/cortex-a/selu_kernel_arm.c | 7 +- .../cpu/op/selu/cortex-a/selu_kernel_arm.h | 1 - source/device/cpu/op/selu/selu_ref.c | 27 +- source/device/cpu/op/shape/shape_ref.c | 4 +- .../op/shuffle_channel/shuffle_channel_ref.c | 11 +- .../cpu/op/sigmoid/cortex-a/sigmoid_hcl_arm.c | 1 - .../op/sigmoid/cortex-a/sigmoid_kernel_arm.c | 23 +- .../op/sigmoid/cortex-a/sigmoid_kernel_arm.h | 1 - source/device/cpu/op/sigmoid/sigmoid_ref.c | 35 +- source/device/cpu/op/slice/slice_ref.c | 67 +- .../cpu/op/softmax/cortex-a/softmax_hcl_arm.c | 34 +- .../cpu/op/softmax/cortex-m/softmax_cmsis.c | 6 +- .../cpu/op/softmax/softmax_kernel_ref.h | 14 +- .../cpu/op/softmax/softmax_kernel_ref_fp32.c | 7 +- .../cpu/op/softmax/softmax_kernel_ref_int8.c | 11 +- .../cpu/op/softmax/softmax_kernel_ref_uint8.c | 11 +- source/device/cpu/op/softmax/softmax_ref.c | 21 +- source/device/cpu/op/softplus/softplus_ref.c | 19 +- .../op/spacetobatchnd/spacetobatchnd_ref.c | 39 +- .../cpu/op/spacetodepth/spacetodepth_ref.c | 4 +- .../cpu/op/sparsetodense/sparsetodense_ref.c | 3 +- .../spatialtransformer_ref.c | 135 +- source/device/cpu/op/split/split_ref.c | 19 +- .../squareddifference/squareddifference_ref.c | 15 +- source/device/cpu/op/squeeze/squeeze_ref.c | 5 +- .../cpu/op/strided_slice/strided_slice_ref.c | 17 +- .../device/cpu/op/swap_axis/swap_axis_ref.c | 27 +- .../cpu/op/tanh/cortex-a/tanh_hcl_arm.c | 5 +- .../cpu/op/tanh/cortex-a/tanh_kernel_arm.c | 9 +- source/device/cpu/op/tanh/tanh_ref.c | 11 +- .../device/cpu/op/threshold/threshold_ref.c | 15 +- source/device/cpu/op/tile/tile_ref.c | 68 +- source/device/cpu/op/topkv2/topkv2_ref.c | 27 +- .../device/cpu/op/transpose/transpose_ref.c | 148 +- source/device/cpu/op/unary/unary_kernel_ref.h | 1 - .../cpu/op/unary/unary_kernel_ref_fp32.c | 209 +- .../cpu/op/unary/unary_kernel_ref_uint8.c | 215 +- source/device/cpu/op/unary/unary_ref.c | 9 +- .../device/cpu/op/unsqueeze/unsqueeze_ref.c | 5 +- source/device/cpu/op/upsample/upsample_ref.c | 9 +- source/device/cpu/op/where/where_ref.c | 15 +- .../device/cpu/op/zeroslike/zeroslike_ref.c | 3 +- source/device/cuda/cuda_device.hpp | 3 +- source/device/cuda/cuda_executor.hpp | 7 +- source/device/cuda/cuda_graph.hpp | 4 +- source/device/cuda/cuda_limit.hpp | 35 +- source/device/device.c | 4 +- source/device/device.h | 20 +- source/device/opencl/ocl_define.h | 3 +- source/device/opencl/ocl_device.hpp | 3 +- source/device/opencl/ocl_executor.hpp | 28 +- source/device/opencl/ocl_graph.hpp | 4 +- source/device/opencl/ocl_helper.hpp | 10 +- source/device/opencl/ocl_limit.hpp | 245 +- source/device/tensorrt/trt_define.h | 13 +- source/device/tensorrt/trt_device.hpp | 2 - source/device/tensorrt/trt_executor.hpp | 5 +- source/device/tensorrt/trt_graph.hpp | 1 - source/device/tensorrt/trt_helper.hpp | 97 +- source/device/tensorrt/trt_limit.hpp | 159 +- source/device/tim-vx/timvx_device.hpp | 3 +- source/device/tim-vx/timvx_dump.c | 1119 +++-- source/device/tim-vx/timvx_dump.h | 68 +- source/device/tim-vx/timvx_executor.hpp | 34 +- source/device/tim-vx/timvx_graph.hpp | 4 +- source/device/tim-vx/timvx_limit.hpp | 138 +- source/device/vulkan/layer/concat_vulkan.cpp | 63 +- source/device/vulkan/layer/concat_vulkan.hpp | 6 +- .../vulkan/layer/convolution_vulkan.cpp | 114 +- .../vulkan/layer/convolution_vulkan.hpp | 10 +- .../layer/convolutiondepthwise_vulkan.cpp | 157 +- .../layer/convolutiondepthwise_vulkan.hpp | 9 +- source/device/vulkan/layer/crop_vulkan.cpp | 92 +- source/device/vulkan/layer/crop_vulkan.hpp | 8 +- source/device/vulkan/layer/dropout_vulkan.cpp | 29 +- source/device/vulkan/layer/dropout_vulkan.hpp | 7 +- source/device/vulkan/layer/eltwise_vulkan.cpp | 49 +- source/device/vulkan/layer/eltwise_vulkan.hpp | 8 +- source/device/vulkan/layer/flatten_vulkan.cpp | 50 +- source/device/vulkan/layer/flatten_vulkan.hpp | 7 +- .../vulkan/layer/innerproduct_vulkan.cpp | 59 +- .../vulkan/layer/innerproduct_vulkan.hpp | 6 +- source/device/vulkan/layer/interp_vulkan.cpp | 86 +- source/device/vulkan/layer/interp_vulkan.hpp | 10 +- source/device/vulkan/layer/packing_vulkan.cpp | 9 +- source/device/vulkan/layer/packing_vulkan.hpp | 3 +- source/device/vulkan/layer/padding_vulkan.cpp | 44 +- source/device/vulkan/layer/padding_vulkan.hpp | 5 +- source/device/vulkan/layer/permute_vulkan.cpp | 61 +- source/device/vulkan/layer/permute_vulkan.hpp | 6 +- source/device/vulkan/layer/pooling_vulkan.cpp | 67 +- source/device/vulkan/layer/pooling_vulkan.hpp | 23 +- .../device/vulkan/layer/priorbox_vulkan.cpp | 41 +- .../device/vulkan/layer/priorbox_vulkan.hpp | 6 +- source/device/vulkan/layer/relu_vulkan.cpp | 38 +- source/device/vulkan/layer/relu_vulkan.hpp | 6 +- source/device/vulkan/layer/reshape_vulkan.cpp | 79 +- source/device/vulkan/layer/reshape_vulkan.hpp | 7 +- source/device/vulkan/layer/softmax_vulkan.cpp | 63 +- source/device/vulkan/layer/softmax_vulkan.hpp | 7 +- source/device/vulkan/layer_shader_type.h | 2 +- source/device/vulkan/vulkan_allocator.cpp | 200 +- source/device/vulkan/vulkan_allocator.hpp | 53 +- source/device/vulkan/vulkan_command.cpp | 376 +- source/device/vulkan/vulkan_command.hpp | 113 +- source/device/vulkan/vulkan_define.h | 3 +- source/device/vulkan/vulkan_device.hpp | 3 +- source/device/vulkan/vulkan_executor.hpp | 19 +- source/device/vulkan/vulkan_gpu.cpp | 340 +- source/device/vulkan/vulkan_gpu.hpp | 28 +- source/device/vulkan/vulkan_graph.hpp | 32 +- source/device/vulkan/vulkan_helper.hpp | 6 +- source/device/vulkan/vulkan_layer.cpp | 2 +- source/device/vulkan/vulkan_layer.hpp | 5 +- source/device/vulkan/vulkan_limit.hpp | 245 +- source/device/vulkan/vulkan_option.cpp | 2 +- source/device/vulkan/vulkan_pipeline.cpp | 53 +- source/device/vulkan/vulkan_pipeline.hpp | 2 +- source/device/vulkan/vulkan_platform.hpp | 74 +- source/device/vulkan/vulkan_tensor.cpp | 16 +- source/device/vulkan/vulkan_tensor.hpp | 50 +- source/executer/executer.c | 28 +- source/executer/executer.h | 33 +- source/graph/graph.c | 46 +- source/graph/graph.h | 47 +- source/graph/node.c | 44 +- source/graph/node.h | 31 +- source/graph/subgraph.c | 28 +- source/graph/subgraph.h | 29 +- source/graph/tensor.c | 54 +- source/graph/tensor.h | 67 +- source/module/module.c | 45 +- source/module/module.h | 20 - source/operator/op.c | 2 - source/operator/op.h | 27 +- source/operator/op_name.h | 204 +- source/operator/prototype/absval.c | 6 - source/operator/prototype/add_n.c | 4 - source/operator/prototype/argmax.c | 11 +- source/operator/prototype/argmin.c | 13 +- source/operator/prototype/batchnorm.c | 7 +- source/operator/prototype/batchtospacend.c | 17 +- source/operator/prototype/bias.c | 9 +- source/operator/prototype/broadmul.c | 9 +- source/operator/prototype/cast.c | 4 - source/operator/prototype/ceil.c | 8 +- source/operator/prototype/clip.c | 12 +- source/operator/prototype/comparison.c | 9 +- source/operator/prototype/concat.c | 9 +- source/operator/prototype/const.c | 10 +- source/operator/prototype/convolution.c | 17 +- source/operator/prototype/convolution_param.h | 22 +- source/operator/prototype/crop.c | 10 +- source/operator/prototype/deconvolution.c | 6 +- source/operator/prototype/depthtospace.c | 19 +- source/operator/prototype/detection_output.c | 11 +- .../prototype/detection_postprocess.c | 17 +- .../prototype/detection_postprocess_param.h | 2 +- source/operator/prototype/dropout.c | 8 +- source/operator/prototype/eltwise.c | 10 +- source/operator/prototype/elu.c | 7 +- source/operator/prototype/embedding.c | 10 +- source/operator/prototype/embedding_param.h | 2 +- source/operator/prototype/expand.c | 76 +- source/operator/prototype/expanddims.c | 9 +- source/operator/prototype/fc.c | 3 +- source/operator/prototype/flatten.c | 10 +- source/operator/prototype/gather.c | 50 +- source/operator/prototype/gemm.c | 11 +- source/operator/prototype/generic.c | 7 +- source/operator/prototype/generic_param.h | 2 +- source/operator/prototype/gru.c | 10 +- source/operator/prototype/gru_param.h | 2 +- source/operator/prototype/hardsigmoid.c | 7 +- source/operator/prototype/hardswish.c | 6 +- source/operator/prototype/input.c | 10 +- source/operator/prototype/instancenorm.c | 7 +- source/operator/prototype/interp.c | 13 +- source/operator/prototype/l2normalization.c | 9 +- source/operator/prototype/l2pool.c | 29 +- source/operator/prototype/l2pool_param.h | 3 +- source/operator/prototype/logical.c | 7 +- source/operator/prototype/logsoftmax.c | 9 +- source/operator/prototype/lrn.c | 8 +- source/operator/prototype/lstm.c | 9 +- source/operator/prototype/lstm_param.h | 2 +- source/operator/prototype/matmul.c | 5 - source/operator/prototype/maximum.c | 3 - source/operator/prototype/mean.c | 3 - source/operator/prototype/minimum.c | 3 - source/operator/prototype/mish.c | 9 +- source/operator/prototype/mvn.c | 6 +- source/operator/prototype/noop.c | 4 - source/operator/prototype/normalize.c | 6 +- source/operator/prototype/pad.c | 20 +- source/operator/prototype/pad_param.h | 10 +- source/operator/prototype/permute.c | 9 +- source/operator/prototype/pooling.c | 13 +- source/operator/prototype/pooling_param.h | 4 +- source/operator/prototype/prelu.c | 10 +- source/operator/prototype/priorbox.c | 12 +- source/operator/prototype/psroipooling.c | 7 +- source/operator/prototype/reciprocal.c | 4 +- source/operator/prototype/reducel2.c | 13 +- source/operator/prototype/reduction.c | 18 +- source/operator/prototype/region.c | 6 +- source/operator/prototype/relu.c | 7 +- source/operator/prototype/relu1.c | 10 +- source/operator/prototype/relu6.c | 9 +- source/operator/prototype/reorg.c | 9 +- source/operator/prototype/reshape.c | 42 +- source/operator/prototype/resize.c | 17 +- source/operator/prototype/resize_param.h | 2 +- source/operator/prototype/reverse.c | 5 - source/operator/prototype/rnn.c | 9 +- source/operator/prototype/roialign.c | 10 +- source/operator/prototype/roipooling.c | 8 +- source/operator/prototype/round.c | 5 - source/operator/prototype/rpn.c | 32 +- source/operator/prototype/scale.c | 7 +- source/operator/prototype/scatter.c | 8 +- source/operator/prototype/selu.c | 7 +- source/operator/prototype/shape.c | 9 +- source/operator/prototype/shuffle_channel.c | 8 +- source/operator/prototype/sigmoid.c | 5 - source/operator/prototype/slice.c | 26 +- source/operator/prototype/slice_param.h | 1 - source/operator/prototype/softmax.c | 7 +- source/operator/prototype/softplus.c | 5 +- source/operator/prototype/spacetobatchnd.c | 14 +- source/operator/prototype/spacetodepth.c | 19 +- source/operator/prototype/sparsetodense.c | 7 +- .../operator/prototype/spatialtransformer.c | 27 +- source/operator/prototype/split.c | 15 +- source/operator/prototype/squareddifference.c | 9 +- source/operator/prototype/squeeze.c | 13 +- source/operator/prototype/strided_slice.c | 32 +- source/operator/prototype/swap_axis.c | 11 +- source/operator/prototype/tanh.c | 7 +- source/operator/prototype/threshold.c | 7 +- source/operator/prototype/tile.c | 89 +- source/operator/prototype/topkv2.c | 10 +- source/operator/prototype/transpose.c | 14 +- source/operator/prototype/unary.c | 8 +- source/operator/prototype/unsqueeze.c | 12 +- source/operator/prototype/upsample.c | 9 +- source/operator/prototype/where.c | 9 +- source/operator/prototype/zeroslike.c | 9 +- source/optimizer/estimation.c | 11 +- source/optimizer/estimation.h | 15 +- source/optimizer/helper.c | 4 - source/optimizer/helper.h | 1 - source/optimizer/split.c | 13 +- source/optimizer/split.h | 1 - source/scheduler/scheduler.c | 22 +- source/scheduler/scheduler.h | 10 +- source/serializer/serializer.c | 1 - source/serializer/serializer.h | 2 - source/serializer/tmfile/op/tm2_add_n.c | 3 - source/serializer/tmfile/op/tm2_argmax.c | 6 +- source/serializer/tmfile/op/tm2_argmin.c | 8 +- source/serializer/tmfile/op/tm2_batchnorm.c | 8 +- .../serializer/tmfile/op/tm2_batchtospacend.c | 8 +- source/serializer/tmfile/op/tm2_bias.c | 4 - source/serializer/tmfile/op/tm2_broadmul.c | 4 - source/serializer/tmfile/op/tm2_cast.c | 8 +- source/serializer/tmfile/op/tm2_ceil.c | 4 - source/serializer/tmfile/op/tm2_clip.c | 8 +- source/serializer/tmfile/op/tm2_comparison.c | 8 +- source/serializer/tmfile/op/tm2_concat.c | 8 +- source/serializer/tmfile/op/tm2_conv.c | 8 +- source/serializer/tmfile/op/tm2_crop.c | 8 +- source/serializer/tmfile/op/tm2_deconv.c | 16 +- .../serializer/tmfile/op/tm2_depthtospace.c | 8 +- .../tmfile/op/tm2_detection_output.c | 8 +- .../tmfile/op/tm2_detection_postprocess.c | 12 +- source/serializer/tmfile/op/tm2_dropout.c | 4 - source/serializer/tmfile/op/tm2_eltwise.c | 8 +- source/serializer/tmfile/op/tm2_elu.c | 8 +- source/serializer/tmfile/op/tm2_embedding.c | 8 +- source/serializer/tmfile/op/tm2_expand.c | 15 +- source/serializer/tmfile/op/tm2_expanddims.c | 8 +- source/serializer/tmfile/op/tm2_fc.c | 8 +- source/serializer/tmfile/op/tm2_flatten.c | 8 +- source/serializer/tmfile/op/tm2_gather.c | 14 +- source/serializer/tmfile/op/tm2_gemm.c | 8 +- source/serializer/tmfile/op/tm2_generic.c | 10 +- source/serializer/tmfile/op/tm2_gru.c | 8 +- source/serializer/tmfile/op/tm2_hardsigmoid.c | 8 +- source/serializer/tmfile/op/tm2_hardswish.c | 8 +- .../serializer/tmfile/op/tm2_instancenorm.c | 8 +- source/serializer/tmfile/op/tm2_interp.c | 8 +- .../tmfile/op/tm2_l2normalization.c | 4 - source/serializer/tmfile/op/tm2_l2pool.c | 6 +- source/serializer/tmfile/op/tm2_logical.c | 8 +- source/serializer/tmfile/op/tm2_logistic.c | 4 - source/serializer/tmfile/op/tm2_logsoftmax.c | 4 - source/serializer/tmfile/op/tm2_lrn.c | 8 +- source/serializer/tmfile/op/tm2_lstm.c | 8 +- source/serializer/tmfile/op/tm2_matmul.c | 4 - source/serializer/tmfile/op/tm2_maximum.c | 6 +- source/serializer/tmfile/op/tm2_mean.c | 4 - source/serializer/tmfile/op/tm2_mish.c | 4 - source/serializer/tmfile/op/tm2_mvn.c | 8 +- source/serializer/tmfile/op/tm2_noop.c | 4 - source/serializer/tmfile/op/tm2_normalize.c | 8 +- source/serializer/tmfile/op/tm2_pad.c | 8 +- source/serializer/tmfile/op/tm2_permute.c | 8 +- source/serializer/tmfile/op/tm2_pool.c | 7 +- source/serializer/tmfile/op/tm2_prelu.c | 4 - source/serializer/tmfile/op/tm2_priorbox.c | 5 - .../serializer/tmfile/op/tm2_psroipooling.c | 8 +- source/serializer/tmfile/op/tm2_reciprocal.c | 2 +- source/serializer/tmfile/op/tm2_reducel2.c | 8 +- source/serializer/tmfile/op/tm2_reduction.c | 8 +- source/serializer/tmfile/op/tm2_region.c | 8 +- source/serializer/tmfile/op/tm2_relu.c | 8 +- source/serializer/tmfile/op/tm2_relu1.c | 4 - source/serializer/tmfile/op/tm2_relu6.c | 4 - source/serializer/tmfile/op/tm2_reorg.c | 8 +- source/serializer/tmfile/op/tm2_reshape.c | 12 +- source/serializer/tmfile/op/tm2_resize.c | 8 +- source/serializer/tmfile/op/tm2_reverse.c | 4 - source/serializer/tmfile/op/tm2_rnn.c | 16 +- source/serializer/tmfile/op/tm2_roialign.c | 8 +- source/serializer/tmfile/op/tm2_roipooling.c | 8 +- source/serializer/tmfile/op/tm2_round.c | 6 +- source/serializer/tmfile/op/tm2_rpn.c | 12 +- source/serializer/tmfile/op/tm2_scale.c | 8 +- source/serializer/tmfile/op/tm2_scatter.c | 6 +- source/serializer/tmfile/op/tm2_selu.c | 8 +- source/serializer/tmfile/op/tm2_shape.c | 4 - .../tmfile/op/tm2_shuffle_channel.c | 8 +- source/serializer/tmfile/op/tm2_sigmoid.c | 4 - source/serializer/tmfile/op/tm2_slice.c | 18 +- source/serializer/tmfile/op/tm2_softmax.c | 8 +- source/serializer/tmfile/op/tm2_softplus.c | 2 +- .../serializer/tmfile/op/tm2_spacetobatchnd.c | 8 +- .../serializer/tmfile/op/tm2_spacetodepth.c | 4 - .../serializer/tmfile/op/tm2_sparsetodense.c | 8 +- .../tmfile/op/tm2_spatialtransformer.c | 14 +- source/serializer/tmfile/op/tm2_split.c | 12 +- .../tmfile/op/tm2_squareddifference.c | 4 - source/serializer/tmfile/op/tm2_squeeze.c | 8 +- .../serializer/tmfile/op/tm2_strided_slice.c | 8 +- source/serializer/tmfile/op/tm2_swap_axis.c | 8 +- source/serializer/tmfile/op/tm2_tanh.c | 4 - source/serializer/tmfile/op/tm2_threshold.c | 8 +- source/serializer/tmfile/op/tm2_tile.c | 10 +- source/serializer/tmfile/op/tm2_topkv2.c | 8 +- source/serializer/tmfile/op/tm2_transpose.c | 6 +- source/serializer/tmfile/op/tm2_unary.c | 8 +- source/serializer/tmfile/op/tm2_unsqueeze.c | 12 +- source/serializer/tmfile/op/tm2_upsample.c | 8 +- source/serializer/tmfile/op/tm2_where.c | 4 - source/serializer/tmfile/op/tm2_zeroslike.c | 4 - source/serializer/tmfile/tm2_format.h | 521 ++- source/serializer/tmfile/tm2_serializer.c | 173 +- source/serializer/tmfile/tm2_serializer.h | 9 +- source/system/cpu.c | 40 +- source/utility/float.c | 60 +- source/utility/float.h | 56 +- source/utility/lock.c | 30 +- source/utility/lock.h | 15 +- source/utility/log.c | 78 +- source/utility/log.h | 124 +- source/utility/math.c | 9 - source/utility/math.h | 8 - source/utility/mem_stat.c | 6 +- source/utility/sys_port.c | 2 +- source/utility/sys_port.h | 4 +- source/utility/utils.c | 126 +- source/utility/utils.h | 11 - source/utility/vector.c | 25 +- source/utility/vector.h | 27 +- tests/common/common.h | 10 +- tests/common/compiler_fp16.h | 20 +- tests/common/stb_image.h | 3627 ++++++++--------- tests/common/stb_image_write.h | 787 ++-- tests/common/tengine_operations.c | 136 +- tests/common/util/mathp.c | 9 - tests/common/util/mathp.h | 8 - tests/common/util/vector.c | 25 +- tests/common/util/vector.h | 28 +- tests/models/test_model_alphapose.cpp | 57 +- tests/models/test_model_classification.cpp | 100 +- tests/models/test_model_common.cpp | 48 +- tests/models/test_model_crnn.cpp | 44 +- tests/models/test_model_efficientdet.c | 99 +- tests/models/test_model_hrnet.cpp | 63 +- tests/models/test_model_landmark.cpp | 48 +- tests/models/test_model_mobilefacenet.cpp | 38 +- tests/models/test_model_mobilenet_ssd.c | 54 +- tests/models/test_model_nanodet_m.cpp | 113 +- tests/models/test_model_openpose.cpp | 45 +- tests/models/test_model_retinaface.cpp | 77 +- tests/models/test_model_ultraface.cpp | 61 +- tests/models/test_model_unet.cpp | 105 +- tests/models/test_model_yolact.cpp | 57 +- tests/models/test_model_yolofastest.cpp | 41 +- tests/models/test_model_yolov3.cpp | 28 +- tests/models/test_model_yolov3_tiny.cpp | 62 +- tests/models/test_model_yolov4.cpp | 74 +- tests/models/test_model_yolov4_tiny.cpp | 61 +- tests/models/test_model_yolov5s.cpp | 53 +- tests/models/test_timvx_model_yolov5s.cpp | 125 +- tests/op/test_onnx_op.h | 7 +- tests/op/test_onnx_op_abs.cpp | 11 +- tests/op/test_onnx_op_acos.cpp | 11 +- tests/op/test_onnx_op_add.cpp | 13 +- tests/op/test_onnx_op_asin.cpp | 11 +- tests/op/test_onnx_op_atan.cpp | 11 +- .../test_onnx_op_averagepool_2d_default.cpp | 11 +- tests/op/test_onnx_op_averagepool_2d_pads.cpp | 11 +- .../test_onnx_op_basic_conv_with_padding.cpp | 13 +- ...est_onnx_op_basic_conv_without_padding.cpp | 13 +- tests/op/test_onnx_op_ceil.cpp | 11 +- tests/op/test_onnx_op_clip_example.cpp | 17 +- tests/op/test_onnx_op_concat_1d_axis_0.cpp | 15 +- tests/op/test_onnx_op_concat_2d_axis_0.cpp | 15 +- tests/op/test_onnx_op_concat_2d_axis_1.cpp | 15 +- tests/op/test_onnx_op_concat_3d_axis_0.cpp | 13 +- tests/op/test_onnx_op_concat_3d_axis_1.cpp | 13 +- tests/op/test_onnx_op_concat_3d_axis_2.cpp | 13 +- ...t_onnx_op_conv_with_strides_no_padding.cpp | 13 +- ...test_onnx_op_conv_with_strides_padding.cpp | 13 +- tests/op/test_onnx_op_convtranspose.cpp | 13 +- .../test_onnx_op_convtranspose_dilations.cpp | 13 +- tests/op/test_onnx_op_convtranspose_pad.cpp | 13 +- tests/op/test_onnx_op_convtranspose_pads.cpp | 13 +- tests/op/test_onnx_op_cos.cpp | 11 +- .../op/test_onnx_op_depthtospace_dcr_mode.cpp | 11 +- tests/op/test_onnx_op_div.cpp | 13 +- tests/op/test_onnx_op_dropout_default.cpp | 11 +- tests/op/test_onnx_op_elu.cpp | 11 +- tests/op/test_onnx_op_equal.cpp | 13 +- tests/op/test_onnx_op_exp.cpp | 11 +- .../op/test_onnx_op_expand_dim_unchanged.cpp | 14 +- tests/op/test_onnx_op_floor.cpp | 11 +- tests/op/test_onnx_op_globalaveragepool.cpp | 11 +- tests/op/test_onnx_op_greater.cpp | 11 +- tests/op/test_onnx_op_gru_defaults.cpp | 17 +- tests/op/test_onnx_op_gru_seq_length.cpp | 23 +- .../op/test_onnx_op_gru_with_initial_bias.cpp | 23 +- tests/op/test_onnx_op_hardsigmoid.cpp | 11 +- .../op/test_onnx_op_instancenorm_epsilon.cpp | 27 +- .../op/test_onnx_op_instancenorm_example.cpp | 27 +- tests/op/test_onnx_op_leakyrelu.cpp | 11 +- tests/op/test_onnx_op_less.cpp | 13 +- tests/op/test_onnx_op_log.cpp | 11 +- .../test_onnx_op_logsoftmax_default_axis.cpp | 11 +- tests/op/test_onnx_op_lstm_defaults.cpp | 17 +- .../test_onnx_op_lstm_with_initial_bias.cpp | 23 +- tests/op/test_onnx_op_matmul_2d.cpp | 13 +- tests/op/test_onnx_op_matmul_3d.cpp | 13 +- tests/op/test_onnx_op_matmul_4d.cpp | 13 +- tests/op/test_onnx_op_maxpool_2d_default.cpp | 11 +- .../op/test_onnx_op_maxpool_2d_dilations.cpp | 11 +- tests/op/test_onnx_op_maxpool_2d_pads.cpp | 11 +- tests/op/test_onnx_op_neg.cpp | 11 +- tests/op/test_onnx_op_pow.cpp | 13 +- tests/op/test_onnx_op_reciprocal.cpp | 11 +- .../test_onnx_op_reduce_log_sum_default.cpp | 11 +- ...educe_max_default_axes_keepdim_example.cpp | 11 +- ...uce_mean_default_axes_keepdims_example.cpp | 11 +- ...duce_min_default_axes_keepdims_example.cpp | 11 +- ...m_square_default_axes_keepdims_example.cpp | 11 +- tests/op/test_onnx_op_relu.cpp | 11 +- tests/op/test_onnx_op_selu.cpp | 11 +- tests/op/test_onnx_op_selu_default.cpp | 11 +- .../op/test_onnx_op_softmax_default_axis.cpp | 11 +- tests/op/test_onnx_op_softplus.cpp | 11 +- tests/op/test_onnx_op_squeeze.cpp | 11 +- tests/op/test_onnx_op_sub.cpp | 13 +- tests/op/test_onnx_op_tanh.cpp | 11 +- tests/op/test_onnx_op_unsqueeze_axis_1.cpp | 11 +- tests/op/test_op.h | 499 ++- tests/op/test_op_conv.c | 70 +- tests/op/test_op_prelu.c | 17 +- tests/op/test_op_relu.c | 14 +- tests/op/test_op_relu6.c | 14 +- tests/op/test_tensorrt_op_clip.cpp | 239 +- tests/op/test_tensorrt_op_concat.cpp | 338 +- tests/op/test_tensorrt_op_deconv.cpp | 469 ++- tests/op/test_tensorrt_op_dropout.cpp | 277 +- tests/op/test_tensorrt_op_eltwise.cpp | 343 +- tests/op/test_tensorrt_op_fc.cpp | 315 +- tests/op/test_timvx_op_clip.cpp | 22 +- tests/op/test_timvx_op_concat.cpp | 412 +- tests/op/test_timvx_op_convolution.cpp | 41 +- tests/op/test_timvx_op_deconv.cpp | 537 +-- tests/op/test_timvx_op_dropout.cpp | 22 +- tests/op/test_timvx_op_eltwise_mul.cpp | 409 +- tests/op/test_timvx_op_eltwise_sum.cpp | 409 +- tests/op/test_timvx_op_elu.cpp | 22 +- tests/op/test_timvx_op_fc.cpp | 401 +- tests/op/test_timvx_op_flatten.cpp | 346 +- tests/op/test_timvx_op_gather.cpp | 356 +- tests/op/test_timvx_op_hardswish.cpp | 22 +- tests/op/test_timvx_op_interp.cpp | 364 +- tests/op/test_timvx_op_leakyrelu.cpp | 24 +- tests/op/test_timvx_op_mish.cpp | 21 +- tests/op/test_timvx_op_permute.cpp | 361 +- tests/op/test_timvx_op_pooling.cpp | 26 +- tests/op/test_timvx_op_prelu.cpp | 24 +- tests/op/test_timvx_op_relu.cpp | 22 +- tests/op/test_timvx_op_relu1.cpp | 22 +- tests/op/test_timvx_op_reshape.cpp | 372 +- tests/op/test_timvx_op_resize.cpp | 360 +- tests/op/test_timvx_op_sigmoid.cpp | 22 +- tests/op/test_timvx_op_slice.cpp | 354 +- tests/op/test_timvx_op_softmax.cpp | 344 +- tests/op/test_timvx_op_split.cpp | 424 +- tests/op/test_timvx_op_tanh.cpp | 21 +- tests/op/test_timvx_op_transpose.cpp | 406 +- tests/op/test_timvx_op_upsampling.cpp | 356 +- tools/convert_tool/caffe/caffe2tengine.cpp | 177 +- tools/convert_tool/caffe/caffe2tengine.hpp | 33 +- tools/convert_tool/convert_tool.cpp | 57 +- tools/convert_tool/ncnn/ncnn2tengine.cpp | 311 +- tools/convert_tool/ncnn/ncnn2tengine.hpp | 39 +- tools/convert_tool/onnx/onnx2tengine.cpp | 564 ++- tools/convert_tool/onnx/onnx2tengine.hpp | 31 +- .../utils/graph_optimizer/graph_opt.cpp | 112 +- .../utils/graph_optimizer/graph_opt.hpp | 29 +- .../utils/save_graph/save_graph.cpp | 96 +- .../utils/save_graph/save_graph.hpp | 5 +- .../utils/save_graph/tm2_generate.c | 2 +- .../utils/save_graph/tm2_op_save.cpp | 483 ++- .../utils/save_graph/tm2_op_save.hpp | 14 +- tools/quantize/compiler_fp16.h | 20 +- tools/quantize/quant_save_graph.cpp | 2044 +++++----- tools/quantize/quant_save_graph.hpp | 106 +- tools/quantize/quant_tool.hpp | 32 +- tools/quantize/quant_tool_int8.cpp | 144 +- tools/quantize/quant_tool_uint8.cpp | 166 +- .../quantize/quant_tool_uint8_perchannel.cpp | 1107 +++-- tools/quantize/quant_utils.cpp | 1093 +++-- tools/quantize/quant_utils.hpp | 97 +- tools/quantize/savegraph/save_graph.cpp | 95 +- tools/quantize/savegraph/save_graph.hpp | 5 +- tools/quantize/savegraph/tm2_format.h | 594 +-- tools/quantize/savegraph/tm2_generate.c | 2 +- tools/quantize/savegraph/tm2_op_save.cpp | 456 +-- tools/quantize/savegraph/tm2_op_save.hpp | 13 +- 820 files changed, 28221 insertions(+), 29759 deletions(-) create mode 100644 .github/workflows/code-format.yml diff --git a/.clang-format b/.clang-format index a969255c2..3519ad950 100644 --- a/.clang-format +++ b/.clang-format @@ -1,170 +1,132 @@ -Language: Cpp - -AccessModifierOffset: -4 - -AlignAfterOpenBracket: Align -AlignConsecutiveAssignments: AcrossEmptyLines -AlignConsecutiveBitFields: AcrossEmptyLinesAndComments -AlignConsecutiveDeclarations: Consecutive -AlignConsecutiveMacros: AcrossEmptyLines -AlignEscapedNewlines: Right -AlignOperands: true -AlignTrailingComments: true - -AllowAllArgumentsOnNextLine: true -AllowAllConstructorInitializersOnNextLine: true -AllowAllParametersOfDeclarationOnNextLine: true - -AllowShortBlocksOnASingleLine: Empty -AllowShortCaseLabelsOnASingleLine: false -AllowShortEnumsOnASingleLine: false -AllowShortFunctionsOnASingleLine: Empty -AllowShortIfStatementsOnASingleLine: Never -AllowShortLambdasOnASingleLine: Inline -AllowShortLoopsOnASingleLine: false - -# AlwaysBreakAfterDefinitionReturnType is deprecated -AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: false -AlwaysBreakTemplateDeclarations: Yes - -BinPackArguments: true -BinPackParameters: true - -BitFieldColonSpacing: Both - +# find src/ tools/ tests/ examples/ benchmark/ -type f -name '*.c' -o -name '*.cpp' -o -name '*.h' | xargs -i clang-format -i {} + +# need clang-format >= 10.0 + +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +# AlignConsecutiveBitFields: true +AlignConsecutiveDeclarations: false +AlignConsecutiveMacros: true +AlignEscapedNewlines: Left +# AlignOperands: AlignAfterOperator +AlignTrailingComments: true +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: Always +AllowShortCaseLabelsOnASingleLine: true +# AllowShortEnumsOnASingleLine: true +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: WithoutElse +AllowShortLambdasOnASingleLine: All +AllowShortLoopsOnASingleLine: true +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: Yes +BinPackArguments: true +BinPackParameters: true BraceWrapping: - AfterCaseLabel: true - AfterClass: false - AfterControlStatement: Always - AfterEnum: true - AfterFunction: true - AfterNamespace: true - AfterObjCDeclaration: true - AfterStruct: true - AfterUnion: true - AfterExternBlock: true - BeforeCatch: true - BeforeElse: true - BeforeLambdaBody: false - BeforeWhile: true - IndentBraces: false - SplitEmptyFunction: false - SplitEmptyRecord: false - SplitEmptyNamespace: false -BreakBeforeBinaryOperators: NonAssignment -BreakBeforeBraces: Custom -BreakBeforeConceptDeclarations: true -BreakBeforeTernaryOperators: false -BreakConstructorInitializers: BeforeColon -BreakConstructorInitializersBeforeComma: false -BreakInheritanceList: BeforeColon -BreakStringLiterals: false - -ColumnLimit: 120 - -CommentPragmas: '^ AYU pragma:' - -CompactNamespaces: false - -ConstructorInitializerAllOnOneLineOrOnePerLine: false -ConstructorInitializerIndentWidth: 4 - -ContinuationIndentWidth: 4 - -Cpp11BracedListStyle: false - -DeriveLineEnding: true -DerivePointerAlignment: false - -DisableFormat: false - -ExperimentalAutoDetectBinPacking: false - -FixNamespaceComments: true - -ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] - -IncludeCategories: - - Regex: '^"(llvm|llvm-c|clang|clang-c)/' - Priority: 2 - - Regex: '^(<|"(gtest|isl|json)/)' - Priority: 3 - - Regex: '.*' - Priority: 1 - -IndentCaseBlocks: true -IndentCaseLabels: false -IndentExternBlock: AfterExternBlock -IndentGotoLabels: false -IndentPPDirectives: BeforeHash -IndentRequires: true -IndentWidth: 4 -IndentWrappedFunctionNames: false - -InsertTrailingCommas: Wrapped - -KeepEmptyLinesAtTheStartOfBlocks: false - -MacroBlockBegin: '' -MacroBlockEnd: '' - -MaxEmptyLinesToKeep: 3 - + AfterCaseLabel: true + AfterClass: true + AfterControlStatement: Always + AfterEnum: true + AfterFunction: true + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: true + AfterUnion: true + AfterExternBlock: false + BeforeCatch: true + BeforeElse: true +# BeforeLambdaBody: false +# BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: false +BreakAfterJavaFieldAnnotations: true +BreakBeforeBinaryOperators: All +BreakBeforeBraces: Custom +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: BeforeColon +BreakInheritanceList: BeforeColon +BreakStringLiterals: false +ColumnLimit: 0 +# CommentPragmas: +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DeriveLineEnding: false +DerivePointerAlignment: false +# DisableFormat: +# ExperimentalAutoDetectBinPacking: +FixNamespaceComments: true +# ForEachMacros: +IncludeBlocks: Regroup +# IncludeCategories: +# IncludeIsMainRegex: +# IncludeIsMainSourceRegex: +# IndentCaseBlocks: false +IndentCaseLabels: false +# IndentExternBlock: NoIndent +IndentGotoLabels: false +IndentPPDirectives: None +IndentWidth: 4 +# IndentWrappedFunctionNames: 4 +# InsertTrailingCommas: None +# JavaImportGroups: +# JavaScriptQuotes +# JavaScriptWrapImports: +KeepEmptyLinesAtTheStartOfBlocks: false +Language: Cpp +# MacroBlockBegin: +# MacroBlockEnd: +MaxEmptyLinesToKeep: 1 NamespaceIndentation: None - -ObjCBinPackProtocolList: Auto -ObjCBlockIndentWidth: 4 -ObjCBreakBeforeNestedBlockParam: false -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: true - -PenaltyBreakBeforeFirstCallParameter: 19 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakString: 1000 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 60 - -PointerAlignment: Left - -#RawStringFormats: - -ReflowComments: false - -SortIncludes: Never -SortUsingDeclarations: false - -SpaceAfterCStyleCast: false -SpaceAfterLogicalNot: false -SpaceAfterTemplateKeyword: true -SpaceAroundPointerQualifiers: Before -SpaceBeforeAssignmentOperators: true -SpaceBeforeCaseColon: false -SpaceBeforeCpp11BracedList: true -SpaceBeforeCtorInitializerColon: true -SpaceBeforeInheritanceColon: true -SpaceBeforeParens: ControlStatements -SpaceBeforeRangeBasedForLoopColon: true -SpaceBeforeSquareBrackets: false -SpaceInEmptyBlock: false -SpaceInEmptyParentheses: false -SpacesBeforeTrailingComments: 4 -SpacesInAngles: false -SpacesInCStyleCastParentheses: false -SpacesInConditionalStatement: false -SpacesInContainerLiterals: true -SpacesInParentheses: false -SpacesInSquareBrackets: false - -Standard: c++11 - -TabWidth: 4 - -UseCRLF: false - -UseTab: Never - -# http://clang.llvm.org/docs/ClangFormatStyleOptions.html -# https://www.cnblogs.com/PaulpauL/p/5929753.html -# https://my.oschina.net/u/4393102/blog/3349736 +# NamespaceMacros: +# ObjCBinPackProtocolList: +# ObjCBlockIndentWidth: +# ObjCBreakBeforeNestedBlockParam: +# ObjCSpaceAfterProperty: +# ObjCSpaceBeforeProtocolList: +# PenaltyBreakAssignment: +# PenaltyBreakBeforeFirstCallParameter: +# PenaltyBreakComment: +# PenaltyBreakFirstLessLess: +# PenaltyBreakString: +# PenaltyBreakTemplateDeclaration: +# PenaltyExcessCharacter: +# PenaltyReturnTypeOnItsOwnLine: +PointerAlignment: Left +# RawStringFormats: +ReflowComments: false +SortIncludes: false +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: false +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceBeforeSquareBrackets: false +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInCStyleCastParentheses: false +SpacesInConditionalStatement: false +SpacesInContainerLiterals: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: c++03 +#StatementMacros: +TabWidth: 4 +# TypenameMacros: +UseCRLF: false +UseTab: Never diff --git a/.github/workflows/code-format.yml b/.github/workflows/code-format.yml new file mode 100644 index 000000000..60441b8da --- /dev/null +++ b/.github/workflows/code-format.yml @@ -0,0 +1,21 @@ +name: code-format + +on: [push, pull_request, pull_request_target] + +jobs: + code-format: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + + - name: apt + run: | + sudo apt-get update + sudo apt-get install -y dos2unix clang-format-12 + sudo update-alternatives --install /usr/bin/clang-format clang-format /usr/bin/clang-format-12 120 + - name: code-format + run: | + python scripts/clang-format-all.py + - uses: stefanzweifel/git-auto-commit-action@v4 + with: + commit_message: apply code-format changes diff --git a/benchmark/common/cmdline.hpp b/benchmark/common/cmdline.hpp index 5b88c778a..b26c944c3 100644 --- a/benchmark/common/cmdline.hpp +++ b/benchmark/common/cmdline.hpp @@ -43,778 +43,914 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace cmdline { - namespace detail { - - template - class lexical_cast_t { - public: - static Target cast(const Source &arg) { - Target ret; - std::stringstream ss; - if (!(ss << arg && ss >> ret && ss.eof())) - throw std::bad_cast(); - - return ret; - } - }; +namespace detail { - template - class lexical_cast_t { - public: - static Target cast(const Source &arg) { - return arg; - } - }; - - template - class lexical_cast_t { - public: - static std::string cast(const Source &arg) { - std::ostringstream ss; - ss << arg; - return ss.str(); - } - }; - - template - class lexical_cast_t { - public: - static Target cast(const std::string &arg) { - Target ret; - std::istringstream ss(arg); - if (!(ss >> ret && ss.eof())) - throw std::bad_cast(); - return ret; - } - }; +template +class lexical_cast_t +{ +public: + static Target cast(const Source& arg) + { + Target ret; + std::stringstream ss; + if (!(ss << arg && ss >> ret && ss.eof())) + throw std::bad_cast(); - template - struct is_same { - static const bool value = false; - }; + return ret; + } +}; - template - struct is_same { - static const bool value = true; - }; +template +class lexical_cast_t +{ +public: + static Target cast(const Source& arg) + { + return arg; + } +}; - template - Target lexical_cast(const Source &arg) - { - return lexical_cast_t::value>::cast(arg); - } +template +class lexical_cast_t +{ +public: + static std::string cast(const Source& arg) + { + std::ostringstream ss; + ss << arg; + return ss.str(); + } +}; - static inline std::string demangle(const std::string &name) - { +template +class lexical_cast_t +{ +public: + static Target cast(const std::string& arg) + { + Target ret; + std::istringstream ss(arg); + if (!(ss >> ret && ss.eof())) + throw std::bad_cast(); + return ret; + } +}; + +template +struct is_same +{ + static const bool value = false; +}; + +template +struct is_same +{ + static const bool value = true; +}; + +template +Target lexical_cast(const Source& arg) +{ + return lexical_cast_t::value>::cast(arg); +} + +static inline std::string demangle(const std::string& name) +{ #ifdef _MSC_VER - return name; // MSVC return name -#elif defined(__GNUC__) - // call the original methods when compiler is GCC - int status = 0; - char *p = abi::__cxa_demangle(name.c_str(), 0, 0, &status); - std::string ret(p); - free(p); - return ret; + return name; // MSVC return name +#elif defined(__GNUC__) + // call the original methods when compiler is GCC + int status = 0; + char* p = abi::__cxa_demangle(name.c_str(), 0, 0, &status); + std::string ret(p); + free(p); + return ret; #else - // other compiler need more work + // other compiler need more work #error unexpected c complier (msc/gcc), Need to implement this method for demangle #endif - } - - template - std::string readable_typename() - { - return demangle(typeid(T).name()); - } - - template - std::string default_value(T def) - { - return detail::lexical_cast(def); - } - - template <> - inline std::string readable_typename() - { - return "string"; - } - - } // detail - - //----- - - class cmdline_error : public std::exception { - public: - cmdline_error(const std::string &msg) : msg(msg) {} - ~cmdline_error() throw() {} - const char *what() const throw() { return msg.c_str(); } - private: - std::string msg; - }; - - template - struct default_reader { - T operator()(const std::string &str) { - return detail::lexical_cast(str); - } - }; - - template - struct range_reader { - range_reader(const T &low, const T &high) : low(low), high(high) {} - T operator()(const std::string &s) const { - T ret = default_reader()(s); - if (!(ret >= low && ret <= high)) throw cmdline::cmdline_error("range_error"); - return ret; - } - private: - T low, high; - }; - - template - range_reader range(const T &low, const T &high) +} + +template +std::string readable_typename() +{ + return demangle(typeid(T).name()); +} + +template +std::string default_value(T def) +{ + return detail::lexical_cast(def); +} + +template<> +inline std::string readable_typename() +{ + return "string"; +} + +} // namespace detail + +//----- + +class cmdline_error : public std::exception +{ +public: + cmdline_error(const std::string& msg) + : msg(msg) + { + } + ~cmdline_error() throw() + { + } + const char* what() const throw() { - return range_reader(low, high); + return msg.c_str(); } - template - struct oneof_reader { - T operator()(const std::string &s) { - T ret = default_reader()(s); - if (std::find(alt.begin(), alt.end(), ret) == alt.end()) - throw cmdline_error(""); - return ret; - } - void add(const T &v) { alt.push_back(v); } - private: - std::vector alt; - }; +private: + std::string msg; +}; - template - oneof_reader oneof(T a1) +template +struct default_reader +{ + T operator()(const std::string& str) { - oneof_reader ret; - ret.add(a1); - return ret; + return detail::lexical_cast(str); } +}; - template - oneof_reader oneof(T a1, T a2) +template +struct range_reader +{ + range_reader(const T& low, const T& high) + : low(low), high(high) { - oneof_reader ret; - ret.add(a1); - ret.add(a2); - return ret; } - - template - oneof_reader oneof(T a1, T a2, T a3) + T operator()(const std::string& s) const { - oneof_reader ret; - ret.add(a1); - ret.add(a2); - ret.add(a3); + T ret = default_reader()(s); + if (!(ret >= low && ret <= high)) throw cmdline::cmdline_error("range_error"); return ret; } - template - oneof_reader oneof(T a1, T a2, T a3, T a4) +private: + T low, high; +}; + +template +range_reader range(const T& low, const T& high) +{ + return range_reader(low, high); +} + +template +struct oneof_reader +{ + T operator()(const std::string& s) { - oneof_reader ret; - ret.add(a1); - ret.add(a2); - ret.add(a3); - ret.add(a4); + T ret = default_reader()(s); + if (std::find(alt.begin(), alt.end(), ret) == alt.end()) + throw cmdline_error(""); return ret; } - - template - oneof_reader oneof(T a1, T a2, T a3, T a4, T a5) + void add(const T& v) { - oneof_reader ret; - ret.add(a1); - ret.add(a2); - ret.add(a3); - ret.add(a4); - ret.add(a5); - return ret; + alt.push_back(v); } - template - oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6) +private: + std::vector alt; +}; + +template +oneof_reader oneof(T a1) +{ + oneof_reader ret; + ret.add(a1); + return ret; +} + +template +oneof_reader oneof(T a1, T a2) +{ + oneof_reader ret; + ret.add(a1); + ret.add(a2); + return ret; +} + +template +oneof_reader oneof(T a1, T a2, T a3) +{ + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + return ret; +} + +template +oneof_reader oneof(T a1, T a2, T a3, T a4) +{ + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + return ret; +} + +template +oneof_reader oneof(T a1, T a2, T a3, T a4, T a5) +{ + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + ret.add(a5); + return ret; +} + +template +oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6) +{ + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + ret.add(a5); + ret.add(a6); + return ret; +} + +template +oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7) +{ + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + ret.add(a5); + ret.add(a6); + ret.add(a7); + return ret; +} + +template +oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8) +{ + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + ret.add(a5); + ret.add(a6); + ret.add(a7); + ret.add(a8); + return ret; +} + +template +oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9) +{ + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + ret.add(a5); + ret.add(a6); + ret.add(a7); + ret.add(a8); + ret.add(a9); + return ret; +} + +template +oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9, T a10) +{ + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + ret.add(a5); + ret.add(a6); + ret.add(a7); + ret.add(a8); + ret.add(a9); + ret.add(a10); + return ret; +} + +//----- + +class parser +{ +public: + parser() { - oneof_reader ret; - ret.add(a1); - ret.add(a2); - ret.add(a3); - ret.add(a4); - ret.add(a5); - ret.add(a6); - return ret; + } + ~parser() + { + for (std::map::iterator p = options.begin(); + p != options.end(); p++) + delete p->second; } - template - oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7) + void add(const std::string& name, + char short_name = 0, + const std::string& desc = "") { - oneof_reader ret; - ret.add(a1); - ret.add(a2); - ret.add(a3); - ret.add(a4); - ret.add(a5); - ret.add(a6); - ret.add(a7); - return ret; + if (options.count(name)) throw cmdline_error("multiple definition: " + name); + options[name] = new option_without_value(name, short_name, desc); + ordered.push_back(options[name]); } - template - oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8) + template + void add(const std::string& name, + char short_name = 0, + const std::string& desc = "", + bool need = true, + const T def = T()) { - oneof_reader ret; - ret.add(a1); - ret.add(a2); - ret.add(a3); - ret.add(a4); - ret.add(a5); - ret.add(a6); - ret.add(a7); - ret.add(a8); - return ret; + add(name, short_name, desc, need, def, default_reader()); } - template - oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9) + template + void add(const std::string& name, + char short_name = 0, + const std::string& desc = "", + bool need = true, + const T def = T(), + F reader = F()) { - oneof_reader ret; - ret.add(a1); - ret.add(a2); - ret.add(a3); - ret.add(a4); - ret.add(a5); - ret.add(a6); - ret.add(a7); - ret.add(a8); - ret.add(a9); - return ret; + if (options.count(name)) throw cmdline_error("multiple definition: " + name); + options[name] = new option_with_value_with_reader(name, short_name, need, def, desc, reader); + ordered.push_back(options[name]); } - template - oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9, T a10) - { - oneof_reader ret; - ret.add(a1); - ret.add(a2); - ret.add(a3); - ret.add(a4); - ret.add(a5); - ret.add(a6); - ret.add(a7); - ret.add(a8); - ret.add(a9); - ret.add(a10); - return ret; + void footer(const std::string& f) + { + ftr = f; } - //----- + void set_program_name(const std::string& name) + { + prog_name = name; + } - class parser { - public: - parser() { - } - ~parser() { - for (std::map::iterator p = options.begin(); - p != options.end(); p++) - delete p->second; - } + bool exist(const std::string& name) const + { + if (options.count(name) == 0) throw cmdline_error("there is no flag: --" + name); + return options.find(name)->second->has_set(); + } - void add(const std::string &name, - char short_name = 0, - const std::string &desc = "") { - if (options.count(name)) throw cmdline_error("multiple definition: " + name); - options[name] = new option_without_value(name, short_name, desc); - ordered.push_back(options[name]); - } + template + const T& get(const std::string& name) const + { + if (options.count(name) == 0) throw cmdline_error("there is no flag: --" + name); + const option_with_value* p = dynamic_cast*>(options.find(name)->second); + if (p == NULL) throw cmdline_error("type mismatch flag '" + name + "'"); + return p->get(); + } - template - void add(const std::string &name, - char short_name = 0, - const std::string &desc = "", - bool need = true, - const T def = T()) { - add(name, short_name, desc, need, def, default_reader()); - } + const std::vector& rest() const + { + return others; + } - template - void add(const std::string &name, - char short_name = 0, - const std::string &desc = "", - bool need = true, - const T def = T(), - F reader = F()) { - if (options.count(name)) throw cmdline_error("multiple definition: " + name); - options[name] = new option_with_value_with_reader(name, short_name, need, def, desc, reader); - ordered.push_back(options[name]); - } + bool parse(const std::string& arg) + { + std::vector args; - void footer(const std::string &f) { - ftr = f; - } + std::string buf; + bool in_quote = false; + for (std::string::size_type i = 0; i < arg.length(); i++) + { + if (arg[i] == '\"') + { + in_quote = !in_quote; + continue; + } - void set_program_name(const std::string &name) { - prog_name = name; - } + if (arg[i] == ' ' && !in_quote) + { + args.push_back(buf); + buf = ""; + continue; + } - bool exist(const std::string &name) const { - if (options.count(name) == 0) throw cmdline_error("there is no flag: --" + name); - return options.find(name)->second->has_set(); - } + if (arg[i] == '\\') + { + i++; + if (i >= arg.length()) + { + errors.push_back("unexpected occurrence of '\\' at end of string"); + return false; + } + } - template - const T &get(const std::string &name) const { - if (options.count(name) == 0) throw cmdline_error("there is no flag: --" + name); - const option_with_value *p = dynamic_cast*>(options.find(name)->second); - if (p == NULL) throw cmdline_error("type mismatch flag '" + name + "'"); - return p->get(); + buf += arg[i]; } - const std::vector &rest() const { - return others; + if (in_quote) + { + errors.push_back("quote is not closed"); + return false; } - bool parse(const std::string &arg) { - std::vector args; + if (buf.length() > 0) + args.push_back(buf); - std::string buf; - bool in_quote = false; - for (std::string::size_type i = 0; i < arg.length(); i++) { - if (arg[i] == '\"') { - in_quote = !in_quote; - continue; - } + for (size_t i = 0; i < args.size(); i++) + std::cout << "\"" << args[i] << "\"" << std::endl; - if (arg[i] == ' ' && !in_quote) { - args.push_back(buf); - buf = ""; - continue; - } - - if (arg[i] == '\\') { - i++; - if (i >= arg.length()) { - errors.push_back("unexpected occurrence of '\\' at end of string"); - return false; - } - } - - buf += arg[i]; - } - - if (in_quote) { - errors.push_back("quote is not closed"); - return false; - } - - if (buf.length() > 0) - args.push_back(buf); + return parse(args); + } - for (size_t i = 0; i < args.size(); i++) - std::cout << "\"" << args[i] << "\"" << std::endl; + bool parse(const std::vector& args) + { + int argc = static_cast(args.size()); + std::vector argv(argc); - return parse(args); - } + for (int i = 0; i < argc; i++) + argv[i] = args[i].c_str(); - bool parse(const std::vector &args) { - int argc = static_cast(args.size()); - std::vector argv(argc); + return parse(argc, &argv[0]); + } - for (int i = 0; i < argc; i++) - argv[i] = args[i].c_str(); + bool parse(int argc, const char* const argv[]) + { + errors.clear(); + others.clear(); - return parse(argc, &argv[0]); + if (argc < 1) + { + errors.push_back("argument number must be longer than 0"); + return false; } + if (prog_name == "") + prog_name = argv[0]; - bool parse(int argc, const char * const argv[]) { - errors.clear(); - others.clear(); - - if (argc < 1) { - errors.push_back("argument number must be longer than 0"); - return false; - } - if (prog_name == "") - prog_name = argv[0]; - - std::map lookup; - for (std::map::iterator p = options.begin(); - p != options.end(); p++) { - if (p->first.length() == 0) continue; - char initial = p->second->short_name(); - if (initial) { - if (lookup.count(initial) > 0) { - lookup[initial] = ""; - errors.push_back(std::string("short option '") + initial + "' is ambiguous"); - return false; - } - else lookup[initial] = p->first; + std::map lookup; + for (std::map::iterator p = options.begin(); + p != options.end(); p++) + { + if (p->first.length() == 0) continue; + char initial = p->second->short_name(); + if (initial) + { + if (lookup.count(initial) > 0) + { + lookup[initial] = ""; + errors.push_back(std::string("short option '") + initial + "' is ambiguous"); + return false; } + else + lookup[initial] = p->first; } + } - for (int i = 1; i < argc; i++) { - if (strncmp(argv[i], "--", 2) == 0) { - const char *p = strchr(argv[i] + 2, '='); - if (p) { - std::string name(argv[i] + 2, p); - std::string val(p + 1); - set_option(name, val); + for (int i = 1; i < argc; i++) + { + if (strncmp(argv[i], "--", 2) == 0) + { + const char* p = strchr(argv[i] + 2, '='); + if (p) + { + std::string name(argv[i] + 2, p); + std::string val(p + 1); + set_option(name, val); + } + else + { + std::string name(argv[i] + 2); + if (options.count(name) == 0) + { + errors.push_back("undefined option: --" + name); + continue; } - else { - std::string name(argv[i] + 2); - if (options.count(name) == 0) { - errors.push_back("undefined option: --" + name); + if (options[name]->has_value()) + { + if (i + 1 >= argc) + { + errors.push_back("option needs value: --" + name); continue; } - if (options[name]->has_value()) { - if (i + 1 >= argc) { - errors.push_back("option needs value: --" + name); - continue; - } - else { - i++; - set_option(name, argv[i]); - } - } - else { - set_option(name); + else + { + i++; + set_option(name, argv[i]); } } - } - else if (strncmp(argv[i], "-", 1) == 0) { - if (!argv[i][1]) continue; - char last = argv[i][1]; - for (int j = 2; argv[i][j]; j++) { - last = argv[i][j]; - if (lookup.count(argv[i][j - 1]) == 0) { - errors.push_back(std::string("undefined short option: -") + argv[i][j - 1]); - continue; - } - if (lookup[argv[i][j - 1]] == "") { - errors.push_back(std::string("ambiguous short option: -") + argv[i][j - 1]); - continue; - } - set_option(lookup[argv[i][j - 1]]); + else + { + set_option(name); } - - if (lookup.count(last) == 0) { - errors.push_back(std::string("undefined short option: -") + last); + } + } + else if (strncmp(argv[i], "-", 1) == 0) + { + if (!argv[i][1]) continue; + char last = argv[i][1]; + for (int j = 2; argv[i][j]; j++) + { + last = argv[i][j]; + if (lookup.count(argv[i][j - 1]) == 0) + { + errors.push_back(std::string("undefined short option: -") + argv[i][j - 1]); continue; } - if (lookup[last] == "") { - errors.push_back(std::string("ambiguous short option: -") + last); + if (lookup[argv[i][j - 1]] == "") + { + errors.push_back(std::string("ambiguous short option: -") + argv[i][j - 1]); continue; } + set_option(lookup[argv[i][j - 1]]); + } - if (i + 1 < argc && options[lookup[last]]->has_value()) { - set_option(lookup[last], argv[i + 1]); - i++; - } - else { - set_option(lookup[last]); - } + if (lookup.count(last) == 0) + { + errors.push_back(std::string("undefined short option: -") + last); + continue; + } + if (lookup[last] == "") + { + errors.push_back(std::string("ambiguous short option: -") + last); + continue; + } + + if (i + 1 < argc && options[lookup[last]]->has_value()) + { + set_option(lookup[last], argv[i + 1]); + i++; } - else { - others.push_back(argv[i]); + else + { + set_option(lookup[last]); } } + else + { + others.push_back(argv[i]); + } + } + + for (std::map::iterator p = options.begin(); + p != options.end(); p++) + if (!p->second->valid()) + errors.push_back("need option: --" + std::string(p->first)); + + return errors.size() == 0; + } + + void parse_check(const std::string& arg) + { + if (!options.count("help")) + add("help", '?', "print this message"); + check(0, parse(arg)); + } + + void parse_check(const std::vector& args) + { + if (!options.count("help")) + add("help", '?', "print this message"); + check((int)(args.size()), parse(args)); + } + + void parse_check(int argc, char* argv[]) + { + if (!options.count("help")) + add("help", '?', "print this message"); + check(argc, parse(argc, argv)); + } - for (std::map::iterator p = options.begin(); - p != options.end(); p++) - if (!p->second->valid()) - errors.push_back("need option: --" + std::string(p->first)); + std::string error() const + { + return errors.size() > 0 ? errors[0] : ""; + } + + std::string error_full() const + { + std::ostringstream oss; + for (size_t i = 0; i < errors.size(); i++) + oss << errors[i] << std::endl; + return oss.str(); + } - return errors.size() == 0; + std::string usage() const + { + std::ostringstream oss; + oss << "usage: " << prog_name << " "; + for (size_t i = 0; i < ordered.size(); i++) + { + if (ordered[i]->must()) + oss << ordered[i]->short_description() << " "; } - void parse_check(const std::string &arg) { - if (!options.count("help")) - add("help", '?', "print this message"); - check(0, parse(arg)); + oss << "[options] ... " << ftr << std::endl; + oss << "options:" << std::endl; + + size_t max_width = 0; + for (size_t i = 0; i < ordered.size(); i++) + { + max_width = std::max(max_width, ordered[i]->name().length()); } + for (size_t i = 0; i < ordered.size(); i++) + { + if (ordered[i]->short_name()) + { + oss << " -" << ordered[i]->short_name() << ", "; + } + else + { + oss << " "; + } - void parse_check(const std::vector &args) { - if (!options.count("help")) - add("help", '?', "print this message"); - check((int)(args.size()), parse(args)); + oss << "--" << ordered[i]->name(); + for (size_t j = ordered[i]->name().length(); j < max_width + 4; j++) + oss << ' '; + oss << ordered[i]->description() << std::endl; } + return oss.str(); + } - void parse_check(int argc, char *argv[]) { - if (!options.count("help")) - add("help", '?', "print this message"); - check(argc, parse(argc, argv)); +private: + void check(int argc, bool ok) + { + if ((argc == 1 && !ok) || exist("help")) + { + std::cerr << usage(); + exit(0); } - std::string error() const { - return errors.size() > 0 ? errors[0] : ""; + if (!ok) + { + std::cerr << error() << std::endl + << usage(); + exit(1); } + } - std::string error_full() const { - std::ostringstream oss; - for (size_t i = 0; i < errors.size(); i++) - oss << errors[i] << std::endl; - return oss.str(); + void set_option(const std::string& name) + { + if (options.count(name) == 0) + { + errors.push_back("undefined option: --" + name); + return; } + if (!options[name]->set()) + { + errors.push_back("option needs value: --" + name); + return; + } + } - std::string usage() const { - std::ostringstream oss; - oss << "usage: " << prog_name << " "; - for (size_t i = 0; i < ordered.size(); i++) { - if (ordered[i]->must()) - oss << ordered[i]->short_description() << " "; - } + void set_option(const std::string& name, const std::string& value) + { + if (options.count(name) == 0) + { + errors.push_back("undefined option: --" + name); + return; + } + if (!options[name]->set(value)) + { + errors.push_back("option value is invalid: --" + name + "=" + value); + return; + } + } - oss << "[options] ... " << ftr << std::endl; - oss << "options:" << std::endl; + class option_base + { + public: + virtual ~option_base() + { + } - size_t max_width = 0; - for (size_t i = 0; i < ordered.size(); i++) { - max_width = std::max(max_width, ordered[i]->name().length()); - } - for (size_t i = 0; i < ordered.size(); i++) { - if (ordered[i]->short_name()) { - oss << " -" << ordered[i]->short_name() << ", "; - } - else { - oss << " "; - } + virtual bool has_value() const = 0; + virtual bool set() = 0; + virtual bool set(const std::string& value) = 0; + virtual bool has_set() const = 0; + virtual bool valid() const = 0; + virtual bool must() const = 0; + + virtual const std::string& name() const = 0; + virtual char short_name() const = 0; + virtual const std::string& description() const = 0; + virtual std::string short_description() const = 0; + }; - oss << "--" << ordered[i]->name(); - for (size_t j = ordered[i]->name().length(); j < max_width + 4; j++) - oss << ' '; - oss << ordered[i]->description() << std::endl; - } - return oss.str(); + class option_without_value : public option_base + { + public: + option_without_value(const std::string& name, + char short_name, + const std::string& desc) + : nam(name), snam(short_name), desc(desc), has(false) + { + } + ~option_without_value() + { } - private: - - void check(int argc, bool ok) { - if ((argc == 1 && !ok) || exist("help")) { - std::cerr << usage(); - exit(0); - } + bool has_value() const + { + return false; + } - if (!ok) { - std::cerr << error() << std::endl << usage(); - exit(1); - } + bool set() + { + has = true; + return true; } - void set_option(const std::string &name) { - if (options.count(name) == 0) { - errors.push_back("undefined option: --" + name); - return; - } - if (!options[name]->set()) { - errors.push_back("option needs value: --" + name); - return; - } + bool set(const std::string&) + { + return false; } - void set_option(const std::string &name, const std::string &value) { - if (options.count(name) == 0) { - errors.push_back("undefined option: --" + name); - return; - } - if (!options[name]->set(value)) { - errors.push_back("option value is invalid: --" + name + "=" + value); - return; - } + bool has_set() const + { + return has; } - class option_base { - public: - virtual ~option_base() {} - - virtual bool has_value() const = 0; - virtual bool set() = 0; - virtual bool set(const std::string &value) = 0; - virtual bool has_set() const = 0; - virtual bool valid() const = 0; - virtual bool must() const = 0; - - virtual const std::string &name() const = 0; - virtual char short_name() const = 0; - virtual const std::string &description() const = 0; - virtual std::string short_description() const = 0; - }; - - class option_without_value : public option_base { - public: - option_without_value(const std::string &name, - char short_name, - const std::string &desc) - :nam(name), snam(short_name), desc(desc), has(false) { - } - ~option_without_value() {} + bool valid() const + { + return true; + } - bool has_value() const { return false; } + bool must() const + { + return false; + } - bool set() { - has = true; - return true; - } + const std::string& name() const + { + return nam; + } - bool set(const std::string &) { - return false; - } + char short_name() const + { + return snam; + } - bool has_set() const { - return has; - } + const std::string& description() const + { + return desc; + } - bool valid() const { - return true; - } + std::string short_description() const + { + return "--" + nam; + } - bool must() const { - return false; - } + private: + std::string nam; + char snam; + std::string desc; + bool has; + }; - const std::string &name() const { - return nam; - } + template + class option_with_value : public option_base + { + public: + option_with_value(const std::string& name, + char short_name, + bool need, + const T& def, + const std::string& desc) + : nam(name), snam(short_name), need(need), has(false), def(def), actual(def) + { + this->desc = full_description(desc); + } + ~option_with_value() + { + } - char short_name() const { - return snam; - } + const T& get() const + { + return actual; + } - const std::string &description() const { - return desc; - } + bool has_value() const + { + return true; + } - std::string short_description() const { - return "--" + nam; - } + bool set() + { + return false; + } - private: - std::string nam; - char snam; - std::string desc; - bool has; - }; - - template - class option_with_value : public option_base { - public: - option_with_value(const std::string &name, - char short_name, - bool need, - const T &def, - const std::string &desc) - : nam(name), snam(short_name), need(need), has(false) - , def(def), actual(def) { - this->desc = full_description(desc); + bool set(const std::string& value) + { + try + { + actual = read(value); + has = true; } - ~option_with_value() {} - - const T &get() const { - return actual; + catch (const std::exception& e) + { + (void)e; + return false; } + return true; + } - bool has_value() const { return true; } + bool has_set() const + { + return has; + } - bool set() { - return false; - } + bool valid() const + { + if (need && !has) return false; + return true; + } - bool set(const std::string &value) { - try { - actual = read(value); - has = true; - } - catch (const std::exception &e) { - (void)e; - return false; - } - return true; - } + bool must() const + { + return need; + } - bool has_set() const { - return has; - } + const std::string& name() const + { + return nam; + } - bool valid() const { - if (need && !has) return false; - return true; - } + char short_name() const + { + return snam; + } - bool must() const { - return need; - } + const std::string& description() const + { + return desc; + } - const std::string &name() const { - return nam; - } + std::string short_description() const + { + return "--" + nam + "=" + detail::readable_typename(); + } - char short_name() const { - return snam; - } + protected: + std::string full_description(const std::string& desc_str) + { + return desc_str + " (" + detail::readable_typename() + (need ? "" : " [=" + detail::default_value(def) + "]") + + ")"; + } - const std::string &description() const { - return desc; - } + virtual T read(const std::string& s) = 0; - std::string short_description() const { - return "--" + nam + "=" + detail::readable_typename(); - } + std::string nam; + char snam; + bool need; + std::string desc; - protected: - std::string full_description(const std::string& desc_str) { - return - desc_str + " (" + detail::readable_typename() + - (need ? "" : " [=" + detail::default_value(def) + "]") - + ")"; - } + bool has; + T def; + T actual; + }; - virtual T read(const std::string &s) = 0; - - std::string nam; - char snam; - bool need; - std::string desc; - - bool has; - T def; - T actual; - }; - - template - class option_with_value_with_reader : public option_with_value { - public: - option_with_value_with_reader(const std::string &name, - char short_name, - bool need, - const T def, - const std::string &desc, - F reader) - : option_with_value(name, short_name, need, def, desc), reader(reader) { - } + template + class option_with_value_with_reader : public option_with_value + { + public: + option_with_value_with_reader(const std::string& name, + char short_name, + bool need, + const T def, + const std::string& desc, + F reader) + : option_with_value(name, short_name, need, def, desc), reader(reader) + { + } - private: - T read(const std::string &s) { - return reader(s); - } + private: + T read(const std::string& s) + { + return reader(s); + } - F reader; - }; + F reader; + }; - std::map options; - std::vector ordered; - std::string ftr; + std::map options; + std::vector ordered; + std::string ftr; - std::string prog_name; - std::vector others; + std::string prog_name; + std::vector others; - std::vector errors; - }; + std::vector errors; +}; -} // cmdline +} // namespace cmdline diff --git a/examples/common/common.h b/examples/common/common.h index 40a263aba..9ab861855 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -42,9 +42,9 @@ #ifdef _WIN32 #define WIN32_LEAN_AND_MEAN #include -#else // _WIN32 +#else // _WIN32 #include -#endif // _WIN32 +#endif // _WIN32 #ifdef _WIN32 static double get_current_time() @@ -56,7 +56,7 @@ static double get_current_time() return pc.QuadPart * 1000.0 / freq.QuadPart; } -#else // _WIN32 +#else // _WIN32 static double get_current_time() { @@ -65,7 +65,7 @@ static double get_current_time() return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0; } -#endif // _WIN32 +#endif // _WIN32 static void split(float* array, char* str, const char* del) { @@ -78,4 +78,4 @@ static void split(float* array, char* str, const char* del) } } -#endif // __COMMON_H__ +#endif // __COMMON_H__ diff --git a/examples/common/compiler_fp16.h b/examples/common/compiler_fp16.h index 1857d7eec..d770707c2 100644 --- a/examples/common/compiler_fp16.h +++ b/examples/common/compiler_fp16.h @@ -48,7 +48,7 @@ extern "C" { #else #ifdef _MSC_VER -#pragma pack (push,1) +#pragma pack(push, 1) struct fp16_pack { unsigned short frac : 10; @@ -84,12 +84,12 @@ typedef struct fp16_pack __fp16; static inline float fp16_to_fp32(__fp16 data) { float f; - struct fp32_pack* fp32 = ( struct fp32_pack* )&f; + struct fp32_pack* fp32 = (struct fp32_pack*)&f; struct fp16_pack* fp16 = &data; int exp = fp16->exp; - if(exp == 31 && fp16->frac != 0) + if (exp == 31 && fp16->frac != 0) { // return __builtin_inf()-__builtin_inf(); fp32->sign = fp16->sign; @@ -99,28 +99,28 @@ static inline float fp16_to_fp32(__fp16 data) return f; } - if(exp == 31) + if (exp == 31) exp = 255; - if(exp == 0) + if (exp == 0) exp = 0; else exp = (exp - 15) + 127; fp32->exp = exp; fp32->sign = fp16->sign; - fp32->frac = (( int )fp16->frac) << 13; + fp32->frac = ((int)fp16->frac) << 13; return f; } static inline __fp16 fp32_to_fp16(float data) { - struct fp32_pack* fp32 = ( struct fp32_pack* )&data; + struct fp32_pack* fp32 = (struct fp32_pack*)&data; struct fp16_pack fp16; int exp = fp32->exp; - if(fp32->exp == 255 && fp32->frac != 0) + if (fp32->exp == 255 && fp32->frac != 0) { // NaN fp16.exp = 31; @@ -130,9 +130,9 @@ static inline __fp16 fp32_to_fp16(float data) return fp16; } - if((exp - 127) < -14) + if ((exp - 127) < -14) exp = 0; - else if((exp - 127) > 15) + else if ((exp - 127) > 15) exp = 31; else exp = exp - 127 + 15; diff --git a/examples/common/msc_getopt.h b/examples/common/msc_getopt.h index 0cb88895d..caafad5b2 100644 --- a/examples/common/msc_getopt.h +++ b/examples/common/msc_getopt.h @@ -8,7 +8,7 @@ * IMPLIED ARE HEREBY DISCLAIMED. This includes but is not limited to * warranties of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ - /* +/* * Copyright (c) 2002 Todd C. Miller * * Permission to use, copy, modify, and distribute this software for any @@ -56,7 +56,7 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#pragma warning(disable:4996); +#pragma warning(disable : 4996); #define __GETOPT_H__ @@ -73,16 +73,16 @@ extern "C" { #endif -#define REPLACE_GETOPT /* use this getopt as the system getopt(3) */ +#define REPLACE_GETOPT /* use this getopt as the system getopt(3) */ #ifdef REPLACE_GETOPT -int opterr = 1; /* if error message should be printed */ -int optind = 1; /* index into parent argv vector */ -int optopt = '?'; /* character checked for validity */ -#undef optreset /* see getopt.h */ -#define optreset __mingw_optreset -int optreset; /* reset getopt */ -char *optarg; /* argument associated with option */ +int opterr = 1; /* if error message should be printed */ +int optind = 1; /* index into parent argv vector */ +int optopt = '?'; /* character checked for validity */ +#undef optreset /* see getopt.h */ +#define optreset __mingw_optreset +int optreset; /* reset getopt */ +char* optarg; /* argument associated with option */ #endif //extern int optind; /* index of first non-option in argv */ @@ -92,37 +92,37 @@ char *optarg; /* argument associated with option */ // //extern char *optarg; /* pointer to argument of current option */ -#define PRINT_ERROR ((opterr) && (*options != ':')) +#define PRINT_ERROR ((opterr) && (*options != ':')) -#define FLAG_PERMUTE 0x01 /* permute non-options to the end of argv */ -#define FLAG_ALLARGS 0x02 /* treat non-options as args to option "-1" */ -#define FLAG_LONGONLY 0x04 /* operate as getopt_long_only */ +#define FLAG_PERMUTE 0x01 /* permute non-options to the end of argv */ +#define FLAG_ALLARGS 0x02 /* treat non-options as args to option "-1" */ +#define FLAG_LONGONLY 0x04 /* operate as getopt_long_only */ /* return values */ -#define BADCH (int)'?' -#define BADARG ((*options == ':') ? (int)':' : (int)'?') -#define INORDER (int)1 +#define BADCH (int)'?' +#define BADARG ((*options == ':') ? (int)':' : (int)'?') +#define INORDER (int)1 #ifndef __CYGWIN__ #define __progname __argv[0] #else -extern char __declspec(dllimport) *__progname; +extern char __declspec(dllimport) * __progname; #endif #ifdef __CYGWIN__ static char EMSG[] = ""; #else -#define EMSG "" +#define EMSG "" #endif -static int getopt_internal(int, char * const *, const char *, - const struct option *, int *, int); -static int parse_long_options(char * const *, const char *, - const struct option *, int *, int); +static int getopt_internal(int, char* const*, const char*, + const struct option*, int*, int); +static int parse_long_options(char* const*, const char*, + const struct option*, int*, int); static int gcd(int, int); -static void permute_args(int, int, int, char * const *); +static void permute_args(int, int, int, char* const*); -static char *place = EMSG; /* option letter processing */ +static char* place = EMSG; /* option letter processing */ /* XXX: set optreset to 1 rather than these two */ static int nonopt_start = -1; /* first non option argument (for permute) */ @@ -137,21 +137,21 @@ static const char illoptchar[] = "unknown option -- %c"; static const char illoptstring[] = "unknown option -- %s"; static void -_vwarnx(const char *fmt,va_list ap) +_vwarnx(const char* fmt, va_list ap) { - (void)fprintf(stderr,"%s: ",__progname); - if (fmt != NULL) - (void)vfprintf(stderr,fmt,ap); - (void)fprintf(stderr,"\n"); + (void)fprintf(stderr, "%s: ", __progname); + if (fmt != NULL) + (void)vfprintf(stderr, fmt, ap); + (void)fprintf(stderr, "\n"); } static void -warnx(const char *fmt,...) +warnx(const char* fmt, ...) { - va_list ap; - va_start(ap,fmt); - _vwarnx(fmt,ap); - va_end(ap); + va_list ap; + va_start(ap, fmt); + _vwarnx(fmt, ap); + va_end(ap); } /* @@ -160,16 +160,17 @@ warnx(const char *fmt,...) static int gcd(int a, int b) { - int c; + int c; - c = a % b; - while (c != 0) { - a = b; - b = c; - c = a % b; - } + c = a % b; + while (c != 0) + { + a = b; + b = c; + c = a % b; + } - return (b); + return (b); } /* @@ -179,34 +180,36 @@ gcd(int a, int b) */ static void permute_args(int panonopt_start, int panonopt_end, int opt_end, - char * const *nargv) + char* const* nargv) { - int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos; - char *swap; + int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos; + char* swap; - /* + /* * compute lengths of blocks and number and size of cycles */ - nnonopts = panonopt_end - panonopt_start; - nopts = opt_end - panonopt_end; - ncycle = gcd(nnonopts, nopts); - cyclelen = (opt_end - panonopt_start) / ncycle; - - for (i = 0; i < ncycle; i++) { - cstart = panonopt_end+i; - pos = cstart; - for (j = 0; j < cyclelen; j++) { - if (pos >= panonopt_end) - pos -= nnonopts; - else - pos += nopts; - swap = nargv[pos]; - /* LINTED const cast */ - ((char **) nargv)[pos] = nargv[cstart]; - /* LINTED const cast */ - ((char **)nargv)[cstart] = swap; - } - } + nnonopts = panonopt_end - panonopt_start; + nopts = opt_end - panonopt_end; + ncycle = gcd(nnonopts, nopts); + cyclelen = (opt_end - panonopt_start) / ncycle; + + for (i = 0; i < ncycle; i++) + { + cstart = panonopt_end + i; + pos = cstart; + for (j = 0; j < cyclelen; j++) + { + if (pos >= panonopt_end) + pos -= nnonopts; + else + pos += nopts; + swap = nargv[pos]; + /* LINTED const cast */ + ((char**)nargv)[pos] = nargv[cstart]; + /* LINTED const cast */ + ((char**)nargv)[cstart] = swap; + } + } } #ifdef REPLACE_GETOPT @@ -216,11 +219,9 @@ permute_args(int panonopt_start, int panonopt_end, int opt_end, * * [eventually this will replace the BSD getopt] */ -int -getopt(int nargc, char * const *nargv, const char *options) +int getopt(int nargc, char* const* nargv, const char* options) { - - /* + /* * We don't pass FLAG_PERMUTE to getopt_internal() since * the BSD getopt(3) (unlike GNU) has never done this. * @@ -228,7 +229,7 @@ getopt(int nargc, char * const *nargv, const char *options) * before dropping privileges it makes sense to keep things * as simple (and bug-free) as possible. */ - return (getopt_internal(nargc, nargv, options, NULL, NULL, 0)); + return (getopt_internal(nargc, nargv, options, NULL, NULL, 0)); } #endif /* REPLACE_GETOPT */ @@ -241,7 +242,7 @@ getopt(int nargc, char * const *nargv, const char *options) * proclaim their BSD heritage, before including this header; however, * to maintain portability, developers are advised to avoid it. */ -# define optreset __mingw_optreset +#define optreset __mingw_optreset extern int optreset; #endif #ifdef __cplusplus @@ -265,19 +266,19 @@ extern int optreset; extern "C" { #endif -struct option /* specification for a long form option... */ +struct option /* specification for a long form option... */ { - const char *name; /* option name, without leading hyphens */ - int has_arg; /* does it take an argument? */ - int *flag; /* where to save its status, or NULL */ - int val; /* its associated status value */ + const char* name; /* option name, without leading hyphens */ + int has_arg; /* does it take an argument? */ + int* flag; /* where to save its status, or NULL */ + int val; /* its associated status value */ }; -enum /* permitted values for its `has_arg' field... */ +enum /* permitted values for its `has_arg' field... */ { - no_argument = 0, /* option never takes an argument */ - required_argument, /* option always requires an argument */ - optional_argument /* option may take an argument */ + no_argument = 0, /* option never takes an argument */ + required_argument, /* option always requires an argument */ + optional_argument /* option may take an argument */ }; /* @@ -286,126 +287,137 @@ enum /* permitted values for its `has_arg' field... */ * Returns -1 if short_too is set and the option does not match long_options. */ static int -parse_long_options(char * const *nargv, const char *options, - const struct option *long_options, int *idx, int short_too) +parse_long_options(char* const* nargv, const char* options, + const struct option* long_options, int* idx, int short_too) { - char *current_argv, *has_equal; - size_t current_argv_len; - int i, ambiguous, match; - -#define IDENTICAL_INTERPRETATION(_x, _y) \ - (long_options[(_x)].has_arg == long_options[(_y)].has_arg && \ - long_options[(_x)].flag == long_options[(_y)].flag && \ - long_options[(_x)].val == long_options[(_y)].val) - - current_argv = place; - match = -1; - ambiguous = 0; - - optind++; - - if ((has_equal = strchr(current_argv, '=')) != NULL) { - /* argument found (--option=arg) */ - current_argv_len = has_equal - current_argv; - has_equal++; - } else - current_argv_len = strlen(current_argv); - - for (i = 0; long_options[i].name; i++) { - /* find matching long option */ - if (strncmp(current_argv, long_options[i].name, - current_argv_len)) - continue; - - if (strlen(long_options[i].name) == current_argv_len) { - /* exact match */ - match = i; - ambiguous = 0; - break; - } - /* + char *current_argv, *has_equal; + size_t current_argv_len; + int i, ambiguous, match; + +#define IDENTICAL_INTERPRETATION(_x, _y) \ + (long_options[(_x)].has_arg == long_options[(_y)].has_arg && long_options[(_x)].flag == long_options[(_y)].flag && long_options[(_x)].val == long_options[(_y)].val) + + current_argv = place; + match = -1; + ambiguous = 0; + + optind++; + + if ((has_equal = strchr(current_argv, '=')) != NULL) + { + /* argument found (--option=arg) */ + current_argv_len = has_equal - current_argv; + has_equal++; + } + else + current_argv_len = strlen(current_argv); + + for (i = 0; long_options[i].name; i++) + { + /* find matching long option */ + if (strncmp(current_argv, long_options[i].name, + current_argv_len)) + continue; + + if (strlen(long_options[i].name) == current_argv_len) + { + /* exact match */ + match = i; + ambiguous = 0; + break; + } + /* * If this is a known short option, don't allow * a partial match of a single character. */ - if (short_too && current_argv_len == 1) - continue; - - if (match == -1) /* partial match */ - match = i; - else if (!IDENTICAL_INTERPRETATION(i, match)) - ambiguous = 1; - } - if (ambiguous) { - /* ambiguous abbreviation */ - if (PRINT_ERROR) - warnx(ambig, (int)current_argv_len, - current_argv); - optopt = 0; - return (BADCH); - } - if (match != -1) { /* option found */ - if (long_options[match].has_arg == no_argument - && has_equal) { - if (PRINT_ERROR) - warnx(noarg, (int)current_argv_len, - current_argv); - /* + if (short_too && current_argv_len == 1) + continue; + + if (match == -1) /* partial match */ + match = i; + else if (!IDENTICAL_INTERPRETATION(i, match)) + ambiguous = 1; + } + if (ambiguous) + { + /* ambiguous abbreviation */ + if (PRINT_ERROR) + warnx(ambig, (int)current_argv_len, + current_argv); + optopt = 0; + return (BADCH); + } + if (match != -1) + { /* option found */ + if (long_options[match].has_arg == no_argument + && has_equal) + { + if (PRINT_ERROR) + warnx(noarg, (int)current_argv_len, + current_argv); + /* * XXX: GNU sets optopt to val regardless of flag */ - if (long_options[match].flag == NULL) - optopt = long_options[match].val; - else - optopt = 0; - return (BADARG); - } - if (long_options[match].has_arg == required_argument || - long_options[match].has_arg == optional_argument) { - if (has_equal) - optarg = has_equal; - else if (long_options[match].has_arg == - required_argument) { - /* + if (long_options[match].flag == NULL) + optopt = long_options[match].val; + else + optopt = 0; + return (BADARG); + } + if (long_options[match].has_arg == required_argument || long_options[match].has_arg == optional_argument) + { + if (has_equal) + optarg = has_equal; + else if (long_options[match].has_arg == required_argument) + { + /* * optional argument doesn't use next nargv */ - optarg = nargv[optind++]; - } - } - if ((long_options[match].has_arg == required_argument) - && (optarg == NULL)) { - /* + optarg = nargv[optind++]; + } + } + if ((long_options[match].has_arg == required_argument) + && (optarg == NULL)) + { + /* * Missing argument; leading ':' indicates no error * should be generated. */ - if (PRINT_ERROR) - warnx(recargstring, - current_argv); - /* + if (PRINT_ERROR) + warnx(recargstring, + current_argv); + /* * XXX: GNU sets optopt to val regardless of flag */ - if (long_options[match].flag == NULL) - optopt = long_options[match].val; - else - optopt = 0; - --optind; - return (BADARG); - } - } else { /* unknown option */ - if (short_too) { - --optind; - return (-1); - } - if (PRINT_ERROR) - warnx(illoptstring, current_argv); - optopt = 0; - return (BADCH); - } - if (idx) - *idx = match; - if (long_options[match].flag) { - *long_options[match].flag = long_options[match].val; - return (0); - } else - return (long_options[match].val); + if (long_options[match].flag == NULL) + optopt = long_options[match].val; + else + optopt = 0; + --optind; + return (BADARG); + } + } + else + { /* unknown option */ + if (short_too) + { + --optind; + return (-1); + } + if (PRINT_ERROR) + warnx(illoptstring, current_argv); + optopt = 0; + return (BADCH); + } + if (idx) + *idx = match; + if (long_options[match].flag) + { + *long_options[match].flag = long_options[match].val; + return (0); + } + else + return (long_options[match].val); #undef IDENTICAL_INTERPRETATION } @@ -414,222 +426,235 @@ parse_long_options(char * const *nargv, const char *options, * Parse argc/argv argument vector. Called by user level routines. */ static int -getopt_internal(int nargc, char * const *nargv, const char *options, - const struct option *long_options, int *idx, int flags) +getopt_internal(int nargc, char* const* nargv, const char* options, + const struct option* long_options, int* idx, int flags) { - char *oli; /* option letter list index */ - int optchar, short_too; - static int posixly_correct = -1; + char* oli; /* option letter list index */ + int optchar, short_too; + static int posixly_correct = -1; - if (options == NULL) - return (-1); + if (options == NULL) + return (-1); - /* + /* * XXX Some GNU programs (like cvs) set optind to 0 instead of * XXX using optreset. Work around this braindamage. */ - if (optind == 0) - optind = optreset = 1; + if (optind == 0) + optind = optreset = 1; - /* + /* * Disable GNU extensions if POSIXLY_CORRECT is set or options * string begins with a '+'. * * CV, 2009-12-14: Check POSIXLY_CORRECT anew if optind == 0 or * optreset != 0 for GNU compatibility. */ - if (posixly_correct == -1 || optreset != 0) - posixly_correct = (getenv("POSIXLY_CORRECT") != NULL); - if (*options == '-') - flags |= FLAG_ALLARGS; - else if (posixly_correct || *options == '+') - flags &= ~FLAG_PERMUTE; - if (*options == '+' || *options == '-') - options++; - - optarg = NULL; - if (optreset) - nonopt_start = nonopt_end = -1; + if (posixly_correct == -1 || optreset != 0) + posixly_correct = (getenv("POSIXLY_CORRECT") != NULL); + if (*options == '-') + flags |= FLAG_ALLARGS; + else if (posixly_correct || *options == '+') + flags &= ~FLAG_PERMUTE; + if (*options == '+' || *options == '-') + options++; + + optarg = NULL; + if (optreset) + nonopt_start = nonopt_end = -1; start: - if (optreset || !*place) { /* update scanning pointer */ - optreset = 0; - if (optind >= nargc) { /* end of argument vector */ - place = EMSG; - if (nonopt_end != -1) { - /* do permutation, if we have to */ - permute_args(nonopt_start, nonopt_end, - optind, nargv); - optind -= nonopt_end - nonopt_start; - } - else if (nonopt_start != -1) { - /* + if (optreset || !*place) + { /* update scanning pointer */ + optreset = 0; + if (optind >= nargc) + { /* end of argument vector */ + place = EMSG; + if (nonopt_end != -1) + { + /* do permutation, if we have to */ + permute_args(nonopt_start, nonopt_end, + optind, nargv); + optind -= nonopt_end - nonopt_start; + } + else if (nonopt_start != -1) + { + /* * If we skipped non-options, set optind * to the first of them. */ - optind = nonopt_start; - } - nonopt_start = nonopt_end = -1; - return (-1); - } - if (*(place = nargv[optind]) != '-' || - (place[1] == '\0' && strchr(options, '-') == NULL)) { - place = EMSG; /* found non-option */ - if (flags & FLAG_ALLARGS) { - /* + optind = nonopt_start; + } + nonopt_start = nonopt_end = -1; + return (-1); + } + if (*(place = nargv[optind]) != '-' || (place[1] == '\0' && strchr(options, '-') == NULL)) + { + place = EMSG; /* found non-option */ + if (flags & FLAG_ALLARGS) + { + /* * GNU extension: * return non-option as argument to option 1 */ - optarg = nargv[optind++]; - return (INORDER); - } - if (!(flags & FLAG_PERMUTE)) { - /* + optarg = nargv[optind++]; + return (INORDER); + } + if (!(flags & FLAG_PERMUTE)) + { + /* * If no permutation wanted, stop parsing * at first non-option. */ - return (-1); - } - /* do permutation */ - if (nonopt_start == -1) - nonopt_start = optind; - else if (nonopt_end != -1) { - permute_args(nonopt_start, nonopt_end, - optind, nargv); - nonopt_start = optind - - (nonopt_end - nonopt_start); - nonopt_end = -1; - } - optind++; - /* process next argument */ - goto start; - } - if (nonopt_start != -1 && nonopt_end == -1) - nonopt_end = optind; - - /* + return (-1); + } + /* do permutation */ + if (nonopt_start == -1) + nonopt_start = optind; + else if (nonopt_end != -1) + { + permute_args(nonopt_start, nonopt_end, + optind, nargv); + nonopt_start = optind - (nonopt_end - nonopt_start); + nonopt_end = -1; + } + optind++; + /* process next argument */ + goto start; + } + if (nonopt_start != -1 && nonopt_end == -1) + nonopt_end = optind; + + /* * If we have "-" do nothing, if "--" we are done. */ - if (place[1] != '\0' && *++place == '-' && place[1] == '\0') { - optind++; - place = EMSG; - /* + if (place[1] != '\0' && *++place == '-' && place[1] == '\0') + { + optind++; + place = EMSG; + /* * We found an option (--), so if we skipped * non-options, we have to permute. */ - if (nonopt_end != -1) { - permute_args(nonopt_start, nonopt_end, - optind, nargv); - optind -= nonopt_end - nonopt_start; - } - nonopt_start = nonopt_end = -1; - return (-1); - } - } - - /* + if (nonopt_end != -1) + { + permute_args(nonopt_start, nonopt_end, + optind, nargv); + optind -= nonopt_end - nonopt_start; + } + nonopt_start = nonopt_end = -1; + return (-1); + } + } + + /* * Check long options if: * 1) we were passed some * 2) the arg is not just "-" * 3) either the arg starts with -- we are getopt_long_only() */ - if (long_options != NULL && place != nargv[optind] && - (*place == '-' || (flags & FLAG_LONGONLY))) { - short_too = 0; - if (*place == '-') - place++; /* --foo long option */ - else if (*place != ':' && strchr(options, *place) != NULL) - short_too = 1; /* could be short option too */ - - optchar = parse_long_options(nargv, options, long_options, - idx, short_too); - if (optchar != -1) { - place = EMSG; - return (optchar); - } - } - - if ((optchar = (int)*place++) == (int)':' || - (optchar == (int)'-' && *place != '\0') || - (oli = (char*)strchr(options, optchar)) == NULL) { - /* + if (long_options != NULL && place != nargv[optind] && (*place == '-' || (flags & FLAG_LONGONLY))) + { + short_too = 0; + if (*place == '-') + place++; /* --foo long option */ + else if (*place != ':' && strchr(options, *place) != NULL) + short_too = 1; /* could be short option too */ + + optchar = parse_long_options(nargv, options, long_options, + idx, short_too); + if (optchar != -1) + { + place = EMSG; + return (optchar); + } + } + + if ((optchar = (int)*place++) == (int)':' || (optchar == (int)'-' && *place != '\0') || (oli = (char*)strchr(options, optchar)) == NULL) + { + /* * If the user specified "-" and '-' isn't listed in * options, return -1 (non-option) as per POSIX. * Otherwise, it is an unknown option character (or ':'). */ - if (optchar == (int)'-' && *place == '\0') - return (-1); - if (!*place) - ++optind; - if (PRINT_ERROR) - warnx(illoptchar, optchar); - optopt = optchar; - return (BADCH); - } - if (long_options != NULL && optchar == 'W' && oli[1] == ';') { - /* -W long-option */ - if (*place) /* no space */ - /* NOTHING */; - else if (++optind >= nargc) { /* no arg */ - place = EMSG; - if (PRINT_ERROR) - warnx(recargchar, optchar); - optopt = optchar; - return (BADARG); - } else /* white space */ - place = nargv[optind]; - optchar = parse_long_options(nargv, options, long_options, - idx, 0); - place = EMSG; - return (optchar); - } - if (*++oli != ':') { /* doesn't take argument */ - if (!*place) - ++optind; - } else { /* takes (optional) argument */ - optarg = NULL; - if (*place) /* no white space */ - optarg = place; - else if (oli[1] != ':') { /* arg not optional */ - if (++optind >= nargc) { /* no arg */ - place = EMSG; - if (PRINT_ERROR) - warnx(recargchar, optchar); - optopt = optchar; - return (BADARG); - } else - optarg = nargv[optind]; - } - place = EMSG; - ++optind; - } - /* dump back option letter */ - return (optchar); + if (optchar == (int)'-' && *place == '\0') + return (-1); + if (!*place) + ++optind; + if (PRINT_ERROR) + warnx(illoptchar, optchar); + optopt = optchar; + return (BADCH); + } + if (long_options != NULL && optchar == 'W' && oli[1] == ';') + { + /* -W long-option */ + if (*place) /* no space */ + /* NOTHING */; + else if (++optind >= nargc) + { /* no arg */ + place = EMSG; + if (PRINT_ERROR) + warnx(recargchar, optchar); + optopt = optchar; + return (BADARG); + } + else /* white space */ + place = nargv[optind]; + optchar = parse_long_options(nargv, options, long_options, + idx, 0); + place = EMSG; + return (optchar); + } + if (*++oli != ':') + { /* doesn't take argument */ + if (!*place) + ++optind; + } + else + { /* takes (optional) argument */ + optarg = NULL; + if (*place) /* no white space */ + optarg = place; + else if (oli[1] != ':') + { /* arg not optional */ + if (++optind >= nargc) + { /* no arg */ + place = EMSG; + if (PRINT_ERROR) + warnx(recargchar, optchar); + optopt = optchar; + return (BADARG); + } + else + optarg = nargv[optind]; + } + place = EMSG; + ++optind; + } + /* dump back option letter */ + return (optchar); } /* * getopt_long -- * Parse argc/argv argument vector. */ -int -getopt_long(int nargc, char * const *nargv, const char *options, - const struct option *long_options, int *idx) +int getopt_long(int nargc, char* const* nargv, const char* options, + const struct option* long_options, int* idx) { - - return (getopt_internal(nargc, nargv, options, long_options, idx, - FLAG_PERMUTE)); + return (getopt_internal(nargc, nargv, options, long_options, idx, + FLAG_PERMUTE)); } /* * getopt_long_only -- * Parse argc/argv argument vector. */ -int -getopt_long_only(int nargc, char * const *nargv, const char *options, - const struct option *long_options, int *idx) +int getopt_long_only(int nargc, char* const* nargv, const char* options, + const struct option* long_options, int* idx) { - - return (getopt_internal(nargc, nargv, options, long_options, idx, - FLAG_PERMUTE|FLAG_LONGONLY)); + return (getopt_internal(nargc, nargv, options, long_options, idx, + FLAG_PERMUTE | FLAG_LONGONLY)); } //extern int getopt_long(int nargc, char * const *nargv, const char *options, @@ -643,7 +668,7 @@ getopt_long_only(int nargc, char * const *nargv, const char *options, /* * ...for the long form API only; keep this for compatibility. */ -# define HAVE_DECL_GETOPT 1 +#define HAVE_DECL_GETOPT 1 #endif #ifdef __cplusplus diff --git a/examples/common/stb_image.h b/examples/common/stb_image.h index aa445aadf..142610cf4 100644 --- a/examples/common/stb_image.h +++ b/examples/common/stb_image.h @@ -3,13 +3,13 @@ #ifndef STBI_NO_STDIO #include -#endif // STBI_NO_STDIO +#endif // STBI_NO_STDIO #define STBI_VERSION 1 enum { - STBI_default = 0, // only used for desired_channels + STBI_default = 0, // only used for desired_channels STBI_grey = 1, STBI_grey_alpha = 2, @@ -36,9 +36,9 @@ extern "C" { typedef struct { int (*read)(void* user, char* data, - int size); // fill 'data' with 'size' bytes. return number of bytes actually read - void (*skip)(void* user, int n); // skip the next 'n' bytes, or 'unget' the last -n bytes if negative - int (*eof)(void* user); // returns nonzero if we are at end of file/data + int size); // fill 'data' with 'size' bytes. return number of bytes actually read + void (*skip)(void* user, int n); // skip the next 'n' bytes, or 'unget' the last -n bytes if negative + int (*eof)(void* user); // returns nonzero if we are at end of file/data } stbi_io_callbacks; //////////////////////////////////// @@ -95,12 +95,12 @@ extern float* stbi_loadf_from_file(FILE* f, int* x, int* y, int* channels_in_fil #ifndef STBI_NO_HDR extern void stbi_hdr_to_ldr_gamma(float gamma); extern void stbi_hdr_to_ldr_scale(float scale); -#endif // STBI_NO_HDR +#endif // STBI_NO_HDR #ifndef STBI_NO_LINEAR extern void stbi_ldr_to_hdr_gamma(float gamma); extern void stbi_ldr_to_hdr_scale(float scale); -#endif // STBI_NO_LINEAR +#endif // STBI_NO_LINEAR // stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR extern int stbi_is_hdr_from_callbacks(stbi_io_callbacks const* clbk, void* user); @@ -108,7 +108,7 @@ extern int stbi_is_hdr_from_memory(stbi_uc const* buffer, int len); #ifndef STBI_NO_STDIO extern int stbi_is_hdr(char const* filename); extern int stbi_is_hdr_from_file(FILE* f); -#endif // STBI_NO_STDIO +#endif // STBI_NO_STDIO // get a VERY brief reason for failure // NOT THREADSAFE @@ -160,14 +160,12 @@ extern int stbi_zlib_decode_noheader_buffer(char* obuffer, int olen, const char* // // //// end header file ///////////////////////////////////////////////////// -#endif // STBI_INCLUDE_STB_IMAGE_H +#endif // STBI_INCLUDE_STB_IMAGE_H #define STB_IMAGE_IMPLEMENTATION #ifdef STB_IMAGE_IMPLEMENTATION -#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) || defined(STBI_ONLY_TGA) || \ - defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || \ - defined(STBI_ONLY_PNM) || defined(STBI_ONLY_ZLIB) +#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) || defined(STBI_ONLY_ZLIB) #ifndef STBI_ONLY_JPEG #define STBI_NO_JPEG #endif @@ -202,13 +200,13 @@ extern int stbi_zlib_decode_noheader_buffer(char* obuffer, int olen, const char* #endif #include -#include // ptrdiff_t on osx +#include // ptrdiff_t on osx #include #include #include #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) -#include // ldexp, pow +#include // ldexp, pow #endif #ifndef STBI_NO_STDIO @@ -247,9 +245,9 @@ typedef int32_t stbi__int32; typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1]; #ifdef _MSC_VER -#define STBI_NOTUSED(v) ( void )(v) +#define STBI_NOTUSED(v) (void)(v) #else -#define STBI_NOTUSED(v) ( void )sizeof(v) +#define STBI_NOTUSED(v) (void)sizeof(v) #endif #ifdef _MSC_VER @@ -271,9 +269,9 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1]; #endif #ifndef STBI_MALLOC -#define STBI_MALLOC(sz) malloc(sz) +#define STBI_MALLOC(sz) malloc(sz) #define STBI_REALLOC(p, newsz) realloc(p, newsz) -#define STBI_FREE(p) free(p) +#define STBI_FREE(p) free(p) #endif #ifndef STBI_REALLOC_SIZED @@ -319,8 +317,8 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1]; #ifdef _MSC_VER -#if _MSC_VER >= 1400 // not VC6 -#include // __cpuid +#if _MSC_VER >= 1400 // not VC6 +#include // __cpuid static int stbi__cpuid3(void) { int info[4]; @@ -347,7 +345,7 @@ static int stbi__sse2_available(void) int info3 = stbi__cpuid3(); return ((info3 >> 26) & 1) != 0; } -#else // assume GCC-style if not VC++ +#else // assume GCC-style if not VC++ #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16))) static int stbi__sse2_available(void) @@ -404,8 +402,8 @@ static void stbi__start_mem(stbi__context* s, stbi_uc const* buffer, int len) { s->io.read = NULL; s->read_from_callbacks = 0; - s->img_buffer = s->img_buffer_original = ( stbi_uc* )buffer; - s->img_buffer_end = s->img_buffer_original_end = ( stbi_uc* )buffer + len; + s->img_buffer = s->img_buffer_original = (stbi_uc*)buffer; + s->img_buffer_end = s->img_buffer_original_end = (stbi_uc*)buffer + len; } // initialize a callback-based context @@ -424,17 +422,17 @@ static void stbi__start_callbacks(stbi__context* s, stbi_io_callbacks* c, void* static int stbi__stdio_read(void* user, char* data, int size) { - return ( int )fread(data, 1, size, ( FILE* )user); + return (int)fread(data, 1, size, (FILE*)user); } static void stbi__stdio_skip(void* user, int n) { - fseek(( FILE* )user, n, SEEK_CUR); + fseek((FILE*)user, n, SEEK_CUR); } static int stbi__stdio_eof(void* user) { - return feof(( FILE* )user); + return feof((FILE*)user); } static stbi_io_callbacks stbi__stdio_callbacks = { @@ -445,12 +443,12 @@ static stbi_io_callbacks stbi__stdio_callbacks = { static void stbi__start_file(stbi__context* s, FILE* f) { - stbi__start_callbacks(s, &stbi__stdio_callbacks, ( void* )f); + stbi__start_callbacks(s, &stbi__stdio_callbacks, (void*)f); } // static void stop_file(stbi__context *s) { } -#endif // !STBI_NO_STDIO +#endif // !STBI_NO_STDIO static void stbi__rewind(stbi__context* s) { @@ -564,7 +562,7 @@ static void* stbi__malloc(size_t size) // negative terms are considered invalid. static int stbi__addsizes_valid(int a, int b) { - if(b < 0) + if (b < 0) return 0; // now 0 <= b <= INT_MAX, hence also // 0 <= INT_MAX - b <= INTMAX. @@ -577,10 +575,10 @@ static int stbi__addsizes_valid(int a, int b) // negative factors are considered invalid. static int stbi__mul2sizes_valid(int a, int b) { - if(a < 0 || b < 0) + if (a < 0 || b < 0) return 0; - if(b == 0) - return 1; // mul-by-0 is always safe + if (b == 0) + return 1; // mul-by-0 is always safe // portable way to check for no overflows in a*b return a <= INT_MAX / b; } @@ -601,22 +599,21 @@ static int stbi__mad3sizes_valid(int a, int b, int c, int add) #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add) { - return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) && stbi__mul2sizes_valid(a * b * c, d) && - stbi__addsizes_valid(a * b * c * d, add); + return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) && stbi__mul2sizes_valid(a * b * c, d) && stbi__addsizes_valid(a * b * c * d, add); } #endif // mallocs with size overflow checking static void* stbi__malloc_mad2(int a, int b, int add) { - if(!stbi__mad2sizes_valid(a, b, add)) + if (!stbi__mad2sizes_valid(a, b, add)) return NULL; return stbi__malloc(a * b + add); } static void* stbi__malloc_mad3(int a, int b, int c, int add) { - if(!stbi__mad3sizes_valid(a, b, c, add)) + if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL; return stbi__malloc(a * b * c + add); } @@ -624,7 +621,7 @@ static void* stbi__malloc_mad3(int a, int b, int c, int add) #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) static void* stbi__malloc_mad4(int a, int b, int c, int d, int add) { - if(!stbi__mad4sizes_valid(a, b, c, d, add)) + if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL; return stbi__malloc(a * b * c * d + add); } @@ -642,8 +639,8 @@ static void* stbi__malloc_mad4(int a, int b, int c, int d, int add) #define stbi__err(x, y) stbi__err(x) #endif -#define stbi__errpf(x, y) (( float* )(size_t)(stbi__err(x, y) ? NULL : NULL)) -#define stbi__errpuc(x, y) (( unsigned char* )(size_t)(stbi__err(x, y) ? NULL : NULL)) +#define stbi__errpf(x, y) ((float*)(size_t)(stbi__err(x, y) ? NULL : NULL)) +#define stbi__errpuc(x, y) ((unsigned char*)(size_t)(stbi__err(x, y) ? NULL : NULL)) extern void stbi_image_free(void* retval_from_stbi_load) { @@ -667,43 +664,42 @@ extern void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip) static void* stbi__load_main(stbi__context* s, int* x, int* y, int* comp, int req_comp, stbi__result_info* ri, int bpc) { - memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields - ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed - ri->channel_order = - STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order + memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields + ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed + ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order ri->num_channels = 0; #ifndef STBI_NO_JPEG - if(stbi__jpeg_test(s)) + if (stbi__jpeg_test(s)) return stbi__jpeg_load(s, x, y, comp, req_comp, ri); #endif #ifndef STBI_NO_PNG - if(stbi__png_test(s)) + if (stbi__png_test(s)) return stbi__png_load(s, x, y, comp, req_comp, ri); #endif #ifndef STBI_NO_BMP - if(stbi__bmp_test(s)) + if (stbi__bmp_test(s)) return stbi__bmp_load(s, x, y, comp, req_comp, ri); #endif #ifndef STBI_NO_GIF - if(stbi__gif_test(s)) + if (stbi__gif_test(s)) return stbi__gif_load(s, x, y, comp, req_comp, ri); #endif #ifndef STBI_NO_PSD - if(stbi__psd_test(s)) + if (stbi__psd_test(s)) return stbi__psd_load(s, x, y, comp, req_comp, ri, bpc); #endif #ifndef STBI_NO_PIC - if(stbi__pic_test(s)) + if (stbi__pic_test(s)) return stbi__pic_load(s, x, y, comp, req_comp, ri); #endif #ifndef STBI_NO_PNM - if(stbi__pnm_test(s)) + if (stbi__pnm_test(s)) return stbi__pnm_load(s, x, y, comp, req_comp, ri); #endif #ifndef STBI_NO_HDR - if(stbi__hdr_test(s)) + if (stbi__hdr_test(s)) { float* hdr = stbi__hdr_load(s, x, y, comp, req_comp, ri); return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp); @@ -712,7 +708,7 @@ static void* stbi__load_main(stbi__context* s, int* x, int* y, int* comp, int re #ifndef STBI_NO_TGA // test tga last because it's a crappy test! - if(stbi__tga_test(s)) + if (stbi__tga_test(s)) return stbi__tga_load(s, x, y, comp, req_comp, ri); #endif @@ -725,13 +721,12 @@ static stbi_uc* stbi__convert_16_to_8(stbi__uint16* orig, int w, int h, int chan int img_len = w * h * channels; stbi_uc* reduced; - reduced = ( stbi_uc* )stbi__malloc(img_len); - if(reduced == NULL) + reduced = (stbi_uc*)stbi__malloc(img_len); + if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory"); - for(i = 0; i < img_len; ++i) - reduced[i] = - (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling + for (i = 0; i < img_len; ++i) + reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling STBI_FREE(orig); return reduced; @@ -743,13 +738,12 @@ static stbi__uint16* stbi__convert_8_to_16(stbi_uc* orig, int w, int h, int chan int img_len = w * h * channels; stbi__uint16* enlarged; - enlarged = ( stbi__uint16* )stbi__malloc(img_len * 2); - if(enlarged == NULL) - return ( stbi__uint16* )stbi__errpuc("outofmem", "Out of memory"); + enlarged = (stbi__uint16*)stbi__malloc(img_len * 2); + if (enlarged == NULL) + return (stbi__uint16*)stbi__errpuc("outofmem", "Out of memory"); - for(i = 0; i < img_len; ++i) - enlarged[i] = - (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff + for (i = 0; i < img_len; ++i) + enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff STBI_FREE(orig); return enlarged; @@ -758,17 +752,17 @@ static stbi__uint16* stbi__convert_8_to_16(stbi_uc* orig, int w, int h, int chan static void stbi__vertical_flip(void* image, int w, int h, int bytes_per_pixel) { int row; - size_t bytes_per_row = ( size_t )w * bytes_per_pixel; + size_t bytes_per_row = (size_t)w * bytes_per_pixel; stbi_uc temp[2048]; - stbi_uc* bytes = ( stbi_uc* )image; + stbi_uc* bytes = (stbi_uc*)image; - for(row = 0; row < (h >> 1); row++) + for (row = 0; row < (h >> 1); row++) { stbi_uc* row0 = bytes + row * bytes_per_row; stbi_uc* row1 = bytes + (h - row - 1) * bytes_per_row; // swap row0 with row1 size_t bytes_left = bytes_per_row; - while(bytes_left) + while (bytes_left) { size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp); memcpy(temp, row0, bytes_copy); @@ -786,8 +780,8 @@ static void stbi__vertical_flip_slices(void* image, int w, int h, int z, int byt int slice; int slice_size = w * h * bytes_per_pixel; - stbi_uc* bytes = ( stbi_uc* )image; - for(slice = 0; slice < z; ++slice) + stbi_uc* bytes = (stbi_uc*)image; + for (slice = 0; slice < z; ++slice) { stbi__vertical_flip(bytes, w, h, bytes_per_pixel); bytes += slice_size; @@ -799,25 +793,25 @@ static unsigned char* stbi__load_and_postprocess_8bit(stbi__context* s, int* x, stbi__result_info ri; void* result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8); - if(result == NULL) + if (result == NULL) return NULL; - if(ri.bits_per_channel != 8) + if (ri.bits_per_channel != 8) { STBI_ASSERT(ri.bits_per_channel == 16); - result = stbi__convert_16_to_8(( stbi__uint16* )result, *x, *y, req_comp == 0 ? *comp : req_comp); + result = stbi__convert_16_to_8((stbi__uint16*)result, *x, *y, req_comp == 0 ? *comp : req_comp); ri.bits_per_channel = 8; } // @TODO: move stbi__convert_format to here - if(stbi__vertically_flip_on_load) + if (stbi__vertically_flip_on_load) { int channels = req_comp ? req_comp : *comp; stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc)); } - return ( unsigned char* )result; + return (unsigned char*)result; } static stbi__uint16* stbi__load_and_postprocess_16bit(stbi__context* s, int* x, int* y, int* comp, int req_comp) @@ -825,32 +819,32 @@ static stbi__uint16* stbi__load_and_postprocess_16bit(stbi__context* s, int* x, stbi__result_info ri; void* result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16); - if(result == NULL) + if (result == NULL) return NULL; - if(ri.bits_per_channel != 16) + if (ri.bits_per_channel != 16) { STBI_ASSERT(ri.bits_per_channel == 8); - result = stbi__convert_8_to_16(( stbi_uc* )result, *x, *y, req_comp == 0 ? *comp : req_comp); + result = stbi__convert_8_to_16((stbi_uc*)result, *x, *y, req_comp == 0 ? *comp : req_comp); ri.bits_per_channel = 16; } // @TODO: move stbi__convert_format16 to here // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision - if(stbi__vertically_flip_on_load) + if (stbi__vertically_flip_on_load) { int channels = req_comp ? req_comp : *comp; stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16)); } - return ( stbi__uint16* )result; + return (stbi__uint16*)result; } #if !defined(STBI_NO_HDR) || !defined(STBI_NO_LINEAR) static void stbi__float_postprocess(float* result, int* x, int* y, int* comp, int req_comp) { - if(stbi__vertically_flip_on_load && result != NULL) + if (stbi__vertically_flip_on_load && result != NULL) { int channels = req_comp ? req_comp : *comp; stbi__vertical_flip(result, *x, *y, channels * sizeof(float)); @@ -864,7 +858,7 @@ static FILE* stbi__fopen(char const* filename, char const* mode) { FILE* f; #if defined(_MSC_VER) && _MSC_VER >= 1400 - if(0 != fopen_s(&f, filename, mode)) + if (0 != fopen_s(&f, filename, mode)) f = 0; #else f = fopen(filename, mode); @@ -876,7 +870,7 @@ extern stbi_uc* stbi_load(const char* filename, int* x, int* y, int* comp, int r { FILE* f = stbi__fopen(filename, "rb"); unsigned char* result; - if(!f) + if (!f) return stbi__errpuc("can't fopen", "Unable to open file"); result = stbi_load_from_file(f, x, y, comp, req_comp); fclose(f); @@ -889,10 +883,10 @@ extern stbi_uc* stbi_load_from_file(FILE* f, int* x, int* y, int* comp, int req_ stbi__context s; stbi__start_file(&s, f); result = stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp); - if(result) + if (result) { // need to 'unget' all the characters in the IO buffer - fseek(f, -( int )(s.img_buffer_end - s.img_buffer), SEEK_CUR); + fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR); } return result; } @@ -903,10 +897,10 @@ extern stbi__uint16* stbi_load_from_file_16(FILE* f, int* x, int* y, int* comp, stbi__context s; stbi__start_file(&s, f); result = stbi__load_and_postprocess_16bit(&s, x, y, comp, req_comp); - if(result) + if (result) { // need to 'unget' all the characters in the IO buffer - fseek(f, -( int )(s.img_buffer_end - s.img_buffer), SEEK_CUR); + fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR); } return result; } @@ -915,14 +909,14 @@ extern stbi_us* stbi_load_16(char const* filename, int* x, int* y, int* comp, in { FILE* f = stbi__fopen(filename, "rb"); stbi__uint16* result; - if(!f) - return ( stbi_us* )stbi__errpuc("can't fopen", "Unable to open file"); + if (!f) + return (stbi_us*)stbi__errpuc("can't fopen", "Unable to open file"); result = stbi_load_from_file_16(f, x, y, comp, req_comp); fclose(f); return result; } -#endif //! STBI_NO_STDIO +#endif //! STBI_NO_STDIO extern stbi_us* stbi_load_16_from_memory(stbi_uc const* buffer, int len, int* x, int* y, int* channels_in_file, int desired_channels) @@ -936,7 +930,7 @@ extern stbi_us* stbi_load_16_from_callbacks(stbi_io_callbacks const* clbk, void* int* channels_in_file, int desired_channels) { stbi__context s; - stbi__start_callbacks(&s, ( stbi_io_callbacks* )clbk, user); + stbi__start_callbacks(&s, (stbi_io_callbacks*)clbk, user); return stbi__load_and_postprocess_16bit(&s, x, y, channels_in_file, desired_channels); } @@ -951,7 +945,7 @@ extern stbi_uc* stbi_load_from_callbacks(stbi_io_callbacks const* clbk, void* us int req_comp) { stbi__context s; - stbi__start_callbacks(&s, ( stbi_io_callbacks* )clbk, user); + stbi__start_callbacks(&s, (stbi_io_callbacks*)clbk, user); return stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp); } @@ -963,8 +957,8 @@ extern stbi_uc* stbi_load_gif_from_memory(stbi_uc const* buffer, int len, int** stbi__context s; stbi__start_mem(&s, buffer, len); - result = ( unsigned char* )stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp); - if(stbi__vertically_flip_on_load) + result = (unsigned char*)stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp); + if (stbi__vertically_flip_on_load) { stbi__vertical_flip_slices(result, *x, *y, *z, *comp); } @@ -978,17 +972,17 @@ static float* stbi__loadf_main(stbi__context* s, int* x, int* y, int* comp, int { unsigned char* data; #ifndef STBI_NO_HDR - if(stbi__hdr_test(s)) + if (stbi__hdr_test(s)) { stbi__result_info ri; float* hdr_data = stbi__hdr_load(s, x, y, comp, req_comp, &ri); - if(hdr_data) + if (hdr_data) stbi__float_postprocess(hdr_data, x, y, comp, req_comp); return hdr_data; } #endif data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp); - if(data) + if (data) return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp); return stbi__errpf("unknown image type", "Image not of any known type, or corrupt"); } @@ -1004,7 +998,7 @@ extern float* stbi_loadf_from_callbacks(stbi_io_callbacks const* clbk, void* use int req_comp) { stbi__context s; - stbi__start_callbacks(&s, ( stbi_io_callbacks* )clbk, user); + stbi__start_callbacks(&s, (stbi_io_callbacks*)clbk, user); return stbi__loadf_main(&s, x, y, comp, req_comp); } @@ -1013,7 +1007,7 @@ extern float* stbi_loadf(char const* filename, int* x, int* y, int* comp, int re { float* result; FILE* f = stbi__fopen(filename, "rb"); - if(!f) + if (!f) return stbi__errpf("can't fopen", "Unable to open file"); result = stbi_loadf_from_file(f, x, y, comp, req_comp); fclose(f); @@ -1026,9 +1020,9 @@ extern float* stbi_loadf_from_file(FILE* f, int* x, int* y, int* comp, int req_c stbi__start_file(&s, f); return stbi__loadf_main(&s, x, y, comp, req_comp); } -#endif // !STBI_NO_STDIO +#endif // !STBI_NO_STDIO -#endif // !STBI_NO_LINEAR +#endif // !STBI_NO_LINEAR // these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is // defined, for API simplicity; if STBI_NO_LINEAR is defined, it always @@ -1052,7 +1046,7 @@ extern int stbi_is_hdr(char const* filename) { FILE* f = stbi__fopen(filename, "rb"); int result = 0; - if(f) + if (f) { result = stbi_is_hdr_from_file(f); fclose(f); @@ -1075,13 +1069,13 @@ extern int stbi_is_hdr_from_file(FILE* f) return 0; #endif } -#endif // !STBI_NO_STDIO +#endif // !STBI_NO_STDIO extern int stbi_is_hdr_from_callbacks(stbi_io_callbacks const* clbk, void* user) { #ifndef STBI_NO_HDR stbi__context s; - stbi__start_callbacks(&s, ( stbi_io_callbacks* )clbk, user); + stbi__start_callbacks(&s, (stbi_io_callbacks*)clbk, user); return stbi__hdr_test(&s); #else STBI_NOTUSED(clbk); @@ -1128,8 +1122,8 @@ enum static void stbi__refill_buffer(stbi__context* s) { - int n = (s->io.read)(s->io_user_data, ( char* )s->buffer_start, s->buflen); - if(n == 0) + int n = (s->io.read)(s->io_user_data, (char*)s->buffer_start, s->buflen); + if (n == 0) { // at end of file, treat same as if from memory, but need to handle case // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file @@ -1147,9 +1141,9 @@ static void stbi__refill_buffer(stbi__context* s) stbi_inline static stbi_uc stbi__get8(stbi__context* s) { - if(s->img_buffer < s->img_buffer_end) + if (s->img_buffer < s->img_buffer_end) return *s->img_buffer++; - if(s->read_from_callbacks) + if (s->read_from_callbacks) { stbi__refill_buffer(s); return *s->img_buffer++; @@ -1159,13 +1153,13 @@ stbi_inline static stbi_uc stbi__get8(stbi__context* s) stbi_inline static int stbi__at_eof(stbi__context* s) { - if(s->io.read) + if (s->io.read) { - if(!(s->io.eof)(s->io_user_data)) + if (!(s->io.eof)(s->io_user_data)) return 0; // if feof() is true, check if buffer = end // special case: we've only got the special 0 character at the end - if(s->read_from_callbacks == 0) + if (s->read_from_callbacks == 0) return 1; } @@ -1174,15 +1168,15 @@ stbi_inline static int stbi__at_eof(stbi__context* s) static void stbi__skip(stbi__context* s, int n) { - if(n < 0) + if (n < 0) { s->img_buffer = s->img_buffer_end; return; } - if(s->io.read) + if (s->io.read) { - int blen = ( int )(s->img_buffer_end - s->img_buffer); - if(blen < n) + int blen = (int)(s->img_buffer_end - s->img_buffer); + if (blen < n) { s->img_buffer = s->img_buffer_end; (s->io.skip)(s->io_user_data, n - blen); @@ -1194,23 +1188,23 @@ static void stbi__skip(stbi__context* s, int n) static int stbi__getn(stbi__context* s, stbi_uc* buffer, int n) { - if(s->io.read) + if (s->io.read) { - int blen = ( int )(s->img_buffer_end - s->img_buffer); - if(blen < n) + int blen = (int)(s->img_buffer_end - s->img_buffer); + if (blen < n) { int res, count; memcpy(buffer, s->img_buffer, blen); - count = (s->io.read)(s->io_user_data, ( char* )buffer + blen, n - blen); + count = (s->io.read)(s->io_user_data, (char*)buffer + blen, n - blen); res = (count == (n - blen)); s->img_buffer = s->img_buffer_end; return res; } } - if(s->img_buffer + n <= s->img_buffer_end) + if (s->img_buffer + n <= s->img_buffer_end) { memcpy(buffer, s->img_buffer, n); s->img_buffer += n; @@ -1250,7 +1244,7 @@ static stbi__uint32 stbi__get32le(stbi__context* s) } #endif -#define STBI__BYTECAST(x) ((stbi_uc)(( x )&255)) // truncate int to byte without warnings +#define STBI__BYTECAST(x) ((stbi_uc)((x)&255)) // truncate int to byte without warnings ////////////////////////////////////////////////////////////////////////////// // @@ -1273,29 +1267,29 @@ static unsigned char* stbi__convert_format(unsigned char* data, int img_n, int r int i, j; unsigned char* good; - if(req_comp == img_n) + if (req_comp == img_n) return data; STBI_ASSERT(req_comp >= 1 && req_comp <= 4); - good = ( unsigned char* )stbi__malloc_mad3(req_comp, x, y, 0); - if(good == NULL) + good = (unsigned char*)stbi__malloc_mad3(req_comp, x, y, 0); + if (good == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); } - for(j = 0; j < ( int )y; ++j) + for (j = 0; j < (int)y; ++j) { unsigned char* src = data + j * x * img_n; unsigned char* dest = good + j * x * req_comp; -#define STBI__COMBO(a, b) (( a )*8 + (b)) +#define STBI__COMBO(a, b) ((a)*8 + (b)) #define STBI__CASE(a, b) \ case STBI__COMBO(a, b): \ - for(i = x - 1; i >= 0; --i, src += a, dest += b) + for (i = x - 1; i >= 0; --i, src += a, dest += b) // convert source image with img_n components to one with req_comp components; // avoid switch per pixel, so use switch per scanline and massive macros - switch(STBI__COMBO(img_n, req_comp)) + switch (STBI__COMBO(img_n, req_comp)) { STBI__CASE(1, 2) { @@ -1357,8 +1351,8 @@ static unsigned char* stbi__convert_format(unsigned char* data, int img_n, int r dest[0] = src[0], dest[1] = src[1], dest[2] = src[2]; } break; - default: - STBI_ASSERT(0); + default: + STBI_ASSERT(0); } #undef STBI__CASE } @@ -1377,29 +1371,29 @@ static stbi__uint16* stbi__convert_format16(stbi__uint16* data, int img_n, int r int i, j; stbi__uint16* good; - if(req_comp == img_n) + if (req_comp == img_n) return data; STBI_ASSERT(req_comp >= 1 && req_comp <= 4); - good = ( stbi__uint16* )stbi__malloc((size_t)req_comp * x * y * 2); - if(good == NULL) + good = (stbi__uint16*)stbi__malloc((size_t)req_comp * x * y * 2); + if (good == NULL) { STBI_FREE(data); - return ( stbi__uint16* )stbi__errpuc("outofmem", "Out of memory"); + return (stbi__uint16*)stbi__errpuc("outofmem", "Out of memory"); } - for(j = 0; j < ( int )y; ++j) + for (j = 0; j < (int)y; ++j) { stbi__uint16* src = data + j * x * img_n; stbi__uint16* dest = good + j * x * req_comp; -#define STBI__COMBO(a, b) (( a )*8 + (b)) +#define STBI__COMBO(a, b) ((a)*8 + (b)) #define STBI__CASE(a, b) \ case STBI__COMBO(a, b): \ - for(i = x - 1; i >= 0; --i, src += a, dest += b) + for (i = x - 1; i >= 0; --i, src += a, dest += b) // convert source image with img_n components to one with req_comp components; // avoid switch per pixel, so use switch per scanline and massive macros - switch(STBI__COMBO(img_n, req_comp)) + switch (STBI__COMBO(img_n, req_comp)) { STBI__CASE(1, 2) { @@ -1461,8 +1455,8 @@ static stbi__uint16* stbi__convert_format16(stbi__uint16* data, int img_n, int r dest[0] = src[0], dest[1] = src[1], dest[2] = src[2]; } break; - default: - STBI_ASSERT(0); + default: + STBI_ASSERT(0); } #undef STBI__CASE } @@ -1476,26 +1470,26 @@ static float* stbi__ldr_to_hdr(stbi_uc* data, int x, int y, int comp) { int i, k, n; float* output; - if(!data) + if (!data) return NULL; - output = ( float* )stbi__malloc_mad4(x, y, comp, sizeof(float), 0); - if(output == NULL) + output = (float*)stbi__malloc_mad4(x, y, comp, sizeof(float), 0); + if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); } // compute number of non-alpha components - if(comp & 1) + if (comp & 1) n = comp; else n = comp - 1; - for(i = 0; i < x * y; ++i) + for (i = 0; i < x * y; ++i) { - for(k = 0; k < n; ++k) + for (k = 0; k < n; ++k) { - output[i * comp + k] = ( float )(pow(data[i * comp + k] / 255.0f, stbi__l2h_gamma) * stbi__l2h_scale); + output[i * comp + k] = (float)(pow(data[i * comp + k] / 255.0f, stbi__l2h_gamma) * stbi__l2h_scale); } - if(k < comp) + if (k < comp) output[i * comp + k] = data[i * comp + k] / 255.0f; } STBI_FREE(data); @@ -1504,43 +1498,43 @@ static float* stbi__ldr_to_hdr(stbi_uc* data, int x, int y, int comp) #endif #ifndef STBI_NO_HDR -#define stbi__float2int(x) (( int )(x)) +#define stbi__float2int(x) ((int)(x)) static stbi_uc* stbi__hdr_to_ldr(float* data, int x, int y, int comp) { int i, k, n; stbi_uc* output; - if(!data) + if (!data) return NULL; - output = ( stbi_uc* )stbi__malloc_mad3(x, y, comp, 0); - if(output == NULL) + output = (stbi_uc*)stbi__malloc_mad3(x, y, comp, 0); + if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); } // compute number of non-alpha components - if(comp & 1) + if (comp & 1) n = comp; else n = comp - 1; - for(i = 0; i < x * y; ++i) + for (i = 0; i < x * y; ++i) { - for(k = 0; k < n; ++k) + for (k = 0; k < n; ++k) { - float z = ( float )pow((double)data[i * comp + k] * stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f; - if(z < 0) + float z = (float)pow((double)data[i * comp + k] * stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f; + if (z < 0) z = 0; - if(z > 255) + if (z > 255) z = 255; - output[i * comp + k] = ( stbi_uc )stbi__float2int(z); + output[i * comp + k] = (stbi_uc)stbi__float2int(z); } - if(k < comp) + if (k < comp) { float z = data[i * comp + k] * 255 + 0.5f; - if(z < 0) + if (z < 0) z = 0; - if(z > 255) + if (z > 255) z = 255; - output[i * comp + k] = ( stbi_uc )stbi__float2int(z); + output[i * comp + k] = (stbi_uc)stbi__float2int(z); } } STBI_FREE(data); @@ -1572,7 +1566,7 @@ static stbi_uc* stbi__hdr_to_ldr(float* data, int x, int y, int comp) #ifndef STBI_NO_JPEG // huffman decoding acceleration -#define FAST_BITS 9 // larger handles more cases; smaller stomps less cache +#define FAST_BITS 9 // larger handles more cases; smaller stomps less cache typedef struct { @@ -1582,7 +1576,7 @@ typedef struct stbi_uc values[256]; stbi_uc size[257]; unsigned int maxcode[18]; - int delta[17]; // old 'firstsymbol' - old 'firstcode' + int delta[17]; // old 'firstsymbol' - old 'firstcode' } stbi__huffman; typedef struct @@ -1611,14 +1605,14 @@ typedef struct stbi_uc* data; void *raw_data, *raw_coeff; stbi_uc* linebuf; - short* coeff; // progressive only - int coeff_w, coeff_h; // number of 8x8 coefficient blocks + short* coeff; // progressive only + int coeff_w, coeff_h; // number of 8x8 coefficient blocks } img_comp[4]; - stbi__uint32 code_buffer; // jpeg entropy-coded buffer - int code_bits; // number of valid bits - unsigned char marker; // marker seen while filling entropy buffer - int nomore; // flag if we saw a marker so must stop + stbi__uint32 code_buffer; // jpeg entropy-coded buffer + int code_bits; // number of valid bits + unsigned char marker; // marker seen while filling entropy buffer + int nomore; // flag if we saw a marker so must stop int progressive; int spec_start; @@ -1627,7 +1621,7 @@ typedef struct int succ_low; int eob_run; int jfif; - int app14_color_transform; // Adobe APP14 tag + int app14_color_transform; // Adobe APP14 tag int rgb; int scan_n, order[4]; @@ -1645,23 +1639,23 @@ static int stbi__build_huffman(stbi__huffman* h, int* count) int i, j, k = 0; unsigned int code; // build size list for each symbol (from JPEG spec) - for(i = 0; i < 16; ++i) - for(j = 0; j < count[i]; ++j) + for (i = 0; i < 16; ++i) + for (j = 0; j < count[i]; ++j) h->size[k++] = (stbi_uc)(i + 1); h->size[k] = 0; // compute actual symbols (from jpeg spec) code = 0; k = 0; - for(j = 1; j <= 16; ++j) + for (j = 1; j <= 16; ++j) { // compute delta to add to code to compute symbol id h->delta[j] = k - code; - if(h->size[k] == j) + if (h->size[k] == j) { - while(h->size[k] == j) + while (h->size[k] == j) h->code[k++] = (stbi__uint16)(code++); - if(code - 1 >= (1u << j)) + if (code - 1 >= (1u << j)) return stbi__err("bad code lengths", "Corrupt JPEG"); } // compute largest code + 1 for this size, preshifted as needed later @@ -1672,16 +1666,16 @@ static int stbi__build_huffman(stbi__huffman* h, int* count) // build non-spec acceleration table; 255 is flag for not-accelerated memset(h->fast, 255, 1 << FAST_BITS); - for(i = 0; i < k; ++i) + for (i = 0; i < k; ++i) { int s = h->size[i]; - if(s <= FAST_BITS) + if (s <= FAST_BITS) { int c = h->code[i] << (FAST_BITS - s); int m = 1 << (FAST_BITS - s); - for(j = 0; j < m; ++j) + for (j = 0; j < m; ++j) { - h->fast[c + j] = ( stbi_uc )i; + h->fast[c + j] = (stbi_uc)i; } } } @@ -1693,26 +1687,26 @@ static int stbi__build_huffman(stbi__huffman* h, int* count) static void stbi__build_fast_ac(stbi__int16* fast_ac, stbi__huffman* h) { int i; - for(i = 0; i < (1 << FAST_BITS); ++i) + for (i = 0; i < (1 << FAST_BITS); ++i) { stbi_uc fast = h->fast[i]; fast_ac[i] = 0; - if(fast < 255) + if (fast < 255) { int rs = h->values[fast]; int run = (rs >> 4) & 15; int magbits = rs & 15; int len = h->size[fast]; - if(magbits && len + magbits <= FAST_BITS) + if (magbits && len + magbits <= FAST_BITS) { // magnitude code followed by receive_extend code int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits); int m = 1 << (magbits - 1); - if(k < m) + if (k < m) k += (~0U << magbits) + 1; // if the result is small enough, we can fit it in fast_ac table - if(k >= -128 && k <= 127) + if (k >= -128 && k <= 127) fast_ac[i] = (stbi__int16)((k * 256) + (run * 16) + (len + magbits)); } } @@ -1724,25 +1718,25 @@ static void stbi__grow_buffer_unsafe(stbi__jpeg* j) do { unsigned int b = j->nomore ? 0 : stbi__get8(j->s); - if(b == 0xff) + if (b == 0xff) { int c = stbi__get8(j->s); - while(c == 0xff) - c = stbi__get8(j->s); // consume fill bytes - if(c != 0) + while (c == 0xff) + c = stbi__get8(j->s); // consume fill bytes + if (c != 0) { - j->marker = ( unsigned char )c; + j->marker = (unsigned char)c; j->nomore = 1; return; } } j->code_buffer |= b << (24 - j->code_bits); j->code_bits += 8; - } while(j->code_bits <= 24); + } while (j->code_bits <= 24); } // (1 << n) - 1 -static const stbi__uint32 stbi__bmask[17] = {0, 1, 3, 7, 15, 31, 63, 127, 255, +static const stbi__uint32 stbi__bmask[17] = {0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767, 65535}; // decode a jpeg huffman value from the bitstream @@ -1751,17 +1745,17 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg* j, stbi__huffman* h) unsigned int temp; int c, k; - if(j->code_bits < 16) + if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); // look at the top FAST_BITS and determine what symbol ID it is, // if the code is <= FAST_BITS c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1); k = h->fast[c]; - if(k < 255) + if (k < 255) { int s = h->size[k]; - if(s > j->code_bits) + if (s > j->code_bits) return -1; j->code_buffer <<= s; j->code_bits -= s; @@ -1775,17 +1769,17 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg* j, stbi__huffman* h) // wants to be compared against something shifted to have 16; // that way we don't need to shift inside the loop. temp = j->code_buffer >> 16; - for(k = FAST_BITS + 1;; ++k) - if(temp < h->maxcode[k]) + for (k = FAST_BITS + 1;; ++k) + if (temp < h->maxcode[k]) break; - if(k == 17) + if (k == 17) { // error! code not found j->code_bits -= 16; return -1; } - if(k > j->code_bits) + if (k > j->code_bits) return -1; // convert the huffman code to the symbol id @@ -1799,7 +1793,7 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg* j, stbi__huffman* h) } // bias[n] = (-1<code_bits < n) + if (j->code_bits < n) stbi__grow_buffer_unsafe(j); - sgn = ( stbi__int32 )j->code_buffer >> 31; // sign bit is always in MSB + sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB k = stbi_lrot(j->code_buffer, n); - STBI_ASSERT(n >= 0 && n < ( int )(sizeof(stbi__bmask) / sizeof(*stbi__bmask))); + STBI_ASSERT(n >= 0 && n < (int)(sizeof(stbi__bmask) / sizeof(*stbi__bmask))); j->code_buffer = k & ~stbi__bmask[n]; k &= stbi__bmask[n]; j->code_bits -= n; @@ -1824,7 +1818,7 @@ stbi_inline static int stbi__extend_receive(stbi__jpeg* j, int n) stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg* j, int n) { unsigned int k; - if(j->code_bits < n) + if (j->code_bits < n) stbi__grow_buffer_unsafe(j); k = stbi_lrot(j->code_buffer, n); j->code_buffer = k & ~stbi__bmask[n]; @@ -1836,7 +1830,7 @@ stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg* j, int n) stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg* j) { unsigned int k; - if(j->code_bits < 1) + if (j->code_bits < 1) stbi__grow_buffer_unsafe(j); k = j->code_buffer; j->code_buffer <<= 1; @@ -1860,10 +1854,10 @@ static int stbi__jpeg_decode_block(stbi__jpeg* j, short data[64], stbi__huffman* int diff, dc, k; int t; - if(j->code_bits < 16) + if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); t = stbi__jpeg_huff_decode(j, hdc); - if(t < 0) + if (t < 0) return stbi__err("bad huffman code", "Corrupt JPEG"); // 0 all the ac values now so we can do it 32-bits at a time @@ -1872,7 +1866,7 @@ static int stbi__jpeg_decode_block(stbi__jpeg* j, short data[64], stbi__huffman* diff = t ? stbi__extend_receive(j, t) : 0; dc = j->img_comp[b].dc_pred + diff; j->img_comp[b].dc_pred = dc; - data[0] = ( short )(dc * dequant[0]); + data[0] = (short)(dc * dequant[0]); // decode AC components, see JPEG spec k = 1; @@ -1880,31 +1874,31 @@ static int stbi__jpeg_decode_block(stbi__jpeg* j, short data[64], stbi__huffman* { unsigned int zig; int c, r, s; - if(j->code_bits < 16) + if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1); r = fac[c]; - if(r) - { // fast-AC path - k += (r >> 4) & 15; // run - s = r & 15; // combined length + if (r) + { // fast-AC path + k += (r >> 4) & 15; // run + s = r & 15; // combined length j->code_buffer <<= s; j->code_bits -= s; // decode into unzigzag'd location zig = stbi__jpeg_dezigzag[k++]; - data[zig] = ( short )((r >> 8) * dequant[zig]); + data[zig] = (short)((r >> 8) * dequant[zig]); } else { int rs = stbi__jpeg_huff_decode(j, hac); - if(rs < 0) + if (rs < 0) return stbi__err("bad huffman code", "Corrupt JPEG"); s = rs & 15; r = rs >> 4; - if(s == 0) + if (s == 0) { - if(rs != 0xf0) - break; // end block + if (rs != 0xf0) + break; // end block k += 16; } else @@ -1912,10 +1906,10 @@ static int stbi__jpeg_decode_block(stbi__jpeg* j, short data[64], stbi__huffman* k += r; // decode into unzigzag'd location zig = stbi__jpeg_dezigzag[k++]; - data[zig] = ( short )(stbi__extend_receive(j, s) * dequant[zig]); + data[zig] = (short)(stbi__extend_receive(j, s) * dequant[zig]); } } - } while(k < 64); + } while (k < 64); return 1; } @@ -1923,28 +1917,28 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg* j, short data[64], stbi__ { int diff, dc; int t; - if(j->spec_end != 0) + if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG"); - if(j->code_bits < 16) + if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); - if(j->succ_high == 0) + if (j->succ_high == 0) { // first scan for DC coefficient, must be first - memset(data, 0, 64 * sizeof(data[0])); // 0 all the ac values now + memset(data, 0, 64 * sizeof(data[0])); // 0 all the ac values now t = stbi__jpeg_huff_decode(j, hdc); diff = t ? stbi__extend_receive(j, t) : 0; dc = j->img_comp[b].dc_pred + diff; j->img_comp[b].dc_pred = dc; - data[0] = ( short )(dc << j->succ_low); + data[0] = (short)(dc << j->succ_low); } else { // refinement scan for DC coefficient - if(stbi__jpeg_get_bit(j)) - data[0] += ( short )(1 << j->succ_low); + if (stbi__jpeg_get_bit(j)) + data[0] += (short)(1 << j->succ_low); } return 1; } @@ -1954,14 +1948,14 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg* j, short data[64], stbi__ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__huffman* hac, stbi__int16* fac) { int k; - if(j->spec_start == 0) + if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG"); - if(j->succ_high == 0) + if (j->succ_high == 0) { int shift = j->succ_low; - if(j->eob_run) + if (j->eob_run) { --j->eob_run; return 1; @@ -1972,32 +1966,32 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__ { unsigned int zig; int c, r, s; - if(j->code_bits < 16) + if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1); r = fac[c]; - if(r) - { // fast-AC path - k += (r >> 4) & 15; // run - s = r & 15; // combined length + if (r) + { // fast-AC path + k += (r >> 4) & 15; // run + s = r & 15; // combined length j->code_buffer <<= s; j->code_bits -= s; zig = stbi__jpeg_dezigzag[k++]; - data[zig] = ( short )((r >> 8) << shift); + data[zig] = (short)((r >> 8) << shift); } else { int rs = stbi__jpeg_huff_decode(j, hac); - if(rs < 0) + if (rs < 0) return stbi__err("bad huffman code", "Corrupt JPEG"); s = rs & 15; r = rs >> 4; - if(s == 0) + if (s == 0) { - if(r < 15) + if (r < 15) { j->eob_run = (1 << r); - if(r) + if (r) j->eob_run += stbi__jpeg_get_bits(j, r); --j->eob_run; break; @@ -2008,28 +2002,28 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__ { k += r; zig = stbi__jpeg_dezigzag[k++]; - data[zig] = ( short )(stbi__extend_receive(j, s) << shift); + data[zig] = (short)(stbi__extend_receive(j, s) << shift); } } - } while(k <= j->spec_end); + } while (k <= j->spec_end); } else { // refinement scan for these AC coefficients - short bit = ( short )(1 << j->succ_low); + short bit = (short)(1 << j->succ_low); - if(j->eob_run) + if (j->eob_run) { --j->eob_run; - for(k = j->spec_start; k <= j->spec_end; ++k) + for (k = j->spec_start; k <= j->spec_end; ++k) { short* p = &data[stbi__jpeg_dezigzag[k]]; - if(*p != 0) - if(stbi__jpeg_get_bit(j)) - if((*p & bit) == 0) + if (*p != 0) + if (stbi__jpeg_get_bit(j)) + if ((*p & bit) == 0) { - if(*p > 0) + if (*p > 0) *p += bit; else *p -= bit; @@ -2043,19 +2037,19 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__ { int r, s; int rs = stbi__jpeg_huff_decode( - j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh - if(rs < 0) + j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh + if (rs < 0) return stbi__err("bad huffman code", "Corrupt JPEG"); s = rs & 15; r = rs >> 4; - if(s == 0) + if (s == 0) { - if(r < 15) + if (r < 15) { j->eob_run = (1 << r) - 1; - if(r) + if (r) j->eob_run += stbi__jpeg_get_bits(j, r); - r = 64; // force end of block + r = 64; // force end of block } else { @@ -2066,25 +2060,25 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__ } else { - if(s != 1) + if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG"); // sign bit - if(stbi__jpeg_get_bit(j)) + if (stbi__jpeg_get_bit(j)) s = bit; else s = -bit; } // advance by r - while(k <= j->spec_end) + while (k <= j->spec_end) { short* p = &data[stbi__jpeg_dezigzag[k++]]; - if(*p != 0) + if (*p != 0) { - if(stbi__jpeg_get_bit(j)) - if((*p & bit) == 0) + if (stbi__jpeg_get_bit(j)) + if ((*p & bit) == 0) { - if(*p > 0) + if (*p > 0) *p += bit; else *p -= bit; @@ -2092,15 +2086,15 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__ } else { - if(r == 0) + if (r == 0) { - *p = ( short )s; + *p = (short)s; break; } --r; } } - } while(k <= j->spec_end); + } while (k <= j->spec_end); } } return 1; @@ -2110,18 +2104,18 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__ stbi_inline static stbi_uc stbi__clamp(int x) { // trick to use a single test to catch both cases - if(( unsigned int )x > 255) + if ((unsigned int)x > 255) { - if(x < 0) + if (x < 0) return 0; - if(x > 255) + if (x > 255) return 255; } - return ( stbi_uc )x; + return (stbi_uc)x; } -#define stbi__f2f(x) (( int )((( x )*4096 + 0.5))) -#define stbi__fsh(x) (( x )*4096) +#define stbi__f2f(x) ((int)(((x)*4096 + 0.5))) +#define stbi__fsh(x) ((x)*4096) // derived from jidctint -- DCT_ISLOW #define STBI__IDCT_1D(s0, s1, s2, s3, s4, s5, s6, s7) \ @@ -2168,10 +2162,10 @@ static void stbi__idct_block(stbi_uc* out, int out_stride, short data[64]) short* d = data; // columns - for(i = 0; i < 8; ++i, ++d, ++v) + for (i = 0; i < 8; ++i, ++d, ++v) { // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing - if(d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 && d[48] == 0 && d[56] == 0) + if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 && d[48] == 0 && d[56] == 0) { // no shortcut 0 seconds // (1|2|3|4|5|6|7)==0 0 seconds @@ -2200,7 +2194,7 @@ static void stbi__idct_block(stbi_uc* out, int out_stride, short data[64]) } } - for(i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride) + for (i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride) { // no fast case since the first 1D IDCT spread components out STBI__IDCT_1D(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]) @@ -2330,14 +2324,14 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64]) __m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17)); // load - row0 = _mm_load_si128(( const __m128i* )(data + 0 * 8)); - row1 = _mm_load_si128(( const __m128i* )(data + 1 * 8)); - row2 = _mm_load_si128(( const __m128i* )(data + 2 * 8)); - row3 = _mm_load_si128(( const __m128i* )(data + 3 * 8)); - row4 = _mm_load_si128(( const __m128i* )(data + 4 * 8)); - row5 = _mm_load_si128(( const __m128i* )(data + 5 * 8)); - row6 = _mm_load_si128(( const __m128i* )(data + 6 * 8)); - row7 = _mm_load_si128(( const __m128i* )(data + 7 * 8)); + row0 = _mm_load_si128((const __m128i*)(data + 0 * 8)); + row1 = _mm_load_si128((const __m128i*)(data + 1 * 8)); + row2 = _mm_load_si128((const __m128i*)(data + 2 * 8)); + row3 = _mm_load_si128((const __m128i*)(data + 3 * 8)); + row4 = _mm_load_si128((const __m128i*)(data + 4 * 8)); + row5 = _mm_load_si128((const __m128i*)(data + 5 * 8)); + row6 = _mm_load_si128((const __m128i*)(data + 6 * 8)); + row7 = _mm_load_si128((const __m128i*)(data + 7 * 8)); // column pass dct_pass(bias_0, 10); @@ -2367,39 +2361,39 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64]) { // pack - __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7 + __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7 __m128i p1 = _mm_packus_epi16(row2, row3); __m128i p2 = _mm_packus_epi16(row4, row5); __m128i p3 = _mm_packus_epi16(row6, row7); // 8bit 8x8 transpose pass 1 - dct_interleave8(p0, p2); // a0e0a1e1... - dct_interleave8(p1, p3); // c0g0c1g1... + dct_interleave8(p0, p2); // a0e0a1e1... + dct_interleave8(p1, p3); // c0g0c1g1... // transpose pass 2 - dct_interleave8(p0, p1); // a0c0e0g0... - dct_interleave8(p2, p3); // b0d0f0h0... + dct_interleave8(p0, p1); // a0c0e0g0... + dct_interleave8(p2, p3); // b0d0f0h0... // transpose pass 3 - dct_interleave8(p0, p2); // a0b0c0d0... - dct_interleave8(p1, p3); // a4b4c4d4... + dct_interleave8(p0, p2); // a0b0c0d0... + dct_interleave8(p1, p3); // a4b4c4d4... // store - _mm_storel_epi64(( __m128i* )out, p0); + _mm_storel_epi64((__m128i*)out, p0); out += out_stride; - _mm_storel_epi64(( __m128i* )out, _mm_shuffle_epi32(p0, 0x4e)); + _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride; - _mm_storel_epi64(( __m128i* )out, p2); + _mm_storel_epi64((__m128i*)out, p2); out += out_stride; - _mm_storel_epi64(( __m128i* )out, _mm_shuffle_epi32(p2, 0x4e)); + _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride; - _mm_storel_epi64(( __m128i* )out, p1); + _mm_storel_epi64((__m128i*)out, p1); out += out_stride; - _mm_storel_epi64(( __m128i* )out, _mm_shuffle_epi32(p1, 0x4e)); + _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride; - _mm_storel_epi64(( __m128i* )out, p3); + _mm_storel_epi64((__m128i*)out, p3); out += out_stride; - _mm_storel_epi64(( __m128i* )out, _mm_shuffle_epi32(p3, 0x4e)); + _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi32(p3, 0x4e)); } #undef dct_const @@ -2413,7 +2407,7 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64]) #undef dct_pass } -#endif // STBI_SSE2 +#endif // STBI_SSE2 #ifdef STBI_NEON @@ -2548,19 +2542,19 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64]) } // pass 1 - dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6 + dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6 dct_trn16(row2, row3); dct_trn16(row4, row5); dct_trn16(row6, row7); // pass 2 - dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4 + dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4 dct_trn32(row1, row3); dct_trn32(row4, row6); dct_trn32(row5, row7); // pass 3 - dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0 + dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0 dct_trn64(row1, row5); dct_trn64(row2, row6); dct_trn64(row3, row7); @@ -2659,7 +2653,7 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64]) #undef dct_pass } -#endif // STBI_NEON +#endif // STBI_NEON #define STBI__MARKER_none 0xff // if there's a pending marker from the entropy stream, return that @@ -2668,17 +2662,17 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64]) static stbi_uc stbi__get_marker(stbi__jpeg* j) { stbi_uc x; - if(j->marker != STBI__MARKER_none) + if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; } x = stbi__get8(j->s); - if(x != 0xff) + if (x != 0xff) return STBI__MARKER_none; - while(x == 0xff) - x = stbi__get8(j->s); // consume repeated 0xff fill bytes + while (x == 0xff) + x = stbi__get8(j->s); // consume repeated 0xff fill bytes return x; } @@ -2704,9 +2698,9 @@ static void stbi__jpeg_reset(stbi__jpeg* j) static int stbi__parse_entropy_coded_data(stbi__jpeg* z) { stbi__jpeg_reset(z); - if(!z->progressive) + if (!z->progressive) { - if(z->scan_n == 1) + if (z->scan_n == 1) { int i, j; STBI_SIMD_ALIGN(short, data[64]); @@ -2717,24 +2711,24 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z) // component has, independent of interleaved MCU blocking and such int w = (z->img_comp[n].x + 7) >> 3; int h = (z->img_comp[n].y + 7) >> 3; - for(j = 0; j < h; ++j) + for (j = 0; j < h; ++j) { - for(i = 0; i < w; ++i) + for (i = 0; i < w; ++i) { int ha = z->img_comp[n].ha; - if(!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, - z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) + if (!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, + z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0; z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2, data); // every data block is an MCU, so countdown the restart interval - if(--z->todo <= 0) + if (--z->todo <= 0) { - if(z->code_bits < 24) + if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); // if it's NOT a restart, then just bail, so we get corrupt data // rather than no data - if(!STBI__RESTART(z->marker)) + if (!STBI__RESTART(z->marker)) return 1; stbi__jpeg_reset(z); } @@ -2743,28 +2737,28 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z) return 1; } else - { // interleaved + { // interleaved int i, j, k, x, y; STBI_SIMD_ALIGN(short, data[64]); - for(j = 0; j < z->img_mcu_y; ++j) + for (j = 0; j < z->img_mcu_y; ++j) { - for(i = 0; i < z->img_mcu_x; ++i) + for (i = 0; i < z->img_mcu_x; ++i) { // scan an interleaved mcu... process scan_n components in order - for(k = 0; k < z->scan_n; ++k) + for (k = 0; k < z->scan_n; ++k) { int n = z->order[k]; // scan out an mcu's worth of this component; that's just determined // by the basic H and V specified for the component - for(y = 0; y < z->img_comp[n].v; ++y) + for (y = 0; y < z->img_comp[n].v; ++y) { - for(x = 0; x < z->img_comp[n].h; ++x) + for (x = 0; x < z->img_comp[n].h; ++x) { int x2 = (i * z->img_comp[n].h + x) * 8; int y2 = (j * z->img_comp[n].v + y) * 8; int ha = z->img_comp[n].ha; - if(!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, - z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) + if (!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, + z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0; z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * y2 + x2, z->img_comp[n].w2, data); @@ -2773,11 +2767,11 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z) } // after all interleaved components, that's an interleaved MCU, // so now count down the restart interval - if(--z->todo <= 0) + if (--z->todo <= 0) { - if(z->code_bits < 24) + if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); - if(!STBI__RESTART(z->marker)) + if (!STBI__RESTART(z->marker)) return 1; stbi__jpeg_reset(z); } @@ -2788,7 +2782,7 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z) } else { - if(z->scan_n == 1) + if (z->scan_n == 1) { int i, j; int n = z->order[0]; @@ -2798,28 +2792,28 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z) // component has, independent of interleaved MCU blocking and such int w = (z->img_comp[n].x + 7) >> 3; int h = (z->img_comp[n].y + 7) >> 3; - for(j = 0; j < h; ++j) + for (j = 0; j < h; ++j) { - for(i = 0; i < w; ++i) + for (i = 0; i < w; ++i) { short* data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w); - if(z->spec_start == 0) + if (z->spec_start == 0) { - if(!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n)) + if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n)) return 0; } else { int ha = z->img_comp[n].ha; - if(!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha])) + if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha])) return 0; } // every data block is an MCU, so countdown the restart interval - if(--z->todo <= 0) + if (--z->todo <= 0) { - if(z->code_bits < 24) + if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); - if(!STBI__RESTART(z->marker)) + if (!STBI__RESTART(z->marker)) return 1; stbi__jpeg_reset(z); } @@ -2828,37 +2822,37 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z) return 1; } else - { // interleaved + { // interleaved int i, j, k, x, y; - for(j = 0; j < z->img_mcu_y; ++j) + for (j = 0; j < z->img_mcu_y; ++j) { - for(i = 0; i < z->img_mcu_x; ++i) + for (i = 0; i < z->img_mcu_x; ++i) { // scan an interleaved mcu... process scan_n components in order - for(k = 0; k < z->scan_n; ++k) + for (k = 0; k < z->scan_n; ++k) { int n = z->order[k]; // scan out an mcu's worth of this component; that's just determined // by the basic H and V specified for the component - for(y = 0; y < z->img_comp[n].v; ++y) + for (y = 0; y < z->img_comp[n].v; ++y) { - for(x = 0; x < z->img_comp[n].h; ++x) + for (x = 0; x < z->img_comp[n].h; ++x) { int x2 = (i * z->img_comp[n].h + x); int y2 = (j * z->img_comp[n].v + y); short* data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w); - if(!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n)) + if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n)) return 0; } } } // after all interleaved components, that's an interleaved MCU, // so now count down the restart interval - if(--z->todo <= 0) + if (--z->todo <= 0) { - if(z->code_bits < 24) + if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); - if(!STBI__RESTART(z->marker)) + if (!STBI__RESTART(z->marker)) return 1; stbi__jpeg_reset(z); } @@ -2872,23 +2866,23 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z) static void stbi__jpeg_dequantize(short* data, stbi__uint16* dequant) { int i; - for(i = 0; i < 64; ++i) + for (i = 0; i < 64; ++i) data[i] *= dequant[i]; } static void stbi__jpeg_finish(stbi__jpeg* z) { - if(z->progressive) + if (z->progressive) { // dequantize and idct the data int i, j, n; - for(n = 0; n < z->s->img_n; ++n) + for (n = 0; n < z->s->img_n; ++n) { int w = (z->img_comp[n].x + 7) >> 3; int h = (z->img_comp[n].y + 7) >> 3; - for(j = 0; j < h; ++j) + for (j = 0; j < h; ++j) { - for(i = 0; i < w; ++i) + for (i = 0; i < w; ++i) { short* data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w); stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]); @@ -2903,114 +2897,113 @@ static void stbi__jpeg_finish(stbi__jpeg* z) static int stbi__process_marker(stbi__jpeg* z, int m) { int L; - switch(m) + switch (m) { - case STBI__MARKER_none: // no marker found - return stbi__err("expected marker", "Corrupt JPEG"); + case STBI__MARKER_none: // no marker found + return stbi__err("expected marker", "Corrupt JPEG"); - case 0xDD: // DRI - specify restart interval - if(stbi__get16be(z->s) != 4) - return stbi__err("bad DRI len", "Corrupt JPEG"); - z->restart_interval = stbi__get16be(z->s); - return 1; + case 0xDD: // DRI - specify restart interval + if (stbi__get16be(z->s) != 4) + return stbi__err("bad DRI len", "Corrupt JPEG"); + z->restart_interval = stbi__get16be(z->s); + return 1; + + case 0xDB: // DQT - define quantization table + L = stbi__get16be(z->s) - 2; + while (L > 0) + { + int q = stbi__get8(z->s); + int p = q >> 4, sixteen = (p != 0); + int t = q & 15, i; + if (p != 0 && p != 1) + return stbi__err("bad DQT type", "Corrupt JPEG"); + if (t > 3) + return stbi__err("bad DQT table", "Corrupt JPEG"); + + for (i = 0; i < 64; ++i) + z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s)); + L -= (sixteen ? 129 : 65); + } + return L == 0; - case 0xDB: // DQT - define quantization table - L = stbi__get16be(z->s) - 2; - while(L > 0) + case 0xC4: // DHT - define huffman table + L = stbi__get16be(z->s) - 2; + while (L > 0) + { + stbi_uc* v; + int sizes[16], i, n = 0; + int q = stbi__get8(z->s); + int tc = q >> 4; + int th = q & 15; + if (tc > 1 || th > 3) + return stbi__err("bad DHT header", "Corrupt JPEG"); + for (i = 0; i < 16; ++i) { - int q = stbi__get8(z->s); - int p = q >> 4, sixteen = (p != 0); - int t = q & 15, i; - if(p != 0 && p != 1) - return stbi__err("bad DQT type", "Corrupt JPEG"); - if(t > 3) - return stbi__err("bad DQT table", "Corrupt JPEG"); - - for(i = 0; i < 64; ++i) - z->dequant[t][stbi__jpeg_dezigzag[i]] = - (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s)); - L -= (sixteen ? 129 : 65); + sizes[i] = stbi__get8(z->s); + n += sizes[i]; } - return L == 0; - - case 0xC4: // DHT - define huffman table - L = stbi__get16be(z->s) - 2; - while(L > 0) + L -= 17; + if (tc == 0) { - stbi_uc* v; - int sizes[16], i, n = 0; - int q = stbi__get8(z->s); - int tc = q >> 4; - int th = q & 15; - if(tc > 1 || th > 3) - return stbi__err("bad DHT header", "Corrupt JPEG"); - for(i = 0; i < 16; ++i) - { - sizes[i] = stbi__get8(z->s); - n += sizes[i]; - } - L -= 17; - if(tc == 0) - { - if(!stbi__build_huffman(z->huff_dc + th, sizes)) - return 0; - v = z->huff_dc[th].values; - } - else - { - if(!stbi__build_huffman(z->huff_ac + th, sizes)) - return 0; - v = z->huff_ac[th].values; - } - for(i = 0; i < n; ++i) - v[i] = stbi__get8(z->s); - if(tc != 0) - stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th); - L -= n; + if (!stbi__build_huffman(z->huff_dc + th, sizes)) + return 0; + v = z->huff_dc[th].values; } - return L == 0; + else + { + if (!stbi__build_huffman(z->huff_ac + th, sizes)) + return 0; + v = z->huff_ac[th].values; + } + for (i = 0; i < n; ++i) + v[i] = stbi__get8(z->s); + if (tc != 0) + stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th); + L -= n; + } + return L == 0; } // check for comment block or APP blocks - if((m >= 0xE0 && m <= 0xEF) || m == 0xFE) + if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) { L = stbi__get16be(z->s); - if(L < 2) + if (L < 2) { - if(m == 0xFE) + if (m == 0xFE) return stbi__err("bad COM len", "Corrupt JPEG"); else return stbi__err("bad APP len", "Corrupt JPEG"); } L -= 2; - if(m == 0xE0 && L >= 5) - { // JFIF APP0 segment + if (m == 0xE0 && L >= 5) + { // JFIF APP0 segment static const unsigned char tag[5] = {'J', 'F', 'I', 'F', '\0'}; int ok = 1; int i; - for(i = 0; i < 5; ++i) - if(stbi__get8(z->s) != tag[i]) + for (i = 0; i < 5; ++i) + if (stbi__get8(z->s) != tag[i]) ok = 0; L -= 5; - if(ok) + if (ok) z->jfif = 1; } - else if(m == 0xEE && L >= 12) - { // Adobe APP14 segment + else if (m == 0xEE && L >= 12) + { // Adobe APP14 segment static const unsigned char tag[6] = {'A', 'd', 'o', 'b', 'e', '\0'}; int ok = 1; int i; - for(i = 0; i < 6; ++i) - if(stbi__get8(z->s) != tag[i]) + for (i = 0; i < 6; ++i) + if (stbi__get8(z->s) != tag[i]) ok = 0; L -= 6; - if(ok) + if (ok) { - stbi__get8(z->s); // version - stbi__get16be(z->s); // flags0 - stbi__get16be(z->s); // flags1 - z->app14_color_transform = stbi__get8(z->s); // color transform + stbi__get8(z->s); // version + stbi__get16be(z->s); // flags0 + stbi__get16be(z->s); // flags1 + z->app14_color_transform = stbi__get8(z->s); // color transform L -= 6; } } @@ -3028,24 +3021,24 @@ static int stbi__process_scan_header(stbi__jpeg* z) int i; int Ls = stbi__get16be(z->s); z->scan_n = stbi__get8(z->s); - if(z->scan_n < 1 || z->scan_n > 4 || z->scan_n > ( int )z->s->img_n) + if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int)z->s->img_n) return stbi__err("bad SOS component count", "Corrupt JPEG"); - if(Ls != 6 + 2 * z->scan_n) + if (Ls != 6 + 2 * z->scan_n) return stbi__err("bad SOS len", "Corrupt JPEG"); - for(i = 0; i < z->scan_n; ++i) + for (i = 0; i < z->scan_n; ++i) { int id = stbi__get8(z->s), which; int q = stbi__get8(z->s); - for(which = 0; which < z->s->img_n; ++which) - if(z->img_comp[which].id == id) + for (which = 0; which < z->s->img_n; ++which) + if (z->img_comp[which].id == id) break; - if(which == z->s->img_n) - return 0; // no match + if (which == z->s->img_n) + return 0; // no match z->img_comp[which].hd = q >> 4; - if(z->img_comp[which].hd > 3) + if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff", "Corrupt JPEG"); z->img_comp[which].ha = q & 15; - if(z->img_comp[which].ha > 3) + if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff", "Corrupt JPEG"); z->order[i] = which; } @@ -3053,21 +3046,20 @@ static int stbi__process_scan_header(stbi__jpeg* z) { int aa; z->spec_start = stbi__get8(z->s); - z->spec_end = stbi__get8(z->s); // should be 63, but might be 0 + z->spec_end = stbi__get8(z->s); // should be 63, but might be 0 aa = stbi__get8(z->s); z->succ_high = (aa >> 4); z->succ_low = (aa & 15); - if(z->progressive) + if (z->progressive) { - if(z->spec_start > 63 || z->spec_end > 63 || z->spec_start > z->spec_end || z->succ_high > 13 || - z->succ_low > 13) + if (z->spec_start > 63 || z->spec_end > 63 || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13) return stbi__err("bad SOS", "Corrupt JPEG"); } else { - if(z->spec_start != 0) + if (z->spec_start != 0) return stbi__err("bad SOS", "Corrupt JPEG"); - if(z->succ_high != 0 || z->succ_low != 0) + if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS", "Corrupt JPEG"); z->spec_end = 63; } @@ -3079,21 +3071,21 @@ static int stbi__process_scan_header(stbi__jpeg* z) static int stbi__free_jpeg_components(stbi__jpeg* z, int ncomp, int why) { int i; - for(i = 0; i < ncomp; ++i) + for (i = 0; i < ncomp; ++i) { - if(z->img_comp[i].raw_data) + if (z->img_comp[i].raw_data) { STBI_FREE(z->img_comp[i].raw_data); z->img_comp[i].raw_data = NULL; z->img_comp[i].data = NULL; } - if(z->img_comp[i].raw_coeff) + if (z->img_comp[i].raw_coeff) { STBI_FREE(z->img_comp[i].raw_coeff); z->img_comp[i].raw_coeff = 0; z->img_comp[i].coeff = 0; } - if(z->img_comp[i].linebuf) + if (z->img_comp[i].linebuf) { STBI_FREE(z->img_comp[i].linebuf); z->img_comp[i].linebuf = NULL; @@ -3107,62 +3099,62 @@ static int stbi__process_frame_header(stbi__jpeg* z, int scan) stbi__context* s = z->s; int Lf, p, i, q, h_max = 1, v_max = 1, c; Lf = stbi__get16be(s); - if(Lf < 11) - return stbi__err("bad SOF len", "Corrupt JPEG"); // JPEG + if (Lf < 11) + return stbi__err("bad SOF len", "Corrupt JPEG"); // JPEG p = stbi__get8(s); - if(p != 8) - return stbi__err("only 8-bit", "JPEG format not supported: 8-bit only"); // JPEG baseline + if (p != 8) + return stbi__err("only 8-bit", "JPEG format not supported: 8-bit only"); // JPEG baseline s->img_y = stbi__get16be(s); - if(s->img_y == 0) + if (s->img_y == 0) return stbi__err( "no header height", - "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG + "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG s->img_x = stbi__get16be(s); - if(s->img_x == 0) - return stbi__err("0 width", "Corrupt JPEG"); // JPEG requires + if (s->img_x == 0) + return stbi__err("0 width", "Corrupt JPEG"); // JPEG requires c = stbi__get8(s); - if(c != 3 && c != 1 && c != 4) + if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count", "Corrupt JPEG"); s->img_n = c; - for(i = 0; i < c; ++i) + for (i = 0; i < c; ++i) { z->img_comp[i].data = NULL; z->img_comp[i].linebuf = NULL; } - if(Lf != 8 + 3 * s->img_n) + if (Lf != 8 + 3 * s->img_n) return stbi__err("bad SOF len", "Corrupt JPEG"); z->rgb = 0; - for(i = 0; i < s->img_n; ++i) + for (i = 0; i < s->img_n; ++i) { static const unsigned char rgb[3] = {'R', 'G', 'B'}; z->img_comp[i].id = stbi__get8(s); - if(s->img_n == 3 && z->img_comp[i].id == rgb[i]) + if (s->img_n == 3 && z->img_comp[i].id == rgb[i]) ++z->rgb; q = stbi__get8(s); z->img_comp[i].h = (q >> 4); - if(!z->img_comp[i].h || z->img_comp[i].h > 4) + if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H", "Corrupt JPEG"); z->img_comp[i].v = q & 15; - if(!z->img_comp[i].v || z->img_comp[i].v > 4) + if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V", "Corrupt JPEG"); z->img_comp[i].tq = stbi__get8(s); - if(z->img_comp[i].tq > 3) + if (z->img_comp[i].tq > 3) return stbi__err("bad TQ", "Corrupt JPEG"); } - if(scan != STBI__SCAN_load) + if (scan != STBI__SCAN_load) return 1; - if(!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) + if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode"); - for(i = 0; i < s->img_n; ++i) + for (i = 0; i < s->img_n; ++i) { - if(z->img_comp[i].h > h_max) + if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h; - if(z->img_comp[i].v > v_max) + if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v; } @@ -3175,7 +3167,7 @@ static int stbi__process_frame_header(stbi__jpeg* z, int scan) z->img_mcu_x = (s->img_x + z->img_mcu_w - 1) / z->img_mcu_w; z->img_mcu_y = (s->img_y + z->img_mcu_h - 1) / z->img_mcu_h; - for(i = 0; i < s->img_n; ++i) + for (i = 0; i < s->img_n; ++i) { // number of effective pixels (e.g. for non-interleaved MCU) z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max - 1) / h_max; @@ -3193,19 +3185,19 @@ static int stbi__process_frame_header(stbi__jpeg* z, int scan) z->img_comp[i].raw_coeff = 0; z->img_comp[i].linebuf = NULL; z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15); - if(z->img_comp[i].raw_data == NULL) + if (z->img_comp[i].raw_data == NULL) return stbi__free_jpeg_components(z, i + 1, stbi__err("outofmem", "Out of memory")); // align blocks for idct using mmx/sse - z->img_comp[i].data = ( stbi_uc* )((( size_t )z->img_comp[i].raw_data + 15) & ~15); - if(z->progressive) + z->img_comp[i].data = (stbi_uc*)(((size_t)z->img_comp[i].raw_data + 15) & ~15); + if (z->progressive) { // w2, h2 are multiples of 8 (see above) z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8; z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8; z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15); - if(z->img_comp[i].raw_coeff == NULL) + if (z->img_comp[i].raw_coeff == NULL) return stbi__free_jpeg_components(z, i + 1, stbi__err("outofmem", "Out of memory")); - z->img_comp[i].coeff = ( short* )((( size_t )z->img_comp[i].raw_coeff + 15) & ~15); + z->img_comp[i].coeff = (short*)(((size_t)z->img_comp[i].raw_coeff + 15) & ~15); } } @@ -3225,29 +3217,29 @@ static int stbi__decode_jpeg_header(stbi__jpeg* z, int scan) { int m; z->jfif = 0; - z->app14_color_transform = -1; // valid values are 0,1,2 - z->marker = STBI__MARKER_none; // initialize cached marker to empty + z->app14_color_transform = -1; // valid values are 0,1,2 + z->marker = STBI__MARKER_none; // initialize cached marker to empty m = stbi__get_marker(z); - if(!stbi__SOI(m)) + if (!stbi__SOI(m)) return stbi__err("no SOI", "Corrupt JPEG"); - if(scan == STBI__SCAN_type) + if (scan == STBI__SCAN_type) return 1; m = stbi__get_marker(z); - while(!stbi__SOF(m)) + while (!stbi__SOF(m)) { - if(!stbi__process_marker(z, m)) + if (!stbi__process_marker(z, m)) return 0; m = stbi__get_marker(z); - while(m == STBI__MARKER_none) + while (m == STBI__MARKER_none) { // some files have extra padding after their blocks, so ok, we'll scan - if(stbi__at_eof(z->s)) + if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG"); m = stbi__get_marker(z); } } z->progressive = stbi__SOF_progressive(m); - if(!stbi__process_frame_header(z, scan)) + if (!stbi__process_frame_header(z, scan)) return 0; return 1; } @@ -3256,30 +3248,30 @@ static int stbi__decode_jpeg_header(stbi__jpeg* z, int scan) static int stbi__decode_jpeg_image(stbi__jpeg* j) { int m; - for(m = 0; m < 4; m++) + for (m = 0; m < 4; m++) { j->img_comp[m].raw_data = NULL; j->img_comp[m].raw_coeff = NULL; } j->restart_interval = 0; - if(!stbi__decode_jpeg_header(j, STBI__SCAN_load)) + if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0; m = stbi__get_marker(j); - while(!stbi__EOI(m)) + while (!stbi__EOI(m)) { - if(stbi__SOS(m)) + if (stbi__SOS(m)) { - if(!stbi__process_scan_header(j)) + if (!stbi__process_scan_header(j)) return 0; - if(!stbi__parse_entropy_coded_data(j)) + if (!stbi__parse_entropy_coded_data(j)) return 0; - if(j->marker == STBI__MARKER_none) + if (j->marker == STBI__MARKER_none) { // handle 0s at the end of image data from IP Kamera 9060 - while(!stbi__at_eof(j->s)) + while (!stbi__at_eof(j->s)) { int x = stbi__get8(j->s); - if(x == 255) + if (x == 255) { j->marker = stbi__get8(j->s); break; @@ -3289,23 +3281,23 @@ static int stbi__decode_jpeg_image(stbi__jpeg* j) // return 0 } } - else if(stbi__DNL(m)) + else if (stbi__DNL(m)) { int Ld = stbi__get16be(j->s); stbi__uint32 NL = stbi__get16be(j->s); - if(Ld != 4) + if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG"); - if(NL != j->s->img_y) + if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG"); } else { - if(!stbi__process_marker(j, m)) + if (!stbi__process_marker(j, m)) return 0; } m = stbi__get_marker(j); } - if(j->progressive) + if (j->progressive) stbi__jpeg_finish(j); return 1; } @@ -3330,7 +3322,7 @@ static stbi_uc* stbi__resample_row_v_2(stbi_uc* out, stbi_uc* in_near, stbi_uc* // need to generate two samples vertically for every one in input int i; STBI_NOTUSED(hs); - for(i = 0; i < w; ++i) + for (i = 0; i < w; ++i) out[i] = stbi__div4(3 * in_near[i] + in_far[i] + 2); return out; } @@ -3341,7 +3333,7 @@ static stbi_uc* stbi__resample_row_h_2(stbi_uc* out, stbi_uc* in_near, stbi_uc* int i; stbi_uc* input = in_near; - if(w == 1) + if (w == 1) { // if only one sample, can't do any interpolation out[0] = out[1] = input[0]; @@ -3350,7 +3342,7 @@ static stbi_uc* stbi__resample_row_h_2(stbi_uc* out, stbi_uc* in_near, stbi_uc* out[0] = input[0]; out[1] = stbi__div4(input[0] * 3 + input[1] + 2); - for(i = 1; i < w - 1; ++i) + for (i = 1; i < w - 1; ++i) { int n = 3 * input[i] + 2; out[i * 2 + 0] = stbi__div4(n + input[i - 1]); @@ -3371,7 +3363,7 @@ static stbi_uc* stbi__resample_row_hv_2(stbi_uc* out, stbi_uc* in_near, stbi_uc* { // need to generate 2x2 samples for every one in input int i, t0, t1; - if(w == 1) + if (w == 1) { out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2); return out; @@ -3379,7 +3371,7 @@ static stbi_uc* stbi__resample_row_hv_2(stbi_uc* out, stbi_uc* in_near, stbi_uc* t1 = 3 * in_near[0] + in_far[0]; out[0] = stbi__div4(t1 + 2); - for(i = 1; i < w; ++i) + for (i = 1; i < w; ++i) { t0 = t1; t1 = 3 * in_near[i] + in_far[i]; @@ -3399,7 +3391,7 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb // need to generate 2x2 samples for every one in input int i = 0, t0, t1; - if(w == 1) + if (w == 1) { out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2); return out; @@ -3409,19 +3401,19 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb // process groups of 8 pixels for as long as we can. // note we can't handle the last pixel in a row in this loop // because we need to handle the filter boundary conditions. - for(; i < ((w - 1) & ~7); i += 8) + for (; i < ((w - 1) & ~7); i += 8) { #if defined(STBI_SSE2) // load and perform the vertical filtering pass // this uses 3*x + y = 4*x + (y - x) __m128i zero = _mm_setzero_si128(); - __m128i farb = _mm_loadl_epi64(( __m128i* )(in_far + i)); - __m128i nearb = _mm_loadl_epi64(( __m128i* )(in_near + i)); + __m128i farb = _mm_loadl_epi64((__m128i*)(in_far + i)); + __m128i nearb = _mm_loadl_epi64((__m128i*)(in_near + i)); __m128i farw = _mm_unpacklo_epi8(farb, zero); __m128i nearw = _mm_unpacklo_epi8(nearb, zero); __m128i diff = _mm_sub_epi16(farw, nearw); __m128i nears = _mm_slli_epi16(nearw, 2); - __m128i curr = _mm_add_epi16(nears, diff); // current row + __m128i curr = _mm_add_epi16(nears, diff); // current row // horizontal filter works the same based on shifted vers of current // row. "prev" is current row shifted right by 1 pixel; we need to @@ -3453,7 +3445,7 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb // pack and write output __m128i outv = _mm_packus_epi16(de0, de1); - _mm_storeu_si128(( __m128i* )(out + i * 2), outv); + _mm_storeu_si128((__m128i*)(out + i * 2), outv); #elif defined(STBI_NEON) // load and perform the vertical filtering pass // this uses 3*x + y = 4*x + (y - x) @@ -3461,7 +3453,7 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb uint8x8_t nearb = vld1_u8(in_near + i); int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb)); int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2)); - int16x8_t curr = vaddq_s16(nears, diff); // current row + int16x8_t curr = vaddq_s16(nears, diff); // current row // horizontal filter works the same based on shifted vers of current // row. "prev" is current row shifted right by 1 pixel; we need to @@ -3498,7 +3490,7 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb t1 = 3 * in_near[i] + in_far[i]; out[i * 2] = stbi__div16(3 * t1 + t0 + 8); - for(++i; i < w; ++i) + for (++i; i < w; ++i) { t0 = t1; t1 = 3 * in_near[i] + in_far[i]; @@ -3518,22 +3510,22 @@ static stbi_uc* stbi__resample_row_generic(stbi_uc* out, stbi_uc* in_near, stbi_ // resample with nearest-neighbor int i, j; STBI_NOTUSED(in_far); - for(i = 0; i < w; ++i) - for(j = 0; j < hs; ++j) + for (i = 0; i < w; ++i) + for (j = 0; j < hs; ++j) out[i * hs + j] = in_near[i]; return out; } // this is a reduced-precision calculation of YCbCr-to-RGB introduced // to make sure the code produces the same results in both SIMD and scalar -#define stbi__float2fixed(x) ((( int )(( x )*4096.0f + 0.5f)) << 8) +#define stbi__float2fixed(x) (((int)((x)*4096.0f + 0.5f)) << 8) static void stbi__YCbCr_to_RGB_row(stbi_uc* out, const stbi_uc* y, const stbi_uc* pcb, const stbi_uc* pcr, int count, int step) { int i; - for(i = 0; i < count; ++i) + for (i = 0; i < count; ++i) { - int y_fixed = (y[i] << 20) + (1 << 19); // rounding + int y_fixed = (y[i] << 20) + (1 << 19); // rounding int r, g, b; int cr = pcr[i] - 128; int cb = pcb[i] - 128; @@ -3543,30 +3535,30 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc* out, const stbi_uc* y, const stbi_uc r >>= 20; g >>= 20; b >>= 20; - if(( unsigned )r > 255) + if ((unsigned)r > 255) { - if(r < 0) + if (r < 0) r = 0; else r = 255; } - if(( unsigned )g > 255) + if ((unsigned)g > 255) { - if(g < 0) + if (g < 0) g = 0; else g = 255; } - if(( unsigned )b > 255) + if ((unsigned)b > 255) { - if(b < 0) + if (b < 0) b = 0; else b = 255; } - out[0] = ( stbi_uc )r; - out[1] = ( stbi_uc )g; - out[2] = ( stbi_uc )b; + out[0] = (stbi_uc)r; + out[1] = (stbi_uc)g; + out[2] = (stbi_uc)b; out[3] = 255; out += step; } @@ -3582,25 +3574,25 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons // step == 3 is pretty ugly on the final interleave, and i'm not convinced // it's useful in practice (you wouldn't use it for textures, for example). // so just accelerate step == 4 case. - if(step == 4) + if (step == 4) { // this is a fairly straightforward implementation and not super-optimized. __m128i signflip = _mm_set1_epi8(-0x80); - __m128i cr_const0 = _mm_set1_epi16(( short )(1.40200f * 4096.0f + 0.5f)); - __m128i cr_const1 = _mm_set1_epi16(-( short )(0.71414f * 4096.0f + 0.5f)); - __m128i cb_const0 = _mm_set1_epi16(-( short )(0.34414f * 4096.0f + 0.5f)); - __m128i cb_const1 = _mm_set1_epi16(( short )(1.77200f * 4096.0f + 0.5f)); - __m128i y_bias = _mm_set1_epi8(( char )( unsigned char )128); - __m128i xw = _mm_set1_epi16(255); // alpha channel - - for(; i + 7 < count; i += 8) + __m128i cr_const0 = _mm_set1_epi16((short)(1.40200f * 4096.0f + 0.5f)); + __m128i cr_const1 = _mm_set1_epi16(-(short)(0.71414f * 4096.0f + 0.5f)); + __m128i cb_const0 = _mm_set1_epi16(-(short)(0.34414f * 4096.0f + 0.5f)); + __m128i cb_const1 = _mm_set1_epi16((short)(1.77200f * 4096.0f + 0.5f)); + __m128i y_bias = _mm_set1_epi8((char)(unsigned char)128); + __m128i xw = _mm_set1_epi16(255); // alpha channel + + for (; i + 7 < count; i += 8) { // load - __m128i y_bytes = _mm_loadl_epi64(( __m128i* )(y + i)); - __m128i cr_bytes = _mm_loadl_epi64(( __m128i* )(pcr + i)); - __m128i cb_bytes = _mm_loadl_epi64(( __m128i* )(pcb + i)); - __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128 - __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128 + __m128i y_bytes = _mm_loadl_epi64((__m128i*)(y + i)); + __m128i cr_bytes = _mm_loadl_epi64((__m128i*)(pcr + i)); + __m128i cb_bytes = _mm_loadl_epi64((__m128i*)(pcb + i)); + __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128 + __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128 // unpack to short (and left-shift cr, cb by 8) __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes); @@ -3634,8 +3626,8 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons __m128i o1 = _mm_unpackhi_epi16(t0, t1); // store - _mm_storeu_si128(( __m128i* )(out + 0), o0); - _mm_storeu_si128(( __m128i* )(out + 16), o1); + _mm_storeu_si128((__m128i*)(out + 0), o0); + _mm_storeu_si128((__m128i*)(out + 16), o1); out += 32; } } @@ -3643,16 +3635,16 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons #ifdef STBI_NEON // in this version, step=3 support would be easy to add. but is there demand? - if(step == 4) + if (step == 4) { // this is a fairly straightforward implementation and not super-optimized. uint8x8_t signflip = vdup_n_u8(0x80); - int16x8_t cr_const0 = vdupq_n_s16(( short )(1.40200f * 4096.0f + 0.5f)); - int16x8_t cr_const1 = vdupq_n_s16(-( short )(0.71414f * 4096.0f + 0.5f)); - int16x8_t cb_const0 = vdupq_n_s16(-( short )(0.34414f * 4096.0f + 0.5f)); - int16x8_t cb_const1 = vdupq_n_s16(( short )(1.77200f * 4096.0f + 0.5f)); + int16x8_t cr_const0 = vdupq_n_s16((short)(1.40200f * 4096.0f + 0.5f)); + int16x8_t cr_const1 = vdupq_n_s16(-(short)(0.71414f * 4096.0f + 0.5f)); + int16x8_t cb_const0 = vdupq_n_s16(-(short)(0.34414f * 4096.0f + 0.5f)); + int16x8_t cb_const1 = vdupq_n_s16((short)(1.77200f * 4096.0f + 0.5f)); - for(; i + 7 < count; i += 8) + for (; i + 7 < count; i += 8) { // load uint8x8_t y_bytes = vld1_u8(y + i); @@ -3689,9 +3681,9 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons } #endif - for(; i < count; ++i) + for (; i < count; ++i) { - int y_fixed = (y[i] << 20) + (1 << 19); // rounding + int y_fixed = (y[i] << 20) + (1 << 19); // rounding int r, g, b; int cr = pcr[i] - 128; int cb = pcb[i] - 128; @@ -3701,30 +3693,30 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons r >>= 20; g >>= 20; b >>= 20; - if(( unsigned )r > 255) + if ((unsigned)r > 255) { - if(r < 0) + if (r < 0) r = 0; else r = 255; } - if(( unsigned )g > 255) + if ((unsigned)g > 255) { - if(g < 0) + if (g < 0) g = 0; else g = 255; } - if(( unsigned )b > 255) + if ((unsigned)b > 255) { - if(b < 0) + if (b < 0) b = 0; else b = 255; } - out[0] = ( stbi_uc )r; - out[1] = ( stbi_uc )g; - out[2] = ( stbi_uc )b; + out[0] = (stbi_uc)r; + out[1] = (stbi_uc)g; + out[2] = (stbi_uc)b; out[3] = 255; out += step; } @@ -3739,7 +3731,7 @@ static void stbi__setup_jpeg(stbi__jpeg* j) j->resample_row_hv_2_kernel = stbi__resample_row_hv_2; #ifdef STBI_SSE2 - if(stbi__sse2_available()) + if (stbi__sse2_available()) { j->idct_block_kernel = stbi__idct_simd; j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd; @@ -3764,9 +3756,9 @@ typedef struct { resample_row_func resample; stbi_uc *line0, *line1; - int hs, vs; // expansion factor in each axis - int w_lores; // horizontal pixels pre-expansion - int ystep; // how far through vertical expansion we are + int hs, vs; // expansion factor in each axis + int w_lores; // horizontal pixels pre-expansion + int ystep; // how far through vertical expansion we are int ypos; // which pre-expansion row we're on } stbi__resample; @@ -3780,25 +3772,26 @@ static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y) static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp, int req_comp) { int n, decode_n, is_rgb; - z->s->img_n = 0; // make stbi__cleanup_jpeg safe + z->s->img_n = 0; // make stbi__cleanup_jpeg safe // validate req_comp - if(req_comp < 0 || req_comp > 4) + if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error"); // load a jpeg image from whichever source, but leave in YCbCr format - if(!stbi__decode_jpeg_image(z)) + if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; } // determine actual number of components to generate - n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1; + n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 + : 1; is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif)); - if(z->s->img_n == 3 && n < 3 && !is_rgb) + if (z->s->img_n == 3 && n < 3 && !is_rgb) decode_n = 1; else decode_n = z->s->img_n; @@ -3812,14 +3805,14 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp stbi__resample res_comp[4]; - for(k = 0; k < decode_n; ++k) + for (k = 0; k < decode_n; ++k) { stbi__resample* r = &res_comp[k]; // allocate line buffer big enough for upsampling off the edges // with upsample factor of 4 - z->img_comp[k].linebuf = ( stbi_uc* )stbi__malloc(z->s->img_x + 3); - if(!z->img_comp[k].linebuf) + z->img_comp[k].linebuf = (stbi_uc*)stbi__malloc(z->s->img_x + 3); + if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); @@ -3832,52 +3825,52 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp r->ypos = 0; r->line0 = r->line1 = z->img_comp[k].data; - if(r->hs == 1 && r->vs == 1) + if (r->hs == 1 && r->vs == 1) r->resample = resample_row_1; - else if(r->hs == 1 && r->vs == 2) + else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2; - else if(r->hs == 2 && r->vs == 1) + else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2; - else if(r->hs == 2 && r->vs == 2) + else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel; else r->resample = stbi__resample_row_generic; } // can't error after this so, this is safe - output = ( stbi_uc* )stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1); - if(!output) + output = (stbi_uc*)stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1); + if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); } // now go ahead and resample - for(j = 0; j < z->s->img_y; ++j) + for (j = 0; j < z->s->img_y; ++j) { stbi_uc* out = output + n * z->s->img_x * j; - for(k = 0; k < decode_n; ++k) + for (k = 0; k < decode_n; ++k) { stbi__resample* r = &res_comp[k]; int y_bot = r->ystep >= (r->vs >> 1); coutput[k] = r->resample(z->img_comp[k].linebuf, y_bot ? r->line1 : r->line0, y_bot ? r->line0 : r->line1, r->w_lores, r->hs); - if(++r->ystep >= r->vs) + if (++r->ystep >= r->vs) { r->ystep = 0; r->line0 = r->line1; - if(++r->ypos < z->img_comp[k].y) + if (++r->ypos < z->img_comp[k].y) r->line1 += z->img_comp[k].w2; } } - if(n >= 3) + if (n >= 3) { stbi_uc* y = coutput[0]; - if(z->s->img_n == 3) + if (z->s->img_n == 3) { - if(is_rgb) + if (is_rgb) { - for(i = 0; i < z->s->img_x; ++i) + for (i = 0; i < z->s->img_x; ++i) { out[0] = y[i]; out[1] = coutput[1][i]; @@ -3891,11 +3884,11 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); } } - else if(z->s->img_n == 4) + else if (z->s->img_n == 4) { - if(z->app14_color_transform == 0) - { // CMYK - for(i = 0; i < z->s->img_x; ++i) + if (z->app14_color_transform == 0) + { // CMYK + for (i = 0; i < z->s->img_x; ++i) { stbi_uc m = coutput[3][i]; out[0] = stbi__blinn_8x8(coutput[0][i], m); @@ -3905,10 +3898,10 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp out += n; } } - else if(z->app14_color_transform == 2) - { // YCCK + else if (z->app14_color_transform == 2) + { // YCCK z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); - for(i = 0; i < z->s->img_x; ++i) + for (i = 0; i < z->s->img_x; ++i) { stbi_uc m = coutput[3][i]; out[0] = stbi__blinn_8x8(255 - out[0], m); @@ -3918,37 +3911,37 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp } } else - { // YCbCr + alpha? Ignore the fourth channel for now + { // YCbCr + alpha? Ignore the fourth channel for now z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); } } else - for(i = 0; i < z->s->img_x; ++i) + for (i = 0; i < z->s->img_x; ++i) { out[0] = out[1] = out[2] = y[i]; - out[3] = 255; // not used if n==3 + out[3] = 255; // not used if n==3 out += n; } } else { - if(is_rgb) + if (is_rgb) { - if(n == 1) - for(i = 0; i < z->s->img_x; ++i) + if (n == 1) + for (i = 0; i < z->s->img_x; ++i) *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]); else { - for(i = 0; i < z->s->img_x; ++i, out += 2) + for (i = 0; i < z->s->img_x; ++i, out += 2) { out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]); out[1] = 255; } } } - else if(z->s->img_n == 4 && z->app14_color_transform == 0) + else if (z->s->img_n == 4 && z->app14_color_transform == 0) { - for(i = 0; i < z->s->img_x; ++i) + for (i = 0; i < z->s->img_x; ++i) { stbi_uc m = coutput[3][i]; stbi_uc r = stbi__blinn_8x8(coutput[0][i], m); @@ -3959,9 +3952,9 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp out += n; } } - else if(z->s->img_n == 4 && z->app14_color_transform == 2) + else if (z->s->img_n == 4 && z->app14_color_transform == 2) { - for(i = 0; i < z->s->img_x; ++i) + for (i = 0; i < z->s->img_x; ++i) { out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]); out[1] = 255; @@ -3971,11 +3964,11 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp else { stbi_uc* y = coutput[0]; - if(n == 1) - for(i = 0; i < z->s->img_x; ++i) + if (n == 1) + for (i = 0; i < z->s->img_x; ++i) out[i] = y[i]; else - for(i = 0; i < z->s->img_x; ++i) + for (i = 0; i < z->s->img_x; ++i) *out++ = y[i], *out++ = 255; } } @@ -3983,8 +3976,8 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp stbi__cleanup_jpeg(z); *out_x = z->s->img_x; *out_y = z->s->img_y; - if(comp) - *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output + if (comp) + *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output return output; } } @@ -3992,7 +3985,7 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp static void* stbi__jpeg_load(stbi__context* s, int* x, int* y, int* comp, int req_comp, stbi__result_info* ri) { unsigned char* result; - stbi__jpeg* j = ( stbi__jpeg* )stbi__malloc(sizeof(stbi__jpeg)); + stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg)); STBI_NOTUSED(ri); j->s = s; stbi__setup_jpeg(j); @@ -4004,7 +3997,7 @@ static void* stbi__jpeg_load(stbi__context* s, int* x, int* y, int* comp, int re static int stbi__jpeg_test(stbi__context* s) { int r; - stbi__jpeg* j = ( stbi__jpeg* )stbi__malloc(sizeof(stbi__jpeg)); + stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg)); j->s = s; stbi__setup_jpeg(j); r = stbi__decode_jpeg_header(j, STBI__SCAN_type); @@ -4015,16 +4008,16 @@ static int stbi__jpeg_test(stbi__context* s) static int stbi__jpeg_info_raw(stbi__jpeg* j, int* x, int* y, int* comp) { - if(!stbi__decode_jpeg_header(j, STBI__SCAN_header)) + if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) { stbi__rewind(j->s); return 0; } - if(x) + if (x) *x = j->s->img_x; - if(y) + if (y) *y = j->s->img_y; - if(comp) + if (comp) *comp = j->s->img_n >= 3 ? 3 : 1; return 1; } @@ -4032,7 +4025,7 @@ static int stbi__jpeg_info_raw(stbi__jpeg* j, int* x, int* y, int* comp) static int stbi__jpeg_info(stbi__context* s, int* x, int* y, int* comp) { int result; - stbi__jpeg* j = ( stbi__jpeg* )(stbi__malloc(sizeof(stbi__jpeg))); + stbi__jpeg* j = (stbi__jpeg*)(stbi__malloc(sizeof(stbi__jpeg))); j->s = s; result = stbi__jpeg_info_raw(j, x, y, comp); STBI_FREE(j); @@ -4050,7 +4043,7 @@ static int stbi__jpeg_info(stbi__context* s, int* x, int* y, int* comp) #ifndef STBI_NO_ZLIB // fast-way is faster to check than jpeg huffman, but slow way is slower -#define STBI__ZFAST_BITS 9 // accelerate all cases in default tables +#define STBI__ZFAST_BITS 9 // accelerate all cases in default tables #define STBI__ZFAST_MASK ((1 << STBI__ZFAST_BITS) - 1) // zlib-style huffman encoding @@ -4090,40 +4083,40 @@ static int stbi__zbuild_huffman(stbi__zhuffman* z, const stbi_uc* sizelist, int // DEFLATE spec for generating codes memset(sizes, 0, sizeof(sizes)); memset(z->fast, 0, sizeof(z->fast)); - for(i = 0; i < num; ++i) + for (i = 0; i < num; ++i) ++sizes[sizelist[i]]; sizes[0] = 0; - for(i = 1; i < 16; ++i) - if(sizes[i] > (1 << i)) + for (i = 1; i < 16; ++i) + if (sizes[i] > (1 << i)) return stbi__err("bad sizes", "Corrupt PNG"); code = 0; - for(i = 1; i < 16; ++i) + for (i = 1; i < 16; ++i) { next_code[i] = code; - z->firstcode[i] = ( stbi__uint16 )code; - z->firstsymbol[i] = ( stbi__uint16 )k; + z->firstcode[i] = (stbi__uint16)code; + z->firstsymbol[i] = (stbi__uint16)k; code = (code + sizes[i]); - if(sizes[i]) - if(code - 1 >= (1 << i)) + if (sizes[i]) + if (code - 1 >= (1 << i)) return stbi__err("bad codelengths", "Corrupt PNG"); - z->maxcode[i] = code << (16 - i); // preshift for inner loop + z->maxcode[i] = code << (16 - i); // preshift for inner loop code <<= 1; k += sizes[i]; } - z->maxcode[16] = 0x10000; // sentinel - for(i = 0; i < num; ++i) + z->maxcode[16] = 0x10000; // sentinel + for (i = 0; i < num; ++i) { int s = sizelist[i]; - if(s) + if (s) { int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s]; stbi__uint16 fastv = (stbi__uint16)((s << 9) | i); - z->size[c] = ( stbi_uc )s; - z->value[c] = ( stbi__uint16 )i; - if(s <= STBI__ZFAST_BITS) + z->size[c] = (stbi_uc)s; + z->value[c] = (stbi__uint16)i; + if (s <= STBI__ZFAST_BITS) { int j = stbi__bit_reverse(next_code[s], s); - while(j < (1 << STBI__ZFAST_BITS)) + while (j < (1 << STBI__ZFAST_BITS)) { z->fast[j] = fastv; j += (1 << s); @@ -4157,7 +4150,7 @@ typedef struct stbi_inline static stbi_uc stbi__zget8(stbi__zbuf* z) { - if(z->zbuffer >= z->zbuffer_end) + if (z->zbuffer >= z->zbuffer_end) return 0; return *z->zbuffer++; } @@ -4167,15 +4160,15 @@ static void stbi__fill_bits(stbi__zbuf* z) do { STBI_ASSERT(z->code_buffer < (1U << z->num_bits)); - z->code_buffer |= ( unsigned int )stbi__zget8(z) << z->num_bits; + z->code_buffer |= (unsigned int)stbi__zget8(z) << z->num_bits; z->num_bits += 8; - } while(z->num_bits <= 24); + } while (z->num_bits <= 24); } stbi_inline static unsigned int stbi__zreceive(stbi__zbuf* z, int n) { unsigned int k; - if(z->num_bits < n) + if (z->num_bits < n) stbi__fill_bits(z); k = z->code_buffer & ((1 << n) - 1); z->code_buffer >>= n; @@ -4189,11 +4182,11 @@ static int stbi__zhuffman_decode_slowpath(stbi__zbuf* a, stbi__zhuffman* z) // not resolved by fast table, so compute it the slow way // use jpeg approach, which requires MSbits at top k = stbi__bit_reverse(a->code_buffer, 16); - for(s = STBI__ZFAST_BITS + 1;; ++s) - if(k < z->maxcode[s]) + for (s = STBI__ZFAST_BITS + 1;; ++s) + if (k < z->maxcode[s]) break; - if(s == 16) - return -1; // invalid code! + if (s == 16) + return -1; // invalid code! // code size is s, so: b = (k >> (16 - s)) - z->firstcode[s] + z->firstsymbol[s]; STBI_ASSERT(z->size[b] == s); @@ -4205,10 +4198,10 @@ static int stbi__zhuffman_decode_slowpath(stbi__zbuf* a, stbi__zhuffman* z) stbi_inline static int stbi__zhuffman_decode(stbi__zbuf* a, stbi__zhuffman* z) { int b, s; - if(a->num_bits < 16) + if (a->num_bits < 16) stbi__fill_bits(a); b = z->fast[a->code_buffer & STBI__ZFAST_MASK]; - if(b) + if (b) { s = b >> 9; a->code_buffer >>= s; @@ -4218,20 +4211,20 @@ stbi_inline static int stbi__zhuffman_decode(stbi__zbuf* a, stbi__zhuffman* z) return stbi__zhuffman_decode_slowpath(a, z); } -static int stbi__zexpand(stbi__zbuf* z, char* zout, int n) // need to make room for n bytes +static int stbi__zexpand(stbi__zbuf* z, char* zout, int n) // need to make room for n bytes { char* q; int cur, limit, old_limit; z->zout = zout; - if(!z->z_expandable) + if (!z->z_expandable) return stbi__err("output buffer limit", "Corrupt PNG"); - cur = ( int )(z->zout - z->zout_start); - limit = old_limit = ( int )(z->zout_end - z->zout_start); - while(cur + n > limit) + cur = (int)(z->zout - z->zout_start); + limit = old_limit = (int)(z->zout_end - z->zout_start); + while (cur + n > limit) limit *= 2; - q = ( char* )STBI_REALLOC_SIZED(z->zout_start, old_limit, limit); + q = (char*)STBI_REALLOC_SIZED(z->zout_start, old_limit, limit); STBI_NOTUSED(old_limit); - if(q == NULL) + if (q == NULL) return stbi__err("outofmem", "Out of memory"); z->zout_start = q; z->zout = q + cur; @@ -4239,82 +4232,82 @@ static int stbi__zexpand(stbi__zbuf* z, char* zout, int n) // need to make ro return 1; } -static const int stbi__zlength_base[31] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, - 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0}; +static const int stbi__zlength_base[31] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, + 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0}; static const int stbi__zlength_extra[31] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 0, 0}; -static const int stbi__zdist_base[32] = {1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, - 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, - 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0, 0}; +static const int stbi__zdist_base[32] = {1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, + 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, + 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0, 0}; -static const int stbi__zdist_extra[32] = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, +static const int stbi__zdist_extra[32] = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13}; static int stbi__parse_huffman_block(stbi__zbuf* a) { char* zout = a->zout; - for(;;) + for (;;) { int z = stbi__zhuffman_decode(a, &a->z_length); - if(z < 256) + if (z < 256) { - if(z < 0) - return stbi__err("bad huffman code", "Corrupt PNG"); // error in huffman codes - if(zout >= a->zout_end) + if (z < 0) + return stbi__err("bad huffman code", "Corrupt PNG"); // error in huffman codes + if (zout >= a->zout_end) { - if(!stbi__zexpand(a, zout, 1)) + if (!stbi__zexpand(a, zout, 1)) return 0; zout = a->zout; } - *zout++ = ( char )z; + *zout++ = (char)z; } else { stbi_uc* p; int len, dist; - if(z == 256) + if (z == 256) { a->zout = zout; return 1; } z -= 257; len = stbi__zlength_base[z]; - if(stbi__zlength_extra[z]) + if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]); z = stbi__zhuffman_decode(a, &a->z_distance); - if(z < 0) + if (z < 0) return stbi__err("bad huffman code", "Corrupt PNG"); dist = stbi__zdist_base[z]; - if(stbi__zdist_extra[z]) + if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]); - if(zout - a->zout_start < dist) + if (zout - a->zout_start < dist) return stbi__err("bad dist", "Corrupt PNG"); - if(zout + len > a->zout_end) + if (zout + len > a->zout_end) { - if(!stbi__zexpand(a, zout, len)) + if (!stbi__zexpand(a, zout, len)) return 0; zout = a->zout; } - p = ( stbi_uc* )(zout - dist); - if(dist == 1) - { // run of one byte; common in images. + p = (stbi_uc*)(zout - dist); + if (dist == 1) + { // run of one byte; common in images. stbi_uc v = *p; - if(len) + if (len) { do *zout++ = v; - while(--len); + while (--len); } } else { - if(len) + if (len) { do *zout++ = *p++; - while(--len); + while (--len); } } } @@ -4325,7 +4318,7 @@ static int stbi__compute_huffman_codes(stbi__zbuf* a) { static const stbi_uc length_dezigzag[19] = {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; stbi__zhuffman z_codelength; - stbi_uc lencodes[286 + 32 + 137]; // padding for maximum single op + stbi_uc lencodes[286 + 32 + 137]; // padding for maximum single op stbi_uc codelength_sizes[19]; int i, n; @@ -4335,50 +4328,50 @@ static int stbi__compute_huffman_codes(stbi__zbuf* a) int ntot = hlit + hdist; memset(codelength_sizes, 0, sizeof(codelength_sizes)); - for(i = 0; i < hclen; ++i) + for (i = 0; i < hclen; ++i) { int s = stbi__zreceive(a, 3); - codelength_sizes[length_dezigzag[i]] = ( stbi_uc )s; + codelength_sizes[length_dezigzag[i]] = (stbi_uc)s; } - if(!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) + if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0; n = 0; - while(n < ntot) + while (n < ntot) { int c = stbi__zhuffman_decode(a, &z_codelength); - if(c < 0 || c >= 19) + if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG"); - if(c < 16) - lencodes[n++] = ( stbi_uc )c; + if (c < 16) + lencodes[n++] = (stbi_uc)c; else { stbi_uc fill = 0; - if(c == 16) + if (c == 16) { c = stbi__zreceive(a, 2) + 3; - if(n == 0) + if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG"); fill = lencodes[n - 1]; } - else if(c == 17) + else if (c == 17) c = stbi__zreceive(a, 3) + 3; else { STBI_ASSERT(c == 18); c = stbi__zreceive(a, 7) + 11; } - if(ntot - n < c) + if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG"); memset(lencodes + n, fill, c); n += c; } } - if(n != ntot) + if (n != ntot) return stbi__err("bad codelengths", "Corrupt PNG"); - if(!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) + if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0; - if(!stbi__zbuild_huffman(&a->z_distance, lencodes + hlit, hdist)) + if (!stbi__zbuild_huffman(&a->z_distance, lencodes + hlit, hdist)) return 0; return 1; } @@ -4387,28 +4380,28 @@ static int stbi__parse_uncompressed_block(stbi__zbuf* a) { stbi_uc header[4]; int len, nlen, k; - if(a->num_bits & 7) - stbi__zreceive(a, a->num_bits & 7); // discard + if (a->num_bits & 7) + stbi__zreceive(a, a->num_bits & 7); // discard // drain the bit-packed data into header k = 0; - while(a->num_bits > 0) + while (a->num_bits > 0) { - header[k++] = (stbi_uc)(a->code_buffer & 255); // suppress MSVC run-time check + header[k++] = (stbi_uc)(a->code_buffer & 255); // suppress MSVC run-time check a->code_buffer >>= 8; a->num_bits -= 8; } STBI_ASSERT(a->num_bits == 0); // now fill header the normal way - while(k < 4) + while (k < 4) header[k++] = stbi__zget8(a); len = header[1] * 256 + header[0]; nlen = header[3] * 256 + header[2]; - if(nlen != (len ^ 0xffff)) + if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt", "Corrupt PNG"); - if(a->zbuffer + len > a->zbuffer_end) + if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer", "Corrupt PNG"); - if(a->zout + len > a->zout_end) - if(!stbi__zexpand(a, a->zout, len)) + if (a->zout + len > a->zout_end) + if (!stbi__zexpand(a, a->zout, len)) return 0; memcpy(a->zout, a->zbuffer, len); a->zbuffer += len; @@ -4422,12 +4415,12 @@ static int stbi__parse_zlib_header(stbi__zbuf* a) int cm = cmf & 15; /* int cinfo = cmf >> 4; */ int flg = stbi__zget8(a); - if((cmf * 256 + flg) % 31 != 0) - return stbi__err("bad zlib header", "Corrupt PNG"); // zlib spec - if(flg & 32) - return stbi__err("no preset dict", "Corrupt PNG"); // preset dictionary not allowed in png - if(cm != 8) - return stbi__err("bad compression", "Corrupt PNG"); // DEFLATE required for png + if ((cmf * 256 + flg) % 31 != 0) + return stbi__err("bad zlib header", "Corrupt PNG"); // zlib spec + if (flg & 32) + return stbi__err("no preset dict", "Corrupt PNG"); // preset dictionary not allowed in png + if (cm != 8) + return stbi__err("bad compression", "Corrupt PNG"); // DEFLATE required for png // window = 1 << (8 + cinfo)... but who cares, we fully buffer output return 1; } @@ -4459,8 +4452,8 @@ Init algorithm: static int stbi__parse_zlib(stbi__zbuf* a, int parse_header) { int final, type; - if(parse_header) - if(!stbi__parse_zlib_header(a)) + if (parse_header) + if (!stbi__parse_zlib_header(a)) return 0; a->num_bits = 0; a->code_buffer = 0; @@ -4468,34 +4461,34 @@ static int stbi__parse_zlib(stbi__zbuf* a, int parse_header) { final = stbi__zreceive(a, 1); type = stbi__zreceive(a, 2); - if(type == 0) + if (type == 0) { - if(!stbi__parse_uncompressed_block(a)) + if (!stbi__parse_uncompressed_block(a)) return 0; } - else if(type == 3) + else if (type == 3) { return 0; } else { - if(type == 1) + if (type == 1) { // use fixed code lengths - if(!stbi__zbuild_huffman(&a->z_length, stbi__zdefault_length, 288)) + if (!stbi__zbuild_huffman(&a->z_length, stbi__zdefault_length, 288)) return 0; - if(!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance, 32)) + if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance, 32)) return 0; } else { - if(!stbi__compute_huffman_codes(a)) + if (!stbi__compute_huffman_codes(a)) return 0; } - if(!stbi__parse_huffman_block(a)) + if (!stbi__parse_huffman_block(a)) return 0; } - } while(!final); + } while (!final); return 1; } @@ -4512,15 +4505,15 @@ static int stbi__do_zlib(stbi__zbuf* a, char* obuf, int olen, int exp, int parse extern char* stbi_zlib_decode_malloc_guesssize(const char* buffer, int len, int initial_size, int* outlen) { stbi__zbuf a; - char* p = ( char* )stbi__malloc(initial_size); - if(p == NULL) + char* p = (char*)stbi__malloc(initial_size); + if (p == NULL) return NULL; - a.zbuffer = ( stbi_uc* )buffer; - a.zbuffer_end = ( stbi_uc* )buffer + len; - if(stbi__do_zlib(&a, p, initial_size, 1, 1)) + a.zbuffer = (stbi_uc*)buffer; + a.zbuffer_end = (stbi_uc*)buffer + len; + if (stbi__do_zlib(&a, p, initial_size, 1, 1)) { - if(outlen) - *outlen = ( int )(a.zout - a.zout_start); + if (outlen) + *outlen = (int)(a.zout - a.zout_start); return a.zout_start; } else @@ -4539,15 +4532,15 @@ extern char* stbi_zlib_decode_malloc_guesssize_headerflag(const char* buffer, in int parse_header) { stbi__zbuf a; - char* p = ( char* )stbi__malloc(initial_size); - if(p == NULL) + char* p = (char*)stbi__malloc(initial_size); + if (p == NULL) return NULL; - a.zbuffer = ( stbi_uc* )buffer; - a.zbuffer_end = ( stbi_uc* )buffer + len; - if(stbi__do_zlib(&a, p, initial_size, 1, parse_header)) + a.zbuffer = (stbi_uc*)buffer; + a.zbuffer_end = (stbi_uc*)buffer + len; + if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) { - if(outlen) - *outlen = ( int )(a.zout - a.zout_start); + if (outlen) + *outlen = (int)(a.zout - a.zout_start); return a.zout_start; } else @@ -4560,10 +4553,10 @@ extern char* stbi_zlib_decode_malloc_guesssize_headerflag(const char* buffer, in extern int stbi_zlib_decode_buffer(char* obuffer, int olen, char const* ibuffer, int ilen) { stbi__zbuf a; - a.zbuffer = ( stbi_uc* )ibuffer; - a.zbuffer_end = ( stbi_uc* )ibuffer + ilen; - if(stbi__do_zlib(&a, obuffer, olen, 0, 1)) - return ( int )(a.zout - a.zout_start); + a.zbuffer = (stbi_uc*)ibuffer; + a.zbuffer_end = (stbi_uc*)ibuffer + ilen; + if (stbi__do_zlib(&a, obuffer, olen, 0, 1)) + return (int)(a.zout - a.zout_start); else return -1; } @@ -4571,15 +4564,15 @@ extern int stbi_zlib_decode_buffer(char* obuffer, int olen, char const* ibuffer, extern char* stbi_zlib_decode_noheader_malloc(char const* buffer, int len, int* outlen) { stbi__zbuf a; - char* p = ( char* )stbi__malloc(16384); - if(p == NULL) + char* p = (char*)stbi__malloc(16384); + if (p == NULL) return NULL; - a.zbuffer = ( stbi_uc* )buffer; - a.zbuffer_end = ( stbi_uc* )buffer + len; - if(stbi__do_zlib(&a, p, 16384, 1, 0)) + a.zbuffer = (stbi_uc*)buffer; + a.zbuffer_end = (stbi_uc*)buffer + len; + if (stbi__do_zlib(&a, p, 16384, 1, 0)) { - if(outlen) - *outlen = ( int )(a.zout - a.zout_start); + if (outlen) + *outlen = (int)(a.zout - a.zout_start); return a.zout_start; } else @@ -4592,10 +4585,10 @@ extern char* stbi_zlib_decode_noheader_malloc(char const* buffer, int len, int* extern int stbi_zlib_decode_noheader_buffer(char* obuffer, int olen, const char* ibuffer, int ilen) { stbi__zbuf a; - a.zbuffer = ( stbi_uc* )ibuffer; - a.zbuffer_end = ( stbi_uc* )ibuffer + ilen; - if(stbi__do_zlib(&a, obuffer, olen, 0, 0)) - return ( int )(a.zout - a.zout_start); + a.zbuffer = (stbi_uc*)ibuffer; + a.zbuffer_end = (stbi_uc*)ibuffer + ilen; + if (stbi__do_zlib(&a, obuffer, olen, 0, 0)) + return (int)(a.zout - a.zout_start); else return -1; } @@ -4630,8 +4623,8 @@ static int stbi__check_png_header(stbi__context* s) { static const stbi_uc png_sig[8] = {137, 80, 78, 71, 13, 10, 26, 10}; int i; - for(i = 0; i < 8; ++i) - if(stbi__get8(s) != png_sig[i]) + for (i = 0; i < 8; ++i) + if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig", "Not a PNG"); return 1; } @@ -4663,9 +4656,9 @@ static int stbi__paeth(int a, int b, int c) int pa = abs(p - a); int pb = abs(p - b); int pc = abs(p - c); - if(pa <= pb && pa <= pc) + if (pa <= pb && pa <= pc) return a; - if(pb <= pc) + if (pb <= pc) return b; return c; } @@ -4681,18 +4674,18 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r stbi__uint32 i, j, stride = x * out_n * bytes; stbi__uint32 img_len, img_width_bytes; int k; - int img_n = s->img_n; // copy it into a local for later + int img_n = s->img_n; // copy it into a local for later int output_bytes = out_n * bytes; int filter_bytes = img_n * bytes; int width = x; STBI_ASSERT(out_n == s->img_n || out_n == s->img_n + 1); - a->out = ( stbi_uc* )stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into - if(!a->out) + a->out = (stbi_uc*)stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into + if (!a->out) return stbi__err("outofmem", "Out of memory"); - if(!stbi__mad3sizes_valid(img_n, x, depth, 7)) + if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG"); img_width_bytes = (((img_n * x * depth) + 7) >> 3); img_len = (img_width_bytes + 1) * y; @@ -4700,75 +4693,74 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r // we used to check for exact match between raw_len and img_len on non-interlaced PNGs, // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros), // so just check for raw_len < img_len always. - if(raw_len < img_len) + if (raw_len < img_len) return stbi__err("not enough pixels", "Corrupt PNG"); - for(j = 0; j < y; ++j) + for (j = 0; j < y; ++j) { stbi_uc* cur = a->out + stride * j; stbi_uc* prior; int filter = *raw++; - if(filter > 4) + if (filter > 4) return stbi__err("invalid filter", "Corrupt PNG"); - if(depth < 8) + if (depth < 8) { STBI_ASSERT(img_width_bytes <= x); - cur += x * out_n - - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place + cur += x * out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place filter_bytes = 1; width = img_width_bytes; } - prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above + prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above // if first row, use special filter that doesn't sample previous row - if(j == 0) + if (j == 0) filter = first_row_filter[filter]; // handle first byte explicitly - for(k = 0; k < filter_bytes; ++k) + for (k = 0; k < filter_bytes; ++k) { - switch(filter) + switch (filter) { - case STBI__F_none: - cur[k] = raw[k]; - break; - case STBI__F_sub: - cur[k] = raw[k]; - break; - case STBI__F_up: - cur[k] = STBI__BYTECAST(raw[k] + prior[k]); - break; - case STBI__F_avg: - cur[k] = STBI__BYTECAST(raw[k] + (prior[k] >> 1)); - break; - case STBI__F_paeth: - cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0, prior[k], 0)); - break; - case STBI__F_avg_first: - cur[k] = raw[k]; - break; - case STBI__F_paeth_first: - cur[k] = raw[k]; - break; + case STBI__F_none: + cur[k] = raw[k]; + break; + case STBI__F_sub: + cur[k] = raw[k]; + break; + case STBI__F_up: + cur[k] = STBI__BYTECAST(raw[k] + prior[k]); + break; + case STBI__F_avg: + cur[k] = STBI__BYTECAST(raw[k] + (prior[k] >> 1)); + break; + case STBI__F_paeth: + cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0, prior[k], 0)); + break; + case STBI__F_avg_first: + cur[k] = raw[k]; + break; + case STBI__F_paeth_first: + cur[k] = raw[k]; + break; } } - if(depth == 8) + if (depth == 8) { - if(img_n != out_n) - cur[img_n] = 255; // first pixel + if (img_n != out_n) + cur[img_n] = 255; // first pixel raw += img_n; cur += out_n; prior += out_n; } - else if(depth == 16) + else if (depth == 16) { - if(img_n != out_n) + if (img_n != out_n) { - cur[filter_bytes] = 255; // first pixel top byte - cur[filter_bytes + 1] = 255; // first pixel bottom byte + cur[filter_bytes] = 255; // first pixel top byte + cur[filter_bytes + 1] = 255; // first pixel bottom byte } raw += filter_bytes; cur += output_bytes; @@ -4782,49 +4774,48 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r } // this is a little gross, so that we don't switch per-pixel or per-component - if(depth < 8 || img_n == out_n) + if (depth < 8 || img_n == out_n) { int nk = (width - 1) * filter_bytes; #define STBI__CASE(f) \ case f: \ - for(k = 0; k < nk; ++k) - switch(filter) + for (k = 0; k < nk; ++k) + switch (filter) { - // "none" filter turns into a memcpy here; make that explicit. - case STBI__F_none: - memcpy(cur, raw, nk); - break; - STBI__CASE(STBI__F_sub) - { - cur[k] = STBI__BYTECAST(raw[k] + cur[k - filter_bytes]); - } - break; - STBI__CASE(STBI__F_up) - { - cur[k] = STBI__BYTECAST(raw[k] + prior[k]); - } - break; - STBI__CASE(STBI__F_avg) - { - cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1)); - } - break; - STBI__CASE(STBI__F_paeth) - { - cur[k] = STBI__BYTECAST(raw[k] + - stbi__paeth(cur[k - filter_bytes], prior[k], prior[k - filter_bytes])); - } - break; - STBI__CASE(STBI__F_avg_first) - { - cur[k] = STBI__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1)); - } - break; - STBI__CASE(STBI__F_paeth_first) - { - cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], 0, 0)); - } - break; + // "none" filter turns into a memcpy here; make that explicit. + case STBI__F_none: + memcpy(cur, raw, nk); + break; + STBI__CASE(STBI__F_sub) + { + cur[k] = STBI__BYTECAST(raw[k] + cur[k - filter_bytes]); + } + break; + STBI__CASE(STBI__F_up) + { + cur[k] = STBI__BYTECAST(raw[k] + prior[k]); + } + break; + STBI__CASE(STBI__F_avg) + { + cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1)); + } + break; + STBI__CASE(STBI__F_paeth) + { + cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], prior[k], prior[k - filter_bytes])); + } + break; + STBI__CASE(STBI__F_avg_first) + { + cur[k] = STBI__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1)); + } + break; + STBI__CASE(STBI__F_paeth_first) + { + cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], 0, 0)); + } + break; } #undef STBI__CASE raw += nk; @@ -4832,12 +4823,12 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r else { STBI_ASSERT(img_n + 1 == out_n); -#define STBI__CASE(f) \ - case f: \ - for(i = x - 1; i >= 1; \ - --i, cur[filter_bytes] = 255, raw += filter_bytes, cur += output_bytes, prior += output_bytes) \ - for(k = 0; k < filter_bytes; ++k) - switch(filter) +#define STBI__CASE(f) \ + case f: \ + for (i = x - 1; i >= 1; \ + --i, cur[filter_bytes] = 255, raw += filter_bytes, cur += output_bytes, prior += output_bytes) \ + for (k = 0; k < filter_bytes; ++k) + switch (filter) { STBI__CASE(STBI__F_none) { @@ -4861,8 +4852,7 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r break; STBI__CASE(STBI__F_paeth) { - cur[k] = - STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - output_bytes], prior[k], prior[k - output_bytes])); + cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - output_bytes], prior[k], prior[k - output_bytes])); } break; STBI__CASE(STBI__F_avg_first) @@ -4880,10 +4870,10 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r // the loop above sets the high byte of the pixels' alpha, but for // 16 bit png files we also need the low byte set. we'll do that here. - if(depth == 16) + if (depth == 16) { - cur = a->out + stride * j; // start at the beginning of the row again - for(i = 0; i < x; ++i, cur += output_bytes) + cur = a->out + stride * j; // start at the beginning of the row again + for (i = 0; i < x; ++i, cur += output_bytes) { cur[filter_bytes + 1] = 255; } @@ -4894,17 +4884,16 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r // we make a separate pass to expand bits to pixels; for performance, // this could run two scanlines behind the above code, so it won't // intefere with filtering but will still be in the cache. - if(depth < 8) + if (depth < 8) { - for(j = 0; j < y; ++j) + for (j = 0; j < y; ++j) { stbi_uc* cur = a->out + stride * j; stbi_uc* in = a->out + stride * j + x * out_n - img_width_bytes; // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for // 1/2/4-bit png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data // that will be skipped in the later loop - stbi_uc scale = - (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range + stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range // note that the final byte might overshoot and write more data than desired. // we can allocate enough data that this never writes out of memory, but it @@ -4912,35 +4901,35 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel. // so we need to explicitly clamp the final ones - if(depth == 4) + if (depth == 4) { - for(k = x * img_n; k >= 2; k -= 2, ++in) + for (k = x * img_n; k >= 2; k -= 2, ++in) { *cur++ = scale * ((*in >> 4)); *cur++ = scale * ((*in) & 0x0f); } - if(k > 0) + if (k > 0) *cur++ = scale * ((*in >> 4)); } - else if(depth == 2) + else if (depth == 2) { - for(k = x * img_n; k >= 4; k -= 4, ++in) + for (k = x * img_n; k >= 4; k -= 4, ++in) { *cur++ = scale * ((*in >> 6)); *cur++ = scale * ((*in >> 4) & 0x03); *cur++ = scale * ((*in >> 2) & 0x03); *cur++ = scale * ((*in) & 0x03); } - if(k > 0) + if (k > 0) *cur++ = scale * ((*in >> 6)); - if(k > 1) + if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03); - if(k > 2) + if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03); } - else if(depth == 1) + else if (depth == 1) { - for(k = x * img_n; k >= 8; k -= 8, ++in) + for (k = x * img_n; k >= 8; k -= 8, ++in) { *cur++ = scale * ((*in >> 7)); *cur++ = scale * ((*in >> 6) & 0x01); @@ -4951,29 +4940,29 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r *cur++ = scale * ((*in >> 1) & 0x01); *cur++ = scale * ((*in) & 0x01); } - if(k > 0) + if (k > 0) *cur++ = scale * ((*in >> 7)); - if(k > 1) + if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01); - if(k > 2) + if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01); - if(k > 3) + if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01); - if(k > 4) + if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01); - if(k > 5) + if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01); - if(k > 6) + if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01); } - if(img_n != out_n) + if (img_n != out_n) { int q; // insert alpha = 255 cur = a->out + stride * j; - if(img_n == 1) + if (img_n == 1) { - for(q = x - 1; q >= 0; --q) + for (q = x - 1; q >= 0; --q) { cur[q * 2 + 1] = 255; cur[q * 2 + 0] = cur[q]; @@ -4982,7 +4971,7 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r else { STBI_ASSERT(img_n == 3); - for(q = x - 1; q >= 0; --q) + for (q = x - 1; q >= 0; --q) { cur[q * 4 + 3] = 255; cur[q * 4 + 2] = cur[q * 3 + 2]; @@ -4993,16 +4982,16 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r } } } - else if(depth == 16) + else if (depth == 16) { // force the image data from big-endian to platform-native. // this is done in a separate pass due to the decoding relying // on the data being untouched, but could probably be done // per-line during decode if care is taken. stbi_uc* cur = a->out; - stbi__uint16* cur16 = ( stbi__uint16* )cur; + stbi__uint16* cur16 = (stbi__uint16*)cur; - for(i = 0; i < x * y * out_n; ++i, cur16++, cur += 2) + for (i = 0; i < x * y * out_n; ++i, cur16++, cur += 2) { *cur16 = (cur[0] << 8) | cur[1]; } @@ -5018,12 +5007,12 @@ static int stbi__create_png_image(stbi__png* a, stbi_uc* image_data, stbi__uint3 int out_bytes = out_n * bytes; stbi_uc* final; int p; - if(!interlaced) + if (!interlaced) return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color); // de-interlacing - final = ( stbi_uc* )stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0); - for(p = 0; p < 7; ++p) + final = (stbi_uc*)stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0); + for (p = 0; p < 7; ++p) { int xorig[] = {0, 4, 0, 2, 0, 1, 0}; int yorig[] = {0, 0, 4, 0, 2, 0, 1}; @@ -5033,17 +5022,17 @@ static int stbi__create_png_image(stbi__png* a, stbi_uc* image_data, stbi__uint3 // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1 x = (a->s->img_x - xorig[p] + xspc[p] - 1) / xspc[p]; y = (a->s->img_y - yorig[p] + yspc[p] - 1) / yspc[p]; - if(x && y) + if (x && y) { stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y; - if(!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) + if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) { STBI_FREE(final); return 0; } - for(j = 0; j < y; ++j) + for (j = 0; j < y; ++j) { - for(i = 0; i < x; ++i) + for (i = 0; i < x; ++i) { int out_y = j * yspc[p] + yorig[p]; int out_x = i * xspc[p] + xorig[p]; @@ -5071,9 +5060,9 @@ static int stbi__compute_transparency(stbi__png* z, stbi_uc tc[3], int out_n) // already got 255 as the alpha value in the output STBI_ASSERT(out_n == 2 || out_n == 4); - if(out_n == 2) + if (out_n == 2) { - for(i = 0; i < pixel_count; ++i) + for (i = 0; i < pixel_count; ++i) { p[1] = (p[0] == tc[0] ? 0 : 255); p += 2; @@ -5081,9 +5070,9 @@ static int stbi__compute_transparency(stbi__png* z, stbi_uc tc[3], int out_n) } else { - for(i = 0; i < pixel_count; ++i) + for (i = 0; i < pixel_count; ++i) { - if(p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) + if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) p[3] = 0; p += 4; } @@ -5095,15 +5084,15 @@ static int stbi__compute_transparency16(stbi__png* z, stbi__uint16 tc[3], int ou { stbi__context* s = z->s; stbi__uint32 i, pixel_count = s->img_x * s->img_y; - stbi__uint16* p = ( stbi__uint16* )z->out; + stbi__uint16* p = (stbi__uint16*)z->out; // compute color-based transparency, assuming we've // already got 65535 as the alpha value in the output STBI_ASSERT(out_n == 2 || out_n == 4); - if(out_n == 2) + if (out_n == 2) { - for(i = 0; i < pixel_count; ++i) + for (i = 0; i < pixel_count; ++i) { p[1] = (p[0] == tc[0] ? 0 : 65535); p += 2; @@ -5111,9 +5100,9 @@ static int stbi__compute_transparency16(stbi__png* z, stbi__uint16 tc[3], int ou } else { - for(i = 0; i < pixel_count; ++i) + for (i = 0; i < pixel_count; ++i) { - if(p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) + if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) p[3] = 0; p += 4; } @@ -5126,16 +5115,16 @@ static int stbi__expand_png_palette(stbi__png* a, stbi_uc* palette, int len, int stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y; stbi_uc *p, *temp_out, *orig = a->out; - p = ( stbi_uc* )stbi__malloc_mad2(pixel_count, pal_img_n, 0); - if(p == NULL) + p = (stbi_uc*)stbi__malloc_mad2(pixel_count, pal_img_n, 0); + if (p == NULL) return stbi__err("outofmem", "Out of memory"); // between here and free(out) below, exitting would leak temp_out = p; - if(pal_img_n == 3) + if (pal_img_n == 3) { - for(i = 0; i < pixel_count; ++i) + for (i = 0; i < pixel_count; ++i) { int n = orig[i] * 4; p[0] = palette[n]; @@ -5146,7 +5135,7 @@ static int stbi__expand_png_palette(stbi__png* a, stbi_uc* palette, int len, int } else { - for(i = 0; i < pixel_count; ++i) + for (i = 0; i < pixel_count; ++i) { int n = orig[i] * 4; p[0] = palette[n]; @@ -5183,9 +5172,9 @@ static void stbi__de_iphone(stbi__png* z) stbi__uint32 i, pixel_count = s->img_x * s->img_y; stbi_uc* p = z->out; - if(s->img_out_n == 3) - { // convert bgr to rgb - for(i = 0; i < pixel_count; ++i) + if (s->img_out_n == 3) + { // convert bgr to rgb + for (i = 0; i < pixel_count; ++i) { stbi_uc t = p[0]; p[0] = p[2]; @@ -5196,14 +5185,14 @@ static void stbi__de_iphone(stbi__png* z) else { STBI_ASSERT(s->img_out_n == 4); - if(stbi__unpremultiply_on_load) + if (stbi__unpremultiply_on_load) { // convert bgr to rgb and unpremultiply - for(i = 0; i < pixel_count; ++i) + for (i = 0; i < pixel_count; ++i) { stbi_uc a = p[3]; stbi_uc t = p[0]; - if(a) + if (a) { stbi_uc half = a / 2; p[0] = (p[2] * 255 + half) / a; @@ -5221,7 +5210,7 @@ static void stbi__de_iphone(stbi__png* z) else { // convert bgr to rgb - for(i = 0; i < pixel_count; ++i) + for (i = 0; i < pixel_count; ++i) { stbi_uc t = p[0]; p[0] = p[2]; @@ -5233,7 +5222,7 @@ static void stbi__de_iphone(stbi__png* z) } #define STBI__PNG_TYPE(a, b, c, d) \ - ((( unsigned )(a) << 24) + (( unsigned )(b) << 16) + (( unsigned )(c) << 8) + ( unsigned )(d)) + (((unsigned)(a) << 24) + ((unsigned)(b) << 16) + ((unsigned)(c) << 8) + (unsigned)(d)) static int stbi__parse_png_file(stbi__png* z, int scan, int req_comp) { @@ -5248,250 +5237,249 @@ static int stbi__parse_png_file(stbi__png* z, int scan, int req_comp) z->idata = NULL; z->out = NULL; - if(!stbi__check_png_header(s)) + if (!stbi__check_png_header(s)) return 0; - if(scan == STBI__SCAN_type) + if (scan == STBI__SCAN_type) return 1; - for(;;) + for (;;) { stbi__pngchunk c = stbi__get_chunk_header(s); - switch(c.type) + switch (c.type) { - case STBI__PNG_TYPE('C', 'g', 'B', 'I'): - is_iphone = 1; - stbi__skip(s, c.length); - break; - case STBI__PNG_TYPE('I', 'H', 'D', 'R'): + case STBI__PNG_TYPE('C', 'g', 'B', 'I'): + is_iphone = 1; + stbi__skip(s, c.length); + break; + case STBI__PNG_TYPE('I', 'H', 'D', 'R'): + { + int comp, filter; + if (!first) + return stbi__err("multiple IHDR", "Corrupt PNG"); + first = 0; + if (c.length != 13) + return stbi__err("bad IHDR len", "Corrupt PNG"); + s->img_x = stbi__get32be(s); + if (s->img_x > (1 << 24)) + return stbi__err("too large", "Very large image (corrupt?)"); + s->img_y = stbi__get32be(s); + if (s->img_y > (1 << 24)) + return stbi__err("too large", "Very large image (corrupt?)"); + z->depth = stbi__get8(s); + if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16) + return stbi__err("1/2/4/8/16-bit only", "PNG not supported: 1/2/4/8/16-bit only"); + color = stbi__get8(s); + if (color > 6) + return stbi__err("bad ctype", "Corrupt PNG"); + if (color == 3 && z->depth == 16) + return stbi__err("bad ctype", "Corrupt PNG"); + if (color == 3) + pal_img_n = 3; + else if (color & 1) + return stbi__err("bad ctype", "Corrupt PNG"); + comp = stbi__get8(s); + if (comp) + return stbi__err("bad comp method", "Corrupt PNG"); + filter = stbi__get8(s); + if (filter) + return stbi__err("bad filter method", "Corrupt PNG"); + interlace = stbi__get8(s); + if (interlace > 1) + return stbi__err("bad interlace method", "Corrupt PNG"); + if (!s->img_x || !s->img_y) + return stbi__err("0-pixel image", "Corrupt PNG"); + if (!pal_img_n) { - int comp, filter; - if(!first) - return stbi__err("multiple IHDR", "Corrupt PNG"); - first = 0; - if(c.length != 13) - return stbi__err("bad IHDR len", "Corrupt PNG"); - s->img_x = stbi__get32be(s); - if(s->img_x > (1 << 24)) - return stbi__err("too large", "Very large image (corrupt?)"); - s->img_y = stbi__get32be(s); - if(s->img_y > (1 << 24)) - return stbi__err("too large", "Very large image (corrupt?)"); - z->depth = stbi__get8(s); - if(z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16) - return stbi__err("1/2/4/8/16-bit only", "PNG not supported: 1/2/4/8/16-bit only"); - color = stbi__get8(s); - if(color > 6) - return stbi__err("bad ctype", "Corrupt PNG"); - if(color == 3 && z->depth == 16) - return stbi__err("bad ctype", "Corrupt PNG"); - if(color == 3) - pal_img_n = 3; - else if(color & 1) - return stbi__err("bad ctype", "Corrupt PNG"); - comp = stbi__get8(s); - if(comp) - return stbi__err("bad comp method", "Corrupt PNG"); - filter = stbi__get8(s); - if(filter) - return stbi__err("bad filter method", "Corrupt PNG"); - interlace = stbi__get8(s); - if(interlace > 1) - return stbi__err("bad interlace method", "Corrupt PNG"); - if(!s->img_x || !s->img_y) - return stbi__err("0-pixel image", "Corrupt PNG"); - if(!pal_img_n) - { - s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0); - if((1 << 30) / s->img_x / s->img_n < s->img_y) - return stbi__err("too large", "Image too large to decode"); - if(scan == STBI__SCAN_header) - return 1; - } - else - { - // if paletted, then pal_n is our final components, and - // img_n is # components to decompress/filter. - s->img_n = 1; - if((1 << 30) / s->img_x / 4 < s->img_y) - return stbi__err("too large", "Corrupt PNG"); - // if SCAN_header, have to scan to see if we have a tRNS - } - break; + s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0); + if ((1 << 30) / s->img_x / s->img_n < s->img_y) + return stbi__err("too large", "Image too large to decode"); + if (scan == STBI__SCAN_header) + return 1; + } + else + { + // if paletted, then pal_n is our final components, and + // img_n is # components to decompress/filter. + s->img_n = 1; + if ((1 << 30) / s->img_x / 4 < s->img_y) + return stbi__err("too large", "Corrupt PNG"); + // if SCAN_header, have to scan to see if we have a tRNS } + break; + } - case STBI__PNG_TYPE('P', 'L', 'T', 'E'): + case STBI__PNG_TYPE('P', 'L', 'T', 'E'): + { + if (first) + return stbi__err("first not IHDR", "Corrupt PNG"); + if (c.length > 256 * 3) + return stbi__err("invalid PLTE", "Corrupt PNG"); + pal_len = c.length / 3; + if (pal_len * 3 != c.length) + return stbi__err("invalid PLTE", "Corrupt PNG"); + for (i = 0; i < pal_len; ++i) + { + palette[i * 4 + 0] = stbi__get8(s); + palette[i * 4 + 1] = stbi__get8(s); + palette[i * 4 + 2] = stbi__get8(s); + palette[i * 4 + 3] = 255; + } + break; + } + + case STBI__PNG_TYPE('t', 'R', 'N', 'S'): + { + if (first) + return stbi__err("first not IHDR", "Corrupt PNG"); + if (z->idata) + return stbi__err("tRNS after IDAT", "Corrupt PNG"); + if (pal_img_n) { - if(first) - return stbi__err("first not IHDR", "Corrupt PNG"); - if(c.length > 256 * 3) - return stbi__err("invalid PLTE", "Corrupt PNG"); - pal_len = c.length / 3; - if(pal_len * 3 != c.length) - return stbi__err("invalid PLTE", "Corrupt PNG"); - for(i = 0; i < pal_len; ++i) + if (scan == STBI__SCAN_header) { - palette[i * 4 + 0] = stbi__get8(s); - palette[i * 4 + 1] = stbi__get8(s); - palette[i * 4 + 2] = stbi__get8(s); - palette[i * 4 + 3] = 255; + s->img_n = 4; + return 1; } - break; + if (pal_len == 0) + return stbi__err("tRNS before PLTE", "Corrupt PNG"); + if (c.length > pal_len) + return stbi__err("bad tRNS len", "Corrupt PNG"); + pal_img_n = 4; + for (i = 0; i < c.length; ++i) + palette[i * 4 + 3] = stbi__get8(s); } - - case STBI__PNG_TYPE('t', 'R', 'N', 'S'): + else { - if(first) - return stbi__err("first not IHDR", "Corrupt PNG"); - if(z->idata) - return stbi__err("tRNS after IDAT", "Corrupt PNG"); - if(pal_img_n) + if (!(s->img_n & 1)) + return stbi__err("tRNS with alpha", "Corrupt PNG"); + if (c.length != (stbi__uint32)s->img_n * 2) + return stbi__err("bad tRNS len", "Corrupt PNG"); + has_trans = 1; + if (z->depth == 16) { - if(scan == STBI__SCAN_header) - { - s->img_n = 4; - return 1; - } - if(pal_len == 0) - return stbi__err("tRNS before PLTE", "Corrupt PNG"); - if(c.length > pal_len) - return stbi__err("bad tRNS len", "Corrupt PNG"); - pal_img_n = 4; - for(i = 0; i < c.length; ++i) - palette[i * 4 + 3] = stbi__get8(s); + for (k = 0; k < s->img_n; ++k) + tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is } else { - if(!(s->img_n & 1)) - return stbi__err("tRNS with alpha", "Corrupt PNG"); - if(c.length != ( stbi__uint32 )s->img_n * 2) - return stbi__err("bad tRNS len", "Corrupt PNG"); - has_trans = 1; - if(z->depth == 16) - { - for(k = 0; k < s->img_n; ++k) - tc16[k] = ( stbi__uint16 )stbi__get16be(s); // copy the values as-is - } - else - { - for(k = 0; k < s->img_n; ++k) - tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * - stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger - } + for (k = 0; k < s->img_n; ++k) + tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger } - break; } + break; + } - case STBI__PNG_TYPE('I', 'D', 'A', 'T'): + case STBI__PNG_TYPE('I', 'D', 'A', 'T'): + { + if (first) + return stbi__err("first not IHDR", "Corrupt PNG"); + if (pal_img_n && !pal_len) + return stbi__err("no PLTE", "Corrupt PNG"); + if (scan == STBI__SCAN_header) { - if(first) - return stbi__err("first not IHDR", "Corrupt PNG"); - if(pal_img_n && !pal_len) - return stbi__err("no PLTE", "Corrupt PNG"); - if(scan == STBI__SCAN_header) - { - s->img_n = pal_img_n; - return 1; - } - if(( int )(ioff + c.length) < ( int )ioff) - return 0; - if(ioff + c.length > idata_limit) - { - stbi__uint32 idata_limit_old = idata_limit; - stbi_uc* p; - if(idata_limit == 0) - idata_limit = c.length > 4096 ? c.length : 4096; - while(ioff + c.length > idata_limit) - idata_limit *= 2; - STBI_NOTUSED(idata_limit_old); - p = ( stbi_uc* )STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); - if(p == NULL) - return stbi__err("outofmem", "Out of memory"); - z->idata = p; - } - if(!stbi__getn(s, z->idata + ioff, c.length)) - return stbi__err("outofdata", "Corrupt PNG"); - ioff += c.length; - break; + s->img_n = pal_img_n; + return 1; + } + if ((int)(ioff + c.length) < (int)ioff) + return 0; + if (ioff + c.length > idata_limit) + { + stbi__uint32 idata_limit_old = idata_limit; + stbi_uc* p; + if (idata_limit == 0) + idata_limit = c.length > 4096 ? c.length : 4096; + while (ioff + c.length > idata_limit) + idata_limit *= 2; + STBI_NOTUSED(idata_limit_old); + p = (stbi_uc*)STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); + if (p == NULL) + return stbi__err("outofmem", "Out of memory"); + z->idata = p; } + if (!stbi__getn(s, z->idata + ioff, c.length)) + return stbi__err("outofdata", "Corrupt PNG"); + ioff += c.length; + break; + } - case STBI__PNG_TYPE('I', 'E', 'N', 'D'): + case STBI__PNG_TYPE('I', 'E', 'N', 'D'): + { + stbi__uint32 raw_len, bpl; + if (first) + return stbi__err("first not IHDR", "Corrupt PNG"); + if (scan != STBI__SCAN_load) + return 1; + if (z->idata == NULL) + return stbi__err("no IDAT", "Corrupt PNG"); + // initial guess for decoded data size to avoid unnecessary reallocs + bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component + raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */; + z->expanded = (stbi_uc*)stbi_zlib_decode_malloc_guesssize_headerflag((char*)z->idata, ioff, raw_len, + (int*)&raw_len, !is_iphone); + if (z->expanded == NULL) + return 0; // zlib should set error + STBI_FREE(z->idata); + z->idata = NULL; + if ((req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n) || has_trans) + s->img_out_n = s->img_n + 1; + else + s->img_out_n = s->img_n; + if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) + return 0; + if (has_trans) { - stbi__uint32 raw_len, bpl; - if(first) - return stbi__err("first not IHDR", "Corrupt PNG"); - if(scan != STBI__SCAN_load) - return 1; - if(z->idata == NULL) - return stbi__err("no IDAT", "Corrupt PNG"); - // initial guess for decoded data size to avoid unnecessary reallocs - bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component - raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */; - z->expanded = ( stbi_uc* )stbi_zlib_decode_malloc_guesssize_headerflag(( char* )z->idata, ioff, raw_len, - ( int* )&raw_len, !is_iphone); - if(z->expanded == NULL) - return 0; // zlib should set error - STBI_FREE(z->idata); - z->idata = NULL; - if((req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n) || has_trans) - s->img_out_n = s->img_n + 1; - else - s->img_out_n = s->img_n; - if(!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) - return 0; - if(has_trans) + if (z->depth == 16) { - if(z->depth == 16) - { - if(!stbi__compute_transparency16(z, tc16, s->img_out_n)) - return 0; - } - else - { - if(!stbi__compute_transparency(z, tc, s->img_out_n)) - return 0; - } - } - if(is_iphone && stbi__de_iphone_flag && s->img_out_n > 2) - stbi__de_iphone(z); - if(pal_img_n) - { - // pal_img_n == 3 or 4 - s->img_n = pal_img_n; // record the actual colors we had - s->img_out_n = pal_img_n; - if(req_comp >= 3) - s->img_out_n = req_comp; - if(!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n)) + if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0; } - else if(has_trans) + else { - // non-paletted image with tRNS -> source image has (constant) alpha - ++s->img_n; + if (!stbi__compute_transparency(z, tc, s->img_out_n)) + return 0; } - STBI_FREE(z->expanded); - z->expanded = NULL; - return 1; } + if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2) + stbi__de_iphone(z); + if (pal_img_n) + { + // pal_img_n == 3 or 4 + s->img_n = pal_img_n; // record the actual colors we had + s->img_out_n = pal_img_n; + if (req_comp >= 3) + s->img_out_n = req_comp; + if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n)) + return 0; + } + else if (has_trans) + { + // non-paletted image with tRNS -> source image has (constant) alpha + ++s->img_n; + } + STBI_FREE(z->expanded); + z->expanded = NULL; + return 1; + } - default: - // if critical, fail - if(first) - return stbi__err("first not IHDR", "Corrupt PNG"); - if((c.type & (1 << 29)) == 0) - { + default: + // if critical, fail + if (first) + return stbi__err("first not IHDR", "Corrupt PNG"); + if ((c.type & (1 << 29)) == 0) + { #ifndef STBI_NO_FAILURE_STRINGS - // not threadsafe - static char invalid_chunk[] = "XXXX PNG chunk not known"; - invalid_chunk[0] = STBI__BYTECAST(c.type >> 24); - invalid_chunk[1] = STBI__BYTECAST(c.type >> 16); - invalid_chunk[2] = STBI__BYTECAST(c.type >> 8); - invalid_chunk[3] = STBI__BYTECAST(c.type >> 0); + // not threadsafe + static char invalid_chunk[] = "XXXX PNG chunk not known"; + invalid_chunk[0] = STBI__BYTECAST(c.type >> 24); + invalid_chunk[1] = STBI__BYTECAST(c.type >> 16); + invalid_chunk[2] = STBI__BYTECAST(c.type >> 8); + invalid_chunk[3] = STBI__BYTECAST(c.type >> 0); #endif - return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type"); - } - stbi__skip(s, c.length); - break; + return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type"); + } + stbi__skip(s, c.length); + break; } // end of PNG chunk, read and skip CRC stbi__get32be(s); @@ -5501,31 +5489,30 @@ static int stbi__parse_png_file(stbi__png* z, int scan, int req_comp) static void* stbi__do_png(stbi__png* p, int* x, int* y, int* n, int req_comp, stbi__result_info* ri) { void* result = NULL; - if(req_comp < 0 || req_comp > 4) + if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error"); - if(stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) + if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) { - if(p->depth < 8) + if (p->depth < 8) ri->bits_per_channel = 8; else ri->bits_per_channel = p->depth; result = p->out; p->out = NULL; - if(req_comp && req_comp != p->s->img_out_n) + if (req_comp && req_comp != p->s->img_out_n) { - if(ri->bits_per_channel == 8) - result = - stbi__convert_format(( unsigned char* )result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y); + if (ri->bits_per_channel == 8) + result = stbi__convert_format((unsigned char*)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y); else - result = stbi__convert_format16(( stbi__uint16* )result, p->s->img_out_n, req_comp, p->s->img_x, + result = stbi__convert_format16((stbi__uint16*)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y); p->s->img_out_n = req_comp; - if(result == NULL) + if (result == NULL) return result; } *x = p->s->img_x; *y = p->s->img_y; - if(n) + if (n) *n = p->s->img_n; } STBI_FREE(p->out); @@ -5555,16 +5542,16 @@ static int stbi__png_test(stbi__context* s) static int stbi__png_info_raw(stbi__png* p, int* x, int* y, int* comp) { - if(!stbi__parse_png_file(p, STBI__SCAN_header, 0)) + if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) { stbi__rewind(p->s); return 0; } - if(x) + if (x) *x = p->s->img_x; - if(y) + if (y) *y = p->s->img_y; - if(comp) + if (comp) *comp = p->s->img_n; return 1; } @@ -5580,9 +5567,9 @@ static int stbi__png_is16(stbi__context* s) { stbi__png p; p.s = s; - if(!stbi__png_info_raw(&p, NULL, NULL, NULL)) + if (!stbi__png_info_raw(&p, NULL, NULL, NULL)) return 0; - if(p.depth != 16) + if (p.depth != 16) { stbi__rewind(p.s); return 0; @@ -5598,14 +5585,14 @@ static int stbi__bmp_test_raw(stbi__context* s) { int r; int sz; - if(stbi__get8(s) != 'B') + if (stbi__get8(s) != 'B') return 0; - if(stbi__get8(s) != 'M') + if (stbi__get8(s) != 'M') return 0; - stbi__get32le(s); // discard filesize - stbi__get16le(s); // discard reserved - stbi__get16le(s); // discard reserved - stbi__get32le(s); // discard data offset + stbi__get32le(s); // discard filesize + stbi__get16le(s); // discard reserved + stbi__get16le(s); // discard reserved + stbi__get32le(s); // discard data offset sz = stbi__get32le(s); r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124); return r; @@ -5622,28 +5609,28 @@ static int stbi__bmp_test(stbi__context* s) static int stbi__high_bit(unsigned int z) { int n = 0; - if(z == 0) + if (z == 0) return -1; - if(z >= 0x10000) + if (z >= 0x10000) n += 16, z >>= 16; - if(z >= 0x00100) + if (z >= 0x00100) n += 8, z >>= 8; - if(z >= 0x00010) + if (z >= 0x00010) n += 4, z >>= 4; - if(z >= 0x00004) + if (z >= 0x00004) n += 2, z >>= 2; - if(z >= 0x00002) + if (z >= 0x00002) n += 1, z >>= 1; return n; } static int stbi__bitcount(unsigned int a) { - a = (a & 0x55555555) + ((a >> 1) & 0x55555555); // max 2 - a = (a & 0x33333333) + ((a >> 2) & 0x33333333); // max 4 - a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits - a = (a + (a >> 8)); // max 16 per 8 bits - a = (a + (a >> 16)); // max 32 per 8 bits + a = (a & 0x55555555) + ((a >> 1) & 0x55555555); // max 2 + a = (a & 0x33333333) + ((a >> 2) & 0x33333333); // max 4 + a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits + a = (a + (a >> 8)); // max 16 per 8 bits + a = (a + (a >> 16)); // max 32 per 8 bits return a & 0xff; } @@ -5664,16 +5651,24 @@ static int stbi__shiftsigned(int v, int shift, int bits) 0x01 /*0b00000001*/, }; static unsigned int shift_table[9] = { - 0, 0, 0, 1, 0, 2, 4, 6, 0, + 0, + 0, + 0, + 1, + 0, + 2, + 4, + 6, + 0, }; - if(shift < 0) + if (shift < 0) v <<= -shift; else v >>= shift; STBI_ASSERT(v >= 0 && v < 256); v >>= (8 - bits); STBI_ASSERT(bits >= 0 && bits <= 8); - return ( int )(( unsigned )v * mul_table[bits]) >> shift_table[bits]; + return (int)((unsigned)v * mul_table[bits]) >> shift_table[bits]; } typedef struct @@ -5685,18 +5680,18 @@ typedef struct static void* stbi__bmp_parse_header(stbi__context* s, stbi__bmp_data* info) { int hsz; - if(stbi__get8(s) != 'B' || stbi__get8(s) != 'M') + if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP"); - stbi__get32le(s); // discard filesize - stbi__get16le(s); // discard reserved - stbi__get16le(s); // discard reserved + stbi__get32le(s); // discard filesize + stbi__get16le(s); // discard reserved + stbi__get16le(s); // discard reserved info->offset = stbi__get32le(s); info->hsz = hsz = stbi__get32le(s); info->mr = info->mg = info->mb = info->ma = 0; - if(hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) + if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown"); - if(hsz == 12) + if (hsz == 12) { s->img_x = stbi__get16le(s); s->img_y = stbi__get16le(s); @@ -5706,39 +5701,39 @@ static void* stbi__bmp_parse_header(stbi__context* s, stbi__bmp_data* info) s->img_x = stbi__get32le(s); s->img_y = stbi__get32le(s); } - if(stbi__get16le(s) != 1) + if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP"); info->bpp = stbi__get16le(s); - if(hsz != 12) + if (hsz != 12) { int compress = stbi__get32le(s); - if(compress == 1 || compress == 2) + if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE"); - stbi__get32le(s); // discard sizeof - stbi__get32le(s); // discard hres - stbi__get32le(s); // discard vres - stbi__get32le(s); // discard colorsused - stbi__get32le(s); // discard max important - if(hsz == 40 || hsz == 56) - { - if(hsz == 56) + stbi__get32le(s); // discard sizeof + stbi__get32le(s); // discard hres + stbi__get32le(s); // discard vres + stbi__get32le(s); // discard colorsused + stbi__get32le(s); // discard max important + if (hsz == 40 || hsz == 56) + { + if (hsz == 56) { stbi__get32le(s); stbi__get32le(s); stbi__get32le(s); stbi__get32le(s); } - if(info->bpp == 16 || info->bpp == 32) + if (info->bpp == 16 || info->bpp == 32) { - if(compress == 0) + if (compress == 0) { - if(info->bpp == 32) + if (info->bpp == 32) { info->mr = 0xffu << 16; info->mg = 0xffu << 8; info->mb = 0xffu << 0; info->ma = 0xffu << 24; - info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0 + info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0 } else { @@ -5747,13 +5742,13 @@ static void* stbi__bmp_parse_header(stbi__context* s, stbi__bmp_data* info) info->mb = 31u << 0; } } - else if(compress == 3) + else if (compress == 3) { info->mr = stbi__get32le(s); info->mg = stbi__get32le(s); info->mb = stbi__get32le(s); // not documented, but generated by photoshop and handled by mspaint - if(info->mr == info->mg && info->mg == info->mb) + if (info->mr == info->mg && info->mg == info->mb) { // ?!?!? return stbi__errpuc("bad BMP", "bad BMP"); @@ -5766,25 +5761,25 @@ static void* stbi__bmp_parse_header(stbi__context* s, stbi__bmp_data* info) else { int i; - if(hsz != 108 && hsz != 124) + if (hsz != 108 && hsz != 124) return stbi__errpuc("bad BMP", "bad BMP"); info->mr = stbi__get32le(s); info->mg = stbi__get32le(s); info->mb = stbi__get32le(s); info->ma = stbi__get32le(s); - stbi__get32le(s); // discard color space - for(i = 0; i < 12; ++i) - stbi__get32le(s); // discard color space parameters - if(hsz == 124) + stbi__get32le(s); // discard color space + for (i = 0; i < 12; ++i) + stbi__get32le(s); // discard color space parameters + if (hsz == 124) { - stbi__get32le(s); // discard rendering intent - stbi__get32le(s); // discard offset of profile data - stbi__get32le(s); // discard size of profile data - stbi__get32le(s); // discard reserved + stbi__get32le(s); // discard rendering intent + stbi__get32le(s); // discard offset of profile data + stbi__get32le(s); // discard size of profile data + stbi__get32le(s); // discard reserved } } } - return ( void* )1; + return (void*)1; } static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req_comp, stbi__result_info* ri) @@ -5798,11 +5793,11 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req STBI_NOTUSED(ri); info.all_a = 255; - if(stbi__bmp_parse_header(s, &info) == NULL) - return NULL; // error code already set + if (stbi__bmp_parse_header(s, &info) == NULL) + return NULL; // error code already set - flip_vertically = (( int )s->img_y) > 0; - s->img_y = abs(( int )s->img_y); + flip_vertically = ((int)s->img_y) > 0; + s->img_y = abs((int)s->img_y); mr = info.mr; mg = info.mg; @@ -5810,53 +5805,53 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req ma = info.ma; all_a = info.all_a; - if(info.hsz == 12) + if (info.hsz == 12) { - if(info.bpp < 24) + if (info.bpp < 24) psize = (info.offset - 14 - 24) / 3; } else { - if(info.bpp < 16) + if (info.bpp < 16) psize = (info.offset - 14 - info.hsz) >> 2; } s->img_n = ma ? 4 : 3; - if(req_comp && req_comp >= 3) // we can directly decode 3 or 4 + if (req_comp && req_comp >= 3) // we can directly decode 3 or 4 target = req_comp; else - target = s->img_n; // if they want monochrome, we'll post-convert + target = s->img_n; // if they want monochrome, we'll post-convert // sanity-check size - if(!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0)) + if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0)) return stbi__errpuc("too large", "Corrupt BMP"); - out = ( stbi_uc* )stbi__malloc_mad3(target, s->img_x, s->img_y, 0); - if(!out) + out = (stbi_uc*)stbi__malloc_mad3(target, s->img_x, s->img_y, 0); + if (!out) return stbi__errpuc("outofmem", "Out of memory"); - if(info.bpp < 16) + if (info.bpp < 16) { int z = 0; - if(psize == 0 || psize > 256) + if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); } - for(i = 0; i < psize; ++i) + for (i = 0; i < psize; ++i) { pal[i][2] = stbi__get8(s); pal[i][1] = stbi__get8(s); pal[i][0] = stbi__get8(s); - if(info.hsz != 12) + if (info.hsz != 12) stbi__get8(s); pal[i][3] = 255; } stbi__skip(s, info.offset - 14 - info.hsz - psize * (info.hsz == 12 ? 3 : 4)); - if(info.bpp == 1) + if (info.bpp == 1) width = (s->img_x + 7) >> 3; - else if(info.bpp == 4) + else if (info.bpp == 4) width = (s->img_x + 1) >> 1; - else if(info.bpp == 8) + else if (info.bpp == 8) width = s->img_x; else { @@ -5864,18 +5859,18 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req return stbi__errpuc("bad bpp", "Corrupt BMP"); } pad = (-width) & 3; - if(info.bpp == 1) + if (info.bpp == 1) { - for(j = 0; j < ( int )s->img_y; ++j) + for (j = 0; j < (int)s->img_y; ++j) { int bit_offset = 7, v = stbi__get8(s); - for(i = 0; i < ( int )s->img_x; ++i) + for (i = 0; i < (int)s->img_x; ++i) { int color = (v >> bit_offset) & 0x1; out[z++] = pal[color][0]; out[z++] = pal[color][1]; out[z++] = pal[color][2]; - if((--bit_offset) < 0) + if ((--bit_offset) < 0) { bit_offset = 7; v = stbi__get8(s); @@ -5886,12 +5881,12 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req } else { - for(j = 0; j < ( int )s->img_y; ++j) + for (j = 0; j < (int)s->img_y; ++j) { - for(i = 0; i < ( int )s->img_x; i += 2) + for (i = 0; i < (int)s->img_x; i += 2) { int v = stbi__get8(s), v2 = 0; - if(info.bpp == 4) + if (info.bpp == 4) { v2 = v & 15; v >>= 4; @@ -5899,15 +5894,15 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req out[z++] = pal[v][0]; out[z++] = pal[v][1]; out[z++] = pal[v][2]; - if(target == 4) + if (target == 4) out[z++] = 255; - if(i + 1 == ( int )s->img_x) + if (i + 1 == (int)s->img_x) break; v = (info.bpp == 8) ? stbi__get8(s) : v2; out[z++] = pal[v][0]; out[z++] = pal[v][1]; out[z++] = pal[v][2]; - if(target == 4) + if (target == 4) out[z++] = 255; } stbi__skip(s, pad); @@ -5920,25 +5915,25 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req int z = 0; int easy = 0; stbi__skip(s, info.offset - 14 - info.hsz); - if(info.bpp == 24) + if (info.bpp == 24) width = 3 * s->img_x; - else if(info.bpp == 16) + else if (info.bpp == 16) width = 2 * s->img_x; else /* bpp = 32 and pad = 0 */ width = 0; pad = (-width) & 3; - if(info.bpp == 24) + if (info.bpp == 24) { easy = 1; } - else if(info.bpp == 32) + else if (info.bpp == 32) { - if(mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000) + if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000) easy = 2; } - if(!easy) + if (!easy) { - if(!mr || !mg || !mb) + if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); @@ -5953,11 +5948,11 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req ashift = stbi__high_bit(ma) - 7; acount = stbi__bitcount(ma); } - for(j = 0; j < ( int )s->img_y; ++j) + for (j = 0; j < (int)s->img_y; ++j) { - if(easy) + if (easy) { - for(i = 0; i < ( int )s->img_x; ++i) + for (i = 0; i < (int)s->img_x; ++i) { unsigned char a; out[z + 2] = stbi__get8(s); @@ -5966,23 +5961,23 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req z += 3; a = (easy == 2 ? stbi__get8(s) : 255); all_a |= a; - if(target == 4) + if (target == 4) out[z++] = a; } } else { int bpp = info.bpp; - for(i = 0; i < ( int )s->img_x; ++i) + for (i = 0; i < (int)s->img_x; ++i) { - stbi__uint32 v = (bpp == 16 ? ( stbi__uint32 )stbi__get16le(s) : stbi__get32le(s)); + stbi__uint32 v = (bpp == 16 ? (stbi__uint32)stbi__get16le(s) : stbi__get32le(s)); unsigned int a; out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount)); out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount)); out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount)); a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255); all_a |= a; - if(target == 4) + if (target == 4) out[z++] = STBI__BYTECAST(a); } } @@ -5991,34 +5986,34 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req } // if alpha channel is all 0s, replace with all 255s - if(target == 4 && all_a == 0) - for(i = 4 * s->img_x * s->img_y - 1; i >= 0; i -= 4) + if (target == 4 && all_a == 0) + for (i = 4 * s->img_x * s->img_y - 1; i >= 0; i -= 4) out[i] = 255; - if(flip_vertically) + if (flip_vertically) { stbi_uc t; - for(j = 0; j<( int )s->img_y>> 1; ++j) + for (j = 0; j < (int)s->img_y >> 1; ++j) { stbi_uc* p1 = out + j * s->img_x * target; stbi_uc* p2 = out + (s->img_y - 1 - j) * s->img_x * target; - for(i = 0; i < ( int )s->img_x * target; ++i) + for (i = 0; i < (int)s->img_x * target; ++i) { t = p1[i], p1[i] = p2[i], p2[i] = t; } } } - if(req_comp && req_comp != target) + if (req_comp && req_comp != target) { out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y); - if(out == NULL) - return out; // stbi__convert_format frees input on failure + if (out == NULL) + return out; // stbi__convert_format frees input on failure } *x = s->img_x; *y = s->img_y; - if(comp) + if (comp) *comp = s->img_n; return out; } @@ -6031,25 +6026,25 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16) { // only RGB or RGBA (incl. 16bit) or grey allowed - if(is_rgb16) + if (is_rgb16) *is_rgb16 = 0; - switch(bits_per_pixel) - { - case 8: - return STBI_grey; - case 16: - if(is_grey) - return STBI_grey_alpha; - // fallthrough - case 15: - if(is_rgb16) - *is_rgb16 = 1; - return STBI_rgb; - case 24: // fallthrough - case 32: - return bits_per_pixel / 8; - default: - return 0; + switch (bits_per_pixel) + { + case 8: + return STBI_grey; + case 16: + if (is_grey) + return STBI_grey_alpha; + // fallthrough + case 15: + if (is_rgb16) + *is_rgb16 = 1; + return STBI_rgb; + case 24: // fallthrough + case 32: + return bits_per_pixel / 8; + default: + return 0; } } @@ -6057,58 +6052,58 @@ static int stbi__tga_info(stbi__context* s, int* x, int* y, int* comp) { int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp; int sz, tga_colormap_type; - stbi__get8(s); // discard Offset - tga_colormap_type = stbi__get8(s); // colormap type - if(tga_colormap_type > 1) + stbi__get8(s); // discard Offset + tga_colormap_type = stbi__get8(s); // colormap type + if (tga_colormap_type > 1) { stbi__rewind(s); - return 0; // only RGB or indexed allowed + return 0; // only RGB or indexed allowed } - tga_image_type = stbi__get8(s); // image type - if(tga_colormap_type == 1) - { // colormapped (paletted) image - if(tga_image_type != 1 && tga_image_type != 9) + tga_image_type = stbi__get8(s); // image type + if (tga_colormap_type == 1) + { // colormapped (paletted) image + if (tga_image_type != 1 && tga_image_type != 9) { stbi__rewind(s); return 0; } - stbi__skip(s, 4); // skip index of first colormap entry and number of entries - sz = stbi__get8(s); // check bits per palette color entry - if((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) + stbi__skip(s, 4); // skip index of first colormap entry and number of entries + sz = stbi__get8(s); // check bits per palette color entry + if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) { stbi__rewind(s); return 0; } - stbi__skip(s, 4); // skip image x and y origin + stbi__skip(s, 4); // skip image x and y origin tga_colormap_bpp = sz; } else - { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE - if((tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11)) + { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE + if ((tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11)) { stbi__rewind(s); - return 0; // only RGB or grey allowed, +/- RLE + return 0; // only RGB or grey allowed, +/- RLE } - stbi__skip(s, 9); // skip colormap specification and image x/y origin + stbi__skip(s, 9); // skip colormap specification and image x/y origin tga_colormap_bpp = 0; } tga_w = stbi__get16le(s); - if(tga_w < 1) + if (tga_w < 1) { stbi__rewind(s); - return 0; // test width + return 0; // test width } tga_h = stbi__get16le(s); - if(tga_h < 1) + if (tga_h < 1) { stbi__rewind(s); - return 0; // test height + return 0; // test height } - tga_bits_per_pixel = stbi__get8(s); // bits per pixel - stbi__get8(s); // ignore alpha bits - if(tga_colormap_bpp != 0) + tga_bits_per_pixel = stbi__get8(s); // bits per pixel + stbi__get8(s); // ignore alpha bits + if (tga_colormap_bpp != 0) { - if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) + if ((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) { // when using a colormap, tga_bits_per_pixel is the size of the indexes // I don't think anything but 8 or 16bit indexes makes sense @@ -6121,56 +6116,56 @@ static int stbi__tga_info(stbi__context* s, int* x, int* y, int* comp) { tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL); } - if(!tga_comp) + if (!tga_comp) { stbi__rewind(s); return 0; } - if(x) + if (x) *x = tga_w; - if(y) + if (y) *y = tga_h; - if(comp) + if (comp) *comp = tga_comp; - return 1; // seems to have passed everything + return 1; // seems to have passed everything } static int stbi__tga_test(stbi__context* s) { int res = 0; int sz, tga_color_type; - stbi__get8(s); // discard Offset - tga_color_type = stbi__get8(s); // color type - if(tga_color_type > 1) - goto errorEnd; // only RGB or indexed allowed - sz = stbi__get8(s); // image type - if(tga_color_type == 1) - { // colormapped (paletted) image - if(sz != 1 && sz != 9) - goto errorEnd; // colortype 1 demands image type 1 or 9 - stbi__skip(s, 4); // skip index of first colormap entry and number of entries - sz = stbi__get8(s); // check bits per palette color entry - if((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) + stbi__get8(s); // discard Offset + tga_color_type = stbi__get8(s); // color type + if (tga_color_type > 1) + goto errorEnd; // only RGB or indexed allowed + sz = stbi__get8(s); // image type + if (tga_color_type == 1) + { // colormapped (paletted) image + if (sz != 1 && sz != 9) + goto errorEnd; // colortype 1 demands image type 1 or 9 + stbi__skip(s, 4); // skip index of first colormap entry and number of entries + sz = stbi__get8(s); // check bits per palette color entry + if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) goto errorEnd; - stbi__skip(s, 4); // skip image x and y origin + stbi__skip(s, 4); // skip image x and y origin } else - { // "normal" image w/o colormap - if((sz != 2) && (sz != 3) && (sz != 10) && (sz != 11)) - goto errorEnd; // only RGB or grey allowed, +/- RLE - stbi__skip(s, 9); // skip colormap specification and image x/y origin - } - if(stbi__get16le(s) < 1) - goto errorEnd; // test width - if(stbi__get16le(s) < 1) - goto errorEnd; // test height - sz = stbi__get8(s); // bits per pixel - if((tga_color_type == 1) && (sz != 8) && (sz != 16)) - goto errorEnd; // for colormapped images, bpp is size of an index - if((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) + { // "normal" image w/o colormap + if ((sz != 2) && (sz != 3) && (sz != 10) && (sz != 11)) + goto errorEnd; // only RGB or grey allowed, +/- RLE + stbi__skip(s, 9); // skip colormap specification and image x/y origin + } + if (stbi__get16le(s) < 1) + goto errorEnd; // test width + if (stbi__get16le(s) < 1) + goto errorEnd; // test height + sz = stbi__get8(s); // bits per pixel + if ((tga_color_type == 1) && (sz != 8) && (sz != 16)) + goto errorEnd; // for colormapped images, bpp is size of an index + if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) goto errorEnd; - res = 1; // if we got this far, everything's good and we can return 1 instead of 0 + res = 1; // if we got this far, everything's good and we can return 1 instead of 0 errorEnd: stbi__rewind(s); @@ -6180,7 +6175,7 @@ static int stbi__tga_test(stbi__context* s) // read 16bit value and convert to 24bit RGB static void stbi__tga_read_rgb16(stbi__context* s, stbi_uc* out) { - stbi__uint16 px = ( stbi__uint16 )stbi__get16le(s); + stbi__uint16 px = (stbi__uint16)stbi__get16le(s); stbi__uint16 fiveBitMask = 31; // we have 3 channels with 5bits each int r = (px >> 10) & fiveBitMask; @@ -6226,7 +6221,7 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req STBI_NOTUSED(ri); // do a tiny bit of precessing - if(tga_image_type >= 8) + if (tga_image_type >= 8) { tga_image_type -= 8; tga_is_RLE = 1; @@ -6234,33 +6229,33 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req tga_inverted = 1 - ((tga_inverted >> 5) & 1); // If I'm paletted, then I'll use the number of bits from the palette - if(tga_indexed) + if (tga_indexed) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16); else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16); - if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency + if (!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency return stbi__errpuc("bad format", "Can't find out TGA pixelformat"); // tga info *x = tga_width; *y = tga_height; - if(comp) + if (comp) *comp = tga_comp; - if(!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0)) + if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0)) return stbi__errpuc("too large", "Corrupt TGA"); - tga_data = ( unsigned char* )stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0); - if(!tga_data) + tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0); + if (!tga_data) return stbi__errpuc("outofmem", "Out of memory"); // skip to the data's starting position (offset usually = 0) stbi__skip(s, tga_offset); - if(!tga_indexed && !tga_is_RLE && !tga_rgb16) + if (!tga_indexed && !tga_is_RLE && !tga_rgb16) { - for(i = 0; i < tga_height; ++i) + for (i = 0; i < tga_height; ++i) { int row = tga_inverted ? tga_height - i - 1 : i; stbi_uc* tga_row = tga_data + row * tga_width * tga_comp; @@ -6270,28 +6265,28 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req else { // do I need to load a palette? - if(tga_indexed) + if (tga_indexed) { // any data to skip? (offset usually = 0) stbi__skip(s, tga_palette_start); // load the palette - tga_palette = ( unsigned char* )stbi__malloc_mad2(tga_palette_len, tga_comp, 0); - if(!tga_palette) + tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0); + if (!tga_palette) { STBI_FREE(tga_data); return stbi__errpuc("outofmem", "Out of memory"); } - if(tga_rgb16) + if (tga_rgb16) { stbi_uc* pal_entry = tga_palette; STBI_ASSERT(tga_comp == STBI_rgb); - for(i = 0; i < tga_palette_len; ++i) + for (i = 0; i < tga_palette_len; ++i) { stbi__tga_read_rgb16(s, pal_entry); pal_entry += tga_comp; } } - else if(!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) + else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) { STBI_FREE(tga_data); STBI_FREE(tga_palette); @@ -6299,12 +6294,12 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req } } // load the data - for(i = 0; i < tga_width * tga_height; ++i) + for (i = 0; i < tga_width * tga_height; ++i) { // if I'm in RLE mode, do I need to get a RLE stbi__pngchunk? - if(tga_is_RLE) + if (tga_is_RLE) { - if(RLE_count == 0) + if (RLE_count == 0) { // yep, get the next byte as a RLE command int RLE_cmd = stbi__get8(s); @@ -6312,7 +6307,7 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req RLE_repeating = RLE_cmd >> 7; read_next_pixel = 1; } - else if(!RLE_repeating) + else if (!RLE_repeating) { read_next_pixel = 1; } @@ -6322,25 +6317,25 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req read_next_pixel = 1; } // OK, if I need to read a pixel, do it now - if(read_next_pixel) + if (read_next_pixel) { // load however much data we did have - if(tga_indexed) + if (tga_indexed) { // read in index, then perform the lookup int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s); - if(pal_idx >= tga_palette_len) + if (pal_idx >= tga_palette_len) { // invalid index pal_idx = 0; } pal_idx *= tga_comp; - for(j = 0; j < tga_comp; ++j) + for (j = 0; j < tga_comp; ++j) { raw_data[j] = tga_palette[pal_idx + j]; } } - else if(tga_rgb16) + else if (tga_rgb16) { STBI_ASSERT(tga_comp == STBI_rgb); stbi__tga_read_rgb16(s, raw_data); @@ -6348,30 +6343,30 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req else { // read in the data raw - for(j = 0; j < tga_comp; ++j) + for (j = 0; j < tga_comp; ++j) { raw_data[j] = stbi__get8(s); } } // clear the reading flag for the next pixel read_next_pixel = 0; - } // end of reading a pixel + } // end of reading a pixel // copy data - for(j = 0; j < tga_comp; ++j) + for (j = 0; j < tga_comp; ++j) tga_data[i * tga_comp + j] = raw_data[j]; // in case we're in RLE mode, keep counting down --RLE_count; } // do I need to invert the image? - if(tga_inverted) + if (tga_inverted) { - for(j = 0; j * 2 < tga_height; ++j) + for (j = 0; j * 2 < tga_height; ++j) { int index1 = j * tga_width * tga_comp; int index2 = (tga_height - 1 - j) * tga_width * tga_comp; - for(i = tga_width * tga_comp; i > 0; --i) + for (i = tga_width * tga_comp; i > 0; --i) { unsigned char temp = tga_data[index1]; tga_data[index1] = tga_data[index2]; @@ -6382,17 +6377,17 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req } } // clear my palette, if I had one - if(tga_palette != NULL) + if (tga_palette != NULL) { STBI_FREE(tga_palette); } } // swap RGB - if the source data was RGB16, it already is in the right order - if(tga_comp >= 3 && !tga_rgb16) + if (tga_comp >= 3 && !tga_rgb16) { unsigned char* tga_pixel = tga_data; - for(i = 0; i < tga_width * tga_height; ++i) + for (i = 0; i < tga_width * tga_height; ++i) { unsigned char temp = tga_pixel[0]; tga_pixel[0] = tga_pixel[2]; @@ -6402,7 +6397,7 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req } // convert to target component count - if(req_comp && req_comp != tga_comp) + if (req_comp && req_comp != tga_comp) tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height); // the things I do to get rid of an error message, and yet keep @@ -6429,38 +6424,38 @@ static int stbi__psd_decode_rle(stbi__context* s, stbi_uc* p, int pixelCount) int count, nleft, len; count = 0; - while((nleft = pixelCount - count) > 0) + while ((nleft = pixelCount - count) > 0) { len = stbi__get8(s); - if(len == 128) + if (len == 128) { // No-op. } - else if(len < 128) + else if (len < 128) { // Copy next len+1 bytes literally. len++; - if(len > nleft) - return 0; // corrupt data + if (len > nleft) + return 0; // corrupt data count += len; - while(len) + while (len) { *p = stbi__get8(s); p += 4; len--; } } - else if(len > 128) + else if (len > 128) { stbi_uc val; // Next -len+1 bytes in the dest are replicated from next source byte. // (Interpret len as a negative 8-bit int.) len = 257 - len; - if(len > nleft) - return 0; // corrupt data + if (len > nleft) + return 0; // corrupt data val = stbi__get8(s); count += len; - while(len) + while (len) { *p = val; p += 4; @@ -6483,11 +6478,11 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req STBI_NOTUSED(ri); // Check identifier - if(stbi__get32be(s) != 0x38425053) // "8BPS" + if (stbi__get32be(s) != 0x38425053) // "8BPS" return stbi__errpuc("not PSD", "Corrupt PSD image"); // Check file type version. - if(stbi__get16be(s) != 1) + if (stbi__get16be(s) != 1) return stbi__errpuc("wrong version", "Unsupported version of PSD image"); // Skip 6 reserved bytes. @@ -6495,7 +6490,7 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req // Read the number of channels (R, G, B, A, etc). channelCount = stbi__get16be(s); - if(channelCount < 0 || channelCount > 16) + if (channelCount < 0 || channelCount > 16) return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image"); // Read the rows and columns of the image. @@ -6504,7 +6499,7 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req // Make sure the depth is 8 bits. bitdepth = stbi__get16be(s); - if(bitdepth != 8 && bitdepth != 16) + if (bitdepth != 8 && bitdepth != 16) return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit"); // Make sure the color mode is RGB. @@ -6517,7 +6512,7 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req // 7: Multichannel // 8: Duotone // 9: Lab color - if(stbi__get16be(s) != 3) + if (stbi__get16be(s) != 3) return stbi__errpuc("wrong color format", "PSD is not in RGB color format"); // Skip the Mode Data. (It's the palette for indexed color; other info for other modes.) @@ -6534,24 +6529,24 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req // 0: no compression // 1: RLE compressed compression = stbi__get16be(s); - if(compression > 1) + if (compression > 1) return stbi__errpuc("bad compression", "PSD has an unknown compression format"); // Check size - if(!stbi__mad3sizes_valid(4, w, h, 0)) + if (!stbi__mad3sizes_valid(4, w, h, 0)) return stbi__errpuc("too large", "Corrupt PSD"); // Create the destination image. - if(!compression && bitdepth == 16 && bpc == 16) + if (!compression && bitdepth == 16 && bpc == 16) { - out = ( stbi_uc* )stbi__malloc_mad3(8, w, h, 0); + out = (stbi_uc*)stbi__malloc_mad3(8, w, h, 0); ri->bits_per_channel = 16; } else - out = ( stbi_uc* )stbi__malloc(4 * (size_t)w * h); + out = (stbi_uc*)stbi__malloc(4 * (size_t)w * h); - if(!out) + if (!out) return stbi__errpuc("outofmem", "Out of memory"); pixelCount = w * h; @@ -6559,7 +6554,7 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req // memset( out, 0, pixelCount * 4 ); // Finally, the image data. - if(compression) + if (compression) { // RLE as used by .PSD and .TIFF // Loop until you get the number of unpacked bytes you are expecting: @@ -6574,21 +6569,21 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req stbi__skip(s, h * channelCount * 2); // Read the RLE data by channel. - for(channel = 0; channel < 4; channel++) + for (channel = 0; channel < 4; channel++) { stbi_uc* p; p = out + channel; - if(channel >= channelCount) + if (channel >= channelCount) { // Fill this channel with default data. - for(i = 0; i < pixelCount; i++, p += 4) + for (i = 0; i < pixelCount; i++, p += 4) *p = (channel == 3 ? 255 : 0); } else { // Read the RLE data. - if(!stbi__psd_decode_rle(s, p, pixelCount)) + if (!stbi__psd_decode_rle(s, p, pixelCount)) { STBI_FREE(out); return stbi__errpuc("corrupt", "bad RLE data"); @@ -6602,45 +6597,45 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image. // Read the data by channel. - for(channel = 0; channel < 4; channel++) + for (channel = 0; channel < 4; channel++) { - if(channel >= channelCount) + if (channel >= channelCount) { // Fill this channel with default data. - if(bitdepth == 16 && bpc == 16) + if (bitdepth == 16 && bpc == 16) { - stbi__uint16* q = (( stbi__uint16* )out) + channel; + stbi__uint16* q = ((stbi__uint16*)out) + channel; stbi__uint16 val = channel == 3 ? 65535 : 0; - for(i = 0; i < pixelCount; i++, q += 4) + for (i = 0; i < pixelCount; i++, q += 4) *q = val; } else { stbi_uc* p = out + channel; stbi_uc val = channel == 3 ? 255 : 0; - for(i = 0; i < pixelCount; i++, p += 4) + for (i = 0; i < pixelCount; i++, p += 4) *p = val; } } else { - if(ri->bits_per_channel == 16) - { // output bpc - stbi__uint16* q = (( stbi__uint16* )out) + channel; - for(i = 0; i < pixelCount; i++, q += 4) - *q = ( stbi__uint16 )stbi__get16be(s); + if (ri->bits_per_channel == 16) + { // output bpc + stbi__uint16* q = ((stbi__uint16*)out) + channel; + for (i = 0; i < pixelCount; i++, q += 4) + *q = (stbi__uint16)stbi__get16be(s); } else { stbi_uc* p = out + channel; - if(bitdepth == 16) - { // input bpc - for(i = 0; i < pixelCount; i++, p += 4) + if (bitdepth == 16) + { // input bpc + for (i = 0; i < pixelCount; i++, p += 4) *p = (stbi_uc)(stbi__get16be(s) >> 8); } else { - for(i = 0; i < pixelCount; i++, p += 4) + for (i = 0; i < pixelCount; i++, p += 4) *p = stbi__get8(s); } } @@ -6649,14 +6644,14 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req } // remove weird white matte from PSD - if(channelCount >= 4) + if (channelCount >= 4) { - if(ri->bits_per_channel == 16) + if (ri->bits_per_channel == 16) { - for(i = 0; i < w * h; ++i) + for (i = 0; i < w * h; ++i) { - stbi__uint16* pixel = ( stbi__uint16* )out + 4 * i; - if(pixel[3] != 0 && pixel[3] != 65535) + stbi__uint16* pixel = (stbi__uint16*)out + 4 * i; + if (pixel[3] != 0 && pixel[3] != 65535) { float a = pixel[3] / 65535.0f; float ra = 1.0f / a; @@ -6669,34 +6664,34 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req } else { - for(i = 0; i < w * h; ++i) + for (i = 0; i < w * h; ++i) { unsigned char* pixel = out + 4 * i; - if(pixel[3] != 0 && pixel[3] != 255) + if (pixel[3] != 0 && pixel[3] != 255) { float a = pixel[3] / 255.0f; float ra = 1.0f / a; float inv_a = 255.0f * (1 - ra); - pixel[0] = ( unsigned char )(pixel[0] * ra + inv_a); - pixel[1] = ( unsigned char )(pixel[1] * ra + inv_a); - pixel[2] = ( unsigned char )(pixel[2] * ra + inv_a); + pixel[0] = (unsigned char)(pixel[0] * ra + inv_a); + pixel[1] = (unsigned char)(pixel[1] * ra + inv_a); + pixel[2] = (unsigned char)(pixel[2] * ra + inv_a); } } } } // convert to desired output format - if(req_comp && req_comp != 4) + if (req_comp && req_comp != 4) { - if(ri->bits_per_channel == 16) - out = ( stbi_uc* )stbi__convert_format16(( stbi__uint16* )out, 4, req_comp, w, h); + if (ri->bits_per_channel == 16) + out = (stbi_uc*)stbi__convert_format16((stbi__uint16*)out, 4, req_comp, w, h); else out = stbi__convert_format(out, 4, req_comp, w, h); - if(out == NULL) - return out; // stbi__convert_format frees input on failure + if (out == NULL) + return out; // stbi__convert_format frees input on failure } - if(comp) + if (comp) *comp = 4; *y = h; *x = w; @@ -6716,8 +6711,8 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req static int stbi__pic_is4(stbi__context* s, const char* str) { int i; - for(i = 0; i < 4; ++i) - if(stbi__get8(s) != ( stbi_uc )str[i]) + for (i = 0; i < 4; ++i) + if (stbi__get8(s) != (stbi_uc)str[i]) return 0; return 1; @@ -6727,13 +6722,13 @@ static int stbi__pic_test_core(stbi__context* s) { int i; - if(!stbi__pic_is4(s, "\x53\x80\xF6\x34")) + if (!stbi__pic_is4(s, "\x53\x80\xF6\x34")) return 0; - for(i = 0; i < 84; ++i) + for (i = 0; i < 84; ++i) stbi__get8(s); - if(!stbi__pic_is4(s, "PICT")) + if (!stbi__pic_is4(s, "PICT")) return 0; return 1; @@ -6748,11 +6743,11 @@ static stbi_uc* stbi__readval(stbi__context* s, int channel, stbi_uc* dest) { int mask = 0x80, i; - for(i = 0; i < 4; ++i, mask >>= 1) + for (i = 0; i < 4; ++i, mask >>= 1) { - if(channel & mask) + if (channel & mask) { - if(stbi__at_eof(s)) + if (stbi__at_eof(s)) return stbi__errpuc("bad file", "PIC file too short"); dest[i] = stbi__get8(s); } @@ -6765,8 +6760,8 @@ static void stbi__copyval(int channel, stbi_uc* dest, const stbi_uc* src) { int mask = 0x80, i; - for(i = 0; i < 4; ++i, mask >>= 1) - if(channel & mask) + for (i = 0; i < 4; ++i, mask >>= 1) + if (channel & mask) dest[i] = src[i]; } @@ -6781,7 +6776,7 @@ static stbi_uc* stbi__pic_load_core(stbi__context* s, int width, int height, int { stbi__pic_packet* packet; - if(num_packets == sizeof(packets) / sizeof(packets[0])) + if (num_packets == sizeof(packets) / sizeof(packets[0])) return stbi__errpuc("bad format", "too many packets"); packet = &packets[num_packets++]; @@ -6793,103 +6788,103 @@ static stbi_uc* stbi__pic_load_core(stbi__context* s, int width, int height, int act_comp |= packet->channel; - if(stbi__at_eof(s)) + if (stbi__at_eof(s)) return stbi__errpuc("bad file", "file too short (reading packets)"); - if(packet->size != 8) + if (packet->size != 8) return stbi__errpuc("bad format", "packet isn't 8bpp"); - } while(chained); + } while (chained); - *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel? + *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel? - for(y = 0; y < height; ++y) + for (y = 0; y < height; ++y) { int packet_idx; - for(packet_idx = 0; packet_idx < num_packets; ++packet_idx) + for (packet_idx = 0; packet_idx < num_packets; ++packet_idx) { stbi__pic_packet* packet = &packets[packet_idx]; stbi_uc* dest = result + y * width * 4; - switch(packet->type) + switch (packet->type) { - default: - return stbi__errpuc("bad format", "packet has bad compression type"); + default: + return stbi__errpuc("bad format", "packet has bad compression type"); - case 0: - { // uncompressed - int x; + case 0: + { // uncompressed + int x; - for(x = 0; x < width; ++x, dest += 4) - if(!stbi__readval(s, packet->channel, dest)) - return 0; - break; - } + for (x = 0; x < width; ++x, dest += 4) + if (!stbi__readval(s, packet->channel, dest)) + return 0; + break; + } - case 1: // Pure RLE - { - int left = width, i; + case 1: // Pure RLE + { + int left = width, i; - while(left > 0) - { - stbi_uc count, value[4]; + while (left > 0) + { + stbi_uc count, value[4]; - count = stbi__get8(s); - if(stbi__at_eof(s)) - return stbi__errpuc("bad file", "file too short (pure read count)"); + count = stbi__get8(s); + if (stbi__at_eof(s)) + return stbi__errpuc("bad file", "file too short (pure read count)"); - if(count > left) - count = ( stbi_uc )left; + if (count > left) + count = (stbi_uc)left; - if(!stbi__readval(s, packet->channel, value)) - return 0; + if (!stbi__readval(s, packet->channel, value)) + return 0; - for(i = 0; i < count; ++i, dest += 4) - stbi__copyval(packet->channel, dest, value); - left -= count; - } + for (i = 0; i < count; ++i, dest += 4) + stbi__copyval(packet->channel, dest, value); + left -= count; } - break; + } + break; - case 2: - { // Mixed RLE - int left = width; - while(left > 0) - { - int count = stbi__get8(s), i; - if(stbi__at_eof(s)) - return stbi__errpuc("bad file", "file too short (mixed read count)"); + case 2: + { // Mixed RLE + int left = width; + while (left > 0) + { + int count = stbi__get8(s), i; + if (stbi__at_eof(s)) + return stbi__errpuc("bad file", "file too short (mixed read count)"); - if(count >= 128) - { // Repeated - stbi_uc value[4]; + if (count >= 128) + { // Repeated + stbi_uc value[4]; - if(count == 128) - count = stbi__get16be(s); - else - count -= 127; - if(count > left) - return stbi__errpuc("bad file", "scanline overrun"); + if (count == 128) + count = stbi__get16be(s); + else + count -= 127; + if (count > left) + return stbi__errpuc("bad file", "scanline overrun"); - if(!stbi__readval(s, packet->channel, value)) - return 0; + if (!stbi__readval(s, packet->channel, value)) + return 0; - for(i = 0; i < count; ++i, dest += 4) - stbi__copyval(packet->channel, dest, value); - } - else - { // Raw - ++count; - if(count > left) - return stbi__errpuc("bad file", "scanline overrun"); + for (i = 0; i < count; ++i, dest += 4) + stbi__copyval(packet->channel, dest, value); + } + else + { // Raw + ++count; + if (count > left) + return stbi__errpuc("bad file", "scanline overrun"); - for(i = 0; i < count; ++i, dest += 4) - if(!stbi__readval(s, packet->channel, dest)) - return 0; - } - left -= count; + for (i = 0; i < count; ++i, dest += 4) + if (!stbi__readval(s, packet->channel, dest)) + return 0; } - break; + left -= count; } + break; + } } } } @@ -6903,35 +6898,35 @@ static void* stbi__pic_load(stbi__context* s, int* px, int* py, int* comp, int r int i, x, y, internal_comp; STBI_NOTUSED(ri); - if(!comp) + if (!comp) comp = &internal_comp; - for(i = 0; i < 92; ++i) + for (i = 0; i < 92; ++i) stbi__get8(s); x = stbi__get16be(s); y = stbi__get16be(s); - if(stbi__at_eof(s)) + if (stbi__at_eof(s)) return stbi__errpuc("bad file", "file too short (pic header)"); - if(!stbi__mad3sizes_valid(x, y, 4, 0)) + if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode"); - stbi__get32be(s); // skip `ratio' - stbi__get16be(s); // skip `fields' - stbi__get16be(s); // skip `pad' + stbi__get32be(s); // skip `ratio' + stbi__get16be(s); // skip `fields' + stbi__get16be(s); // skip `pad' // intermediate buffer is RGBA - result = ( stbi_uc* )stbi__malloc_mad3(x, y, 4, 0); + result = (stbi_uc*)stbi__malloc_mad3(x, y, 4, 0); memset(result, 0xff, (size_t)x * y * 4); - if(!stbi__pic_load_core(s, x, y, comp, result)) + if (!stbi__pic_load_core(s, x, y, comp, result)) { STBI_FREE(result); result = 0; } *px = x; *py = y; - if(req_comp == 0) + if (req_comp == 0) req_comp = *comp; result = stbi__convert_format(result, 4, req_comp, x, y); @@ -6960,8 +6955,8 @@ typedef struct typedef struct { int w, h; - stbi_uc* out; // output buffer (always 4 components) - stbi_uc* background; // The current "background" as far as a gif is concerned + stbi_uc* out; // output buffer (always 4 components) + stbi_uc* background; // The current "background" as far as a gif is concerned stbi_uc* history; int flags, bgindex, ratio, transparent, eflags; stbi_uc pal[256][4]; @@ -6980,12 +6975,12 @@ typedef struct static int stbi__gif_test_raw(stbi__context* s) { int sz; - if(stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') + if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0; sz = stbi__get8(s); - if(sz != '9' && sz != '7') + if (sz != '9' && sz != '7') return 0; - if(stbi__get8(s) != 'a') + if (stbi__get8(s) != 'a') return 0; return 1; } @@ -7000,7 +6995,7 @@ static int stbi__gif_test(stbi__context* s) static void stbi__gif_parse_colortable(stbi__context* s, stbi_uc pal[256][4], int num_entries, int transp) { int i; - for(i = 0; i < num_entries; ++i) + for (i = 0; i < num_entries; ++i) { pal[i][2] = stbi__get8(s); pal[i][1] = stbi__get8(s); @@ -7012,13 +7007,13 @@ static void stbi__gif_parse_colortable(stbi__context* s, stbi_uc pal[256][4], in static int stbi__gif_header(stbi__context* s, stbi__gif* g, int* comp, int is_info) { stbi_uc version; - if(stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') + if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return stbi__err("not GIF", "Corrupt GIF"); version = stbi__get8(s); - if(version != '7' && version != '9') + if (version != '7' && version != '9') return stbi__err("not GIF", "Corrupt GIF"); - if(stbi__get8(s) != 'a') + if (stbi__get8(s) != 'a') return stbi__err("not GIF", "Corrupt GIF"); stbi__g_failure_reason = ""; @@ -7029,13 +7024,13 @@ static int stbi__gif_header(stbi__context* s, stbi__gif* g, int* comp, int is_in g->ratio = stbi__get8(s); g->transparent = -1; - if(comp != 0) - *comp = 4; // can't actually tell whether it's 3 or 4 until we parse the comments + if (comp != 0) + *comp = 4; // can't actually tell whether it's 3 or 4 until we parse the comments - if(is_info) + if (is_info) return 1; - if(g->flags & 0x80) + if (g->flags & 0x80) stbi__gif_parse_colortable(s, g->pal, 2 << (g->flags & 7), -1); return 1; @@ -7043,16 +7038,16 @@ static int stbi__gif_header(stbi__context* s, stbi__gif* g, int* comp, int is_in static int stbi__gif_info_raw(stbi__context* s, int* x, int* y, int* comp) { - stbi__gif* g = ( stbi__gif* )stbi__malloc(sizeof(stbi__gif)); - if(!stbi__gif_header(s, g, comp, 1)) + stbi__gif* g = (stbi__gif*)stbi__malloc(sizeof(stbi__gif)); + if (!stbi__gif_header(s, g, comp, 1)) { STBI_FREE(g); stbi__rewind(s); return 0; } - if(x) + if (x) *x = g->w; - if(y) + if (y) *y = g->h; STBI_FREE(g); return 1; @@ -7065,10 +7060,10 @@ static void stbi__out_gif_code(stbi__gif* g, stbi__uint16 code) // recurse to decode the prefixes, since the linked-list is backwards, // and working backwards through an interleaved image would be nasty - if(g->codes[code].prefix >= 0) + if (g->codes[code].prefix >= 0) stbi__out_gif_code(g, g->codes[code].prefix); - if(g->cur_y >= g->max_y) + if (g->cur_y >= g->max_y) return; idx = g->cur_x + g->cur_y; @@ -7076,8 +7071,8 @@ static void stbi__out_gif_code(stbi__gif* g, stbi__uint16 code) g->history[idx / 4] = 1; c = &g->color_table[g->codes[code].suffix * 4]; - if(c[3] > 128) - { // don't render transparent pixels; + if (c[3] > 128) + { // don't render transparent pixels; p[0] = c[2]; p[1] = c[1]; p[2] = c[0]; @@ -7085,12 +7080,12 @@ static void stbi__out_gif_code(stbi__gif* g, stbi__uint16 code) } g->cur_x += 4; - if(g->cur_x >= g->max_x) + if (g->cur_x >= g->max_x) { g->cur_x = g->start_x; g->cur_y += g->step; - while(g->cur_y >= g->max_y && g->parse > 0) + while (g->cur_y >= g->max_y && g->parse > 0) { g->step = (1 << g->parse) * g->line_size; g->cur_y = g->start_y + (g->step >> 1); @@ -7108,7 +7103,7 @@ static stbi_uc* stbi__process_gif_raster(stbi__context* s, stbi__gif* g) stbi__gif_lzw* p; lzw_cs = stbi__get8(s); - if(lzw_cs > 12) + if (lzw_cs > 12) return NULL; clear = 1 << lzw_cs; first = 1; @@ -7116,11 +7111,11 @@ static stbi_uc* stbi__process_gif_raster(stbi__context* s, stbi__gif* g) codemask = (1 << codesize) - 1; bits = 0; valid_bits = 0; - for(init_code = 0; init_code < clear; init_code++) + for (init_code = 0; init_code < clear; init_code++) { g->codes[init_code].prefix = -1; - g->codes[init_code].first = ( stbi_uc )init_code; - g->codes[init_code].suffix = ( stbi_uc )init_code; + g->codes[init_code].first = (stbi_uc)init_code; + g->codes[init_code].suffix = (stbi_uc)init_code; } // support no starting clear code @@ -7128,18 +7123,18 @@ static stbi_uc* stbi__process_gif_raster(stbi__context* s, stbi__gif* g) oldcode = -1; len = 0; - for(;;) + for (;;) { - if(valid_bits < codesize) + if (valid_bits < codesize) { - if(len == 0) + if (len == 0) { - len = stbi__get8(s); // start new block - if(len == 0) + len = stbi__get8(s); // start new block + if (len == 0) return g->out; } --len; - bits |= ( stbi__int32 )stbi__get8(s) << valid_bits; + bits |= (stbi__int32)stbi__get8(s) << valid_bits; valid_bits += 8; } else @@ -7148,46 +7143,46 @@ static stbi_uc* stbi__process_gif_raster(stbi__context* s, stbi__gif* g) bits >>= codesize; valid_bits -= codesize; // @OPTIMIZE: is there some way we can accelerate the non-clear path? - if(code == clear) - { // clear code + if (code == clear) + { // clear code codesize = lzw_cs + 1; codemask = (1 << codesize) - 1; avail = clear + 2; oldcode = -1; first = 0; } - else if(code == clear + 1) - { // end of stream code + else if (code == clear + 1) + { // end of stream code stbi__skip(s, len); - while((len = stbi__get8(s)) > 0) + while ((len = stbi__get8(s)) > 0) stbi__skip(s, len); return g->out; } - else if(code <= avail) + else if (code <= avail) { - if(first) + if (first) { return stbi__errpuc("no clear code", "Corrupt GIF"); } - if(oldcode >= 0) + if (oldcode >= 0) { p = &g->codes[avail++]; - if(avail > 8192) + if (avail > 8192) { return stbi__errpuc("too many codes", "Corrupt GIF"); } - p->prefix = ( stbi__int16 )oldcode; + p->prefix = (stbi__int16)oldcode; p->first = g->codes[oldcode].first; p->suffix = (code == avail) ? p->first : g->codes[code].first; } - else if(code == avail) + else if (code == avail) return stbi__errpuc("illegal code in raster", "Corrupt GIF"); - stbi__out_gif_code(g, ( stbi__uint16 )code); + stbi__out_gif_code(g, (stbi__uint16)code); - if((avail & codemask) == 0 && avail <= 0x0FFF) + if ((avail & codemask) == 0 && avail <= 0x0FFF) { codesize++; codemask = (1 << codesize) - 1; @@ -7214,22 +7209,22 @@ static stbi_uc* stbi__gif_load_next(stbi__context* s, stbi__gif* g, int* comp, i // on first frame, any non-written pixels get the background colour (non-transparent) first_frame = 0; - if(g->out == 0) - { - if(!stbi__gif_header(s, g, comp, 0)) - return 0; // stbi__g_failure_reason set by stbi__gif_header - g->out = ( stbi_uc* )stbi__malloc(4 * (size_t)(g->w) * g->h); - g->background = ( stbi_uc* )stbi__malloc(4 * (size_t)(g->w) * g->h); - g->history = ( stbi_uc* )stbi__malloc((size_t)(g->w) * g->h); - if(g->out == 0) + if (g->out == 0) + { + if (!stbi__gif_header(s, g, comp, 0)) + return 0; // stbi__g_failure_reason set by stbi__gif_header + g->out = (stbi_uc*)stbi__malloc(4 * (size_t)(g->w) * g->h); + g->background = (stbi_uc*)stbi__malloc(4 * (size_t)(g->w) * g->h); + g->history = (stbi_uc*)stbi__malloc((size_t)(g->w) * g->h); + if (g->out == 0) return stbi__errpuc("outofmem", "Out of memory"); // image is treated as "tranparent" at the start - ie, nothing overwrites the current background; // background colour is only used for pixels that are not rendered first frame, after that "background" // color refers to teh color that was there the previous frame. memset(g->out, 0x00, 4 * (size_t)(g->w) * g->h); - memset(g->background, 0x00, 4 * (size_t)(g->w) * g->h); // state of the background (starts transparent) - memset(g->history, 0x00, (size_t)(g->w) * g->h); // pixels that were affected previous frame + memset(g->background, 0x00, 4 * (size_t)(g->w) * g->h); // state of the background (starts transparent) + memset(g->history, 0x00, (size_t)(g->w) * g->h); // pixels that were affected previous frame first_frame = 1; } else @@ -7238,27 +7233,27 @@ static stbi_uc* stbi__gif_load_next(stbi__context* s, stbi__gif* g, int* comp, i dispose = (g->eflags & 0x1C) >> 2; pcount = g->w * g->h; - if((dispose == 3) && (two_back == 0)) + if ((dispose == 3) && (two_back == 0)) { - dispose = 2; // if I don't have an image to revert back to, default to the old background + dispose = 2; // if I don't have an image to revert back to, default to the old background } - if(dispose == 3) - { // use previous graphic - for(pi = 0; pi < pcount; ++pi) + if (dispose == 3) + { // use previous graphic + for (pi = 0; pi < pcount; ++pi) { - if(g->history[pi]) + if (g->history[pi]) { memcpy(&g->out[pi * 4], &two_back[pi * 4], 4); } } } - else if(dispose == 2) + else if (dispose == 2) { // restore what was changed last frame to background before that frame; - for(pi = 0; pi < pcount; ++pi) + for (pi = 0; pi < pcount; ++pi) { - if(g->history[pi]) + if (g->history[pi]) { memcpy(&g->out[pi * 4], &g->background[pi * 4], 4); } @@ -7277,139 +7272,139 @@ static stbi_uc* stbi__gif_load_next(stbi__context* s, stbi__gif* g, int* comp, i } // clear my history; - memset(g->history, 0x00, (size_t)(g->w) * g->h); // pixels that were affected previous frame + memset(g->history, 0x00, (size_t)(g->w) * g->h); // pixels that were affected previous frame - for(;;) + for (;;) { int tag = stbi__get8(s); - switch(tag) + switch (tag) + { + case 0x2C: /* Image Descriptor */ { - case 0x2C: /* Image Descriptor */ + stbi__int32 x, y, w, h; + stbi_uc* o; + + x = stbi__get16le(s); + y = stbi__get16le(s); + w = stbi__get16le(s); + h = stbi__get16le(s); + if (((x + w) > (g->w)) || ((y + h) > (g->h))) + return stbi__errpuc("bad Image Descriptor", "Corrupt GIF"); + + g->line_size = g->w * 4; + g->start_x = x * 4; + g->start_y = y * g->line_size; + g->max_x = g->start_x + w * 4; + g->max_y = g->start_y + h * g->line_size; + g->cur_x = g->start_x; + g->cur_y = g->start_y; + + g->lflags = stbi__get8(s); + + if (g->lflags & 0x40) { - stbi__int32 x, y, w, h; - stbi_uc* o; - - x = stbi__get16le(s); - y = stbi__get16le(s); - w = stbi__get16le(s); - h = stbi__get16le(s); - if(((x + w) > (g->w)) || ((y + h) > (g->h))) - return stbi__errpuc("bad Image Descriptor", "Corrupt GIF"); - - g->line_size = g->w * 4; - g->start_x = x * 4; - g->start_y = y * g->line_size; - g->max_x = g->start_x + w * 4; - g->max_y = g->start_y + h * g->line_size; - g->cur_x = g->start_x; - g->cur_y = g->start_y; - - g->lflags = stbi__get8(s); - - if(g->lflags & 0x40) - { - g->step = 8 * g->line_size; // first interlaced spacing - g->parse = 3; - } - else - { - g->step = g->line_size; - g->parse = 0; - } + g->step = 8 * g->line_size; // first interlaced spacing + g->parse = 3; + } + else + { + g->step = g->line_size; + g->parse = 0; + } - if(g->lflags & 0x80) - { - stbi__gif_parse_colortable(s, g->lpal, 2 << (g->lflags & 7), - g->eflags & 0x01 ? g->transparent : -1); - g->color_table = ( stbi_uc* )g->lpal; - } - else if(g->flags & 0x80) - { - g->color_table = ( stbi_uc* )g->pal; - } - else - return stbi__errpuc("missing color table", "Corrupt GIF"); + if (g->lflags & 0x80) + { + stbi__gif_parse_colortable(s, g->lpal, 2 << (g->lflags & 7), + g->eflags & 0x01 ? g->transparent : -1); + g->color_table = (stbi_uc*)g->lpal; + } + else if (g->flags & 0x80) + { + g->color_table = (stbi_uc*)g->pal; + } + else + return stbi__errpuc("missing color table", "Corrupt GIF"); - o = stbi__process_gif_raster(s, g); - if(o == NULL) - return NULL; + o = stbi__process_gif_raster(s, g); + if (o == NULL) + return NULL; - // if this was the first frame, - pcount = g->w * g->h; - if(first_frame && (g->bgindex > 0)) + // if this was the first frame, + pcount = g->w * g->h; + if (first_frame && (g->bgindex > 0)) + { + // if first frame, any pixel not drawn to gets the background color + for (pi = 0; pi < pcount; ++pi) { - // if first frame, any pixel not drawn to gets the background color - for(pi = 0; pi < pcount; ++pi) + if (g->history[pi] == 0) { - if(g->history[pi] == 0) - { - g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will - // be reset next frame if need be; - memcpy(&g->out[pi * 4], &g->pal[g->bgindex], 4); - } + g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will + // be reset next frame if need be; + memcpy(&g->out[pi * 4], &g->pal[g->bgindex], 4); } } - - return o; } - case 0x21: // Comment Extension. - { - int len; - int ext = stbi__get8(s); - if(ext == 0xF9) - { // Graphic Control Extension. - len = stbi__get8(s); - if(len == 4) - { - g->eflags = stbi__get8(s); - g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths. + return o; + } - // unset old transparent - if(g->transparent >= 0) - { - g->pal[g->transparent][3] = 255; - } - if(g->eflags & 0x01) - { - g->transparent = stbi__get8(s); - if(g->transparent >= 0) - { - g->pal[g->transparent][3] = 0; - } - } - else + case 0x21: // Comment Extension. + { + int len; + int ext = stbi__get8(s); + if (ext == 0xF9) + { // Graphic Control Extension. + len = stbi__get8(s); + if (len == 4) + { + g->eflags = stbi__get8(s); + g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths. + + // unset old transparent + if (g->transparent >= 0) + { + g->pal[g->transparent][3] = 255; + } + if (g->eflags & 0x01) + { + g->transparent = stbi__get8(s); + if (g->transparent >= 0) { - // don't need transparent - stbi__skip(s, 1); - g->transparent = -1; + g->pal[g->transparent][3] = 0; } } else { - stbi__skip(s, len); - break; + // don't need transparent + stbi__skip(s, 1); + g->transparent = -1; } } - while((len = stbi__get8(s)) != 0) + else { stbi__skip(s, len); + break; } - break; } + while ((len = stbi__get8(s)) != 0) + { + stbi__skip(s, len); + } + break; + } - case 0x3B: // gif stream termination code - return ( stbi_uc* )s; // using '1' causes warning on some compilers + case 0x3B: // gif stream termination code + return (stbi_uc*)s; // using '1' causes warning on some compilers - default: - return stbi__errpuc("unknown code", "Corrupt GIF"); + default: + return stbi__errpuc("unknown code", "Corrupt GIF"); } } } static void* stbi__load_gif_main(stbi__context* s, int** delays, int* x, int* y, int* z, int* comp, int req_comp) { - if(stbi__gif_test(s)) + if (stbi__gif_test(s)) { int layers = 0; stbi_uc* u = 0; @@ -7418,7 +7413,7 @@ static void* stbi__load_gif_main(stbi__context* s, int** delays, int* x, int* y, stbi__gif g; int stride; memset(&g, 0, sizeof(g)); - if(delays) + if (delays) { *delays = 0; } @@ -7426,44 +7421,44 @@ static void* stbi__load_gif_main(stbi__context* s, int** delays, int* x, int* y, do { u = stbi__gif_load_next(s, &g, comp, req_comp, two_back); - if(u == ( stbi_uc* )s) - u = 0; // end of animated gif marker + if (u == (stbi_uc*)s) + u = 0; // end of animated gif marker - if(u) + if (u) { *x = g.w; *y = g.h; ++layers; stride = g.w * g.h * 4; - if(out) + if (out) { - out = ( stbi_uc* )STBI_REALLOC(out, (size_t)layers * stride); - if(delays) + out = (stbi_uc*)STBI_REALLOC(out, (size_t)layers * stride); + if (delays) { - *delays = ( int* )STBI_REALLOC(*delays, sizeof(int) * layers); + *delays = (int*)STBI_REALLOC(*delays, sizeof(int) * layers); } } else { - out = ( stbi_uc* )stbi__malloc((size_t)layers * stride); - if(delays) + out = (stbi_uc*)stbi__malloc((size_t)layers * stride); + if (delays) { - *delays = ( int* )stbi__malloc(layers * sizeof(int)); + *delays = (int*)stbi__malloc(layers * sizeof(int)); } } memcpy(out + ((layers - 1) * stride), u, stride); - if(layers >= 2) + if (layers >= 2) { two_back = out - 2 * stride; } - if(delays) + if (delays) { (*delays)[layers - 1U] = g.delay; } } - } while(u != 0); + } while (u != 0); // free temp buffer; STBI_FREE(g.out); @@ -7471,7 +7466,7 @@ static void* stbi__load_gif_main(stbi__context* s, int** delays, int* x, int* y, STBI_FREE(g.background); // do the final conversion after loading everything; - if(req_comp && req_comp != 4) + if (req_comp && req_comp != 4) out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h); *z = layers; @@ -7490,16 +7485,16 @@ static void* stbi__gif_load(stbi__context* s, int* x, int* y, int* comp, int req memset(&g, 0, sizeof(g)); u = stbi__gif_load_next(s, &g, comp, req_comp, 0); - if(u == ( stbi_uc* )s) - u = 0; // end of animated gif marker - if(u) + if (u == (stbi_uc*)s) + u = 0; // end of animated gif marker + if (u) { *x = g.w; *y = g.h; // moved conversion to after successful load so that the same // can be done for multiple frames. - if(req_comp && req_comp != 4) + if (req_comp && req_comp != 4) u = stbi__convert_format(u, 4, req_comp, g.w, g.h); } @@ -7523,8 +7518,8 @@ static int stbi__gif_info(stbi__context* s, int* x, int* y, int* comp) static int stbi__hdr_test_core(stbi__context* s, const char* signature) { int i; - for(i = 0; signature[i]; ++i) - if(stbi__get8(s) != signature[i]) + for (i = 0; signature[i]; ++i) + if (stbi__get8(s) != signature[i]) return 0; stbi__rewind(s); return 1; @@ -7534,7 +7529,7 @@ static int stbi__hdr_test(stbi__context* s) { int r = stbi__hdr_test_core(s, "#?RADIANCE\n"); stbi__rewind(s); - if(!r) + if (!r) { r = stbi__hdr_test_core(s, "#?RGBE\n"); stbi__rewind(s); @@ -7548,19 +7543,19 @@ static char* stbi__hdr_gettoken(stbi__context* z, char* buffer) int len = 0; char c = '\0'; - c = ( char )stbi__get8(z); + c = (char)stbi__get8(z); - while(!stbi__at_eof(z) && c != '\n') + while (!stbi__at_eof(z) && c != '\n') { buffer[len++] = c; - if(len == STBI__HDR_BUFLEN - 1) + if (len == STBI__HDR_BUFLEN - 1) { // flush to end of line - while(!stbi__at_eof(z) && stbi__get8(z) != '\n') + while (!stbi__at_eof(z) && stbi__get8(z) != '\n') ; break; } - c = ( char )stbi__get8(z); + c = (char)stbi__get8(z); } buffer[len] = 0; @@ -7569,12 +7564,12 @@ static char* stbi__hdr_gettoken(stbi__context* z, char* buffer) static void stbi__hdr_convert(float* output, stbi_uc* input, int req_comp) { - if(input[3] != 0) + if (input[3] != 0) { float f1; // Exponent - f1 = ( float )ldexp(1.0f, input[3] - ( int )(128 + 8)); - if(req_comp <= 2) + f1 = (float)ldexp(1.0f, input[3] - (int)(128 + 8)); + if (req_comp <= 2) output[0] = (input[0] + input[1] + input[2]) * f1 / 3; else { @@ -7582,25 +7577,25 @@ static void stbi__hdr_convert(float* output, stbi_uc* input, int req_comp) output[1] = input[1] * f1; output[2] = input[2] * f1; } - if(req_comp == 2) + if (req_comp == 2) output[1] = 1; - if(req_comp == 4) + if (req_comp == 4) output[3] = 1; } else { - switch(req_comp) + switch (req_comp) { - case 4: - output[3] = 1; /* fallthrough */ - case 3: - output[0] = output[1] = output[2] = 0; - break; - case 2: - output[1] = 1; /* fallthrough */ - case 1: - output[0] = 0; - break; + case 4: + output[3] = 1; /* fallthrough */ + case 3: + output[0] = output[1] = output[2] = 0; + break; + case 2: + output[1] = 1; /* fallthrough */ + case 1: + output[0] = 0; + break; } } } @@ -7621,63 +7616,63 @@ static float* stbi__hdr_load(stbi__context* s, int* x, int* y, int* comp, int re // Check identifier headerToken = stbi__hdr_gettoken(s, buffer); - if(strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0) + if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0) return stbi__errpf("not HDR", "Corrupt HDR image"); // Parse header - for(;;) + for (;;) { token = stbi__hdr_gettoken(s, buffer); - if(token[0] == 0) + if (token[0] == 0) break; - if(strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) + if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1; } - if(!valid) + if (!valid) return stbi__errpf("unsupported format", "Unsupported HDR format"); // Parse width and height // can't use sscanf() if we're not using stdio! token = stbi__hdr_gettoken(s, buffer); - if(strncmp(token, "-Y ", 3)) + if (strncmp(token, "-Y ", 3)) return stbi__errpf("unsupported data layout", "Unsupported HDR format"); token += 3; - height = ( int )strtol(token, &token, 10); - while(*token == ' ') + height = (int)strtol(token, &token, 10); + while (*token == ' ') ++token; - if(strncmp(token, "+X ", 3)) + if (strncmp(token, "+X ", 3)) return stbi__errpf("unsupported data layout", "Unsupported HDR format"); token += 3; - width = ( int )strtol(token, NULL, 10); + width = (int)strtol(token, NULL, 10); *x = width; *y = height; - if(comp) + if (comp) *comp = 3; - if(req_comp == 0) + if (req_comp == 0) req_comp = 3; - if(!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0)) + if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0)) return stbi__errpf("too large", "HDR image is too large"); // Read data - hdr_data = ( float* )stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0); - if(!hdr_data) + hdr_data = (float*)stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0); + if (!hdr_data) return stbi__errpf("outofmem", "Out of memory"); // Load image data // image data is stored as some number of sca - if(width < 8 || width >= 32768) + if (width < 8 || width >= 32768) { // Read flat data - for(j = 0; j < height; ++j) + for (j = 0; j < height; ++j) { - for(i = 0; i < width; ++i) + for (i = 0; i < width; ++i) { stbi_uc rgbe[4]; - main_decode_loop: +main_decode_loop: stbi__getn(s, rgbe, 4); stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp); } @@ -7688,83 +7683,83 @@ static float* stbi__hdr_load(stbi__context* s, int* x, int* y, int* comp, int re // Read RLE-encoded data scanline = NULL; - for(j = 0; j < height; ++j) + for (j = 0; j < height; ++j) { c1 = stbi__get8(s); c2 = stbi__get8(s); len = stbi__get8(s); - if(c1 != 2 || c2 != 2 || (len & 0x80)) + if (c1 != 2 || c2 != 2 || (len & 0x80)) { // not run-length encoded, so we have to actually use THIS data as a decoded // pixel (note this can't be a valid pixel--one of RGB must be >= 128) stbi_uc rgbe[4]; - rgbe[0] = ( stbi_uc )c1; - rgbe[1] = ( stbi_uc )c2; - rgbe[2] = ( stbi_uc )len; - rgbe[3] = ( stbi_uc )stbi__get8(s); + rgbe[0] = (stbi_uc)c1; + rgbe[1] = (stbi_uc)c2; + rgbe[2] = (stbi_uc)len; + rgbe[3] = (stbi_uc)stbi__get8(s); stbi__hdr_convert(hdr_data, rgbe, req_comp); i = 1; j = 0; STBI_FREE(scanline); - goto main_decode_loop; // yes, this makes no sense + goto main_decode_loop; // yes, this makes no sense } len <<= 8; len |= stbi__get8(s); - if(len != width) + if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); } - if(scanline == NULL) + if (scanline == NULL) { - scanline = ( stbi_uc* )stbi__malloc_mad2(width, 4, 0); - if(!scanline) + scanline = (stbi_uc*)stbi__malloc_mad2(width, 4, 0); + if (!scanline) { STBI_FREE(hdr_data); return stbi__errpf("outofmem", "Out of memory"); } } - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { int nleft; i = 0; - while((nleft = width - i) > 0) + while ((nleft = width - i) > 0) { count = stbi__get8(s); - if(count > 128) + if (count > 128) { // Run value = stbi__get8(s); count -= 128; - if(count > nleft) + if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); } - for(z = 0; z < count; ++z) + for (z = 0; z < count; ++z) scanline[i++ * 4 + k] = value; } else { // Dump - if(count > nleft) + if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); } - for(z = 0; z < count; ++z) + for (z = 0; z < count; ++z) scanline[i++ * 4 + k] = stbi__get8(s); } } } - for(i = 0; i < width; ++i) + for (i = 0; i < width; ++i) stbi__hdr_convert(hdr_data + (j * width + i) * req_comp, scanline + i * 4, req_comp); } - if(scanline) + if (scanline) STBI_FREE(scanline); } @@ -7778,54 +7773,54 @@ static int stbi__hdr_info(stbi__context* s, int* x, int* y, int* comp) int valid = 0; int dummy; - if(!x) + if (!x) x = &dummy; - if(!y) + if (!y) y = &dummy; - if(!comp) + if (!comp) comp = &dummy; - if(stbi__hdr_test(s) == 0) + if (stbi__hdr_test(s) == 0) { stbi__rewind(s); return 0; } - for(;;) + for (;;) { token = stbi__hdr_gettoken(s, buffer); - if(token[0] == 0) + if (token[0] == 0) break; - if(strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) + if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1; } - if(!valid) + if (!valid) { stbi__rewind(s); return 0; } token = stbi__hdr_gettoken(s, buffer); - if(strncmp(token, "-Y ", 3)) + if (strncmp(token, "-Y ", 3)) { stbi__rewind(s); return 0; } token += 3; - *y = ( int )strtol(token, &token, 10); - while(*token == ' ') + *y = (int)strtol(token, &token, 10); + while (*token == ' ') ++token; - if(strncmp(token, "+X ", 3)) + if (strncmp(token, "+X ", 3)) { stbi__rewind(s); return 0; } token += 3; - *x = ( int )strtol(token, NULL, 10); + *x = (int)strtol(token, NULL, 10); *comp = 3; return 1; } -#endif // STBI_NO_HDR +#endif // STBI_NO_HDR #ifndef STBI_NO_BMP static int stbi__bmp_info(stbi__context* s, int* x, int* y, int* comp) @@ -7836,13 +7831,13 @@ static int stbi__bmp_info(stbi__context* s, int* x, int* y, int* comp) info.all_a = 255; p = stbi__bmp_parse_header(s, &info); stbi__rewind(s); - if(p == NULL) + if (p == NULL) return 0; - if(x) + if (x) *x = s->img_x; - if(y) + if (y) *y = s->img_y; - if(comp) + if (comp) *comp = info.ma ? 4 : 3; return 1; } @@ -7852,25 +7847,25 @@ static int stbi__bmp_info(stbi__context* s, int* x, int* y, int* comp) static int stbi__psd_info(stbi__context* s, int* x, int* y, int* comp) { int channelCount, dummy, depth; - if(!x) + if (!x) x = &dummy; - if(!y) + if (!y) y = &dummy; - if(!comp) + if (!comp) comp = &dummy; - if(stbi__get32be(s) != 0x38425053) + if (stbi__get32be(s) != 0x38425053) { stbi__rewind(s); return 0; } - if(stbi__get16be(s) != 1) + if (stbi__get16be(s) != 1) { stbi__rewind(s); return 0; } stbi__skip(s, 6); channelCount = stbi__get16be(s); - if(channelCount < 0 || channelCount > 16) + if (channelCount < 0 || channelCount > 16) { stbi__rewind(s); return 0; @@ -7878,12 +7873,12 @@ static int stbi__psd_info(stbi__context* s, int* x, int* y, int* comp) *y = stbi__get32be(s); *x = stbi__get32be(s); depth = stbi__get16be(s); - if(depth != 8 && depth != 16) + if (depth != 8 && depth != 16) { stbi__rewind(s); return 0; } - if(stbi__get16be(s) != 3) + if (stbi__get16be(s) != 3) { stbi__rewind(s); return 0; @@ -7895,27 +7890,27 @@ static int stbi__psd_info(stbi__context* s, int* x, int* y, int* comp) static int stbi__psd_is16(stbi__context* s) { int channelCount, depth; - if(stbi__get32be(s) != 0x38425053) + if (stbi__get32be(s) != 0x38425053) { stbi__rewind(s); return 0; } - if(stbi__get16be(s) != 1) + if (stbi__get16be(s) != 1) { stbi__rewind(s); return 0; } stbi__skip(s, 6); channelCount = stbi__get16be(s); - if(channelCount < 0 || channelCount > 16) + if (channelCount < 0 || channelCount > 16) { stbi__rewind(s); return 0; } - ( void )stbi__get32be(s); - ( void )stbi__get32be(s); + (void)stbi__get32be(s); + (void)stbi__get32be(s); depth = stbi__get16be(s); - if(depth != 16) + if (depth != 16) { stbi__rewind(s); return 0; @@ -7930,14 +7925,14 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp) int act_comp = 0, num_packets = 0, chained, dummy; stbi__pic_packet packets[10]; - if(!x) + if (!x) x = &dummy; - if(!y) + if (!y) y = &dummy; - if(!comp) + if (!comp) comp = &dummy; - if(!stbi__pic_is4(s, "\x53\x80\xF6\x34")) + if (!stbi__pic_is4(s, "\x53\x80\xF6\x34")) { stbi__rewind(s); return 0; @@ -7947,12 +7942,12 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp) *x = stbi__get16be(s); *y = stbi__get16be(s); - if(stbi__at_eof(s)) + if (stbi__at_eof(s)) { stbi__rewind(s); return 0; } - if((*x) != 0 && (1 << 28) / (*x) < (*y)) + if ((*x) != 0 && (1 << 28) / (*x) < (*y)) { stbi__rewind(s); return 0; @@ -7964,7 +7959,7 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp) { stbi__pic_packet* packet; - if(num_packets == sizeof(packets) / sizeof(packets[0])) + if (num_packets == sizeof(packets) / sizeof(packets[0])) return 0; packet = &packets[num_packets++]; @@ -7974,17 +7969,17 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp) packet->channel = stbi__get8(s); act_comp |= packet->channel; - if(stbi__at_eof(s)) + if (stbi__at_eof(s)) { stbi__rewind(s); return 0; } - if(packet->size != 8) + if (packet->size != 8) { stbi__rewind(s); return 0; } - } while(chained); + } while (chained); *comp = (act_comp & 0x10 ? 4 : 3); @@ -8009,9 +8004,9 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp) static int stbi__pnm_test(stbi__context* s) { char p, t; - p = ( char )stbi__get8(s); - t = ( char )stbi__get8(s); - if(p != 'P' || (t != '5' && t != '6')) + p = (char)stbi__get8(s); + t = (char)stbi__get8(s); + if (p != 'P' || (t != '5' && t != '6')) { stbi__rewind(s); return 0; @@ -8024,27 +8019,27 @@ static void* stbi__pnm_load(stbi__context* s, int* x, int* y, int* comp, int req stbi_uc* out; STBI_NOTUSED(ri); - if(!stbi__pnm_info(s, ( int* )&s->img_x, ( int* )&s->img_y, ( int* )&s->img_n)) + if (!stbi__pnm_info(s, (int*)&s->img_x, (int*)&s->img_y, (int*)&s->img_n)) return 0; *x = s->img_x; *y = s->img_y; - if(comp) + if (comp) *comp = s->img_n; - if(!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0)) + if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0)) return stbi__errpuc("too large", "PNM too large"); - out = ( stbi_uc* )stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0); - if(!out) + out = (stbi_uc*)stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0); + if (!out) return stbi__errpuc("outofmem", "Out of memory"); stbi__getn(s, out, s->img_n * s->img_x * s->img_y); - if(req_comp && req_comp != s->img_n) + if (req_comp && req_comp != s->img_n) { out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y); - if(out == NULL) - return out; // stbi__convert_format frees input on failure + if (out == NULL) + return out; // stbi__convert_format frees input on failure } return out; } @@ -8056,16 +8051,16 @@ static int stbi__pnm_isspace(char c) static void stbi__pnm_skip_whitespace(stbi__context* s, char* c) { - for(;;) + for (;;) { - while(!stbi__at_eof(s) && stbi__pnm_isspace(*c)) - *c = ( char )stbi__get8(s); + while (!stbi__at_eof(s) && stbi__pnm_isspace(*c)) + *c = (char)stbi__get8(s); - if(stbi__at_eof(s) || *c != '#') + if (stbi__at_eof(s) || *c != '#') break; - while(!stbi__at_eof(s) && *c != '\n' && *c != '\r') - *c = ( char )stbi__get8(s); + while (!stbi__at_eof(s) && *c != '\n' && *c != '\r') + *c = (char)stbi__get8(s); } } @@ -8078,10 +8073,10 @@ static int stbi__pnm_getinteger(stbi__context* s, char* c) { int value = 0; - while(!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) + while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) { value = value * 10 + (*c - '0'); - *c = ( char )stbi__get8(s); + *c = (char)stbi__get8(s); } return value; @@ -8092,38 +8087,38 @@ static int stbi__pnm_info(stbi__context* s, int* x, int* y, int* comp) int maxv, dummy; char c, p, t; - if(!x) + if (!x) x = &dummy; - if(!y) + if (!y) y = &dummy; - if(!comp) + if (!comp) comp = &dummy; stbi__rewind(s); // Get identifier - p = ( char )stbi__get8(s); - t = ( char )stbi__get8(s); - if(p != 'P' || (t != '5' && t != '6')) + p = (char)stbi__get8(s); + t = (char)stbi__get8(s); + if (p != 'P' || (t != '5' && t != '6')) { stbi__rewind(s); return 0; } - *comp = (t == '6') ? 3 : 1; // '5' is 1-component .pgm; '6' is 3-component .ppm + *comp = (t == '6') ? 3 : 1; // '5' is 1-component .pgm; '6' is 3-component .ppm - c = ( char )stbi__get8(s); + c = (char)stbi__get8(s); stbi__pnm_skip_whitespace(s, &c); - *x = stbi__pnm_getinteger(s, &c); // read width + *x = stbi__pnm_getinteger(s, &c); // read width stbi__pnm_skip_whitespace(s, &c); - *y = stbi__pnm_getinteger(s, &c); // read height + *y = stbi__pnm_getinteger(s, &c); // read height stbi__pnm_skip_whitespace(s, &c); - maxv = stbi__pnm_getinteger(s, &c); // read max value + maxv = stbi__pnm_getinteger(s, &c); // read max value - if(maxv > 255) + if (maxv > 255) return stbi__err("max value > 255", "PPM image not 8-bit"); else return 1; @@ -8133,48 +8128,48 @@ static int stbi__pnm_info(stbi__context* s, int* x, int* y, int* comp) static int stbi__info_main(stbi__context* s, int* x, int* y, int* comp) { #ifndef STBI_NO_JPEG - if(stbi__jpeg_info(s, x, y, comp)) + if (stbi__jpeg_info(s, x, y, comp)) return 1; #endif #ifndef STBI_NO_PNG - if(stbi__png_info(s, x, y, comp)) + if (stbi__png_info(s, x, y, comp)) return 1; #endif #ifndef STBI_NO_GIF - if(stbi__gif_info(s, x, y, comp)) + if (stbi__gif_info(s, x, y, comp)) return 1; #endif #ifndef STBI_NO_BMP - if(stbi__bmp_info(s, x, y, comp)) + if (stbi__bmp_info(s, x, y, comp)) return 1; #endif #ifndef STBI_NO_PSD - if(stbi__psd_info(s, x, y, comp)) + if (stbi__psd_info(s, x, y, comp)) return 1; #endif #ifndef STBI_NO_PIC - if(stbi__pic_info(s, x, y, comp)) + if (stbi__pic_info(s, x, y, comp)) return 1; #endif #ifndef STBI_NO_PNM - if(stbi__pnm_info(s, x, y, comp)) + if (stbi__pnm_info(s, x, y, comp)) return 1; #endif #ifndef STBI_NO_HDR - if(stbi__hdr_info(s, x, y, comp)) + if (stbi__hdr_info(s, x, y, comp)) return 1; #endif // test tga last because it's a crappy test! #ifndef STBI_NO_TGA - if(stbi__tga_info(s, x, y, comp)) + if (stbi__tga_info(s, x, y, comp)) return 1; #endif return stbi__err("unknown image type", "Image not of any known type, or corrupt"); @@ -8183,12 +8178,12 @@ static int stbi__info_main(stbi__context* s, int* x, int* y, int* comp) static int stbi__is_16_main(stbi__context* s) { #ifndef STBI_NO_PNG - if(stbi__png_is16(s)) + if (stbi__png_is16(s)) return 1; #endif #ifndef STBI_NO_PSD - if(stbi__psd_is16(s)) + if (stbi__psd_is16(s)) return 1; #endif @@ -8200,7 +8195,7 @@ extern int stbi_info(char const* filename, int* x, int* y, int* comp) { FILE* f = stbi__fopen(filename, "rb"); int result; - if(!f) + if (!f) return stbi__err("can't fopen", "Unable to open file"); result = stbi_info_from_file(f, x, y, comp); fclose(f); @@ -8222,7 +8217,7 @@ extern int stbi_is_16_bit(char const* filename) { FILE* f = stbi__fopen(filename, "rb"); int result; - if(!f) + if (!f) return stbi__err("can't fopen", "Unable to open file"); result = stbi_is_16_bit_from_file(f); fclose(f); @@ -8239,7 +8234,7 @@ extern int stbi_is_16_bit_from_file(FILE* f) fseek(f, pos, SEEK_SET); return r; } -#endif // !STBI_NO_STDIO +#endif // !STBI_NO_STDIO extern int stbi_info_from_memory(stbi_uc const* buffer, int len, int* x, int* y, int* comp) { @@ -8251,7 +8246,7 @@ extern int stbi_info_from_memory(stbi_uc const* buffer, int len, int* x, int* y, extern int stbi_info_from_callbacks(stbi_io_callbacks const* c, void* user, int* x, int* y, int* comp) { stbi__context s; - stbi__start_callbacks(&s, ( stbi_io_callbacks* )c, user); + stbi__start_callbacks(&s, (stbi_io_callbacks*)c, user); return stbi__info_main(&s, x, y, comp); } @@ -8265,11 +8260,11 @@ extern int stbi_is_16_bit_from_memory(stbi_uc const* buffer, int len) extern int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const* c, void* user) { stbi__context s; - stbi__start_callbacks(&s, ( stbi_io_callbacks* )c, user); + stbi__start_callbacks(&s, (stbi_io_callbacks*)c, user); return stbi__is_16_main(&s); } -#endif // STB_IMAGE_IMPLEMENTATION +#endif // STB_IMAGE_IMPLEMENTATION /* revision history: diff --git a/examples/common/stb_image_write.h b/examples/common/stb_image_write.h index 42b7c1796..fe585cf94 100644 --- a/examples/common/stb_image_write.h +++ b/examples/common/stb_image_write.h @@ -14,7 +14,7 @@ #endif #endif -#ifndef STB_IMAGE_WRITE_STATIC // C++ forbids static forward declarations +#ifndef STB_IMAGE_WRITE_STATIC // C++ forbids static forward declarations extern int stbi_write_tga_with_rle; extern int stbi_write_png_compression_level; extern int stbi_write_force_png_filter; @@ -40,7 +40,7 @@ STBIWDEF int stbi_write_jpg_to_func(stbi_write_func* func, void* context, int x, STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean); -#endif // INCLUDE_STB_IMAGE_WRITE_H +#endif // INCLUDE_STB_IMAGE_WRITE_H #define STB_IMAGE_WRITE_IMPLEMENTATION #ifdef STB_IMAGE_WRITE_IMPLEMENTATION @@ -56,7 +56,7 @@ STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean); #ifndef STBI_WRITE_NO_STDIO #include -#endif // STBI_WRITE_NO_STDIO +#endif // STBI_WRITE_NO_STDIO #include #include @@ -72,9 +72,9 @@ STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean); #endif #ifndef STBIW_MALLOC -#define STBIW_MALLOC(sz) malloc(sz) +#define STBIW_MALLOC(sz) malloc(sz) #define STBIW_REALLOC(p, newsz) realloc(p, newsz) -#define STBIW_FREE(p) free(p) +#define STBIW_FREE(p) free(p) #endif #ifndef STBIW_REALLOC_SIZED @@ -90,7 +90,7 @@ STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean); #define STBIW_ASSERT(x) assert(x) #endif -#define STBIW_UCHAR(x) ( unsigned char )(( x )&0xff) +#define STBIW_UCHAR(x) (unsigned char)((x)&0xff) #ifdef STB_IMAGE_WRITE_STATIC static int stbi__flip_vertically_on_write = 0; @@ -126,69 +126,69 @@ static void stbi__start_write_callbacks(stbi__write_context* s, stbi_write_func* static void stbi__stdio_write(void* context, void* data, int size) { - fwrite(data, 1, size, ( FILE* )context); + fwrite(data, 1, size, (FILE*)context); } static int stbi__start_write_file(stbi__write_context* s, const char* filename) { FILE* f; #ifdef STBI_MSC_SECURE_CRT - if(fopen_s(&f, filename, "wb")) + if (fopen_s(&f, filename, "wb")) f = NULL; #else f = fopen(filename, "wb"); #endif - stbi__start_write_callbacks(s, stbi__stdio_write, ( void* )f); + stbi__start_write_callbacks(s, stbi__stdio_write, (void*)f); return f != NULL; } static void stbi__end_write_file(stbi__write_context* s) { - fclose(( FILE* )s->context); + fclose((FILE*)s->context); } -#endif // !STBI_WRITE_NO_STDIO +#endif // !STBI_WRITE_NO_STDIO typedef unsigned int stbiw_uint32; typedef int stb_image_write_test[sizeof(stbiw_uint32) == 4 ? 1 : -1]; static void stbiw__writefv(stbi__write_context* s, const char* fmt, va_list v) { - while(*fmt) + while (*fmt) { - switch(*fmt++) + switch (*fmt++) { - case ' ': - break; - case '1': - { - unsigned char x = STBIW_UCHAR(va_arg(v, int)); - s->func(s->context, &x, 1); - break; - } - case '2': - { - int x = va_arg(v, int); - unsigned char b[2]; - b[0] = STBIW_UCHAR(x); - b[1] = STBIW_UCHAR(x >> 8); - s->func(s->context, b, 2); - break; - } - case '4': - { - stbiw_uint32 x = va_arg(v, int); - unsigned char b[4]; - b[0] = STBIW_UCHAR(x); - b[1] = STBIW_UCHAR(x >> 8); - b[2] = STBIW_UCHAR(x >> 16); - b[3] = STBIW_UCHAR(x >> 24); - s->func(s->context, b, 4); - break; - } - default: - STBIW_ASSERT(0); - return; + case ' ': + break; + case '1': + { + unsigned char x = STBIW_UCHAR(va_arg(v, int)); + s->func(s->context, &x, 1); + break; + } + case '2': + { + int x = va_arg(v, int); + unsigned char b[2]; + b[0] = STBIW_UCHAR(x); + b[1] = STBIW_UCHAR(x >> 8); + s->func(s->context, b, 2); + break; + } + case '4': + { + stbiw_uint32 x = va_arg(v, int); + unsigned char b[4]; + b[0] = STBIW_UCHAR(x); + b[1] = STBIW_UCHAR(x >> 8); + b[2] = STBIW_UCHAR(x >> 16); + b[3] = STBIW_UCHAR(x >> 24); + s->func(s->context, b, 4); + break; + } + default: + STBIW_ASSERT(0); + return; } } } @@ -219,33 +219,33 @@ static void stbiw__write_pixel(stbi__write_context* s, int rgb_dir, int comp, in unsigned char bg[3] = {255, 0, 255}, px[3]; int k; - if(write_alpha < 0) + if (write_alpha < 0) s->func(s->context, &d[comp - 1], 1); - switch(comp) + switch (comp) { - case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case - case 1: - if(expand_mono) - stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp - else - s->func(s->context, d, 1); // monochrome TGA - break; - case 4: - if(!write_alpha) - { - // composite against pink background - for(k = 0; k < 3; ++k) - px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255; - stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]); - break; - } - /* FALLTHROUGH */ - case 3: - stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]); + case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case + case 1: + if (expand_mono) + stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp + else + s->func(s->context, d, 1); // monochrome TGA + break; + case 4: + if (!write_alpha) + { + // composite against pink background + for (k = 0; k < 3; ++k) + px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255; + stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]); break; + } + /* FALLTHROUGH */ + case 3: + stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]); + break; } - if(write_alpha > 0) + if (write_alpha > 0) s->func(s->context, &d[comp - 1], 1); } @@ -255,22 +255,22 @@ static void stbiw__write_pixels(stbi__write_context* s, int rgb_dir, int vdir, i stbiw_uint32 zero = 0; int i, j, j_end; - if(y <= 0) + if (y <= 0) return; - if(stbi__flip_vertically_on_write) + if (stbi__flip_vertically_on_write) vdir *= -1; - if(vdir < 0) + if (vdir < 0) j_end = -1, j = y - 1; else j_end = y, j = 0; - for(; j != j_end; j += vdir) + for (; j != j_end; j += vdir) { - for(i = 0; i < x; ++i) + for (i = 0; i < x; ++i) { - unsigned char* d = ( unsigned char* )data + (j * x + i) * comp; + unsigned char* d = (unsigned char*)data + (j * x + i) * comp; stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d); } s->func(s->context, &zero, scanline_pad); @@ -280,7 +280,7 @@ static void stbiw__write_pixels(stbi__write_context* s, int rgb_dir, int vdir, i static int stbiw__outfile(stbi__write_context* s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void* data, int alpha, int pad, const char* fmt, ...) { - if(y < 0 || x < 0) + if (y < 0 || x < 0) { return 0; } @@ -298,11 +298,11 @@ static int stbiw__outfile(stbi__write_context* s, int rgb_dir, int vdir, int x, static int stbi_write_bmp_core(stbi__write_context* s, int x, int y, int comp, const void* data) { int pad = (-x * 3) & 3; - return stbiw__outfile(s, -1, -1, x, y, comp, 1, ( void* )data, 0, pad, + return stbiw__outfile(s, -1, -1, x, y, comp, 1, (void*)data, 0, pad, "11 4 22 4" "4 44 22 444444", - 'B', 'M', 14 + 40 + (x * 3 + pad) * y, 0, 0, 14 + 40, // file header - 40, x, y, 1, 24, 0, 0, 0, 0, 0, 0); // bitmap header + 'B', 'M', 14 + 40 + (x * 3 + pad) * y, 0, 0, 14 + 40, // file header + 40, x, y, 1, 24, 0, 0, 0, 0, 0, 0); // bitmap header } STBIWDEF int stbi_write_bmp_to_func(stbi_write_func* func, void* context, int x, int y, int comp, const void* data) @@ -316,7 +316,7 @@ STBIWDEF int stbi_write_bmp_to_func(stbi_write_func* func, void* context, int x, STBIWDEF int stbi_write_bmp(char const* filename, int x, int y, int comp, const void* data) { stbi__write_context s; - if(stbi__start_write_file(&s, filename)) + if (stbi__start_write_file(&s, filename)) { int r = stbi_write_bmp_core(&s, x, y, comp, data); stbi__end_write_file(&s); @@ -325,20 +325,20 @@ STBIWDEF int stbi_write_bmp(char const* filename, int x, int y, int comp, const else return 0; } -#endif //! STBI_WRITE_NO_STDIO +#endif //! STBI_WRITE_NO_STDIO static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, void* data) { int has_alpha = (comp == 2 || comp == 4); int colorbytes = has_alpha ? comp - 1 : comp; - int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3 + int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3 - if(y < 0 || x < 0) + if (y < 0 || x < 0) return 0; - if(!stbi_write_tga_with_rle) + if (!stbi_write_tga_with_rle) { - return stbiw__outfile(s, -1, -1, x, y, comp, 0, ( void* )data, has_alpha, 0, "111 221 2222 11", 0, 0, format, 0, + return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void*)data, has_alpha, 0, "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8); } else @@ -349,7 +349,7 @@ static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, v stbiw__writef(s, "111 221 2222 11", 0, 0, format + 8, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8); - if(stbi__flip_vertically_on_write) + if (stbi__flip_vertically_on_write) { j = 0; jend = y; @@ -361,27 +361,27 @@ static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, v jend = -1; jdir = -1; } - for(; j != jend; j += jdir) + for (; j != jend; j += jdir) { - unsigned char* row = ( unsigned char* )data + j * x * comp; + unsigned char* row = (unsigned char*)data + j * x * comp; int len; - for(i = 0; i < x; i += len) + for (i = 0; i < x; i += len) { unsigned char* begin = row + i * comp; int diff = 1; len = 1; - if(i < x - 1) + if (i < x - 1) { ++len; diff = memcmp(begin, row + (i + 1) * comp, comp); - if(diff) + if (diff) { const unsigned char* prev = begin; - for(k = i + 2; k < x && len < 128; ++k) + for (k = i + 2; k < x && len < 128; ++k) { - if(memcmp(prev, row + k * comp, comp)) + if (memcmp(prev, row + k * comp, comp)) { prev += comp; ++len; @@ -395,9 +395,9 @@ static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, v } else { - for(k = i + 2; k < x && len < 128; ++k) + for (k = i + 2; k < x && len < 128; ++k) { - if(!memcmp(begin, row + k * comp, comp)) + if (!memcmp(begin, row + k * comp, comp)) { ++len; } @@ -409,11 +409,11 @@ static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, v } } - if(diff) + if (diff) { unsigned char header = STBIW_UCHAR(len - 1); s->func(s->context, &header, 1); - for(k = 0; k < len; ++k) + for (k = 0; k < len; ++k) { stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp); } @@ -434,16 +434,16 @@ STBIWDEF int stbi_write_tga_to_func(stbi_write_func* func, void* context, int x, { stbi__write_context s; stbi__start_write_callbacks(&s, func, context); - return stbi_write_tga_core(&s, x, y, comp, ( void* )data); + return stbi_write_tga_core(&s, x, y, comp, (void*)data); } #ifndef STBI_WRITE_NO_STDIO STBIWDEF int stbi_write_tga(char const* filename, int x, int y, int comp, const void* data) { stbi__write_context s; - if(stbi__start_write_file(&s, filename)) + if (stbi__start_write_file(&s, filename)) { - int r = stbi_write_tga_core(&s, x, y, comp, ( void* )data); + int r = stbi_write_tga_core(&s, x, y, comp, (void*)data); stbi__end_write_file(&s); return r; } @@ -463,18 +463,18 @@ void stbiw__linear_to_rgbe(unsigned char* rgbe, float* linear) int exponent; float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2])); - if(maxcomp < 1e-32f) + if (maxcomp < 1e-32f) { rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0; } else { - float normalize = ( float )frexp(maxcomp, &exponent) * 256.0f / maxcomp; + float normalize = (float)frexp(maxcomp, &exponent) * 256.0f / maxcomp; - rgbe[0] = ( unsigned char )(linear[0] * normalize); - rgbe[1] = ( unsigned char )(linear[1] * normalize); - rgbe[2] = ( unsigned char )(linear[2] * normalize); - rgbe[3] = ( unsigned char )(exponent + 128); + rgbe[0] = (unsigned char)(linear[0] * normalize); + rgbe[1] = (unsigned char)(linear[1] * normalize); + rgbe[2] = (unsigned char)(linear[2] * normalize); + rgbe[3] = (unsigned char)(exponent + 128); } } @@ -489,7 +489,7 @@ void stbiw__write_run_data(stbi__write_context* s, int length, unsigned char dat void stbiw__write_dump_data(stbi__write_context* s, int length, unsigned char* data) { unsigned char lengthbyte = STBIW_UCHAR(length); - STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code + STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code s->func(s->context, &lengthbyte, 1); s->func(s->context, data, length); } @@ -505,21 +505,21 @@ void stbiw__write_hdr_scanline(stbi__write_context* s, int width, int ncomp, uns scanlineheader[3] = (width & 0x00ff); /* skip RLE for images too small or large */ - if(width < 8 || width >= 32768) + if (width < 8 || width >= 32768) { - for(x = 0; x < width; x++) + for (x = 0; x < width; x++) { - switch(ncomp) + switch (ncomp) { - case 4: /* fallthrough */ - case 3: - linear[2] = scanline[x * ncomp + 2]; - linear[1] = scanline[x * ncomp + 1]; - linear[0] = scanline[x * ncomp + 0]; - break; - default: - linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0]; - break; + case 4: /* fallthrough */ + case 3: + linear[2] = scanline[x * ncomp + 2]; + linear[1] = scanline[x * ncomp + 1]; + linear[0] = scanline[x * ncomp + 0]; + break; + default: + linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0]; + break; } stbiw__linear_to_rgbe(rgbe, linear); s->func(s->context, rgbe, 4); @@ -529,19 +529,19 @@ void stbiw__write_hdr_scanline(stbi__write_context* s, int width, int ncomp, uns { int c, r; /* encode into scratch buffer */ - for(x = 0; x < width; x++) + for (x = 0; x < width; x++) { - switch(ncomp) + switch (ncomp) { - case 4: /* fallthrough */ - case 3: - linear[2] = scanline[x * ncomp + 2]; - linear[1] = scanline[x * ncomp + 1]; - linear[0] = scanline[x * ncomp + 0]; - break; - default: - linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0]; - break; + case 4: /* fallthrough */ + case 3: + linear[2] = scanline[x * ncomp + 2]; + linear[1] = scanline[x * ncomp + 1]; + linear[0] = scanline[x * ncomp + 0]; + break; + default: + linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0]; + break; } stbiw__linear_to_rgbe(rgbe, linear); scratch[x + width * 0] = rgbe[0]; @@ -553,43 +553,43 @@ void stbiw__write_hdr_scanline(stbi__write_context* s, int width, int ncomp, uns s->func(s->context, scanlineheader, 4); /* RLE each component separately */ - for(c = 0; c < 4; c++) + for (c = 0; c < 4; c++) { unsigned char* comp = &scratch[width * c]; x = 0; - while(x < width) + while (x < width) { // find first run r = x; - while(r + 2 < width) + while (r + 2 < width) { - if(comp[r] == comp[r + 1] && comp[r] == comp[r + 2]) + if (comp[r] == comp[r + 1] && comp[r] == comp[r + 2]) break; ++r; } - if(r + 2 >= width) + if (r + 2 >= width) r = width; // dump up to first run - while(x < r) + while (x < r) { int len = r - x; - if(len > 128) + if (len > 128) len = 128; stbiw__write_dump_data(s, len, &comp[x]); x += len; } // if there's a run, output it - if(r + 2 < width) - { // same test as what we break out of in search loop, so only true if we break'd + if (r + 2 < width) + { // same test as what we break out of in search loop, so only true if we break'd // find next byte after run - while(r < width && comp[r] == comp[x]) + while (r < width && comp[r] == comp[x]) ++r; // output run up to r - while(x < r) + while (x < r) { int len = r - x; - if(len > 127) + if (len > 127) len = 127; stbiw__write_run_data(s, len, comp[x]); x += len; @@ -602,12 +602,12 @@ void stbiw__write_hdr_scanline(stbi__write_context* s, int width, int ncomp, uns static int stbi_write_hdr_core(stbi__write_context* s, int x, int y, int comp, float* data) { - if(y <= 0 || x <= 0 || data == NULL) + if (y <= 0 || x <= 0 || data == NULL) return 0; else { // Each component is stored separately. Allocate scratch space for full output scanline. - unsigned char* scratch = ( unsigned char* )STBIW_MALLOC(x * 4); + unsigned char* scratch = (unsigned char*)STBIW_MALLOC(x * 4); int i, len; char buffer[128]; char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n"; @@ -620,7 +620,7 @@ static int stbi_write_hdr_core(stbi__write_context* s, int x, int y, int comp, f #endif s->func(s->context, buffer, len); - for(i = 0; i < y; i++) + for (i = 0; i < y; i++) stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp * x * (stbi__flip_vertically_on_write ? y - 1 - i : i) * x); STBIW_FREE(scratch); @@ -632,23 +632,23 @@ STBIWDEF int stbi_write_hdr_to_func(stbi_write_func* func, void* context, int x, { stbi__write_context s; stbi__start_write_callbacks(&s, func, context); - return stbi_write_hdr_core(&s, x, y, comp, ( float* )data); + return stbi_write_hdr_core(&s, x, y, comp, (float*)data); } #ifndef STBI_WRITE_NO_STDIO STBIWDEF int stbi_write_hdr(char const* filename, int x, int y, int comp, const float* data) { stbi__write_context s; - if(stbi__start_write_file(&s, filename)) + if (stbi__start_write_file(&s, filename)) { - int r = stbi_write_hdr_core(&s, x, y, comp, ( float* )data); + int r = stbi_write_hdr_core(&s, x, y, comp, (float*)data); stbi__end_write_file(&s); return r; } else return 0; } -#endif // STBI_WRITE_NO_STDIO +#endif // STBI_WRITE_NO_STDIO ////////////////////////////////////////////////////////////////////////////// // @@ -657,30 +657,29 @@ STBIWDEF int stbi_write_hdr(char const* filename, int x, int y, int comp, const #ifndef STBIW_ZLIB_COMPRESS // stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size() -#define stbiw__sbraw(a) (( int* )( a )-2) -#define stbiw__sbm(a) stbiw__sbraw(a)[0] -#define stbiw__sbn(a) stbiw__sbraw(a)[1] +#define stbiw__sbraw(a) ((int*)(a)-2) +#define stbiw__sbm(a) stbiw__sbraw(a)[0] +#define stbiw__sbn(a) stbiw__sbraw(a)[1] -#define stbiw__sbneedgrow(a, n) ((a) == 0 || stbiw__sbn(a) + n >= stbiw__sbm(a)) +#define stbiw__sbneedgrow(a, n) ((a) == 0 || stbiw__sbn(a) + n >= stbiw__sbm(a)) #define stbiw__sbmaybegrow(a, n) (stbiw__sbneedgrow(a, (n)) ? stbiw__sbgrow(a, n) : 0) -#define stbiw__sbgrow(a, n) stbiw__sbgrowf(( void** )&(a), (n), sizeof(*(a))) +#define stbiw__sbgrow(a, n) stbiw__sbgrowf((void**)&(a), (n), sizeof(*(a))) #define stbiw__sbpush(a, v) (stbiw__sbmaybegrow(a, 1), (a)[stbiw__sbn(a)++] = (v)) -#define stbiw__sbcount(a) ((a) ? stbiw__sbn(a) : 0) -#define stbiw__sbfree(a) ((a) ? STBIW_FREE(stbiw__sbraw(a)), 0 : 0) +#define stbiw__sbcount(a) ((a) ? stbiw__sbn(a) : 0) +#define stbiw__sbfree(a) ((a) ? STBIW_FREE(stbiw__sbraw(a)), 0 : 0) static void* stbiw__sbgrowf(void** arr, int increment, int itemsize) { int m = *arr ? 2 * stbiw__sbm(*arr) + increment : increment + 1; - void* p = - STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr) * itemsize + sizeof(int) * 2) : 0, - (unsigned long)itemsize * m + sizeof(int) * 2); + void* p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr) * itemsize + sizeof(int) * 2) : 0, + (unsigned long)itemsize * m + sizeof(int) * 2); STBIW_ASSERT(p); - if(p) + if (p) { - if(!*arr) - (( int* )p)[1] = 0; - *arr = ( void* )(( int* )p + 2); + if (!*arr) + ((int*)p)[1] = 0; + *arr = (void*)((int*)p + 2); stbiw__sbm(*arr) = m; } return *arr; @@ -688,7 +687,7 @@ static void* stbiw__sbgrowf(void** arr, int increment, int itemsize) static unsigned char* stbiw__zlib_flushf(unsigned char* data, unsigned int* bitbuffer, int* bitcount) { - while(*bitcount >= 8) + while (*bitcount >= 8) { stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer)); *bitbuffer >>= 8; @@ -700,7 +699,7 @@ static unsigned char* stbiw__zlib_flushf(unsigned char* data, unsigned int* bitb static int stbiw__zlib_bitrev(int code, int codebits) { int res = 0; - while(codebits--) + while (codebits--) { res = (res << 1) | (code & 1); code >>= 1; @@ -711,8 +710,8 @@ static int stbiw__zlib_bitrev(int code, int codebits) static unsigned int stbiw__zlib_countm(unsigned char* a, unsigned char* b, int limit) { int i; - for(i = 0; i < limit && i < 258; ++i) - if(a[i] != b[i]) + for (i = 0; i < limit && i < 258; ++i) + if (a[i] != b[i]) break; return i; } @@ -729,93 +728,94 @@ static unsigned int stbiw__zhash(unsigned char* data) return hash; } -#define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount)) +#define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount)) #define stbiw__zlib_add(code, codebits) (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush()) -#define stbiw__zlib_huffa(b, c) stbiw__zlib_add(stbiw__zlib_bitrev(b, c), c) +#define stbiw__zlib_huffa(b, c) stbiw__zlib_add(stbiw__zlib_bitrev(b, c), c) // default huffman tables #define stbiw__zlib_huff1(n) stbiw__zlib_huffa(0x30 + (n), 8) -#define stbiw__zlib_huff2(n) stbiw__zlib_huffa(0x190 + ( n )-144, 9) -#define stbiw__zlib_huff3(n) stbiw__zlib_huffa(0 + ( n )-256, 7) -#define stbiw__zlib_huff4(n) stbiw__zlib_huffa(0xc0 + ( n )-280, 8) -#define stbiw__zlib_huff(n) \ - ((n) <= 143 ? stbiw__zlib_huff1(n) : \ - (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n)) +#define stbiw__zlib_huff2(n) stbiw__zlib_huffa(0x190 + (n)-144, 9) +#define stbiw__zlib_huff3(n) stbiw__zlib_huffa(0 + (n)-256, 7) +#define stbiw__zlib_huff4(n) stbiw__zlib_huffa(0xc0 + (n)-280, 8) +#define stbiw__zlib_huff(n) \ + ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) \ + : (n) <= 279 ? stbiw__zlib_huff3(n) \ + : stbiw__zlib_huff4(n)) #define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n)) #define stbiw__ZHASH 16384 -#endif // STBIW_ZLIB_COMPRESS +#endif // STBIW_ZLIB_COMPRESS unsigned char* stbi_zlib_compress(unsigned char* data, int data_len, int* out_len, int quality) { #ifdef STBIW_ZLIB_COMPRESS // user provided a zlib compress implementation, use that return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality); -#else // use builtin - static unsigned short lengthc[] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, +#else // use builtin + static unsigned short lengthc[] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 259}; static unsigned char lengtheb[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0}; - static unsigned short distc[] = {1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, - 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, + static unsigned short distc[] = {1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, + 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 32768}; - static unsigned char disteb[] = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, + static unsigned char disteb[] = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13}; unsigned int bitbuf = 0; int i, j, bitcount = 0; unsigned char* out = NULL; - unsigned char*** hash_table = ( unsigned char*** )STBIW_MALLOC(stbiw__ZHASH * sizeof(char**)); - if(hash_table == NULL) + unsigned char*** hash_table = (unsigned char***)STBIW_MALLOC(stbiw__ZHASH * sizeof(char**)); + if (hash_table == NULL) return NULL; - if(quality < 5) + if (quality < 5) quality = 5; - stbiw__sbpush(out, 0x78); // DEFLATE 32K window - stbiw__sbpush(out, 0x5e); // FLEVEL = 1 + stbiw__sbpush(out, 0x78); // DEFLATE 32K window + stbiw__sbpush(out, 0x5e); // FLEVEL = 1 stbiw__zlib_add(1, 1); // BFINAL = 1 stbiw__zlib_add(1, 2); // BTYPE = 1 -- fixed huffman - for(i = 0; i < stbiw__ZHASH; ++i) + for (i = 0; i < stbiw__ZHASH; ++i) hash_table[i] = NULL; i = 0; - while(i < data_len - 3) + while (i < data_len - 3) { // hash next 3 bytes of data to be compressed int h = stbiw__zhash(data + i) & (stbiw__ZHASH - 1), best = 3; unsigned char* bestloc = 0; unsigned char** hlist = hash_table[h]; int n = stbiw__sbcount(hlist); - for(j = 0; j < n; ++j) + for (j = 0; j < n; ++j) { - if(hlist[j] - data > i - 32768) - { // if entry lies within window + if (hlist[j] - data > i - 32768) + { // if entry lies within window int d = stbiw__zlib_countm(hlist[j], data + i, data_len - i); - if(d >= best) + if (d >= best) best = d, bestloc = hlist[j]; } } // when hash table entry is too long, delete half the entries - if(hash_table[h] && stbiw__sbn(hash_table[h]) == 2 * quality) + if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2 * quality) { STBIW_MEMMOVE(hash_table[h], hash_table[h] + quality, sizeof(hash_table[h][0]) * quality); stbiw__sbn(hash_table[h]) = quality; } stbiw__sbpush(hash_table[h], data + i); - if(bestloc) + if (bestloc) { // "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal h = stbiw__zhash(data + i + 1) & (stbiw__ZHASH - 1); hlist = hash_table[h]; n = stbiw__sbcount(hlist); - for(j = 0; j < n; ++j) + for (j = 0; j < n; ++j) { - if(hlist[j] - data > i - 32767) + if (hlist[j] - data > i - 32767) { int e = stbiw__zlib_countm(hlist[j], data + i + 1, data_len - i - 1); - if(e > best) - { // if next match is better, bail on current match + if (e > best) + { // if next match is better, bail on current match bestloc = NULL; break; } @@ -823,19 +823,19 @@ unsigned char* stbi_zlib_compress(unsigned char* data, int data_len, int* out_le } } - if(bestloc) + if (bestloc) { - int d = ( int )(data + i - bestloc); // distance back + int d = (int)(data + i - bestloc); // distance back STBIW_ASSERT(d <= 32767 && best <= 258); - for(j = 0; best > lengthc[j + 1] - 1; ++j) + for (j = 0; best > lengthc[j + 1] - 1; ++j) ; stbiw__zlib_huff(j + 257); - if(lengtheb[j]) + if (lengtheb[j]) stbiw__zlib_add(best - lengthc[j], lengtheb[j]); - for(j = 0; d > distc[j + 1] - 1; ++j) + for (j = 0; d > distc[j + 1] - 1; ++j) ; stbiw__zlib_add(stbiw__zlib_bitrev(j, 5), 5); - if(disteb[j]) + if (disteb[j]) stbiw__zlib_add(d - distc[j], disteb[j]); i += best; } @@ -846,25 +846,25 @@ unsigned char* stbi_zlib_compress(unsigned char* data, int data_len, int* out_le } } // write out final bytes - for(; i < data_len; ++i) + for (; i < data_len; ++i) stbiw__zlib_huffb(data[i]); - stbiw__zlib_huff(256); // end of block + stbiw__zlib_huff(256); // end of block // pad with 0 bits to byte boundary - while(bitcount) + while (bitcount) stbiw__zlib_add(0, 1); - for(i = 0; i < stbiw__ZHASH; ++i) - ( void )stbiw__sbfree(hash_table[i]); + for (i = 0; i < stbiw__ZHASH; ++i) + (void)stbiw__sbfree(hash_table[i]); STBIW_FREE(hash_table); { // compute adler32 on input unsigned int s1 = 1, s2 = 0; - int blocklen = ( int )(data_len % 5552); + int blocklen = (int)(data_len % 5552); j = 0; - while(j < data_len) + while (j < data_len) { - for(i = 0; i < blocklen; ++i) + for (i = 0; i < blocklen; ++i) s1 += data[j + i], s2 += s1; s1 %= 65521, s2 %= 65521; j += blocklen; @@ -878,8 +878,8 @@ unsigned char* stbi_zlib_compress(unsigned char* data, int data_len, int* out_le *out_len = stbiw__sbn(out); // make returned pointer freeable STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len); - return ( unsigned char* )stbiw__sbraw(out); -#endif // STBIW_ZLIB_COMPRESS + return (unsigned char*)stbiw__sbraw(out); +#endif // STBIW_ZLIB_COMPRESS } static unsigned int stbiw__crc32(unsigned char* buffer, int len) @@ -917,14 +917,14 @@ static unsigned int stbiw__crc32(unsigned char* buffer, int len) unsigned int crc = ~0u; int i; - for(i = 0; i < len; ++i) + for (i = 0; i < len; ++i) crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)]; return ~crc; } #define stbiw__wpng4(o, a, b, c, d) \ ((o)[0] = STBIW_UCHAR(a), (o)[1] = STBIW_UCHAR(b), (o)[2] = STBIW_UCHAR(c), (o)[3] = STBIW_UCHAR(d), (o) += 4) -#define stbiw__wp32(data, v) stbiw__wpng4(data, (v) >> 24, (v) >> 16, (v) >> 8, (v)); +#define stbiw__wp32(data, v) stbiw__wpng4(data, (v) >> 24, (v) >> 16, (v) >> 8, (v)); #define stbiw__wptag(data, s) stbiw__wpng4(data, s[0], s[1], s[2], s[3]) static void stbiw__wpcrc(unsigned char** data, int len) @@ -936,9 +936,9 @@ static void stbiw__wpcrc(unsigned char** data, int len) static unsigned char stbiw__paeth(int a, int b, int c) { int p = a + b - c, pa = abs(p - a), pb = abs(p - b), pc = abs(p - c); - if(pa <= pb && pa <= pc) + if (pa <= pb && pa <= pc) return STBIW_UCHAR(a); - if(pb <= pc) + if (pb <= pc) return STBIW_UCHAR(b); return STBIW_UCHAR(c); } @@ -954,58 +954,58 @@ static void stbiw__encode_png_line(unsigned char* pixels, int stride_bytes, int int type = mymap[filter_type]; unsigned char* z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height - 1 - y : y); int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes; - for(i = 0; i < n; ++i) + for (i = 0; i < n; ++i) { - switch(type) + switch (type) { - case 0: - line_buffer[i] = z[i]; - break; - case 1: - line_buffer[i] = z[i]; - break; - case 2: - line_buffer[i] = z[i] - z[i - signed_stride]; - break; - case 3: - line_buffer[i] = z[i] - (z[i - signed_stride] >> 1); - break; - case 4: - line_buffer[i] = ( signed char )(z[i] - stbiw__paeth(0, z[i - signed_stride], 0)); - break; - case 5: - line_buffer[i] = z[i]; - break; - case 6: - line_buffer[i] = z[i]; - break; + case 0: + line_buffer[i] = z[i]; + break; + case 1: + line_buffer[i] = z[i]; + break; + case 2: + line_buffer[i] = z[i] - z[i - signed_stride]; + break; + case 3: + line_buffer[i] = z[i] - (z[i - signed_stride] >> 1); + break; + case 4: + line_buffer[i] = (signed char)(z[i] - stbiw__paeth(0, z[i - signed_stride], 0)); + break; + case 5: + line_buffer[i] = z[i]; + break; + case 6: + line_buffer[i] = z[i]; + break; } } - for(i = n; i < width * n; ++i) + for (i = n; i < width * n; ++i) { - switch(type) + switch (type) { - case 0: - line_buffer[i] = z[i]; - break; - case 1: - line_buffer[i] = z[i] - z[i - n]; - break; - case 2: - line_buffer[i] = z[i] - z[i - signed_stride]; - break; - case 3: - line_buffer[i] = z[i] - ((z[i - n] + z[i - signed_stride]) >> 1); - break; - case 4: - line_buffer[i] = z[i] - stbiw__paeth(z[i - n], z[i - signed_stride], z[i - signed_stride - n]); - break; - case 5: - line_buffer[i] = z[i] - (z[i - n] >> 1); - break; - case 6: - line_buffer[i] = z[i] - stbiw__paeth(z[i - n], 0, 0); - break; + case 0: + line_buffer[i] = z[i]; + break; + case 1: + line_buffer[i] = z[i] - z[i - n]; + break; + case 2: + line_buffer[i] = z[i] - z[i - signed_stride]; + break; + case 3: + line_buffer[i] = z[i] - ((z[i - n] + z[i - signed_stride]) >> 1); + break; + case 4: + line_buffer[i] = z[i] - stbiw__paeth(z[i - n], z[i - signed_stride], z[i - signed_stride - n]); + break; + case 5: + line_buffer[i] = z[i] - (z[i - n] >> 1); + break; + case 6: + line_buffer[i] = z[i] - stbiw__paeth(z[i - n], 0, 0); + break; } } } @@ -1019,76 +1019,76 @@ unsigned char* stbi_write_png_to_mem(unsigned char* pixels, int stride_bytes, in signed char* line_buffer; int j, zlen; - if(stride_bytes == 0) + if (stride_bytes == 0) stride_bytes = x * n; - if(force_filter >= 5) + if (force_filter >= 5) { force_filter = -1; } - filt = ( unsigned char* )STBIW_MALLOC((x * n + 1) * (size_t)y); - if(!filt) + filt = (unsigned char*)STBIW_MALLOC((x * n + 1) * (size_t)y); + if (!filt) return 0; - line_buffer = ( signed char* )STBIW_MALLOC((size_t)x * n); - if(!line_buffer) + line_buffer = (signed char*)STBIW_MALLOC((size_t)x * n); + if (!line_buffer) { STBIW_FREE(filt); return 0; } - for(j = 0; j < y; ++j) + for (j = 0; j < y; ++j) { int filter_type; - if(force_filter > -1) + if (force_filter > -1) { filter_type = force_filter; stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, force_filter, line_buffer); } else - { // Estimate the best filter by running through all of them: + { // Estimate the best filter by running through all of them: int best_filter = 0, best_filter_val = 0x7fffffff, est, i; - for(filter_type = 0; filter_type < 5; filter_type++) + for (filter_type = 0; filter_type < 5; filter_type++) { stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, filter_type, line_buffer); // Estimate the entropy of the line using this filter; the less, the better. est = 0; - for(i = 0; i < x * n; ++i) + for (i = 0; i < x * n; ++i) { - est += abs(( signed char )line_buffer[i]); + est += abs((signed char)line_buffer[i]); } - if(est < best_filter_val) + if (est < best_filter_val) { best_filter_val = est; best_filter = filter_type; } } - if(filter_type != best_filter) - { // If the last iteration already got us the best filter, don't redo it + if (filter_type != best_filter) + { // If the last iteration already got us the best filter, don't redo it stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, best_filter, line_buffer); filter_type = best_filter; } } // when we get here, filter_type contains the filter type, and line_buffer contains the data - filt[j * (x * n + 1)] = ( unsigned char )filter_type; + filt[j * (x * n + 1)] = (unsigned char)filter_type; STBIW_MEMMOVE(filt + j * (x * n + 1) + 1, line_buffer, (size_t)x * n); } STBIW_FREE(line_buffer); zlib = stbi_zlib_compress(filt, y * (x * n + 1), &zlen, stbi_write_png_compression_level); STBIW_FREE(filt); - if(!zlib) + if (!zlib) return 0; // each tag requires 12 bytes of overhead - out = ( unsigned char* )STBIW_MALLOC(8 + 12 + 13 + 12 + zlen + 12); - if(!out) + out = (unsigned char*)STBIW_MALLOC(8 + 12 + 13 + 12 + zlen + 12); + if (!out) return 0; *out_len = 8 + 12 + 13 + 12 + zlen + 12; o = out; STBIW_MEMMOVE(o, sig, 8); o += 8; - stbiw__wp32(o, 13); // header length + stbiw__wp32(o, 13); // header length stbiw__wptag(o, "IHDR"); stbiw__wp32(o, x); stbiw__wp32(o, y); @@ -1120,16 +1120,16 @@ STBIWDEF int stbi_write_png(char const* filename, int x, int y, int comp, const { FILE* f; int len; - unsigned char* png = stbi_write_png_to_mem(( unsigned char* )data, stride_bytes, x, y, comp, &len); - if(png == NULL) + unsigned char* png = stbi_write_png_to_mem((unsigned char*)data, stride_bytes, x, y, comp, &len); + if (png == NULL) return 0; #ifdef STBI_MSC_SECURE_CRT - if(fopen_s(&f, filename, "wb")) + if (fopen_s(&f, filename, "wb")) f = NULL; #else f = fopen(filename, "wb"); #endif - if(!f) + if (!f) { STBIW_FREE(png); return 0; @@ -1145,8 +1145,8 @@ STBIWDEF int stbi_write_png_to_func(stbi_write_func* func, void* context, int x, int stride_bytes) { int len; - unsigned char* png = stbi_write_png_to_mem(( unsigned char* )data, stride_bytes, x, y, comp, &len); - if(png == NULL) + unsigned char* png = stbi_write_png_to_mem((unsigned char*)data, stride_bytes, x, y, comp, &len); + if (png == NULL) return 0; func(context, png, len); STBIW_FREE(png); @@ -1161,8 +1161,8 @@ STBIWDEF int stbi_write_png_to_func(stbi_write_func* func, void* context, int x, * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html */ -static const unsigned char stbiw__jpg_ZigZag[] = {0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42, - 3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53, +static const unsigned char stbiw__jpg_ZigZag[] = {0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42, + 3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53, 10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60, 21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63}; @@ -1171,11 +1171,11 @@ static void stbiw__jpg_writeBits(stbi__write_context* s, int* bitBufP, int* bitC int bitBuf = *bitBufP, bitCnt = *bitCntP; bitCnt += bs[1]; bitBuf |= bs[0] << (24 - bitCnt); - while(bitCnt >= 8) + while (bitCnt >= 8) { unsigned char c = (bitBuf >> 16) & 255; stbiw__putc(s, c); - if(c == 255) + if (c == 255) { stbiw__putc(s, 0); } @@ -1202,33 +1202,33 @@ static void stbiw__jpg_DCT(float* d0p, float* d1p, float* d2p, float* d3p, float float tmp4 = d3 - d4; // Even part - float tmp10 = tmp0 + tmp3; // phase 2 + float tmp10 = tmp0 + tmp3; // phase 2 float tmp13 = tmp0 - tmp3; float tmp11 = tmp1 + tmp2; float tmp12 = tmp1 - tmp2; - d0 = tmp10 + tmp11; // phase 3 + d0 = tmp10 + tmp11; // phase 3 d4 = tmp10 - tmp11; - z1 = (tmp12 + tmp13) * 0.707106781f; // c4 - d2 = tmp13 + z1; // phase 5 + z1 = (tmp12 + tmp13) * 0.707106781f; // c4 + d2 = tmp13 + z1; // phase 5 d6 = tmp13 - z1; // Odd part - tmp10 = tmp4 + tmp5; // phase 2 + tmp10 = tmp4 + tmp5; // phase 2 tmp11 = tmp5 + tmp6; tmp12 = tmp6 + tmp7; // The rotator is modified from fig 4-8 to avoid extra negations. - z5 = (tmp10 - tmp12) * 0.382683433f; // c6 - z2 = tmp10 * 0.541196100f + z5; // c2-c6 - z4 = tmp12 * 1.306562965f + z5; // c2+c6 - z3 = tmp11 * 0.707106781f; // c4 + z5 = (tmp10 - tmp12) * 0.382683433f; // c6 + z2 = tmp10 * 0.541196100f + z5; // c2-c6 + z4 = tmp12 * 1.306562965f + z5; // c2+c6 + z3 = tmp11 * 0.707106781f; // c4 - z11 = tmp7 + z3; // phase 5 + z11 = tmp7 + z3; // phase 5 z13 = tmp7 - z3; - *d5p = z13 + z2; // phase 6 + *d5p = z13 + z2; // phase 6 *d3p = z13 - z2; *d1p = z11 + z4; *d7p = z11 - z4; @@ -1244,7 +1244,7 @@ static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) int tmp1 = val < 0 ? -val : val; val = val < 0 ? val - 1 : val; bits[1] = 1; - while(tmp1 >>= 1) + while (tmp1 >>= 1) { ++bits[1]; } @@ -1260,29 +1260,29 @@ static int stbiw__jpg_processDU(stbi__write_context* s, int* bitBuf, int* bitCnt int DU[64]; // DCT rows - for(dataOff = 0; dataOff < 64; dataOff += 8) + for (dataOff = 0; dataOff < 64; dataOff += 8) { stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff + 1], &CDU[dataOff + 2], &CDU[dataOff + 3], &CDU[dataOff + 4], &CDU[dataOff + 5], &CDU[dataOff + 6], &CDU[dataOff + 7]); } // DCT columns - for(dataOff = 0; dataOff < 8; ++dataOff) + for (dataOff = 0; dataOff < 8; ++dataOff) { stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff + 8], &CDU[dataOff + 16], &CDU[dataOff + 24], &CDU[dataOff + 32], &CDU[dataOff + 40], &CDU[dataOff + 48], &CDU[dataOff + 56]); } // Quantize/descale/zigzag the coefficients - for(i = 0; i < 64; ++i) + for (i = 0; i < 64; ++i) { float v = CDU[i] * fdtbl[i]; // DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f)); // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway? - DU[stbiw__jpg_ZigZag[i]] = ( int )(v < 0 ? v - 0.5f : v + 0.5f); + DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? v - 0.5f : v + 0.5f); } // Encode DC diff = DU[0] - DC; - if(diff == 0) + if (diff == 0) { stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]); } @@ -1295,29 +1295,29 @@ static int stbiw__jpg_processDU(stbi__write_context* s, int* bitBuf, int* bitCnt } // Encode ACs end0pos = 63; - for(; (end0pos > 0) && (DU[end0pos] == 0); --end0pos) + for (; (end0pos > 0) && (DU[end0pos] == 0); --end0pos) { } // end0pos = first element in reverse order !=0 - if(end0pos == 0) + if (end0pos == 0) { stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB); return DU[0]; } - for(i = 1; i <= end0pos; ++i) + for (i = 1; i <= end0pos; ++i) { int startpos = i; int nrzeroes; unsigned short bits[2]; - for(; DU[i] == 0 && i <= end0pos; ++i) + for (; DU[i] == 0 && i <= end0pos; ++i) { } nrzeroes = i - startpos; - if(nrzeroes >= 16) + if (nrzeroes >= 16) { int lng = nrzeroes >> 4; int nrmarker; - for(nrmarker = 1; nrmarker <= lng; ++nrmarker) + for (nrmarker = 1; nrmarker <= lng; ++nrmarker) stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes); nrzeroes &= 15; } @@ -1325,7 +1325,7 @@ static int stbiw__jpg_processDU(stbi__write_context* s, int* bitBuf, int* bitCnt stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes << 4) + bits[1]]); stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits); } - if(end0pos != 63) + if (end0pos != 63) { stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB); } @@ -1362,111 +1362,50 @@ static int stbi_write_jpg_core(stbi__write_context* s, int width, int height, in 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa}; // Huffman tables - static const unsigned short YDC_HT[256][2] = {{0, 2}, {2, 3}, {3, 3}, {4, 3}, {5, 3}, {6, 3}, - {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9}}; - static const unsigned short UVDC_HT[256][2] = {{0, 2}, {1, 2}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, - {62, 6}, {126, 7}, {254, 8}, {510, 9}, {1022, 10}, {2046, 11}}; + static const unsigned short YDC_HT[256][2] = {{0, 2}, {2, 3}, {3, 3}, {4, 3}, {5, 3}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9}}; + static const unsigned short UVDC_HT[256][2] = {{0, 2}, {1, 2}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9}, {1022, 10}, {2046, 11}}; static const unsigned short YAC_HT[256][2] = { - {10, 4}, {0, 2}, {1, 2}, {4, 3}, {11, 4}, {26, 5}, {120, 7}, {248, 8}, - {1014, 10}, {65410, 16}, {65411, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {12, 4}, {27, 5}, {121, 7}, {502, 9}, {2038, 11}, {65412, 16}, {65413, 16}, - {65414, 16}, {65415, 16}, {65416, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {28, 5}, {249, 8}, {1015, 10}, {4084, 12}, {65417, 16}, {65418, 16}, {65419, 16}, - {65420, 16}, {65421, 16}, {65422, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {58, 6}, {503, 9}, {4085, 12}, {65423, 16}, {65424, 16}, {65425, 16}, {65426, 16}, - {65427, 16}, {65428, 16}, {65429, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {59, 6}, {1016, 10}, {65430, 16}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, - {65435, 16}, {65436, 16}, {65437, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {122, 7}, {2039, 11}, {65438, 16}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, - {65443, 16}, {65444, 16}, {65445, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {123, 7}, {4086, 12}, {65446, 16}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, - {65451, 16}, {65452, 16}, {65453, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {250, 8}, {4087, 12}, {65454, 16}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, - {65459, 16}, {65460, 16}, {65461, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {504, 9}, {32704, 15}, {65462, 16}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, - {65467, 16}, {65468, 16}, {65469, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {505, 9}, {65470, 16}, {65471, 16}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, - {65476, 16}, {65477, 16}, {65478, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {506, 9}, {65479, 16}, {65480, 16}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, - {65485, 16}, {65486, 16}, {65487, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {1017, 10}, {65488, 16}, {65489, 16}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, - {65494, 16}, {65495, 16}, {65496, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {1018, 10}, {65497, 16}, {65498, 16}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, - {65503, 16}, {65504, 16}, {65505, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {2040, 11}, {65506, 16}, {65507, 16}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, - {65512, 16}, {65513, 16}, {65514, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {65515, 16}, {65516, 16}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, - {65522, 16}, {65523, 16}, {65524, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {2041, 11}, {65525, 16}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, - {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}; + {10, 4}, {0, 2}, {1, 2}, {4, 3}, {11, 4}, {26, 5}, {120, 7}, {248, 8}, {1014, 10}, {65410, 16}, {65411, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {12, 4}, {27, 5}, {121, 7}, {502, 9}, {2038, 11}, {65412, 16}, {65413, 16}, {65414, 16}, {65415, 16}, {65416, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {28, 5}, {249, 8}, {1015, 10}, {4084, 12}, {65417, 16}, {65418, 16}, {65419, 16}, {65420, 16}, {65421, 16}, {65422, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {58, 6}, {503, 9}, {4085, 12}, {65423, 16}, {65424, 16}, {65425, 16}, {65426, 16}, {65427, 16}, {65428, 16}, {65429, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {59, 6}, {1016, 10}, {65430, 16}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {122, 7}, {2039, 11}, {65438, 16}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {123, 7}, {4086, 12}, {65446, 16}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16}, {65453, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {250, 8}, {4087, 12}, {65454, 16}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16}, {65460, 16}, {65461, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {504, 9}, {32704, 15}, {65462, 16}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {505, 9}, {65470, 16}, {65471, 16}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {506, 9}, {65479, 16}, {65480, 16}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16}, {65487, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {1017, 10}, {65488, 16}, {65489, 16}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {1018, 10}, {65497, 16}, {65498, 16}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, {65505, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {2040, 11}, {65506, 16}, {65507, 16}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {65515, 16}, {65516, 16}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {2041, 11}, {65525, 16}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}; static const unsigned short UVAC_HT[256][2] = { - {0, 2}, {1, 2}, {4, 3}, {10, 4}, {24, 5}, {25, 5}, {56, 6}, {120, 7}, - {500, 9}, {1014, 10}, {4084, 12}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {11, 4}, {57, 6}, {246, 8}, {501, 9}, {2038, 11}, {4085, 12}, {65416, 16}, - {65417, 16}, {65418, 16}, {65419, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {26, 5}, {247, 8}, {1015, 10}, {4086, 12}, {32706, 15}, {65420, 16}, {65421, 16}, - {65422, 16}, {65423, 16}, {65424, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {27, 5}, {248, 8}, {1016, 10}, {4087, 12}, {65425, 16}, {65426, 16}, {65427, 16}, - {65428, 16}, {65429, 16}, {65430, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {58, 6}, {502, 9}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16}, - {65436, 16}, {65437, 16}, {65438, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {59, 6}, {1017, 10}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, - {65444, 16}, {65445, 16}, {65446, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {121, 7}, {2039, 11}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, - {65452, 16}, {65453, 16}, {65454, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {122, 7}, {2040, 11}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16}, - {65460, 16}, {65461, 16}, {65462, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {249, 8}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16}, - {65469, 16}, {65470, 16}, {65471, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {503, 9}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, - {65478, 16}, {65479, 16}, {65480, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {504, 9}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16}, - {65487, 16}, {65488, 16}, {65489, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {505, 9}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, - {65496, 16}, {65497, 16}, {65498, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {506, 9}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, - {65505, 16}, {65506, 16}, {65507, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {2041, 11}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16}, - {65514, 16}, {65515, 16}, {65516, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {16352, 14}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16}, - {65523, 16}, {65524, 16}, {65525, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {1018, 10}, {32707, 15}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, - {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}; - static const int YQT[] = {16, 11, 10, 16, 24, 40, 51, 61, 12, 12, 14, 19, 26, 58, 60, 55, - 14, 13, 16, 24, 40, 57, 69, 56, 14, 17, 22, 29, 51, 87, 80, 62, - 18, 22, 37, 56, 68, 109, 103, 77, 24, 35, 55, 64, 81, 104, 113, 92, + {0, 2}, {1, 2}, {4, 3}, {10, 4}, {24, 5}, {25, 5}, {56, 6}, {120, 7}, {500, 9}, {1014, 10}, {4084, 12}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {11, 4}, {57, 6}, {246, 8}, {501, 9}, {2038, 11}, {4085, 12}, {65416, 16}, {65417, 16}, {65418, 16}, {65419, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {26, 5}, {247, 8}, {1015, 10}, {4086, 12}, {32706, 15}, {65420, 16}, {65421, 16}, {65422, 16}, {65423, 16}, {65424, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {27, 5}, {248, 8}, {1016, 10}, {4087, 12}, {65425, 16}, {65426, 16}, {65427, 16}, {65428, 16}, {65429, 16}, {65430, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {58, 6}, {502, 9}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {65438, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {59, 6}, {1017, 10}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16}, {65446, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {121, 7}, {2039, 11}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16}, {65453, 16}, {65454, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {122, 7}, {2040, 11}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16}, {65460, 16}, {65461, 16}, {65462, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {249, 8}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {65470, 16}, {65471, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {503, 9}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16}, {65479, 16}, {65480, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {504, 9}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16}, {65487, 16}, {65488, 16}, {65489, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {505, 9}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16}, {65497, 16}, {65498, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {506, 9}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, {65505, 16}, {65506, 16}, {65507, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {2041, 11}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16}, {65515, 16}, {65516, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {16352, 14}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {65525, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {1018, 10}, {32707, 15}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}; + static const int YQT[] = {16, 11, 10, 16, 24, 40, 51, 61, 12, 12, 14, 19, 26, 58, 60, 55, + 14, 13, 16, 24, 40, 57, 69, 56, 14, 17, 22, 29, 51, 87, 80, 62, + 18, 22, 37, 56, 68, 109, 103, 77, 24, 35, 55, 64, 81, 104, 113, 92, 49, 64, 78, 87, 103, 121, 120, 101, 72, 92, 95, 98, 112, 100, 103, 99}; static const int UVQT[] = {17, 18, 24, 47, 99, 99, 99, 99, 18, 21, 26, 66, 99, 99, 99, 99, 24, 26, 56, 99, 99, 99, 99, 99, 47, 66, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99}; - static const float aasf[] = {1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, - 1.175875602f * 2.828427125f, 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, + static const float aasf[] = {1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, + 1.175875602f * 2.828427125f, 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f}; int row, col, i, k; float fdtbl_Y[64], fdtbl_UV[64]; unsigned char YTable[64], UVTable[64]; - if(!data || !width || !height || comp > 4 || comp < 1) + if (!data || !width || !height || comp > 4 || comp < 1) { return 0; } quality = quality ? quality : 90; - quality = quality < 1 ? 1 : quality > 100 ? 100 : quality; + quality = quality < 1 ? 1 : quality > 100 ? 100 + : quality; quality = quality < 50 ? 5000 / quality : 200 - quality * 2; - for(i = 0; i < 64; ++i) + for (i = 0; i < 64; ++i) { int uvti, yti = (YQT[i] * quality + 50) / 100; - YTable[stbiw__jpg_ZigZag[i]] = ( unsigned char )(yti < 1 ? 1 : yti > 255 ? 255 : yti); + YTable[stbiw__jpg_ZigZag[i]] = (unsigned char)(yti < 1 ? 1 : yti > 255 ? 255 + : yti); uvti = (UVQT[i] * quality + 50) / 100; - UVTable[stbiw__jpg_ZigZag[i]] = ( unsigned char )(uvti < 1 ? 1 : uvti > 255 ? 255 : uvti); + UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char)(uvti < 1 ? 1 : uvti > 255 ? 255 + : uvti); } - for(row = 0, k = 0; row < 8; ++row) + for (row = 0, k = 0; row < 8; ++row) { - for(col = 0; col < 8; ++col, ++k) + for (col = 0; col < 8; ++col, ++k) { fdtbl_Y[k] = 1 / (YTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]); fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]); @@ -1475,17 +1414,17 @@ static int stbi_write_jpg_core(stbi__write_context* s, int width, int height, in // Write Headers { - static const unsigned char head0[] = {0xFF, 0xD8, 0xFF, 0xE0, 0, 0x10, 'J', 'F', 'I', 'F', 0, 1, 1, - 0, 0, 1, 0, 1, 0, 0, 0xFF, 0xDB, 0, 0x84, 0}; + static const unsigned char head0[] = {0xFF, 0xD8, 0xFF, 0xE0, 0, 0x10, 'J', 'F', 'I', 'F', 0, 1, 1, + 0, 0, 1, 0, 1, 0, 0, 0xFF, 0xDB, 0, 0x84, 0}; static const unsigned char head2[] = {0xFF, 0xDA, 0, 0xC, 3, 1, 0, 2, 0x11, 3, 0x11, 0, 0x3F, 0}; const unsigned char head1[] = {0xFF, 0xC0, 0, 0x11, 8, - ( unsigned char )(height >> 8), + (unsigned char)(height >> 8), STBIW_UCHAR(height), - ( unsigned char )(width >> 8), + (unsigned char)(width >> 8), STBIW_UCHAR(width), 3, 1, @@ -1502,50 +1441,50 @@ static int stbi_write_jpg_core(stbi__write_context* s, int width, int height, in 0x01, 0xA2, 0}; - s->func(s->context, ( void* )head0, sizeof(head0)); - s->func(s->context, ( void* )YTable, sizeof(YTable)); + s->func(s->context, (void*)head0, sizeof(head0)); + s->func(s->context, (void*)YTable, sizeof(YTable)); stbiw__putc(s, 1); s->func(s->context, UVTable, sizeof(UVTable)); - s->func(s->context, ( void* )head1, sizeof(head1)); - s->func(s->context, ( void* )(std_dc_luminance_nrcodes + 1), sizeof(std_dc_luminance_nrcodes) - 1); - s->func(s->context, ( void* )std_dc_luminance_values, sizeof(std_dc_luminance_values)); - stbiw__putc(s, 0x10); // HTYACinfo - s->func(s->context, ( void* )(std_ac_luminance_nrcodes + 1), sizeof(std_ac_luminance_nrcodes) - 1); - s->func(s->context, ( void* )std_ac_luminance_values, sizeof(std_ac_luminance_values)); - stbiw__putc(s, 1); // HTUDCinfo - s->func(s->context, ( void* )(std_dc_chrominance_nrcodes + 1), sizeof(std_dc_chrominance_nrcodes) - 1); - s->func(s->context, ( void* )std_dc_chrominance_values, sizeof(std_dc_chrominance_values)); - stbiw__putc(s, 0x11); // HTUACinfo - s->func(s->context, ( void* )(std_ac_chrominance_nrcodes + 1), sizeof(std_ac_chrominance_nrcodes) - 1); - s->func(s->context, ( void* )std_ac_chrominance_values, sizeof(std_ac_chrominance_values)); - s->func(s->context, ( void* )head2, sizeof(head2)); + s->func(s->context, (void*)head1, sizeof(head1)); + s->func(s->context, (void*)(std_dc_luminance_nrcodes + 1), sizeof(std_dc_luminance_nrcodes) - 1); + s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values)); + stbiw__putc(s, 0x10); // HTYACinfo + s->func(s->context, (void*)(std_ac_luminance_nrcodes + 1), sizeof(std_ac_luminance_nrcodes) - 1); + s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values)); + stbiw__putc(s, 1); // HTUDCinfo + s->func(s->context, (void*)(std_dc_chrominance_nrcodes + 1), sizeof(std_dc_chrominance_nrcodes) - 1); + s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values)); + stbiw__putc(s, 0x11); // HTUACinfo + s->func(s->context, (void*)(std_ac_chrominance_nrcodes + 1), sizeof(std_ac_chrominance_nrcodes) - 1); + s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values)); + s->func(s->context, (void*)head2, sizeof(head2)); } // Encode 8x8 macroblocks { static const unsigned short fillBits[] = {0x7F, 7}; - const unsigned char* imageData = ( const unsigned char* )data; + const unsigned char* imageData = (const unsigned char*)data; int DCY = 0, DCU = 0, DCV = 0; int bitBuf = 0, bitCnt = 0; // comp == 2 is grey+alpha (alpha is ignored) int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0; int x, y, pos; - for(y = 0; y < height; y += 8) + for (y = 0; y < height; y += 8) { - for(x = 0; x < width; x += 8) + for (x = 0; x < width; x += 8) { float YDU[64], UDU[64], VDU[64]; - for(row = y, pos = 0; row < y + 8; ++row) + for (row = y, pos = 0; row < y + 8; ++row) { - for(col = x; col < x + 8; ++col, ++pos) + for (col = x; col < x + 8; ++col, ++pos) { int p = (stbi__flip_vertically_on_write ? height - 1 - row : row) * width * comp + col * comp; float r, g, b; - if(row >= height) + if (row >= height) { p -= width * comp * (row + 1 - height); } - if(col >= width) + if (col >= width) { p -= comp * (col + 1 - width); } @@ -1581,14 +1520,14 @@ STBIWDEF int stbi_write_jpg_to_func(stbi_write_func* func, void* context, int x, { stbi__write_context s; stbi__start_write_callbacks(&s, func, context); - return stbi_write_jpg_core(&s, x, y, comp, ( void* )data, quality); + return stbi_write_jpg_core(&s, x, y, comp, (void*)data, quality); } #ifndef STBI_WRITE_NO_STDIO STBIWDEF int stbi_write_jpg(char const* filename, int x, int y, int comp, const void* data, int quality) { stbi__write_context s; - if(stbi__start_write_file(&s, filename)) + if (stbi__start_write_file(&s, filename)) { int r = stbi_write_jpg_core(&s, x, y, comp, data, quality); stbi__end_write_file(&s); @@ -1599,7 +1538,7 @@ STBIWDEF int stbi_write_jpg(char const* filename, int x, int y, int comp, const } #endif -#endif // STB_IMAGE_WRITE_IMPLEMENTATION +#endif // STB_IMAGE_WRITE_IMPLEMENTATION /* Revision history 1.09 (2018-02-11) diff --git a/examples/common/tengine_operations.c b/examples/common/tengine_operations.c index 3ee24716c..2ce0d8354 100644 --- a/examples/common/tengine_operations.c +++ b/examples/common/tengine_operations.c @@ -71,7 +71,7 @@ image load_image_stb(const char* filename, int channels) { int dst_index = i + w * j + w * h * k; int src_index = k + src_c * i + src_c * w * j; - im.data[dst_index] = ( float )data[src_index]; + im.data[dst_index] = (float)data[src_index]; } } } @@ -83,7 +83,7 @@ image load_image_stb(const char* filename, int channels) image make_image(int w, int h, int c) { image out = make_empty_image(w, h, c); - out.data = ( float* )calloc((size_t)h * w * c, sizeof(float)); + out.data = (float*)calloc((size_t)h * w * c, sizeof(float)); return out; } @@ -125,17 +125,17 @@ image imread_process(const char* filename, int img_w, int img_h, float* means, f switch (choice) { - case 0: - out = gray2bgr(out); - break; - case 1: - out = rgb2gray(out); - break; - case 2: - out = rgb2bgr_permute(out); - break; - default: - break; + case 0: + out = gray2bgr(out); + break; + case 1: + out = rgb2gray(out); + break; + case 2: + out = rgb2bgr_permute(out); + break; + default: + break; } image resImg = make_image(img_w, img_h, out.c); @@ -171,8 +171,8 @@ image resize_image(image im, int ow, int oh) int h = im.h; int w = im.w; float shift = 0.f; - float _scale_x = ( float )((w - shift) / (ow - shift)); - float _scale_y = ( float )((h - shift) / (oh - shift)); + float _scale_x = (float)((w - shift) / (ow - shift)); + float _scale_y = (float)((h - shift) / (oh - shift)); float32x4_t scale_x = vdupq_n_f32(_scale_x); float offset = 0.5; int in_hw = h * w; @@ -215,8 +215,7 @@ image resize_image(image im, int ow, int oh) float32x4_t fx_0 = vsubq_f32(offset_1, fx); - const int32x4_t in_idx = - vaddq_s32(vaddq_s32(vmulq_s32(sy_0, w_0), vcvtq_s32_f32(sx)), vmulq_s32(in_hw_0, k_0)); + const int32x4_t in_idx = vaddq_s32(vaddq_s32(vmulq_s32(sy_0, w_0), vcvtq_s32_f32(sx)), vmulq_s32(in_hw_0, k_0)); int32x4_t in_index0 = in_idx; int32x4_t in_index2 = vaddq_s32(in_idx, vcvtq_s32_f32(offset_1)); int32x4_t in_index1 = vaddq_s32(in_idx, w_0); @@ -290,8 +289,8 @@ image resize_image(image im, int ow, int oh) int h = im.h; int w = im.w; float shift = 0.f; - float _scale_x = ( float )((w - shift) / (ow - shift)); - float _scale_y = ( float )((h - shift) / (oh - shift)); + float _scale_x = (float)((w - shift) / (ow - shift)); + float _scale_y = (float)((h - shift) / (oh - shift)); float32x4_t scale_x = vdupq_n_f32(_scale_x); float offset = 0.5; @@ -335,8 +334,7 @@ image resize_image(image im, int ow, int oh) float32x4_t fx_0 = vsubq_f32(offset_1, fx); - const int32x4_t in_idx = - vaddq_s32(vaddq_s32(vmulq_s32(sy_0, w_0), vcvtq_s32_f32(sx)), vmulq_s32(in_hw_0, k_0)); + const int32x4_t in_idx = vaddq_s32(vaddq_s32(vmulq_s32(sy_0, w_0), vcvtq_s32_f32(sx)), vmulq_s32(in_hw_0, k_0)); int32x4_t in_index0 = in_idx; int32x4_t in_index2 = vaddq_s32(in_idx, vcvtq_s32_f32(offset_1)); @@ -408,8 +406,8 @@ image resize_image(image im, int ow, int oh) #endif #else - float scale_x = ( float )(im.w) / (ow); - float scale_y = ( float )(im.h) / (oh); + float scale_x = (float)(im.w) / (ow); + float scale_y = (float)(im.h) / (oh); int w = im.w; int h = im.h; int in_hw = h * w; @@ -480,14 +478,14 @@ image copyMaker(image im, int top, int bottom, int left, int right, float value) void save_image(image im, const char* name) { - char buff[256] = { 0 }; - unsigned char* data = ( unsigned char* )calloc((size_t)im.w * im.h * im.c, sizeof(char)); + char buff[256] = {0}; + unsigned char* data = (unsigned char*)calloc((size_t)im.w * im.h * im.c, sizeof(char)); int i, k; for (k = 0; k < im.c; ++k) { for (i = 0; i < im.w * im.h; ++i) { - data[i * im.c + k] = ( unsigned char )(im.data[i + k * im.w * im.h]); + data[i * im.c + k] = (unsigned char)(im.data[i + k * im.w * im.h]); } } @@ -506,22 +504,22 @@ void save_image(image im, const char* name) switch (f) { - case 0: - strcat(buff, ".jpg"); - case 1: - success = stbi_write_jpg(buff, im.w, im.h, im.c, data, 80); - break; - case 2: - success = stbi_write_png(buff, im.w, im.h, im.c, data, im.w * im.c); - break; - case 3: - success = stbi_write_tga(buff, im.w, im.h, im.c, data); - break; - case 4: - success = stbi_write_bmp(buff, im.w, im.h, im.c, data); - break; - default: - return; + case 0: + strcat(buff, ".jpg"); + case 1: + success = stbi_write_jpg(buff, im.w, im.h, im.c, data, 80); + break; + case 2: + success = stbi_write_png(buff, im.w, im.h, im.c, data, im.w * im.c); + break; + case 3: + success = stbi_write_tga(buff, im.w, im.h, im.c, data); + break; + case 4: + success = stbi_write_bmp(buff, im.w, im.h, im.c, data); + break; + default: + return; } free(data); if (!success) @@ -586,7 +584,7 @@ static float get_pixelBychannel(image m, int x, int y, int c) image copy_image(image p) { image copy = p; - copy.data = ( float* )calloc((size_t)p.h * p.w * p.c, sizeof(float)); + copy.data = (float*)calloc((size_t)p.h * p.w * p.c, sizeof(float)); memcpy(copy.data, p.data, (unsigned long)p.h * p.w * p.c * sizeof(float)); return copy; } @@ -642,7 +640,8 @@ image imread2post(const char* filename) { image im = load_image_stb(filename, 0); const int len = im.c * im.h * im.w; - for (int i = 0; i < len; ++i) { + for (int i = 0; i < len; ++i) + { im.data[i] *= 255; } return im; @@ -651,20 +650,21 @@ image imread2post(const char* filename) image rgb2bgr_permute(image src) { const int len = src.c * src.h * src.w; - float* GRB = ( float* )malloc(sizeof(float) * len); + float* GRB = (float*)malloc(sizeof(float) * len); for (int c = 0; c < src.c; c++) { for (int h = 0; h < src.h; h++) { for (int w = 0; w < src.w; w++) { - int newIndex = ( c )*src.h * src.w + h * src.w + w; + int newIndex = (c)*src.h * src.w + h * src.w + w; int grbIndex = (2 - c) * src.h * src.w + h * src.w + w; GRB[grbIndex] = src.data[newIndex]; } } } - for (int i = 0; i < len; ++i) { + for (int i = 0; i < len; ++i) + { src.data[i] = GRB[i]; } free(GRB); @@ -673,14 +673,14 @@ image rgb2bgr_permute(image src) image image_permute(image src) { - float* GRB = ( float* )malloc(sizeof(float) * src.c * src.h * src.w); + float* GRB = (float*)malloc(sizeof(float) * src.c * src.h * src.w); for (int c = 0; c < src.c; c++) { for (int h = 0; h < src.h; h++) { for (int w = 0; w < src.w; w++) { - int newIndex = ( c )*src.h * src.w + h * src.w + w; + int newIndex = (c)*src.h * src.w + h * src.w + w; int grbIndex = (2 - c) * src.h * src.w + h * src.w + w; GRB[grbIndex] = src.data[newIndex]; } @@ -696,7 +696,7 @@ image gray2bgr(image src) res.c = 3; res.h = src.h; res.w = src.w; - res.data = ( float* )malloc(sizeof(float) * 3 * src.h * src.w); + res.data = (float*)malloc(sizeof(float) * 3 * src.h * src.w); for (int x = 0; x < src.h; x++) { for (int y = 0; y < src.w; y++) @@ -714,7 +714,7 @@ image gray2bgr(image src) image tranpose(image src) { int size = src.c * src.h * src.w; - float* tempData = ( float* )malloc(sizeof(float) * size); + float* tempData = (float*)malloc(sizeof(float) * size); int index = 0; for (int c = 0; c < src.c; c++) @@ -811,7 +811,7 @@ image rgb2gray(image src) res.h = src.h; res.w = src.w; res.c = 1; - res.data = ( float* )malloc(sizeof(float) * res.h * res.w); + res.data = (float*)malloc(sizeof(float) * res.h * res.w); for (int i = 0; i < res.h; i++) { for (int j = 0; j < res.w; j++) @@ -838,7 +838,7 @@ image letterbox(image im, int w, int h) { int ow = im.w; int oh = im.h; - if ((( float )w / im.w) < (( float )h / im.h)) + if (((float)w / im.w) < ((float)h / im.h)) { ow = w; oh = (im.h * w) / im.w; @@ -853,7 +853,7 @@ image letterbox(image im, int w, int h) boxed.w = w; boxed.h = h; boxed.c = im.c; - boxed.data = ( float* )malloc(sizeof(float) * im.c * h * w); + boxed.data = (float*)malloc(sizeof(float) * im.c * h * w); for (int i = 0; i < boxed.c * boxed.h * boxed.w; i++) { @@ -868,20 +868,20 @@ image letterbox(image im, int w, int h) void tengine_resize_f32(float* data, float* res, int ow, int oh, int c, int h, int w) { - float _scale_x = ( float )(w) / ( float )(ow); - float _scale_y = ( float )(h) / ( float )(oh); + float _scale_x = (float)(w) / (float)(ow); + float _scale_y = (float)(h) / (float)(oh); float offset = 0.5f; - int16_t* buf = ( int16_t* )malloc((ow + ow + ow + oh + oh + oh) * sizeof(int16_t)); - int16_t* xCoef = ( int16_t* )(buf); - int16_t* xPos = ( int16_t* )(buf + ow + ow); - int16_t* yCoef = ( int16_t* )(buf + ow + ow + ow); - int16_t* yPos = ( int16_t* )(buf + ow + ow + ow + oh + oh); + int16_t* buf = (int16_t*)malloc((ow + ow + ow + oh + oh + oh) * sizeof(int16_t)); + int16_t* xCoef = (int16_t*)(buf); + int16_t* xPos = (int16_t*)(buf + ow + ow); + int16_t* yCoef = (int16_t*)(buf + ow + ow + ow); + int16_t* yPos = (int16_t*)(buf + ow + ow + ow + oh + oh); for (int i = 0; i < ow; i++) { - float fx = ( float )((( float )i + offset) * _scale_x - offset); - int sx = ( int )fx; + float fx = (float)(((float)i + offset) * _scale_x - offset); + int sx = (int)fx; fx -= sx; if (sx < 0) { @@ -900,8 +900,8 @@ void tengine_resize_f32(float* data, float* res, int ow, int oh, int c, int h, i for (int j = 0; j < oh; j++) { - float fy = ( float )((( float )j + offset) * _scale_y - offset); - int sy = ( int )fy; + float fy = (float)(((float)j + offset) * _scale_y - offset); + int sy = (int)fy; fy -= sy; if (sy < 0) { @@ -919,7 +919,7 @@ void tengine_resize_f32(float* data, float* res, int ow, int oh, int c, int h, i } // int32_t* row = new int32_t[ow + ow]; - int32_t* row = ( int32_t* )malloc((ow + ow) * sizeof(int32_t)); + int32_t* row = (int32_t*)malloc((ow + ow) * sizeof(int32_t)); for (int k = 0; k < c; k++) { @@ -1019,7 +1019,7 @@ static void sort_cls_score(cls_score* array, int left, int right) void print_topk(float* data, int total_num, int topk) { - cls_score* cls_scores = ( cls_score* )malloc(total_num * sizeof(cls_score)); + cls_score* cls_scores = (cls_score*)malloc(total_num * sizeof(cls_score)); for (int i = 0; i < total_num; i++) { cls_scores[i].id = i; diff --git a/examples/common/test_nnie_all.hpp b/examples/common/test_nnie_all.hpp index e10151554..4c55dbe73 100644 --- a/examples/common/test_nnie_all.hpp +++ b/examples/common/test_nnie_all.hpp @@ -33,29 +33,29 @@ #include "mpi_nnie.h" /*16Byte align*/ -#define TEST_NNIE_ALIGN_16 16 -#define TEST_NNIE_ALIGN16(u32Num) ((u32Num + TEST_NNIE_ALIGN_16 - 1) / TEST_NNIE_ALIGN_16 * TEST_NNIE_ALIGN_16) -#define TEST_NNIE_COORDI_NUM 4 /*coordinate numbers*/ -#define TEST_NNIE_QUANT_BASE 4096 /*the base value*/ -#define TEST_NNIE_PROPOSAL_WIDTH 6 /*the number of proposal values*/ -#define TEST_NNIE_SSD_REPORT_NODE_NUM 12 -#define TEST_NNIE_MAX_SOFTWARE_MEM_NUM 4 -#define TEST_NNIE_SSD_REPORT_NODE_NUM 12 -#define TEST_NNIE_SSD_PRIORBOX_NUM 6 -#define TEST_NNIE_SSD_SOFTMAX_NUM 6 -#define TEST_NNIE_SSD_ASPECT_RATIO_NUM 6 -#define TEST_NNIE_YOLOV3_REPORT_BLOB_NUM 3 /*yolov3 report blob num*/ +#define TEST_NNIE_ALIGN_16 16 +#define TEST_NNIE_ALIGN16(u32Num) ((u32Num + TEST_NNIE_ALIGN_16 - 1) / TEST_NNIE_ALIGN_16 * TEST_NNIE_ALIGN_16) +#define TEST_NNIE_COORDI_NUM 4 /*coordinate numbers*/ +#define TEST_NNIE_QUANT_BASE 4096 /*the base value*/ +#define TEST_NNIE_PROPOSAL_WIDTH 6 /*the number of proposal values*/ +#define TEST_NNIE_SSD_REPORT_NODE_NUM 12 +#define TEST_NNIE_MAX_SOFTWARE_MEM_NUM 4 +#define TEST_NNIE_SSD_REPORT_NODE_NUM 12 +#define TEST_NNIE_SSD_PRIORBOX_NUM 6 +#define TEST_NNIE_SSD_SOFTMAX_NUM 6 +#define TEST_NNIE_SSD_ASPECT_RATIO_NUM 6 +#define TEST_NNIE_YOLOV3_REPORT_BLOB_NUM 3 /*yolov3 report blob num*/ #define TEST_NNIE_YOLOV3_EACH_BBOX_INFER_RESULT_NUM 85 /*yolov3 inference result num of each bbox*/ -#define TEST_NNIE_YOLOV3_EACH_GRID_BIAS_NUM 6 /*yolov3 bias num of each grid*/ -#define TEST_NNIE_SCORE_NUM 2 /*the num of RPN scores*/ +#define TEST_NNIE_YOLOV3_EACH_GRID_BIAS_NUM 6 /*yolov3 bias num of each grid*/ +#define TEST_NNIE_SCORE_NUM 2 /*the num of RPN scores*/ -#define TEST_NNIE_COORDI_NUM 4 /*coordinate numbers*/ -#define TEST_COORDI_NUM 4 /*num of coordinates*/ -#define TEST_NNIE_HALF 0.5f /*the half value*/ -#define TEST_NNIE_MAX(a, b) (((a) > (b)) ? (a) : (b)) -#define TEST_NNIE_MIN(a, b) (((a) < (b)) ? (a) : (b)) +#define TEST_NNIE_COORDI_NUM 4 /*coordinate numbers*/ +#define TEST_COORDI_NUM 4 /*num of coordinates*/ +#define TEST_NNIE_HALF 0.5f /*the half value*/ +#define TEST_NNIE_MAX(a, b) (((a) > (b)) ? (a) : (b)) +#define TEST_NNIE_MIN(a, b) (((a) < (b)) ? (a) : (b)) -#define TEST_NNIE_SIGMOID(x) (HI_FLOAT)(1.0f / (1 + fast_exp(-x))) +#define TEST_NNIE_SIGMOID(x) (HI_FLOAT)(1.0f / (1 + fast_exp(-x))) #define TEST_NNIE_SIGMOID_NOEXP(x) (HI_FLOAT)(1.0f / (1 + x)) inline float32x4_t vexpq10_f32(float32x4_t x) @@ -74,7 +74,7 @@ inline float32x4_t vexpq10_f32(float32x4_t x) return x; } -void fast_exp_4f(const float *a, float *xx) +void fast_exp_4f(const float* a, float* xx) { float32x4_t x = vld1q_f32(a); x = vexpq10_f32(x); @@ -120,14 +120,14 @@ typedef struct hiTEST_NNIE_FASTERRCNN_SOFTWARE_PARAM_S HI_U32 u32ClassNum; HI_U32 au32ConfThresh[21]; HI_U32 u32ValidNmsThresh; - HI_S32 *aps32Conv[2]; + HI_S32* aps32Conv[2]; SVP_MEM_INFO_S stRpnTmpBuf; SVP_DST_BLOB_S stRpnBbox; SVP_DST_BLOB_S stClassRoiNum; SVP_DST_BLOB_S stDstRoi; SVP_DST_BLOB_S stDstScore; SVP_MEM_INFO_S stGetResultTmpBuf; - HI_CHAR *apcRpnDataLayerName[2]; + HI_CHAR* apcRpnDataLayerName[2]; } TEST_NNIE_FASTERRCNN_SOFTWARE_PARAM_S; typedef struct hiTEST_NNIE_CNN_GETTOPN_UNIT_S @@ -270,7 +270,7 @@ typedef struct hiTEST_NNIE_STACK HI_S32 s32Max; } TEST_NNIE_STACK_S; -HI_S32 SAMPLE_COMM_SVP_MallocMem(const HI_CHAR *pszMmb, const HI_CHAR *pszZone, HI_U64 *pu64PhyAddr, HI_VOID **ppvVirAddr, HI_U32 u32Size) +HI_S32 SAMPLE_COMM_SVP_MallocMem(const HI_CHAR* pszMmb, const HI_CHAR* pszZone, HI_U64* pu64PhyAddr, HI_VOID** ppvVirAddr, HI_U32 u32Size) { HI_S32 s32Ret = HI_SUCCESS; diff --git a/examples/cpp_tm_classification.cpp b/examples/cpp_tm_classification.cpp index f5cb0d3a7..d4451f1cb 100644 --- a/examples/cpp_tm_classification.cpp +++ b/examples/cpp_tm_classification.cpp @@ -35,15 +35,15 @@ #include "tengine_cpp_api.h" #include "tengine_operations.h" -#define DEFAULT_IMG_H 227 -#define DEFAULT_IMG_W 227 -#define DEFAULT_SCALE1 1.f -#define DEFAULT_SCALE2 1.f -#define DEFAULT_SCALE3 1.f -#define DEFAULT_MEAN1 104.007 -#define DEFAULT_MEAN2 116.669 -#define DEFAULT_MEAN3 122.679 -#define DEFAULT_LOOP_COUNT 1 +#define DEFAULT_IMG_H 227 +#define DEFAULT_IMG_W 227 +#define DEFAULT_SCALE1 1.f +#define DEFAULT_SCALE2 1.f +#define DEFAULT_SCALE3 1.f +#define DEFAULT_MEAN1 104.007 +#define DEFAULT_MEAN2 116.669 +#define DEFAULT_MEAN3 122.679 +#define DEFAULT_LOOP_COUNT 1 #define DEFAULT_THREAD_COUNT 1 using namespace std; @@ -54,7 +54,8 @@ void show_usage() << " [-m model_file] [-l label_file] [-i image_file]\n" << " [-g img_h,img_w] [-s scale] [-w mean[0],mean[1],mean[2]] [-r repeat_count]\n"; - std::cout << "\nmobilenet example: \n" << " ./classification -m /path/to/mobilenet.tmfile -l /path/to/labels.txt -i /path/to/img.jpg -g 224,224 -s 0.017 -w 104.007,116.669,122.679" << std::endl; + std::cout << "\nmobilenet example: \n" + << " ./classification -m /path/to/mobilenet.tmfile -l /path/to/labels.txt -i /path/to/img.jpg -g 224,224 -s 0.017 -w 104.007,116.669,122.679" << std::endl; } int main(int argc, char* argv[]) @@ -74,34 +75,34 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'g': - split(img_hw, optarg, ","); - img_h = ( int )img_hw[0]; - img_w = ( int )img_hw[1]; - break; - case 's': - split(scale, optarg, ","); - break; - case 'w': - split(mean, optarg, ","); - break; - case 'r': - loop_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'g': + split(img_hw, optarg, ","); + img_h = (int)img_hw[0]; + img_w = (int)img_hw[1]; + break; + case 's': + split(scale, optarg, ","); + break; + case 'w': + split(mean, optarg, ","); + break; + case 'r': + loop_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -112,7 +113,7 @@ int main(int argc, char* argv[]) return -1; } - if(image_file.empty()) + if (image_file.empty()) { std::cerr << "Error: Image file not specified!" << std::endl; show_usage(); @@ -120,15 +121,15 @@ int main(int argc, char* argv[]) } // check input files - if(!check_file_exist(model_file.c_str()) || !check_file_exist(image_file.c_str())) + if (!check_file_exist(model_file.c_str()) || !check_file_exist(image_file.c_str())) return -1; - if(img_h == 0) + if (img_h == 0) { img_h = DEFAULT_IMG_H; std::cout << "Image height not specified, use default [" << DEFAULT_IMG_H << "]" << std::endl; } - if(img_w == 0) + if (img_w == 0) { img_w = DEFAULT_IMG_W; std::cout << "Image width not specified, use default [" << DEFAULT_IMG_W << "]" << std::endl; @@ -140,7 +141,7 @@ int main(int argc, char* argv[]) scale[2] = DEFAULT_SCALE3; std::cout << "Scale value not specified, use default [" << scale[0] << ", " << scale[1] << ", " << scale[2] << "]" << std::endl; } - if(mean[0] == -1.0 || mean[1] == -1.0 || mean[2] == -1.0) + if (mean[0] == -1.0 || mean[1] == -1.0 || mean[2] == -1.0) { mean[0] = DEFAULT_MEAN1; mean[1] = DEFAULT_MEAN2; @@ -169,7 +170,7 @@ int main(int argc, char* argv[]) /* prepare input data */ input_tensor.create(1, 3, img_h, img_w); - get_input_data(image_file.c_str(), ( float* )input_tensor.data, img_h, img_w, mean, scale); + get_input_data(image_file.c_str(), (float*)input_tensor.data, img_h, img_w, mean, scale); /* forward */ somenet.input_tensor("data", input_tensor); @@ -196,7 +197,7 @@ int main(int argc, char* argv[]) somenet.extract_tensor("prob", output_tensor); /* after process */ - print_topk(( float* )output_tensor.data, output_tensor.elem_num, 5); + print_topk((float*)output_tensor.data, output_tensor.elem_num, 5); std::cout << "--------------------------------------\n"; std::cout << "ALL TEST DONE\n"; } diff --git a/examples/cpp_tm_mobilenet_ssd.cpp b/examples/cpp_tm_mobilenet_ssd.cpp index b85fe348b..28ad5ceaf 100644 --- a/examples/cpp_tm_mobilenet_ssd.cpp +++ b/examples/cpp_tm_mobilenet_ssd.cpp @@ -55,16 +55,16 @@ typedef struct Box void post_process_ssd(const string image_file, float threshold, const float* outdata, int num) { - const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle", - "bus", "car", "cat", "chair", "cow", "diningtable", - "dog", "horse", "motorbike", "person", "pottedplant", "sheep", - "sofa", "train", "tvmonitor"}; + const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle", + "bus", "car", "cat", "chair", "cow", "diningtable", + "dog", "horse", "motorbike", "person", "pottedplant", "sheep", + "sofa", "train", "tvmonitor"}; image im = imread(image_file.c_str()); int raw_h = im.h; int raw_w = im.w; -// struct vector* boxes = create_vector(sizeof(Box_t), nullptr); + // struct vector* boxes = create_vector(sizeof(Box_t), nullptr); std::vector boxes; fprintf(stderr, "detect result num: %d \n", num); @@ -122,23 +122,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -150,7 +150,7 @@ int main(int argc, char* argv[]) return -1; } - if(image_file.empty()) + if (image_file.empty()) { std::cerr << "Error: Image file not specified!" << std::endl; show_usage(); @@ -184,7 +184,7 @@ int main(int argc, char* argv[]) /* prepare input data */ input_tensor.create(1, 3, img_h, img_w); - get_input_data(image_file.c_str(), ( float* )input_tensor.data, img_h, img_w, mean, scale); + get_input_data(image_file.c_str(), (float*)input_tensor.data, img_h, img_w, mean, scale); /* forward */ somenet.input_tensor("data", input_tensor); @@ -211,7 +211,7 @@ int main(int argc, char* argv[]) somenet.extract_tensor("detection_out", output_tensor); /* SSD process */ - post_process_ssd(image_file, show_threshold, ( float* )output_tensor.data, output_tensor.h); + post_process_ssd(image_file, show_threshold, (float*)output_tensor.data, output_tensor.h); } /* release */ diff --git a/examples/tm_alphapose.cpp b/examples/tm_alphapose.cpp index 4be2296a4..5776993b5 100644 --- a/examples/tm_alphapose.cpp +++ b/examples/tm_alphapose.cpp @@ -34,14 +34,14 @@ #include "tengine/c_api.h" #include "tengine_operations.h" -#define DEFAULT_IMG_H 320 -#define DEFAULT_IMG_W 256 -#define DEFAULT_SCALE1 (0.0039216) -#define DEFAULT_SCALE2 (0.0039215) -#define DEFAULT_SCALE3 (0.0039215) -#define DEFAULT_MEAN1 0.406 -#define DEFAULT_MEAN2 0.457 -#define DEFAULT_MEAN3 0.480 +#define DEFAULT_IMG_H 320 +#define DEFAULT_IMG_W 256 +#define DEFAULT_SCALE1 (0.0039216) +#define DEFAULT_SCALE2 (0.0039215) +#define DEFAULT_SCALE3 (0.0039215) +#define DEFAULT_MEAN1 0.406 +#define DEFAULT_MEAN2 0.457 +#define DEFAULT_MEAN3 0.480 #define DEFAULT_REPEAT_COUNT 1 #define DEFAULT_THREAD_COUNT 1 @@ -51,7 +51,7 @@ using predict_t = std::tuple; const float s_keypoint_thresh = 0.2; -cv::Mat get_3rd_point(const cv::Mat & a, const cv::Mat & b) +cv::Mat get_3rd_point(const cv::Mat& a, const cv::Mat& b) { auto direct = a - b; cv::Mat result(direct.size(), direct.type()); @@ -60,13 +60,13 @@ cv::Mat get_3rd_point(const cv::Mat & a, const cv::Mat & b) return result; } -cv::Mat get_input_data_pose(const char * img_file_path) +cv::Mat get_input_data_pose(const char* img_file_path) { cv::Mat img = cv::imread(img_file_path); cv::cvtColor(img, img, cv::COLOR_BGR2RGB); img.convertTo(img, CV_32FC3); - float* img_data = ( float* )img.data; + float* img_data = (float*)img.data; float means[3]{DEFAULT_MEAN1, DEFAULT_MEAN2, DEFAULT_MEAN3}; float scales[3]{DEFAULT_SCALE1, DEFAULT_SCALE2, DEFAULT_SCALE3}; @@ -85,11 +85,11 @@ cv::Mat get_input_data_pose(const char * img_file_path) return std::move(img); } -cv::Mat crop_box(const cv::Mat & org_img, - const pt_t & up_left, - const pt_t & bottom_right, - const int & input_res_h, - const int & input_res_w) +cv::Mat crop_box(const cv::Mat& org_img, + const pt_t& up_left, + const pt_t& bottom_right, + const int& input_res_h, + const int& input_res_w) { auto img = org_img.clone(); @@ -144,16 +144,16 @@ cv::Mat crop_box(const cv::Mat & org_img, return std::move(dst_img); } -float * pre_process_pose(cv::Mat & img, - const std::vector & boxes, - std::vector & pt1, - std::vector & pt2) +float* pre_process_pose(cv::Mat& img, + const std::vector& boxes, + std::vector& pt1, + std::vector& pt2) { const int img_height = img.rows; const int img_width = img.cols; - float * predict_data = (float *) malloc (boxes.size() * DEFAULT_IMG_H * DEFAULT_IMG_W * 3 * sizeof(float)); - float * p_data = predict_data; + float* predict_data = (float*)malloc(boxes.size() * DEFAULT_IMG_H * DEFAULT_IMG_W * 3 * sizeof(float)); + float* p_data = predict_data; for (size_t i = 0; i < boxes.size(); i++) { @@ -167,10 +167,8 @@ float * pre_process_pose(cv::Mat & img, up_left[0] = std::max(0.f, up_left[0] - box_wt * scale_rate / 2); up_left[1] = std::max(0.f, up_left[1] - box_ht * scale_rate / 2); - bottom_right[0] = - std::max(std::min(img_width - 1.f, bottom_right[0] + box_wt * scale_rate / 2), up_left[0] + 5); - bottom_right[1] = - std::max(std::min(img_height - 1.f, bottom_right[1] + box_ht * scale_rate / 2), up_left[1] + 5); + bottom_right[0] = std::max(std::min(img_width - 1.f, bottom_right[0] + box_wt * scale_rate / 2), up_left[0] + 5); + bottom_right[1] = std::max(std::min(img_height - 1.f, bottom_right[1] + box_ht * scale_rate / 2), up_left[1] + 5); auto inp = crop_box(img, up_left, bottom_right, DEFAULT_IMG_H, DEFAULT_IMG_W); //HWC -> CHW @@ -192,8 +190,8 @@ float * pre_process_pose(cv::Mat & img, return predict_data; } -cv::Mat transform_box_invert_batch(cv::Mat & pt, - const std::vector & ul, const std::vector & br, +cv::Mat transform_box_invert_batch(cv::Mat& pt, + const std::vector& ul, const std::vector& br, const int& input_res_h, const int& input_res_w, const int& output_res_h, const int& output_res_w) { @@ -204,8 +202,8 @@ cv::Mat transform_box_invert_batch(cv::Mat & pt, for (size_t i = 0; i < center.size(); i++) { - auto & len_h_element = len_h[i]; - auto & len_w_element = len_w[i]; + auto& len_h_element = len_h[i]; + auto& len_w_element = len_w[i]; len_h_element = std::numeric_limits::min(); for (size_t j = 0; j < std::tuple_size::value; j++) { @@ -221,10 +219,9 @@ cv::Mat transform_box_invert_batch(cv::Mat & pt, len_h_element = size[i][j]; } } - len_w_element = len_h_element * (input_res_w * 1.f / input_res_h); + len_w_element = len_h_element * (input_res_w * 1.f / input_res_h); } - auto clamp_min_func = [](float v, float min = 0.f) - { + auto clamp_min_func = [](float v, float min = 0.f) { if (v < min) return min; return v; }; @@ -248,16 +245,15 @@ cv::Mat transform_box_invert_batch(cv::Mat & pt, return std::move(new_point); } -predict_t get_predict(float * hm_data, +predict_t get_predict(float* hm_data, const int hm_dims[4], - const std::vector & pt1, - const std::vector & pt2, - const int & input_res_h, - const int & input_res_w) + const std::vector& pt1, + const std::vector& pt2, + const int& input_res_h, + const int& input_res_w) { // Get Keypoint location from heatmap - auto get_hm_data = [](float * data, const int data_dims[4], const std::array ele_dims) - { + auto get_hm_data = [](float* data, const int data_dims[4], const std::array ele_dims) { return *(data + ele_dims[0] * data_dims[1] * data_dims[2] * data_dims[3] + ele_dims[1] * data_dims[2] * data_dims[3] @@ -265,14 +261,14 @@ predict_t get_predict(float * hm_data, + ele_dims[3]); }; - cv::Mat preds(hm_dims[0], hm_dims[1], CV_32FC2); + cv::Mat preds(hm_dims[0], hm_dims[1], CV_32FC2); cv::Mat maxval(hm_dims[0], hm_dims[1], CV_32FC1); for (int i = 0; i < hm_dims[0]; i++) { for (int j = 0; j < hm_dims[1]; j++) { - float * start_iter = hm_data + i * hm_dims[1] * hm_dims[2] * hm_dims[3] + j * hm_dims[2] * hm_dims[3]; + float* start_iter = hm_data + i * hm_dims[1] * hm_dims[2] * hm_dims[3] + j * hm_dims[2] * hm_dims[3]; auto max_element = std::max_element(start_iter, start_iter + hm_dims[2] * hm_dims[3]); preds.ptr(i, j)->val[0] = preds.ptr(i, j)->val[1] = std::distance(start_iter, max_element) + 1; maxval.at(i, j) = *max_element; @@ -301,10 +297,11 @@ predict_t get_predict(float * hm_data, && (0 < pY) && (pY < (hm_dims[3] - 1))) { - auto sign_func = [](float x) - { - if (x > 0.) x = 1.f; - else if (x < 0.) x = -1.f; + auto sign_func = [](float x) { + if (x > 0.) + x = 1.f; + else if (x < 0.) + x = -1.f; return x; }; @@ -322,8 +319,8 @@ predict_t get_predict(float * hm_data, return std::make_tuple(preds, preds_tf, maxval); } -void post_process_pose(const char * image_file, - float * heatmap_data, int heatmap_dims[4], +void post_process_pose(const char* image_file, + float* heatmap_data, int heatmap_dims[4], const std::vector& pt1, const std::vector& pt2) { cv::Mat preds_hm, preds_scores; @@ -348,7 +345,7 @@ void show_usage() fprintf(stderr, "[Usage]: [-h]\n [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n"); } -bool tengine_predict(float * input_data, graph_t graph, const int input_dims[4], const int & num_thread, const int & loop_count) +bool tengine_predict(float* input_data, graph_t graph, const int input_dims[4], const int& num_thread, const int& loop_count) { /* set runtime options */ struct options opt; @@ -423,23 +420,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -482,13 +479,13 @@ int main(int argc, char* argv[]) int img_width = input_tensor.cols; // support multi-roi boxes later - std::vector boxes {{0,0, static_cast(img_width - 1), static_cast(img_height - 1)}}; + std::vector boxes{{0, 0, static_cast(img_width - 1), static_cast(img_height - 1)}}; std::vector pt1, pt2; pt1.resize(boxes.size()); pt2.resize(boxes.size()); // pre-process - float * input_data = pre_process_pose(input_tensor, boxes, pt1, pt2); + float* input_data = pre_process_pose(input_tensor, boxes, pt1, pt2); int input_dims[] = {static_cast(boxes.size()), 3, DEFAULT_IMG_H, DEFAULT_IMG_W}; // nchw // run prediction @@ -503,7 +500,7 @@ int main(int argc, char* argv[]) int heatmap_dims[MAX_SHAPE_DIM_NUM] = {0}; get_tensor_shape(output_tensor, heatmap_dims, MAX_SHAPE_DIM_NUM); - post_process_pose(image_file, (float *)get_tensor_buffer(output_tensor), heatmap_dims, pt1, pt2); + post_process_pose(image_file, (float*)get_tensor_buffer(output_tensor), heatmap_dims, pt1, pt2); if (input_data) { diff --git a/examples/tm_classification.c b/examples/tm_classification.c index ceda33270..f4d33d878 100644 --- a/examples/tm_classification.c +++ b/examples/tm_classification.c @@ -29,15 +29,15 @@ #include "tengine/c_api.h" #include "tengine_operations.h" -#define DEFAULT_IMG_H 224 -#define DEFAULT_IMG_W 224 -#define DEFAULT_SCALE1 0.017f -#define DEFAULT_SCALE2 0.017f -#define DEFAULT_SCALE3 0.017f -#define DEFAULT_MEAN1 104.007 -#define DEFAULT_MEAN2 116.669 -#define DEFAULT_MEAN3 122.679 -#define DEFAULT_LOOP_COUNT 1 +#define DEFAULT_IMG_H 224 +#define DEFAULT_IMG_W 224 +#define DEFAULT_SCALE1 0.017f +#define DEFAULT_SCALE2 0.017f +#define DEFAULT_SCALE3 0.017f +#define DEFAULT_MEAN1 104.007 +#define DEFAULT_MEAN2 116.669 +#define DEFAULT_MEAN3 122.679 +#define DEFAULT_LOOP_COUNT 1 #define DEFAULT_THREAD_COUNT 1 #define DEFAULT_CPU_AFFINITY 255 @@ -69,8 +69,8 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, /* set the shape, data buffer of input_tensor of the graph */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - float* input_data = ( float* )malloc(img_size * sizeof(float)); + int dims[] = {1, 3, img_h, img_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -89,7 +89,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -131,7 +131,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, /* get the result of classification */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); print_topk(output_data, output_size, 5); @@ -176,37 +176,37 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'g': - split(img_hw, optarg, ","); - img_h = ( int )img_hw[0]; - img_w = ( int )img_hw[1]; - break; - case 's': - split(scale, optarg, ","); - break; - case 'w': - split(mean, optarg, ","); - break; - case 'r': - loop_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'a': - cpu_affinity = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'g': + split(img_hw, optarg, ","); + img_h = (int)img_hw[0]; + img_w = (int)img_hw[1]; + break; + case 's': + split(scale, optarg, ","); + break; + case 'w': + split(mean, optarg, ","); + break; + case 'r': + loop_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'a': + cpu_affinity = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } diff --git a/examples/tm_classification_acl.c b/examples/tm_classification_acl.c index f6a1cbff5..0d1fbc4c4 100644 --- a/examples/tm_classification_acl.c +++ b/examples/tm_classification_acl.c @@ -29,15 +29,15 @@ #include "tengine/c_api.h" #include "tengine_operations.h" -#define DEFAULT_IMG_H 227 -#define DEFAULT_IMG_W 227 -#define DEFAULT_SCALE1 1.f -#define DEFAULT_SCALE2 1.f -#define DEFAULT_SCALE3 1.f -#define DEFAULT_MEAN1 104.007 -#define DEFAULT_MEAN2 116.669 -#define DEFAULT_MEAN3 122.679 -#define DEFAULT_LOOP_COUNT 1 +#define DEFAULT_IMG_H 227 +#define DEFAULT_IMG_W 227 +#define DEFAULT_SCALE1 1.f +#define DEFAULT_SCALE2 1.f +#define DEFAULT_SCALE3 1.f +#define DEFAULT_MEAN1 104.007 +#define DEFAULT_MEAN2 116.669 +#define DEFAULT_MEAN3 122.679 +#define DEFAULT_LOOP_COUNT 1 #define DEFAULT_THREAD_COUNT 1 #define DEFAULT_CPU_AFFINITY 255 @@ -78,8 +78,8 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, /* set the shape, data buffer of input_tensor of the graph */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - float* input_data = ( float* )malloc(img_size * sizeof(float)); + int dims[] = {1, 3, img_h, img_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -98,7 +98,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -140,7 +140,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, /* get the result of classification */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); print_topk(output_data, output_size, 5); @@ -185,37 +185,37 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'g': - split(img_hw, optarg, ","); - img_h = ( int )img_hw[0]; - img_w = ( int )img_hw[1]; - break; - case 's': - split(scale, optarg, ","); - break; - case 'w': - split(mean, optarg, ","); - break; - case 'r': - loop_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'a': - cpu_affinity = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'g': + split(img_hw, optarg, ","); + img_h = (int)img_hw[0]; + img_w = (int)img_hw[1]; + break; + case 's': + split(scale, optarg, ","); + break; + case 'w': + split(mean, optarg, ","); + break; + case 'r': + loop_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'a': + cpu_affinity = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } diff --git a/examples/tm_classification_cuda.cpp b/examples/tm_classification_cuda.cpp index 7166305ab..943643564 100644 --- a/examples/tm_classification_cuda.cpp +++ b/examples/tm_classification_cuda.cpp @@ -29,15 +29,15 @@ #include "tengine/c_api.h" #include "tengine_operations.h" -#define DEFAULT_IMG_H 227 -#define DEFAULT_IMG_W 227 -#define DEFAULT_SCALE1 1.f -#define DEFAULT_SCALE2 1.f -#define DEFAULT_SCALE3 1.f -#define DEFAULT_MEAN1 104.007 -#define DEFAULT_MEAN2 116.669 -#define DEFAULT_MEAN3 122.679 -#define DEFAULT_LOOP_COUNT 1 +#define DEFAULT_IMG_H 227 +#define DEFAULT_IMG_W 227 +#define DEFAULT_SCALE1 1.f +#define DEFAULT_SCALE2 1.f +#define DEFAULT_SCALE3 1.f +#define DEFAULT_MEAN1 104.007 +#define DEFAULT_MEAN2 116.669 +#define DEFAULT_MEAN3 122.679 +#define DEFAULT_LOOP_COUNT 1 #define DEFAULT_THREAD_COUNT 1 #define DEFAULT_CPU_AFFINITY 255 @@ -58,7 +58,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, return -1; } fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version()); - + /* create NVIDIA CUDA backend */ context_t cuda_context = create_context("cuda", 1); int rtt = add_context_device(cuda_context, "CUDA"); @@ -66,7 +66,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, { fprintf(stderr, " add_context_device NV CUDA DEVICE failed.\n"); return -1; - } + } /* create graph, load tengine model xxx.tmfile */ graph_t graph = create_graph(cuda_context, "tengine", model_file); @@ -78,8 +78,8 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, /* set the shape, data buffer of input_tensor of the graph */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - float* input_data = ( float* )malloc(img_size * sizeof(float)); + int dims[] = {1, 3, img_h, img_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -98,7 +98,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -140,7 +140,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, /* get the result of classification */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); print_topk(output_data, output_size, 5); @@ -185,37 +185,37 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'g': - split(img_hw, optarg, ","); - img_h = ( int )img_hw[0]; - img_w = ( int )img_hw[1]; - break; - case 's': - split(scale, optarg, ","); - break; - case 'w': - split(mean, optarg, ","); - break; - case 'r': - loop_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'a': - cpu_affinity = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'g': + split(img_hw, optarg, ","); + img_h = (int)img_hw[0]; + img_w = (int)img_hw[1]; + break; + case 's': + split(scale, optarg, ","); + break; + case 'w': + split(mean, optarg, ","); + break; + case 'r': + loop_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'a': + cpu_affinity = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } diff --git a/examples/tm_classification_fp16.c b/examples/tm_classification_fp16.c index 3108679e6..870fb5756 100644 --- a/examples/tm_classification_fp16.c +++ b/examples/tm_classification_fp16.c @@ -30,22 +30,22 @@ #include "tengine_operations.h" #include "compiler_fp16.h" -#define DEFAULT_IMG_H 227 -#define DEFAULT_IMG_W 227 -#define DEFAULT_SCALE1 1.f -#define DEFAULT_SCALE2 1.f -#define DEFAULT_SCALE3 1.f -#define DEFAULT_MEAN1 104.007 -#define DEFAULT_MEAN2 116.669 -#define DEFAULT_MEAN3 122.679 -#define DEFAULT_LOOP_COUNT 1 +#define DEFAULT_IMG_H 227 +#define DEFAULT_IMG_W 227 +#define DEFAULT_SCALE1 1.f +#define DEFAULT_SCALE2 1.f +#define DEFAULT_SCALE3 1.f +#define DEFAULT_MEAN1 104.007 +#define DEFAULT_MEAN2 116.669 +#define DEFAULT_MEAN3 122.679 +#define DEFAULT_LOOP_COUNT 1 #define DEFAULT_THREAD_COUNT 1 void get_input_fp16_data(const char* image_file, __fp16* input_data, int img_h, int img_w, float* mean, float* scale) { image img = imread_process(image_file, img_w, img_h, mean, scale); - float* image_data = ( float* )img.data; + float* image_data = (float*)img.data; for (int i = 0; i < img_w * img_h * 3; i++) input_data[i] = fp32_to_fp16(image_data[i]); @@ -81,8 +81,8 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - __fp16* input_data = ( __fp16* )malloc(img_size * sizeof(__fp16)); + int dims[] = {1, 3, img_h, img_w}; // nchw + __fp16* input_data = (__fp16*)malloc(img_size * sizeof(__fp16)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -141,11 +141,11 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, /* get the result of classification */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - __fp16* output_fp16 = ( __fp16* )get_tensor_buffer(output_tensor); + __fp16* output_fp16 = (__fp16*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(__fp16); /* cast fp16 to fp32 */ - float* output_data = ( float* )malloc(output_size * sizeof(float)); + float* output_data = (float*)malloc(output_size * sizeof(float)); for (int i = 0; i < output_size; i++) output_data[i] = fp16_to_fp32(output_fp16[i]); @@ -193,34 +193,34 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'g': - split(img_hw, optarg, ","); - img_h = ( int )img_hw[0]; - img_w = ( int )img_hw[1]; - break; - case 's': - split(scale, optarg, ","); - break; - case 'w': - split(mean, optarg, ","); - break; - case 'r': - loop_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'g': + split(img_hw, optarg, ","); + img_h = (int)img_hw[0]; + img_w = (int)img_hw[1]; + break; + case 's': + split(scale, optarg, ","); + break; + case 'w': + split(mean, optarg, ","); + break; + case 'r': + loop_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } diff --git a/examples/tm_classification_int8.c b/examples/tm_classification_int8.c index 734b68a09..098a2fa3e 100644 --- a/examples/tm_classification_int8.c +++ b/examples/tm_classification_int8.c @@ -29,23 +29,23 @@ #include "tengine/c_api.h" #include "tengine_operations.h" -#define DEFAULT_IMG_H 224 -#define DEFAULT_IMG_W 224 -#define DEFAULT_SCALE1 0.017f -#define DEFAULT_SCALE2 0.017f -#define DEFAULT_SCALE3 0.017f -#define DEFAULT_MEAN1 104.007 -#define DEFAULT_MEAN2 116.669 -#define DEFAULT_MEAN3 122.679 -#define DEFAULT_LOOP_COUNT 1 +#define DEFAULT_IMG_H 224 +#define DEFAULT_IMG_W 224 +#define DEFAULT_SCALE1 0.017f +#define DEFAULT_SCALE2 0.017f +#define DEFAULT_SCALE3 0.017f +#define DEFAULT_MEAN1 104.007 +#define DEFAULT_MEAN2 116.669 +#define DEFAULT_MEAN3 122.679 +#define DEFAULT_LOOP_COUNT 1 #define DEFAULT_THREAD_COUNT 1 void get_input_int8_data(const char* image_file, int8_t* input_data, int img_h, int img_w, float* mean, float* scale, - float input_scale) + float input_scale) { image img = imread_process(image_file, img_w, img_h, mean, scale); - float* image_data = ( float* )img.data; + float* image_data = (float*)img.data; for (int i = 0; i < img_w * img_h * 3; i++) { @@ -89,8 +89,8 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - int8_t* input_data = ( int8_t* )malloc(img_size); + int dims[] = {1, 3, img_h, img_w}; // nchw + int8_t* input_data = (int8_t*)malloc(img_size); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -152,16 +152,16 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, /* get the result of classification */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - int8_t* output_i8 = ( int8_t* )get_tensor_buffer(output_tensor); + int8_t* output_i8 = (int8_t*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor); /* dequant */ float output_scale = 0.f; int output_zero_point = 0; get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); + float* output_data = (float*)malloc(output_size * sizeof(float)); for (int i = 0; i < output_size; i++) - output_data[i] = ( float )output_i8[i]* output_scale; + output_data[i] = (float)output_i8[i] * output_scale; print_topk(output_data, output_size, 5); fprintf(stderr, "--------------------------------------\n"); @@ -207,34 +207,34 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'g': - split(img_hw, optarg, ","); - img_h = ( int )img_hw[0]; - img_w = ( int )img_hw[1]; - break; - case 's': - split(scale, optarg, ","); - break; - case 'w': - split(mean, optarg, ","); - break; - case 'r': - loop_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'g': + split(img_hw, optarg, ","); + img_h = (int)img_hw[0]; + img_w = (int)img_hw[1]; + break; + case 's': + split(scale, optarg, ","); + break; + case 'w': + split(mean, optarg, ","); + break; + case 'r': + loop_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } diff --git a/examples/tm_classification_timvx.c b/examples/tm_classification_timvx.c index b759c8ab0..4d81c25d4 100644 --- a/examples/tm_classification_timvx.c +++ b/examples/tm_classification_timvx.c @@ -29,15 +29,15 @@ #include "tengine/c_api.h" #include "tengine_operations.h" -#define DEFAULT_IMG_H 227 -#define DEFAULT_IMG_W 227 -#define DEFAULT_SCALE1 1.f -#define DEFAULT_SCALE2 1.f -#define DEFAULT_SCALE3 1.f -#define DEFAULT_MEAN1 104.007 -#define DEFAULT_MEAN2 116.669 -#define DEFAULT_MEAN3 122.679 -#define DEFAULT_LOOP_COUNT 1 +#define DEFAULT_IMG_H 227 +#define DEFAULT_IMG_W 227 +#define DEFAULT_SCALE1 1.f +#define DEFAULT_SCALE2 1.f +#define DEFAULT_SCALE3 1.f +#define DEFAULT_MEAN1 104.007 +#define DEFAULT_MEAN2 116.669 +#define DEFAULT_MEAN3 122.679 +#define DEFAULT_LOOP_COUNT 1 #define DEFAULT_THREAD_COUNT 1 #define DEFAULT_CPU_AFFINITY 255 @@ -46,7 +46,7 @@ void get_input_uint8_data(const char* image_file, uint8_t* input_data, int img_h { image img = imread_process(image_file, img_w, img_h, mean, scale); - float* image_data = ( float* )img.data; + float* image_data = (float*)img.data; for (int i = 0; i < img_w * img_h * 3; i++) { @@ -63,7 +63,7 @@ void get_input_uint8_data(const char* image_file, uint8_t* input_data, int img_h } int tengine_classify(const char* model_file, const char* image_file, int img_h, int img_w, float* mean, float* scale, - int loop_count, int num_thread, int affinity) + int loop_count, int num_thread, int affinity) { /* set runtime options */ struct options opt; @@ -99,8 +99,8 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - uint8_t* input_data = ( uint8_t* )malloc(img_size); + int dims[] = {1, 3, img_h, img_w}; // nchw + uint8_t* input_data = (uint8_t*)malloc(img_size); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -119,7 +119,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -164,16 +164,16 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, /* get the result of classification */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - uint8_t* output_u8 = ( uint8_t* )get_tensor_buffer(output_tensor); + uint8_t* output_u8 = (uint8_t*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor); /* dequant */ float output_scale = 0.f; int output_zero_point = 0; get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); + float* output_data = (float*)malloc(output_size * sizeof(float)); for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; print_topk(output_data, output_size, 5); fprintf(stderr, "--------------------------------------\n"); @@ -218,37 +218,37 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'g': - split(img_hw, optarg, ","); - img_h = ( int )img_hw[0]; - img_w = ( int )img_hw[1]; - break; - case 's': - split(scale, optarg, ","); - break; - case 'w': - split(mean, optarg, ","); - break; - case 'r': - loop_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'a': - cpu_affinity = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'g': + split(img_hw, optarg, ","); + img_h = (int)img_hw[0]; + img_w = (int)img_hw[1]; + break; + case 's': + split(scale, optarg, ","); + break; + case 'w': + split(mean, optarg, ","); + break; + case 'r': + loop_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'a': + cpu_affinity = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } diff --git a/examples/tm_classification_trt.cpp b/examples/tm_classification_trt.cpp index 64d0cd861..4ebed2402 100644 --- a/examples/tm_classification_trt.cpp +++ b/examples/tm_classification_trt.cpp @@ -29,19 +29,18 @@ #include "tengine/c_api.h" #include "tengine_operations.h" -#define DEFAULT_IMG_H 227 -#define DEFAULT_IMG_W 227 -#define DEFAULT_SCALE1 1.f -#define DEFAULT_SCALE2 1.f -#define DEFAULT_SCALE3 1.f -#define DEFAULT_MEAN1 104.007 -#define DEFAULT_MEAN2 116.669 -#define DEFAULT_MEAN3 122.679 -#define DEFAULT_LOOP_COUNT 1 +#define DEFAULT_IMG_H 227 +#define DEFAULT_IMG_W 227 +#define DEFAULT_SCALE1 1.f +#define DEFAULT_SCALE2 1.f +#define DEFAULT_SCALE3 1.f +#define DEFAULT_MEAN1 104.007 +#define DEFAULT_MEAN2 116.669 +#define DEFAULT_MEAN3 122.679 +#define DEFAULT_LOOP_COUNT 1 #define DEFAULT_THREAD_COUNT 1 #define DEFAULT_CPU_AFFINITY 255 - int tengine_classify(const char* model_file, const char* image_file, int img_h, int img_w, const float* mean, const float* scale, int loop_count, int num_thread, int affinity) { @@ -59,7 +58,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, return -1; } fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version()); - + /* create NVIDIA TensorRT backend */ context_t trt_context = create_context("trt", 1); int rtt = add_context_device(trt_context, "TensorRT"); @@ -79,8 +78,8 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, /* set the shape, data buffer of input_tensor of the graph */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - float* input_data = ( float* )malloc(img_size * sizeof(float)); + int dims[] = {1, 3, img_h, img_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -99,7 +98,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -141,7 +140,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, /* get the result of classification */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); print_topk(output_data, output_size, 5); @@ -186,37 +185,37 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'g': - split(img_hw, optarg, ","); - img_h = ( int )img_hw[0]; - img_w = ( int )img_hw[1]; - break; - case 's': - split(scale, optarg, ","); - break; - case 'w': - split(mean, optarg, ","); - break; - case 'r': - loop_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'a': - cpu_affinity = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'g': + split(img_hw, optarg, ","); + img_h = (int)img_hw[0]; + img_w = (int)img_hw[1]; + break; + case 's': + split(scale, optarg, ","); + break; + case 'w': + split(mean, optarg, ","); + break; + case 'r': + loop_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'a': + cpu_affinity = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } diff --git a/examples/tm_classification_uint8.c b/examples/tm_classification_uint8.c index 1a59bfdc7..dbf11c32e 100644 --- a/examples/tm_classification_uint8.c +++ b/examples/tm_classification_uint8.c @@ -29,15 +29,15 @@ #include "tengine/c_api.h" #include "tengine_operations.h" -#define DEFAULT_IMG_H 224 -#define DEFAULT_IMG_W 224 -#define DEFAULT_SCALE1 0.017f -#define DEFAULT_SCALE2 0.017f -#define DEFAULT_SCALE3 0.017f -#define DEFAULT_MEAN1 104.007 -#define DEFAULT_MEAN2 116.669 -#define DEFAULT_MEAN3 122.679 -#define DEFAULT_LOOP_COUNT 1 +#define DEFAULT_IMG_H 224 +#define DEFAULT_IMG_W 224 +#define DEFAULT_SCALE1 0.017f +#define DEFAULT_SCALE2 0.017f +#define DEFAULT_SCALE3 0.017f +#define DEFAULT_MEAN1 104.007 +#define DEFAULT_MEAN2 116.669 +#define DEFAULT_MEAN3 122.679 +#define DEFAULT_LOOP_COUNT 1 #define DEFAULT_THREAD_COUNT 1 void get_input_uint8_data(const char* image_file, uint8_t* input_data, int img_h, int img_w, float* mean, float* scale, @@ -45,7 +45,7 @@ void get_input_uint8_data(const char* image_file, uint8_t* input_data, int img_h { image img = imread_process(image_file, img_w, img_h, mean, scale); - float* image_data = ( float* )img.data; + float* image_data = (float*)img.data; for (int i = 0; i < img_w * img_h * 3; i++) { @@ -89,8 +89,8 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - uint8_t* input_data = ( uint8_t* )malloc(img_size); + int dims[] = {1, 3, img_h, img_w}; // nchw + uint8_t* input_data = (uint8_t*)malloc(img_size); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -109,7 +109,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -154,16 +154,16 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, /* get the result of classification */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - uint8_t* output_u8 = ( uint8_t* )get_tensor_buffer(output_tensor); + uint8_t* output_u8 = (uint8_t*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor); /* dequant */ float output_scale = 0.f; int output_zero_point = 0; get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); + float* output_data = (float*)malloc(output_size * sizeof(float)); for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; print_topk(output_data, output_size, 5); fprintf(stderr, "--------------------------------------\n"); @@ -207,34 +207,34 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'g': - split(img_hw, optarg, ","); - img_h = ( int )img_hw[0]; - img_w = ( int )img_hw[1]; - break; - case 's': - split(scale, optarg, ","); - break; - case 'w': - split(mean, optarg, ","); - break; - case 'r': - loop_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'g': + split(img_hw, optarg, ","); + img_h = (int)img_hw[0]; + img_w = (int)img_hw[1]; + break; + case 's': + split(scale, optarg, ","); + break; + case 'w': + split(mean, optarg, ","); + break; + case 'r': + loop_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } diff --git a/examples/tm_classification_vulkan.c b/examples/tm_classification_vulkan.c index 97960681e..792e3ac32 100644 --- a/examples/tm_classification_vulkan.c +++ b/examples/tm_classification_vulkan.c @@ -29,15 +29,15 @@ #include "tengine/c_api.h" #include "tengine_operations.h" -#define DEFAULT_IMG_H 227 -#define DEFAULT_IMG_W 227 -#define DEFAULT_SCALE1 1.f -#define DEFAULT_SCALE2 1.f -#define DEFAULT_SCALE3 1.f -#define DEFAULT_MEAN1 104.007 -#define DEFAULT_MEAN2 116.669 -#define DEFAULT_MEAN3 122.679 -#define DEFAULT_LOOP_COUNT 1 +#define DEFAULT_IMG_H 227 +#define DEFAULT_IMG_W 227 +#define DEFAULT_SCALE1 1.f +#define DEFAULT_SCALE2 1.f +#define DEFAULT_SCALE3 1.f +#define DEFAULT_MEAN1 104.007 +#define DEFAULT_MEAN2 116.669 +#define DEFAULT_MEAN3 122.679 +#define DEFAULT_LOOP_COUNT 1 #define DEFAULT_THREAD_COUNT 1 int tengine_classify(const char* model_file, const char* image_file, int img_h, int img_w, const float* mean, @@ -72,8 +72,8 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - float* input_data = ( float* )malloc(img_size * sizeof(float)); + int dims[] = {1, 3, img_h, img_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -132,7 +132,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h, /* get the result of classification */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); print_topk(output_data, output_size, 5); @@ -178,34 +178,34 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'g': - split(img_hw, optarg, ","); - img_h = ( int )img_hw[0]; - img_w = ( int )img_hw[1]; - break; - case 's': - split(scale, optarg, ","); - break; - case 'w': - split(mean, optarg, ","); - break; - case 'r': - loop_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'g': + split(img_hw, optarg, ","); + img_h = (int)img_hw[0]; + img_w = (int)img_hw[1]; + break; + case 's': + split(scale, optarg, ","); + break; + case 'w': + split(mean, optarg, ","); + break; + case 'r': + loop_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } diff --git a/examples/tm_crnn.cpp b/examples/tm_crnn.cpp index c7c1ed0b7..ee3e2b1ed 100644 --- a/examples/tm_crnn.cpp +++ b/examples/tm_crnn.cpp @@ -57,7 +57,7 @@ void get_input_data_cv(const cv::Mat& sample, float* input_data, int img_h, int { cv::cvtColor(sample, img, cv::COLOR_GRAY2RGB); } - else if (sample.channels() == 3 && img_c == 3 && swapRB == 1) + else if (sample.channels() == 3 && img_c == 3 && swapRB == 1) { cv::cvtColor(sample, img, cv::COLOR_BGR2RGB); } @@ -75,7 +75,7 @@ void get_input_data_cv(const cv::Mat& sample, float* input_data, int img_h, int img.convertTo(img, CV_32FC3); else if (img_c == 1) img.convertTo(img, CV_32FC1); - float* img_data = ( float* )img.data; + float* img_data = (float*)img.data; int hw = img_h * img_w; for (int h = 0; h < img_h; h++) { @@ -158,26 +158,26 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'l': - label_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'l': + label_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -238,7 +238,7 @@ int main(int argc, char* argv[]) int img_size = img_h * img_w * 1; int dims[] = {1, 1, img_h, img_w}; - float* input_data = ( float* )malloc(img_size * sizeof(float)); + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == nullptr) @@ -293,7 +293,7 @@ int main(int argc, char* argv[]) /* process the crnn result */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* ocr_data = ( float* )get_tensor_buffer(output_tensor); + float* ocr_data = (float*)get_tensor_buffer(output_tensor); process_crnn_result(ocr_data, label_file); free(input_data); diff --git a/examples/tm_efficientdet.c b/examples/tm_efficientdet.c index 73daeeeb0..26ed21f55 100644 --- a/examples/tm_efficientdet.c +++ b/examples/tm_efficientdet.c @@ -30,19 +30,18 @@ #include "tengine/c_api.h" #include "tengine_operations.h" -#define DEFAULT_IMG_H 512 -#define DEFAULT_IMG_W 512 -#define DEFAULT_SCALE1 0.017124754f -#define DEFAULT_SCALE2 0.017507003f -#define DEFAULT_SCALE3 0.017429194f -#define DEFAULT_MEAN1 123.675 -#define DEFAULT_MEAN2 116.280 -#define DEFAULT_MEAN3 103.530 -#define DEFAULT_LOOP_COUNT 1 +#define DEFAULT_IMG_H 512 +#define DEFAULT_IMG_W 512 +#define DEFAULT_SCALE1 0.017124754f +#define DEFAULT_SCALE2 0.017507003f +#define DEFAULT_SCALE3 0.017429194f +#define DEFAULT_MEAN1 123.675 +#define DEFAULT_MEAN2 116.280 +#define DEFAULT_MEAN3 103.530 +#define DEFAULT_LOOP_COUNT 1 #define DEFAULT_THREAD_COUNT 1 #define DEFAULT_CPU_AFFINITY 255 - typedef struct Box { int x0; @@ -53,20 +52,22 @@ typedef struct Box float score; } Box_t; - -void qsort_descent_inplace(Box_t* boxes, int left, int right) { +void qsort_descent_inplace(Box_t* boxes, int left, int right) +{ int i = left; int j = right; float p = boxes[(left + right) / 2].score; - while (i <= j) { + while (i <= j) + { while (boxes[i].score > p) i++; while (boxes[j].score < p) j--; - if (i <= j) { + if (i <= j) + { // swap Box_t tmp = boxes[i]; boxes[i] = boxes[j]; @@ -90,23 +91,26 @@ void qsort_descent_inplace(Box_t* boxes, int left, int right) { } } - -int nms(const Box_t* boxes, const int num_boxes, int* suppressed, float nms_threshold) { +int nms(const Box_t* boxes, const int num_boxes, int* suppressed, float nms_threshold) +{ int num_outputs = num_boxes; float* areas = malloc(num_boxes * sizeof(float)); - for (int i = 0; i < num_boxes; i++) { - areas[i] = (float) ((boxes[i].x1 - boxes[i].x0) * (boxes[i].y1 - boxes[i].y0)); + for (int i = 0; i < num_boxes; i++) + { + areas[i] = (float)((boxes[i].x1 - boxes[i].x0) * (boxes[i].y1 - boxes[i].y0)); } - for (int i = 0; i < num_boxes; i++) { + for (int i = 0; i < num_boxes; i++) + { const Box_t a = boxes[i]; if (suppressed[i] == 1) continue; - for (int j = i + 1; j < num_boxes; j++) { + for (int j = i + 1; j < num_boxes; j++) + { const Box_t b = boxes[j]; if (suppressed[j] == 1) @@ -117,10 +121,13 @@ int nms(const Box_t* boxes, const int num_boxes, int* suppressed, float nms_thre float total_area = (a.x1 - a.x0) * (a.y1 - a.y0) + (b.x1 - b.x0) * (b.y1 - b.y0) - intersection; float iou = fmaxf(intersection / total_area, 0); - if (iou > nms_threshold){ + if (iou > nms_threshold) + { suppressed[j] = 1; num_outputs--; - } else{ + } + else + { suppressed[j] = 0; } } @@ -130,54 +137,62 @@ int nms(const Box_t* boxes, const int num_boxes, int* suppressed, float nms_thre return num_outputs; } - -float* arange(int start, int end, float stride) { - int length = (int) ((float) ceilf((float) (end - start) / stride)); +float* arange(int start, int end, float stride) +{ + int length = (int)((float)ceilf((float)(end - start) / stride)); float* result = malloc(length * sizeof(float)); - result[0] = (float) start; - for (int i = 1; i < length; i++) { + result[0] = (float)start; + for (int i = 1; i < length; i++) + { result[i] = result[i - 1] + stride; } return result; } - void tile(const float* arr, int arr_length, int times, float offset, - float* result, int arr_starts_from, int arr_stride) { + float* result, int arr_starts_from, int arr_stride) +{ int length = arr_length * times; - if (result == NULL) { + if (result == NULL) + { result = malloc(length * sizeof(float)); arr_starts_from = 0; } - for (int i = 0, j = 0; i < length; i++, j += arr_stride) { + for (int i = 0, j = 0; i < length; i++, j += arr_stride) + { result[j + arr_starts_from] = arr[i % arr_length] + offset; } } void repeat(const float* arr, int arr_length, int times, float offset, - float* result, int arr_starts_from, int arr_stride) { + float* result, int arr_starts_from, int arr_stride) +{ int length = arr_length * times; - if (result == NULL) { + if (result == NULL) + { result = malloc(length * sizeof(float)); arr_starts_from = 0; } - for (int i = 0, j = 0; i < length; i++, j += arr_stride) { + for (int i = 0, j = 0; i < length; i++, j += arr_stride) + { result[j + arr_starts_from] = arr[i / times] + offset; } } - -int argmax(const float* arr, int arr_starts_from, int arr_length) { +int argmax(const float* arr, int arr_starts_from, int arr_length) +{ float max_value = arr[arr_starts_from]; int max_idx = 0; - for (int i = 1; i < arr_length; i++) { + for (int i = 1; i < arr_length; i++) + { float this_value = arr[arr_starts_from + i]; - if (this_value > max_value) { + if (this_value > max_value) + { max_value = this_value; max_idx = i; } @@ -185,28 +200,27 @@ int argmax(const float* arr, int arr_starts_from, int arr_length) { return max_idx; } - int tengine_detect(const char* model_file, const char* image_file, int img_h, int img_w, const float* mean, - const float* scale, int loop_count, int num_thread, int affinity) + const float* scale, int loop_count, int num_thread, int affinity) { /* setup network */ const char* CLASSES_NAME[] = {"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", - "fire hydrant", "", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", - "cow", "elephant", "bear", "zebra", "giraffe", "", "backpack", "umbrella", "", "", "handbag", "tie", - "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", - "skateboard", "surfboard", "tennis racket", "bottle", "", "wine glass", "cup", "fork", "knife", "spoon", - "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", - "cake", "chair", "couch", "potted plant", "bed", "", "dining table", "", "", "toilet", "", "tv", - "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", - "refrigerator", "", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", - "toothbrush"}; + "fire hydrant", "", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", + "cow", "elephant", "bear", "zebra", "giraffe", "", "backpack", "umbrella", "", "", "handbag", "tie", + "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", + "skateboard", "surfboard", "tennis racket", "bottle", "", "wine glass", "cup", "fork", "knife", "spoon", + "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", + "cake", "chair", "couch", "potted plant", "bed", "", "dining table", "", "", "toilet", "", "tv", + "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", + "refrigerator", "", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", + "toothbrush"}; int PYRAMID_LEVELS[] = {3, 4, 5, 6, 7}; int STRIDES[] = {8, 16, 32, 64, 128}; float SCALES[] = { - (float) pow(2, 0.), - (float) pow(2, 1. / 3.), - (float) pow(2, 2. / 3.), + (float)pow(2, 0.), + (float)pow(2, 1. / 3.), + (float)pow(2, 2. / 3.), }; float RATIOS_X[] = {1.f, 1.4f, 0.7f}; float RATIOS_Y[] = {1.f, 0.7f, 1.4f}; @@ -243,8 +257,8 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in /* set the shape, data buffer of input_tensor of the graph */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - float* input_data = ( float* )malloc(img_size * sizeof(float)); + int dims[] = {1, 3, img_h, img_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -285,16 +299,19 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in int resized_h, resized_w; float resize_scale; image resImg; - if (raw_h > raw_w){ + if (raw_h > raw_w) + { resized_h = img_h; - resized_w = (int) ((float) img_h / raw_h * raw_w); + resized_w = (int)((float)img_h / raw_h * raw_w); resImg = resize_image(im, resized_w, img_h); - resize_scale = (float) raw_h / img_h; - } else{ + resize_scale = (float)raw_h / img_h; + } + else + { resized_w = img_w; - resized_h = (int) ((float) img_w / raw_w * raw_h); + resized_h = (int)((float)img_w / raw_w * raw_h); resImg = resize_image(im, img_w, resized_h); - resize_scale = (float) raw_w / img_w; + resize_scale = (float)raw_w / img_w; } free_image(im); @@ -334,11 +351,11 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in /* get the result of classification */ tensor_t output_tensor_regression = get_graph_output_tensor(graph, 0, 0); - float* output_data_regression = ( float* )get_tensor_buffer(output_tensor_regression); + float* output_data_regression = (float*)get_tensor_buffer(output_tensor_regression); int num_anchors = get_tensor_buffer_size(output_tensor_regression) / sizeof(float) / 4; tensor_t output_tensor_classification = get_graph_output_tensor(graph, 1, 0); - float* output_data_classification = ( float* )get_tensor_buffer(output_tensor_classification); + float* output_data_classification = (float*)get_tensor_buffer(output_tensor_classification); int num_classes = get_tensor_buffer_size(output_tensor_classification) / sizeof(float) / num_anchors; // postprocess @@ -349,21 +366,24 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in float* anchors_y1 = malloc(num_anchors * sizeof(float)); int anchor_idx = 0; - for (int stride_idx = 0; stride_idx < num_levels; stride_idx++) { + for (int stride_idx = 0; stride_idx < num_levels; stride_idx++) + { int stride = STRIDES[stride_idx]; - float arange_stride = powf(2, (float) PYRAMID_LEVELS[stride_idx]); - int length_x = (int) ceilf(((float) img_w - (float) stride / 2) / (float) arange_stride); - int length_y = (int) ceilf(((float) img_h - (float) stride / 2) / (float) arange_stride); + float arange_stride = powf(2, (float)PYRAMID_LEVELS[stride_idx]); + int length_x = (int)ceilf(((float)img_w - (float)stride / 2) / (float)arange_stride); + int length_y = (int)ceilf(((float)img_h - (float)stride / 2) / (float)arange_stride); float* x = arange(stride / 2, img_w, arange_stride); float* y = arange(stride / 2, img_h, arange_stride); int start_idx = anchor_idx; int num_anchor_types = num_scales * num_ratios; - for (int i = 0; i < num_scales; i++) { + for (int i = 0; i < num_scales; i++) + { float anchor_scale = SCALES[i]; - float base_anchor_size = ANCHOR_SCALE * (float) stride * anchor_scale; + float base_anchor_size = ANCHOR_SCALE * (float)stride * anchor_scale; - for (int j = 0; j < num_ratios; j++) { + for (int j = 0; j < num_ratios; j++) + { float ratio_x = RATIOS_X[j]; float ratio_y = RATIOS_Y[j]; @@ -391,14 +411,16 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in int num_proposals_over_threshold = 0; #pragma omp parallel for num_threads(opt.num_thread) - for (int i = 0; i < num_anchors; i++) { + for (int i = 0; i < num_anchors; i++) + { // loop over anchors // confidence int max_idx = argmax(output_data_classification, i * num_classes, num_classes); float max_score = output_data_classification[i * num_classes + max_idx]; - if (isinf(max_score) || max_score < CONFIDENCE_THRESHOLD){ + if (isinf(max_score) || max_score < CONFIDENCE_THRESHOLD) + { proposals[i].class_idx = -1; continue; } @@ -429,24 +451,25 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in xmax *= resize_scale; // clipping - xmin = fmaxf(fminf(xmin, (float) (raw_w - 1)), 0.f); - xmax = fmaxf(fminf(xmax, (float) (raw_w - 1)), 0.f); - ymin = fmaxf(fminf(ymin, (float) (raw_h - 1)), 0.f); - ymax = fmaxf(fminf(ymax, (float) (raw_h - 1)), 0.f); + xmin = fmaxf(fminf(xmin, (float)(raw_w - 1)), 0.f); + xmax = fmaxf(fminf(xmax, (float)(raw_w - 1)), 0.f); + ymin = fmaxf(fminf(ymin, (float)(raw_h - 1)), 0.f); + ymax = fmaxf(fminf(ymax, (float)(raw_h - 1)), 0.f); // area filtering float area = (xmax - xmin) * (ymax - ymin); - if (area < 4){ + if (area < 4) + { proposals[i].class_idx = -1; continue; } num_proposals_over_threshold++; - proposals[i].x0 = (int) xmin; - proposals[i].x1 = (int) xmax; - proposals[i].y0 = (int) ymin; - proposals[i].y1 = (int) ymax; + proposals[i].x0 = (int)xmin; + proposals[i].x1 = (int)xmax; + proposals[i].y0 = (int)ymin; + proposals[i].y1 = (int)ymax; } free(anchors_x0); free(anchors_x1); @@ -456,16 +479,18 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in // filter boxes with confidence threshold Box_t* proposals_over_threshold = malloc(sizeof(Box_t) * num_proposals_over_threshold); int proposals_over_threshold_idx = 0; - for (int i = 0; i < num_anchors; i++) { + for (int i = 0; i < num_anchors; i++) + { Box_t box = proposals[i]; - if(box.class_idx == -1) + if (box.class_idx == -1) continue; proposals_over_threshold[proposals_over_threshold_idx] = box; proposals_over_threshold_idx++; } free(proposals); - if (num_proposals_over_threshold > 0){ + if (num_proposals_over_threshold > 0) + { // sort boxes qsort_descent_inplace(proposals_over_threshold, 0, num_proposals_over_threshold - 1); @@ -474,9 +499,10 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in int num_outputs = nms(proposals_over_threshold, num_proposals_over_threshold, suppressed, NMS_THRESHOLD); Box_t* proposals_after_nms = malloc(num_outputs * sizeof(Box_t)); int proposals_after_nms_idx = 0; - for(int i = 0; i < num_proposals_over_threshold; i++){ + for (int i = 0; i < num_proposals_over_threshold; i++) + { Box_t box = proposals_over_threshold[i]; - if(suppressed[i] == 1) + if (suppressed[i] == 1) continue; proposals_after_nms[proposals_after_nms_idx] = box; proposals_after_nms_idx++; @@ -536,37 +562,37 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'g': - split(img_hw, optarg, ","); - img_h = ( int )img_hw[0]; - img_w = ( int )img_hw[1]; - break; - case 's': - split(scale, optarg, ","); - break; - case 'w': - split(mean, optarg, ","); - break; - case 'r': - loop_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'a': - cpu_affinity = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'g': + split(img_hw, optarg, ","); + img_h = (int)img_hw[0]; + img_w = (int)img_hw[1]; + break; + case 's': + split(scale, optarg, ","); + break; + case 'w': + split(mean, optarg, ","); + break; + case 'r': + loop_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'a': + cpu_affinity = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } diff --git a/examples/tm_efficientdet_uint8.c b/examples/tm_efficientdet_uint8.c index f25aa64c5..cc61bfea0 100644 --- a/examples/tm_efficientdet_uint8.c +++ b/examples/tm_efficientdet_uint8.c @@ -28,19 +28,18 @@ #include "tengine/c_api.h" #include "tengine_operations.h" -#define DEFAULT_IMG_H 512 -#define DEFAULT_IMG_W 512 -#define DEFAULT_SCALE1 0.017124754f -#define DEFAULT_SCALE2 0.017507003f -#define DEFAULT_SCALE3 0.017429194f -#define DEFAULT_MEAN1 123.675 -#define DEFAULT_MEAN2 116.280 -#define DEFAULT_MEAN3 103.530 -#define DEFAULT_LOOP_COUNT 1 +#define DEFAULT_IMG_H 512 +#define DEFAULT_IMG_W 512 +#define DEFAULT_SCALE1 0.017124754f +#define DEFAULT_SCALE2 0.017507003f +#define DEFAULT_SCALE3 0.017429194f +#define DEFAULT_MEAN1 123.675 +#define DEFAULT_MEAN2 116.280 +#define DEFAULT_MEAN3 103.530 +#define DEFAULT_LOOP_COUNT 1 #define DEFAULT_THREAD_COUNT 1 #define DEFAULT_CPU_AFFINITY 255 - typedef struct Box { int x0; @@ -51,20 +50,22 @@ typedef struct Box float score; } Box_t; - -void qsort_descent_inplace(Box_t* boxes, int left, int right) { +void qsort_descent_inplace(Box_t* boxes, int left, int right) +{ int i = left; int j = right; float p = boxes[(left + right) / 2].score; - while (i <= j) { + while (i <= j) + { while (boxes[i].score > p) i++; while (boxes[j].score < p) j--; - if (i <= j) { + if (i <= j) + { // swap Box_t tmp = boxes[i]; boxes[i] = boxes[j]; @@ -88,23 +89,26 @@ void qsort_descent_inplace(Box_t* boxes, int left, int right) { } } - -int nms(const Box_t* boxes, const int num_boxes, int* suppressed, float nms_threshold) { +int nms(const Box_t* boxes, const int num_boxes, int* suppressed, float nms_threshold) +{ int num_outputs = num_boxes; float* areas = malloc(num_boxes * sizeof(float)); - for (int i = 0; i < num_boxes; i++) { - areas[i] = (float) ((boxes[i].x1 - boxes[i].x0) * (boxes[i].y1 - boxes[i].y0)); + for (int i = 0; i < num_boxes; i++) + { + areas[i] = (float)((boxes[i].x1 - boxes[i].x0) * (boxes[i].y1 - boxes[i].y0)); } - for (int i = 0; i < num_boxes; i++) { + for (int i = 0; i < num_boxes; i++) + { const Box_t a = boxes[i]; if (suppressed[i] == 1) continue; - for (int j = i + 1; j < num_boxes; j++) { + for (int j = i + 1; j < num_boxes; j++) + { const Box_t b = boxes[j]; if (suppressed[j] == 1) @@ -115,10 +119,13 @@ int nms(const Box_t* boxes, const int num_boxes, int* suppressed, float nms_thre float total_area = (a.x1 - a.x0) * (a.y1 - a.y0) + (b.x1 - b.x0) * (b.y1 - b.y0) - intersection; float iou = fmaxf(intersection / total_area, 0); - if (iou > nms_threshold){ + if (iou > nms_threshold) + { suppressed[j] = 1; num_outputs--; - } else{ + } + else + { suppressed[j] = 0; } } @@ -128,54 +135,62 @@ int nms(const Box_t* boxes, const int num_boxes, int* suppressed, float nms_thre return num_outputs; } - -float* arange(int start, int end, float stride) { - int length = (int) ((float) ceilf((float) (end - start) / stride)); +float* arange(int start, int end, float stride) +{ + int length = (int)((float)ceilf((float)(end - start) / stride)); float* result = malloc(length * sizeof(float)); - result[0] = (float) start; - for (int i = 1; i < length; i++) { + result[0] = (float)start; + for (int i = 1; i < length; i++) + { result[i] = result[i - 1] + stride; } return result; } - void tile(const float* arr, int arr_length, int times, float offset, - float* result, int arr_starts_from, int arr_stride) { + float* result, int arr_starts_from, int arr_stride) +{ int length = arr_length * times; - if (result == NULL) { + if (result == NULL) + { result = malloc(length * sizeof(float)); arr_starts_from = 0; } - for (int i = 0, j = 0; i < length; i++, j += arr_stride) { + for (int i = 0, j = 0; i < length; i++, j += arr_stride) + { result[j + arr_starts_from] = arr[i % arr_length] + offset; } } void repeat(const float* arr, int arr_length, int times, float offset, - float* result, int arr_starts_from, int arr_stride) { + float* result, int arr_starts_from, int arr_stride) +{ int length = arr_length * times; - if (result == NULL) { + if (result == NULL) + { result = malloc(length * sizeof(float)); arr_starts_from = 0; } - for (int i = 0, j = 0; i < length; i++, j += arr_stride) { + for (int i = 0, j = 0; i < length; i++, j += arr_stride) + { result[j + arr_starts_from] = arr[i / times] + offset; } } - -int argmax(const float* arr, int arr_starts_from, int arr_length) { +int argmax(const float* arr, int arr_starts_from, int arr_length) +{ float max_value = arr[arr_starts_from]; int max_idx = 0; - for (int i = 1; i < arr_length; i++) { + for (int i = 1; i < arr_length; i++) + { float this_value = arr[arr_starts_from + i]; - if (this_value > max_value) { + if (this_value > max_value) + { max_value = this_value; max_idx = i; } @@ -183,28 +198,27 @@ int argmax(const float* arr, int arr_starts_from, int arr_length) { return max_idx; } - int tengine_detect(const char* model_file, const char* image_file, int img_h, int img_w, const float* mean, - const float* scale, int loop_count, int num_thread, int affinity) + const float* scale, int loop_count, int num_thread, int affinity) { /* setup network */ const char* CLASSES_NAME[] = {"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", - "fire hydrant", "", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", - "cow", "elephant", "bear", "zebra", "giraffe", "", "backpack", "umbrella", "", "", "handbag", "tie", - "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", - "skateboard", "surfboard", "tennis racket", "bottle", "", "wine glass", "cup", "fork", "knife", "spoon", - "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", - "cake", "chair", "couch", "potted plant", "bed", "", "dining table", "", "", "toilet", "", "tv", - "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", - "refrigerator", "", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", - "toothbrush"}; + "fire hydrant", "", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", + "cow", "elephant", "bear", "zebra", "giraffe", "", "backpack", "umbrella", "", "", "handbag", "tie", + "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", + "skateboard", "surfboard", "tennis racket", "bottle", "", "wine glass", "cup", "fork", "knife", "spoon", + "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", + "cake", "chair", "couch", "potted plant", "bed", "", "dining table", "", "", "toilet", "", "tv", + "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", + "refrigerator", "", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", + "toothbrush"}; int PYRAMID_LEVELS[] = {3, 4, 5, 6, 7}; int STRIDES[] = {8, 16, 32, 64, 128}; float SCALES[] = { - (float) pow(2, 0.), - (float) pow(2, 1. / 3.), - (float) pow(2, 2. / 3.), + (float)pow(2, 0.), + (float)pow(2, 1. / 3.), + (float)pow(2, 2. / 3.), }; float RATIOS_X[] = {1.f, 1.4f, 0.7f}; float RATIOS_Y[] = {1.f, 0.7f, 1.4f}; @@ -241,8 +255,8 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in /* set the shape, data buffer of input_tensor of the graph */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - uint8_t* input_data = ( uint8_t* )malloc(img_size * sizeof(uint8_t)); + int dims[] = {1, 3, img_h, img_w}; // nchw + uint8_t* input_data = (uint8_t*)malloc(img_size * sizeof(uint8_t)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -283,16 +297,19 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in int resized_h, resized_w; float resize_scale; image resImg; - if (raw_h > raw_w){ + if (raw_h > raw_w) + { resized_h = img_h; - resized_w = (int) ((float) img_h / raw_h * raw_w); + resized_w = (int)((float)img_h / raw_h * raw_w); resImg = resize_image(im, resized_w, img_h); - resize_scale = (float) raw_h / img_h; - } else{ + resize_scale = (float)raw_h / img_h; + } + else + { resized_w = img_w; - resized_h = (int) ((float) img_w / raw_w * raw_h); + resized_h = (int)((float)img_w / raw_w * raw_h); resImg = resize_image(im, img_w, resized_h); - resize_scale = (float) raw_w / img_w; + resize_scale = (float)raw_w / img_w; } free_image(im); @@ -347,12 +364,12 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in /* get the result of classification */ tensor_t output_tensor_regression = get_graph_output_tensor(graph, 0, 0); - uint8_t* output_data_regression_u8 = ( uint8_t* )get_tensor_buffer(output_tensor_regression); + uint8_t* output_data_regression_u8 = (uint8_t*)get_tensor_buffer(output_tensor_regression); int num_anchors_data = get_tensor_buffer_size(output_tensor_regression); int num_anchors = get_tensor_buffer_size(output_tensor_regression) / 4; tensor_t output_tensor_classification = get_graph_output_tensor(graph, 1, 0); - uint8_t* output_data_classification_u8 = ( uint8_t* )get_tensor_buffer(output_tensor_classification); + uint8_t* output_data_classification_u8 = (uint8_t*)get_tensor_buffer(output_tensor_classification); int num_classes_data = get_tensor_buffer_size(output_tensor_classification); int num_classes = get_tensor_buffer_size(output_tensor_classification) / num_anchors; @@ -360,16 +377,16 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in float output_scale_regression = 0.f; int output_zero_point_regression = 0; get_tensor_quant_param(output_tensor_regression, &output_scale_regression, &output_zero_point_regression, 1); - float* output_data_regression = ( float* )malloc(num_anchors_data * sizeof(float)); + float* output_data_regression = (float*)malloc(num_anchors_data * sizeof(float)); for (int i = 0; i < num_anchors_data; i++) - output_data_regression[i] = (( float )output_data_regression_u8[i] - ( float )output_zero_point_regression) * output_scale_regression; + output_data_regression[i] = ((float)output_data_regression_u8[i] - (float)output_zero_point_regression) * output_scale_regression; float output_scale_classification = 0.f; int output_zero_point_classification = 0; get_tensor_quant_param(output_tensor_classification, &output_scale_classification, &output_zero_point_classification, 1); - float* output_data_classification = ( float* )malloc(num_classes_data * sizeof(float)); + float* output_data_classification = (float*)malloc(num_classes_data * sizeof(float)); for (int i = 0; i < num_classes_data; i++) - output_data_classification[i] = (( float )output_data_classification_u8[i] - ( float )output_zero_point_classification) * output_scale_classification; + output_data_classification[i] = ((float)output_data_classification_u8[i] - (float)output_zero_point_classification) * output_scale_classification; // postprocess // generate anchors @@ -379,21 +396,24 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in float* anchors_y1 = malloc(num_anchors * sizeof(float)); int anchor_idx = 0; - for (int stride_idx = 0; stride_idx < num_levels; stride_idx++) { + for (int stride_idx = 0; stride_idx < num_levels; stride_idx++) + { int stride = STRIDES[stride_idx]; - float arange_stride = powf(2, (float) PYRAMID_LEVELS[stride_idx]); - int length_x = (int) ceilf(((float) img_w - (float) stride / 2) / (float) arange_stride); - int length_y = (int) ceilf(((float) img_h - (float) stride / 2) / (float) arange_stride); + float arange_stride = powf(2, (float)PYRAMID_LEVELS[stride_idx]); + int length_x = (int)ceilf(((float)img_w - (float)stride / 2) / (float)arange_stride); + int length_y = (int)ceilf(((float)img_h - (float)stride / 2) / (float)arange_stride); float* x = arange(stride / 2, img_w, arange_stride); float* y = arange(stride / 2, img_h, arange_stride); int start_idx = anchor_idx; int num_anchor_types = num_scales * num_ratios; - for (int i = 0; i < num_scales; i++) { + for (int i = 0; i < num_scales; i++) + { float anchor_scale = SCALES[i]; - float base_anchor_size = ANCHOR_SCALE * (float) stride * anchor_scale; + float base_anchor_size = ANCHOR_SCALE * (float)stride * anchor_scale; - for (int j = 0; j < num_ratios; j++) { + for (int j = 0; j < num_ratios; j++) + { float ratio_x = RATIOS_X[j]; float ratio_y = RATIOS_Y[j]; @@ -421,14 +441,16 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in int num_proposals_over_threshold = 0; #pragma omp parallel for num_threads(opt.num_thread) - for (int i = 0; i < num_anchors; i++) { + for (int i = 0; i < num_anchors; i++) + { // loop over anchors // confidence int max_idx = argmax(output_data_classification, i * num_classes, num_classes); float max_score = output_data_classification[i * num_classes + max_idx]; - if (isinf(max_score) || max_score < CONFIDENCE_THRESHOLD){ + if (isinf(max_score) || max_score < CONFIDENCE_THRESHOLD) + { proposals[i].class_idx = -1; continue; } @@ -459,24 +481,25 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in xmax *= resize_scale; // clipping - xmin = fmaxf(fminf(xmin, (float) (raw_w - 1)), 0.f); - xmax = fmaxf(fminf(xmax, (float) (raw_w - 1)), 0.f); - ymin = fmaxf(fminf(ymin, (float) (raw_h - 1)), 0.f); - ymax = fmaxf(fminf(ymax, (float) (raw_h - 1)), 0.f); + xmin = fmaxf(fminf(xmin, (float)(raw_w - 1)), 0.f); + xmax = fmaxf(fminf(xmax, (float)(raw_w - 1)), 0.f); + ymin = fmaxf(fminf(ymin, (float)(raw_h - 1)), 0.f); + ymax = fmaxf(fminf(ymax, (float)(raw_h - 1)), 0.f); // area filtering float area = (xmax - xmin) * (ymax - ymin); - if (area < 4){ + if (area < 4) + { proposals[i].class_idx = -1; continue; } num_proposals_over_threshold++; - proposals[i].x0 = (int) xmin; - proposals[i].x1 = (int) xmax; - proposals[i].y0 = (int) ymin; - proposals[i].y1 = (int) ymax; + proposals[i].x0 = (int)xmin; + proposals[i].x1 = (int)xmax; + proposals[i].y0 = (int)ymin; + proposals[i].y1 = (int)ymax; } free(anchors_x0); free(anchors_x1); @@ -486,16 +509,18 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in // filter boxes wiht confidence threshold Box_t* proposals_over_threshold = malloc(sizeof(Box_t) * num_proposals_over_threshold); int proposals_over_threshold_idx = 0; - for (int i = 0; i < num_anchors; i++) { + for (int i = 0; i < num_anchors; i++) + { Box_t box = proposals[i]; - if(box.class_idx == -1) + if (box.class_idx == -1) continue; proposals_over_threshold[proposals_over_threshold_idx] = box; proposals_over_threshold_idx++; } free(proposals); - if (num_proposals_over_threshold > 0){ + if (num_proposals_over_threshold > 0) + { // sort boxes qsort_descent_inplace(proposals_over_threshold, 0, num_proposals_over_threshold - 1); @@ -504,9 +529,10 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in int num_outputs = nms(proposals_over_threshold, num_proposals_over_threshold, suppressed, NMS_THRESHOLD); Box_t* proposals_after_nms = malloc(num_outputs * sizeof(Box_t)); int proposals_after_nms_idx = 0; - for(int i = 0; i < num_proposals_over_threshold; i++){ + for (int i = 0; i < num_proposals_over_threshold; i++) + { Box_t box = proposals_over_threshold[i]; - if(suppressed[i] == 1) + if (suppressed[i] == 1) continue; proposals_after_nms[proposals_after_nms_idx] = box; proposals_after_nms_idx++; @@ -568,37 +594,37 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'g': - split(img_hw, optarg, ","); - img_h = ( int )img_hw[0]; - img_w = ( int )img_hw[1]; - break; - case 's': - split(scale, optarg, ","); - break; - case 'w': - split(mean, optarg, ","); - break; - case 'r': - loop_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'a': - cpu_affinity = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'g': + split(img_hw, optarg, ","); + img_h = (int)img_hw[0]; + img_w = (int)img_hw[1]; + break; + case 's': + split(scale, optarg, ","); + break; + case 'w': + split(mean, optarg, ","); + break; + case 'r': + loop_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'a': + cpu_affinity = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } diff --git a/examples/tm_hrnet.cpp b/examples/tm_hrnet.cpp index c81b0b3d4..bb26f404b 100644 --- a/examples/tm_hrnet.cpp +++ b/examples/tm_hrnet.cpp @@ -39,58 +39,60 @@ #define DEFAULT_REPEAT_COUNT 1 #define DEFAULT_THREAD_COUNT 1 -#define LETTERBOX_ROWS 256 -#define LETTERBOX_COLS 256 -#define MODEL_CHANNELS 3 -#define HEATMAP_CHANNEL 16 +#define LETTERBOX_ROWS 256 +#define LETTERBOX_COLS 256 +#define MODEL_CHANNELS 3 +#define HEATMAP_CHANNEL 16 -typedef struct { +typedef struct +{ float x; float y; float score; } ai_point_t; -struct skeleton { +struct skeleton +{ int connection[2]; int left_right_neutral; }; -std::vector pairs = {{0, 1, 0}, - {1, 2, 0}, - {3, 4, 1}, - {4, 5, 1}, - {2, 6, 0}, - {3, 6, 1}, - {6, 7, 2}, - {7, 8, 2}, - {8, 9, 2}, - {13, 7, 1}, +std::vector pairs = {{0, 1, 0}, + {1, 2, 0}, + {3, 4, 1}, + {4, 5, 1}, + {2, 6, 0}, + {3, 6, 1}, + {6, 7, 2}, + {7, 8, 2}, + {8, 9, 2}, + {13, 7, 1}, {10, 11, 0}, - {7, 12, 0}, + {7, 12, 0}, {12, 11, 0}, {13, 14, 1}, {14, 15, 1}}; - -typedef struct { +typedef struct +{ std::vector keypoints; int32_t img_width = 0; int32_t img_heigh = 0; uint64_t timestamp = 0; } ai_body_parts_s; -void FindMax2D(float *buf, int width, int height, int *max_idx_width, int *max_idx_height, float *max_value, int c) +void FindMax2D(float* buf, int width, int height, int* max_idx_width, int* max_idx_height, float* max_value, int c) { - float *ptr = buf; + float* ptr = buf; *max_value = -10.f; *max_idx_width = 0; *max_idx_height = 0; - for (int h = 0; h < height; h++) + for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) + for (int w = 0; w < width; w++) { float score = ptr[c * height * width + h * height + w]; - if (score > *max_value) + if (score > *max_value) { *max_value = score; *max_idx_height = h; @@ -100,7 +102,7 @@ void FindMax2D(float *buf, int width, int height, int *max_idx_width, int *max_i } } -void PostProcess(float *data, ai_body_parts_s &pose, int img_h, int img_w) +void PostProcess(float* data, ai_body_parts_s& pose, int img_h, int img_w) { int heatmap_width = img_w / 4; int heatmap_height = img_h / 4; @@ -108,21 +110,20 @@ void PostProcess(float *data, ai_body_parts_s &pose, int img_h, int img_w) float max_score; ai_point_t kp; - for (int c = 0; c < HEATMAP_CHANNEL; ++c) + for (int c = 0; c < HEATMAP_CHANNEL; ++c) { FindMax2D(data, heatmap_width, heatmap_height, &max_idx_width, &max_idx_height, &max_score, c); - kp.x = (float) max_idx_width / (float) heatmap_width; - kp.y = (float) max_idx_height / (float) heatmap_height; + kp.x = (float)max_idx_width / (float)heatmap_width; + kp.y = (float)max_idx_height / (float)heatmap_height; kp.score = max_score; pose.keypoints.push_back(kp); std::cout << "x: " << pose.keypoints[c].x * 64 << ", y: " << pose.keypoints[c].y * 64 << ", score: " << pose.keypoints[c].score << std::endl; - } } -void draw_result(cv::Mat img, ai_body_parts_s &pose) +void draw_result(cv::Mat img, ai_body_parts_s& pose) { /* recover process to draw */ float scale_letterbox; @@ -145,8 +146,8 @@ void draw_result(cv::Mat img, ai_body_parts_s &pose) for (int i = 0; i < HEATMAP_CHANNEL; i++) { - int x = (int) ((pose.keypoints[i].x * LETTERBOX_COLS - tmp_w) * ratio_x); - int y = (int) ((pose.keypoints[i].y * LETTERBOX_ROWS - tmp_h) * ratio_y); + int x = (int)((pose.keypoints[i].x * LETTERBOX_COLS - tmp_w) * ratio_x); + int y = (int)((pose.keypoints[i].y * LETTERBOX_ROWS - tmp_h) * ratio_y); x = std::max(std::min(x, (img.cols - 1)), 0); y = std::max(std::min(y, (img.rows - 1)), 0); @@ -157,24 +158,24 @@ void draw_result(cv::Mat img, ai_body_parts_s &pose) cv::Scalar color; cv::Point pt1; cv::Point pt2; - for (auto &element: pairs) + for (auto& element : pairs) { - switch(element.left_right_neutral) + switch (element.left_right_neutral) { - case 0: - color = cv::Scalar(255, 0, 0); - break; - case 1: - color = cv::Scalar(0, 0, 255); - break; - default: - color = cv::Scalar(0, 255, 0); + case 0: + color = cv::Scalar(255, 0, 0); + break; + case 1: + color = cv::Scalar(0, 0, 255); + break; + default: + color = cv::Scalar(0, 255, 0); } - int x1 = (int) ((pose.keypoints[element.connection[0]].x * LETTERBOX_COLS - tmp_w) * ratio_x); - int y1 = (int) ((pose.keypoints[element.connection[0]].y * LETTERBOX_ROWS - tmp_h) * ratio_y); - int x2 = (int) ((pose.keypoints[element.connection[1]].x * LETTERBOX_COLS - tmp_w) * ratio_x); - int y2 = (int) ((pose.keypoints[element.connection[1]].y * LETTERBOX_ROWS - tmp_h) * ratio_y); + int x1 = (int)((pose.keypoints[element.connection[0]].x * LETTERBOX_COLS - tmp_w) * ratio_x); + int y1 = (int)((pose.keypoints[element.connection[0]].y * LETTERBOX_ROWS - tmp_h) * ratio_y); + int x2 = (int)((pose.keypoints[element.connection[1]].x * LETTERBOX_COLS - tmp_w) * ratio_x); + int y2 = (int)((pose.keypoints[element.connection[1]].y * LETTERBOX_ROWS - tmp_h) * ratio_y); x1 = std::max(std::min(x1, (img.cols - 1)), 0); y1 = std::max(std::min(y1, (img.rows - 1)), 0); @@ -187,7 +188,7 @@ void draw_result(cv::Mat img, ai_body_parts_s &pose) } } -void get_input_fp32_data_square(const char *image_file, float *input_data, float *mean, float *scale) +void get_input_fp32_data_square(const char* image_file, float* input_data, float* mean, float* scale) { cv::Mat img = cv::imread(image_file); @@ -215,15 +216,15 @@ void get_input_fp32_data_square(const char *image_file, float *input_data, float int right = (LETTERBOX_COLS - resize_cols + 1) / 2; // Letterbox filling cv::copyMakeBorder(img, img_new, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0)); -// cv::imwrite("hrnet_lb_image.jpg", img_new); // for letterbox test - float *img_data = (float *) img_new.data; + // cv::imwrite("hrnet_lb_image.jpg", img_new); // for letterbox test + float* img_data = (float*)img_new.data; /* nhwc to nchw */ - for (int h = 0; h < LETTERBOX_ROWS; h++) + for (int h = 0; h < LETTERBOX_ROWS; h++) { - for (int w = 0; w < LETTERBOX_COLS; w++) + for (int w = 0; w < LETTERBOX_COLS; w++) { - for (int c = 0; c < MODEL_CHANNELS; c++) + for (int c = 0; c < MODEL_CHANNELS; c++) { int in_index = h * LETTERBOX_COLS * MODEL_CHANNELS + w * MODEL_CHANNELS + c; int out_index = c * LETTERBOX_ROWS * LETTERBOX_COLS + h * LETTERBOX_COLS + w; @@ -233,17 +234,17 @@ void get_input_fp32_data_square(const char *image_file, float *input_data, float } } -void show_usage() +void show_usage() { fprintf(stderr, "[Usage]: [-h]\n [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n"); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { int repeat_count = DEFAULT_REPEAT_COUNT; int num_thread = DEFAULT_THREAD_COUNT; - char *model_file = nullptr; - char *image_file = nullptr; + char* model_file = nullptr; + char* image_file = nullptr; int img_h = LETTERBOX_COLS; int img_w = LETTERBOX_ROWS; ai_body_parts_s pose; @@ -252,39 +253,39 @@ int main(int argc, char *argv[]) float scale[3] = {0.017125f, 0.017507f, 0.017429f}; int res; - while ((res = getopt(argc, argv, "m:i:r:t:h:")) != -1) + while ((res = getopt(argc, argv, "m:i:r:t:h:")) != -1) { - switch (res) + switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } /* check files */ - if (model_file == nullptr) + if (model_file == nullptr) { fprintf(stderr, "Error: Tengine model file not specified!\n"); show_usage(); return -1; } - if (image_file == nullptr) + if (image_file == nullptr) { fprintf(stderr, "Error: Image file not specified!\n"); show_usage(); @@ -311,7 +312,7 @@ int main(int argc, char *argv[]) /* create graph, load tengine model xxx.tmfile */ graph_t graph = create_graph(nullptr, "tengine", model_file); - if (graph == nullptr) + if (graph == nullptr) { fprintf(stderr, "Create graph failed.\n"); return -1; @@ -319,30 +320,30 @@ int main(int argc, char *argv[]) /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw + int dims[] = {1, 3, img_h, img_w}; // nchw std::vector input_data(img_size); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); - if (input_tensor == nullptr) + if (input_tensor == nullptr) { fprintf(stderr, "Get input tensor failed\n"); return -1; } - if (set_tensor_shape(input_tensor, dims, 4) < 0) + if (set_tensor_shape(input_tensor, dims, 4) < 0) { fprintf(stderr, "Set input tensor shape failed\n"); return -1; } - if (set_tensor_buffer(input_tensor, input_data.data(), img_size * sizeof(float)) < 0) + if (set_tensor_buffer(input_tensor, input_data.data(), img_size * sizeof(float)) < 0) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; } /* prerun graph, set work options(num_thread, cluster, precision) */ - if (prerun_graph_multithread(graph, opt) < 0) + if (prerun_graph_multithread(graph, opt) < 0) { fprintf(stderr, "Prerun multithread graph failed.\n"); return -1; @@ -355,10 +356,10 @@ int main(int argc, char *argv[]) double min_time = DBL_MAX; double max_time = DBL_MIN; double total_time = 0.; - for (int i = 0; i < repeat_count; i++) + for (int i = 0; i < repeat_count; i++) { double start = get_current_time(); - if (run_graph(graph, 1) < 0) + if (run_graph(graph, 1) < 0) { fprintf(stderr, "Run graph failed\n"); return -1; @@ -370,11 +371,11 @@ int main(int argc, char *argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat [%d] min %.3f ms, max %.3f ms, avg %.3f ms\n", repeat_count, min_time, max_time, - total_time / repeat_count); + total_time / repeat_count); /* get output tensor */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float *data = (float *) (get_tensor_buffer(output_tensor)); + float* data = (float*)(get_tensor_buffer(output_tensor)); PostProcess(data, pose, img_h, img_w); diff --git a/examples/tm_hrnet_timvx.cpp b/examples/tm_hrnet_timvx.cpp index a677b05ea..296ce5b91 100644 --- a/examples/tm_hrnet_timvx.cpp +++ b/examples/tm_hrnet_timvx.cpp @@ -37,58 +37,60 @@ #define DEFAULT_REPEAT_COUNT 1 #define DEFAULT_THREAD_COUNT 1 -#define LETTERBOX_ROWS 256 -#define LETTERBOX_COLS 256 -#define MODEL_CHANNELS 3 -#define HEATMAP_CHANNEL 16 +#define LETTERBOX_ROWS 256 +#define LETTERBOX_COLS 256 +#define MODEL_CHANNELS 3 +#define HEATMAP_CHANNEL 16 -typedef struct { +typedef struct +{ float x; float y; float score; } ai_point_t; -struct skeleton { +struct skeleton +{ int connection[2]; int left_right_neutral; }; -std::vector pairs = {{0, 1, 0}, - {1, 2, 0}, - {3, 4, 1}, - {4, 5, 1}, - {2, 6, 0}, - {3, 6, 1}, - {6, 7, 2}, - {7, 8, 2}, - {8, 9, 2}, - {13, 7, 1}, +std::vector pairs = {{0, 1, 0}, + {1, 2, 0}, + {3, 4, 1}, + {4, 5, 1}, + {2, 6, 0}, + {3, 6, 1}, + {6, 7, 2}, + {7, 8, 2}, + {8, 9, 2}, + {13, 7, 1}, {10, 11, 0}, - {7, 12, 0}, + {7, 12, 0}, {12, 11, 0}, {13, 14, 1}, {14, 15, 1}}; - -typedef struct { +typedef struct +{ std::vector keypoints; int32_t img_width = 0; int32_t img_heigh = 0; uint64_t timestamp = 0; } ai_body_parts_s; -void FindMax2D(float *buf, int width, int height, int *max_idx_width, int *max_idx_height, float *max_value, int c) +void FindMax2D(float* buf, int width, int height, int* max_idx_width, int* max_idx_height, float* max_value, int c) { - float *ptr = buf; + float* ptr = buf; *max_value = -10.f; *max_idx_width = 0; *max_idx_height = 0; - for (int h = 0; h < height; h++) + for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) + for (int w = 0; w < width; w++) { float score = ptr[c * height * width + h * height + w]; - if (score > *max_value) + if (score > *max_value) { *max_value = score; *max_idx_height = h; @@ -98,7 +100,7 @@ void FindMax2D(float *buf, int width, int height, int *max_idx_width, int *max_i } } -void PostProcess(float *data, ai_body_parts_s &pose, int img_h, int img_w) +void PostProcess(float* data, ai_body_parts_s& pose, int img_h, int img_w) { int heatmap_width = img_w / 4; int heatmap_height = img_h / 4; @@ -106,21 +108,20 @@ void PostProcess(float *data, ai_body_parts_s &pose, int img_h, int img_w) float max_score; ai_point_t kp; - for (int c = 0; c < HEATMAP_CHANNEL; ++c) + for (int c = 0; c < HEATMAP_CHANNEL; ++c) { FindMax2D(data, heatmap_width, heatmap_height, &max_idx_width, &max_idx_height, &max_score, c); - kp.x = (float) max_idx_width / (float) heatmap_width; - kp.y = (float) max_idx_height / (float) heatmap_height; + kp.x = (float)max_idx_width / (float)heatmap_width; + kp.y = (float)max_idx_height / (float)heatmap_height; kp.score = max_score; pose.keypoints.push_back(kp); std::cout << "x: " << pose.keypoints[c].x * 64 << ", y: " << pose.keypoints[c].y * 64 << ", score: " << pose.keypoints[c].score << std::endl; - } } -void draw_result(cv::Mat img, ai_body_parts_s &pose) +void draw_result(cv::Mat img, ai_body_parts_s& pose) { /* recover process to draw */ float scale_letterbox; @@ -143,8 +144,8 @@ void draw_result(cv::Mat img, ai_body_parts_s &pose) for (int i = 0; i < HEATMAP_CHANNEL; i++) { - int x = (int) ((pose.keypoints[i].x * LETTERBOX_COLS - tmp_w) * ratio_x); - int y = (int) ((pose.keypoints[i].y * LETTERBOX_ROWS - tmp_h) * ratio_y); + int x = (int)((pose.keypoints[i].x * LETTERBOX_COLS - tmp_w) * ratio_x); + int y = (int)((pose.keypoints[i].y * LETTERBOX_ROWS - tmp_h) * ratio_y); x = std::max(std::min(x, (img.cols - 1)), 0); y = std::max(std::min(y, (img.rows - 1)), 0); @@ -155,24 +156,24 @@ void draw_result(cv::Mat img, ai_body_parts_s &pose) cv::Scalar color; cv::Point pt1; cv::Point pt2; - for (auto &element: pairs) + for (auto& element : pairs) { - switch(element.left_right_neutral) + switch (element.left_right_neutral) { - case 0: - color = cv::Scalar(255, 0, 0); - break; - case 1: - color = cv::Scalar(0, 0, 255); - break; - default: - color = cv::Scalar(0, 255, 0); + case 0: + color = cv::Scalar(255, 0, 0); + break; + case 1: + color = cv::Scalar(0, 0, 255); + break; + default: + color = cv::Scalar(0, 255, 0); } - int x1 = (int) ((pose.keypoints[element.connection[0]].x * LETTERBOX_COLS - tmp_w) * ratio_x); - int y1 = (int) ((pose.keypoints[element.connection[0]].y * LETTERBOX_ROWS - tmp_h) * ratio_y); - int x2 = (int) ((pose.keypoints[element.connection[1]].x * LETTERBOX_COLS - tmp_w) * ratio_x); - int y2 = (int) ((pose.keypoints[element.connection[1]].y * LETTERBOX_ROWS - tmp_h) * ratio_y); + int x1 = (int)((pose.keypoints[element.connection[0]].x * LETTERBOX_COLS - tmp_w) * ratio_x); + int y1 = (int)((pose.keypoints[element.connection[0]].y * LETTERBOX_ROWS - tmp_h) * ratio_y); + int x2 = (int)((pose.keypoints[element.connection[1]].x * LETTERBOX_COLS - tmp_w) * ratio_x); + int y2 = (int)((pose.keypoints[element.connection[1]].y * LETTERBOX_ROWS - tmp_h) * ratio_y); x1 = std::max(std::min(x1, (img.cols - 1)), 0); y1 = std::max(std::min(y1, (img.rows - 1)), 0); @@ -185,8 +186,8 @@ void draw_result(cv::Mat img, ai_body_parts_s &pose) } } -void get_input_uint8_data_square(const char *image_file, uint8_t *input_data, float *mean, float *scale, - float input_scale, int zero_point) +void get_input_uint8_data_square(const char* image_file, uint8_t* input_data, float* mean, float* scale, + float input_scale, int zero_point) { cv::Mat img = cv::imread(image_file); @@ -214,21 +215,21 @@ void get_input_uint8_data_square(const char *image_file, uint8_t *input_data, fl int right = (LETTERBOX_COLS - resize_cols + 1) / 2; // Letterbox filling cv::copyMakeBorder(img, img_new, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0)); -// cv::imwrite("hrnet_lb_image.jpg", img_new); // for letterbox test - float *img_data = (float *) img_new.data; + // cv::imwrite("hrnet_lb_image.jpg", img_new); // for letterbox test + float* img_data = (float*)img_new.data; /* nhwc to nchw */ - for (int h = 0; h < LETTERBOX_ROWS; h++) - { - for (int w = 0; w < LETTERBOX_COLS; w++) - { - for (int c = 0; c < MODEL_CHANNELS; c++) - { + for (int h = 0; h < LETTERBOX_ROWS; h++) + { + for (int w = 0; w < LETTERBOX_COLS; w++) + { + for (int c = 0; c < MODEL_CHANNELS; c++) + { int in_index = h * LETTERBOX_COLS * MODEL_CHANNELS + w * MODEL_CHANNELS + c; int out_index = c * LETTERBOX_ROWS * LETTERBOX_COLS + h * LETTERBOX_COLS + w; float input_temp = (img_data[in_index] - mean[c]) * scale[c]; /* quant to uint8 */ - int udata = (round)(input_temp / input_scale + ( float )zero_point); + int udata = (round)(input_temp / input_scale + (float)zero_point); if (udata > 255) udata = 255; else if (udata < 0) @@ -239,17 +240,17 @@ void get_input_uint8_data_square(const char *image_file, uint8_t *input_data, fl } } -void show_usage() +void show_usage() { fprintf(stderr, "[Usage]: [-h]\n [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n"); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { int repeat_count = DEFAULT_REPEAT_COUNT; int num_thread = DEFAULT_THREAD_COUNT; - char *model_file = nullptr; - char *image_file = nullptr; + char* model_file = nullptr; + char* image_file = nullptr; int img_h = LETTERBOX_COLS; int img_w = LETTERBOX_ROWS; ai_body_parts_s pose; @@ -258,40 +259,40 @@ int main(int argc, char *argv[]) float scale[3] = {0.017125f, 0.017507f, 0.017429f}; int res; - while ((res = getopt(argc, argv, "m:i:r:t:h:")) != -1) + while ((res = getopt(argc, argv, "m:i:r:t:h:")) != -1) { - switch (res) + switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } /* check files */ - if (model_file == nullptr) - { + if (model_file == nullptr) + { fprintf(stderr, "Error: Tengine model file not specified!\n"); show_usage(); return -1; } - if (image_file == nullptr) - { + if (image_file == nullptr) + { fprintf(stderr, "Error: Image file not specified!\n"); show_usage(); return -1; @@ -332,31 +333,31 @@ int main(int argc, char *argv[]) } /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw + int dims[] = {1, 3, img_h, img_w}; // nchw std::vector input_data(img_size); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); - if (input_tensor == nullptr) - { + if (input_tensor == nullptr) + { fprintf(stderr, "Get input tensor failed\n"); return -1; } - if (set_tensor_shape(input_tensor, dims, 4) < 0) - { + if (set_tensor_shape(input_tensor, dims, 4) < 0) + { fprintf(stderr, "Set input tensor shape failed\n"); return -1; } if (set_tensor_buffer(input_tensor, input_data.data(), img_size) < 0) - { + { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; } /* prerun graph, set work options(num_thread, cluster, precision) */ - if (prerun_graph_multithread(graph, opt) < 0) - { + if (prerun_graph_multithread(graph, opt) < 0) + { fprintf(stderr, "Prerun multithread graph failed.\n"); return -1; } @@ -371,11 +372,11 @@ int main(int argc, char *argv[]) double min_time = DBL_MAX; double max_time = DBL_MIN; double total_time = 0.; - for (int i = 0; i < repeat_count; i++) - { + for (int i = 0; i < repeat_count; i++) + { double start = get_current_time(); - if (run_graph(graph, 1) < 0) - { + if (run_graph(graph, 1) < 0) + { fprintf(stderr, "Run graph failed\n"); return -1; } @@ -386,20 +387,21 @@ int main(int argc, char *argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat [%d] min %.3f ms, max %.3f ms, avg %.3f ms\n", repeat_count, min_time, max_time, - total_time / repeat_count); + total_time / repeat_count); /* get output tensor */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - uint8_t* output_u8 = ( uint8_t* )get_tensor_buffer(output_tensor); - int output_size = get_tensor_buffer_size(output_tensor)/ sizeof(uint8_t); + uint8_t* output_u8 = (uint8_t*)get_tensor_buffer(output_tensor); + int output_size = get_tensor_buffer_size(output_tensor) / sizeof(uint8_t); /* dequant */ float output_scale = 0.f; int output_zero_point = 0; get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); // float* output_data = ( float* )malloc(output_size * sizeof(float)); std::vector output_data(output_size); - for (int i = 0; i < output_size; i++) { - output_data[i] = ((float) output_u8[i] - (float) output_zero_point) * output_scale; + for (int i = 0; i < output_size; i++) + { + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; } PostProcess(output_data.data(), pose, img_h, img_w); diff --git a/examples/tm_landmark.cpp b/examples/tm_landmark.cpp index 74fc95c9a..081a17a43 100644 --- a/examples/tm_landmark.cpp +++ b/examples/tm_landmark.cpp @@ -36,7 +36,7 @@ void get_input_fp32_data(const char* image_file, float* input_data, int img_h, i { image img = imread_process(image_file, img_w, img_h, mean, scale); - float* image_data = ( float* )img.data; + float* image_data = (float*)img.data; for (int i = 0; i < img_w * img_h * 3; i++) input_data[i] = image_data[i]; @@ -65,23 +65,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -124,8 +124,8 @@ int main(int argc, char* argv[]) /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - float* input_data = (float* )malloc(img_size * sizeof(float)); + int dims[] = {1, 3, img_h, img_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == nullptr) @@ -182,13 +182,13 @@ int main(int argc, char* argv[]) /* get output tensor */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* data = ( float* )(get_tensor_buffer(output_tensor)); - int data_size = get_tensor_buffer_size(output_tensor) / sizeof(float ); + float* data = (float*)(get_tensor_buffer(output_tensor)); + int data_size = get_tensor_buffer_size(output_tensor) / sizeof(float); image img_out = imread(image_file); for (int i = 0; i < data_size / 2; i++) { - int x = (int)(data[2 * i ] * (float)img_out.w / 144.f); + int x = (int)(data[2 * i] * (float)img_out.w / 144.f); int y = (int)(data[2 * i + 1] * (float)img_out.h / 144.f); draw_circle(img_out, x, y, 2, 0, 255, 0); } diff --git a/examples/tm_landmark_timvx.cpp b/examples/tm_landmark_timvx.cpp index 3062f4d8e..08c3901f5 100644 --- a/examples/tm_landmark_timvx.cpp +++ b/examples/tm_landmark_timvx.cpp @@ -37,11 +37,11 @@ void get_input_uint8_data(const char* image_file, uint8_t* input_data, int img_h { image img = imread_process(image_file, img_w, img_h, mean, scale); - float* image_data = ( float* )img.data; + float* image_data = (float*)img.data; for (int i = 0; i < img_w * img_h * 3; i++) { - int udata = (round)(image_data[i] / input_scale + (float )zero_point); + int udata = (round)(image_data[i] / input_scale + (float)zero_point); if (udata > 255) udata = 255; else if (udata < 0) @@ -74,23 +74,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -146,8 +146,8 @@ int main(int argc, char* argv[]) /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - uint8_t* input_data = ( uint8_t* )malloc(img_size); + int dims[] = {1, 3, img_h, img_w}; // nchw + uint8_t* input_data = (uint8_t*)malloc(img_size); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == nullptr) @@ -210,13 +210,13 @@ int main(int argc, char* argv[]) float output_scale = 0.f; int output_zp = 0; get_tensor_quant_param(output_tensor, &output_scale, &output_zp, 1); - uint8_t* data = ( uint8_t* )(get_tensor_buffer(output_tensor)); + uint8_t* data = (uint8_t*)(get_tensor_buffer(output_tensor)); int data_size = get_tensor_buffer_size(output_tensor) / sizeof(uint8_t); image img_out = imread(image_file); for (int i = 0; i < data_size / 2; i++) { - int x = (int)(((float)data[2 * i ] - (float)output_zp) * output_scale * (float)img_out.w / 144.f); + int x = (int)(((float)data[2 * i] - (float)output_zp) * output_scale * (float)img_out.w / 144.f); int y = (int)(((float)data[2 * i + 1] - (float)output_zp) * output_scale * (float)img_out.h / 144.f); draw_circle(img_out, x, y, 2, 0, 255, 0); } diff --git a/examples/tm_landmark_uint8.cpp b/examples/tm_landmark_uint8.cpp index 4010cc034..af825d9e4 100644 --- a/examples/tm_landmark_uint8.cpp +++ b/examples/tm_landmark_uint8.cpp @@ -37,11 +37,11 @@ void get_input_uint8_data(const char* image_file, uint8_t* input_data, int img_h { image img = imread_process(image_file, img_w, img_h, mean, scale); - float* image_data = ( float* )img.data; + float* image_data = (float*)img.data; for (int i = 0; i < img_w * img_h * 3; i++) { - int udata = (round)(image_data[i] / input_scale + (float )zero_point); + int udata = (round)(image_data[i] / input_scale + (float)zero_point); if (udata > 255) udata = 255; else if (udata < 0) @@ -74,23 +74,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -133,8 +133,8 @@ int main(int argc, char* argv[]) /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - uint8_t* input_data = ( uint8_t* )malloc(img_size); + int dims[] = {1, 3, img_h, img_w}; // nchw + uint8_t* input_data = (uint8_t*)malloc(img_size); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == nullptr) @@ -197,13 +197,13 @@ int main(int argc, char* argv[]) float output_scale = 0.f; int output_zp = 0; get_tensor_quant_param(output_tensor, &output_scale, &output_zp, 1); - uint8_t* data = ( uint8_t* )(get_tensor_buffer(output_tensor)); + uint8_t* data = (uint8_t*)(get_tensor_buffer(output_tensor)); int data_size = get_tensor_buffer_size(output_tensor) / sizeof(uint8_t); image img_out = imread(image_file); for (int i = 0; i < data_size / 2; i++) { - int x = (int)(((float)data[2 * i ] - (float)output_zp) * output_scale * (float)img_out.w / 144.f); + int x = (int)(((float)data[2 * i] - (float)output_zp) * output_scale * (float)img_out.w / 144.f); int y = (int)(((float)data[2 * i + 1] - (float)output_zp) * output_scale * (float)img_out.h / 144.f); draw_circle(img_out, x, y, 2, 0, 255, 0); } diff --git a/examples/tm_mobilefacenet.cpp b/examples/tm_mobilefacenet.cpp index 5b6b8f841..d7e3020b6 100644 --- a/examples/tm_mobilefacenet.cpp +++ b/examples/tm_mobilefacenet.cpp @@ -35,7 +35,7 @@ #define DEFAULT_MEAN3 122.679 #define MOBILE_FACE_HEIGHT 110 -#define MOBILE_FACE_WIDTH 110 +#define MOBILE_FACE_WIDTH 110 graph_t graph; tensor_t input_tensor; @@ -81,7 +81,7 @@ int getFeature(const char* imagefile, float* feature) fprintf(stderr, "run_graph fail"); return -1; } - float* data = ( float* )get_tensor_buffer(output_tensor); + float* data = (float*)get_tensor_buffer(output_tensor); int outsize; outsize = get_tensor_buffer_size(output_tensor) / sizeof(float); for (int i = 0; i < outsize; i++) @@ -127,20 +127,20 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'a': - person_a = optarg; - break; - case 'b': - person_b = optarg; - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'a': + person_a = optarg; + break; + case 'b': + person_b = optarg; + break; + case 'h': + show_usage(); + return 0; + default: + break; } } diff --git a/examples/tm_mobilefacenet_uint8.cpp b/examples/tm_mobilefacenet_uint8.cpp index e34e09098..f5756dd3f 100644 --- a/examples/tm_mobilefacenet_uint8.cpp +++ b/examples/tm_mobilefacenet_uint8.cpp @@ -30,15 +30,15 @@ #include "tengine/c_api.h" #include "tengine_operations.h" -#define DEFAULT_MEAN1 127.5 -#define DEFAULT_MEAN2 127.5 -#define DEFAULT_MEAN3 127.5 +#define DEFAULT_MEAN1 127.5 +#define DEFAULT_MEAN2 127.5 +#define DEFAULT_MEAN3 127.5 #define DEFAULT_SCALE1 0.0078 #define DEFAULT_SCALE2 0.0078 #define DEFAULT_SCALE3 0.0078 #define MOBILE_FACE_HEIGHT 112 -#define MOBILE_FACE_WIDTH 112 +#define MOBILE_FACE_WIDTH 112 graph_t graph; tensor_t input_tensor; @@ -52,7 +52,7 @@ void init(const char* modelfile) opt.num_thread = 1; opt.cluster = TENGINE_CLUSTER_ALL; opt.precision = TENGINE_MODE_UINT8; - opt.affinity = 0x01; + opt.affinity = 0x01; int dims[4] = {1, 3, MOBILE_FACE_HEIGHT, MOBILE_FACE_WIDTH}; init_tengine(); @@ -83,7 +83,7 @@ void get_input_uint8_data(const char* image_file, uint8_t* input_data, int img_h { image img = imread_process(image_file, img_w, img_h, mean, scale); - float* image_data = ( float* )img.data; + float* image_data = (float*)img.data; for (int i = 0; i < img_w * img_h * 3; i++) { @@ -111,7 +111,7 @@ int getFeature(const char* imagefile, float* feature) float input_scale = 0.f; int input_zero_point = 0; - get_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); + get_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); get_input_uint8_data(imagefile, input_data.data(), height, width, means, scales, input_scale, input_zero_point); set_tensor_buffer(input_tensor, input_data.data(), img_size * sizeof(uint8_t)); @@ -123,7 +123,7 @@ int getFeature(const char* imagefile, float* feature) /* get the result of classification */ output_tensor = get_graph_output_tensor(graph, 0, 0); - uint8_t* output_u8 = ( uint8_t* )get_tensor_buffer(output_tensor); + uint8_t* output_u8 = (uint8_t*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor); /* dequant */ @@ -131,7 +131,7 @@ int getFeature(const char* imagefile, float* feature) int output_zero_point = 0; get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); for (int i = 0; i < output_size; i++) - feature[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; + feature[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; return output_size; } @@ -174,20 +174,20 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'a': - person_a = optarg; - break; - case 'b': - person_b = optarg; - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'a': + person_a = optarg; + break; + case 'b': + person_b = optarg; + break; + case 'h': + show_usage(); + return 0; + default: + break; } } diff --git a/examples/tm_mobilenet_ssd.c b/examples/tm_mobilenet_ssd.c index 873cb1ddf..f49bc11af 100644 --- a/examples/tm_mobilenet_ssd.c +++ b/examples/tm_mobilenet_ssd.c @@ -29,8 +29,8 @@ #include "tengine_operations.h" #define DEFAULT_MAX_BOX_COUNT 100 -#define DEFAULT_REPEAT_COUNT 1 -#define DEFAULT_THREAD_COUNT 1 +#define DEFAULT_REPEAT_COUNT 1 +#define DEFAULT_THREAD_COUNT 1 typedef struct Box { @@ -44,10 +44,10 @@ typedef struct Box void post_process_ssd(const char* image_file, float threshold, const float* outdata, int num) { - const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle", - "bus", "car", "cat", "chair", "cow", "diningtable", - "dog", "horse", "motorbike", "person", "pottedplant", "sheep", - "sofa", "train", "tvmonitor"}; + const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle", + "bus", "car", "cat", "chair", "cow", "diningtable", + "dog", "horse", "motorbike", "person", "pottedplant", "sheep", + "sofa", "train", "tvmonitor"}; image im = imread(image_file); @@ -117,23 +117,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -176,8 +176,8 @@ int main(int argc, char* argv[]) /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - float* input_data = ( float* )malloc(img_size * sizeof(float)); + int dims[] = {1, 3, img_h, img_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -196,7 +196,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -233,10 +233,10 @@ int main(int argc, char* argv[]) fprintf(stderr, "--------------------------------------\n"); /* process the detection result */ - tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); //"detection_out" + tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); //"detection_out" int out_dim[4]; get_tensor_shape(output_tensor, out_dim, 4); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); post_process_ssd(image_file, show_threshold, output_data, out_dim[1]); /* release tengine */ diff --git a/examples/tm_mobilenet_ssd_acl.c b/examples/tm_mobilenet_ssd_acl.c index e8ade622c..35e8a5868 100644 --- a/examples/tm_mobilenet_ssd_acl.c +++ b/examples/tm_mobilenet_ssd_acl.c @@ -27,8 +27,8 @@ #include "tengine_operations.h" #define DEFAULT_MAX_BOX_COUNT 100 -#define DEFAULT_REPEAT_COUNT 1 -#define DEFAULT_THREAD_COUNT 1 +#define DEFAULT_REPEAT_COUNT 1 +#define DEFAULT_THREAD_COUNT 1 typedef struct Box { @@ -42,10 +42,10 @@ typedef struct Box void post_process_ssd(const char* image_file, float threshold, const float* outdata, int num) { - const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle", - "bus", "car", "cat", "chair", "cow", "diningtable", - "dog", "horse", "motorbike", "person", "pottedplant", "sheep", - "sofa", "train", "tvmonitor"}; + const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle", + "bus", "car", "cat", "chair", "cow", "diningtable", + "dog", "horse", "motorbike", "person", "pottedplant", "sheep", + "sofa", "train", "tvmonitor"}; image im = imread(image_file); @@ -115,23 +115,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -187,8 +187,8 @@ int main(int argc, char* argv[]) /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - float* input_data = ( float* )malloc(img_size * sizeof(float)); + int dims[] = {1, 3, img_h, img_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -207,7 +207,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -244,10 +244,10 @@ int main(int argc, char* argv[]) fprintf(stderr, "--------------------------------------\n"); /* process the detection result */ - tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); //"detection_out" + tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); //"detection_out" int out_dim[4]; get_tensor_shape(output_tensor, out_dim, 4); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); post_process_ssd(image_file, show_threshold, output_data, out_dim[1]); /* release tengine */ diff --git a/examples/tm_mobilenet_ssd_uint8.cpp b/examples/tm_mobilenet_ssd_uint8.cpp index ced0319f7..6420b4a9a 100644 --- a/examples/tm_mobilenet_ssd_uint8.cpp +++ b/examples/tm_mobilenet_ssd_uint8.cpp @@ -49,7 +49,7 @@ void get_input_uint_data_ssd(const char* image_file, uint8_t* input_data, int im float scales[3] = {1 / 127.5f, 1 / 127.5f, 1 / 127.5f}; image img = imread_process(image_file, img_w, img_h, mean, scales); - float* image_data = ( float* )img.data; + float* image_data = (float*)img.data; for (int i = 0; i < img_w * img_h * 3; i++) { @@ -67,10 +67,10 @@ void get_input_uint_data_ssd(const char* image_file, uint8_t* input_data, int im void post_process_ssd(const char* image_file, float threshold, float* outdata, int num) { - const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle", - "bus", "car", "cat", "chair", "cow", "diningtable", - "dog", "horse", "motorbike", "person", "pottedplant", "sheep", - "sofa", "train", "tvmonitor"}; + const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle", + "bus", "car", "cat", "chair", "cow", "diningtable", + "dog", "horse", "motorbike", "person", "pottedplant", "sheep", + "sofa", "train", "tvmonitor"}; image im = imread(image_file); @@ -91,11 +91,11 @@ void post_process_ssd(const char* image_file, float threshold, float* outdata, i box.y1 = outdata[5] * raw_h; boxes.push_back(box); printf("%s\t:%.2f\n", class_names[box.class_idx], box.score * 100.f); - printf("BOX:( %d , %d ),( %d , %d )\n", ( int )box.x0, ( int )box.y0, ( int )box.x1, ( int )box.y1); + printf("BOX:( %d , %d ),( %d , %d )\n", (int)box.x0, (int)box.y0, (int)box.x1, (int)box.y1); } outdata += 6; } - for (int i = 0; i < ( int )boxes.size(); i++) + for (int i = 0; i < (int)boxes.size(); i++) { Box box = boxes[i]; draw_box(im, box.x0, box.y0, box.x1, box.y1, 2, 125, 0, 125); @@ -131,23 +131,23 @@ int main(int argc, char* argv[]) { switch (ret) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -193,8 +193,8 @@ int main(int argc, char* argv[]) /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - uint8_t* input_data = ( uint8_t* )malloc(img_size * sizeof(uint8_t)); + int dims[] = {1, 3, img_h, img_w}; // nchw + uint8_t* input_data = (uint8_t*)malloc(img_size * sizeof(uint8_t)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -213,7 +213,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -225,7 +225,7 @@ int main(int argc, char* argv[]) /* prepare process input data, set the data mem to input tensor */ float input_scale = 0.f; int input_zero_point = 0; - get_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); + get_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); get_input_uint_data_ssd(image_file, input_data, img_h, img_w, input_scale, input_zero_point); /* run graph */ @@ -253,19 +253,19 @@ int main(int argc, char* argv[]) fprintf(stderr, "--------------------------------------\n"); /* process the detection result */ - tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); //"detection_out" + tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); //"detection_out" int out_dim[4]; get_tensor_shape(output_tensor, out_dim, 4); int output_size = get_tensor_buffer_size(output_tensor); - uint8_t* output_u8 = ( uint8_t* )get_tensor_buffer(output_tensor); - float* output_data = ( float* )malloc(output_size * sizeof(float)); + uint8_t* output_u8 = (uint8_t*)get_tensor_buffer(output_tensor); + float* output_data = (float*)malloc(output_size * sizeof(float)); /* dequant */ float output_scale = 0.f; int output_zero_point = 0; - get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - (float )output_zero_point) * output_scale; + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; /* post_process_ssd */ post_process_ssd(image_file, show_threshold, output_data, out_dim[1]); diff --git a/examples/tm_nanodet_m.cpp b/examples/tm_nanodet_m.cpp index 5c614e7d5..16aa0a3ba 100644 --- a/examples/tm_nanodet_m.cpp +++ b/examples/tm_nanodet_m.cpp @@ -42,13 +42,12 @@ #include "tengine_operations.h" // tengine output tensor names -const char *cls_pred_name[] = { - "cls_pred_stride_8", "cls_pred_stride_16", "cls_pred_stride_32" -}; -const char *dis_pred_name[] = { +const char* cls_pred_name[] = { + "cls_pred_stride_8", "cls_pred_stride_16", "cls_pred_stride_32"}; +const char* dis_pred_name[] = { #ifdef TRY_POST_SOFTMAX "dis_pred_stride_8", "dis_pred_stride_16", "dis_pred_stride_32" -#else /* !TRY_POST_SOFTMAX */ +#else /* !TRY_POST_SOFTMAX */ "dis_sm_stride_8", "dis_sm_stride_16", "dis_sm_stride_32" #endif /* TRY_POST_SOFTMAX */ }; @@ -60,8 +59,10 @@ struct Object float prob; }; -static __inline float fast_exp(float x) { - union { +static __inline float fast_exp(float x) +{ + union + { uint32_t i; float f; } v{}; @@ -70,16 +71,19 @@ static __inline float fast_exp(float x) { } template -static int softmax(const _Tp* src, _Tp* dst, int length) { +static int softmax(const _Tp* src, _Tp* dst, int length) +{ const _Tp max_value = *std::max_element(src, src + length); - _Tp denominator{ 0 }; - - for (int i = 0; i < length; ++i) { - dst[i] = std::exp/*fast_exp*/(src[i] - max_value); + _Tp denominator{0}; + + for (int i = 0; i < length; ++i) + { + dst[i] = std::exp /*fast_exp*/ (src[i] - max_value); denominator += dst[i]; } - - for (int i = 0; i < length; ++i) { + + for (int i = 0; i < length; ++i) + { dst[i] /= denominator; } @@ -178,8 +182,9 @@ static void nms_sorted_bboxes(const std::vector& faceobjects, std::vecto // @param: in_pad[in] as letter box's shape // @param: prob_threshold[in] // @param: objects[out] output detected objects -static void generate_proposals(const float *cls_pred, const float *dis_pred, int stride, - const image &in_pad, float prob_threshold, std::vector& objects) { +static void generate_proposals(const float* cls_pred, const float* dis_pred, int stride, + const image& in_pad, float prob_threshold, std::vector& objects) +{ const int num_grid_x = in_pad.w / stride; const int num_grid_y = in_pad.h / stride; // Note: Here, we hard coded some model parameters for simplicity. @@ -188,37 +193,44 @@ static void generate_proposals(const float *cls_pred, const float *dis_pred, int // Discrete distribution parameter, see the following resources for more details: // [nanodet-m.yml](https://github.com/RangiLyu/nanodet/blob/main/config/nanodet-m.yml) // [GFL](https://arxiv.org/pdf/2006.04388.pdf) - const int reg_max_1 = 8; // 32 / 4; + const int reg_max_1 = 8; // 32 / 4; - for (int i = 0; i < num_grid_y; i++) { - for (int j = 0; j < num_grid_x; j++) { + for (int i = 0; i < num_grid_y; i++) + { + for (int j = 0; j < num_grid_x; j++) + { const int idx = i * num_grid_x + j; - const float *scores = cls_pred + idx * num_class; + const float* scores = cls_pred + idx * num_class; // find label with max score int label = -1; float score = -FLT_MAX; - for (int k = 0; k < num_class; k++) { - if (scores[k] > score) { + for (int k = 0; k < num_class; k++) + { + if (scores[k] > score) + { label = k; score = scores[k]; } } - if (score >= prob_threshold) { + if (score >= prob_threshold) + { float pred_ltrb[4]; - for (int k = 0; k < 4; k++) { + for (int k = 0; k < 4; k++) + { float dis = 0.f; // predicted distance distribution after softmax #ifdef TRY_POST_SOFTMAX - float dis_after_sm[8] = { 0. }; + float dis_after_sm[8] = {0.}; softmax(dis_pred + idx * reg_max_1 * 4 + k * reg_max_1, dis_after_sm, 8); -#else /* !TRY_POST_SOFTMAX */ - const float *dis_after_sm = dis_pred + idx * reg_max_1 * 4 + k * reg_max_1; +#else /* !TRY_POST_SOFTMAX */ + const float* dis_after_sm = dis_pred + idx * reg_max_1 * 4 + k * reg_max_1; #endif /* TRY_POST_SOFTMAX */ // integral on predicted discrete distribution - for (int l = 0; l < reg_max_1; l++) { + for (int l = 0; l < reg_max_1; l++) + { dis += l * dis_after_sm[l]; //printf("%2.6f ", dis_after_sm[l]); } @@ -250,19 +262,18 @@ static void generate_proposals(const float *cls_pred, const float *dis_pred, int } } -static void draw_objects(const cv::Mat& bgr, const std::vector& objects, const char *path) +static void draw_objects(const cv::Mat& bgr, const std::vector& objects, const char* path) { static const char* class_names[] = { - "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", - "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", - "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", - "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", - "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", - "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", - "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", - "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", - "hair drier", "toothbrush" - }; + "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", + "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", + "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", + "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", + "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", + "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", + "hair drier", "toothbrush"}; cv::Mat image = bgr.clone(); @@ -271,8 +282,8 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects, const Object& obj = objects[i]; fprintf(stderr, "%2d: %3.3f%%, [%7.3f, %7.3f, %7.3f, %7.3f], %s\n", - obj.label, obj.prob * 100, obj.rect.x, obj.rect.y, - obj.rect.x + obj.rect.width, obj.rect.y + obj.rect.height, class_names[obj.label]); + obj.label, obj.prob * 100, obj.rect.x, obj.rect.y, + obj.rect.x + obj.rect.width, obj.rect.y + obj.rect.height, class_names[obj.label]); cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); @@ -311,10 +322,13 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects, /// @param norm norm values per channel static void nhwc_to_nchw(float* src, float* dst, int h_limit, int w_limit, int c_limit, const float* mean, const float* norm) { - for (int h = 0; h < h_limit; h++) { - for (int w = 0; w < w_limit; w++) { - for (int c = 0; c < 3; c++) { - int in_index = h * w_limit * 3 + w * 3 + c; + for (int h = 0; h < h_limit; h++) + { + for (int w = 0; w < w_limit; w++) + { + for (int c = 0; c < 3; c++) + { + int in_index = h * w_limit * 3 + w * 3 + c; int out_index = c * h_limit * w_limit + h * w_limit + w; dst[out_index] = (src[in_index] - mean[c]) * norm[c]; } @@ -323,17 +337,19 @@ static void nhwc_to_nchw(float* src, float* dst, int h_limit, int w_limit, int c } // @brief: get input data and resize to model input shape directly -static int get_input_data(const char *path, const float *mean, const float *norm, image &lb) { +static int get_input_data(const char* path, const float* mean, const float* norm, image& lb) +{ // load input image cv::Mat img = cv::imread(path, 1); - if (img.empty()) { + if (img.empty()) + { fprintf(stderr, "cv::imread %s failed\n", path); return -1; } if (img.cols != lb.w || img.rows != lb.h) cv::resize(img, img, cv::Size(lb.w, lb.h)); img.convertTo(img, CV_32FC3); - float *_data = (float *)img.data; + float* _data = (float*)img.data; nhwc_to_nchw(_data, lb.data, lb.h, lb.w, 3, mean, norm); return 0; @@ -343,10 +359,12 @@ static int get_input_data(const char *path, const float *mean, const float *norm // @param: lb[in/out] letter box image inst // @param: pad[out] top and left pad size // @return: resize scale from origin image to letter box -static float get_input_data(const char *path, const float *mean, const float *norm, image &lb, image &pad) { +static float get_input_data(const char* path, const float* mean, const float* norm, image& lb, image& pad) +{ // load input image cv::Mat img = cv::imread(path, 1); - if (img.empty()) { + if (img.empty()) + { fprintf(stderr, "cv::imread %s failed\n", path); return -1.; } @@ -365,29 +383,31 @@ static float get_input_data(const char *path, const float *mean, const float *no pad.h = lb.h - h; //(h + 31) / 32 * 32 - h; // Generate a gray image using opencv cv::Mat img_pad(lb.w, lb.h, CV_32FC3, //cv::Scalar(0)); - cv::Scalar(0.5/norm[0] + mean[0], 0.5/norm[1] + mean[1], 0.5/norm[2] + mean[2])); + cv::Scalar(0.5 / norm[0] + mean[0], 0.5 / norm[1] + mean[1], 0.5 / norm[2] + mean[2])); // Letterbox filling - cv::copyMakeBorder(img, img_pad, pad.h/2, pad.h - pad.h/2, pad.w/2, pad.w - pad.w/2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0)); + cv::copyMakeBorder(img, img_pad, pad.h / 2, pad.h - pad.h / 2, pad.w / 2, pad.w - pad.w / 2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0)); img_pad.convertTo(img_pad, CV_32FC3); - float *_data = (float *)img_pad.data; + float* _data = (float*)img_pad.data; nhwc_to_nchw(_data, lb.data, lb.h, lb.w, 3, mean, norm); return lb_scale; } -static void show_usage() { +static void show_usage() +{ fprintf(stderr, "[Usage]: [-h]\n"); fprintf(stderr, " [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count] [-o output_file]\n"); } -int main(int argc, char* argv[]) { +int main(int argc, char* argv[]) +{ const char* model_file = nullptr; const char* image_file = nullptr; const char* output_file = "nanodet_m_out.jpg"; - const float mean[3] = { 103.53f, 116.28f, 123.675f }; // bgr - const float norm[3] = { 0.017429f, 0.017507f, 0.017125f }; + const float mean[3] = {103.53f, 116.28f, 123.675f}; // bgr + const float norm[3] = {0.017429f, 0.017507f, 0.017125f}; int repeat_count = 1; int num_thread = 1; @@ -396,38 +416,42 @@ int main(int argc, char* argv[]) { const float nms_threshold = 0.5f; int res; - while ((res = getopt(argc, argv, "m:i:o:r:t:h:")) != -1) { - switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'o': - output_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + while ((res = getopt(argc, argv, "m:i:o:r:t:h:")) != -1) + { + switch (res) + { + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'o': + output_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } /* check files */ - if (nullptr == model_file || nullptr == image_file) { + if (nullptr == model_file || nullptr == image_file) + { fprintf(stderr, "Error: Tengine model file not specified!\n"); show_usage(); return -1; } - if (!check_file_exist(model_file) || !check_file_exist(image_file)) { + if (!check_file_exist(model_file) || !check_file_exist(image_file)) + { return -1; } @@ -439,7 +463,8 @@ int main(int argc, char* argv[]) { opt.affinity = 0; /* inital tengine */ - if (0 != init_tengine()) { + if (0 != init_tengine()) + { fprintf(stderr, "Initial tengine failed.\n"); return -1; } @@ -447,14 +472,16 @@ int main(int argc, char* argv[]) { /* create graph, load tengine model xxx.tmfile */ graph_t graph = create_graph(nullptr, "tengine", model_file); - if (nullptr == graph) { + if (nullptr == graph) + { fprintf(stderr, "Create graph failed.\n"); return -1; } /* get input tensor of graph */ tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); - if (nullptr == input_tensor) { + if (nullptr == input_tensor) + { fprintf(stderr, "Get input tensor failed\n"); return -1; } @@ -462,7 +489,8 @@ int main(int argc, char* argv[]) { /* get shape of input tensor */ int i, dims[4]; // nchw int dim_num = get_tensor_shape(input_tensor, dims, 4); - if (4 != dim_num) { + if (4 != dim_num) + { fprintf(stderr, "Get input tensor shape error\n"); return -1; } @@ -473,18 +501,20 @@ int main(int argc, char* argv[]) { #ifdef TRY_LETTER_BOX image pad = make_empty_image(lb.w, lb.h, lb.c); float lb_scale = get_input_data(image_file, mean, norm, lb, pad); -#else /* !TRY_LETTER_BOX */ +#else /* !TRY_LETTER_BOX */ get_input_data(image_file, mean, norm, lb); #endif /* TRY_LETTER_BOX */ /* set the data mem to input tensor */ - if (set_tensor_buffer(input_tensor, lb.data, img_size * sizeof(float)) < 0) { + if (set_tensor_buffer(input_tensor, lb.data, img_size * sizeof(float)) < 0) + { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; } /* prerun graph to infer shape, and set work options(num_thread, cluster, precision) */ - if (prerun_graph_multithread(graph, opt) < 0) { + if (prerun_graph_multithread(graph, opt) < 0) + { fprintf(stderr, "Prerun multithread graph failed.\n"); return -1; } @@ -493,9 +523,11 @@ int main(int argc, char* argv[]) { double min_time = DBL_MAX; double max_time = DBL_MIN; double total_time = 0.; - for (i = 0; i < repeat_count; i++) { + for (i = 0; i < repeat_count; i++) + { double start = get_current_time(); - if (run_graph(graph, 1) < 0) { + if (run_graph(graph, 1) < 0) + { fprintf(stderr, "Run graph failed\n"); return -1; } @@ -506,22 +538,24 @@ int main(int argc, char* argv[]) { max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); /* nanodet_m postprocess */ std::vector proposals, objects; - for (int stride_index = 0; stride_index < 3; stride_index++) { + for (int stride_index = 0; stride_index < 3; stride_index++) + { tensor_t cls_tensor = get_graph_tensor(graph, cls_pred_name[stride_index]); tensor_t dis_tensor = get_graph_tensor(graph, dis_pred_name[stride_index]); - if (NULL == cls_tensor || NULL ==dis_tensor) { + if (NULL == cls_tensor || NULL == dis_tensor) + { fprintf(stderr, "get graph tensor failed\n"); return -1; } - const float *cls_pred = (const float *)get_tensor_buffer(cls_tensor); - const float *dis_pred = (const float *)get_tensor_buffer(dis_tensor); + const float* cls_pred = (const float*)get_tensor_buffer(cls_tensor); + const float* dis_pred = (const float*)get_tensor_buffer(dis_tensor); generate_proposals(cls_pred, dis_pred, 1 << (stride_index + 3), - lb, prob_threshold, objects); + lb, prob_threshold, objects); proposals.insert(proposals.end(), objects.begin(), objects.end()); } @@ -534,10 +568,11 @@ int main(int argc, char* argv[]) { cv::Mat img = cv::imread(image_file); int count = picked.size(); - fprintf(stderr, "detection num: %d\n",count); + fprintf(stderr, "detection num: %d\n", count); objects.resize(count); - for (i = 0; i < count; i++) { + for (i = 0; i < count; i++) + { objects[i] = proposals[picked[i]]; #ifdef TRY_LETTER_BOX @@ -546,7 +581,7 @@ int main(int argc, char* argv[]) { float y0 = (objects[i].rect.y - (pad.h / 2)) / lb_scale; float x1 = (objects[i].rect.x + objects[i].rect.width - (pad.w / 2)) / lb_scale; float y1 = (objects[i].rect.y + objects[i].rect.height - (pad.h / 2)) / lb_scale; -#else /* !TRY_LETTER_BOX */ +#else /* !TRY_LETTER_BOX */ // adjust offset to original unresized static float lb_scale_w = 1. * lb.w / img.cols; static float lb_scale_h = 1. * lb.h / img.rows; @@ -576,4 +611,3 @@ int main(int argc, char* argv[]) { release_tengine(); return 0; } - diff --git a/examples/tm_nanodet_m_timvx.cpp b/examples/tm_nanodet_m_timvx.cpp index ed9471d75..dc1edeb81 100644 --- a/examples/tm_nanodet_m_timvx.cpp +++ b/examples/tm_nanodet_m_timvx.cpp @@ -42,13 +42,12 @@ #include "tengine_operations.h" // tengine output tensor names -const char *cls_pred_name[] = { - "cls_pred_stride_8", "cls_pred_stride_16", "cls_pred_stride_32" -}; -const char *dis_pred_name[] = { +const char* cls_pred_name[] = { + "cls_pred_stride_8", "cls_pred_stride_16", "cls_pred_stride_32"}; +const char* dis_pred_name[] = { #ifdef TRY_POST_SOFTMAX "dis_pred_stride_8", "dis_pred_stride_16", "dis_pred_stride_32" -#else /* !TRY_POST_SOFTMAX */ +#else /* !TRY_POST_SOFTMAX */ "dis_sm_stride_8", "dis_sm_stride_16", "dis_sm_stride_32" #endif /* TRY_POST_SOFTMAX */ }; @@ -60,8 +59,10 @@ struct Object float prob; }; -static __inline float fast_exp(float x) { - union { +static __inline float fast_exp(float x) +{ + union + { uint32_t i; float f; } v{}; @@ -70,16 +71,19 @@ static __inline float fast_exp(float x) { } template -static int softmax(const _Tp* src, _Tp* dst, int length) { +static int softmax(const _Tp* src, _Tp* dst, int length) +{ const _Tp max_value = *std::max_element(src, src + length); - _Tp denominator{ 0 }; - - for (int i = 0; i < length; ++i) { - dst[i] = std::exp/*fast_exp*/(src[i] - max_value); + _Tp denominator{0}; + + for (int i = 0; i < length; ++i) + { + dst[i] = std::exp /*fast_exp*/ (src[i] - max_value); denominator += dst[i]; } - - for (int i = 0; i < length; ++i) { + + for (int i = 0; i < length; ++i) + { dst[i] /= denominator; } @@ -178,8 +182,9 @@ static void nms_sorted_bboxes(const std::vector& faceobjects, std::vecto // @param: in_pad[in] as letter box's shape // @param: prob_threshold[in] // @param: objects[out] output detected objects -static void generate_proposals(const float *cls_pred, const float *dis_pred, int stride, - const image &in_pad, float prob_threshold, std::vector& objects) { +static void generate_proposals(const float* cls_pred, const float* dis_pred, int stride, + const image& in_pad, float prob_threshold, std::vector& objects) +{ const int num_grid_x = in_pad.w / stride; const int num_grid_y = in_pad.h / stride; // Note: Here, we hard coded some model parameters for simplicity. @@ -188,37 +193,44 @@ static void generate_proposals(const float *cls_pred, const float *dis_pred, int // Discrete distribution parameter, see the following resources for more details: // [nanodet-m.yml](https://github.com/RangiLyu/nanodet/blob/main/config/nanodet-m.yml) // [GFL](https://arxiv.org/pdf/2006.04388.pdf) - const int reg_max_1 = 8; // 32 / 4; + const int reg_max_1 = 8; // 32 / 4; - for (int i = 0; i < num_grid_y; i++) { - for (int j = 0; j < num_grid_x; j++) { + for (int i = 0; i < num_grid_y; i++) + { + for (int j = 0; j < num_grid_x; j++) + { const int idx = i * num_grid_x + j; - const float *scores = cls_pred + idx * num_class; + const float* scores = cls_pred + idx * num_class; // find label with max score int label = -1; float score = -FLT_MAX; - for (int k = 0; k < num_class; k++) { - if (scores[k] > score) { + for (int k = 0; k < num_class; k++) + { + if (scores[k] > score) + { label = k; score = scores[k]; } } - if (score >= prob_threshold) { + if (score >= prob_threshold) + { float pred_ltrb[4]; - for (int k = 0; k < 4; k++) { + for (int k = 0; k < 4; k++) + { float dis = 0.f; // predicted distance distribution after softmax #ifdef TRY_POST_SOFTMAX - float dis_after_sm[8] = { 0. }; + float dis_after_sm[8] = {0.}; softmax(dis_pred + idx * reg_max_1 * 4 + k * reg_max_1, dis_after_sm, 8); -#else /* !TRY_POST_SOFTMAX */ - const float *dis_after_sm = dis_pred + idx * reg_max_1 * 4 + k * reg_max_1; +#else /* !TRY_POST_SOFTMAX */ + const float* dis_after_sm = dis_pred + idx * reg_max_1 * 4 + k * reg_max_1; #endif /* TRY_POST_SOFTMAX */ // integral on predicted discrete distribution - for (int l = 0; l < reg_max_1; l++) { + for (int l = 0; l < reg_max_1; l++) + { dis += l * dis_after_sm[l]; //printf("%2.6f ", dis_after_sm[l]); } @@ -250,19 +262,18 @@ static void generate_proposals(const float *cls_pred, const float *dis_pred, int } } -static void draw_objects(const cv::Mat& bgr, const std::vector& objects, const char *path) +static void draw_objects(const cv::Mat& bgr, const std::vector& objects, const char* path) { static const char* class_names[] = { - "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", - "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", - "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", - "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", - "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", - "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", - "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", - "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", - "hair drier", "toothbrush" - }; + "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", + "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", + "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", + "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", + "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", + "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", + "hair drier", "toothbrush"}; cv::Mat image = bgr.clone(); @@ -271,8 +282,8 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects, const Object& obj = objects[i]; fprintf(stderr, "%2d: %3.3f%%, [%7.3f, %7.3f, %7.3f, %7.3f], %s\n", - obj.label, obj.prob * 100, obj.rect.x, obj.rect.y, - obj.rect.x + obj.rect.width, obj.rect.y + obj.rect.height, class_names[obj.label]); + obj.label, obj.prob * 100, obj.rect.x, obj.rect.y, + obj.rect.x + obj.rect.width, obj.rect.y + obj.rect.height, class_names[obj.label]); cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); @@ -302,10 +313,12 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects, } // @brief: get input data and resize to model input shape directly -static int get_input_data(const char *path, const float *mean, const float *norm, image &lb) { +static int get_input_data(const char* path, const float* mean, const float* norm, image& lb) +{ // load input image cv::Mat img = cv::imread(path, 1); - if (img.empty()) { + if (img.empty()) + { fprintf(stderr, "cv::imread %s failed\n", path); return -1; } @@ -314,11 +327,14 @@ static int get_input_data(const char *path, const float *mean, const float *norm img.convertTo(img, CV_32FC3); /* nhwc to nchw */ - float *_data = (float *)img.data; - for (int h = 0; h < lb.h; h++) { - for (int w = 0; w < lb.w; w++) { - for (int c = 0; c < 3; c++) { - int in_index = h * lb.w * 3 + w * 3 + c; + float* _data = (float*)img.data; + for (int h = 0; h < lb.h; h++) + { + for (int w = 0; w < lb.w; w++) + { + for (int c = 0; c < 3; c++) + { + int in_index = h * lb.w * 3 + w * 3 + c; int out_index = c * lb.h * lb.w + h * lb.w + w; lb.data[out_index] = (_data[in_index] - mean[c]) * norm[c]; } @@ -331,10 +347,12 @@ static int get_input_data(const char *path, const float *mean, const float *norm // @param: lb[in/out] letter box image inst // @param: pad[out] top and left pad size // @return: resize scale from origin image to letter box -static float get_input_data(const char *path, const float *mean, const float *norm, image &lb, image &pad) { +static float get_input_data(const char* path, const float* mean, const float* norm, image& lb, image& pad) +{ // load input image cv::Mat img = cv::imread(path, 1); - if (img.empty()) { + if (img.empty()) + { fprintf(stderr, "cv::imread %s failed\n", path); return -1.; } @@ -353,17 +371,20 @@ static float get_input_data(const char *path, const float *mean, const float *no pad.h = lb.h - h; //(h + 31) / 32 * 32 - h; // Generate a gray image using opencv cv::Mat img_pad(lb.w, lb.h, CV_32FC3, //cv::Scalar(0)); - cv::Scalar(0.5/norm[0] + mean[0], 0.5/norm[0] + mean[0], 0.5/norm[2] + mean[2])); + cv::Scalar(0.5 / norm[0] + mean[0], 0.5 / norm[0] + mean[0], 0.5 / norm[2] + mean[2])); // Letterbox filling - cv::copyMakeBorder(img, img_pad, pad.h/2, pad.h - pad.h/2, pad.w/2, pad.w - pad.w/2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0)); + cv::copyMakeBorder(img, img_pad, pad.h / 2, pad.h - pad.h / 2, pad.w / 2, pad.w - pad.w / 2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0)); img_pad.convertTo(img_pad, CV_32FC3); - float *_data = (float *)img_pad.data; + float* _data = (float*)img_pad.data; /* nhwc to nchw */ - for (int h = 0; h < lb.h; h++) { - for (int w = 0; w < lb.w; w++) { - for (int c = 0; c < 3; c++) { - int in_index = h * lb.w * 3 + w * 3 + c; + for (int h = 0; h < lb.h; h++) + { + for (int w = 0; w < lb.w; w++) + { + for (int c = 0; c < 3; c++) + { + int in_index = h * lb.w * 3 + w * 3 + c; int out_index = c * lb.h * lb.w + h * lb.w + w; lb.data[out_index] = (_data[in_index] - mean[c]) * norm[c]; } @@ -373,7 +394,8 @@ static float get_input_data(const char *path, const float *mean, const float *no return lb_scale; } -static void show_usage() { +static void show_usage() +{ fprintf(stderr, "[Usage]: [-h]\n"); fprintf(stderr, " [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count] [-o output_file]\n"); } @@ -392,13 +414,14 @@ void get_input_uint8_data(float* input_fp32, uint8_t* input_data, int size, floa } } -int main(int argc, char* argv[]) { +int main(int argc, char* argv[]) +{ const char* model_file = nullptr; const char* image_file = nullptr; const char* output_file = "nanodet_m_uint8_out.jpg"; - const float mean[3] = { 103.53f, 116.28f, 123.675f }; // bgr - const float norm[3] = { 0.017429f, 0.017507f, 0.017125f }; + const float mean[3] = {103.53f, 116.28f, 123.675f}; // bgr + const float norm[3] = {0.017429f, 0.017507f, 0.017125f}; int repeat_count = 1; int num_thread = 1; @@ -407,38 +430,42 @@ int main(int argc, char* argv[]) { const float nms_threshold = 0.5f; int res; - while ((res = getopt(argc, argv, "m:i:o:r:t:h:")) != -1) { - switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'o': - output_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + while ((res = getopt(argc, argv, "m:i:o:r:t:h:")) != -1) + { + switch (res) + { + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'o': + output_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } /* check files */ - if (nullptr == model_file || nullptr == image_file) { + if (nullptr == model_file || nullptr == image_file) + { fprintf(stderr, "Error: Tengine model file not specified!\n"); show_usage(); return -1; } - if (!check_file_exist(model_file) || !check_file_exist(image_file)) { + if (!check_file_exist(model_file) || !check_file_exist(image_file)) + { return -1; } @@ -450,7 +477,8 @@ int main(int argc, char* argv[]) { opt.affinity = 0; /* inital tengine */ - if (0 != init_tengine()) { + if (0 != init_tengine()) + { fprintf(stderr, "Initial tengine failed.\n"); return -1; } @@ -466,14 +494,16 @@ int main(int argc, char* argv[]) { } /* create graph, load tengine model xxx.tmfile */ graph_t graph = create_graph(timvx_context, "tengine", model_file); - if (nullptr == graph) { + if (nullptr == graph) + { fprintf(stderr, "Create graph failed.\n"); return -1; } /* get input tensor of graph */ tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); - if (nullptr == input_tensor) { + if (nullptr == input_tensor) + { fprintf(stderr, "Get input tensor failed\n"); return -1; } @@ -481,7 +511,8 @@ int main(int argc, char* argv[]) { /* get shape of input tensor */ int i, dims[4]; // nchw int dim_num = get_tensor_shape(input_tensor, dims, 4); - if (4 != dim_num) { + if (4 != dim_num) + { fprintf(stderr, "Get input tensor shape error\n"); return -1; } @@ -492,7 +523,7 @@ int main(int argc, char* argv[]) { #ifdef TRY_LETTER_BOX image pad = make_empty_image(lb.w, lb.h, lb.c); float lb_scale = get_input_data(image_file, mean, norm, lb, pad); -#else /* !TRY_LETTER_BOX */ +#else /* !TRY_LETTER_BOX */ get_input_data(image_file, mean, norm, lb); #endif /* TRY_LETTER_BOX */ @@ -504,13 +535,15 @@ int main(int argc, char* argv[]) { get_input_uint8_data(lb.data, input_data.data(), img_size, input_scale, input_zero_point); /* set the data mem to input tensor */ - if (set_tensor_buffer(input_tensor, input_data.data(), img_size) < 0) { + if (set_tensor_buffer(input_tensor, input_data.data(), img_size) < 0) + { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; } /* prerun graph to infer shape, and set work options(num_thread, cluster, precision) */ - if (prerun_graph_multithread(graph, opt) < 0) { + if (prerun_graph_multithread(graph, opt) < 0) + { fprintf(stderr, "Prerun multithread graph failed.\n"); return -1; } @@ -519,9 +552,11 @@ int main(int argc, char* argv[]) { double min_time = DBL_MAX; double max_time = DBL_MIN; double total_time = 0.; - for (int i = 0; i < repeat_count; i++) { + for (int i = 0; i < repeat_count; i++) + { double start = get_current_time(); - if (run_graph(graph, 1) < 0) { + if (run_graph(graph, 1) < 0) + { fprintf(stderr, "Run graph failed\n"); return -1; } @@ -532,39 +567,39 @@ int main(int argc, char* argv[]) { max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); /* nanodet_m postprocess */ std::vector proposals, objects; - for (int stride_index = 0; stride_index < 3; stride_index++) + for (int stride_index = 0; stride_index < 3; stride_index++) { tensor_t cls_tensor = get_graph_tensor(graph, cls_pred_name[stride_index]); tensor_t dis_tensor = get_graph_tensor(graph, dis_pred_name[stride_index]); - int cls_count = get_tensor_buffer_size(cls_tensor) / sizeof(uint8_t); - int dis_count = get_tensor_buffer_size(dis_tensor) / sizeof(uint8_t); + int cls_count = get_tensor_buffer_size(cls_tensor) / sizeof(uint8_t); + int dis_count = get_tensor_buffer_size(dis_tensor) / sizeof(uint8_t); - float cls_scale = 0.f; - float dis_scale = 0.f; - int cls_zero_point = 0; - int dis_zero_point = 0; + float cls_scale = 0.f; + float dis_scale = 0.f; + int cls_zero_point = 0; + int dis_zero_point = 0; get_tensor_quant_param(cls_tensor, &cls_scale, &cls_zero_point, 1); get_tensor_quant_param(dis_tensor, &dis_scale, &dis_zero_point, 1); - - const uint8_t *cls_pred_u8 = (const uint8_t *)get_tensor_buffer(cls_tensor); - const uint8_t *dis_pred_u8 = (const uint8_t *)get_tensor_buffer(dis_tensor); + + const uint8_t* cls_pred_u8 = (const uint8_t*)get_tensor_buffer(cls_tensor); + const uint8_t* dis_pred_u8 = (const uint8_t*)get_tensor_buffer(dis_tensor); std::vector cls_pred(cls_count); std::vector dis_pred(dis_count); for (int c = 0; c < cls_count; c++) - cls_pred[c] = (( float )cls_pred_u8[c] - ( float )cls_zero_point) * cls_scale; + cls_pred[c] = ((float)cls_pred_u8[c] - (float)cls_zero_point) * cls_scale; for (int c = 0; c < dis_count; c++) - dis_pred[c] = (( float )dis_pred_u8[c] - ( float )dis_zero_point) * dis_scale; - + dis_pred[c] = ((float)dis_pred_u8[c] - (float)dis_zero_point) * dis_scale; + generate_proposals(cls_pred.data(), dis_pred.data(), 1 << (stride_index + 3), lb, prob_threshold, objects); proposals.insert(proposals.end(), objects.begin(), objects.end()); } @@ -578,10 +613,11 @@ int main(int argc, char* argv[]) { cv::Mat img = cv::imread(image_file); int count = picked.size(); - fprintf(stderr, "detection num: %d\n",count); + fprintf(stderr, "detection num: %d\n", count); objects.resize(count); - for (int i = 0; i < count; i++) { + for (int i = 0; i < count; i++) + { objects[i] = proposals[picked[i]]; #ifdef TRY_LETTER_BOX @@ -590,7 +626,7 @@ int main(int argc, char* argv[]) { float y0 = (objects[i].rect.y - (pad.h / 2)) / lb_scale; float x1 = (objects[i].rect.x + objects[i].rect.width - (pad.w / 2)) / lb_scale; float y1 = (objects[i].rect.y + objects[i].rect.height - (pad.h / 2)) / lb_scale; -#else /* !TRY_LETTER_BOX */ +#else /* !TRY_LETTER_BOX */ // adjust offset to original unresized static float lb_scale_w = 1. * lb.w / img.cols; static float lb_scale_h = 1. * lb.h / img.rows; @@ -620,4 +656,3 @@ int main(int argc, char* argv[]) { release_tengine(); return 0; } - diff --git a/examples/tm_openpose.cpp b/examples/tm_openpose.cpp index a8a9b88d8..04eb4acf5 100644 --- a/examples/tm_openpose.cpp +++ b/examples/tm_openpose.cpp @@ -17,23 +17,19 @@ #define DEFAULT_THREAD_COUNT 1 #ifdef MPI -const int POSE_PAIRS[14][2] = {{0, 1}, {1, 2}, {2, 3}, {3, 4}, {1, 5}, {5, 6}, {6, 7}, - {1, 14}, {14, 8}, {8, 9}, {9, 10}, {14, 11}, {11, 12}, {12, 13}}; +const int POSE_PAIRS[14][2] = {{0, 1}, {1, 2}, {2, 3}, {3, 4}, {1, 5}, {5, 6}, {6, 7}, {1, 14}, {14, 8}, {8, 9}, {9, 10}, {14, 11}, {11, 12}, {12, 13}}; // std::string model_file = "models/openpose_mpi.tmfile"; int nPoints = 15; #endif #ifdef COCO -const int POSE_PAIRS[17][2] = {{1, 2}, {1, 5}, {2, 3}, {3, 4}, {5, 6}, {6, 7}, {1, 8}, {8, 9}, {9, 10}, - {1, 11}, {11, 12}, {12, 13}, {1, 0}, {0, 14}, {14, 16}, {0, 15}, {15, 17}}; +const int POSE_PAIRS[17][2] = {{1, 2}, {1, 5}, {2, 3}, {3, 4}, {5, 6}, {6, 7}, {1, 8}, {8, 9}, {9, 10}, {1, 11}, {11, 12}, {12, 13}, {1, 0}, {0, 14}, {14, 16}, {0, 15}, {15, 17}}; // std::string model_file = "models/openpose_coco.tmfile"; int nPoints = 18; #endif #ifdef BODY25 -const int POSE_PAIRS[24][2] = {{1, 2}, {1, 5}, {2, 3}, {3, 4}, {5, 6}, {6, 7}, {1, 8}, {8, 9}, - {9, 10}, {10, 11}, {11, 24}, {11, 22}, {22, 23}, {8, 12}, {12, 13}, {13, 14}, - {14, 21}, {14, 19}, {19, 20}, {1, 0}, {0, 15}, {16, 18}, {0, 16}, {15, 17}}; +const int POSE_PAIRS[24][2] = {{1, 2}, {1, 5}, {2, 3}, {3, 4}, {5, 6}, {6, 7}, {1, 8}, {8, 9}, {9, 10}, {10, 11}, {11, 24}, {11, 22}, {22, 23}, {8, 12}, {12, 13}, {13, 14}, {14, 21}, {14, 19}, {19, 20}, {1, 0}, {0, 15}, {16, 18}, {0, 16}, {15, 17}}; // std::string model_file = "models/openpose_body25.tmfile" int nPoints = 25; #endif @@ -43,7 +39,7 @@ void get_input_data_pose(cv::Mat img, float* input_data, int img_h, int img_w) cv::resize(img, img, cv::Size(img_h, img_w)); img.convertTo(img, CV_32FC3); - float* img_data = ( float* )img.data; + float* img_data = (float*)img.data; int hw = img_h * img_w; double scalefactor = 1.0 / 255; float mean[3] = {0, 0, 0}; @@ -78,19 +74,19 @@ void post_process_pose(cv::Mat img, cv::Mat frameCopy, float threshold, float* o if (outdata[piexl] > prob) { prob = outdata[piexl]; - maxloc.y = ( int )piexl / H; - maxloc.x = ( int )piexl % W; + maxloc.y = (int)piexl / H; + maxloc.x = (int)piexl % W; } } cv::Point2f p(-1, -1); if (prob > threshold) { p = maxloc; - p.y *= ( float )frameWidth / W; - p.x *= ( float )frameHeight / H; + p.y *= (float)frameWidth / W; + p.x *= (float)frameHeight / H; - cv::circle(frameCopy, cv::Point(( int )p.x, ( int )p.y), 4, cv::Scalar(255, 255, 0), -1); - cv::putText(frameCopy, cv::format("%d", n), cv::Point(( int )p.x, ( int )p.y), cv::FONT_HERSHEY_PLAIN, 2, + cv::circle(frameCopy, cv::Point((int)p.x, (int)p.y), 4, cv::Scalar(255, 255, 0), -1); + cv::putText(frameCopy, cv::format("%d", n), cv::Point((int)p.x, (int)p.y), cv::FONT_HERSHEY_PLAIN, 2, cv::Scalar(0, 255, 255), 2); } points[n] = p; @@ -133,23 +129,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -193,9 +189,9 @@ int main(int argc, char* argv[]) /* set the input shape to initial the graph, and prerun graph to infer shape */ int channel = 3; int img_size = img_h * img_w * channel; - int dims[] = {1, channel, img_h, img_w}; // nchw + int dims[] = {1, channel, img_h, img_w}; // nchw - float* input_data = ( float* )malloc(img_size * sizeof(float)); + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == nullptr) @@ -214,7 +210,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -258,7 +254,7 @@ int main(int argc, char* argv[]) return -1; } - float* outdata = ( float* )get_tensor_buffer(out_tensor); + float* outdata = (float*)get_tensor_buffer(out_tensor); int num = nPoints; int H = out_dim[2]; int W = out_dim[3]; @@ -278,4 +274,3 @@ int main(int argc, char* argv[]) return 0; } - diff --git a/examples/tm_retinaface.cpp b/examples/tm_retinaface.cpp index 1d2346b79..0a4eccef8 100644 --- a/examples/tm_retinaface.cpp +++ b/examples/tm_retinaface.cpp @@ -116,7 +116,7 @@ void draw_target(const std::vector& all_pred_boxes, image img) const char* class_names[] = {"faces"}; fprintf(stdout, "detected face num: %zu\n", all_pred_boxes.size()); - for (int b = 0; b < ( int )all_pred_boxes.size(); b++) + for (int b = 0; b < (int)all_pred_boxes.size(); b++) { Face2f box = all_pred_boxes[b]; @@ -167,7 +167,7 @@ void nms_sorted_boxes(const std::vector& face_objects, std::vector& const Face2f& a = face_objects[i]; int keep = 1; - for (int j = 0; j < ( int )picked.size(); j++) + for (int j = 0; j < (int)picked.size(); j++) { const Face2f& b = face_objects[picked[j]]; @@ -228,22 +228,22 @@ std::vector generate_anchors(int base_size, const std::vector& rat std::vector anchors(num_ratio * num_scale); - const float cx = ( float )base_size * 0.5f; - const float cy = ( float )base_size * 0.5f; + const float cx = (float)base_size * 0.5f; + const float cy = (float)base_size * 0.5f; for (int i = 0; i < num_ratio; i++) { float ar = ratios[i]; - int r_w = ( int )round(( float )base_size / sqrt(ar)); - int r_h = ( int )round(( float )r_w * ar); // round(base_size * sqrt(ar)); + int r_w = (int)round((float)base_size / sqrt(ar)); + int r_h = (int)round((float)r_w * ar); // round(base_size * sqrt(ar)); for (int j = 0; j < num_scale; j++) { float scale = scales[j]; - float rs_w = ( float )r_w * scale; - float rs_h = ( float )r_h * scale; + float rs_w = (float)r_w * scale; + float rs_h = (float)r_h * scale; Box2f& anchor = anchors[i * num_scale + j]; @@ -337,10 +337,10 @@ static void generate_proposals(std::vector& anchors, int feat_stride, con faces.push_back(obj); } - anchor_x += ( float )feat_stride; + anchor_x += (float)feat_stride; } - anchor_y += ( float )feat_stride; + anchor_y += (float)feat_stride; } } } @@ -360,11 +360,11 @@ int get_input_data(const char* image_file, const int& max_size, const int& targe scale = float(target_size) / float(im_size_min); - if (scale * ( float )im_size_max > ( float )max_size) + if (scale * (float)im_size_max > (float)max_size) scale = float(max_size) / float(im_size_max); - dst_size.width = ( int )round(( float )img.w * scale); - dst_size.height = ( int )round(( float )img.h * scale); + dst_size.width = (int)round((float)img.w * scale); + dst_size.height = (int)round((float)img.h * scale); image resImg = resize_image(img, dst_size.width, dst_size.height); int img_size = dst_size.height * dst_size.width * 3; @@ -418,26 +418,26 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'n': - device_name = optarg; - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'n': + device_name = optarg; + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -464,7 +464,7 @@ int main(int argc, char* argv[]) opt.num_thread = num_thread; opt.cluster = TENGINE_CLUSTER_ALL; opt.precision = TENGINE_MODE_FP32; - opt.affinity = 0; + opt.affinity = 0; /* inital tengine */ int ret = init_tengine(); @@ -518,7 +518,7 @@ int main(int argc, char* argv[]) { printf("Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (0 != prerun_graph_multithread(graph, opt)) @@ -547,7 +547,7 @@ int main(int argc, char* argv[]) } printf("img_h, img_w : %d, %d\n", image_size.height, image_size.width); printf("Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, - num_thread, total_time / ( float )repeat_count, max_time, min_time); + num_thread, total_time / (float)repeat_count, max_time, min_time); printf("--------------------------------------\n"); /* process the detection result */ @@ -570,9 +570,9 @@ int main(int argc, char* argv[]) get_tensor_shape(bbox_blob_tensor, bbox_blob_dims, MAX_SHAPE_DIM_NUM); get_tensor_shape(landmark_blob_tensor, landmark_blob_dims, MAX_SHAPE_DIM_NUM); - float* score_blob = ( float* )get_tensor_buffer(score_blob_tensor); - float* bbox_blob = ( float* )get_tensor_buffer(bbox_blob_tensor); - float* landmark_blob = ( float* )get_tensor_buffer(landmark_blob_tensor); + float* score_blob = (float*)get_tensor_buffer(score_blob_tensor); + float* bbox_blob = (float*)get_tensor_buffer(bbox_blob_tensor); + float* landmark_blob = (float*)get_tensor_buffer(landmark_blob_tensor); const int base_size = 16; const int feat_stride = stride[stride_index]; @@ -615,10 +615,10 @@ int main(int argc, char* argv[]) float x1 = x0 + face_objects[i].rect.w; float y1 = y0 + face_objects[i].rect.h; - x0 = std::max(std::min(x0, ( float )image_size.width - 1), 0.f); - y0 = std::max(std::min(y0, ( float )image_size.height - 1), 0.f); - x1 = std::max(std::min(x1, ( float )image_size.width - 1), 0.f); - y1 = std::max(std::min(y1, ( float )image_size.height - 1), 0.f); + x0 = std::max(std::min(x0, (float)image_size.width - 1), 0.f); + y0 = std::max(std::min(y0, (float)image_size.height - 1), 0.f); + x1 = std::max(std::min(x1, (float)image_size.width - 1), 0.f); + y1 = std::max(std::min(y1, (float)image_size.height - 1), 0.f); face_objects[i].rect.x = x0; face_objects[i].rect.y = y0; diff --git a/examples/tm_ultraface.cpp b/examples/tm_ultraface.cpp index ac426a36f..abf1f8f43 100644 --- a/examples/tm_ultraface.cpp +++ b/examples/tm_ultraface.cpp @@ -31,12 +31,12 @@ #include "tengine/c_api.h" #include "tengine_operations.h" -#define DEFAULT_REPEAT_COUNT 1 -#define DEFAULT_THREAD_COUNT 1 -#define num_featuremap 4 -#define hard_nms 1 -#define blending_nms 2 /* mix nms was been proposaled in paper blaze face, aims to minimize the temporal jitter*/ -#define clip(x, y) (x < 0 ? 0 : (x > y ? y : x)) +#define DEFAULT_REPEAT_COUNT 1 +#define DEFAULT_THREAD_COUNT 1 +#define num_featuremap 4 +#define hard_nms 1 +#define blending_nms 2 /* mix nms was been proposaled in paper blaze face, aims to minimize the temporal jitter*/ +#define clip(x, y) (x < 0 ? 0 : (x > y ? y : x)) typedef struct FaceInfo { @@ -114,49 +114,52 @@ static void nms(std::vector& input, std::vector& output, int } switch (type) { - case hard_nms: { - output.push_back(buf[0]); - break; - } - case blending_nms: { - float total = 0; - for (int i = 0; i < buf.size(); i++) - { - total += exp(buf[i].score); - } - FaceInfo rects; - memset(&rects, 0, sizeof(rects)); - for (int i = 0; i < buf.size(); i++) - { - float rate = exp(buf[i].score) / total; - rects.x1 += buf[i].x1 * rate; - rects.y1 += buf[i].y1 * rate; - rects.x2 += buf[i].x2 * rate; - rects.y2 += buf[i].y2 * rate; - rects.score += buf[i].score * rate; - } - output.push_back(rects); - break; + case hard_nms: + { + output.push_back(buf[0]); + break; + } + case blending_nms: + { + float total = 0; + for (int i = 0; i < buf.size(); i++) + { + total += exp(buf[i].score); } - default: { - fprintf(stderr, "wrong type of nms."); - exit(-1); + FaceInfo rects; + memset(&rects, 0, sizeof(rects)); + for (int i = 0; i < buf.size(); i++) + { + float rate = exp(buf[i].score) / total; + rects.x1 += buf[i].x1 * rate; + rects.y1 += buf[i].y1 * rate; + rects.x2 += buf[i].x2 * rate; + rects.y2 += buf[i].y2 * rate; + rects.score += buf[i].score * rate; } + output.push_back(rects); + break; + } + default: + { + fprintf(stderr, "wrong type of nms."); + exit(-1); + } } } } -static void post_process_ultraface(const char* image_file, float *boxs_data, float *scores_data) +static void post_process_ultraface(const char* image_file, float* boxs_data, float* scores_data) { image im = imread(image_file); int image_h = im.h; int image_w = im.w; - const std::vector> min_boxes = { + const std::vector > min_boxes = { {10.0f, 16.0f, 24.0f}, {32.0f, 48.0f}, {64.0f, 96.0f}, {128.0f, 192.0f, 256.0f}}; - std::vector> shrinkage_size; - std::vector> priors = {}; - std::vector> featuremap_size; + std::vector > shrinkage_size; + std::vector > priors = {}; + std::vector > featuremap_size; const std::vector strides = {8.0, 16.0, 32.0, 64.0}; std::vector w_h_list = {g_tensor_in_w, g_tensor_in_h}; @@ -256,23 +259,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -315,8 +318,8 @@ int main(int argc, char* argv[]) /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = g_tensor_in_h * g_tensor_in_w * 3; - int dims[] = {1, 3, g_tensor_in_h, g_tensor_in_w}; // nchw - float* input_data = ( float* )malloc(img_size * sizeof(float)); + int dims[] = {1, 3, g_tensor_in_h, g_tensor_in_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -335,7 +338,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -382,8 +385,8 @@ int main(int argc, char* argv[]) tensor_t boxs_tensor = get_graph_output_tensor(graph, 0, 0); tensor_t scores_tensor = get_graph_output_tensor(graph, 1, 0); - float* boxs_data = (float* )get_tensor_buffer(boxs_tensor); - float* scores_data = (float* )get_tensor_buffer(scores_tensor); + float* boxs_data = (float*)get_tensor_buffer(boxs_tensor); + float* scores_data = (float*)get_tensor_buffer(scores_tensor); post_process_ultraface(image_file, boxs_data, scores_data); diff --git a/examples/tm_unet.cpp b/examples/tm_unet.cpp index 4d1929dba..3070846ae 100644 --- a/examples/tm_unet.cpp +++ b/examples/tm_unet.cpp @@ -35,17 +35,17 @@ #include "tengine/c_api.h" #include "tengine_operations.h" -#define DEFAULT_IMG_H 512 -#define DEFAULT_IMG_W 512 -#define DEFAULT_SCALE1 (1.f/255.f) -#define DEFAULT_SCALE2 (1.f/255.f) -#define DEFAULT_SCALE3 (1.f/255.f) -#define DEFAULT_MEAN1 0 -#define DEFAULT_MEAN2 0 -#define DEFAULT_MEAN3 0 -#define DEFAULT_LOOP_COUNT 1 -#define DEFAULT_THREAD_COUNT 1 -#define DEFAULT_CPU_AFFINITY 255 +#define DEFAULT_IMG_H 512 +#define DEFAULT_IMG_W 512 +#define DEFAULT_SCALE1 (1.f / 255.f) +#define DEFAULT_SCALE2 (1.f / 255.f) +#define DEFAULT_SCALE3 (1.f / 255.f) +#define DEFAULT_MEAN1 0 +#define DEFAULT_MEAN2 0 +#define DEFAULT_MEAN3 0 +#define DEFAULT_LOOP_COUNT 1 +#define DEFAULT_THREAD_COUNT 1 +#define DEFAULT_CPU_AFFINITY 255 #define DEFAULT_CONF_THRESHOLD 0.5f /** @@ -55,22 +55,29 @@ * because of the onnx->tmfile convertion problem, keep the network input size dividable by 16 (256,512) */ -int draw_segmentation(const int32_t* data, int h, int w) { - static std::map color_table = {{0, cv::Vec3b(0,0,0)}, - {1, cv::Vec3b(20,59,255)}, - {2, cv::Vec3b(120,59,200)}, - {3, cv::Vec3b(80,29,129)}, - {4, cv::Vec3b(210,99,12)}, // add more color if needed - {-1, cv::Vec3b(255,255,255)} // other type - }; +int draw_segmentation(const int32_t* data, int h, int w) +{ + static std::map color_table = { + {0, cv::Vec3b(0, 0, 0)}, + {1, cv::Vec3b(20, 59, 255)}, + {2, cv::Vec3b(120, 59, 200)}, + {3, cv::Vec3b(80, 29, 129)}, + {4, cv::Vec3b(210, 99, 12)}, // add more color if needed + {-1, cv::Vec3b(255, 255, 255)} // other type + }; cv::Mat img = cv::Mat::zeros(h, w, CV_8UC3); - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { + for (int i = 0; i < h; ++i) + { + for (int j = 0; j < w; ++j) + { cv::Vec3b color; int32_t value = data[i * w + j]; - if (color_table.count(value) > 0) { + if (color_table.count(value) > 0) + { color = color_table.at(value); - } else { + } + else + { color = color_table.at(-1); } img.at(i, j) = color; @@ -81,7 +88,7 @@ int draw_segmentation(const int32_t* data, int h, int w) { } int tengine_segment(const char* model_file, const char* image_file, int img_h, int img_w, const float* mean, - const float* scale, int loop_count, int num_thread, int affinity, float conf_thresh) + const float* scale, int loop_count, int num_thread, int affinity, float conf_thresh) { /* set runtime options */ struct options opt; @@ -108,8 +115,8 @@ int tengine_segment(const char* model_file, const char* image_file, int img_h, i /* set the shape, data buffer of input_tensor of the graph */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - float* input_data = ( float* )malloc(img_size * sizeof(float)); + int dims[] = {1, 3, img_h, img_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -128,7 +135,7 @@ int tengine_segment(const char* model_file, const char* image_file, int img_h, i { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -170,45 +177,56 @@ int tengine_segment(const char* model_file, const char* image_file, int img_h, i /* get the result of classification */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); int channel = output_size / img_h / img_w; int res = output_size % (img_h * img_w); - if (res != 0) { - fprintf(stderr, "output shape is not supported.\n"); - } else { - int* label_data = new int[img_h * img_w]; - /* single class segmentation */ - if (channel == 1) { - for (int i=0; i < img_h; ++i) { - for (int j=0; j < img_w; ++j) { - float conf = 1/(1+std::exp(-output_data[i*img_w + j])); - label_data[i*img_w + j] = conf > conf_thresh ? 1 : 0; - } + if (res != 0) + { + fprintf(stderr, "output shape is not supported.\n"); + } + else + { + int* label_data = new int[img_h * img_w]; + /* single class segmentation */ + if (channel == 1) + { + for (int i = 0; i < img_h; ++i) + { + for (int j = 0; j < img_w; ++j) + { + float conf = 1 / (1 + std::exp(-output_data[i * img_w + j])); + label_data[i * img_w + j] = conf > conf_thresh ? 1 : 0; + } + } } - } - /* multi-class segmentation */ - else { - for (int i=0; i < img_h; ++i) { - for (int j=0; j < img_w; ++j) { - int argmax_id = -1; - float max_conf = std::numeric_limits::min(); - for (int k=0; k < channel; ++k) { - float out_value = output_data[k * img_w * img_h + i * img_w + j]; - if (out_value > max_conf) { - argmax_id = k; - max_conf = out_value; - } - } - label_data[i*img_w + j] = argmax_id; - } + /* multi-class segmentation */ + else + { + for (int i = 0; i < img_h; ++i) + { + for (int j = 0; j < img_w; ++j) + { + int argmax_id = -1; + float max_conf = std::numeric_limits::min(); + for (int k = 0; k < channel; ++k) + { + float out_value = output_data[k * img_w * img_h + i * img_w + j]; + if (out_value > max_conf) + { + argmax_id = k; + max_conf = out_value; + } + } + label_data[i * img_w + j] = argmax_id; + } + } } - } - /* visualization */ - draw_segmentation(label_data, img_h, img_w); - fprintf(stderr, "segmentatation result is save as unet_out.png\n"); - delete[] label_data; + /* visualization */ + draw_segmentation(label_data, img_h, img_w); + fprintf(stderr, "segmentatation result is save as unet_out.png\n"); + delete[] label_data; } /* release tengine */ @@ -246,40 +264,40 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'g': - split(img_hw, optarg, ","); - img_h = ( int )img_hw[0]; - img_w = ( int )img_hw[1]; - break; - case 's': - split(scale, optarg, ","); - break; - case 'w': - split(mean, optarg, ","); - break; - case 'r': - loop_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'a': - cpu_affinity = atoi(optarg); - break; - case 'c': - conf_thresh = atof(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'g': + split(img_hw, optarg, ","); + img_h = (int)img_hw[0]; + img_w = (int)img_hw[1]; + break; + case 's': + split(scale, optarg, ","); + break; + case 'w': + split(mean, optarg, ","); + break; + case 'r': + loop_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'a': + cpu_affinity = atoi(optarg); + break; + case 'c': + conf_thresh = atof(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } diff --git a/examples/tm_yolact.cpp b/examples/tm_yolact.cpp index 1d2e2d499..8e0aaa5e4 100644 --- a/examples/tm_yolact.cpp +++ b/examples/tm_yolact.cpp @@ -86,7 +86,7 @@ void get_input_data_cv(const cv::Mat& sample, float* input_data, int img_h, int cv::resize(img, img, cv::Size(img_h, img_w)); img.convertTo(img, CV_32FC3); - float* img_data = ( float* )img.data; + float* img_data = (float*)img.data; int hw = img_h * img_w; for (int h = 0; h < img_h; h++) { @@ -166,10 +166,10 @@ static inline float intersection_area(const Object& a, const Object& b) return inter.area(); } -static void fast_nms(std::vector>& class_candidates, std::vector& objects, +static void fast_nms(std::vector >& class_candidates, std::vector& objects, const float iou_thresh, const int nms_top_k, const int keep_top_k) { - for (int i = 0; i < ( int )class_candidates.size(); i++) + for (int i = 0; i < (int)class_candidates.size(); i++) { std::vector& candidate = class_candidates[i]; std::sort(candidate.begin(), candidate.end(), [](const Object& a, const Object& b) { return a.prob > b.prob; }); @@ -189,7 +189,7 @@ static void fast_nms(std::vector>& class_candidates, std::ve { areas[j] = candidate[j].rect.area(); } - std::vector> iou_matrix; + std::vector > iou_matrix; for (int j = 0; j < n; j++) { std::vector iou_row(n); @@ -252,7 +252,7 @@ static int detect_yolact(const cv::Mat& bgr, std::vector& objects, const /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = target_size * target_size * 3; - int dims[] = {1, 3, target_size, target_size}; // nchw + int dims[] = {1, 3, target_size, target_size}; // nchw std::vector input_data(img_size); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); @@ -272,7 +272,7 @@ static int detect_yolact(const cv::Mat& bgr, std::vector& objects, const { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -311,10 +311,10 @@ static int detect_yolact(const cv::Mat& bgr, std::vector& objects, const tensor_t location_tensor = get_graph_output_tensor(graph, 2, 0); tensor_t mask_tensor = get_graph_output_tensor(graph, 3, 0); tensor_t confidence_tensor = get_graph_output_tensor(graph, 4, 0); - float* maskmaps = ( float* )get_tensor_buffer(maskmaps_tensor); - float* location = ( float* )get_tensor_buffer(location_tensor); - float* mask = ( float* )get_tensor_buffer(mask_tensor); - float* confidence = ( float* )get_tensor_buffer(confidence_tensor); + float* maskmaps = (float*)get_tensor_buffer(maskmaps_tensor); + float* location = (float*)get_tensor_buffer(location_tensor); + float* mask = (float*)get_tensor_buffer(mask_tensor); + float* confidence = (float*)get_tensor_buffer(confidence_tensor); int num_class = 81; int num_priors = 19248; @@ -323,7 +323,7 @@ static int detect_yolact(const cv::Mat& bgr, std::vector& objects, const const float nms_thresh = 0.5f; const int keep_top_k = 200; - std::vector> class_candidates; + std::vector > class_candidates; class_candidates.resize(num_class); for (int i = 0; i < num_priors; i++) @@ -352,18 +352,18 @@ static int detect_yolact(const cv::Mat& bgr, std::vector& objects, const float bbox_cx = var[0] * loc[0] * priorbox.w + priorbox.cx; float bbox_cy = var[1] * loc[1] * priorbox.h + priorbox.cy; - float bbox_w = ( float )(exp(var[2] * loc[2]) * priorbox.w); - float bbox_h = ( float )(exp(var[3] * loc[3]) * priorbox.h); + float bbox_w = (float)(exp(var[2] * loc[2]) * priorbox.w); + float bbox_h = (float)(exp(var[3] * loc[3]) * priorbox.h); float obj_x1 = bbox_cx - bbox_w * 0.5f; float obj_y1 = bbox_cy - bbox_h * 0.5f; float obj_x2 = bbox_cx + bbox_w * 0.5f; float obj_y2 = bbox_cy + bbox_h * 0.5f; - obj_x1 = std::max(std::min(obj_x1 * bgr.cols, ( float )(bgr.cols - 1)), 0.f); - obj_y1 = std::max(std::min(obj_y1 * bgr.rows, ( float )(bgr.rows - 1)), 0.f); - obj_x2 = std::max(std::min(obj_x2 * bgr.cols, ( float )(bgr.cols - 1)), 0.f); - obj_y2 = std::max(std::min(obj_y2 * bgr.rows, ( float )(bgr.rows - 1)), 0.f); + obj_x1 = std::max(std::min(obj_x1 * bgr.cols, (float)(bgr.cols - 1)), 0.f); + obj_y1 = std::max(std::min(obj_y1 * bgr.rows, (float)(bgr.rows - 1)), 0.f); + obj_x2 = std::max(std::min(obj_x2 * bgr.cols, (float)(bgr.cols - 1)), 0.f); + obj_y2 = std::max(std::min(obj_y2 * bgr.rows, (float)(bgr.rows - 1)), 0.f); Object obj; obj.rect = cv::Rect_(obj_x1, obj_y1, obj_x2 - obj_x1 + 1, obj_y2 - obj_y1 + 1); @@ -390,7 +390,7 @@ static int detect_yolact(const cv::Mat& bgr, std::vector& objects, const { const float* maskmap = maskmaps + p; float coeff = obj.maskdata[p]; - float* mp = ( float* )mask1.data; + float* mp = (float*)mask1.data; // mask += m * coeff for (int j = 0; j < 138 * 138; j++) @@ -534,8 +534,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects) {0, 255, 75}, {0, 255, 151}, {255, 56, 0}, - {245, 255, 0} - }; + {245, 255, 0}}; cv::Mat image = bgr.clone(); @@ -613,23 +612,23 @@ int main(int argc, char** argv) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } diff --git a/examples/tm_yolact_uint8.cpp b/examples/tm_yolact_uint8.cpp index e344d39c2..642b5acf8 100644 --- a/examples/tm_yolact_uint8.cpp +++ b/examples/tm_yolact_uint8.cpp @@ -61,8 +61,8 @@ struct Object cv::Mat mask; }; -void get_input_data_cv_uint8(const cv::Mat& sample, uint8_t* input_data, int img_h, int img_w, const float* mean, const float* scale, - float input_scale, int zero_point) +void get_input_data_cv_uint8(const cv::Mat& sample, uint8_t* input_data, int img_h, int img_w, const float* mean, const float* scale, + float input_scale, int zero_point) { cv::Mat img; if (sample.channels() == 4) @@ -84,20 +84,21 @@ void get_input_data_cv_uint8(const cv::Mat& sample, uint8_t* input_data, int img cv::resize(img, img, cv::Size(img_h, img_w)); img.convertTo(img, CV_32FC3); - float* img_data = (float* )img.data; + float* img_data = (float*)img.data; /* nhwc to nchw */ for (int h = 0; h < img_h; h++) - { for (int w = 0; w < img_w; w++) + { + for (int w = 0; w < img_w; w++) { for (int c = 0; c < 3; c++) { - int in_index = h * img_w * 3 + w * 3 + c; + int in_index = h * img_w * 3 + w * 3 + c; int out_index = c * img_h * img_w + h * img_w + w; float input_fp32 = (img_data[in_index] - mean[c]) * scale[c]; /* quant to uint8 */ - int udata = (round)(input_fp32 / input_scale + ( float )zero_point); + int udata = (round)(input_fp32 / input_scale + (float)zero_point); if (udata > 255) udata = 255; else if (udata < 0) @@ -174,10 +175,10 @@ static inline float intersection_area(const Object& a, const Object& b) return inter.area(); } -static void fast_nms(std::vector>& class_candidates, std::vector& objects, +static void fast_nms(std::vector >& class_candidates, std::vector& objects, const float iou_thresh, const int nms_top_k, const int keep_top_k) { - for (int i = 0; i < ( int )class_candidates.size(); i++) + for (int i = 0; i < (int)class_candidates.size(); i++) { std::vector& candidate = class_candidates[i]; std::sort(candidate.begin(), candidate.end(), [](const Object& a, const Object& b) { return a.prob > b.prob; }); @@ -197,7 +198,7 @@ static void fast_nms(std::vector>& class_candidates, std::ve { areas[j] = candidate[j].rect.area(); } - std::vector> iou_matrix; + std::vector > iou_matrix; for (int j = 0; j < n; j++) { std::vector iou_row(n); @@ -260,7 +261,7 @@ static int detect_yolact(const cv::Mat& bgr, std::vector& objects, const /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = target_size * target_size * 3; - int dims[] = {1, 3, target_size, target_size}; // nchw + int dims[] = {1, 3, target_size, target_size}; // nchw std::vector input_data(img_size); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); @@ -280,7 +281,7 @@ static int detect_yolact(const cv::Mat& bgr, std::vector& objects, const { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -292,7 +293,7 @@ static int detect_yolact(const cv::Mat& bgr, std::vector& objects, const /* prepare process input data, set the data mem to input tensor */ float input_scale = 0.f; int input_zero_point = 0; - get_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); + get_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); get_input_data_cv_uint8(bgr, input_data.data(), target_size, target_size, mean_vals, norm_vals, input_scale, input_zero_point); /* run graph */ @@ -318,19 +319,19 @@ static int detect_yolact(const cv::Mat& bgr, std::vector& objects, const fprintf(stderr, "--------------------------------------\n"); /* dequant output data */ - tensor_t maskmaps_tensor = get_graph_output_tensor(graph, 1, 0); - tensor_t location_tensor = get_graph_output_tensor(graph, 2, 0); - tensor_t mask_tensor = get_graph_output_tensor(graph, 3, 0); + tensor_t maskmaps_tensor = get_graph_output_tensor(graph, 1, 0); + tensor_t location_tensor = get_graph_output_tensor(graph, 2, 0); + tensor_t mask_tensor = get_graph_output_tensor(graph, 3, 0); tensor_t confidence_tensor = get_graph_output_tensor(graph, 4, 0); float maskmaps_scale = 0.f; float location_scale = 0.f; - float mask_scale = 0.f; + float mask_scale = 0.f; float confidence_scale = 0.f; int maskmaps_zero_point = 0; int location_zero_point = 0; - int mask_zero_point = 0; + int mask_zero_point = 0; int confidence_zero_point = 0; get_tensor_quant_param(maskmaps_tensor, &maskmaps_scale, &maskmaps_zero_point, 1); @@ -338,15 +339,15 @@ static int detect_yolact(const cv::Mat& bgr, std::vector& objects, const get_tensor_quant_param(mask_tensor, &mask_scale, &mask_zero_point, 1); get_tensor_quant_param(confidence_tensor, &confidence_scale, &confidence_zero_point, 1); - int maskmaps_count = get_tensor_buffer_size(maskmaps_tensor) / sizeof(uint8_t); - int location_count = get_tensor_buffer_size(location_tensor) / sizeof(uint8_t); - int mask_count = get_tensor_buffer_size(mask_tensor) / sizeof(uint8_t); + int maskmaps_count = get_tensor_buffer_size(maskmaps_tensor) / sizeof(uint8_t); + int location_count = get_tensor_buffer_size(location_tensor) / sizeof(uint8_t); + int mask_count = get_tensor_buffer_size(mask_tensor) / sizeof(uint8_t); int confidence_count = get_tensor_buffer_size(confidence_tensor) / sizeof(uint8_t); - uint8_t* maskmaps_u8 = ( uint8_t* )get_tensor_buffer(maskmaps_tensor); - uint8_t* location_u8 = ( uint8_t* )get_tensor_buffer(location_tensor); - uint8_t* mask_u8 = ( uint8_t* )get_tensor_buffer(mask_tensor); - uint8_t* confidence_u8 = ( uint8_t* )get_tensor_buffer(confidence_tensor); + uint8_t* maskmaps_u8 = (uint8_t*)get_tensor_buffer(maskmaps_tensor); + uint8_t* location_u8 = (uint8_t*)get_tensor_buffer(location_tensor); + uint8_t* mask_u8 = (uint8_t*)get_tensor_buffer(mask_tensor); + uint8_t* confidence_u8 = (uint8_t*)get_tensor_buffer(confidence_tensor); std::vector maskmaps(maskmaps_count); std::vector location(location_count); @@ -355,23 +356,23 @@ static int detect_yolact(const cv::Mat& bgr, std::vector& objects, const for (int c = 0; c < maskmaps_count; c++) { - maskmaps[c] = (( float )maskmaps_u8[c] - ( float )maskmaps_zero_point) * maskmaps_scale; + maskmaps[c] = ((float)maskmaps_u8[c] - (float)maskmaps_zero_point) * maskmaps_scale; } for (int c = 0; c < location_count; c++) { - location[c] = (( float )location_u8[c] - ( float )location_zero_point) * location_scale; + location[c] = ((float)location_u8[c] - (float)location_zero_point) * location_scale; } for (int c = 0; c < mask_count; c++) { - mask[c] = (( float )mask_u8[c] - ( float )mask_zero_point) * mask_scale; + mask[c] = ((float)mask_u8[c] - (float)mask_zero_point) * mask_scale; } for (int c = 0; c < confidence_count; c++) { - confidence[c] = (( float )confidence_u8[c] - ( float )confidence_zero_point) * confidence_scale; - } + confidence[c] = ((float)confidence_u8[c] - (float)confidence_zero_point) * confidence_scale; + } /* postprocess */ int num_class = 81; @@ -381,7 +382,7 @@ static int detect_yolact(const cv::Mat& bgr, std::vector& objects, const const float nms_thresh = 0.5f; const int keep_top_k = 200; - std::vector> class_candidates; + std::vector > class_candidates; class_candidates.resize(num_class); for (int i = 0; i < num_priors; i++) @@ -410,18 +411,18 @@ static int detect_yolact(const cv::Mat& bgr, std::vector& objects, const float bbox_cx = var[0] * loc[0] * priorbox.w + priorbox.cx; float bbox_cy = var[1] * loc[1] * priorbox.h + priorbox.cy; - float bbox_w = ( float )(exp(var[2] * loc[2]) * priorbox.w); - float bbox_h = ( float )(exp(var[3] * loc[3]) * priorbox.h); + float bbox_w = (float)(exp(var[2] * loc[2]) * priorbox.w); + float bbox_h = (float)(exp(var[3] * loc[3]) * priorbox.h); float obj_x1 = bbox_cx - bbox_w * 0.5f; float obj_y1 = bbox_cy - bbox_h * 0.5f; float obj_x2 = bbox_cx + bbox_w * 0.5f; float obj_y2 = bbox_cy + bbox_h * 0.5f; - obj_x1 = std::max(std::min(obj_x1 * bgr.cols, ( float )(bgr.cols - 1)), 0.f); - obj_y1 = std::max(std::min(obj_y1 * bgr.rows, ( float )(bgr.rows - 1)), 0.f); - obj_x2 = std::max(std::min(obj_x2 * bgr.cols, ( float )(bgr.cols - 1)), 0.f); - obj_y2 = std::max(std::min(obj_y2 * bgr.rows, ( float )(bgr.rows - 1)), 0.f); + obj_x1 = std::max(std::min(obj_x1 * bgr.cols, (float)(bgr.cols - 1)), 0.f); + obj_y1 = std::max(std::min(obj_y1 * bgr.rows, (float)(bgr.rows - 1)), 0.f); + obj_x2 = std::max(std::min(obj_x2 * bgr.cols, (float)(bgr.cols - 1)), 0.f); + obj_y2 = std::max(std::min(obj_y2 * bgr.rows, (float)(bgr.rows - 1)), 0.f); Object obj; obj.rect = cv::Rect_(obj_x1, obj_y1, obj_x2 - obj_x1 + 1, obj_y2 - obj_y1 + 1); @@ -448,7 +449,7 @@ static int detect_yolact(const cv::Mat& bgr, std::vector& objects, const { const float* maskmap = maskmaps.data() + p; float coeff = obj.maskdata[p]; - float* mp = ( float* )mask1.data; + float* mp = (float*)mask1.data; // mask += m * coeff for (int j = 0; j < 138 * 138; j++) @@ -592,8 +593,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects) {0, 255, 75}, {0, 255, 151}, {255, 56, 0}, - {245, 255, 0} - }; + {245, 255, 0}}; cv::Mat image = bgr.clone(); @@ -671,23 +671,23 @@ int main(int argc, char** argv) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } diff --git a/examples/tm_yolofastest.cpp b/examples/tm_yolofastest.cpp index acaa42cdc..69824e9c6 100644 --- a/examples/tm_yolofastest.cpp +++ b/examples/tm_yolofastest.cpp @@ -23,7 +23,7 @@ * * original model: https://github.com/dog-qiuqiu/Yolo-Fastest/tree/master/ModelZoo/yolo-fastest-1.1_coco */ - + #include #include #include @@ -70,20 +70,20 @@ struct TMat return (const float*)data; } - float *row(int row) const + float* row(int row) const { - return (float *)data + w * row; + return (float*)data + w * row; } - TMat channel_range(int start, int chn_num) const + TMat channel_range(int start, int chn_num) const { - TMat mat = { 0 }; + TMat mat = {0}; mat.batch = 1; mat.c = chn_num; mat.h = h; mat.w = w; - mat.data = (float *)data + start * h * w; + mat.data = (float*)data + start * h * w; return mat; } @@ -94,7 +94,7 @@ struct TMat } int batch, c, h, w; - void *data; + void* data; }; class Yolov3DetectionOutput @@ -102,8 +102,8 @@ class Yolov3DetectionOutput public: int init(int version); int forward(const std::vector& bottom_blobs, std::vector& top_blobs); -private: +private: int m_num_box; int m_num_class; float m_anchors_scale[32]; @@ -122,24 +122,23 @@ static const char* class_names[] = { "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", - "hair drier", "toothbrush" -}; + "hair drier", "toothbrush"}; int Yolov3DetectionOutput::init(int version) { memset(this, 0, sizeof(*this)); m_num_box = 3; m_num_class = 80; - - fprintf(stderr, "Yolov3DetectionOutput init param[%d]\n", version); - + + fprintf(stderr, "Yolov3DetectionOutput init param[%d]\n", version); + if (version == YOLOV3) { m_anchors_scale[0] = 32; m_anchors_scale[1] = 16; m_anchors_scale[2] = 8; - float bias[] = { 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 }; + float bias[] = {10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326}; memcpy(m_biases, bias, sizeof(bias)); m_mask[0] = 6; @@ -159,7 +158,7 @@ int Yolov3DetectionOutput::init(int version) m_anchors_scale[0] = 32; m_anchors_scale[1] = 16; - float bias[] = { 12, 18, 37, 49, 52,132, 115, 73, 119,199, 242,238 }; + float bias[] = {12, 18, 37, 49, 52, 132, 115, 73, 119, 199, 242, 238}; memcpy(m_biases, bias, sizeof(bias)); m_mask[0] = 3; @@ -345,7 +344,7 @@ int Yolov3DetectionOutput::forward(const std::vector& bottom_blobs, std::v float area = bbox_w * bbox_h; - BBoxRect c = { confidence, bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax, area, class_index }; + BBoxRect c = {confidence, bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax, area, class_index}; all_box_bbox_rects[pp].push_back(c); } @@ -410,13 +409,13 @@ int Yolov3DetectionOutput::forward(const std::vector& bottom_blobs, std::v static void get_input_data_darknet(const char* image_file, float* input_data, int net_h, int net_w) { - float mean[3] = { 0.f, 0.f, 0.f }; - float scale[3] = { 1.0f / 255, 1.0f / 255, 1.0f / 255 }; + float mean[3] = {0.f, 0.f, 0.f}; + float scale[3] = {1.0f / 255, 1.0f / 255, 1.0f / 255}; //no letter box by default get_input_data(image_file, input_data, net_h, net_w, mean, scale); // input rgb - image swaprgb_img = { 0 }; + image swaprgb_img = {0}; swaprgb_img.c = 3; swaprgb_img.w = net_w; swaprgb_img.h = net_h; @@ -429,11 +428,11 @@ static void show_usage() fprintf(stderr, "[Usage]: [-h]\n [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n"); } -static void run_yolo(graph_t graph, std::vector &boxes, int img_width, int img_height) +static void run_yolo(graph_t graph, std::vector& boxes, int img_width, int img_height) { Yolov3DetectionOutput yolo; std::vector yolo_inputs, yolo_outputs; - + yolo.init(YOLO_FASTEST); int output_node_num = get_graph_output_node_number(graph); @@ -442,8 +441,8 @@ static void run_yolo(graph_t graph, std::vector &boxes, int img_width, for (int i = 0; i < output_node_num; ++i) { - tensor_t out_tensor = get_graph_output_tensor(graph, i, 0); //"detection_out" - int out_dim[4] = { 0 }; + tensor_t out_tensor = get_graph_output_tensor(graph, i, 0); //"detection_out" + int out_dim[4] = {0}; get_tensor_shape(out_tensor, out_dim, 4); yolo_inputs[i].batch = out_dim[0]; @@ -485,9 +484,9 @@ static void run_yolo(graph_t graph, std::vector &boxes, int img_width, //rect correct for (int i = 0; i < yolo_outputs[0].h; i++) { - float *data_row = yolo_outputs[0].row(i); + float* data_row = yolo_outputs[0].row(i); - BBoxRect box = { 0 }; + BBoxRect box = {0}; box.score = data_row[1]; box.label = data_row[0]; box.xmin = (data_row[2] - roi_left) / roi_width * img_width; @@ -584,7 +583,7 @@ int main(int argc, char* argv[]) /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = net_h * net_w * 3; - int dims[] = { 1, 3, net_h, net_w }; // nchw + int dims[] = {1, 3, net_h, net_w}; // nchw std::vector input_data(img_size); @@ -636,7 +635,7 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, - num_thread, total_time / repeat_count, max_time, min_time); + num_thread, total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); /* process the detection result */ diff --git a/examples/tm_yolov3.cpp b/examples/tm_yolov3.cpp index 5127480c0..d2e5e7bad 100644 --- a/examples/tm_yolov3.cpp +++ b/examples/tm_yolov3.cpp @@ -146,15 +146,16 @@ void get_input_data_yolov3(const char* image_file, float* input_data, int img_h, /* resize process */ cv::resize(img, img, cv::Size(img_w, img_h)); img.convertTo(img, CV_32FC3); - float* img_data = (float* )img.data; + float* img_data = (float*)img.data; /* nhwc to nchw */ for (int h = 0; h < img_h; h++) - { for (int w = 0; w < img_w; w++) + { + for (int w = 0; w < img_w; w++) { for (int c = 0; c < 3; c++) { - int in_index = h * img_w * 3 + w * 3 + c; + int in_index = h * img_w * 3 + w * 3 + c; int out_index = c * img_h * img_w + h * img_w + w; input_data[out_index] = (img_data[in_index] - mean[c]) * scale[c]; } @@ -171,11 +172,11 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho int feat_h = 416.0 / stride; int cls_num = 80; int anchor_group = 0; - if(stride == 8) + if (stride == 8) anchor_group = 1; - if(stride == 16) + if (stride == 16) anchor_group = 2; - if(stride == 32) + if (stride == 32) anchor_group = 3; //printf("anchor_group:%d\n",anchor_group); for (int h = 0; h <= feat_h - 1; h++) @@ -191,7 +192,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho { int score_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + (s + 5) * channel_size; float score = feat[score_index]; - if(score > class_score) + if (score > class_score) { class_index = s; class_score = score; @@ -199,7 +200,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho } float box_score = feat[anchor * (cls_num + 5) * channel_size + feat_w * h + w + 4 * channel_size]; float final_score = sigmoid(box_score) * sigmoid(class_score); - if(final_score >= prob_threshold) + if (final_score >= prob_threshold) { int dx_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 0 * channel_size; int dy_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 1 * channel_size; @@ -219,7 +220,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho float pred_y = (h + dy) * stride; float pred_w = exp(dw) * anchor_w; float pred_h = exp(dh) * anchor_h; - + float x0 = (pred_x - pred_w * 0.5f); float y0 = (pred_y - pred_h * 0.5f); float x1 = (pred_x + pred_w * 0.5f); @@ -232,7 +233,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho obj.rect.height = y1 - y0; obj.label = class_index; obj.prob = final_score; - objects.push_back(obj); + objects.push_back(obj); } } } @@ -250,8 +251,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects) "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", - "hair drier", "toothbrush" - }; + "hair drier", "toothbrush"}; cv::Mat image = bgr.clone(); @@ -312,23 +312,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -355,7 +355,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "cv::imread %s failed\n", image_file); return -1; - } + } /* set runtime options */ struct options opt; @@ -432,16 +432,16 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); - tensor_t p8_output = get_graph_output_tensor(graph, 2, 0); + tensor_t p8_output = get_graph_output_tensor(graph, 2, 0); tensor_t p16_output = get_graph_output_tensor(graph, 1, 0); tensor_t p32_output = get_graph_output_tensor(graph, 0, 0); - - float* p8_data = ( float*)get_tensor_buffer(p8_output); - float* p16_data = ( float*)get_tensor_buffer(p16_output); - float* p32_data = ( float*)get_tensor_buffer(p32_output); + + float* p8_data = (float*)get_tensor_buffer(p8_output); + float* p16_data = (float*)get_tensor_buffer(p16_output); + float* p32_data = (float*)get_tensor_buffer(p32_output); /* postprocess */ const float prob_threshold = 0.4f; @@ -473,7 +473,7 @@ int main(int argc, char* argv[]) float ratio_y = (float)raw_h / img_h; int count = picked.size(); - fprintf(stderr, "detection num: %d\n",count); + fprintf(stderr, "detection num: %d\n", count); objects.resize(count); for (int i = 0; i < count; i++) diff --git a/examples/tm_yolov3_tiny.cpp b/examples/tm_yolov3_tiny.cpp index d77a3d823..3516e2dea 100644 --- a/examples/tm_yolov3_tiny.cpp +++ b/examples/tm_yolov3_tiny.cpp @@ -146,15 +146,16 @@ void get_input_data_yolov3(const char* image_file, float* input_data, int img_h, /* resize process */ cv::resize(img, img, cv::Size(img_w, img_h)); img.convertTo(img, CV_32FC3); - float* img_data = (float* )img.data; + float* img_data = (float*)img.data; /* nhwc to nchw */ for (int h = 0; h < img_h; h++) - { for (int w = 0; w < img_w; w++) + { + for (int w = 0; w < img_w; w++) { for (int c = 0; c < 3; c++) { - int in_index = h * img_w * 3 + w * 3 + c; + int in_index = h * img_w * 3 + w * 3 + c; int out_index = c * img_h * img_w + h * img_w + w; input_data[out_index] = (img_data[in_index] - mean[c]) * scale[c]; } @@ -172,9 +173,9 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho int cls_num = 80; int anchor_group = 0; - if(stride == 16) + if (stride == 16) anchor_group = 1; - if(stride == 32) + if (stride == 32) anchor_group = 2; //printf("anchor_group:%d\n",anchor_group); for (int h = 0; h <= feat_h - 1; h++) @@ -190,7 +191,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho { int score_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + (s + 5) * channel_size; float score = feat[score_index]; - if(score > class_score) + if (score > class_score) { class_index = s; class_score = score; @@ -198,7 +199,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho } float box_score = feat[anchor * (cls_num + 5) * channel_size + feat_w * h + w + 4 * channel_size]; float final_score = sigmoid(box_score) * sigmoid(class_score); - if(final_score >= prob_threshold) + if (final_score >= prob_threshold) { int dx_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 0 * channel_size; int dy_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 1 * channel_size; @@ -218,7 +219,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho float pred_y = (h + dy) * stride; float pred_w = exp(dw) * anchor_w; float pred_h = exp(dh) * anchor_h; - + float x0 = (pred_x - pred_w * 0.5f); float y0 = (pred_y - pred_h * 0.5f); float x1 = (pred_x + pred_w * 0.5f); @@ -231,7 +232,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho obj.rect.height = y1 - y0; obj.label = class_index; obj.prob = final_score; - objects.push_back(obj); + objects.push_back(obj); } } } @@ -249,8 +250,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects) "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", - "hair drier", "toothbrush" - }; + "hair drier", "toothbrush"}; cv::Mat image = bgr.clone(); @@ -311,23 +311,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -354,7 +354,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "cv::imread %s failed\n", image_file); return -1; - } + } /* set runtime options */ struct options opt; @@ -381,7 +381,7 @@ int main(int argc, char* argv[]) int img_size = img_h * img_w * img_c; int dims[] = {1, 3, img_h, img_w}; - float* input_data = ( float* )malloc(img_size * sizeof(float)); + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == nullptr) @@ -431,14 +431,14 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); tensor_t p16_output = get_graph_output_tensor(graph, 1, 0); tensor_t p32_output = get_graph_output_tensor(graph, 0, 0); - - float* p16_data = ( float*)get_tensor_buffer(p16_output); - float* p32_data = ( float*)get_tensor_buffer(p32_output); + + float* p16_data = (float*)get_tensor_buffer(p16_output); + float* p32_data = (float*)get_tensor_buffer(p32_output); /* postprocess */ const float prob_threshold = 0.4f; @@ -468,7 +468,7 @@ int main(int argc, char* argv[]) float ratio_y = (float)raw_h / img_h; int count = picked.size(); - fprintf(stderr, "detection num: %d\n",count); + fprintf(stderr, "detection num: %d\n", count); objects.resize(count); for (int i = 0; i < count; i++) diff --git a/examples/tm_yolov3_tiny_uint8.cpp b/examples/tm_yolov3_tiny_uint8.cpp index ed80e3b6e..54d701ee9 100644 --- a/examples/tm_yolov3_tiny_uint8.cpp +++ b/examples/tm_yolov3_tiny_uint8.cpp @@ -133,7 +133,7 @@ static void nms_sorted_bboxes(const std::vector& faceobjects, std::vecto } } -void get_input_data_yolov3_uint8(const char* image_file, uint8_t * input_data, int img_h, int img_w, const float* mean, const float* scale, +void get_input_data_yolov3_uint8(const char* image_file, uint8_t* input_data, int img_h, int img_w, const float* mean, const float* scale, float input_scale, int zero_point) { cv::Mat sample = cv::imread(image_file, 1); @@ -147,20 +147,21 @@ void get_input_data_yolov3_uint8(const char* image_file, uint8_t * input_data, i /* resize process */ cv::resize(img, img, cv::Size(img_w, img_h)); img.convertTo(img, CV_32FC3); - float* img_data = (float* )img.data; + float* img_data = (float*)img.data; /* nhwc to nchw */ for (int h = 0; h < img_h; h++) - { for (int w = 0; w < img_w; w++) + { + for (int w = 0; w < img_w; w++) { for (int c = 0; c < 3; c++) { - int in_index = h * img_w * 3 + w * 3 + c; + int in_index = h * img_w * 3 + w * 3 + c; int out_index = c * img_h * img_w + h * img_w + w; float input_fp32 = (img_data[in_index] - mean[c]) * scale[c]; /* quant to uint8 */ - int udata = (round)(input_fp32 / input_scale + ( float )zero_point); + int udata = (round)(input_fp32 / input_scale + (float)zero_point); if (udata > 255) udata = 255; else if (udata < 0) @@ -182,9 +183,9 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho int cls_num = 80; int anchor_group = 0; - if(stride == 16) + if (stride == 16) anchor_group = 1; - if(stride == 32) + if (stride == 32) anchor_group = 2; //printf("anchor_group:%d\n",anchor_group); for (int h = 0; h <= feat_h - 1; h++) @@ -200,7 +201,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho { int score_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + (s + 5) * channel_size; float score = feat[score_index]; - if(score > class_score) + if (score > class_score) { class_index = s; class_score = score; @@ -208,7 +209,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho } float box_score = feat[anchor * (cls_num + 5) * channel_size + feat_w * h + w + 4 * channel_size]; float final_score = sigmoid(box_score) * sigmoid(class_score); - if(final_score >= prob_threshold) + if (final_score >= prob_threshold) { int dx_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 0 * channel_size; int dy_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 1 * channel_size; @@ -228,7 +229,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho float pred_y = (h + dy) * stride; float pred_w = exp(dw) * anchor_w; float pred_h = exp(dh) * anchor_h; - + float x0 = (pred_x - pred_w * 0.5f); float y0 = (pred_y - pred_h * 0.5f); float x1 = (pred_x + pred_w * 0.5f); @@ -241,7 +242,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho obj.rect.height = y1 - y0; obj.label = class_index; obj.prob = final_score; - objects.push_back(obj); + objects.push_back(obj); } } } @@ -259,8 +260,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects) "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", - "hair drier", "toothbrush" - }; + "hair drier", "toothbrush"}; cv::Mat image = bgr.clone(); @@ -321,23 +321,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -364,7 +364,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "cv::imread %s failed\n", image_file); return -1; - } + } /* set runtime options */ struct options opt; @@ -444,7 +444,7 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); /* dequant output data */ @@ -462,21 +462,21 @@ int main(int argc, char* argv[]) int p16_count = get_tensor_buffer_size(p16_output) / sizeof(uint8_t); int p32_count = get_tensor_buffer_size(p32_output) / sizeof(uint8_t); - uint8_t* p16_data_u8 = ( uint8_t* )get_tensor_buffer(p16_output); - uint8_t* p32_data_u8 = ( uint8_t* )get_tensor_buffer(p32_output); + uint8_t* p16_data_u8 = (uint8_t*)get_tensor_buffer(p16_output); + uint8_t* p32_data_u8 = (uint8_t*)get_tensor_buffer(p32_output); std::vector p16_data(p16_count); std::vector p32_data(p32_count); for (int c = 0; c < p16_count; c++) { - p16_data[c] = (( float )p16_data_u8[c] - ( float )p16_zero_point) * p16_scale; + p16_data[c] = ((float)p16_data_u8[c] - (float)p16_zero_point) * p16_scale; } for (int c = 0; c < p32_count; c++) { - p32_data[c] = (( float )p32_data_u8[c] - ( float )p32_zero_point) * p32_scale; - } + p32_data[c] = ((float)p32_data_u8[c] - (float)p32_zero_point) * p32_scale; + } /* postprocess */ const float prob_threshold = 0.4f; @@ -505,7 +505,7 @@ int main(int argc, char* argv[]) float ratio_y = (float)raw_h / img_h; int count = picked.size(); - fprintf(stderr, "detection num: %d\n",count); + fprintf(stderr, "detection num: %d\n", count); objects.resize(count); for (int i = 0; i < count; i++) diff --git a/examples/tm_yolov3_uint8.cpp b/examples/tm_yolov3_uint8.cpp index 35bfca921..93d509ab1 100644 --- a/examples/tm_yolov3_uint8.cpp +++ b/examples/tm_yolov3_uint8.cpp @@ -133,7 +133,7 @@ static void nms_sorted_bboxes(const std::vector& faceobjects, std::vecto } } -void get_input_data_yolov3_uint8(const char* image_file, uint8_t * input_data, int img_h, int img_w, const float* mean, const float* scale, +void get_input_data_yolov3_uint8(const char* image_file, uint8_t* input_data, int img_h, int img_w, const float* mean, const float* scale, float input_scale, int zero_point) { cv::Mat sample = cv::imread(image_file, 1); @@ -147,20 +147,21 @@ void get_input_data_yolov3_uint8(const char* image_file, uint8_t * input_data, i /* resize process */ cv::resize(img, img, cv::Size(img_w, img_h)); img.convertTo(img, CV_32FC3); - float* img_data = (float* )img.data; + float* img_data = (float*)img.data; /* nhwc to nchw */ for (int h = 0; h < img_h; h++) - { for (int w = 0; w < img_w; w++) + { + for (int w = 0; w < img_w; w++) { for (int c = 0; c < 3; c++) { - int in_index = h * img_w * 3 + w * 3 + c; + int in_index = h * img_w * 3 + w * 3 + c; int out_index = c * img_h * img_w + h * img_w + w; float input_fp32 = (img_data[in_index] - mean[c]) * scale[c]; /* quant to uint8 */ - int udata = (round)(input_fp32 / input_scale + ( float )zero_point); + int udata = (round)(input_fp32 / input_scale + (float)zero_point); if (udata > 255) udata = 255; else if (udata < 0) @@ -181,11 +182,11 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho int feat_h = 416.0 / stride; int cls_num = 80; int anchor_group = 0; - if(stride == 8) + if (stride == 8) anchor_group = 1; - if(stride == 16) + if (stride == 16) anchor_group = 2; - if(stride == 32) + if (stride == 32) anchor_group = 3; //printf("anchor_group:%d\n",anchor_group); for (int h = 0; h <= feat_h - 1; h++) @@ -201,7 +202,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho { int score_index = anchor * 85 * channel_size + feat_w * h + w + (s + 5) * channel_size; float score = feat[score_index]; - if(score > class_score) + if (score > class_score) { class_index = s; class_score = score; @@ -209,7 +210,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho } float box_score = feat[anchor * 85 * channel_size + feat_w * h + w + 4 * channel_size]; float final_score = sigmoid(box_score) * sigmoid(class_score); - if(final_score >= prob_threshold) + if (final_score >= prob_threshold) { int dx_index = anchor * 85 * channel_size + feat_w * h + w + 0 * channel_size; int dy_index = anchor * 85 * channel_size + feat_w * h + w + 1 * channel_size; @@ -217,7 +218,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho int dh_index = anchor * 85 * channel_size + feat_w * h + w + 3 * channel_size; float dx = sigmoid(feat[dx_index]); - + float dy = sigmoid(feat[dy_index]); float dw = feat[dw_index]; @@ -230,8 +231,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho float pred_y = (h + dy) * stride; float pred_w = exp(dw) * anchor_w; float pred_h = exp(dh) * anchor_h; - - + float x0 = (pred_x - pred_w * 0.5f); float y0 = (pred_y - pred_h * 0.5f); float x1 = (pred_x + pred_w * 0.5f); @@ -244,7 +244,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho obj.rect.height = y1 - y0; obj.label = class_index; obj.prob = final_score; - objects.push_back(obj); + objects.push_back(obj); } } } @@ -262,8 +262,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects) "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", - "hair drier", "toothbrush" - }; + "hair drier", "toothbrush"}; cv::Mat image = bgr.clone(); @@ -324,23 +323,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -367,7 +366,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "cv::imread %s failed\n", image_file); return -1; - } + } /* set runtime options */ struct options opt; @@ -447,18 +446,18 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); /* dequant output data */ - tensor_t p8_output = get_graph_output_tensor(graph, 2, 0); + tensor_t p8_output = get_graph_output_tensor(graph, 2, 0); tensor_t p16_output = get_graph_output_tensor(graph, 1, 0); tensor_t p32_output = get_graph_output_tensor(graph, 0, 0); - float p8_scale = 0.f; + float p8_scale = 0.f; float p16_scale = 0.f; float p32_scale = 0.f; - int p8_zero_point = 0; + int p8_zero_point = 0; int p16_zero_point = 0; int p32_zero_point = 0; @@ -466,13 +465,13 @@ int main(int argc, char* argv[]) get_tensor_quant_param(p16_output, &p16_scale, &p16_zero_point, 1); get_tensor_quant_param(p32_output, &p32_scale, &p32_zero_point, 1); - int p8_count = get_tensor_buffer_size(p8_output) / sizeof(uint8_t); + int p8_count = get_tensor_buffer_size(p8_output) / sizeof(uint8_t); int p16_count = get_tensor_buffer_size(p16_output) / sizeof(uint8_t); int p32_count = get_tensor_buffer_size(p32_output) / sizeof(uint8_t); - uint8_t* p8_data_u8 = ( uint8_t* )get_tensor_buffer(p8_output); - uint8_t* p16_data_u8 = ( uint8_t* )get_tensor_buffer(p16_output); - uint8_t* p32_data_u8 = ( uint8_t* )get_tensor_buffer(p32_output); + uint8_t* p8_data_u8 = (uint8_t*)get_tensor_buffer(p8_output); + uint8_t* p16_data_u8 = (uint8_t*)get_tensor_buffer(p16_output); + uint8_t* p32_data_u8 = (uint8_t*)get_tensor_buffer(p32_output); std::vector p8_data(p8_count); std::vector p16_data(p16_count); @@ -480,17 +479,17 @@ int main(int argc, char* argv[]) for (int c = 0; c < p8_count; c++) { - p8_data[c] = (( float )p8_data_u8[c] - ( float )p8_zero_point) * p8_scale; + p8_data[c] = ((float)p8_data_u8[c] - (float)p8_zero_point) * p8_scale; } for (int c = 0; c < p16_count; c++) { - p16_data[c] = (( float )p16_data_u8[c] - ( float )p16_zero_point) * p16_scale; + p16_data[c] = ((float)p16_data_u8[c] - (float)p16_zero_point) * p16_scale; } for (int c = 0; c < p32_count; c++) { - p32_data[c] = (( float )p32_data_u8[c] - ( float )p32_zero_point) * p32_scale; + p32_data[c] = ((float)p32_data_u8[c] - (float)p32_zero_point) * p32_scale; } /* postprocess */ @@ -523,7 +522,7 @@ int main(int argc, char* argv[]) float ratio_y = (float)raw_h / img_h; int count = picked.size(); - fprintf(stderr, "detection num: %d\n",count); + fprintf(stderr, "detection num: %d\n", count); objects.resize(count); for (int i = 0; i < count; i++) diff --git a/examples/tm_yolov4.cpp b/examples/tm_yolov4.cpp index b09624624..3dea741c0 100644 --- a/examples/tm_yolov4.cpp +++ b/examples/tm_yolov4.cpp @@ -148,15 +148,16 @@ void get_input_data_yolov4(const char* image_file, float* input_data, int img_h, /* resize process */ cv::resize(img, img, cv::Size(img_w, img_h)); img.convertTo(img, CV_32FC3); - float* img_data = (float* )img.data; + float* img_data = (float*)img.data; /* nhwc to nchw */ for (int h = 0; h < img_h; h++) - { for (int w = 0; w < img_w; w++) + { + for (int w = 0; w < img_w; w++) { for (int c = 0; c < 3; c++) { - int in_index = h * img_w * 3 + w * 3 + c; + int in_index = h * img_w * 3 + w * 3 + c; int out_index = c * img_h * img_w + h * img_w + w; input_data[out_index] = (img_data[in_index] - mean[c]) * scale[c]; } @@ -164,7 +165,7 @@ void get_input_data_yolov4(const char* image_file, float* input_data, int img_h, } } -static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector& objects) +static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector& objects) { static float anchors[18] = {12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401}; int anchor_num = 3; @@ -172,11 +173,11 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh int feat_h = 416 / stride; int cls_num = 80; int anchor_group = 0; - if(stride == 8) + if (stride == 8) anchor_group = 1; - if(stride == 16) + if (stride == 16) anchor_group = 2; - if(stride == 32) + if (stride == 32) anchor_group = 3; for (int h = 0; h <= feat_h - 1; h++) @@ -192,7 +193,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh { int score_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + (s + 5) * channel_size; float score = feat[score_index]; - if(score > class_score) + if (score > class_score) { class_index = s; class_score = score; @@ -200,7 +201,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh } float box_score = feat[anchor * (cls_num + 5) * channel_size + feat_w * h + w + 4 * channel_size]; float final_score = sigmoid(box_score) * sigmoid(class_score); - if(final_score >= prob_threshold) + if (final_score >= prob_threshold) { int dx_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 0 * channel_size; int dy_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 1 * channel_size; @@ -218,8 +219,8 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh float pred_x = (w + dx) * stride; float pred_y = (h + dy) * stride; - float pred_w = exp(dw) * anchor_w ; - float pred_h = exp(dh) * anchor_h ; + float pred_w = exp(dw) * anchor_w; + float pred_h = exp(dh) * anchor_h; float x0 = (pred_x - pred_w * 0.5f); float y0 = (pred_y - pred_h * 0.5f); @@ -233,7 +234,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh obj.rect.height = y1 - y0; obj.label = class_index; obj.prob = final_score; - objects.push_back(obj); + objects.push_back(obj); } } } @@ -251,8 +252,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects) "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", - "hair drier", "toothbrush" - }; + "hair drier", "toothbrush"}; cv::Mat image = bgr.clone(); @@ -313,23 +313,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -356,7 +356,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "cv::imread %s failed\n", image_file); return -1; - } + } /* set runtime options */ struct options opt; @@ -433,18 +433,18 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); tensor_t p8_output = get_graph_output_tensor(graph, 0, 0); tensor_t p16_output = get_graph_output_tensor(graph, 1, 0); tensor_t p32_output = get_graph_output_tensor(graph, 2, 0); - - float* p8_data = ( float*)get_tensor_buffer(p8_output); - float* p16_data = ( float*)get_tensor_buffer(p16_output); - float* p32_data = ( float*)get_tensor_buffer(p32_output); - /* postprocess */ + float* p8_data = (float*)get_tensor_buffer(p8_output); + float* p16_data = (float*)get_tensor_buffer(p16_output); + float* p32_data = (float*)get_tensor_buffer(p32_output); + + /* postprocess */ const float prob_threshold = 0.45f; const float nms_threshold = 0.25f; @@ -473,7 +473,7 @@ int main(int argc, char* argv[]) float ratio_y = (float)raw_h / img_h; int count = picked.size(); - fprintf(stderr, "detection num: %d\n",count); + fprintf(stderr, "detection num: %d\n", count); objects.resize(count); for (int i = 0; i < count; i++) diff --git a/examples/tm_yolov4_tiny.cpp b/examples/tm_yolov4_tiny.cpp index cd06f604a..512baed34 100644 --- a/examples/tm_yolov4_tiny.cpp +++ b/examples/tm_yolov4_tiny.cpp @@ -149,15 +149,16 @@ void get_input_data_yolov4(const char* image_file, float* input_data, int img_h, /* resize process */ cv::resize(img, img, cv::Size(img_w, img_h)); img.convertTo(img, CV_32FC3); - float* img_data = (float* )img.data; + float* img_data = (float*)img.data; /* nhwc to nchw */ for (int h = 0; h < img_h; h++) - { for (int w = 0; w < img_w; w++) + { + for (int w = 0; w < img_w; w++) { for (int c = 0; c < 3; c++) { - int in_index = h * img_w * 3 + w * 3 + c; + int in_index = h * img_w * 3 + w * 3 + c; int out_index = c * img_h * img_w + h * img_w + w; input_data[out_index] = (img_data[in_index] - mean[c]) * scale[c]; } @@ -165,7 +166,7 @@ void get_input_data_yolov4(const char* image_file, float* input_data, int img_h, } } -static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector& objects) +static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector& objects) { static float anchors[12] = {10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319}; int anchor_num = 3; @@ -173,9 +174,9 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh int feat_h = 416 / stride; int cls_num = 80; int anchor_group = 0; - if(stride == 16) + if (stride == 16) anchor_group = 1; - if(stride == 32) + if (stride == 32) anchor_group = 2; for (int h = 0; h <= feat_h - 1; h++) @@ -191,7 +192,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh { int score_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + (s + 5) * channel_size; float score = feat[score_index]; - if(score > class_score) + if (score > class_score) { class_index = s; class_score = score; @@ -199,7 +200,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh } float box_score = feat[anchor * (cls_num + 5) * channel_size + feat_w * h + w + 4 * channel_size]; float final_score = sigmoid(box_score) * sigmoid(class_score); - if(final_score >= prob_threshold) + if (final_score >= prob_threshold) { int dx_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 0 * channel_size; int dy_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 1 * channel_size; @@ -217,8 +218,8 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh float pred_x = (w + dx) * stride; float pred_y = (h + dy) * stride; - float pred_w = exp(dw) * anchor_w ; - float pred_h = exp(dh) * anchor_h ; + float pred_w = exp(dw) * anchor_w; + float pred_h = exp(dh) * anchor_h; float x0 = (pred_x - pred_w * 0.5f); float y0 = (pred_y - pred_h * 0.5f); @@ -232,7 +233,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh obj.rect.height = y1 - y0; obj.label = class_index; obj.prob = final_score; - objects.push_back(obj); + objects.push_back(obj); } } } @@ -250,8 +251,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects) "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", - "hair drier", "toothbrush" - }; + "hair drier", "toothbrush"}; cv::Mat image = bgr.clone(); @@ -312,23 +312,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -355,7 +355,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "cv::imread %s failed\n", image_file); return -1; - } + } /* set runtime options */ struct options opt; @@ -432,17 +432,16 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); - tensor_t p16_output = get_graph_output_tensor(graph, 1, 0); tensor_t p32_output = get_graph_output_tensor(graph, 0, 0); - float* p16_data = ( float*)get_tensor_buffer(p16_output); - float* p32_data = ( float*)get_tensor_buffer(p32_output); + float* p16_data = (float*)get_tensor_buffer(p16_output); + float* p32_data = (float*)get_tensor_buffer(p32_output); - /* postprocess */ + /* postprocess */ const float prob_threshold = 0.45f; const float nms_threshold = 0.25f; @@ -469,7 +468,7 @@ int main(int argc, char* argv[]) float ratio_y = (float)raw_h / img_h; int count = picked.size(); - fprintf(stderr, "detection num: %d\n",count); + fprintf(stderr, "detection num: %d\n", count); objects.resize(count); for (int i = 0; i < count; i++) diff --git a/examples/tm_yolov4_tiny_timvx.cpp b/examples/tm_yolov4_tiny_timvx.cpp index 0478d0d55..52f362fec 100644 --- a/examples/tm_yolov4_tiny_timvx.cpp +++ b/examples/tm_yolov4_tiny_timvx.cpp @@ -134,7 +134,7 @@ static void nms_sorted_bboxes(const std::vector& faceobjects, std::vecto } } -void get_input_data_yolov4_uint8(const char* image_file, uint8_t * input_data, int img_h, int img_w, const float* mean, const float* scale, +void get_input_data_yolov4_uint8(const char* image_file, uint8_t* input_data, int img_h, int img_w, const float* mean, const float* scale, float input_scale, int zero_point) { cv::Mat sample = cv::imread(image_file, 1); @@ -148,20 +148,21 @@ void get_input_data_yolov4_uint8(const char* image_file, uint8_t * input_data, i /* resize process */ cv::resize(img, img, cv::Size(img_w, img_h)); img.convertTo(img, CV_32FC3); - float* img_data = (float* )img.data; + float* img_data = (float*)img.data; /* nhwc to nchw */ for (int h = 0; h < img_h; h++) - { for (int w = 0; w < img_w; w++) + { + for (int w = 0; w < img_w; w++) { for (int c = 0; c < 3; c++) { - int in_index = h * img_w * 3 + w * 3 + c; + int in_index = h * img_w * 3 + w * 3 + c; int out_index = c * img_h * img_w + h * img_w + w; float input_fp32 = (img_data[in_index] - mean[c]) * scale[c]; /* quant to uint8 */ - int udata = (round)(input_fp32 / input_scale + ( float )zero_point); + int udata = (round)(input_fp32 / input_scale + (float)zero_point); if (udata > 255) udata = 255; else if (udata < 0) @@ -173,7 +174,7 @@ void get_input_data_yolov4_uint8(const char* image_file, uint8_t * input_data, i } } -static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector& objects) +static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector& objects) { static float anchors[12] = {10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319}; int anchor_num = 3; @@ -181,9 +182,9 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh int feat_h = 416 / stride; int cls_num = 80; int anchor_group = 0; - if(stride == 16) + if (stride == 16) anchor_group = 1; - if(stride == 32) + if (stride == 32) anchor_group = 2; for (int h = 0; h <= feat_h - 1; h++) @@ -199,7 +200,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh { int score_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + (s + 5) * channel_size; float score = feat[score_index]; - if(score > class_score) + if (score > class_score) { class_index = s; class_score = score; @@ -207,7 +208,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh } float box_score = feat[anchor * (cls_num + 5) * channel_size + feat_w * h + w + 4 * channel_size]; float final_score = sigmoid(box_score) * sigmoid(class_score); - if(final_score >= prob_threshold) + if (final_score >= prob_threshold) { int dx_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 0 * channel_size; int dy_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 1 * channel_size; @@ -225,8 +226,8 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh float pred_x = (w + dx) * stride; float pred_y = (h + dy) * stride; - float pred_w = exp(dw) * anchor_w ; - float pred_h = exp(dh) * anchor_h ; + float pred_w = exp(dw) * anchor_w; + float pred_h = exp(dh) * anchor_h; float x0 = (pred_x - pred_w * 0.5f); float y0 = (pred_y - pred_h * 0.5f); @@ -240,7 +241,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh obj.rect.height = y1 - y0; obj.label = class_index; obj.prob = final_score; - objects.push_back(obj); + objects.push_back(obj); } } } @@ -258,8 +259,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects) "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", - "hair drier", "toothbrush" - }; + "hair drier", "toothbrush"}; cv::Mat image = bgr.clone(); @@ -320,23 +320,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -363,7 +363,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "cv::imread %s failed\n", image_file); return -1; - } + } /* set runtime options */ struct options opt; @@ -452,7 +452,7 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); /* dequant output data */ @@ -470,23 +470,23 @@ int main(int argc, char* argv[]) int p16_count = get_tensor_buffer_size(p16_output) / sizeof(uint8_t); int p32_count = get_tensor_buffer_size(p32_output) / sizeof(uint8_t); - uint8_t* p16_data_u8 = ( uint8_t* )get_tensor_buffer(p16_output); - uint8_t* p32_data_u8 = ( uint8_t* )get_tensor_buffer(p32_output); + uint8_t* p16_data_u8 = (uint8_t*)get_tensor_buffer(p16_output); + uint8_t* p32_data_u8 = (uint8_t*)get_tensor_buffer(p32_output); std::vector p16_data(p16_count); std::vector p32_data(p32_count); for (int c = 0; c < p16_count; c++) { - p16_data[c] = (( float )p16_data_u8[c] - ( float )p16_zero_point) * p16_scale; + p16_data[c] = ((float)p16_data_u8[c] - (float)p16_zero_point) * p16_scale; } for (int c = 0; c < p32_count; c++) { - p32_data[c] = (( float )p32_data_u8[c] - ( float )p32_zero_point) * p32_scale; + p32_data[c] = ((float)p32_data_u8[c] - (float)p32_zero_point) * p32_scale; } - /* postprocess */ + /* postprocess */ const float prob_threshold = 0.45f; const float nms_threshold = 0.25f; @@ -513,7 +513,7 @@ int main(int argc, char* argv[]) float ratio_y = (float)raw_h / img_h; int count = picked.size(); - fprintf(stderr, "detection num: %d\n",count); + fprintf(stderr, "detection num: %d\n", count); objects.resize(count); for (int i = 0; i < count; i++) diff --git a/examples/tm_yolov4_tiny_uint8.cpp b/examples/tm_yolov4_tiny_uint8.cpp index 2c3c995ac..4ea318c56 100644 --- a/examples/tm_yolov4_tiny_uint8.cpp +++ b/examples/tm_yolov4_tiny_uint8.cpp @@ -134,7 +134,7 @@ static void nms_sorted_bboxes(const std::vector& faceobjects, std::vecto } } -void get_input_data_yolov4_uint8(const char* image_file, uint8_t * input_data, int img_h, int img_w, const float* mean, const float* scale, +void get_input_data_yolov4_uint8(const char* image_file, uint8_t* input_data, int img_h, int img_w, const float* mean, const float* scale, float input_scale, int zero_point) { cv::Mat sample = cv::imread(image_file, 1); @@ -148,20 +148,21 @@ void get_input_data_yolov4_uint8(const char* image_file, uint8_t * input_data, i /* resize process */ cv::resize(img, img, cv::Size(img_w, img_h)); img.convertTo(img, CV_32FC3); - float* img_data = (float* )img.data; + float* img_data = (float*)img.data; /* nhwc to nchw */ for (int h = 0; h < img_h; h++) - { for (int w = 0; w < img_w; w++) + { + for (int w = 0; w < img_w; w++) { for (int c = 0; c < 3; c++) { - int in_index = h * img_w * 3 + w * 3 + c; + int in_index = h * img_w * 3 + w * 3 + c; int out_index = c * img_h * img_w + h * img_w + w; float input_fp32 = (img_data[in_index] - mean[c]) * scale[c]; /* quant to uint8 */ - int udata = (round)(input_fp32 / input_scale + ( float )zero_point); + int udata = (round)(input_fp32 / input_scale + (float)zero_point); if (udata > 255) udata = 255; else if (udata < 0) @@ -173,7 +174,7 @@ void get_input_data_yolov4_uint8(const char* image_file, uint8_t * input_data, i } } -static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector& objects) +static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector& objects) { static float anchors[12] = {10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319}; int anchor_num = 3; @@ -181,9 +182,9 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh int feat_h = 416 / stride; int cls_num = 80; int anchor_group = 0; - if(stride == 16) + if (stride == 16) anchor_group = 1; - if(stride == 32) + if (stride == 32) anchor_group = 2; for (int h = 0; h <= feat_h - 1; h++) @@ -199,7 +200,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh { int score_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + (s + 5) * channel_size; float score = feat[score_index]; - if(score > class_score) + if (score > class_score) { class_index = s; class_score = score; @@ -207,7 +208,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh } float box_score = feat[anchor * (cls_num + 5) * channel_size + feat_w * h + w + 4 * channel_size]; float final_score = sigmoid(box_score) * sigmoid(class_score); - if(final_score >= prob_threshold) + if (final_score >= prob_threshold) { int dx_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 0 * channel_size; int dy_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 1 * channel_size; @@ -225,8 +226,8 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh float pred_x = (w + dx) * stride; float pred_y = (h + dy) * stride; - float pred_w = exp(dw) * anchor_w ; - float pred_h = exp(dh) * anchor_h ; + float pred_w = exp(dw) * anchor_w; + float pred_h = exp(dh) * anchor_h; float x0 = (pred_x - pred_w * 0.5f); float y0 = (pred_y - pred_h * 0.5f); @@ -240,7 +241,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh obj.rect.height = y1 - y0; obj.label = class_index; obj.prob = final_score; - objects.push_back(obj); + objects.push_back(obj); } } } @@ -258,8 +259,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects) "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", - "hair drier", "toothbrush" - }; + "hair drier", "toothbrush"}; cv::Mat image = bgr.clone(); @@ -320,23 +320,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -363,7 +363,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "cv::imread %s failed\n", image_file); return -1; - } + } /* set runtime options */ struct options opt; @@ -443,7 +443,7 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); /* dequant output data */ @@ -461,23 +461,23 @@ int main(int argc, char* argv[]) int p16_count = get_tensor_buffer_size(p16_output) / sizeof(uint8_t); int p32_count = get_tensor_buffer_size(p32_output) / sizeof(uint8_t); - uint8_t* p16_data_u8 = ( uint8_t* )get_tensor_buffer(p16_output); - uint8_t* p32_data_u8 = ( uint8_t* )get_tensor_buffer(p32_output); + uint8_t* p16_data_u8 = (uint8_t*)get_tensor_buffer(p16_output); + uint8_t* p32_data_u8 = (uint8_t*)get_tensor_buffer(p32_output); std::vector p16_data(p16_count); std::vector p32_data(p32_count); for (int c = 0; c < p16_count; c++) { - p16_data[c] = (( float )p16_data_u8[c] - ( float )p16_zero_point) * p16_scale; + p16_data[c] = ((float)p16_data_u8[c] - (float)p16_zero_point) * p16_scale; } for (int c = 0; c < p32_count; c++) { - p32_data[c] = (( float )p32_data_u8[c] - ( float )p32_zero_point) * p32_scale; + p32_data[c] = ((float)p32_data_u8[c] - (float)p32_zero_point) * p32_scale; } - /* postprocess */ + /* postprocess */ const float prob_threshold = 0.45f; const float nms_threshold = 0.25f; @@ -504,7 +504,7 @@ int main(int argc, char* argv[]) float ratio_y = (float)raw_h / img_h; int count = picked.size(); - fprintf(stderr, "detection num: %d\n",count); + fprintf(stderr, "detection num: %d\n", count); objects.resize(count); for (int i = 0; i < count; i++) diff --git a/examples/tm_yolov4_uint8.cpp b/examples/tm_yolov4_uint8.cpp index 78e867979..463ea9d7e 100644 --- a/examples/tm_yolov4_uint8.cpp +++ b/examples/tm_yolov4_uint8.cpp @@ -134,7 +134,7 @@ static void nms_sorted_bboxes(const std::vector& faceobjects, std::vecto } } -void get_input_data_yolov4_uint8(const char* image_file, uint8_t * input_data, int img_h, int img_w, const float* mean, const float* scale, +void get_input_data_yolov4_uint8(const char* image_file, uint8_t* input_data, int img_h, int img_w, const float* mean, const float* scale, float input_scale, int zero_point) { cv::Mat sample = cv::imread(image_file, 1); @@ -148,20 +148,21 @@ void get_input_data_yolov4_uint8(const char* image_file, uint8_t * input_data, i /* resize process */ cv::resize(img, img, cv::Size(img_w, img_h)); img.convertTo(img, CV_32FC3); - float* img_data = (float* )img.data; + float* img_data = (float*)img.data; /* nhwc to nchw */ for (int h = 0; h < img_h; h++) - { for (int w = 0; w < img_w; w++) + { + for (int w = 0; w < img_w; w++) { for (int c = 0; c < 3; c++) { - int in_index = h * img_w * 3 + w * 3 + c; + int in_index = h * img_w * 3 + w * 3 + c; int out_index = c * img_h * img_w + h * img_w + w; float input_fp32 = (img_data[in_index] - mean[c]) * scale[c]; /* quant to uint8 */ - int udata = (round)(input_fp32 / input_scale + ( float )zero_point); + int udata = (round)(input_fp32 / input_scale + (float)zero_point); if (udata > 255) udata = 255; else if (udata < 0) @@ -173,7 +174,7 @@ void get_input_data_yolov4_uint8(const char* image_file, uint8_t * input_data, i } } -static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector& objects) +static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector& objects) { static float anchors[18] = {12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401}; int anchor_num = 3; @@ -181,11 +182,11 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh int feat_h = 416 / stride; int cls_num = 80; int anchor_group = 0; - if(stride == 8) + if (stride == 8) anchor_group = 1; - if(stride == 16) + if (stride == 16) anchor_group = 2; - if(stride == 32) + if (stride == 32) anchor_group = 3; for (int h = 0; h <= feat_h - 1; h++) @@ -201,7 +202,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh { int score_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + (s + 5) * channel_size; float score = feat[score_index]; - if(score > class_score) + if (score > class_score) { class_index = s; class_score = score; @@ -209,7 +210,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh } float box_score = feat[anchor * (cls_num + 5) * channel_size + feat_w * h + w + 4 * channel_size]; float final_score = sigmoid(box_score) * sigmoid(class_score); - if(final_score >= prob_threshold) + if (final_score >= prob_threshold) { int dx_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 0 * channel_size; int dy_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 1 * channel_size; @@ -227,8 +228,8 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh float pred_x = (w + dx) * stride; float pred_y = (h + dy) * stride; - float pred_w = exp(dw) * anchor_w ; - float pred_h = exp(dh) * anchor_h ; + float pred_w = exp(dw) * anchor_w; + float pred_h = exp(dh) * anchor_h; float x0 = (pred_x - pred_w * 0.5f); float y0 = (pred_y - pred_h * 0.5f); @@ -242,7 +243,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh obj.rect.height = y1 - y0; obj.label = class_index; obj.prob = final_score; - objects.push_back(obj); + objects.push_back(obj); } } } @@ -260,8 +261,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects) "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", - "hair drier", "toothbrush" - }; + "hair drier", "toothbrush"}; cv::Mat image = bgr.clone(); @@ -322,23 +322,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -365,7 +365,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "cv::imread %s failed\n", image_file); return -1; - } + } /* set runtime options */ struct options opt; @@ -445,18 +445,18 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); /* dequant output data */ - tensor_t p8_output = get_graph_output_tensor(graph, 2, 0); + tensor_t p8_output = get_graph_output_tensor(graph, 2, 0); tensor_t p16_output = get_graph_output_tensor(graph, 1, 0); tensor_t p32_output = get_graph_output_tensor(graph, 0, 0); - float p8_scale = 0.f; + float p8_scale = 0.f; float p16_scale = 0.f; float p32_scale = 0.f; - int p8_zero_point = 0; + int p8_zero_point = 0; int p16_zero_point = 0; int p32_zero_point = 0; @@ -464,13 +464,13 @@ int main(int argc, char* argv[]) get_tensor_quant_param(p16_output, &p16_scale, &p16_zero_point, 1); get_tensor_quant_param(p32_output, &p32_scale, &p32_zero_point, 1); - int p8_count = get_tensor_buffer_size(p8_output) / sizeof(uint8_t); + int p8_count = get_tensor_buffer_size(p8_output) / sizeof(uint8_t); int p16_count = get_tensor_buffer_size(p16_output) / sizeof(uint8_t); int p32_count = get_tensor_buffer_size(p32_output) / sizeof(uint8_t); - uint8_t* p8_data_u8 = ( uint8_t* )get_tensor_buffer(p8_output); - uint8_t* p16_data_u8 = ( uint8_t* )get_tensor_buffer(p16_output); - uint8_t* p32_data_u8 = ( uint8_t* )get_tensor_buffer(p32_output); + uint8_t* p8_data_u8 = (uint8_t*)get_tensor_buffer(p8_output); + uint8_t* p16_data_u8 = (uint8_t*)get_tensor_buffer(p16_output); + uint8_t* p32_data_u8 = (uint8_t*)get_tensor_buffer(p32_output); std::vector p8_data(p8_count); std::vector p16_data(p16_count); @@ -478,20 +478,20 @@ int main(int argc, char* argv[]) for (int c = 0; c < p8_count; c++) { - p8_data[c] = (( float )p8_data_u8[c] - ( float )p8_zero_point) * p8_scale; + p8_data[c] = ((float)p8_data_u8[c] - (float)p8_zero_point) * p8_scale; } for (int c = 0; c < p16_count; c++) { - p16_data[c] = (( float )p16_data_u8[c] - ( float )p16_zero_point) * p16_scale; + p16_data[c] = ((float)p16_data_u8[c] - (float)p16_zero_point) * p16_scale; } for (int c = 0; c < p32_count; c++) { - p32_data[c] = (( float )p32_data_u8[c] - ( float )p32_zero_point) * p32_scale; + p32_data[c] = ((float)p32_data_u8[c] - (float)p32_zero_point) * p32_scale; } - /* postprocess */ + /* postprocess */ const float prob_threshold = 0.45f; const float nms_threshold = 0.25f; @@ -520,7 +520,7 @@ int main(int argc, char* argv[]) float ratio_y = (float)raw_h / img_h; int count = picked.size(); - fprintf(stderr, "detection num: %d\n",count); + fprintf(stderr, "detection num: %d\n", count); objects.resize(count); for (int i = 0; i < count; i++) diff --git a/examples/tm_yolov5.cpp b/examples/tm_yolov5.cpp index acf3e4c56..2debc8ece 100644 --- a/examples/tm_yolov5.cpp +++ b/examples/tm_yolov5.cpp @@ -31,7 +31,6 @@ #include "tengine/c_api.h" #include "tengine_operations.h" - static constexpr int kAnchorNum = 3; static constexpr int kClassNum = 80; static constexpr float kIgnoreThresh = 0.5f; @@ -71,7 +70,7 @@ void correct_yolo_boxes(std::vector& dets, int w, int h, int netw, in int i; int new_w = 0; int new_h = 0; - if ((( float )netw / w) < (( float )neth / h)) + if (((float)netw / w) < ((float)neth / h)) { new_w = netw; new_h = (h * netw) / w; @@ -85,10 +84,10 @@ void correct_yolo_boxes(std::vector& dets, int w, int h, int netw, in for (i = 0; i < dets.size(); ++i) { box b = dets[i].bbox; - b.x = (b.x - (netw - new_w) / 2.) / (( float )new_w / w); - b.y = (b.y - (neth - new_h) / 2.) / (( float )new_h / h); - b.w /= (( float )new_w / w); - b.h /= (( float )new_h / h); + b.x = (b.x - (netw - new_w) / 2.) / ((float)new_w / w); + b.y = (b.y - (neth - new_h) / 2.) / ((float)new_h / h); + b.w /= ((float)new_w / w); + b.h /= ((float)new_h / h); dets[i].bbox = b; } @@ -123,8 +122,7 @@ std::vector forward_darknet_layer_cpu(const float* input, int img_w, { for (int channel = 0; channel < 3; channel++) { - const float* pdata = input + channel * out_h * out_w * (kClassNum + 5) + - shift_y * out_w * (kClassNum + 5) + shift_x * (kClassNum + 5); + const float* pdata = input + channel * out_h * out_w * (kClassNum + 5) + shift_y * out_w * (kClassNum + 5) + shift_x * (kClassNum + 5); float box_prob = logistic_cpu(*(pdata + 4)); if (box_prob < kIgnoreThresh) continue; @@ -212,7 +210,7 @@ std::vector do_nms_sort(std::vector& dets, int classes, fl for (int k = 0; k < classes; ++k) { std::vector class_detection; - for (auto & det : dets) + for (auto& det : dets) { if (det.classes == k) { @@ -220,7 +218,7 @@ std::vector do_nms_sort(std::vector& dets, int classes, fl } } - std::sort(class_detection.begin(), class_detection.end(), [](const detection & a, const detection & b) { + std::sort(class_detection.begin(), class_detection.end(), [](const detection& a, const detection& b) { return a.prob > b.prob; }); @@ -282,28 +280,28 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 's': - net_w = std::strtoul(optarg, nullptr, 10); - net_h = net_w; - fprintf(stderr, "set net input size: %d %d\n", net_h, net_w); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 's': + net_w = std::strtoul(optarg, nullptr, 10); + net_h = net_w; + fprintf(stderr, "set net input size: %d %d\n", net_h, net_w); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -350,7 +348,7 @@ int main(int argc, char* argv[]) /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = net_h * net_w * 3; - int dims[] = {1, 3, net_h, net_w}; // nchw + int dims[] = {1, 3, net_h, net_w}; // nchw std::vector input_data(img_size); @@ -418,7 +416,7 @@ int main(int argc, char* argv[]) int out_dim[5]; get_tensor_shape(out_tensor, out_dim, 5); - float* out_data = ( float* )get_tensor_buffer(out_tensor); + float* out_data = (float*)get_tensor_buffer(out_tensor); int out_w = out_dim[3]; int out_h = out_dim[2]; auto node_detection = forward_darknet_layer_cpu(out_data, img.w, img.h, net_w, net_h, out_w, out_h); diff --git a/examples/tm_yolov5s.cpp b/examples/tm_yolov5s.cpp index bef5d476b..b8f277a4a 100644 --- a/examples/tm_yolov5s.cpp +++ b/examples/tm_yolov5s.cpp @@ -135,9 +135,9 @@ static void nms_sorted_bboxes(const std::vector& faceobjects, std::vecto } } - -static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector& objects, - int letterbox_cols, int letterbox_rows){ +static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector& objects, + int letterbox_cols, int letterbox_rows) +{ static float anchors[18] = {10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326}; int anchor_num = 3; @@ -145,11 +145,11 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh int feat_h = letterbox_rows / stride; int cls_num = 80; int anchor_group; - if(stride == 8) + if (stride == 8) anchor_group = 1; - if(stride == 16) + if (stride == 16) anchor_group = 2; - if(stride == 32) + if (stride == 32) anchor_group = 3; for (int h = 0; h <= feat_h - 1; h++) { @@ -163,7 +163,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh for (int s = 0; s <= cls_num - 1; s++) { float score = feat[a * feat_w * feat_h * (cls_num + 5) + h * feat_w * (cls_num + 5) + w * (cls_num + 5) + s + 5]; - if(score > class_score) + if (score > class_score) { class_index = s; class_score = score; @@ -171,7 +171,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh } //process box score float box_score = feat[a * feat_w * feat_h * (cls_num + 5) + (h * feat_w) * (cls_num + 5) + w * (cls_num + 5) + 4]; - float final_score = sigmoid(box_score ) * sigmoid(class_score); + float final_score = sigmoid(box_score) * sigmoid(class_score); if (final_score >= prob_threshold) { int loc_idx = a * feat_h * feat_w * (cls_num + 5) + h * feat_w * (cls_num + 5) + w * (cls_num + 5); @@ -207,16 +207,15 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { - "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", - "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", - "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", - "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", - "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", - "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", - "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", - "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", - "hair drier", "toothbrush" - }; + "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", + "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", + "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", + "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", + "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", + "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", + "hair drier", "toothbrush"}; cv::Mat image = bgr.clone(); @@ -255,8 +254,8 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects) void show_usage() { fprintf( - stderr, - "[Usage]: [-h]\n [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n"); + stderr, + "[Usage]: [-h]\n [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n"); } void get_input_data_focus(const char* image_file, float* input_data, int letterbox_rows, int letterbox_cols, const float* mean, const float* scale) @@ -273,9 +272,12 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb float scale_letterbox; int resize_rows; int resize_cols; - if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) { + if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) + { scale_letterbox = letterbox_rows * 1.0 / img.rows; - } else { + } + else + { scale_letterbox = letterbox_cols * 1.0 / img.cols; } resize_cols = int(scale_letterbox * img.cols); @@ -284,7 +286,7 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb cv::resize(img, img, cv::Size(resize_cols, resize_rows)); img.convertTo(img, CV_32FC3); // Generate a gray image for letterbox using opencv - cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3,cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2])); + cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3, cv::Scalar(0.5 / scale[0] + mean[0], 0.5 / scale[1] + mean[1], 0.5 / scale[2] + mean[2])); int top = (letterbox_rows - resize_rows) / 2; int bot = (letterbox_rows - resize_rows + 1) / 2; int left = (letterbox_cols - resize_cols) / 2; @@ -293,7 +295,7 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb cv::copyMakeBorder(img, img_new, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0)); img_new.convertTo(img_new, CV_32FC3); - float* img_data = (float* )img_new.data; + float* img_data = (float*)img_new.data; std::vector input_temp(3 * letterbox_cols * letterbox_rows); /* nhwc to nchw */ @@ -303,7 +305,7 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb { for (int c = 0; c < 3; c++) { - int in_index = h * letterbox_cols * 3 + w * 3 + c; + int in_index = h * letterbox_cols * 3 + w * 3 + c; int out_index = c * letterbox_rows * letterbox_cols + h * letterbox_cols + w; input_temp[out_index] = (img_data[in_index] - mean[c]) * scale[c]; } @@ -317,17 +319,12 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb { for (int c = 0; c < 3; c++) { - for (int h = 0; h < letterbox_rows/2; h++) + for (int h = 0; h < letterbox_rows / 2; h++) { - for (int w = 0; w < letterbox_cols/2; w++) + for (int w = 0; w < letterbox_cols / 2; w++) { - int in_index = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows + - h * 2 * letterbox_cols + w * 2; - int out_index = i * 2 * 3 * (letterbox_cols/2) * (letterbox_rows/2) + - g * 3 * (letterbox_cols/2) * (letterbox_rows/2) + - c * (letterbox_cols/2) * (letterbox_rows/2) + - h * (letterbox_cols/2) + - w; + int in_index = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows + h * 2 * letterbox_cols + w * 2; + int out_index = i * 2 * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + g * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + c * (letterbox_cols / 2) * (letterbox_rows / 2) + h * (letterbox_cols / 2) + w; input_data[out_index] = input_temp[in_index]; } @@ -337,7 +334,6 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb } } - int main(int argc, char* argv[]) { const char* model_file = nullptr; @@ -359,23 +355,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -479,7 +475,7 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); /* yolov5 postprocess */ @@ -490,9 +486,9 @@ int main(int argc, char* argv[]) tensor_t p16_output = get_graph_output_tensor(graph, 1, 0); tensor_t p32_output = get_graph_output_tensor(graph, 2, 0); - float* p8_data = ( float*)get_tensor_buffer(p8_output); - float* p16_data = ( float*)get_tensor_buffer(p16_output); - float* p32_data = ( float*)get_tensor_buffer(p32_output); + float* p8_data = (float*)get_tensor_buffer(p8_output); + float* p16_data = (float*)get_tensor_buffer(p16_output); + float* p32_data = (float*)get_tensor_buffer(p32_output); /* postprocess */ const float prob_threshold = 0.25f; @@ -508,7 +504,7 @@ int main(int argc, char* argv[]) proposals.insert(proposals.end(), objects32.begin(), objects32.end()); generate_proposals(16, p16_data, prob_threshold, objects16, letterbox_cols, letterbox_rows); proposals.insert(proposals.end(), objects16.begin(), objects16.end()); - generate_proposals( 8, p8_data, prob_threshold, objects8, letterbox_cols, letterbox_rows); + generate_proposals(8, p8_data, prob_threshold, objects8, letterbox_cols, letterbox_rows); proposals.insert(proposals.end(), objects8.begin(), objects8.end()); qsort_descent_inplace(proposals); @@ -520,9 +516,12 @@ int main(int argc, char* argv[]) float scale_letterbox; int resize_rows; int resize_cols; - if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) { + if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) + { scale_letterbox = letterbox_rows * 1.0 / img.rows; - } else { + } + else + { scale_letterbox = letterbox_cols * 1.0 / img.cols; } resize_cols = int(scale_letterbox * img.cols); @@ -535,7 +534,7 @@ int main(int argc, char* argv[]) float ratio_y = (float)img.cols / resize_cols; int count = picked.size(); - fprintf(stderr, "detection num: %d\n",count); + fprintf(stderr, "detection num: %d\n", count); objects.resize(count); for (int i = 0; i < count; i++) @@ -569,4 +568,3 @@ int main(int argc, char* argv[]) destroy_graph(graph); release_tengine(); } - diff --git a/examples/tm_yolov5s_timvx.cpp b/examples/tm_yolov5s_timvx.cpp index 0152ee175..7f5198951 100644 --- a/examples/tm_yolov5s_timvx.cpp +++ b/examples/tm_yolov5s_timvx.cpp @@ -37,7 +37,6 @@ #include "tengine/c_api.h" #include "tengine_operations.h" - struct Object { cv::Rect_ rect; @@ -135,8 +134,7 @@ static void nms_sorted_bboxes(const std::vector& faceobjects, std::vecto } } - -static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector& objects, +static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector& objects, int letterbox_cols, int letterbox_rows) { static float anchors[18] = {10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326}; @@ -146,11 +144,11 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh int feat_h = letterbox_rows / stride; int cls_num = 80; int anchor_group; - if(stride == 8) + if (stride == 8) anchor_group = 1; - if(stride == 16) + if (stride == 16) anchor_group = 2; - if(stride == 32) + if (stride == 32) anchor_group = 3; for (int h = 0; h <= feat_h - 1; h++) { @@ -164,7 +162,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh for (int s = 0; s <= cls_num - 1; s++) { float score = feat[a * feat_w * feat_h * (cls_num + 5) + h * feat_w * (cls_num + 5) + w * (cls_num + 5) + s + 5]; - if(score > class_score) + if (score > class_score) { class_index = s; class_score = score; @@ -172,7 +170,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh } //process box score float box_score = feat[a * feat_w * feat_h * (cls_num + 5) + (h * feat_w) * (cls_num + 5) + w * (cls_num + 5) + 4]; - float final_score = sigmoid(box_score ) * sigmoid(class_score); + float final_score = sigmoid(box_score) * sigmoid(class_score); if (final_score >= prob_threshold) { int loc_idx = a * feat_h * feat_w * (cls_num + 5) + h * feat_w * (cls_num + 5) + w * (cls_num + 5); @@ -208,16 +206,15 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { - "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", - "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", - "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", - "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", - "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", - "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", - "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", - "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", - "hair drier", "toothbrush" - }; + "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", + "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", + "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", + "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", + "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", + "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", + "hair drier", "toothbrush"}; cv::Mat image = bgr.clone(); @@ -256,8 +253,8 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects) void show_usage() { fprintf( - stderr, - "[Usage]: [-h]\n [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n"); + stderr, + "[Usage]: [-h]\n [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n"); } void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int letterbox_rows, int letterbox_cols, const float* mean, @@ -275,9 +272,12 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int float scale_letterbox; int resize_rows; int resize_cols; - if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) { + if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) + { scale_letterbox = letterbox_rows * 1.0 / img.rows; - } else { + } + else + { scale_letterbox = letterbox_cols * 1.0 / img.cols; } resize_cols = int(scale_letterbox * img.cols); @@ -286,7 +286,7 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int cv::resize(img, img, cv::Size(resize_cols, resize_rows)); img.convertTo(img, CV_32FC3); // Generate a gray image for letterbox using opencv - cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3,cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2])); + cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3, cv::Scalar(0.5 / scale[0] + mean[0], 0.5 / scale[1] + mean[1], 0.5 / scale[2] + mean[2])); int top = (letterbox_rows - resize_rows) / 2; int bot = (letterbox_rows - resize_rows + 1) / 2; int left = (letterbox_cols - resize_cols) / 2; @@ -295,7 +295,7 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int cv::copyMakeBorder(img, img_new, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0)); img_new.convertTo(img_new, CV_32FC3); - float* img_data = (float* )img_new.data; + float* img_data = (float*)img_new.data; std::vector input_temp(3 * letterbox_cols * letterbox_rows); /* nhwc to nchw */ @@ -305,7 +305,7 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int { for (int c = 0; c < 3; c++) { - int in_index = h * letterbox_cols * 3 + w * 3 + c; + int in_index = h * letterbox_cols * 3 + w * 3 + c; int out_index = c * letterbox_rows * letterbox_cols + h * letterbox_cols + w; input_temp[out_index] = (img_data[in_index] - mean[c]) * scale[c]; } @@ -319,20 +319,15 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int { for (int c = 0; c < 3; c++) { - for (int h = 0; h < letterbox_rows/2; h++) + for (int h = 0; h < letterbox_rows / 2; h++) { - for (int w = 0; w < letterbox_cols/2; w++) + for (int w = 0; w < letterbox_cols / 2; w++) { - int in_index = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows + - h * 2 * letterbox_cols + w * 2; - int out_index = i * 2 * 3 * (letterbox_cols/2) * (letterbox_rows/2) + - g * 3 * (letterbox_cols/2) * (letterbox_rows/2) + - c * (letterbox_cols/2) * (letterbox_rows/2) + - h * (letterbox_cols/2) + - w; + int in_index = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows + h * 2 * letterbox_cols + w * 2; + int out_index = i * 2 * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + g * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + c * (letterbox_cols / 2) * (letterbox_rows / 2) + h * (letterbox_cols / 2) + w; /* quant to uint8 */ - int udata = (round)(input_temp[in_index] / input_scale + ( float )zero_point); + int udata = (round)(input_temp[in_index] / input_scale + (float)zero_point); if (udata > 255) udata = 255; else if (udata < 0) @@ -366,23 +361,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -498,7 +493,7 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); /* yolov5 postprocess */ @@ -529,23 +524,23 @@ int main(int argc, char* argv[]) std::vector p16_data(p16_count); std::vector p32_data(p32_count); - uint8_t* p8_data_u8 = ( uint8_t* )get_tensor_buffer(p8_output); - uint8_t* p16_data_u8 = ( uint8_t* )get_tensor_buffer(p16_output); - uint8_t* p32_data_u8 = ( uint8_t* )get_tensor_buffer(p32_output); + uint8_t* p8_data_u8 = (uint8_t*)get_tensor_buffer(p8_output); + uint8_t* p16_data_u8 = (uint8_t*)get_tensor_buffer(p16_output); + uint8_t* p32_data_u8 = (uint8_t*)get_tensor_buffer(p32_output); for (int c = 0; c < p8_count; c++) { - p8_data[c] = (( float )p8_data_u8[c] - ( float )p8_zero_point) * p8_scale; + p8_data[c] = ((float)p8_data_u8[c] - (float)p8_zero_point) * p8_scale; } for (int c = 0; c < p16_count; c++) { - p16_data[c] = (( float )p16_data_u8[c] - ( float )p16_zero_point) * p16_scale; + p16_data[c] = ((float)p16_data_u8[c] - (float)p16_zero_point) * p16_scale; } for (int c = 0; c < p32_count; c++) { - p32_data[c] = (( float )p32_data_u8[c] - ( float )p32_zero_point) * p32_scale; + p32_data[c] = ((float)p32_data_u8[c] - (float)p32_zero_point) * p32_scale; } /* postprocess */ @@ -562,7 +557,7 @@ int main(int argc, char* argv[]) proposals.insert(proposals.end(), objects32.begin(), objects32.end()); generate_proposals(16, p16_data.data(), prob_threshold, objects16, letterbox_cols, letterbox_rows); proposals.insert(proposals.end(), objects16.begin(), objects16.end()); - generate_proposals( 8, p8_data.data(), prob_threshold, objects8, letterbox_cols, letterbox_rows); + generate_proposals(8, p8_data.data(), prob_threshold, objects8, letterbox_cols, letterbox_rows); proposals.insert(proposals.end(), objects8.begin(), objects8.end()); qsort_descent_inplace(proposals); @@ -574,9 +569,12 @@ int main(int argc, char* argv[]) float scale_letterbox; int resize_rows; int resize_cols; - if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) { + if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) + { scale_letterbox = letterbox_rows * 1.0 / img.rows; - } else { + } + else + { scale_letterbox = letterbox_cols * 1.0 / img.cols; } resize_cols = int(scale_letterbox * img.cols); @@ -589,7 +587,7 @@ int main(int argc, char* argv[]) float ratio_y = (float)img.cols / resize_cols; int count = picked.size(); - fprintf(stderr, "detection num: %d\n",count); + fprintf(stderr, "detection num: %d\n", count); objects.resize(count); for (int i = 0; i < count; i++) @@ -623,4 +621,3 @@ int main(int argc, char* argv[]) destroy_graph(graph); release_tengine(); } - diff --git a/examples/tm_yolox.cpp b/examples/tm_yolox.cpp index 9f54b5806..e9da0dd5e 100644 --- a/examples/tm_yolox.cpp +++ b/examples/tm_yolox.cpp @@ -45,7 +45,6 @@ struct Object float prob; }; - static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; @@ -134,16 +133,15 @@ static void nms_sorted_bboxes(const std::vector& faceobjects, std::vecto static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { - "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", - "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", - "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", - "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", - "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", - "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", - "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", - "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", - "hair drier", "toothbrush" - }; + "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", + "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", + "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", + "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", + "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", + "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", + "hair drier", "toothbrush"}; cv::Mat image = bgr.clone(); @@ -175,7 +173,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects) cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } - + cv::imwrite("yolox_out.jpg", image); } struct GridAndStride @@ -211,10 +209,10 @@ static void generate_yolox_proposals(std::vector grid_strides, fl const int num_grid = 3549; const int num_class = 80; const int num_anchors = grid_strides.size(); - + for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++) { - // printf("%d,%d\n",num_anchors,anchor_idx); + // printf("%d,%d\n",num_anchors,anchor_idx); const int grid0 = grid_strides[anchor_idx].grid0; const int grid1 = grid_strides[anchor_idx].grid1; const int stride = grid_strides[anchor_idx].stride; @@ -228,7 +226,7 @@ static void generate_yolox_proposals(std::vector grid_strides, fl float h = exp(feat_ptr[3]) * stride; float x0 = x_center - w * 0.5f; float y0 = y_center - h * 0.5f; - + float box_objectness = feat_ptr[4]; for (int class_idx = 0; class_idx < num_class; class_idx++) @@ -252,13 +250,13 @@ static void generate_yolox_proposals(std::vector grid_strides, fl feat_ptr += 85; } // point anchor loop -} +} void show_usage() { fprintf( - stderr, - "[Usage]: [-h]\n [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n"); + stderr, + "[Usage]: [-h]\n [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n"); } void get_input_data_focus(const char* image_file, float* input_data, int letterbox_rows, int letterbox_cols, const float* mean, const float* scale) @@ -275,9 +273,12 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb float scale_letterbox; int resize_rows; int resize_cols; - if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) { + if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) + { scale_letterbox = letterbox_rows * 1.0 / img.rows; - } else { + } + else + { scale_letterbox = letterbox_cols * 1.0 / img.cols; } resize_cols = int(scale_letterbox * img.cols); @@ -287,7 +288,7 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb img.convertTo(img, CV_32FC3); // Generate a gray image for letterbox using opencv - cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3, cv::Scalar(0, 0, 0)/*cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2])*/); + cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3, cv::Scalar(0, 0, 0) /*cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2])*/); int top = 0; int bot = letterbox_rows - resize_rows; int left = 0; @@ -296,7 +297,7 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb cv::copyMakeBorder(img, img_new, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(114.f, 114.f, 114.f)); img_new.convertTo(img_new, CV_32FC3); - float* img_data = (float* )img_new.data; + float* img_data = (float*)img_new.data; std::vector input_temp(3 * letterbox_cols * letterbox_rows); /* nhwc to nchw */ @@ -306,7 +307,7 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb { for (int c = 0; c < 3; c++) { - int in_index = h * letterbox_cols * 3 + w * 3 + c; + int in_index = h * letterbox_cols * 3 + w * 3 + c; int out_index = c * letterbox_rows * letterbox_cols + h * letterbox_cols + w; input_temp[out_index] = (img_data[in_index] - mean[c]) * scale[c]; } @@ -320,17 +321,12 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb { for (int c = 0; c < 3; c++) { - for (int h = 0; h < letterbox_rows/2; h++) + for (int h = 0; h < letterbox_rows / 2; h++) { - for (int w = 0; w < letterbox_cols/2; w++) + for (int w = 0; w < letterbox_cols / 2; w++) { - int in_index = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows + - h * 2 * letterbox_cols + w * 2; - int out_index = i * 2 * 3 * (letterbox_cols/2) * (letterbox_rows/2) + - g * 3 * (letterbox_cols/2) * (letterbox_rows/2) + - c * (letterbox_cols/2) * (letterbox_rows/2) + - h * (letterbox_cols/2) + - w; + int in_index = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows + h * 2 * letterbox_cols + w * 2; + int out_index = i * 2 * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + g * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + c * (letterbox_cols / 2) * (letterbox_rows / 2) + h * (letterbox_cols / 2) + w; input_data[out_index] = input_temp[in_index]; } @@ -340,7 +336,6 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb } } - int main(int argc, char* argv[]) { const char* model_file = nullptr; @@ -362,23 +357,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -435,7 +430,7 @@ int main(int argc, char* argv[]) std::vector input_data(img_size); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); - + if (input_tensor == nullptr) { fprintf(stderr, "Get input tensor failed\n"); @@ -483,16 +478,16 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); /* yolox postprocess */ tensor_t p8_output = get_graph_output_tensor(graph, 0, 0); - float* p8_data = ( float*)get_tensor_buffer(p8_output); + float* p8_data = (float*)get_tensor_buffer(p8_output); /* postprocess */ - const float prob_threshold = 0.3f; - const float nms_threshold = 0.65f; + const float prob_threshold = 0.3f; + const float nms_threshold = 0.65f; std::vector proposals; std::vector objects; @@ -509,14 +504,17 @@ int main(int argc, char* argv[]) float scale_letterbox; int resize_rows; int resize_cols; - if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) { + if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) + { scale_letterbox = letterbox_rows * 1.0 / img.rows; - } else { + } + else + { scale_letterbox = letterbox_cols * 1.0 / img.cols; } int count = picked.size(); - fprintf(stderr, "detection num: %d\n",count); + fprintf(stderr, "detection num: %d\n", count); objects.resize(count); for (int i = 0; i < count; i++) diff --git a/examples/tm_yolox_timvx.cpp b/examples/tm_yolox_timvx.cpp index da778080d..2aced6f2e 100644 --- a/examples/tm_yolox_timvx.cpp +++ b/examples/tm_yolox_timvx.cpp @@ -1,578 +1,575 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2021, OPEN AI LAB - * Author: xwwang@openailab.com - * Author: 774074168@qq.com - * Author: honghao@openailab.com - * original model: https://github.com/Megvii-BaseDetection/YOLOX - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common.h" -#include "tengine/c_api.h" -#include "tengine_operations.h" - -struct Object -{ - cv::Rect_ rect; - int label; - float prob; -}; - - -static inline float intersection_area(const Object& a, const Object& b) -{ - cv::Rect_ inter = a.rect & b.rect; - return inter.area(); -} - -static void qsort_descent_inplace(std::vector& faceobjects, int left, int right) -{ - int i = left; - int j = right; - float p = faceobjects[(left + right) / 2].prob; - - while (i <= j) - { - while (faceobjects[i].prob > p) - i++; - - while (faceobjects[j].prob < p) - j--; - - if (i <= j) - { - // swap - std::swap(faceobjects[i], faceobjects[j]); - - i++; - j--; - } - } - -#pragma omp parallel sections - { -#pragma omp section - { - if (left < j) qsort_descent_inplace(faceobjects, left, j); - } -#pragma omp section - { - if (i < right) qsort_descent_inplace(faceobjects, i, right); - } - } -} - -static void qsort_descent_inplace(std::vector& faceobjects) -{ - if (faceobjects.empty()) - return; - - qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1); -} - -static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold) -{ - picked.clear(); - - const int n = faceobjects.size(); - - std::vector areas(n); - for (int i = 0; i < n; i++) - { - areas[i] = faceobjects[i].rect.area(); - } - - for (int i = 0; i < n; i++) - { - const Object& a = faceobjects[i]; - - int keep = 1; - for (int j = 0; j < (int)picked.size(); j++) - { - const Object& b = faceobjects[picked[j]]; - - // intersection over union - float inter_area = intersection_area(a, b); - float union_area = areas[i] + areas[picked[j]] - inter_area; - // float IoU = inter_area / union_area - if (inter_area / union_area > nms_threshold) - keep = 0; - } - - if (keep) - picked.push_back(i); - } -} - - -static void draw_objects(const cv::Mat& bgr, const std::vector& objects) -{ - static const char* class_names[] = { - "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", - "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", - "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", - "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", - "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", - "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", - "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", - "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", - "hair drier", "toothbrush" - }; - - cv::Mat image = bgr.clone(); - - for (size_t i = 0; i < objects.size(); i++) - { - const Object& obj = objects[i]; - - fprintf(stderr, "%2d: %3.0f%%, [%4.0f, %4.0f, %4.0f, %4.0f], %s\n", obj.label, obj.prob * 100, obj.rect.x, - obj.rect.y, obj.rect.x + obj.rect.width, obj.rect.y + obj.rect.height, class_names[obj.label]); - - cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); - - char text[256]; - sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); - - int baseLine = 0; - cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); - - int x = obj.rect.x; - int y = obj.rect.y - label_size.height - baseLine; - if (y < 0) - y = 0; - if (x + label_size.width > image.cols) - x = image.cols - label_size.width; - - cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), - cv::Scalar(255, 255, 255), -1); - - cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, - cv::Scalar(0, 0, 0)); - } - - cv::imwrite("yolox_timvx_out.jpg", image); -} - -struct GridAndStride -{ - int grid0; - int grid1; - int stride; -}; - -static int generate_grids_and_stride(const int target_size, std::vector& strides, std::vector& grid_strides) -{ - for (auto stride : strides) - { - int num_grid = target_size / stride; - for (int g1 = 0; g1 < num_grid; g1++) - { - for (int g0 = 0; g0 < num_grid; g0++) - { - GridAndStride ss; - ss.grid0 = g0; - ss.grid1 = g1; - ss.stride = stride; - grid_strides.push_back(ss); - } - } - } - - return 0; -} - -static void generate_yolox_proposals(std::vector grid_strides, float* feat_ptr, float prob_threshold, std::vector& objects) -{ - const int num_grid = 3549; - const int num_class = 80; - const int num_anchors = grid_strides.size(); - - //const float* feat_ptr = feat_blob; - for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++) - { - // printf("%d,%d\n",num_anchors,anchor_idx); - const int grid0 = grid_strides[anchor_idx].grid0; - const int grid1 = grid_strides[anchor_idx].grid1; - const int stride = grid_strides[anchor_idx].stride; - - // yolox/models/yolo_head.py decode logic - // outputs[..., :2] = (outputs[..., :2] + grids) * strides - // outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides - float x_center = (feat_ptr[0] + grid0) * stride; - float y_center = (feat_ptr[1] + grid1) * stride; - float w = exp(feat_ptr[2]) * stride; - float h = exp(feat_ptr[3]) * stride; - float x0 = x_center - w * 0.5f; - float y0 = y_center - h * 0.5f; - - float box_objectness = feat_ptr[4]; - - for (int class_idx = 0; class_idx < num_class; class_idx++) - { - float box_cls_score = feat_ptr[5 + class_idx]; - float box_prob = box_objectness * box_cls_score; - if (box_prob > prob_threshold) - { - Object obj; - obj.rect.x = x0; - obj.rect.y = y0; - obj.rect.width = w; - obj.rect.height = h; - obj.label = class_idx; - obj.prob = box_prob; - - objects.push_back(obj); - } - - } // class loop - feat_ptr += 85; - - } // point anchor loop -} - -void show_usage() -{ - fprintf( - stderr, - "[Usage]: [-h]\n [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n"); -} - -void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int letterbox_rows, int letterbox_cols, const float* mean, - const float* scale, float input_scale, int zero_point) -{ - cv::Mat sample = cv::imread(image_file, 1); - cv::Mat img; - - if (sample.channels() == 1) - cv::cvtColor(sample, img, cv::COLOR_GRAY2RGB); - else - cv::cvtColor(sample, img, cv::COLOR_BGR2RGB); - - /* letterbox process to support different letterbox size */ - float scale_letterbox; - int resize_rows; - int resize_cols; - if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) { - scale_letterbox = letterbox_rows * 1.0 / img.rows; - } else { - scale_letterbox = letterbox_cols * 1.0 / img.cols; - } - resize_cols = int(scale_letterbox * img.cols); - resize_rows = int(scale_letterbox * img.rows); - - cv::resize(img, img, cv::Size(resize_cols, resize_rows)); - - img.convertTo(img, CV_32FC3); - // Generate a gray image for letterbox using opencv - cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3, cv::Scalar(0, 0, 0)/*cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2])*/); - int top = 0; - int bot = letterbox_rows - resize_rows; - int left = 0; - int right = letterbox_cols - resize_cols; - // Letterbox filling - cv::copyMakeBorder(img, img_new, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(114.f, 114.f, 114.f)); - - img_new.convertTo(img_new, CV_32FC3); - float* img_data = (float* )img_new.data; - std::vector input_temp(3 * letterbox_cols * letterbox_rows); - - /* nhwc to nchw */ - for (int h = 0; h < letterbox_rows; h++) - { - for (int w = 0; w < letterbox_cols; w++) - { - for (int c = 0; c < 3; c++) - { - int in_index = h * letterbox_cols * 3 + w * 3 + c; - int out_index = c * letterbox_rows * letterbox_cols + h * letterbox_cols + w; - input_temp[out_index] = (img_data[in_index] - mean[c]) * scale[c]; - } - } - } - - /* focus process */ - for (int i = 0; i < 2; i++) // corresponding to rows - { - for (int g = 0; g < 2; g++) // corresponding to cols - { - for (int c = 0; c < 3; c++) - { - for (int h = 0; h < letterbox_rows/2; h++) - { - for (int w = 0; w < letterbox_cols/2; w++) - { - int in_index = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows + - h * 2 * letterbox_cols + w * 2; - int out_index = i * 2 * 3 * (letterbox_cols/2) * (letterbox_rows/2) + - g * 3 * (letterbox_cols/2) * (letterbox_rows/2) + - c * (letterbox_cols/2) * (letterbox_rows/2) + - h * (letterbox_cols/2) + - w; - - /* quant to uint8 */ - int udata = (round)(input_temp[in_index] / input_scale + ( float )zero_point); - if (udata > 255) - udata = 255; - else if (udata < 0) - udata = 0; - - input_data[out_index] = udata; - } - } - } - } - } -} - - -int main(int argc, char* argv[]) -{ - const char* model_file = nullptr; - const char* image_file = nullptr; - - int img_c = 3; - const float mean[3] = {255.f * 0.485f, 255.f * 0.456, 255.f * 0.406f}; - const float scale[3] = {1 / (255.f * 0.229f), 1 / (255.f * 0.224f), 1 / (255.f * 0.225f)}; - - // allow none square letterbox, set default letterbox size - int letterbox_rows = 640; - int letterbox_cols = 640; - - int repeat_count = 1; - int num_thread = 1; - - int res; - while ((res = getopt(argc, argv, "m:i:r:t:h:")) != -1) - { - switch (res) - { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; - } - } - - /* check files */ - if (nullptr == model_file) - { - fprintf(stderr, "Error: Tengine model file not specified!\n"); - show_usage(); - return -1; - } - - if (nullptr == image_file) - { - fprintf(stderr, "Error: Image file not specified!\n"); - show_usage(); - return -1; - } - - if (!check_file_exist(model_file) || !check_file_exist(image_file)) - return -1; - - cv::Mat img = cv::imread(image_file, 1); - if (img.empty()) - { - fprintf(stderr, "cv::imread %s failed\n", image_file); - return -1; - } - - /* set runtime options */ - struct options opt; - opt.num_thread = num_thread; - opt.cluster = TENGINE_CLUSTER_ALL; - opt.precision = TENGINE_MODE_UINT8; - opt.affinity = 0; - - /* inital tengine */ - if (init_tengine() != 0) - { - fprintf(stderr, "Initial tengine failed.\n"); - return -1; - } - fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version()); - - /* create VeriSilicon TIM-VX backend */ - context_t timvx_context = create_context("timvx", 1); - int rtt = set_context_device(timvx_context, "TIMVX", NULL, 0); - if (0 > rtt) - { - fprintf(stderr, " add_context_device VSI DEVICE failed.\n"); - return -1; - } - - /* create graph, load tengine model xxx.tmfile */ - graph_t graph = create_graph(timvx_context, "tengine", model_file); - if (graph == nullptr) - { - fprintf(stderr, "Create graph failed.\n"); - return -1; - } - - int img_size = letterbox_rows * letterbox_cols * img_c; - int dims[] = {1, 12, int(letterbox_rows / 2), int(letterbox_cols / 2)}; - std::vector input_data(img_size); - - tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); - - if (input_tensor == nullptr) - { - fprintf(stderr, "Get input tensor failed\n"); - return -1; - } - - if (set_tensor_shape(input_tensor, dims, 4) < 0) - { - fprintf(stderr, "Set input tensor shape failed\n"); - return -1; - } - - if (set_tensor_buffer(input_tensor, input_data.data(), img_size) < 0) - { - fprintf(stderr, "Set input tensor buffer failed\n"); - return -1; - } - - /* prerun graph, set work options(num_thread, cluster, precision) */ - if (prerun_graph_multithread(graph, opt) < 0) - { - fprintf(stderr, "Prerun multithread graph failed.\n"); - return -1; - } - - /* prepare process input data, set the data mem to input tensor */ - float input_scale = 0.f; - int input_zero_point = 0; - get_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); - get_input_data_focus_uint8(image_file, input_data.data(), letterbox_rows, letterbox_cols, mean, scale, input_scale, input_zero_point); - - /* run graph */ - double min_time = DBL_MAX; - double max_time = DBL_MIN; - double total_time = 0.; - for (int i = 0; i < repeat_count; i++) - { - double start = get_current_time(); - if (run_graph(graph, 1) < 0) - { - fprintf(stderr, "Run graph failed\n"); - return -1; - } - double end = get_current_time(); - double cur = end - start; - total_time += cur; - min_time = std::min(min_time, cur); - max_time = std::max(max_time, cur); - } - fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); - fprintf(stderr, "--------------------------------------\n"); - - /* yolox postprocess */ - tensor_t p8_output = get_graph_output_tensor(graph, 0, 0); - uint8_t * output_u8 = ( uint8_t*)get_tensor_buffer(p8_output); - int output_size = get_tensor_buffer_size(p8_output); - - /* dequant */ - float output_scale = 0.f; - int output_zero_point = 0; - get_tensor_quant_param(p8_output, &output_scale, &output_zero_point, 1); - std::vector p8_data(output_size); - for (int i = 0; i < output_size; i++) - p8_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; - - /* postprocess */ - const float prob_threshold = 0.3f; - const float nms_threshold = 0.65f; - - std::vector proposals; - std::vector objects; - - std::vector strides = {8, 16, 32}; // might have stride=64 - std::vector grid_strides; - generate_grids_and_stride(letterbox_rows, strides, grid_strides); - generate_yolox_proposals(grid_strides, p8_data.data(), prob_threshold, proposals); - qsort_descent_inplace(proposals); - std::vector picked; - nms_sorted_bboxes(proposals, picked, nms_threshold); - - /* yolox draw the result */ - float scale_letterbox; - int resize_rows; - int resize_cols; - if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) { - scale_letterbox = letterbox_rows * 1.0 / img.rows; - } else { - scale_letterbox = letterbox_cols * 1.0 / img.cols; - } - - int count = picked.size(); - fprintf(stderr, "detection num: %d\n",count); - - objects.resize(count); - for (int i = 0; i < count; i++) - { - objects[i] = proposals[picked[i]]; - float x0 = (objects[i].rect.x) / scale_letterbox; - float y0 = (objects[i].rect.y) / scale_letterbox; - float x1 = (objects[i].rect.x + objects[i].rect.width) / scale_letterbox; - float y1 = (objects[i].rect.y + objects[i].rect.height) / scale_letterbox; - x0 = std::max(std::min(x0, (float)(img.cols - 1)), 0.f); - y0 = std::max(std::min(y0, (float)(img.rows - 1)), 0.f); - x1 = std::max(std::min(x1, (float)(img.cols - 1)), 0.f); - y1 = std::max(std::min(y1, (float)(img.rows - 1)), 0.f); - - objects[i].rect.x = x0; - objects[i].rect.y = y0; - objects[i].rect.width = x1 - x0; - objects[i].rect.height = y1 - y0; - } - - draw_objects(img, objects); - - /* release tengine */ - postrun_graph(graph); - destroy_graph(graph); - release_tengine(); -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2021, OPEN AI LAB + * Author: xwwang@openailab.com + * Author: 774074168@qq.com + * Author: honghao@openailab.com + * original model: https://github.com/Megvii-BaseDetection/YOLOX + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" +#include "tengine/c_api.h" +#include "tengine_operations.h" + +struct Object +{ + cv::Rect_ rect; + int label; + float prob; +}; + +static inline float intersection_area(const Object& a, const Object& b) +{ + cv::Rect_ inter = a.rect & b.rect; + return inter.area(); +} + +static void qsort_descent_inplace(std::vector& faceobjects, int left, int right) +{ + int i = left; + int j = right; + float p = faceobjects[(left + right) / 2].prob; + + while (i <= j) + { + while (faceobjects[i].prob > p) + i++; + + while (faceobjects[j].prob < p) + j--; + + if (i <= j) + { + // swap + std::swap(faceobjects[i], faceobjects[j]); + + i++; + j--; + } + } + +#pragma omp parallel sections + { +#pragma omp section + { + if (left < j) qsort_descent_inplace(faceobjects, left, j); + } +#pragma omp section + { + if (i < right) qsort_descent_inplace(faceobjects, i, right); + } + } +} + +static void qsort_descent_inplace(std::vector& faceobjects) +{ + if (faceobjects.empty()) + return; + + qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1); +} + +static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold) +{ + picked.clear(); + + const int n = faceobjects.size(); + + std::vector areas(n); + for (int i = 0; i < n; i++) + { + areas[i] = faceobjects[i].rect.area(); + } + + for (int i = 0; i < n; i++) + { + const Object& a = faceobjects[i]; + + int keep = 1; + for (int j = 0; j < (int)picked.size(); j++) + { + const Object& b = faceobjects[picked[j]]; + + // intersection over union + float inter_area = intersection_area(a, b); + float union_area = areas[i] + areas[picked[j]] - inter_area; + // float IoU = inter_area / union_area + if (inter_area / union_area > nms_threshold) + keep = 0; + } + + if (keep) + picked.push_back(i); + } +} + +static void draw_objects(const cv::Mat& bgr, const std::vector& objects) +{ + static const char* class_names[] = { + "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", + "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", + "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", + "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", + "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", + "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", + "hair drier", "toothbrush"}; + + cv::Mat image = bgr.clone(); + + for (size_t i = 0; i < objects.size(); i++) + { + const Object& obj = objects[i]; + + fprintf(stderr, "%2d: %3.0f%%, [%4.0f, %4.0f, %4.0f, %4.0f], %s\n", obj.label, obj.prob * 100, obj.rect.x, + obj.rect.y, obj.rect.x + obj.rect.width, obj.rect.y + obj.rect.height, class_names[obj.label]); + + cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); + + char text[256]; + sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); + + int baseLine = 0; + cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); + + int x = obj.rect.x; + int y = obj.rect.y - label_size.height - baseLine; + if (y < 0) + y = 0; + if (x + label_size.width > image.cols) + x = image.cols - label_size.width; + + cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), + cv::Scalar(255, 255, 255), -1); + + cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, + cv::Scalar(0, 0, 0)); + } + + cv::imwrite("yolox_timvx_out.jpg", image); +} + +struct GridAndStride +{ + int grid0; + int grid1; + int stride; +}; + +static int generate_grids_and_stride(const int target_size, std::vector& strides, std::vector& grid_strides) +{ + for (auto stride : strides) + { + int num_grid = target_size / stride; + for (int g1 = 0; g1 < num_grid; g1++) + { + for (int g0 = 0; g0 < num_grid; g0++) + { + GridAndStride ss; + ss.grid0 = g0; + ss.grid1 = g1; + ss.stride = stride; + grid_strides.push_back(ss); + } + } + } + + return 0; +} + +static void generate_yolox_proposals(std::vector grid_strides, float* feat_ptr, float prob_threshold, std::vector& objects) +{ + const int num_grid = 3549; + const int num_class = 80; + const int num_anchors = grid_strides.size(); + + //const float* feat_ptr = feat_blob; + for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++) + { + // printf("%d,%d\n",num_anchors,anchor_idx); + const int grid0 = grid_strides[anchor_idx].grid0; + const int grid1 = grid_strides[anchor_idx].grid1; + const int stride = grid_strides[anchor_idx].stride; + + // yolox/models/yolo_head.py decode logic + // outputs[..., :2] = (outputs[..., :2] + grids) * strides + // outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides + float x_center = (feat_ptr[0] + grid0) * stride; + float y_center = (feat_ptr[1] + grid1) * stride; + float w = exp(feat_ptr[2]) * stride; + float h = exp(feat_ptr[3]) * stride; + float x0 = x_center - w * 0.5f; + float y0 = y_center - h * 0.5f; + + float box_objectness = feat_ptr[4]; + + for (int class_idx = 0; class_idx < num_class; class_idx++) + { + float box_cls_score = feat_ptr[5 + class_idx]; + float box_prob = box_objectness * box_cls_score; + if (box_prob > prob_threshold) + { + Object obj; + obj.rect.x = x0; + obj.rect.y = y0; + obj.rect.width = w; + obj.rect.height = h; + obj.label = class_idx; + obj.prob = box_prob; + + objects.push_back(obj); + } + + } // class loop + feat_ptr += 85; + + } // point anchor loop +} + +void show_usage() +{ + fprintf( + stderr, + "[Usage]: [-h]\n [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n"); +} + +void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int letterbox_rows, int letterbox_cols, const float* mean, + const float* scale, float input_scale, int zero_point) +{ + cv::Mat sample = cv::imread(image_file, 1); + cv::Mat img; + + if (sample.channels() == 1) + cv::cvtColor(sample, img, cv::COLOR_GRAY2RGB); + else + cv::cvtColor(sample, img, cv::COLOR_BGR2RGB); + + /* letterbox process to support different letterbox size */ + float scale_letterbox; + int resize_rows; + int resize_cols; + if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) + { + scale_letterbox = letterbox_rows * 1.0 / img.rows; + } + else + { + scale_letterbox = letterbox_cols * 1.0 / img.cols; + } + resize_cols = int(scale_letterbox * img.cols); + resize_rows = int(scale_letterbox * img.rows); + + cv::resize(img, img, cv::Size(resize_cols, resize_rows)); + + img.convertTo(img, CV_32FC3); + // Generate a gray image for letterbox using opencv + cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3, cv::Scalar(0, 0, 0) /*cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2])*/); + int top = 0; + int bot = letterbox_rows - resize_rows; + int left = 0; + int right = letterbox_cols - resize_cols; + // Letterbox filling + cv::copyMakeBorder(img, img_new, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(114.f, 114.f, 114.f)); + + img_new.convertTo(img_new, CV_32FC3); + float* img_data = (float*)img_new.data; + std::vector input_temp(3 * letterbox_cols * letterbox_rows); + + /* nhwc to nchw */ + for (int h = 0; h < letterbox_rows; h++) + { + for (int w = 0; w < letterbox_cols; w++) + { + for (int c = 0; c < 3; c++) + { + int in_index = h * letterbox_cols * 3 + w * 3 + c; + int out_index = c * letterbox_rows * letterbox_cols + h * letterbox_cols + w; + input_temp[out_index] = (img_data[in_index] - mean[c]) * scale[c]; + } + } + } + + /* focus process */ + for (int i = 0; i < 2; i++) // corresponding to rows + { + for (int g = 0; g < 2; g++) // corresponding to cols + { + for (int c = 0; c < 3; c++) + { + for (int h = 0; h < letterbox_rows / 2; h++) + { + for (int w = 0; w < letterbox_cols / 2; w++) + { + int in_index = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows + h * 2 * letterbox_cols + w * 2; + int out_index = i * 2 * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + g * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + c * (letterbox_cols / 2) * (letterbox_rows / 2) + h * (letterbox_cols / 2) + w; + + /* quant to uint8 */ + int udata = (round)(input_temp[in_index] / input_scale + (float)zero_point); + if (udata > 255) + udata = 255; + else if (udata < 0) + udata = 0; + + input_data[out_index] = udata; + } + } + } + } + } +} + +int main(int argc, char* argv[]) +{ + const char* model_file = nullptr; + const char* image_file = nullptr; + + int img_c = 3; + const float mean[3] = {255.f * 0.485f, 255.f * 0.456, 255.f * 0.406f}; + const float scale[3] = {1 / (255.f * 0.229f), 1 / (255.f * 0.224f), 1 / (255.f * 0.225f)}; + + // allow none square letterbox, set default letterbox size + int letterbox_rows = 640; + int letterbox_cols = 640; + + int repeat_count = 1; + int num_thread = 1; + + int res; + while ((res = getopt(argc, argv, "m:i:r:t:h:")) != -1) + { + switch (res) + { + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; + } + } + + /* check files */ + if (nullptr == model_file) + { + fprintf(stderr, "Error: Tengine model file not specified!\n"); + show_usage(); + return -1; + } + + if (nullptr == image_file) + { + fprintf(stderr, "Error: Image file not specified!\n"); + show_usage(); + return -1; + } + + if (!check_file_exist(model_file) || !check_file_exist(image_file)) + return -1; + + cv::Mat img = cv::imread(image_file, 1); + if (img.empty()) + { + fprintf(stderr, "cv::imread %s failed\n", image_file); + return -1; + } + + /* set runtime options */ + struct options opt; + opt.num_thread = num_thread; + opt.cluster = TENGINE_CLUSTER_ALL; + opt.precision = TENGINE_MODE_UINT8; + opt.affinity = 0; + + /* inital tengine */ + if (init_tengine() != 0) + { + fprintf(stderr, "Initial tengine failed.\n"); + return -1; + } + fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version()); + + /* create VeriSilicon TIM-VX backend */ + context_t timvx_context = create_context("timvx", 1); + int rtt = set_context_device(timvx_context, "TIMVX", NULL, 0); + if (0 > rtt) + { + fprintf(stderr, " add_context_device VSI DEVICE failed.\n"); + return -1; + } + + /* create graph, load tengine model xxx.tmfile */ + graph_t graph = create_graph(timvx_context, "tengine", model_file); + if (graph == nullptr) + { + fprintf(stderr, "Create graph failed.\n"); + return -1; + } + + int img_size = letterbox_rows * letterbox_cols * img_c; + int dims[] = {1, 12, int(letterbox_rows / 2), int(letterbox_cols / 2)}; + std::vector input_data(img_size); + + tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); + + if (input_tensor == nullptr) + { + fprintf(stderr, "Get input tensor failed\n"); + return -1; + } + + if (set_tensor_shape(input_tensor, dims, 4) < 0) + { + fprintf(stderr, "Set input tensor shape failed\n"); + return -1; + } + + if (set_tensor_buffer(input_tensor, input_data.data(), img_size) < 0) + { + fprintf(stderr, "Set input tensor buffer failed\n"); + return -1; + } + + /* prerun graph, set work options(num_thread, cluster, precision) */ + if (prerun_graph_multithread(graph, opt) < 0) + { + fprintf(stderr, "Prerun multithread graph failed.\n"); + return -1; + } + + /* prepare process input data, set the data mem to input tensor */ + float input_scale = 0.f; + int input_zero_point = 0; + get_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); + get_input_data_focus_uint8(image_file, input_data.data(), letterbox_rows, letterbox_cols, mean, scale, input_scale, input_zero_point); + + /* run graph */ + double min_time = DBL_MAX; + double max_time = DBL_MIN; + double total_time = 0.; + for (int i = 0; i < repeat_count; i++) + { + double start = get_current_time(); + if (run_graph(graph, 1) < 0) + { + fprintf(stderr, "Run graph failed\n"); + return -1; + } + double end = get_current_time(); + double cur = end - start; + total_time += cur; + min_time = std::min(min_time, cur); + max_time = std::max(max_time, cur); + } + fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, + total_time / repeat_count, max_time, min_time); + fprintf(stderr, "--------------------------------------\n"); + + /* yolox postprocess */ + tensor_t p8_output = get_graph_output_tensor(graph, 0, 0); + uint8_t* output_u8 = (uint8_t*)get_tensor_buffer(p8_output); + int output_size = get_tensor_buffer_size(p8_output); + + /* dequant */ + float output_scale = 0.f; + int output_zero_point = 0; + get_tensor_quant_param(p8_output, &output_scale, &output_zero_point, 1); + std::vector p8_data(output_size); + for (int i = 0; i < output_size; i++) + p8_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; + + /* postprocess */ + const float prob_threshold = 0.3f; + const float nms_threshold = 0.65f; + + std::vector proposals; + std::vector objects; + + std::vector strides = {8, 16, 32}; // might have stride=64 + std::vector grid_strides; + generate_grids_and_stride(letterbox_rows, strides, grid_strides); + generate_yolox_proposals(grid_strides, p8_data.data(), prob_threshold, proposals); + qsort_descent_inplace(proposals); + std::vector picked; + nms_sorted_bboxes(proposals, picked, nms_threshold); + + /* yolox draw the result */ + float scale_letterbox; + int resize_rows; + int resize_cols; + if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) + { + scale_letterbox = letterbox_rows * 1.0 / img.rows; + } + else + { + scale_letterbox = letterbox_cols * 1.0 / img.cols; + } + + int count = picked.size(); + fprintf(stderr, "detection num: %d\n", count); + + objects.resize(count); + for (int i = 0; i < count; i++) + { + objects[i] = proposals[picked[i]]; + float x0 = (objects[i].rect.x) / scale_letterbox; + float y0 = (objects[i].rect.y) / scale_letterbox; + float x1 = (objects[i].rect.x + objects[i].rect.width) / scale_letterbox; + float y1 = (objects[i].rect.y + objects[i].rect.height) / scale_letterbox; + x0 = std::max(std::min(x0, (float)(img.cols - 1)), 0.f); + y0 = std::max(std::min(y0, (float)(img.rows - 1)), 0.f); + x1 = std::max(std::min(x1, (float)(img.cols - 1)), 0.f); + y1 = std::max(std::min(y1, (float)(img.rows - 1)), 0.f); + + objects[i].rect.x = x0; + objects[i].rect.y = y0; + objects[i].rect.width = x1 - x0; + objects[i].rect.height = y1 - y0; + } + + draw_objects(img, objects); + + /* release tengine */ + postrun_graph(graph); + destroy_graph(graph); + release_tengine(); +} diff --git a/source/api/c_api.c b/source/api/c_api.c index e79d42f99..4bff60859 100644 --- a/source/api/c_api.c +++ b/source/api/c_api.c @@ -54,7 +54,7 @@ #include #define STR_VERSION2(a) #a -#define STR_VERSION(a) STR_VERSION2(a) +#define STR_VERSION(a) STR_VERSION2(a) #ifdef TENGINE_LITE_VERSION static const char* tengine_lite_version = STR_VERSION(TENGINE_LITE_VERSION); @@ -70,13 +70,10 @@ static const char* ver_postfix = "dev"; static char* hcl_version = NULL; - static int init_flag = 0; - //////////////////////////////////////////////////// context about //////////////////////////////////////////////////// - context_t create_context(const char* context_name, int empty_context) { struct context* context = (struct context*)sys_malloc(sizeof(struct context)); @@ -92,7 +89,6 @@ context_t create_context(const char* context_name, int empty_context) return context; } - void destroy_context(context_t context) { struct context* ctx = (struct context*)context; @@ -120,13 +116,11 @@ void destroy_context(context_t context) sys_free(ctx); } - struct context* get_ir_graph_context(struct graph* ir_graph) { return ir_graph->attribute->context; } - int get_context_device_number(context_t context) { struct context* ctx = (struct context*)context; @@ -138,7 +132,6 @@ int get_context_device_number(context_t context) return 0; } - struct device* get_context_device(context_t context, int index) { struct context* ctx = (struct context*)context; @@ -156,7 +149,6 @@ struct device* get_context_device(context_t context, int index) return NULL; } - int add_context_device(context_t context, const char* dev_name) { struct context* ctx = (struct context*)context; @@ -184,7 +176,6 @@ int add_context_device(context_t context, const char* dev_name) return 0; } - int set_context_device(context_t context, const char* dev_name, const void* dev_option, size_t dev_opt_size) { struct context* ctx = (struct context*)context; @@ -218,7 +209,6 @@ int set_context_device(context_t context, const char* dev_name, const void* dev_ return 0; } - int remove_context_device(context_t context, const char* dev_name) { struct context* ctx = (struct context*)context; @@ -250,22 +240,18 @@ int remove_context_device(context_t context, const char* dev_name) return -1; } - int set_context_attr(context_t context, const char* attr_name, const void* val, int val_size) { return -1; } - int get_context_attr(context_t context, const char* attr_name, void* val, int val_size) { return -1; } - //////////////////////////////////////////////////// engine about //////////////////////////////////////////////////// - const char* get_tengine_version(void) { static char buf[128]; @@ -277,13 +263,11 @@ const char* get_tengine_version(void) return buf; } - int request_tengine_version(const char* version) { return 1; } - int init_tengine(void) { if (0 != init_flag) @@ -319,7 +303,6 @@ int init_tengine(void) return ret; } - void release_tengine(void) { if (0 == init_flag) @@ -360,17 +343,14 @@ void release_tengine(void) init_flag = 0; } - //////////////////////////////////////////////////// graph about //////////////////////////////////////////////////// - graph_t create_graph_error(ir_graph_t* graph) { destroy_graph(graph); return NULL; } - graph_t create_graph(context_t context, const char* model_format, const char* file_name, ...) { int is_new_context = 0; @@ -447,19 +427,17 @@ graph_t create_graph(context_t context, const char* model_format, const char* fi return ir_graph; } - int prerun_graph(graph_t graph) { struct options option; - option.num_thread = 1; - option.precision = -1; - option.affinity = -1; - option.cluster = TENGINE_CLUSTER_BIG; + option.num_thread = 1; + option.precision = -1; + option.affinity = -1; + option.cluster = TENGINE_CLUSTER_BIG; return prerun_graph_multithread(graph, option); } - int prerun_graph_multithread(graph_t graph, struct options option) { struct graph* ir_graph = (struct graph*)graph; @@ -519,9 +497,7 @@ int prerun_graph_multithread(graph_t graph, struct options option) } int precision = TENGINE_MODE_FP32; - if (0 <= option.precision && (TENGINE_MODE_FP32 == option.precision || TENGINE_MODE_FP16 == option.precision - || TENGINE_MODE_HYBRID_INT8== option.precision || TENGINE_MODE_UINT8 == option.precision - || TENGINE_MODE_INT8== option.precision)) + if (0 <= option.precision && (TENGINE_MODE_FP32 == option.precision || TENGINE_MODE_FP16 == option.precision || TENGINE_MODE_HYBRID_INT8 == option.precision || TENGINE_MODE_UINT8 == option.precision || TENGINE_MODE_INT8 == option.precision)) { precision = option.precision; } @@ -529,11 +505,11 @@ int prerun_graph_multithread(graph_t graph, struct options option) ctx->default_options = sys_malloc(sizeof(struct cpu_option)); struct cpu_option* opt = (struct cpu_option*)ctx->default_options; - opt->dev_name = CPU_DEVICE_NAME; - opt->num_thread = count; - opt->cluster = TENGINE_CLUSTER_BIG; - opt->precision = precision; - opt->affinity = option.affinity; + opt->dev_name = CPU_DEVICE_NAME; + opt->num_thread = count; + opt->cluster = TENGINE_CLUSTER_BIG; + opt->precision = precision; + opt->affinity = option.affinity; struct scheduler* scheduler = ctx->scheduler; ret = scheduler->prerun(scheduler, ir_graph); @@ -566,7 +542,6 @@ int prerun_graph_multithread(graph_t graph, struct options option) return 0; } - int run_graph(graph_t graph, int block) { struct graph* ir_graph = (struct graph*)graph; @@ -589,7 +564,6 @@ int run_graph(graph_t graph, int block) return 0; } - int wait_graph(graph_t graph, int try_wait) { struct graph* ir_graph = (struct graph*)graph; @@ -614,7 +588,6 @@ int wait_graph(graph_t graph, int try_wait) return scheduler->wait(scheduler, ir_graph); } - int postrun_graph(graph_t graph) { struct graph* ir_graph = (struct graph*)graph; @@ -637,7 +610,6 @@ int postrun_graph(graph_t graph) return 0; } - int set_graph_layout(graph_t graph, int layout_type) { struct graph* ir_graph = (struct graph*)graph; @@ -652,31 +624,26 @@ int set_graph_layout(graph_t graph, int layout_type) return 0; } - int set_graph_attr(graph_t graph, const char* attr_name, const void* buf, int size) { return -1; } - int get_graph_attr(graph_t graph, const char* attr_name, void* buf, int size) { return -1; } - int set_graph_thread(graph_t graph, int cluster, int threads) { return -1; } - int set_graph_thread_mask(graph_t graph, size_t cpu_mask) { return -1; } - int destroy_graph(graph_t graph) { struct graph* ir_graph = (struct graph*)graph; @@ -689,13 +656,11 @@ int destroy_graph(graph_t graph) return 0; } - void dump_graph(graph_t graph) { dump_ir_graph((ir_graph_t*)graph); } - int set_graph_device(graph_t graph, const char* dev_name) { struct graph* ir_graph = (struct graph*)graph; @@ -710,16 +675,14 @@ int set_graph_device(graph_t graph, const char* dev_name) return 0; } - //////////////////////////////////////////////////// node about //////////////////////////////////////////////////// - int set_graph_input_node(graph_t graph, const char* input_nodes[], int input_number) { struct graph* ir_graph = (struct graph*)graph; int16_t* input_node_indexes; - input_node_indexes = ( int16_t* )sys_malloc(sizeof(int16_t) * input_number); + input_node_indexes = (int16_t*)sys_malloc(sizeof(int16_t) * input_number); if (input_node_indexes == NULL) { @@ -746,14 +709,13 @@ int set_graph_input_node(graph_t graph, const char* input_nodes[], int input_num return ret; } - int set_graph_output_node(graph_t graph, const char* output_nodes[], int output_number) { struct graph* ir_graph = (struct graph*)graph; int16_t* output_node_indexes; - output_node_indexes = ( int16_t* )sys_malloc(sizeof(int16_t) * output_number); + output_node_indexes = (int16_t*)sys_malloc(sizeof(int16_t) * output_number); if (output_node_indexes == NULL) { @@ -780,18 +742,16 @@ int set_graph_output_node(graph_t graph, const char* output_nodes[], int output_ return ret; } - int get_graph_input_node_number(graph_t graph) { - struct graph* ir_graph = ( struct graph* )graph; + struct graph* ir_graph = (struct graph*)graph; return ir_graph->input_num; } - node_t get_graph_input_node(graph_t graph, int idx) { - struct graph* ir_graph = ( struct graph* )graph; + struct graph* ir_graph = (struct graph*)graph; if (idx < 0 || idx >= ir_graph->input_num) { @@ -801,18 +761,16 @@ node_t get_graph_input_node(graph_t graph, int idx) return get_ir_graph_node(ir_graph, ir_graph->input_nodes[idx]); } - int get_graph_output_node_number(graph_t graph) { - struct graph* ir_graph = ( struct graph* )graph; + struct graph* ir_graph = (struct graph*)graph; return ir_graph->output_num; } - node_t get_graph_output_node(graph_t graph, int idx) { - struct graph* ir_graph = ( struct graph* )graph; + struct graph* ir_graph = (struct graph*)graph; if (idx < 0 || idx >= ir_graph->output_num) { @@ -822,10 +780,9 @@ node_t get_graph_output_node(graph_t graph, int idx) return get_ir_graph_node(ir_graph, ir_graph->output_nodes[idx]); } - tensor_t get_graph_input_tensor(graph_t graph, int input_idx, int tensor_idx) { - struct graph* ir_graph = ( struct graph* )graph; + struct graph* ir_graph = (struct graph*)graph; if (input_idx < 0 || input_idx >= ir_graph->input_num) { @@ -844,10 +801,9 @@ tensor_t get_graph_input_tensor(graph_t graph, int input_idx, int tensor_idx) return get_ir_graph_tensor(ir_node->graph, ir_node->output_tensors[tensor_idx]); } - tensor_t get_graph_output_tensor(graph_t graph, int output_idx, int tensor_idx) { - struct graph* ir_graph = ( struct graph* )graph; + struct graph* ir_graph = (struct graph*)graph; if (output_idx < 0 || output_idx >= ir_graph->output_num) { @@ -866,10 +822,9 @@ tensor_t get_graph_output_tensor(graph_t graph, int output_idx, int tensor_idx) return get_ir_graph_tensor(ir_node->graph, ir_node->output_tensors[tensor_idx]); } - node_t create_graph_node(graph_t graph, const char* node_name, const char* op_name) { - struct graph* ir_graph = ( struct graph* )graph; + struct graph* ir_graph = (struct graph*)graph; int node_idx = get_ir_node_index_from_name(ir_graph, node_name); @@ -888,10 +843,9 @@ node_t create_graph_node(graph_t graph, const char* node_name, const char* op_na return create_ir_node(ir_graph, node_name, op_type, 1); } - node_t get_graph_node(graph_t graph, const char* node_name) { - struct graph* ir_graph = ( struct graph* )graph; + struct graph* ir_graph = (struct graph*)graph; int node_idx = get_ir_node_index_from_name(ir_graph, node_name); @@ -903,10 +857,9 @@ node_t get_graph_node(graph_t graph, const char* node_name) return ir_graph->node_list[node_idx]; } - node_t get_graph_node_by_idx(graph_t graph, int idx) { - struct graph* ir_graph = ( struct graph* )graph; + struct graph* ir_graph = (struct graph*)graph; if (idx < 0 || idx >= ir_graph->node_num) return NULL; @@ -914,34 +867,30 @@ node_t get_graph_node_by_idx(graph_t graph, int idx) return ir_graph->node_list[idx]; } - int get_graph_node_num(graph_t graph) { - struct graph* ir_graph = ( struct graph* )graph; + struct graph* ir_graph = (struct graph*)graph; return ir_graph->node_num; } - int get_node_output_number(node_t node) { - struct node* ir_node = ( struct node* )node; + struct node* ir_node = (struct node*)node; return ir_node->output_num; } - int get_node_input_number(node_t node) { - struct node* ir_node = ( struct node* )node; + struct node* ir_node = (struct node*)node; return ir_node->input_num; } - const char* get_node_name(node_t node) { - struct node* ir_node = ( struct node* )node; + struct node* ir_node = (struct node*)node; if (ir_node->name) { @@ -953,17 +902,15 @@ const char* get_node_name(node_t node) return ir_node->name; } - const char* get_node_op(node_t node) { - struct node* ir_node = ( struct node* )node; + struct node* ir_node = (struct node*)node; int op_type = ir_node->op.type; return get_op_name_from_type(op_type); } - const char* get_node_device(node_t node) { struct node* ir_node = (struct node*)node; @@ -989,74 +936,62 @@ const char* get_node_device(node_t node) return NULL; } - int get_node_attr_int(node_t node, const char* attr_name, int* attr_val) { return -1; } - int get_node_attr_float(node_t node, const char* attr_name, float* attr_val) { return -1; } - int get_node_attr_pointer(node_t node, const char* attr_name, void* attr_val) { return -1; } - int get_node_attr_generic(node_t node, const char* attr_name, const char* type_name, void* buf, int size) { return -1; } - int set_node_attr_int(node_t node, const char* attr_name, const int* attr_val) { return -1; } - int set_node_attr_float(node_t node, const char* attr_name, const float* attr_val) { return -1; } - int set_node_attr_pointer(node_t node, const char* attr_name, const void* attr_val) { return -1; } - int set_node_attr_generic(node_t node, const char* attr_name, const char* type_name, const void* buf, int size) { return -1; } - int add_node_attr(node_t node, const char* attr_name, const char* type_name, int size) { return -1; } - void release_graph_node(node_t node) { - ( void )node; + (void)node; // NOTHING NEEDS TO DO } - //////////////////////////////////////////////////// tensor about //////////////////////////////////////////////////// - tensor_t get_node_input_tensor(node_t node, int input_idx) { - struct node* ir_node = ( struct node* )node; + struct node* ir_node = (struct node*)node; if (input_idx < 0 || input_idx >= ir_node->input_num) { @@ -1068,7 +1003,7 @@ tensor_t get_node_input_tensor(node_t node, int input_idx) tensor_t get_node_output_tensor(node_t node, int output_idx) { - struct node* ir_node = ( struct node* )node; + struct node* ir_node = (struct node*)node; if (output_idx < 0 || output_idx >= ir_node->output_num) { @@ -1078,19 +1013,17 @@ tensor_t get_node_output_tensor(node_t node, int output_idx) return get_ir_graph_tensor(ir_node->graph, ir_node->output_tensors[output_idx]); } - int set_node_input_tensor(node_t node, int input_idx, tensor_t tensor) { - struct node* ir_node = ( struct node* )node; + struct node* ir_node = (struct node*)node; struct tensor* ir_tensor = (struct tensor*)tensor; return set_ir_node_input_tensor(ir_node, input_idx, ir_tensor); } - int set_node_output_tensor(node_t node, int output_idx, tensor_t tensor, int tensor_type) { - struct node* ir_node = ( struct node* )node; + struct node* ir_node = (struct node*)node; struct tensor* ir_tensor = (struct tensor*)tensor; ir_tensor->tensor_type = tensor_type; @@ -1098,18 +1031,16 @@ int set_node_output_tensor(node_t node, int output_idx, tensor_t tensor, int ten return set_ir_node_output_tensor(ir_node, output_idx, ir_tensor); } - tensor_t create_graph_tensor(graph_t graph, const char* tensor_name, int data_type) { - struct graph* ir_graph = ( struct graph* )graph; + struct graph* ir_graph = (struct graph*)graph; return create_ir_tensor(ir_graph, tensor_name, data_type); } - tensor_t get_graph_tensor(graph_t graph, const char* tensor_name) { - struct graph* ir_graph = ( struct graph* )graph; + struct graph* ir_graph = (struct graph*)graph; for (int i = 0; i < ir_graph->node_num; i++) { @@ -1124,14 +1055,14 @@ tensor_t get_graph_tensor(graph_t graph, const char* tensor_name) { struct tensor* ir_tensor = get_ir_graph_tensor(ir_node->graph, ir_node->input_tensors[j]); if (ir_tensor && ir_tensor->name && !strcmp(ir_tensor->name, tensor_name)) - return ( tensor_t )ir_tensor; + return (tensor_t)ir_tensor; } for (int j = 0; j < ir_node->output_num; j++) { struct tensor* ir_tensor = get_ir_graph_tensor(ir_node->graph, ir_node->output_tensors[j]); if (ir_tensor && ir_tensor->name && !strcmp(ir_tensor->name, tensor_name)) - return ( tensor_t )ir_tensor; + return (tensor_t)ir_tensor; } } } @@ -1139,7 +1070,6 @@ tensor_t get_graph_tensor(graph_t graph, const char* tensor_name) return NULL; } - const char* get_tensor_name(tensor_t tensor) { struct tensor* ir_tensor = (struct tensor*)tensor; @@ -1150,13 +1080,11 @@ const char* get_tensor_name(tensor_t tensor) return ir_tensor->name; } - void release_graph_tensor(tensor_t tensor) { // NOTHING NEEDS TO DO } - int set_tensor_shape(tensor_t tensor, const int dims[], int dim_number) { struct tensor* ir_tensor = (struct tensor*)tensor; @@ -1164,7 +1092,6 @@ int set_tensor_shape(tensor_t tensor, const int dims[], int dim_number) return set_ir_tensor_shape(ir_tensor, dims, dim_number); } - int get_tensor_shape(tensor_t tensor, int dims[], int dim_number) { struct tensor* ir_tensor = (struct tensor*)tensor; @@ -1180,7 +1107,6 @@ int get_tensor_shape(tensor_t tensor, int dims[], int dim_number) return ir_tensor->dim_num; } - int get_tensor_buffer_size(tensor_t tensor) { struct tensor* ir_tensor = (struct tensor*)tensor; @@ -1188,7 +1114,6 @@ int get_tensor_buffer_size(tensor_t tensor) return (int)(ir_tensor->elem_size * ir_tensor->elem_num); } - void* get_tensor_buffer(tensor_t tensor) { struct tensor* ir_tensor = (struct tensor*)tensor; @@ -1198,7 +1123,6 @@ void* get_tensor_buffer(tensor_t tensor) return ir_tensor->data; } - int set_tensor_buffer(tensor_t tensor, void* buffer, int buffer_size) { struct tensor* ir_tensor = (struct tensor*)tensor; @@ -1220,7 +1144,6 @@ int set_tensor_buffer(tensor_t tensor, void* buffer, int buffer_size) return 0; } - int get_tensor_data(tensor_t tensor, void* output_data, int data_size) { struct tensor* ir_tensor = (struct tensor*)tensor; @@ -1247,7 +1170,6 @@ int get_tensor_data(tensor_t tensor, void* output_data, int data_size) return -1; } - int set_tensor_data(tensor_t tensor, const void* input_data, int data_size) { struct tensor* ir_tensor = (struct tensor*)tensor; @@ -1268,7 +1190,6 @@ int set_tensor_data(tensor_t tensor, const void* input_data, int data_size) return -1; } - int get_tensor_data_type(tensor_t tensor) { struct tensor* ir_tensor = (struct tensor*)tensor; @@ -1276,7 +1197,6 @@ int get_tensor_data_type(tensor_t tensor) return ir_tensor->data_type; } - int set_tensor_data_type(tensor_t tensor, int data_type) { struct tensor* ir_tensor = (struct tensor*)tensor; @@ -1286,7 +1206,6 @@ int set_tensor_data_type(tensor_t tensor, int data_type) return 0; } - int get_tensor_layout(tensor_t tensor) { struct tensor* ir_tensor = (struct tensor*)tensor; @@ -1294,7 +1213,6 @@ int get_tensor_layout(tensor_t tensor) return ir_tensor->layout; } - int set_tensor_layout(tensor_t tensor, int layout) { struct tensor* ir_tensor = (struct tensor*)tensor; @@ -1304,7 +1222,6 @@ int set_tensor_layout(tensor_t tensor, int layout) return 0; } - int set_tensor_quant_param(tensor_t tensor, const float* scale, const int* zero_point, int number) { struct tensor* ir_tensor = (struct tensor*)tensor; @@ -1312,7 +1229,6 @@ int set_tensor_quant_param(tensor_t tensor, const float* scale, const int* zero_ return set_ir_tensor_quantization_parameter(ir_tensor, scale, zero_point, number); } - int get_tensor_quant_param(tensor_t tensor, float* scale, int* zero_point, int number) { struct tensor* ir_tensor = (struct tensor*)tensor; @@ -1320,63 +1236,52 @@ int get_tensor_quant_param(tensor_t tensor, float* scale, int* zero_point, int n return get_ir_tensor_quantization_parameter(ir_tensor, scale, zero_point, number); } - //////////////////////////////////////////////////// misc about //////////////////////////////////////////////////// - const char* get_tengine_hcl_version() { return hcl_version; } - int set_default_device(const char* device) { return -1; } - void set_log_level(enum log_level level) { SET_LOG_LEVEL(level); } - void set_log_output(log_print_t func) { SET_LOG_OUTPUT(func); } - int get_tengine_errno(void) { return -1; } - int clr_tengine_errno(void) { return -1; } - size_t get_cluster_affinity_mask(int cluster) { check_cpu(); return get_cpu_cluster_mask(cluster); } - //////////////////////////////////////////////////// custom about //////////////////////////////////////////////////// - int set_custom_kernel(node_t node, const char* dev_name, struct custom_kernel_ops* kernel_ops) { // TODO: set custom kernel return -1; } - int remove_custom_kernel(node_t node, const char* dev_name) { // TODO: remove custom kernel diff --git a/source/api/c_api.h b/source/api/c_api.h index c15093191..4e67ef6a1 100644 --- a/source/api/c_api.h +++ b/source/api/c_api.h @@ -28,7 +28,6 @@ #include #include - #if defined __GNUC__ #define DLLEXPORT __attribute((visibility("default"))) #elif defined(_MSC_VER) @@ -37,10 +36,9 @@ #define DLLEXPORT #endif - #if defined __GNUC__ #define DEPRECATED_BEFORE -#define DEPRECATED_AFTER __attribute__ ((deprecated)) +#define DEPRECATED_AFTER __attribute__((deprecated)) #elif defined(_MSC_VER) #pragma deprecated() #define DEPRECATED_BEFORE __declspec(deprecated) @@ -50,59 +48,57 @@ #define DEPRECATED_AFTER #endif - #ifdef __cplusplus extern "C" { #endif -#define MAX_SHAPE_DIM_NUM 8 +#define MAX_SHAPE_DIM_NUM 8 /* the data type of the tensor */ -#define TENGINE_DT_FP32 0 -#define TENGINE_DT_FP16 1 -#define TENGINE_DT_INT8 2 -#define TENGINE_DT_UINT8 3 -#define TENGINE_DT_INT32 4 -#define TENGINE_DT_INT16 5 +#define TENGINE_DT_FP32 0 +#define TENGINE_DT_FP16 1 +#define TENGINE_DT_INT8 2 +#define TENGINE_DT_UINT8 3 +#define TENGINE_DT_INT32 4 +#define TENGINE_DT_INT16 5 /* layout type, not real layout */ -#define TENGINE_LAYOUT_NCHW 0 -#define TENGINE_LAYOUT_NHWC 1 +#define TENGINE_LAYOUT_NCHW 0 +#define TENGINE_LAYOUT_NHWC 1 /* tensor type: the content changed or not during inference */ -#define TENSOR_TYPE_UNKNOWN 0 -#define TENSOR_TYPE_VAR 1 -#define TENSOR_TYPE_CONST 2 -#define TENSOR_TYPE_INPUT 3 -#define TENSOR_TYPE_DEP 4 +#define TENSOR_TYPE_UNKNOWN 0 +#define TENSOR_TYPE_VAR 1 +#define TENSOR_TYPE_CONST 2 +#define TENSOR_TYPE_INPUT 3 +#define TENSOR_TYPE_DEP 4 /* cluster type: big-LITTLE and DynamIQ defined */ -#define TENGINE_CLUSTER_ALL 0 -#define TENGINE_CLUSTER_BIG 1 -#define TENGINE_CLUSTER_MEDIUM 2 -#define TENGINE_CLUSTER_LITTLE 3 +#define TENGINE_CLUSTER_ALL 0 +#define TENGINE_CLUSTER_BIG 1 +#define TENGINE_CLUSTER_MEDIUM 2 +#define TENGINE_CLUSTER_LITTLE 3 -#define TENGINE_MODE_FP32 0 -#define TENGINE_MODE_FP16 1 -#define TENGINE_MODE_HYBRID_INT8 2 -#define TENGINE_MODE_UINT8 3 -#define TENGINE_MODE_INT8 4 +#define TENGINE_MODE_FP32 0 +#define TENGINE_MODE_FP16 1 +#define TENGINE_MODE_HYBRID_INT8 2 +#define TENGINE_MODE_UINT8 3 +#define TENGINE_MODE_INT8 4 /* node dump action definition */ -#define NODE_DUMP_ACTION_DISABLE 0 -#define NODE_DUMP_ACTION_ENABLE 1 -#define NODE_DUMP_ACTION_START 2 -#define NODE_DUMP_ACTION_STOP 3 -#define NODE_DUMP_ACTION_GET 4 +#define NODE_DUMP_ACTION_DISABLE 0 +#define NODE_DUMP_ACTION_ENABLE 1 +#define NODE_DUMP_ACTION_START 2 +#define NODE_DUMP_ACTION_STOP 3 +#define NODE_DUMP_ACTION_GET 4 /* graph perf action definition */ -#define GRAPH_PERF_STAT_DISABLE 0 -#define GRAPH_PERF_STAT_ENABLE 1 -#define GRAPH_PERF_STAT_STOP 2 -#define GRAPH_PERF_STAT_START 3 -#define GRAPH_PERF_STAT_RESET 4 -#define GRAPH_PERF_STAT_GET 5 - +#define GRAPH_PERF_STAT_DISABLE 0 +#define GRAPH_PERF_STAT_ENABLE 1 +#define GRAPH_PERF_STAT_STOP 2 +#define GRAPH_PERF_STAT_START 3 +#define GRAPH_PERF_STAT_RESET 4 +#define GRAPH_PERF_STAT_GET 5 /* follow the std. UNIX log level definition */ enum log_level @@ -117,7 +113,6 @@ enum log_level LOG_DEBUG }; - /* note: Android NN only define one event */ enum graph_exec_event { @@ -128,7 +123,6 @@ enum graph_exec_event GRAPH_EXEC_DONE }; - /* TODO: should add suspend? */ enum graph_exec_stat { @@ -139,7 +133,6 @@ enum graph_exec_stat GRAPH_STAT_ERROR }; - enum device_policy { DEFAULT_POLICY, @@ -147,19 +140,15 @@ enum device_policy LOW_POWER_POLICY }; - typedef void* context_t; typedef void* graph_t; typedef void* tensor_t; typedef void* node_t; - typedef int (*event_handler_t)(graph_t, int, void* arg); - typedef void (*log_print_t)(const char*); - /* graph exec options */ typedef struct options { @@ -169,15 +158,14 @@ typedef struct options uint64_t affinity; } options_t; - struct custom_kernel_tensor { int dim[MAX_SHAPE_DIM_NUM]; /* the shape dim array */ - int dim_num; /* valid entry number */ + int dim_num; /* valid entry number */ int element_num; int element_size; /* determined by data_type */ int data_type; - int dev_type; /* indicate the tensor belongs to CPU/GPU ... */ + int dev_type; /* indicate the tensor belongs to CPU/GPU ... */ int layout_type; /* NCHW type or NHWC type*/ /* quant info */ @@ -186,20 +174,19 @@ struct custom_kernel_tensor int* zero_point; int* quant_number; - void* data; /* pointer to host memory (virtual address) */ - void* dev_mem; /* refers to device memory block */ + void* data; /* pointer to host memory (virtual address) */ + void* dev_mem; /* refers to device memory block */ void* mapped_mem; /* the mapped address for device memory block */ }; - /* For user to add user defined kernel*/ struct custom_kernel_ops { const char* kernel_name; /* name of the kernel */ - const char* op; /* name of the op to be implemented */ - int force; /* if not set, when bind() failed, + const char* op; /* name of the op to be implemented */ + int force; /* if not set, when bind() failed, try to use other kernel implementations*/ - void* kernel_param; /* used for kernel impl functions */ + void* kernel_param; /* used for kernel impl functions */ int kernel_param_size; /*! @@ -230,7 +217,7 @@ struct custom_kernel_ops * @return the inplace input tensor index for an output tensor. * if the output tensor is not an inplace one, return -1. */ - int (*inplace_info)(struct custom_kernel_ops* ops, int output_idx); // optional + int (*inplace_info)(struct custom_kernel_ops* ops, int output_idx); // optional /*! * @brief Check if the kernel can work on the input and output shapes. @@ -321,7 +308,6 @@ struct custom_kernel_ops void (*release)(struct custom_kernel_ops* ops); }; - /************** Library intialization and version checking *******************/ /*! @@ -1122,7 +1108,6 @@ DLLEXPORT DEPRECATED_BEFORE const char* get_node_device(node_t node) DEPRECATED_ */ DLLEXPORT const char* get_default_device(void); - /******************** execution context *****************************/ /*! diff --git a/source/api/plugin.c b/source/api/plugin.c index c39c4bfae..019ceb1c0 100644 --- a/source/api/plugin.c +++ b/source/api/plugin.c @@ -39,13 +39,12 @@ #endif #ifdef _MSC_VER -typedef int(*fun_ptr)(void); +typedef int (*fun_ptr)(void); typedef HINSTANCE so_handle_t; #else typedef void* so_handle_t; #endif - struct plugin_header { char* name; @@ -55,7 +54,6 @@ struct plugin_header static struct vector* plugin_list = NULL; - static int exec_so_func(so_handle_t handle, const char* func_name) { #ifdef _MSC_VER @@ -87,7 +85,6 @@ static int exec_so_func(so_handle_t handle, const char* func_name) return 0; } - int load_tengine_plugin(const char* plugin_name, const char* file_name, const char* init_func_name) { struct plugin_header header; @@ -138,7 +135,6 @@ int load_tengine_plugin(const char* plugin_name, const char* file_name, const ch /* execute the init function */ if (init_func_name && exec_so_func(header.handle, init_func_name) < 0) { - #ifdef _MSC_VER FreeLibrary(header.handle); #else @@ -162,7 +158,6 @@ int load_tengine_plugin(const char* plugin_name, const char* file_name, const ch return 0; } - int unload_tengine_plugin(const char* plugin_name, const char* rel_func_name) { if (plugin_list == NULL) @@ -206,7 +201,6 @@ int unload_tengine_plugin(const char* plugin_name, const char* rel_func_name) return 0; } - int get_tengine_plugin_number(void) { int plugin_num = 0; @@ -217,7 +211,6 @@ int get_tengine_plugin_number(void) return plugin_num; } - const char* get_tengine_plugin_name(int idx) { int plugin_num = get_tengine_plugin_number(); diff --git a/source/device/acl/acl_define.h b/source/device/acl/acl_define.h index 60f70e2e9..184dc0f68 100644 --- a/source/device/acl/acl_define.h +++ b/source/device/acl/acl_define.h @@ -26,9 +26,8 @@ #define ACL_DEV_NAME "ACL" - typedef struct acl_option { char* dev_name; - int precision; //!< precision of calculation + int precision; //!< precision of calculation } acl_opt_t; diff --git a/source/device/acl/acl_device.hpp b/source/device/acl/acl_device.hpp index 247b36db1..d8900de8e 100644 --- a/source/device/acl/acl_device.hpp +++ b/source/device/acl/acl_device.hpp @@ -26,8 +26,7 @@ #include "acl_define.h" -extern "C" -{ +extern "C" { #include "api/c_api.h" #include "device/device.h" diff --git a/source/device/acl/acl_executor.hpp b/source/device/acl/acl_executor.hpp index ea5f504f8..0574b9e38 100644 --- a/source/device/acl/acl_executor.hpp +++ b/source/device/acl/acl_executor.hpp @@ -38,8 +38,7 @@ #include -extern "C" -{ +extern "C" { #include "api/c_api.h" #include "device/device.h" #include "graph/tensor.h" @@ -53,7 +52,6 @@ extern "C" #include "utility/log.h" } - #define MAX_TENGINE_DATA_TYPE_NUM 6 static const int gs32TengineDataElemetSize[MAX_TENGINE_DATA_TYPE_NUM] = {4, 2, 1, 1, 4, 2}; @@ -65,7 +63,7 @@ using namespace arm_compute; #define dynamic_cast static_cast #endif -template +template inline void _PermuteDataLayoutNCHWToNHWCInter(T* pvData, int n, int c, int h, int w, T* pvOutputData); void _PermuteDataLayoutNCHWToNHWC(void* pvData, int n, int c, int h, int w, void* pvOutputData, int DataEleSize); void copy_buffer(void* dest, const void* src, const int src_len, DataType dest_type, DataType src_type); @@ -77,10 +75,9 @@ class CLGraph ~CLGraph(); void init(std::string name, DataType type); - int prerun(struct subgraph *subgraph, struct acl_option* option); - int run(struct subgraph *subgraph); - int postrun(struct subgraph *subgraph); - + int prerun(struct subgraph* subgraph, struct acl_option* option); + int run(struct subgraph* subgraph); + int postrun(struct subgraph* subgraph); private: bool CreateACLGraph(struct subgraph* subgraph, DataType type, bool bDataLayoutOpFlag = false); @@ -106,7 +103,7 @@ class CLGraph public: std::string name_; - std::vector> functions_map_; + std::vector > functions_map_; std::unordered_map tensors_map_; DataType data_type_; diff --git a/source/device/acl/acl_graph.hpp b/source/device/acl/acl_graph.hpp index dc070509e..be40c0fa6 100644 --- a/source/device/acl/acl_graph.hpp +++ b/source/device/acl/acl_graph.hpp @@ -24,15 +24,13 @@ #pragma once -extern "C" -{ +extern "C" { #include "graph/tensor.h" #include "graph/node.h" #include "graph/graph.h" #include "graph/subgraph.h" #include "device/device.h" - int acl_dev_init(struct device* dev); int acl_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options); int acl_dev_run(struct device* dev, struct subgraph* subgraph); diff --git a/source/device/acl/acl_limit.hpp b/source/device/acl/acl_limit.hpp index c09b1f6b0..0b08d3268 100644 --- a/source/device/acl/acl_limit.hpp +++ b/source/device/acl/acl_limit.hpp @@ -22,35 +22,33 @@ * Author: hhchen@openailab.com */ - #pragma once -extern "C" -{ +extern "C" { #include "operator/op.h" } const int acl_supported_ops[] = { - OP_BATCHNORM, - OP_CAST, - OP_CLIP, - OP_CONCAT, - OP_CONST, - OP_CONV, - OP_CROP, - OP_DECONV, - OP_DROPOUT, - OP_ELTWISE, - OP_FC, - //OP_FLATTEN, - OP_INPUT, - OP_INTERP, - //OP_PERMUTE, - OP_POOL, - OP_RELU, - OP_RESHAPE, - OP_RESIZE, - //OP_SLICE, - OP_SOFTMAX - //OP_BIAS, + OP_BATCHNORM, + OP_CAST, + OP_CLIP, + OP_CONCAT, + OP_CONST, + OP_CONV, + OP_CROP, + OP_DECONV, + OP_DROPOUT, + OP_ELTWISE, + OP_FC, + //OP_FLATTEN, + OP_INPUT, + OP_INTERP, + //OP_PERMUTE, + OP_POOL, + OP_RELU, + OP_RESHAPE, + OP_RESIZE, + //OP_SLICE, + OP_SOFTMAX + //OP_BIAS, }; diff --git a/source/device/cpu/cpu_define.h b/source/device/cpu/cpu_define.h index dfc4ac09f..39ea017fe 100644 --- a/source/device/cpu/cpu_define.h +++ b/source/device/cpu/cpu_define.h @@ -26,29 +26,28 @@ #include -#define OPS_SCORE_STATIC 10000 -#define OPS_SCORE_BEST 8000 -#define OPS_SCORE_PREFER 6000 -#define OPS_SCORE_CANDO 4000 -#define OPS_SCORE_NOTSUP 2000 +#define OPS_SCORE_STATIC 10000 +#define OPS_SCORE_BEST 8000 +#define OPS_SCORE_PREFER 6000 +#define OPS_SCORE_CANDO 4000 +#define OPS_SCORE_NOTSUP 2000 -#define MEM_POOL_ALLOCATED 8 -#define INPLACE_BLOCK_FLAG 0x40 +#define MEM_POOL_ALLOCATED 8 +#define INPLACE_BLOCK_FLAG 0x40 -#define CPU_DEVICE_NAME "CPU" - -#define TENGINE_DUMP_DIR "TG_DEBUG_DUMP_DIR" -#define TENGINE_DUMP_LAYER "TG_DEBUG_DATA" -#define TENGINE_DUMP_GRAPH "TG_DEBUG_GRAPH" -#define TENGINE_PRINT_LAYER_COST "TG_DEBUG_TIME" -#define TENGINE_FORCE_USE_REF_OP "TG_DEBUG_REF" +#define CPU_DEVICE_NAME "CPU" +#define TENGINE_DUMP_DIR "TG_DEBUG_DUMP_DIR" +#define TENGINE_DUMP_LAYER "TG_DEBUG_DATA" +#define TENGINE_DUMP_GRAPH "TG_DEBUG_GRAPH" +#define TENGINE_PRINT_LAYER_COST "TG_DEBUG_TIME" +#define TENGINE_FORCE_USE_REF_OP "TG_DEBUG_REF" typedef struct cpu_option { const char* dev_name; - int num_thread; //!< how many threads to run - int cluster; //!< cpu cluster - int precision; //!< precision of calculation - size_t affinity; //!< affinity of cpu core, max 64 cpus + int num_thread; //!< how many threads to run + int cluster; //!< cpu cluster + int precision; //!< precision of calculation + size_t affinity; //!< affinity of cpu core, max 64 cpus } cpu_opt_t; diff --git a/source/device/cpu/cpu_device.c b/source/device/cpu/cpu_device.c index ad00395e1..1c8270fa6 100644 --- a/source/device/cpu/cpu_device.c +++ b/source/device/cpu/cpu_device.c @@ -47,21 +47,18 @@ #include - int init_cpu(struct device* device) { (void)device; return register_all_cpu_ops(); } - int release_cpu(struct device* device) { (void)device; return unregister_all_cpu_ops(); } - static int prerun(struct device* dev, struct subgraph* subgraph, void* option) { struct exec_graph* exec_graph; @@ -92,13 +89,11 @@ static int prerun(struct device* dev, struct subgraph* subgraph, void* option) exec_graph->timer = NULL; } - subgraph->device_graph = exec_graph; return 0; } - static int run(struct device* dev, struct subgraph* subgraph) { struct exec_graph* exec_graph = (struct exec_graph*)subgraph->device_graph; @@ -113,7 +108,7 @@ static int run(struct device* dev, struct subgraph* subgraph) for (int i = 0; i < node_num; i++) { - struct exec_node* node = ( struct exec_node* )get_vector_data(exec_graph->exec_node_list, i); + struct exec_node* node = (struct exec_node*)get_vector_data(exec_graph->exec_node_list, i); struct node_ops* node_ops = node->node_ops; /* TODO: handle the shape changed and dynamic shape case */ @@ -167,7 +162,7 @@ static int run(struct device* dev, struct subgraph* subgraph) struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, node->ir_node->input_tensors[j]); if (input_tensor->dim_num <= 5) { - char dir_str[32] = { 0 }; + char dir_str[32] = {0}; sprintf(dir_str, "in[%d]", j); if (NULL != input_tensor->data) @@ -183,7 +178,7 @@ static int run(struct device* dev, struct subgraph* subgraph) /* debug */ if (output_tensor->dim_num <= 5) { - char dir_str[32] = { 0 }; + char dir_str[32] = {0}; sprintf(dir_str, "out[%d]", j); extract_feature_from_tensor(dir_str, name, output_tensor); @@ -225,7 +220,6 @@ static int run(struct device* dev, struct subgraph* subgraph) return 0; } - static int postrun(struct device* dev, struct subgraph* subgraph) { struct exec_graph* exec_graph = (struct exec_graph*)subgraph->device_graph; @@ -234,7 +228,7 @@ static int postrun(struct device* dev, struct subgraph* subgraph) for (int i = 0; i < node_num; i++) { - struct exec_node* node = ( struct exec_node* )get_vector_data(exec_graph->exec_node_list, i); + struct exec_node* node = (struct exec_node*)get_vector_data(exec_graph->exec_node_list, i); struct node_ops* node_ops = node->node_ops; if (exec_graph->timer) @@ -255,7 +249,6 @@ static int postrun(struct device* dev, struct subgraph* subgraph) return 0; } - static int cpu_dev_release_exec_graph(struct device* dev, void* exec_graph) { if (NULL != exec_graph) @@ -266,7 +259,6 @@ static int cpu_dev_release_exec_graph(struct device* dev, void* exec_graph) return 0; } - static int cpu_allocate(struct device* device, struct subgraph* sub_graph) { /* set the correct input wait count: INPUT tensor is always ready */ @@ -283,7 +275,6 @@ static int cpu_allocate(struct device* device, struct subgraph* sub_graph) return 0; } - static int cpu_describe(struct device* device, struct vector* allowed_ops, struct vector* blocked_ops, struct vector* precision) { if (NULL == device) @@ -317,7 +308,6 @@ static int cpu_describe(struct device* device, struct vector* allowed_ops, struc return 0; } - static int cpu_evaluation(struct device* device, struct subgraph* sub_graph, struct vector* tensor, struct vector* node) { if (NULL == device) @@ -332,7 +322,6 @@ static int cpu_evaluation(struct device* device, struct subgraph* sub_graph, str return 0; } - static int cpu_release(struct device* device, struct subgraph* sub_graph) { if (NULL == device) @@ -345,7 +334,6 @@ static int cpu_release(struct device* device, struct subgraph* sub_graph) return 0; } - int cpu_split_graph(struct graph* ir_graph) { struct device* default_device = find_default_device(); @@ -390,47 +378,42 @@ int cpu_split_graph(struct graph* ir_graph) return 0; } - static struct interface cpu_interface = { - .init = init_cpu, - .pre_run = prerun, - .run = run, - .post_run = postrun, - .async_run = NULL, - .async_wait = NULL, - .release_graph = cpu_dev_release_exec_graph, - .release_device = release_cpu, + .init = init_cpu, + .pre_run = prerun, + .run = run, + .post_run = postrun, + .async_run = NULL, + .async_wait = NULL, + .release_graph = cpu_dev_release_exec_graph, + .release_device = release_cpu, }; - static struct allocator cpu_allocator = { - .describe = cpu_describe, - .evaluation = cpu_evaluation, - .allocate = cpu_allocate, - .release = cpu_release, + .describe = cpu_describe, + .evaluation = cpu_evaluation, + .allocate = cpu_allocate, + .release = cpu_release, }; - static struct optimizer cpu_optimizer = { - .split_graph = cpu_split_graph, - .optimize_graph = NULL, + .split_graph = cpu_split_graph, + .optimize_graph = NULL, }; - static struct cpu_device cpu_dev = { - .base = { - .name = CPU_DEVICE_NAME, - .interface = &cpu_interface, - .allocator = &cpu_allocator, - .optimizer = &cpu_optimizer, - .scheduler = NULL, - .privacy = NULL, - }, - .master_cpu = 0, - .cpu_model = 0, + .base = { + .name = CPU_DEVICE_NAME, + .interface = &cpu_interface, + .allocator = &cpu_allocator, + .optimizer = &cpu_optimizer, + .scheduler = NULL, + .privacy = NULL, + }, + .master_cpu = 0, + .cpu_model = 0, }; - int register_cpu_device(void) { #ifdef TENGINE_AUTO_LOAD_HCL @@ -448,7 +431,6 @@ int register_cpu_device(void) return 0; } - int unregister_cpu_device(void) { int ret = unregister_device(&cpu_dev.base); diff --git a/source/device/cpu/cpu_device.h b/source/device/cpu/cpu_device.h index 4f717e98e..d39a44dd9 100644 --- a/source/device/cpu/cpu_device.h +++ b/source/device/cpu/cpu_device.h @@ -28,11 +28,9 @@ #include "device/device.h" - struct node_ops; struct node; - struct cpu_device { struct device base; @@ -40,5 +38,4 @@ struct cpu_device uint8_t cpu_model; }; - int register_cpu_device(void); diff --git a/source/device/cpu/cpu_dump.c b/source/device/cpu/cpu_dump.c index 2cce834af..c29a7ca83 100644 --- a/source/device/cpu/cpu_dump.c +++ b/source/device/cpu/cpu_dump.c @@ -52,7 +52,6 @@ #include #endif - char* replace_string_character(const char* src_str, char* dst_str, const char* target_char, const char* replaced_char) { const char* p; @@ -82,40 +81,41 @@ char* replace_string_character(const char* src_str, char* dst_str, const char* t return dst_str; } - int get_tensor_cv_shape(const struct tensor* tensor, int* n, int* c, int* h, int* w) { - if (NULL == tensor || NULL == n || NULL == c || NULL == h || NULL ==w) + if (NULL == tensor || NULL == n || NULL == c || NULL == h || NULL == w) { return -1; } - *n = 0; *c = 0; *h = 0; *w = 0; + *n = 0; + *c = 0; + *h = 0; + *w = 0; const int* dims = tensor->dims; switch (tensor->dim_num) { - case 4: - *n = dims[0]; - *c = dims[1]; - *h = dims[2]; - *w = dims[3]; - break; - case 3: - *n = dims[0]; - *h = dims[1]; - *w = dims[2]; - case 2: - *n = dims[0]; - *w = dims[1]; - default: - return -1; + case 4: + *n = dims[0]; + *c = dims[1]; + *h = dims[2]; + *w = dims[3]; + break; + case 3: + *n = dims[0]; + *h = dims[1]; + *w = dims[2]; + case 2: + *n = dims[0]; + *w = dims[1]; + default: + return -1; } return 0; } - float get_node_total_flops(struct node* node) { float flops = 0.f; @@ -151,214 +151,132 @@ float get_node_total_flops(struct node* node) return flops; } - int print_tensor_data_value(FILE* file, const struct tensor* tensor, int offset) { switch (tensor->data_type) { - case TENGINE_DT_FP32: - { - float* base_ptr = (float*)tensor->data; - float val = base_ptr[offset]; - if (val < 0) - fprintf(file, "%.4f ", val); - else - fprintf(file, " %.4f ", val); - break; - } - case TENGINE_DT_FP16: - { - fp16_t* base_ptr = (fp16_t*)tensor->data; - fp16_t val = base_ptr[offset]; - - float val_fp32 = fp16_to_fp32(val); - - if (val_fp32 < 0) - fprintf(file, "%.4f ", val_fp32); - else - fprintf(file, " %.4f ", val_fp32); - break; - } - case TENGINE_DT_UINT8: - { - uint8_t* base_ptr = (uint8_t*)tensor->data; - uint8_t val = base_ptr[offset]; + case TENGINE_DT_FP32: + { + float* base_ptr = (float*)tensor->data; + float val = base_ptr[offset]; + if (val < 0) + fprintf(file, "%.4f ", val); + else + fprintf(file, " %.4f ", val); + break; + } + case TENGINE_DT_FP16: + { + fp16_t* base_ptr = (fp16_t*)tensor->data; + fp16_t val = base_ptr[offset]; - float scale = tensor->scale; - int32_t zero_point = tensor->zero_point; + float val_fp32 = fp16_to_fp32(val); - float val_fp32 = (float)((int)val - (int)zero_point) * scale; - if (val_fp32 < 0) - fprintf(file, "%.4f ", val_fp32); - else - fprintf(file, " %.4f ", val_fp32); - break; - } - case TENGINE_DT_INT8: - { - int8_t * base_ptr = (int8_t*)tensor->data; - int8_t val = base_ptr[offset]; + if (val_fp32 < 0) + fprintf(file, "%.4f ", val_fp32); + else + fprintf(file, " %.4f ", val_fp32); + break; + } + case TENGINE_DT_UINT8: + { + uint8_t* base_ptr = (uint8_t*)tensor->data; + uint8_t val = base_ptr[offset]; + + float scale = tensor->scale; + int32_t zero_point = tensor->zero_point; + + float val_fp32 = (float)((int)val - (int)zero_point) * scale; + if (val_fp32 < 0) + fprintf(file, "%.4f ", val_fp32); + else + fprintf(file, " %.4f ", val_fp32); + break; + } + case TENGINE_DT_INT8: + { + int8_t* base_ptr = (int8_t*)tensor->data; + int8_t val = base_ptr[offset]; - float scale = tensor->scale; + float scale = tensor->scale; - float val_fp32 = (float)val * scale; - if (val_fp32 < 0) - fprintf(file, "%.4f ", val_fp32); - else - fprintf(file, " %.4f ", val_fp32); - } - case TENGINE_DT_INT32: - { - int32_t* base_ptr = (int32_t*)tensor->data; - int8_t val = base_ptr[offset]; + float val_fp32 = (float)val * scale; + if (val_fp32 < 0) + fprintf(file, "%.4f ", val_fp32); + else + fprintf(file, " %.4f ", val_fp32); + } + case TENGINE_DT_INT32: + { + int32_t* base_ptr = (int32_t*)tensor->data; + int8_t val = base_ptr[offset]; - float scale = tensor->scale; - float val_fp32 = (float)val * scale; + float scale = tensor->scale; + float val_fp32 = (float)val * scale; - if (val_fp32 < 0) - fprintf(file, "%.6f ", val_fp32); - else - fprintf(file, " %.6f ", val_fp32); - } + if (val_fp32 < 0) + fprintf(file, "%.6f ", val_fp32); + else + fprintf(file, " %.6f ", val_fp32); + } } return 0; } - void print_tensor_data_to_file(FILE* file, const struct tensor* tensor) { switch (tensor->dim_num) { - case 5: - { - int dim5 = tensor->dims[0], batch = tensor->dims[1], channel = 0, height = 0, width = 0; - - if (TENGINE_LAYOUT_NCHW == tensor->layout) - { - channel = tensor->dims[2]; - height = tensor->dims[3]; - width = tensor->dims[4]; - } - if (TENGINE_LAYOUT_NHWC == tensor->layout) - { - height = tensor->dims[2]; - width = tensor->dims[3]; - channel = tensor->dims[4]; - } - - if (TENGINE_DT_FP32 == tensor->data_type) - { - fprintf(file, "Shape is {%d %d %d %d %d}, data type is fp32\n", dim5, batch, channel, height, width); - } - else - { - if (TENGINE_DT_FP16 == tensor->data_type) - { - fprintf(file, "Shape is {%d %d %d %d %d}, data type is fp16, cast to fp32\n", dim5, batch, channel, height, width); - } - else - { - const char* type_name = get_tensor_data_type_string(tensor->data_type); - fprintf(file, "Shape is {%d %d %d %d %d}, data type is %s, inverse quantization to fp32\n", dim5, batch, channel, height, width, type_name); - } - } - - for (int d5 = 0; d5 < dim5; d5++) - { - fprintf(file, "Dim5 %d:\n", d5); - - for (int n = 0; n < batch; n++) - { - fprintf(file, "\tBatch %d:\n", n); - - for (int ch = 0; ch < channel; ch++) - { - fprintf(file, "\t\tChannel %d:\n", ch); - - for (int h = 0; h < height; h++) - { - fprintf(file, "\t\t\t"); - - for (int w = 0; w < width; w++) - { - int offset = 0; - - if (TENGINE_LAYOUT_NCHW == tensor->layout) - { - offset += d5 * batch * channel * height * width; - offset += n * channel * height * width; - offset += ch * height * width; - offset += h * width; - offset += w; - } - if (TENGINE_LAYOUT_NHWC == tensor->layout) - { - offset += d5 * batch * channel * height * width; - offset += n * channel * height * width; - offset += ch; - offset += h * width * channel; - offset += w * channel; - } - - print_tensor_data_value(file, tensor, offset); - } - fprintf(file, "\n"); - } - fprintf(file, "\n"); - } - fprintf(file, "\n"); - } - fprintf(file, "\n"); - } + case 5: + { + int dim5 = tensor->dims[0], batch = tensor->dims[1], channel = 0, height = 0, width = 0; - break; + if (TENGINE_LAYOUT_NCHW == tensor->layout) + { + channel = tensor->dims[2]; + height = tensor->dims[3]; + width = tensor->dims[4]; } - case 4: + if (TENGINE_LAYOUT_NHWC == tensor->layout) { - int batch = tensor->dims[0], channel = 0, height = 0, width = 0; - - if (TENGINE_LAYOUT_NCHW == tensor->layout) - { - channel = tensor->dims[1]; - height = tensor->dims[2]; - width = tensor->dims[3]; - } - if (TENGINE_LAYOUT_NHWC == tensor->layout) - { - height = tensor->dims[1]; - width = tensor->dims[2]; - channel = tensor->dims[3]; - } + height = tensor->dims[2]; + width = tensor->dims[3]; + channel = tensor->dims[4]; + } - if (TENGINE_DT_FP32 == tensor->data_type) + if (TENGINE_DT_FP32 == tensor->data_type) + { + fprintf(file, "Shape is {%d %d %d %d %d}, data type is fp32\n", dim5, batch, channel, height, width); + } + else + { + if (TENGINE_DT_FP16 == tensor->data_type) { - fprintf(file, "Shape is {%d %d %d %d}, data type is fp32\n", batch, channel, height, width); + fprintf(file, "Shape is {%d %d %d %d %d}, data type is fp16, cast to fp32\n", dim5, batch, channel, height, width); } else { - if (TENGINE_DT_FP16 == tensor->data_type) - { - fprintf(file, "Shape is {%d %d %d %d}, data type is fp16, cast to fp32\n", batch, channel, height, width); - } - else - { - const char* type_name = get_tensor_data_type_string(tensor->data_type); - fprintf(file, "Shape is {%d %d %d %d}, data type is %s, inverse quantization to fp32\n", batch, channel, height, width, type_name); - } + const char* type_name = get_tensor_data_type_string(tensor->data_type); + fprintf(file, "Shape is {%d %d %d %d %d}, data type is %s, inverse quantization to fp32\n", dim5, batch, channel, height, width, type_name); } + } + + for (int d5 = 0; d5 < dim5; d5++) + { + fprintf(file, "Dim5 %d:\n", d5); for (int n = 0; n < batch; n++) { - fprintf(file, "Batch %d:\n", n); + fprintf(file, "\tBatch %d:\n", n); for (int ch = 0; ch < channel; ch++) { - fprintf(file, "\tChannel %d:\n", ch); + fprintf(file, "\t\tChannel %d:\n", ch); for (int h = 0; h < height; h++) { - fprintf(file, "\t\t"); + fprintf(file, "\t\t\t"); for (int w = 0; w < width; w++) { @@ -366,6 +284,7 @@ void print_tensor_data_to_file(FILE* file, const struct tensor* tensor) if (TENGINE_LAYOUT_NCHW == tensor->layout) { + offset += d5 * batch * channel * height * width; offset += n * channel * height * width; offset += ch * height * width; offset += h * width; @@ -373,6 +292,7 @@ void print_tensor_data_to_file(FILE* file, const struct tensor* tensor) } if (TENGINE_LAYOUT_NHWC == tensor->layout) { + offset += d5 * batch * channel * height * width; offset += n * channel * height * width; offset += ch; offset += h * width * channel; @@ -387,49 +307,56 @@ void print_tensor_data_to_file(FILE* file, const struct tensor* tensor) } fprintf(file, "\n"); } + fprintf(file, "\n"); + } + + break; + } + case 4: + { + int batch = tensor->dims[0], channel = 0, height = 0, width = 0; - break; + if (TENGINE_LAYOUT_NCHW == tensor->layout) + { + channel = tensor->dims[1]; + height = tensor->dims[2]; + width = tensor->dims[3]; } - case 3: + if (TENGINE_LAYOUT_NHWC == tensor->layout) { - int batch = 0, height = 0, width = 0; - - if (TENGINE_LAYOUT_NCHW == tensor->layout) - { - batch = tensor->dims[0]; - height = tensor->dims[1]; - width = tensor->dims[2]; - } - if (TENGINE_LAYOUT_NHWC == tensor->layout) - { - height = tensor->dims[0]; - width = tensor->dims[1]; - batch = tensor->dims[2]; - } + height = tensor->dims[1]; + width = tensor->dims[2]; + channel = tensor->dims[3]; + } - if (TENGINE_DT_FP32 == tensor->data_type) + if (TENGINE_DT_FP32 == tensor->data_type) + { + fprintf(file, "Shape is {%d %d %d %d}, data type is fp32\n", batch, channel, height, width); + } + else + { + if (TENGINE_DT_FP16 == tensor->data_type) { - fprintf(file, "Shape is {%d %d %d}, data type is fp32\n", batch, height, width); + fprintf(file, "Shape is {%d %d %d %d}, data type is fp16, cast to fp32\n", batch, channel, height, width); } else { - if (TENGINE_DT_FP16 == tensor->data_type) - { - fprintf(file, "Shape is {%d %d %d}, data type is fp16, cast to fp32\n", batch, height, width); - } - else - { - const char* type_name = get_tensor_data_type_string(tensor->data_type); - fprintf(file, "Shape is {%d %d %d}, data type is %s, inverse quantization to fp32\n", batch, height, width, type_name); - } + const char* type_name = get_tensor_data_type_string(tensor->data_type); + fprintf(file, "Shape is {%d %d %d %d}, data type is %s, inverse quantization to fp32\n", batch, channel, height, width, type_name); } + } - for (int n = 0; n < batch; n++) + for (int n = 0; n < batch; n++) + { + fprintf(file, "Batch %d:\n", n); + + for (int ch = 0; ch < channel; ch++) { + fprintf(file, "\tChannel %d:\n", ch); + for (int h = 0; h < height; h++) { - fprintf(file, "Channel %d:\n", h); - fprintf(file, "\t"); + fprintf(file, "\t\t"); for (int w = 0; w < width; w++) { @@ -437,15 +364,17 @@ void print_tensor_data_to_file(FILE* file, const struct tensor* tensor) if (TENGINE_LAYOUT_NCHW == tensor->layout) { - offset += n * height * width; + offset += n * channel * height * width; + offset += ch * height * width; offset += h * width; offset += w; } if (TENGINE_LAYOUT_NHWC == tensor->layout) { - offset += h; - offset += n * width * height; - offset += w * height; + offset += n * channel * height * width; + offset += ch; + offset += h * width * channel; + offset += w * channel; } print_tensor_data_value(file, tensor, offset); @@ -454,86 +383,153 @@ void print_tensor_data_to_file(FILE* file, const struct tensor* tensor) } fprintf(file, "\n"); } + fprintf(file, "\n"); + } - break; + break; + } + case 3: + { + int batch = 0, height = 0, width = 0; + + if (TENGINE_LAYOUT_NCHW == tensor->layout) + { + batch = tensor->dims[0]; + height = tensor->dims[1]; + width = tensor->dims[2]; } - case 2: + if (TENGINE_LAYOUT_NHWC == tensor->layout) { - int batch = 0, width = 0; - - if (TENGINE_LAYOUT_NCHW == tensor->layout) - { - batch = tensor->dims[0]; - width = tensor->dims[1]; - } - if (TENGINE_LAYOUT_NHWC == tensor->layout) - { - batch = tensor->dims[0]; - width = tensor->dims[1]; - } + height = tensor->dims[0]; + width = tensor->dims[1]; + batch = tensor->dims[2]; + } - if (TENGINE_DT_FP32 == tensor->data_type) + if (TENGINE_DT_FP32 == tensor->data_type) + { + fprintf(file, "Shape is {%d %d %d}, data type is fp32\n", batch, height, width); + } + else + { + if (TENGINE_DT_FP16 == tensor->data_type) { - fprintf(file, "Shape is {%d %d}, data type is fp32\n", batch, width); + fprintf(file, "Shape is {%d %d %d}, data type is fp16, cast to fp32\n", batch, height, width); } else { - if (TENGINE_DT_FP16 == tensor->data_type) - { - fprintf(file, "Shape is {%d %d}, data type is fp16, cast to fp32\n", batch, width); - } - else - { - const char* type_name = get_tensor_data_type_string(tensor->data_type); - fprintf(file, "Shape is {%d %d}, data type is %s, inverse quantization to fp32\n", batch, width, type_name); - } + const char* type_name = get_tensor_data_type_string(tensor->data_type); + fprintf(file, "Shape is {%d %d %d}, data type is %s, inverse quantization to fp32\n", batch, height, width, type_name); } + } - for (int n = 0; n < batch; n++) + for (int n = 0; n < batch; n++) + { + for (int h = 0; h < height; h++) { + fprintf(file, "Channel %d:\n", h); + fprintf(file, "\t"); + for (int w = 0; w < width; w++) { int offset = 0; - offset += n * width; - offset += w; + if (TENGINE_LAYOUT_NCHW == tensor->layout) + { + offset += n * height * width; + offset += h * width; + offset += w; + } + if (TENGINE_LAYOUT_NHWC == tensor->layout) + { + offset += h; + offset += n * width * height; + offset += w * height; + } print_tensor_data_value(file, tensor, offset); } fprintf(file, "\n"); } + fprintf(file, "\n"); + } + + break; + } + case 2: + { + int batch = 0, width = 0; - break; + if (TENGINE_LAYOUT_NCHW == tensor->layout) + { + batch = tensor->dims[0]; + width = tensor->dims[1]; } - case 1: + if (TENGINE_LAYOUT_NHWC == tensor->layout) { - int width = tensor->dims[0]; - - fprintf(file, "Shape is {%d}, data type is fp32\n", width); + batch = tensor->dims[0]; + width = tensor->dims[1]; + } + if (TENGINE_DT_FP32 == tensor->data_type) + { + fprintf(file, "Shape is {%d %d}, data type is fp32\n", batch, width); + } + else + { + if (TENGINE_DT_FP16 == tensor->data_type) + { + fprintf(file, "Shape is {%d %d}, data type is fp16, cast to fp32\n", batch, width); + } + else + { + const char* type_name = get_tensor_data_type_string(tensor->data_type); + fprintf(file, "Shape is {%d %d}, data type is %s, inverse quantization to fp32\n", batch, width, type_name); + } + } + for (int n = 0; n < batch; n++) + { for (int w = 0; w < width; w++) { - print_tensor_data_value(file, tensor, w); + int offset = 0; + + offset += n * width; + offset += w; + + print_tensor_data_value(file, tensor, offset); } + fprintf(file, "\n"); + } + + break; + } + case 1: + { + int width = tensor->dims[0]; - break; + fprintf(file, "Shape is {%d}, data type is fp32\n", width); + + for (int w = 0; w < width; w++) + { + print_tensor_data_value(file, tensor, w); } - default: - printf("Input dimension %d not to be supported.\n", tensor->dim_num); + + break; + } + default: + printf("Input dimension %d not to be supported.\n", tensor->dim_num); } } - /* * Extract the blob feature map */ void extract_feature_from_tensor(const char* comment, const char* layer_name, const struct tensor* tensor) { // 1. deal with saving path - char save_dir[256] = { '0' }; + char save_dir[256] = {'0'}; - const char *env_path = getenv(TENGINE_DUMP_DIR); + const char* env_path = getenv(TENGINE_DUMP_DIR); if (NULL != env_path && (256 - 2) > strlen(env_path)) { @@ -552,7 +548,7 @@ void extract_feature_from_tensor(const char* comment, const char* layer_name, co } else { -// TLOG_WARNING("Tengine: Env var \"TENGINE_DUMP_DIR\" is too long(%d vs. 254). Using default path.\n", strlen(env_path)); + // TLOG_WARNING("Tengine: Env var \"TENGINE_DUMP_DIR\" is too long(%d vs. 254). Using default path.\n", strlen(env_path)); sprintf(save_dir, "./output/"); #ifdef _MSC_VER CreateDirectoryA(save_dir, NULL); @@ -582,7 +578,7 @@ void extract_feature_from_tensor(const char* comment, const char* layer_name, co replace_string_character(layer_short_name, layer_legal_name, "/", "-"); // 3. join path - char output_file_path[512] = { '0' }; + char output_file_path[512] = {'0'}; if (strlen(layer_legal_name) + strlen(save_dir) + strlen(comment) > 256 - 16) { @@ -606,13 +602,12 @@ void extract_feature_from_tensor(const char* comment, const char* layer_name, co file = NULL; } - void extract_node_executed_time(struct subgraph* subgraph, int node_id) { struct exec_graph* exec_graph = (struct exec_graph*)subgraph->device_graph; int node_num = get_vector_num(exec_graph->exec_node_list); int i = node_id; - struct exec_node* node = ( struct exec_node* )get_vector_data(exec_graph->exec_node_list, i); + struct exec_node* node = (struct exec_node*)get_vector_data(exec_graph->exec_node_list, i); double* timer = (double*)exec_graph->timer; @@ -638,51 +633,51 @@ void extract_node_executed_time(struct subgraph* subgraph, int node_id) switch (node->ir_node->op.type) { - case OP_CONV: + case OP_CONV: + { + struct conv_param* param = (struct conv_param*)node->ir_node->op.param_mem; + fprintf(stdout, "K: %dx%d | S: %dx%d | P: %d %d %d %d", param->kernel_h, param->kernel_w, param->stride_h, param->stride_w, + param->pad_h0, param->pad_h1, param->pad_w0, param->pad_w1); + if (param->group != 1) { - struct conv_param* param = (struct conv_param*)node->ir_node->op.param_mem; - fprintf(stdout, "K: %dx%d | S: %dx%d | P: %d %d %d %d", param->kernel_h, param->kernel_w, param->stride_h, param->stride_w, - param->pad_h0, param->pad_h1, param->pad_w0, param->pad_w1); - if(param->group != 1) - { - fprintf(stdout, " DW(%3d) ", param->group); - } - else - { - fprintf(stdout, " "); - } - break; + fprintf(stdout, " DW(%3d) ", param->group); } - case OP_DECONV: + else { - struct deconv_param* param = (struct deconv_param*)node->ir_node->op.param_mem; - fprintf(stdout, "K: %dx%d | S: %dx%d | P: %d %d %d %d", param->kernel_h, param->kernel_w, param->stride_h, param->stride_w, - param->pad_h0, param->pad_h1, param->pad_w0, param->pad_w1); - if(param->group != 1) - { - fprintf(stdout, " DW(%3d) ", param->group); - } - else - { - fprintf(stdout, " "); - } - break; + fprintf(stdout, " "); } - case OP_POOL: + break; + } + case OP_DECONV: + { + struct deconv_param* param = (struct deconv_param*)node->ir_node->op.param_mem; + fprintf(stdout, "K: %dx%d | S: %dx%d | P: %d %d %d %d", param->kernel_h, param->kernel_w, param->stride_h, param->stride_w, + param->pad_h0, param->pad_h1, param->pad_w0, param->pad_w1); + if (param->group != 1) { - struct pool_param* param = (struct pool_param*)node->ir_node->op.param_mem; - fprintf(stdout, "K: %dx%d | S: %dx%d | P: %d %d %d %d", param->kernel_h, param->kernel_w, param->stride_h, param->stride_w, - param->pad_h0, param->pad_h1, param->pad_w0, param->pad_w1); - if(param->pool_method == 0) - { - fprintf(stdout, " Max"); - } - else - { - fprintf(stdout, " Avg"); - } - break; + fprintf(stdout, " DW(%3d) ", param->group); } + else + { + fprintf(stdout, " "); + } + break; + } + case OP_POOL: + { + struct pool_param* param = (struct pool_param*)node->ir_node->op.param_mem; + fprintf(stdout, "K: %dx%d | S: %dx%d | P: %d %d %d %d", param->kernel_h, param->kernel_w, param->stride_h, param->stride_w, + param->pad_h0, param->pad_h1, param->pad_w0, param->pad_w1); + if (param->pool_method == 0) + { + fprintf(stdout, " Max"); + } + else + { + fprintf(stdout, " Avg"); + } + break; + } } if (OP_CONV == node->ir_node->op.type || OP_DECONV == node->ir_node->op.type) @@ -699,9 +694,6 @@ void extract_node_executed_time(struct subgraph* subgraph, int node_id) } } - - - double get_current_time(void) { #ifdef _MSC_VER diff --git a/source/device/cpu/cpu_dump.h b/source/device/cpu/cpu_dump.h index 23e9471a3..e916a2078 100644 --- a/source/device/cpu/cpu_dump.h +++ b/source/device/cpu/cpu_dump.h @@ -27,7 +27,6 @@ struct tensor; struct subgraph; - void extract_feature_from_tensor(const char* comment, const char* layer_name, const struct tensor* tensor); void extract_node_executed_time(struct subgraph* subgraph, int node_id); diff --git a/source/device/cpu/cpu_graph.c b/source/device/cpu/cpu_graph.c index 5136a2178..7032ed50c 100644 --- a/source/device/cpu/cpu_graph.c +++ b/source/device/cpu/cpu_graph.c @@ -39,10 +39,9 @@ #include "utility/log.h" #include "serializer/serializer.h" - static struct exec_graph* new_exec_graph(void) { - struct exec_graph* exec_graph = ( struct exec_graph* )sys_malloc(sizeof(struct exec_graph)); + struct exec_graph* exec_graph = (struct exec_graph*)sys_malloc(sizeof(struct exec_graph)); if (exec_graph == NULL) return NULL; @@ -65,16 +64,15 @@ static struct exec_graph* new_exec_graph(void) return exec_graph; } - void release_exec_graph(void* exec_graph) { - struct exec_graph* graph = ( struct exec_graph* )exec_graph; + struct exec_graph* graph = (struct exec_graph*)exec_graph; int node_num = get_vector_num(graph->exec_node_list); for (int i = 0; i < node_num; i++) { - struct exec_node* exec_node = ( struct exec_node* )get_vector_data(graph->exec_node_list, i); + struct exec_node* exec_node = (struct exec_node*)get_vector_data(graph->exec_node_list, i); struct node_ops* node_ops = exec_node->node_ops; release_exec_node(graph, exec_node, node_ops); @@ -87,7 +85,6 @@ void release_exec_graph(void* exec_graph) sys_free(graph); } - struct exec_graph* create_exec_graph(struct subgraph* subgraph, int num_thread, int mode, size_t cpu_affinity) { /* generate exec_graph */ @@ -138,19 +135,18 @@ struct exec_graph* create_exec_graph(struct subgraph* subgraph, int num_thread, return exec_graph; - error: +error: release_exec_graph(exec_graph); return NULL; } - int prerun_exec_graph(struct exec_graph* exec_graph) { int node_num = get_vector_num(exec_graph->exec_node_list); for (int i = 0; i < node_num; i++) { - struct exec_node* exec_node = ( struct exec_node* )get_vector_data(exec_graph->exec_node_list, i); + struct exec_node* exec_node = (struct exec_node*)get_vector_data(exec_graph->exec_node_list, i); struct node_ops* node_ops = exec_node->node_ops; if (node_ops->prerun && node_ops->prerun(node_ops, exec_node, exec_graph) < 0) diff --git a/source/device/cpu/cpu_graph.h b/source/device/cpu/cpu_graph.h index 0e3fc61bd..e40497843 100644 --- a/source/device/cpu/cpu_graph.h +++ b/source/device/cpu/cpu_graph.h @@ -29,24 +29,22 @@ #include - struct exec_graph { - struct vector* exec_node_list; - struct mem_pool* mem_pool; - struct cpu_device* dev; - - void* shared_mem; - int shared_mem_size; - void* shared_pack4_mem; - int shared_pack4_mem_size; - int num_thread; - int mode; - size_t cpu_affinity; - void* timer; + struct vector* exec_node_list; + struct mem_pool* mem_pool; + struct cpu_device* dev; + + void* shared_mem; + int shared_mem_size; + void* shared_pack4_mem; + int shared_pack4_mem_size; + int num_thread; + int mode; + size_t cpu_affinity; + void* timer; }; - struct exec_graph* create_exec_graph(struct subgraph* subgraph, int num_thread, int mode, size_t cpu_affinity); int prerun_exec_graph(struct exec_graph* exec_graph); diff --git a/source/device/cpu/cpu_module.c b/source/device/cpu/cpu_module.c index eda4b21c5..7f024cb09 100644 --- a/source/device/cpu/cpu_module.c +++ b/source/device/cpu/cpu_module.c @@ -45,27 +45,24 @@ #include "utility/log.h" #include "serializer/serializer.h" - static struct vector** cpu_builtin_ops_registry; -static struct vector* cpu_custom_ops_registry; +static struct vector* cpu_custom_ops_registry; #ifdef TENGINE_AUTO_LOAD_HCL void* hcl_handler = NULL; #endif - struct custom_reg_entry { int op_type; struct node_ops* node_ops; }; - static int init_builtin_ops_registry(void) { int alloc_num = 0; - cpu_builtin_ops_registry = ( struct vector** )sys_malloc(sizeof(void*) * OP_BUILTIN_LAST); + cpu_builtin_ops_registry = (struct vector**)sys_malloc(sizeof(void*) * OP_BUILTIN_LAST); if (cpu_builtin_ops_registry == NULL) return -1; @@ -83,7 +80,7 @@ static int init_builtin_ops_registry(void) return 0; - error: +error: for (int i = 0; i < alloc_num; i++) { release_vector(cpu_builtin_ops_registry[i]); @@ -148,7 +145,7 @@ static inline struct node_ops* find_builtin_node_ops(struct exec_graph* exec_gra for (int i = 0; i < num; i++) { - struct node_ops* node_ops = *( struct node_ops** )get_vector_data(ops_vector, i); + struct node_ops* node_ops = *(struct node_ops**)get_vector_data(ops_vector, i); int score = node_ops->score(node_ops, exec_graph, ir_node); @@ -199,7 +196,7 @@ int register_custom_node_ops(int op_type, struct node_ops* node_ops) for (int i = 0; i < n; i++) { - struct custom_reg_entry* entry = ( struct custom_reg_entry* )get_vector_data(cpu_custom_ops_registry, i); + struct custom_reg_entry* entry = (struct custom_reg_entry*)get_vector_data(cpu_custom_ops_registry, i); if (entry->op_type == op_type) { @@ -228,7 +225,7 @@ int unregister_custom_node_ops(int op_type, struct node_ops* node_ops) for (int i = 0; i < n; i++) { - struct custom_reg_entry* entry = ( struct custom_reg_entry* )get_vector_data(cpu_custom_ops_registry, i); + struct custom_reg_entry* entry = (struct custom_reg_entry*)get_vector_data(cpu_custom_ops_registry, i); if (entry->op_type == op_type && entry->node_ops == node_ops) { @@ -247,7 +244,7 @@ static inline struct node_ops* find_custom_node_ops(struct exec_graph* exec_grap for (int i = 0; i < n; i++) { - struct custom_reg_entry* entry = ( struct custom_reg_entry* )get_vector_data(cpu_custom_ops_registry, i); + struct custom_reg_entry* entry = (struct custom_reg_entry*)get_vector_data(cpu_custom_ops_registry, i); if (entry->op_type == op_type) return entry->node_ops; diff --git a/source/device/cpu/cpu_module.h b/source/device/cpu/cpu_module.h index 347c93dcb..50edd5f8e 100644 --- a/source/device/cpu/cpu_module.h +++ b/source/device/cpu/cpu_module.h @@ -24,11 +24,9 @@ #pragma once - struct node_ops; struct exec_graph; - int init_cpu_node_ops_registry(void); void release_cpu_node_ops_registry(void); diff --git a/source/device/cpu/cpu_node.c b/source/device/cpu/cpu_node.c index 7a14b8eac..9ea1aa72e 100644 --- a/source/device/cpu/cpu_node.c +++ b/source/device/cpu/cpu_node.c @@ -28,7 +28,6 @@ #include "graph/node.h" #include "utility/sys_port.h" - int init_exec_node(struct exec_graph* exec_graph, struct exec_node* exec_node, struct node* ir_node, struct node_ops* node_ops) { exec_node->ir_node = ir_node; @@ -44,7 +43,7 @@ int init_exec_node(struct exec_graph* exec_graph, struct exec_node* exec_node, s if (exec_node->output_num > 4) { - exec_node->block_id_ptr = ( int8_t* )sys_malloc(sizeof(int8_t) * exec_node->output_num); + exec_node->block_id_ptr = (int8_t*)sys_malloc(sizeof(int8_t) * exec_node->output_num); block_id = exec_node->block_id_ptr; } @@ -57,7 +56,6 @@ int init_exec_node(struct exec_graph* exec_graph, struct exec_node* exec_node, s return 0; } - void release_exec_node(struct exec_graph* exec_graph, struct exec_node* exec_node, struct node_ops* node_ops) { if (node_ops->release_node) diff --git a/source/device/cpu/cpu_node.h b/source/device/cpu/cpu_node.h index 8787a2929..b0c2fa575 100644 --- a/source/device/cpu/cpu_node.h +++ b/source/device/cpu/cpu_node.h @@ -29,31 +29,29 @@ #include - struct node; struct node_ops; struct exec_node; struct exec_graph; - struct exec_node { - struct node* ir_node; - struct node_ops* node_ops; - void* ops_priv; /* priv data for ops */ + struct node* ir_node; + struct node_ops* node_ops; + void* ops_priv; /* priv data for ops */ - int8_t inplace_map_num; - int8_t output_num; + int8_t inplace_map_num; + int8_t output_num; union { uint8_t* inplace_map_ptr; - uint8_t inplace_map[4]; /* opt for single inplace map, such as relu */ + uint8_t inplace_map[4]; /* opt for single inplace map, such as relu */ }; union { - int8_t block_id[4]; + int8_t block_id[4]; int8_t* block_id_ptr; }; @@ -61,7 +59,6 @@ struct exec_node int shared_pack4_mem_size; }; - struct node_ops { int (*prerun)(struct node_ops*, struct exec_node*, struct exec_graph*); diff --git a/source/device/cpu/cpu_pool.c b/source/device/cpu/cpu_pool.c index 9a848bd53..21a4917e4 100644 --- a/source/device/cpu/cpu_pool.c +++ b/source/device/cpu/cpu_pool.c @@ -36,8 +36,6 @@ #include "utility/vector.h" #include "utility/log.h" - - struct mem_record { struct tensor* ir_tensor; @@ -45,7 +43,6 @@ struct mem_record int block_id; }; - static int find_inplace_input(struct exec_node* exec_node, int output_slot, struct node* ir_node, struct graph* ir_graph) { if (exec_node->inplace_map_num == 0) @@ -79,14 +76,13 @@ static int find_inplace_input(struct exec_node* exec_node, int output_slot, stru return input_slot; } - static int find_tensor_mem_list(struct vector* tensor_mem_list, const struct tensor* ir_tensor) { int rec_number = get_vector_num(tensor_mem_list); for (int i = 0; i < rec_number; i++) { - struct mem_record* rec = ( struct mem_record* )get_vector_data(tensor_mem_list, i); + struct mem_record* rec = (struct mem_record*)get_vector_data(tensor_mem_list, i); if (rec->ir_tensor == ir_tensor) return i; @@ -95,7 +91,6 @@ static int find_tensor_mem_list(struct vector* tensor_mem_list, const struct ten return -1; } - void free_exec_graph_mem(struct exec_graph* graph) { /* free the shared memory */ @@ -121,7 +116,6 @@ void free_exec_graph_mem(struct exec_graph* graph) } } - static void mem_pool_dump(struct mem_pool* mem_pool) { int block_number = get_vector_num(mem_pool->block_list); @@ -130,7 +124,7 @@ static void mem_pool_dump(struct mem_pool* mem_pool) for (int i = 0; i < block_number; i++) { - struct mem_block_entry* entry = ( struct mem_block_entry* )get_vector_data(mem_pool->block_list, i); + struct mem_block_entry* entry = (struct mem_block_entry*)get_vector_data(mem_pool->block_list, i); TLOG_INFO("Tengine: %d: %p (%d) used: %d free: %d\n", i, entry->addr, entry->block_size, entry->alloc_count, entry->free_count); @@ -139,12 +133,12 @@ static void mem_pool_dump(struct mem_pool* mem_pool) static void* mem_pool_get_mem_block(struct mem_pool* mem_pool, int block_id) { - struct mem_block_entry* entry = ( struct mem_block_entry* )get_vector_data(mem_pool->block_list, block_id); + struct mem_block_entry* entry = (struct mem_block_entry*)get_vector_data(mem_pool->block_list, block_id); size_t addr = (size_t)(entry->addr); size_t aligned_addr = (addr + 4 + mem_pool->align_size) & (~(mem_pool->align_size - 1)); - return ( void* )aligned_addr; + return (void*)aligned_addr; } static int mem_pool_get_backend_mem(struct mem_pool* mem_pool) @@ -153,7 +147,7 @@ static int mem_pool_get_backend_mem(struct mem_pool* mem_pool) for (int i = 0; i < block_num; i++) { - struct mem_block_entry* entry = ( struct mem_block_entry* )get_vector_data(mem_pool->block_list, i); + struct mem_block_entry* entry = (struct mem_block_entry*)get_vector_data(mem_pool->block_list, i); entry->block_size = entry->max_req_size + mem_pool->align_size + 128; @@ -173,7 +167,7 @@ static int mem_pool_allocate(struct mem_pool* mem_pool, int size) for (int i = 0; i < block_num; i++) { - struct mem_block_entry* entry = ( struct mem_block_entry* )get_vector_data(mem_pool->block_list, i); + struct mem_block_entry* entry = (struct mem_block_entry*)get_vector_data(mem_pool->block_list, i); if (entry->free_count != entry->alloc_count) continue; @@ -202,15 +196,13 @@ static int mem_pool_allocate(struct mem_pool* mem_pool, int size) return block_num; } - static void mem_pool_free(struct mem_pool* mem_pool, int block_id) { - struct mem_block_entry* block = ( struct mem_block_entry* )get_vector_data(mem_pool->block_list, block_id); + struct mem_block_entry* block = (struct mem_block_entry*)get_vector_data(mem_pool->block_list, block_id); block->free_count++; } - void release_mem_pool(struct mem_pool* mem_pool) { if (mem_pool->block_list != NULL) @@ -219,7 +211,7 @@ void release_mem_pool(struct mem_pool* mem_pool) for (int i = 0; i < block_num; i++) { - struct mem_block_entry* entry = ( struct mem_block_entry* )get_vector_data(mem_pool->block_list, i); + struct mem_block_entry* entry = (struct mem_block_entry*)get_vector_data(mem_pool->block_list, i); sys_free(entry->addr); } @@ -230,10 +222,9 @@ void release_mem_pool(struct mem_pool* mem_pool) sys_free(mem_pool); } - static struct mem_pool* create_mem_pool(void) { - struct mem_pool* mem_pool = ( struct mem_pool* )sys_malloc(sizeof(struct mem_pool)); + struct mem_pool* mem_pool = (struct mem_pool*)sys_malloc(sizeof(struct mem_pool)); if (mem_pool == NULL) return NULL; @@ -252,7 +243,7 @@ static struct mem_pool* create_mem_pool(void) return mem_pool; - error: +error: release_mem_pool(mem_pool); @@ -281,7 +272,7 @@ int alloc_exec_graph_mem(struct exec_graph* exec_graph) for (int i = 0; i < node_num; i++) { - struct exec_node* exec_node = ( struct exec_node* )get_vector_data(exec_graph->exec_node_list, i); + struct exec_node* exec_node = (struct exec_node*)get_vector_data(exec_graph->exec_node_list, i); struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; @@ -311,7 +302,7 @@ int alloc_exec_graph_mem(struct exec_graph* exec_graph) if (idx < 0) continue; - struct mem_record* input_r = ( struct mem_record* )get_vector_data(tensor_mem_list, idx); + struct mem_record* input_r = (struct mem_record*)get_vector_data(tensor_mem_list, idx); input_r->ir_tensor = ir_tensor; input_r->used = ir_tensor->consumer_num; @@ -346,7 +337,7 @@ int alloc_exec_graph_mem(struct exec_graph* exec_graph) if (idx < 0) continue; - struct mem_record* input_r = ( struct mem_record* )get_vector_data(tensor_mem_list, idx); + struct mem_record* input_r = (struct mem_record*)get_vector_data(tensor_mem_list, idx); input_r->used--; @@ -406,7 +397,7 @@ int alloc_exec_graph_mem(struct exec_graph* exec_graph) /* now, the real allocate */ for (int i = 0; i < node_num; i++) { - struct exec_node* exec_node = ( struct exec_node* )get_vector_data(exec_graph->exec_node_list, i); + struct exec_node* exec_node = (struct exec_node*)get_vector_data(exec_graph->exec_node_list, i); struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; struct mem_pool* local_mem_pool = exec_graph->mem_pool; diff --git a/source/device/cpu/cpu_pool.h b/source/device/cpu/cpu_pool.h index 0037fbfed..126a09633 100644 --- a/source/device/cpu/cpu_pool.h +++ b/source/device/cpu/cpu_pool.h @@ -31,7 +31,6 @@ struct exec_graph; - struct mem_block_entry { void* addr; @@ -46,14 +45,13 @@ struct mem_pool uint8_t align_size; /* must be 2^n */ struct vector* block_list; - int (*get_backend_mem)(struct mem_pool*); + int (*get_backend_mem)(struct mem_pool*); void* (*get_mem_block)(struct mem_pool*, int block_id); - int (*allocate)(struct mem_pool*, int size); - void (*free)(struct mem_pool*, int block_id); - void (*dump)(struct mem_pool*); + int (*allocate)(struct mem_pool*, int size); + void (*free)(struct mem_pool*, int block_id); + void (*dump)(struct mem_pool*); }; - void release_mem_pool(struct mem_pool* mem_pool); int alloc_exec_graph_mem(struct exec_graph* exec_graph); void free_exec_graph_mem(struct exec_graph* graph); diff --git a/source/device/cpu/op/absval/absval_ref.c b/source/device/cpu/op/absval/absval_ref.c index 925b0fd82..973bbae6d 100644 --- a/source/device/cpu/op/absval/absval_ref.c +++ b/source/device/cpu/op/absval/absval_ref.c @@ -33,25 +33,21 @@ #include - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; } - static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; } - static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; } - static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { struct node* ir_node = exec_node->ir_node; @@ -62,8 +58,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - float* idata = ( float* )input_tensor->data; - float* odata = ( float* )output_tensor->data; + float* idata = (float*)input_tensor->data; + float* odata = (float*)output_tensor->data; for (uint32_t i = 0; i < output_tensor->elem_num; i++) { @@ -75,7 +71,6 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex return 0; } - static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) { struct node* ir_node = exec_node; @@ -90,7 +85,6 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } - static struct node_ops hcl_node_ops = {.prerun = prerun, .run = run, .reshape = NULL, @@ -104,7 +98,6 @@ int register_absval_ref_op() return register_builtin_node_ops(OP_ABSVAL, &hcl_node_ops); } - int unregister_absval_ref_op() { return unregister_builtin_node_ops(OP_ABSVAL, &hcl_node_ops); diff --git a/source/device/cpu/op/absval/cortex-a/absval_hcl_arm.c b/source/device/cpu/op/absval/cortex-a/absval_hcl_arm.c index 960f1f198..c01c37a0c 100644 --- a/source/device/cpu/op/absval/cortex-a/absval_hcl_arm.c +++ b/source/device/cpu/op/absval/cortex-a/absval_hcl_arm.c @@ -35,25 +35,21 @@ #include - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; } - static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; } - static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; } - static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { struct node* ir_node = exec_node->ir_node; @@ -64,8 +60,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - float* idata = ( float* )input_tensor->data; - float* odata = ( float* )output_tensor->data; + float* idata = (float*)input_tensor->data; + float* odata = (float*)output_tensor->data; int channel_num = input_tensor->dims[1]; int batch_number = input_tensor->dims[0]; @@ -99,7 +95,6 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex return 0; } - static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) { struct node* ir_node = exec_node; @@ -114,7 +109,6 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } - static struct node_ops hcl_node_ops = {.prerun = prerun, .run = run, .reshape = NULL, @@ -123,13 +117,11 @@ static struct node_ops hcl_node_ops = {.prerun = prerun, .release_node = release_node, .score = score}; - int register_absval_hcl_arm_op() { return register_builtin_node_ops(OP_ABSVAL, &hcl_node_ops); } - int unregister_absval_hcl_arm_op() { return unregister_builtin_node_ops(OP_ABSVAL, &hcl_node_ops); diff --git a/source/device/cpu/op/add_n/add_n_ref.c b/source/device/cpu/op/add_n/add_n_ref.c index 69b9d54dc..559b6cc44 100644 --- a/source/device/cpu/op/add_n/add_n_ref.c +++ b/source/device/cpu/op/add_n/add_n_ref.c @@ -33,7 +33,6 @@ #include - struct add_n_op_param { int in_num; @@ -56,7 +55,7 @@ static int ref_add_n_fp32(const float** input, float* output, int size, const st static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct add_n_op_param* add_n_op_param = ( struct add_n_op_param* )sys_malloc(sizeof(struct add_n_op_param)); + struct add_n_op_param* add_n_op_param = (struct add_n_op_param*)sys_malloc(sizeof(struct add_n_op_param)); exec_node->ops_priv = add_n_op_param; return 0; @@ -72,12 +71,12 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct { struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; - struct add_n_op_param* add_n_op_param = ( struct add_n_op_param* )exec_node->ops_priv; + struct add_n_op_param* add_n_op_param = (struct add_n_op_param*)exec_node->ops_priv; struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); int in_num = ir_node->input_num; add_n_op_param->in_num = in_num; - add_n_op_param->input_data = ( void** )sys_malloc(sizeof(void*) * in_num); + add_n_op_param->input_data = (void**)sys_malloc(sizeof(void*) * in_num); return 0; } @@ -90,27 +89,27 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); uint32_t elem_num = input_tensor_a->elem_num; - struct add_n_op_param* add_n_op_param = ( struct add_n_op_param* )exec_node->ops_priv; + struct add_n_op_param* add_n_op_param = (struct add_n_op_param*)exec_node->ops_priv; for (int i = 0; i < add_n_op_param->in_num; i++) { struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[i]); void* data = input_tensor->data; add_n_op_param->input_data[i] = data; } - const void** input = ( const void** )add_n_op_param->input_data; + const void** input = (const void**)add_n_op_param->input_data; float* output = (float*)output_tensor->data; for (uint32_t i = 0; i < elem_num; i++) { output[i] = 0; } - ref_add_n_fp32(( const float** )input, output, elem_num, add_n_op_param); + ref_add_n_fp32((const float**)input, output, elem_num, add_n_op_param); return 0; } static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct add_n_op_param* add_n_op_param = ( struct add_n_op_param* )exec_node->ops_priv; + struct add_n_op_param* add_n_op_param = (struct add_n_op_param*)exec_node->ops_priv; sys_free(add_n_op_param->input_data); return 0; diff --git a/source/device/cpu/op/argmax/argmax_ref.c b/source/device/cpu/op/argmax/argmax_ref.c index 413f982a5..ba8898a38 100644 --- a/source/device/cpu/op/argmax/argmax_ref.c +++ b/source/device/cpu/op/argmax/argmax_ref.c @@ -36,7 +36,6 @@ #include - struct argmax_op_param { int axis; @@ -112,7 +111,7 @@ static int ref_argmax_uint8(uint8_t* input, int* output, const struct argmax_op_ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct argmax_op_param* argmax_op_param = ( struct argmax_op_param* )sys_malloc(sizeof(struct argmax_op_param)); + struct argmax_op_param* argmax_op_param = (struct argmax_op_param*)sys_malloc(sizeof(struct argmax_op_param)); argmax_op_param->axis = 0; argmax_op_param->axis_size = 1; argmax_op_param->inner_size = 1; @@ -137,8 +136,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct argmax_op_param* argmax_op_param = ( struct argmax_op_param* )exec_node->ops_priv; - struct argmax_param* argmax_param = ( struct argmax_param* )ir_node->op.param_mem; + struct argmax_op_param* argmax_op_param = (struct argmax_op_param*)exec_node->ops_priv; + struct argmax_param* argmax_param = (struct argmax_param*)ir_node->op.param_mem; argmax_op_param->axis = argmax_param->axis; argmax_op_param->keepdims = argmax_param->keepdims; argmax_op_param->axis_size = input_tensor->dims[argmax_param->axis]; @@ -174,15 +173,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex void* in_data = input_tensor->data; void* out_data = output_tensor->data; - struct argmax_op_param* argmax_op_param = ( struct argmax_op_param* )exec_node->ops_priv; + struct argmax_op_param* argmax_op_param = (struct argmax_op_param*)exec_node->ops_priv; TLOG_ERR("output_tensor->elem_num:%d\n", output_tensor->elem_num); TLOG_ERR("output_tensor->elem_size:%d\n", output_tensor->elem_size); if (input_tensor->data_type == TENGINE_DT_FP32) - ref_argmax_fp32(( float* )in_data, (int*)out_data, argmax_op_param); - else if(input_tensor->data_type == TENGINE_DT_UINT8) - ref_argmax_uint8(( uint8_t* )in_data, (int*)out_data, argmax_op_param); + ref_argmax_fp32((float*)in_data, (int*)out_data, argmax_op_param); + else if (input_tensor->data_type == TENGINE_DT_UINT8) + ref_argmax_uint8((uint8_t*)in_data, (int*)out_data, argmax_op_param); return 0; } diff --git a/source/device/cpu/op/argmin/argmin_ref.c b/source/device/cpu/op/argmin/argmin_ref.c index 730bd4155..58da946b0 100644 --- a/source/device/cpu/op/argmin/argmin_ref.c +++ b/source/device/cpu/op/argmin/argmin_ref.c @@ -36,7 +36,6 @@ #include - struct argmin_op_param { int axis; @@ -112,7 +111,7 @@ static int ref_argmin_uint8(uint8_t* input, int* output, const struct argmin_op_ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct argmin_op_param* argmin_op_param = ( struct argmin_op_param* )sys_malloc(sizeof(struct argmin_op_param)); + struct argmin_op_param* argmin_op_param = (struct argmin_op_param*)sys_malloc(sizeof(struct argmin_op_param)); argmin_op_param->axis = 0; argmin_op_param->axis_size = 1; argmin_op_param->inner_size = 1; @@ -137,8 +136,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct argmin_op_param* argmin_op_param = ( struct argmin_op_param* )exec_node->ops_priv; - struct argmin_param* argmin_param = ( struct argmin_param* )ir_node->op.param_mem; + struct argmin_op_param* argmin_op_param = (struct argmin_op_param*)exec_node->ops_priv; + struct argmin_param* argmin_param = (struct argmin_param*)ir_node->op.param_mem; argmin_op_param->axis = argmin_param->axis; argmin_op_param->keepdims = argmin_param->keepdims; argmin_op_param->axis_size = input_tensor->dims[argmin_param->axis]; @@ -174,15 +173,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex void* in_data = input_tensor->data; void* out_data = output_tensor->data; - struct argmin_op_param* argmin_op_param = ( struct argmin_op_param* )exec_node->ops_priv; + struct argmin_op_param* argmin_op_param = (struct argmin_op_param*)exec_node->ops_priv; TLOG_ERR("output_tensor->elem_num:%d\n", output_tensor->elem_num); TLOG_ERR("output_tensor->elem_size:%d\n", output_tensor->elem_size); if (input_tensor->data_type == TENGINE_DT_FP32) - ref_argmin_fp32(( float* )in_data, (int*)out_data, argmin_op_param); - else if(input_tensor->data_type == TENGINE_DT_UINT8) - ref_argmin_uint8(( uint8_t* )in_data, (int*)out_data, argmin_op_param); + ref_argmin_fp32((float*)in_data, (int*)out_data, argmin_op_param); + else if (input_tensor->data_type == TENGINE_DT_UINT8) + ref_argmin_uint8((uint8_t*)in_data, (int*)out_data, argmin_op_param); return 0; } diff --git a/source/device/cpu/op/batchnorm/batchnorm_kernel_ref.h b/source/device/cpu/op/batchnorm/batchnorm_kernel_ref.h index 59f499d14..f655e481f 100644 --- a/source/device/cpu/op/batchnorm/batchnorm_kernel_ref.h +++ b/source/device/cpu/op/batchnorm/batchnorm_kernel_ref.h @@ -25,7 +25,6 @@ #ifndef __BATCHNORM_KERNEL_REF_H__ #define __BATCHNORM_KERNEL_REF_H__ - #include "graph/tensor.h" #include "graph/node.h" #include "graph/graph.h" diff --git a/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_fp32.c b/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_fp32.c index b06390e88..de8c49d84 100644 --- a/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_fp32.c +++ b/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_fp32.c @@ -36,7 +36,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - int ref_batchnorm_fp32(float* input, float* output, const struct ref_batchnorm_param* param) { float* scale_mean = param->scale_mean; diff --git a/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_uint8.c b/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_uint8.c index e129ae1ce..76d2414fe 100644 --- a/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_uint8.c +++ b/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_uint8.c @@ -36,7 +36,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - int ref_batchnorm_uint8(struct tensor* input_tensor, struct tensor* output_tensor, const struct ref_batchnorm_param* param) { float* scale_mean = param->scale_mean; @@ -55,9 +54,9 @@ int ref_batchnorm_uint8(struct tensor* input_tensor, struct tensor* output_tenso int32_t input_zero = input_tensor->zero_point; int32_t output_zero = output_tensor->zero_point; - float* data_fp32 = (float*) sys_malloc(total_size * sizeof(float)); - for(int i = 0; i < total_size; i++) - data_fp32[i] = ((float) input_uint8[i] - (float)input_zero) * input_scale; + float* data_fp32 = (float*)sys_malloc(total_size * sizeof(float)); + for (int i = 0; i < total_size; i++) + data_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale; for (int n = 0; n < param->input_n; ++n) { @@ -87,7 +86,7 @@ int ref_batchnorm_uint8(struct tensor* input_tensor, struct tensor* output_tenso } // quant - for(int i=0; i 255) diff --git a/source/device/cpu/op/batchnorm/batchnorm_ref.c b/source/device/cpu/op/batchnorm/batchnorm_ref.c index fd8dade7c..25f381310 100644 --- a/source/device/cpu/op/batchnorm/batchnorm_ref.c +++ b/source/device/cpu/op/batchnorm/batchnorm_ref.c @@ -38,11 +38,9 @@ #include "batchnorm_kernel_ref.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct ref_batchnorm_param* batchnorm_op_param = - ( struct ref_batchnorm_param* )sys_malloc(sizeof(struct ref_batchnorm_param)); + struct ref_batchnorm_param* batchnorm_op_param = (struct ref_batchnorm_param*)sys_malloc(sizeof(struct ref_batchnorm_param)); memset(batchnorm_op_param, 0, sizeof(struct ref_batchnorm_param)); exec_node->ops_priv = batchnorm_op_param; return 0; @@ -63,15 +61,15 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct struct tensor* mean_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[3]); struct tensor* var_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[4]); - struct ref_batchnorm_param* op_param = ( struct ref_batchnorm_param* )exec_node->ops_priv; - struct batchnorm_param* batchnorm_param = ( struct batchnorm_param* )ir_node->op.param_mem; + struct ref_batchnorm_param* op_param = (struct ref_batchnorm_param*)exec_node->ops_priv; + struct batchnorm_param* batchnorm_param = (struct batchnorm_param*)ir_node->op.param_mem; int channel_num = input_tensor->dims[1]; - float* scale_mean = ( float* )sys_malloc(channel_num * sizeof(float)); - float* scale_var_inv = ( float* )sys_malloc(channel_num * sizeof(float)); - const float* mean = ( const float* )mean_tensor->data; - const float* var = ( const float* )var_tensor->data; + float* scale_mean = (float*)sys_malloc(channel_num * sizeof(float)); + float* scale_var_inv = (float*)sys_malloc(channel_num * sizeof(float)); + const float* mean = (const float*)mean_tensor->data; + const float* var = (const float*)var_tensor->data; float rescale_factor; float eps = batchnorm_param->eps; @@ -81,9 +79,9 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct for (int c = 0; c < channel_num; c++) { float tmp = sqrtf(var[c] * rescale_factor + eps); - scale_var_inv[c] = ( float )(1.f / tmp); + scale_var_inv[c] = (float)(1.f / tmp); tmp = rescale_factor * scale_var_inv[c]; - scale_mean[c] = ( float )(-mean[c] * tmp); + scale_mean[c] = (float)(-mean[c] * tmp); } float* gamma = NULL; float* beta = NULL; @@ -91,8 +89,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct { const struct tensor* gamma_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); const struct tensor* beta_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); - gamma = ( float* )gamma_tensor->data; - beta = ( float* )beta_tensor->data; + gamma = (float*)gamma_tensor->data; + beta = (float*)beta_tensor->data; } int layout = ir_graph->graph_layout; op_param->iscaffe = batchnorm_param->caffe_flavor; @@ -112,7 +110,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct ref_batchnorm_param* batchnorm_op_param = ( struct ref_batchnorm_param* )exec_node->ops_priv; + struct ref_batchnorm_param* batchnorm_op_param = (struct ref_batchnorm_param*)exec_node->ops_priv; void* out_data = output_tensor->data; void* input = input_tensor->data; @@ -134,7 +132,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex { return -1; } - + int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_batchnorm_fp32((float*)input, (float*)out_data, batchnorm_op_param); @@ -146,7 +144,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct ref_batchnorm_param* batchnorm_op_param = ( struct ref_batchnorm_param* )exec_node->ops_priv; + struct ref_batchnorm_param* batchnorm_op_param = (struct ref_batchnorm_param*)exec_node->ops_priv; sys_free(batchnorm_op_param->scale_mean); sys_free(batchnorm_op_param->scale_var_inv); diff --git a/source/device/cpu/op/batchnorm/cortex-a/batchnorm_hcl_arm.c b/source/device/cpu/op/batchnorm/cortex-a/batchnorm_hcl_arm.c index f455f1f67..359b14ee5 100644 --- a/source/device/cpu/op/batchnorm/cortex-a/batchnorm_hcl_arm.c +++ b/source/device/cpu/op/batchnorm/cortex-a/batchnorm_hcl_arm.c @@ -35,11 +35,9 @@ #include #include - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct hcl_batchnorm_param* op_param = - ( struct hcl_batchnorm_param* )sys_malloc(sizeof(struct hcl_batchnorm_param)); + struct hcl_batchnorm_param* op_param = (struct hcl_batchnorm_param*)sys_malloc(sizeof(struct hcl_batchnorm_param)); memset(op_param, 0, sizeof(struct hcl_batchnorm_param)); exec_node->ops_priv = op_param; return 0; @@ -61,13 +59,13 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct int channel_num = mean_tensor->dims[0]; - float* scale_mean = ( float* )sys_malloc(channel_num * sizeof(float)); - float* scale_var_inv = ( float* )sys_malloc(channel_num * sizeof(float)); + float* scale_mean = (float*)sys_malloc(channel_num * sizeof(float)); + float* scale_var_inv = (float*)sys_malloc(channel_num * sizeof(float)); - const float* mean = ( const float* )mean_tensor->data; - const float* var = ( const float* )var_tensor->data; + const float* mean = (const float*)mean_tensor->data; + const float* var = (const float*)var_tensor->data; - struct batchnorm_param* batchnorm_param = ( struct batchnorm_param* )ir_node->op.param_mem; + struct batchnorm_param* batchnorm_param = (struct batchnorm_param*)ir_node->op.param_mem; float rescale_factor; float eps = batchnorm_param->eps; @@ -76,16 +74,16 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct for (int c = 0; c < channel_num; c++) { float tmp = sqrt(var[c] * rescale_factor + eps); - scale_var_inv[c] = ( float )(1.f / tmp); + scale_var_inv[c] = (float)(1.f / tmp); tmp = rescale_factor * scale_var_inv[c]; - scale_mean[c] = ( float )(-mean[c] * tmp); + scale_mean[c] = (float)(-mean[c] * tmp); } if (!batchnorm_param->caffe_flavor) { const struct tensor* gamma_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); const struct tensor* beta_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); - const float* gamma = ( const float* )gamma_tensor->data; - const float* beta = ( const float* )beta_tensor->data; + const float* gamma = (const float*)gamma_tensor->data; + const float* beta = (const float*)beta_tensor->data; for (int c = 0; c < channel_num; c++) { scale_var_inv[c] *= gamma[c]; @@ -94,7 +92,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct } } - struct hcl_batchnorm_param* op_param = ( struct hcl_batchnorm_param* )exec_node->ops_priv; + struct hcl_batchnorm_param* op_param = (struct hcl_batchnorm_param*)exec_node->ops_priv; op_param->scale_mean = scale_mean; op_param->scale_var_inv = scale_var_inv; @@ -111,7 +109,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct hcl_batchnorm_param* op_param = ( struct hcl_batchnorm_param* )exec_node->ops_priv; + struct hcl_batchnorm_param* op_param = (struct hcl_batchnorm_param*)exec_node->ops_priv; float* scale_mean = op_param->scale_mean; float* scale_var_inv = op_param->scale_var_inv; int num_thread = exec_graph->num_thread; @@ -123,7 +121,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct hcl_batchnorm_param* op_param = ( struct hcl_batchnorm_param* )exec_node->ops_priv; + struct hcl_batchnorm_param* op_param = (struct hcl_batchnorm_param*)exec_node->ops_priv; sys_free(op_param->scale_mean); sys_free(op_param->scale_var_inv); diff --git a/source/device/cpu/op/batchnorm/cortex-a/batchnorm_kernel_arm.c b/source/device/cpu/op/batchnorm/cortex-a/batchnorm_kernel_arm.c index cd9a5835c..181648a08 100644 --- a/source/device/cpu/op/batchnorm/cortex-a/batchnorm_kernel_arm.c +++ b/source/device/cpu/op/batchnorm/cortex-a/batchnorm_kernel_arm.c @@ -26,11 +26,10 @@ #include - static void batchnorm_kernel(int i, int id, void* data, const float* input, float* output, float* scale_mean, float* scale_var, int channel_size, int num_thread) { - int step = (( int* )data)[0]; + int step = ((int*)data)[0]; #pragma omp parallel for num_threads(num_thread) for (int c = 0; c < step; c++) @@ -68,11 +67,11 @@ int batchnorm_run(struct tensor* output_tensor, struct tensor* input_tensor, flo int channel_size = (input_tensor->dims[2]) * (input_tensor->dims[3]); int img_size = channel_num * channel_size; - const float* input = ( const float* )input_tensor->data; - float* output = ( float* )output_tensor->data; + const float* input = (const float*)input_tensor->data; + float* output = (float*)output_tensor->data; - float* scale_mean_t = ( float* )scale_mean; - float* scale_var_inv_t = ( float* )scale_var_inv; + float* scale_mean_t = (float*)scale_mean; + float* scale_var_inv_t = (float*)scale_var_inv; /* only use mean and var */ for (int i = 0; i < batch_number; i++) diff --git a/source/device/cpu/op/batchnorm/cortex-a/batchnorm_kernel_arm.h b/source/device/cpu/op/batchnorm/cortex-a/batchnorm_kernel_arm.h index 9bff1df8b..58a3d3507 100644 --- a/source/device/cpu/op/batchnorm/cortex-a/batchnorm_kernel_arm.h +++ b/source/device/cpu/op/batchnorm/cortex-a/batchnorm_kernel_arm.h @@ -29,7 +29,6 @@ #include "graph/node.h" #include "graph/graph.h" - struct hcl_batchnorm_param { float* scale_mean; diff --git a/source/device/cpu/op/batchtospacend/batchtospacend_ref.c b/source/device/cpu/op/batchtospacend/batchtospacend_ref.c index 770cabb76..9c9aa6044 100644 --- a/source/device/cpu/op/batchtospacend/batchtospacend_ref.c +++ b/source/device/cpu/op/batchtospacend/batchtospacend_ref.c @@ -36,7 +36,6 @@ #include #include - static int ref_batchtospacend_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct batchtospacend_param* param, int num_thread) { @@ -62,25 +61,21 @@ static int ref_batchtospacend_fp32(struct tensor* input_tensor, struct tensor* o const int spatial_offset = (int)roundf(in_batch / out_dims[0]); for (int in_h = 0; in_h < in_dims[1]; ++in_h) { - const int out_h = - (int)roundf(in_h * (param->dilation_y) + spatial_offset / (param->dilation_x) - param->crop_top); + const int out_h = (int)roundf(in_h * (param->dilation_y) + spatial_offset / (param->dilation_x) - param->crop_top); if (out_h < 0 || out_h >= out_dims[1]) continue; for (int in_w = 0; in_w < in_dims[2]; ++in_w) { - const int out_w = - (int)roundf(in_w * param->dilation_x + spatial_offset % param->dilation_x - param->crop_left); + const int out_w = (int)roundf(in_w * param->dilation_x + spatial_offset % param->dilation_x - param->crop_left); if (out_w < 0 || out_w >= out_dims[2]) continue; - int outOffset = (int)roundf(out_batch * out_dims[1] * out_dims[2] * out_dims[3] + - out_h * out_dims[2] * out_dims[3] + out_w * in_dims[3]); + int outOffset = (int)roundf(out_batch * out_dims[1] * out_dims[2] * out_dims[3] + out_h * out_dims[2] * out_dims[3] + out_w * in_dims[3]); float* out = out_data + outOffset; - int inOffset = (int)roundf(in_batch * in_dims[1] * in_dims[2] * in_dims[3] + in_h * in_dims[2] * in_dims[3] + - in_w * in_dims[3]); + int inOffset = (int)roundf(in_batch * in_dims[1] * in_dims[2] * in_dims[3] + in_h * in_dims[2] * in_dims[3] + in_w * in_dims[3]); const float* in = in_data + inOffset; memcpy(out, in, in_dims[3] * sizeof(float)); } @@ -109,7 +104,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct batchtospacend_param* batchtospacend_param = ( struct batchtospacend_param* )ir_node->op.param_mem; + struct batchtospacend_param* batchtospacend_param = (struct batchtospacend_param*)ir_node->op.param_mem; ref_batchtospacend_fp32(input_tensor, output_tensor, batchtospacend_param, exec_graph->num_thread); diff --git a/source/device/cpu/op/bias/bias_ref.c b/source/device/cpu/op/bias/bias_ref.c index c9653295c..2eb39c085 100644 --- a/source/device/cpu/op/bias/bias_ref.c +++ b/source/device/cpu/op/bias/bias_ref.c @@ -33,7 +33,6 @@ #include - int ref_bias_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* bias_tensor, int num_thread) { diff --git a/source/device/cpu/op/broadmul/broadmul_ref.c b/source/device/cpu/op/broadmul/broadmul_ref.c index 7d91e9f01..92ed72a28 100644 --- a/source/device/cpu/op/broadmul/broadmul_ref.c +++ b/source/device/cpu/op/broadmul/broadmul_ref.c @@ -33,7 +33,6 @@ #include - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -151,4 +150,3 @@ int unregister_broadmul_ref_op() { return unregister_builtin_node_ops(OP_BROADMUL, &hcl_node_ops); } - diff --git a/source/device/cpu/op/cast/cast_ref.c b/source/device/cpu/op/cast/cast_ref.c index 8fcb58772..9eb88fb16 100644 --- a/source/device/cpu/op/cast/cast_ref.c +++ b/source/device/cpu/op/cast/cast_ref.c @@ -37,7 +37,6 @@ #include #include - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -69,7 +68,6 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int num_thread = exec_graph->num_thread; - if (input_tensor->elem_num != output_tensor->elem_num || input_tensor->dim_num != output_tensor->dim_num) { return -1; diff --git a/source/device/cpu/op/ceil/ceil_ref.c b/source/device/cpu/op/ceil/ceil_ref.c index 0602ec448..95cc44f39 100644 --- a/source/device/cpu/op/ceil/ceil_ref.c +++ b/source/device/cpu/op/ceil/ceil_ref.c @@ -35,7 +35,6 @@ #include - int ref_ceil_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread) { // dims size = 2 or 3 @@ -84,7 +83,7 @@ int ref_ceil_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int int ref_ceil_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread) { - /* dequant */ + /* dequant */ uint8_t* input_uint8 = (uint8_t*)input_tensor->data; uint8_t* output_uint8 = (uint8_t*)output_tensor->data; float input_scale = input_tensor->scale; @@ -94,12 +93,12 @@ int ref_ceil_uint8(struct tensor* input_tensor, struct tensor* output_tensor, in int input_size = input_tensor->elem_num; int output_size = output_tensor->elem_num; - float* input_data = ( float* )sys_malloc(input_size * sizeof(float)); - float* out_data = ( float* )sys_malloc(output_size * sizeof(float)); + float* input_data = (float*)sys_malloc(input_size * sizeof(float)); + float* out_data = (float*)sys_malloc(output_size * sizeof(float)); for (int i = 0; i < input_size; i++) { - input_data[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale; + input_data[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale; } // dims size = 2 or 3 @@ -180,7 +179,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_ceil_fp32(input_tensor, output_tensor, exec_graph->num_thread); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_ceil_uint8(input_tensor, output_tensor, exec_graph->num_thread); else TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type); diff --git a/source/device/cpu/op/clip/clip_kernel_ref.h b/source/device/cpu/op/clip/clip_kernel_ref.h index 9a2898884..efdd67877 100644 --- a/source/device/cpu/op/clip/clip_kernel_ref.h +++ b/source/device/cpu/op/clip/clip_kernel_ref.h @@ -25,12 +25,10 @@ #ifndef __CLIP_KERNEL_REF_H__ #define __CLIP_KERNEL_REF_H__ - #include "graph/tensor.h" #include "graph/node.h" #include "graph/graph.h" - int ref_clip_fp32(struct tensor* input_tensor, struct tensor* output_tensor, float max, float min); int ref_clip_uint8(struct tensor* input_tensor, struct tensor* output_tensor, float max, float min); diff --git a/source/device/cpu/op/clip/clip_kernel_ref_fp32.c b/source/device/cpu/op/clip/clip_kernel_ref_fp32.c index 53f688e40..ba2b46bad 100644 --- a/source/device/cpu/op/clip/clip_kernel_ref_fp32.c +++ b/source/device/cpu/op/clip/clip_kernel_ref_fp32.c @@ -36,7 +36,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - int ref_clip_fp32(struct tensor* input_tensor, struct tensor* output_tensor, float max, float min) { int total_size = input_tensor->elem_num; diff --git a/source/device/cpu/op/clip/clip_kernel_ref_uint8.c b/source/device/cpu/op/clip/clip_kernel_ref_uint8.c index 064954335..c7f33a538 100644 --- a/source/device/cpu/op/clip/clip_kernel_ref_uint8.c +++ b/source/device/cpu/op/clip/clip_kernel_ref_uint8.c @@ -38,12 +38,11 @@ #include - int ref_clip_uint8(struct tensor* input_tensor, struct tensor* output_tensor, float max, float min) { int total_size = input_tensor->elem_num; - uint8_t* input_uint8 = ( uint8_t* )input_tensor->data; - uint8_t* output_uint8 = ( uint8_t* )output_tensor->data; + uint8_t* input_uint8 = (uint8_t*)input_tensor->data; + uint8_t* output_uint8 = (uint8_t*)output_tensor->data; float input_scale = input_tensor->scale; float output_scale = output_tensor->scale; @@ -51,11 +50,11 @@ int ref_clip_uint8(struct tensor* input_tensor, struct tensor* output_tensor, fl int output_zero = output_tensor->zero_point; /* input dequant */ - float* input_fp32 = ( float* )sys_malloc(total_size * sizeof(float)); - float* output_fp32 = ( float* )sys_malloc(total_size * sizeof(float)); + float* input_fp32 = (float*)sys_malloc(total_size * sizeof(float)); + float* output_fp32 = (float*)sys_malloc(total_size * sizeof(float)); for (uint32_t i = 0; i < input_tensor->elem_num; i++) - input_fp32[i] = ((float )input_uint8[i] - (float )input_zero) * input_scale; + input_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale; for (int i = 0; i < total_size; i++) { @@ -75,7 +74,7 @@ int ref_clip_uint8(struct tensor* input_tensor, struct tensor* output_tensor, fl } sys_free(input_fp32); - sys_free(output_fp32); + sys_free(output_fp32); return 0; } diff --git a/source/device/cpu/op/clip/clip_ref.c b/source/device/cpu/op/clip/clip_ref.c index 09c6c1c41..b29962c19 100644 --- a/source/device/cpu/op/clip/clip_ref.c +++ b/source/device/cpu/op/clip/clip_ref.c @@ -36,7 +36,6 @@ #include "clip_kernel_ref.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -59,7 +58,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct clip_param* clip_param = ( struct clip_param* )ir_node->op.param_mem; + struct clip_param* clip_param = (struct clip_param*)ir_node->op.param_mem; float max = clip_param->max; float min = clip_param->min; diff --git a/source/device/cpu/op/comparison/comparison_kernel_ref_fp32.c b/source/device/cpu/op/comparison/comparison_kernel_ref_fp32.c index 47a5c25f0..bfa3e4b70 100644 --- a/source/device/cpu/op/comparison/comparison_kernel_ref_fp32.c +++ b/source/device/cpu/op/comparison/comparison_kernel_ref_fp32.c @@ -24,7 +24,6 @@ #include "comparison_kernel_ref.h" - void comp_equal(int input_hw, int input_hw_1, int input_count4, int input1_count4, float* input0, float* input1, p_comparison_param param, float* output) { @@ -418,33 +417,39 @@ int ref_comparison_fp32(float* input0, float* input1, float* output, p_compariso switch (param->type) { - case 0: { - comp_equal(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output); - break; - } - case 1: { - comp_nequal(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output); - break; - } - case 2: { - comp_greater(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output); - break; - } - case 3: { - comp_greatere(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output); - break; - } - case 4: { - comp_less(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output); - break; - } - case 5: { - comp_lesse(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output); - break; - } - default: - return -1; - break; + case 0: + { + comp_equal(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output); + break; + } + case 1: + { + comp_nequal(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output); + break; + } + case 2: + { + comp_greater(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output); + break; + } + case 3: + { + comp_greatere(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output); + break; + } + case 4: + { + comp_less(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output); + break; + } + case 5: + { + comp_lesse(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output); + break; + } + default: + return -1; + break; } return 0; } diff --git a/source/device/cpu/op/comparison/comparison_ref.c b/source/device/cpu/op/comparison/comparison_ref.c index b583e7252..14405732c 100644 --- a/source/device/cpu/op/comparison/comparison_ref.c +++ b/source/device/cpu/op/comparison/comparison_ref.c @@ -38,7 +38,6 @@ #include - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -63,7 +62,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor1 = get_ir_graph_tensor(graph, node->input_tensors[1]); struct tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct comparison_param* param = ( struct comparison_param* )node->op.param_mem; + struct comparison_param* param = (struct comparison_param*)node->op.param_mem; void* input0 = input_tensor->data; void* input1 = input_tensor1->data; diff --git a/source/device/cpu/op/concat/concat_kernel_ref.h b/source/device/cpu/op/concat/concat_kernel_ref.h index 2e0c71a2b..d078e1bc0 100644 --- a/source/device/cpu/op/concat/concat_kernel_ref.h +++ b/source/device/cpu/op/concat/concat_kernel_ref.h @@ -25,12 +25,10 @@ #ifndef __CONCAT_KERNEL_REF_H__ #define __CONCAT_KERNEL_REF_H__ - #include "graph/tensor.h" #include "graph/node.h" #include "graph/graph.h" - int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis); int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis); diff --git a/source/device/cpu/op/concat/concat_kernel_ref_fp32.c b/source/device/cpu/op/concat/concat_kernel_ref_fp32.c index 1d220cd18..d36733ea7 100644 --- a/source/device/cpu/op/concat/concat_kernel_ref_fp32.c +++ b/source/device/cpu/op/concat/concat_kernel_ref_fp32.c @@ -36,7 +36,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis) { struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); @@ -45,10 +44,10 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis) { struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); - float* input_data = (float*)input_tensor->data; + float* input_data = (float*)input_tensor->data; float* output_data = (float*)output_tensor->data; - for(int i=0; ielem_num; i++) + for (int i = 0; i < input_tensor->elem_num; i++) output_data[i] = input_data[i]; return 0; @@ -67,10 +66,10 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis) int size = input_tensor->elem_num; - float* input_data = (float*)input_tensor->data; + float* input_data = (float*)input_tensor->data; float* output_data = (float*)output_tensor->data + output_step; - for (int i=0; iinput_num; num++) { - struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); + struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); int size = input_tensor->elem_num; - float* input_data = (float*)input_tensor->data; + float* input_data = (float*)input_tensor->data; float* output_data = (float*)output_tensor->data + output_step; - for (int i=0; idims[0]; int out_w = output_tensor->dims[1]; - for (int n=0; ndims[0]; n++) + for (int n = 0; n < output_tensor->dims[0]; n++) { int output_step = 0; for (int num = 0; num < ir_node->input_num; num++) @@ -116,10 +115,10 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis) int in_n = input_tensor->dims[0]; int in_w = input_tensor->dims[1]; - float* input_data = (float*)input_tensor->data + n * in_w; + float* input_data = (float*)input_tensor->data + n * in_w; float* output_data = (float*)output_tensor->data + n * out_w + output_step; - for (int i=0; iinput_num; num++) { - struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); + struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); int size = input_tensor->elem_num; - float* input_data = (float*)input_tensor->data; + float* input_data = (float*)input_tensor->data; float* output_data = (float*)output_tensor->data + output_step; - for (int i=0; idims[2]; int out_nstep = out_h * out_w; - for (int n=0; ninput_num; num++) @@ -170,10 +169,10 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis) int in_w = input_tensor->dims[2]; int in_nstep = in_h * in_w; - float* input_data = (float*)input_tensor->data + n * in_nstep; + float* input_data = (float*)input_tensor->data + n * in_nstep; float* output_data = (float*)output_tensor->data + n * out_nstep + output_step; - for (int i=0; idims[2]; int out_nstep = out_h * out_w; - for (int n=0; ninput_num; num++) @@ -204,10 +203,10 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis) int in_w = input_tensor->dims[2]; int in_nstep = in_h * in_w; - float* input_data = (float*)input_tensor->data + n * in_nstep + h * in_w; + float* input_data = (float*)input_tensor->data + n * in_nstep + h * in_w; float* output_data = (float*)output_tensor->data + n * out_nstep + h * out_w + output_step; - for (int i=0; iinput_num; num++) { - struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); + struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); int size = input_tensor->elem_num; - float* input_data = (float*)input_tensor->data; + float* input_data = (float*)input_tensor->data; float* output_data = (float*)output_tensor->data + output_step; - for (int i=0; iinput_num; num++) @@ -263,10 +262,10 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis) int in_cstep = in_h * in_w; int in_nstep = in_c * in_cstep; - float* input_data = (float*)input_tensor->data + n * in_nstep; + float* input_data = (float*)input_tensor->data + n * in_nstep; float* output_data = (float*)output_tensor->data + n * out_nstep + output_step; - for (int i=0; iinput_num; num++) @@ -301,10 +300,10 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis) int in_cstep = in_h * in_w; int in_nstep = in_c * in_cstep; - float* input_data = (float*)input_tensor->data + n * in_nstep + c * in_cstep; + float* input_data = (float*)input_tensor->data + n * in_nstep + c * in_cstep; float* output_data = (float*)output_tensor->data + n * out_nstep + c * out_cstep + output_step; - for (int i=0; iinput_num; num++) @@ -342,10 +341,10 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis) int in_cstep = in_h * in_w; int in_nstep = in_c * in_cstep; - float* input_data = (float*)input_tensor->data + n * in_nstep + c * in_cstep + h * in_w; + float* input_data = (float*)input_tensor->data + n * in_nstep + c * in_cstep + h * in_w; float* output_data = (float*)output_tensor->data + n * out_nstep + c * out_cstep + h * out_w + output_step; - for (int i=0; i - int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) { struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); @@ -48,10 +47,10 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) { struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); - int8_t* input_data = (int8_t*)input_tensor->data; + int8_t* input_data = (int8_t*)input_tensor->data; int8_t* output_data = (int8_t*)output_tensor->data; - for(int i=0; ielem_num; i++) + for (int i = 0; i < input_tensor->elem_num; i++) output_data[i] = input_data[i]; return 0; @@ -73,12 +72,12 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) int size = input_tensor->elem_num; - int8_t* input_data = (int8_t*)input_tensor->data; + int8_t* input_data = (int8_t*)input_tensor->data; int8_t* output_data = (int8_t*)output_tensor->data + output_step; - for (int i=0; i 127) idata = 127; else if (idata < -127) @@ -96,19 +95,19 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) int output_step = 0; for (int num = 0; num < ir_node->input_num; num++) { - struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); + struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); float intput_scale = input_tensor->scale; float rescale = intput_scale / output_scale; int size = input_tensor->elem_num; - int8_t* input_data = (int8_t*)input_tensor->data; + int8_t* input_data = (int8_t*)input_tensor->data; int8_t* output_data = (int8_t*)output_tensor->data + output_step; - for (int i=0; i 127) idata = 127; else if (idata < -127) @@ -125,7 +124,7 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) int out_n = output_tensor->dims[0]; int out_w = output_tensor->dims[1]; - for (int n=0; ndims[0]; n++) + for (int n = 0; n < output_tensor->dims[0]; n++) { int output_step = 0; for (int num = 0; num < ir_node->input_num; num++) @@ -133,17 +132,17 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); float intput_scale = input_tensor->scale; - float rescale = intput_scale / output_scale; + float rescale = intput_scale / output_scale; int in_n = input_tensor->dims[0]; int in_w = input_tensor->dims[1]; - int8_t* input_data = (int8_t*)input_tensor->data + n * in_w; + int8_t* input_data = (int8_t*)input_tensor->data + n * in_w; int8_t* output_data = (int8_t*)output_tensor->data + n * out_w + output_step; - for (int i=0; i 127) idata = 127; else if (idata < -127) @@ -162,19 +161,19 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) int output_step = 0; for (int num = 0; num < ir_node->input_num; num++) { - struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); + struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); float intput_scale = input_tensor->scale; - float rescale = intput_scale / output_scale; + float rescale = intput_scale / output_scale; int size = input_tensor->elem_num; - int8_t* input_data = (int8_t*)input_tensor->data; + int8_t* input_data = (int8_t*)input_tensor->data; int8_t* output_data = (int8_t*)output_tensor->data + output_step; - for (int i=0; i 127) idata = 127; else if (idata < -127) @@ -193,7 +192,7 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) int out_w = output_tensor->dims[2]; int out_nstep = out_h * out_w; - for (int n=0; ninput_num; num++) @@ -201,19 +200,19 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); float intput_scale = input_tensor->scale; - float rescale = intput_scale / output_scale; + float rescale = intput_scale / output_scale; int in_n = input_tensor->dims[0]; int in_h = input_tensor->dims[1]; int in_w = input_tensor->dims[2]; int in_nstep = in_h * in_w; - int8_t* input_data = (int8_t*)input_tensor->data + n * in_nstep; + int8_t* input_data = (int8_t*)input_tensor->data + n * in_nstep; int8_t* output_data = (int8_t*)output_tensor->data + n * out_nstep + output_step; - for (int i=0; i 127) idata = 127; else if (idata < -127) @@ -233,9 +232,9 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) int out_w = output_tensor->dims[2]; int out_nstep = out_h * out_w; - for (int n=0; ninput_num; num++) @@ -243,19 +242,19 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); float intput_scale = input_tensor->scale; - float rescale = intput_scale / output_scale; + float rescale = intput_scale / output_scale; int in_n = input_tensor->dims[0]; int in_h = input_tensor->dims[1]; int in_w = input_tensor->dims[2]; int in_nstep = in_h * in_w; - int8_t* input_data = (int8_t*)input_tensor->data + n * in_nstep + h * in_w; + int8_t* input_data = (int8_t*)input_tensor->data + n * in_nstep + h * in_w; int8_t* output_data = (int8_t*)output_tensor->data + n * out_nstep + h * out_w + output_step; - for (int i=0; i 127) idata = 127; else if (idata < -127) @@ -275,19 +274,19 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) int output_step = 0; for (int num = 0; num < ir_node->input_num; num++) { - struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); + struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); float intput_scale = input_tensor->scale; - float rescale = intput_scale / output_scale; + float rescale = intput_scale / output_scale; int size = input_tensor->elem_num; - int8_t* input_data = (int8_t*)input_tensor->data; + int8_t* input_data = (int8_t*)input_tensor->data; int8_t* output_data = (int8_t*)output_tensor->data + output_step; - for (int i=0; i 127) idata = 127; else if (idata < -127) @@ -308,7 +307,7 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) int out_cstep = out_h * out_w; int out_nstep = out_c * out_cstep; - for (int n=0; ninput_num; num++) @@ -316,7 +315,7 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); float intput_scale = input_tensor->scale; - float rescale = intput_scale / output_scale; + float rescale = intput_scale / output_scale; int in_n = input_tensor->dims[0]; int in_c = input_tensor->dims[1]; @@ -325,12 +324,12 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) int in_cstep = in_h * in_w; int in_nstep = in_c * in_cstep; - int8_t* input_data = (int8_t*)input_tensor->data + n * in_nstep; + int8_t* input_data = (int8_t*)input_tensor->data + n * in_nstep; int8_t* output_data = (int8_t*)output_tensor->data + n * out_nstep + output_step; - for (int i=0; i 127) idata = 127; else if (idata < -127) @@ -352,9 +351,9 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) int out_cstep = out_h * out_w; int out_nstep = out_c * out_cstep; - for (int n=0; ninput_num; num++) @@ -362,7 +361,7 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); float intput_scale = input_tensor->scale; - float rescale = intput_scale / output_scale; + float rescale = intput_scale / output_scale; int in_n = input_tensor->dims[0]; int in_c = input_tensor->dims[1]; @@ -371,12 +370,12 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) int in_cstep = in_h * in_w; int in_nstep = in_c * in_cstep; - int8_t* input_data = (int8_t*)input_tensor->data + n * in_nstep + c * in_cstep; + int8_t* input_data = (int8_t*)input_tensor->data + n * in_nstep + c * in_cstep; int8_t* output_data = (int8_t*)output_tensor->data + n * out_nstep + c * out_cstep + output_step; - for (int i=0; i 127) idata = 127; else if (idata < -127) @@ -399,11 +398,11 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) int out_cstep = out_h * out_w; int out_nstep = out_c * out_cstep; - for (int n=0; ninput_num; num++) @@ -420,12 +419,12 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis) int in_cstep = in_h * in_w; int in_nstep = in_c * in_cstep; - int8_t* input_data = (int8_t*)input_tensor->data + n * in_nstep + c * in_cstep + h * in_w; + int8_t* input_data = (int8_t*)input_tensor->data + n * in_nstep + c * in_cstep + h * in_w; int8_t* output_data = (int8_t*)output_tensor->data + n * out_nstep + c * out_cstep + h * out_w + output_step; - for (int i=0; i 127) idata = 127; else if (idata < -127) diff --git a/source/device/cpu/op/concat/concat_kernel_ref_uint8.c b/source/device/cpu/op/concat/concat_kernel_ref_uint8.c index 80c1b1599..68f13601d 100644 --- a/source/device/cpu/op/concat/concat_kernel_ref_uint8.c +++ b/source/device/cpu/op/concat/concat_kernel_ref_uint8.c @@ -38,23 +38,22 @@ #include - int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) { struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); float output_scale = output_tensor->scale; - int output_zero = output_tensor->zero_point; + int output_zero = output_tensor->zero_point; if (ir_node->input_num == 1) { struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); - uint8_t* input_data = (uint8_t*)input_tensor->data; + uint8_t* input_data = (uint8_t*)input_tensor->data; uint8_t* output_data = (uint8_t*)output_tensor->data; - for(int i=0; ielem_num; i++) + for (int i = 0; i < input_tensor->elem_num; i++) output_data[i] = input_data[i]; - + return 0; } @@ -75,12 +74,12 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) int size = input_tensor->elem_num; - uint8_t* input_data = (uint8_t*)input_tensor->data; + uint8_t* input_data = (uint8_t*)input_tensor->data; uint8_t* output_data = (uint8_t*)output_tensor->data + output_step; - for (int i=0; i 255) udata = 255; else if (udata < 0) @@ -98,7 +97,7 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) int output_step = 0; for (int num = 0; num < ir_node->input_num; num++) { - struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); + struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); float intput_scale = input_tensor->scale; int intput_zero = input_tensor->zero_point; @@ -106,12 +105,12 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) int size = input_tensor->elem_num; - uint8_t* input_data = (uint8_t*)input_tensor->data; + uint8_t* input_data = (uint8_t*)input_tensor->data; uint8_t* output_data = (uint8_t*)output_tensor->data + output_step; - for (int i=0; i 255) udata = 255; else if (udata < 0) @@ -128,7 +127,7 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) int out_n = output_tensor->dims[0]; int out_w = output_tensor->dims[1]; - for (int n=0; ndims[0]; n++) + for (int n = 0; n < output_tensor->dims[0]; n++) { int output_step = 0; for (int num = 0; num < ir_node->input_num; num++) @@ -137,17 +136,17 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) float intput_scale = input_tensor->scale; int intput_zero = input_tensor->zero_point; - float rescale = intput_scale / output_scale; + float rescale = intput_scale / output_scale; int in_n = input_tensor->dims[0]; int in_w = input_tensor->dims[1]; - uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_w; + uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_w; uint8_t* output_data = (uint8_t*)output_tensor->data + n * out_w + output_step; - for (int i=0; i 255) udata = 255; else if (udata < 0) @@ -166,20 +165,20 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) int output_step = 0; for (int num = 0; num < ir_node->input_num; num++) { - struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); + struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); float intput_scale = input_tensor->scale; int intput_zero = input_tensor->zero_point; - float rescale = intput_scale / output_scale; + float rescale = intput_scale / output_scale; int size = input_tensor->elem_num; - uint8_t* input_data = (uint8_t*)input_tensor->data; + uint8_t* input_data = (uint8_t*)input_tensor->data; uint8_t* output_data = (uint8_t*)output_tensor->data + output_step; - for (int i=0; i 255) udata = 255; else if (udata < 0) @@ -198,7 +197,7 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) int out_w = output_tensor->dims[2]; int out_nstep = out_h * out_w; - for (int n=0; ninput_num; num++) @@ -207,19 +206,19 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) float intput_scale = input_tensor->scale; int intput_zero = input_tensor->zero_point; - float rescale = intput_scale / output_scale; + float rescale = intput_scale / output_scale; int in_n = input_tensor->dims[0]; int in_h = input_tensor->dims[1]; int in_w = input_tensor->dims[2]; int in_nstep = in_h * in_w; - uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_nstep; + uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_nstep; uint8_t* output_data = (uint8_t*)output_tensor->data + n * out_nstep + output_step; - for (int i=0; i 255) udata = 255; else if (udata < 0) @@ -239,9 +238,9 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) int out_w = output_tensor->dims[2]; int out_nstep = out_h * out_w; - for (int n=0; ninput_num; num++) @@ -250,19 +249,19 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) float intput_scale = input_tensor->scale; int intput_zero = input_tensor->zero_point; - float rescale = intput_scale / output_scale; + float rescale = intput_scale / output_scale; int in_n = input_tensor->dims[0]; int in_h = input_tensor->dims[1]; int in_w = input_tensor->dims[2]; int in_nstep = in_h * in_w; - uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_nstep + h * in_w; + uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_nstep + h * in_w; uint8_t* output_data = (uint8_t*)output_tensor->data + n * out_nstep + h * out_w + output_step; - for (int i=0; i 255) udata = 255; else if (udata < 0) @@ -282,20 +281,20 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) int output_step = 0; for (int num = 0; num < ir_node->input_num; num++) { - struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); + struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]); float intput_scale = input_tensor->scale; int intput_zero = input_tensor->zero_point; - float rescale = intput_scale / output_scale; + float rescale = intput_scale / output_scale; int size = input_tensor->elem_num; - uint8_t* input_data = (uint8_t*)input_tensor->data; + uint8_t* input_data = (uint8_t*)input_tensor->data; uint8_t* output_data = (uint8_t*)output_tensor->data + output_step; - for (int i=0; i 255) udata = 255; else if (udata < 0) @@ -316,7 +315,7 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) int out_cstep = out_h * out_w; int out_nstep = out_c * out_cstep; - for (int n=0; ninput_num; num++) @@ -325,7 +324,7 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) float intput_scale = input_tensor->scale; int intput_zero = input_tensor->zero_point; - float rescale = intput_scale / output_scale; + float rescale = intput_scale / output_scale; int in_n = input_tensor->dims[0]; int in_c = input_tensor->dims[1]; @@ -334,12 +333,12 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) int in_cstep = in_h * in_w; int in_nstep = in_c * in_cstep; - uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_nstep; + uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_nstep; uint8_t* output_data = (uint8_t*)output_tensor->data + n * out_nstep + output_step; - for (int i=0; i 255) udata = 255; else if (udata < 0) @@ -361,9 +360,9 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) int out_cstep = out_h * out_w; int out_nstep = out_c * out_cstep; - for (int n=0; ninput_num; num++) @@ -372,7 +371,7 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) float intput_scale = input_tensor->scale; int intput_zero = input_tensor->zero_point; - float rescale = intput_scale / output_scale; + float rescale = intput_scale / output_scale; int in_n = input_tensor->dims[0]; int in_c = input_tensor->dims[1]; @@ -381,12 +380,12 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) int in_cstep = in_h * in_w; int in_nstep = in_c * in_cstep; - uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_nstep + c * in_cstep; + uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_nstep + c * in_cstep; uint8_t* output_data = (uint8_t*)output_tensor->data + n * out_nstep + c * out_cstep + output_step; - for (int i=0; i 255) udata = 255; else if (udata < 0) @@ -409,11 +408,11 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) int out_cstep = out_h * out_w; int out_nstep = out_c * out_cstep; - for (int n=0; ninput_num; num++) @@ -422,7 +421,7 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) float intput_scale = input_tensor->scale; int intput_zero = input_tensor->zero_point; - float rescale = intput_scale / output_scale; + float rescale = intput_scale / output_scale; int in_n = input_tensor->dims[0]; int in_c = input_tensor->dims[1]; @@ -431,12 +430,12 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis) int in_cstep = in_h * in_w; int in_nstep = in_c * in_cstep; - uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_nstep + c * in_cstep + h * in_w; + uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_nstep + c * in_cstep + h * in_w; uint8_t* output_data = (uint8_t*)output_tensor->data + n * out_nstep + c * out_cstep + h * out_w + output_step; - for (int i=0; i 255) udata = 255; else if (udata < 0) diff --git a/source/device/cpu/op/concat/concat_ref.c b/source/device/cpu/op/concat/concat_ref.c index 7630d4be0..b3b704f5f 100644 --- a/source/device/cpu/op/concat/concat_ref.c +++ b/source/device/cpu/op/concat/concat_ref.c @@ -38,7 +38,6 @@ #include "concat_kernel_ref.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -54,8 +53,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct concat_param* concat_param = ( struct concat_param* )ir_node->op.param_mem; - + struct concat_param* concat_param = (struct concat_param*)ir_node->op.param_mem; + int ret = -1; if (output_tensor->data_type == TENGINE_DT_FP32) { @@ -81,13 +80,13 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc } static struct node_ops hcl_node_ops = { - .prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score}; int register_concat_ref_op() { diff --git a/source/device/cpu/op/conv/conv_kernel_ref.h b/source/device/cpu/op/conv/conv_kernel_ref.h index a3b49607c..c35c156c2 100644 --- a/source/device/cpu/op/conv/conv_kernel_ref.h +++ b/source/device/cpu/op/conv/conv_kernel_ref.h @@ -31,17 +31,16 @@ #include "graph/node.h" #include "graph/graph.h" - int ref_conv_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* kernel, - struct tensor* bias, struct conv_param* conv_param); + struct tensor* bias, struct conv_param* conv_param); int ref_conv_fp16(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* kernel, - struct tensor* bias, struct conv_param* conv_param); + struct tensor* bias, struct conv_param* conv_param); int ref_conv_int8(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* kernel, - struct tensor* bias, struct conv_param* conv_param); + struct tensor* bias, struct conv_param* conv_param); int ref_conv_uint8(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* kernel, - struct tensor* bias, struct conv_param* conv_param); + struct tensor* bias, struct conv_param* conv_param); #endif diff --git a/source/device/cpu/op/conv/conv_kernel_ref_fp16.c b/source/device/cpu/op/conv/conv_kernel_ref_fp16.c index 26f6aa284..3e284d063 100644 --- a/source/device/cpu/op/conv/conv_kernel_ref_fp16.c +++ b/source/device/cpu/op/conv/conv_kernel_ref_fp16.c @@ -37,9 +37,8 @@ #include "conv_kernel_ref.h" - int ref_conv_fp16(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* kernel, - struct tensor* bias, struct conv_param* conv_param) + struct tensor* bias, struct conv_param* conv_param) { #if MACOS TLOG_ERR("FP16 not support under mac os"); @@ -88,14 +87,11 @@ int ref_conv_fp16(struct tensor* input_tensor, struct tensor* output_tensor, str float total = 0.f; if (input_tensor->layout == 0) { - output_offset = n * group * output_c * output_h * output_w + - g * output_c * output_h * output_w + c * output_h * output_w + - h * output_w + w; + output_offset = n * group * output_c * output_h * output_w + g * output_c * output_h * output_w + c * output_h * output_w + h * output_w + w; } else { - output_offset = n * group * output_c * output_h * output_w + - h * output_w * group * output_c + w * group * output_c + output_c * g + c; + output_offset = n * group * output_c * output_h * output_w + h * output_w * group * output_c + w * group * output_c + output_c * g + c; } for (kc = 0; kc < input_c; ++kc) { @@ -111,25 +107,16 @@ int ref_conv_fp16(struct tensor* input_tensor, struct tensor* output_tensor, str { if (input_tensor->layout == 0) { - input_offset = n * group * input_c * input_h * input_w + - g * input_c * input_h * input_w + kc * input_h * input_w + - cur_y * input_w + cur_x; - kernel_offset = g * output_c * kernel_size + c * kernel_size + - kc * conv_param->kernel_h * conv_param->kernel_w + - kh * conv_param->kernel_w + kw; + input_offset = n * group * input_c * input_h * input_w + g * input_c * input_h * input_w + kc * input_h * input_w + cur_y * input_w + cur_x; + kernel_offset = g * output_c * kernel_size + c * kernel_size + kc * conv_param->kernel_h * conv_param->kernel_w + kh * conv_param->kernel_w + kw; } else { - input_offset = n * group * input_c * input_h * input_w + - cur_y * input_w * input_c * group + cur_x * input_c * group + - g * input_c + kc; - kernel_offset = c * group * kernel_size + - kh * conv_param->kernel_w * input_c * group + - kw * input_c * group + g * input_c + kc; + input_offset = n * group * input_c * input_h * input_w + cur_y * input_w * input_c * group + cur_x * input_c * group + g * input_c + kc; + kernel_offset = c * group * kernel_size + kh * conv_param->kernel_w * input_c * group + kw * input_c * group + g * input_c + kc; } - total += fp16_to_fp32(input_data[input_offset]) * - fp16_to_fp32(kernel_data[kernel_offset]); + total += fp16_to_fp32(input_data[input_offset]) * fp16_to_fp32(kernel_data[kernel_offset]); } } } diff --git a/source/device/cpu/op/conv/conv_kernel_ref_fp32.c b/source/device/cpu/op/conv/conv_kernel_ref_fp32.c index ff331494f..692852f61 100644 --- a/source/device/cpu/op/conv/conv_kernel_ref_fp32.c +++ b/source/device/cpu/op/conv/conv_kernel_ref_fp32.c @@ -37,9 +37,8 @@ #include "conv_kernel_ref.h" - int ref_conv_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* kernel, - struct tensor* bias, struct conv_param* conv_param) + struct tensor* bias, struct conv_param* conv_param) { int batch = input_tensor->dims[0]; int group = conv_param->group; @@ -84,9 +83,7 @@ int ref_conv_fp32(struct tensor* input_tensor, struct tensor* output_tensor, str const int w_start = (w * conv_param->stride_w) - conv_param->pad_w0; float total = 0.f; - output_offset = n * group * output_c * output_h * output_w + - g * output_c * output_h * output_w + c * output_h * output_w + - h * output_w + w; + output_offset = n * group * output_c * output_h * output_w + g * output_c * output_h * output_w + c * output_h * output_w + h * output_w + w; for (kc = 0; kc < input_c; ++kc) { @@ -100,13 +97,8 @@ int ref_conv_fp32(struct tensor* input_tensor, struct tensor* output_tensor, str // use zero as a default value. if ((cur_x >= 0) && (cur_x < input_w) && (cur_y >= 0) && (cur_y < input_h)) { - - input_offset = n * group * input_c * input_h * input_w + - g * input_c * input_h * input_w + kc * input_h * input_w + - cur_y * input_w + cur_x; - kernel_offset = g * output_c * kernel_size + c * kernel_size + - kc * conv_param->kernel_h * conv_param->kernel_w + - kh * conv_param->kernel_w + kw; + input_offset = n * group * input_c * input_h * input_w + g * input_c * input_h * input_w + kc * input_h * input_w + cur_y * input_w + cur_x; + kernel_offset = g * output_c * kernel_size + c * kernel_size + kc * conv_param->kernel_h * conv_param->kernel_w + kh * conv_param->kernel_w + kw; total += input_data[input_offset] * kernel_data[kernel_offset]; } diff --git a/source/device/cpu/op/conv/conv_kernel_ref_int8.c b/source/device/cpu/op/conv/conv_kernel_ref_int8.c index 80d6a7dab..ba27e50d6 100644 --- a/source/device/cpu/op/conv/conv_kernel_ref_int8.c +++ b/source/device/cpu/op/conv/conv_kernel_ref_int8.c @@ -39,9 +39,8 @@ #include - int ref_conv_int8(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* kernel, - struct tensor* bias, struct conv_param* conv_param) + struct tensor* bias, struct conv_param* conv_param) { int batch = input_tensor->dims[0]; int group = conv_param->group; @@ -71,9 +70,9 @@ int ref_conv_int8(struct tensor* input_tensor, struct tensor* output_tensor, str /* input and kernel scales */ int dequant_scales_size = group * output_c; - float *dequant_scales = (float*)malloc(sizeof(float) * dequant_scales_size); + float* dequant_scales = (float*)malloc(sizeof(float) * dequant_scales_size); - for(int i = 0; i < dequant_scales_size; i++) + for (int i = 0; i < dequant_scales_size; i++) { dequant_scales[i] = (input_scale * kernel_scales[i]); } @@ -100,14 +99,11 @@ int ref_conv_int8(struct tensor* input_tensor, struct tensor* output_tensor, str int32_t total_i32 = 0; if (input_tensor->layout == 0) { - output_offset = n * group * output_c * output_h * output_w + - g * output_c * output_h * output_w + c * output_h * output_w + - h * output_w + w; + output_offset = n * group * output_c * output_h * output_w + g * output_c * output_h * output_w + c * output_h * output_w + h * output_w + w; } else { - output_offset = n * group * output_c * output_h * output_w + - h * output_w * group * output_c + w * group * output_c + output_c * g + c; + output_offset = n * group * output_c * output_h * output_w + h * output_w * group * output_c + w * group * output_c + output_c * g + c; } for (kc = 0; kc < input_c; ++kc) { @@ -123,21 +119,13 @@ int ref_conv_int8(struct tensor* input_tensor, struct tensor* output_tensor, str { if (input_tensor->layout == 0) { - input_offset = n * group * input_c * input_h * input_w + - g * input_c * input_h * input_w + kc * input_h * input_w + - cur_y * input_w + cur_x; - kernel_offset = g * output_c * kernel_size + c * kernel_size + - kc * conv_param->kernel_h * conv_param->kernel_w + - kh * conv_param->kernel_w + kw; + input_offset = n * group * input_c * input_h * input_w + g * input_c * input_h * input_w + kc * input_h * input_w + cur_y * input_w + cur_x; + kernel_offset = g * output_c * kernel_size + c * kernel_size + kc * conv_param->kernel_h * conv_param->kernel_w + kh * conv_param->kernel_w + kw; } else { - input_offset = n * group * input_c * input_h * input_w + - cur_y * input_w * input_c * group + cur_x * input_c * group + - g * input_c + kc; - kernel_offset = c * group * kernel_size + - kh * conv_param->kernel_w * input_c * group + - kw * input_c * group + g * input_c + kc; + input_offset = n * group * input_c * input_h * input_w + cur_y * input_w * input_c * group + cur_x * input_c * group + g * input_c + kc; + kernel_offset = c * group * kernel_size + kh * conv_param->kernel_w * input_c * group + kw * input_c * group + g * input_c + kc; } total_i32 += (int32_t)input_i8[input_offset] * (int32_t)kernel_i8[kernel_offset]; diff --git a/source/device/cpu/op/conv/conv_kernel_ref_uint8.c b/source/device/cpu/op/conv/conv_kernel_ref_uint8.c index c236fa84a..376f15ad3 100644 --- a/source/device/cpu/op/conv/conv_kernel_ref_uint8.c +++ b/source/device/cpu/op/conv/conv_kernel_ref_uint8.c @@ -39,9 +39,8 @@ #include - int ref_conv_uint8(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* kernel, - struct tensor* bias, struct conv_param* conv_param) + struct tensor* bias, struct conv_param* conv_param) { int batch = input_tensor->dims[0]; int group = conv_param->group; @@ -74,15 +73,15 @@ int ref_conv_uint8(struct tensor* input_tensor, struct tensor* output_tensor, st /* dequant input */ int input_size = batch * group * input_c * input_h * input_w; - float* input_fp32 = ( float* )sys_malloc(sizeof(float) * input_size); + float* input_fp32 = (float*)sys_malloc(sizeof(float) * input_size); for (int i = 0; i < input_size; i++) - input_fp32[i] = (( float )input_data[i] - input_zero) * input_scale; + input_fp32[i] = ((float)input_data[i] - input_zero) * input_scale; /* dequant kernel */ int kernel_total = group * output_c * kernel_size; - float* kernel_fp32 = ( float* )sys_malloc(sizeof(float) * kernel_total); + float* kernel_fp32 = (float*)sys_malloc(sizeof(float) * kernel_total); for (int i = 0; i < kernel_total; i++) - kernel_fp32[i] = (( float )kernel_data[i] - kernel_zero) * kernel_scale; + kernel_fp32[i] = ((float)kernel_data[i] - kernel_zero) * kernel_scale; /* dequant biases */ int bias_size = group * output_c; @@ -90,9 +89,9 @@ int ref_conv_uint8(struct tensor* input_tensor, struct tensor* output_tensor, st float* bias_fp32 = NULL; if (bias != NULL) { - bias_fp32 = ( float* )sys_malloc(sizeof(float) * bias_size); + bias_fp32 = (float*)sys_malloc(sizeof(float) * bias_size); for (int i = 0; i < bias_size; i++) - bias_fp32[i] = ( float )bias_data[i] * input_scale * kernel_scale; + bias_fp32[i] = (float)bias_data[i] * input_scale * kernel_scale; } if (conv_param->kernel_h == 0) @@ -117,14 +116,11 @@ int ref_conv_uint8(struct tensor* input_tensor, struct tensor* output_tensor, st float total = 0.f; if (input_tensor->layout == 0) { - output_offset = n * group * output_c * output_h * output_w + - g * output_c * output_h * output_w + c * output_h * output_w + - h * output_w + w; + output_offset = n * group * output_c * output_h * output_w + g * output_c * output_h * output_w + c * output_h * output_w + h * output_w + w; } else { - output_offset = n * group * output_c * output_h * output_w + - h * output_w * group * output_c + w * group * output_c + output_c * g + c; + output_offset = n * group * output_c * output_h * output_w + h * output_w * group * output_c + w * group * output_c + output_c * g + c; } for (kc = 0; kc < input_c; ++kc) { @@ -140,21 +136,13 @@ int ref_conv_uint8(struct tensor* input_tensor, struct tensor* output_tensor, st { if (input_tensor->layout == 0) { - input_offset = n * group * input_c * input_h * input_w + - g * input_c * input_h * input_w + kc * input_h * input_w + - cur_y * input_w + cur_x; - kernel_offset = g * output_c * kernel_size + c * kernel_size + - kc * conv_param->kernel_h * conv_param->kernel_w + - kh * conv_param->kernel_w + kw; + input_offset = n * group * input_c * input_h * input_w + g * input_c * input_h * input_w + kc * input_h * input_w + cur_y * input_w + cur_x; + kernel_offset = g * output_c * kernel_size + c * kernel_size + kc * conv_param->kernel_h * conv_param->kernel_w + kh * conv_param->kernel_w + kw; } else { - input_offset = n * group * input_c * input_h * input_w + - cur_y * input_w * input_c * group + cur_x * input_c * group + - g * input_c + kc; - kernel_offset = c * group * kernel_size + - kh * conv_param->kernel_w * input_c * group + - kw * input_c * group + g * input_c + kc; + input_offset = n * group * input_c * input_h * input_w + cur_y * input_w * input_c * group + cur_x * input_c * group + g * input_c + kc; + kernel_offset = c * group * kernel_size + kh * conv_param->kernel_w * input_c * group + kw * input_c * group + g * input_c + kc; } total += input_fp32[input_offset] * kernel_fp32[kernel_offset]; diff --git a/source/device/cpu/op/conv/conv_ref.c b/source/device/cpu/op/conv/conv_ref.c index 3f403aa1f..8f655f580 100644 --- a/source/device/cpu/op/conv/conv_ref.c +++ b/source/device/cpu/op/conv/conv_ref.c @@ -37,7 +37,6 @@ #include "conv_kernel_ref.h" - // add conv op by wangxinwei for debug conv //======================================================================================================// @@ -57,7 +56,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); } - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; int ret = 0; if (input_tensor->data_type == TENGINE_DT_FP32) @@ -86,12 +85,12 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; /* dynamic get the shape of output tensor */ int n = input_tensor->dims[0]; int h = input_tensor->dims[2]; - int w = input_tensor->dims[3]; + int w = input_tensor->dims[3]; int ret = 0; if (conv_param->kernel_w == 0) @@ -133,10 +132,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc } else { - out_h = - (h - conv_param->dilation_h * (conv_param->kernel_h - 1) - 1 + conv_param->pad_h0 + conv_param->pad_h1) / - conv_param->stride_h + - 1; + out_h = (h - conv_param->dilation_h * (conv_param->kernel_h - 1) - 1 + conv_param->pad_h0 + conv_param->pad_h1) / conv_param->stride_h + 1; } if (conv_param->pad_w0 < 0) @@ -159,10 +155,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc } else { - out_w = - (w - conv_param->dilation_w * (conv_param->kernel_w - 1) - 1 + conv_param->pad_w0 + conv_param->pad_w1) / - conv_param->stride_w + - 1; + out_w = (w - conv_param->dilation_w * (conv_param->kernel_w - 1) - 1 + conv_param->pad_w0 + conv_param->pad_w1) / conv_param->stride_w + 1; } int dims[4]; @@ -207,12 +200,12 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc } static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score}; int register_conv_ref_op() { diff --git a/source/device/cpu/op/conv/cortex-a/armv8.2/conv_dw_kernel_fp16_arm82.c b/source/device/cpu/op/conv/cortex-a/armv8.2/conv_dw_kernel_fp16_arm82.c index 3c3854143..db451322f 100644 --- a/source/device/cpu/op/conv/cortex-a/armv8.2/conv_dw_kernel_fp16_arm82.c +++ b/source/device/cpu/op/conv/cortex-a/armv8.2/conv_dw_kernel_fp16_arm82.c @@ -37,7 +37,7 @@ void dw_k3s2_fp16_relu_fused_a76(__fp16* bias, __fp16* input, __fp16* kernel, __ void dw_k3s2_fp16_relu6_fused_a76(__fp16* bias, __fp16* input, __fp16* kernel, __fp16* output, long channel_number, long input_w, long input_h, long pad0); int conv_dw_fp16_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, - struct tensor* output_tensor, struct conv_param* param, int num_thread, int cpu_affinity) + struct tensor* output_tensor, struct conv_param* param, int num_thread, int cpu_affinity) { /* param */ int pads[4]; @@ -79,7 +79,7 @@ int conv_dw_fp16_run(struct tensor* input_tensor, struct tensor* filter_tensor, if (bias_tensor) bias_buf = bias_tensor->data; - for (int n = 0; n < batch; n++) // batch size + for (int n = 0; n < batch; n++) // batch size { __fp16* input = input_buf + n * input_size * group; __fp16* output = output_buf + n * output_size * group; @@ -92,7 +92,7 @@ int conv_dw_fp16_run(struct tensor* input_tensor, struct tensor* filter_tensor, { if (activation == 0) { - #pragma omp parallel for num_threads(num_thread) +#pragma omp parallel for num_threads(num_thread) for (int i = 0; i < group; i++) { __fp16* cur_input = input + i * channel_size; @@ -106,7 +106,7 @@ int conv_dw_fp16_run(struct tensor* input_tensor, struct tensor* filter_tensor, } else if (activation > 0) { - #pragma omp parallel for num_threads(num_thread) +#pragma omp parallel for num_threads(num_thread) for (int i = 0; i < group; i++) { __fp16* cur_input = input + i * channel_size; @@ -120,7 +120,7 @@ int conv_dw_fp16_run(struct tensor* input_tensor, struct tensor* filter_tensor, } else { - #pragma omp parallel for num_threads(num_thread) +#pragma omp parallel for num_threads(num_thread) for (int i = 0; i < group; i++) { __fp16* cur_input = input + i * channel_size; @@ -137,7 +137,7 @@ int conv_dw_fp16_run(struct tensor* input_tensor, struct tensor* filter_tensor, { if (activation == 0) { - #pragma omp parallel for num_threads(num_thread) +#pragma omp parallel for num_threads(num_thread) for (int i = 0; i < group; i++) { __fp16* cur_input = input + i * channel_size; @@ -151,7 +151,7 @@ int conv_dw_fp16_run(struct tensor* input_tensor, struct tensor* filter_tensor, } else if (activation > 0) { - #pragma omp parallel for num_threads(num_thread) +#pragma omp parallel for num_threads(num_thread) for (int i = 0; i < group; i++) { __fp16* cur_input = input + i * channel_size; @@ -165,7 +165,7 @@ int conv_dw_fp16_run(struct tensor* input_tensor, struct tensor* filter_tensor, } else { - #pragma omp parallel for num_threads(num_thread) +#pragma omp parallel for num_threads(num_thread) for (int i = 0; i < group; i++) { __fp16* cur_input = input + i * channel_size; diff --git a/source/device/cpu/op/conv/cortex-a/armv8.2/conv_dw_kernel_fp16_arm82.h b/source/device/cpu/op/conv/cortex-a/armv8.2/conv_dw_kernel_fp16_arm82.h index 052ffcc18..a07995f59 100644 --- a/source/device/cpu/op/conv/cortex-a/armv8.2/conv_dw_kernel_fp16_arm82.h +++ b/source/device/cpu/op/conv/cortex-a/armv8.2/conv_dw_kernel_fp16_arm82.h @@ -28,7 +28,6 @@ #include "convolution_param.h" int conv_dw_fp16_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, - struct tensor* output_tensor, struct conv_param* param, int num_thread, int cpu_affinity) - ; + struct tensor* output_tensor, struct conv_param* param, int num_thread, int cpu_affinity); #endif diff --git a/source/device/cpu/op/conv/cortex-a/armv8.2/conv_kernel_fp16_arm82.c b/source/device/cpu/op/conv/cortex-a/armv8.2/conv_kernel_fp16_arm82.c index ca2698959..3e9a92944 100644 --- a/source/device/cpu/op/conv/cortex-a/armv8.2/conv_kernel_fp16_arm82.c +++ b/source/device/cpu/op/conv/cortex-a/armv8.2/conv_kernel_fp16_arm82.c @@ -33,12 +33,12 @@ #define PER_OUT_CHAN 16 void hgemm_4x16_a76(__fp16* biases, __fp16* input, __fp16* kernel, long kernel_size, __fp16* output, - long output_xy, long fused_relu); + long output_xy, long fused_relu); void hgemm_4x4_a76(__fp16* biases, __fp16* input, __fp16* kernel, long kernel_size, __fp16* output, - long output_xy, long fused_relu); + long output_xy, long fused_relu); void im2col_fp16_1x1(__fp16* input, long input_xy, __fp16* col, long col_cnt, long input_chan); -void im2col_fp16_3x3(__fp16* input, long input_x, long input_y, long input_chan, __fp16* col, long stride); +void im2col_fp16_3x3(__fp16* input, long input_x, long input_y, long input_chan, __fp16* col, long stride); void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, int kernel_x, int kernel_y, int stride_x, int stride_y, int dilation_x, int dilation_y, int pad_w0, int pad_w1, int pad_h0, int pad_h1, int output_x, @@ -51,7 +51,7 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i __fp16* cur_col = col + col_start * kernel_size; int col_i, col_j, kch, ky, kx, i; - if((kernel_x == 1) && (kernel_y == 1) && (stride_x == 1) && (stride_y == 1)) + if ((kernel_x == 1) && (kernel_y == 1) && (stride_x == 1) && (stride_y == 1)) { { int col_cnt = (col_end & -4) - (col_start & -4); @@ -60,13 +60,13 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i col_i = col_end & -4; } // final 4 input - if(col_end & 0x3) + if (col_end & 0x3) { - for(col_j = 0; col_j < kernel_size; col_j++) + for (col_j = 0; col_j < kernel_size; col_j++) { - for(i = 0; i < 4; i++) + for (i = 0; i < 4; i++) { - if((col_i + i) < col_end) + if ((col_i + i) < col_end) *cur_col++ = *(im + input_xy * col_j + col_i + i); else *cur_col++ = 0.0; @@ -74,22 +74,20 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i } } } - else if((kernel_x == 3) && (kernel_y == 3) && (dilation_x == 1) && (dilation_y == 1)) + else if ((kernel_x == 3) && (kernel_y == 3) && (dilation_x == 1) && (dilation_y == 1)) { - int is_pad0 = (pad_w0 == 0) && (pad_h0 == 0) && (pad_w1 == 0) && (pad_h1 == 0); - for(col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) + for (col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) { cur_col = col + col_i * kernel_size; int imy0 = col_i / output_x; int imy3 = (col_i + 3) / output_x; int imx0 = col_i - imy0 * output_x; int imx3 = (col_i + 3) - imy3 * output_x; - if((imy0 == imy3) && - (is_pad0 || (imy0 != 0 && imx0 != 0 && imy0 != (output_y - 1) && imx3 != (output_x - 1)))) + if ((imy0 == imy3) && (is_pad0 || (imy0 != 0 && imx0 != 0 && imy0 != (output_y - 1) && imx3 != (output_x - 1)))) { __fp16* l0 = im + (imy0 * stride_y - pad_y) * input_x + (imx0 * stride_x - pad_x); - + { im2col_fp16_3x3(l0, input_x, input_y, input_chan, cur_col, stride_x); cur_col += 4 * kernel_size; @@ -103,15 +101,15 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i cnt_x[2] * stride_x - pad_x, cnt_x[3] * stride_x - pad_x}; int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, cnt_y[2] * stride_y - pad_y, cnt_y[3] * stride_y - pad_y}; - for(kch = 0; kch < input_chan; kch++) - for(ky = 0; ky < 3; ky++) - for(kx = 0; kx < 3; kx++) + for (kch = 0; kch < input_chan; kch++) + for (ky = 0; ky < 3; ky++) + for (kx = 0; kx < 3; kx++) { int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx}; int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky}; - for(i = 0; i < 4; i++) + for (i = 0; i < 4; i++) { - if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0.0; @@ -120,7 +118,7 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i } } // final 4 input - if(col_end & 0x3) + if (col_end & 0x3) { int cnt_y[4] = {col_i / output_x, (col_i + 1) / output_x, (col_i + 2) / output_x, (col_i + 3) / output_x}; int cnt_x[4] = {col_i - cnt_y[0] * output_x, col_i - cnt_y[1] * output_x + 1, @@ -129,16 +127,15 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i cnt_x[3] * stride_x - pad_x}; int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, cnt_y[2] * stride_y - pad_y, cnt_y[3] * stride_y - pad_y}; - for(kch = 0; kch < input_chan; kch++) - for(ky = 0; ky < 3; ky++) - for(kx = 0; kx < 3; kx++) + for (kch = 0; kch < input_chan; kch++) + for (ky = 0; ky < 3; ky++) + for (kx = 0; kx < 3; kx++) { int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx}; int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky}; - for(i = 0; i < 4; i++) + for (i = 0; i < 4; i++) { - if((col_i + i) < col_end && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && - imy[i] < input_y) + if ((col_i + i) < col_end && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0.0; @@ -147,8 +144,8 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i } } else - { // for general cases - for(col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) + { // for general cases + for (col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) { int cnt_y[4] = {col_i / output_x, (col_i + 1) / output_x, (col_i + 2) / output_x, (col_i + 3) / output_x}; int cnt_x[4] = {col_i - cnt_y[0] * output_x, col_i - cnt_y[1] * output_x + 1, @@ -157,15 +154,15 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i cnt_x[3] * stride_x - pad_x}; int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, cnt_y[2] * stride_y - pad_y, cnt_y[3] * stride_y - pad_y}; - for(kch = 0; kch < input_chan; kch++) - for(ky = 0; ky < (kernel_y * dilation_y); ky += dilation_y) - for(kx = 0; kx < (kernel_x * dilation_x); kx += dilation_x) + for (kch = 0; kch < input_chan; kch++) + for (ky = 0; ky < (kernel_y * dilation_y); ky += dilation_y) + for (kx = 0; kx < (kernel_x * dilation_x); kx += dilation_x) { int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx}; int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky}; - for(i = 0; i < 4; i++) + for (i = 0; i < 4; i++) { - if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0.0; @@ -173,7 +170,7 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i } } // final 4 input - if(col_end & 0x3) + if (col_end & 0x3) { int cnt_y[4] = {col_i / output_x, (col_i + 1) / output_x, (col_i + 2) / output_x, (col_i + 3) / output_x}; int cnt_x[4] = {col_i - cnt_y[0] * output_x, col_i - cnt_y[1] * output_x + 1, @@ -182,16 +179,15 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i cnt_x[3] * stride_x - pad_x}; int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, cnt_y[2] * stride_y - pad_y, cnt_y[3] * stride_y - pad_y}; - for(kch = 0; kch < input_chan; kch++) - for(ky = 0; ky < (kernel_y * dilation_y); ky += dilation_y) - for(kx = 0; kx < (kernel_x * dilation_x); kx += dilation_x) + for (kch = 0; kch < input_chan; kch++) + for (ky = 0; ky < (kernel_y * dilation_y); ky += dilation_y) + for (kx = 0; kx < (kernel_x * dilation_x); kx += dilation_x) { int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx}; int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky}; - for(i = 0; i < 4; i++) + for (i = 0; i < 4; i++) { - if((col_i + i) < col_end && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && - imy[i] < input_y) + if ((col_i + i) < col_end && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0.0; @@ -214,7 +210,7 @@ void interleave_kernel(__fp16* kernel, __fp16* kernel_interleaved, int kernel_ch __fp16* cur_kernel_interleaved = kernel_interleaved; // interleave 16 kernels - for(i = 0; i < (kernel_chan & -16); i += 16) + for (i = 0; i < (kernel_chan & -16); i += 16) { cur_kernel0 = kernel + kernel_size * i; cur_kernel1 = kernel + kernel_size * (i + 1); @@ -232,7 +228,7 @@ void interleave_kernel(__fp16* kernel, __fp16* kernel_interleaved, int kernel_ch cur_kernel13 = kernel + kernel_size * (i + 13); cur_kernel14 = kernel + kernel_size * (i + 14); cur_kernel15 = kernel + kernel_size * (i + 15); - for(j = 0; j < kernel_size; j++) + for (j = 0; j < kernel_size; j++) { *(cur_kernel_interleaved++) = cur_kernel0[j]; *(cur_kernel_interleaved++) = cur_kernel1[j]; @@ -253,13 +249,13 @@ void interleave_kernel(__fp16* kernel, __fp16* kernel_interleaved, int kernel_ch } } - for(i = (kernel_chan & -16); i < (kernel_chan & -4); i += 4) + for (i = (kernel_chan & -16); i < (kernel_chan & -4); i += 4) { cur_kernel0 = kernel + kernel_size * i; cur_kernel1 = kernel + kernel_size * (i + 1); cur_kernel2 = kernel + kernel_size * (i + 2); cur_kernel3 = kernel + kernel_size * (i + 3); - for(j = 0; j < kernel_size; j++) + for (j = 0; j < kernel_size; j++) { *(cur_kernel_interleaved++) = cur_kernel0[j]; *(cur_kernel_interleaved++) = cur_kernel1[j]; @@ -271,9 +267,9 @@ void interleave_kernel(__fp16* kernel, __fp16* kernel_interleaved, int kernel_ch cur_kernel0 = kernel + kernel_size * i; cur_kernel1 = kernel + kernel_size * (i + 1); cur_kernel2 = kernel + kernel_size * (i + 2); - if((kernel_chan & 0x3) == 3) + if ((kernel_chan & 0x3) == 3) { - for(j = 0; j < kernel_size; j++) + for (j = 0; j < kernel_size; j++) { *(cur_kernel_interleaved++) = cur_kernel0[j]; *(cur_kernel_interleaved++) = cur_kernel1[j]; @@ -281,9 +277,9 @@ void interleave_kernel(__fp16* kernel, __fp16* kernel_interleaved, int kernel_ch *(cur_kernel_interleaved++) = 0.0; } } - else if((kernel_chan & 0x3) == 2) + else if ((kernel_chan & 0x3) == 2) { - for(j = 0; j < kernel_size; j++) + for (j = 0; j < kernel_size; j++) { *(cur_kernel_interleaved++) = cur_kernel0[j]; *(cur_kernel_interleaved++) = cur_kernel1[j]; @@ -291,9 +287,9 @@ void interleave_kernel(__fp16* kernel, __fp16* kernel_interleaved, int kernel_ch *(cur_kernel_interleaved++) = 0.0; } } - else if((kernel_chan & 0x3) == 1) + else if ((kernel_chan & 0x3) == 1) { - for(j = 0; j < kernel_size; j++) + for (j = 0; j < kernel_size; j++) { *(cur_kernel_interleaved++) = cur_kernel0[j]; *(cur_kernel_interleaved++) = 0.0; @@ -303,7 +299,7 @@ void interleave_kernel(__fp16* kernel, __fp16* kernel_interleaved, int kernel_ch } } -static void interleave(struct tensor * filter, struct conv_priv_info* priv_info, struct conv_param* param) +static void interleave(struct tensor* filter, struct conv_priv_info* priv_info, struct conv_param* param) { int group = param->group; int out_chan = filter->dims[0] / group; @@ -313,9 +309,9 @@ static void interleave(struct tensor * filter, struct conv_priv_info* priv_info int kernel_interleaved_size_g = kernel_size * ((out_chan + 3) & -4); __fp16* kernel = (__fp16*)filter->data; - + __fp16* interleave_buf = (__fp16*)priv_info->interleave_buffer; - for(int g = 0; g < group; g++) + for (int g = 0; g < group; g++) { __fp16* cur_kernel = kernel + g * kernel_size_g; __fp16* cur_interleave = interleave_buf + g * kernel_interleaved_size_g; @@ -327,33 +323,33 @@ static void hgemm_set(__fp16* col, __fp16* kernel, __fp16* biases, __fp16* outpu int ch_start, int ch_end, int output_xy, int relu_fused, int num_thread, int cpu_affinity) { int nn_outch = ch_end / PER_OUT_CHAN; - int col_end3 = output_xy & 0x3; + int col_end3 = output_xy & 0x3; if (col_end3) { - #pragma omp parallel for num_threads(num_thread) - for (int pp=0; ppgroup; int input_chan = param->input_channel / group; int kernel_size = input_chan * param->kernel_h * param->kernel_w; - + int output_xy = output->dims[2] * output->dims[3]; int mem_size = sizeof(__fp16) * kernel_size * ((output_xy + 3) & -4) + 128; return mem_size; } -static int get_private_mem_size(struct tensor * filter, struct conv_param* param) +static int get_private_mem_size(struct tensor* filter, struct conv_param* param) { int group = param->group; int out_chan = filter->dims[0] / group; int kernel_size = filter->dims[1] * filter->dims[2] * filter->dims[3]; - + int mem_size = sizeof(__fp16) * kernel_size * ((out_chan + 3) & -4) * group + 128; return mem_size; } -int fp16_conv_hcl_prerun(struct tensor* input_tensor , \ - struct tensor* filter_tensor , \ - struct tensor* output_tensor , \ - struct conv_priv_info* priv_info , \ - struct conv_param* param) +int fp16_conv_hcl_prerun(struct tensor* input_tensor, + struct tensor* filter_tensor, + struct tensor* output_tensor, + struct conv_priv_info* priv_info, + struct conv_param* param) { if (!priv_info->external_im2col_mem) { - int mem_size = fp16_conv_hcl_get_shared_mem_size(input_tensor , output_tensor , param); + int mem_size = fp16_conv_hcl_get_shared_mem_size(input_tensor, output_tensor, param); void* mem = sys_malloc(mem_size); priv_info->im2col_buffer = mem; priv_info->im2col_buffer_size = mem_size; @@ -493,15 +489,15 @@ int fp16_conv_hcl_prerun(struct tensor* input_tensor , \ return 0; } -int fp16_conv_hcl_postrun(struct conv_priv_info* priv_info) +int fp16_conv_hcl_postrun(struct conv_priv_info* priv_info) { - if(!priv_info->external_interleave_mem && priv_info->interleave_buffer != NULL) + if (!priv_info->external_interleave_mem && priv_info->interleave_buffer != NULL) { sys_free(priv_info->interleave_buffer); priv_info->interleave_buffer = NULL; } - if(!priv_info->external_im2col_mem && priv_info->im2col_buffer != NULL) + if (!priv_info->external_im2col_mem && priv_info->im2col_buffer != NULL) { sys_free(priv_info->im2col_buffer); priv_info->im2col_buffer = NULL; @@ -510,13 +506,13 @@ int fp16_conv_hcl_postrun(struct conv_priv_info* priv_info) return 0; } -int fp16_conv_hcl_run(struct tensor* input_tensor , \ - struct tensor* filter_tensor , \ - struct tensor* bias_tensor , \ - struct tensor* output_tensor , \ - struct conv_priv_info* priv_info , \ - struct conv_param* param, \ - int num_thread, int cpu_affinity) +int fp16_conv_hcl_run(struct tensor* input_tensor, + struct tensor* filter_tensor, + struct tensor* bias_tensor, + struct tensor* output_tensor, + struct conv_priv_info* priv_info, + struct conv_param* param, + int num_thread, int cpu_affinity) { /* param */ // TLOG_ERR("run into fp16_conv_hcl_run!\n"); @@ -558,23 +554,23 @@ int fp16_conv_hcl_run(struct tensor* input_tensor , \ int sgemm_set_chan = out_c / PER_OUT_CHAN * PER_OUT_CHAN; int sgemm_set_remain = out_c % PER_OUT_CHAN; - for(int n = 0; n < batch; n++) // batch size + for (int n = 0; n < batch; n++) // batch size { - for(int g = 0; g < group; g++) + for (int g = 0; g < group; g++) { /* im2col */ - __fp16* cur_input = input_buf + (n * group + g) *input_size; + __fp16* cur_input = input_buf + (n * group + g) * input_size; im2col(cur_input, col_buf, in_c, in_w, in_h, kernel_w, kernel_h, - stride_w, stride_h, dilation_w, dilation_h, pad_w0, pad_w1, pad_h0, pad_h1, - out_w, out_h, 0, out_hw); + stride_w, stride_h, dilation_w, dilation_h, pad_w0, pad_w1, pad_h0, pad_h1, + out_w, out_h, 0, out_hw); /* gemm */ __fp16* cur_kernel = interleave_buf + g * (kernel_size * ((out_c + 3) & -4)); __fp16* cur_output = output_buf + (n * group + g) * output_size; - __fp16* cur_bias = biases_buf? (biases_buf + g * out_c) : NULL; + __fp16* cur_bias = biases_buf ? (biases_buf + g * out_c) : NULL; hgemm_set(col_buf, cur_kernel, cur_bias, cur_output, kernel_size, 0, sgemm_set_chan, out_hw, fused_relu, num_thread, cpu_affinity); - if(sgemm_set_remain) + if (sgemm_set_remain) { hgemm4x4(col_buf, cur_kernel, cur_bias, cur_output, kernel_size, sgemm_set_chan, out_c, out_hw, fused_relu, num_thread, cpu_affinity); } diff --git a/source/device/cpu/op/conv/cortex-a/conv_dw_dilation_kernel_arm.h b/source/device/cpu/op/conv/cortex-a/conv_dw_dilation_kernel_arm.h index dad13a6f8..f4d93b091 100644 --- a/source/device/cpu/op/conv/cortex-a/conv_dw_dilation_kernel_arm.h +++ b/source/device/cpu/op/conv/cortex-a/conv_dw_dilation_kernel_arm.h @@ -65,8 +65,7 @@ int conv_dw_dilation_run(float* input_buf, float* weight_buf, float* bias, float tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[5]), vld1q_f32(input_buf_c + h * input_w + w + pad)); tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[6]), vld1q_f32(input_buf_c + (h + pad) * input_w + w - pad)); - tmp_4 = - vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[7]), vld1q_f32(input_buf_c + (h + pad) * input_w + w)); + tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[7]), vld1q_f32(input_buf_c + (h + pad) * input_w + w)); tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[8]), vld1q_f32(input_buf_c + (h + pad) * input_w + w + pad)); tmp_4 = vector_activation(tmp_4, activation); @@ -115,8 +114,7 @@ int conv_dw_dilation_run(float* input_buf, float* weight_buf, float* bias, float tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[0]), vld1q_f32(input_buf_c + (h - pad) * input_w + w - pad)); - tmp_4 = - vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[1]), vld1q_f32(input_buf_c + (h - pad) * input_w + w)); + tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[1]), vld1q_f32(input_buf_c + (h - pad) * input_w + w)); tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[2]), vld1q_f32(input_buf_c + (h - pad) * input_w + w + pad)); tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[3]), vld1q_f32(input_buf_c + h * input_w + w - pad)); @@ -124,8 +122,7 @@ int conv_dw_dilation_run(float* input_buf, float* weight_buf, float* bias, float tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[5]), vld1q_f32(input_buf_c + h * input_w + w + pad)); tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[6]), vld1q_f32(input_buf_c + (h + pad) * input_w + w - pad)); - tmp_4 = - vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[7]), vld1q_f32(input_buf_c + (h + pad) * input_w + w)); + tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[7]), vld1q_f32(input_buf_c + (h + pad) * input_w + w)); tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[8]), vld1q_f32(input_buf_c + (h + pad) * input_w + w + pad)); tmp_4 = vector_activation(tmp_4, activation); @@ -177,8 +174,7 @@ int conv_dw_dilation_run(float* input_buf, float* weight_buf, float* bias, float tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[0]), vld1q_f32(input_buf_c + (h - pad) * input_w + w - pad)); - tmp_4 = - vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[1]), vld1q_f32(input_buf_c + (h - pad) * input_w + w)); + tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[1]), vld1q_f32(input_buf_c + (h - pad) * input_w + w)); tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[2]), vld1q_f32(input_buf_c + (h - pad) * input_w + w + pad)); tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[3]), vld1q_f32(input_buf_c + h * input_w + w - pad)); diff --git a/source/device/cpu/op/conv/cortex-a/conv_dw_hcl_arm.c b/source/device/cpu/op/conv/cortex-a/conv_dw_hcl_arm.c index a59389549..cd18faf5d 100644 --- a/source/device/cpu/op/conv/cortex-a/conv_dw_hcl_arm.c +++ b/source/device/cpu/op/conv/cortex-a/conv_dw_hcl_arm.c @@ -50,8 +50,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct struct tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; /* get cpu affinity */ conv_priv_info->cpu_type = exec_graph->cpu_affinity; @@ -67,7 +67,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct return -1; } } - /* int8 prerun */ + /* int8 prerun */ else if (exec_graph->mode == TENGINE_MODE_INT8) { /* do prerun */ @@ -100,8 +100,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; /* fp32 run */ if (exec_graph->mode == TENGINE_MODE_FP32) @@ -114,7 +114,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex } } #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - else if (exec_graph->mode == TENGINE_MODE_FP16) + else if (exec_graph->mode == TENGINE_MODE_FP16) { if (conv_dw_fp16_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_param, num_thread, cpu_affinity) < 0) { @@ -124,7 +124,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex } } #endif - /* int8 run */ + /* int8 run */ else if (exec_graph->mode == TENGINE_MODE_INT8) { if (conv_dw_int8_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread, cpu_affinity) < 0) @@ -145,7 +145,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; /* fp32 postrun */ if (exec_graph->mode == TENGINE_MODE_FP32 || exec_graph->mode == TENGINE_MODE_UINT8) @@ -157,7 +157,7 @@ static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struc return -1; } } - /* int8 postrun */ + /* int8 postrun */ else if (exec_graph->mode == TENGINE_MODE_INT8) { if (conv_dw_int8_postrun(conv_priv_info) < 0) @@ -171,17 +171,15 @@ static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struc return 0; } - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { struct node* ir_node = exec_node->ir_node; - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; /* init the private info data of convolution op */ - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )sys_malloc(sizeof(struct conv_priv_info)); + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)sys_malloc(sizeof(struct conv_priv_info)); if (conv_priv_info == NULL) { - return -1; } memset(conv_priv_info, 0, sizeof(struct conv_priv_info)); @@ -191,7 +189,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; sys_free(conv_priv_info); exec_node->ops_priv = NULL; return 0; @@ -199,7 +197,7 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) { - struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem; + struct conv_param* param = (struct conv_param*)exec_node->op.param_mem; struct node* ir_node = exec_node; struct graph* ir_graph = ir_node->graph; @@ -232,10 +230,10 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc if (input_tensor->data_type != TENGINE_DT_FP32 && input_tensor->data_type != TENGINE_DT_INT8) return 0; #endif - if (kernel_h == 7 && kernel_w == 7 && stride_h == 1 && stride_w == 1) // this is a bug, todo fix it. + if (kernel_h == 7 && kernel_w == 7 && stride_h == 1 && stride_w == 1) // this is a bug, todo fix it. return 0; - if (kernel_h == 2 && kernel_w == 2) // this is a bug, todo fix it. + if (kernel_h == 2 && kernel_w == 2) // this is a bug, todo fix it. return 0; if (dilation_h != 1 || dilation_w != 1) @@ -248,13 +246,12 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc } static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = postrun, - .init_node = init_node, - .release_node = release_node, - .score = score -}; + .run = run, + .reshape = NULL, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score}; int register_conv_dw_hcl_arm_op() { diff --git a/source/device/cpu/op/conv/cortex-a/conv_dw_k5_k7_kernel_arm.h b/source/device/cpu/op/conv/cortex-a/conv_dw_k5_k7_kernel_arm.h index c969d8d1f..7186b089c 100644 --- a/source/device/cpu/op/conv/cortex-a/conv_dw_k5_k7_kernel_arm.h +++ b/source/device/cpu/op/conv/cortex-a/conv_dw_k5_k7_kernel_arm.h @@ -55,7 +55,7 @@ static float32x4_t vector_activation(float32x4_t tmp, int type) tmp = vmaxq_f32(tmp, zero); if (type > 0) { - float32x4_t max = vdupq_n_f32(( float )type); + float32x4_t max = vdupq_n_f32((float)type); tmp = vminq_f32(tmp, max); } } @@ -66,7 +66,7 @@ static float32x4_t vector_activation(float32x4_t tmp, int type) void depthwise_conv_k5s1(float* input, float* weight, float* bias, float* output, int input_h, int input_w, int channel, int output_h, int output_w, int activation, int num_thread) { -// #pragma omp parallel for num_threads(num_thread) + // #pragma omp parallel for num_threads(num_thread) for (int c = 0; c < channel; c++) { float* input_cur = (float*)input + c * input_h * input_w; @@ -91,7 +91,7 @@ void depthwise_conv_k5s2(float* input_buf, float* weight_buf, float* bias, float int mid_w = output_w - 2; int mid_w_block = mid_w & -4; -// #pragma omp parallel for num_threads(num_thread) + // #pragma omp parallel for num_threads(num_thread) for (int c = 0; c < channel; c++) { int w, h; @@ -685,16 +685,14 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line2, kernel_31_34); tmp_4_0 = vmlaq_f32(tmp_4_0, line3, kernel_38_41); tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_45_48); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); float32x4_t tmp_4_1 = vmulq_f32(line1, kernel_17_20); tmp_4_1 = vmlaq_f32(tmp_4_1, line2, kernel_24_27); tmp_4_1 = vmlaq_f32(tmp_4_1, line3, kernel_31_34); tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_38_41); tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_45_48); - tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + - vgetq_lane_f32(tmp_4_1, 3) + bias_c; + tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c; *output_buf_1++ = elem_activation(tmp1, activation); float32x4_t tmp_4_2 = vmulq_f32(line1, kernel_10_13); tmp_4_2 = vmlaq_f32(tmp_4_2, line2, kernel_17_20); @@ -702,8 +700,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_2 = vmlaq_f32(tmp_4_2, line4, kernel_31_34); tmp_4_2 = vmlaq_f32(tmp_4_2, line5, kernel_38_41); tmp_4_2 = vmlaq_f32(tmp_4_2, line6, kernel_45_48); - tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + - vgetq_lane_f32(tmp_4_2, 3) + bias_c; + tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c; *output_buf_2++ = elem_activation(tmp2, activation); } float32x4_t kernel_9_12 = vextq_f32(kernel_8_11, kernel_12_15, 1); @@ -716,8 +713,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line2, kernel_30_33); tmp_4_0 = vmlaq_f32(tmp_4_0, line3, kernel_37_40); tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_44_47); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; tmp0 += weight_buf[27] * input_1[4]; tmp0 += weight_buf[34] * input_2[4]; tmp0 += weight_buf[41] * input_3[4]; @@ -728,8 +724,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_1 = vmlaq_f32(tmp_4_1, line3, kernel_30_33); tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_37_40); tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_44_47); - tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + - vgetq_lane_f32(tmp_4_1, 3) + bias_c; + tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c; tmp1 += weight_buf[20] * input_1[4]; tmp1 += weight_buf[27] * input_2[4]; tmp1 += weight_buf[34] * input_3[4]; @@ -742,8 +737,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_2 = vmlaq_f32(tmp_4_2, line4, kernel_30_33); tmp_4_2 = vmlaq_f32(tmp_4_2, line5, kernel_37_40); tmp_4_2 = vmlaq_f32(tmp_4_2, line6, kernel_44_47); - tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + - vgetq_lane_f32(tmp_4_2, 3) + bias_c; + tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c; tmp2 += weight_buf[13] * input_1[4]; tmp2 += weight_buf[20] * input_2[4]; tmp2 += weight_buf[27] * input_3[4]; @@ -1033,12 +1027,9 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_2 = vmlaq_f32(tmp_4_2, tmp, kernel_38_41); tmp = vextq_f32(zero, line6_1, 3); tmp_4_2 = vmlaq_f32(tmp_4_2, tmp, kernel_45_48); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; - tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + - vgetq_lane_f32(tmp_4_1, 3) + bias_c; - tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + - vgetq_lane_f32(tmp_4_2, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c; + tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); *output_buf_1++ = elem_activation(tmp1, activation); *output_buf_2++ = elem_activation(tmp2, activation); @@ -1117,8 +1108,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line2, kernel_28_31); tmp_4_0 = vmlaq_f32(tmp_4_0, line3, kernel_35_38); tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_42_45); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; tmp0 += vgetq_lane_f32(line1_1, 0) * weight_buf[25]; tmp0 += vgetq_lane_f32(line2_1, 0) * weight_buf[32]; tmp0 += vgetq_lane_f32(line3_1, 0) * weight_buf[39]; @@ -1130,8 +1120,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_1 = vmlaq_f32(tmp_4_1, line3, kernel_28_31); tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_35_38); tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_42_45); - tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + - vgetq_lane_f32(tmp_4_1, 3) + bias_c; + tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c; tmp1 += vgetq_lane_f32(line1_1, 0) * weight_buf[18]; tmp1 += vgetq_lane_f32(line2_1, 0) * weight_buf[25]; tmp1 += vgetq_lane_f32(line3_1, 0) * weight_buf[32]; @@ -1145,8 +1134,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_2 = vmlaq_f32(tmp_4_2, line4, kernel_28_31); tmp_4_2 = vmlaq_f32(tmp_4_2, line5, kernel_35_38); tmp_4_2 = vmlaq_f32(tmp_4_2, line6, kernel_42_45); - tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + - vgetq_lane_f32(tmp_4_2, 3) + bias_c; + tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c; tmp2 += vgetq_lane_f32(line1_1, 0) * weight_buf[11]; tmp2 += vgetq_lane_f32(line2_1, 0) * weight_buf[18]; tmp2 += vgetq_lane_f32(line3_1, 0) * weight_buf[25]; @@ -1167,8 +1155,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line2, kernel_28_31); tmp_4_0 = vmlaq_f32(tmp_4_0, line3, kernel_35_38); tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_42_45); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); float32x4_t tmp_4_1 = vmulq_f32(line1, kernel_14_17); @@ -1176,8 +1163,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_1 = vmlaq_f32(tmp_4_1, line3, kernel_28_31); tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_35_38); tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_42_45); - tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + - vgetq_lane_f32(tmp_4_1, 3) + bias_c; + tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c; *output_buf_1++ = elem_activation(tmp1, activation); float32x4_t tmp_4_2 = vmulq_f32(line1, kernel_7_10); @@ -1186,8 +1172,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_2 = vmlaq_f32(tmp_4_2, line4, kernel_28_31); tmp_4_2 = vmlaq_f32(tmp_4_2, line5, kernel_35_38); tmp_4_2 = vmlaq_f32(tmp_4_2, line6, kernel_42_45); - tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + - vgetq_lane_f32(tmp_4_2, 3) + bias_c; + tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c; *output_buf_2++ = elem_activation(tmp2, activation); } float32x4_t kernel_1_4 = vextq_f32(kernel_0_3, kernel_4_7, 1); @@ -1222,8 +1207,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_31_34); tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_38_41); tmp_4_0 = vmlaq_f32(tmp_4_0, line7, kernel_45_48); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); } line1_1 = vld1q_f32(input_1 + 4); @@ -1241,8 +1225,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_30_33); tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_37_40); tmp_4_0 = vmlaq_f32(tmp_4_0, line7, kernel_44_47); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; tmp0 += vgetq_lane_f32(line1_1, 0) * weight_buf[6]; tmp0 += vgetq_lane_f32(line2_1, 0) * weight_buf[13]; @@ -1424,8 +1407,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, tmp, kernel_38_41); tmp = vextq_f32(zero, line7_1, 3); tmp_4_0 = vmlaq_f32(tmp_4_0, tmp, kernel_45_48); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); line1 = vextq_f32(line1, line1_1, 1); line2 = vextq_f32(line2, line2_1, 1); @@ -1483,8 +1465,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_28_31); tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_35_38); tmp_4_0 = vmlaq_f32(tmp_4_0, line7, kernel_42_45); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; tmp0 += vgetq_lane_f32(line1_1, 0) * weight_buf[4]; tmp0 += vgetq_lane_f32(line2_1, 0) * weight_buf[11]; tmp0 += vgetq_lane_f32(line3_1, 0) * weight_buf[18]; @@ -1509,8 +1490,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_28_31); tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_35_38); tmp_4_0 = vmlaq_f32(tmp_4_0, line7, kernel_42_45); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); } } @@ -1536,23 +1516,20 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_24_27); tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_31_34); tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_38_41); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); float32x4_t tmp_4_1 = vmulq_f32(line2, kernel_3_6); tmp_4_1 = vmlaq_f32(tmp_4_1, line3, kernel_10_13); tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_17_20); tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_24_27); tmp_4_1 = vmlaq_f32(tmp_4_1, line6, kernel_31_34); - tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + - vgetq_lane_f32(tmp_4_1, 3) + bias_c; + tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c; *output_buf_1++ = elem_activation(tmp1, activation); float32x4_t tmp_4_2 = vmulq_f32(line3, kernel_3_6); tmp_4_2 = vmlaq_f32(tmp_4_2, line4, kernel_10_13); tmp_4_2 = vmlaq_f32(tmp_4_2, line5, kernel_17_20); tmp_4_2 = vmlaq_f32(tmp_4_2, line6, kernel_24_27); - tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + - vgetq_lane_f32(tmp_4_2, 3) + bias_c; + tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c; *output_buf_2++ = elem_activation(tmp2, activation); } line1_1 = vld1q_f32(input_1 + 4); @@ -1569,8 +1546,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_23_26); tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_30_33); tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_37_40); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; tmp0 += vgetq_lane_f32(line1_1, 0) * weight_buf[6]; tmp0 += vgetq_lane_f32(line2_1, 0) * weight_buf[13]; tmp0 += vgetq_lane_f32(line3_1, 0) * weight_buf[20]; @@ -1584,8 +1560,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_16_19); tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_23_26); tmp_4_1 = vmlaq_f32(tmp_4_1, line6, kernel_30_33); - tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + - vgetq_lane_f32(tmp_4_1, 3) + bias_c; + tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c; tmp1 += vgetq_lane_f32(line2_1, 0) * weight_buf[6]; tmp1 += vgetq_lane_f32(line3_1, 0) * weight_buf[13]; tmp1 += vgetq_lane_f32(line4_1, 0) * weight_buf[20]; @@ -1597,8 +1572,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_2 = vmlaq_f32(tmp_4_2, line4, kernel_9_12); tmp_4_2 = vmlaq_f32(tmp_4_2, line5, kernel_16_19); tmp_4_2 = vmlaq_f32(tmp_4_2, line6, kernel_23_26); - tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + - vgetq_lane_f32(tmp_4_2, 3) + bias_c; + tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c; tmp2 += vgetq_lane_f32(line3_1, 0) * weight_buf[6]; tmp2 += vgetq_lane_f32(line4_1, 0) * weight_buf[13]; tmp2 += vgetq_lane_f32(line5_1, 0) * weight_buf[20]; @@ -1871,14 +1845,11 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, tmp, kernel_38_41); tmp_4_1 = vmlaq_f32(tmp_4_1, tmp, kernel_31_34); tmp_4_2 = vmlaq_f32(tmp_4_2, tmp, kernel_24_27); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); - tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + - vgetq_lane_f32(tmp_4_1, 3) + bias_c; + tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c; *output_buf_1++ = elem_activation(tmp1, activation); - tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + - vgetq_lane_f32(tmp_4_2, 3) + bias_c; + tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c; *output_buf_2++ = elem_activation(tmp2, activation); line1 = vextq_f32(line1, line1_1, 1); line2 = vextq_f32(line2, line2_1, 1); @@ -1955,8 +1926,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_21_24); tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_28_31); tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_35_38); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; tmp0 += vgetq_lane_f32(line1_1, 0) * weight_buf[4]; tmp0 += vgetq_lane_f32(line2_1, 0) * weight_buf[11]; tmp0 += vgetq_lane_f32(line3_1, 0) * weight_buf[18]; @@ -1969,8 +1939,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_14_17); tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_21_24); tmp_4_1 = vmlaq_f32(tmp_4_1, line6, kernel_28_31); - tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + - vgetq_lane_f32(tmp_4_1, 3) + bias_c; + tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c; tmp1 += vgetq_lane_f32(line2_1, 0) * weight_buf[4]; tmp1 += vgetq_lane_f32(line3_1, 0) * weight_buf[11]; tmp1 += vgetq_lane_f32(line4_1, 0) * weight_buf[18]; @@ -1981,8 +1950,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_2 = vmlaq_f32(tmp_4_2, line4, kernel_7_10); tmp_4_2 = vmlaq_f32(tmp_4_2, line5, kernel_14_17); tmp_4_2 = vmlaq_f32(tmp_4_2, line6, kernel_21_24); - tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + - vgetq_lane_f32(tmp_4_2, 3) + bias_c; + tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c; tmp2 += vgetq_lane_f32(line3_1, 0) * weight_buf[4]; tmp2 += vgetq_lane_f32(line4_1, 0) * weight_buf[11]; tmp2 += vgetq_lane_f32(line5_1, 0) * weight_buf[18]; @@ -2003,23 +1971,20 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_21_24); tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_28_31); tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_35_38); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); float32x4_t tmp_4_1 = vmulq_f32(line2, kernel_0_3); tmp_4_1 = vmlaq_f32(tmp_4_1, line3, kernel_7_10); tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_14_17); tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_21_24); tmp_4_1 = vmlaq_f32(tmp_4_1, line6, kernel_28_31); - tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + - vgetq_lane_f32(tmp_4_1, 3) + bias_c; + tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c; *output_buf_1++ = elem_activation(tmp1, activation); float32x4_t tmp_4_2 = vmulq_f32(line3, kernel_0_3); tmp_4_2 = vmlaq_f32(tmp_4_2, line4, kernel_7_10); tmp_4_2 = vmlaq_f32(tmp_4_2, line5, kernel_14_17); tmp_4_2 = vmlaq_f32(tmp_4_2, line6, kernel_21_24); - tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + - vgetq_lane_f32(tmp_4_2, 3) + bias_c; + tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c; *output_buf_2++ = elem_activation(tmp2, activation); } } @@ -2041,7 +2006,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output int mid_block = mid_w >> 2; int w = 0; -//#pragma omp parallel for num_threads(num_thread) + //#pragma omp parallel for num_threads(num_thread) for (int c = 0; c < channel; c++) { float tmp0, tmp1; @@ -2086,8 +2051,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line2, kernel_31_34); tmp_4_0 = vmlaq_f32(tmp_4_0, line3, kernel_38_41); tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_45_48); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); float32x4_t tmp_4_1 = vmulq_f32(line1, kernel_10_13); tmp_4_1 = vmlaq_f32(tmp_4_1, line2, kernel_17_20); @@ -2095,8 +2059,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_31_34); tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_38_41); tmp_4_1 = vmlaq_f32(tmp_4_1, line6, kernel_45_48); - tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + - vgetq_lane_f32(tmp_4_1, 3) + bias_c; + tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c; *output_buf_1++ = elem_activation(tmp1, activation); } @@ -2331,8 +2294,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line2_1, kernel_31_34); tmp_4_0 = vmlaq_f32(tmp_4_0, line3_1, kernel_38_41); tmp_4_0 = vmlaq_f32(tmp_4_0, line4_1, kernel_45_48); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); float32x4_t tmp_4_1 = vmulq_f32(line1, kernel_0789); tmp_4_1 = vmlaq_f32(tmp_4_1, line2, kernel_0141516); @@ -2346,8 +2308,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output tmp_4_1 = vmlaq_f32(tmp_4_1, line4_1, kernel_31_34); tmp_4_1 = vmlaq_f32(tmp_4_1, line5_1, kernel_38_41); tmp_4_1 = vmlaq_f32(tmp_4_1, line6_1, kernel_45_48); - tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + - vgetq_lane_f32(tmp_4_1, 3) + bias_c; + tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c; *output_buf_1++ = elem_activation(tmp1, activation); line1 = vextq_f32(line1, line1_1, 2); @@ -2423,8 +2384,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line2, kernel_28_31); tmp_4_0 = vmlaq_f32(tmp_4_0, line3, kernel_35_38); tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_42_45); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); float32x4_t tmp_4_1 = vmulq_f32(line1, kernel_7_10); tmp_4_1 = vmlaq_f32(tmp_4_1, line2, kernel_14_17); @@ -2432,8 +2392,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_28_31); tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_35_38); tmp_4_1 = vmlaq_f32(tmp_4_1, line6, kernel_42_45); - tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + - vgetq_lane_f32(tmp_4_1, 3) + bias_c; + tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c; *output_buf_1++ = elem_activation(tmp1, activation); } } @@ -2497,8 +2456,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_31_34); tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_38_41); tmp_4_0 = vmlaq_f32(tmp_4_0, line7, kernel_45_48); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); } line1_1 = vld1q_f32(input_1 + 4); @@ -2702,8 +2660,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line5_1, kernel_31_34); tmp_4_0 = vmlaq_f32(tmp_4_0, line6_1, kernel_38_41); tmp_4_0 = vmlaq_f32(tmp_4_0, line7_1, kernel_45_48); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); line1 = vextq_f32(line1, line1_1, 2); @@ -2774,8 +2731,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_28_31); tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_35_38); tmp_4_0 = vmlaq_f32(tmp_4_0, line7, kernel_42_45); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); } else @@ -2824,15 +2780,13 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_24_27); tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_31_34); tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_38_41); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); float32x4_t tmp_4_1 = vmulq_f32(line3, kernel_3_6); tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_10_13); tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_17_20); tmp_4_1 = vmlaq_f32(tmp_4_1, line6, kernel_24_27); - tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + - vgetq_lane_f32(tmp_4_1, 3) + bias_c; + tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c; *output_buf_1++ = elem_activation(tmp1, activation); } line1_1 = vld1q_f32(input_1 + 4); @@ -3052,8 +3006,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line4_1, kernel_24_27); tmp_4_0 = vmlaq_f32(tmp_4_0, line5_1, kernel_31_34); tmp_4_0 = vmlaq_f32(tmp_4_0, line6_1, kernel_38_41); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); float32x4_t tmp_4_1 = vmulq_f32(line3, kernel_0012); @@ -3064,8 +3017,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output tmp_4_1 = vmlaq_f32(tmp_4_1, line4_1, kernel_10_13); tmp_4_1 = vmlaq_f32(tmp_4_1, line5_1, kernel_17_20); tmp_4_1 = vmlaq_f32(tmp_4_1, line6_1, kernel_24_27); - tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + - vgetq_lane_f32(tmp_4_1, 3) + bias_c; + tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c; *output_buf_1++ = elem_activation(tmp1, activation); line1 = vextq_f32(line1, line1_1, 2); @@ -3143,15 +3095,13 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_21_24); tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_28_31); tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_35_38); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); float32x4_t tmp_4_1 = vmulq_f32(line3, kernel_0_3); tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_7_10); tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_14_17); tmp_4_1 = vmlaq_f32(tmp_4_1, line6, kernel_21_24); - tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + - vgetq_lane_f32(tmp_4_1, 3) + bias_c; + tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c; *output_buf_1++ = elem_activation(tmp1, activation); } } @@ -3205,8 +3155,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line3, kernel_17_20); tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_24_27); tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_31_34); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); } line1_1 = vld1q_f32(input_1 + 4); @@ -3356,8 +3305,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line3_1, kernel_17_20); tmp_4_0 = vmlaq_f32(tmp_4_0, line4_1, kernel_24_27); tmp_4_0 = vmlaq_f32(tmp_4_0, line5_1, kernel_31_34); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); line1 = vextq_f32(line1, line1_1, 2); @@ -3414,8 +3362,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output tmp_4_0 = vmlaq_f32(tmp_4_0, line3, kernel_14_17); tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_21_24); tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_28_31); - tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + - vgetq_lane_f32(tmp_4_0, 3) + bias_c; + tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c; *output_buf++ = elem_activation(tmp0, activation); } } diff --git a/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_arm.c b/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_arm.c index 7ed499544..3ee41e0bb 100644 --- a/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_arm.c +++ b/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_arm.c @@ -36,7 +36,6 @@ #include "utility/log.h" #include "device/cpu/cpu_node.h" - static void pad_0_align_2D(float* dst, float* src, int m, int n, int m_align, int n_align, int pad_h, int pad_w) { int i; @@ -220,7 +219,7 @@ static void DirectConv(float* input_buf, int input_h, int input_w, float* output #endif int conv_dw_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, - struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param) + struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param) { int batch = input_tensor->dims[0]; int input_c = input_tensor->dims[1]; @@ -237,7 +236,7 @@ int conv_dw_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, priv_info->input_pad = sys_malloc(batch * input_c * padded_in_h * padded_in_w * sizeof(float)); memset(priv_info->input_pad, 0, batch * input_c * padded_in_h * padded_in_w * sizeof(float)); - + return 0; } @@ -277,14 +276,14 @@ int conv_dw_run(struct tensor* input_tensor, struct tensor* filter_tensor, struc int padded_in_w = in_w + param->pad_w0 + param->pad_w1; /* buffer addr */ - float* input_buf = ( float* )input_tensor->data; - float* kernel_buf = ( float* )filter_tensor->data; - float* output_buf = ( float* )output_tensor->data; + float* input_buf = (float*)input_tensor->data; + float* kernel_buf = (float*)filter_tensor->data; + float* output_buf = (float*)output_tensor->data; float* biases_buf = NULL; if (bias_tensor) - biases_buf = ( float* )bias_tensor->data; + biases_buf = (float*)bias_tensor->data; - for (int n = 0; n < batch; n++) // batch size + for (int n = 0; n < batch; n++) // batch size { float* cur_input = input_buf + n * input_size * group; float* cur_output = output_buf + n * output_size * group; @@ -304,7 +303,7 @@ int conv_dw_run(struct tensor* input_tensor, struct tensor* filter_tensor, struc if (stride_h == 1) { pad_0_align_3D((float*)conv_info->input_pad + n * group * padded_in_h * padded_in_w, cur_input, - in_h, in_w, padded_in_h, padded_in_w, group, param->pad_h0, param->pad_w0); + in_h, in_w, padded_in_h, padded_in_w, group, param->pad_h0, param->pad_w0); depthwise_conv_k5s1((float*)conv_info->input_pad, kernel_buf, biases_buf, cur_output, padded_in_h, padded_in_w, group, out_h, out_w, act_type, num_thread); } diff --git a/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_arm.h b/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_arm.h index 17030f3cd..0e53cb03b 100644 --- a/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_arm.h +++ b/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_arm.h @@ -30,21 +30,18 @@ #include "graph/node.h" #include "graph/graph.h" - /* float32 */ int conv_dw_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor, - struct conv_priv_info* info, struct conv_param* param); + struct conv_priv_info* info, struct conv_param* param); int conv_dw_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, - struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, int num_thread, int cpu_affinity) - ; + struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, int num_thread, int cpu_affinity); int conv_dw_postrun(struct conv_priv_info* priv_info); /* int8 */ int conv_dw_int8_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor, - struct conv_priv_info* info, struct conv_param* param); + struct conv_priv_info* info, struct conv_param* param); int conv_dw_int8_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, - struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param, - int num_thread, int cpu_affinity) - ; + struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param, + int num_thread, int cpu_affinity); int conv_dw_int8_postrun(struct conv_priv_info* priv_info); #endif diff --git a/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_int8_arm.c b/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_int8_arm.c index ffbb6eb1d..21b7e583e 100644 --- a/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_int8_arm.c +++ b/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_int8_arm.c @@ -32,21 +32,20 @@ #include "utility/sys_port.h" - #ifdef __aarch64__ void depthwise_k3s1p1_int8_a72(int8_t* input, int8_t* kernel, int8_t* out, int* bias, long out_h, long out_w, - long multi, long shift, long input_w, long act_min, long act_max); + long multi, long shift, long input_w, long act_min, long act_max); void depthwise_k3s2p1_int8_a72(int8_t* input, int8_t* kernel, int8_t* out, int* bias, long out_h, long out_w, - long multi, long shift, long input_w, long act_min, long act_max); + long multi, long shift, long input_w, long act_min, long act_max); #else void depthwise_k3s1_int8(int8_t* input, int8_t* kernel, int8_t* out, int* bias, int out_h, int out_w, - int multi, int shift, int input_w, int act_min, int act_max); + int multi, int shift, int input_w, int act_min, int act_max); void depthwise_k3s2_int8(int8_t* input, int8_t* kernel, int8_t* out, int* bias, int out_h, int out_w, - int multi, int shift, int input_w, int act_min, int act_max); + int multi, int shift, int input_w, int act_min, int act_max); #endif int conv_dw_int8_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor, - struct conv_priv_info* priv_info, struct conv_param* param) + struct conv_priv_info* priv_info, struct conv_param* param) { int batch = input_tensor->dims[0]; int in_c = input_tensor->dims[1]; @@ -67,19 +66,19 @@ int conv_dw_int8_prerun(struct tensor* input_tensor, struct tensor* filter_tenso priv_info->activation_min = -127; priv_info->activation_max = 127; /* set activation */ - if(param->activation >= 0) + if (param->activation >= 0) { priv_info->activation_min = 0; - if(param->activation == 1) + if (param->activation == 1) priv_info->activation_max = round(1.0 / output_scale); - if(param->activation == 6) + if (param->activation == 6) priv_info->activation_max = round(6.0 / output_scale); - if(priv_info->activation_max > 127) + if (priv_info->activation_max > 127) priv_info->activation_max = 127; } - for(int i=0; iactivation; pads[0] = param->pad_h0; pads[1] = param->pad_w0; - pads[2] = param->pad_h1; - pads[3] = param->pad_w1; + pads[2] = param->pad_h1; + pads[3] = param->pad_w1; int batch = input_tensor->dims[0]; int in_c = input_tensor->dims[1] / group; @@ -223,9 +222,9 @@ int conv_dw_int8_run(struct tensor* input_tensor, struct tensor* filter_tensor, int activation_max = priv_info->activation_max; /* buffer addr */ - int8_t* input_buf = ( int8_t* )input_tensor->data; - int8_t* kernel_buf = ( int8_t* )filter_tensor->data; - int8_t* output_buf = ( int8_t* )output_tensor->data; + int8_t* input_buf = (int8_t*)input_tensor->data; + int8_t* kernel_buf = (int8_t*)filter_tensor->data; + int8_t* output_buf = (int8_t*)output_tensor->data; int32_t* biases_buf = NULL; if (bias_tensor != NULL) { @@ -234,13 +233,13 @@ int conv_dw_int8_run(struct tensor* input_tensor, struct tensor* filter_tensor, int* multi = priv_info->multi; int* q_shift = priv_info->q_shift; - for (int n = 0; n < batch; n++) // batch size + for (int n = 0; n < batch; n++) // batch size { int8_t* input = input_buf + n * input_size * group; int8_t* kernel = kernel_buf + n * kernel_size * group; int8_t* output = output_buf + n * output_size * group; - conv_dw_int8_direct(input, kernel, output, biases_buf, in_h, in_w, - out_h, out_w, in_c * group, stride_h, pads, multi, q_shift, + conv_dw_int8_direct(input, kernel, output, biases_buf, in_h, in_w, + out_h, out_w, in_c * group, stride_h, pads, multi, q_shift, activation_min, activation_max, num_thread, cpu_affinity); } return 0; diff --git a/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_int8_arm.h b/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_int8_arm.h index 52bdc89aa..d5d685c6d 100644 --- a/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_int8_arm.h +++ b/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_int8_arm.h @@ -32,12 +32,11 @@ #include "graph/node.h" #include "graph/graph.h" - int conv_dw_int8_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor, - struct conv_priv_info* priv_info, struct conv_param* param); + struct conv_priv_info* priv_info, struct conv_param* param); int conv_dw_int8_postrun(struct conv_priv_info* priv_info); int conv_dw_int8_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, - struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param, - int num_thread, int cpu_affinity); + struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param, + int num_thread, int cpu_affinity); #endif \ No newline at end of file diff --git a/source/device/cpu/op/conv/cortex-a/conv_hcl_arm.c b/source/device/cpu/op/conv/cortex-a/conv_hcl_arm.c index 977c8d4e2..5958c7c38 100644 --- a/source/device/cpu/op/conv/cortex-a/conv_hcl_arm.c +++ b/source/device/cpu/op/conv/cortex-a/conv_hcl_arm.c @@ -40,7 +40,6 @@ #include - static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { struct node* ir_node = exec_node->ir_node; @@ -49,8 +48,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct struct tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; /* get cpu affinity */ conv_priv_info->cpu_type = exec_graph->cpu_affinity; @@ -69,7 +68,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct if (conv_hcl_set_shared_pack4_mem && exec_node->shared_pack4_mem_size < exec_graph->shared_pack4_mem_size) { if (conv_hcl_set_shared_pack4_mem(conv_priv_info, exec_graph->shared_pack4_mem, - exec_graph->shared_pack4_mem_size) < 0) + exec_graph->shared_pack4_mem_size) + < 0) { TLOG_ERR("hcl conv: set shared pack4 memory failed\n"); return -1; @@ -148,14 +148,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex if (ir_node->input_num > 2) bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; /* fp32 run */ if (exec_graph->mode == TENGINE_MODE_FP32 || exec_graph->mode == TENGINE_MODE_UINT8) { if (conv_hcl_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread, - cpu_affinity) < 0) + cpu_affinity) + < 0) { TLOG_ERR("hcl conv run failed\n"); return -1; @@ -177,7 +178,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex else if (exec_graph->mode == TENGINE_MODE_INT8) { if (int8_conv_hcl_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread, - cpu_affinity) < 0) + cpu_affinity) + < 0) { TLOG_ERR("hcl conv int8 run failed\n"); return -1; @@ -201,7 +203,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; /* dynamic get the shape of output tensor */ int n = input_tensor->dims[0]; @@ -263,10 +265,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc } else { - out_h = - (h - conv_param->dilation_h * (conv_param->kernel_h - 1) - 1 + conv_param->pad_h0 + conv_param->pad_h1) / - conv_param->stride_h + - 1; + out_h = (h - conv_param->dilation_h * (conv_param->kernel_h - 1) - 1 + conv_param->pad_h0 + conv_param->pad_h1) / conv_param->stride_h + 1; } if (conv_param->pad_w0 < 0) @@ -289,10 +288,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc } else { - out_w = - (w - conv_param->dilation_w * (conv_param->kernel_w - 1) - 1 + conv_param->pad_w0 + conv_param->pad_w1) / - conv_param->stride_w + - 1; + out_w = (w - conv_param->dilation_w * (conv_param->kernel_w - 1) - 1 + conv_param->pad_w0 + conv_param->pad_w1) / conv_param->stride_w + 1; } int dims[4]; @@ -305,7 +301,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc dims[2] = out_h; dims[3] = out_w; - for (int i=0; i<4; i++) + for (int i = 0; i < 4; i++) { if (dims[i] == 0) dims[i] = 1; @@ -322,7 +318,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc dims[2] = out_w; dims[3] = out_c; - for (int i=0; i<4; i++) + for (int i = 0; i < 4; i++) { if (dims[i] == 0) dims[i] = 1; @@ -337,7 +333,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; /* fp32 postrun */ if (exec_graph->mode == TENGINE_MODE_FP32 || exec_graph->mode == TENGINE_MODE_UINT8) @@ -390,10 +386,10 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; /* init the private info data of convolution op */ - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )sys_malloc(sizeof(struct conv_priv_info)); + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)sys_malloc(sizeof(struct conv_priv_info)); if (conv_priv_info == NULL) { return -1; @@ -429,7 +425,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; sys_free(conv_priv_info); exec_node->ops_priv = NULL; @@ -442,7 +438,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc struct graph* ir_graph = ir_node->graph; struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem; + struct conv_param* param = (struct conv_param*)exec_node->op.param_mem; int group = param->group; int kernel_h = param->kernel_h; int kernel_w = param->kernel_w; @@ -450,7 +446,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc int out_c = output_tensor->dims[1] / group; /* todo support int8/fp16 */ -#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC if (input_tensor->data_type != TENGINE_DT_FP32 && input_tensor->data_type != TENGINE_DT_FP16) return 0; @@ -466,14 +462,13 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc } static struct node_ops hcl_node_ops = { - .prerun = prerun, - .run = run, - .reshape = reshape, - .postrun = postrun, - .init_node = init_node, - .release_node = release_node, - .score = score -}; + .prerun = prerun, + .run = run, + .reshape = reshape, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score}; int register_conv_hcl_arm_op() { diff --git a/source/device/cpu/op/conv/cortex-a/conv_kernel_arm.c b/source/device/cpu/op/conv/cortex-a/conv_kernel_arm.c index 9e91564d9..dc10dec4c 100644 --- a/source/device/cpu/op/conv/cortex-a/conv_kernel_arm.c +++ b/source/device/cpu/op/conv/cortex-a/conv_kernel_arm.c @@ -118,9 +118,9 @@ static void interleave_kernel(float* kernel, float* kernel_interleaved, int kern /* kernel interleave */ static void interleave(struct tensor* filter, struct conv_priv_info* priv_info, struct conv_param* param) { - int group = param->group; + int group = param->group; int kernel_size = filter->dims[1] * filter->dims[2] * filter->dims[3]; - int out_chan = filter->dims[0] / group; + int out_chan = filter->dims[0] / group; int out_chan_align4 = (out_chan + 3) / 4 * 4; int kernel_size_algin = kernel_size * out_chan_align4; @@ -130,7 +130,7 @@ static void interleave(struct tensor* filter, struct conv_priv_info* priv_info, float* interleave_buf = priv_info->interleave_buffer; for (int g = 0; g < group; g++) { - float* cur_kernel = kernel + g * kernel_size_group; + float* cur_kernel = kernel + g * kernel_size_group; float* cur_interleave = interleave_buf + g * kernel_size_algin; interleave_kernel(cur_kernel, cur_interleave, out_chan, kernel_size); } @@ -145,7 +145,7 @@ static void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k int in_xy = in_w * in_h; int out_xy = out_w * out_h; int col_end3 = out_xy & 3; - #pragma omp parallel for num_threads(num_thread) +#pragma omp parallel for num_threads(num_thread) for (int col_i = 0; col_i < out_xy - 3; col_i += 4) { float* cur_col = col + col_i * kernel_size; @@ -179,7 +179,7 @@ static void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k int out_xy = out_w * out_h; int col_end3 = out_xy & 3; int is_pad0 = (pad_w0 == 0) && (pad_h0 == 0) && (pad_w1 == 0) && (pad_h1 == 0); - #pragma omp parallel for num_threads(num_thread) +#pragma omp parallel for num_threads(num_thread) for (int col_i = 0; col_i < (out_xy & -4); col_i += 4) { float* cur_col = col + col_i * kernel_size; @@ -255,7 +255,7 @@ static void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k else { int out_xy = out_w * out_h; - #pragma omp parallel for num_threads(num_thread) +#pragma omp parallel for num_threads(num_thread) for (int col_i = 0; col_i < out_xy - 3; col_i += 4) { int kernel_size = k_w * k_h * in_c; @@ -318,7 +318,7 @@ static void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k } static void sgemm_set(float* col, float* kernel, float* biases, float* output, int kernel_size, int col_start, - int col_end, int kernel_start, int kernel_end, int output_xy, int activation, int num_thread, int cpu_affinity) + int col_end, int kernel_start, int kernel_end, int output_xy, int activation, int num_thread, int cpu_affinity) { int col_end3 = col_end & 0x3; int nn_outch = kernel_end / PER_OUT_CHAN; @@ -327,21 +327,21 @@ static void sgemm_set(float* col, float* kernel, float* biases, float* output, i for (int pp = 0; pp < nn_outch; pp++) { int p = pp * PER_OUT_CHAN; - float* biasptr = biases ? ( float* )(biases + p) : NULL; - float* kernel_tmp = ( float* )(kernel + p * kernel_size); - float* output_tmp = ( float* )(output + p * output_xy); + float* biasptr = biases ? (float*)(biases + p) : NULL; + float* kernel_tmp = (float*)(kernel + p * kernel_size); + float* output_tmp = (float*)(output + p * output_xy); for (int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4) #ifdef __aarch64__ { - float* col_tmp = ( float* )(col + col_line * kernel_size); + float* col_tmp = (float*)(col + col_line * kernel_size); sgemm_4x16_a72(biasptr, col_tmp, kernel_tmp, kernel_size, output_tmp + col_line, output_xy, activation, 0); } if (col_end3) { int col_line = col_end & -4; float result[4 * PER_OUT_CHAN]; - float* col_tmp = ( float* )(col + col_line * kernel_size); + float* col_tmp = (float*)(col + col_line * kernel_size); sgemm_4x16_a72(biasptr, col_tmp, kernel_tmp, kernel_size, result, 4, activation, 0); @@ -355,14 +355,14 @@ static void sgemm_set(float* col, float* kernel, float* biases, float* output, i } #else { - float* col_tmp = ( float* )(col + col_line * kernel_size); + float* col_tmp = (float*)(col + col_line * kernel_size); sgemm_4x12_a17(biasptr, col_tmp, kernel_tmp, kernel_size, output_tmp + col_line, output_xy, activation, 0); } if (col_end3) { int col_line = col_end & -4; float result[4 * PER_OUT_CHAN]; - float* col_tmp = ( float* )(col + col_line * kernel_size); + float* col_tmp = (float*)(col + col_line * kernel_size); sgemm_4x12_a17(biasptr, col_tmp, kernel_tmp, kernel_size, result, 4, activation, 0); @@ -385,16 +385,16 @@ static void sgemm4x4(float* col, float* kernel, float* biases, float* output, in int kernel_end3 = kernel_end & 0x3; #pragma omp parallel for num_threads(num_thread) - for (int kernel_num = (kernel_start & -4); kernel_num < (kernel_end & -4); kernel_num += 4) + for (int kernel_num = (kernel_start & -4); kernel_num < (kernel_end & -4); kernel_num += 4) { float *cur_col, *cur_kernel, *cur_output; - float* cur_biases = biases ? ( float* )(biases + kernel_num) : NULL; + float* cur_biases = biases ? (float*)(biases + kernel_num) : NULL; - cur_kernel = ( float* )(kernel + kernel_num * kernel_size); - cur_output = ( float* )(output + kernel_num * output_xy); + cur_kernel = (float*)(kernel + kernel_num * kernel_size); + cur_output = (float*)(output + kernel_num * output_xy); for (int col_line = 0; col_line < (col_end & -4); col_line += 4) { - cur_col = ( float* )(col + col_line * kernel_size); + cur_col = (float*)(col + col_line * kernel_size); #ifdef __aarch64__ sgemm_4x4_a72(cur_biases, cur_col, cur_kernel, kernel_size, cur_output + col_line, output_xy, activation, 0); #else @@ -405,7 +405,7 @@ static void sgemm4x4(float* col, float* kernel, float* biases, float* output, in { float result[16]; int col_line = col_end & -4; - cur_col = ( float* )(col + col_line * kernel_size); + cur_col = (float*)(col + col_line * kernel_size); #ifdef __aarch64__ sgemm_4x4_a72(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0); #else @@ -421,14 +421,14 @@ static void sgemm4x4(float* col, float* kernel, float* biases, float* output, in if (kernel_end3) { int kernel_num = (kernel_end & -4); - float* cur_biases = biases ? ( float* )(biases + kernel_num) : NULL; - float* cur_kernel = ( float* )(kernel + kernel_num * kernel_size); + float* cur_biases = biases ? (float*)(biases + kernel_num) : NULL; + float* cur_kernel = (float*)(kernel + kernel_num * kernel_size); #pragma omp parallel for num_threads(num_thread) for (int col_line = 0; col_line < (col_end & -4); col_line += 4) { float result[16]; - float* cur_col = ( float* )(col + col_line * kernel_size); + float* cur_col = (float*)(col + col_line * kernel_size); #ifdef __aarch64__ sgemm_4x4_a72(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0); #else @@ -443,7 +443,7 @@ static void sgemm4x4(float* col, float* kernel, float* biases, float* output, in { float result[16]; int col_line = col_end & -4; - float* cur_col = ( float* )(col + col_line * kernel_size); + float* cur_col = (float*)(col + col_line * kernel_size); #ifdef __aarch64__ sgemm_4x4_a72(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0); #else @@ -487,15 +487,15 @@ static int winograd_support(struct conv_param* param, int in_h, int in_w) */ int conv_hcl_get_shared_mem_size(struct tensor* input, struct tensor* output, struct conv_param* param) { - int in_h = input->dims[2]; - int in_w = input->dims[3]; + int in_h = input->dims[2]; + int in_w = input->dims[3]; int out_h = output->dims[2]; int out_w = output->dims[3]; int group = param->group; - int input_chan = param->input_channel / group; + int input_chan = param->input_channel / group; int kernel_size = input_chan * param->kernel_h * param->kernel_w; - int out_cstep = out_h * out_w; // channel cstep, output_h * output_w - int elem_size = input->elem_size; // uint8/int8 is 1 byte, fp32 is 4 bytes + int out_cstep = out_h * out_w; // channel cstep, output_h * output_w + int elem_size = input->elem_size; // uint8/int8 is 1 byte, fp32 is 4 bytes out_cstep = (out_cstep + 3) / 4 * 4; int mem_size = elem_size * kernel_size * out_cstep + 128; @@ -512,7 +512,7 @@ static int get_private_mem_size(struct tensor* filter, struct conv_param* param) int out_chan = filter->dims[0] / group; int out_chan_align4 = (out_chan + 3) / 4 * 4; int kernel_size = filter->dims[1] * filter->dims[2] * filter->dims[3]; - int mem_size = kernel_size * filter->elem_size * out_chan_align4 * group + 128; // caution + int mem_size = kernel_size * filter->elem_size * out_chan_align4 * group + 128; // caution return mem_size; } @@ -552,7 +552,7 @@ int conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, s if (priv_info->winograd) { #ifdef __aarch64__ - if(in_c >= 256) + if (in_c >= 256) return wino_conv_hcl_prerun_1(input_tensor, filter_tensor, output_tensor, priv_info, param); else #endif @@ -564,7 +564,7 @@ int conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, s { int mem_size = conv_hcl_get_shared_mem_size(input_tensor, output_tensor, param); void* mem = sys_malloc(mem_size); - priv_info->im2col_buffer = mem; + priv_info->im2col_buffer = mem; priv_info->im2col_buffer_size = mem_size; } @@ -573,7 +573,7 @@ int conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, s { int mem_size = get_private_mem_size(filter_tensor, param); void* mem = sys_malloc(mem_size); - priv_info->interleave_buffer = mem; + priv_info->interleave_buffer = mem; priv_info->interleave_buffer_size = mem_size; } @@ -634,7 +634,7 @@ int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, stru if (priv_info->winograd) { #ifdef __aarch64__ - if(in_c >= 256) + if (in_c >= 256) return wino_conv_hcl_run_1(input_tensor, filter_tensor, bias_tensor, output_tensor, priv_info, param, num_thread, cpu_affinity); else #endif @@ -650,13 +650,13 @@ int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, stru int output_image_size = output_tensor->dims[1] * output_tensor->dims[2] * output_tensor->dims[3]; /* buffer addr */ - float* input_buf = ( float* )input_tensor->data; - float* output_buf = ( float* )output_tensor->data; + float* input_buf = (float*)input_tensor->data; + float* output_buf = (float*)output_tensor->data; float* biases_buf = NULL; if (bias_tensor != NULL) - biases_buf = ( float* )bias_tensor->data; - float* col_buf = ( float* )priv_info->im2col_buffer; - float* interleave_buf = ( float* )priv_info->interleave_buffer; + biases_buf = (float*)bias_tensor->data; + float* col_buf = (float*)priv_info->im2col_buffer; + float* interleave_buf = (float*)priv_info->interleave_buffer; /* block size split parameter */ int L2_CACHE_SIZE = ((cpu_affinity == TENGINE_CLUSTER_LITTLE) ? 512 : 1024) * 1024; @@ -666,7 +666,7 @@ int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, stru int sgemm_set_chan = out_c / PER_OUT_CHAN * PER_OUT_CHAN; int sgemm_set_remain = out_c % PER_OUT_CHAN; - for (int n = 0; n < batch; n++) // batch size + for (int n = 0; n < batch; n++) // batch size { for (int g = 0; g < group; g++) { @@ -677,19 +677,19 @@ int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, stru /* im2col */ im2col(cur_input, col_buf, in_c, in_w, in_h, kernel_w, kernel_h, stride_w, stride_h, dilation_w, dilation_h, - pad_w0, pad_w1, pad_h0, pad_h1, out_w, out_h, num_thread); + pad_w0, pad_w1, pad_h0, pad_h1, out_w, out_h, num_thread); - for(int col_i = 0; col_i < out_hw; col_i += col_cnt_l2) + for (int col_i = 0; col_i < out_hw; col_i += col_cnt_l2) { int col_start = col_i; int col_end = col_i + col_cnt_l2; col_end = col_end > out_hw ? out_hw : col_end; /* gemm */ sgemm_set(col_buf, cur_kernel, cur_bias, cur_output, kernel_size, col_start, col_end, 0, sgemm_set_chan, out_hw, act_type, - num_thread, cpu_affinity); + num_thread, cpu_affinity); if (sgemm_set_remain) sgemm4x4(col_buf, cur_kernel, cur_bias, cur_output, kernel_size, col_start, col_end, sgemm_set_chan, out_c, out_hw, - act_type, num_thread, cpu_affinity); + act_type, num_thread, cpu_affinity); } } } diff --git a/source/device/cpu/op/conv/cortex-a/conv_kernel_arm.h b/source/device/cpu/op/conv/cortex-a/conv_kernel_arm.h index 1c489e0c6..041b4980e 100644 --- a/source/device/cpu/op/conv/cortex-a/conv_kernel_arm.h +++ b/source/device/cpu/op/conv/cortex-a/conv_kernel_arm.h @@ -31,7 +31,6 @@ #include "graph/node.h" #include "graph/graph.h" - /* float32 */ int conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param); @@ -52,29 +51,29 @@ int conv_hcl_set_shared_pack4_mem(struct conv_priv_info* priv_info, void* mem, i /* fp16 */ #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -int fp16_conv_hcl_prerun(struct tensor* input_tensor, - struct tensor* filter_tensor, - struct tensor* output_tensor, - struct conv_priv_info* info, - struct conv_param* param) ; +int fp16_conv_hcl_prerun(struct tensor* input_tensor, + struct tensor* filter_tensor, + struct tensor* output_tensor, + struct conv_priv_info* info, + struct conv_param* param); int fp16_conv_hcl_postrun(struct conv_priv_info* info); -int fp16_conv_hcl_run(struct tensor* input_tensor , struct tensor* filter_tensor ,struct tensor* bias_tensor , struct tensor* output_tensor , struct conv_priv_info* conv_info ,struct conv_param* param, int num_thread, int cpu_affinity) ; +int fp16_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, int num_thread, int cpu_affinity); -int fp16_conv_hcl_get_shared_mem_size(struct tensor* input_tensor ,struct tensor* output_tensor , struct conv_param* param) ; +int fp16_conv_hcl_get_shared_mem_size(struct tensor* input_tensor, struct tensor* output_tensor, struct conv_param* param); #endif /* int8 */ int int8_conv_hcl_get_shared_mem_size(struct tensor* input_tensor, struct tensor* output_tensor, - struct conv_param* param); + struct conv_param* param); int int8_conv_hcl_set_shared_mem(struct conv_priv_info* priv_info, void* mem, int mem_size); int int8_conv_hcl_set_shared_pack4_mem(struct conv_priv_info* priv_info, void* mem, int mem_size); int int8_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor, - struct conv_priv_info* priv_info, struct conv_param* param); + struct conv_priv_info* priv_info, struct conv_param* param); int int8_conv_hcl_postrun(struct conv_priv_info* priv_info); int int8_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, - struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param, - int num_thread, int cpu_affinity); + struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param, + int num_thread, int cpu_affinity); #endif diff --git a/source/device/cpu/op/conv/cortex-a/conv_kernel_int8_arm.c b/source/device/cpu/op/conv/cortex-a/conv_kernel_int8_arm.c index f96b8b5d6..6c17a77ac 100644 --- a/source/device/cpu/op/conv/cortex-a/conv_kernel_int8_arm.c +++ b/source/device/cpu/op/conv/cortex-a/conv_kernel_int8_arm.c @@ -32,12 +32,11 @@ #include #include - #ifdef __aarch64__ void i8gemm_4x16_a72_int8(int* biases, int8_t* input, int8_t* kernel, long kernel_size, int8_t* output, - int* multi, long output_xy, int* shift, int activation_min, int activation_max); + int* multi, long output_xy, int* shift, int activation_min, int activation_max); void i8gemm_4x4_a72_int8(int* biases, int8_t* input, int8_t* kernel, long kernel_size, int8_t* output, - int* multi, long output_xy, int* shift, int activation_min, int activation_max); + int* multi, long output_xy, int* shift, int activation_min, int activation_max); void im2col_int8_1x1(int8_t* input, long input_xy, int8_t* col, long col_cnt, long input_chan); void im2col_int8_3x3(int8_t* input, long input_x, long input_y, long input_chan, int8_t* col, long stride); // col_start and col_end need to be 16 aligned @@ -50,10 +49,10 @@ static void i8gemm4x16(int8_t* col, int8_t* kernel, bool bias_term, int* biases, int kernel_size_aligned2 = (kernel_size + 1) & -2; #pragma omp parallel for num_threads(num_thread) - for(int kernel_num = (kernel_start & -16); kernel_num < (kernel_end & -16); kernel_num += 16) + for (int kernel_num = (kernel_start & -16); kernel_num < (kernel_end & -16); kernel_num += 16) { int* cur_biases = NULL; - if(bias_term) + if (bias_term) { cur_biases = biases + kernel_num; } @@ -67,24 +66,24 @@ static void i8gemm4x16(int8_t* col, int8_t* kernel, bool bias_term, int* biases, int8_t* cur_kernel = kernel + kernel_num * kernel_size_aligned2; int8_t* output_result = output + kernel_num * output_xy; - for(int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4) + for (int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4) { int8_t* cur_col = col + col_line * kernel_size_aligned2; - + i8gemm_4x16_a72_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, output_result + col_line, pmulti, - output_xy, pq_shift, activation_min, activation_max); + output_xy, pq_shift, activation_min, activation_max); } - if(col_end3) + if (col_end3) { int col_line = col_end & -4; int8_t* cur_col = col + col_line * kernel_size_aligned2; i8gemm_4x16_a72_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, (int8_t*)result, pmulti, 0, pq_shift, activation_min, activation_max); - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int j = 0; j < 4; j++) + for (int j = 0; j < 4; j++) { output_line[j] = output + (kernel_num + i * 4 + j) * output_xy + col_line; } @@ -94,14 +93,14 @@ static void i8gemm4x16(int8_t* col, int8_t* kernel, bool bias_term, int* biases, *(output_line[2] + 0) = result[i * 16 + 10]; *(output_line[3] + 0) = result[i * 16 + 15]; - if((col_end3) >= 2) + if ((col_end3) >= 2) { *(output_line[0] + 1) = result[i * 16 + 4]; *(output_line[1] + 1) = result[i * 16 + 1]; *(output_line[2] + 1) = result[i * 16 + 14]; *(output_line[3] + 1) = result[i * 16 + 11]; } - if((col_end3) == 3) + if ((col_end3) == 3) { *(output_line[0] + 2) = result[i * 16 + 8]; *(output_line[1] + 2) = result[i * 16 + 13]; @@ -123,10 +122,10 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases, int kernel_size_aligned2 = (kernel_size + 1) & -2; #pragma omp parallel for num_threads(num_thread) - for(int kernel_num = kernel_start & -4; kernel_num < (kernel_end & -4); kernel_num += 4) + for (int kernel_num = kernel_start & -4; kernel_num < (kernel_end & -4); kernel_num += 4) { int* cur_biases = NULL; - if(bias_term) + if (bias_term) { cur_biases = biases + kernel_num; } @@ -140,21 +139,21 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases, int8_t* cur_kernel = kernel + kernel_num * kernel_size_aligned2; int8_t* output_result = output + kernel_num * output_xy; - for(int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4) + for (int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4) { int8_t* cur_col = col + col_line * kernel_size_aligned2; i8gemm_4x4_a72_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, output_result + col_line, pmulti, - output_xy, pq_shift, activation_min, activation_max); + output_xy, pq_shift, activation_min, activation_max); } - if(col_end3) + if (col_end3) { int col_line = col_end & -4; int8_t* cur_col = col + col_line * kernel_size_aligned2; i8gemm_4x4_a72_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, (int8_t*)result, pmulti, 0, pq_shift, activation_min, activation_max); - for(int j = 0; j < 4; j++) + for (int j = 0; j < 4; j++) { output_line[j] = output + (kernel_num + j) * output_xy + col_line; } @@ -164,14 +163,14 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases, *(output_line[2] + 0) = result[10]; *(output_line[3] + 0) = result[15]; - if(col_end3 >= 2) + if (col_end3 >= 2) { *(output_line[0] + 1) = result[4]; *(output_line[1] + 1) = result[1]; *(output_line[2] + 1) = result[14]; *(output_line[3] + 1) = result[11]; } - if(col_end3 == 3) + if (col_end3 == 3) { *(output_line[0] + 2) = result[8]; *(output_line[1] + 2) = result[13]; @@ -180,11 +179,11 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases, } } } - if(kernel_end3) + if (kernel_end3) { int kernel_num = kernel_end & -4; int* cur_biases = NULL; - if(bias_term) + if (bias_term) { cur_biases = biases + kernel_num; } @@ -196,13 +195,13 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases, int* pq_shift = q_shift + kernel_num; int8_t* cur_kernel = kernel + kernel_num * kernel_size_aligned2; - for(int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4) + for (int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4) { int8_t* cur_col = col + col_line * kernel_size_aligned2; i8gemm_4x4_a72_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, (int8_t*)result, pmulti, 0, pq_shift, activation_min, activation_max); - for(int j = 0; j < 4; j++) + for (int j = 0; j < 4; j++) { output_line[j] = output + (kernel_num + j) * output_xy + col_line; } @@ -212,14 +211,14 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases, *(output_line[0] + 2) = result[8]; *(output_line[0] + 3) = result[12]; - if(kernel_end3 >= 2) + if (kernel_end3 >= 2) { *(output_line[1] + 0) = result[5]; *(output_line[1] + 1) = result[1]; *(output_line[1] + 2) = result[13]; *(output_line[1] + 3) = result[9]; } - if(kernel_end3 == 3) + if (kernel_end3 == 3) { *(output_line[2] + 0) = result[10]; *(output_line[2] + 1) = result[14]; @@ -227,37 +226,37 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases, *(output_line[2] + 3) = result[6]; } } - if(col_end3) + if (col_end3) { int col_line = col_end & -4; int8_t* cur_col = col + col_line * kernel_size_aligned2; i8gemm_4x4_a72_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, (int8_t*)result, pmulti, 0, pq_shift, activation_min, activation_max); - for(int j = 0; j < 4; j++) + for (int j = 0; j < 4; j++) { output_line[j] = output + (kernel_num + j) * output_xy + col_line; } *(output_line[0] + 0) = result[0]; - if(col_end3 >= 2) + if (col_end3 >= 2) *(output_line[0] + 1) = result[4]; - if(col_end3 == 3) + if (col_end3 == 3) *(output_line[0] + 2) = result[8]; - if(kernel_end3 >= 2) + if (kernel_end3 >= 2) { *(output_line[1] + 0) = result[5]; - if(col_end3 >= 2) + if (col_end3 >= 2) *(output_line[1] + 1) = result[1]; - if(col_end3 == 3) + if (col_end3 == 3) *(output_line[1] + 2) = result[13]; } - if(kernel_end3 == 3) + if (kernel_end3 == 3) { *(output_line[2] + 0) = result[10]; - if(col_end3 >= 2) + if (col_end3 >= 2) *(output_line[2] + 1) = result[14]; - if(col_end3 == 3) + if (col_end3 == 3) *(output_line[2] + 2) = result[2]; } } @@ -279,10 +278,10 @@ static void i8gemm4x8(int8_t* col, int8_t* kernel, bool bias_term, int* biases, int kernel_size_aligned2 = (kernel_size + 1) & -2; #pragma omp parallel for num_threads(num_thread) - for(int kernel_num = (kernel_start & -8); kernel_num < (kernel_end & -8); kernel_num += 8) + for (int kernel_num = (kernel_start & -8); kernel_num < (kernel_end & -8); kernel_num += 8) { int* cur_biases = NULL; - if(bias_term) + if (bias_term) { cur_biases = biases + kernel_num; } @@ -296,23 +295,23 @@ static void i8gemm4x8(int8_t* col, int8_t* kernel, bool bias_term, int* biases, int8_t* cur_kernel = kernel + kernel_num * kernel_size_aligned2; int8_t* output_result = output + kernel_num * output_xy; - for(int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4) + for (int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4) { int8_t* cur_col = col + col_line * kernel_size_aligned2; i8gemm_4x8_a17_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, output_result + col_line, pmulti, - output_xy, pq_shift, activation_min, activation_max); + output_xy, pq_shift, activation_min, activation_max); } - if(col_end3) + if (col_end3) { int col_line = col_end & -4; int8_t* cur_col = col + col_line * kernel_size_aligned2; i8gemm_4x8_a17_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, (int8_t*)result, pmulti, 0, pq_shift, activation_min, activation_max); - for(int i = 0; i < 2; i++) + for (int i = 0; i < 2; i++) { - for(int j = 0; j < 4; j++) + for (int j = 0; j < 4; j++) { output_line[j] = output + (kernel_num + i * 4 + j) * output_xy + col_line; } @@ -322,14 +321,14 @@ static void i8gemm4x8(int8_t* col, int8_t* kernel, bool bias_term, int* biases, *(output_line[2] + 0) = result[i * 16 + 10]; *(output_line[3] + 0) = result[i * 16 + 15]; - if(col_end3 >= 2) + if (col_end3 >= 2) { *(output_line[0] + 1) = result[i * 16 + 4]; *(output_line[1] + 1) = result[i * 16 + 1]; *(output_line[2] + 1) = result[i * 16 + 14]; *(output_line[3] + 1) = result[i * 16 + 11]; } - if(col_end3 == 3) + if (col_end3 == 3) { *(output_line[0] + 2) = result[i * 16 + 8]; *(output_line[1] + 2) = result[i * 16 + 13]; @@ -352,10 +351,10 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases, int kernel_size_aligned2 = (kernel_size + 1) & -2; #pragma omp parallel for num_threads(num_thread) - for(int kernel_num = (kernel_start & -4); kernel_num < (kernel_end & -4); kernel_num += 4) + for (int kernel_num = (kernel_start & -4); kernel_num < (kernel_end & -4); kernel_num += 4) { int* cur_biases = NULL; - if(bias_term) + if (bias_term) { cur_biases = biases + kernel_num; } @@ -364,27 +363,27 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases, int8_t* output_line[4]; int* pmulti = multi + kernel_num; - int* pq_shift = q_shift + kernel_num; + int* pq_shift = q_shift + kernel_num; int8_t* cur_kernel = kernel + kernel_num * kernel_size_aligned2; int8_t* output_result = output + kernel_num * output_xy; - for(int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4) + for (int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4) { int8_t* cur_col = col + col_line * kernel_size_aligned2; i8gemm_4x4_a17_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, output_result + col_line, pmulti, - output_xy, pq_shift, activation_min, activation_max); + output_xy, pq_shift, activation_min, activation_max); } - if(col_end3) + if (col_end3) { int col_line = col_end & -4; int8_t* cur_col = col + col_line * kernel_size_aligned2; i8gemm_4x4_a17_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, (int8_t*)result, pmulti, 0, pq_shift, activation_min, activation_max); - for(int j = 0; j < 4; j++) + for (int j = 0; j < 4; j++) { output_line[j] = output + (kernel_num + j) * output_xy + col_line; } @@ -394,14 +393,14 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases, *(output_line[2] + 0) = result[10]; *(output_line[3] + 0) = result[15]; - if(col_end3 >= 2) + if (col_end3 >= 2) { *(output_line[0] + 1) = result[4]; *(output_line[1] + 1) = result[1]; *(output_line[2] + 1) = result[14]; *(output_line[3] + 1) = result[11]; } - if(col_end3 == 3) + if (col_end3 == 3) { *(output_line[0] + 2) = result[8]; *(output_line[1] + 2) = result[13]; @@ -410,11 +409,11 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases, } } } - if(kernel_end3) + if (kernel_end3) { int kernel_num = kernel_end & -4; int* cur_biases = NULL; - if(bias_term) + if (bias_term) { cur_biases = biases + kernel_num; } @@ -426,13 +425,13 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases, int* pq_shift = q_shift + kernel_num; int8_t* cur_kernel = kernel + kernel_num * kernel_size_aligned2; - for(int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4) + for (int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4) { int8_t* cur_col = col + col_line * kernel_size_aligned2; i8gemm_4x4_a17_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, (int8_t*)result, pmulti, 0, pq_shift, activation_min, activation_max); - for(int j = 0; j < 4; j++) + for (int j = 0; j < 4; j++) { output_line[j] = output + (kernel_num + j) * output_xy + col_line; } @@ -442,14 +441,14 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases, *(output_line[0] + 2) = result[8]; *(output_line[0] + 3) = result[12]; - if(kernel_end3 >= 2) + if (kernel_end3 >= 2) { *(output_line[1] + 0) = result[5]; *(output_line[1] + 1) = result[1]; *(output_line[1] + 2) = result[13]; *(output_line[1] + 3) = result[9]; } - if(kernel_end3 == 3) + if (kernel_end3 == 3) { *(output_line[2] + 0) = result[10]; *(output_line[2] + 1) = result[14]; @@ -457,37 +456,37 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases, *(output_line[2] + 3) = result[6]; } } - if(col_end3) + if (col_end3) { int col_line = col_end & -4; int8_t* cur_col = col + col_line * kernel_size_aligned2; i8gemm_4x4_a17_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, (int8_t*)result, pmulti, 0, pq_shift, activation_min, activation_max); - for(int j = 0; j < 4; j++) + for (int j = 0; j < 4; j++) { output_line[j] = output + (kernel_num + j) * output_xy + col_line; } *(output_line[0] + 0) = result[0]; - if(col_end3 >= 2) + if (col_end3 >= 2) *(output_line[0] + 1) = result[4]; - if(col_end3 == 3) + if (col_end3 == 3) *(output_line[0] + 2) = result[8]; - if(kernel_end3 >= 2) + if (kernel_end3 >= 2) { *(output_line[1] + 0) = result[5]; - if(col_end3 >= 2) + if (col_end3 >= 2) *(output_line[1] + 1) = result[1]; - if(col_end3 == 3) + if (col_end3 == 3) *(output_line[1] + 2) = result[13]; } - if(kernel_end3 == 3) + if (kernel_end3 == 3) { *(output_line[2] + 0) = result[10]; - if(col_end3 >= 2) + if (col_end3 >= 2) *(output_line[2] + 1) = result[14]; - if(col_end3 == 3) + if (col_end3 == 3) *(output_line[2] + 2) = result[2]; } } @@ -504,7 +503,7 @@ static int get_private_mem_size(struct tensor* filter, struct conv_param* param) int out_chan = filter->dims[0] / group; int out_chan_align4 = (out_chan + 3) / 4 * 4; int kernel_size = filter->dims[1] * filter->dims[2] * filter->dims[3]; - int mem_size = kernel_size * filter->elem_size * out_chan_align4 * group + 128; // caution + int mem_size = kernel_size * filter->elem_size * out_chan_align4 * group + 128; // caution return mem_size; } @@ -527,18 +526,18 @@ int int8_conv_hcl_set_shared_pack4_mem(struct conv_priv_info* priv_info, void* m } int int8_conv_hcl_get_shared_mem_size(struct tensor* input, struct tensor* output, struct conv_param* param) { - int in_h = input->dims[2]; - int in_w = input->dims[3]; + int in_h = input->dims[2]; + int in_w = input->dims[3]; int out_h = output->dims[2]; int out_w = output->dims[3]; int group = param->group; - int input_chan = param->input_channel / group; + int input_chan = param->input_channel / group; int kernel_size = input_chan * param->kernel_h * param->kernel_w; - int out_cstep = out_h * out_w; // channel cstep, output_h * output_w - int elem_size = input->elem_size; // uint8/int8 is 1 byte, fp32 is 4 bytes + int out_cstep = out_h * out_w; // channel cstep, output_h * output_w + int elem_size = input->elem_size; // uint8/int8 is 1 byte, fp32 is 4 bytes out_cstep = (out_cstep + 3) / 4 * 4; - + int kernel_size_aligned2 = (kernel_size + 1) & -2; int mem_size = elem_size * kernel_size_aligned2 * out_cstep + 128; @@ -553,18 +552,18 @@ void interleave_kernel_int8(int8_t* kernel, int8_t* kernel_int8, int kernel_chan int i, j, k; // interleave 16 kernels - for(i = 0; i < (kernel_chan & -16); i += 16) + for (i = 0; i < (kernel_chan & -16); i += 16) { - for(j = 0; j < 16; j++) + for (j = 0; j < 16; j++) cur_kernel[j] = kernel + kernel_size * (i + j); - for(j = 0; j < (kernel_size & -2); j += 2) - for(k = 0; k < 16; k++) + for (j = 0; j < (kernel_size & -2); j += 2) + for (k = 0; k < 16; k++) { *(cur_kernel_int8++) = *(cur_kernel[k] + j); *(cur_kernel_int8++) = *(cur_kernel[k] + j + 1); } - if(kernel_size & 0x1) - for(k = 0; k < 16; k++) + if (kernel_size & 0x1) + for (k = 0; k < 16; k++) { *(cur_kernel_int8++) = *(cur_kernel[k] + j); *(cur_kernel_int8++) = 0; @@ -572,87 +571,87 @@ void interleave_kernel_int8(int8_t* kernel, int8_t* kernel_int8, int kernel_chan } // interleave 4 kernels - for(i = (kernel_chan & -16); i < (kernel_chan & -4); i += 4) + for (i = (kernel_chan & -16); i < (kernel_chan & -4); i += 4) { - for(j = 0; j < 4; j++) + for (j = 0; j < 4; j++) cur_kernel[j] = kernel + kernel_size * (i + j); - for(j = 0; j < (kernel_size & -2); j += 2) - for(k = 0; k < 4; k++) + for (j = 0; j < (kernel_size & -2); j += 2) + for (k = 0; k < 4; k++) { *(cur_kernel_int8++) = *(cur_kernel[k] + j); *(cur_kernel_int8++) = *(cur_kernel[k] + j + 1); } - if(kernel_size & 0x1) - for(k = 0; k < 4; k++) + if (kernel_size & 0x1) + for (k = 0; k < 4; k++) { *(cur_kernel_int8++) = *(cur_kernel[k] + j); *(cur_kernel_int8++) = 0; } } // last 4 kernels - if((kernel_chan & 0x3) != 0) + if ((kernel_chan & 0x3) != 0) { - for(j = 0; j < 3; j++) + for (j = 0; j < 3; j++) cur_kernel[j] = kernel + kernel_size * (i + j); - if((kernel_chan & 0x3) == 3) + if ((kernel_chan & 0x3) == 3) { - for(j = 0; j < (kernel_size & -2); j += 2) + for (j = 0; j < (kernel_size & -2); j += 2) { - for(k = 0; k < 3; k++) + for (k = 0; k < 3; k++) { *(cur_kernel_int8++) = *(cur_kernel[k] + j); *(cur_kernel_int8++) = *(cur_kernel[k] + j + 1); } - for(k = 0; k < 2; k++) + for (k = 0; k < 2; k++) *(cur_kernel_int8++) = 0; } - if(kernel_size & 0x1) + if (kernel_size & 0x1) { - for(k = 0; k < 3; k++) + for (k = 0; k < 3; k++) { *(cur_kernel_int8++) = *(cur_kernel[k] + j); *(cur_kernel_int8++) = 0; } - for(k = 0; k < 2; k++) + for (k = 0; k < 2; k++) *(cur_kernel_int8++) = 0; } } - else if((kernel_chan & 0x3) == 2) + else if ((kernel_chan & 0x3) == 2) { - for(j = 0; j < (kernel_size & -2); j += 2) + for (j = 0; j < (kernel_size & -2); j += 2) { - for(k = 0; k < 2; k++) + for (k = 0; k < 2; k++) { *(cur_kernel_int8++) = *(cur_kernel[k] + j); *(cur_kernel_int8++) = *(cur_kernel[k] + j + 1); } - for(k = 0; k < 4; k++) + for (k = 0; k < 4; k++) *(cur_kernel_int8++) = 0; } - if(kernel_size & 0x1) + if (kernel_size & 0x1) { - for(k = 0; k < 2; k++) + for (k = 0; k < 2; k++) { *(cur_kernel_int8++) = *(cur_kernel[k] + j); *(cur_kernel_int8++) = 0; } - for(k = 0; k < 4; k++) + for (k = 0; k < 4; k++) *(cur_kernel_int8++) = 0; } } - else if((kernel_chan & 0x3) == 1) + else if ((kernel_chan & 0x3) == 1) { - for(j = 0; j < (kernel_size & -2); j += 2) + for (j = 0; j < (kernel_size & -2); j += 2) { *(cur_kernel_int8++) = *(cur_kernel[0] + j); *(cur_kernel_int8++) = *(cur_kernel[0] + j + 1); - for(k = 0; k < 6; k++) + for (k = 0; k < 6; k++) *(cur_kernel_int8++) = 0; } - if(kernel_size & 0x1) + if (kernel_size & 0x1) { *(cur_kernel_int8++) = *(cur_kernel[0] + j); - for(k = 0; k < 7; k++) + for (k = 0; k < 7; k++) *(cur_kernel_int8++) = 0; } } @@ -665,18 +664,18 @@ void interleave_kernel_int8(int8_t* kernel, int8_t* kernel_int8, int kernel_chan int kernel_size1 = kernel_size & 0x1; // interleave 8 kernels - for(i = 0; i < (kernel_chan & -8); i += 8) + for (i = 0; i < (kernel_chan & -8); i += 8) { - for(j = 0; j < 8; j++) + for (j = 0; j < 8; j++) cur_kernel[j] = kernel + kernel_size * (i + j); - for(j = 0; j < (kernel_size & -2); j += 2) - for(k = 0; k < 8; k++) + for (j = 0; j < (kernel_size & -2); j += 2) + for (k = 0; k < 8; k++) { *(cur_kernel_int8++) = *(cur_kernel[k] + j); *(cur_kernel_int8++) = *(cur_kernel[k] + j + 1); } - if(kernel_size1) - for(k = 0; k < 8; k++) + if (kernel_size1) + for (k = 0; k < 8; k++) { *(cur_kernel_int8++) = *(cur_kernel[k] + j); *(cur_kernel_int8++) = 0; @@ -684,87 +683,87 @@ void interleave_kernel_int8(int8_t* kernel, int8_t* kernel_int8, int kernel_chan } // interleave 4 kernels - for(; i < (kernel_chan & -4); i += 4) + for (; i < (kernel_chan & -4); i += 4) { - for(j = 0; j < 4; j++) + for (j = 0; j < 4; j++) cur_kernel[j] = kernel + kernel_size * (i + j); - for(j = 0; j < (kernel_size & -2); j += 2) - for(k = 0; k < 4; k++) + for (j = 0; j < (kernel_size & -2); j += 2) + for (k = 0; k < 4; k++) { *(cur_kernel_int8++) = *(cur_kernel[k] + j); *(cur_kernel_int8++) = *(cur_kernel[k] + j + 1); } - if(kernel_size1) - for(k = 0; k < 4; k++) + if (kernel_size1) + for (k = 0; k < 4; k++) { *(cur_kernel_int8++) = *(cur_kernel[k] + j); *(cur_kernel_int8++) = 0; } } // last 4 kernels - if(kernel_chan3) + if (kernel_chan3) { - for(j = 0; j < 3; j++) + for (j = 0; j < 3; j++) cur_kernel[j] = kernel + kernel_size * (i + j); - if((kernel_chan3) == 3) + if ((kernel_chan3) == 3) { - for(j = 0; j < (kernel_size & -2); j += 2) + for (j = 0; j < (kernel_size & -2); j += 2) { - for(k = 0; k < 3; k++) + for (k = 0; k < 3; k++) { *(cur_kernel_int8++) = *(cur_kernel[k] + j); *(cur_kernel_int8++) = *(cur_kernel[k] + j + 1); } - for(k = 0; k < 2; k++) + for (k = 0; k < 2; k++) *(cur_kernel_int8++) = 0; } - if(kernel_size1) + if (kernel_size1) { - for(k = 0; k < 3; k++) + for (k = 0; k < 3; k++) { *(cur_kernel_int8++) = *(cur_kernel[k] + j); *(cur_kernel_int8++) = 0; } - for(k = 0; k < 2; k++) + for (k = 0; k < 2; k++) *(cur_kernel_int8++) = 0; } } - else if((kernel_chan3) == 2) + else if ((kernel_chan3) == 2) { - for(j = 0; j < (kernel_size & -2); j += 2) + for (j = 0; j < (kernel_size & -2); j += 2) { - for(k = 0; k < 2; k++) + for (k = 0; k < 2; k++) { *(cur_kernel_int8++) = *(cur_kernel[k] + j); *(cur_kernel_int8++) = *(cur_kernel[k] + j + 1); } - for(k = 0; k < 4; k++) + for (k = 0; k < 4; k++) *(cur_kernel_int8++) = 0; } - if(kernel_size1) + if (kernel_size1) { - for(k = 0; k < 2; k++) + for (k = 0; k < 2; k++) { *(cur_kernel_int8++) = *(cur_kernel[k] + j); *(cur_kernel_int8++) = 0; } - for(k = 0; k < 4; k++) + for (k = 0; k < 4; k++) *(cur_kernel_int8++) = 0; } } else - { // kernel_chan & 0x3 == 1 - for(j = 0; j < (kernel_size & -2); j += 2) + { // kernel_chan & 0x3 == 1 + for (j = 0; j < (kernel_size & -2); j += 2) { *(cur_kernel_int8++) = *(cur_kernel[0] + j); *(cur_kernel_int8++) = *(cur_kernel[0] + j + 1); - for(k = 0; k < 6; k++) + for (k = 0; k < 6; k++) *(cur_kernel_int8++) = 0; } - if(kernel_size1) + if (kernel_size1) { *(cur_kernel_int8++) = *(cur_kernel[0] + j); - for(k = 0; k < 7; k++) + for (k = 0; k < 7; k++) *(cur_kernel_int8++) = 0; } } @@ -776,9 +775,9 @@ void interleave_kernel_int8(int8_t* kernel, int8_t* kernel_int8, int kernel_chan /* kernel interleave */ static void interleave_int8(struct tensor* filter, struct conv_priv_info* priv_info, struct conv_param* param) { - int group = param->group; + int group = param->group; int kernel_size = filter->dims[1] * filter->dims[2] * filter->dims[3]; - int out_chan = filter->dims[0] / group; + int out_chan = filter->dims[0] / group; int out_chan_align4 = (out_chan + 3) / 4 * 4; int kernel_size_algin = kernel_size * out_chan_align4; @@ -788,15 +787,14 @@ static void interleave_int8(struct tensor* filter, struct conv_priv_info* priv_i int8_t* interleave_buf = priv_info->interleave_buffer; for (int g = 0; g < group; g++) { - int8_t* cur_kernel = kernel + g * kernel_size_group; + int8_t* cur_kernel = kernel + g * kernel_size_group; int8_t* cur_interleave = interleave_buf + g * kernel_size_algin; interleave_kernel_int8(cur_kernel, cur_interleave, out_chan, kernel_size); } } - static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, int input_y, int kernel_x, int kernel_y, int stride_x, int stride_y, int dilation_x, - int dilation_y, int pad_x0, int pad_x1, int pad_y0, int pad_y1, int output_x, int output_y, int num_thread) + int dilation_y, int pad_x0, int pad_x1, int pad_y0, int pad_y1, int output_x, int output_y, int num_thread) { int col_start = 0; int col_end = output_x * output_y; @@ -813,21 +811,21 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in #ifdef __aarch64__ // is 1x1 - if(is_1x1) + if (is_1x1) { int8_t* cur_col = col + col_start * kernel_size_aligned2; int col_cnt = (col_end & -4) - (col_start & -4); - im2col_int8_1x1(( int8_t* )im + col_start, input_xy, cur_col, col_cnt, kernel_size); + im2col_int8_1x1((int8_t*)im + col_start, input_xy, cur_col, col_cnt, kernel_size); cur_col += col_cnt * kernel_size_aligned2; int col_i = col_end & -4; // final 4 input - if(col_end3) + if (col_end3) { - for(int kch = 0; kch < (kernel_size & -2); kch += 2) + for (int kch = 0; kch < (kernel_size & -2); kch += 2) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - if((col_i + i) < col_end) + if ((col_i + i) < col_end) { *cur_col++ = *(im + input_xy * (kch + 0) + col_i + i); *cur_col++ = *(im + input_xy * (kch + 1) + col_i + i); @@ -840,11 +838,11 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } } int kch = kernel_size & -2; - if(kernel_size1) + if (kernel_size1) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - if((col_i + i) < col_end) + if ((col_i + i) < col_end) { *cur_col++ = *(im + input_xy * (kch + 0) + col_i + i); *cur_col++ = 0; @@ -859,10 +857,10 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } } // 3x3 non dilation - else if(is_3x3) + else if (is_3x3) { #pragma omp parallel for num_threads(num_thread) - for(int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) + for (int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) { int imx[4] = {0}; int imy[4] = {0}; @@ -872,17 +870,16 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in int imy_start[4] = {0}; int8_t* cur_col = col + col_i * kernel_size_aligned2; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { cnt_y[i] = (col_i + i) / output_x; cnt_x[i] = col_i + i - cnt_y[i] * output_x; imx_start[i] = cnt_x[i] * stride_x - pad_x0; imy_start[i] = cnt_y[i] * stride_y - pad_y0; } - if((cnt_y[0] == cnt_y[3]) && - (is_pad0 || (cnt_y[0] > 0 && cnt_x[0] > 0 && cnt_y[0] < (output_y - 1) && cnt_x[3] < (output_x - 1)))) + if ((cnt_y[0] == cnt_y[3]) && (is_pad0 || (cnt_y[0] > 0 && cnt_x[0] > 0 && cnt_y[0] < (output_y - 1) && cnt_x[3] < (output_x - 1)))) { - int8_t* input_start = ( int8_t* )(im + imy_start[0] * input_x + imx_start[0]); + int8_t* input_start = (int8_t*)(im + imy_start[0] * input_x + imx_start[0]); im2col_int8_3x3(input_start, input_x, input_y, input_chan, cur_col, stride_x); cur_col += 4 * kernel_size_aligned2; } @@ -891,32 +888,32 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in bool odd_line = false; int kchp = 0; int kyp = 0; - for(int kch = 0; kch < input_chan; kch++) + for (int kch = 0; kch < input_chan; kch++) { - for(int ky = 0; ky < 3; ky++) + for (int ky = 0; ky < 3; ky++) { - if(odd_line) + if (odd_line) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imy[i] = imy_start[i] + kyp; imx[i] = imx_start[i] + 2; - if(imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]); else *cur_col++ = 0; imy[i] = imy_start[i] + ky; - if(imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y) + if (imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx_start[i]); else *cur_col++ = 0; } - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { imx[i] = imx_start[i] + 1 + k; - if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -927,14 +924,14 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in // even line 2n else { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) imy[i] = imy_start[i] + ky; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { imx[i] = imx_start[i] + k; - if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -946,13 +943,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } } } - if(kernel_size1) + if (kernel_size1) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imy[i] = imy_start[i] + kyp; imx[i] = imx_start[i] + 2; - if(imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -962,7 +959,7 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } } int col_i = col_end & -4; - if(col_end3) + if (col_end3) { int imx[4] = {0}; int imy[4] = {0}; @@ -971,7 +968,7 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in int imx_start[4] = {0}; int imy_start[4] = {0}; int8_t* cur_col = col + col_i * kernel_size_aligned2; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { cnt_y[i] = (col_i + i) / output_x; cnt_x[i] = col_i + i - cnt_y[i] * output_x; @@ -981,33 +978,33 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in bool odd_line = false; int kchp = 0; int kyp = 0; - for(int kch = 0; kch < input_chan; kch++) + for (int kch = 0; kch < input_chan; kch++) { - for(int ky = 0; ky < 3; ky++) + for (int ky = 0; ky < 3; ky++) { // odd line 1 + 2n - if(odd_line) + if (odd_line) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imy[i] = imy_start[i] + kyp; imx[i] = imx_start[i] + 2; - if((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if ((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]); else *cur_col++ = 0; imy[i] = imy_start[i] + ky; - if((i < col_end3) && imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y) + if ((i < col_end3) && imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx_start[i]); else *cur_col++ = 0; } - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { imx[i] = imx_start[i] + (1 + k); - if((i < col_end3) && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if ((i < col_end3) && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -1018,14 +1015,14 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in // even line 2n + 1 else { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) imy[i] = imy_start[i] + ky; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { imx[i] = imx_start[i] + k; - if(i < col_end3 && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (i < col_end3 && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -1037,13 +1034,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } } } - if(kernel_size1) + if (kernel_size1) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imy[i] = imy_start[i] + kyp; imx[i] = imx_start[i] + 2; - if((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if ((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -1053,43 +1050,43 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } } // general case for kernel size <=3 - else if((kernel_x) < 4 && (kernel_y < 4)) + else if ((kernel_x) < 4 && (kernel_y < 4)) { int kch[2], kx[2], ky[2], imx[4][2], imy[4][2]; int8_t* cur_col = col + col_start * kernel_size_aligned2; - for(int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) + for (int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) { int cnt_x[4] = {0}; int cnt_y[4] = {0}; int imx_start[4] = {0}; int imy_start[4] = {0}; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { cnt_y[i] = (col_i + i) / output_x; cnt_x[i] = col_i + i - cnt_y[i] * output_x; imx_start[i] = cnt_x[i] * stride_x - pad_x0; imy_start[i] = cnt_y[i] * stride_y - pad_y0; } - for(int col_j = 0; col_j < (kernel_size & -2); col_j += 2) + for (int col_j = 0; col_j < (kernel_size & -2); col_j += 2) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { kch[k] = (col_j + k) / kernel_xy; ky[k] = (col_j + k - kch[k] * kernel_xy) / kernel_x; kx[k] = (col_j + k - kch[k] * kernel_xy) - ky[k] * kernel_x; ky[k] = ky[k] * dilation_y; kx[k] = kx[k] * dilation_x; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imx[i][k] = imx_start[i] + kx[k]; imy[i][k] = imy_start[i] + ky[k]; } } - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { - if(imx[i][k] >= 0 && imx[i][k] < input_x && imy[i][k] >= 0 && imy[i][k] < input_y) + if (imx[i][k] >= 0 && imx[i][k] < input_x && imy[i][k] >= 0 && imy[i][k] < input_y) *cur_col++ = *(im + input_xy * kch[k] + input_x * imy[i][k] + imx[i][k]); else *cur_col++ = 0; @@ -1097,18 +1094,18 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } } int col_j = kernel_size & -2; - if(kernel_size1) + if (kernel_size1) { kch[0] = col_j / kernel_xy; ky[0] = (col_j - kch[0] * kernel_xy) / kernel_x; kx[0] = col_j - kch[0] * kernel_xy - ky[0] * kernel_x; ky[0] = ky[0] * dilation_y; kx[0] = kx[0] * dilation_x; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imx[i][0] = imx_start[i] + kx[0]; imy[i][0] = imy_start[i] + ky[0]; - if(imx[i][0] >= 0 && imx[i][0] < input_x && imy[i][0] >= 0 && imy[i][0] < input_y) + if (imx[i][0] >= 0 && imx[i][0] < input_x && imy[i][0] >= 0 && imy[i][0] < input_y) *cur_col++ = *(im + input_xy * kch[0] + input_x * imy[i][0] + imx[i][0]); else *cur_col++ = 0; @@ -1118,40 +1115,39 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } int col_i = col_end & -4; // final 4 input - if(col_end3) + if (col_end3) { int cnt_x[4] = {0}; int cnt_y[4] = {0}; int imx_start[4] = {0}; int imy_start[4] = {0}; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { cnt_y[i] = (col_i + i) / output_x; cnt_x[i] = col_i + i - cnt_y[i] * output_x; imx_start[i] = cnt_x[i] * stride_x - pad_x0; imy_start[i] = cnt_y[i] * stride_y - pad_y0; } - for(int col_j = 0; col_j < (kernel_size & -2); col_j += 2) + for (int col_j = 0; col_j < (kernel_size & -2); col_j += 2) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { kch[k] = (col_j + k) / kernel_xy; ky[k] = (col_j + k - kch[k] * kernel_xy) / kernel_x; kx[k] = (col_j + k - kch[k] * kernel_xy) - ky[k] * kernel_x; ky[k] = ky[k] * dilation_y; kx[k] = kx[k] * dilation_x; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imx[i][k] = imx_start[i] + kx[k]; imy[i][k] = imy_start[i] + ky[k]; } } - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { - if((col_i + i) < col_end && imx[i][k] >= 0 && imx[i][k] < input_x && imy[i][k] >= 0 && - imy[i][k] < input_y) + if ((col_i + i) < col_end && imx[i][k] >= 0 && imx[i][k] < input_x && imy[i][k] >= 0 && imy[i][k] < input_y) *cur_col++ = *(im + input_xy * kch[k] + input_x * imy[i][k] + imx[i][k]); else *cur_col++ = 0; @@ -1159,18 +1155,18 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } } int col_j = kernel_size & -2; - if(kernel_size1) + if (kernel_size1) { kch[0] = col_j / kernel_xy; ky[0] = (col_j - kch[0] * kernel_xy) / kernel_x; kx[0] = col_j - kch[0] * kernel_xy - ky[0] * kernel_x; ky[0] = ky[0] * dilation_y; kx[0] = kx[0] * dilation_x; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imx[i][0] = imx_start[i] + kx[0]; imy[i][0] = imy_start[i] + ky[0]; - if((col_i + i) < col_end && imx[i][0] >= 0 && imx[i][0] < input_x && imy[i][0] >= 0 && imy[i][0] < input_y) + if ((col_i + i) < col_end && imx[i][0] >= 0 && imx[i][0] < input_x && imy[i][0] >= 0 && imy[i][0] < input_y) *cur_col++ = *(im + input_xy * kch[0] + input_x * imy[i][0] + imx[i][0]); else *cur_col++ = 0; @@ -1185,13 +1181,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in int kch, kx, ky, kchp, kyp, imx[4], imy[4] = {0}; int kernel_x1 = kernel_x & 0x1; int8_t* cur_col = col + col_start * kernel_size_aligned2; - for(int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) + for (int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) { int cnt_x[4] = {0}; int cnt_y[4] = {0}; int imx_start[4] = {0}; int imy_start[4] = {0}; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { cnt_y[i] = (col_i + i) / output_x; cnt_x[i] = col_i + i - cnt_y[i] * output_x; @@ -1201,35 +1197,35 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in bool odd_line = false; kchp = 0; kyp = 0; - for(int kch = 0; kch < input_chan; kch++) + for (int kch = 0; kch < input_chan; kch++) { - for(ky = 0; ky < kernel_y; ky++) + for (ky = 0; ky < kernel_y; ky++) { // odd line 2 + 2n - if(odd_line) + if (odd_line) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imy[i] = imy_start[i] + kyp * dilation_y; imx[i] = imx_start[i] + (kernel_x - 1) * dilation_x; - if(imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]); else *cur_col++ = 0; imy[i] = imy_start[i] + ky * dilation_y; - if(imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y) + if (imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx_start[i]); else *cur_col++ = 0; } - for(kx = 1; kx < kernel_x; kx += 2) + for (kx = 1; kx < kernel_x; kx += 2) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { imx[i] = imx_start[i] + (kx + k) * dilation_x; - if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -1241,16 +1237,16 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in // even line 2n else { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) imy[i] = imy_start[i] + ky * dilation_y; - for(kx = 0; kx < (kernel_x - 1); kx += 2) + for (kx = 0; kx < (kernel_x - 1); kx += 2) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { imx[i] = imx_start[i] + (kx + k) * dilation_x; - if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -1263,13 +1259,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } } } - if(kernel_size1) + if (kernel_size1) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imy[i] = imy_start[i] + kyp * dilation_y; imx[i] = imx_start[i] + (kernel_x - 1) * dilation_x; - if(imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -1279,13 +1275,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } int col_i = col_end & -4; // final 4 input - if(col_end3) + if (col_end3) { int cnt_x[4] = {0}; int cnt_y[4] = {0}; int imx_start[4] = {0}; int imy_start[4] = {0}; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { cnt_y[i] = (col_i + i) / output_x; cnt_x[i] = col_i + i - cnt_y[i] * output_x; @@ -1295,36 +1291,35 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in bool odd_line = false; kchp = 0; kyp = 0; - for(int kch = 0; kch < input_chan; kch++) + for (int kch = 0; kch < input_chan; kch++) { - for(ky = 0; ky < kernel_y; ky++) + for (ky = 0; ky < kernel_y; ky++) { // odd line 1 + 2n - if(odd_line) + if (odd_line) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imy[i] = imy_start[i] + kyp * dilation_y; imx[i] = imx_start[i] + (kernel_x - 1) * dilation_x; - if((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if ((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]); else *cur_col++ = 0; imy[i] = imy_start[i] + ky * dilation_y; - if((i < col_end3) && imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y) + if ((i < col_end3) && imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx_start[i]); else *cur_col++ = 0; } - for(kx = 1; kx < kernel_x; kx += 2) + for (kx = 1; kx < kernel_x; kx += 2) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { imx[i] = imx_start[i] + (kx + k) * dilation_x; - if((i < col_end3) && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && - imy[i] < input_y) + if ((i < col_end3) && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -1336,17 +1331,16 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in // even line 2n + 1 else { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) imy[i] = imy_start[i] + ky * dilation_y; - for(kx = 0; kx < (kernel_x - 1); kx += 2) + for (kx = 0; kx < (kernel_x - 1); kx += 2) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { imx[i] = imx_start[i] + (kx + k) * dilation_x; - if(i < col_end3 && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && - imy[i] < input_y) + if (i < col_end3 && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -1359,13 +1353,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } } } - if(kernel_size1) + if (kernel_size1) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imy[i] = imy_start[i] + kyp * dilation_y; imx[i] = imx_start[i] + (kernel_x - 1) * dilation_x; - if((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if ((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -1375,12 +1369,12 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } } #else - if(is_3x3) + if (is_3x3) { int stride_x2 = stride_x * 2; int stride_x3 = stride_x * 3; -// #pragma omp parallel for num_threads(num_thread) - for(int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) + // #pragma omp parallel for num_threads(num_thread) + for (int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) { int imx[4] = {0}; int imy[4] = {0}; @@ -1389,23 +1383,22 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in int imx_start[4] = {0}; int imy_start[4] = {0}; int8_t* cur_col = col + col_i * kernel_size_aligned2; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { cnt_y[i] = (col_i + i) / output_x; cnt_x[i] = col_i + i - cnt_y[i] * output_x; imx_start[i] = cnt_x[i] * stride_x - pad_x0; imy_start[i] = cnt_y[i] * stride_y - pad_y0; } - if((cnt_y[0] == cnt_y[3]) && - (is_pad0 || (cnt_y[0] > 0 && cnt_x[0] > 0 && cnt_y[0] < (output_y - 1) && cnt_x[3] < (output_x - 1)))) + if ((cnt_y[0] == cnt_y[3]) && (is_pad0 || (cnt_y[0] > 0 && cnt_x[0] > 0 && cnt_y[0] < (output_y - 1) && cnt_x[3] < (output_x - 1)))) { - int8_t* l00 = ( int8_t* )(im + imy_start[0] * input_x + imx_start[0]); + int8_t* l00 = (int8_t*)(im + imy_start[0] * input_x + imx_start[0]); int8_t* l01 = l00 + input_x; int8_t* l02 = l00 + input_x * 2; int8_t* l10 = l00 + input_xy; int8_t* l11 = l10 + input_x; int8_t* l12 = l10 + input_x * 2; - for(int kch = 0; kch < (input_chan & -2); kch += 2) + for (int kch = 0; kch < (input_chan & -2); kch += 2) { cur_col[0] = l00[0]; cur_col[1] = l00[1]; @@ -1487,7 +1480,7 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in l11 += input_xy * 2; l12 += input_xy * 2; } - if(input_chan & 0x1) + if (input_chan & 0x1) { cur_col[0] = l00[0]; cur_col[1] = l00[1]; @@ -1536,32 +1529,32 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in bool odd_line = false; int kchp = 0; int kyp = 0; - for(int kch = 0; kch < input_chan; kch++) + for (int kch = 0; kch < input_chan; kch++) { - for(int ky = 0; ky < 3; ky++) + for (int ky = 0; ky < 3; ky++) { - if(odd_line) + if (odd_line) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imy[i] = imy_start[i] + kyp; imx[i] = imx_start[i] + 2; - if(imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]); else *cur_col++ = 0; imy[i] = imy_start[i] + ky; - if(imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y) + if (imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx_start[i]); else *cur_col++ = 0; } - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { imx[i] = imx_start[i] + 1 + k; - if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -1572,14 +1565,14 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in // even line 2n else { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) imy[i] = imy_start[i] + ky; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { imx[i] = imx_start[i] + k; - if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -1591,13 +1584,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } } } - if(kernel_size1) + if (kernel_size1) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imy[i] = imy_start[i] + kyp; imx[i] = imx_start[i] + 2; - if(imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -1607,7 +1600,7 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } } int col_i = col_end & -4; - if(col_end3) + if (col_end3) { int imx[4] = {0}; int imy[4] = {0}; @@ -1616,7 +1609,7 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in int imx_start[4] = {0}; int imy_start[4] = {0}; int8_t* cur_col = col + col_i * kernel_size_aligned2; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { cnt_y[i] = (col_i + i) / output_x; cnt_x[i] = col_i + i - cnt_y[i] * output_x; @@ -1626,33 +1619,33 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in bool odd_line = false; int kchp = 0; int kyp = 0; - for(int kch = 0; kch < input_chan; kch++) + for (int kch = 0; kch < input_chan; kch++) { - for(int ky = 0; ky < 3; ky++) + for (int ky = 0; ky < 3; ky++) { // odd line 1 + 2n - if(odd_line) + if (odd_line) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imy[i] = imy_start[i] + kyp; imx[i] = imx_start[i] + 2; - if((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if ((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]); else *cur_col++ = 0; imy[i] = imy_start[i] + ky; - if((i < col_end3) && imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y) + if ((i < col_end3) && imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx_start[i]); else *cur_col++ = 0; } - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { imx[i] = imx_start[i] + (1 + k); - if((i < col_end3) && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if ((i < col_end3) && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -1663,14 +1656,14 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in // even line 2n + 1 else { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) imy[i] = imy_start[i] + ky; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { imx[i] = imx_start[i] + k; - if(i < col_end3 && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (i < col_end3 && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -1682,13 +1675,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } } } - if(kernel_size1) + if (kernel_size1) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imy[i] = imy_start[i] + kyp; imx[i] = imx_start[i] + 2; - if((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if ((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -1698,43 +1691,43 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } } // general case for kernel size <=3 - else if((kernel_x) < 4 && (kernel_y < 4)) + else if ((kernel_x) < 4 && (kernel_y < 4)) { int kch[2], kx[2], ky[2], imx[4][2], imy[4][2]; - for(int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) + for (int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) { int cnt_x[4] = {0}; int cnt_y[4] = {0}; int imx_start[4] = {0}; int imy_start[4] = {0}; int8_t* cur_col = col + col_i * kernel_size_aligned2; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { cnt_y[i] = (col_i + i) / output_x; cnt_x[i] = col_i + i - cnt_y[i] * output_x; imx_start[i] = cnt_x[i] * stride_x - pad_x0; imy_start[i] = cnt_y[i] * stride_y - pad_y0; } - for(int col_j = 0; col_j < (kernel_size & -2); col_j += 2) + for (int col_j = 0; col_j < (kernel_size & -2); col_j += 2) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { kch[k] = (col_j + k) / kernel_xy; ky[k] = (col_j + k - kch[k] * kernel_xy) / kernel_x; kx[k] = (col_j + k - kch[k] * kernel_xy) - ky[k] * kernel_x; ky[k] = ky[k] * dilation_y; kx[k] = kx[k] * dilation_x; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imx[i][k] = imx_start[i] + kx[k]; imy[i][k] = imy_start[i] + ky[k]; } } - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { - if(imx[i][k] >= 0 && imx[i][k] < input_x && imy[i][k] >= 0 && imy[i][k] < input_y) + if (imx[i][k] >= 0 && imx[i][k] < input_x && imy[i][k] >= 0 && imy[i][k] < input_y) *cur_col++ = *(im + input_xy * kch[k] + input_x * imy[i][k] + imx[i][k]); else *cur_col++ = 0; @@ -1742,18 +1735,18 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } } int col_j = kernel_size & -2; - if(kernel_size1) + if (kernel_size1) { kch[0] = col_j / kernel_xy; ky[0] = (col_j - kch[0] * kernel_xy) / kernel_x; kx[0] = col_j - kch[0] * kernel_xy - ky[0] * kernel_x; ky[0] = ky[0] * dilation_y; kx[0] = kx[0] * dilation_x; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imx[i][0] = imx_start[i] + kx[0]; imy[i][0] = imy_start[i] + ky[0]; - if(imx[i][0] >= 0 && imx[i][0] < input_x && imy[i][0] >= 0 && imy[i][0] < input_y) + if (imx[i][0] >= 0 && imx[i][0] < input_x && imy[i][0] >= 0 && imy[i][0] < input_y) *cur_col++ = *(im + input_xy * kch[0] + input_x * imy[i][0] + imx[i][0]); else *cur_col++ = 0; @@ -1763,41 +1756,40 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } int col_i = col_end & -4; // final 4 input - if(col_end3) + if (col_end3) { int cnt_x[4] = {0}; int cnt_y[4] = {0}; int imx_start[4] = {0}; int imy_start[4] = {0}; int8_t* cur_col = col + col_i * kernel_size_aligned2; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { cnt_y[i] = (col_i + i) / output_x; cnt_x[i] = col_i + i - cnt_y[i] * output_x; imx_start[i] = cnt_x[i] * stride_x - pad_x0; imy_start[i] = cnt_y[i] * stride_y - pad_y0; } - for(int col_j = 0; col_j < (kernel_size & -2); col_j += 2) + for (int col_j = 0; col_j < (kernel_size & -2); col_j += 2) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { kch[k] = (col_j + k) / kernel_xy; ky[k] = (col_j + k - kch[k] * kernel_xy) / kernel_x; kx[k] = (col_j + k - kch[k] * kernel_xy) - ky[k] * kernel_x; ky[k] = ky[k] * dilation_y; kx[k] = kx[k] * dilation_x; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imx[i][k] = imx_start[i] + kx[k]; imy[i][k] = imy_start[i] + ky[k]; } } - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { - if((col_i + i) < col_end && imx[i][k] >= 0 && imx[i][k] < input_x && imy[i][k] >= 0 && - imy[i][k] < input_y) + if ((col_i + i) < col_end && imx[i][k] >= 0 && imx[i][k] < input_x && imy[i][k] >= 0 && imy[i][k] < input_y) *cur_col++ = *(im + input_xy * kch[k] + input_x * imy[i][k] + imx[i][k]); else *cur_col++ = 0; @@ -1805,19 +1797,18 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } } int col_j = kernel_size & -2; - if(kernel_size1) + if (kernel_size1) { kch[0] = col_j / kernel_xy; ky[0] = (col_j - kch[0] * kernel_xy) / kernel_x; kx[0] = col_j - kch[0] * kernel_xy - ky[0] * kernel_x; ky[0] = ky[0] * dilation_y; kx[0] = kx[0] * dilation_x; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imx[i][0] = imx_start[i] + kx[0]; imy[i][0] = imy_start[i] + ky[0]; - if((col_i + i) < col_end && imx[i][0] >= 0 && imx[i][0] < input_x && imy[i][0] >= 0 && - imy[i][0] < input_y) + if ((col_i + i) < col_end && imx[i][0] >= 0 && imx[i][0] < input_x && imy[i][0] >= 0 && imy[i][0] < input_y) *cur_col++ = *(im + input_xy * kch[0] + input_x * imy[i][0] + imx[i][0]); else *cur_col++ = 0; @@ -1832,13 +1823,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in int kch, kx, ky, kchp, kyp, imx[4], imy[4]; int kernel_x1 = kernel_x & 0x1; int8_t* cur_col = col + col_start * kernel_size_aligned2; - for(int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) + for (int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) { int cnt_x[4] = {0}; int cnt_y[4] = {0}; int imx_start[4] = {0}; int imy_start[4] = {0}; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { cnt_y[i] = (col_i + i) / output_x; cnt_x[i] = col_i + i - cnt_y[i] * output_x; @@ -1848,35 +1839,35 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in bool odd_line = false; kchp = 0; kyp = 0; - for(int kch = 0; kch < input_chan; kch++) + for (int kch = 0; kch < input_chan; kch++) { - for(int ky = 0; ky < kernel_y; ky++) + for (int ky = 0; ky < kernel_y; ky++) { // odd line 2 + 2n - if(odd_line) + if (odd_line) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imy[i] = imy_start[i] + kyp * dilation_y; imx[i] = imx_start[i] + (kernel_x - 1) * dilation_x; - if(imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]); else *cur_col++ = 0; imy[i] = imy_start[i] + ky * dilation_y; - if(imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y) + if (imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx_start[i]); else *cur_col++ = 0; } - for(int kx = 1; kx < kernel_x; kx += 2) + for (int kx = 1; kx < kernel_x; kx += 2) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { imx[i] = imx_start[i] + (kx + k) * dilation_x; - if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -1888,16 +1879,16 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in // even line 2n else { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) imy[i] = imy_start[i] + ky * dilation_y; - for(int kx = 0; kx < (kernel_x - 1); kx += 2) + for (int kx = 0; kx < (kernel_x - 1); kx += 2) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { imx[i] = imx_start[i] + (kx + k) * dilation_x; - if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -1910,13 +1901,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } } } - if(kernel_size1) + if (kernel_size1) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imy[i] = imy_start[i] + kyp * dilation_y; imx[i] = imx_start[i] + (kernel_x - 1) * dilation_x; - if(imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if (imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -1926,13 +1917,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } int col_i = col_end & -4; // final 4 input - if(col_end3) + if (col_end3) { int cnt_x[4] = {0}; int cnt_y[4] = {0}; int imx_start[4] = {0}; int imy_start[4] = {0}; - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { cnt_y[i] = (col_i + i) / output_x; cnt_x[i] = col_i + i - cnt_y[i] * output_x; @@ -1942,36 +1933,35 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in bool odd_line = false; kchp = 0; kyp = 0; - for(int kch = 0; kch < input_chan; kch++) + for (int kch = 0; kch < input_chan; kch++) { - for(int ky = 0; ky < kernel_y; ky++) + for (int ky = 0; ky < kernel_y; ky++) { // odd line 1 + 2n - if(odd_line) + if (odd_line) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imy[i] = imy_start[i] + kyp * dilation_y; imx[i] = imx_start[i] + (kernel_x - 1) * dilation_x; - if((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if ((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]); else *cur_col++ = 0; imy[i] = imy_start[i] + ky * dilation_y; - if((i < col_end3) && imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y) + if ((i < col_end3) && imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx_start[i]); else *cur_col++ = 0; } - for(int kx = 1; kx < kernel_x; kx += 2) + for (int kx = 1; kx < kernel_x; kx += 2) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { imx[i] = imx_start[i] + (kx + k) * dilation_x; - if((i < col_end3) && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && - imy[i] < input_y) + if ((i < col_end3) && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -1983,19 +1973,18 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in // even line 2n + 1 else { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imy[i] = imy_start[i] + ky * dilation_y; } - for(int kx = 0; kx < (kernel_x - 1); kx += 2) + for (int kx = 0; kx < (kernel_x - 1); kx += 2) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - for(int k = 0; k < 2; k++) + for (int k = 0; k < 2; k++) { imx[i] = imx_start[i] + (kx + k) * dilation_x; - if(i < col_end3 && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && - imy[i] < input_y) + if (i < col_end3 && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -2008,13 +1997,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in } } } - if(kernel_size1) + if (kernel_size1) { - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { imy[i] = imy_start[i] + kyp * dilation_y; imx[i] = imx_start[i] + (kernel_x - 1) * dilation_x; - if((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + if ((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]); else *cur_col++ = 0; @@ -2027,9 +2016,8 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in return; } - int int8_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor, - struct conv_priv_info* priv_info, struct conv_param* param) + struct conv_priv_info* priv_info, struct conv_param* param) { int in_c = input_tensor->dims[1]; int in_h = input_tensor->dims[2]; @@ -2043,7 +2031,7 @@ int int8_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tens { int mem_size = int8_conv_hcl_get_shared_mem_size(input_tensor, output_tensor, param); void* mem = sys_malloc(mem_size); - priv_info->im2col_buffer = mem; + priv_info->im2col_buffer = mem; priv_info->im2col_buffer_size = mem_size; } /* alloc mem of kernel interleave */ @@ -2051,7 +2039,7 @@ int int8_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tens { int mem_size = get_private_mem_size(filter_tensor, param); void* mem = sys_malloc(mem_size); - priv_info->interleave_buffer = mem; + priv_info->interleave_buffer = mem; priv_info->interleave_buffer_size = mem_size; } /* kernel interleave */ @@ -2067,19 +2055,19 @@ int int8_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tens priv_info->activation_min = -127; priv_info->activation_max = 127; /* set activation */ - if(param->activation >= 0) + if (param->activation >= 0) { priv_info->activation_min = 0; - if(param->activation == 1) + if (param->activation == 1) priv_info->activation_max = round(1.0 / output_scale); - if(param->activation == 6) + if (param->activation == 6) priv_info->activation_max = round(6.0 / output_scale); - if(priv_info->activation_max > 127) + if (priv_info->activation_max > 127) priv_info->activation_max = 127; } - for(int i=0; igroup; @@ -2165,8 +2153,8 @@ int int8_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, int activation_max = priv_info->activation_max; /* buffer addr */ - int8_t* input_buf = ( int8_t* )input_tensor->data; - int8_t* output_buf = ( int8_t* )output_tensor->data; + int8_t* input_buf = (int8_t*)input_tensor->data; + int8_t* output_buf = (int8_t*)output_tensor->data; int32_t* biases_buf = NULL; bool have_biases = false; if (bias_tensor != NULL) @@ -2175,11 +2163,11 @@ int int8_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, have_biases = true; } - int8_t* col_buf = ( int8_t* )priv_info->im2col_buffer; - int8_t* interleave_buf = ( int8_t* )priv_info->interleave_buffer; + int8_t* col_buf = (int8_t*)priv_info->im2col_buffer; + int8_t* interleave_buf = (int8_t*)priv_info->interleave_buffer; /* block size split parameter */ - int L2_CACHE_SIZE = (cpu_affinity == TENGINE_CLUSTER_LITTLE)? 512 * 1024 : 1024 * 1024; + int L2_CACHE_SIZE = (cpu_affinity == TENGINE_CLUSTER_LITTLE) ? 512 * 1024 : 1024 * 1024; int kernel_size_l1 = kernel_size; #ifdef __aarch64__ int col_cnt_l2 = L2_CACHE_SIZE * 3 / kernel_size_l1 / 4; @@ -2188,7 +2176,7 @@ int int8_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, #endif col_cnt_l2 = col_cnt_l2 > 4 ? (col_cnt_l2 & -4) : 4; - for (int n = 0; n < batch; n++) // batch size + for (int n = 0; n < batch; n++) // batch size { int8_t* input = input_buf + n * input_size * group; int8_t* output = output_buf + n * output_size * group; @@ -2197,7 +2185,7 @@ int int8_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, int8_t* cur_input = input + g * input_size; im2col_int8(cur_input, col_buf, in_c, in_w, in_h, kernel_w, kernel_h, stride_w, stride_h, dilation_w, dilation_h, - pad_w0, pad_w1, pad_h0, pad_h1, out_w, out_h, num_thread); + pad_w0, pad_w1, pad_h0, pad_h1, out_w, out_h, num_thread); int kernel_size_aligned2 = (kernel_size + 1) & -2; int output_chan_aligned4 = (out_c + 3) & -4; @@ -2209,25 +2197,25 @@ int int8_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, int* q_shift_g = priv_info->q_shift + g * out_c; // for input block of L2 cache size - for(int col_i = 0; col_i < out_hw; col_i += col_cnt_l2) + for (int col_i = 0; col_i < out_hw; col_i += col_cnt_l2) { int col_start = col_i; int col_end = col_i + col_cnt_l2; col_end = col_end > out_hw ? out_hw : col_end; #ifdef __aarch64__ i8gemm4x16(col_buf, kernel_g, have_biases, bias_g, output_g, multi_g, kernel_size, out_hw, - col_start, col_end, 0, out_c & -16, activation_min, activation_max, q_shift_g, num_thread, cpu_affinity); - if(out_c & 0xf) + col_start, col_end, 0, out_c & -16, activation_min, activation_max, q_shift_g, num_thread, cpu_affinity); + if (out_c & 0xf) i8gemm4x4(col_buf, kernel_g, have_biases, bias_g, output_g, multi_g, kernel_size, out_hw, - col_start, col_end, out_c & -16, out_c, activation_min, activation_max, q_shift_g, num_thread, cpu_affinity); + col_start, col_end, out_c & -16, out_c, activation_min, activation_max, q_shift_g, num_thread, cpu_affinity); #else i8gemm4x8(col_buf, kernel_g, have_biases, bias_g, output_g, multi_g, kernel_size, out_hw, - col_start, col_end, 0, out_c & -8, activation_min, activation_max, q_shift_g, num_thread, cpu_affinity); - if(out_c & 0x7) + col_start, col_end, 0, out_c & -8, activation_min, activation_max, q_shift_g, num_thread, cpu_affinity); + if (out_c & 0x7) i8gemm4x4(col_buf, kernel_g, have_biases, bias_g, output_g, multi_g, kernel_size, out_hw, - col_start, col_end, out_c & -8, out_c, activation_min, activation_max, q_shift_g, num_thread, cpu_affinity); + col_start, col_end, out_c & -8, out_c, activation_min, activation_max, q_shift_g, num_thread, cpu_affinity); #endif - } // col_cont + } // col_cont } } return 0; diff --git a/source/device/cpu/op/conv/cortex-a/conv_kernel_int8_arm.h b/source/device/cpu/op/conv/cortex-a/conv_kernel_int8_arm.h index cb19229be..f9603a273 100644 --- a/source/device/cpu/op/conv/cortex-a/conv_kernel_int8_arm.h +++ b/source/device/cpu/op/conv/cortex-a/conv_kernel_int8_arm.h @@ -30,14 +30,13 @@ #include "graph/node.h" #include "graph/graph.h" - int int8_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor, - struct conv_priv_info* priv_info, struct conv_param* param); + struct conv_priv_info* priv_info, struct conv_param* param); int int8_conv_hcl_postrun(struct conv_priv_info* priv_info); int int8_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, - struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param, - int num_thread, int cpu_affinity); + struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param, + int num_thread, int cpu_affinity); #endif diff --git a/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_1_arm.c b/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_1_arm.c index 6c714c0aa..4d26ead44 100644 --- a/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_1_arm.c +++ b/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_1_arm.c @@ -38,16 +38,15 @@ #include - -#define TILE 4 +#define TILE 4 #define BLOCK_HW_UNIT 4 -#define ELEM_SIZE ((TILE + 2) * (TILE + 2)) +#define ELEM_SIZE ((TILE + 2) * (TILE + 2)) #define WINO_MAX(a, b) ((a) > (b) ? (a) : (b)) #define WINO_MIN(a, b) ((a) < (b) ? (a) : (b)) -#define PER_OUT_CHAN 16 -#define KER_COUT_UNIT 16 +#define PER_OUT_CHAN 16 +#define KER_COUT_UNIT 16 #define KER_COUT_UNIT4 4 void tran_inp_4(float*, float*, float*, int, int, int); void wino_sgemm_4x16_A72(float* output, const float* input, const float* kernel, long cin, short stride_save); @@ -56,13 +55,17 @@ void wino_sgemm_1x16(float* output, const float* input, const float* kernel, lon void wino_sgemm_1x4(float* output, const float* input, const float* kernel, long cin); void tran_out_4(float*, float*, int, float*, float*, int); -#define INTERLEAVE_KERNEL_UNIT(cout_idx_p,cout_unit,cin,ker_src,ker_dst,ELEM_SIZE,i,j,s){ \ - for(i = 0; i < cin; i++){ \ - for(j = 0; j < cout_unit; j++){ \ - *ker_dst = ker_src[((cout_idx_p + j) * cin + i) * ELEM_SIZE + s]; \ - ker_dst++; \ - } \ - }} +#define INTERLEAVE_KERNEL_UNIT(cout_idx_p, cout_unit, cin, ker_src, ker_dst, ELEM_SIZE, i, j, s) \ + { \ + for (i = 0; i < cin; i++) \ + { \ + for (j = 0; j < cout_unit; j++) \ + { \ + *ker_dst = ker_src[((cout_idx_p + j) * cin + i) * ELEM_SIZE + s]; \ + ker_dst++; \ + } \ + } \ + } static inline void trans_kernel_f43(float* ker, float* trans_ker) { @@ -83,10 +86,10 @@ static inline void trans_kernel_f43(float* ker, float* trans_ker) */ float tmp[18] = {0}; - float neg_r0_add_r2_x_1_6[6]; // (r0+r2)*1./6 - float r0_1_4_add_r2_x_1_6[6]; // (r0*1/4 + r2)*1./6 - float r1_1_6[6]; // r1*1/6 - float r1_1_12[6]; // r1*1/12 + float neg_r0_add_r2_x_1_6[6]; // (r0+r2)*1./6 + float r0_1_4_add_r2_x_1_6[6]; // (r0*1/4 + r2)*1./6 + float r1_1_6[6]; // r1*1/6 + float r1_1_12[6]; // r1*1/12 float s_1_6 = 1. / 6.f; for (int j = 0; j < 3; j++) { @@ -132,14 +135,14 @@ static inline void transform_kernel_f43_tile(struct tensor* filter, float* trans { int outc = filter->dims[0]; int inc = filter->dims[1]; - float* kernel = ( float* )filter->data; + float* kernel = (float*)filter->data; float* ker_ptr = trans_ker; for (int i = 0; i < outc; i++) { for (int j = 0; j < inc; j++) { - trans_kernel_f43(( float* )(kernel + 9 * (j + i * inc)), ker_ptr); + trans_kernel_f43((float*)(kernel + 9 * (j + i * inc)), ker_ptr); ker_ptr += ELEM_SIZE; } } @@ -149,22 +152,25 @@ static inline void transform_kernel_f43_tile(struct tensor* filter, float* trans // ker1 [ELEM_SIZE][cout//KER_COUT_UNIT][cin][KER_COUT_UNIT] static inline void interleave_kernel_1(float* ker0, float* ker1, int cout, int cin) { - int i,j; + int i, j; float* ker1_ptr = ker1; - for(int s = 0; s < ELEM_SIZE; s++) + for (int s = 0; s < ELEM_SIZE; s++) { int p; //cout 16 - for(p = 0; p < (cout& -KER_COUT_UNIT); p+=KER_COUT_UNIT){ - INTERLEAVE_KERNEL_UNIT(p,KER_COUT_UNIT,cin,ker0,ker1_ptr,ELEM_SIZE,i,j,s); + for (p = 0; p < (cout & -KER_COUT_UNIT); p += KER_COUT_UNIT) + { + INTERLEAVE_KERNEL_UNIT(p, KER_COUT_UNIT, cin, ker0, ker1_ptr, ELEM_SIZE, i, j, s); } //cout 4 - for(p = (cout & -KER_COUT_UNIT); p < (cout & -KER_COUT_UNIT4); p += KER_COUT_UNIT4){ - INTERLEAVE_KERNEL_UNIT(p,KER_COUT_UNIT4,cin,ker0,ker1_ptr,ELEM_SIZE,i,j,s); + for (p = (cout & -KER_COUT_UNIT); p < (cout & -KER_COUT_UNIT4); p += KER_COUT_UNIT4) + { + INTERLEAVE_KERNEL_UNIT(p, KER_COUT_UNIT4, cin, ker0, ker1_ptr, ELEM_SIZE, i, j, s); } // cout 1 - for(p=(cout & -KER_COUT_UNIT4); p < cout; p ++){ - INTERLEAVE_KERNEL_UNIT(p,1,cin,ker0,ker1_ptr,ELEM_SIZE,i,j,s); + for (p = (cout & -KER_COUT_UNIT4); p < cout; p++) + { + INTERLEAVE_KERNEL_UNIT(p, 1, cin, ker0, ker1_ptr, ELEM_SIZE, i, j, s); } } } @@ -175,7 +181,7 @@ static inline void pad_input1(const float* input, float* inp_padded, int inc, in int padded_hw = padded_h * padded_w; float* pad_ptr; - float* inp_ptr = ( float* )input; + float* inp_ptr = (float*)input; int resi_h = padded_h - pad0 - inh; int resi_w = padded_w - pad1 - inw; for (int c = 0; c < inc; c++) @@ -204,7 +210,7 @@ static inline void pad_input1(const float* input, float* inp_padded, int inc, in static inline void trans_inp_1tile(float* input, float* inp_ptr, int ih, int jw, int c, int in_hw, int inw) { - float* inp = ( float* )input + c * in_hw + ih * 4 * inw + jw * 4; + float* inp = (float*)input + c * in_hw + ih * 4 * inw + jw * 4; float* inp0 = inp; float* inp1 = inp0 + inw; float* inp2 = inp1 + inw; @@ -346,19 +352,19 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si float32x4_t line0_4 = vld1q_f32(mid + 16); float32x4_t line0_5 = vld1q_f32(mid + 20); - float32x4_t line1_0 = vsubq_f32(r0, r0_); // mid[(6 + i) * 4 + k] [1][0] - float32x4_t line1_1 = vsubq_f32(r1, r1_); // mid[(6 + i) * 4 + k] [1][1] - float32x4_t line1_2 = vsubq_f32(r2, r2_); // mid[(6 + i) * 4 + k] [1][2] - float32x4_t line1_3 = vsubq_f32(r3, r3_); // mid[(6 + i) * 4 + k] [1][3] - float32x4_t line1_4 = vsubq_f32(r4, r4_); // mid[(6 + i) * 4 + k] [1][4] - float32x4_t line1_5 = vsubq_f32(r5, r5_); // mid[(6 + i) * 4 + k] [1][5] + float32x4_t line1_0 = vsubq_f32(r0, r0_); // mid[(6 + i) * 4 + k] [1][0] + float32x4_t line1_1 = vsubq_f32(r1, r1_); // mid[(6 + i) * 4 + k] [1][1] + float32x4_t line1_2 = vsubq_f32(r2, r2_); // mid[(6 + i) * 4 + k] [1][2] + float32x4_t line1_3 = vsubq_f32(r3, r3_); // mid[(6 + i) * 4 + k] [1][3] + float32x4_t line1_4 = vsubq_f32(r4, r4_); // mid[(6 + i) * 4 + k] [1][4] + float32x4_t line1_5 = vsubq_f32(r5, r5_); // mid[(6 + i) * 4 + k] [1][5] - float32x4_t line2_0 = vaddq_f32(r0, r0_); // mid[(12 + i) * 4 + k] [2][0] - float32x4_t line2_1 = vaddq_f32(r1, r1_); // mid[(12 + i) * 4 + k] [2][1] - float32x4_t line2_2 = vaddq_f32(r2, r2_); // mid[(12 + i) * 4 + k] [2][2] - float32x4_t line2_3 = vaddq_f32(r3, r3_); // mid[(12 + i) * 4 + k] [2][3] - float32x4_t line2_4 = vaddq_f32(r4, r4_); // mid[(12 + i) * 4 + k] [2][4] - float32x4_t line2_5 = vaddq_f32(r5, r5_); // mid[(12 + i) * 4 + k] [2][5] + float32x4_t line2_0 = vaddq_f32(r0, r0_); // mid[(12 + i) * 4 + k] [2][0] + float32x4_t line2_1 = vaddq_f32(r1, r1_); // mid[(12 + i) * 4 + k] [2][1] + float32x4_t line2_2 = vaddq_f32(r2, r2_); // mid[(12 + i) * 4 + k] [2][2] + float32x4_t line2_3 = vaddq_f32(r3, r3_); // mid[(12 + i) * 4 + k] [2][3] + float32x4_t line2_4 = vaddq_f32(r4, r4_); // mid[(12 + i) * 4 + k] [2][4] + float32x4_t line2_5 = vaddq_f32(r5, r5_); // mid[(12 + i) * 4 + k] [2][5] r0 = vld1q_f32(r4_minus_r2); r1 = vld1q_f32(r4_minus_r2 + 4); @@ -381,19 +387,19 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si float32x4_t line5_4 = vld1q_f32(mid + 136); float32x4_t line5_5 = vld1q_f32(mid + 140); - float32x4_t line3_0 = vsubq_f32(r0, r0_); // mid[(18 + i) * 4 + k] [3][0] - float32x4_t line3_1 = vsubq_f32(r1, r1_); // mid[(18 + i) * 4 + k] [3][1] - float32x4_t line3_2 = vsubq_f32(r2, r2_); // mid[(18 + i) * 4 + k] [3][2] - float32x4_t line3_3 = vsubq_f32(r3, r3_); // mid[(18 + i) * 4 + k] [3][3] - float32x4_t line3_4 = vsubq_f32(r4, r4_); // mid[(18 + i) * 4 + k] [3][4] - float32x4_t line3_5 = vsubq_f32(r5, r5_); // mid[(18 + i) * 4 + k] [3][5] + float32x4_t line3_0 = vsubq_f32(r0, r0_); // mid[(18 + i) * 4 + k] [3][0] + float32x4_t line3_1 = vsubq_f32(r1, r1_); // mid[(18 + i) * 4 + k] [3][1] + float32x4_t line3_2 = vsubq_f32(r2, r2_); // mid[(18 + i) * 4 + k] [3][2] + float32x4_t line3_3 = vsubq_f32(r3, r3_); // mid[(18 + i) * 4 + k] [3][3] + float32x4_t line3_4 = vsubq_f32(r4, r4_); // mid[(18 + i) * 4 + k] [3][4] + float32x4_t line3_5 = vsubq_f32(r5, r5_); // mid[(18 + i) * 4 + k] [3][5] - float32x4_t line4_0 = vaddq_f32(r0, r0_); // mid[(24 + i) * 4 + k] [4][0] - float32x4_t line4_1 = vaddq_f32(r1, r1_); // mid[(24 + i) * 4 + k] [4][1] - float32x4_t line4_2 = vaddq_f32(r2, r2_); // mid[(24 + i) * 4 + k] [4][2] - float32x4_t line4_3 = vaddq_f32(r3, r3_); // mid[(24 + i) * 4 + k] [4][3] - float32x4_t line4_4 = vaddq_f32(r4, r4_); // mid[(24 + i) * 4 + k] [4][4] - float32x4_t line4_5 = vaddq_f32(r5, r5_); // mid[(24 + i) * 4 + k] [4][5] + float32x4_t line4_0 = vaddq_f32(r0, r0_); // mid[(24 + i) * 4 + k] [4][0] + float32x4_t line4_1 = vaddq_f32(r1, r1_); // mid[(24 + i) * 4 + k] [4][1] + float32x4_t line4_2 = vaddq_f32(r2, r2_); // mid[(24 + i) * 4 + k] [4][2] + float32x4_t line4_3 = vaddq_f32(r3, r3_); // mid[(24 + i) * 4 + k] [4][3] + float32x4_t line4_4 = vaddq_f32(r4, r4_); // mid[(24 + i) * 4 + k] [4][4] + float32x4_t line4_5 = vaddq_f32(r5, r5_); // mid[(24 + i) * 4 + k] [4][5] // r4_minus_r2[i * 4 + k] i=0 = mid[0][4] r0 = vsubq_f32(line0_4, line0_2); @@ -418,30 +424,30 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si r4_ = vmulq_f32(r4_, const2); r5_ = vmulq_f32(r5_, const2); - vst1q_f32(inp_ptr + s_size * 3, vsubq_f32(r0, r0_)); // inp_ptr[ s_size * (3 + i * 6)] - vst1q_f32(inp_ptr + s_size * 9, vsubq_f32(r1, r1_)); // inp_ptr[ s_size * (3 + i * 6)] - vst1q_f32(inp_ptr + s_size * 15, vsubq_f32(r2, r2_)); // inp_ptr[ s_size * (3 + i * 6)] - vst1q_f32(inp_ptr + s_size * 21, vsubq_f32(r3, r3_)); // inp_ptr[ s_size * (3 + i * 6)] - vst1q_f32(inp_ptr + s_size * 27, vsubq_f32(r4, r4_)); // inp_ptr[ s_size * (3 + i * 6)] - vst1q_f32(inp_ptr + s_size * 33, vsubq_f32(r5, r5_)); // inp_ptr[ s_size * (3 + i * 6)] + vst1q_f32(inp_ptr + s_size * 3, vsubq_f32(r0, r0_)); // inp_ptr[ s_size * (3 + i * 6)] + vst1q_f32(inp_ptr + s_size * 9, vsubq_f32(r1, r1_)); // inp_ptr[ s_size * (3 + i * 6)] + vst1q_f32(inp_ptr + s_size * 15, vsubq_f32(r2, r2_)); // inp_ptr[ s_size * (3 + i * 6)] + vst1q_f32(inp_ptr + s_size * 21, vsubq_f32(r3, r3_)); // inp_ptr[ s_size * (3 + i * 6)] + vst1q_f32(inp_ptr + s_size * 27, vsubq_f32(r4, r4_)); // inp_ptr[ s_size * (3 + i * 6)] + vst1q_f32(inp_ptr + s_size * 33, vsubq_f32(r5, r5_)); // inp_ptr[ s_size * (3 + i * 6)] - vst1q_f32(inp_ptr + s_size * 4, vaddq_f32(r0, r0_)); // inp_ptr[ s_size * (4 + i * 6)] - vst1q_f32(inp_ptr + s_size * 10, vaddq_f32(r1, r1_)); // inp_ptr[ s_size * (4 + i * 6)] - vst1q_f32(inp_ptr + s_size * 16, vaddq_f32(r2, r2_)); // inp_ptr[ s_size * (4 + i * 6)] - vst1q_f32(inp_ptr + s_size * 22, vaddq_f32(r3, r3_)); // inp_ptr[ s_size * (4 + i * 6)] - vst1q_f32(inp_ptr + s_size * 28, vaddq_f32(r4, r4_)); // inp_ptr[ s_size * (4 + i * 6)] - vst1q_f32(inp_ptr + s_size * 34, vaddq_f32(r5, r5_)); // inp_ptr[ s_size * (4 + i * 6)] + vst1q_f32(inp_ptr + s_size * 4, vaddq_f32(r0, r0_)); // inp_ptr[ s_size * (4 + i * 6)] + vst1q_f32(inp_ptr + s_size * 10, vaddq_f32(r1, r1_)); // inp_ptr[ s_size * (4 + i * 6)] + vst1q_f32(inp_ptr + s_size * 16, vaddq_f32(r2, r2_)); // inp_ptr[ s_size * (4 + i * 6)] + vst1q_f32(inp_ptr + s_size * 22, vaddq_f32(r3, r3_)); // inp_ptr[ s_size * (4 + i * 6)] + vst1q_f32(inp_ptr + s_size * 28, vaddq_f32(r4, r4_)); // inp_ptr[ s_size * (4 + i * 6)] + vst1q_f32(inp_ptr + s_size * 34, vaddq_f32(r5, r5_)); // inp_ptr[ s_size * (4 + i * 6)] float32x4_t const4 = vdupq_n_f32(4.f); float32x4_t const5 = vdupq_n_f32(-5.f); - r0_ = vmulq_f32(line0_1, const4); // line 1*4 ======== + r0_ = vmulq_f32(line0_1, const4); // line 1*4 ======== r1_ = vmulq_f32(line1_1, const4); r2_ = vmulq_f32(line2_1, const4); r3_ = vmulq_f32(line3_1, const4); r4_ = vmulq_f32(line4_1, const4); r5_ = vmulq_f32(line5_1, const4); - float32x4_t rr0_ = vsubq_f32(r0_, line0_3); // line1*4-line3 + float32x4_t rr0_ = vsubq_f32(r0_, line0_3); // line1*4-line3 float32x4_t rr1_ = vsubq_f32(r1_, line1_3); float32x4_t rr2_ = vsubq_f32(r2_, line2_3); float32x4_t rr3_ = vsubq_f32(r3_, line3_3); @@ -455,28 +461,28 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si r4 = vmulq_f32(line4_2, const4); r5 = vmulq_f32(line5_2, const4); - r0 = vsubq_f32(line0_4, r0); // line4 -4*line2 + r0 = vsubq_f32(line0_4, r0); // line4 -4*line2 r1 = vsubq_f32(line1_4, r1); r2 = vsubq_f32(line2_4, r2); r3 = vsubq_f32(line3_4, r3); r4 = vsubq_f32(line4_4, r4); r5 = vsubq_f32(line5_4, r5); - vst1q_f32(inp_ptr + s_size * 1, vsubq_f32(r0, rr0_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 7, vsubq_f32(r1, rr1_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 13, vsubq_f32(r2, rr2_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 19, vsubq_f32(r3, rr3_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 25, vsubq_f32(r4, rr4_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 31, vsubq_f32(r5, rr5_)); // inp_ptr[ s_size * (1 + i * 6)] - - vst1q_f32(inp_ptr + s_size * 2, vaddq_f32(r0, rr0_)); // inp_ptr[ s_size * (2 + i * 6)] - vst1q_f32(inp_ptr + s_size * 8, vaddq_f32(r1, rr1_)); // inp_ptr[ s_size * (2 + i * 6)] - vst1q_f32(inp_ptr + s_size * 14, vaddq_f32(r2, rr2_)); // inp_ptr[ s_size * (2 + i * 6)] - vst1q_f32(inp_ptr + s_size * 20, vaddq_f32(r3, rr3_)); // inp_ptr[ s_size * (2 + i * 6)] - vst1q_f32(inp_ptr + s_size * 26, vaddq_f32(r4, rr4_)); // inp_ptr[ s_size * (2 + i * 6)] - vst1q_f32(inp_ptr + s_size * 32, vaddq_f32(r5, rr5_)); // inp_ptr[ s_size * (2 + i * 6)] - - r0_ = vaddq_f32(line0_5, r0_); // 5 + 1*4 + vst1q_f32(inp_ptr + s_size * 1, vsubq_f32(r0, rr0_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 7, vsubq_f32(r1, rr1_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 13, vsubq_f32(r2, rr2_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 19, vsubq_f32(r3, rr3_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 25, vsubq_f32(r4, rr4_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 31, vsubq_f32(r5, rr5_)); // inp_ptr[ s_size * (1 + i * 6)] + + vst1q_f32(inp_ptr + s_size * 2, vaddq_f32(r0, rr0_)); // inp_ptr[ s_size * (2 + i * 6)] + vst1q_f32(inp_ptr + s_size * 8, vaddq_f32(r1, rr1_)); // inp_ptr[ s_size * (2 + i * 6)] + vst1q_f32(inp_ptr + s_size * 14, vaddq_f32(r2, rr2_)); // inp_ptr[ s_size * (2 + i * 6)] + vst1q_f32(inp_ptr + s_size * 20, vaddq_f32(r3, rr3_)); // inp_ptr[ s_size * (2 + i * 6)] + vst1q_f32(inp_ptr + s_size * 26, vaddq_f32(r4, rr4_)); // inp_ptr[ s_size * (2 + i * 6)] + vst1q_f32(inp_ptr + s_size * 32, vaddq_f32(r5, rr5_)); // inp_ptr[ s_size * (2 + i * 6)] + + r0_ = vaddq_f32(line0_5, r0_); // 5 + 1*4 r1_ = vaddq_f32(line1_5, r1_); r2_ = vaddq_f32(line2_5, r2_); r3_ = vaddq_f32(line3_5, r3_); @@ -489,12 +495,12 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si r3 = vmulq_f32(line3_3, const5); r4 = vmulq_f32(line4_3, const5); r5 = vmulq_f32(line5_3, const5); - vst1q_f32(inp_ptr + s_size * 5, vaddq_f32(r0, r0_)); // inp_ptr[ s_size * (5 + i * 6)] - vst1q_f32(inp_ptr + s_size * 11, vaddq_f32(r1, r1_)); // inp_ptr[ s_size * (5 + i * 6)] - vst1q_f32(inp_ptr + s_size * 17, vaddq_f32(r2, r2_)); // inp_ptr[ s_size * (5 + i * 6)] - vst1q_f32(inp_ptr + s_size * 23, vaddq_f32(r3, r3_)); // inp_ptr[ s_size * (5 + i * 6)] - vst1q_f32(inp_ptr + s_size * 29, vaddq_f32(r4, r4_)); // inp_ptr[ s_size * (5 + i * 6)] - vst1q_f32(inp_ptr + s_size * 35, vaddq_f32(r5, r5_)); // inp_ptr[ s_size * (5 + i * 6)] + vst1q_f32(inp_ptr + s_size * 5, vaddq_f32(r0, r0_)); // inp_ptr[ s_size * (5 + i * 6)] + vst1q_f32(inp_ptr + s_size * 11, vaddq_f32(r1, r1_)); // inp_ptr[ s_size * (5 + i * 6)] + vst1q_f32(inp_ptr + s_size * 17, vaddq_f32(r2, r2_)); // inp_ptr[ s_size * (5 + i * 6)] + vst1q_f32(inp_ptr + s_size * 23, vaddq_f32(r3, r3_)); // inp_ptr[ s_size * (5 + i * 6)] + vst1q_f32(inp_ptr + s_size * 29, vaddq_f32(r4, r4_)); // inp_ptr[ s_size * (5 + i * 6)] + vst1q_f32(inp_ptr + s_size * 35, vaddq_f32(r5, r5_)); // inp_ptr[ s_size * (5 + i * 6)] r0 = vmulq_f32(line0_0, const4); r1 = vmulq_f32(line1_0, const4); @@ -517,12 +523,12 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si r4 = vaddq_f32(r4, line4_4); r5 = vaddq_f32(r5, line5_4); - vst1q_f32(inp_ptr + s_size * 0, vaddq_f32(r0, r0_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 6, vaddq_f32(r1, r1_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 12, vaddq_f32(r2, r2_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 18, vaddq_f32(r3, r3_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 24, vaddq_f32(r4, r4_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 30, vaddq_f32(r5, r5_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 0, vaddq_f32(r0, r0_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 6, vaddq_f32(r1, r1_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 12, vaddq_f32(r2, r2_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 18, vaddq_f32(r3, r3_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 24, vaddq_f32(r4, r4_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 30, vaddq_f32(r5, r5_)); // inp_ptr[ s_size * (1 + i * 6)] // for(int i = 0; i < 6; i++) // { @@ -552,10 +558,9 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si // } } - // trans_input [block_hw/4][ELEM_SIZE][inc][4] static inline void tran_input_4block(const float* input, float* trans_inp, int inc, int block_h, - int block_w, int inh, int inw) + int block_w, int inh, int inw) { int in_hw = inh * inw; int block_hw = block_h * block_w; @@ -577,7 +582,7 @@ static inline void tran_input_4block(const float* input, float* trans_inp, int i if (idxh[0] == idxh[3]) { - float* temp_inp_ptr = ( float* )(input + idxh[0] * 4 * inw + idxw[0] * 4); + float* temp_inp_ptr = (float*)(input + idxh[0] * 4 * inw + idxw[0] * 4); for (int c = 0; c < inc; c++) { float ker00[4] = {1, 2, 4, 5}; @@ -592,13 +597,13 @@ static inline void tran_input_4block(const float* input, float* trans_inp, int i for (int c = 0; c < inc; c++) { - trans_inp_1tile(( float* )input, buffer, idxh[0], idxw[0], c, in_hw, inw); + trans_inp_1tile((float*)input, buffer, idxh[0], idxw[0], c, in_hw, inw); buffer += ELEM_SIZE; - trans_inp_1tile(( float* )input, buffer, idxh[1], idxw[1], c, in_hw, inw); + trans_inp_1tile((float*)input, buffer, idxh[1], idxw[1], c, in_hw, inw); buffer += ELEM_SIZE; - trans_inp_1tile(( float* )input, buffer, idxh[2], idxw[2], c, in_hw, inw); + trans_inp_1tile((float*)input, buffer, idxh[2], idxw[2], c, in_hw, inw); buffer += ELEM_SIZE; - trans_inp_1tile(( float* )input, buffer, idxh[3], idxw[3], c, in_hw, inw); + trans_inp_1tile((float*)input, buffer, idxh[3], idxw[3], c, in_hw, inw); buffer += ELEM_SIZE; } // interleave @@ -621,7 +626,7 @@ static inline void tran_input_4block(const float* input, float* trans_inp, int i // tran_inp [block_hw/4][36][inc][4] -> [36][block_hw/4][inc][4] static inline void tran_input_4block_1(const float* input, float* trans_inp, int inc, int block_h, int block_w, int inh, - int inw,int num_thread) + int inw, int num_thread) { int in_hw = inh * inw; int block_hw = block_h * block_w; @@ -631,8 +636,8 @@ static inline void tran_input_4block_1(const float* input, float* trans_inp, int int s_size = block_hw * inc * sizeof(float); -#pragma omp parallel for num_threads(num_thread) shared(block_hw,nn_block,in_hw) private(idxh,idxw) - for(int ib = 0; ib < nn_block; ib++) +#pragma omp parallel for num_threads(num_thread) shared(block_hw, nn_block, in_hw) private(idxh, idxw) + for (int ib = 0; ib < nn_block; ib++) { int off_set0 = ib * BLOCK_HW_UNIT * inc; @@ -645,10 +650,10 @@ static inline void tran_input_4block_1(const float* input, float* trans_inp, int idxw[2] = (ib * 4 + 2) % block_w; idxw[3] = (ib * 4 + 3) % block_w; - if(idxh[0] == idxh[3]) + if (idxh[0] == idxh[3]) { - float* temp_inp_ptr = ( float* )(input + idxh[0] * 4 * inw + idxw[0] * 4); - for(int c = 0; c < inc; c++) + float* temp_inp_ptr = (float*)(input + idxh[0] * 4 * inw + idxw[0] * 4); + for (int c = 0; c < inc; c++) { float ker00[4] = {1, 2, 4, 5}; tran_inp_4(temp_inp_ptr, trans_inp + c * 4 + off_set0, ker00, inw, s_size, in_hw); @@ -660,24 +665,24 @@ static inline void tran_input_4block_1(const float* input, float* trans_inp, int float buffer0[inc * ELEM_SIZE * BLOCK_HW_UNIT]; float* buffer = buffer0; - for(int c = 0; c < inc; c++) + for (int c = 0; c < inc; c++) { - trans_inp_1tile(( float* )input, buffer, idxh[0], idxw[0], c, in_hw, inw); + trans_inp_1tile((float*)input, buffer, idxh[0], idxw[0], c, in_hw, inw); buffer += ELEM_SIZE; - trans_inp_1tile(( float* )input, buffer, idxh[1], idxw[1], c, in_hw, inw); + trans_inp_1tile((float*)input, buffer, idxh[1], idxw[1], c, in_hw, inw); buffer += ELEM_SIZE; - trans_inp_1tile(( float* )input, buffer, idxh[2], idxw[2], c, in_hw, inw); + trans_inp_1tile((float*)input, buffer, idxh[2], idxw[2], c, in_hw, inw); buffer += ELEM_SIZE; - trans_inp_1tile(( float* )input, buffer, idxh[3], idxw[3], c, in_hw, inw); + trans_inp_1tile((float*)input, buffer, idxh[3], idxw[3], c, in_hw, inw); buffer += ELEM_SIZE; } // interleave - for(int s = 0; s < ELEM_SIZE; s++) + for (int s = 0; s < ELEM_SIZE; s++) { float* tmp_inp = trans_inp + s * block_hw * inc + off_set0; - for(int i = 0; i < inc; i++) + for (int i = 0; i < inc; i++) { - for(int j = 0; j < BLOCK_HW_UNIT; j++) + for (int j = 0; j < BLOCK_HW_UNIT; j++) { *tmp_inp = buffer0[i * ELEM_SIZE * BLOCK_HW_UNIT + j * ELEM_SIZE + s]; tmp_inp++; @@ -701,7 +706,7 @@ static inline void tran_input_resi_block(const float* input, float* trans_inp, i { int ih = ib / block_w; int jw = ib % block_w; - trans_inp_1tile(( float* )input, buffer, ih, jw, c, in_hw, inw); + trans_inp_1tile((float*)input, buffer, ih, jw, c, in_hw, inw); buffer += ELEM_SIZE; } // interleave @@ -717,29 +722,28 @@ static inline void tran_input_resi_block(const float* input, float* trans_inp, i } } - // tran_inp [block_resi][36][inc] -> [36][block_resi][inc] static inline void tran_input_resi_block_1(const float* input, float* trans_inp, int inc, int nn_block, int resi_block, int block_hw, int block_w, int in_hw, int inw) { - for(int ib = resi_block; ib < block_hw; ib++) + for (int ib = resi_block; ib < block_hw; ib++) { int off_set0 = ib * inc; float buffer0[ELEM_SIZE * inc]; float* buffer = buffer0; - for(int c = 0; c < inc; c++) + for (int c = 0; c < inc; c++) { int ih = ib / block_w; int jw = ib % block_w; - trans_inp_1tile(( float* )input, buffer, ih, jw, c, in_hw, inw); + trans_inp_1tile((float*)input, buffer, ih, jw, c, in_hw, inw); buffer += ELEM_SIZE; } // interleave - for(int s = 0; s < ELEM_SIZE; s++) + for (int s = 0; s < ELEM_SIZE; s++) { float* tmp_inp = trans_inp + s * block_hw * inc + off_set0; - for(int i = 0; i < inc; i++) + for (int i = 0; i < inc; i++) { *tmp_inp = buffer0[i * ELEM_SIZE + s]; tmp_inp++; @@ -749,7 +753,6 @@ static inline void tran_input_resi_block_1(const float* input, float* trans_inp, } } - static inline float do_activation(float value, int activation) { if (activation >= 0) @@ -961,8 +964,7 @@ static inline void transform_output_f43_1tile(const float* buffer_ptr, float* ou float* out_ptr = out + cout_idx * out_hw; int i_h = idx_blockhw / block_w; int j_w = idx_blockhw % block_w; - if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || - (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1))) + if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1))) { trans_output_f43(buffer_ptr, out_ptr + (i_h * TILE * outw + j_w * TILE), outw, bias_ptr, activation); } @@ -1016,17 +1018,16 @@ static inline void transform_output_f43_4tile(float* buffer_ptr, float* out, int float* out_ptr = out + cout_idx * out_hw; if (bias) { - bias_ptr = ( float* )bias + cout_idx; + bias_ptr = (float*)bias + cout_idx; } for (int ii = 0; ii < 4; ii++) { int i_h = idx_h[ii]; int j_w = idx_w[ii]; - if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || - (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1))) + if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1))) { trans_output_f43(buffer_ptr, out_ptr + (i_h * TILE * outw + j_w * TILE), outw, bias_ptr, activation); - } // direct use_out_ptr + } // direct use_out_ptr else { int ret_h = TILE - resi_h; @@ -1046,7 +1047,7 @@ static inline void transform_output_f43_4tile(float* buffer_ptr, float* out, int out_pointer[hh * outw + ww] = do_activation(tmp_buffer[hh * 4 + ww], activation); } } - } // end else, tmp_buff + } // end else, tmp_buff buffer_ptr += ELEM_SIZE; } } @@ -1055,17 +1056,17 @@ static inline void transform_output_f43_4tile(float* buffer_ptr, float* out, int // trans_input [block_hw/4][ELEM_SIZE][inc][4] // kernel [out_c/PER_OUT_CHAN][ELEM_SIZE][in_c][PER_OUT_CHAN] static void wino_sgemm_4x16_1(const float* ker, const float* inp, float* output, int cin, int cout_end, - int block_h, int block_w, int out_c, int num_thread, int s, int cpu_affinity) + int block_h, int block_w, int out_c, int num_thread, int s, int cpu_affinity) { int block_hw = block_h * block_w; - #pragma omp parallel for num_threads(num_thread) +#pragma omp parallel for num_threads(num_thread) for (int p = 0; p < (cout_end & -PER_OUT_CHAN); p += PER_OUT_CHAN) { - float * out_ptr = output + p * ELEM_SIZE * block_hw; - float * out_ptr1 ; + float* out_ptr = output + p * ELEM_SIZE * block_hw; + float* out_ptr1; int i; - + for (i = 0; i < (block_hw & -4); i += 4) { out_ptr1 = out_ptr + i * ELEM_SIZE * KER_COUT_UNIT; @@ -1073,11 +1074,11 @@ static void wino_sgemm_4x16_1(const float* ker, const float* inp, float* output, int offset = s * block_hw * cin + i * cin; int offset_ker = s * cin * out_c + p * cin; -//#ifdef __aarch64__ + //#ifdef __aarch64__ wino_sgemm_4x16_A72(out_ptr1 + s * BLOCK_HW_UNIT, inp + offset, ker + offset_ker, cin, 1); } - - for(; i < block_hw ;i++) + + for (; i < block_hw; i++) { out_ptr1 = out_ptr + i * ELEM_SIZE * KER_COUT_UNIT; @@ -1090,7 +1091,7 @@ static void wino_sgemm_4x16_1(const float* ker, const float* inp, float* output, } void wino_sgemm_4x4_1(const float* ker, const float* inp, float* output, int cin, int cout_start, - int cout_end, int block_h, int block_w, int out_c, int activation, int s, int num_thread, int cpu_affinity) + int cout_end, int block_h, int block_w, int out_c, int activation, int s, int num_thread, int cpu_affinity) { int block_start = 0; int block_hw = block_h * block_w; @@ -1102,15 +1103,15 @@ void wino_sgemm_4x4_1(const float* ker, const float* inp, float* output, int cin float* out_ptr = output + p * ELEM_SIZE * block_hw; int i = 0; - for(i = (block_start & -4); i < (block_end & -4); i += 4) + for (i = (block_start & -4); i < (block_end & -4); i += 4) { float* out_ptr1 = out_ptr + i * ELEM_SIZE * KER_COUT_UNIT4; int offset = s * block_hw * cin + i * cin; int offset_ker = s * cin * out_c + p * cin; -//#ifdef __aarch64__ + //#ifdef __aarch64__ wino_sgemm_4x4_A72(out_ptr1 + s * BLOCK_HW_UNIT, inp + offset, ker + offset_ker, cin, 1); } - for(; i < block_end; i++) + for (; i < block_end; i++) { float* out_ptr1 = out_ptr + i * ELEM_SIZE * KER_COUT_UNIT4; @@ -1128,14 +1129,14 @@ void wino_sgemm_4x4_1(const float* ker, const float* inp, float* output, int cin for (i = (block_start & -4); i < (block_end & -4); i += 4) { float* out_ptr1 = out_ptr + i * ELEM_SIZE + s * BLOCK_HW_UNIT; - float* inp_ = (float*)(inp + s * block_hw * cin + i*cin); + float* inp_ = (float*)(inp + s * block_hw * cin + i * cin); float sum0 = 0; float sum1 = 0; float sum2 = 0; float sum3 = 0; for (int k = 0; k < cin; k++) { - sum0 += inp_[k * 4 ] * ker_[k]; + sum0 += inp_[k * 4] * ker_[k]; sum1 += inp_[k * 4 + 1] * ker_[k]; sum2 += inp_[k * 4 + 2] * ker_[k]; sum3 += inp_[k * 4 + 3] * ker_[k]; @@ -1145,12 +1146,13 @@ void wino_sgemm_4x4_1(const float* ker, const float* inp, float* output, int cin out_ptr1[2] = sum2; out_ptr1[3] = sum3; } - for(; i < block_end; i++) - { + for (; i < block_end; i++) + { float* out_ptr1 = out_ptr + i * ELEM_SIZE + s; - float* inp_ = (float*)(inp + s * block_hw * cin + i*cin); + float* inp_ = (float*)(inp + s * block_hw * cin + i * cin); float sum0 = 0; - for(int k = 0; k < cin; k++){ + for (int k = 0; k < cin; k++) + { sum0 += inp_[k] * ker_[k]; } out_ptr1[0] = sum0; @@ -1163,13 +1165,14 @@ static inline void trans_output_p(float* trans_out_ptr, float* output, float* bias, int bias_term, int block_h, int block_w, int block_hw, int out_hw, int out_w, int resi_h, int resi_w, - int activation,int p,int KER_COUT_UNIT_) + int activation, int p, int KER_COUT_UNIT_) { int flag_outw = 1; - if(out_w < 16) + if (out_w < 16) flag_outw = 0; int i; - for(i=0; i< (block_hw & -BLOCK_HW_UNIT); i+=BLOCK_HW_UNIT){ + for (i = 0; i < (block_hw & -BLOCK_HW_UNIT); i += BLOCK_HW_UNIT) + { float* buffer_ptr = trans_out_ptr + i * KER_COUT_UNIT_ * ELEM_SIZE; int idx_h[4]; int idx_w[4]; @@ -1183,59 +1186,73 @@ static inline void trans_output_p(float* trans_out_ptr, idx_w[2] = (i + 2) % block_w; idx_w[3] = (i + 3) % block_w; int wino_out_4_tiles = 0; - if(flag_outw){ - if((idx_h[0] == idx_h[3]) && (idx_h[0] < (block_h - 1)) && (idx_w[3] < (block_w - 1))){ + if (flag_outw) + { + if ((idx_h[0] == idx_h[3]) && (idx_h[0] < (block_h - 1)) && (idx_w[3] < (block_w - 1))) + { wino_out_4_tiles = 1; } } - if(wino_out_4_tiles == 1){ + if (wino_out_4_tiles == 1) + { float* bias_ptr = NULL; - for(int pss = 0; pss < KER_COUT_UNIT_; pss++){ + for (int pss = 0; pss < KER_COUT_UNIT_; pss++) + { int cout_idx = p + pss; float* out_ptr = output + cout_idx * out_hw + idx_h[0] * TILE * out_w + idx_w[0] * TILE; - if(bias_term){ - bias_ptr = ( float* )(bias + cout_idx); + if (bias_term) + { + bias_ptr = (float*)(bias + cout_idx); } float ker00[4] = {2, 4, 8, 0}; tran_out_4(buffer_ptr + pss * ELEM_SIZE * BLOCK_HW_UNIT, out_ptr, out_w * sizeof(float), ker00, bias_ptr, activation); } } - else{ + else + { float tmp_buffer[TILE * TILE]; const float* bias_ptr = NULL; - for(int pss = 0; pss < KER_COUT_UNIT_; pss++){ + for (int pss = 0; pss < KER_COUT_UNIT_; pss++) + { int cout_idx = p + pss; float* out_ptr = output + cout_idx * out_hw; - if(bias_term){ + if (bias_term) + { bias_ptr = bias + cout_idx; } float buffer[BLOCK_HW_UNIT * ELEM_SIZE]; float* buffer_ptr0 = buffer; float* mid_ptr = buffer_ptr + pss * BLOCK_HW_UNIT * ELEM_SIZE; - for(int t = 0; t < BLOCK_HW_UNIT; t++){ - for(int ss = 0; ss < ELEM_SIZE; ss++){ + for (int t = 0; t < BLOCK_HW_UNIT; t++) + { + for (int ss = 0; ss < ELEM_SIZE; ss++) + { *buffer_ptr0 = mid_ptr[ss * BLOCK_HW_UNIT + t]; buffer_ptr0++; } } - for(int ii = 0; ii < BLOCK_HW_UNIT; ii++){ + for (int ii = 0; ii < BLOCK_HW_UNIT; ii++) + { int i_h = idx_h[ii]; int j_w = idx_w[ii]; - if((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || - (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1))){ + if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1))) + { trans_output_f43(buffer + ii * ELEM_SIZE, out_ptr + (i_h * TILE * out_w + j_w * TILE), - out_w, ( const float* )bias_ptr, activation); + out_w, (const float*)bias_ptr, activation); } - else{ + else + { int ret_h = TILE - resi_h; - if(i_h < block_h - 1) ret_h = TILE; + if (i_h < block_h - 1) ret_h = TILE; int ret_w = TILE - resi_w; - if(j_w < block_w - 1) ret_w = TILE; - trans_output_f43_ordinary(buffer + ii * ELEM_SIZE, tmp_buffer, ( const float* )bias_ptr); + if (j_w < block_w - 1) ret_w = TILE; + trans_output_f43_ordinary(buffer + ii * ELEM_SIZE, tmp_buffer, (const float*)bias_ptr); float* out_pointer = out_ptr + (i_h * TILE * out_w + j_w * TILE); - for(int hh = 0; hh < ret_h; hh++){ - for(int ww = 0; ww < ret_w; ww++){ + for (int hh = 0; hh < ret_h; hh++) + { + for (int ww = 0; ww < ret_w; ww++) + { out_pointer[hh * out_w + ww] = do_activation(tmp_buffer[hh * 4 + ww], activation); } } @@ -1244,12 +1261,15 @@ static inline void trans_output_p(float* trans_out_ptr, } } } - for(; i < block_hw; i++){ + for (; i < block_hw; i++) + { float* buffer_ptr = trans_out_ptr + i * KER_COUT_UNIT_ * ELEM_SIZE; float resi_buffer[KER_COUT_UNIT_ * ELEM_SIZE]; float* buffer0 = resi_buffer; - for(int pp = 0; pp < KER_COUT_UNIT_; pp++){ - for(int ss = 0; ss < ELEM_SIZE; ss++){ + for (int pp = 0; pp < KER_COUT_UNIT_; pp++) + { + for (int ss = 0; ss < ELEM_SIZE; ss++) + { *buffer0 = buffer_ptr[ss * KER_COUT_UNIT_ + pp]; buffer0++; } @@ -1259,17 +1279,17 @@ static inline void trans_output_p(float* trans_out_ptr, } } - // transform output static inline void trans_output_1(float* trans_out, float* output, float* bias, int bias_term, int block_h, int block_w, - int cout_start, int cout_end, int out_hw, int out_w, int resi_h, int resi_w, - int activation,int num_thread) + int cout_start, int cout_end, int out_hw, int out_w, int resi_h, int resi_w, + int activation, int num_thread) { int block_hw = block_h * block_w; int p; //cout 16 #pragma omp parallel for num_threads(num_thread) shared(block_hw) - for(p = cout_start; p < (cout_end& -KER_COUT_UNIT); p+=KER_COUT_UNIT){ + for (p = cout_start; p < (cout_end & -KER_COUT_UNIT); p += KER_COUT_UNIT) + { trans_output_p(trans_out + p * block_hw * ELEM_SIZE, output, bias, bias_term, block_h, block_w, block_hw, @@ -1278,7 +1298,8 @@ static inline void trans_output_1(float* trans_out, float* output, float* bias, } //cout 4 #pragma omp parallel for num_threads(num_thread) shared(block_hw) - for(p = (cout_end & -KER_COUT_UNIT); p < (cout_end & -KER_COUT_UNIT4); p += KER_COUT_UNIT4){ + for (p = (cout_end & -KER_COUT_UNIT); p < (cout_end & -KER_COUT_UNIT4); p += KER_COUT_UNIT4) + { trans_output_p(trans_out + p * block_hw * ELEM_SIZE, output, bias, bias_term, block_h, block_w, block_hw, @@ -1287,7 +1308,8 @@ static inline void trans_output_1(float* trans_out, float* output, float* bias, } // cout 1 #pragma omp parallel for num_threads(num_thread) shared(block_hw) - for(p=(cout_end & -KER_COUT_UNIT4); p < cout_end; p ++){ + for (p = (cout_end & -KER_COUT_UNIT4); p < cout_end; p++) + { trans_output_p(trans_out + p * block_hw * ELEM_SIZE, output, bias, bias_term, block_h, block_w, block_hw, @@ -1301,17 +1323,17 @@ static int get_private_mem_size(struct tensor* filter, struct conv_param* param) int output_c = filter->dims[0]; int input_c = filter->dims[1]; int trans_ker_size = output_c * input_c * ELEM_SIZE * sizeof(float); - return trans_ker_size + 128; // caution + return trans_ker_size + 128; // caution } int wino_conv_hcl_prerun_1(struct tensor* input_tensor, struct tensor* filter_tensor, - struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param) + struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param) { // fTLOG_ERR(stderr,"run into wino_1 prerun.\n"); int output_c = filter_tensor->dims[0]; int input_c = filter_tensor->dims[1]; int mem_size = get_private_mem_size(filter_tensor, param); - float* trans_mem = ( float* )sys_malloc(mem_size); + float* trans_mem = (float*)sys_malloc(mem_size); if (!priv_info->external_interleave_mem) { @@ -1321,7 +1343,7 @@ int wino_conv_hcl_prerun_1(struct tensor* input_tensor, struct tensor* filter_te } transform_kernel_f43_tile(filter_tensor, trans_mem); - interleave_kernel_1(trans_mem, ( float* )priv_info->interleave_buffer, output_c, input_c); + interleave_kernel_1(trans_mem, (float*)priv_info->interleave_buffer, output_c, input_c); sys_free(trans_mem); @@ -1329,8 +1351,8 @@ int wino_conv_hcl_prerun_1(struct tensor* input_tensor, struct tensor* filter_te } int wino_conv_hcl_run_1(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, - struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param, - int num_thread, int cpu_affinity) + struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param, + int num_thread, int cpu_affinity) { int kernel_h = param->kernel_h; int kernel_w = param->kernel_w; @@ -1368,19 +1390,19 @@ int wino_conv_hcl_run_1(struct tensor* input_tensor, struct tensor* filter_tenso int padded_in_hw = padded_in_h * padded_in_w; /* buffer addr */ - float* input_buf = ( float* )input_tensor->data; - float* output_buf = ( float* )output_tensor->data; + float* input_buf = (float*)input_tensor->data; + float* output_buf = (float*)output_tensor->data; float* biases_buf = NULL; int bias_term = 0; if (bias_tensor != NULL) { - biases_buf = ( float* )bias_tensor->data; + biases_buf = (float*)bias_tensor->data; bias_term = 1; } - float* col_buf = ( float* )priv_info->im2col_buffer; - float* interleave_buf = ( float* )priv_info->interleave_buffer; + float* col_buf = (float*)priv_info->im2col_buffer; + float* interleave_buf = (float*)priv_info->interleave_buffer; int inp_padded_size = sizeof(float) * (in_c * padded_in_hw + 2); @@ -1393,9 +1415,9 @@ int wino_conv_hcl_run_1(struct tensor* input_tensor, struct tensor* filter_tenso for (int n = 0; n < batch; n++) { - float* input_padded = ( float* )sys_malloc(inp_padded_size); - float* trans_inp = ( float* )sys_malloc(sizeof(float) * ELEM_SIZE * in_c * block_hw + 128); - float* trans_out = ( float* )sys_malloc(sizeof(float) * ELEM_SIZE * out_c * block_hw); + float* input_padded = (float*)sys_malloc(inp_padded_size); + float* trans_inp = (float*)sys_malloc(sizeof(float) * ELEM_SIZE * in_c * block_hw + 128); + float* trans_out = (float*)sys_malloc(sizeof(float) * ELEM_SIZE * out_c * block_hw); float* input = input_buf + n * input_size; float* output = output_buf + n * output_size; @@ -1409,24 +1431,24 @@ int wino_conv_hcl_run_1(struct tensor* input_tensor, struct tensor* filter_tenso if (resi_block != block_hw) { tran_input_resi_block_1(input_padded, trans_inp, in_c, nn_block, resi_block, block_hw, block_w, - padded_in_hw, padded_in_w); + padded_in_hw, padded_in_w); } sys_free(input_padded); /* gemm */ - for(int s = 0; s < ELEM_SIZE; s++) + for (int s = 0; s < ELEM_SIZE; s++) { wino_sgemm_4x16_1(interleave_buf, trans_inp, trans_out, in_c, nn_out_c, block_h, block_w, - out_c, num_thread, s, cpu_affinity); + out_c, num_thread, s, cpu_affinity); if (nn_out_c != out_c) { - wino_sgemm_4x4_1(interleave_buf, trans_inp, trans_out, in_c, nn_out_c, - out_c, block_h, block_w, out_c, act_type, s ,num_thread, cpu_affinity); + wino_sgemm_4x4_1(interleave_buf, trans_inp, trans_out, in_c, nn_out_c, + out_c, block_h, block_w, out_c, act_type, s, num_thread, cpu_affinity); } } sys_free(trans_inp); trans_output_1(trans_out, output, biases_buf, bias_term, block_h, block_w, 0, out_c, out_hw, out_w, resi_h, resi_w, - act_type,num_thread); + act_type, num_thread); sys_free(trans_out); } diff --git a/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_1_arm.h b/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_1_arm.h index 53a45a9ec..b4b3298d0 100644 --- a/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_1_arm.h +++ b/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_1_arm.h @@ -30,13 +30,11 @@ #include "graph/node.h" #include "graph/graph.h" - int wino_conv_hcl_prerun_1(struct tensor* input_tensor, struct tensor* filter_tensor, - struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param) - ; + struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param); int wino_conv_hcl_run_1(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, - struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, - int num_thread, int affinity); + struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, + int num_thread, int affinity); #endif diff --git a/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_arm.c b/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_arm.c index c6a3b1525..50c2025dd 100644 --- a/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_arm.c +++ b/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_arm.c @@ -34,8 +34,7 @@ #include - -#define TILE 4 +#define TILE 4 #define ELEM_SIZE ((TILE + 2) * (TILE + 2)) #define WINO_MAX(a, b) ((a) > (b) ? (a) : (b)) @@ -93,10 +92,10 @@ static inline void trans_kernel_f43(float* ker, float* trans_ker) */ float tmp[18] = {0}; - float neg_r0_add_r2_x_1_6[6]; // (r0+r2)*1./6 - float r0_1_4_add_r2_x_1_6[6]; // (r0*1/4 + r2)*1./6 - float r1_1_6[6]; // r1*1/6 - float r1_1_12[6]; // r1*1/12 + float neg_r0_add_r2_x_1_6[6]; // (r0+r2)*1./6 + float r0_1_4_add_r2_x_1_6[6]; // (r0*1/4 + r2)*1./6 + float r1_1_6[6]; // r1*1/6 + float r1_1_12[6]; // r1*1/12 float s_1_6 = 1. / 6.f; for (int j = 0; j < 3; j++) { @@ -142,14 +141,14 @@ static inline void transform_kernel_f43_tile(struct tensor* filter, float* trans { int outc = filter->dims[0]; int inc = filter->dims[1]; - float* kernel = ( float* )filter->data; + float* kernel = (float*)filter->data; float* ker_ptr = trans_ker; for (int i = 0; i < outc; i++) { for (int j = 0; j < inc; j++) { - trans_kernel_f43(( float* )(kernel + 9 * (j + i * inc)), ker_ptr); + trans_kernel_f43((float*)(kernel + 9 * (j + i * inc)), ker_ptr); ker_ptr += ELEM_SIZE; } } @@ -212,7 +211,7 @@ static inline void pad_input1(const float* input, float* inp_padded, int inc, in int padded_hw = padded_h * padded_w; float* pad_ptr; - float* inp_ptr = ( float* )input; + float* inp_ptr = (float*)input; int resi_h = padded_h - pad0 - inh; int resi_w = padded_w - pad1 - inw; for (int c = 0; c < inc; c++) @@ -241,7 +240,7 @@ static inline void pad_input1(const float* input, float* inp_padded, int inc, in static inline void trans_inp_1tile(float* input, float* inp_ptr, int ih, int jw, int c, int in_hw, int inw) { - float* inp = ( float* )input + c * in_hw + ih * 4 * inw + jw * 4; + float* inp = (float*)input + c * in_hw + ih * 4 * inw + jw * 4; float* inp0 = inp; float* inp1 = inp0 + inw; float* inp2 = inp1 + inw; @@ -383,19 +382,19 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si float32x4_t line0_4 = vld1q_f32(mid + 16); float32x4_t line0_5 = vld1q_f32(mid + 20); - float32x4_t line1_0 = vsubq_f32(r0, r0_); // mid[(6 + i) * 4 + k] [1][0] - float32x4_t line1_1 = vsubq_f32(r1, r1_); // mid[(6 + i) * 4 + k] [1][1] - float32x4_t line1_2 = vsubq_f32(r2, r2_); // mid[(6 + i) * 4 + k] [1][2] - float32x4_t line1_3 = vsubq_f32(r3, r3_); // mid[(6 + i) * 4 + k] [1][3] - float32x4_t line1_4 = vsubq_f32(r4, r4_); // mid[(6 + i) * 4 + k] [1][4] - float32x4_t line1_5 = vsubq_f32(r5, r5_); // mid[(6 + i) * 4 + k] [1][5] + float32x4_t line1_0 = vsubq_f32(r0, r0_); // mid[(6 + i) * 4 + k] [1][0] + float32x4_t line1_1 = vsubq_f32(r1, r1_); // mid[(6 + i) * 4 + k] [1][1] + float32x4_t line1_2 = vsubq_f32(r2, r2_); // mid[(6 + i) * 4 + k] [1][2] + float32x4_t line1_3 = vsubq_f32(r3, r3_); // mid[(6 + i) * 4 + k] [1][3] + float32x4_t line1_4 = vsubq_f32(r4, r4_); // mid[(6 + i) * 4 + k] [1][4] + float32x4_t line1_5 = vsubq_f32(r5, r5_); // mid[(6 + i) * 4 + k] [1][5] - float32x4_t line2_0 = vaddq_f32(r0, r0_); // mid[(12 + i) * 4 + k] [2][0] - float32x4_t line2_1 = vaddq_f32(r1, r1_); // mid[(12 + i) * 4 + k] [2][1] - float32x4_t line2_2 = vaddq_f32(r2, r2_); // mid[(12 + i) * 4 + k] [2][2] - float32x4_t line2_3 = vaddq_f32(r3, r3_); // mid[(12 + i) * 4 + k] [2][3] - float32x4_t line2_4 = vaddq_f32(r4, r4_); // mid[(12 + i) * 4 + k] [2][4] - float32x4_t line2_5 = vaddq_f32(r5, r5_); // mid[(12 + i) * 4 + k] [2][5] + float32x4_t line2_0 = vaddq_f32(r0, r0_); // mid[(12 + i) * 4 + k] [2][0] + float32x4_t line2_1 = vaddq_f32(r1, r1_); // mid[(12 + i) * 4 + k] [2][1] + float32x4_t line2_2 = vaddq_f32(r2, r2_); // mid[(12 + i) * 4 + k] [2][2] + float32x4_t line2_3 = vaddq_f32(r3, r3_); // mid[(12 + i) * 4 + k] [2][3] + float32x4_t line2_4 = vaddq_f32(r4, r4_); // mid[(12 + i) * 4 + k] [2][4] + float32x4_t line2_5 = vaddq_f32(r5, r5_); // mid[(12 + i) * 4 + k] [2][5] r0 = vld1q_f32(r4_minus_r2); r1 = vld1q_f32(r4_minus_r2 + 4); @@ -418,19 +417,19 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si float32x4_t line5_4 = vld1q_f32(mid + 136); float32x4_t line5_5 = vld1q_f32(mid + 140); - float32x4_t line3_0 = vsubq_f32(r0, r0_); // mid[(18 + i) * 4 + k] [3][0] - float32x4_t line3_1 = vsubq_f32(r1, r1_); // mid[(18 + i) * 4 + k] [3][1] - float32x4_t line3_2 = vsubq_f32(r2, r2_); // mid[(18 + i) * 4 + k] [3][2] - float32x4_t line3_3 = vsubq_f32(r3, r3_); // mid[(18 + i) * 4 + k] [3][3] - float32x4_t line3_4 = vsubq_f32(r4, r4_); // mid[(18 + i) * 4 + k] [3][4] - float32x4_t line3_5 = vsubq_f32(r5, r5_); // mid[(18 + i) * 4 + k] [3][5] + float32x4_t line3_0 = vsubq_f32(r0, r0_); // mid[(18 + i) * 4 + k] [3][0] + float32x4_t line3_1 = vsubq_f32(r1, r1_); // mid[(18 + i) * 4 + k] [3][1] + float32x4_t line3_2 = vsubq_f32(r2, r2_); // mid[(18 + i) * 4 + k] [3][2] + float32x4_t line3_3 = vsubq_f32(r3, r3_); // mid[(18 + i) * 4 + k] [3][3] + float32x4_t line3_4 = vsubq_f32(r4, r4_); // mid[(18 + i) * 4 + k] [3][4] + float32x4_t line3_5 = vsubq_f32(r5, r5_); // mid[(18 + i) * 4 + k] [3][5] - float32x4_t line4_0 = vaddq_f32(r0, r0_); // mid[(24 + i) * 4 + k] [4][0] - float32x4_t line4_1 = vaddq_f32(r1, r1_); // mid[(24 + i) * 4 + k] [4][1] - float32x4_t line4_2 = vaddq_f32(r2, r2_); // mid[(24 + i) * 4 + k] [4][2] - float32x4_t line4_3 = vaddq_f32(r3, r3_); // mid[(24 + i) * 4 + k] [4][3] - float32x4_t line4_4 = vaddq_f32(r4, r4_); // mid[(24 + i) * 4 + k] [4][4] - float32x4_t line4_5 = vaddq_f32(r5, r5_); // mid[(24 + i) * 4 + k] [4][5] + float32x4_t line4_0 = vaddq_f32(r0, r0_); // mid[(24 + i) * 4 + k] [4][0] + float32x4_t line4_1 = vaddq_f32(r1, r1_); // mid[(24 + i) * 4 + k] [4][1] + float32x4_t line4_2 = vaddq_f32(r2, r2_); // mid[(24 + i) * 4 + k] [4][2] + float32x4_t line4_3 = vaddq_f32(r3, r3_); // mid[(24 + i) * 4 + k] [4][3] + float32x4_t line4_4 = vaddq_f32(r4, r4_); // mid[(24 + i) * 4 + k] [4][4] + float32x4_t line4_5 = vaddq_f32(r5, r5_); // mid[(24 + i) * 4 + k] [4][5] // r4_minus_r2[i * 4 + k] i=0 = mid[0][4] r0 = vsubq_f32(line0_4, line0_2); @@ -455,30 +454,30 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si r4_ = vmulq_f32(r4_, const2); r5_ = vmulq_f32(r5_, const2); - vst1q_f32(inp_ptr + s_size * 3, vsubq_f32(r0, r0_)); // inp_ptr[ s_size * (3 + i * 6)] - vst1q_f32(inp_ptr + s_size * 9, vsubq_f32(r1, r1_)); // inp_ptr[ s_size * (3 + i * 6)] - vst1q_f32(inp_ptr + s_size * 15, vsubq_f32(r2, r2_)); // inp_ptr[ s_size * (3 + i * 6)] - vst1q_f32(inp_ptr + s_size * 21, vsubq_f32(r3, r3_)); // inp_ptr[ s_size * (3 + i * 6)] - vst1q_f32(inp_ptr + s_size * 27, vsubq_f32(r4, r4_)); // inp_ptr[ s_size * (3 + i * 6)] - vst1q_f32(inp_ptr + s_size * 33, vsubq_f32(r5, r5_)); // inp_ptr[ s_size * (3 + i * 6)] + vst1q_f32(inp_ptr + s_size * 3, vsubq_f32(r0, r0_)); // inp_ptr[ s_size * (3 + i * 6)] + vst1q_f32(inp_ptr + s_size * 9, vsubq_f32(r1, r1_)); // inp_ptr[ s_size * (3 + i * 6)] + vst1q_f32(inp_ptr + s_size * 15, vsubq_f32(r2, r2_)); // inp_ptr[ s_size * (3 + i * 6)] + vst1q_f32(inp_ptr + s_size * 21, vsubq_f32(r3, r3_)); // inp_ptr[ s_size * (3 + i * 6)] + vst1q_f32(inp_ptr + s_size * 27, vsubq_f32(r4, r4_)); // inp_ptr[ s_size * (3 + i * 6)] + vst1q_f32(inp_ptr + s_size * 33, vsubq_f32(r5, r5_)); // inp_ptr[ s_size * (3 + i * 6)] - vst1q_f32(inp_ptr + s_size * 4, vaddq_f32(r0, r0_)); // inp_ptr[ s_size * (4 + i * 6)] - vst1q_f32(inp_ptr + s_size * 10, vaddq_f32(r1, r1_)); // inp_ptr[ s_size * (4 + i * 6)] - vst1q_f32(inp_ptr + s_size * 16, vaddq_f32(r2, r2_)); // inp_ptr[ s_size * (4 + i * 6)] - vst1q_f32(inp_ptr + s_size * 22, vaddq_f32(r3, r3_)); // inp_ptr[ s_size * (4 + i * 6)] - vst1q_f32(inp_ptr + s_size * 28, vaddq_f32(r4, r4_)); // inp_ptr[ s_size * (4 + i * 6)] - vst1q_f32(inp_ptr + s_size * 34, vaddq_f32(r5, r5_)); // inp_ptr[ s_size * (4 + i * 6)] + vst1q_f32(inp_ptr + s_size * 4, vaddq_f32(r0, r0_)); // inp_ptr[ s_size * (4 + i * 6)] + vst1q_f32(inp_ptr + s_size * 10, vaddq_f32(r1, r1_)); // inp_ptr[ s_size * (4 + i * 6)] + vst1q_f32(inp_ptr + s_size * 16, vaddq_f32(r2, r2_)); // inp_ptr[ s_size * (4 + i * 6)] + vst1q_f32(inp_ptr + s_size * 22, vaddq_f32(r3, r3_)); // inp_ptr[ s_size * (4 + i * 6)] + vst1q_f32(inp_ptr + s_size * 28, vaddq_f32(r4, r4_)); // inp_ptr[ s_size * (4 + i * 6)] + vst1q_f32(inp_ptr + s_size * 34, vaddq_f32(r5, r5_)); // inp_ptr[ s_size * (4 + i * 6)] float32x4_t const4 = vdupq_n_f32(4.f); float32x4_t const5 = vdupq_n_f32(-5.f); - r0_ = vmulq_f32(line0_1, const4); // line 1*4 ======== + r0_ = vmulq_f32(line0_1, const4); // line 1*4 ======== r1_ = vmulq_f32(line1_1, const4); r2_ = vmulq_f32(line2_1, const4); r3_ = vmulq_f32(line3_1, const4); r4_ = vmulq_f32(line4_1, const4); r5_ = vmulq_f32(line5_1, const4); - float32x4_t rr0_ = vsubq_f32(r0_, line0_3); // line1*4-line3 + float32x4_t rr0_ = vsubq_f32(r0_, line0_3); // line1*4-line3 float32x4_t rr1_ = vsubq_f32(r1_, line1_3); float32x4_t rr2_ = vsubq_f32(r2_, line2_3); float32x4_t rr3_ = vsubq_f32(r3_, line3_3); @@ -492,28 +491,28 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si r4 = vmulq_f32(line4_2, const4); r5 = vmulq_f32(line5_2, const4); - r0 = vsubq_f32(line0_4, r0); // line4 -4*line2 + r0 = vsubq_f32(line0_4, r0); // line4 -4*line2 r1 = vsubq_f32(line1_4, r1); r2 = vsubq_f32(line2_4, r2); r3 = vsubq_f32(line3_4, r3); r4 = vsubq_f32(line4_4, r4); r5 = vsubq_f32(line5_4, r5); - vst1q_f32(inp_ptr + s_size * 1, vsubq_f32(r0, rr0_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 7, vsubq_f32(r1, rr1_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 13, vsubq_f32(r2, rr2_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 19, vsubq_f32(r3, rr3_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 25, vsubq_f32(r4, rr4_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 31, vsubq_f32(r5, rr5_)); // inp_ptr[ s_size * (1 + i * 6)] - - vst1q_f32(inp_ptr + s_size * 2, vaddq_f32(r0, rr0_)); // inp_ptr[ s_size * (2 + i * 6)] - vst1q_f32(inp_ptr + s_size * 8, vaddq_f32(r1, rr1_)); // inp_ptr[ s_size * (2 + i * 6)] - vst1q_f32(inp_ptr + s_size * 14, vaddq_f32(r2, rr2_)); // inp_ptr[ s_size * (2 + i * 6)] - vst1q_f32(inp_ptr + s_size * 20, vaddq_f32(r3, rr3_)); // inp_ptr[ s_size * (2 + i * 6)] - vst1q_f32(inp_ptr + s_size * 26, vaddq_f32(r4, rr4_)); // inp_ptr[ s_size * (2 + i * 6)] - vst1q_f32(inp_ptr + s_size * 32, vaddq_f32(r5, rr5_)); // inp_ptr[ s_size * (2 + i * 6)] - - r0_ = vaddq_f32(line0_5, r0_); // 5 + 1*4 + vst1q_f32(inp_ptr + s_size * 1, vsubq_f32(r0, rr0_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 7, vsubq_f32(r1, rr1_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 13, vsubq_f32(r2, rr2_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 19, vsubq_f32(r3, rr3_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 25, vsubq_f32(r4, rr4_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 31, vsubq_f32(r5, rr5_)); // inp_ptr[ s_size * (1 + i * 6)] + + vst1q_f32(inp_ptr + s_size * 2, vaddq_f32(r0, rr0_)); // inp_ptr[ s_size * (2 + i * 6)] + vst1q_f32(inp_ptr + s_size * 8, vaddq_f32(r1, rr1_)); // inp_ptr[ s_size * (2 + i * 6)] + vst1q_f32(inp_ptr + s_size * 14, vaddq_f32(r2, rr2_)); // inp_ptr[ s_size * (2 + i * 6)] + vst1q_f32(inp_ptr + s_size * 20, vaddq_f32(r3, rr3_)); // inp_ptr[ s_size * (2 + i * 6)] + vst1q_f32(inp_ptr + s_size * 26, vaddq_f32(r4, rr4_)); // inp_ptr[ s_size * (2 + i * 6)] + vst1q_f32(inp_ptr + s_size * 32, vaddq_f32(r5, rr5_)); // inp_ptr[ s_size * (2 + i * 6)] + + r0_ = vaddq_f32(line0_5, r0_); // 5 + 1*4 r1_ = vaddq_f32(line1_5, r1_); r2_ = vaddq_f32(line2_5, r2_); r3_ = vaddq_f32(line3_5, r3_); @@ -526,12 +525,12 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si r3 = vmulq_f32(line3_3, const5); r4 = vmulq_f32(line4_3, const5); r5 = vmulq_f32(line5_3, const5); - vst1q_f32(inp_ptr + s_size * 5, vaddq_f32(r0, r0_)); // inp_ptr[ s_size * (5 + i * 6)] - vst1q_f32(inp_ptr + s_size * 11, vaddq_f32(r1, r1_)); // inp_ptr[ s_size * (5 + i * 6)] - vst1q_f32(inp_ptr + s_size * 17, vaddq_f32(r2, r2_)); // inp_ptr[ s_size * (5 + i * 6)] - vst1q_f32(inp_ptr + s_size * 23, vaddq_f32(r3, r3_)); // inp_ptr[ s_size * (5 + i * 6)] - vst1q_f32(inp_ptr + s_size * 29, vaddq_f32(r4, r4_)); // inp_ptr[ s_size * (5 + i * 6)] - vst1q_f32(inp_ptr + s_size * 35, vaddq_f32(r5, r5_)); // inp_ptr[ s_size * (5 + i * 6)] + vst1q_f32(inp_ptr + s_size * 5, vaddq_f32(r0, r0_)); // inp_ptr[ s_size * (5 + i * 6)] + vst1q_f32(inp_ptr + s_size * 11, vaddq_f32(r1, r1_)); // inp_ptr[ s_size * (5 + i * 6)] + vst1q_f32(inp_ptr + s_size * 17, vaddq_f32(r2, r2_)); // inp_ptr[ s_size * (5 + i * 6)] + vst1q_f32(inp_ptr + s_size * 23, vaddq_f32(r3, r3_)); // inp_ptr[ s_size * (5 + i * 6)] + vst1q_f32(inp_ptr + s_size * 29, vaddq_f32(r4, r4_)); // inp_ptr[ s_size * (5 + i * 6)] + vst1q_f32(inp_ptr + s_size * 35, vaddq_f32(r5, r5_)); // inp_ptr[ s_size * (5 + i * 6)] r0 = vmulq_f32(line0_0, const4); r1 = vmulq_f32(line1_0, const4); @@ -554,12 +553,12 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si r4 = vaddq_f32(r4, line4_4); r5 = vaddq_f32(r5, line5_4); - vst1q_f32(inp_ptr + s_size * 0, vaddq_f32(r0, r0_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 6, vaddq_f32(r1, r1_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 12, vaddq_f32(r2, r2_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 18, vaddq_f32(r3, r3_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 24, vaddq_f32(r4, r4_)); // inp_ptr[ s_size * (1 + i * 6)] - vst1q_f32(inp_ptr + s_size * 30, vaddq_f32(r5, r5_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 0, vaddq_f32(r0, r0_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 6, vaddq_f32(r1, r1_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 12, vaddq_f32(r2, r2_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 18, vaddq_f32(r3, r3_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 24, vaddq_f32(r4, r4_)); // inp_ptr[ s_size * (1 + i * 6)] + vst1q_f32(inp_ptr + s_size * 30, vaddq_f32(r5, r5_)); // inp_ptr[ s_size * (1 + i * 6)] // for(int i = 0; i < 6; i++) // { @@ -599,7 +598,7 @@ static inline void tran_input_4block(const float* input, float* trans_inp, int i int idxh[4]; int idxw[4]; -#pragma omp parallel for num_threads(num_thread) shared(block_hw,nn_block,in_hw) private(idxh,idxw) +#pragma omp parallel for num_threads(num_thread) shared(block_hw, nn_block, in_hw) private(idxh, idxw) for (int ib = 0; ib < nn_block; ib++) { float* inp_ptr_4tile = trans_inp + ib * 4 * ELEM_SIZE * inc; @@ -614,7 +613,7 @@ static inline void tran_input_4block(const float* input, float* trans_inp, int i if (idxh[0] == idxh[3]) { - float* temp_inp_ptr = ( float* )(input + idxh[0] * 4 * inw + idxw[0] * 4); + float* temp_inp_ptr = (float*)(input + idxh[0] * 4 * inw + idxw[0] * 4); for (int c = 0; c < inc; c++) { #ifdef __aarch64__ @@ -634,13 +633,13 @@ static inline void tran_input_4block(const float* input, float* trans_inp, int i for (int c = 0; c < inc; c++) { - trans_inp_1tile(( float* )input, buffer, idxh[0], idxw[0], c, in_hw, inw); + trans_inp_1tile((float*)input, buffer, idxh[0], idxw[0], c, in_hw, inw); buffer += ELEM_SIZE; - trans_inp_1tile(( float* )input, buffer, idxh[1], idxw[1], c, in_hw, inw); + trans_inp_1tile((float*)input, buffer, idxh[1], idxw[1], c, in_hw, inw); buffer += ELEM_SIZE; - trans_inp_1tile(( float* )input, buffer, idxh[2], idxw[2], c, in_hw, inw); + trans_inp_1tile((float*)input, buffer, idxh[2], idxw[2], c, in_hw, inw); buffer += ELEM_SIZE; - trans_inp_1tile(( float* )input, buffer, idxh[3], idxw[3], c, in_hw, inw); + trans_inp_1tile((float*)input, buffer, idxh[3], idxw[3], c, in_hw, inw); buffer += ELEM_SIZE; } // interleave @@ -673,7 +672,7 @@ static inline void tran_input_resi_block(const float* input, float* trans_inp, i { int ih = ib / block_w; int jw = ib % block_w; - trans_inp_1tile(( float* )input, buffer, ih, jw, c, in_hw, inw); + trans_inp_1tile((float*)input, buffer, ih, jw, c, in_hw, inw); buffer += ELEM_SIZE; } // interleave @@ -900,8 +899,7 @@ static inline void transform_output_f43_1tile(const float* buffer_ptr, float* ou float* out_ptr = out + cout_idx * out_hw; int i_h = idx_blockhw / block_w; int j_w = idx_blockhw % block_w; - if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || - (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1))) + if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1))) { trans_output_f43(buffer_ptr, out_ptr + (i_h * TILE * outw + j_w * TILE), outw, bias_ptr, activation); } @@ -955,17 +953,16 @@ static inline void transform_output_f43_4tile(float* buffer_ptr, float* out, int float* out_ptr = out + cout_idx * out_hw; if (bias) { - bias_ptr = ( float* )bias + cout_idx; + bias_ptr = (float*)bias + cout_idx; } for (int ii = 0; ii < 4; ii++) { int i_h = idx_h[ii]; int j_w = idx_w[ii]; - if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || - (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1))) + if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1))) { trans_output_f43(buffer_ptr, out_ptr + (i_h * TILE * outw + j_w * TILE), outw, bias_ptr, activation); - } // direct use_out_ptr + } // direct use_out_ptr else { int ret_h = TILE - resi_h; @@ -985,7 +982,7 @@ static inline void transform_output_f43_4tile(float* buffer_ptr, float* out, int out_pointer[hh * outw + ww] = do_activation(tmp_buffer[hh * 4 + ww], activation); } } - } // end else, tmp_buff + } // end else, tmp_buff buffer_ptr += ELEM_SIZE; } } @@ -1048,7 +1045,7 @@ static void wino_sgemm_set(const float* ker, const float* inp, float* output, co float* out_ptr = output + cout_idx * out_hw + idx_h[0] * TILE * out_w + idx_w[0] * TILE; if (bias) { - bias_ptr = ( float* )(bias + cout_idx); + bias_ptr = (float*)(bias + cout_idx); } float ker00[4] = {2, 4, 8, 0}; @@ -1086,11 +1083,10 @@ static void wino_sgemm_set(const float* ker, const float* inp, float* output, co { int i_h = idx_h[ii]; int j_w = idx_w[ii]; - if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || - (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1))) + if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1))) { trans_output_f43(buffer + ii * ELEM_SIZE + pss * 36 * 4, out_ptr + (i_h * TILE * out_w + j_w * TILE), out_w, (const float*)bias_ptr, activation); - } // direct use_out_ptr + } // direct use_out_ptr else { int ret_h = TILE - resi_h; @@ -1109,7 +1105,7 @@ static void wino_sgemm_set(const float* ker, const float* inp, float* output, co out_pointer[hh * out_w + ww] = do_activation(tmp_buffer[hh * 4 + ww], activation); } } - } // end else, tmp_buff + } // end else, tmp_buff } } } @@ -1163,7 +1159,7 @@ static void wino_sgemm_set(const float* ker, const float* inp, float* output, co } } // end interleave - transform_output_f43_1tile(( const float* )buffer, output, p, i, block_h, block_w, out_hw, out_w, resi_h, + transform_output_f43_1tile((const float*)buffer, output, p, i, block_h, block_w, out_hw, out_w, resi_h, resi_w, PER_OUT_CHAN, bias, activation); // end transform } @@ -1177,7 +1173,7 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo int flag_outw = 1; if (out_w < 16) flag_outw = 0; - + #pragma omp parallel for num_threads(num_thread) for (int p = (cout_start & -4); p < (cout_end & -4); p += 4) { @@ -1226,7 +1222,7 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo float* out_ptr = output + cout_idx * out_hw + idx_h[0] * TILE * out_w + idx_w[0] * TILE; if (bias) { - bias_ptr = ( float* )(bias + cout_idx); + bias_ptr = (float*)(bias + cout_idx); } float ker00[4] = {2, 4, 8, 0}; @@ -1268,13 +1264,12 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo { int i_h = idx_h[ii]; int j_w = idx_w[ii]; - if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || - (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1))) + if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1))) { trans_output_f43(buffer + ii * ELEM_SIZE + pss * 36 * 4, out_ptr + (i_h * TILE * out_w + j_w * TILE), out_w, - ( const float* )bias_ptr, activation); - } // direct use_out_ptr + (const float*)bias_ptr, activation); + } // direct use_out_ptr else { int ret_h = TILE - resi_h; @@ -1285,18 +1280,17 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo ret_w = TILE; // tmp_buffer trans_output_f43_ordinary(buffer + ii * ELEM_SIZE + pss * 36 * 4, tmp_buffer, - ( const float* )bias_ptr); + (const float*)bias_ptr); float* out_pointer = out_ptr + (i_h * TILE * out_w + j_w * TILE); for (int hh = 0; hh < ret_h; hh++) { for (int ww = 0; ww < ret_w; ww++) { - out_pointer[hh * out_w + ww] = - do_activation(tmp_buffer[hh * 4 + ww], activation); + out_pointer[hh * out_w + ww] = do_activation(tmp_buffer[hh * 4 + ww], activation); } } - } // end else, tmp_buff + } // end else, tmp_buff } } } @@ -1353,7 +1347,7 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo } } // end interleave - transform_output_f43_1tile(( const float* )buffer, output, p, i, block_h, block_w, out_hw, out_w, resi_h, + transform_output_f43_1tile((const float*)buffer, output, p, i, block_h, block_w, out_hw, out_w, resi_h, resi_w, 4, bias, activation); // end transform } @@ -1384,8 +1378,8 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo // gemm+interleave buffer[4][36] for (int s = 0; s < ELEM_SIZE; s++) { - float* inp_ = ( float* )(inp_ptr + s * 4 * cin); - float* ker_ = ( float* )(ker_ptr + s * cin); + float* inp_ = (float*)(inp_ptr + s * 4 * cin); + float* ker_ = (float*)(ker_ptr + s * cin); float sum0 = 0; float sum1 = 0; @@ -1415,12 +1409,11 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo { int i_h = idx_h[ii]; int j_w = idx_w[ii]; - if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || - (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1))) + if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1))) { trans_output_f43(buffer + ii * ELEM_SIZE, out_ptr + (i_h * TILE * out_w + j_w * TILE), out_w, - ( const float* )bias_ptr, activation); - } // direct use_out_ptr + (const float*)bias_ptr, activation); + } // direct use_out_ptr else { int ret_h = TILE - resi_h; @@ -1430,7 +1423,7 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo if (j_w < block_w - 1) ret_w = TILE; // tmp_buffer - trans_output_f43_ordinary(buffer + ii * ELEM_SIZE, tmp_buffer, ( const float* )bias_ptr); + trans_output_f43_ordinary(buffer + ii * ELEM_SIZE, tmp_buffer, (const float*)bias_ptr); float* out_pointer = out_ptr + (i_h * TILE * out_w + j_w * TILE); for (int hh = 0; hh < ret_h; hh++) { @@ -1439,8 +1432,8 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo out_pointer[hh * out_w + ww] = do_activation(tmp_buffer[hh * 4 + ww], activation); } } - } // end else, tmp_buff - } // end transform + } // end else, tmp_buff + } // end transform } for (; i < block_hw; i++) @@ -1450,8 +1443,8 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo float buffer[ELEM_SIZE]; for (int s = 0; s < ELEM_SIZE; s++) { - float* inp_ = ( float* )(inp_ptr + s * cin); - float* ker_ = ( float* )(ker_ptr + s * cin); + float* inp_ = (float*)(inp_ptr + s * cin); + float* ker_ = (float*)(ker_ptr + s * cin); float sum = 0; for (int k = 0; k < cin; k++) @@ -1461,7 +1454,7 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo buffer[s] = sum; } // end interleave - transform_output_f43_1tile(( const float* )buffer, output, p, i, block_h, block_w, out_hw, out_w, resi_h, + transform_output_f43_1tile((const float*)buffer, output, p, i, block_h, block_w, out_hw, out_w, resi_h, resi_w, 1, bias, activation); // end transform } @@ -1473,7 +1466,7 @@ static int get_private_mem_size(struct tensor* filter, struct conv_param* param) int output_c = filter->dims[0]; int input_c = filter->dims[1]; int trans_ker_size = output_c * input_c * ELEM_SIZE * sizeof(float); - return trans_ker_size + 128; // caution + return trans_ker_size + 128; // caution } int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, @@ -1482,7 +1475,7 @@ int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tens int output_c = filter_tensor->dims[0]; int input_c = filter_tensor->dims[1]; int mem_size = get_private_mem_size(filter_tensor, param); - float* trans_mem = ( float* )sys_malloc(mem_size); + float* trans_mem = (float*)sys_malloc(mem_size); if (!priv_info->external_interleave_mem) { @@ -1492,7 +1485,7 @@ int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tens } transform_kernel_f43_tile(filter_tensor, trans_mem); - interleave_kernel(trans_mem, ( float* )priv_info->interleave_buffer, output_c, input_c); + interleave_kernel(trans_mem, (float*)priv_info->interleave_buffer, output_c, input_c); sys_free(trans_mem); @@ -1548,16 +1541,16 @@ int wino_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, int padded_in_hw = padded_in_h * padded_in_w; /* buffer addr */ - float* input_buf = ( float* )input_tensor->data; - float* output_buf = ( float* )output_tensor->data; + float* input_buf = (float*)input_tensor->data; + float* output_buf = (float*)output_tensor->data; float* biases_buf = NULL; if (bias_tensor != NULL) - biases_buf = ( float* )bias_tensor->data; - float* col_buf = ( float* )priv_info->im2col_buffer; - float* interleave_buf = ( float* )priv_info->interleave_buffer; + biases_buf = (float*)bias_tensor->data; + float* col_buf = (float*)priv_info->im2col_buffer; + float* interleave_buf = (float*)priv_info->interleave_buffer; - float* input_padd_buf = ( float* )sys_malloc(sizeof(float) * padded_in_hw * in_c + 128); - float* trans_input_buf = ( float* )sys_malloc(sizeof(float) * block_hw * in_c * ELEM_SIZE + 128); + float* input_padd_buf = (float*)sys_malloc(sizeof(float) * padded_in_hw * in_c + 128); + float* trans_input_buf = (float*)sys_malloc(sizeof(float) * block_hw * in_c * ELEM_SIZE + 128); int nn_out_c = out_c / PER_OUT_CHAN * PER_OUT_CHAN; diff --git a/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_arm.h b/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_arm.h index 5f6685528..cac8b75b1 100644 --- a/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_arm.h +++ b/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_arm.h @@ -30,10 +30,8 @@ #include "graph/node.h" #include "graph/graph.h" - int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, - struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param) - ; + struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param); int wino_conv_hcl_postrun(struct conv_priv_info* info); diff --git a/source/device/cpu/op/conv/cortex-m/conv_cmsis.c b/source/device/cpu/op/conv/cortex-m/conv_cmsis.c index ecbe6c4ca..f9057f0b6 100644 --- a/source/device/cpu/op/conv/cortex-m/conv_cmsis.c +++ b/source/device/cpu/op/conv/cortex-m/conv_cmsis.c @@ -35,7 +35,6 @@ #include "arm_math.h" - struct cmsis_param { uint16_t bias_shift; @@ -78,7 +77,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str int scale = ir_tensor->scale; out_shift = cal_shift(scale); - struct cmsis_param* param = ( struct cmsis_param* )sys_malloc(sizeof(struct cmsis_param)); + struct cmsis_param* param = (struct cmsis_param*)sys_malloc(sizeof(struct cmsis_param)); param->bias_shift = bias_shift; param->out_shift = out_shift; @@ -86,9 +85,8 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str exec_node->ops_priv = param; /*2*ch_im_in*dim_kernel*dim_kernel */ - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; - exec_node->shared_mem_size = - sizeof(q15_t) * 2 * conv_param->input_channel * conv_param->kernel_h * conv_param->kernel_w; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; + exec_node->shared_mem_size = sizeof(q15_t) * 2 * conv_param->input_channel * conv_param->kernel_h * conv_param->kernel_w; return 0; } @@ -103,18 +101,18 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex { struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; - struct cmsis_param* cmsis_param = ( struct cmsis_param* )exec_node->ops_priv; + struct cmsis_param* cmsis_param = (struct cmsis_param*)exec_node->ops_priv; struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); struct tensor* bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; int ret = arm_convolve_HWC_q7_nonsquare( - input_tensor->data, input_tensor->dims[2], input_tensor->dims[1], input_tensor->dims[3], weight_tensor->data, - weight_tensor->dims[3], conv_param->kernel_w, conv_param->kernel_h, conv_param->pad_w0, conv_param->pad_h0, - conv_param->stride_w, conv_param->stride_h, bias_tensor->data, cmsis_param->bias_shift, cmsis_param->out_shift, - output_tensor->data, output_tensor->dims[2], output_tensor->dims[1], exec_graph->shared_mem, NULL); + input_tensor->data, input_tensor->dims[2], input_tensor->dims[1], input_tensor->dims[3], weight_tensor->data, + weight_tensor->dims[3], conv_param->kernel_w, conv_param->kernel_h, conv_param->pad_w0, conv_param->pad_h0, + conv_param->stride_w, conv_param->stride_h, bias_tensor->data, cmsis_param->bias_shift, cmsis_param->out_shift, + output_tensor->data, output_tensor->dims[2], output_tensor->dims[1], exec_graph->shared_mem, NULL); if (ret != ARM_MATH_SUCCESS) { @@ -137,12 +135,12 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc } static struct node_ops cmsis_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score}; int register_conv_cmsis_op() { diff --git a/source/device/cpu/op/conv/mips/conv_dw_hcl_mips.c b/source/device/cpu/op/conv/mips/conv_dw_hcl_mips.c index 68de53bb4..095dc59f8 100644 --- a/source/device/cpu/op/conv/mips/conv_dw_hcl_mips.c +++ b/source/device/cpu/op/conv/mips/conv_dw_hcl_mips.c @@ -35,7 +35,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { struct node* ir_node = exec_node->ir_node; @@ -54,8 +53,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; if (conv_dw_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread, cpu_affinity) < 0) { @@ -79,7 +78,7 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) { - struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem; + struct conv_param* param = (struct conv_param*)exec_node->op.param_mem; struct node* ir_node = exec_node; struct graph* ir_graph = ir_node->graph; @@ -108,8 +107,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc if (input_tensor->data_type != TENGINE_DT_FP32) return 0; - if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 && - ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2))) + if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 && ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2))) return OPS_SCORE_BEST; else return 0; diff --git a/source/device/cpu/op/conv/mips/conv_dw_kernel_mips.c b/source/device/cpu/op/conv/mips/conv_dw_kernel_mips.c index 9e14cb452..7eafe03af 100644 --- a/source/device/cpu/op/conv/mips/conv_dw_kernel_mips.c +++ b/source/device/cpu/op/conv/mips/conv_dw_kernel_mips.c @@ -28,19 +28,18 @@ #include #include - #define max(a, b) ((a) > (b) ? (a) : (b)) #define min(a, b) ((a) < (b) ? (a) : (b)) void relu(float* data, int size, int activation) { - for(int i = 0; i < size; i++) + for (int i = 0; i < size; i++) { - data[i] = max(data[i], ( float )0); + data[i] = max(data[i], (float)0); - if(activation > 0) + if (activation > 0) { - data[i] = min(data[i], ( float )activation); + data[i] = min(data[i], (float)activation); } } } @@ -58,32 +57,32 @@ void convdw3x3s1(float* output, float* input, float* _kernel, float* _bias, int const int group = channel; const float* kernel = _kernel; - #pragma omp parallel for num_threads(num_thread) - for (int g=0; g0; remain--) + for (; remain > 0; remain--) { float sum = bias0; sum += r0[0] * k0[0]; @@ -131,7 +130,7 @@ void convdw3x3s1(float* output, float* input, float* _kernel, float* _bias, int { int remain = outw; - for (; remain>0; remain--) + for (; remain > 0; remain--) { float sum = bias0; sum += r0[0] * k0[0]; @@ -171,22 +170,22 @@ void convdw3x3s2(float* output, float* input, float* _kernel, float* _bias, int const int group = channel; - const int tailstep = w - 2*outw + w; + const int tailstep = w - 2 * outw + w; const float* kernel = _kernel; - #pragma omp parallel for num_threads(num_thread) - for (int g=0; g0; remain--) + for (; remain > 0; remain--) { float sum = bias0; sum += r0[0] * k0[0]; @@ -221,7 +220,7 @@ void convdw3x3s2(float* output, float* input, float* _kernel, float* _bias, int r1 += tailstep; r2 += tailstep; } - } + } } void pad(float* input, float* output, int in_h, int in_w, int out_h, int out_w, int top, int left, float v) @@ -282,9 +281,9 @@ void pad(float* input, float* output, int in_h, int in_w, int out_h, int out_w, int conv_dw_run(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, int num_thread, int cpu_affinity) { - float* input = ( float* )input_tensor->data; - float* output = ( float* )output_tensor->data; - float* kernel = ( float* )weight_tensor->data; + float* input = (float*)input_tensor->data; + float* output = (float*)output_tensor->data; + float* kernel = (float*)weight_tensor->data; float* biases = NULL; if (bias_tensor) biases = (float*)bias_tensor->data; @@ -298,7 +297,7 @@ int conv_dw_run(struct tensor* input_tensor, struct tensor* weight_tensor, struc int outc = output_tensor->dims[1]; int outh = output_tensor->dims[2]; int outw = output_tensor->dims[3]; - int out_hw = outh * outw; + int out_hw = outh * outw; int out_chw = out_hw * outc; int ksize_h = param->kernel_h; @@ -323,16 +322,16 @@ int conv_dw_run(struct tensor* input_tensor, struct tensor* weight_tensor, struc else { input_tmp = (float*)malloc(inh_tmp * inw_tmp * group * sizeof(float)); - for (int g=0; gir_node; @@ -46,8 +45,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct struct tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; /* get cpu affinity */ conv_priv_info->cpu_type = exec_graph->cpu_affinity; @@ -67,7 +66,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct if (conv_hcl_set_shared_pack4_mem && exec_node->shared_pack4_mem_size < exec_graph->shared_pack4_mem_size) { if (conv_hcl_set_shared_pack4_mem(conv_priv_info, exec_graph->shared_pack4_mem, - exec_graph->shared_pack4_mem_size) < 0) + exec_graph->shared_pack4_mem_size) + < 0) { TLOG_ERR("hcl conv: set shared pack4 memory failed\n"); // set_tengine_errno(EFAULT); @@ -119,14 +119,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex if (ir_node->input_num > 2) bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; /* fp32 run */ if (exec_graph->mode == TENGINE_MODE_FP32) { if (conv_hcl_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread, - cpu_affinity) < 0) + cpu_affinity) + < 0) { TLOG_ERR("hcl conv run failed\n"); // set_tengine_errno(EFAULT); @@ -149,7 +150,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; /* fp32 postrun */ if (exec_graph->mode == TENGINE_MODE_FP32) @@ -182,10 +183,10 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; /* init the private info data of convolution op */ - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )sys_malloc(sizeof(struct conv_priv_info)); + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)sys_malloc(sizeof(struct conv_priv_info)); if (conv_priv_info == NULL) { // set_tengine_errno(ENOMEM); @@ -211,7 +212,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; sys_free(conv_priv_info); exec_node->ops_priv = NULL; @@ -224,7 +225,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc struct graph* ir_graph = ir_node->graph; struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem; + struct conv_param* param = (struct conv_param*)exec_node->op.param_mem; int group = param->group; int kernel_h = param->kernel_h; int kernel_w = param->kernel_w; @@ -246,8 +247,7 @@ static struct node_ops hcl_node_ops = {.prerun = prerun, .postrun = postrun, .init_node = init_node, .release_node = release_node, - .score = score -}; + .score = score}; int register_conv_hcl_mips_op() { diff --git a/source/device/cpu/op/conv/mips/conv_kernel_mips.c b/source/device/cpu/op/conv/mips/conv_kernel_mips.c index 9bd9d4b69..b66994ec3 100644 --- a/source/device/cpu/op/conv/mips/conv_kernel_mips.c +++ b/source/device/cpu/op/conv/mips/conv_kernel_mips.c @@ -41,7 +41,6 @@ #include #include - #if __mips_msa #include #endif @@ -50,7 +49,7 @@ static int get_private_mem_size(struct tensor* filter) { - return filter->elem_num * filter->elem_size; // caution + return filter->elem_num * filter->elem_size; // caution } static void interleave(struct tensor* filter, struct conv_priv_info* priv_info) @@ -126,8 +125,8 @@ void input_pack4(int K, int N, float* pB, float* pB_t, int num_thread) int nn_size = N >> 2; int remian_size_start = nn_size << 2; - // [ch00, ch10, ch20, ch30, ch01, ch11, ch21, ch31, ch02, ch12, ch22, ch32, ch03, ch13, ch23, ch33 ....] - #pragma omp parallel for num_threads(num_thread) +// [ch00, ch10, ch20, ch30, ch01, ch11, ch21, ch31, ch02, ch12, ch22, ch32, ch03, ch13, ch23, ch33 ....] +#pragma omp parallel for num_threads(num_thread) for (int ii = 0; ii < nn_size; ii++) { int i = ii * 4; @@ -143,14 +142,14 @@ void input_pack4(int K, int N, float* pB, float* pB_t, int num_thread) tmp[1] = img[1]; tmp[2] = img[2]; tmp[3] = img[3]; -#endif // __mips_msa +#endif // __mips_msa tmp += 4; img += N; } } - // [ch00, ch01, ch02, ch03 ....] - #pragma omp parallel for num_threads(num_thread) +// [ch00, ch01, ch02, ch03 ....] +#pragma omp parallel for num_threads(num_thread) for (int i = remian_size_start; i < N; i++) { const float* img = pB + i; @@ -175,13 +174,13 @@ static void sgemm(int M, int N, int K, float* pA_t, float* pB_t, float* pC, int nn_outch = M >> 2; remain_outch_start = nn_outch << 2; - // output ch0 - ch3 - #pragma omp parallel for num_threads(num_thread) - for (int pp=0; ppdims[3]; int out_image_size = output->dims[1] * output->dims[2] * output->dims[3]; - float* interleave_fp32 = ( float* )priv_info->interleave_buffer_pack4 + outchan_g * group * kernel_size; + float* interleave_fp32 = (float*)priv_info->interleave_buffer_pack4 + outchan_g * group * kernel_size; float* im2col_pack4_fp32 = priv_info->im2col_buffer_pack4; - float* output_fp32 = ( float* )output->data + n * out_image_size + outchan_g * group * out_h * out_w; + float* output_fp32 = (float*)output->data + n * out_image_size + outchan_g * group * out_h * out_w; float* bias_fp32 = NULL; if (bias) - bias_fp32 = ( float* )bias->data + outchan_g * group; + bias_fp32 = (float*)bias->data + outchan_g * group; float* filter_sgemm = interleave_fp32; float* input_sgemm_pack4 = im2col_pack4_fp32; @@ -525,8 +524,7 @@ static int winograd_support(struct conv_param* param, int in_h, int in_w) if (in_h <= 10 && in_w <= 10) return 0; - if (group != 1 || kernel_h != 3 || kernel_w != 3 || stride_h != 1 || stride_w != 1 || dilation_h != 1 || - dilation_w != 1 || input_chan < 16 || output_chan < 16) + if (group != 1 || kernel_h != 3 || kernel_w != 3 || stride_h != 1 || stride_w != 1 || dilation_h != 1 || dilation_w != 1 || input_chan < 16 || output_chan < 16) return 0; return 1; @@ -560,8 +558,8 @@ int conv_hcl_get_interleave_pack4_size(int M, int K, struct tensor* filter) void conv_hcl_interleave_pack4(int M, int K, struct conv_priv_info* priv_info) { - float* pA = ( float* )priv_info->interleave_buffer; - float* pA_t = ( float* )priv_info->interleave_buffer_pack4; + float* pA = (float*)priv_info->interleave_buffer; + float* pA_t = (float*)priv_info->interleave_buffer_pack4; int nn_outch = M >> 2; int remain_outch_start = nn_outch << 2; @@ -674,8 +672,7 @@ int conv_hcl_postrun(struct conv_priv_info* priv_info) return wino_conv_hcl_postrun(priv_info); } - if (priv_info->external_interleave_pack4_mem && !priv_info->external_interleave_mem && - priv_info->interleave_buffer != NULL) + if (priv_info->external_interleave_pack4_mem && !priv_info->external_interleave_mem && priv_info->interleave_buffer != NULL) { sys_free(priv_info->interleave_buffer_pack4); priv_info->interleave_buffer_pack4 = NULL; @@ -713,7 +710,7 @@ int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, stru cpu_affinity); } - for (int i = 0; i < input_tensor->dims[0]; i++) // batch size + for (int i = 0; i < input_tensor->dims[0]; i++) // batch size { for (int j = 0; j < group; j++) { diff --git a/source/device/cpu/op/conv/mips/wino_conv_kernel_mips.c b/source/device/cpu/op/conv/mips/wino_conv_kernel_mips.c index c3d8ff789..19ede63f5 100644 --- a/source/device/cpu/op/conv/mips/wino_conv_kernel_mips.c +++ b/source/device/cpu/op/conv/mips/wino_conv_kernel_mips.c @@ -27,7 +27,7 @@ #include "wino_conv_kernel_mips.h" -#define TILE 4 +#define TILE 4 #define ELEM_SIZE ((TILE + 2) * (TILE + 2)) #define WINO_MAX(a, b) ((a) > (b) ? (a) : (b)) @@ -37,11 +37,11 @@ static void relu(float* data, int size, int activation) { for (int i = 0; i < size; i++) { - data[i] = WINO_MAX(data[i], ( float )0); + data[i] = WINO_MAX(data[i], (float)0); if (activation > 0) { - data[i] = WINO_MIN(data[i], ( float )activation); + data[i] = WINO_MIN(data[i], (float)activation); } } } @@ -50,7 +50,7 @@ static int get_private_mem_size(struct tensor* filter, struct conv_param* param) int output_c = filter->dims[0]; int input_c = filter->dims[1]; int trans_ker_size = output_c * input_c * ELEM_SIZE * sizeof(float); - return trans_ker_size + 128; // caution + return trans_ker_size + 128; // caution } static void pad_0_align_2D(float* dst, float* src, int m, int n, int m_align, int n_align, int pad_h, int pad_w) @@ -132,7 +132,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel int w_tm = outw_align / 4 * 6; int h_tm = outh_align / 4 * 6; - int nColBlocks = h_tm / 6; // may be the block num in Feathercnn + int nColBlocks = h_tm / 6; // may be the block num in Feathercnn int nRowBlocks = w_tm / 6; const int tiles = nColBlocks * nRowBlocks; @@ -164,7 +164,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel // 4 = 2 * r01 - r02 - 2 * r03 + r04 // 5 = 4 * r01 - 5 * r03 + r05 - #pragma omp parallel for num_threads(num_thread) +#pragma omp parallel for num_threads(num_thread) for (int q = 0; q < inch; q++) { const float* img = bottom_blob_bordered + q * w * h; @@ -322,7 +322,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel int w_tm = outw_align / 4 * 6; int h_tm = outh_align / 4 * 6; - int nColBlocks = h_tm / 6; // may be the block num in Feathercnn + int nColBlocks = h_tm / 6; // may be the block num in Feathercnn int nRowBlocks = w_tm / 6; const int tiles = nColBlocks * nRowBlocks; @@ -330,7 +330,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel top_blob_tm = dot_block; - #pragma omp parallel for num_threads(num_thread) +#pragma omp parallel for num_threads(num_thread) for (int r = 0; r < 9; r++) { int nn_outch = 0; @@ -533,7 +533,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel output6_tm[n] = sum6[n]; output7_tm[n] = sum7[n]; } -#endif // __mips_msa +#endif // __mips_msa output0_tm += 36; output1_tm += 36; output2_tm += 36; @@ -617,7 +617,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel output2_tm[n] = sum2[n]; output3_tm[n] = sum3[n]; } -#endif // __mips_msa +#endif // __mips_msa output0_tm += 36; output1_tm += 36; output2_tm += 36; @@ -658,7 +658,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel { for (int n = 0; n < 4; n++) { - sum0[n] += ( int )r0[n] * kptr[n]; + sum0[n] += (int)r0[n] * kptr[n]; } kptr += 4; r0 += 4; @@ -668,7 +668,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel { output0_tm[n] = sum0[n]; } -#endif // __mips_msa +#endif // __mips_msa output0_tm += 36; } } @@ -703,12 +703,12 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel int w_tm = outw_align / 4 * 6; int h_tm = outh_align / 4 * 6; - int nColBlocks = h_tm / 6; // may be the block num in Feathercnn + int nColBlocks = h_tm / 6; // may be the block num in Feathercnn int nRowBlocks = w_tm / 6; const int tiles = nColBlocks * nRowBlocks; - #pragma omp parallel for num_threads(num_thread) +#pragma omp parallel for num_threads(num_thread) for (int p = 0; p < outch; p++) { float* out_tile = top_blob_tm + 36 * tiles * p; @@ -816,14 +816,13 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel void conv3x3s1_winograd43_transform_kernel_sse(const float* kernel, float* kernel_wino, int inch, int outch) { - float* kernel_tm = ( float* )sys_malloc(6 * 6 * inch * outch * sizeof(float)); + float* kernel_tm = (float*)sys_malloc(6 * 6 * inch * outch * sizeof(float)); // G const float ktm[6][3] = { - {1.0f / 4, 0.0f, 0.0f}, {-1.0f / 6, -1.0f / 6, -1.0f / 6}, {-1.0f / 6, 1.0f / 6, -1.0f / 6}, - {1.0f / 24, 1.0f / 12, 1.0f / 6}, {1.0f / 24, -1.0f / 12, 1.0f / 6}, {0.0f, 0.0f, 1.0f}}; + {1.0f / 4, 0.0f, 0.0f}, {-1.0f / 6, -1.0f / 6, -1.0f / 6}, {-1.0f / 6, 1.0f / 6, -1.0f / 6}, {1.0f / 24, 1.0f / 12, 1.0f / 6}, {1.0f / 24, -1.0f / 12, 1.0f / 6}, {0.0f, 0.0f, 1.0f}}; - #pragma omp parallel for +#pragma omp parallel for for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) @@ -864,14 +863,14 @@ void conv3x3s1_winograd43_transform_kernel_sse(const float* kernel, float* kerne int p = 0; for (; p + 7 < outch; p += 8) { - const float* kernel0 = ( const float* )kernel_tm + p * inch * 36; - const float* kernel1 = ( const float* )kernel_tm + (p + 1) * inch * 36; - const float* kernel2 = ( const float* )kernel_tm + (p + 2) * inch * 36; - const float* kernel3 = ( const float* )kernel_tm + (p + 3) * inch * 36; - const float* kernel4 = ( const float* )kernel_tm + (p + 4) * inch * 36; - const float* kernel5 = ( const float* )kernel_tm + (p + 5) * inch * 36; - const float* kernel6 = ( const float* )kernel_tm + (p + 6) * inch * 36; - const float* kernel7 = ( const float* )kernel_tm + (p + 7) * inch * 36; + const float* kernel0 = (const float*)kernel_tm + p * inch * 36; + const float* kernel1 = (const float*)kernel_tm + (p + 1) * inch * 36; + const float* kernel2 = (const float*)kernel_tm + (p + 2) * inch * 36; + const float* kernel3 = (const float*)kernel_tm + (p + 3) * inch * 36; + const float* kernel4 = (const float*)kernel_tm + (p + 4) * inch * 36; + const float* kernel5 = (const float*)kernel_tm + (p + 5) * inch * 36; + const float* kernel6 = (const float*)kernel_tm + (p + 6) * inch * 36; + const float* kernel7 = (const float*)kernel_tm + (p + 7) * inch * 36; float* ktmp = kernel_tm_test + p / 8 * inch * 32; @@ -931,10 +930,10 @@ void conv3x3s1_winograd43_transform_kernel_sse(const float* kernel, float* kerne for (; p + 3 < outch; p += 4) { - const float* kernel0 = ( const float* )kernel_tm + p * inch * 36; - const float* kernel1 = ( const float* )kernel_tm + (p + 1) * inch * 36; - const float* kernel2 = ( const float* )kernel_tm + (p + 2) * inch * 36; - const float* kernel3 = ( const float* )kernel_tm + (p + 3) * inch * 36; + const float* kernel0 = (const float*)kernel_tm + p * inch * 36; + const float* kernel1 = (const float*)kernel_tm + (p + 1) * inch * 36; + const float* kernel2 = (const float*)kernel_tm + (p + 2) * inch * 36; + const float* kernel3 = (const float*)kernel_tm + (p + 3) * inch * 36; float* ktmp = kernel_tm_test + (p / 8 + (p % 8) / 4) * inch * 32; for (int q = 0; q < inch; q++) @@ -969,7 +968,7 @@ void conv3x3s1_winograd43_transform_kernel_sse(const float* kernel, float* kerne for (; p < outch; p++) { - const float* kernel0 = ( const float* )kernel_tm + p * inch * 36; + const float* kernel0 = (const float*)kernel_tm + p * inch * 36; float* ktmp = kernel_tm_test + (p / 8 + (p % 8) / 4 + p % 4) * inch * 32; for (int q = 0; q < inch; q++) @@ -1003,7 +1002,7 @@ int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tens int pad_h = param->pad_h0; int pad_w = param->pad_w0; - float* kernel = ( float* )filter_tensor->data; + float* kernel = (float*)filter_tensor->data; if (!priv_info->external_interleave_mem) { @@ -1023,17 +1022,17 @@ int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tens int outw = block_w * TILE; int outh = block_h * TILE; - priv_info->input_pad = ( float* )sys_malloc(batch * input_c * pad_inhw * sizeof(float)); + priv_info->input_pad = (float*)sys_malloc(batch * input_c * pad_inhw * sizeof(float)); memset(priv_info->input_pad, 0, batch * input_c * pad_inhw * sizeof(float)); - priv_info->dot_block = ( float* )sys_malloc(ELEM_SIZE * block * output_c * sizeof(float)); - priv_info->transform_input = ( float* )sys_malloc(ELEM_SIZE * block * input_c * sizeof(float)); + priv_info->dot_block = (float*)sys_malloc(ELEM_SIZE * block * output_c * sizeof(float)); + priv_info->transform_input = (float*)sys_malloc(ELEM_SIZE * block * input_c * sizeof(float)); priv_info->output_bordered = NULL; if (outw != output_w || outh != output_h) { - priv_info->output_bordered = ( float* )sys_malloc(outw * outh * output_c * sizeof(float)); + priv_info->output_bordered = (float*)sys_malloc(outw * outh * output_c * sizeof(float)); } - conv3x3s1_winograd43_transform_kernel_sse(kernel, ( float* )priv_info->interleave_buffer, input_c, output_c); + conv3x3s1_winograd43_transform_kernel_sse(kernel, (float*)priv_info->interleave_buffer, input_c, output_c); return 0; } @@ -1111,11 +1110,11 @@ int wino_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, int padded_in_hw = padded_in_h * padded_in_w; /* buffer addr */ - float* input = ( float* )input_tensor->data; - float* output = ( float* )output_tensor->data; + float* input = (float*)input_tensor->data; + float* output = (float*)output_tensor->data; float* biases = NULL; if (bias_tensor != NULL) - biases = ( float* )bias_tensor->data; + biases = (float*)bias_tensor->data; pad_0_align_3D(priv_info->input_pad, input, in_h, in_w, padded_in_h, padded_in_w, in_c, pad_h0, pad_w0); diff --git a/source/device/cpu/op/conv/mips/wino_conv_kernel_mips.h b/source/device/cpu/op/conv/mips/wino_conv_kernel_mips.h index 4a2610126..aee0540a3 100644 --- a/source/device/cpu/op/conv/mips/wino_conv_kernel_mips.h +++ b/source/device/cpu/op/conv/mips/wino_conv_kernel_mips.h @@ -30,14 +30,12 @@ #include "graph/node.h" #include "graph/graph.h" - #if __mips_msa #include #endif int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, - struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param) - ; + struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param); int wino_conv_hcl_postrun(struct conv_priv_info* info); diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_hcl_rv64.c b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_hcl_rv64.c index 7eab21fd0..338827acd 100644 --- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_hcl_rv64.c +++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_hcl_rv64.c @@ -39,7 +39,6 @@ #include #include - static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { struct node* ir_node = exec_node->ir_node; @@ -54,16 +53,16 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex if (ir_node->input_num > 2) bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; int ret = -1; if (exec_graph->mode == TENGINE_MODE_FP32) ret = conv_dw_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread, cpu_affinity); else { - TLOG_ERR("hcl conv run failed\n"); - return -1; + TLOG_ERR("hcl conv run failed\n"); + return -1; } return ret; @@ -81,7 +80,7 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) { - struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem; + struct conv_param* param = (struct conv_param*)exec_node->op.param_mem; struct node* ir_node = exec_node; struct graph* ir_graph = ir_node->graph; @@ -113,12 +112,10 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc if (kernel_h != kernel_w || input_tensor->dims[0] > 1) return 0; - if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 && - ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2))) + if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 && ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2))) + return OPS_SCORE_BEST; + else if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 5 && kernel_w == 5 && ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2))) return OPS_SCORE_BEST; - else if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 5 && kernel_w == 5 && - ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2))) - return OPS_SCORE_BEST; else return 0; } diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_kernel_rv64.c b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_kernel_rv64.c index 18e6ef238..a7b45fbc0 100644 --- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_kernel_rv64.c +++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_kernel_rv64.c @@ -54,7 +54,6 @@ #include #include - #define max(a, b) ((a) > (b) ? (a) : (b)) #define min(a, b) ((a) < (b) ? (a) : (b)) @@ -62,11 +61,11 @@ static void relu(float* data, int size, int activation) { for (int i = 0; i < size; i++) { - data[i] = max(data[i], ( float )0); + data[i] = max(data[i], (float)0); if (activation > 0) { - data[i] = min(data[i], ( float )activation); + data[i] = min(data[i], (float)activation); } } } @@ -319,7 +318,7 @@ static void convdw5x5s1(float* output, float* input, float* _kernel, float* _bia int c_step_out = outw * outh; const int group = channel; - const float* kernel = _kernel; + const float* kernel = _kernel; #pragma omp parallel for num_threads(num_thread) for (int g = 0; g < group; g++) @@ -597,12 +596,12 @@ static void convdw5x5s2(float* output, float* input, float* _kernel, float* _bia int conv_dw_run(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, int num_thread, int cpu_affinity) { - float* input = ( float* )input_tensor->data; - float* output = ( float* )output_tensor->data; - float* kernel = ( float* )weight_tensor->data; + float* input = (float*)input_tensor->data; + float* output = (float*)output_tensor->data; + float* kernel = (float*)weight_tensor->data; float* biases = NULL; if (bias_tensor) - biases = ( float* )bias_tensor->data; + biases = (float*)bias_tensor->data; int batch_number = input_tensor->dims[0]; int inc = input_tensor->dims[1]; @@ -637,8 +636,8 @@ int conv_dw_run(struct tensor* input_tensor, struct tensor* weight_tensor, struc input_tmp = input; else { - input_tmp = ( float* )sys_malloc(inh_tmp * inw_tmp * group * sizeof(float)); -#pragma omp parallel for num_threads(num_thread) + input_tmp = (float*)sys_malloc(inh_tmp * inw_tmp * group * sizeof(float)); +#pragma omp parallel for num_threads(num_thread) for (int g = 0; g < group; g++) { float* pad_in = input + g * inh * inw; @@ -650,13 +649,13 @@ int conv_dw_run(struct tensor* input_tensor, struct tensor* weight_tensor, struc /* process */ for (int i = 0; i < batch_number; i++) { - if (ksize_h ==3 && stride_h == 1) + if (ksize_h == 3 && stride_h == 1) convdw3x3s1(output, input_tmp, kernel, biases, group, inh_tmp, inw_tmp, outh, outw, num_thread); - else if (ksize_h ==3 && stride_h == 2) + else if (ksize_h == 3 && stride_h == 2) convdw3x3s2(output, input_tmp, kernel, biases, group, inh_tmp, inw_tmp, outh, outw, num_thread); - else if (ksize_h ==5 && stride_h == 1) + else if (ksize_h == 5 && stride_h == 1) convdw5x5s1(output, input_tmp, kernel, biases, group, inh_tmp, inw_tmp, outh, outw, num_thread); - else if (ksize_h ==5 && stride_h == 2) + else if (ksize_h == 5 && stride_h == 2) convdw5x5s2(output, input_tmp, kernel, biases, group, inh_tmp, inw_tmp, outh, outw, num_thread); else TLOG_ERR("convdw %d x %d, s %d not support.\n", ksize_h, ksize_w, stride_h); diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_kernel_rv64.h b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_kernel_rv64.h index a08006b87..0a6276579 100644 --- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_kernel_rv64.h +++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_kernel_rv64.h @@ -31,7 +31,6 @@ #include "graph/node.h" #include "graph/graph.h" - int conv_dw_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, int num_thread, int cpu_affinity); diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_hcl_rv64.c b/source/device/cpu/op/conv/risc-v/lp64dv/conv_hcl_rv64.c index 8cd3bfcf4..ac7333ff0 100644 --- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_hcl_rv64.c +++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_hcl_rv64.c @@ -39,7 +39,6 @@ #include "string.h" - static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { struct node* ir_node = exec_node->ir_node; @@ -48,8 +47,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct struct tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; /* get cpu affinity */ conv_priv_info->cpu_type = exec_graph->cpu_affinity; @@ -68,7 +67,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct if (conv_hcl_set_shared_pack4_mem && exec_node->shared_pack4_mem_size < exec_graph->shared_pack4_mem_size) { if (conv_hcl_set_shared_pack4_mem(conv_priv_info, exec_graph->shared_pack4_mem, - exec_graph->shared_pack4_mem_size) < 0) + exec_graph->shared_pack4_mem_size) + < 0) { TLOG_ERR("hcl conv: set shared pack4 memory failed\n"); return -1; @@ -117,14 +117,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex if (ir_node->input_num > 2) bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; /* fp32 run */ if (exec_graph->mode == TENGINE_MODE_FP32) { if (conv_hcl_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread, - cpu_affinity) < 0) + cpu_affinity) + < 0) { TLOG_ERR("hcl conv run failed\n"); return -1; @@ -146,7 +147,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; /* fp32 postrun */ if (exec_graph->mode == TENGINE_MODE_FP32) @@ -178,10 +179,10 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; /* init the private info data of convolution op */ - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )sys_malloc(sizeof(struct conv_priv_info)); + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)sys_malloc(sizeof(struct conv_priv_info)); if (conv_priv_info == NULL) { return -1; @@ -206,7 +207,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; sys_free(conv_priv_info); exec_node->ops_priv = NULL; @@ -219,7 +220,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc struct graph* ir_graph = ir_node->graph; struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem; + struct conv_param* param = (struct conv_param*)exec_node->op.param_mem; int group = param->group; int kernel_h = param->kernel_h; int kernel_w = param->kernel_w; @@ -235,16 +236,14 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_PREFER; } - static struct node_ops hcl_node_ops = { - .prerun = prerun, - .run = run, - .reshape = reshape, - .postrun = postrun, - .init_node = init_node, - .release_node = release_node, - .score = score -}; + .prerun = prerun, + .run = run, + .reshape = reshape, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score}; int register_conv_hcl_rv64_op() { @@ -256,4 +255,3 @@ int unregister_conv_hcl_rv64_op() unregister_builtin_node_ops(OP_CONV, &hcl_node_ops); return 0; } - diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.c b/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.c index 3666bab6e..999a49d4e 100644 --- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.c +++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.c @@ -32,9 +32,9 @@ #define PER_OUT_CHAN 16 void sgemm_4x16_rv64(float* biases, float* input, float* kernel, long kernel_size, float* output, long output_xy, - int activation, int layout); + int activation, int layout); void sgemm_4x4_rv64(float* biases, float* input, float* kernel, long kernel_size, float* output, long output_xy, - int activation, int layout); + int activation, int layout); void im2col_fp32_1x1(float* input, int input_xy, float* col, int col_cnt, int input_chan); void im2col_fp32_3x3(float* input, int w, int h, int channel, float* cur_col, int stride); @@ -103,9 +103,9 @@ static void interleave_kernel(float* kernel, float* kernel_interleaved, int kern /* kernel interleave */ static void interleave(struct tensor* filter, struct conv_priv_info* priv_info, struct conv_param* param) { - int group = param->group; + int group = param->group; int kernel_size = filter->dims[1] * filter->dims[2] * filter->dims[3]; - int out_chan = filter->dims[0] / group; + int out_chan = filter->dims[0] / group; int out_chan_align4 = (out_chan + 3) / 4 * 4; int kernel_size_algin = kernel_size * out_chan_align4; @@ -115,7 +115,7 @@ static void interleave(struct tensor* filter, struct conv_priv_info* priv_info, float* interleave_buf = priv_info->interleave_buffer; for (int g = 0; g < group; g++) { - float* cur_kernel = kernel + g * kernel_size_group; + float* cur_kernel = kernel + g * kernel_size_group; float* cur_interleave = interleave_buf + g * kernel_size_algin; interleave_kernel(cur_kernel, cur_interleave, out_chan, kernel_size); } @@ -130,14 +130,13 @@ static void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k int in_xy = in_w * in_h; int out_xy = out_w * out_h; int col_end3 = out_xy & 3; - #pragma omp parallel for num_threads(num_thread) +#pragma omp parallel for num_threads(num_thread) for (int col_i = 0; col_i < out_xy - 3; col_i += 4) { float* cur_col = col + col_i * kernel_size; float* cur_input = input + col_i; im2col_fp32_1x1(cur_input, in_xy, cur_col, 4, in_c); - } int col_i = out_xy & -4; float* cur_col; @@ -164,7 +163,7 @@ static void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k int out_xy = out_w * out_h; int col_end3 = out_xy & 3; int is_pad0 = (pad_w0 == 0) && (pad_h0 == 0) && (pad_w1 == 0) && (pad_h1 == 0); - #pragma omp parallel for num_threads(num_thread) +#pragma omp parallel for num_threads(num_thread) for (int col_i = 0; col_i < (out_xy & -4); col_i += 4) { float* cur_col = col + col_i * kernel_size; @@ -176,7 +175,7 @@ static void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k { float* l0 = input + (imy0 * s_h - pad_h0) * in_w + (imx0 * s_w - pad_w0); { - im2col_fp32_3x3(l0, in_w, in_h, in_c, cur_col, s_w); // add im2col 3x3 + im2col_fp32_3x3(l0, in_w, in_h, in_c, cur_col, s_w); // add im2col 3x3 cur_col += 4 * kernel_size; } } @@ -239,7 +238,7 @@ static void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k else { int out_xy = out_w * out_h; - #pragma omp parallel for num_threads(num_thread) +#pragma omp parallel for num_threads(num_thread) for (int col_i = 0; col_i < out_xy - 3; col_i += 4) { int kernel_size = k_w * k_h * in_c; @@ -314,20 +313,20 @@ static void sgemm_set(float* col, float* kernel, float* biases, float* output, i { int p = pp * PER_OUT_CHAN; - float* biasptr = biases ? ( float* )(biases + p) : NULL; - float* kernel_tmp = ( float* )(kernel + p * kernel_size); - float* output_tmp = ( float* )(output + p * output_xy); + float* biasptr = biases ? (float*)(biases + p) : NULL; + float* kernel_tmp = (float*)(kernel + p * kernel_size); + float* output_tmp = (float*)(output + p * output_xy); int col_line = 0; for (col_line = 0; col_line + 3 < output_xy; col_line += 4) { - float* col_tmp = ( float* )(col + col_line * kernel_size); - sgemm_4x16_rv64(biasptr, col_tmp, kernel_tmp, kernel_size, output_tmp + col_line, output_xy, activation, 0); // FIXME: replace with sgemm_4x16_rv64 + float* col_tmp = (float*)(col + col_line * kernel_size); + sgemm_4x16_rv64(biasptr, col_tmp, kernel_tmp, kernel_size, output_tmp + col_line, output_xy, activation, 0); // FIXME: replace with sgemm_4x16_rv64 } { float result[64]; - float* col_tmp = ( float* )(col + col_line * kernel_size); - sgemm_4x16_rv64(biasptr, col_tmp, kernel_tmp, kernel_size, result, 4, activation, 0); // FIXME: replace with sgemm_4x16_rv64 + float* col_tmp = (float*)(col + col_line * kernel_size); + sgemm_4x16_rv64(biasptr, col_tmp, kernel_tmp, kernel_size, result, 4, activation, 0); // FIXME: replace with sgemm_4x16_rv64 for (int i = 0; i < 16; i++) { for (int j = 0; j < (col_end3); j++) @@ -343,14 +342,14 @@ static void sgemm_set(float* col, float* kernel, float* biases, float* output, i { int p = pp * PER_OUT_CHAN; - float* biasptr = biases ? ( float* )(biases + p) : NULL; - float* kernel_tmp = ( float* )(kernel + p * kernel_size); - float* output_tmp = ( float* )(output + p * output_xy); + float* biasptr = biases ? (float*)(biases + p) : NULL; + float* kernel_tmp = (float*)(kernel + p * kernel_size); + float* output_tmp = (float*)(output + p * output_xy); for (int col_line = 0; col_line + 3 < output_xy; col_line += 4) { - float* col_tmp = ( float* )(col + col_line * kernel_size); - sgemm_4x16_rv64(biasptr, col_tmp, kernel_tmp, kernel_size, output_tmp + col_line, output_xy, activation, 0); // FIXME: replace with sgemm_4x16_rv64 + float* col_tmp = (float*)(col + col_line * kernel_size); + sgemm_4x16_rv64(biasptr, col_tmp, kernel_tmp, kernel_size, output_tmp + col_line, output_xy, activation, 0); // FIXME: replace with sgemm_4x16_rv64 } } } @@ -364,23 +363,23 @@ static void sgemm4x4(float* col, float* kernel, float* biases, float* output, in int kernel_end3 = ch_end & 0x3; #pragma omp parallel for num_threads(num_thread) private(result) - for (int kernel_num = ch_start; kernel_num < ((ch_end & -4)-3); kernel_num += 4) + for (int kernel_num = ch_start; kernel_num < ((ch_end & -4) - 3); kernel_num += 4) { float* cur_biases = NULL; float *cur_col, *cur_kernel, *cur_output; int col_line; if (biases) - cur_biases = ( float* )(biases + kernel_num); - cur_kernel = ( float* )(kernel + kernel_num * kernel_size); - cur_output = ( float* )(output + kernel_num * output_xy); + cur_biases = (float*)(biases + kernel_num); + cur_kernel = (float*)(kernel + kernel_num * kernel_size); + cur_output = (float*)(output + kernel_num * output_xy); for (col_line = 0; col_line < (output_xy & -4); col_line += 4) { - cur_col = ( float* )(col + col_line * kernel_size); + cur_col = (float*)(col + col_line * kernel_size); sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, cur_output + col_line, output_xy, activation, 0); } if (col_end3) { - cur_col = ( float* )(col + col_line * kernel_size); + cur_col = (float*)(col + col_line * kernel_size); sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0); for (int i = 0; i < 4; i++) { @@ -394,13 +393,13 @@ static void sgemm4x4(float* col, float* kernel, float* biases, float* output, in int kernel_num = (ch_end & -4); float* cur_biases = NULL; if (biases) - cur_biases = ( float* )(biases + kernel_num); - float* cur_kernel = ( float* )(kernel + kernel_num * kernel_size); - #pragma omp parallel for num_threads(num_thread) private(result) + cur_biases = (float*)(biases + kernel_num); + float* cur_kernel = (float*)(kernel + kernel_num * kernel_size); +#pragma omp parallel for num_threads(num_thread) private(result) for (int col_line = 0; col_line < (output_xy & -4); col_line += 4) { - float* cur_col = ( float* )(col + col_line * kernel_size); - sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0); + float* cur_col = (float*)(col + col_line * kernel_size); + sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0); for (int i = 0; i < kernel_end3; i++) for (int j = 0; j < 4; j++) *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j]; @@ -408,8 +407,8 @@ static void sgemm4x4(float* col, float* kernel, float* biases, float* output, in int col_line = output_xy & -4; if (col_end3) { - float* cur_col = ( float* )(col + col_line * kernel_size); - sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0); + float* cur_col = (float*)(col + col_line * kernel_size); + sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0); for (int i = 0; i < (kernel_end3); i++) { for (int j = 0; j < (col_end3); j++) @@ -448,15 +447,15 @@ static int winograd_support(struct conv_param* param, int in_h, int in_w) */ int conv_hcl_get_shared_mem_size_rv64(struct tensor* input, struct tensor* output, struct conv_param* param) { - int in_h = input->dims[2]; - int in_w = input->dims[3]; + int in_h = input->dims[2]; + int in_w = input->dims[3]; int out_h = output->dims[2]; int out_w = output->dims[3]; int group = param->group; - int input_chan = param->input_channel / group; + int input_chan = param->input_channel / group; int kernel_size = input_chan * param->kernel_h * param->kernel_w; - int out_cstep = out_h * out_w; // channel cstep, output_h * output_w - int elem_size = input->elem_size; // uint8/int8 is 1 byte, fp32 is 4 bytes + int out_cstep = out_h * out_w; // channel cstep, output_h * output_w + int elem_size = input->elem_size; // uint8/int8 is 1 byte, fp32 is 4 bytes out_cstep = (out_cstep + 3) / 4 * 4; int mem_size = elem_size * kernel_size * out_cstep + 128; @@ -473,7 +472,7 @@ static int get_private_mem_size(struct tensor* filter, struct conv_param* param) int out_chan = filter->dims[0] / group; int out_chan_align4 = (out_chan + 3) / 4 * 4; int kernel_size = filter->dims[1] * filter->dims[2] * filter->dims[3]; - int mem_size = kernel_size * filter->elem_size * out_chan_align4 * group + 128; // caution + int mem_size = kernel_size * filter->elem_size * out_chan_align4 * group + 128; // caution return mem_size; } @@ -523,7 +522,7 @@ int conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, s { int mem_size = conv_hcl_get_shared_mem_size_rv64(input_tensor, output_tensor, param); void* mem = sys_malloc(mem_size); - priv_info->im2col_buffer = mem; + priv_info->im2col_buffer = mem; priv_info->im2col_buffer_size = mem_size; } @@ -532,7 +531,7 @@ int conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, s { int mem_size = get_private_mem_size(filter_tensor, param); void* mem = sys_malloc(mem_size); - priv_info->interleave_buffer = mem; + priv_info->interleave_buffer = mem; priv_info->interleave_buffer_size = mem_size; } @@ -607,18 +606,18 @@ int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, stru int output_image_size = output_tensor->dims[1] * output_tensor->dims[2] * output_tensor->dims[3]; /* buffer addr */ - float* input_buf = ( float* )input_tensor->data; - float* output_buf = ( float* )output_tensor->data; + float* input_buf = (float*)input_tensor->data; + float* output_buf = (float*)output_tensor->data; float* biases_buf = NULL; if (bias_tensor != NULL) - biases_buf = ( float* )bias_tensor->data; - float* col_buf = ( float* )priv_info->im2col_buffer; - float* interleave_buf = ( float* )priv_info->interleave_buffer; + biases_buf = (float*)bias_tensor->data; + float* col_buf = (float*)priv_info->im2col_buffer; + float* interleave_buf = (float*)priv_info->interleave_buffer; int sgemm_set_chan = out_c / PER_OUT_CHAN * PER_OUT_CHAN; int sgemm_set_remain = out_c % PER_OUT_CHAN; - for (int n = 0; n < batch; n++) // batch size + for (int n = 0; n < batch; n++) // batch size { for (int g = 0; g < group; g++) { diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.h b/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.h index 9a49bffa1..f2f9051a6 100644 --- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.h +++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.h @@ -49,7 +49,7 @@ int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, stru int num_thread, int cpu_affinity) __attribute__((weak)); int conv_hcl_get_shared_mem_size_rv64(struct tensor* input_tensor, struct tensor* output_tensor, - struct conv_param* param); + struct conv_param* param); int conv_hcl_get_shared_pack4_mem_size(struct tensor* input_tensor, struct tensor* output_tensor, struct conv_param* param) __attribute__((weak)); diff --git a/source/device/cpu/op/conv/x86/conv_direct_hcl_int8_x86.c b/source/device/cpu/op/conv/x86/conv_direct_hcl_int8_x86.c index 7948e34e8..43647e551 100644 --- a/source/device/cpu/op/conv/x86/conv_direct_hcl_int8_x86.c +++ b/source/device/cpu/op/conv/x86/conv_direct_hcl_int8_x86.c @@ -37,7 +37,6 @@ #include #include - static void pad_int8(int8_t* input, int8_t* output, int in_h, int in_w, int out_h, int out_w, int top, int left, int8_t v) { int8_t* ptr = input; @@ -94,7 +93,7 @@ static void pad_int8(int8_t* input, int8_t* output, int in_h, int in_w, int out_ } static int conv3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, - struct tensor* output_tensor, struct conv_param* param, int num_thread) + struct tensor* output_tensor, struct conv_param* param, int num_thread) { int inch = input_tensor->dims[1]; int inh = input_tensor->dims[2]; @@ -115,9 +114,9 @@ static int conv3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weight float* output_fp32 = (float*)sys_malloc(out_size * sizeof(float)); int8_t* output_int8 = (int8_t*)output_tensor->data; - int8_t* input_int8 = (int8_t*)input_tensor->data; + int8_t* input_int8 = (int8_t*)input_tensor->data; int32_t* bias_int32 = NULL; - if(bias_tensor) + if (bias_tensor) bias_int32 = (int32_t*)bias_tensor->data; /* get scale value of quantizaiton */ @@ -135,8 +134,8 @@ static int conv3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weight input_tmp = input_int8; else { - input_tmp = ( int8_t* )sys_malloc((size_t)inh_tmp * inw_tmp * inch * sizeof(int8_t)); -#pragma omp parallel for num_threads(num_thread) + input_tmp = (int8_t*)sys_malloc((size_t)inh_tmp * inw_tmp * inch * sizeof(int8_t)); +#pragma omp parallel for num_threads(num_thread) for (int g = 0; g < inch; g++) { int8_t* pad_in = input_int8 + g * inh * inw; @@ -149,7 +148,7 @@ static int conv3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weight for (int p = 0; p < outch; p++) { int32_t* out0 = output_int32 + p * out_hw; - int8_t* kernel0 = (int8_t* )kernel + p * inch * 9; + int8_t* kernel0 = (int8_t*)kernel + p * inch * 9; for (int q = 0; q < inch; q++) { @@ -169,15 +168,15 @@ static int conv3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weight { int sum0 = 0; - sum0 += ( int )r0[0] * kernel0[0]; - sum0 += ( int )r0[1] * kernel0[1]; - sum0 += ( int )r0[2] * kernel0[2]; - sum0 += ( int )r1[0] * kernel0[3]; - sum0 += ( int )r1[1] * kernel0[4]; - sum0 += ( int )r1[2] * kernel0[5]; - sum0 += ( int )r2[0] * kernel0[6]; - sum0 += ( int )r2[1] * kernel0[7]; - sum0 += ( int )r2[2] * kernel0[8]; + sum0 += (int)r0[0] * kernel0[0]; + sum0 += (int)r0[1] * kernel0[1]; + sum0 += (int)r0[2] * kernel0[2]; + sum0 += (int)r1[0] * kernel0[3]; + sum0 += (int)r1[1] * kernel0[4]; + sum0 += (int)r1[2] * kernel0[5]; + sum0 += (int)r2[0] * kernel0[6]; + sum0 += (int)r2[1] * kernel0[7]; + sum0 += (int)r2[2] * kernel0[8]; *outptr0 += sum0; @@ -204,9 +203,9 @@ static int conv3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weight { int output_off = i * (outh * outw) + j; if (bias_tensor) - output_fp32[output_off] = (float )(output_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i]; + output_fp32[output_off] = (float)(output_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i]; else - output_fp32[output_off] = (float )output_int32[output_off] * input_scale * kernel_scales[i]; + output_fp32[output_off] = (float)output_int32[output_off] * input_scale * kernel_scales[i]; } } @@ -252,7 +251,7 @@ static int conv3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weight { int output_off = i * (outh * outw) + j; - int32_t data_i32 = ( int32_t )(round(output_fp32[output_off] / output_scale)); + int32_t data_i32 = (int32_t)(round(output_fp32[output_off] / output_scale)); if (data_i32 > 127) data_i32 = 127; else if (data_i32 < -127) @@ -292,9 +291,9 @@ static int conv3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weight float* output_fp32 = (float*)sys_malloc(out_size * sizeof(float)); int8_t* output_int8 = (int8_t*)output_tensor->data; - int8_t* input_int8 = (int8_t*)input_tensor->data; + int8_t* input_int8 = (int8_t*)input_tensor->data; int32_t* bias_int32 = NULL; - if(bias_tensor) + if (bias_tensor) bias_int32 = (int32_t*)bias_tensor->data; /* get scale value of quantizaiton */ @@ -312,8 +311,8 @@ static int conv3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weight input_tmp = input_int8; else { - input_tmp = ( int8_t* )sys_malloc((size_t)inh_tmp * inw_tmp * inch * sizeof(int8_t)); -#pragma omp parallel for num_threads(num_thread) + input_tmp = (int8_t*)sys_malloc((size_t)inh_tmp * inw_tmp * inch * sizeof(int8_t)); +#pragma omp parallel for num_threads(num_thread) for (int g = 0; g < inch; g++) { int8_t* pad_in = input_int8 + g * inh * inw; @@ -328,7 +327,7 @@ static int conv3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weight for (int p = 0; p < outch; p++) { int32_t* out0 = output_int32 + p * out_hw; - int8_t* kernel0 = (int8_t* )kernel + p * inch * 9; + int8_t* kernel0 = (int8_t*)kernel + p * inch * 9; for (int q = 0; q < inch; q++) { @@ -348,15 +347,15 @@ static int conv3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weight { int sum0 = 0; - sum0 += ( int )r0[0] * kernel0[0]; - sum0 += ( int )r0[1] * kernel0[1]; - sum0 += ( int )r0[2] * kernel0[2]; - sum0 += ( int )r1[0] * kernel0[3]; - sum0 += ( int )r1[1] * kernel0[4]; - sum0 += ( int )r1[2] * kernel0[5]; - sum0 += ( int )r2[0] * kernel0[6]; - sum0 += ( int )r2[1] * kernel0[7]; - sum0 += ( int )r2[2] * kernel0[8]; + sum0 += (int)r0[0] * kernel0[0]; + sum0 += (int)r0[1] * kernel0[1]; + sum0 += (int)r0[2] * kernel0[2]; + sum0 += (int)r1[0] * kernel0[3]; + sum0 += (int)r1[1] * kernel0[4]; + sum0 += (int)r1[2] * kernel0[5]; + sum0 += (int)r2[0] * kernel0[6]; + sum0 += (int)r2[1] * kernel0[7]; + sum0 += (int)r2[2] * kernel0[8]; *outptr0 += sum0; @@ -383,9 +382,9 @@ static int conv3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weight { int output_off = i * (outh * outw) + j; if (bias_tensor) - output_fp32[output_off] = (float )(output_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i]; + output_fp32[output_off] = (float)(output_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i]; else - output_fp32[output_off] = (float )output_int32[output_off] * input_scale * kernel_scales[i]; + output_fp32[output_off] = (float)output_int32[output_off] * input_scale * kernel_scales[i]; } } @@ -431,7 +430,7 @@ static int conv3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weight { int output_off = i * (outh * outw) + j; - int32_t data_i32 = ( int32_t )(round(output_fp32[output_off] / output_scale)); + int32_t data_i32 = (int32_t)(round(output_fp32[output_off] / output_scale)); if (data_i32 > 127) data_i32 = 127; else if (data_i32 < -127) @@ -466,19 +465,19 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; int ret = -1; - switch(conv_param->stride_h) + switch (conv_param->stride_h) { - case 1: - ret = conv3x3s1_int8_sse(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_param, num_thread); - break; - case 2: - ret = conv3x3s2_int8_sse(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_param, num_thread); - break; - default: - TLOG_ERR("Direct Convolution Int8 not support the stride %d\n", conv_param->stride_h); + case 1: + ret = conv3x3s1_int8_sse(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_param, num_thread); + break; + case 2: + ret = conv3x3s2_int8_sse(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_param, num_thread); + break; + default: + TLOG_ERR("Direct Convolution Int8 not support the stride %d\n", conv_param->stride_h); } return ret; @@ -496,7 +495,7 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) { - struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem; + struct conv_param* param = (struct conv_param*)exec_node->op.param_mem; struct node* ir_node = exec_node; struct graph* ir_graph = ir_node->graph; @@ -520,8 +519,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc if (input_tensor->data_type != TENGINE_DT_INT8) return 0; - if (group == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 && - ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2))) + if (group == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 && ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2))) return OPS_SCORE_BEST * 2; else return 0; diff --git a/source/device/cpu/op/conv/x86/conv_dw_hcl_x86.c b/source/device/cpu/op/conv/x86/conv_dw_hcl_x86.c index 6dfdb8fd1..b94bcb363 100644 --- a/source/device/cpu/op/conv/x86/conv_dw_hcl_x86.c +++ b/source/device/cpu/op/conv/x86/conv_dw_hcl_x86.c @@ -39,7 +39,6 @@ #include #include - static void pad_int8(int8_t* input, int8_t* output, int in_h, int in_w, int out_h, int out_w, int top, int left, int8_t v) { int8_t* ptr = input; @@ -96,7 +95,7 @@ static void pad_int8(int8_t* input, int8_t* output, int in_h, int in_w, int out_ } static int convdw3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, - struct tensor* output_tensor, struct conv_param* param, int num_thread) + struct tensor* output_tensor, struct conv_param* param, int num_thread) { int inch = input_tensor->dims[1]; int inh = input_tensor->dims[2]; @@ -117,9 +116,9 @@ static int convdw3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weig float* output_fp32 = (float*)sys_malloc(out_size * sizeof(float)); int8_t* output_int8 = (int8_t*)output_tensor->data; - int8_t* input_int8 = (int8_t*)input_tensor->data; + int8_t* input_int8 = (int8_t*)input_tensor->data; int32_t* bias_int32 = NULL; - if(bias_tensor) + if (bias_tensor) bias_int32 = (int32_t*)bias_tensor->data; /* get scale value of quantizaiton */ @@ -137,7 +136,7 @@ static int convdw3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weig input_tmp = input_int8; else { - input_tmp = ( int8_t* )sys_malloc((size_t)inh_tmp * inw_tmp * inch * sizeof(int8_t)); + input_tmp = (int8_t*)sys_malloc((size_t)inh_tmp * inw_tmp * inch * sizeof(int8_t)); #pragma omp parallel for num_threads(num_thread) for (int g = 0; g < inch; g++) { @@ -151,7 +150,7 @@ static int convdw3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weig for (int p = 0; p < outch; p++) { int32_t* out0 = output_int32 + p * out_hw; - int8_t* kernel0 = (int8_t* )kernel + p * 9; + int8_t* kernel0 = (int8_t*)kernel + p * 9; int* outptr0 = out0; @@ -169,15 +168,15 @@ static int convdw3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weig { int sum0 = 0; - sum0 += ( int )r0[0] * kernel0[0]; - sum0 += ( int )r0[1] * kernel0[1]; - sum0 += ( int )r0[2] * kernel0[2]; - sum0 += ( int )r1[0] * kernel0[3]; - sum0 += ( int )r1[1] * kernel0[4]; - sum0 += ( int )r1[2] * kernel0[5]; - sum0 += ( int )r2[0] * kernel0[6]; - sum0 += ( int )r2[1] * kernel0[7]; - sum0 += ( int )r2[2] * kernel0[8]; + sum0 += (int)r0[0] * kernel0[0]; + sum0 += (int)r0[1] * kernel0[1]; + sum0 += (int)r0[2] * kernel0[2]; + sum0 += (int)r1[0] * kernel0[3]; + sum0 += (int)r1[1] * kernel0[4]; + sum0 += (int)r1[2] * kernel0[5]; + sum0 += (int)r2[0] * kernel0[6]; + sum0 += (int)r2[1] * kernel0[7]; + sum0 += (int)r2[2] * kernel0[8]; *outptr0 += sum0; @@ -203,9 +202,9 @@ static int convdw3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weig { int output_off = i * (outh * outw) + j; if (bias_tensor) - output_fp32[output_off] = (float )(output_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i]; + output_fp32[output_off] = (float)(output_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i]; else - output_fp32[output_off] = (float )output_int32[output_off] * input_scale * kernel_scales[i]; + output_fp32[output_off] = (float)output_int32[output_off] * input_scale * kernel_scales[i]; } } @@ -251,7 +250,7 @@ static int convdw3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weig { int output_off = i * (outh * outw) + j; - int32_t data_i32 = ( int32_t )(round(output_fp32[output_off] / output_scale)); + int32_t data_i32 = (int32_t)(round(output_fp32[output_off] / output_scale)); if (data_i32 > 127) data_i32 = 127; else if (data_i32 < -127) @@ -269,9 +268,8 @@ static int convdw3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weig return 0; } - static int convdw3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, - struct tensor* output_tensor, struct conv_param* param, int num_thread) + struct tensor* output_tensor, struct conv_param* param, int num_thread) { int inch = input_tensor->dims[1]; int inh = input_tensor->dims[2]; @@ -292,9 +290,9 @@ static int convdw3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weig float* output_fp32 = (float*)sys_malloc(out_size * sizeof(float)); int8_t* output_int8 = (int8_t*)output_tensor->data; - int8_t* input_int8 = (int8_t*)input_tensor->data; + int8_t* input_int8 = (int8_t*)input_tensor->data; int32_t* bias_int32 = NULL; - if(bias_tensor) + if (bias_tensor) bias_int32 = (int32_t*)bias_tensor->data; /* get scale value of quantizaiton */ @@ -312,8 +310,8 @@ static int convdw3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weig input_tmp = input_int8; else { - input_tmp = ( int8_t* )sys_malloc((size_t)inh_tmp * inw_tmp * inch * sizeof(int8_t)); -#pragma omp parallel for num_threads(num_thread) + input_tmp = (int8_t*)sys_malloc((size_t)inh_tmp * inw_tmp * inch * sizeof(int8_t)); +#pragma omp parallel for num_threads(num_thread) for (int g = 0; g < inch; g++) { int8_t* pad_in = input_int8 + g * inh * inw; @@ -328,7 +326,7 @@ static int convdw3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weig for (int p = 0; p < outch; p++) { int32_t* out0 = output_int32 + p * out_hw; - int8_t* kernel0 = (int8_t* )kernel + p * 9; + int8_t* kernel0 = (int8_t*)kernel + p * 9; int* outptr0 = out0; @@ -346,15 +344,15 @@ static int convdw3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weig { int sum0 = 0; - sum0 += ( int )r0[0] * kernel0[0]; - sum0 += ( int )r0[1] * kernel0[1]; - sum0 += ( int )r0[2] * kernel0[2]; - sum0 += ( int )r1[0] * kernel0[3]; - sum0 += ( int )r1[1] * kernel0[4]; - sum0 += ( int )r1[2] * kernel0[5]; - sum0 += ( int )r2[0] * kernel0[6]; - sum0 += ( int )r2[1] * kernel0[7]; - sum0 += ( int )r2[2] * kernel0[8]; + sum0 += (int)r0[0] * kernel0[0]; + sum0 += (int)r0[1] * kernel0[1]; + sum0 += (int)r0[2] * kernel0[2]; + sum0 += (int)r1[0] * kernel0[3]; + sum0 += (int)r1[1] * kernel0[4]; + sum0 += (int)r1[2] * kernel0[5]; + sum0 += (int)r2[0] * kernel0[6]; + sum0 += (int)r2[1] * kernel0[7]; + sum0 += (int)r2[2] * kernel0[8]; *outptr0 += sum0; @@ -380,9 +378,9 @@ static int convdw3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weig { int output_off = i * (outh * outw) + j; if (bias_tensor) - output_fp32[output_off] = (float )(output_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i]; + output_fp32[output_off] = (float)(output_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i]; else - output_fp32[output_off] = (float )output_int32[output_off] * input_scale * kernel_scales[i]; + output_fp32[output_off] = (float)output_int32[output_off] * input_scale * kernel_scales[i]; } } @@ -428,7 +426,7 @@ static int convdw3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weig { int output_off = i * (outh * outw) + j; - int32_t data_i32 = ( int32_t )(round(output_fp32[output_off] / output_scale)); + int32_t data_i32 = (int32_t)(round(output_fp32[output_off] / output_scale)); if (data_i32 > 127) data_i32 = 127; else if (data_i32 < -127) @@ -447,19 +445,19 @@ static int convdw3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weig } static int conv_dw_run_int8(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, - struct tensor* output_tensor, struct conv_param* param, int num_thread) + struct tensor* output_tensor, struct conv_param* param, int num_thread) { int ret = -1; - switch(param->stride_h) + switch (param->stride_h) { - case 1: - ret = convdw3x3s1_int8_sse(input_tensor, weight_tensor, bias_tensor, output_tensor, param, num_thread); - break; - case 2: - ret = convdw3x3s2_int8_sse(input_tensor, weight_tensor, bias_tensor, output_tensor, param, num_thread); - break; - default: - TLOG_ERR("Direct Convolution Int8 not support the stride %d\n", param->stride_h); + case 1: + ret = convdw3x3s1_int8_sse(input_tensor, weight_tensor, bias_tensor, output_tensor, param, num_thread); + break; + case 2: + ret = convdw3x3s2_int8_sse(input_tensor, weight_tensor, bias_tensor, output_tensor, param, num_thread); + break; + default: + TLOG_ERR("Direct Convolution Int8 not support the stride %d\n", param->stride_h); } return ret; @@ -480,8 +478,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex if (ir_node->input_num > 2) bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; int ret = -1; if (exec_graph->mode == TENGINE_MODE_FP32) @@ -490,8 +488,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex ret = conv_dw_run_int8(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_param, num_thread); else { - TLOG_ERR("hcl conv run failed\n"); - return -1; + TLOG_ERR("hcl conv run failed\n"); + return -1; } return ret; @@ -509,7 +507,7 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) { - struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem; + struct conv_param* param = (struct conv_param*)exec_node->op.param_mem; struct node* ir_node = exec_node; struct graph* ir_graph = ir_node->graph; @@ -538,8 +536,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc if (kernel_h != kernel_w || input_tensor->dims[0] > 1) return 0; - if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 && - ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2))) + if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 && ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2))) return OPS_SCORE_BEST; else return 0; diff --git a/source/device/cpu/op/conv/x86/conv_dw_kernel_x86.c b/source/device/cpu/op/conv/x86/conv_dw_kernel_x86.c index 8da88c902..45ec0536c 100644 --- a/source/device/cpu/op/conv/x86/conv_dw_kernel_x86.c +++ b/source/device/cpu/op/conv/x86/conv_dw_kernel_x86.c @@ -46,7 +46,6 @@ #include #endif - #define max(a, b) ((a) > (b) ? (a) : (b)) #define min(a, b) ((a) < (b) ? (a) : (b)) @@ -54,11 +53,11 @@ static void relu(float* data, int size, int activation) { for (int i = 0; i < size; i++) { - data[i] = max(data[i], ( float )0); + data[i] = max(data[i], (float)0); if (activation > 0) { - data[i] = min(data[i], ( float )activation); + data[i] = min(data[i], (float)activation); } } } @@ -127,9 +126,9 @@ static void convdw3x3s1(float* output, float* img_data, float* kernel_data, floa int channel_count = inc >> 3; int channel_remain = inc - (channel_count << 3); // generate the image tmp - float* img_tmp = ( float* )sys_malloc(8 * (unsigned long)inwh * (channel_count + 1) * sizeof(float)); - float* kernel_tmp = ( float* )sys_malloc(8 * 9 * (channel_count + 1) * sizeof(float)); - float* bias_tmp = ( float* )sys_malloc(8 * (channel_count + 1) * sizeof(float)); + float* img_tmp = (float*)sys_malloc(8 * (unsigned long)inwh * (channel_count + 1) * sizeof(float)); + float* kernel_tmp = (float*)sys_malloc(8 * 9 * (channel_count + 1) * sizeof(float)); + float* bias_tmp = (float*)sys_malloc(8 * (channel_count + 1) * sizeof(float)); { for (int i = 0; i < channel_count; i++) { @@ -334,7 +333,7 @@ static void convdw3x3s1(float* output, float* img_data, float* kernel_data, floa } } - float* output_tmp = ( float* )sys_malloc((unsigned long)outwh * (channel_count + 1) * 8 * sizeof(float)); + float* output_tmp = (float*)sys_malloc((unsigned long)outwh * (channel_count + 1) * 8 * sizeof(float)); for (int c = 0; c < channel_count + 1; c++) { float* ktmp = kernel_tmp + c * 8 * 9; @@ -783,9 +782,9 @@ static void convdw3x3s2(float* output, float* img_data, float* kernel_data, floa int channel_count = inc >> 3; int channel_remain = inc - (channel_count << 3); // generate the image tmp - float* img_tmp = ( float* )sys_malloc(8 * (unsigned long)inwh * (channel_count + 1) * sizeof(float)); - float* kernel_tmp = ( float* )sys_malloc(8 * 9 * (channel_count + 1) * sizeof(float)); - float* bias_tmp = ( float* )sys_malloc(8 * (channel_count + 1) * sizeof(float)); + float* img_tmp = (float*)sys_malloc(8 * (unsigned long)inwh * (channel_count + 1) * sizeof(float)); + float* kernel_tmp = (float*)sys_malloc(8 * 9 * (channel_count + 1) * sizeof(float)); + float* bias_tmp = (float*)sys_malloc(8 * (channel_count + 1) * sizeof(float)); { for (int i = 0; i < channel_count; i++) { @@ -993,7 +992,7 @@ static void convdw3x3s2(float* output, float* img_data, float* kernel_data, floa } } - float* output_tmp = ( float* )sys_malloc((unsigned long)outwh * (channel_count + 1) * 8 * sizeof(float)); + float* output_tmp = (float*)sys_malloc((unsigned long)outwh * (channel_count + 1) * 8 * sizeof(float)); for (int c = 0; c < channel_count + 1; c++) { float* ktmp = kernel_tmp + c * 8 * 9; @@ -1310,9 +1309,9 @@ static void convdw3x3s1(float* output, float* img_data, float* kernel_data, floa int channel_remain = inc - (channel_count << 2); // generate the image tmp - float* img_tmp = ( float* )sys_malloc(4 * inwh * (channel_count + 1) * sizeof(float)); - float* kernel_tmp = ( float* )sys_malloc(4 * 9 * (channel_count + 1) * sizeof(float)); - float* bias_tmp = ( float* )sys_malloc(4 * (channel_count + 1) * sizeof(float)); + float* img_tmp = (float*)sys_malloc(4 * inwh * (channel_count + 1) * sizeof(float)); + float* kernel_tmp = (float*)sys_malloc(4 * 9 * (channel_count + 1) * sizeof(float)); + float* bias_tmp = (float*)sys_malloc(4 * (channel_count + 1) * sizeof(float)); { for (int i = 0; i < channel_count; i++) { @@ -1416,7 +1415,7 @@ static void convdw3x3s1(float* output, float* img_data, float* kernel_data, floa } } } - float* output_tmp = ( float* )sys_malloc(outwh * 4 * (channel_count + 1) * sizeof(float)); + float* output_tmp = (float*)sys_malloc(outwh * 4 * (channel_count + 1) * sizeof(float)); for (int c = 0; c < channel_count + 1; c++) { @@ -1951,9 +1950,9 @@ static void convdw3x3s2(float* output, float* img_data, float* kernel_data, floa int channel_count = inc >> 2; int channel_remain = inc - (channel_count << 2); // generate the image tmp - float* img_tmp = ( float* )sys_malloc(4 * inwh * (channel_count + 1) * sizeof(float)); - float* kernel_tmp = ( float* )sys_malloc(4 * 9 * (channel_count + 1) * sizeof(float)); - float* bias_tmp = ( float* )sys_malloc(4 * (channel_count + 1) * sizeof(float)); + float* img_tmp = (float*)sys_malloc(4 * inwh * (channel_count + 1) * sizeof(float)); + float* kernel_tmp = (float*)sys_malloc(4 * 9 * (channel_count + 1) * sizeof(float)); + float* bias_tmp = (float*)sys_malloc(4 * (channel_count + 1) * sizeof(float)); { for (int i = 0; i < channel_count; i++) { @@ -2057,7 +2056,7 @@ static void convdw3x3s2(float* output, float* img_data, float* kernel_data, floa } } } - float* output_tmp = ( float* )sys_malloc(outwh * 4 * (channel_count + 1) * sizeof(float)); + float* output_tmp = (float*)sys_malloc(outwh * 4 * (channel_count + 1) * sizeof(float)); for (int c = 0; c < channel_count + 1; c++) { float* ktmp = kernel_tmp + c * 4 * 9; @@ -2525,12 +2524,12 @@ static void convdw3x3s2(float* output, float* input, float* _kernel, float* _bia int conv_dw_run(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, int num_thread, int cpu_affinity) { - float* input = ( float* )input_tensor->data; - float* output = ( float* )output_tensor->data; - float* kernel = ( float* )weight_tensor->data; + float* input = (float*)input_tensor->data; + float* output = (float*)output_tensor->data; + float* kernel = (float*)weight_tensor->data; float* biases = NULL; if (bias_tensor) - biases = ( float* )bias_tensor->data; + biases = (float*)bias_tensor->data; int batch_number = input_tensor->dims[0]; int inc = input_tensor->dims[1]; @@ -2565,8 +2564,8 @@ int conv_dw_run(struct tensor* input_tensor, struct tensor* weight_tensor, struc input_tmp = input; else { - input_tmp = ( float* )sys_malloc((size_t)inh_tmp * inw_tmp * group * sizeof(float)); -#pragma omp parallel for num_threads(num_thread) + input_tmp = (float*)sys_malloc((size_t)inh_tmp * inw_tmp * group * sizeof(float)); +#pragma omp parallel for num_threads(num_thread) for (int g = 0; g < group; g++) { float* pad_in = input + g * inh * inw; diff --git a/source/device/cpu/op/conv/x86/conv_dw_kernel_x86.h b/source/device/cpu/op/conv/x86/conv_dw_kernel_x86.h index 665f832f9..85b6ad3ea 100644 --- a/source/device/cpu/op/conv/x86/conv_dw_kernel_x86.h +++ b/source/device/cpu/op/conv/x86/conv_dw_kernel_x86.h @@ -31,7 +31,6 @@ #include "graph/node.h" #include "graph/graph.h" - int conv_dw_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, int num_thread, int cpu_affinity); diff --git a/source/device/cpu/op/conv/x86/conv_hcl_x86.c b/source/device/cpu/op/conv/x86/conv_hcl_x86.c index 7215a1bd7..b1a3cf689 100644 --- a/source/device/cpu/op/conv/x86/conv_hcl_x86.c +++ b/source/device/cpu/op/conv/x86/conv_hcl_x86.c @@ -38,7 +38,6 @@ #include - static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { struct node* ir_node = exec_node->ir_node; @@ -47,8 +46,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct struct tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; /* get cpu affinity */ conv_priv_info->cpu_type = exec_graph->cpu_affinity; @@ -67,7 +66,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct if (conv_hcl_set_shared_pack4_mem && exec_node->shared_pack4_mem_size < exec_graph->shared_pack4_mem_size) { if (conv_hcl_set_shared_pack4_mem(conv_priv_info, exec_graph->shared_pack4_mem, - exec_graph->shared_pack4_mem_size) < 0) + exec_graph->shared_pack4_mem_size) + < 0) { TLOG_ERR("hcl conv: set shared pack4 memory failed\n"); @@ -119,14 +119,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex if (ir_node->input_num > 2) bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; /* fp32 run */ if (exec_graph->mode == TENGINE_MODE_FP32 || exec_graph->mode == TENGINE_MODE_UINT8 || exec_graph->mode == TENGINE_MODE_INT8) { if (conv_hcl_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread, - cpu_affinity) < 0) + cpu_affinity) + < 0) { TLOG_ERR("hcl conv run failed\n"); return -1; @@ -150,7 +151,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; /* dynamic get the shape of output tensor */ int n = input_tensor->dims[0]; @@ -212,10 +213,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc } else { - out_h = - (h - conv_param->dilation_h * (conv_param->kernel_h - 1) - 1 + conv_param->pad_h0 + conv_param->pad_h1) / - conv_param->stride_h + - 1; + out_h = (h - conv_param->dilation_h * (conv_param->kernel_h - 1) - 1 + conv_param->pad_h0 + conv_param->pad_h1) / conv_param->stride_h + 1; } if (conv_param->pad_w0 < 0) @@ -238,10 +236,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc } else { - out_w = - (w - conv_param->dilation_w * (conv_param->kernel_w - 1) - 1 + conv_param->pad_w0 + conv_param->pad_w1) / - conv_param->stride_w + - 1; + out_w = (w - conv_param->dilation_w * (conv_param->kernel_w - 1) - 1 + conv_param->pad_w0 + conv_param->pad_w1) / conv_param->stride_w + 1; } int dims[4]; @@ -254,7 +249,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc dims[2] = out_h; dims[3] = out_w; - for (int i=0; i<4; i++) + for (int i = 0; i < 4; i++) { if (dims[i] == 0) dims[i] = 1; @@ -271,7 +266,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc dims[2] = out_w; dims[3] = out_c; - for (int i=0; i<4; i++) + for (int i = 0; i < 4; i++) { if (dims[i] == 0) dims[i] = 1; @@ -286,10 +281,10 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; /* fp32 postrun */ - if (exec_graph->mode == TENGINE_MODE_FP32 || exec_graph->mode == TENGINE_MODE_UINT8 || exec_graph->mode == TENGINE_MODE_INT8 ) + if (exec_graph->mode == TENGINE_MODE_FP32 || exec_graph->mode == TENGINE_MODE_UINT8 || exec_graph->mode == TENGINE_MODE_INT8) { if (conv_hcl_postrun(conv_priv_info) < 0) { @@ -318,10 +313,10 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; /* init the private info data of convolution op */ - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )sys_malloc(sizeof(struct conv_priv_info)); + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)sys_malloc(sizeof(struct conv_priv_info)); if (conv_priv_info == NULL) { return -1; @@ -346,7 +341,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; + struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; sys_free(conv_priv_info); exec_node->ops_priv = NULL; @@ -359,7 +354,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc struct graph* ir_graph = ir_node->graph; struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem; + struct conv_param* param = (struct conv_param*)exec_node->op.param_mem; int group = param->group; int kernel_h = param->kernel_h; int kernel_w = param->kernel_w; @@ -381,8 +376,7 @@ static struct node_ops hcl_node_ops = {.prerun = prerun, .postrun = postrun, .init_node = init_node, .release_node = release_node, - .score = score -}; + .score = score}; int register_conv_hcl_x86_op() { diff --git a/source/device/cpu/op/conv/x86/conv_kernel_x86.c b/source/device/cpu/op/conv/x86/conv_kernel_x86.c index 763fad86f..ee90eee1c 100644 --- a/source/device/cpu/op/conv/x86/conv_kernel_x86.c +++ b/source/device/cpu/op/conv/x86/conv_kernel_x86.c @@ -51,13 +51,12 @@ #define max(a, b) ((a) > (b) ? (a) : (b)) #define min(a, b) ((a) < (b) ? (a) : (b)) - static int get_private_mem_size(struct tensor* filter) { - if (filter->data_type == TENGINE_DT_UINT8) // simulator uint8 inference with fp32 + if (filter->data_type == TENGINE_DT_UINT8) // simulator uint8 inference with fp32 return filter->elem_num * filter->elem_size * 4; else - return filter->elem_num * filter->elem_size; // caution + return filter->elem_num * filter->elem_size; // caution } static void interleave(struct tensor* filter, struct conv_priv_info* priv_info) @@ -69,7 +68,7 @@ static void interleave(struct tensor* filter, struct conv_priv_info* priv_info) static void interleave_uint8(struct tensor* filter, struct conv_priv_info* priv_info) { /* dequant uint8 weight to fp32 for simulator */ - float* weight_fp32 = (float* )priv_info->interleave_buffer; + float* weight_fp32 = (float*)priv_info->interleave_buffer; uint8_t* weight_uint8 = (uint8_t*)filter->data; float scale = filter->scale; int zero_point = filter->zero_point; @@ -81,7 +80,7 @@ static void interleave_uint8(struct tensor* filter, struct conv_priv_info* priv_ } void im2col_fp32(float* data_img, float* data_col, int inh, int inw, int inc, int outh, int outw, int ksize_h, - int ksize_w, int sh, int sw, int ph, int pw, int dh, int dw) + int ksize_w, int sh, int sw, int ph, int pw, int dh, int dw) { const int channels_col = ksize_h * ksize_w * inc; @@ -163,7 +162,7 @@ void im2col_uint8(uint8_t* data_img, float* data_col, struct tensor* input_tenso if (im_row >= 0 && im_row < inh) { - uint8_t * in = data_img + inw * (im_row + inh * c_) + im_col + (w_low - 1) * sw; + uint8_t* in = data_img + inw * (im_row + inh * c_) + im_col + (w_low - 1) * sw; memset(out, 0, w_low * sizeof(float)); out += w_low; @@ -218,12 +217,12 @@ void im2col_int8(int8_t* data_img, int8_t* data_col, struct tensor* input_tensor for (int h = 0; h < outh; ++h) { const int im_row = kh * dh + h * sh - ph; - int8_t * out = data_col + (c * outh + h) * outw; - const int8_t * end = out + w_high; + int8_t* out = data_col + (c * outh + h) * outw; + const int8_t* end = out + w_high; if (im_row >= 0 && im_row < inh) { - int8_t * in = data_img + inw * (im_row + inh * c_) + im_col + (w_low - 1) * sw; + int8_t* in = data_img + inw * (im_row + inh * c_) + im_col + (w_low - 1) * sw; memset(out, 0, w_low * sizeof(int8_t)); out += w_low; while (out < end) @@ -249,8 +248,8 @@ static void im2col_ir(struct tensor* input, struct tensor* output, struct conv_p int image_size = input->dims[1] * input->dims[2] * input->dims[3]; int group_size = input_chan * input->dims[2] * input->dims[3]; - void* input_base = (void*)((uint8_t*)input->data + (n * image_size + group * group_size) * input->elem_size); - void* im2col_buf = (void*)priv_info->im2col_buffer; + void* input_base = (void*)((uint8_t*)input->data + (n * image_size + group * group_size) * input->elem_size); + void* im2col_buf = (void*)priv_info->im2col_buffer; if (input->data_type == TENGINE_DT_FP32) { @@ -297,7 +296,7 @@ void input_pack4_fp32(int K, int N, float* pB, float* pB_t, int num_thread) tmp[5] = img[5]; tmp[6] = img[6]; tmp[7] = img[7]; -#endif // __SSE__ +#endif // __SSE__ tmp += 8; img += N; } @@ -333,7 +332,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i { int i = pp * 8; - float* output0 = pC + ( i )*N; + float* output0 = pC + (i)*N; float* output1 = pC + (i + 1) * N; float* output2 = pC + (i + 2) * N; float* output3 = pC + (i + 3) * N; @@ -369,18 +368,18 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i __m256 _vb1 = _mm256_loadu_ps(vb + 8); __m256 _vb2 = _mm256_loadu_ps(vb + 16); __m256 _vb3 = _mm256_loadu_ps(vb + 24); - _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0); // sum0 = (a00-a07) * k00 - _sum1 = _mm256_fmadd_ps(_vb0, _va1, _sum1); // sum1 = (a00-a07) * k10 - _sum2 = _mm256_fmadd_ps(_vb0, _va2, _sum2); // sum2 = (a00-a07) * k20 - _sum3 = _mm256_fmadd_ps(_vb0, _va3, _sum3); // sum3 = (a00-a07) * k30 + _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0); // sum0 = (a00-a07) * k00 + _sum1 = _mm256_fmadd_ps(_vb0, _va1, _sum1); // sum1 = (a00-a07) * k10 + _sum2 = _mm256_fmadd_ps(_vb0, _va2, _sum2); // sum2 = (a00-a07) * k20 + _sum3 = _mm256_fmadd_ps(_vb0, _va3, _sum3); // sum3 = (a00-a07) * k30 _va0 = _mm256_broadcast_ss(va + 4); _va1 = _mm256_broadcast_ss(va + 5); _va2 = _mm256_broadcast_ss(va + 6); _va3 = _mm256_broadcast_ss(va + 7); - _sum4 = _mm256_fmadd_ps(_vb0, _va0, _sum4); // sum4 = (a00-a07) * k40 - _sum5 = _mm256_fmadd_ps(_vb0, _va1, _sum5); // sum5 = (a00-a07) * k50 - _sum6 = _mm256_fmadd_ps(_vb0, _va2, _sum6); // sum6 = (a00-a07) * k60 - _sum7 = _mm256_fmadd_ps(_vb0, _va3, _sum7); // sum7 = (a00-a07) * k70 + _sum4 = _mm256_fmadd_ps(_vb0, _va0, _sum4); // sum4 = (a00-a07) * k40 + _sum5 = _mm256_fmadd_ps(_vb0, _va1, _sum5); // sum5 = (a00-a07) * k50 + _sum6 = _mm256_fmadd_ps(_vb0, _va2, _sum6); // sum6 = (a00-a07) * k60 + _sum7 = _mm256_fmadd_ps(_vb0, _va3, _sum7); // sum7 = (a00-a07) * k70 va += 8; @@ -389,18 +388,18 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i _va1 = _mm256_broadcast_ss(va + 1); _va2 = _mm256_broadcast_ss(va + 2); _va3 = _mm256_broadcast_ss(va + 3); - _sum0 = _mm256_fmadd_ps(_vb1, _va0, _sum0); // sum0 += (a10-a17) * k01 - _sum1 = _mm256_fmadd_ps(_vb1, _va1, _sum1); // sum1 += (a10-a17) * k11 - _sum2 = _mm256_fmadd_ps(_vb1, _va2, _sum2); // sum2 += (a10-a17) * k21 - _sum3 = _mm256_fmadd_ps(_vb1, _va3, _sum3); // sum3 += (a10-a17) * k31 + _sum0 = _mm256_fmadd_ps(_vb1, _va0, _sum0); // sum0 += (a10-a17) * k01 + _sum1 = _mm256_fmadd_ps(_vb1, _va1, _sum1); // sum1 += (a10-a17) * k11 + _sum2 = _mm256_fmadd_ps(_vb1, _va2, _sum2); // sum2 += (a10-a17) * k21 + _sum3 = _mm256_fmadd_ps(_vb1, _va3, _sum3); // sum3 += (a10-a17) * k31 _va0 = _mm256_broadcast_ss(va + 4); _va1 = _mm256_broadcast_ss(va + 5); _va2 = _mm256_broadcast_ss(va + 6); _va3 = _mm256_broadcast_ss(va + 7); - _sum4 = _mm256_fmadd_ps(_vb1, _va0, _sum4); // sum4 += (a10-a17) * k41 - _sum5 = _mm256_fmadd_ps(_vb1, _va1, _sum5); // sum5 += (a10-a17) * k51 - _sum6 = _mm256_fmadd_ps(_vb1, _va2, _sum6); // sum6 += (a10-a17) * k61 - _sum7 = _mm256_fmadd_ps(_vb1, _va3, _sum7); // sum7 += (a10-a17) * k71 + _sum4 = _mm256_fmadd_ps(_vb1, _va0, _sum4); // sum4 += (a10-a17) * k41 + _sum5 = _mm256_fmadd_ps(_vb1, _va1, _sum5); // sum5 += (a10-a17) * k51 + _sum6 = _mm256_fmadd_ps(_vb1, _va2, _sum6); // sum6 += (a10-a17) * k61 + _sum7 = _mm256_fmadd_ps(_vb1, _va3, _sum7); // sum7 += (a10-a17) * k71 va += 8; @@ -409,18 +408,18 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i _va1 = _mm256_broadcast_ss(va + 1); _va2 = _mm256_broadcast_ss(va + 2); _va3 = _mm256_broadcast_ss(va + 3); - _sum0 = _mm256_fmadd_ps(_vb2, _va0, _sum0); // sum0 += (a20-a27) * k02 - _sum1 = _mm256_fmadd_ps(_vb2, _va1, _sum1); // sum1 += (a20-a27) * k12 - _sum2 = _mm256_fmadd_ps(_vb2, _va2, _sum2); // sum2 += (a20-a27) * k22 - _sum3 = _mm256_fmadd_ps(_vb2, _va3, _sum3); // sum3 += (a20-a27) * k32 + _sum0 = _mm256_fmadd_ps(_vb2, _va0, _sum0); // sum0 += (a20-a27) * k02 + _sum1 = _mm256_fmadd_ps(_vb2, _va1, _sum1); // sum1 += (a20-a27) * k12 + _sum2 = _mm256_fmadd_ps(_vb2, _va2, _sum2); // sum2 += (a20-a27) * k22 + _sum3 = _mm256_fmadd_ps(_vb2, _va3, _sum3); // sum3 += (a20-a27) * k32 _va0 = _mm256_broadcast_ss(va + 4); _va1 = _mm256_broadcast_ss(va + 5); _va2 = _mm256_broadcast_ss(va + 6); _va3 = _mm256_broadcast_ss(va + 7); - _sum4 = _mm256_fmadd_ps(_vb2, _va0, _sum4); // sum4 += (a20-a27) * k42 - _sum5 = _mm256_fmadd_ps(_vb2, _va1, _sum5); // sum5 += (a20-a27) * k52 - _sum6 = _mm256_fmadd_ps(_vb2, _va2, _sum6); // sum6 += (a20-a27) * k62 - _sum7 = _mm256_fmadd_ps(_vb2, _va3, _sum7); // sum7 += (a20-a27) * k72 + _sum4 = _mm256_fmadd_ps(_vb2, _va0, _sum4); // sum4 += (a20-a27) * k42 + _sum5 = _mm256_fmadd_ps(_vb2, _va1, _sum5); // sum5 += (a20-a27) * k52 + _sum6 = _mm256_fmadd_ps(_vb2, _va2, _sum6); // sum6 += (a20-a27) * k62 + _sum7 = _mm256_fmadd_ps(_vb2, _va3, _sum7); // sum7 += (a20-a27) * k72 va += 8; @@ -429,18 +428,18 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i _va1 = _mm256_broadcast_ss(va + 1); _va2 = _mm256_broadcast_ss(va + 2); _va3 = _mm256_broadcast_ss(va + 3); - _sum0 = _mm256_fmadd_ps(_vb3, _va0, _sum0); // sum0 += (a30-a37) * k03 - _sum1 = _mm256_fmadd_ps(_vb3, _va1, _sum1); // sum1 += (a30-a37) * k13 - _sum2 = _mm256_fmadd_ps(_vb3, _va2, _sum2); // sum2 += (a30-a37) * k23 - _sum3 = _mm256_fmadd_ps(_vb3, _va3, _sum3); // sum3 += (a30-a37) * k33 + _sum0 = _mm256_fmadd_ps(_vb3, _va0, _sum0); // sum0 += (a30-a37) * k03 + _sum1 = _mm256_fmadd_ps(_vb3, _va1, _sum1); // sum1 += (a30-a37) * k13 + _sum2 = _mm256_fmadd_ps(_vb3, _va2, _sum2); // sum2 += (a30-a37) * k23 + _sum3 = _mm256_fmadd_ps(_vb3, _va3, _sum3); // sum3 += (a30-a37) * k33 _va0 = _mm256_broadcast_ss(va + 4); _va1 = _mm256_broadcast_ss(va + 5); _va2 = _mm256_broadcast_ss(va + 6); _va3 = _mm256_broadcast_ss(va + 7); - _sum4 = _mm256_fmadd_ps(_vb3, _va0, _sum4); // sum4 += (a30-a37) * k43 - _sum5 = _mm256_fmadd_ps(_vb3, _va1, _sum5); // sum5 += (a30-a37) * k53 - _sum6 = _mm256_fmadd_ps(_vb3, _va2, _sum6); // sum6 += (a30-a37) * k63 - _sum7 = _mm256_fmadd_ps(_vb3, _va3, _sum7); // sum7 += (a30-a37) * k73 + _sum4 = _mm256_fmadd_ps(_vb3, _va0, _sum4); // sum4 += (a30-a37) * k43 + _sum5 = _mm256_fmadd_ps(_vb3, _va1, _sum5); // sum5 += (a30-a37) * k53 + _sum6 = _mm256_fmadd_ps(_vb3, _va2, _sum6); // sum6 += (a30-a37) * k63 + _sum7 = _mm256_fmadd_ps(_vb3, _va3, _sum7); // sum7 += (a30-a37) * k73 va += 8; vb += 32; @@ -458,14 +457,14 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i __m256 _va6 = _mm256_broadcast_ss(va + 6); __m256 _va7 = _mm256_broadcast_ss(va + 7); __m256 _vb0 = _mm256_loadu_ps(vb); - _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0); // sum0 = (a00-a07) * k00 - _sum1 = _mm256_fmadd_ps(_vb0, _va1, _sum1); // sum1 = (a00-a07) * k10 - _sum2 = _mm256_fmadd_ps(_vb0, _va2, _sum2); // sum2 = (a00-a07) * k20 - _sum3 = _mm256_fmadd_ps(_vb0, _va3, _sum3); // sum3 = (a00-a07) * k30 - _sum4 = _mm256_fmadd_ps(_vb0, _va4, _sum4); // sum4 = (a00-a07) * k40 - _sum5 = _mm256_fmadd_ps(_vb0, _va5, _sum5); // sum5 = (a00-a07) * k50 - _sum6 = _mm256_fmadd_ps(_vb0, _va6, _sum6); // sum6 = (a00-a07) * k60 - _sum7 = _mm256_fmadd_ps(_vb0, _va7, _sum7); // sum7 = (a00-a07) * k70 + _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0); // sum0 = (a00-a07) * k00 + _sum1 = _mm256_fmadd_ps(_vb0, _va1, _sum1); // sum1 = (a00-a07) * k10 + _sum2 = _mm256_fmadd_ps(_vb0, _va2, _sum2); // sum2 = (a00-a07) * k20 + _sum3 = _mm256_fmadd_ps(_vb0, _va3, _sum3); // sum3 = (a00-a07) * k30 + _sum4 = _mm256_fmadd_ps(_vb0, _va4, _sum4); // sum4 = (a00-a07) * k40 + _sum5 = _mm256_fmadd_ps(_vb0, _va5, _sum5); // sum5 = (a00-a07) * k50 + _sum6 = _mm256_fmadd_ps(_vb0, _va6, _sum6); // sum6 = (a00-a07) * k60 + _sum7 = _mm256_fmadd_ps(_vb0, _va7, _sum7); // sum7 = (a00-a07) * k70 va += 8; vb += 8; @@ -518,7 +517,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i output6[n] = sum6[n]; output7[n] = sum7[n]; } -#endif // __AVX__ +#endif // __AVX__ output0 += 8; output1 += 8; output2 += 8; @@ -553,10 +552,10 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i __m256 _va2 = _mm256_loadu_ps(va + 16); __m256 _va3 = _mm256_loadu_ps(va + 24); - _sum0 = _mm256_fmadd_ps(_va0, _vb0, _sum0); // sum0 += (k00-k70) * a00 - _sum1 = _mm256_fmadd_ps(_va1, _vb1, _sum1); // sum1 += (k01-k71) * a10 - _sum2 = _mm256_fmadd_ps(_va2, _vb2, _sum2); // sum2 += (k02-k72) * a20 - _sum3 = _mm256_fmadd_ps(_va3, _vb3, _sum3); // sum3 += (k03-k73) * a30 + _sum0 = _mm256_fmadd_ps(_va0, _vb0, _sum0); // sum0 += (k00-k70) * a00 + _sum1 = _mm256_fmadd_ps(_va1, _vb1, _sum1); // sum1 += (k01-k71) * a10 + _sum2 = _mm256_fmadd_ps(_va2, _vb2, _sum2); // sum2 += (k02-k72) * a20 + _sum3 = _mm256_fmadd_ps(_va3, _vb3, _sum3); // sum3 += (k03-k73) * a30 va += 32; vb += 4; @@ -572,7 +571,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i __m256 _vb0 = _mm256_broadcast_ss(vb); __m256 _va = _mm256_loadu_ps(va); - _sum0_7 = _mm256_fmadd_ps(_va, _vb0, _sum0_7); // sum0 += (k00-k70) * a00 + _sum0_7 = _mm256_fmadd_ps(_va, _vb0, _sum0_7); // sum0 += (k00-k70) * a00 va += 8; vb += 1; @@ -621,7 +620,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i output5[0] = sum5; output6[0] = sum6; output7[0] = sum7; -#endif // __AVX__ +#endif // __AVX__ output0++; output1++; output2++; @@ -639,7 +638,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i { int i = remain_outch_start + pp * 4; - float* output0 = pC + ( i )*N; + float* output0 = pC + (i)*N; float* output1 = pC + (i + 1) * N; float* output2 = pC + (i + 2) * N; float* output3 = pC + (i + 3) * N; @@ -667,10 +666,10 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i __m256 _vb1 = _mm256_loadu_ps(vb + 8); __m256 _vb2 = _mm256_loadu_ps(vb + 16); __m256 _vb3 = _mm256_loadu_ps(vb + 24); - _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0); // sum0 = (a00-a07) * k00 - _sum1 = _mm256_fmadd_ps(_vb0, _va1, _sum1); // sum1 = (a00-a07) * k10 - _sum2 = _mm256_fmadd_ps(_vb0, _va2, _sum2); // sum2 = (a00-a07) * k20 - _sum3 = _mm256_fmadd_ps(_vb0, _va3, _sum3); // sum3 = (a00-a07) * k30 + _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0); // sum0 = (a00-a07) * k00 + _sum1 = _mm256_fmadd_ps(_vb0, _va1, _sum1); // sum1 = (a00-a07) * k10 + _sum2 = _mm256_fmadd_ps(_vb0, _va2, _sum2); // sum2 = (a00-a07) * k20 + _sum3 = _mm256_fmadd_ps(_vb0, _va3, _sum3); // sum3 = (a00-a07) * k30 va += 4; @@ -679,10 +678,10 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i _va1 = _mm256_broadcast_ss(va + 1); _va2 = _mm256_broadcast_ss(va + 2); _va3 = _mm256_broadcast_ss(va + 3); - _sum0 = _mm256_fmadd_ps(_vb1, _va0, _sum0); // sum0 += (a10-a17) * k01 - _sum1 = _mm256_fmadd_ps(_vb1, _va1, _sum1); // sum1 += (a10-a17) * k11 - _sum2 = _mm256_fmadd_ps(_vb1, _va2, _sum2); // sum2 += (a10-a17) * k21 - _sum3 = _mm256_fmadd_ps(_vb1, _va3, _sum3); // sum3 += (a10-a17) * k31 + _sum0 = _mm256_fmadd_ps(_vb1, _va0, _sum0); // sum0 += (a10-a17) * k01 + _sum1 = _mm256_fmadd_ps(_vb1, _va1, _sum1); // sum1 += (a10-a17) * k11 + _sum2 = _mm256_fmadd_ps(_vb1, _va2, _sum2); // sum2 += (a10-a17) * k21 + _sum3 = _mm256_fmadd_ps(_vb1, _va3, _sum3); // sum3 += (a10-a17) * k31 va += 4; @@ -691,10 +690,10 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i _va1 = _mm256_broadcast_ss(va + 1); _va2 = _mm256_broadcast_ss(va + 2); _va3 = _mm256_broadcast_ss(va + 3); - _sum0 = _mm256_fmadd_ps(_vb2, _va0, _sum0); // sum0 += (a20-a27) * k02 - _sum1 = _mm256_fmadd_ps(_vb2, _va1, _sum1); // sum1 += (a20-a27) * k12 - _sum2 = _mm256_fmadd_ps(_vb2, _va2, _sum2); // sum2 += (a20-a27) * k22 - _sum3 = _mm256_fmadd_ps(_vb2, _va3, _sum3); // sum3 += (a20-a27) * k32 + _sum0 = _mm256_fmadd_ps(_vb2, _va0, _sum0); // sum0 += (a20-a27) * k02 + _sum1 = _mm256_fmadd_ps(_vb2, _va1, _sum1); // sum1 += (a20-a27) * k12 + _sum2 = _mm256_fmadd_ps(_vb2, _va2, _sum2); // sum2 += (a20-a27) * k22 + _sum3 = _mm256_fmadd_ps(_vb2, _va3, _sum3); // sum3 += (a20-a27) * k32 va += 4; @@ -703,10 +702,10 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i _va1 = _mm256_broadcast_ss(va + 1); _va2 = _mm256_broadcast_ss(va + 2); _va3 = _mm256_broadcast_ss(va + 3); - _sum0 = _mm256_fmadd_ps(_vb3, _va0, _sum0); // sum0 += (a30-a37) * k03 - _sum1 = _mm256_fmadd_ps(_vb3, _va1, _sum1); // sum1 += (a30-a37) * k13 - _sum2 = _mm256_fmadd_ps(_vb3, _va2, _sum2); // sum2 += (a30-a37) * k23 - _sum3 = _mm256_fmadd_ps(_vb3, _va3, _sum3); // sum3 += (a30-a37) * k33 + _sum0 = _mm256_fmadd_ps(_vb3, _va0, _sum0); // sum0 += (a30-a37) * k03 + _sum1 = _mm256_fmadd_ps(_vb3, _va1, _sum1); // sum1 += (a30-a37) * k13 + _sum2 = _mm256_fmadd_ps(_vb3, _va2, _sum2); // sum2 += (a30-a37) * k23 + _sum3 = _mm256_fmadd_ps(_vb3, _va3, _sum3); // sum3 += (a30-a37) * k33 va += 4; vb += 32; @@ -720,10 +719,10 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i __m256 _va2 = _mm256_broadcast_ss(va + 2); __m256 _va3 = _mm256_broadcast_ss(va + 3); __m256 _vb0 = _mm256_loadu_ps(vb); - _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0); // sum0 = (a00-a07) * k00 - _sum1 = _mm256_fmadd_ps(_vb0, _va1, _sum1); // sum1 = (a00-a07) * k10 - _sum2 = _mm256_fmadd_ps(_vb0, _va2, _sum2); // sum2 = (a00-a07) * k20 - _sum3 = _mm256_fmadd_ps(_vb0, _va3, _sum3); // sum3 = (a00-a07) * k30 + _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0); // sum0 = (a00-a07) * k00 + _sum1 = _mm256_fmadd_ps(_vb0, _va1, _sum1); // sum1 = (a00-a07) * k10 + _sum2 = _mm256_fmadd_ps(_vb0, _va2, _sum2); // sum2 = (a00-a07) * k20 + _sum3 = _mm256_fmadd_ps(_vb0, _va3, _sum3); // sum3 = (a00-a07) * k30 va += 4; vb += 8; @@ -760,7 +759,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i output2[n] = sum2[n]; output3[n] = sum3[n]; } -#endif // __AVX__ +#endif // __AVX__ output0 += 8; output1 += 8; output2 += 8; @@ -790,10 +789,10 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i __m128 _va2 = _mm_loadu_ps(va + 8); __m128 _va3 = _mm_loadu_ps(va + 12); - _sum0 = _mm_fmadd_ps(_va0, _vb0, _sum0); // sum0 += (k00-k30) * a00 - _sum1 = _mm_fmadd_ps(_va1, _vb1, _sum1); // sum1 += (k01-k31) * a10 - _sum2 = _mm_fmadd_ps(_va2, _vb2, _sum2); // sum2 += (k02-k32) * a20 - _sum3 = _mm_fmadd_ps(_va3, _vb3, _sum3); // sum3 += (k03-k33) * a30 + _sum0 = _mm_fmadd_ps(_va0, _vb0, _sum0); // sum0 += (k00-k30) * a00 + _sum1 = _mm_fmadd_ps(_va1, _vb1, _sum1); // sum1 += (k01-k31) * a10 + _sum2 = _mm_fmadd_ps(_va2, _vb2, _sum2); // sum2 += (k02-k32) * a20 + _sum3 = _mm_fmadd_ps(_va3, _vb3, _sum3); // sum3 += (k03-k33) * a30 va += 16; vb += 4; @@ -809,7 +808,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i __m128 _vb0 = _mm_set1_ps(vb[0]); __m128 _va = _mm_loadu_ps(va); - _sum0_3 = _mm_fmadd_ps(_va, _vb0, _sum0_3); // sum0 += (k00-k30) * a00 + _sum0_3 = _mm_fmadd_ps(_va, _vb0, _sum0_3); // sum0 += (k00-k30) * a00 va += 4; vb += 1; @@ -841,7 +840,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i output1[0] = sum1; output2[0] = sum2; output3[0] = sum3; -#endif // __AVX__ +#endif // __AVX__ output0++; output1++; output2++; @@ -877,10 +876,10 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i __m256 _vb2 = _mm256_loadu_ps(vb + 16); __m256 _vb3 = _mm256_loadu_ps(vb + 24); - _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0); // sum0 = (a00-a07) * k00 - _sum0 = _mm256_fmadd_ps(_vb1, _va1, _sum0); // sum0 += (a10-a17) * k01 - _sum0 = _mm256_fmadd_ps(_vb2, _va2, _sum0); // sum0 += (a20-a27) * k02 - _sum0 = _mm256_fmadd_ps(_vb3, _va3, _sum0); // sum0 += (a30-a37) * k03 + _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0); // sum0 = (a00-a07) * k00 + _sum0 = _mm256_fmadd_ps(_vb1, _va1, _sum0); // sum0 += (a10-a17) * k01 + _sum0 = _mm256_fmadd_ps(_vb2, _va2, _sum0); // sum0 += (a20-a27) * k02 + _sum0 = _mm256_fmadd_ps(_vb3, _va3, _sum0); // sum0 += (a30-a37) * k03 va += 4; vb += 32; @@ -892,7 +891,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i __m256 _va0 = _mm256_broadcast_ss(va); __m256 _vb0 = _mm256_loadu_ps(vb); - _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0); // sum0 = (a00-a07) * k00 + _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0); // sum0 = (a00-a07) * k00 va += 1; vb += 8; @@ -917,7 +916,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i { output[n] = sum[n]; } -#endif // __AVX__ +#endif // __AVX__ output += 8; } @@ -946,7 +945,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i #endif #else float sum0 = 0.f; -#endif // __AVX__ +#endif // __AVX__ for (; k < K; k++) { sum0 += va[0] * vb[0]; @@ -1019,7 +1018,7 @@ static void sgemm_i8(int M, int N, int K, int8_t* pA_t, int8_t* pB_t, int32_t* p { int i = pp * 8; - int32_t* output0 = pC + ( i )*N; + int32_t* output0 = pC + (i)*N; int32_t* output1 = pC + (i + 1) * N; int32_t* output2 = pC + (i + 2) * N; int32_t* output3 = pC + (i + 3) * N; @@ -1327,7 +1326,7 @@ static void sgemm_i8(int M, int N, int K, int8_t* pA_t, int8_t* pB_t, int32_t* p { int i = remain_outch_start + pp * 4; - int32_t* output0 = pC + ( i )*N; + int32_t* output0 = pC + (i)*N; int32_t* output1 = pC + (i + 1) * N; int32_t* output2 = pC + (i + 2) * N; int32_t* output3 = pC + (i + 3) * N; @@ -1641,13 +1640,13 @@ static void sgemm_fp32(struct tensor* input, struct tensor* filter, struct tenso int out_w = output->dims[3]; int out_image_size = output->dims[1] * output->dims[2] * output->dims[3]; - float* interleave_fp32 = ( float* )priv_info->interleave_buffer_pack4 + outchan_g * group * kernel_size; + float* interleave_fp32 = (float*)priv_info->interleave_buffer_pack4 + outchan_g * group * kernel_size; float* im2col_pack4_fp32 = (float*)priv_info->im2col_buffer_pack4; - float* output_fp32 = ( float* )output->data + n * out_image_size + outchan_g * group * out_h * out_w; + float* output_fp32 = (float*)output->data + n * out_image_size + outchan_g * group * out_h * out_w; float* bias_fp32 = NULL; if (bias) - bias_fp32 = ( float* )bias->data + outchan_g * group; + bias_fp32 = (float*)bias->data + outchan_g * group; float* filter_sgemm = interleave_fp32; float* input_sgemm_pack4 = im2col_pack4_fp32; @@ -1712,15 +1711,15 @@ static void sgemm_uint8(struct tensor* input, struct tensor* filter, struct tens int out_w = output->dims[3]; int out_image_size = output->dims[1] * output->dims[2] * output->dims[3]; - float* interleave_fp32 = ( float* )priv_info->interleave_buffer_pack4 + outchan_g * group * kernel_size; + float* interleave_fp32 = (float*)priv_info->interleave_buffer_pack4 + outchan_g * group * kernel_size; float* im2col_pack4_fp32 = (float*)priv_info->im2col_buffer_pack4; - uint8_t * output_uint8 = ( uint8_t* )output->data + n * out_image_size + outchan_g * group * out_h * out_w; + uint8_t* output_uint8 = (uint8_t*)output->data + n * out_image_size + outchan_g * group * out_h * out_w; int* bias_int32 = NULL; float bias_scale = 0.f; if (bias) { - bias_int32 = ( int* )bias->data + outchan_g * group; + bias_int32 = (int*)bias->data + outchan_g * group; bias_scale = input->scale * filter->scale; } @@ -1738,7 +1737,7 @@ static void sgemm_uint8(struct tensor* input, struct tensor* filter, struct tens for (int j = 0; j < out_h * out_w; j++) { int output_off = i * (out_h * out_w) + j; - output_sgemm[output_off] += (float )bias_int32[i] * bias_scale; + output_sgemm[output_off] += (float)bias_int32[i] * bias_scale; } } } @@ -1782,7 +1781,7 @@ static void sgemm_uint8(struct tensor* input, struct tensor* filter, struct tens { int output_off = i * (out_h * out_w) + j; - int udata = ( int )(round(output_sgemm[output_off] / output->scale) + output->zero_point); + int udata = (int)(round(output_sgemm[output_off] / output->scale) + output->zero_point); if (udata > 255) udata = 255; else if (udata < 0) @@ -1795,8 +1794,8 @@ static void sgemm_uint8(struct tensor* input, struct tensor* filter, struct tens } static void sgemm_int8(struct tensor* input, struct tensor* filter, struct tensor* bias, - struct tensor* output, struct conv_priv_info* priv_info, struct conv_param* param, int n, - int group, int num_thread) + struct tensor* output, struct conv_priv_info* priv_info, struct conv_param* param, int n, + int group, int num_thread) { int kernel_size = param->kernel_h * param->kernel_w * param->input_channel / param->group; int outchan_g = param->output_channel / param->group; @@ -1805,13 +1804,13 @@ static void sgemm_int8(struct tensor* input, struct tensor* filter, struct tenso int out_w = output->dims[3]; int out_image_size = output->dims[1] * output->dims[2] * output->dims[3]; - int8_t* interleave_int8 = ( int8_t* )priv_info->interleave_buffer_pack4 + outchan_g * group * kernel_size; + int8_t* interleave_int8 = (int8_t*)priv_info->interleave_buffer_pack4 + outchan_g * group * kernel_size; int8_t* im2col_pack4_int8 = (int8_t*)priv_info->im2col_buffer_pack4; - int8_t * output_int8 = ( int8_t* )output->data + n * out_image_size + outchan_g * group * out_h * out_w; - int32_t * bias_int32 = NULL; + int8_t* output_int8 = (int8_t*)output->data + n * out_image_size + outchan_g * group * out_h * out_w; + int32_t* bias_int32 = NULL; if (bias) - bias_int32 = ( int* )bias->data + outchan_g * group; + bias_int32 = (int*)bias->data + outchan_g * group; float input_scale = input->scale; float* kernel_scales = filter->scale_list; @@ -1832,9 +1831,9 @@ static void sgemm_int8(struct tensor* input, struct tensor* filter, struct tenso { int output_off = i * (out_h * out_w) + j; if (bias) - output_sgemm_fp32[output_off] = (float )(output_sgemm_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i]; + output_sgemm_fp32[output_off] = (float)(output_sgemm_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i]; else - output_sgemm_fp32[output_off] = (float )output_sgemm_int32[output_off] * input_scale * kernel_scales[i]; + output_sgemm_fp32[output_off] = (float)output_sgemm_int32[output_off] * input_scale * kernel_scales[i]; } } @@ -1880,7 +1879,7 @@ static void sgemm_int8(struct tensor* input, struct tensor* filter, struct tenso { int output_off = i * (out_h * out_w) + j; - int32_t data_i32 = ( int32_t )(round(output_sgemm_fp32[output_off] / output_scale)); + int32_t data_i32 = (int32_t)(round(output_sgemm_fp32[output_off] / output_scale)); if (data_i32 > 127) data_i32 = 127; else if (data_i32 < -127) @@ -1909,8 +1908,7 @@ static int winograd_support(struct conv_param* param, int in_h, int in_w) if (in_h <= 10 && in_w <= 10) return 0; - if (group != 1 || kernel_h != 3 || kernel_w != 3 || stride_h != 1 || stride_w != 1 || dilation_h != 1 || - dilation_w != 1 || input_chan < 16 || output_chan < 16 || output_chan % 16) + if (group != 1 || kernel_h != 3 || kernel_w != 3 || stride_h != 1 || stride_w != 1 || dilation_h != 1 || dilation_w != 1 || input_chan < 16 || output_chan < 16 || output_chan % 16) return 0; return 1; @@ -1958,8 +1956,8 @@ int conv_hcl_get_interleave_pack4_size(int M, int K, struct tensor* filter) void conv_hcl_interleave_pack4_fp32(int M, int K, struct conv_priv_info* priv_info) { - float* pA = ( float* )priv_info->interleave_buffer; - float* pA_t = ( float* )priv_info->interleave_buffer_pack4; + float* pA = (float*)priv_info->interleave_buffer; + float* pA_t = (float*)priv_info->interleave_buffer_pack4; int nn_outch = M >> 3; int remain_outch_start = nn_outch << 3; @@ -2048,8 +2046,8 @@ void conv_hcl_interleave_pack4_fp32(int M, int K, struct conv_priv_info* priv_in void conv_hcl_interleave_pack4_int8(int M, int K, struct conv_priv_info* priv_info) { - int8_t* pA = ( int8_t * )priv_info->interleave_buffer; - int8_t* pA_t = ( int8_t* )priv_info->interleave_buffer_pack4; + int8_t* pA = (int8_t*)priv_info->interleave_buffer; + int8_t* pA_t = (int8_t*)priv_info->interleave_buffer_pack4; int nn_outch = M >> 3; int remain_outch_start = nn_outch << 3; @@ -2217,8 +2215,7 @@ int conv_hcl_postrun(struct conv_priv_info* priv_info) return wino_conv_hcl_postrun(priv_info); } - if (priv_info->external_interleave_pack4_mem && !priv_info->external_interleave_mem && - priv_info->interleave_buffer != NULL) + if (priv_info->external_interleave_pack4_mem && !priv_info->external_interleave_mem && priv_info->interleave_buffer != NULL) { sys_free(priv_info->interleave_buffer_pack4); priv_info->interleave_buffer_pack4 = NULL; @@ -2256,7 +2253,7 @@ int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, stru cpu_affinity); } - for (int i = 0; i < input_tensor->dims[0]; i++) // batch size + for (int i = 0; i < input_tensor->dims[0]; i++) // batch size { for (int j = 0; j < group; j++) { diff --git a/source/device/cpu/op/conv/x86/conv_kernel_x86.h b/source/device/cpu/op/conv/x86/conv_kernel_x86.h index 03237b896..8be2524c9 100644 --- a/source/device/cpu/op/conv/x86/conv_kernel_x86.h +++ b/source/device/cpu/op/conv/x86/conv_kernel_x86.h @@ -31,7 +31,6 @@ #include "graph/node.h" #include "graph/graph.h" - /* float32 */ int conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param); diff --git a/source/device/cpu/op/conv/x86/wino_conv_kernel_x86.c b/source/device/cpu/op/conv/x86/wino_conv_kernel_x86.c index e6355de2c..01c1169a6 100644 --- a/source/device/cpu/op/conv/x86/wino_conv_kernel_x86.c +++ b/source/device/cpu/op/conv/x86/wino_conv_kernel_x86.c @@ -39,7 +39,7 @@ #include #include -#define TILE 4 +#define TILE 4 #define ELEM_SIZE ((TILE + 2) * (TILE + 2)) #define WINO_MAX(a, b) ((a) > (b) ? (a) : (b)) @@ -49,11 +49,11 @@ static void relu(float* data, int size, int activation) { for (int i = 0; i < size; i++) { - data[i] = WINO_MAX(data[i], ( float )0); + data[i] = WINO_MAX(data[i], (float)0); if (activation > 0) { - data[i] = WINO_MIN(data[i], ( float )activation); + data[i] = WINO_MIN(data[i], (float)activation); } } } @@ -62,7 +62,7 @@ static int get_private_mem_size(struct tensor* filter, struct conv_param* param) int output_c = filter->dims[0]; int input_c = filter->dims[1]; int trans_ker_size = (unsigned long)output_c * input_c * ELEM_SIZE * sizeof(float); - return trans_ker_size + 128; // caution + return trans_ker_size + 128; // caution } static void pad_0_align_2D(float* dst, float* src, int m, int n, int m_align, int n_align, int pad_h, int pad_w) @@ -144,7 +144,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel int w_tm = outw_align / 4 * 6; int h_tm = outh_align / 4 * 6; - int nColBlocks = h_tm / 6; // may be the block num in Feathercnn + int nColBlocks = h_tm / 6; // may be the block num in Feathercnn int nRowBlocks = w_tm / 6; const int tiles = nColBlocks * nRowBlocks; @@ -527,7 +527,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel out_tm8[2] = d5[4]; out_tm8[3] = d5[5]; } -#endif // __AVX__ +#endif // __AVX__ r0 += 4; r1 += 4; r2 += 4; @@ -545,7 +545,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel int w_tm = outw_align / 4 * 6; int h_tm = outh_align / 4 * 6; - int nColBlocks = h_tm / 6; // may be the block num in Feathercnn + int nColBlocks = h_tm / 6; // may be the block num in Feathercnn int nRowBlocks = w_tm / 6; const int tiles = nColBlocks * nRowBlocks; @@ -815,7 +815,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel output6_tm[n] = sum6[n]; output7_tm[n] = sum7[n]; } -#endif // __AVX__ +#endif // __AVX__ output0_tm += 36; output1_tm += 36; output2_tm += 36; @@ -911,7 +911,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel output2_tm[n] = sum2[n]; output3_tm[n] = sum3[n]; } -#endif // __AVX__ +#endif // __AVX__ output0_tm += 36; output1_tm += 36; output2_tm += 36; @@ -929,8 +929,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel for (int i = 0; i < tiles; i++) { - const float* kptr = - kernel_tm_test + 4 * r * inch * outch + (p / 8 + (p % 8) / 4 + p % 4) * inch * 4; + const float* kptr = kernel_tm_test + 4 * r * inch * outch + (p / 8 + (p % 8) / 4 + p % 4) * inch * 4; const float* r0 = bottom_blob_tm + 4 * inch * (tiles * r + i); #if __AVX__ || __SSE__ #if __AVX__ @@ -970,7 +969,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel { output0_tm[n] = sum0[n]; } -#endif // __AVX__ || __SSE__ +#endif // __AVX__ || __SSE__ output0_tm += 36; } } @@ -1005,7 +1004,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel int w_tm = outw_align / 4 * 6; int h_tm = outh_align / 4 * 6; - int nColBlocks = h_tm / 6; // may be the block num in Feathercnn + int nColBlocks = h_tm / 6; // may be the block num in Feathercnn int nRowBlocks = w_tm / 6; const int tiles = nColBlocks * nRowBlocks; @@ -1118,12 +1117,11 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel void conv3x3s1_winograd43_transform_kernel_sse(const float* kernel, float* kernel_wino, int inch, int outch) { - float* kernel_tm = ( float* )sys_malloc((unsigned long)6 * 6 * inch * outch * sizeof(float)); + float* kernel_tm = (float*)sys_malloc((unsigned long)6 * 6 * inch * outch * sizeof(float)); // G const float ktm[6][3] = { - {1.0f / 4, 0.0f, 0.0f}, {-1.0f / 6, -1.0f / 6, -1.0f / 6}, {-1.0f / 6, 1.0f / 6, -1.0f / 6}, - {1.0f / 24, 1.0f / 12, 1.0f / 6}, {1.0f / 24, -1.0f / 12, 1.0f / 6}, {0.0f, 0.0f, 1.0f}}; + {1.0f / 4, 0.0f, 0.0f}, {-1.0f / 6, -1.0f / 6, -1.0f / 6}, {-1.0f / 6, 1.0f / 6, -1.0f / 6}, {1.0f / 24, 1.0f / 12, 1.0f / 6}, {1.0f / 24, -1.0f / 12, 1.0f / 6}, {0.0f, 0.0f, 1.0f}}; #pragma omp parallel for for (int p = 0; p < outch; p++) @@ -1166,14 +1164,14 @@ void conv3x3s1_winograd43_transform_kernel_sse(const float* kernel, float* kerne int p = 0; for (; p + 7 < outch; p += 8) { - const float* kernel0 = ( const float* )kernel_tm + p * inch * 36; - const float* kernel1 = ( const float* )kernel_tm + (p + 1) * inch * 36; - const float* kernel2 = ( const float* )kernel_tm + (p + 2) * inch * 36; - const float* kernel3 = ( const float* )kernel_tm + (p + 3) * inch * 36; - const float* kernel4 = ( const float* )kernel_tm + (p + 4) * inch * 36; - const float* kernel5 = ( const float* )kernel_tm + (p + 5) * inch * 36; - const float* kernel6 = ( const float* )kernel_tm + (p + 6) * inch * 36; - const float* kernel7 = ( const float* )kernel_tm + (p + 7) * inch * 36; + const float* kernel0 = (const float*)kernel_tm + p * inch * 36; + const float* kernel1 = (const float*)kernel_tm + (p + 1) * inch * 36; + const float* kernel2 = (const float*)kernel_tm + (p + 2) * inch * 36; + const float* kernel3 = (const float*)kernel_tm + (p + 3) * inch * 36; + const float* kernel4 = (const float*)kernel_tm + (p + 4) * inch * 36; + const float* kernel5 = (const float*)kernel_tm + (p + 5) * inch * 36; + const float* kernel6 = (const float*)kernel_tm + (p + 6) * inch * 36; + const float* kernel7 = (const float*)kernel_tm + (p + 7) * inch * 36; float* ktmp = kernel_tm_test + p / 8 * inch * 32; @@ -1233,10 +1231,10 @@ void conv3x3s1_winograd43_transform_kernel_sse(const float* kernel, float* kerne for (; p + 3 < outch; p += 4) { - const float* kernel0 = ( const float* )kernel_tm + p * inch * 36; - const float* kernel1 = ( const float* )kernel_tm + (p + 1) * inch * 36; - const float* kernel2 = ( const float* )kernel_tm + (p + 2) * inch * 36; - const float* kernel3 = ( const float* )kernel_tm + (p + 3) * inch * 36; + const float* kernel0 = (const float*)kernel_tm + p * inch * 36; + const float* kernel1 = (const float*)kernel_tm + (p + 1) * inch * 36; + const float* kernel2 = (const float*)kernel_tm + (p + 2) * inch * 36; + const float* kernel3 = (const float*)kernel_tm + (p + 3) * inch * 36; float* ktmp = kernel_tm_test + (p / 8 + (p % 8) / 4) * inch * 16; for (int q = 0; q < inch; q++) @@ -1271,7 +1269,7 @@ void conv3x3s1_winograd43_transform_kernel_sse(const float* kernel, float* kerne for (; p < outch; p++) { - const float* kernel0 = ( const float* )kernel_tm + p * inch * 36; + const float* kernel0 = (const float*)kernel_tm + p * inch * 36; float* ktmp = kernel_tm_test + (p / 8 + (p % 8) / 4 + p % 4) * inch * 4; for (int q = 0; q < inch; q++) @@ -1305,7 +1303,7 @@ int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tens int pad_h = param->pad_h0; int pad_w = param->pad_w0; - float* kernel = ( float* )filter_tensor->data; + float* kernel = (float*)filter_tensor->data; if (!priv_info->external_interleave_mem) { @@ -1325,17 +1323,17 @@ int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tens int outw = block_w * TILE; int outh = block_h * TILE; - priv_info->input_pad = ( float* )sys_malloc((unsigned long)batch * input_c * pad_inhw * sizeof(float)); + priv_info->input_pad = (float*)sys_malloc((unsigned long)batch * input_c * pad_inhw * sizeof(float)); memset(priv_info->input_pad, 0, (unsigned long)batch * input_c * pad_inhw * sizeof(float)); - priv_info->dot_block = ( float* )sys_malloc(ELEM_SIZE * (unsigned long)block * output_c * sizeof(float)); - priv_info->transform_input = ( float* )sys_malloc(ELEM_SIZE * (unsigned long)block * input_c * sizeof(float)); + priv_info->dot_block = (float*)sys_malloc(ELEM_SIZE * (unsigned long)block * output_c * sizeof(float)); + priv_info->transform_input = (float*)sys_malloc(ELEM_SIZE * (unsigned long)block * input_c * sizeof(float)); priv_info->output_bordered = NULL; if (outw != output_w || outh != output_h) { - priv_info->output_bordered = ( float* )sys_malloc((unsigned long)outw * outh * output_c * sizeof(float)); + priv_info->output_bordered = (float*)sys_malloc((unsigned long)outw * outh * output_c * sizeof(float)); } - conv3x3s1_winograd43_transform_kernel_sse(kernel, ( float* )priv_info->interleave_buffer, input_c, output_c); + conv3x3s1_winograd43_transform_kernel_sse(kernel, (float*)priv_info->interleave_buffer, input_c, output_c); return 0; } @@ -1416,11 +1414,11 @@ int wino_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, int padded_in_hw = padded_in_h * padded_in_w; /* buffer addr */ - float* input = ( float* )input_tensor->data; - float* output = ( float* )output_tensor->data; + float* input = (float*)input_tensor->data; + float* output = (float*)output_tensor->data; float* biases = NULL; if (bias_tensor != NULL) - biases = ( float* )bias_tensor->data; + biases = (float*)bias_tensor->data; for (int i = 0; i < batch; i++) { @@ -1429,9 +1427,9 @@ int wino_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, pad_0_align_3D((float*)priv_info->input_pad + i * in_c * padded_in_h * padded_in_w, input + i * in_c * in_h * in_w, in_h, in_w, padded_in_h, padded_in_w, in_c, pad_h0, pad_w0); conv3x3s1_winograd43_sse((float*)priv_info->input_pad + i * in_c * padded_in_h * padded_in_w + g * input_size_g, - output + i * out_c * out_h * out_w, (float*)priv_info->interleave_buffer, (float*)priv_info->dot_block, - (float*)priv_info->transform_input, (float*)priv_info->output_bordered, - biases, padded_in_w, padded_in_h, in_c, out_w, out_h, out_c, num_thread); + output + i * out_c * out_h * out_w, (float*)priv_info->interleave_buffer, (float*)priv_info->dot_block, + (float*)priv_info->transform_input, (float*)priv_info->output_bordered, + biases, padded_in_w, padded_in_h, in_c, out_w, out_h, out_c, num_thread); } } if (act_type >= 0) diff --git a/source/device/cpu/op/conv/x86/wino_conv_kernel_x86.h b/source/device/cpu/op/conv/x86/wino_conv_kernel_x86.h index 3cae478fb..2f3201f44 100644 --- a/source/device/cpu/op/conv/x86/wino_conv_kernel_x86.h +++ b/source/device/cpu/op/conv/x86/wino_conv_kernel_x86.h @@ -38,7 +38,6 @@ #include #endif - int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param); diff --git a/source/device/cpu/op/crop/crop_ref.c b/source/device/cpu/op/crop/crop_ref.c index 2d89b45a6..f59650a39 100644 --- a/source/device/cpu/op/crop/crop_ref.c +++ b/source/device/cpu/op/crop/crop_ref.c @@ -36,7 +36,6 @@ #include - static int ref_crop_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct crop_param* param, int num_thread) { @@ -71,8 +70,7 @@ static int ref_crop_fp32(struct tensor* input_tensor, struct tensor* output_tens for (int w = 0; w < oDataW; w++) { int i_w = w + offsetW; - output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = - input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w]; + output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w]; } } } @@ -93,8 +91,7 @@ static int ref_crop_fp32(struct tensor* input_tensor, struct tensor* output_tens for (int w = 0; w < oDataW; w++) { int i_w = w + param->offset_w; - output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = - input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w]; + output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w]; } } } @@ -118,8 +115,7 @@ static int ref_crop_fp32(struct tensor* input_tensor, struct tensor* output_tens for (int w = 0; w < oDataW; w++) { int i_w = param->offset_w + w; - output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = - input[n * iDataC * iDataH * iDataW + i_c * iDataH * iDataW + i_h * iDataW + i_w]; + output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = input[n * iDataC * iDataH * iDataW + i_c * iDataH * iDataW + i_h * iDataW + i_w]; } } } @@ -137,8 +133,7 @@ static int ref_crop_fp32(struct tensor* input_tensor, struct tensor* output_tens for (int w = 0; w < oDataW; w++) { int i_w = param->offset_w + w; - output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = - input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w]; + output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w]; } } } @@ -150,7 +145,7 @@ static int ref_crop_fp32(struct tensor* input_tensor, struct tensor* output_tens } static int ref_crop_uint8(struct tensor* input_tensor, struct tensor* output_tensor, struct crop_param* param, - int num_thread) + int num_thread) { uint8_t* input = (uint8_t*)input_tensor->data; uint8_t* output = (uint8_t*)output_tensor->data; @@ -183,8 +178,7 @@ static int ref_crop_uint8(struct tensor* input_tensor, struct tensor* output_ten for (int w = 0; w < oDataW; w++) { int i_w = w + offsetW; - output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = - input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w]; + output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w]; } } } @@ -205,8 +199,7 @@ static int ref_crop_uint8(struct tensor* input_tensor, struct tensor* output_ten for (int w = 0; w < oDataW; w++) { int i_w = w + param->offset_w; - output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = - input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w]; + output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w]; } } } @@ -230,8 +223,7 @@ static int ref_crop_uint8(struct tensor* input_tensor, struct tensor* output_ten for (int w = 0; w < oDataW; w++) { int i_w = param->offset_w + w; - output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = - input[n * iDataC * iDataH * iDataW + i_c * iDataH * iDataW + i_h * iDataW + i_w]; + output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = input[n * iDataC * iDataH * iDataW + i_c * iDataH * iDataW + i_h * iDataW + i_w]; } } } @@ -249,8 +241,7 @@ static int ref_crop_uint8(struct tensor* input_tensor, struct tensor* output_ten for (int w = 0; w < oDataW; w++) { int i_w = param->offset_w + w; - output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = - input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w]; + output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w]; } } } @@ -278,11 +269,11 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct crop_param* crop_param = ( struct crop_param* )ir_node->op.param_mem; + struct crop_param* crop_param = (struct crop_param*)ir_node->op.param_mem; if (input_tensor->data_type == TENGINE_DT_FP32) ref_crop_fp32(input_tensor, output_tensor, crop_param, exec_graph->num_thread); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ref_crop_uint8(input_tensor, output_tensor, crop_param, exec_graph->num_thread); return 0; diff --git a/source/device/cpu/op/deconv/cortex_a/deconv_dw_hcl_arm.c b/source/device/cpu/op/deconv/cortex_a/deconv_dw_hcl_arm.c index 1daba216a..360f061ea 100644 --- a/source/device/cpu/op/deconv/cortex_a/deconv_dw_hcl_arm.c +++ b/source/device/cpu/op/deconv/cortex_a/deconv_dw_hcl_arm.c @@ -36,7 +36,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { struct node* ir_node = exec_node->ir_node; @@ -54,10 +53,9 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct deconv_param* deconv_param = ( struct deconv_param* )ir_node->op.param_mem; + struct deconv_param* deconv_param = (struct deconv_param*)ir_node->op.param_mem; - if (deconv_dw_run(input_tensor, weight_tensor, bias_tensor, output_tensor, deconv_param, num_thread, cpu_affinity) < - 0) + if (deconv_dw_run(input_tensor, weight_tensor, bias_tensor, output_tensor, deconv_param, num_thread, cpu_affinity) < 0) { TLOG_ERR("hcl conv run failed\n"); // set_tengine_errno(EFAULT); @@ -79,7 +77,7 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) { - struct deconv_param* param = ( struct deconv_param* )exec_node->op.param_mem; + struct deconv_param* param = (struct deconv_param*)exec_node->op.param_mem; struct node* ir_node = exec_node; struct graph* ir_graph = ir_node->graph; @@ -117,8 +115,7 @@ static struct node_ops hcl_node_ops = {.prerun = NULL, .postrun = NULL, .init_node = init_node, .release_node = release_node, - .score = score -}; + .score = score}; int register_deconv_dw_hcl_arm_op() { diff --git a/source/device/cpu/op/deconv/cortex_a/deconv_dw_kernel_arm.c b/source/device/cpu/op/deconv/cortex_a/deconv_dw_kernel_arm.c index e5e50f111..18d07bc5b 100644 --- a/source/device/cpu/op/deconv/cortex_a/deconv_dw_kernel_arm.c +++ b/source/device/cpu/op/deconv/cortex_a/deconv_dw_kernel_arm.c @@ -37,7 +37,6 @@ #else #endif - inline static float do_activation(float input, int activation) { if (activation == 0) @@ -185,45 +184,45 @@ inline static void deconv_dw_genreal_3x3s2(const float* input, const float* kern float32x4_t input_4 = vld1q_f32(cur_input); // out row 0 - float32x4_t out_00 = vmulq_lane_f32(input_4, vget_low_f32(_k0), 0); // 0,2,4,6 - float32x4_t out_01 = vmulq_lane_f32(input_4, vget_low_f32(_k0), 1); // 1,3,5,7 - float32x4_t out_02 = vmulq_lane_f32(input_4, vget_high_f32(_k0), 0); // 2,4,6,8 + float32x4_t out_00 = vmulq_lane_f32(input_4, vget_low_f32(_k0), 0); // 0,2,4,6 + float32x4_t out_01 = vmulq_lane_f32(input_4, vget_low_f32(_k0), 1); // 1,3,5,7 + float32x4_t out_02 = vmulq_lane_f32(input_4, vget_high_f32(_k0), 0); // 2,4,6,8 float32x4x2_t out_0 = vld2q_f32(cur_out0); - out_0.val[0] = vaddq_f32(out_0.val[0], out_00); // 0,2,4,6 - out_0.val[1] = vaddq_f32(out_0.val[1], out_01); // 1,3,5,7 + out_0.val[0] = vaddq_f32(out_0.val[0], out_00); // 0,2,4,6 + out_0.val[1] = vaddq_f32(out_0.val[1], out_01); // 1,3,5,7 vst2q_f32(cur_out0, out_0); out_0 = vld2q_f32(cur_out0 + 2); - out_0.val[0] = vaddq_f32(out_0.val[0], out_02); // 2,4,6,8 + out_0.val[0] = vaddq_f32(out_0.val[0], out_02); // 2,4,6,8 vst2q_f32(cur_out0 + 2, out_0); // out row 1 - float32x4_t out_10 = vmulq_lane_f32(input_4, vget_low_f32(_k1), 0); // 0,2,4,6 - float32x4_t out_11 = vmulq_lane_f32(input_4, vget_low_f32(_k1), 1); // 1,3,5,7 - float32x4_t out_12 = vmulq_lane_f32(input_4, vget_high_f32(_k1), 0); // 2,4,6,8 + float32x4_t out_10 = vmulq_lane_f32(input_4, vget_low_f32(_k1), 0); // 0,2,4,6 + float32x4_t out_11 = vmulq_lane_f32(input_4, vget_low_f32(_k1), 1); // 1,3,5,7 + float32x4_t out_12 = vmulq_lane_f32(input_4, vget_high_f32(_k1), 0); // 2,4,6,8 float32x4x2_t out_1 = vld2q_f32(cur_out1); - out_1.val[0] = vaddq_f32(out_1.val[0], out_10); // 0,2,4,6 - out_1.val[1] = vaddq_f32(out_1.val[1], out_11); // 1,3,5,7 + out_1.val[0] = vaddq_f32(out_1.val[0], out_10); // 0,2,4,6 + out_1.val[1] = vaddq_f32(out_1.val[1], out_11); // 1,3,5,7 vst2q_f32(cur_out1, out_1); out_1 = vld2q_f32(cur_out1 + 2); - out_1.val[0] = vaddq_f32(out_1.val[0], out_12); // 2,4,6,8 + out_1.val[0] = vaddq_f32(out_1.val[0], out_12); // 2,4,6,8 vst2q_f32(cur_out1 + 2, out_1); // out row 2 - float32x4_t out_20 = vmulq_lane_f32(input_4, vget_low_f32(_k2), 0); // 0,2,4,6 - float32x4_t out_21 = vmulq_lane_f32(input_4, vget_low_f32(_k2), 1); // 1,3,5,7 - float32x4_t out_22 = vmulq_lane_f32(input_4, vget_high_f32(_k2), 0); // 2,4,6,8 + float32x4_t out_20 = vmulq_lane_f32(input_4, vget_low_f32(_k2), 0); // 0,2,4,6 + float32x4_t out_21 = vmulq_lane_f32(input_4, vget_low_f32(_k2), 1); // 1,3,5,7 + float32x4_t out_22 = vmulq_lane_f32(input_4, vget_high_f32(_k2), 0); // 2,4,6,8 float32x4x2_t out_2 = vld2q_f32(cur_out2); - out_2.val[0] = vaddq_f32(out_2.val[0], out_20); // 0,2,4,6 - out_2.val[1] = vaddq_f32(out_2.val[1], out_21); // 1,3,5,7 + out_2.val[0] = vaddq_f32(out_2.val[0], out_20); // 0,2,4,6 + out_2.val[1] = vaddq_f32(out_2.val[1], out_21); // 1,3,5,7 vst2q_f32(cur_out2, out_2); out_2 = vld2q_f32(cur_out2 + 2); - out_2.val[0] = vaddq_f32(out_2.val[0], out_22); // 2,4,6,8 + out_2.val[0] = vaddq_f32(out_2.val[0], out_22); // 2,4,6,8 vst2q_f32(cur_out2 + 2, out_2); cur_input += 4; @@ -472,12 +471,12 @@ int deconv_dw_run(struct tensor* input_tensor, struct tensor* filter_tensor, str int output_size = out_c * out_h * out_w; int out_c_align = ((out_c + 3) & -4); /* buffer addr */ - float* input_buf = ( float* )input_tensor->data; - float* kernel_buf = ( float* )filter_tensor->data; - float* output_buf = ( float* )output_tensor->data; - float* biases_buf = ( float* )bias_tensor->data; + float* input_buf = (float*)input_tensor->data; + float* kernel_buf = (float*)filter_tensor->data; + float* output_buf = (float*)output_tensor->data; + float* biases_buf = (float*)bias_tensor->data; - for (int n = 0; n < batch; n++) // batch size + for (int n = 0; n < batch; n++) // batch size { float* cur_input = input_buf + n * input_size * group; float* cur_output = output_buf + n * output_size * group; @@ -510,7 +509,7 @@ int deconv_dw_run(struct tensor* input_tensor, struct tensor* filter_tensor, str { int out_h_pad = out_h + pads[0] * 2; int out_w_pad = out_w + pads[1] * 2; - float* output_buf = ( float* )malloc(sizeof(float) * group * out_h_pad * out_w_pad + 128); + float* output_buf = (float*)malloc(sizeof(float) * group * out_h_pad * out_w_pad + 128); if (stride_h == 1 && kernel_h == 4) { diff --git a/source/device/cpu/op/deconv/cortex_a/deconv_dw_kernel_arm.h b/source/device/cpu/op/deconv/cortex_a/deconv_dw_kernel_arm.h index 091f7a1d8..93576a691 100644 --- a/source/device/cpu/op/deconv/cortex_a/deconv_dw_kernel_arm.h +++ b/source/device/cpu/op/deconv/cortex_a/deconv_dw_kernel_arm.h @@ -28,13 +28,12 @@ #include "graph/tensor.h" - -int deconv_dw_run(struct tensor* input_tensor , \ - struct tensor* filter_tensor ,\ - struct tensor* bias_tensor , \ - struct tensor* output_tensor , \ - struct deconv_param* param, \ - int num_thread, \ - int cpu_affinity) ; +int deconv_dw_run(struct tensor* input_tensor, + struct tensor* filter_tensor, + struct tensor* bias_tensor, + struct tensor* output_tensor, + struct deconv_param* param, + int num_thread, + int cpu_affinity); #endif diff --git a/source/device/cpu/op/deconv/cortex_a/deconv_hcl_arm.c b/source/device/cpu/op/deconv/cortex_a/deconv_hcl_arm.c index fa5883320..a81fa1e8c 100644 --- a/source/device/cpu/op/deconv/cortex_a/deconv_hcl_arm.c +++ b/source/device/cpu/op/deconv/cortex_a/deconv_hcl_arm.c @@ -36,7 +36,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { struct node* ir_node = exec_node->ir_node; @@ -45,13 +44,13 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct struct tensor* filter_tensor; struct tensor* output_tensor; - struct deconv_priv_info* deconv_priv_info = ( struct deconv_priv_info* )exec_node->ops_priv; + struct deconv_priv_info* deconv_priv_info = (struct deconv_priv_info*)exec_node->ops_priv; input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct deconv_param* deconv_param = ( struct deconv_param* )ir_node->op.param_mem; + struct deconv_param* deconv_param = (struct deconv_param*)ir_node->op.param_mem; /* prerun now */ if (deconv_hcl_prerun(input_tensor, filter_tensor, output_tensor, deconv_priv_info, deconv_param) < 0) @@ -81,11 +80,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct deconv_param* deconv_param = ( struct deconv_param* )ir_node->op.param_mem; - struct deconv_priv_info* deconv_priv_info = ( struct deconv_priv_info* )exec_node->ops_priv; + struct deconv_param* deconv_param = (struct deconv_param*)ir_node->op.param_mem; + struct deconv_priv_info* deconv_priv_info = (struct deconv_priv_info*)exec_node->ops_priv; if (deconv_hcl_run(input_tensor, weight_tensor, bias_tensor, output_tensor, deconv_priv_info, deconv_param, - num_thread, cpu_affinity) < 0) + num_thread, cpu_affinity) + < 0) { TLOG_ERR("hcl deconv run failed\n"); // set_tengine_errno(EFAULT); @@ -102,7 +102,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct deconv_priv_info* deconv_priv_info = ( struct deconv_priv_info* )exec_node->ops_priv; + struct deconv_priv_info* deconv_priv_info = (struct deconv_priv_info*)exec_node->ops_priv; if (deconv_hcl_postrun(deconv_priv_info) < 0) { @@ -123,8 +123,8 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct deconv_param* deconv_param = ( struct deconv_param* )ir_node->op.param_mem; - struct deconv_priv_info* deconv_priv_info = ( struct deconv_priv_info* )sys_malloc(sizeof(struct deconv_priv_info)); + struct deconv_param* deconv_param = (struct deconv_param*)ir_node->op.param_mem; + struct deconv_priv_info* deconv_priv_info = (struct deconv_priv_info*)sys_malloc(sizeof(struct deconv_priv_info)); if (deconv_priv_info == NULL) { @@ -140,7 +140,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct deconv_priv_info* deconv_priv_info = ( struct deconv_priv_info* )exec_node->ops_priv; + struct deconv_priv_info* deconv_priv_info = (struct deconv_priv_info*)exec_node->ops_priv; sys_free(deconv_priv_info); exec_node->ops_priv = NULL; return 0; @@ -157,8 +157,7 @@ static struct node_ops hcl_node_ops = {.prerun = prerun, .postrun = postrun, .init_node = init_node, .release_node = release_node, - .score = score -}; + .score = score}; int register_deconv_hcl_arm_op() { diff --git a/source/device/cpu/op/deconv/cortex_a/deconv_kernel_arm.c b/source/device/cpu/op/deconv/cortex_a/deconv_kernel_arm.c index efb532a9f..e69ae1b46 100644 --- a/source/device/cpu/op/deconv/cortex_a/deconv_kernel_arm.c +++ b/source/device/cpu/op/deconv/cortex_a/deconv_kernel_arm.c @@ -28,7 +28,6 @@ #include #include - #ifdef __aarch64__ #define PER_OUT_CHAN 16 void sgemm_4x16_deconv_a72(float* input, float* kernel, long kernel_size, float* output, long weight_size); @@ -57,37 +56,37 @@ static void interleave_kernel(float* kernel, float* kernel_interleaved, int kern float* cur_kernel_interleaved = kernel_interleaved; // interleave PER_OUT_CHAN kernels - for(i = 0; i + PER_OUT_CHAN - 1 < kernel_size; i += PER_OUT_CHAN) + for (i = 0; i + PER_OUT_CHAN - 1 < kernel_size; i += PER_OUT_CHAN) { - for(j = 0; j < kernel_chan; j++) + for (j = 0; j < kernel_chan; j++) { - for(k = 0; k < PER_OUT_CHAN; k++) + for (k = 0; k < PER_OUT_CHAN; k++) *(cur_kernel_interleaved++) = kernel[j * kernel_size + i + k]; } } - for(; i < (kernel_size & -4); i += 4) + for (; i < (kernel_size & -4); i += 4) { - for(j = 0; j < kernel_chan; j++) + for (j = 0; j < kernel_chan; j++) { - for(k = 0; k < 4; k++) + for (k = 0; k < 4; k++) *(cur_kernel_interleaved++) = kernel[j * kernel_size + i + k]; } } // last 4 kernel int kernel_size3 = kernel_chan & 0x3; - if(kernel_size3) + if (kernel_size3) { - for(j = 0; j < kernel_chan; j++) + for (j = 0; j < kernel_chan; j++) { - for(k = 0; k < kernel_size3; k++) + for (k = 0; k < kernel_size3; k++) *(cur_kernel_interleaved++) = kernel[j * kernel_size + i + k]; - for(; k < 4; k++) - *(cur_kernel_interleaved++) = 0.0; + for (; k < 4; k++) + *(cur_kernel_interleaved++) = 0.0; } } } -static void interleave(struct tensor * filter, struct deconv_priv_info* priv_info, struct deconv_param* param) +static void interleave(struct tensor* filter, struct deconv_priv_info* priv_info, struct deconv_param* param) { int group = param->group; int out_chan = filter->dims[0] / group; @@ -98,7 +97,7 @@ static void interleave(struct tensor * filter, struct deconv_priv_info* priv_in float* kernel = filter->data; float* interleave_buf = priv_info->interleave_buffer; - for(int g = 0; g < group; g++) + for (int g = 0; g < group; g++) { float* cur_kernel = kernel + g * kernel_size * in_chan; float* cur_interleave = interleave_buf + g * kernel_size_algin; @@ -113,26 +112,26 @@ static void transpose_input(float* input, float* inputT, int input_w, int input_ float* cur_input = inputT; - for(i = 0; i < (input_w & -4); i += 4) - for(j = 0; j < input_h; j++) - for(k = 0; k < 4; k++) + for (i = 0; i < (input_w & -4); i += 4) + for (j = 0; j < input_h; j++) + for (k = 0; k < 4; k++) *cur_input++ = *(input + j * input_w + i + k); - if(input_w3) + if (input_w3) { - for(j = 0; j < input_h; j++) + for (j = 0; j < input_h; j++) { - for(k = 0; k < input_w3; k++) + for (k = 0; k < input_w3; k++) *cur_input++ = *(input + j * input_w + i + k); - for(; k < 4; k++) + for (; k < 4; k++) *cur_input++ = 0; } } } static void col2im(float* col, float* im, float* bias, int output_ch, int output_x, int output_y, - int kernel_x, int kernel_y, int stride_x, int stride_y, int dilation_x, int dilation_y, int pad_x, - int pad_y, int input_x, int input_y) + int kernel_x, int kernel_y, int stride_x, int stride_y, int dilation_x, int dilation_y, int pad_x, + int pad_y, int input_x, int input_y) { float* cur_col; int imx_start, imy_start, ix, iy, kch, kx, ky, imx, imy; @@ -143,49 +142,49 @@ static void col2im(float* col, float* im, float* bias, int output_ch, int output int is_4x4 = (kernel_x == 4 && kernel_y == 4 && is_nodilation); int is_8x8 = (kernel_x == 8 && kernel_y == 8 && is_nodilation); /* init bias */ - if(bias == NULL) + if (bias == NULL) { - for(int i = 0; i < (output_xy * output_ch); i++) + for (int i = 0; i < (output_xy * output_ch); i++) im[i] = 0; } else { float* cur_im = im; - for(int i = 0; i < output_ch; i++) - for(int j = 0; j < output_xy; j++) + for (int i = 0; i < output_ch; i++) + for (int j = 0; j < output_xy; j++) *cur_im++ = bias[i]; } - if(is_4x4) + if (is_4x4) { - for(iy = 0; iy < input_y; iy++) + for (iy = 0; iy < input_y; iy++) { imy_start = iy * stride_y - pad_y; - for(ix = 0; ix < input_x; ix++) + for (ix = 0; ix < input_x; ix++) { imx_start = ix * stride_x - pad_x; cur_col = col + (iy * input_x + ix) * weight_size; - if(iy != 0 && iy != (input_y - 1) && ix != 0 && ix != (input_x - 1)) + if (iy != 0 && iy != (input_y - 1) && ix != 0 && ix != (input_x - 1)) { - for(kch = 0; kch < output_ch; kch++) - for(ky = 0; ky < 4; ky++) + for (kch = 0; kch < output_ch; kch++) + for (ky = 0; ky < 4; ky++) { imy = imy_start + ky; - for(kx = 0; kx < 4; kx++) - *(im + output_xy * kch + output_x * imy + imx_start + kx) += *cur_col++ ; + for (kx = 0; kx < 4; kx++) + *(im + output_xy * kch + output_x * imy + imx_start + kx) += *cur_col++; } } else { - for(kch = 0; kch < output_ch; kch++) + for (kch = 0; kch < output_ch; kch++) { - for(ky = 0; ky < 4; ky++) + for (ky = 0; ky < 4; ky++) { imy = imy_start + ky; - for(kx = 0; kx < 4; kx++) + for (kx = 0; kx < 4; kx++) { imx = imx_start + kx; - if(imx >= 0 && imx < output_x && imy >= 0 && imy < output_y) + if (imx >= 0 && imx < output_x && imy >= 0 && imy < output_y) *(im + output_xy * kch + output_x * imy + imx) += *cur_col; cur_col++; } @@ -195,35 +194,35 @@ static void col2im(float* col, float* im, float* bias, int output_ch, int output } } } - else if(is_8x8) + else if (is_8x8) { - for(iy = 0; iy < input_y; iy++) + for (iy = 0; iy < input_y; iy++) { imy_start = iy * stride_y - pad_y; - for(ix = 0; ix < input_x; ix++) + for (ix = 0; ix < input_x; ix++) { imx_start = ix * stride_x - pad_x; cur_col = col + (iy * input_x + ix) * weight_size; - if(iy != 0 && iy != (input_y - 1) && ix != 0 && ix != (input_x - 1)) + if (iy != 0 && iy != (input_y - 1) && ix != 0 && ix != (input_x - 1)) { - for(kch = 0; kch < output_ch; kch++) - for(ky = 0; ky < 8; ky++) + for (kch = 0; kch < output_ch; kch++) + for (ky = 0; ky < 8; ky++) { imy = imy_start + ky; - for(kx = 0; kx < 8; kx++) + for (kx = 0; kx < 8; kx++) *(im + output_xy * kch + output_x * imy + imx_start + kx) += *cur_col++; } } else { - for(kch = 0; kch < output_ch; kch++) - for(ky = 0; ky < 8; ky++) + for (kch = 0; kch < output_ch; kch++) + for (ky = 0; ky < 8; ky++) { imy = imy_start + ky; - for(kx = 0; kx < 8; kx++) + for (kx = 0; kx < 8; kx++) { imx = imx_start + kx; - if(imx >= 0 && imx < output_x && imy >= 0 && imy < output_y) + if (imx >= 0 && imx < output_x && imy >= 0 && imy < output_y) *(im + output_xy * kch + output_x * imy + imx) += *cur_col; cur_col++; } @@ -235,20 +234,20 @@ static void col2im(float* col, float* im, float* bias, int output_ch, int output // general case else { - for(iy = 0; iy < input_y; iy++) + for (iy = 0; iy < input_y; iy++) { imy_start = iy * stride_y - pad_y; - for(ix = 0; ix < input_x; ix++) + for (ix = 0; ix < input_x; ix++) { imx_start = ix * stride_x - pad_x; cur_col = col + (iy * input_x + ix) * weight_size; - if(iy != 0 && iy != (input_y - 1) && ix != 0 && ix != (input_x - 1)) + if (iy != 0 && iy != (input_y - 1) && ix != 0 && ix != (input_x - 1)) { - for(kch = 0; kch < output_ch; kch++) - for(ky = 0; ky < kernel_y; ky++) + for (kch = 0; kch < output_ch; kch++) + for (ky = 0; ky < kernel_y; ky++) { imy = imy_start + ky * dilation_y; - for(kx = 0; kx < kernel_x; kx++) + for (kx = 0; kx < kernel_x; kx++) { imx = imx_start + kx * dilation_x; *(im + output_xy * kch + output_x * imy + imx) += *cur_col++; @@ -257,16 +256,16 @@ static void col2im(float* col, float* im, float* bias, int output_ch, int output } else { - for(kch = 0; kch < output_ch; kch++) + for (kch = 0; kch < output_ch; kch++) { - for(ky = 0; ky < kernel_y; ky++) + for (ky = 0; ky < kernel_y; ky++) { imy = imy_start + ky * dilation_y; - for(kx = 0; kx < kernel_x; kx++) + for (kx = 0; kx < kernel_x; kx++) { imx = imx_start + kx * dilation_x; float out = bias[kch]; - if(imx >= 0 && imx < output_x && imy >= 0 && imy < output_y) + if (imx >= 0 && imx < output_x && imy >= 0 && imy < output_y) *(im + output_xy * kch + output_x * imy + imx) += *cur_col; cur_col++; } @@ -282,23 +281,23 @@ static void sgemm_set(float* input, float* kernel, float* col, int in_ch, int in int kernel_start, int kernel_end, int num_thread, int cpu_affinity) { int nn_kernel = (kernel_end - kernel_start) / PER_OUT_CHAN; - int input_end3 = in_hw & 0x3; + int input_end3 = in_hw & 0x3; if (input_end3) { - #pragma omp parallel for num_threads(num_thread) - for (int pp=0; ppgroup; int kernel_h = param->kernel_h; int kernel_w = param->kernel_w; - int out_ch = output_tensor->dims[1]/group; - int in_ch = input_tensor->dims[1]/group; + int out_ch = output_tensor->dims[1] / group; + int in_ch = input_tensor->dims[1] / group; int in_h = input_tensor->dims[2]; int in_w = input_tensor->dims[3]; @@ -491,7 +490,6 @@ int deconv_hcl_prerun(struct tensor* input_tensor , \ int col_size = sizeof(float) * in_h * in_w * kernel_size + 128; priv_info->col_buffer = (float*)sys_malloc(col_size); priv_info->col_buffer_size = col_size; - } interleave(filter_tensor, priv_info, param); @@ -499,21 +497,21 @@ int deconv_hcl_prerun(struct tensor* input_tensor , \ return 0; } -int deconv_hcl_postrun(struct deconv_priv_info* priv_info) +int deconv_hcl_postrun(struct deconv_priv_info* priv_info) { - if(priv_info->interleave_buffer != NULL) + if (priv_info->interleave_buffer != NULL) { sys_free(priv_info->interleave_buffer); priv_info->interleave_buffer = NULL; } - if(priv_info->trans_input_buffer != NULL) + if (priv_info->trans_input_buffer != NULL) { sys_free(priv_info->trans_input_buffer); priv_info->trans_input_buffer = NULL; } - if(priv_info->col_buffer != NULL) + if (priv_info->col_buffer != NULL) { sys_free(priv_info->col_buffer); priv_info->col_buffer = NULL; @@ -522,14 +520,14 @@ int deconv_hcl_postrun(struct deconv_priv_info* priv_info) return 0; } -int deconv_hcl_run(struct tensor* input_tensor , \ - struct tensor* filter_tensor , \ - struct tensor* bias_tensor , \ - struct tensor* output_tensor , \ - struct deconv_priv_info* priv_info , \ - struct deconv_param* param, \ - int num_thread, \ - int cpu_affinity) +int deconv_hcl_run(struct tensor* input_tensor, + struct tensor* filter_tensor, + struct tensor* bias_tensor, + struct tensor* output_tensor, + struct deconv_priv_info* priv_info, + struct deconv_param* param, + int num_thread, + int cpu_affinity) { /* param */ int group = param->group; @@ -543,7 +541,7 @@ int deconv_hcl_run(struct tensor* input_tensor , \ int in_c = input_tensor->dims[1] / group; int in_h = input_tensor->dims[2]; int in_w = input_tensor->dims[3]; - int in_hw = in_h * in_w; + int in_hw = in_h * in_w; int input_size = in_c * in_h * in_w; int out_c = output_tensor->dims[1] / group; @@ -553,7 +551,7 @@ int deconv_hcl_run(struct tensor* input_tensor , \ int output_size = out_c * out_h * out_w; int kernel_size = out_c * ksize * ksize; - int kernel_size_g = ((kernel_size + 3)&-4 ) * in_c; + int kernel_size_g = ((kernel_size + 3) & -4) * in_c; /* buffer addr */ float* input_buf = (float*)input_tensor->data; @@ -565,9 +563,9 @@ int deconv_hcl_run(struct tensor* input_tensor , \ int sgemm_set_num = kernel_size / PER_OUT_CHAN * PER_OUT_CHAN; int sgemm_set_remain = kernel_size % PER_OUT_CHAN; - for(int n = 0; n < batch; n++) // batch size + for (int n = 0; n < batch; n++) // batch size { - for(int g = 0; g < group; g++) + for (int g = 0; g < group; g++) { /* im2col */ float* cur_input = input_buf + (n * group + g) * input_size; @@ -576,15 +574,14 @@ int deconv_hcl_run(struct tensor* input_tensor , \ transpose_input(cur_input, trans_input_buf, in_hw, in_c); /* gemm */ - sgemm_set(trans_input_buf,cur_kernel, col_buf, in_c, in_hw, kernel_size, 0, sgemm_set_num, num_thread, cpu_affinity); - if(sgemm_set_remain) - sgemm4x4(trans_input_buf,cur_kernel, col_buf, in_c, in_hw, kernel_size, sgemm_set_num, kernel_size, num_thread, cpu_affinity); - float* cur_bias = biases_buf? (biases_buf + g * out_c) : NULL; - col2im(col_buf, cur_output, cur_bias, out_c, out_w, out_h, ksize, ksize, stride, - stride, dilation, dilation, pad, pad, in_w, in_h); + sgemm_set(trans_input_buf, cur_kernel, col_buf, in_c, in_hw, kernel_size, 0, sgemm_set_num, num_thread, cpu_affinity); + if (sgemm_set_remain) + sgemm4x4(trans_input_buf, cur_kernel, col_buf, in_c, in_hw, kernel_size, sgemm_set_num, kernel_size, num_thread, cpu_affinity); + float* cur_bias = biases_buf ? (biases_buf + g * out_c) : NULL; + col2im(col_buf, cur_output, cur_bias, out_c, out_w, out_h, ksize, ksize, stride, + stride, dilation, dilation, pad, pad, in_w, in_h); } } - return 0 ; - + return 0; } diff --git a/source/device/cpu/op/deconv/cortex_a/deconv_kernel_arm.h b/source/device/cpu/op/deconv/cortex_a/deconv_kernel_arm.h index e6b04b725..591aa718b 100644 --- a/source/device/cpu/op/deconv/cortex_a/deconv_kernel_arm.h +++ b/source/device/cpu/op/deconv/cortex_a/deconv_kernel_arm.h @@ -28,33 +28,31 @@ #include "graph/tensor.h" - struct deconv_priv_info { - float* interleave_buffer ; + float* interleave_buffer; int interleave_buffer_size; - float* col_buffer ; + float* col_buffer; int col_buffer_size; float* trans_input_buffer; int trans_input_size; }; -int deconv_hcl_prerun(struct tensor* input_tensor , \ - struct tensor* filter_tensor , \ - struct tensor* output_tensor , \ - struct deconv_priv_info* info , \ - struct deconv_param* param) ; +int deconv_hcl_prerun(struct tensor* input_tensor, + struct tensor* filter_tensor, + struct tensor* output_tensor, + struct deconv_priv_info* info, + struct deconv_param* param); int deconv_hcl_postrun(struct deconv_priv_info* info); -int deconv_hcl_run(struct tensor* input_tensor , \ - struct tensor* filter_tensor ,\ - struct tensor* bias_tensor , \ - struct tensor* output_tensor , \ - struct deconv_priv_info* deconv_info , \ - struct deconv_param* param, \ - int num_thread, \ - int cpu_affinity) ; - +int deconv_hcl_run(struct tensor* input_tensor, + struct tensor* filter_tensor, + struct tensor* bias_tensor, + struct tensor* output_tensor, + struct deconv_priv_info* deconv_info, + struct deconv_param* param, + int num_thread, + int cpu_affinity); #endif diff --git a/source/device/cpu/op/deconv/deconv_ref.c b/source/device/cpu/op/deconv/deconv_ref.c index 9919d7a81..7bdfa4b76 100644 --- a/source/device/cpu/op/deconv/deconv_ref.c +++ b/source/device/cpu/op/deconv/deconv_ref.c @@ -36,21 +36,20 @@ #include - struct deconv_ref_param { - int in_shape[4]; // NCHW - int out_shape[3]; // CHW - int kernels[2]; // hw - int strides[2]; // hw - int dilations[2]; // hw + int in_shape[4]; // NCHW + int out_shape[3]; // CHW + int kernels[2]; // hw + int strides[2]; // hw + int dilations[2]; // hw int pads[2]; int batch; int group; int activation; int layout; int zero[3]; // input, kernel, output - float scale[3]; // input, kernel, output + float scale[3]; // input, kernel, output }; static inline float activation(float input, int activation) @@ -102,7 +101,7 @@ static int ref_deconv_fp32(const float* input, float* output, const float* kerne int kernel_offset = 0; int output_offset = 0; - memset(( void* )output, 0, (unsigned long)output_h * output_w * output_c * batch * group * sizeof(float)); + memset((void*)output, 0, (unsigned long)output_h * output_w * output_c * batch * group * sizeof(float)); for (n = 0; n < batch; ++n) { @@ -118,13 +117,11 @@ static int ref_deconv_fp32(const float* input, float* output, const float* kerne { if (param->layout == 0) { - input_offset = n * group * input_c * input_h * input_w + g * input_c * input_h * input_w + - kc * input_h * input_w + h * input_w + w; + input_offset = n * group * input_c * input_h * input_w + g * input_c * input_h * input_w + kc * input_h * input_w + h * input_w + w; } else { - input_offset = n * group * input_c * input_h * input_w + h * group * input_c * input_w + - w * group * input_c + g * input_c + kc; + input_offset = n * group * input_c * input_h * input_w + h * group * input_c * input_w + w * group * input_c + g * input_c + kc; } input_val = input[input_offset]; for (c = 0; c < output_c; c++) @@ -135,26 +132,18 @@ static int ref_deconv_fp32(const float* input, float* output, const float* kerne { cur_out_x = org_out_x + k_w * dilation_w; cur_out_y = org_out_y + k_h * dilation_h; - if (cur_out_x >= 0 && cur_out_x < output_w && cur_out_y >= 0 && - cur_out_y < output_h) + if (cur_out_x >= 0 && cur_out_x < output_w && cur_out_y >= 0 && cur_out_y < output_h) { if (param->layout == 0) { - kernel_offset = g * output_c * input_c * kernel_h * kernel_w + - kc * output_c * kernel_h * kernel_w + - c * kernel_h * kernel_w + k_h * kernel_w + k_w; + kernel_offset = g * output_c * input_c * kernel_h * kernel_w + kc * output_c * kernel_h * kernel_w + c * kernel_h * kernel_w + k_h * kernel_w + k_w; - output_offset = n * group * output_c * output_w * output_h + - g * output_c * output_w * output_h + - c * output_w * output_h + cur_out_y * output_w + cur_out_x; + output_offset = n * group * output_c * output_w * output_h + g * output_c * output_w * output_h + c * output_w * output_h + cur_out_y * output_w + cur_out_x; } else { - kernel_offset = g * output_c * input_c * kernel_h * kernel_w + - k_h * kernel_w * output_c + k_w * output_c + c; - output_offset = n * output_h * output_w * output_c * group + - cur_out_y * group * output_w * output_c + - cur_out_x * group * output_c + g * output_c + c; + kernel_offset = g * output_c * input_c * kernel_h * kernel_w + k_h * kernel_w * output_c + k_w * output_c + c; + output_offset = n * output_h * output_w * output_c * group + cur_out_y * group * output_w * output_c + cur_out_x * group * output_c + g * output_c + c; } weight_val = kernel[kernel_offset]; output[output_offset] += weight_val * input_val; @@ -182,14 +171,11 @@ static int ref_deconv_fp32(const float* input, float* output, const float* kerne { if (param->layout == 0) { - output_offset = n * output_c * group * output_w * output_h + - g * output_c * output_w * output_h + c * output_h * output_w + - h * output_w + w; + output_offset = n * output_c * group * output_w * output_h + g * output_c * output_w * output_h + c * output_h * output_w + h * output_w + w; } else { - output_offset = n * output_c * group * output_w * output_h + - h * output_c * group * output_w + w * output_c * group + c; + output_offset = n * output_c * group * output_w * output_h + h * output_c * group * output_w + w * output_c * group + c; } output[output_offset] += bias_val; } @@ -214,19 +200,19 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct struct graph* ir_graph = ir_node->graph; struct graph* graph = ir_node->graph; - struct deconv_param* param = ( struct deconv_param* )(ir_node->op.param_mem); - struct deconv_ref_param* op_param = ( struct deconv_ref_param* )exec_node->ops_priv; + struct deconv_param* param = (struct deconv_param*)(ir_node->op.param_mem); + struct deconv_ref_param* op_param = (struct deconv_ref_param*)exec_node->ops_priv; struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); - if (graph->graph_layout == TENGINE_LAYOUT_NCHW) // nchw + if (graph->graph_layout == TENGINE_LAYOUT_NCHW) // nchw { op_param->batch = input_tensor->dims[0]; op_param->in_shape[0] = input_tensor->dims[1]; op_param->in_shape[1] = input_tensor->dims[2]; op_param->in_shape[2] = input_tensor->dims[3]; } - else // nhwc + else // nhwc { op_param->batch = input_tensor->dims[0]; op_param->in_shape[0] = input_tensor->dims[3]; @@ -238,12 +224,12 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct struct tensor* weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); - if (graph->graph_layout == TENGINE_LAYOUT_NCHW) // hw + if (graph->graph_layout == TENGINE_LAYOUT_NCHW) // hw { op_param->kernels[0] = weight_tensor->dims[2]; op_param->kernels[1] = weight_tensor->dims[3]; } - else // + else // { op_param->kernels[0] = weight_tensor->dims[1]; op_param->kernels[1] = weight_tensor->dims[2]; @@ -253,7 +239,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - if (graph->graph_layout == TENGINE_LAYOUT_NCHW) // chw + if (graph->graph_layout == TENGINE_LAYOUT_NCHW) // chw { op_param->out_shape[0] = output_tensor->dims[1]; op_param->out_shape[1] = output_tensor->dims[2]; @@ -272,8 +258,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct op_param->dilations[1] = param->dilation_h; op_param->dilations[0] = param->dilation_w; - op_param->pads[0] = param->pad_h0; // pad_h - op_param->pads[1] = param->pad_w0; // pad_w + op_param->pads[0] = param->pad_h0; // pad_h + op_param->pads[1] = param->pad_w0; // pad_w op_param->group = param->group; op_param->activation = param->activation; @@ -291,9 +277,9 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* bias_tensor = NULL; struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - if (ir_node->input_num > 2) + if (ir_node->input_num > 2) { - bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); + bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); } void* output_data = output_tensor->data; @@ -304,7 +290,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex if (bias_tensor != NULL) bias = bias_tensor->data; - struct deconv_ref_param* op_param = ( struct deconv_ref_param* )exec_node->ops_priv; + struct deconv_ref_param* op_param = (struct deconv_ref_param*)exec_node->ops_priv; /* input quant param */ int ret = ref_deconv_fp32((float*)input_data, (float*)output_data, (float*)kernel, (float*)bias, op_param); @@ -324,7 +310,7 @@ static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struc static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct deconv_ref_param* deconv_ref_param = ( struct deconv_ref_param* )sys_malloc(sizeof(struct deconv_ref_param)); + struct deconv_ref_param* deconv_ref_param = (struct deconv_ref_param*)sys_malloc(sizeof(struct deconv_ref_param)); exec_node->ops_priv = deconv_ref_param; return 0; } diff --git a/source/device/cpu/op/depthtospace/depthtospace_ref.c b/source/device/cpu/op/depthtospace/depthtospace_ref.c index cd8e0610a..940b033ce 100644 --- a/source/device/cpu/op/depthtospace/depthtospace_ref.c +++ b/source/device/cpu/op/depthtospace/depthtospace_ref.c @@ -34,7 +34,6 @@ #include - int ref_depthtospace_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread) { float* input_data = (float*)input_tensor->data; diff --git a/source/device/cpu/op/detection_output/detection_output_ref.c b/source/device/cpu/op/detection_output/detection_output_ref.c index fe6cbde25..ed9409118 100644 --- a/source/device/cpu/op/detection_output/detection_output_ref.c +++ b/source/device/cpu/op/detection_output/detection_output_ref.c @@ -38,7 +38,6 @@ #include #include - typedef struct { float x0; @@ -155,7 +154,7 @@ void nms_sorted_bboxes(const Box_t* bboxes, int bboxes_num, int* picked, int* pi } } - sys_free(areas); + sys_free(areas); } static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) @@ -170,29 +169,29 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct node* ir_node = exec_node->ir_node; + struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; - struct tensor* loc_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); + struct tensor* loc_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* conf_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); struct tensor* priorbox_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - detection_output_param_t* param = ( detection_output_param_t* )(ir_node->op.param_mem); + detection_output_param_t* param = (detection_output_param_t*)(ir_node->op.param_mem); - float* location = NULL; + float* location = NULL; float* confidence = NULL; - float* priorbox = NULL; + float* priorbox = NULL; /* use original fp32 data or dequant uint8 to fp32 */ if (loc_tensor->data_type == TENGINE_DT_FP32) - location = ( float* )loc_tensor->data; + location = (float*)loc_tensor->data; else if (loc_tensor->data_type == TENGINE_DT_UINT8) { uint8_t* location_u8 = (uint8_t*)loc_tensor->data; - uint32_t elem_num = loc_tensor->elem_num; - uint32_t zero_point = loc_tensor->zero_point; + uint32_t elem_num = loc_tensor->elem_num; + uint32_t zero_point = loc_tensor->zero_point; float scale = loc_tensor->scale; location = (float*)sys_malloc(elem_num * sizeof(float)); - for (int i=0; idata_type == TENGINE_DT_INT8) { int8_t* location_i8 = (int8_t*)loc_tensor->data; - uint32_t elem_num = loc_tensor->elem_num; + uint32_t elem_num = loc_tensor->elem_num; float scale = loc_tensor->scale; location = (float*)sys_malloc(elem_num * sizeof(float)); - for (int i=0; idata_type == TENGINE_DT_FP32) - confidence = ( float* )conf_tensor->data; + confidence = (float*)conf_tensor->data; else if (conf_tensor->data_type == TENGINE_DT_UINT8) { uint8_t* confidence_u8 = (uint8_t*)conf_tensor->data; - uint32_t elem_num = conf_tensor->elem_num; - uint32_t zero_point = conf_tensor->zero_point; + uint32_t elem_num = conf_tensor->elem_num; + uint32_t zero_point = conf_tensor->zero_point; float scale = conf_tensor->scale; confidence = (float*)sys_malloc(elem_num * sizeof(float)); - for (int i=0; idata_type == TENGINE_DT_INT8) { int8_t* confidence_i8 = (int8_t*)conf_tensor->data; - uint32_t elem_num = conf_tensor->elem_num; + uint32_t elem_num = conf_tensor->elem_num; float scale = conf_tensor->scale; confidence = (float*)sys_malloc(elem_num * sizeof(float)); - for (int i=0; idata_type == TENGINE_DT_FP32) - priorbox = ( float* )priorbox_tensor->data; + priorbox = (float*)priorbox_tensor->data; else if (priorbox_tensor->data_type == TENGINE_DT_UINT8) { uint8_t* priorbox_u8 = (uint8_t*)priorbox_tensor->data; - uint32_t elem_num = priorbox_tensor->elem_num; - uint32_t zero_point = priorbox_tensor->zero_point; + uint32_t elem_num = priorbox_tensor->elem_num; + uint32_t zero_point = priorbox_tensor->zero_point; float scale = priorbox_tensor->scale; priorbox = (float*)sys_malloc(elem_num * sizeof(float)); - for (int i=0; idata_type == TENGINE_DT_INT8) { int8_t* priorbox_i8 = (int8_t*)priorbox_tensor->data; - uint32_t elem_num = priorbox_tensor->elem_num; + uint32_t elem_num = priorbox_tensor->elem_num; float scale = priorbox_tensor->scale; priorbox = (float*)sys_malloc(elem_num * sizeof(float)); - for (int i=0; idims[2]; - const int num_prior = num_priorx4 / 4; + const int num_prior = num_priorx4 / 4; const int num_classes = param->num_classes; int b = 0; - float* loc_ptr = location + b * num_priorx4; - float* conf_ptr = confidence + b * num_prior * num_classes; + float* loc_ptr = location + b * num_priorx4; + float* conf_ptr = confidence + b * num_prior * num_classes; float* prior_ptr = priorbox + b * num_priorx4 * 2; Box_t* boxes = (Box_t*)sys_malloc(sizeof(Box_t) * num_prior); @@ -294,7 +293,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex if (class_box_num > param->nms_top_k) class_box_num = param->nms_top_k; - int* picked = (int*)sys_malloc(sizeof(int) * class_box_num); // = NULL; + int* picked = (int*)sys_malloc(sizeof(int) * class_box_num); // = NULL; int picked_num = 0; nms_sorted_bboxes(class_box, class_box_num, picked, &picked_num, param->nms_threshold); @@ -304,14 +303,14 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex push_vector_data(output_bbox_v, &class_box[z]); } - sys_free(picked); - sys_free(class_box); + sys_free(picked); + sys_free(class_box); } - sys_free(boxes); + sys_free(boxes); int total_num = get_vector_num(output_bbox_v); - Box_t* bbox_rects = ( Box_t* )sys_malloc(total_num * sizeof(Box_t)); + Box_t* bbox_rects = (Box_t*)sys_malloc(total_num * sizeof(Box_t)); for (int i = 0; i < total_num; i++) memcpy(&bbox_rects[i], get_vector_data(output_bbox_v, i), sizeof(Box_t)); @@ -328,10 +327,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex // output float* output_fp32 = NULL; if (output_tensor->data_type == TENGINE_DT_FP32) - output_fp32 = ( float* )output_tensor->data; + output_fp32 = (float*)output_tensor->data; else { - output_fp32 = (float*)sys_malloc(output_tensor->elem_num * sizeof(float )); + output_fp32 = (float*)sys_malloc(output_tensor->elem_num * sizeof(float)); } for (int i = 0; i < num_detected; i++) @@ -355,7 +354,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex uint32_t elem_num = output_tensor->elem_num; float scale = output_tensor->scale; uint32_t zero_point = output_tensor->zero_point; - for(int i=0; i 255) @@ -377,7 +376,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int8_t* output_i8 = (int8_t*)output_tensor->data; int32_t elem_num = output_tensor->elem_num; float scale = output_tensor->scale; - for(int i=0; i 127) diff --git a/source/device/cpu/op/detection_postprocess/detection_postprocess_ref.c b/source/device/cpu/op/detection_postprocess/detection_postprocess_ref.c index 9d7185cff..25b14171a 100644 --- a/source/device/cpu/op/detection_postprocess/detection_postprocess_ref.c +++ b/source/device/cpu/op/detection_postprocess/detection_postprocess_ref.c @@ -39,13 +39,12 @@ #include #include - struct Dpp_Box { - float x0; // xmin - float y0; // ymin - float x1; // xmax - float y1; // ymax + float x0; // xmin + float y0; // ymin + float x1; // xmax + float y1; // ymax int box_idx; int class_idx; float score; @@ -68,7 +67,7 @@ struct dpp_param #define DPP_MAX(a, b) (a > b ? a : b) static float intersection_area(const struct Dpp_Box a, const struct Dpp_Box b) { - if(a.x0 > b.x1 || a.x1 < b.x0 || a.y0 > b.y1 || a.y1 < b.y0) + if (a.x0 > b.x1 || a.x1 < b.x0 || a.y0 > b.y1 || a.y1 < b.y0) { // no intersection return 0.f; @@ -77,15 +76,15 @@ static float intersection_area(const struct Dpp_Box a, const struct Dpp_Box b) float inter_width = DPP_MIN(a.x1, b.x1) - DPP_MAX(a.x0, b.x0); float inter_height = DPP_MIN(a.y1, b.y1) - DPP_MAX(a.y0, b.y0); - return inter_width* inter_height; + return inter_width * inter_height; } static void nms_sorted_bboxes(const struct Dpp_Box* boxes, int boxes_size, int* picked, int* picked_size, - float nms_threshold) + float nms_threshold) { float* areas = (float*)sys_malloc(sizeof(float) * boxes_size); int n_picked = 0; - for(int i = 0; i < boxes_size; i++) + for (int i = 0; i < boxes_size; i++) { float width = boxes[i].x1 - boxes[i].x0; float height = boxes[i].y1 - boxes[i].y0; @@ -93,20 +92,20 @@ static void nms_sorted_bboxes(const struct Dpp_Box* boxes, int boxes_size, int* areas[i] = width * height; } - for(int i = 0; i < boxes_size; i++) + for (int i = 0; i < boxes_size; i++) { int keep = 1; - for(int j = 0; j < n_picked; j++) + for (int j = 0; j < n_picked; j++) { // intersection over union float inter_area = intersection_area(boxes[i], boxes[picked[j]]); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area - if(inter_area / union_area > nms_threshold) + if (inter_area / union_area > nms_threshold) keep = 0; } - if(keep) + if (keep) { picked[n_picked] = i; n_picked++; @@ -120,17 +119,17 @@ static void nms_sorted_bboxes(const struct Dpp_Box* boxes, int boxes_size, int* static void sort_boxes_by_score(struct Dpp_Box* boxes, int size) { int i, j; - for(i = 0; i < size - 1; i++) + for (i = 0; i < size - 1; i++) { int max_idx = i; - for(j = i + 1; j < size; j++) + for (j = i + 1; j < size; j++) { - if(boxes[j].score < 0.6) + if (boxes[j].score < 0.6) continue; - if(boxes[max_idx].score < boxes[j].score) + if (boxes[max_idx].score < boxes[j].score) max_idx = j; } - if(i != max_idx) + if (i != max_idx) { struct Dpp_Box tmp; memcpy(&tmp, boxes + i, sizeof(struct Dpp_Box)); @@ -139,14 +138,14 @@ static void sort_boxes_by_score(struct Dpp_Box* boxes, int size) } else { - if(boxes[max_idx].score < 0.6) + if (boxes[max_idx].score < 0.6) return; } } } static int decode_single_box(struct Dpp_Box* box, const float* box_ptr, const float* anchor_ptr, - const float* scales) + const float* scales) { int i = box->box_idx; @@ -163,7 +162,7 @@ static int decode_single_box(struct Dpp_Box* box, const float* box_ptr, const fl box->x0 = xcenter - half_w; box->y1 = ycenter + half_h; box->x1 = xcenter + half_w; - if(box->y0 < 0 || box->x0 < 0) + if (box->y0 < 0 || box->x0 < 0) return -1; return 0; } @@ -172,20 +171,20 @@ void get_all_boxes_rect(struct Dpp_Box* all_class_bbox_rects, const float* box, const float* anchor, int num_boxes, int num_classes, float* scales) { struct Dpp_Box selected_box; - for(int j = 0; j < num_boxes; j++) + for (int j = 0; j < num_boxes; j++) { - for(int i = 1; i < num_classes; i++) + for (int i = 1; i < num_classes; i++) { float score = scores[j * num_classes + i]; - if(score < 0.6) + if (score < 0.6) continue; selected_box.score = score; selected_box.class_idx = i; selected_box.box_idx = j; - if(decode_single_box(&selected_box, box, anchor, scales) < 0) + if (decode_single_box(&selected_box, box, anchor, scales) < 0) continue; // struct Box* cls_vector = all_class_bbox_rects[i]; @@ -195,39 +194,39 @@ void get_all_boxes_rect(struct Dpp_Box* all_class_bbox_rects, const float* box, } int ref_dpp_fp32(const float* input_f, const float* score_f, const float* anchor_f, - float* detect_num, float* detect_class, float* detect_score, float* detect_boxes,struct dpp_param* param) + float* detect_num, float* detect_class, float* detect_score, float* detect_boxes, struct dpp_param* param) { const int num_classes = param->num_classes + 1; const int num_boxes = param->num_boxes; const int max_detections = param->max_detections; - struct Dpp_Box* all_boxes = ( struct Dpp_Box* )malloc((unsigned long)num_classes * num_boxes * sizeof(struct Dpp_Box)); + struct Dpp_Box* all_boxes = (struct Dpp_Box*)malloc((unsigned long)num_classes * num_boxes * sizeof(struct Dpp_Box)); memset(all_boxes, 0, sizeof(struct Dpp_Box) * num_classes * num_boxes); get_all_boxes_rect(all_boxes, input_f, score_f, anchor_f, num_boxes, num_classes, param->scales); int max_picked_boxes = 2 * max_detections * num_classes; - struct Dpp_Box* picked_boxes = ( struct Dpp_Box* )malloc(max_picked_boxes * sizeof(struct Dpp_Box)); + struct Dpp_Box* picked_boxes = (struct Dpp_Box*)malloc(max_picked_boxes * sizeof(struct Dpp_Box)); memset(picked_boxes, 0, sizeof(struct Dpp_Box) * max_picked_boxes); int all_picked_size = 0; - for(int i = 1; i < num_classes; i++) + for (int i = 1; i < num_classes; i++) { struct Dpp_Box* class_box = all_boxes + i * num_boxes; // sort sort_boxes_by_score(class_box, num_boxes); int box_size = 0; - for(int j = 0; j < num_boxes; j++) + for (int j = 0; j < num_boxes; j++) { - if(class_box[j].score < 0.6) + if (class_box[j].score < 0.6) break; box_size++; } - if(box_size == 0) + if (box_size == 0) continue; - if(box_size > max_detections * 2) + if (box_size > max_detections * 2) box_size = max_detections * 2; int* picked = (int*)sys_malloc(sizeof(int) * num_boxes); @@ -237,7 +236,7 @@ int ref_dpp_fp32(const float* input_f, const float* score_f, const float* anchor nms_sorted_bboxes(class_box, box_size, picked, &picked_size, param->nms_iou_threshold); // save the survivors - for(int j = 0; j < picked_size; j++) + for (int j = 0; j < picked_size; j++) { int z = picked[j]; memcpy(picked_boxes + all_picked_size, class_box + z, sizeof(struct Dpp_Box)); @@ -248,13 +247,13 @@ int ref_dpp_fp32(const float* input_f, const float* score_f, const float* anchor } sort_boxes_by_score(picked_boxes, max_picked_boxes); - if(all_picked_size > max_detections) + if (all_picked_size > max_detections) all_picked_size = max_detections; // generate output tensors detect_num[0] = all_picked_size; - for(int i = 0; i < all_picked_size; i++) + for (int i = 0; i < all_picked_size; i++) { detect_class[i] = picked_boxes[i].class_idx; detect_score[i] = picked_boxes[i].score; @@ -271,7 +270,7 @@ int ref_dpp_fp32(const float* input_f, const float* score_f, const float* anchor } int ref_dpp_uint8(const uint8_t* input, const uint8_t* score, const uint8_t* anchor, - float* detect_num, float* detect_class, float* detect_score, float* detect_boxes,struct dpp_param* param) + float* detect_num, float* detect_class, float* detect_score, float* detect_boxes, struct dpp_param* param) { const int num_classes = param->num_classes + 1; const int num_boxes = param->num_boxes; @@ -280,43 +279,43 @@ int ref_dpp_uint8(const uint8_t* input, const uint8_t* score, const uint8_t* anc /* transform uint8_t to fp32 */ int input_size = num_boxes * 4; int score_size = num_boxes * num_classes; - float* input_f = (float* )malloc(input_size * sizeof(float)); - float* score_f = (float* )malloc(score_size * sizeof(float)); - float* anchor_f = (float* )malloc(input_size * sizeof(float)); - for(int i=0; izero[0]) * param->quant_scale[0]; - for(int i=0; iquant_scale[1]; - for(int i=0; izero[2]) * param->quant_scale[2]; - struct Dpp_Box* all_boxes = (struct Dpp_Box* )malloc((unsigned long)num_classes * num_boxes * sizeof(struct Dpp_Box)); + struct Dpp_Box* all_boxes = (struct Dpp_Box*)malloc((unsigned long)num_classes * num_boxes * sizeof(struct Dpp_Box)); memset(all_boxes, 0, sizeof(struct Dpp_Box) * num_classes * num_boxes); get_all_boxes_rect(all_boxes, input_f, score_f, anchor_f, num_boxes, num_classes, param->scales); int max_picked_boxes = 2 * max_detections * num_classes; - struct Dpp_Box* picked_boxes = ( struct Dpp_Box* )malloc(max_picked_boxes * sizeof(struct Dpp_Box)); + struct Dpp_Box* picked_boxes = (struct Dpp_Box*)malloc(max_picked_boxes * sizeof(struct Dpp_Box)); memset(picked_boxes, 0, sizeof(struct Dpp_Box) * max_picked_boxes); int all_picked_size = 0; - for(int i = 1; i < num_classes; i++) + for (int i = 1; i < num_classes; i++) { struct Dpp_Box* class_box = all_boxes + i * num_boxes; // sort sort_boxes_by_score(class_box, num_boxes); int box_size = 0; - for(int j = 0; j < num_boxes; j++) + for (int j = 0; j < num_boxes; j++) { - if(class_box[j].score < 0.6) + if (class_box[j].score < 0.6) break; box_size++; } - if(box_size == 0) + if (box_size == 0) continue; - if(box_size > max_detections * 2) + if (box_size > max_detections * 2) box_size = max_detections * 2; int* picked = (int*)sys_malloc(sizeof(int) * num_boxes); @@ -326,7 +325,7 @@ int ref_dpp_uint8(const uint8_t* input, const uint8_t* score, const uint8_t* anc nms_sorted_bboxes(class_box, box_size, picked, &picked_size, param->nms_iou_threshold); // save the survivors - for(int j = 0; j < picked_size; j++) + for (int j = 0; j < picked_size; j++) { int z = picked[j]; memcpy(picked_boxes + all_picked_size, class_box + z, sizeof(struct Dpp_Box)); @@ -337,13 +336,13 @@ int ref_dpp_uint8(const uint8_t* input, const uint8_t* score, const uint8_t* anc } sort_boxes_by_score(picked_boxes, max_picked_boxes); - if(all_picked_size > max_detections) + if (all_picked_size > max_detections) all_picked_size = max_detections; // generate output tensors detect_num[0] = all_picked_size; - for(int i = 0; i < all_picked_size; i++) + for (int i = 0; i < all_picked_size; i++) { detect_class[i] = picked_boxes[i].class_idx; detect_score[i] = picked_boxes[i].score; @@ -373,7 +372,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct detection_postprocess_param* param_ = (struct detection_postprocess_param* )ir_node->op.param_mem; + struct detection_postprocess_param* param_ = (struct detection_postprocess_param*)ir_node->op.param_mem; param.max_classes_per_detection = param_->max_classes_per_detection; param.nms_iou_threshold = param_->nms_iou_threshold; @@ -386,8 +385,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct param.scales[2] = param_->scales[2]; param.scales[3] = param_->scales[3]; - if(input_tensor->data_type != TENGINE_DT_FP32 && input_tensor->data_type != TENGINE_DT_FP16 && - input_tensor->data_type != TENGINE_DT_UINT8) + if (input_tensor->data_type != TENGINE_DT_FP32 && input_tensor->data_type != TENGINE_DT_FP16 && input_tensor->data_type != TENGINE_DT_UINT8) { TLOG_ERR("Not support!"); return -1; @@ -401,7 +399,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; - struct detection_postprocess_param* detection_postprocess_param = (struct detection_postprocess_param* )ir_node->op.param_mem; + struct detection_postprocess_param* detection_postprocess_param = (struct detection_postprocess_param*)ir_node->op.param_mem; struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); const void* input_data = input_tensor->data; @@ -433,11 +431,11 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex if (input_tensor->dim_num == 3 && input_tensor->elem_size == 1) { int in_ch = input_tensor->dims[1]; - int in_w = input_tensor->dims[2]; + int in_w = input_tensor->dims[2]; int in_size = input_tensor->elem_num; int score_ch = score->dims[1]; - int score_w = score->dims[2]; + int score_w = score->dims[2]; int score_size = score->elem_num; uint8_t* input_uint8 = (uint8_t*)input_tensor->data; @@ -449,13 +447,13 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex memcpy(score_uint8_temp, score_uint8, score_size); int index = 0; - for(int w = 0; w < in_w; w++) - for(int c = 0; c < in_ch; c++) + for (int w = 0; w < in_w; w++) + for (int c = 0; c < in_ch; c++) input_uint8[index++] = input_uint8_temp[c * in_w + w]; index = 0; - for(int w = 0; w < score_w; w++) - for(int c = 0; c < score_ch; c++) + for (int w = 0; w < score_w; w++) + for (int c = 0; c < score_ch; c++) score_uint8[index++] = score_uint8_temp[c * score_w + w]; free(input_uint8_temp); @@ -464,29 +462,29 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex else { int in_ch = input_tensor->dims[1]; - int in_w = input_tensor->dims[2]; + int in_w = input_tensor->dims[2]; int in_size = input_tensor->elem_num; int score_ch = score->dims[1]; - int score_w = score->dims[2]; + int score_w = score->dims[2]; int score_size = score->elem_num; float* input_fp32 = (float*)input_tensor->data; float* score_fp32 = (float*)score->data; - float* input_fp32_temp = (float*)malloc(in_size*sizeof(float)); - float* score_fp32_temp = (float*)malloc(score_size*sizeof(float)); + float* input_fp32_temp = (float*)malloc(in_size * sizeof(float)); + float* score_fp32_temp = (float*)malloc(score_size * sizeof(float)); memcpy(input_fp32_temp, input_fp32, in_size); memcpy(score_fp32_temp, score_fp32, score_size); int index = 0; - for(int w = 0; w < in_w; w++) - for(int c = 0; c < in_ch; c++) + for (int w = 0; w < in_w; w++) + for (int c = 0; c < in_ch; c++) input_fp32[index++] = input_fp32_temp[c * in_w + w]; index = 0; - for(int w = 0; w < score_w; w++) - for(int c = 0; c < score_ch; c++) + for (int w = 0; w < score_w; w++) + for (int c = 0; c < score_ch; c++) score_fp32[index++] = score_fp32_temp[c * score_w + w]; free(input_fp32_temp); @@ -494,11 +492,11 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex } if (input_tensor->data_type == TENGINE_DT_FP32) - ref_dpp_fp32((float*)input_data, (float*)score_data, (float*)anchor_data, detect_num_data, - detect_classes_data, detect_scores_data, detect_boxes_data, ¶m); + ref_dpp_fp32((float*)input_data, (float*)score_data, (float*)anchor_data, detect_num_data, + detect_classes_data, detect_scores_data, detect_boxes_data, ¶m); else - ref_dpp_uint8((uint8_t*)input_data, (uint8_t*)score_data, (uint8_t*)anchor_data, detect_num_data, - detect_classes_data, detect_scores_data, detect_boxes_data, ¶m); + ref_dpp_uint8((uint8_t*)input_data, (uint8_t*)score_data, (uint8_t*)anchor_data, detect_num_data, + detect_classes_data, detect_scores_data, detect_boxes_data, ¶m); return 0; } @@ -518,12 +516,12 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } static struct node_ops detection_postprocess_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score}; int register_detection_postprocess_ref_op() { diff --git a/source/device/cpu/op/dropout/dropout_ref.c b/source/device/cpu/op/dropout/dropout_ref.c index dd6e32075..144663971 100644 --- a/source/device/cpu/op/dropout/dropout_ref.c +++ b/source/device/cpu/op/dropout/dropout_ref.c @@ -32,7 +32,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { exec_node->inplace_map[0] = 0; diff --git a/source/device/cpu/op/eltwise/cortex-a/eltwise_hcl_arm.c b/source/device/cpu/op/eltwise/cortex-a/eltwise_hcl_arm.c index 99c2b3c33..01f57de9f 100644 --- a/source/device/cpu/op/eltwise/cortex-a/eltwise_hcl_arm.c +++ b/source/device/cpu/op/eltwise/cortex-a/eltwise_hcl_arm.c @@ -34,7 +34,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -54,7 +53,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor0 = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct eltwise_param* eltwise_param = ( struct eltwise_param* )ir_node->op.param_mem; + struct eltwise_param* eltwise_param = (struct eltwise_param*)ir_node->op.param_mem; struct tensor* input_tensor1 = NULL; if (ir_node->input_num > 1) @@ -79,7 +78,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc input_tensor_0 = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); input_tensor_1 = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); - struct eltwise_param* eltwise_param = ( struct eltwise_param* )ir_node->op.param_mem; + struct eltwise_param* eltwise_param = (struct eltwise_param*)ir_node->op.param_mem; if (input_tensor_0->data_type != TENGINE_DT_FP32 || ir_graph->graph_layout != TENGINE_LAYOUT_NCHW) return 0; diff --git a/source/device/cpu/op/eltwise/cortex-a/eltwise_hcl_arm.h b/source/device/cpu/op/eltwise/cortex-a/eltwise_hcl_arm.h index 95ba897ef..ac2dba2e2 100644 --- a/source/device/cpu/op/eltwise/cortex-a/eltwise_hcl_arm.h +++ b/source/device/cpu/op/eltwise/cortex-a/eltwise_hcl_arm.h @@ -31,7 +31,6 @@ #include - int perf_eltwise_fp32(struct tensor* output_tensor, struct tensor* input_tensor0, struct tensor* input_tensor1, struct eltwise_param* eltwise_param, int num_thread) { diff --git a/source/device/cpu/op/eltwise/cortex-a/eltwise_kernel_arm.c b/source/device/cpu/op/eltwise/cortex-a/eltwise_kernel_arm.c index 13620556d..8e341742f 100644 --- a/source/device/cpu/op/eltwise/cortex-a/eltwise_kernel_arm.c +++ b/source/device/cpu/op/eltwise/cortex-a/eltwise_kernel_arm.c @@ -29,7 +29,6 @@ #include - static int kernel_run(float* output, float* input0, float* input1, int type, int in_size0, int in_size1, int stride) { float* out_ptr = output; @@ -39,51 +38,51 @@ static int kernel_run(float* output, float* input0, float* input1, int type, int switch (type) { - case ELT_SUM: - if (in_size0 == 1) + case ELT_SUM: + if (in_size0 == 1) + { + float32x4_t data0 = vdupq_n_f32(in0[0]); + for (int i = 0; i < in_size1; i = i + 4) + { + float32x4_t data1 = vld1q_f32(in1 + i); + float32x4_t sum = vaddq_f32(data0, data1); + vst1q_f32(out_ptr + i, sum); + } + loop_time = in_size1 / 4; + for (int i = loop_time * 4; i < in_size1; i++) { - float32x4_t data0 = vdupq_n_f32(in0[0]); - for (int i = 0; i < in_size1; i = i + 4) - { - float32x4_t data1 = vld1q_f32(in1 + i); - float32x4_t sum = vaddq_f32(data0, data1); - vst1q_f32(out_ptr + i, sum); - } - loop_time = in_size1 / 4; - for (int i = loop_time * 4; i < in_size1; i++) - { - out_ptr[i] = in1[i] + in0[0]; - } + out_ptr[i] = in1[i] + in0[0]; } - else if (in_size1 == in_size0) + } + else if (in_size1 == in_size0) + { + for (int i = 0; i < in_size1; i = i + 4) + { + float32x4_t data0 = vld1q_f32(in0 + i); + float32x4_t data1 = vld1q_f32(in1 + i); + float32x4_t sum = vaddq_f32(data0, data1); + vst1q_f32(out_ptr + i, sum); + } + loop_time = in_size1 / 4; + + for (int i = loop_time * 4; i < in_size1; i++) { - for (int i = 0; i < in_size1; i = i + 4) - { - float32x4_t data0 = vld1q_f32(in0 + i); - float32x4_t data1 = vld1q_f32(in1 + i); - float32x4_t sum = vaddq_f32(data0, data1); - vst1q_f32(out_ptr + i, sum); - } - loop_time = in_size1 / 4; - - for (int i = loop_time * 4; i < in_size1; i++) - { - out_ptr[i] = in1[i] + in0[i]; - } + out_ptr[i] = in1[i] + in0[i]; } - else if (in_size0 < in_size1 && in_size0 != 1) + } + else if (in_size0 < in_size1 && in_size0 != 1) + { + for (int i = 0; i < in_size1; ++i) { - for (int i = 0; i < in_size1; ++i) - { - *out_ptr++ = in1[i] + in0[i / stride]; - } + *out_ptr++ = in1[i] + in0[i / stride]; } - else - return -1; - break; + } + else + return -1; + break; - default: - break; + default: + break; } return 0; @@ -93,7 +92,7 @@ int eltwise_run(struct tensor* output_tensor, struct tensor* input_tensor0, stru struct eltwise_param* eltwise_param, int num_thread) { // input - float* input0 = ( float* )input_tensor0->data; + float* input0 = (float*)input_tensor0->data; int in_size0 = input_tensor0->elem_num; float* input1 = NULL; @@ -101,7 +100,7 @@ int eltwise_run(struct tensor* output_tensor, struct tensor* input_tensor0, stru if (input_tensor1) { - input1 = ( float* )input_tensor1->data; + input1 = (float*)input_tensor1->data; in_size1 = input_tensor1->elem_num; } @@ -133,7 +132,7 @@ int eltwise_run(struct tensor* output_tensor, struct tensor* input_tensor0, stru // int input_number=node->GetInputNum(); // output - float* output = ( float* )output_tensor->data; + float* output = (float*)output_tensor->data; int result = 0; int stride = input_tensor_tmp->dims[2] * input_tensor_tmp->dims[3]; diff --git a/source/device/cpu/op/eltwise/cortex-a/eltwise_kernel_arm.h b/source/device/cpu/op/eltwise/cortex-a/eltwise_kernel_arm.h index ab78cc214..3bb1e3f05 100644 --- a/source/device/cpu/op/eltwise/cortex-a/eltwise_kernel_arm.h +++ b/source/device/cpu/op/eltwise/cortex-a/eltwise_kernel_arm.h @@ -28,7 +28,6 @@ #include "graph/tensor.h" - int eltwise_run(struct tensor* output_tensor, struct tensor* input_tensor0, struct tensor* input_tensor1, struct eltwise_param* eltwise_param, int num_thread); diff --git a/source/device/cpu/op/eltwise/eltwise_ref.c b/source/device/cpu/op/eltwise/eltwise_ref.c index 0b6f14bf2..58c901056 100644 --- a/source/device/cpu/op/eltwise/eltwise_ref.c +++ b/source/device/cpu/op/eltwise/eltwise_ref.c @@ -36,7 +36,6 @@ #include - struct eltwise_op_param { float scale[3]; @@ -49,214 +48,234 @@ struct eltwise_op_param static int ref_eltwise_fp32(void* output, void* input0, void* input1, int type, int input_count4, int input_chan, int input_hw, int input1_count4, int num_thread, int input_hw_1, struct eltwise_param* eltwise_param) { - float* out_ptr = ( float* )output; - float* in0 = ( float* )input0; - float* in1 = ( float* )input1; + float* out_ptr = (float*)output; + float* in0 = (float*)input0; + float* in1 = (float*)input1; switch (type) { - case ELT_SUB: - if (input1_count4 == 1) - { - for (int i = 0; i < input_count4; ++i) - { - *out_ptr++ = (*in0++) - in1[0]; - } - } - else if (input_count4 == input1_count4) + case ELT_SUB: + if (input1_count4 == 1) + { + for (int i = 0; i < input_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - *out_ptr++ = (*in0++) - (*in1++); - } + *out_ptr++ = (*in0++) - in1[0]; } - else if (input_chan == input1_count4) + } + else if (input_count4 == input1_count4) + { + for (int i = 0; i < input_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - *out_ptr++ = in0[i] - in1[i / input_hw]; - } + *out_ptr++ = (*in0++) - (*in1++); } - else - return -1; - break; - case ELT_SUM: - if (input1_count4 == 1) + } + else if (input_chan == input1_count4) + { + for (int i = 0; i < input_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - *out_ptr++ = (*in0++) + in1[0]; - } + *out_ptr++ = in0[i] - in1[i / input_hw]; } - else if (input_count4 == input1_count4) + } + else + return -1; + break; + case ELT_SUM: + if (input1_count4 == 1) + { + for (int i = 0; i < input_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - *out_ptr++ = (*in0++) + (*in1++); - } + *out_ptr++ = (*in0++) + in1[0]; } - else if (input_chan == input1_count4) + } + else if (input_count4 == input1_count4) + { + for (int i = 0; i < input_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - *out_ptr++ = in0[i] + in1[i / input_hw]; - } + *out_ptr++ = (*in0++) + (*in1++); } - else if(input_hw == input_hw_1){ - for( int i = 0; i < input_chan; i++){ - for(int j = 0; j < input_hw; j++){ - *out_ptr++ = in0[i*input_hw + j] + in1[j]; - } - } - // TLOG_ERR("%d %d \n", input1_count4, input_chan); - } - else - return -1; - break; - case ELT_MAX: + } + else if (input_chan == input1_count4) + { for (int i = 0; i < input_count4; ++i) { - *out_ptr++ = ELT_MAX(in0[i], in1[i]); + *out_ptr++ = in0[i] + in1[i / input_hw]; } - break; - case ELT_PROD: - if (input1_count4 == 1) + } + else if (input_hw == input_hw_1) + { + for (int i = 0; i < input_chan; i++) { - for (int i = 0; i < input_count4; ++i) + for (int j = 0; j < input_hw; j++) { - *out_ptr++ = (*in0++) * in1[0]; + *out_ptr++ = in0[i * input_hw + j] + in1[j]; } } - else if (input_count4 == input1_count4) + // TLOG_ERR("%d %d \n", input1_count4, input_chan); + } + else + return -1; + break; + case ELT_MAX: + for (int i = 0; i < input_count4; ++i) + { + *out_ptr++ = ELT_MAX(in0[i], in1[i]); + } + break; + case ELT_PROD: + if (input1_count4 == 1) + { + for (int i = 0; i < input_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - *out_ptr++ = in0[i] * in1[i]; - } + *out_ptr++ = (*in0++) * in1[0]; } - else if(input_count4 == 1) + } + else if (input_count4 == input1_count4) + { + for (int i = 0; i < input_count4; ++i) { - for(int i = 0; i < input1_count4; ++i) - { - *out_ptr++ = (in1[i]) * in0[0]; - } + *out_ptr++ = in0[i] * in1[i]; } - else if (input_chan == input1_count4) + } + else if (input_count4 == 1) + { + for (int i = 0; i < input1_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - *out_ptr++ = in0[i] * in1[i / input_hw]; - } - } - else if (input_chan == input_count4){ - for(int i = 0; i < input1_count4; i++) - { - *out_ptr++ = in0[i/input_hw] * in1[i]; - } + *out_ptr++ = (in1[i]) * in0[0]; } - else - return -1; - break; - case ELT_RSQRT: + } + else if (input_chan == input1_count4) + { for (int i = 0; i < input_count4; ++i) { - *out_ptr++ = 1 / sqrt(in0[i]); + *out_ptr++ = in0[i] * in1[i / input_hw]; } - break; - case ELT_MIN_SCALAR: - for (int i = 0; i < input_count4; ++i) + } + else if (input_chan == input_count4) + { + for (int i = 0; i < input1_count4; i++) { - *out_ptr++ = ELT_MIN((*in0++), in1[0]); + *out_ptr++ = in0[i / input_hw] * in1[i]; } - break; - case ELT_SUB_SCALAR: + } + else + return -1; + break; + case ELT_RSQRT: + for (int i = 0; i < input_count4; ++i) + { + *out_ptr++ = 1 / sqrt(in0[i]); + } + break; + case ELT_MIN_SCALAR: + for (int i = 0; i < input_count4; ++i) + { + *out_ptr++ = ELT_MIN((*in0++), in1[0]); + } + break; + case ELT_SUB_SCALAR: + for (int i = 0; i < input_count4; ++i) + { + *out_ptr++ = (*in0++) - in1[0]; + } + break; + case ELT_PROD_SCALAR: + for (int i = 0; i < input_count4; ++i) + { + *out_ptr++ = (*in0++) * in1[0]; + } + break; + case ELT_DIV: + if (input1_count4 == 1) + { for (int i = 0; i < input_count4; ++i) { - *out_ptr++ = (*in0++) - in1[0]; + *out_ptr++ = in0[i] / in1[0]; } - break; - case ELT_PROD_SCALAR: + } + else if (input_count4 == input1_count4) + { for (int i = 0; i < input_count4; ++i) { - *out_ptr++ = (*in0++) * in1[0]; + *out_ptr++ = in0[i] / in1[i]; } - break; - case ELT_DIV: - if (input1_count4 == 1) + } + else if (input_count4 == 1) + { + for (int i = 0; i < input1_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - *out_ptr++ = in0[i] / in1[0]; - } + *out_ptr++ = in0[0] / (*in1++); } - else if (input_count4 == input1_count4) + } + else + { + break; + } + break; + case ELT_POW: + if (input_count4 == 1) + { + for (int i = 0; i < input1_count4; i++) { - for (int i = 0; i < input_count4; ++i) - { - *out_ptr++ = in0[i] / in1[i]; - } + *out_ptr++ = powf(in0[0], in1[i]); } - else if (input_count4 == 1) + } + else if (input1_count4 == 1) + { + for (int i = 0; i < input1_count4; i++) { - for (int i = 0; i < input1_count4; ++i) - { - *out_ptr++ = in0[0] / (*in1++); - } + *out_ptr++ = powf(in0[0], in1[i]); } - else + } + else if (input_count4 == input1_count4) + { + for (int i = 0; i < input_count4; i++) { - break; - } - break; - case ELT_POW: - if(input_count4 == 1){ - for(int i = 0; i < input1_count4; i++){ - *out_ptr++ = powf(in0[0], in1[i]); - } - } else if (input1_count4 == 1){ - for(int i = 0; i < input1_count4; i++){ - *out_ptr++ = powf(in0[0], in1[i]); - } - } else if (input_count4 == input1_count4){ - for(int i = 0; i < input_count4; i++){ - *out_ptr++ = powf(in0[i], in1[i]); - } - } else { - TLOG_ERR("Case not support \n"); - } - break; - case ELT_POWER: - for(int i = 0; i < input_count4; i++){ - *out_ptr++ = powf((eltwise_param->shift + eltwise_param->scale * in0[i]), eltwise_param->power); - } - break; - case ELT_LOG: - for(int i = 0; i < input_count4; i++){ - *out_ptr++ = log(in0[i]); - } - break; - case ELT_EXP: - for(int i = 0; i < input_count4; i++){ - *out_ptr++ = exp(in0[i]); - } - break; - case ELT_SQRT: - for(int i = 0; i < input_count4; i++){ - *out_ptr++ = sqrt(in0[i]); - } - break; - case ELT_FLOOR: - for(int i = 0; i < input_count4; i++){ - *out_ptr++ = floor(in0[i]); - } - break; - case ELT_SQUARE: - for(int i = 0; i < input_count4; i++){ - *out_ptr++ = pow(in0[i], 2); + *out_ptr++ = powf(in0[i], in1[i]); } - break; - default: - break; + } + else + { + TLOG_ERR("Case not support \n"); + } + break; + case ELT_POWER: + for (int i = 0; i < input_count4; i++) + { + *out_ptr++ = powf((eltwise_param->shift + eltwise_param->scale * in0[i]), eltwise_param->power); + } + break; + case ELT_LOG: + for (int i = 0; i < input_count4; i++) + { + *out_ptr++ = log(in0[i]); + } + break; + case ELT_EXP: + for (int i = 0; i < input_count4; i++) + { + *out_ptr++ = exp(in0[i]); + } + break; + case ELT_SQRT: + for (int i = 0; i < input_count4; i++) + { + *out_ptr++ = sqrt(in0[i]); + } + break; + case ELT_FLOOR: + for (int i = 0; i < input_count4; i++) + { + *out_ptr++ = floor(in0[i]); + } + break; + case ELT_SQUARE: + for (int i = 0; i < input_count4; i++) + { + *out_ptr++ = pow(in0[i], 2); + } + break; + default: + break; } return 0; @@ -266,9 +285,9 @@ static int ref_eltwise_uint8(struct tensor* output_tensor, struct tensor* input_ struct tensor* input_tensor1, int type, int input_count4, int input_chan, int input_hw, int input1_count4, int num_thread, int input_hw_1, struct eltwise_param* eltwise_param) { - uint8_t* input0_uint8 = ( uint8_t* )input_tensor0->data; + uint8_t* input0_uint8 = (uint8_t*)input_tensor0->data; uint8_t* input1_uint8 = NULL; - uint8_t* output_uint8 = ( uint8_t* )output_tensor->data; + uint8_t* output_uint8 = (uint8_t*)output_tensor->data; float in_scale0 = input_tensor0->scale; float in_scale1 = 0.f; @@ -278,216 +297,234 @@ static int ref_eltwise_uint8(struct tensor* output_tensor, struct tensor* input_ int out_zero = output_tensor->zero_point; /* input dequant */ - float* in0 = ( float* )sys_malloc(input_tensor0->elem_num * sizeof(float)); + float* in0 = (float*)sys_malloc(input_tensor0->elem_num * sizeof(float)); float* in1 = NULL; - float* out_ptr = ( float* )sys_malloc(output_tensor->elem_num * sizeof(float)); + float* out_ptr = (float*)sys_malloc(output_tensor->elem_num * sizeof(float)); for (int i = 0; i < input_tensor0->elem_num; i++) in0[i] = (input0_uint8[i] - in_zero0) * in_scale0; if (input_tensor1 != NULL) { - input1_uint8 = ( uint8_t* )input_tensor1->data; + input1_uint8 = (uint8_t*)input_tensor1->data; in_scale1 = input_tensor1->scale; in_zero1 = input_tensor1->zero_point; - in1 = ( float* )sys_malloc(input_tensor1->elem_num * sizeof(float)); + in1 = (float*)sys_malloc(input_tensor1->elem_num * sizeof(float)); for (int i = 0; i < input_tensor1->elem_num; i++) in1[i] = (input1_uint8[i] - in_zero1) * in_scale1; } /* eltwise operator */ switch (type) { - case ELT_SUB: - if (input_count4 == input1_count4) - { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] - in1[i]; - } - } - else if (input_chan == input1_count4) - { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] - in1[i / input_hw]; - } - } - else if (input1_count4 == 1) + case ELT_SUB: + if (input_count4 == input1_count4) + { + for (int i = 0; i < input_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] - in1[0]; - } + out_ptr[i] = in0[i] - in1[i]; } - else - return -1; - break; - case ELT_SUM: - if (input1_count4 == 1) + } + else if (input_chan == input1_count4) + { + for (int i = 0; i < input_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] + in1[0]; - } + out_ptr[i] = in0[i] - in1[i / input_hw]; } - else if (input_count4 == input1_count4) + } + else if (input1_count4 == 1) + { + for (int i = 0; i < input_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] + in1[i]; - } + out_ptr[i] = in0[i] - in1[0]; } - else if (input_chan == input1_count4) + } + else + return -1; + break; + case ELT_SUM: + if (input1_count4 == 1) + { + for (int i = 0; i < input_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] + in1[i / input_hw]; - } + out_ptr[i] = in0[i] + in1[0]; } - else if(input_hw == input_hw_1){ - for( int i = 0; i < input_chan; i++){ - for(int j = 0; j < input_hw; j++){ - out_ptr[i] = in0[i*input_hw + j] + in1[j]; - } - } - } - else - return -1; - break; - case ELT_MAX: + } + else if (input_count4 == input1_count4) + { for (int i = 0; i < input_count4; ++i) { - out_ptr[i] = ELT_MAX(in0[i], in1[i]); + out_ptr[i] = in0[i] + in1[i]; } - break; - case ELT_PROD: - if (input1_count4 == 1) + } + else if (input_chan == input1_count4) + { + for (int i = 0; i < input_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] * in1[0]; - } + out_ptr[i] = in0[i] + in1[i / input_hw]; } - else if (input_count4 == input1_count4) + } + else if (input_hw == input_hw_1) + { + for (int i = 0; i < input_chan; i++) { - for (int i = 0; i < input_count4; ++i) + for (int j = 0; j < input_hw; j++) { - out_ptr[i] = in0[i] * in1[i]; + out_ptr[i] = in0[i * input_hw + j] + in1[j]; } } - else if (input_chan == input1_count4) + } + else + return -1; + break; + case ELT_MAX: + for (int i = 0; i < input_count4; ++i) + { + out_ptr[i] = ELT_MAX(in0[i], in1[i]); + } + break; + case ELT_PROD: + if (input1_count4 == 1) + { + for (int i = 0; i < input_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] * in1[i / input_hw]; - } + out_ptr[i] = in0[i] * in1[0]; } - else - return -1; - break; - case ELT_RSQRT: + } + else if (input_count4 == input1_count4) + { for (int i = 0; i < input_count4; ++i) { - out_ptr[i] = 1 / sqrt(in0[i]); + out_ptr[i] = in0[i] * in1[i]; } - break; - case ELT_MIN_SCALAR: + } + else if (input_chan == input1_count4) + { for (int i = 0; i < input_count4; ++i) { - out_ptr[i] = ELT_MIN(in0[i], in1[0]); + out_ptr[i] = in0[i] * in1[i / input_hw]; } - break; - case ELT_SUB_SCALAR: + } + else + return -1; + break; + case ELT_RSQRT: + for (int i = 0; i < input_count4; ++i) + { + out_ptr[i] = 1 / sqrt(in0[i]); + } + break; + case ELT_MIN_SCALAR: + for (int i = 0; i < input_count4; ++i) + { + out_ptr[i] = ELT_MIN(in0[i], in1[0]); + } + break; + case ELT_SUB_SCALAR: + for (int i = 0; i < input_count4; ++i) + { + out_ptr[i] = in0[i] - in1[0]; + } + break; + case ELT_PROD_SCALAR: + for (int i = 0; i < input_count4; ++i) + { + out_ptr[i] = in0[i] * in1[0]; + } + break; + case ELT_DIV: + if (input1_count4 == 1) + { for (int i = 0; i < input_count4; ++i) { - out_ptr[i] = in0[i] - in1[0]; + out_ptr[i] = in0[i] / in1[0]; } - break; - case ELT_PROD_SCALAR: + } + else if (input_count4 == input1_count4) + { for (int i = 0; i < input_count4; ++i) { - out_ptr[i] = in0[i] * in1[0]; + out_ptr[i] = in0[i] / in1[i]; } - break; - case ELT_DIV: - if (input1_count4 == 1) + } + else if (input_count4 == 1) + { + for (int i = 0; i < input1_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] / in1[0]; - } + out_ptr[i] = in0[0] / in1[i]; } - else if (input_count4 == input1_count4) + } + else + { + break; + } + break; + case ELT_POW: + if (input_count4 == 1) + { + for (int i = 0; i < input1_count4; i++) { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] / in1[i]; - } + out_ptr[i] = pow(in0[0], in1[i]); } - else if (input_count4 == 1) + } + else if (input1_count4 == 1) + { + for (int i = 0; i < input1_count4; i++) { - for (int i = 0; i < input1_count4; ++i) - { - out_ptr[i] = in0[0] / in1[i]; - } + out_ptr[i] = pow(in0[0], in1[i]); } - else + } + else if (input_count4 == input1_count4) + { + for (int i = 0; i < input_count4; i++) { - break; - } - break; - case ELT_POW: - if(input_count4 == 1){ - for(int i = 0; i < input1_count4; i++){ - out_ptr[i] = pow(in0[0], in1[i]); - } - } else if (input1_count4 == 1){ - for(int i = 0; i < input1_count4; i++){ - out_ptr[i] = pow(in0[0], in1[i]); - } - } else if (input_count4 == input1_count4){ - for(int i = 0; i < input_count4; i++){ - out_ptr[i] = pow(in0[i], in1[i]); - } - } else { - TLOG_ERR("Case not support \n"); - } - break; - case ELT_POWER: - for(int i = 0; i < input_count4; i++){ - out_ptr[i] = pow((eltwise_param->shift + eltwise_param->scale * in0[i]), eltwise_param->power); + out_ptr[i] = pow(in0[i], in1[i]); } - break; - case ELT_LOG: - for(int i = 0; i < input_count4; i++){ - out_ptr[i] = log(in0[i]); - } - break; - case ELT_EXP: - for(int i = 0; i < input_count4; i++){ - out_ptr[i] = exp(in0[i]); - } - break; - case ELT_SQRT: - for(int i = 0; i < input_count4; i++){ - out_ptr[i] = sqrt(in0[i]); - } - break; - case ELT_FLOOR: - for(int i = 0; i < input_count4; i++){ - out_ptr[i] = floor(in0[i]); - } - break; - case ELT_SQUARE: - for(int i = 0; i < input_count4; i++){ - out_ptr[i] = pow(in0[i], 2); - } - break; - default: - break; + } + else + { + TLOG_ERR("Case not support \n"); + } + break; + case ELT_POWER: + for (int i = 0; i < input_count4; i++) + { + out_ptr[i] = pow((eltwise_param->shift + eltwise_param->scale * in0[i]), eltwise_param->power); + } + break; + case ELT_LOG: + for (int i = 0; i < input_count4; i++) + { + out_ptr[i] = log(in0[i]); + } + break; + case ELT_EXP: + for (int i = 0; i < input_count4; i++) + { + out_ptr[i] = exp(in0[i]); + } + break; + case ELT_SQRT: + for (int i = 0; i < input_count4; i++) + { + out_ptr[i] = sqrt(in0[i]); + } + break; + case ELT_FLOOR: + for (int i = 0; i < input_count4; i++) + { + out_ptr[i] = floor(in0[i]); + } + break; + case ELT_SQUARE: + for (int i = 0; i < input_count4; i++) + { + out_ptr[i] = pow(in0[i], 2); + } + break; + default: + break; } - /* output quant */ for (int i = 0; i < output_tensor->elem_num; i++) { @@ -508,228 +545,246 @@ static int ref_eltwise_uint8(struct tensor* output_tensor, struct tensor* input_ } static int ref_eltwise_int8(struct tensor* output_tensor, struct tensor* input_tensor0, - struct tensor* input_tensor1, int type, int input_count4, int input_chan, int input_hw, - int input1_count4, int num_thread, int input_hw_1, struct eltwise_param* eltwise_param) + struct tensor* input_tensor1, int type, int input_count4, int input_chan, int input_hw, + int input1_count4, int num_thread, int input_hw_1, struct eltwise_param* eltwise_param) { - int8_t* input0_int8 = ( int8_t* )input_tensor0->data; + int8_t* input0_int8 = (int8_t*)input_tensor0->data; int8_t* input1_int8 = NULL; - int8_t* output_int8 = ( int8_t* )output_tensor->data; + int8_t* output_int8 = (int8_t*)output_tensor->data; float in_scale0 = input_tensor0->scale; float in_scale1 = 0.f; float out_scale = output_tensor->scale; /* input dequant */ - float* in0 = ( float* )sys_malloc(input_tensor0->elem_num * sizeof(float)); + float* in0 = (float*)sys_malloc(input_tensor0->elem_num * sizeof(float)); float* in1 = NULL; - float* out_ptr = ( float* )sys_malloc(output_tensor->elem_num * sizeof(float)); + float* out_ptr = (float*)sys_malloc(output_tensor->elem_num * sizeof(float)); for (int i = 0; i < input_tensor0->elem_num; i++) - in0[i] = (float )input0_int8[i] * in_scale0; + in0[i] = (float)input0_int8[i] * in_scale0; if (input_tensor1 != NULL) { - input1_int8 = ( int8_t* )input_tensor1->data; + input1_int8 = (int8_t*)input_tensor1->data; in_scale1 = input_tensor1->scale; - in1 = ( float* )sys_malloc(input_tensor1->elem_num * sizeof(float)); + in1 = (float*)sys_malloc(input_tensor1->elem_num * sizeof(float)); for (int i = 0; i < input_tensor1->elem_num; i++) - in1[i] = (float )input1_int8[i] * in_scale1; + in1[i] = (float)input1_int8[i] * in_scale1; } /* eltwise operator */ switch (type) { - case ELT_SUB: - if (input_count4 == input1_count4) - { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] - in1[i]; - } - } - else if (input_chan == input1_count4) - { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] - in1[i / input_hw]; - } - } - else if (input1_count4 == 1) + case ELT_SUB: + if (input_count4 == input1_count4) + { + for (int i = 0; i < input_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] - in1[0]; - } + out_ptr[i] = in0[i] - in1[i]; } - else - return -1; - break; - case ELT_SUM: - if (input1_count4 == 1) + } + else if (input_chan == input1_count4) + { + for (int i = 0; i < input_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] + in1[0]; - } + out_ptr[i] = in0[i] - in1[i / input_hw]; } - else if (input_count4 == input1_count4) + } + else if (input1_count4 == 1) + { + for (int i = 0; i < input_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] + in1[i]; - } + out_ptr[i] = in0[i] - in1[0]; } - else if (input_chan == input1_count4) + } + else + return -1; + break; + case ELT_SUM: + if (input1_count4 == 1) + { + for (int i = 0; i < input_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] + in1[i / input_hw]; - } - } - else if(input_hw == input_hw_1){ - for( int i = 0; i < input_chan; i++){ - for(int j = 0; j < input_hw; j++){ - out_ptr[i] = in0[i*input_hw + j] + in1[j]; - } - } + out_ptr[i] = in0[i] + in1[0]; } - else - return -1; - break; - case ELT_MAX: + } + else if (input_count4 == input1_count4) + { for (int i = 0; i < input_count4; ++i) { - out_ptr[i] = ELT_MAX(in0[i], in1[i]); + out_ptr[i] = in0[i] + in1[i]; } - break; - case ELT_PROD: - if (input1_count4 == 1) + } + else if (input_chan == input1_count4) + { + for (int i = 0; i < input_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] * in1[0]; - } + out_ptr[i] = in0[i] + in1[i / input_hw]; } - else if (input_count4 == input1_count4) + } + else if (input_hw == input_hw_1) + { + for (int i = 0; i < input_chan; i++) { - for (int i = 0; i < input_count4; ++i) + for (int j = 0; j < input_hw; j++) { - out_ptr[i] = in0[i] * in1[i]; + out_ptr[i] = in0[i * input_hw + j] + in1[j]; } } - else if (input_chan == input1_count4) + } + else + return -1; + break; + case ELT_MAX: + for (int i = 0; i < input_count4; ++i) + { + out_ptr[i] = ELT_MAX(in0[i], in1[i]); + } + break; + case ELT_PROD: + if (input1_count4 == 1) + { + for (int i = 0; i < input_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] * in1[i / input_hw]; - } + out_ptr[i] = in0[i] * in1[0]; } - else - return -1; - break; - case ELT_RSQRT: + } + else if (input_count4 == input1_count4) + { for (int i = 0; i < input_count4; ++i) { - out_ptr[i] = 1 / sqrt(in0[i]); + out_ptr[i] = in0[i] * in1[i]; } - break; - case ELT_MIN_SCALAR: + } + else if (input_chan == input1_count4) + { for (int i = 0; i < input_count4; ++i) { - out_ptr[i] = ELT_MIN(in0[i], in1[0]); + out_ptr[i] = in0[i] * in1[i / input_hw]; } - break; - case ELT_SUB_SCALAR: + } + else + return -1; + break; + case ELT_RSQRT: + for (int i = 0; i < input_count4; ++i) + { + out_ptr[i] = 1 / sqrt(in0[i]); + } + break; + case ELT_MIN_SCALAR: + for (int i = 0; i < input_count4; ++i) + { + out_ptr[i] = ELT_MIN(in0[i], in1[0]); + } + break; + case ELT_SUB_SCALAR: + for (int i = 0; i < input_count4; ++i) + { + out_ptr[i] = in0[i] - in1[0]; + } + break; + case ELT_PROD_SCALAR: + for (int i = 0; i < input_count4; ++i) + { + out_ptr[i] = in0[i] * in1[0]; + } + break; + case ELT_DIV: + if (input1_count4 == 1) + { for (int i = 0; i < input_count4; ++i) { - out_ptr[i] = in0[i] - in1[0]; + out_ptr[i] = in0[i] / in1[0]; } - break; - case ELT_PROD_SCALAR: + } + else if (input_count4 == input1_count4) + { for (int i = 0; i < input_count4; ++i) { - out_ptr[i] = in0[i] * in1[0]; + out_ptr[i] = in0[i] / in1[i]; } - break; - case ELT_DIV: - if (input1_count4 == 1) + } + else if (input_count4 == 1) + { + for (int i = 0; i < input1_count4; ++i) { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] / in1[0]; - } + out_ptr[i] = in0[0] / in1[i]; } - else if (input_count4 == input1_count4) + } + else + { + break; + } + break; + case ELT_POW: + if (input_count4 == 1) + { + for (int i = 0; i < input1_count4; i++) { - for (int i = 0; i < input_count4; ++i) - { - out_ptr[i] = in0[i] / in1[i]; - } + out_ptr[i] = pow(in0[0], in1[i]); } - else if (input_count4 == 1) + } + else if (input1_count4 == 1) + { + for (int i = 0; i < input1_count4; i++) { - for (int i = 0; i < input1_count4; ++i) - { - out_ptr[i] = in0[0] / in1[i]; - } + out_ptr[i] = pow(in0[0], in1[i]); } - else + } + else if (input_count4 == input1_count4) + { + for (int i = 0; i < input_count4; i++) { - break; - } - break; - case ELT_POW: - if(input_count4 == 1){ - for(int i = 0; i < input1_count4; i++){ - out_ptr[i] = pow(in0[0], in1[i]); - } - } else if (input1_count4 == 1){ - for(int i = 0; i < input1_count4; i++){ - out_ptr[i] = pow(in0[0], in1[i]); - } - } else if (input_count4 == input1_count4){ - for(int i = 0; i < input_count4; i++){ - out_ptr[i] = pow(in0[i], in1[i]); - } - } else { - TLOG_ERR("Case not support \n"); + out_ptr[i] = pow(in0[i], in1[i]); } - break; - case ELT_POWER: - for(int i = 0; i < input_count4; i++){ - out_ptr[i] = pow((eltwise_param->shift + eltwise_param->scale * in0[i]), eltwise_param->power); - } - break; - case ELT_LOG: - for(int i = 0; i < input_count4; i++){ - out_ptr[i] = log(in0[i]); - } - break; - case ELT_EXP: - for(int i = 0; i < input_count4; i++){ - out_ptr[i] = exp(in0[i]); - } - break; - case ELT_SQRT: - for(int i = 0; i < input_count4; i++){ - out_ptr[i] = sqrt(in0[i]); - } - break; - case ELT_FLOOR: - for(int i = 0; i < input_count4; i++){ - out_ptr[i] = floor(in0[i]); - } - break; - case ELT_SQUARE: - for(int i = 0; i < input_count4; i++){ - out_ptr[i] = pow(in0[i], 2); - } - break; - default: - break; + } + else + { + TLOG_ERR("Case not support \n"); + } + break; + case ELT_POWER: + for (int i = 0; i < input_count4; i++) + { + out_ptr[i] = pow((eltwise_param->shift + eltwise_param->scale * in0[i]), eltwise_param->power); + } + break; + case ELT_LOG: + for (int i = 0; i < input_count4; i++) + { + out_ptr[i] = log(in0[i]); + } + break; + case ELT_EXP: + for (int i = 0; i < input_count4; i++) + { + out_ptr[i] = exp(in0[i]); + } + break; + case ELT_SQRT: + for (int i = 0; i < input_count4; i++) + { + out_ptr[i] = sqrt(in0[i]); + } + break; + case ELT_FLOOR: + for (int i = 0; i < input_count4; i++) + { + out_ptr[i] = floor(in0[i]); + } + break; + case ELT_SQUARE: + for (int i = 0; i < input_count4; i++) + { + out_ptr[i] = pow(in0[i], 2); + } + break; + default: + break; } - /* output quant */ for (int i = 0; i < output_tensor->elem_num; i++) { @@ -774,7 +829,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor0 = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct eltwise_param* eltwise_param = ( struct eltwise_param* )ir_node->op.param_mem; + struct eltwise_param* eltwise_param = (struct eltwise_param*)ir_node->op.param_mem; int layout = ir_graph->graph_layout; void* input0 = input_tensor0->data; @@ -789,7 +844,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input1 = input_tensor1->data; input1_count4 = input_tensor1->elem_num; int dim1_size = input_tensor1->dim_num; - input_hw_1 = input_tensor1->dims[dim1_size-2]*input_tensor1->dims[dim1_size-1]; + input_hw_1 = input_tensor1->dims[dim1_size - 2] * input_tensor1->dims[dim1_size - 1]; } if (!input_tensor1 || input_tensor0->elem_num >= input_tensor1->elem_num) @@ -800,11 +855,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int dim0_size = input_tensor0->dim_num; if (layout == TENGINE_LAYOUT_NCHW) { - input_chan_0 = input_tensor0->dims[dim0_size-3]; - if(input_tensor0->dims[dim0_size-4]){ - input_chan_0 *= input_tensor0->dims[dim0_size-4]; + input_chan_0 = input_tensor0->dims[dim0_size - 3]; + if (input_tensor0->dims[dim0_size - 4]) + { + input_chan_0 *= input_tensor0->dims[dim0_size - 4]; } - input_hw_0 = input_tensor0->dims[dim0_size-2] * input_tensor0->dims[dim0_size-1]; + input_hw_0 = input_tensor0->dims[dim0_size - 2] * input_tensor0->dims[dim0_size - 1]; } else if (layout == TENGINE_LAYOUT_NHWC) { @@ -825,7 +881,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_chan_0, input_hw_0, input1_count4, exec_graph->num_thread, input_hw_1, eltwise_param); else if (input_tensor0->data_type == TENGINE_DT_INT8) ret = ref_eltwise_int8(output_tensor, input_tensor0, input_tensor1, eltwise_param->type, input0_count4, - input_chan_0, input_hw_0, input1_count4, exec_graph->num_thread, input_hw_1, eltwise_param); + input_chan_0, input_hw_0, input1_count4, exec_graph->num_thread, input_hw_1, eltwise_param); else { TLOG_ERR("Input data type %d not to be supported.\n", input_tensor0->data_type); @@ -866,7 +922,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_chan_0, input_hw_0, input1_count4, exec_graph->num_thread, input_hw_1, eltwise_param); else if (output_tensor->data_type == TENGINE_DT_INT8) ret = ref_eltwise_int8(output_tensor, input_tensor1, input_tensor0, eltwise_param->type, input0_count4, - input_chan_0, input_hw_0, input1_count4, exec_graph->num_thread, input_hw_1, eltwise_param); + input_chan_0, input_hw_0, input1_count4, exec_graph->num_thread, input_hw_1, eltwise_param); else { TLOG_ERR("Input data type %d not to be supported.\n", output_tensor->data_type); diff --git a/source/device/cpu/op/elu/cortex-a/elu_hcl_arm.c b/source/device/cpu/op/elu/cortex-a/elu_hcl_arm.c index 3aca14499..1f7a7aad5 100644 --- a/source/device/cpu/op/elu/cortex-a/elu_hcl_arm.c +++ b/source/device/cpu/op/elu/cortex-a/elu_hcl_arm.c @@ -34,7 +34,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -59,7 +58,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct elu_param* elu_param = ( struct elu_param* )ir_node->op.param_mem; + struct elu_param* elu_param = (struct elu_param*)ir_node->op.param_mem; int num_thread = exec_graph->num_thread; diff --git a/source/device/cpu/op/elu/cortex-a/elu_kernel_arm.c b/source/device/cpu/op/elu/cortex-a/elu_kernel_arm.c index 78da9c64c..1125dc958 100644 --- a/source/device/cpu/op/elu/cortex-a/elu_kernel_arm.c +++ b/source/device/cpu/op/elu/cortex-a/elu_kernel_arm.c @@ -30,10 +30,9 @@ #include - static void elu_kernel(int i, int id, void* data, const float* input, float* output, float alpha) { - int elem_num = (( int* )data)[0]; + int elem_num = ((int*)data)[0]; float32x4_t _one = vdupq_n_f32(1.f); float32x4_t _zero = vdupq_n_f32(0.f); float32x4_t _alpha = vdupq_n_f32(alpha); @@ -67,8 +66,8 @@ static void elu_kernel(int i, int id, void* data, const float* input, float* out int elu_run(struct tensor* output_tensor, struct tensor* input_tensor, struct elu_param* elu_param, int num_thread) { - float* data = ( float* )input_tensor->data; - float* out_data = ( float* )output_tensor->data; + float* data = (float*)input_tensor->data; + float* out_data = (float*)output_tensor->data; float alpha = elu_param->alpha; int chan_num = (input_tensor->dims[0]) * (input_tensor->dims[1]); diff --git a/source/device/cpu/op/elu/cortex-a/elu_kernel_arm.h b/source/device/cpu/op/elu/cortex-a/elu_kernel_arm.h index 172ddf329..a3d937afe 100644 --- a/source/device/cpu/op/elu/cortex-a/elu_kernel_arm.h +++ b/source/device/cpu/op/elu/cortex-a/elu_kernel_arm.h @@ -29,7 +29,6 @@ #include "graph/tensor.h" - int elu_run(struct tensor* output_tensor, struct tensor* input_tensor, struct elu_param* elu_param, int num_thread); diff --git a/source/device/cpu/op/elu/elu_ref.c b/source/device/cpu/op/elu/elu_ref.c index 1db10f74e..1d41d940d 100644 --- a/source/device/cpu/op/elu/elu_ref.c +++ b/source/device/cpu/op/elu/elu_ref.c @@ -37,7 +37,6 @@ #include - typedef struct __elu_param { float scale; @@ -45,7 +44,6 @@ typedef struct __elu_param float alpha; } _elu_param, *p_elu_param; - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -89,12 +87,12 @@ int ref_elu_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int int input_size = input_tensor->elem_num; int output_size = output_tensor->elem_num; - float* data = ( float* )sys_malloc(input_size * sizeof(float)); - float* out_data = ( float* )sys_malloc(output_size * sizeof(float)); + float* data = (float*)sys_malloc(input_size * sizeof(float)); + float* out_data = (float*)sys_malloc(output_size * sizeof(float)); for (int i = 0; i < input_size; i++) { - data[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale; + data[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale; } for (int i = 0; i < size; i++) @@ -108,7 +106,7 @@ int ref_elu_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int out_data[i] = data[i]; } } - + /* quant */ for (int i = 0; i < output_size; i++) { @@ -134,7 +132,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct elu_param* param = ( struct elu_param* )node->op.param_mem; + struct elu_param* param = (struct elu_param*)node->op.param_mem; int elem_num = input_tensor->elem_num; void* in_data = input_tensor->data; @@ -150,7 +148,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex if (input_tensor->data_type == TENGINE_DT_FP32) ref_elu_fp32((float*)in_data, (float*)out_data, elem_num, &op_param); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ref_elu_uint8(input_tensor, output_tensor, elem_num, &op_param); return 0; diff --git a/source/device/cpu/op/embedding/embedding_ref.c b/source/device/cpu/op/embedding/embedding_ref.c index 188a5aeb6..5fe920a6a 100644 --- a/source/device/cpu/op/embedding/embedding_ref.c +++ b/source/device/cpu/op/embedding/embedding_ref.c @@ -36,7 +36,6 @@ #include - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -62,7 +61,7 @@ int ref_embed_fp32(float* in_data, float* out_data, float* weight_data, float* b word_index = 0; if (word_index >= input_dim) word_index = input_dim - 1; - const float* embed = ( const float* )weight_data + num_output * word_index; + const float* embed = (const float*)weight_data + num_output * word_index; for (int z = 0; z < num_output; z++) { out_data[i * num_output + z] = embed[z]; @@ -82,7 +81,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct embedding_param* param = ( struct embedding_param* )node->op.param_mem; + struct embedding_param* param = (struct embedding_param*)node->op.param_mem; struct tensor* weight_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]); struct tensor* bias_tensor = NULL; @@ -91,9 +90,9 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex bias_tensor = get_ir_graph_tensor(graph, node->input_tensors[2]); } - return ref_embed_fp32((float*)input->data, (float*)output->data, (float*)weight_tensor->data, - bias_tensor ? (float*)bias_tensor->data : NULL, param->input_dim, param->num_output, - input->elem_size, param->bias_term, 1.0f, 0.0f); + return ref_embed_fp32((float*)input->data, (float*)output->data, (float*)weight_tensor->data, + bias_tensor ? (float*)bias_tensor->data : NULL, param->input_dim, param->num_output, + input->elem_size, param->bias_term, 1.0f, 0.0f); } static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) diff --git a/source/device/cpu/op/expand/expand_ref.c b/source/device/cpu/op/expand/expand_ref.c index e92b30d99..fc0bdcfe4 100644 --- a/source/device/cpu/op/expand/expand_ref.c +++ b/source/device/cpu/op/expand/expand_ref.c @@ -36,7 +36,6 @@ #include #include - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -47,7 +46,8 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, return 0; } -int ref_expand_fp32(float* in1_data, float* in2_data, float* out_data, int* in1_dims, int* in2_dims){ +int ref_expand_fp32(float* in1_data, float* in2_data, float* out_data, int* in1_dims, int* in2_dims) +{ int i_n = in1_dims[0] == 0 ? 1 : in1_dims[0]; int i_c = in1_dims[1] == 0 ? 1 : in1_dims[1]; int i_h = in1_dims[2] == 0 ? 1 : in1_dims[2]; @@ -59,26 +59,26 @@ int ref_expand_fp32(float* in1_data, float* in2_data, float* out_data, int* in1_ int o_w = in2_dims[3] == 0 ? 1 : in2_dims[3]; int int_max = INT_MAX; - if(i_n > int_max / i_c || i_h > int_max /(i_n*i_c) || i_w > int_max / (i_n * i_c * i_h)) + if (i_n > int_max / i_c || i_h > int_max / (i_n * i_c) || i_w > int_max / (i_n * i_c * i_h)) { TLOG_INFO("input dims overflow!"); return -1; } - if(o_n > int_max /o_c || o_h > int_max/(o_n*o_c)||o_w > int_max/(o_n*o_c*o_h)) + if (o_n > int_max / o_c || o_h > int_max / (o_n * o_c) || o_w > int_max / (o_n * o_c * o_h)) { TLOG_INFO("output dims overflow!"); return -1; } - + int index = 0; int i_index = 0; - if( 1 == i_n && 1 == i_h && 1 == i_w && 1 == o_n && i_c == o_c) + if (1 == i_n && 1 == i_h && 1 == i_w && 1 == o_n && i_c == o_c) { - for(int n = 0; n < o_n; ++n) + for (int n = 0; n < o_n; ++n) { - for(int c = 0; c < o_c ; c++) + for (int c = 0; c < o_c; c++) { - for(int i = 0; i < o_h*o_w; i++) + for (int i = 0; i < o_h * o_w; i++) { out_data[index++] = in1_data[i_index]; } @@ -86,23 +86,23 @@ int ref_expand_fp32(float* in1_data, float* in2_data, float* out_data, int* in1_ } } } - else + else { int i_size = i_n * i_c * i_h * i_w; int refreshed = 0; - for(int n = 0; n < o_n; n++) + for (int n = 0; n < o_n; n++) { - for(int c = 0; c < o_c; c++) + for (int c = 0; c < o_c; c++) { - for(int h = 0; h < o_h; ++h) + for (int h = 0; h < o_h; ++h) { - for(int w = 0; w < o_w; ++w) + for (int w = 0; w < o_w; ++w) { refreshed = 0; if (i_index == i_size) i_index = 0; out_data[index++] = in1_data[i_index]; - if (i_w != 1) + if (i_w != 1) { i_index++; refreshed = 1; @@ -136,34 +136,33 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input1_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* input2_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct expand_param* param = ( struct expand_param* )ir_node->op.param_mem; + struct expand_param* param = (struct expand_param*)ir_node->op.param_mem; int dim1_size = input1_tensor->dim_num; int dim2_size = input2_tensor->dim_num; - - int* input1_dims = (int*)malloc(sizeof(int)*4); - int* input2_dims = (int*)malloc(sizeof(int)*4); - for(int i = 0; i < 4; i++) + int* input1_dims = (int*)malloc(sizeof(int) * 4); + int* input2_dims = (int*)malloc(sizeof(int) * 4); + for (int i = 0; i < 4; i++) { input1_dims[i] = 0; } - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { input2_dims[i] = 0; } - for(int i = 0; i < dim1_size ; i++) + for (int i = 0; i < dim1_size; i++) { input1_dims[i] = input1_tensor->dims[i]; } - for(int i = 0; i < param->dim_num; i++) + for (int i = 0; i < param->dim_num; i++) { input2_dims[i] = param->ex_shape[i]; } - ref_expand_fp32((float*)input1_tensor->data, (float*)input2_tensor->data, - (float*)output_tensor->data, input1_dims, input2_dims); + ref_expand_fp32((float*)input1_tensor->data, (float*)input2_tensor->data, + (float*)output_tensor->data, input1_dims, input2_dims); free(input1_dims); free(input2_dims); @@ -177,12 +176,12 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc } static struct node_ops expand_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score}; int register_expand_ref_op() { diff --git a/source/device/cpu/op/expanddims/expanddims_ref.c b/source/device/cpu/op/expanddims/expanddims_ref.c index ae2c31a93..7cd37a4dd 100644 --- a/source/device/cpu/op/expanddims/expanddims_ref.c +++ b/source/device/cpu/op/expanddims/expanddims_ref.c @@ -34,7 +34,6 @@ #include - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; diff --git a/source/device/cpu/op/fc/cortex-a/armv8.2/fc_kernel_fp16_arm82.c b/source/device/cpu/op/fc/cortex-a/armv8.2/fc_kernel_fp16_arm82.c index b472c6fcc..28cf324c7 100644 --- a/source/device/cpu/op/fc/cortex-a/armv8.2/fc_kernel_fp16_arm82.c +++ b/source/device/cpu/op/fc/cortex-a/armv8.2/fc_kernel_fp16_arm82.c @@ -29,7 +29,6 @@ #include #include - void hgemv_1x8_a55(__fp16* biases, __fp16* input, __fp16* kernel, long kernel_size, __fp16* output); void hgemv_1x2_a55(__fp16* biases, __fp16* input, __fp16* kernel, long kernel_size, __fp16* output); @@ -41,12 +40,12 @@ void hgemv1x8(const __fp16* input, const __fp16* output, __fp16* weight_interlea __fp16 *cur_kernel, *cur_biases, *cur_result; // #pragma omp parallel for num_threads(num_thread) - for(ch = start_channel; ch < end_channel; ch += 8) + for (ch = start_channel; ch < end_channel; ch += 8) { - cur_kernel = ( __fp16* )(weight_interleaved + kernel_size * ch); - cur_result = ( __fp16* )(output + ch); - cur_biases = biases ? ( __fp16* )(biases + ch) : NULL; - hgemv_1x8_a55(cur_biases, ( __fp16* )input, cur_kernel, kernel_size, cur_result); // todo implement with A76 + cur_kernel = (__fp16*)(weight_interleaved + kernel_size * ch); + cur_result = (__fp16*)(output + ch); + cur_biases = biases ? (__fp16*)(biases + ch) : NULL; + hgemv_1x8_a55(cur_biases, (__fp16*)input, cur_kernel, kernel_size, cur_result); // todo implement with A76 } } @@ -58,26 +57,25 @@ void hgemv1x2(const __fp16* input, const __fp16* output, __fp16* weight_interlea int ch = 0; __fp16 *cur_kernel, *cur_biases, *cur_result; - for(ch = start_channel; ch < (end_channel & -2); ch += 2) + for (ch = start_channel; ch < (end_channel & -2); ch += 2) { - cur_kernel = ( __fp16* )(weight_interleaved + kernel_size * ch); - cur_result = ( __fp16* )(output + ch); - cur_biases = biases ? ( __fp16* )(biases + ch) : NULL; - hgemv_1x2_a55(cur_biases, ( __fp16* )input, cur_kernel, kernel_size, cur_result); + cur_kernel = (__fp16*)(weight_interleaved + kernel_size * ch); + cur_result = (__fp16*)(output + ch); + cur_biases = biases ? (__fp16*)(biases + ch) : NULL; + hgemv_1x2_a55(cur_biases, (__fp16*)input, cur_kernel, kernel_size, cur_result); } - if(end_channel & 0x1) + if (end_channel & 0x1) { - cur_kernel = ( __fp16* )(weight_interleaved + kernel_size * ch); - cur_result = ( __fp16* )(output + ch); + cur_kernel = (__fp16*)(weight_interleaved + kernel_size * ch); + cur_result = (__fp16*)(output + ch); sum = biases ? *(biases + ch) : 0.f; - for(int j = 0; j < kernel_size; j++) + for (int j = 0; j < kernel_size; j++) sum = sum + input[j] * cur_kernel[j]; *cur_result = sum; } } - static void interleave_kernel(const __fp16* kernel, __fp16* kernel_interleaved, int out_chan, int kernel_size) { int i, j, k; @@ -85,46 +83,45 @@ static void interleave_kernel(const __fp16* kernel, __fp16* kernel_interleaved, __fp16* cur_kernel_interleaved; // interleave 8 kernel - for(i = 0; i < (out_chan & -8); i += 8) + for (i = 0; i < (out_chan & -8); i += 8) { - for(j = 0; j < 8; j++) - cur_kernel[j] = ( __fp16* )kernel + kernel_size * (i + j); - cur_kernel_interleaved = ( __fp16* )kernel_interleaved + kernel_size * i; - for(k = 0; k < kernel_size; k++) - for(j = 0; j < 8; j++) + for (j = 0; j < 8; j++) + cur_kernel[j] = (__fp16*)kernel + kernel_size * (i + j); + cur_kernel_interleaved = (__fp16*)kernel_interleaved + kernel_size * i; + for (k = 0; k < kernel_size; k++) + for (j = 0; j < 8; j++) cur_kernel_interleaved[8 * k + j] = *(cur_kernel[j] + k); } // interleave 2 kernel - for(; i < (out_chan & -2); i += 2) + for (; i < (out_chan & -2); i += 2) { - for(j = 0; j < 2; j++) - cur_kernel[j] = ( __fp16* )kernel + kernel_size * (i + j); - cur_kernel_interleaved = ( __fp16* )kernel_interleaved + kernel_size * i; - for(k = 0; k < kernel_size; k++) - for(j = 0; j < 2; j++) + for (j = 0; j < 2; j++) + cur_kernel[j] = (__fp16*)kernel + kernel_size * (i + j); + cur_kernel_interleaved = (__fp16*)kernel_interleaved + kernel_size * i; + for (k = 0; k < kernel_size; k++) + for (j = 0; j < 2; j++) cur_kernel_interleaved[2 * k + j] = *(cur_kernel[j] + k); } // copy last kernel - if(out_chan & 0x1) + if (out_chan & 0x1) { - cur_kernel[0] = ( __fp16* )kernel + kernel_size * i; - cur_kernel_interleaved = ( __fp16* )kernel_interleaved + kernel_size * i; - for(k = 0; k < kernel_size; k++) + cur_kernel[0] = (__fp16*)kernel + kernel_size * i; + cur_kernel_interleaved = (__fp16*)kernel_interleaved + kernel_size * i; + for (k = 0; k < kernel_size; k++) cur_kernel_interleaved[k] = *(cur_kernel[0] + k); } return; } -int fp16_fc_kernel_prerun(struct tensor* input_tensor , \ - struct tensor* filter_tensor , \ - struct tensor* output_tensor , \ - struct fc_priv_info* priv_info , \ - struct fc_param* param) +int fp16_fc_kernel_prerun(struct tensor* input_tensor, + struct tensor* filter_tensor, + struct tensor* output_tensor, + struct fc_priv_info* priv_info, + struct fc_param* param) { - int num_output = param->num_output; int kernel_size = filter_tensor->dims[1]; int kernel_align = ((kernel_size + 1) & -2); @@ -151,14 +148,13 @@ int fp16_fc_kernel_prerun(struct tensor* input_tensor , \ return 0; } - -int fp16_fc_kernel_run(struct tensor* input_tensor , \ - struct tensor* filter_tensor , \ - struct tensor* bias_tensor , \ - struct tensor* output_tensor , \ - struct fc_priv_info* priv_info , \ - struct fc_param* param, \ - int num_thread, int cpu_affinity) +int fp16_fc_kernel_run(struct tensor* input_tensor, + struct tensor* filter_tensor, + struct tensor* bias_tensor, + struct tensor* output_tensor, + struct fc_priv_info* priv_info, + struct fc_param* param, + int num_thread, int cpu_affinity) { int out_num = param->num_output; int kernel_size = filter_tensor->dims[1]; @@ -172,16 +168,15 @@ int fp16_fc_kernel_run(struct tensor* input_tensor , \ int out_num_8 = out_num & ~7; - for(int i = 0; i < input_tensor->dims[0]; i++) + for (int i = 0; i < input_tensor->dims[0]; i++) { __fp16* cur_input = input + i * kernel_size; __fp16* cur_output = output + i * out_num; hgemv1x8(cur_input, cur_output, weight, biases, kernel_size, 0, out_num_8, num_thread, cpu_affinity); - if(out_num & 0x7) + if (out_num & 0x7) hgemv1x2(cur_input, cur_output, weight, biases, kernel_size, out_num_8, out_num, num_thread, cpu_affinity); } - return 0 ; - + return 0; } diff --git a/source/device/cpu/op/fc/cortex-a/armv8.2/fc_kernel_fp16_arm82.h b/source/device/cpu/op/fc/cortex-a/armv8.2/fc_kernel_fp16_arm82.h index 50b84a2ae..b3dfd0c13 100644 --- a/source/device/cpu/op/fc/cortex-a/armv8.2/fc_kernel_fp16_arm82.h +++ b/source/device/cpu/op/fc/cortex-a/armv8.2/fc_kernel_fp16_arm82.h @@ -27,20 +27,18 @@ #include "../fc_kernel_arm.h" - -int fp16_fc_kernel_prerun(struct tensor* input_tensor , \ - struct tensor* filter_tensor , \ - struct tensor* output_tensor , \ - struct fc_priv_info* priv_info , \ - struct fc_param* param) ; - -int fp16_fc_kernel_run(struct tensor* input_tensor , \ - struct tensor* filter_tensor ,\ - struct tensor* bias_tensor , \ - struct tensor* output_tensor , \ - struct fc_priv_info* priv_info , \ - struct fc_param* param, \ - int num_thread, int cpu_affinity) ; - +int fp16_fc_kernel_prerun(struct tensor* input_tensor, + struct tensor* filter_tensor, + struct tensor* output_tensor, + struct fc_priv_info* priv_info, + struct fc_param* param); + +int fp16_fc_kernel_run(struct tensor* input_tensor, + struct tensor* filter_tensor, + struct tensor* bias_tensor, + struct tensor* output_tensor, + struct fc_priv_info* priv_info, + struct fc_param* param, + int num_thread, int cpu_affinity); #endif diff --git a/source/device/cpu/op/fc/cortex-a/fc_hcl_arm.c b/source/device/cpu/op/fc/cortex-a/fc_hcl_arm.c index 2a7364106..d9322b864 100644 --- a/source/device/cpu/op/fc/cortex-a/fc_hcl_arm.c +++ b/source/device/cpu/op/fc/cortex-a/fc_hcl_arm.c @@ -41,12 +41,10 @@ #include - #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #include "cortex_a/fc_kernel_fp16_arm82.h" #endif - static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { struct node* ir_node = exec_node->ir_node; @@ -55,8 +53,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct struct tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct fc_priv_info* priv_info = ( struct fc_priv_info* )exec_node->ops_priv; - struct fc_param* fc_param = ( struct fc_param* )ir_node->op.param_mem; + struct fc_priv_info* priv_info = (struct fc_priv_info*)exec_node->ops_priv; + struct fc_param* fc_param = (struct fc_param*)ir_node->op.param_mem; /* fp32 prerun */ if (exec_graph->mode == TENGINE_MODE_FP32) @@ -71,7 +69,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC else if (exec_graph->mode == TENGINE_MODE_FP16) { - if(fp16_fc_kernel_prerun(input_tensor, filter_tensor, output_tensor, priv_info, fc_param) < 0) + if (fp16_fc_kernel_prerun(input_tensor, filter_tensor, output_tensor, priv_info, fc_param) < 0) { TLOG_ERR("hcl fp16 fc prerun failed\n"); // set_tengine_errno(EFAULT); @@ -79,14 +77,14 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct } } #endif - else if(exec_graph->mode == TENGINE_MODE_INT8) - { + else if (exec_graph->mode == TENGINE_MODE_INT8) + { if (int8_fc_kernel_prerun(input_tensor, filter_tensor, output_tensor, priv_info, fc_param) < 0) { TLOG_ERR("hcl fc prerun failed\n"); return -1; } - } + } else { TLOG_ERR("Tengine work node not support %d\n", exec_graph->mode); @@ -114,14 +112,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct fc_param* fc_param = ( struct fc_param* )ir_node->op.param_mem; - struct fc_priv_info* priv_info = ( struct fc_priv_info* )exec_node->ops_priv; + struct fc_param* fc_param = (struct fc_param*)ir_node->op.param_mem; + struct fc_priv_info* priv_info = (struct fc_priv_info*)exec_node->ops_priv; /* fp32 run */ if (exec_graph->mode == TENGINE_MODE_FP32) { if (fc_kernel_run(input_tensor, weight_tensor, bias_tensor, output_tensor, priv_info, fc_param, num_thread, - cpu_affinity) < 0) + cpu_affinity) + < 0) { TLOG_ERR("hcl fc run failed\n"); return -1; @@ -139,14 +138,14 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex } } #endif - else if (exec_graph->mode == TENGINE_MODE_INT8) - { - if (int8_fc_kernel_run(input_tensor, weight_tensor, bias_tensor, output_tensor, priv_info,fc_param,num_thread,cpu_affinity) < 0) + else if (exec_graph->mode == TENGINE_MODE_INT8) + { + if (int8_fc_kernel_run(input_tensor, weight_tensor, bias_tensor, output_tensor, priv_info, fc_param, num_thread, cpu_affinity) < 0) { TLOG_ERR("hcl fc run failed\n"); return -1; } - } + } else { TLOG_ERR("Tengine work node not support %d\n", exec_graph->mode); @@ -229,7 +228,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct fc_priv_info* priv_info = ( struct fc_priv_info* )exec_node->ops_priv; + struct fc_priv_info* priv_info = (struct fc_priv_info*)exec_node->ops_priv; /* fp32 postrun */ if (exec_graph->mode == TENGINE_MODE_FP32 || exec_graph->mode == TENGINE_MODE_UINT8) @@ -251,7 +250,7 @@ static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struc static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { /* init the private info data of convolution op */ - struct fc_priv_info* priv_info = ( struct fc_priv_info* )sys_malloc(sizeof(struct fc_priv_info)); + struct fc_priv_info* priv_info = (struct fc_priv_info*)sys_malloc(sizeof(struct fc_priv_info)); if (priv_info == NULL) { return -1; @@ -264,7 +263,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct fc_priv_info* priv_info = ( struct fc_priv_info* )exec_node->ops_priv; + struct fc_priv_info* priv_info = (struct fc_priv_info*)exec_node->ops_priv; sys_free(priv_info); exec_node->ops_priv = NULL; @@ -283,7 +282,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return 0; #else if (input_tensor->data_type != TENGINE_DT_FP32 - // && input_tensor->data_type != TENGINE_DT_INT8 // 从tengine移植的 fc int8 arm 与 fc int8 ref 相比相差较大,暂且关闭 + // && input_tensor->data_type != TENGINE_DT_INT8 // 从tengine移植的 fc int8 arm 与 fc int8 ref 相比相差较大,暂且关闭 ) return 0; #endif @@ -297,8 +296,7 @@ static struct node_ops hcl_node_ops = {.prerun = prerun, .postrun = postrun, .init_node = init_node, .release_node = release_node, - .score = score -}; + .score = score}; int register_fc_hcl_arm_op() { diff --git a/source/device/cpu/op/fc/cortex-a/fc_kernel_arm.c b/source/device/cpu/op/fc/cortex-a/fc_kernel_arm.c index 15c46faee..42ea8ca63 100644 --- a/source/device/cpu/op/fc/cortex-a/fc_kernel_arm.c +++ b/source/device/cpu/op/fc/cortex-a/fc_kernel_arm.c @@ -32,7 +32,6 @@ #include #include - #ifdef __aarch64__ void sgemv_1x8_a72(float* biases, float* input, float* kernel, long kernel_size, float* output); void sgemv_1x2_a72(float* biases, float* input, float* kernel, long kernel_size, float* output); @@ -96,8 +95,8 @@ static void interleave_kernel(const float* kernel, float* kernel_interleaved, in for (i = 0; i < (out_chan & -8); i += 8) { for (j = 0; j < 8; j++) - cur_kernel[j] = ( float* )kernel + kernel_size * (i + j); - cur_kernel_interleaved = ( float* )kernel_interleaved + kernel_size * i; + cur_kernel[j] = (float*)kernel + kernel_size * (i + j); + cur_kernel_interleaved = (float*)kernel_interleaved + kernel_size * i; for (k = 0; k < kernel_size; k++) for (j = 0; j < 8; j++) cur_kernel_interleaved[8 * k + j] = *(cur_kernel[j] + k); @@ -107,8 +106,8 @@ static void interleave_kernel(const float* kernel, float* kernel_interleaved, in for (; i < (out_chan & -2); i += 2) { for (j = 0; j < 2; j++) - cur_kernel[j] = ( float* )kernel + kernel_size * (i + j); - cur_kernel_interleaved = ( float* )kernel_interleaved + kernel_size * i; + cur_kernel[j] = (float*)kernel + kernel_size * (i + j); + cur_kernel_interleaved = (float*)kernel_interleaved + kernel_size * i; for (k = 0; k < kernel_size; k++) for (j = 0; j < 2; j++) cur_kernel_interleaved[2 * k + j] = *(cur_kernel[j] + k); @@ -117,8 +116,8 @@ static void interleave_kernel(const float* kernel, float* kernel_interleaved, in // copy last kernel if (out_chan & 0x1) { - cur_kernel[0] = ( float* )kernel + kernel_size * i; - cur_kernel_interleaved = ( float* )kernel_interleaved + kernel_size * i; + cur_kernel[0] = (float*)kernel + kernel_size * i; + cur_kernel_interleaved = (float*)kernel_interleaved + kernel_size * i; for (k = 0; k < kernel_size; k++) cur_kernel_interleaved[k] = *(cur_kernel[0] + k); } @@ -127,7 +126,7 @@ static void interleave_kernel(const float* kernel, float* kernel_interleaved, in int fc_kernel_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor, struct fc_priv_info* priv_info, struct fc_param* param) { - int num_output = param->num_output; + int num_output = param->num_output; int kernel_size = filter_tensor->dims[1]; if (!priv_info->interleave_buffer) @@ -139,8 +138,8 @@ int fc_kernel_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, priv_info->interleave_buffer_size = mem_size; } - float* filter_data = ( float* )filter_tensor->data; - interleave_kernel(filter_data, ( float* )priv_info->interleave_buffer, num_output, kernel_size); + float* filter_data = (float*)filter_tensor->data; + interleave_kernel(filter_data, (float*)priv_info->interleave_buffer, num_output, kernel_size); return 0; } @@ -170,7 +169,7 @@ int fc_kernel_run(struct tensor* input_tensor, struct tensor* filter_tensor, str int out_num = param->num_output; int kernel_size = filter_tensor->dims[1]; - float* input = input_tensor->data; + float* input = input_tensor->data; float* output = output_tensor->data; float* biases = NULL; if (bias_tensor) diff --git a/source/device/cpu/op/fc/cortex-a/fc_kernel_arm.h b/source/device/cpu/op/fc/cortex-a/fc_kernel_arm.h index 5b6e7e835..2ff456372 100644 --- a/source/device/cpu/op/fc/cortex-a/fc_kernel_arm.h +++ b/source/device/cpu/op/fc/cortex-a/fc_kernel_arm.h @@ -31,7 +31,6 @@ #include "graph/node.h" #include "graph/graph.h" - struct fc_priv_info { void* interleave_buffer; diff --git a/source/device/cpu/op/fc/cortex-a/fc_kernel_int8_arm.c b/source/device/cpu/op/fc/cortex-a/fc_kernel_int8_arm.c index af6fcb7e9..25143bfe7 100644 --- a/source/device/cpu/op/fc/cortex-a/fc_kernel_int8_arm.c +++ b/source/device/cpu/op/fc/cortex-a/fc_kernel_int8_arm.c @@ -31,19 +31,19 @@ #include - -void gemv_1x8_int8(int32_t *biases, const float *scales, int8_t *inp, int8_t *kernel, long kernel_size, - int8_t *output) { +void gemv_1x8_int8(int32_t* biases, const float* scales, int8_t* inp, int8_t* kernel, long kernel_size, + int8_t* output) +{ int8x8_t input; int8x16_t weight_0_1, weight_2_3, weight_4_5, weight_6_7; int16x8_t weight0_16, weight1_16, weight2_16, weight3_16; int16x8_t weight4_16, weight5_16, weight6_16, weight7_16; int32x4_t res = {0, 0, 0, 0}; int32x4_t res1 = {0, 0, 0, 0}; - int8_t *input_ptr = inp; - int8_t *weight_ptr = kernel; + int8_t* input_ptr = inp; + int8_t* weight_ptr = kernel; int remainw = (kernel_size >> 3) << 3; - for (int i = 0; i < remainw; i = i + 8) + for (int i = 0; i < remainw; i = i + 8) { input = vld1_s8(input_ptr); weight_0_1 = vld1q_s8(weight_ptr); @@ -74,7 +74,7 @@ void gemv_1x8_int8(int32_t *biases, const float *scales, int8_t *inp, int8_t *ke weight_ptr += 64; } - for (int i = remainw; i < kernel_size; ++i) + for (int i = remainw; i < kernel_size; ++i) { weight0_16 = vmull_s8(vdup_n_s8(input_ptr[0]), vld1_s8(weight_ptr)); res = vaddq_s32(vmovl_s16(vget_low_s16(weight0_16)), res); @@ -83,12 +83,12 @@ void gemv_1x8_int8(int32_t *biases, const float *scales, int8_t *inp, int8_t *ke weight_ptr += 8; } - if (biases) + if (biases) { int32x4_t bias = vld1q_s32(biases); int32x4_t bias1 = vld1q_s32(biases + 4); - res = vaddq_s32(res,bias); - res1 = vaddq_s32(res1,bias1); + res = vaddq_s32(res, bias); + res1 = vaddq_s32(res1, bias1); } float32x4_t res_f = vcvtq_f32_s32(res); @@ -99,8 +99,8 @@ void gemv_1x8_int8(int32_t *biases, const float *scales, int8_t *inp, int8_t *ke res_f = vmulq_f32(res_f, scale); res1_f = vmulq_f32(res1_f, scale_1); - res_f = vaddq_f32(res_f,vdupq_n_f32(0.5f)); - res1_f = vaddq_f32(res1_f,vdupq_n_f32(0.5f)); + res_f = vaddq_f32(res_f, vdupq_n_f32(0.5f)); + res1_f = vaddq_f32(res1_f, vdupq_n_f32(0.5f)); res = vcvtq_s32_f32(res_f); res1 = vcvtq_s32_f32(res1_f); @@ -114,18 +114,18 @@ void gemv_1x8_int8(int32_t *biases, const float *scales, int8_t *inp, int8_t *ke vst1_s8(output, result); } -void gemv_1x2_int8(const int32_t *biases, const float *scales, int8_t *inp, int8_t *kernel, long kernel_size, - int8_t *output) +void gemv_1x2_int8(const int32_t* biases, const float* scales, int8_t* inp, int8_t* kernel, long kernel_size, + int8_t* output) { - int8_t *input_ptr = inp; - int8_t *weight_ptr = kernel; + int8_t* input_ptr = inp; + int8_t* weight_ptr = kernel; int remainw = (kernel_size << 3) >> 3; int8x8x2_t weight; int8x8_t input; int16x8_t out_16_0, out_16_1; int32x4_t out_32_0, out_32_1; int32_t sum0 = 0, sum1 = 0; - for (int i = 0; i < remainw; i = i + 8) + for (int i = 0; i < remainw; i = i + 8) { weight = vld2_s8(weight_ptr); input = vld1_s8(input_ptr); @@ -133,15 +133,13 @@ void gemv_1x2_int8(const int32_t *biases, const float *scales, int8_t *inp, int8 out_16_1 = vmull_s8(weight.val[1], input); out_32_0 = vpaddlq_s16(out_16_0); out_32_1 = vpaddlq_s16(out_16_1); - sum0 += vgetq_lane_s32(out_32_0, 0) + vgetq_lane_s32(out_32_0, 1) + - vgetq_lane_s32(out_32_0, 2) + vgetq_lane_s32(out_32_0, 3); - sum1 += vgetq_lane_s32(out_32_1, 0) + vgetq_lane_s32(out_32_1, 1) + - vgetq_lane_s32(out_32_1, 2) + vgetq_lane_s32(out_32_1, 3); + sum0 += vgetq_lane_s32(out_32_0, 0) + vgetq_lane_s32(out_32_0, 1) + vgetq_lane_s32(out_32_0, 2) + vgetq_lane_s32(out_32_0, 3); + sum1 += vgetq_lane_s32(out_32_1, 0) + vgetq_lane_s32(out_32_1, 1) + vgetq_lane_s32(out_32_1, 2) + vgetq_lane_s32(out_32_1, 3); weight_ptr += 16; input_ptr += 8; } - for (int i = remainw; i < kernel_size; ++i) + for (int i = remainw; i < kernel_size; ++i) { sum0 += weight_ptr[0] * input_ptr[0]; sum1 += weight_ptr[1] * input_ptr[0]; @@ -149,7 +147,7 @@ void gemv_1x2_int8(const int32_t *biases, const float *scales, int8_t *inp, int8 weight_ptr += 2; } - if (biases) + if (biases) { sum0 += biases[0]; sum1 += biases[1]; @@ -172,51 +170,53 @@ void gemv_1x2_int8(const int32_t *biases, const float *scales, int8_t *inp, int8 } // start and end channel must be 8 aligned -void gemv1x8(const int8_t *input, const int8_t *output, int8_t *weight_interleaved, - const int32_t *biases, const float *scales, +void gemv1x8(const int8_t* input, const int8_t* output, int8_t* weight_interleaved, + const int32_t* biases, const float* scales, int kernel_size, int start_channel, int end_channel, int num_thread, - int cpu_affinity) + int cpu_affinity) { int ch = 0; int8_t *cur_kernel, *cur_result; - int32_t *cur_biases; - const float *cur_scales; + int32_t* cur_biases; + const float* cur_scales; // #pragma omp parallel for num_threads(num_thread) - for (ch = start_channel; ch < end_channel; ch += 8) + for (ch = start_channel; ch < end_channel; ch += 8) { - cur_kernel = (int8_t *) (weight_interleaved + kernel_size * ch); - cur_result = (int8_t *) (output + ch); - cur_biases = biases ? (int32_t *) (biases + ch) : NULL; + cur_kernel = (int8_t*)(weight_interleaved + kernel_size * ch); + cur_result = (int8_t*)(output + ch); + cur_biases = biases ? (int32_t*)(biases + ch) : NULL; cur_scales = scales + ch; - gemv_1x8_int8(cur_biases, cur_scales, (int8_t *) input, cur_kernel, kernel_size, + gemv_1x8_int8(cur_biases, cur_scales, (int8_t*)input, cur_kernel, kernel_size, cur_result); } } // start channel must be 2 aligned -void gemv1x2(const int8_t *input, int8_t *output, int8_t *weight_interleaved, - const int32_t *biases, const float *scales, - int kernel_size,int start_channel,int end_channel,int num_thread,int cpu_affinity) +void gemv1x2(const int8_t* input, int8_t* output, int8_t* weight_interleaved, + const int32_t* biases, const float* scales, + int kernel_size, int start_channel, int end_channel, int num_thread, int cpu_affinity) { int32_t sum; int ch = 0; - int8_t *cur_kernel; - int32_t *cur_biases; - int8_t *cur_result; + int8_t* cur_kernel; + int32_t* cur_biases; + int8_t* cur_result; const float* cur_scales; - for (ch = start_channel; ch < (end_channel & -2); ch += 2) { - cur_kernel = (int8_t *) (weight_interleaved + kernel_size * ch); - cur_result = (int8_t *) (output + ch); - cur_biases = biases ? (int32_t *) (biases + ch) : NULL; + for (ch = start_channel; ch < (end_channel & -2); ch += 2) + { + cur_kernel = (int8_t*)(weight_interleaved + kernel_size * ch); + cur_result = (int8_t*)(output + ch); + cur_biases = biases ? (int32_t*)(biases + ch) : NULL; cur_scales = scales + ch; - gemv_1x2_int8(cur_biases, cur_scales, (int8_t*) input, cur_kernel, kernel_size, cur_result); + gemv_1x2_int8(cur_biases, cur_scales, (int8_t*)input, cur_kernel, kernel_size, cur_result); } - if (end_channel & 0x1) { - cur_kernel = (int8_t *) (weight_interleaved + kernel_size * ch); - cur_result = (int8_t *) (output + ch); + if (end_channel & 0x1) + { + cur_kernel = (int8_t*)(weight_interleaved + kernel_size * ch); + cur_result = (int8_t*)(output + ch); sum = biases ? *(biases + ch) : 0; for (int j = 0; j < kernel_size; j++) sum = sum + input[j] * cur_kernel[j]; @@ -229,40 +229,39 @@ void gemv1x2(const int8_t *input, int8_t *output, int8_t *weight_interleaved, } } - -static void interleave_kernel(const int8_t *kernel, int8_t *kernel_interleaved, int out_chan, int kernel_size) +static void interleave_kernel(const int8_t* kernel, int8_t* kernel_interleaved, int out_chan, int kernel_size) { int i, j, k; - int8_t *cur_kernel[8]; - int8_t *cur_kernel_interleaved; + int8_t* cur_kernel[8]; + int8_t* cur_kernel_interleaved; // interleave 8 kernel - for (i = 0; i < (out_chan & -8); i += 8) + for (i = 0; i < (out_chan & -8); i += 8) { for (j = 0; j < 8; j++) - cur_kernel[j] = (int8_t *) kernel + kernel_size * (i + j); - cur_kernel_interleaved = (int8_t *) kernel_interleaved + kernel_size * i; + cur_kernel[j] = (int8_t*)kernel + kernel_size * (i + j); + cur_kernel_interleaved = (int8_t*)kernel_interleaved + kernel_size * i; for (k = 0; k < kernel_size; k++) for (j = 0; j < 8; j++) cur_kernel_interleaved[8 * k + j] = *(cur_kernel[j] + k); } // interleave 2 kernel - for (; i < (out_chan & -2); i += 2) + for (; i < (out_chan & -2); i += 2) { for (j = 0; j < 2; j++) - cur_kernel[j] = (int8_t *) kernel + kernel_size * (i + j); - cur_kernel_interleaved = (int8_t *) kernel_interleaved + kernel_size * i; + cur_kernel[j] = (int8_t*)kernel + kernel_size * (i + j); + cur_kernel_interleaved = (int8_t*)kernel_interleaved + kernel_size * i; for (k = 0; k < kernel_size; k++) for (j = 0; j < 2; j++) cur_kernel_interleaved[2 * k + j] = *(cur_kernel[j] + k); } // copy last kernel - if (out_chan & 0x1) + if (out_chan & 0x1) { - cur_kernel[0] = (int8_t *) kernel + kernel_size * i; - cur_kernel_interleaved = (int8_t *) kernel_interleaved + kernel_size * i; + cur_kernel[0] = (int8_t*)kernel + kernel_size * i; + cur_kernel_interleaved = (int8_t*)kernel_interleaved + kernel_size * i; for (k = 0; k < kernel_size; k++) cur_kernel_interleaved[k] = *(cur_kernel[0] + k); } @@ -270,75 +269,75 @@ static void interleave_kernel(const int8_t *kernel, int8_t *kernel_interleaved, return; } -int int8_fc_kernel_prerun(struct tensor *input_tensor, \ - struct tensor *filter_tensor, \ - struct tensor *output_tensor, \ - struct fc_priv_info *priv_info, \ - struct fc_param *param) +int int8_fc_kernel_prerun(struct tensor* input_tensor, + struct tensor* filter_tensor, + struct tensor* output_tensor, + struct fc_priv_info* priv_info, + struct fc_param* param) { - int num_output = param->num_output; int kernel_size = filter_tensor->dims[1]; int kernel_align = ((kernel_size + 1) & -2); - if (!priv_info->interleave_buffer) + if (!priv_info->interleave_buffer) { int mem_size = num_output * kernel_align; - void *mem = sys_malloc(mem_size); + void* mem = sys_malloc(mem_size); priv_info->interleave_buffer = mem; priv_info->interleave_buffer_size = mem_size; } - if (!priv_info->input_buffer) + if (!priv_info->input_buffer) { int mem_size = kernel_align; - void *mem = sys_malloc(mem_size); + void* mem = sys_malloc(mem_size); priv_info->input_buffer = mem; priv_info->input_buffer_size = mem_size; } - int8_t *filter_data = (int8_t *) filter_tensor->data; + int8_t* filter_data = (int8_t*)filter_tensor->data; - interleave_kernel(filter_data, (int8_t *) priv_info->interleave_buffer, num_output, + interleave_kernel(filter_data, (int8_t*)priv_info->interleave_buffer, num_output, kernel_size); return 0; } -int int8_fc_kernel_run(struct tensor *input_tensor, \ - struct tensor *filter_tensor, \ - struct tensor *bias_tensor, \ - struct tensor *output_tensor, \ - struct fc_priv_info *priv_info, \ - struct fc_param *param, \ - int num_thread, int cpu_affinity) { +int int8_fc_kernel_run(struct tensor* input_tensor, + struct tensor* filter_tensor, + struct tensor* bias_tensor, + struct tensor* output_tensor, + struct fc_priv_info* priv_info, + struct fc_param* param, + int num_thread, int cpu_affinity) +{ int out_num = param->num_output; int kernel_size = filter_tensor->dims[1]; - int8_t *input = (int8_t *) input_tensor->data; - int8_t *output = (int8_t *) output_tensor->data; - int8_t *weight = (int8_t *) priv_info->interleave_buffer; - int32_t *biases = NULL; + int8_t* input = (int8_t*)input_tensor->data; + int8_t* output = (int8_t*)output_tensor->data; + int8_t* weight = (int8_t*)priv_info->interleave_buffer; + int32_t* biases = NULL; if (bias_tensor) - biases = (int32_t *) bias_tensor->data; + biases = (int32_t*)bias_tensor->data; float input_scale = input_tensor->scale; float output_scale = output_tensor->scale; - float *weight_scales = filter_tensor->scale_list; - float *requant_scales = (float *) malloc(out_num * sizeof(float)); + float* weight_scales = filter_tensor->scale_list; + float* requant_scales = (float*)malloc(out_num * sizeof(float)); for (int i = 0; i < out_num; i++) requant_scales[i] = (input_scale * weight_scales[i]) / output_scale; int out_num_8 = out_num & ~7; - for (int i = 0; i < input_tensor->dims[0]; i++) + for (int i = 0; i < input_tensor->dims[0]; i++) { - int8_t *cur_input = input + i * kernel_size; - int8_t *cur_output = output + i * out_num; + int8_t* cur_input = input + i * kernel_size; + int8_t* cur_output = output + i * out_num; gemv1x8(cur_input, cur_output, weight, biases, requant_scales, kernel_size, 0, out_num_8, num_thread, cpu_affinity); if (out_num & 0x7) - gemv1x2(cur_input, cur_output, weight, biases, requant_scales, kernel_size, out_num_8,out_num,num_thread, cpu_affinity); + gemv1x2(cur_input, cur_output, weight, biases, requant_scales, kernel_size, out_num_8, out_num, num_thread, cpu_affinity); } return 0; diff --git a/source/device/cpu/op/fc/cortex-a/fc_kernel_int8_arm.h b/source/device/cpu/op/fc/cortex-a/fc_kernel_int8_arm.h index 89011b983..f5d601cff 100644 --- a/source/device/cpu/op/fc/cortex-a/fc_kernel_int8_arm.h +++ b/source/device/cpu/op/fc/cortex-a/fc_kernel_int8_arm.h @@ -22,20 +22,18 @@ #include "fc_kernel_arm.h" +int int8_fc_kernel_prerun(struct tensor* input_tensor, + struct tensor* filter_tensor, + struct tensor* output_tensor, + struct fc_priv_info* priv_info, + struct fc_param* param); -int int8_fc_kernel_prerun(struct tensor* input_tensor , \ - struct tensor* filter_tensor , \ - struct tensor* output_tensor , \ - struct fc_priv_info* priv_info , \ - struct fc_param* param) ; - -int int8_fc_kernel_run(struct tensor* input_tensor , \ - struct tensor* filter_tensor ,\ - struct tensor* bias_tensor , \ - struct tensor* output_tensor , \ - struct fc_priv_info* priv_info , \ - struct fc_param* param, \ - int num_thread, int cpu_affinity) ; - +int int8_fc_kernel_run(struct tensor* input_tensor, + struct tensor* filter_tensor, + struct tensor* bias_tensor, + struct tensor* output_tensor, + struct fc_priv_info* priv_info, + struct fc_param* param, + int num_thread, int cpu_affinity); #endif diff --git a/source/device/cpu/op/fc/cortex-m/fc_cmsis.c b/source/device/cpu/op/fc/cortex-m/fc_cmsis.c index beeff6f29..e53be5c71 100644 --- a/source/device/cpu/op/fc/cortex-m/fc_cmsis.c +++ b/source/device/cpu/op/fc/cortex-m/fc_cmsis.c @@ -38,7 +38,6 @@ #include "arm_math.h" #include "arm_nnfunctions.h" - struct cmsis_param { uint16_t bias_shift; @@ -78,7 +77,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str int scale = ir_tensor->scale; out_shift = cal_shift(scale); - struct cmsis_param* param = ( struct cmsis_param* )sys_malloc(sizeof(struct cmsis_param)); + struct cmsis_param* param = (struct cmsis_param*)sys_malloc(sizeof(struct cmsis_param)); param->bias_shift = bias_shift; param->out_shift = out_shift; @@ -105,7 +104,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* weight_tensor; struct tensor* bias_tensor = NULL; struct tensor* output_tensor; - struct cmsis_param* cmsis_param = ( struct cmsis_param* )exec_node->ops_priv; + struct cmsis_param* cmsis_param = (struct cmsis_param*)exec_node->ops_priv; input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); @@ -114,10 +113,9 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex if (ir_node->input_num > 2) bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); - int ret = - arm_fully_connected_q7(input_tensor->data, weight_tensor->data, weight_tensor->dims[1], weight_tensor->dims[0], - cmsis_param->bias_shift, cmsis_param->out_shift, bias_tensor ? bias_tensor->data : NULL, - output_tensor->data, exec_graph->shared_mem); + int ret = arm_fully_connected_q7(input_tensor->data, weight_tensor->data, weight_tensor->dims[1], weight_tensor->dims[0], + cmsis_param->bias_shift, cmsis_param->out_shift, bias_tensor ? bias_tensor->data : NULL, + output_tensor->data, exec_graph->shared_mem); if (ret != ARM_MATH_SUCCESS) return -1; diff --git a/source/device/cpu/op/fc/fc_ref.c b/source/device/cpu/op/fc/fc_ref.c index ee48612c5..b0da933ea 100644 --- a/source/device/cpu/op/fc/fc_ref.c +++ b/source/device/cpu/op/fc/fc_ref.c @@ -39,15 +39,14 @@ #include #include - struct fc_data { int need_trans; - int batch; // N - int out_number; // OUT - int hidden; // hidden + int batch; // N + int out_number; // OUT + int hidden; // hidden int zero[3]; // input, kernel, output - float scale[3]; // input, kernel, output + float scale[3]; // input, kernel, output }; static int ref_fc_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, struct fc_data* param) @@ -83,12 +82,11 @@ static int ref_fc_fp32(struct tensor* input_tensor, struct tensor* output_tensor return 0; } - static int ref_fc_fp16(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, struct fc_data* param) { - #if MACOS +#if MACOS - #else +#else int batch = param->batch; int hidden = param->hidden; int out_number = param->out_number; @@ -116,7 +114,7 @@ static int ref_fc_fp16(struct tensor* input_tensor, struct tensor* output_tensor output[n * out_number + i] = fp32_to_fp16(tmp); } } - #endif +#endif return 0; } @@ -126,7 +124,7 @@ static int ref_fc_uint8(struct tensor* input_tensor, struct tensor* output_tenso int hidden = param->hidden; int out_number = param->out_number; - uint8_t* input = (uint8_t*)input_tensor->data; + uint8_t* input = (uint8_t*)input_tensor->data; uint8_t* output = (uint8_t*)output_tensor->data; uint8_t* weight = (uint8_t*)weight_tensor->data; @@ -141,7 +139,7 @@ static int ref_fc_uint8(struct tensor* input_tensor, struct tensor* output_tenso { int32_t* bias = (int32_t*)bias_tensor->data; float bias_scale = bias_tensor->scale; - + int n, i, j; for (n = 0; n < batch; n++) { @@ -152,14 +150,14 @@ static int ref_fc_uint8(struct tensor* input_tensor, struct tensor* output_tenso { if (param->need_trans == 0) { - float input_fp32 = ((float)input[n * hidden + j] - (float)input_zero) * input_scale; + float input_fp32 = ((float)input[n * hidden + j] - (float)input_zero) * input_scale; float weight_fp32 = ((float)weight[i * hidden + j] - (float)weight_zero) * weight_scale; data += input_fp32 * weight_fp32; } else { - float input_fp32 = ((float)input[n * hidden + j] - (float)input_zero) * input_scale; - float weight_fp32 = ((float)weight[i + j * out_number] - (float)weight_zero) * weight_scale; + float input_fp32 = ((float)input[n * hidden + j] - (float)input_zero) * input_scale; + float weight_fp32 = ((float)weight[i + j * out_number] - (float)weight_zero) * weight_scale; data += input_fp32 * weight_fp32; } } @@ -173,7 +171,7 @@ static int ref_fc_uint8(struct tensor* input_tensor, struct tensor* output_tenso } } else - { + { int n, i, j; for (n = 0; n < batch; n++) { @@ -184,14 +182,14 @@ static int ref_fc_uint8(struct tensor* input_tensor, struct tensor* output_tenso { if (param->need_trans == 0) { - float input_fp32 = ((float)input[n * hidden + j] - (float)input_zero) * input_scale; + float input_fp32 = ((float)input[n * hidden + j] - (float)input_zero) * input_scale; float weight_fp32 = ((float)weight[i * hidden + j] - (float)weight_zero) * weight_scale; data += input_fp32 * weight_fp32; } else { - float input_fp32 = ((float)input[n * hidden + j] - (float)input_zero) * input_scale; - float weight_fp32 = ((float)weight[i + j * out_number] - (float)weight_zero) * weight_scale; + float input_fp32 = ((float)input[n * hidden + j] - (float)input_zero) * input_scale; + float weight_fp32 = ((float)weight[i + j * out_number] - (float)weight_zero) * weight_scale; data += input_fp32 * weight_fp32; } } @@ -208,14 +206,13 @@ static int ref_fc_uint8(struct tensor* input_tensor, struct tensor* output_tenso return 0; } - static int ref_fc_int8(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, struct fc_data* param) { int batch = param->batch; int hidden = param->hidden; int out_number = param->out_number; - int8_t* input = (int8_t*)input_tensor->data; + int8_t* input = (int8_t*)input_tensor->data; int8_t* output = (int8_t*)output_tensor->data; int8_t* weight = (int8_t*)weight_tensor->data; @@ -224,7 +221,7 @@ static int ref_fc_int8(struct tensor* input_tensor, struct tensor* output_tensor float* weight_scales = weight_tensor->scale_list; float* requant_scales = (float*)malloc(out_number * sizeof(float)); - for (int i=0; ineed_trans == 0) { - int8_t input_i8 = input[n * hidden + j]; + int8_t input_i8 = input[n * hidden + j]; int8_t weight_i8 = weight[i * hidden + j]; output_i32 += (int32_t)input_i8 * (int32_t)weight_i8; } else { - int8_t input_i8 = input[n * hidden + j]; + int8_t input_i8 = input[n * hidden + j]; int8_t weight_i8 = weight[i + j * out_number]; output_i32 += (int32_t)input_i8 * (int32_t)weight_i8; } @@ -273,13 +270,13 @@ static int ref_fc_int8(struct tensor* input_tensor, struct tensor* output_tensor { if (param->need_trans == 0) { - int8_t input_i8 = input[n * hidden + j]; + int8_t input_i8 = input[n * hidden + j]; int8_t weight_i8 = weight[i * hidden + j]; output_i32 += (int32_t)input_i8 * (int32_t)weight_i8; } else { - int8_t input_i8 = input[n * hidden + j]; + int8_t input_i8 = input[n * hidden + j]; int8_t weight_i8 = weight[i + j * out_number]; output_i32 += (int32_t)input_i8 * (int32_t)weight_i8; } @@ -301,7 +298,7 @@ static int ref_fc_int8(struct tensor* input_tensor, struct tensor* output_tensor static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct fc_data* op_param = ( struct fc_data* )sys_malloc(sizeof(struct fc_data)); + struct fc_data* op_param = (struct fc_data*)sys_malloc(sizeof(struct fc_data)); memset(op_param, 0, sizeof(struct fc_data)); exec_node->ops_priv = op_param; return 0; @@ -325,8 +322,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct fc_param* param = ( struct fc_param* )ir_node->op.param_mem; - struct fc_data* op_param = ( struct fc_data* )exec_node->ops_priv; + struct fc_param* param = (struct fc_param*)ir_node->op.param_mem; + struct fc_data* op_param = (struct fc_data*)exec_node->ops_priv; if (ir_graph->graph_layout == TENGINE_LAYOUT_NCHW) { @@ -374,8 +371,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct fc_param* param = ( struct fc_param* )ir_node->op.param_mem; - struct fc_data* op_param = ( struct fc_data* )exec_node->ops_priv; + struct fc_param* param = (struct fc_param*)ir_node->op.param_mem; + struct fc_data* op_param = (struct fc_data*)exec_node->ops_priv; if (ir_node->input_num > 2) bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); @@ -384,11 +381,11 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_fc_fp32(input_tensor, output_tensor, weight_tensor, bias_tensor, op_param); else if (input_tensor->data_type == TENGINE_DT_FP16) - #if MACOS +#if MACOS TLOG_ERR("FP16 not support for mac os"); - #else +#else ret = ref_fc_fp16(input_tensor, output_tensor, weight_tensor, bias_tensor, op_param); - #endif +#endif else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_fc_uint8(input_tensor, output_tensor, weight_tensor, bias_tensor, op_param); else if (input_tensor->data_type == TENGINE_DT_INT8) diff --git a/source/device/cpu/op/fc/x86/fc_hcl_x86.c b/source/device/cpu/op/fc/x86/fc_hcl_x86.c index 29c155676..86acbb992 100644 --- a/source/device/cpu/op/fc/x86/fc_hcl_x86.c +++ b/source/device/cpu/op/fc/x86/fc_hcl_x86.c @@ -38,7 +38,6 @@ #include #include - #if __SSE2__ #include #endif @@ -49,11 +48,11 @@ struct fc_data { int need_trans; - int batch; // N - int out_number; // OUT - int hidden; // hidden + int batch; // N + int out_number; // OUT + int hidden; // hidden int zero[3]; // input, kernel, output - float scale[3]; // input, kernel, output + float scale[3]; // input, kernel, output }; static int innerproduct(int inn, int inc, int inh, int inw, int outc, const float* weight, const float* input, float* output, @@ -86,8 +85,8 @@ static int innerproduct(int inn, int inc, int inh, int inw, int outc, const floa _mm_storeu_ps(_sum, _sum0); tmp = _sum[0] + _sum[1] + _sum[2] + _sum[3]; sum = sum + tmp; -#else //__AVX__ - // TODO +#else //__AVX__ \ + // TODO #endif #endif for (; q < inc * size; q++) @@ -105,7 +104,7 @@ static int innerproduct(int inn, int inc, int inh, int inw, int outc, const floa static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct fc_data* op_param = ( struct fc_data* )sys_malloc(sizeof(struct fc_data)); + struct fc_data* op_param = (struct fc_data*)sys_malloc(sizeof(struct fc_data)); memset(op_param, 0, sizeof(struct fc_data)); exec_node->ops_priv = op_param; return 0; @@ -129,8 +128,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct fc_param* param = ( struct fc_param* )ir_node->op.param_mem; - struct fc_data* op_param = ( struct fc_data* )exec_node->ops_priv; + struct fc_param* param = (struct fc_param*)ir_node->op.param_mem; + struct fc_data* op_param = (struct fc_data*)exec_node->ops_priv; if (ir_graph->graph_layout == TENGINE_LAYOUT_NCHW) { @@ -174,14 +173,14 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* bias_tensor; struct tensor* output_tensor; int num_thread = exec_graph->num_thread; - int cpu_affinity = exec_graph->cpu_affinity; + int cpu_affinity = exec_graph->cpu_affinity; input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct fc_param* param = ( struct fc_param* )ir_node->op.param_mem; - struct fc_data* op_param = ( struct fc_data* )exec_node->ops_priv; + struct fc_param* param = (struct fc_param*)ir_node->op.param_mem; + struct fc_data* op_param = (struct fc_data*)exec_node->ops_priv; const void* input_data = input_tensor->data; void* weight_data = weight_tensor->data; @@ -199,8 +198,9 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); bias_data = bias_tensor->data; } - if (innerproduct(batch_number, inc, inh, inw, outc, (float*)weight_data, (float*)input_data, - (float*)output_data, (float*)bias_data, num_thread, cpu_affinity) < 0) + if (innerproduct(batch_number, inc, inh, inw, outc, (float*)weight_data, (float*)input_data, + (float*)output_data, (float*)bias_data, num_thread, cpu_affinity) + < 0) return -1; return 0; diff --git a/source/device/cpu/op/flatten/flatten_ref.c b/source/device/cpu/op/flatten/flatten_ref.c index be26c3354..9b4476d28 100644 --- a/source/device/cpu/op/flatten/flatten_ref.c +++ b/source/device/cpu/op/flatten/flatten_ref.c @@ -36,7 +36,6 @@ #include - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; diff --git a/source/device/cpu/op/gather/gather_ref.c b/source/device/cpu/op/gather/gather_ref.c index 77d29612a..5b5c9ce9e 100644 --- a/source/device/cpu/op/gather/gather_ref.c +++ b/source/device/cpu/op/gather/gather_ref.c @@ -38,10 +38,9 @@ #include #include - typedef struct { - int* in_shape; // the dim of the input + int* in_shape; // the dim of the input int axis; int indices_num; int dim_size; @@ -68,22 +67,23 @@ static int ref_gather_fp32(float* input, int* input_indices, float* output, gath // TLOG_ERR("inner_size size: %d %d \n", inner_size, param->in_shape[i]); } - // #pragma omp parallel for num_threads(num_thread) - if(param->is_onnx){ + // #pragma omp parallel for num_threads(num_thread) + if (param->is_onnx) + { for (int outer = 0; outer < outer_size; ++outer) { - memcpy(out_ptr + (outer * param->indices_num ) * inner_size, - in_ptr + (outer* axis_size + param->indices_num) * inner_size, inner_size* sizeof(float)); + memcpy(out_ptr + (outer * param->indices_num) * inner_size, + in_ptr + (outer * axis_size + param->indices_num) * inner_size, inner_size * sizeof(float)); } - } else { + } + else + { for (int outer = 0; outer < outer_size; ++outer) { for (int i = 0; i < param->indices_num; i++) { - memcpy(out_ptr + (outer * param->indices_num + i) * inner_size, - in_ptr + (outer * axis_size + ( int )input_indices[i]) * inner_size, inner_size * sizeof(float)); - + in_ptr + (outer * axis_size + (int)input_indices[i]) * inner_size, inner_size * sizeof(float)); } } } @@ -109,13 +109,13 @@ static int ref_gather_uint8(uint8_t* input, int* input_indices, uint8_t* output, inner_size *= param->in_shape[i]; } - // #pragma omp parallel for num_threads(num_thread) + // #pragma omp parallel for num_threads(num_thread) for (int outer = 0; outer < outer_size; ++outer) { for (int i = 0; i < param->indices_num; i++) { memcpy(out_ptr + (outer * param->indices_num + i) * inner_size, - in_ptr + (outer * axis_size + ( int )input_indices[i]) * inner_size, inner_size); + in_ptr + (outer * axis_size + (int)input_indices[i]) * inner_size, inner_size); } } @@ -126,14 +126,14 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct { struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; - struct gather_param* gather_param = ( struct gather_param* )ir_node->op.param_mem; - gather_param_t* op_priv_info = ( gather_param_t* )exec_node->ops_priv; + struct gather_param* gather_param = (struct gather_param*)ir_node->op.param_mem; + gather_param_t* op_priv_info = (gather_param_t*)exec_node->ops_priv; struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); op_priv_info->axis = gather_param->axis; op_priv_info->indices_num = gather_param->indices_num; op_priv_info->is_onnx = gather_param->is_onnx; - op_priv_info->in_shape = (int*)sys_malloc(input_tensor->dim_num*sizeof(int)); + op_priv_info->in_shape = (int*)sys_malloc(input_tensor->dim_num * sizeof(int)); /* prerun now */ return 0; } @@ -146,7 +146,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); struct tensor* indices_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); - gather_param_t* op_priv_info = ( gather_param_t* )exec_node->ops_priv; + gather_param_t* op_priv_info = (gather_param_t*)exec_node->ops_priv; int out_size = input_tensor->elem_num; @@ -168,7 +168,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_gather_fp32((float*)input, (int*)indices_data, (float*)output, op_priv_info, exec_graph->num_thread); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_gather_uint8((uint8_t*)input, (int*)indices_data, (uint8_t*)output, op_priv_info, exec_graph->num_thread); return ret; @@ -179,7 +179,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; - gather_param_t* op_priv_info = ( gather_param_t* )sys_malloc(sizeof(gather_param_t)); + gather_param_t* op_priv_info = (gather_param_t*)sys_malloc(sizeof(gather_param_t)); if (op_priv_info == NULL) { @@ -203,7 +203,7 @@ static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struc } static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - gather_param_t* op_priv_info = ( gather_param_t* )exec_node->ops_priv; + gather_param_t* op_priv_info = (gather_param_t*)exec_node->ops_priv; sys_free(op_priv_info); diff --git a/source/device/cpu/op/gru/gru_ref.c b/source/device/cpu/op/gru/gru_ref.c index 47bbb624b..056882f3c 100644 --- a/source/device/cpu/op/gru/gru_ref.c +++ b/source/device/cpu/op/gru/gru_ref.c @@ -66,21 +66,21 @@ int ref_gru_default_fp32(struct tensor* input_tensor, struct tensor* w, struct t /* initial_h_data buffers */ float* initial_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); - float* output_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); - float* h_0 = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); - memset(initial_h_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float)); - memset(output_h_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float)); - memset(h_0, 0, (unsigned long)hidden_size*batch_size * sizeof(float)); + float* output_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); + float* h_0 = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); + memset(initial_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); + memset(output_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); + memset(h_0, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); - float* Z_data = ( float* )malloc(hidden_size * sizeof(float)); - float* R_data = ( float* )malloc(hidden_size * sizeof(float)); - float* H_data = ( float* )malloc(hidden_size * sizeof(float)); + float* Z_data = (float*)malloc(hidden_size * sizeof(float)); + float* R_data = (float*)malloc(hidden_size * sizeof(float)); + float* H_data = (float*)malloc(hidden_size * sizeof(float)); int T = input_tensor->dims[1]; - - for(int seq = 0; seq < input_tensor->dims[0]; seq++) + + for (int seq = 0; seq < input_tensor->dims[0]; seq++) { - for(int t = 0; t < T; t++) + for (int t = 0; t < T; t++) { for (int q = 0; q < hidden_size; q++) { @@ -98,7 +98,7 @@ int ref_gru_default_fp32(struct tensor* input_tensor, struct tensor* w, struct t for (int h = 0; h < hidden_size; h++) { - if(seq == 0) + if (seq == 0) { float h_i = initial_h_data[t * hidden_size + h]; Z += h_i * r_data[(hidden_size * 0 + q) * hidden_size + h]; @@ -115,7 +115,7 @@ int ref_gru_default_fp32(struct tensor* input_tensor, struct tensor* w, struct t float r_tmp = 1.f / (1.f + exp(-R)); for (int k = 0; k < hidden_size; k++) { - if(seq == 0) + if (seq == 0) { r_H += r_tmp * initial_h_data[t * hidden_size + k] * r_data[(hidden_size * 2 + q) * hidden_size + k]; } @@ -132,7 +132,7 @@ int ref_gru_default_fp32(struct tensor* input_tensor, struct tensor* w, struct t for (int h = 0; h < hidden_size; h++) { - if(seq == 0) + if (seq == 0) { float Z = 1.f / (1.f + exp(-Z_data[h])); float H = tanh(H_data[h]); @@ -167,7 +167,7 @@ int ref_gru_with_bias_fp32(struct tensor* input_tensor, struct tensor* w, struct int batch_size = input_tensor->dims[1]; int size = input_tensor->dims[2]; int hidden_size = param->hidden_size; - + float* x_data = (float*)input_tensor->data; float* w_data = (float*)w->data; float* r_data = (float*)r->data; @@ -176,21 +176,21 @@ int ref_gru_with_bias_fp32(struct tensor* input_tensor, struct tensor* w, struct /* initial_h_data buffers */ float* initial_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); - float* output_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); - float* h_0 = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); - memset(initial_h_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float)); - memset(output_h_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float)); - memset(h_0, 0, (unsigned long)hidden_size*batch_size * sizeof(float)); + float* output_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); + float* h_0 = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); + memset(initial_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); + memset(output_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); + memset(h_0, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); - float* Z_data = ( float* )malloc(hidden_size * sizeof(float)); - float* R_data = ( float* )malloc(hidden_size * sizeof(float)); - float* H_data = ( float* )malloc(hidden_size * sizeof(float)); + float* Z_data = (float*)malloc(hidden_size * sizeof(float)); + float* R_data = (float*)malloc(hidden_size * sizeof(float)); + float* H_data = (float*)malloc(hidden_size * sizeof(float)); int T = input_tensor->dims[1]; - - for(int seq = 0; seq < input_tensor->dims[0]; seq++) + + for (int seq = 0; seq < input_tensor->dims[0]; seq++) { - for(int t = 0; t < T; t++) + for (int t = 0; t < T; t++) { for (int q = 0; q < hidden_size; q++) { @@ -213,7 +213,7 @@ int ref_gru_with_bias_fp32(struct tensor* input_tensor, struct tensor* w, struct for (int h = 0; h < hidden_size; h++) { - if(seq == 0) + if (seq == 0) { float h_i = initial_h_data[t * hidden_size + h]; Z += h_i * r_data[(hidden_size * 0 + q) * hidden_size + h]; @@ -233,7 +233,7 @@ int ref_gru_with_bias_fp32(struct tensor* input_tensor, struct tensor* w, struct float r_tmp = 1.f / (1.f + exp(-R)); for (int k = 0; k < hidden_size; k++) { - if(seq == 0) + if (seq == 0) { r_H += r_tmp * initial_h_data[t * hidden_size + k] * r_data[(hidden_size * 2 + q) * hidden_size + k]; } @@ -250,7 +250,7 @@ int ref_gru_with_bias_fp32(struct tensor* input_tensor, struct tensor* w, struct for (int h = 0; h < hidden_size; h++) { - if(seq == 0) + if (seq == 0) { float Z = 1.f / (1.f + exp(-Z_data[h])); float H = tanh(H_data[h]); @@ -291,23 +291,23 @@ int ref_gru_case1_fp32(struct tensor* input_tensor, struct tensor* w, struct ten /* initial_h_data buffers */ float* initial_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); - float* output_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); - float* h_0 = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); + float* output_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); + float* h_0 = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); memset(initial_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); - memset(output_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); - memset(h_0, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); + memset(output_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); + memset(h_0, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); - float* Z_data = ( float* )malloc(hidden_size * sizeof(float)); - float* R_data = ( float* )malloc(hidden_size * sizeof(float)); - float* H_data = ( float* )malloc(hidden_size * sizeof(float)); + float* Z_data = (float*)malloc(hidden_size * sizeof(float)); + float* R_data = (float*)malloc(hidden_size * sizeof(float)); + float* H_data = (float*)malloc(hidden_size * sizeof(float)); float* output_data = (float*)output_tensor->data; int T = input_tensor->dims[1]; int size = input_tensor->dims[2]; - for(int seq = 0; seq < input_tensor->dims[0]; seq++) + for (int seq = 0; seq < input_tensor->dims[0]; seq++) { - for(int t = 0; t < T; t++) + for (int t = 0; t < T; t++) { for (int q = 0; q < hidden_size; q++) { @@ -329,7 +329,7 @@ int ref_gru_case1_fp32(struct tensor* input_tensor, struct tensor* w, struct ten for (int h = 0; h < hidden_size; h++) { - if(seq == 0) + if (seq == 0) { float h_i = initial_h_data[t * hidden_size + h]; Z += h_i * r_data[(hidden_size * 0 + q) * hidden_size + h]; @@ -357,7 +357,7 @@ int ref_gru_case1_fp32(struct tensor* input_tensor, struct tensor* w, struct ten for (int h = 0; h < hidden_size; h++) { - if(seq == 0) + if (seq == 0) { float Z = 1.f / (1.f + exp(-Z_data[h])); float H = tanh(H_data[h]); @@ -402,7 +402,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex if (ir_node->input_num > 3) b = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[3]); - struct gru_param* param = ( struct gru_param* )(ir_node->op.param_mem); + struct gru_param* param = (struct gru_param*)(ir_node->op.param_mem); /* only support one way */ if (w->dim_num == 4 && w->dims[0] == 2) diff --git a/source/device/cpu/op/hardsigmoid/hardsigmoid_ref.c b/source/device/cpu/op/hardsigmoid/hardsigmoid_ref.c index 0d5c260d9..a0b3849e8 100644 --- a/source/device/cpu/op/hardsigmoid/hardsigmoid_ref.c +++ b/source/device/cpu/op/hardsigmoid/hardsigmoid_ref.c @@ -36,7 +36,6 @@ #include - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -75,7 +74,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct hard_sigmoid_param* param = ( struct hard_sigmoid_param* )ir_node->op.param_mem; + struct hard_sigmoid_param* param = (struct hard_sigmoid_param*)ir_node->op.param_mem; int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) @@ -86,7 +85,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex return -1; } - return ret; + return ret; } static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) diff --git a/source/device/cpu/op/hardswish/hardswish_kernel_ref.h b/source/device/cpu/op/hardswish/hardswish_kernel_ref.h index 6d6924206..7fe84eeb6 100644 --- a/source/device/cpu/op/hardswish/hardswish_kernel_ref.h +++ b/source/device/cpu/op/hardswish/hardswish_kernel_ref.h @@ -25,12 +25,10 @@ #ifndef __HARDSWISH_KERNEL_REF_H__ #define __HARDSWISH_KERNEL_REF_H__ - #include "graph/tensor.h" #include "graph/node.h" #include "graph/graph.h" - int ref_hardswish_fp32(struct tensor* input_tensor, struct tensor* output_tensor); int ref_hardswish_uint8(struct tensor* input_tensor, struct tensor* output_tensor); diff --git a/source/device/cpu/op/hardswish/hardswish_kernel_ref_fp32.c b/source/device/cpu/op/hardswish/hardswish_kernel_ref_fp32.c index d5139fe2f..e2c103897 100644 --- a/source/device/cpu/op/hardswish/hardswish_kernel_ref_fp32.c +++ b/source/device/cpu/op/hardswish/hardswish_kernel_ref_fp32.c @@ -38,7 +38,6 @@ #include - int ref_hardswish_fp32(struct tensor* input_tensor, struct tensor* output_tensor) { float* input_data = (float*)input_tensor->data; @@ -53,7 +52,7 @@ int ref_hardswish_fp32(struct tensor* input_tensor, struct tensor* output_tensor tmp = 0.f; if (tmp > 6.f) tmp = 6.f; - + output_data[i] = input_data[i] * (tmp / 6.f); } diff --git a/source/device/cpu/op/hardswish/hardswish_kernel_ref_uint8.c b/source/device/cpu/op/hardswish/hardswish_kernel_ref_uint8.c index 777304fd6..7252b433b 100644 --- a/source/device/cpu/op/hardswish/hardswish_kernel_ref_uint8.c +++ b/source/device/cpu/op/hardswish/hardswish_kernel_ref_uint8.c @@ -38,7 +38,6 @@ #include - int ref_hardswish_uint8(struct tensor* input_tensor, struct tensor* output_tensor) { int size = input_tensor->elem_num; @@ -53,8 +52,8 @@ int ref_hardswish_uint8(struct tensor* input_tensor, struct tensor* output_tenso float* data_fp32 = (float*)sys_malloc(size * sizeof(float)); - for(int i = 0; i < size; i++) - data_fp32[i] = ((float) input_uint8[i] - (float)input_zero) * input_scale; + for (int i = 0; i < size; i++) + data_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale; for (int i = 0; i < size; i++) { @@ -69,7 +68,7 @@ int ref_hardswish_uint8(struct tensor* input_tensor, struct tensor* output_tenso } // quant - for(int i=0; i 255) diff --git a/source/device/cpu/op/hardswish/hardswish_ref.c b/source/device/cpu/op/hardswish/hardswish_ref.c index c836bcad6..3a1910c39 100644 --- a/source/device/cpu/op/hardswish/hardswish_ref.c +++ b/source/device/cpu/op/hardswish/hardswish_ref.c @@ -22,7 +22,6 @@ * Author: qtang@openailab.com */ - #include "hardswish_kernel_ref.h" #include "graph/tensor.h" @@ -35,7 +34,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -61,7 +59,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_hardswish_fp32(input_tensor, output_tensor); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_hardswish_uint8(input_tensor, output_tensor); else TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type); diff --git a/source/device/cpu/op/input/input_ref.c b/source/device/cpu/op/input/input_ref.c index 4754fff7c..4118be0da 100644 --- a/source/device/cpu/op/input/input_ref.c +++ b/source/device/cpu/op/input/input_ref.c @@ -32,7 +32,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { exec_node->inplace_map[0] = 0; diff --git a/source/device/cpu/op/instancenorm/instancenorm_ref.c b/source/device/cpu/op/instancenorm/instancenorm_ref.c index 32f4b4c4d..16fbd563f 100644 --- a/source/device/cpu/op/instancenorm/instancenorm_ref.c +++ b/source/device/cpu/op/instancenorm/instancenorm_ref.c @@ -36,7 +36,6 @@ #include - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -100,7 +99,7 @@ int ref_instancenorm_fp32(float* input_data, float* output_data, float* gamma_da } int ref_instancenorm_uint8(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* gamma_tensor, struct tensor* beta_tensor, - float eps, float scale, float zero_point, int layout) + float eps, float scale, float zero_point, int layout) { int n = input_tensor->dims[0]; int channels = input_tensor->dims[1]; @@ -121,10 +120,10 @@ int ref_instancenorm_uint8(struct tensor* input_tensor, struct tensor* output_te int32_t input_zero = input_tensor->zero_point; int32_t output_zero = output_tensor->zero_point; - float* input_data = (float*) sys_malloc(total_size * sizeof(float)); - float* output_data = (float*) sys_malloc(total_size * sizeof(float)); - for(int i = 0; i < total_size; i++) - input_data[i] = ((float) input_uint8[i] - (float)input_zero) * input_scale; + float* input_data = (float*)sys_malloc(total_size * sizeof(float)); + float* output_data = (float*)sys_malloc(total_size * sizeof(float)); + for (int i = 0; i < total_size; i++) + input_data[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale; for (int s = 0; s < n; s++) { @@ -168,7 +167,7 @@ int ref_instancenorm_uint8(struct tensor* input_tensor, struct tensor* output_te } // quant - for(int i=0; i 255) @@ -206,15 +205,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex void* beta_data = beta_tensor->data; void* gamma_data = gamma_tensor->data; - struct instancenorm_Param* param = ( struct instancenorm_Param* )node->op.param_mem; + struct instancenorm_Param* param = (struct instancenorm_Param*)node->op.param_mem; float eps = param->eps; float scale = 1.f; int zero_point = 0; - + int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_instancenorm_fp32(in_data, out_data, gamma_data, beta_data, size, c, n, eps, scale, zero_point, 0); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_instancenorm_uint8(input_tensor, output_tensor, gamma_tensor, beta_tensor, eps, scale, zero_point, 0); return ret; diff --git a/source/device/cpu/op/interp/cortex-a/interp_hcl_arm.c b/source/device/cpu/op/interp/cortex-a/interp_hcl_arm.c index bf6bb1cfe..c7fc11e26 100644 --- a/source/device/cpu/op/interp/cortex-a/interp_hcl_arm.c +++ b/source/device/cpu/op/interp/cortex-a/interp_hcl_arm.c @@ -34,7 +34,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -59,7 +58,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct interp_param* interp_param = ( struct interp_param* )ir_node->op.param_mem; + struct interp_param* interp_param = (struct interp_param*)ir_node->op.param_mem; int num_thread = exec_graph->num_thread; diff --git a/source/device/cpu/op/interp/cortex-a/interp_kernel_arm.c b/source/device/cpu/op/interp/cortex-a/interp_kernel_arm.c index c3f6647b2..508567ac1 100644 --- a/source/device/cpu/op/interp/cortex-a/interp_kernel_arm.c +++ b/source/device/cpu/op/interp/cortex-a/interp_kernel_arm.c @@ -30,16 +30,15 @@ #include - #define MIN(a, b) ((a) < (b) ? (a) : (b)) static void linear_coeffs(int w, int outw, int* xofs, float* alpha) { - double scale = ( double )w / outw; + double scale = (double)w / outw; for (int dx = 0; dx < outw; dx++) { - float fx = ( float )((dx) * scale); + float fx = (float)((dx)*scale); int sx = floor(fx); fx -= sx; @@ -64,12 +63,12 @@ static void linear_coeffs(int w, int outw, int* xofs, float* alpha) static void resize_bilinear_image(float* src, float* dst, float* alpha, int* xofs, float* beta, int* yofs, int out_h, int out_w, int in_h, int in_w) { - int w = out_w; // dst.w; - int h = out_h; // dst.h; + int w = out_w; // dst.w; + int h = out_h; // dst.h; // loop body - float* rowsbuf0 = ( float* )sys_malloc(w * sizeof(float)); - float* rowsbuf1 = ( float* )sys_malloc(w * sizeof(float)); + float* rowsbuf0 = (float*)sys_malloc(w * sizeof(float)); + float* rowsbuf1 = (float*)sys_malloc(w * sizeof(float)); float* rows0 = rowsbuf0; float* rows1 = rowsbuf1; @@ -89,7 +88,7 @@ static void resize_bilinear_image(float* src, float* dst, float* alpha, int* xof float* rows0_old = rows0; rows0 = rows1; rows1 = rows0_old; - const float* S1 = src + (sy + 1) * in_w; // src.row(sy+1); + const float* S1 = src + (sy + 1) * in_w; // src.row(sy+1); const float* alphap = alpha; float* rows1p = rows1; @@ -117,8 +116,8 @@ static void resize_bilinear_image(float* src, float* dst, float* alpha, int* xof else { // hresize two rows - const float* S0 = src + sy * in_w; // src.row(sy); - const float* S1 = src + (sy + 1) * in_w; // src.row(sy+1); + const float* S0 = src + sy * in_w; // src.row(sy); + const float* S1 = src + (sy + 1) * in_w; // src.row(sy+1); const float* alphap = alpha; float* rows0p = rows0; @@ -160,7 +159,7 @@ static void resize_bilinear_image(float* src, float* dst, float* alpha, int* xof float* rows0p = rows0; float* rows1p = rows1; - float* Dp = dst + dy * out_w; // dst.row(dy); + float* Dp = dst + dy * out_w; // dst.row(dy); int nn = w >> 3; int remain = w - (nn << 3); @@ -215,11 +214,11 @@ static inline void interpolate_cubic(float fx, float* coeffs) } static void cubic_coeffs(int w, int outw, int* xofs, float* alpha) { - double scale = ( double )w / outw; + double scale = (double)w / outw; for (int dx = 0; dx < outw; dx++) { - float fx = ( float )((dx + 0.5) * scale - 0.5); + float fx = (float)((dx + 0.5) * scale - 0.5); int sx = floor(fx); fx -= sx; @@ -265,14 +264,14 @@ static void cubic_coeffs(int w, int outw, int* xofs, float* alpha) static void resize_bicubic_image(float* src, float* dst, float* alpha, int* xofs, float* beta, int* yofs, int out_h, int out_w, int in_h, int in_w) { - int w = out_w; // dst.w; - int h = out_h; // dst.h; + int w = out_w; // dst.w; + int h = out_h; // dst.h; // loop body - float* rowsbuf0 = ( float* )sys_malloc(w * sizeof(float)); - float* rowsbuf1 = ( float* )sys_malloc(w * sizeof(float)); - float* rowsbuf2 = ( float* )sys_malloc(w * sizeof(float)); - float* rowsbuf3 = ( float* )sys_malloc(w * sizeof(float)); + float* rowsbuf0 = (float*)sys_malloc(w * sizeof(float)); + float* rowsbuf1 = (float*)sys_malloc(w * sizeof(float)); + float* rowsbuf2 = (float*)sys_malloc(w * sizeof(float)); + float* rowsbuf3 = (float*)sys_malloc(w * sizeof(float)); float* rows0 = rowsbuf0; float* rows1 = rowsbuf1; float* rows2 = rowsbuf2; @@ -296,7 +295,7 @@ static void resize_bicubic_image(float* src, float* dst, float* alpha, int* xofs rows1 = rows2; rows2 = rows3; rows3 = rows0_old; - const float* S3 = src + (sy + 2) * in_w; // src.row(sy+2); + const float* S3 = src + (sy + 2) * in_w; // src.row(sy+2); const float* alphap = alpha; float* rows3p = rows3; @@ -323,8 +322,8 @@ static void resize_bicubic_image(float* src, float* dst, float* alpha, int* xofs rows1 = rows3; rows2 = rows0_old; rows3 = rows1_old; - const float* S2 = src + (sy + 1) * in_w; // src.row(sy+1); - const float* S3 = src + (sy + 2) * in_w; // src.row(sy+2); + const float* S2 = src + (sy + 1) * in_w; // src.row(sy+1); + const float* S3 = src + (sy + 2) * in_w; // src.row(sy+2); const float* alphap = alpha; float* rows2p = rows2; @@ -355,9 +354,9 @@ static void resize_bicubic_image(float* src, float* dst, float* alpha, int* xofs rows1 = rows0_old; rows2 = rows1_old; rows3 = rows2_old; - const float* S1 = src + sy * in_w; // src.row(sy); - const float* S2 = src + (sy + 1) * in_w; // src.row(sy+1); - const float* S3 = src + (sy + 2) * in_w; // src.row(sy+2); + const float* S1 = src + sy * in_w; // src.row(sy); + const float* S2 = src + (sy + 1) * in_w; // src.row(sy+1); + const float* S3 = src + (sy + 2) * in_w; // src.row(sy+2); const float* alphap = alpha; float* rows1p = rows1; @@ -384,10 +383,10 @@ static void resize_bicubic_image(float* src, float* dst, float* alpha, int* xofs else { // hresize four rows - const float* S0 = src + (sy - 1) * in_w; // src.row(sy-1); - const float* S1 = src + sy * in_w; // src.row(sy); - const float* S2 = src + (sy + 1) * in_w; // src.row(sy+1); - const float* S3 = src + (sy + 2) * in_w; // src.row(sy+2); + const float* S0 = src + (sy - 1) * in_w; // src.row(sy-1); + const float* S1 = src + sy * in_w; // src.row(sy); + const float* S2 = src + (sy + 1) * in_w; // src.row(sy+1); + const float* S3 = src + (sy + 2) * in_w; // src.row(sy+2); const float* alphap = alpha; float* rows0p = rows0; @@ -427,7 +426,7 @@ static void resize_bicubic_image(float* src, float* dst, float* alpha, int* xofs float* rows1p = rows1; float* rows2p = rows2; float* rows3p = rows3; - float* Dp = dst + dy * out_w; // dst.row(dy); + float* Dp = dst + dy * out_w; // dst.row(dy); for (int dx = 0; dx < w; dx++) { *Dp++ = *rows0p++ * b0 + *rows1p++ * b1 + *rows2p++ * b2 + *rows3p++ * b3; @@ -455,8 +454,8 @@ int interp_run(struct tensor* output_tensor, struct tensor* input_tensor, struct int in_h = input_tensor->dims[2]; int in_w = input_tensor->dims[3]; - float* data = ( float* )input_tensor->data; - float* out_data = ( float* )output_tensor->data; + float* data = (float*)input_tensor->data; + float* out_data = (float*)output_tensor->data; if (out_h == 0 || out_w == 0) { @@ -483,31 +482,31 @@ int interp_run(struct tensor* output_tensor, struct tensor* input_tensor, struct return 0; } - if (resize_type == 1) // nearest + if (resize_type == 1) // nearest { #pragma omp parallel for num_threads(num_thread) for (int q = 0; q < in_c; q++) { for (int y = 0; y < out_h; ++y) { - const int in_y = MIN(( int )(y / height_scale), (in_h - 1)); + const int in_y = MIN((int)(y / height_scale), (in_h - 1)); for (int x = 0; x < out_w; ++x) { - const int in_x = MIN(( int )(x / width_scale), (in_w - 1)); + const int in_x = MIN((int)(x / width_scale), (in_w - 1)); out_data[out_w * y + x + out_w * out_h * q] = data[in_y * in_w + in_x + q * in_w * in_h]; } } } } - else if (resize_type == 2) // bilinear + else if (resize_type == 2) // bilinear { - int* buf = ( int* )sys_malloc((out_w + out_h + out_w * 2 + out_h * 2) * sizeof(int)); + int* buf = (int*)sys_malloc((out_w + out_h + out_w * 2 + out_h * 2) * sizeof(int)); - int* xofs = buf; // new int[ow]; - int* yofs = buf + out_w; // new int[oh]; + int* xofs = buf; // new int[ow]; + int* yofs = buf + out_w; // new int[oh]; - float* alpha = ( float* )(buf + out_w + out_h); // new float[ow * 2]; - float* beta = ( float* )(buf + out_w + out_h + out_w * 2); // new float[oh * 2]; + float* alpha = (float*)(buf + out_w + out_h); // new float[ow * 2]; + float* beta = (float*)(buf + out_w + out_h + out_w * 2); // new float[oh * 2]; linear_coeffs(in_w, out_w, xofs, alpha); linear_coeffs(in_h, out_h, yofs, beta); @@ -521,15 +520,15 @@ int interp_run(struct tensor* output_tensor, struct tensor* input_tensor, struct sys_free(buf); } - else if (resize_type == 3) // bicubic + else if (resize_type == 3) // bicubic { - int* buf = ( int* )sys_malloc((out_w + out_h + out_w * 4 + out_h * 4) * sizeof(int)); + int* buf = (int*)sys_malloc((out_w + out_h + out_w * 4 + out_h * 4) * sizeof(int)); - int* xofs = buf; // new int[ow]; - int* yofs = buf + out_w; // new int[oh]; + int* xofs = buf; // new int[ow]; + int* yofs = buf + out_w; // new int[oh]; - float* alpha = ( float* )(buf + out_w + out_h); // new float[ow * 4]; - float* beta = ( float* )(buf + out_w + out_h + out_w * 4); // new float[oh * 4]; + float* alpha = (float*)(buf + out_w + out_h); // new float[ow * 4]; + float* beta = (float*)(buf + out_w + out_h + out_w * 4); // new float[oh * 4]; cubic_coeffs(in_w, out_w, xofs, alpha); cubic_coeffs(in_h, out_h, yofs, beta); diff --git a/source/device/cpu/op/interp/cortex-a/interp_kernel_arm.h b/source/device/cpu/op/interp/cortex-a/interp_kernel_arm.h index afcf57ede..66ec13fa7 100644 --- a/source/device/cpu/op/interp/cortex-a/interp_kernel_arm.h +++ b/source/device/cpu/op/interp/cortex-a/interp_kernel_arm.h @@ -29,7 +29,6 @@ #include "graph/tensor.h" - int interp_run(struct tensor* output_tensor, struct tensor* input_tensor, struct interp_param* interp_param, int num_thread); diff --git a/source/device/cpu/op/interp/interp_ref.c b/source/device/cpu/op/interp/interp_ref.c index d77c31c5e..791ae6df8 100644 --- a/source/device/cpu/op/interp/interp_ref.c +++ b/source/device/cpu/op/interp/interp_ref.c @@ -37,25 +37,24 @@ #include #include - #define INTERP_MIN(a, b) ((a) < (b) ? (a) : (b)) void linear_coeffs(int w, int outw, int* xofs, float* alpha) { - double scale = ( double )w / outw; + double scale = (double)w / outw; - for(int dx = 0; dx < outw; dx++) + for (int dx = 0; dx < outw; dx++) { - float fx = ( float )((dx) * scale); + float fx = (float)((dx)*scale); int sx = floor(fx); fx -= sx; - if(sx < 0) + if (sx < 0) { sx = 0; fx = 0.f; } - if(sx >= w - 1) + if (sx >= w - 1) { sx = w - 2; fx = 1.f; @@ -70,12 +69,12 @@ void linear_coeffs(int w, int outw, int* xofs, float* alpha) void resize_bilinear_image(float* src, float* dst, float* alpha, int* xofs, float* beta, int* yofs, int out_h, int out_w, int in_h, int in_w) { - int w = out_w; //dst.w; - int h = out_h; //dst.h; + int w = out_w; //dst.w; + int h = out_h; //dst.h; // loop body - float* rowsbuf0 = ( float* )sys_malloc(w * sizeof(float)); - float* rowsbuf1 = ( float* )sys_malloc(w * sizeof(float)); + float* rowsbuf0 = (float*)sys_malloc(w * sizeof(float)); + float* rowsbuf1 = (float*)sys_malloc(w * sizeof(float)); float* rows0 = rowsbuf0; float* rows1 = rowsbuf1; @@ -83,7 +82,7 @@ void resize_bilinear_image(float* src, float* dst, float* alpha, int* xofs, floa memset(rowsbuf1, 0, w * sizeof(float)); int prev_sy1 = -2; - for (int dy = 0; dy < h; dy++ ) + for (int dy = 0; dy < h; dy++) { int sy = yofs[dy]; @@ -97,7 +96,7 @@ void resize_bilinear_image(float* src, float* dst, float* alpha, int* xofs, floa float* rows0_old = rows0; rows0 = rows1; rows1 = rows0_old; - const float* S1 = src + (sy+1)*in_w; //src.row(sy+1); + const float* S1 = src + (sy + 1) * in_w; //src.row(sy+1); const float* alphap = alpha; float* rows1p = rows1; @@ -109,7 +108,7 @@ void resize_bilinear_image(float* src, float* dst, float* alpha, int* xofs, floa float a0 = alphap[0]; float a1 = alphap[1]; - rows1p[dx] = S1p[0]*a0 + S1p[1]*a1; + rows1p[dx] = S1p[0] * a0 + S1p[1] * a1; alphap += 2; } @@ -117,8 +116,8 @@ void resize_bilinear_image(float* src, float* dst, float* alpha, int* xofs, floa else { // hresize two rows - const float* S0 = src + sy*in_w; //src.row(sy); - const float* S1 = src + (sy+1)*in_w; //src.row(sy+1); + const float* S0 = src + sy * in_w; //src.row(sy); + const float* S1 = src + (sy + 1) * in_w; //src.row(sy+1); const float* alphap = alpha; float* rows0p = rows0; @@ -132,12 +131,11 @@ void resize_bilinear_image(float* src, float* dst, float* alpha, int* xofs, floa float a0 = alphap[0]; float a1 = alphap[1]; - rows0p[dx] = S0p[0]*a0 + S0p[1]*a1; - rows1p[dx] = S1p[0]*a0 + S1p[1]*a1; + rows0p[dx] = S0p[0] * a0 + S0p[1] * a1; + rows1p[dx] = S1p[0] * a0 + S1p[1] * a1; alphap += 2; } - } prev_sy1 = sy; @@ -210,7 +208,7 @@ int ref_interp_fp32(struct tensor* input_tensor, struct tensor* output_tensor, s int in_channel_size = in_h * in_w; int out_channel_size = out_h * out_w; - int* buf = (int*)sys_malloc((param->output_width + param->output_height + param->output_width*2 + param->output_height*2)*sizeof(float)); + int* buf = (int*)sys_malloc((param->output_width + param->output_height + param->output_width * 2 + param->output_height * 2) * sizeof(float)); if (buf == NULL) { @@ -218,18 +216,18 @@ int ref_interp_fp32(struct tensor* input_tensor, struct tensor* output_tensor, s return -1; } - int* xofs = buf;//new int[ow]; - int* yofs = buf + param->output_width ;//new int[oh]; + int* xofs = buf; //new int[ow]; + int* yofs = buf + param->output_width; //new int[oh]; - float* alpha = (float*)(buf + param->output_width + param->output_height);//new float[ow * 2]; - float* beta = (float*)(buf + param->output_width + param->output_height + param->output_width*2);//new float[oh * 2]; + float* alpha = (float*)(buf + param->output_width + param->output_height); //new float[ow * 2]; + float* beta = (float*)(buf + param->output_width + param->output_height + param->output_width * 2); //new float[oh * 2]; linear_coeffs(in_w, out_w, xofs, alpha); linear_coeffs(in_h, out_h, yofs, beta); for (int q = 0; q < channel; ++q) { - resize_bilinear_image(input+in_channel_size*q, output+out_channel_size*q, alpha, xofs, beta, yofs, out_h, out_w, in_h, in_w); + resize_bilinear_image(input + in_channel_size * q, output + out_channel_size * q, alpha, xofs, beta, yofs, out_h, out_w, in_h, in_w); } sys_free(buf); @@ -259,9 +257,9 @@ int ref_interp_uint8(struct tensor* input_tensor, struct tensor* output_tensor, float* input_fp32 = (float*)sys_malloc(input_total_size * sizeof(float)); float* output_fp32 = (float*)sys_malloc(output_total_size * sizeof(float)); - for(int i=0; ioutput_width + param->output_height + param->output_width*2 + param->output_height*2)*sizeof(float)); + int* buf = (int*)sys_malloc((param->output_width + param->output_height + param->output_width * 2 + param->output_height * 2) * sizeof(float)); if (buf == NULL) { @@ -312,18 +310,18 @@ int ref_interp_uint8(struct tensor* input_tensor, struct tensor* output_tensor, return -1; } - int* xofs = buf;//new int[ow]; - int* yofs = buf + param->output_width ;//new int[oh]; + int* xofs = buf; //new int[ow]; + int* yofs = buf + param->output_width; //new int[oh]; - float* alpha = (float*)(buf + param->output_width + param->output_height);//new float[ow * 2]; - float* beta = (float*)(buf + param->output_width + param->output_height + param->output_width*2);//new float[oh * 2]; + float* alpha = (float*)(buf + param->output_width + param->output_height); //new float[ow * 2]; + float* beta = (float*)(buf + param->output_width + param->output_height + param->output_width * 2); //new float[oh * 2]; linear_coeffs(in_w, out_w, xofs, alpha); linear_coeffs(in_h, out_h, yofs, beta); for (int q = 0; q < channel; ++q) { - resize_bilinear_image(input_fp32+in_channel_size*q, output_fp32+out_channel_size*q, alpha, xofs, beta, yofs, out_h, out_w, in_h, in_w); + resize_bilinear_image(input_fp32 + in_channel_size * q, output_fp32 + out_channel_size * q, alpha, xofs, beta, yofs, out_h, out_w, in_h, in_w); } sys_free(buf); @@ -335,7 +333,7 @@ int ref_interp_uint8(struct tensor* input_tensor, struct tensor* output_tensor, } /* quant */ - for(int i=0; i 255) @@ -372,7 +370,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct graph* graph = node->graph; struct tensor* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct interp_param* param = ( struct interp_param* )node->op.param_mem; + struct interp_param* param = (struct interp_param*)node->op.param_mem; int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) @@ -381,7 +379,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex ret = ref_interp_uint8(input_tensor, output_tensor, param); else TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type); - + return ret; } diff --git a/source/device/cpu/op/l2normalization/l2normalization_ref.c b/source/device/cpu/op/l2normalization/l2normalization_ref.c index c10793290..ac368086a 100644 --- a/source/device/cpu/op/l2normalization/l2normalization_ref.c +++ b/source/device/cpu/op/l2normalization/l2normalization_ref.c @@ -34,17 +34,16 @@ #include - int ref_l2normalization_fp32(float* input_data, float* output_data, int size, int channel_size) { float sq_l2_norm = 0; - for(int j = 0; j < channel_size; j++) + for (int j = 0; j < channel_size; j++) { const float val = input_data[j]; sq_l2_norm += val * val; } const float l2_norm = sqrt(sq_l2_norm); - for(int j = 0; j < channel_size; j++) + for (int j = 0; j < channel_size; j++) { output_data[j] = input_data[j] / l2_norm; } @@ -74,7 +73,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int input_size = 1; int channel_size = input_tensor->dims[1]; - for(int i = 0; i < input_tensor->dim_num; i++){ + for (int i = 0; i < input_tensor->dim_num; i++) + { input_size *= input_tensor->dims[i]; } diff --git a/source/device/cpu/op/l2pool/l2pool_ref.c b/source/device/cpu/op/l2pool/l2pool_ref.c index 6cc5e96c3..5cf027d70 100644 --- a/source/device/cpu/op/l2pool/l2pool_ref.c +++ b/source/device/cpu/op/l2pool/l2pool_ref.c @@ -37,7 +37,6 @@ #include #include - struct ref_l2pool_param { int inc; @@ -53,7 +52,7 @@ struct ref_l2pool_param int pad_h; int pad_w; int inn; - float scale[2]; // scale[0]: input scale, scale[1]: output scale + float scale[2]; // scale[0]: input scale, scale[1]: output scale int zero_point[2]; // zero_point[0]: input zero_point, zero_point[1]: output zero_point }; #define L2POOL_MAX(a, b) ((a) < (b) ? (b) : (a)) @@ -61,8 +60,7 @@ struct ref_l2pool_param static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct ref_l2pool_param* l2pool_op_param = - (struct ref_l2pool_param*)sys_malloc(sizeof(struct ref_l2pool_param)); + struct ref_l2pool_param* l2pool_op_param = (struct ref_l2pool_param*)sys_malloc(sizeof(struct ref_l2pool_param)); memset(l2pool_op_param, 0, sizeof(struct ref_l2pool_param)); exec_node->ops_priv = l2pool_op_param; return 0; @@ -75,11 +73,11 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, } void run_l2pool(float* data, float* out_data, struct ref_l2pool_param* param) { - for(int c = 0; c < param->inc; c++) + for (int c = 0; c < param->inc; c++) { - for(int ph = 0; ph < param->outh; ph++) + for (int ph = 0; ph < param->outh; ph++) { - for(int pw = 0; pw < param->outw; pw++) + for (int pw = 0; pw < param->outw; pw++) { // int index = inc * (ph * outw + pw) + c; int index = param->inc * (ph * param->outw + pw) + c; @@ -95,9 +93,9 @@ void run_l2pool(float* data, float* out_data, struct ref_l2pool_param* param) float tmp = 0.0f; float val = 0.0f; - for(int h = h_start; h < h_end; h++) + for (int h = h_start; h < h_end; h++) { - for(int w = w_start; w < w_end; w++) + for (int w = w_start; w < w_end; w++) { // val = data[i*param->inh*param->inc * param->inw +h * param->inc * param->inw + w * param->inc // +c]; @@ -106,7 +104,7 @@ void run_l2pool(float* data, float* out_data, struct ref_l2pool_param* param) pool_size++; } } - if(tmp == 0) + if (tmp == 0) { out_data[index] = 0; } @@ -123,41 +121,37 @@ int ref_l2pool_fp32(float* data, float* out_data, struct ref_l2pool_param* param { int input_size = param->inc * param->inh * param->inw; int output_size = param->outh * param->outw * param->outc; - for(int i = 0; i < param->inn; i++) + for (int i = 0; i < param->inn; i++) { - run_l2pool(data + i * input_size, out_data + i * output_size,param); + run_l2pool(data + i * input_size, out_data + i * output_size, param); } return 0; } - -void ConvertPaddingStyleToParameters(int stride_h, int stride_w, - int in_height, int in_width, int filter_height, int filter_width, int paddingtype, - int out_height, int out_width, - int* padding_width, int* padding_height) +void ConvertPaddingStyleToParameters(int stride_h, int stride_w, + int in_height, int in_width, int filter_height, int filter_width, int paddingtype, + int out_height, int out_width, + int* padding_width, int* padding_height) { - if(paddingtype == 0 || paddingtype == 2) + if (paddingtype == 0 || paddingtype == 2) { *padding_width = 0; *padding_height = 0; } - else if(paddingtype == 1) + else if (paddingtype == 1) { *padding_width = (int)(((out_width - 1) * stride_w + filter_width - in_width) / 2); - *padding_height = (int)(((out_height - 1) * stride_h + filter_height - in_height)/2); + *padding_height = (int)(((out_height - 1) * stride_h + filter_height - in_height) / 2); } return; } - - static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; } - static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { struct node* ir_node = exec_node->ir_node; @@ -180,10 +174,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int output_c = output_tensor->dims[1]; int padding_w = 0; int padding_h = 0; - + ConvertPaddingStyleToParameters(l2pool_param_op->stride_h, l2pool_param_op->stride_w, input_h, input_w, - l2pool_param_op->kernel_h, l2pool_param_op->kernel_w, l2pool_param_op->paddingType, - output_h, output_w, &padding_w, &padding_h); + l2pool_param_op->kernel_h, l2pool_param_op->kernel_w, l2pool_param_op->paddingType, + output_h, output_w, &padding_w, &padding_h); op_param->inc = input_c; op_param->inh = input_h; diff --git a/source/device/cpu/op/logical/logical_ref.c b/source/device/cpu/op/logical/logical_ref.c index 94cdb8bdd..aef2ad3f7 100644 --- a/source/device/cpu/op/logical/logical_ref.c +++ b/source/device/cpu/op/logical/logical_ref.c @@ -36,7 +36,6 @@ #include - struct logical_param_ref { int type; @@ -118,19 +117,19 @@ static int ref_logical_fp32(float* input0, float* input1, float* output, struct switch (param->type) { - case 0: // LogicalAnd - { - logical_and(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, output); - break; - } - case 1: // LogicalOr - { - logical_or(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, output); - break; - } - default: - return -1; - ; + case 0: // LogicalAnd + { + logical_and(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, output); + break; + } + case 1: // LogicalOr + { + logical_or(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, output); + break; + } + default: + return -1; + ; } return 0; } @@ -169,7 +168,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor1 = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct logical_param* logical_param = ( struct logical_param* )ir_node->op.param_mem; + struct logical_param* logical_param = (struct logical_param*)ir_node->op.param_mem; struct logical_param_ref logical_param_ref; logical_param_ref.shape0[0] = 1; @@ -182,28 +181,28 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex logical_param_ref.shape1[2] = 1; logical_param_ref.shape1[3] = 1; - if (input_tensor0->dims[0] !=0) + if (input_tensor0->dims[0] != 0) logical_param_ref.shape0[0] = input_tensor0->dims[0]; - if (input_tensor0->dims[1] !=0) + if (input_tensor0->dims[1] != 0) logical_param_ref.shape0[1] = input_tensor0->dims[1]; - if (input_tensor0->dims[2] !=0) + if (input_tensor0->dims[2] != 0) logical_param_ref.shape0[2] = input_tensor0->dims[2]; - if (input_tensor0->dims[3] !=0) + if (input_tensor0->dims[3] != 0) logical_param_ref.shape0[3] = input_tensor0->dims[3]; - if (input_tensor1->dims[0] !=0) + if (input_tensor1->dims[0] != 0) logical_param_ref.shape1[0] = input_tensor1->dims[0]; - if (input_tensor1->dims[1] !=0) + if (input_tensor1->dims[1] != 0) logical_param_ref.shape1[1] = input_tensor1->dims[1]; - if (input_tensor1->dims[2] !=0) + if (input_tensor1->dims[2] != 0) logical_param_ref.shape1[2] = input_tensor1->dims[2]; - if (input_tensor1->dims[3] !=0) + if (input_tensor1->dims[3] != 0) logical_param_ref.shape1[3] = input_tensor1->dims[3]; logical_param_ref.type = logical_param->type; - int ret = ref_logical_fp32((float*)input_tensor0->data, (float*)input_tensor1->data, - (float*)output_tensor->data, &logical_param_ref, exec_graph->num_thread); + int ret = ref_logical_fp32((float*)input_tensor0->data, (float*)input_tensor1->data, + (float*)output_tensor->data, &logical_param_ref, exec_graph->num_thread); if (ret != 0) return -1; diff --git a/source/device/cpu/op/logistic/logistic_ref.c b/source/device/cpu/op/logistic/logistic_ref.c index 4d363ed1a..807ff90d9 100644 --- a/source/device/cpu/op/logistic/logistic_ref.c +++ b/source/device/cpu/op/logistic/logistic_ref.c @@ -34,12 +34,11 @@ #include - struct logical_param { int out_size; float scale[2]; // scale[0]: input scale, scale[1]: output scale - int zero_point[2]; // zero_point[0]: input zero_point, zero_point[1]: output zero_point + int zero_point[2]; // zero_point[0]: input zero_point, zero_point[1]: output zero_point }; static int ref_logistic_fp32(float* input_data, float* output_data, struct logical_param* op_param) @@ -58,9 +57,7 @@ static int ref_logistic_uint8(uint8_t* input, uint8_t* output, struct logical_pa for (int i = 0; i < op_param->out_size; i++) { /* get max */ - output[i] = - (1.f / (1.f + exp(-(input[i] - (double )op_param->zero_point[0]) * op_param->scale[0]))) / op_param->scale[1] + - op_param->zero_point[1]; + output[i] = (1.f / (1.f + exp(-(input[i] - (double)op_param->zero_point[0]) * op_param->scale[0]))) / op_param->scale[1] + op_param->zero_point[1]; } return 0; diff --git a/source/device/cpu/op/logsoftmax/logsoftmax_ref.c b/source/device/cpu/op/logsoftmax/logsoftmax_ref.c index 3558efe6b..2af74c63d 100644 --- a/source/device/cpu/op/logsoftmax/logsoftmax_ref.c +++ b/source/device/cpu/op/logsoftmax/logsoftmax_ref.c @@ -37,35 +37,33 @@ #include #include - struct ref_logsoftmax_param { int axis; int in_size; int on_size; int out_size; - float scale[2]; // scale[0]: input scale, scale[1]: output scale + float scale[2]; // scale[0]: input scale, scale[1]: output scale int zero_point[2]; // zero_point[0]: input zero_point, zero_point[1]: output zero_point }; static void GetMaxArray(float* input, float* array, int in_size, int on_size) { - float* input_ptr = ( float* )input; - float* array_ptr = ( float* )array; + float* input_ptr = (float*)input; + float* array_ptr = (float*)array; memset(array, 0, in_size * sizeof(float)); - for(int j = 0; j < on_size; j++) - for(int l = 0; l < in_size; l++) + for (int j = 0; j < on_size; j++) + for (int l = 0; l < in_size; l++) { - if(array_ptr[l] < input_ptr[j * in_size + l]) + if (array_ptr[l] < input_ptr[j * in_size + l]) array_ptr[l] = input_ptr[j * in_size + l]; } } static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct ref_logsoftmax_param* logsoftmax_op_param = - (struct ref_logsoftmax_param*)sys_malloc(sizeof(struct ref_logsoftmax_param)); + struct ref_logsoftmax_param* logsoftmax_op_param = (struct ref_logsoftmax_param*)sys_malloc(sizeof(struct ref_logsoftmax_param)); memset(logsoftmax_op_param, 0, sizeof(struct ref_logsoftmax_param)); exec_node->ops_priv = logsoftmax_op_param; return 0; @@ -79,17 +77,17 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, static void GetOutResult(float* input, float* output, float* array, float* sum_array, int in_size, int on_size) { - float* input_ptr = ( float* )input; - float* output_ptr = ( float* )output; - float* array_ptr = ( float* )array; - float* sum_array_ptr = ( float* )sum_array; + float* input_ptr = (float*)input; + float* output_ptr = (float*)output; + float* array_ptr = (float*)array; + float* sum_array_ptr = (float*)sum_array; memset(sum_array, 0x0, in_size * sizeof(float)); /* get the exp and the summary */ - for(int j = 0; j < on_size; j++) - for(int l = 0; l < in_size; l++) + for (int j = 0; j < on_size; j++) + for (int l = 0; l < in_size; l++) { int index = j * in_size + l; output_ptr[index] = exp(input_ptr[index] - array_ptr[l]); @@ -97,18 +95,18 @@ static void GetOutResult(float* input, float* output, float* array, float* sum_a } /* the final result */ - for(int j = 0; j < on_size; j++) - for(int l = 0; l < in_size; l++) + for (int j = 0; j < on_size; j++) + for (int l = 0; l < in_size; l++) { int index = j * in_size + l; output_ptr[index] /= sum_array_ptr[l]; - output_ptr[index]=log(output_ptr[index]); + output_ptr[index] = log(output_ptr[index]); } } static int ref_logsoftmax_fp32(float* input_data, float* output_data, float* max_array, float* sum_array, struct ref_logsoftmax_param* op_param) { - for(int i = 0; i < op_param->out_size; i++) + for (int i = 0; i < op_param->out_size; i++) { int img_base = i * op_param->in_size * op_param->on_size; GetMaxArray(input_data + img_base, max_array, op_param->in_size, op_param->on_size); @@ -149,27 +147,27 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex // int axis = param_->axis; int out_size = 1; - for(int i = 0; i < axis; i++) + for (int i = 0; i < axis; i++) { out_size *= dims[i]; } int in_size = 1; - for(size_t i = axis + 1; i < input_tensor->dim_num; i++) + for (size_t i = axis + 1; i < input_tensor->dim_num; i++) { in_size *= dims[i]; } int on_size = dims[axis]; - max_array = ( float* )sys_malloc(in_size * sizeof(float)); - sum_array = ( float* )sys_malloc(in_size * sizeof(float)); + max_array = (float*)sys_malloc(in_size * sizeof(float)); + sum_array = (float*)sys_malloc(in_size * sizeof(float)); ref_logsoftmax_param.in_size = in_size; ref_logsoftmax_param.on_size = on_size; if (input_tensor->data_type == TENGINE_DT_FP32) - ref_logsoftmax_fp32((float*)input_tensor->data, (float*)output_tensor->data,max_array,sum_array, &ref_logsoftmax_param); + ref_logsoftmax_fp32((float*)input_tensor->data, (float*)output_tensor->data, max_array, sum_array, &ref_logsoftmax_param); // else - // ref_logistic_uint8(input_tensor->data, output_tensor->data, &logical_param); + // ref_logistic_uint8(input_tensor->data, output_tensor->data, &logical_param); return 0; } diff --git a/source/device/cpu/op/lrn/cortex-a/lrn_hcl_arm.c b/source/device/cpu/op/lrn/cortex-a/lrn_hcl_arm.c index f28faca6e..fc883f9f2 100644 --- a/source/device/cpu/op/lrn/cortex-a/lrn_hcl_arm.c +++ b/source/device/cpu/op/lrn/cortex-a/lrn_hcl_arm.c @@ -34,7 +34,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -59,7 +58,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct lrn_param* lrn_param = ( struct lrn_param* )ir_node->op.param_mem; + struct lrn_param* lrn_param = (struct lrn_param*)ir_node->op.param_mem; int ret = lrn_run(output_tensor, input_tensor, lrn_param, exec_graph->num_thread); if (ret != 0) diff --git a/source/device/cpu/op/lrn/cortex-a/lrn_kernel_arm.c b/source/device/cpu/op/lrn/cortex-a/lrn_kernel_arm.c index e2606992d..a86e6571a 100644 --- a/source/device/cpu/op/lrn/cortex-a/lrn_kernel_arm.c +++ b/source/device/cpu/op/lrn/cortex-a/lrn_kernel_arm.c @@ -38,7 +38,6 @@ #include - #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -126,8 +125,8 @@ static inline float32x4_t vtaylor_polyq_f32(float32x4_t x, struct tab* coeffs) static inline float32x4_t vexpq_f32(float32x4_t x) { - const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2) - const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2) + const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2) + const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2) const float32x4_t CONST_0 = vdupq_n_f32(0.f); const int32x4_t CONST_NEGATIVE_126 = vdupq_n_s32(-126); @@ -147,8 +146,8 @@ static inline float32x4_t vexpq_f32(float32x4_t x) static inline float32x4_t vlogq_f32(float32x4_t x) { - const int32x4_t CONST_127 = vdupq_n_s32(127); // 127 - const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2) + const int32x4_t CONST_127 = vdupq_n_s32(127); // 127 + const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2) // Extract exponent int32x4_t m = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127); @@ -186,7 +185,7 @@ static inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n) static void lrn_kernel(int i, int id, void* data, const float* input, float* output, float* square, float alpha, float beta, float bias, int local_size, int channel_size, int channel_num, int num_thread) { - int step = (( int* )data)[0]; + int step = ((int*)data)[0]; const float32x4_t alpha_vec = vdupq_n_f32(alpha / local_size); const float32x4_t beta_vec = vdupq_n_f32(beta); const float32x4_t bias_vec = vdupq_n_f32(bias); @@ -238,9 +237,9 @@ int lrn_run(struct tensor* output_tensor, struct tensor* input_tensor, struct lr int num_thread) { init_tab(); - const float* input = ( float* )input_tensor->data; - float* output = ( float* )output_tensor->data; - float* square = ( float* )(malloc(input_tensor->elem_num * sizeof(float))); + const float* input = (float*)input_tensor->data; + float* output = (float*)output_tensor->data; + float* square = (float*)(malloc(input_tensor->elem_num * sizeof(float))); int n = input_tensor->dims[0]; int c = input_tensor->dims[1]; diff --git a/source/device/cpu/op/lrn/cortex-a/lrn_kernel_arm.h b/source/device/cpu/op/lrn/cortex-a/lrn_kernel_arm.h index 45330b725..f4c1e20ae 100644 --- a/source/device/cpu/op/lrn/cortex-a/lrn_kernel_arm.h +++ b/source/device/cpu/op/lrn/cortex-a/lrn_kernel_arm.h @@ -31,7 +31,6 @@ #include - struct tab { float32x4_t a0; diff --git a/source/device/cpu/op/lrn/lrn_ref.c b/source/device/cpu/op/lrn/lrn_ref.c index d7026e19d..ff71d6903 100644 --- a/source/device/cpu/op/lrn/lrn_ref.c +++ b/source/device/cpu/op/lrn/lrn_ref.c @@ -37,7 +37,6 @@ #include #include - static int ref_lrn_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct lrn_param* param, int num_thread) { @@ -57,8 +56,8 @@ static int ref_lrn_fp32(struct tensor* input_tensor, struct tensor* output_tenso float* in_data = (float*)input_tensor->data; float* out_data = (float*)output_tensor->data; - float* square = ( float* )(malloc(img_size * sizeof(float))); - float* accum_square = ( float* )(malloc(channel_size * sizeof(float))); + float* square = (float*)(malloc(img_size * sizeof(float))); + float* accum_square = (float*)(malloc(channel_size * sizeof(float))); for (int i = 0; i < n; i++) { @@ -130,7 +129,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct lrn_param* lrn_param = ( struct lrn_param* )ir_node->op.param_mem; + struct lrn_param* lrn_param = (struct lrn_param*)ir_node->op.param_mem; ref_lrn_fp32(input_tensor, output_tensor, lrn_param, exec_graph->num_thread); diff --git a/source/device/cpu/op/lstm/lstm_ref.c b/source/device/cpu/op/lstm/lstm_ref.c index 73849b33c..0367e9f56 100644 --- a/source/device/cpu/op/lstm/lstm_ref.c +++ b/source/device/cpu/op/lstm/lstm_ref.c @@ -56,23 +56,23 @@ int ref_lstm_default_fp32(struct tensor* input_tensor, struct tensor* w, struct float* output_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); float* output_c_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); - memset(init_h_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float)); - memset(init_c_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float)); - memset(output_h_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float)); - memset(output_c_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float)); - + memset(init_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); + memset(init_c_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); + memset(output_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); + memset(output_c_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); + float* output_data = (float*)output_tensor->data; int T = input_tensor->dims[1]; int size = input_tensor->dims[2]; - float* i_flag = ( float* )malloc(hidden_size * sizeof(float)); - float* f_flag = ( float* )malloc(hidden_size * sizeof(float)); - float* o_flag = ( float* )malloc(hidden_size * sizeof(float)); - float* g_flag = ( float* )malloc(hidden_size * sizeof(float)); + float* i_flag = (float*)malloc(hidden_size * sizeof(float)); + float* f_flag = (float*)malloc(hidden_size * sizeof(float)); + float* o_flag = (float*)malloc(hidden_size * sizeof(float)); + float* g_flag = (float*)malloc(hidden_size * sizeof(float)); - for(int seq = 0; seq < input_tensor->dims[0]; seq++) + for (int seq = 0; seq < input_tensor->dims[0]; seq++) { - for(int i = 0; i < T; i++) + for (int i = 0; i < T; i++) { for (int q = 0; q < hidden_size; q++) { @@ -92,7 +92,7 @@ int ref_lstm_default_fp32(struct tensor* input_tensor, struct tensor* w, struct for (int h = 0; h < hidden_size; h++) { - if(seq == 0) + if (seq == 0) { float h_i = init_h_data[h + i * hidden_size]; I += h_i * (r_data[(hidden_size * 0 + q) * hidden_size + h]); @@ -118,14 +118,14 @@ int ref_lstm_default_fp32(struct tensor* input_tensor, struct tensor* w, struct for (int c = 0; c < hidden_size; c++) { - if( seq == 0) + if (seq == 0) { float I = 1.f / (1.f + exp(-i_flag[c])); float F = 1.f / (1.f + exp(-f_flag[c])); float G = tanh(g_flag[c]); float c_i = init_c_data[c + i * hidden_size]; float cell2 = F * c_i + I * G; - float O = 1.f/(1.f + exp(-o_flag[c])); + float O = 1.f / (1.f + exp(-o_flag[c])); float tmp = tanh(cell2); float H = O * tmp; output_c_data[i * hidden_size + c] = cell2; @@ -139,7 +139,7 @@ int ref_lstm_default_fp32(struct tensor* input_tensor, struct tensor* w, struct float G = tanh(g_flag[c]); float c_i = output_c_data[c + i * hidden_size]; float cell2 = F * c_i + I * G; - float O = 1.f/(1.f + exp(-o_flag[c])); + float O = 1.f / (1.f + exp(-o_flag[c])); float H = O * tanh(cell2); output_c_data[i * hidden_size + c] = cell2; output_h_data[i * hidden_size + c] = H; @@ -177,24 +177,24 @@ int ref_lstm_with_bias_fp32(struct tensor* input_tensor, struct tensor* w, struc float* output_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); float* output_c_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); - memset(init_h_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float)); - memset(init_c_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float)); - memset(output_h_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float)); - memset(output_c_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float)); + memset(init_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); + memset(init_c_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); + memset(output_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); + memset(output_c_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); float* output_data = (float*)output_tensor->data; int T = input_tensor->dims[1]; int size = input_tensor->dims[2]; - float* i_flag = ( float* )malloc(hidden_size * sizeof(float)); - float* f_flag = ( float* )malloc(hidden_size * sizeof(float)); - float* o_flag = ( float* )malloc(hidden_size * sizeof(float)); - float* g_flag = ( float* )malloc(hidden_size * sizeof(float)); + float* i_flag = (float*)malloc(hidden_size * sizeof(float)); + float* f_flag = (float*)malloc(hidden_size * sizeof(float)); + float* o_flag = (float*)malloc(hidden_size * sizeof(float)); + float* g_flag = (float*)malloc(hidden_size * sizeof(float)); - for(int seq = 0; seq < input_tensor->dims[0]; seq++) + for (int seq = 0; seq < input_tensor->dims[0]; seq++) { - for(int i = 0; i < T; i++) + for (int i = 0; i < T; i++) { for (int q = 0; q < hidden_size; q++) { @@ -217,7 +217,7 @@ int ref_lstm_with_bias_fp32(struct tensor* input_tensor, struct tensor* w, struc G += b_data[hidden_size * 3 + q]; for (int h = 0; h < hidden_size; h++) { - if(seq == 0) + if (seq == 0) { float h_i = init_h_data[h + i * hidden_size]; I += h_i * (r_data[(hidden_size * 0 + q) * hidden_size + h]); @@ -247,14 +247,14 @@ int ref_lstm_with_bias_fp32(struct tensor* input_tensor, struct tensor* w, struc for (int c = 0; c < hidden_size; c++) { - if( seq == 0) + if (seq == 0) { float I = 1.f / (1.f + exp(-i_flag[c])); float F = 1.f / (1.f + exp(-f_flag[c])); float G = tanh(g_flag[c]); float c_i = init_c_data[c + i * hidden_size]; float cell2 = F * c_i + I * G; - float O = 1.f/(1.f + exp(-o_flag[c])); + float O = 1.f / (1.f + exp(-o_flag[c])); float tmp = tanh(cell2); float H = O * tmp; output_c_data[i * hidden_size + c] = cell2; @@ -268,7 +268,7 @@ int ref_lstm_with_bias_fp32(struct tensor* input_tensor, struct tensor* w, struc float G = tanh(g_flag[c]); float c_i = output_c_data[c + i * hidden_size]; float cell2 = F * c_i + I * G; - float O = 1.f/(1.f + exp(-o_flag[c])); + float O = 1.f / (1.f + exp(-o_flag[c])); float H = O * tanh(cell2); output_c_data[i * hidden_size + c] = cell2; output_h_data[i * hidden_size + c] = H; @@ -301,10 +301,10 @@ int ref_lstm_with_bias_case1_fp32(struct tensor* input_tensor, struct tensor* w, float* b_data = (float*)b->data; /* initial h, initial c buffers */ - float* init_h_data = ( float* )malloc((unsigned long)hidden_size * batch_size * sizeof(float)); - float* init_c_data = ( float* )malloc((unsigned long)hidden_size * batch_size * sizeof(float)); - float* output_h_data = ( float* )malloc((unsigned long)hidden_size * batch_size * sizeof(float)); - float* output_c_data = ( float* )malloc((unsigned long)hidden_size * batch_size * sizeof(float)); + float* init_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); + float* init_c_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); + float* output_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); + float* output_c_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); memset(init_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); memset(init_c_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); @@ -313,16 +313,16 @@ int ref_lstm_with_bias_case1_fp32(struct tensor* input_tensor, struct tensor* w, float* output_data = (float*)output_tensor->data; - float* i_flag = ( float* )malloc(hidden_size * sizeof(float)); - float* f_flag = ( float* )malloc(hidden_size * sizeof(float)); - float* o_flag = ( float* )malloc(hidden_size * sizeof(float)); - float* g_flag = ( float* )malloc(hidden_size * sizeof(float)); + float* i_flag = (float*)malloc(hidden_size * sizeof(float)); + float* f_flag = (float*)malloc(hidden_size * sizeof(float)); + float* o_flag = (float*)malloc(hidden_size * sizeof(float)); + float* g_flag = (float*)malloc(hidden_size * sizeof(float)); - for (int seq = 0; seq < sequence_size; seq++) // sequence + for (int seq = 0; seq < sequence_size; seq++) // sequence { - for (int i = 0; i < batch_size; i++) // batch + for (int i = 0; i < batch_size; i++) // batch { - for (int q = 0; q < hidden_size; q++) // hidden + for (int q = 0; q < hidden_size; q++) // hidden { float I = 0; float F = 0; @@ -330,7 +330,7 @@ int ref_lstm_with_bias_case1_fp32(struct tensor* input_tensor, struct tensor* w, float G = 0; /* input fc */ - for (int m = 0; m < size; m++) // internal size, the same as four fc implement + for (int m = 0; m < size; m++) // internal size, the same as four fc implement { int index = seq * (batch_size * size) + i * size + m; float i_data = x_data[index]; @@ -420,9 +420,9 @@ int ref_lstm_with_bias_case1_fp32(struct tensor* input_tensor, struct tensor* w, return 0; } -int ref_lstm_with_peepholes_fp32(struct tensor* input_tensor, struct tensor* w, struct tensor* r, - struct tensor* b, struct tensor* sequence_lens, struct tensor* init_h, struct tensor* init_c, struct tensor* p, - struct tensor* output_tensor, struct lstm_param* param) +int ref_lstm_with_peepholes_fp32(struct tensor* input_tensor, struct tensor* w, struct tensor* r, + struct tensor* b, struct tensor* sequence_lens, struct tensor* init_h, struct tensor* init_c, struct tensor* p, + struct tensor* output_tensor, struct lstm_param* param) { int batch_size = input_tensor->dims[1]; int hidden_size = param->hidden_size; @@ -434,25 +434,25 @@ int ref_lstm_with_peepholes_fp32(struct tensor* input_tensor, struct tensor* w, float* init_h_data = (float*)init_h->data; float* init_c_data = (float*)init_c->data; float* p_data = (float*)p->data; - + float* output_data = (float*)output_tensor->data; float* output_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); float* output_c_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float)); - memset(output_h_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float)); - memset(output_c_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float)); + memset(output_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); + memset(output_c_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float)); int T = input_tensor->dims[1]; int size = input_tensor->dims[2]; - float* i_flag = ( float* )malloc(hidden_size * sizeof(float)); - float* f_flag = ( float* )malloc(hidden_size * sizeof(float)); - float* o_flag = ( float* )malloc(hidden_size * sizeof(float)); - float* g_flag = ( float* )malloc(hidden_size * sizeof(float)); + float* i_flag = (float*)malloc(hidden_size * sizeof(float)); + float* f_flag = (float*)malloc(hidden_size * sizeof(float)); + float* o_flag = (float*)malloc(hidden_size * sizeof(float)); + float* g_flag = (float*)malloc(hidden_size * sizeof(float)); - for(int seq = 0; seq < input_tensor->dims[0]; seq++) + for (int seq = 0; seq < input_tensor->dims[0]; seq++) { - for(int i = 0; i < T; i++) + for (int i = 0; i < T; i++) { for (int q = 0; q < hidden_size; q++) { @@ -475,7 +475,7 @@ int ref_lstm_with_peepholes_fp32(struct tensor* input_tensor, struct tensor* w, G += b_data[hidden_size * 3 + q]; for (int h = 0; h < hidden_size; h++) { - if(seq == 0) + if (seq == 0) { float h_i = init_h_data[h + i * hidden_size]; I += h_i * (r_data[(hidden_size * 0 + q) * hidden_size + h]); @@ -505,14 +505,14 @@ int ref_lstm_with_peepholes_fp32(struct tensor* input_tensor, struct tensor* w, for (int c = 0; c < hidden_size; c++) { - if( seq == 0) + if (seq == 0) { float I = 1.f / (1.f + exp(-i_flag[c])); float F = 1.f / (1.f + exp(-f_flag[c])); float G = tanh(g_flag[c]); float c_i = init_c_data[c + i * hidden_size]; float cell2 = F * c_i + I * G; - float O = 1.f/(1.f + exp(-(o_flag[c] + p_data[0 * hidden_size + c] * cell2))); + float O = 1.f / (1.f + exp(-(o_flag[c] + p_data[0 * hidden_size + c] * cell2))); float tmp = tanh(cell2); float H = O * tmp; output_c_data[i * hidden_size + c] = cell2; @@ -526,7 +526,7 @@ int ref_lstm_with_peepholes_fp32(struct tensor* input_tensor, struct tensor* w, float G = tanh(g_flag[c]); float c_i = output_c_data[c + i * hidden_size]; float cell2 = F * c_i + I * G; - float O = 1.f/(1.f + exp(-(o_flag[c] + p_data[2 * hidden_size + c] * cell2))); + float O = 1.f / (1.f + exp(-(o_flag[c] + p_data[2 * hidden_size + c] * cell2))); float H = O * tanh(cell2); output_c_data[i * hidden_size + c] = cell2; output_h_data[i * hidden_size + c] = H; @@ -542,8 +542,8 @@ int ref_lstm_with_peepholes_fp32(struct tensor* input_tensor, struct tensor* w, free(f_flag); free(o_flag); free(g_flag); - - return 0; + + return 0; } int ref_lstm_with_bias_bidirection_fp32(struct tensor* input_tensor, struct tensor* w, struct tensor* r, struct tensor* b, struct tensor* output_tensor, struct lstm_param* param) @@ -570,16 +570,16 @@ int ref_lstm_with_bias_bidirection_fp32(struct tensor* input_tensor, struct tens int size = input_tensor->dims[2]; int direct_num = input_tensor->dims[0]; - float* i_flag = ( float* )malloc(hidden_size * sizeof(float)); - float* f_flag = ( float* )malloc(hidden_size * sizeof(float)); - float* o_flag = ( float* )malloc(hidden_size * sizeof(float)); - float* g_flag = ( float* )malloc(hidden_size * sizeof(float)); + float* i_flag = (float*)malloc(hidden_size * sizeof(float)); + float* f_flag = (float*)malloc(hidden_size * sizeof(float)); + float* o_flag = (float*)malloc(hidden_size * sizeof(float)); + float* g_flag = (float*)malloc(hidden_size * sizeof(float)); - for(int seq = 0; seq < input_tensor->dims[0]; seq++) + for (int seq = 0; seq < input_tensor->dims[0]; seq++) { - for(int i = 0; i < T; i++) + for (int i = 0; i < T; i++) { - for(int d = 0; d < direct_num; d++) + for (int d = 0; d < direct_num; d++) { for (int q = 0; q < hidden_size; q++) { @@ -602,7 +602,7 @@ int ref_lstm_with_bias_bidirection_fp32(struct tensor* input_tensor, struct tens G += b_data[d * hidden_size * 4 * 2 + hidden_size * 3 + q]; for (int h = 0; h < hidden_size; h++) { - if(seq == 0) + if (seq == 0) { float h_i = init_h_data[d * input_tensor->dims[1] * hidden_size + h + i * hidden_size]; I += h_i * (r_data[d * hidden_size * hidden_size * 4 + (hidden_size * 0 + q) * hidden_size + h]); @@ -631,14 +631,14 @@ int ref_lstm_with_bias_bidirection_fp32(struct tensor* input_tensor, struct tens } for (int c = 0; c < hidden_size; c++) { - if( seq == 0) + if (seq == 0) { float I = 1.f / (1.f + exp(-i_flag[c])); float F = 1.f / (1.f + exp(-f_flag[c])); float G = tanh(g_flag[c]); float c_i = init_c_data[d * hidden_size * input_tensor->dims[2] + c + i * hidden_size]; float cell2 = F * c_i + I * G; - float O = 1.f/(1.f + exp(-o_flag[c])); + float O = 1.f / (1.f + exp(-o_flag[c])); float tmp = tanh(cell2); float H = O * tmp; output_c_data[d * hidden_size * input_tensor->dims[2] + i * hidden_size + c] = cell2; @@ -652,7 +652,7 @@ int ref_lstm_with_bias_bidirection_fp32(struct tensor* input_tensor, struct tens float G = tanh(g_flag[c]); float c_i = output_c_data[d * hidden_size * input_tensor->dims[2] + c + i * hidden_size]; float cell2 = F * c_i + I * G; - float O = 1.f/(1.f + exp(-o_flag[c])); + float O = 1.f / (1.f + exp(-o_flag[c])); float H = O * tanh(cell2); output_c_data[d * hidden_size * input_tensor->dims[2] + i * hidden_size + c] = cell2; output_h_data[d * hidden_size * input_tensor->dims[2] + i * hidden_size + c] = H; @@ -692,14 +692,14 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* w = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); struct tensor* r = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - + struct tensor* b = NULL; struct tensor* sequence_lens = NULL; struct tensor* init_h = NULL; struct tensor* init_c = NULL; struct tensor* p = NULL; - lstm_param_t* param = ( struct lstm_param* )(ir_node->op.param_mem); + lstm_param_t* param = (struct lstm_param*)(ir_node->op.param_mem); /* only support one way */ if (w->dim_num == 4 && w->dims[0] == 2) @@ -745,7 +745,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc struct graph* ir_graph = node->graph; struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]); - struct lstm_param* lstm_param = ( struct lstm_param* )(node->op.param_mem); + struct lstm_param* lstm_param = (struct lstm_param*)(node->op.param_mem); int batch_size = input->dims[1]; if (lstm_param->mxnet_flag == 0) diff --git a/source/device/cpu/op/matmul/matmul_ref.c b/source/device/cpu/op/matmul/matmul_ref.c index e52961f3f..4f0000547 100644 --- a/source/device/cpu/op/matmul/matmul_ref.c +++ b/source/device/cpu/op/matmul/matmul_ref.c @@ -34,7 +34,6 @@ #include - struct ref_matmul_data { int batch; @@ -117,7 +116,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex else if (dim_size == 2) { param.batch = 1; - param.c = 1; // input0->Getse().Shape(0); + param.c = 1; // input0->Getse().Shape(0); param.m = input_tensor->dims[0]; param.n = input_tensor1->dims[1]; param.k = input_tensor->dims[1]; diff --git a/source/device/cpu/op/maximum/maximum_ref.c b/source/device/cpu/op/maximum/maximum_ref.c index 23c02aca5..ecb34f774 100644 --- a/source/device/cpu/op/maximum/maximum_ref.c +++ b/source/device/cpu/op/maximum/maximum_ref.c @@ -34,7 +34,6 @@ #include - struct maximum_op_param { int in_num; @@ -60,7 +59,7 @@ static int ref_maximum_fp32(const float** in_data, float* out_data, int size, co static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct maximum_op_param* maximum_op_param = ( struct maximum_op_param* )sys_malloc(sizeof(struct maximum_op_param)); + struct maximum_op_param* maximum_op_param = (struct maximum_op_param*)sys_malloc(sizeof(struct maximum_op_param)); exec_node->ops_priv = maximum_op_param; return 0; @@ -76,12 +75,12 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct { struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; - struct maximum_op_param* maximum_op_param = ( struct maximum_op_param* )exec_node->ops_priv; + struct maximum_op_param* maximum_op_param = (struct maximum_op_param*)exec_node->ops_priv; int in_num = ir_node->input_num; maximum_op_param->in_num = in_num; - maximum_op_param->input_data = ( void** )sys_malloc(sizeof(void*) * in_num); + maximum_op_param->input_data = (void**)sys_malloc(sizeof(void*) * in_num); return 0; } @@ -94,7 +93,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); uint32_t elem_num = input_tensor_a->elem_num; - struct maximum_op_param* maximum_op_param = ( struct maximum_op_param* )exec_node->ops_priv; + struct maximum_op_param* maximum_op_param = (struct maximum_op_param*)exec_node->ops_priv; for (int i = 0; i < maximum_op_param->in_num; i++) { struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[i]); @@ -102,17 +101,17 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex maximum_op_param->input_data[i] = data; } - const void** input = ( const void** )maximum_op_param->input_data; + const void** input = (const void**)maximum_op_param->input_data; float* output = (float*)output_tensor->data; - ref_maximum_fp32(( const float** )input, output, elem_num, maximum_op_param); + ref_maximum_fp32((const float**)input, output, elem_num, maximum_op_param); return 0; } static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct maximum_op_param* maximum_op_param = ( struct maximum_op_param* )exec_node->ops_priv; + struct maximum_op_param* maximum_op_param = (struct maximum_op_param*)exec_node->ops_priv; sys_free(maximum_op_param->input_data); diff --git a/source/device/cpu/op/mean/mean_ref.c b/source/device/cpu/op/mean/mean_ref.c index ed7bfc346..1ccd4697b 100644 --- a/source/device/cpu/op/mean/mean_ref.c +++ b/source/device/cpu/op/mean/mean_ref.c @@ -34,7 +34,6 @@ #include - struct mean_op_param { int in_num; @@ -52,14 +51,14 @@ static int ref_mean_fp32(const float** in_data, float* out_data, int size, const const float* data = in_data[n]; sum += data[i]; } - out_data[i] = sum / ( float )in_num; + out_data[i] = sum / (float)in_num; } return 0; } static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct mean_op_param* mean_op_param = ( struct mean_op_param* )sys_malloc(sizeof(struct mean_op_param)); + struct mean_op_param* mean_op_param = (struct mean_op_param*)sys_malloc(sizeof(struct mean_op_param)); exec_node->ops_priv = mean_op_param; return 0; } @@ -74,12 +73,12 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct { struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; - struct mean_op_param* mean_op_param = ( struct mean_op_param* )exec_node->ops_priv; + struct mean_op_param* mean_op_param = (struct mean_op_param*)exec_node->ops_priv; int in_num = ir_node->input_num; mean_op_param->in_num = in_num; - mean_op_param->input_data = ( void** )sys_malloc(sizeof(void*) * in_num); + mean_op_param->input_data = (void**)sys_malloc(sizeof(void*) * in_num); return 0; } @@ -92,7 +91,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); uint32_t elem_num = input_tensor_a->elem_num; - struct mean_op_param* mean_op_param = ( struct mean_op_param* )exec_node->ops_priv; + struct mean_op_param* mean_op_param = (struct mean_op_param*)exec_node->ops_priv; for (int i = 0; i < mean_op_param->in_num; i++) { struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[i]); @@ -100,17 +99,17 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex mean_op_param->input_data[i] = data; } - const void** input = ( const void** )mean_op_param->input_data; + const void** input = (const void**)mean_op_param->input_data; float* output = (float*)output_tensor->data; - ref_mean_fp32(( const float** )input, output, elem_num, mean_op_param); + ref_mean_fp32((const float**)input, output, elem_num, mean_op_param); return 0; } static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct mean_op_param* mean_op_param = ( struct mean_op_param* )exec_node->ops_priv; + struct mean_op_param* mean_op_param = (struct mean_op_param*)exec_node->ops_priv; sys_free(mean_op_param->input_data); diff --git a/source/device/cpu/op/minimum/minimum_ref.c b/source/device/cpu/op/minimum/minimum_ref.c index 076cf851e..19319eb2f 100644 --- a/source/device/cpu/op/minimum/minimum_ref.c +++ b/source/device/cpu/op/minimum/minimum_ref.c @@ -34,7 +34,6 @@ #include - struct minimum_op_param { int in_num; @@ -60,7 +59,7 @@ static int ref_minimum_fp32(const float** in_data, float* out_data, int size, co static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct minimum_op_param* minimum_op_param = ( struct minimum_op_param* )sys_malloc(sizeof(struct minimum_op_param)); + struct minimum_op_param* minimum_op_param = (struct minimum_op_param*)sys_malloc(sizeof(struct minimum_op_param)); exec_node->ops_priv = minimum_op_param; return 0; } @@ -75,12 +74,12 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct { struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; - struct minimum_op_param* minimum_op_param = ( struct minimum_op_param* )exec_node->ops_priv; + struct minimum_op_param* minimum_op_param = (struct minimum_op_param*)exec_node->ops_priv; int in_num = ir_node->input_num; minimum_op_param->in_num = in_num; - minimum_op_param->input_data = ( void** )sys_malloc(sizeof(void*) * in_num); + minimum_op_param->input_data = (void**)sys_malloc(sizeof(void*) * in_num); return 0; } @@ -93,7 +92,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); uint32_t elem_num = input_tensor_a->elem_num; - struct minimum_op_param* minimum_op_param = ( struct minimum_op_param* )exec_node->ops_priv; + struct minimum_op_param* minimum_op_param = (struct minimum_op_param*)exec_node->ops_priv; for (int i = 0; i < minimum_op_param->in_num; i++) { struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[i]); @@ -101,17 +100,17 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex minimum_op_param->input_data[i] = data; } - const void** input = ( const void** )minimum_op_param->input_data; + const void** input = (const void**)minimum_op_param->input_data; float* output = (float*)output_tensor->data; - ref_minimum_fp32(( const float** )input, output, elem_num, minimum_op_param); + ref_minimum_fp32((const float**)input, output, elem_num, minimum_op_param); return 0; } static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct minimum_op_param* minimum_op_param = ( struct minimum_op_param* )exec_node->ops_priv; + struct minimum_op_param* minimum_op_param = (struct minimum_op_param*)exec_node->ops_priv; sys_free(minimum_op_param->input_data); diff --git a/source/device/cpu/op/mish/cortex-a/mish_hcl_arm.c b/source/device/cpu/op/mish/cortex-a/mish_hcl_arm.c index 338c4e2ce..8e3581c24 100644 --- a/source/device/cpu/op/mish/cortex-a/mish_hcl_arm.c +++ b/source/device/cpu/op/mish/cortex-a/mish_hcl_arm.c @@ -34,7 +34,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { exec_node->inplace_map[0] = 0; @@ -64,8 +63,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - float* idata = ( float* )input_tensor->data; - float* odata = ( float* )output_tensor->data; + float* idata = (float*)input_tensor->data; + float* odata = (float*)output_tensor->data; if (idata != odata) { TLOG_ERR("input and output are not the same mem\n"); diff --git a/source/device/cpu/op/mish/cortex-a/mish_kernel_arm.c b/source/device/cpu/op/mish/cortex-a/mish_kernel_arm.c index b50761810..f52317060 100644 --- a/source/device/cpu/op/mish/cortex-a/mish_kernel_arm.c +++ b/source/device/cpu/op/mish/cortex-a/mish_kernel_arm.c @@ -30,10 +30,9 @@ #include - static void mish_kernel(int i, int id, void* data, const float* input, float* output) { - int step = (( int* )data)[0]; + int step = ((int*)data)[0]; const float* cur_input = input + id * step; float* cur_output = output + id * step; for (int i = 0; i < (step & -4); i += 4) @@ -53,8 +52,8 @@ static void mish_kernel(int i, int id, void* data, const float* input, float* ou int mish_run(struct tensor* output_tensor, struct tensor* input_tensor, int num_thread) { - float* data = ( float* )input_tensor->data; - float* out_data = ( float* )output_tensor->data; + float* data = (float*)input_tensor->data; + float* out_data = (float*)output_tensor->data; int chan_num = (input_tensor->dims[0]) * (input_tensor->dims[1]); int chan_size = (input_tensor->dims[2]) * (input_tensor->dims[3]); diff --git a/source/device/cpu/op/mish/cortex-a/mish_kernel_arm.h b/source/device/cpu/op/mish/cortex-a/mish_kernel_arm.h index 137457a4b..b65a25a1a 100644 --- a/source/device/cpu/op/mish/cortex-a/mish_kernel_arm.h +++ b/source/device/cpu/op/mish/cortex-a/mish_kernel_arm.h @@ -27,7 +27,6 @@ #include "graph/tensor.h" - int mish_run(struct tensor* output_tensor, struct tensor* input_tensor, int num_thread); #endif diff --git a/source/device/cpu/op/mish/cortex-a/mish_math_func.h b/source/device/cpu/op/mish/cortex-a/mish_math_func.h index 38b80187b..cd21c52c0 100644 --- a/source/device/cpu/op/mish/cortex-a/mish_math_func.h +++ b/source/device/cpu/op/mish/cortex-a/mish_math_func.h @@ -30,7 +30,6 @@ refer to ncnn #include - static inline float32x4_t div_ps(float32x4_t a, float32x4_t b) { #if __aarch64__ diff --git a/source/device/cpu/op/mish/mish_kernel_ref.h b/source/device/cpu/op/mish/mish_kernel_ref.h index ea10ff7d3..33ae84056 100644 --- a/source/device/cpu/op/mish/mish_kernel_ref.h +++ b/source/device/cpu/op/mish/mish_kernel_ref.h @@ -25,14 +25,12 @@ #ifndef __MISH_KERNEL_REF_H__ #define __MISH_KERNEL_REF_H__ - #include "graph/tensor.h" #include "graph/node.h" #include "graph/graph.h" +int ref_mish_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread); -int ref_mish_fp32(struct tensor *input_tensor, struct tensor *output_tensor, int num_thread); - -int ref_mish_uint8(struct tensor *input_tensor, struct tensor *output_tensor, int num_thread); +int ref_mish_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread); #endif diff --git a/source/device/cpu/op/mish/mish_kernel_ref_fp32.c b/source/device/cpu/op/mish/mish_kernel_ref_fp32.c index c5431f2ee..2c0d7d9e0 100644 --- a/source/device/cpu/op/mish/mish_kernel_ref_fp32.c +++ b/source/device/cpu/op/mish/mish_kernel_ref_fp32.c @@ -38,7 +38,6 @@ #include - int ref_mish_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread) { int w = input_tensor->dims[3]; diff --git a/source/device/cpu/op/mish/mish_kernel_ref_uint8.c b/source/device/cpu/op/mish/mish_kernel_ref_uint8.c index 8f0a2b5e0..2f22f9f27 100644 --- a/source/device/cpu/op/mish/mish_kernel_ref_uint8.c +++ b/source/device/cpu/op/mish/mish_kernel_ref_uint8.c @@ -38,8 +38,7 @@ #include - -int ref_mish_uint8(struct tensor *input_tensor, struct tensor *output_tensor, int num_thread) +int ref_mish_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread) { int w = input_tensor->dims[3]; int h = output_tensor->dims[2]; @@ -61,9 +60,8 @@ int ref_mish_uint8(struct tensor *input_tensor, struct tensor *output_tensor, in float* data_fp32 = (float*)sys_malloc(total_size * sizeof(float)); - for(int i = 0; i < total_size; i++) - data_fp32[i] = ((float) input_uint8[i] - (float)input_zero) * input_scale; - + for (int i = 0; i < total_size; i++) + data_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale; for (int n = 0; n < batch; n++) { @@ -81,7 +79,7 @@ int ref_mish_uint8(struct tensor *input_tensor, struct tensor *output_tensor, in } // quant - for(int i=0; i 255) diff --git a/source/device/cpu/op/mish/mish_ref.c b/source/device/cpu/op/mish/mish_ref.c index 7c7f2addd..91af5a417 100644 --- a/source/device/cpu/op/mish/mish_ref.c +++ b/source/device/cpu/op/mish/mish_ref.c @@ -38,7 +38,6 @@ #include - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -57,12 +56,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); int ret = -1; - if(input_tensor->data_type == TENGINE_DT_FP32) + if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_mish_fp32(input_tensor, output_tensor, exec_graph->num_thread); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_mish_uint8(input_tensor, output_tensor, exec_graph->num_thread); else - TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type); + TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type); return ret; } diff --git a/source/device/cpu/op/mvn/mvn_ref.c b/source/device/cpu/op/mvn/mvn_ref.c index 4274dc490..306082d61 100644 --- a/source/device/cpu/op/mvn/mvn_ref.c +++ b/source/device/cpu/op/mvn/mvn_ref.c @@ -36,7 +36,6 @@ #include - typedef struct _ref_mvn_param { int input_n; @@ -69,7 +68,7 @@ int ref_mvn_fp32(float* in_data, float* out_data, p_ref_mvn_param param) int normalize_variance = param->normalize_variance; float eps = param->eps; - float* sum = ( float* )malloc(in_c * sizeof(float)); + float* sum = (float*)malloc(in_c * sizeof(float)); if (NULL == sum) return -100; @@ -130,7 +129,7 @@ int ref_mvn_fp32(float* in_data, float* out_data, p_ref_mvn_param param) if (normalize_variance) { - float* sqsum = ( float* )malloc(in_c * sizeof(float)); + float* sqsum = (float*)malloc(in_c * sizeof(float)); if (NULL == sqsum) return -100; @@ -227,7 +226,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex op_param.input_h = input_tensor->dims[2]; op_param.input_w = input_tensor->dims[3]; - struct mvn_param* param = ( struct mvn_param* )node->op.param_mem; + struct mvn_param* param = (struct mvn_param*)node->op.param_mem; op_param.normalize_variance = param->normalize_variance; op_param.across_channels = param->across_channels; op_param.eps = param->eps; diff --git a/source/device/cpu/op/noop/noop_ref.c b/source/device/cpu/op/noop/noop_ref.c index 62385d18e..67722f5bb 100644 --- a/source/device/cpu/op/noop/noop_ref.c +++ b/source/device/cpu/op/noop/noop_ref.c @@ -35,7 +35,6 @@ #include #include - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { exec_node->inplace_map[0] = 0; @@ -71,23 +70,26 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex switch (input_tensor->data_type) { - case TENGINE_DT_FP32: - case TENGINE_DT_INT32: { - size *= 4; - break; - } - case TENGINE_DT_FP16: - case TENGINE_DT_INT16: { - size *= 2; - break; - } - case TENGINE_DT_UINT8: - case TENGINE_DT_INT8: { - size *= 1; - break; - } - default: - return -1; + case TENGINE_DT_FP32: + case TENGINE_DT_INT32: + { + size *= 4; + break; + } + case TENGINE_DT_FP16: + case TENGINE_DT_INT16: + { + size *= 2; + break; + } + case TENGINE_DT_UINT8: + case TENGINE_DT_INT8: + { + size *= 1; + break; + } + default: + return -1; } if (size <= 0) diff --git a/source/device/cpu/op/normalize/normalize_ref.c b/source/device/cpu/op/normalize/normalize_ref.c index 52887a0b0..92990f780 100644 --- a/source/device/cpu/op/normalize/normalize_ref.c +++ b/source/device/cpu/op/normalize/normalize_ref.c @@ -37,12 +37,11 @@ #include #include - static void norm_channel(float* input, float* output, float* buffer, float* scale, int hw, int channel, int num_thread) { memset(buffer, 0, hw * sizeof(float)); -//#pragma omp parallel for num_threads(num_thread) + //#pragma omp parallel for num_threads(num_thread) for (int i = 0; i < channel; i++) { for (int j = 0; j < hw; j++) @@ -52,13 +51,13 @@ static void norm_channel(float* input, float* output, float* buffer, float* scal } } -//#pragma omp parallel for num_threads(num_thread) + //#pragma omp parallel for num_threads(num_thread) for (int j = 0; j < hw; j++) { buffer[j] = 1.f / sqrt(buffer[j]); } -//#pragma omp parallel for num_threads(num_thread) + //#pragma omp parallel for num_threads(num_thread) for (int i = 0; i < channel; i++) { for (int j = 0; j < hw; j++) @@ -86,17 +85,17 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); struct tensor* scale_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); - normalize_param_t* param = ( normalize_param_t* )(ir_node->op.param_mem); - float* input_org = ( float* )input_tensor->data; - float* output_org = ( float* )output_tensor->data; - float* sclae_org = ( float* )scale_tensor->data; + normalize_param_t* param = (normalize_param_t*)(ir_node->op.param_mem); + float* input_org = (float*)input_tensor->data; + float* output_org = (float*)output_tensor->data; + float* sclae_org = (float*)scale_tensor->data; int batch_number = input_tensor->dims[0]; int channel_num = input_tensor->dims[1]; int channel_size = (input_tensor->dims[2]) * (input_tensor->dims[3]); int img_size = channel_num * channel_size; - float* buffer = ( float* )sys_malloc(channel_size * sizeof(float)); + float* buffer = (float*)sys_malloc(channel_size * sizeof(float)); if (param->channel_shared == 0 && param->across_spatial == 0) { for (int i = 0; i < batch_number; i++) diff --git a/source/device/cpu/op/pad/pad_ref.c b/source/device/cpu/op/pad/pad_ref.c index ba1f48f7e..d17024b3a 100644 --- a/source/device/cpu/op/pad/pad_ref.c +++ b/source/device/cpu/op/pad/pad_ref.c @@ -37,7 +37,6 @@ #include - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -168,7 +167,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct pad_param* param = ( struct pad_param* )ir_node->op.param_mem; + struct pad_param* param = (struct pad_param*)ir_node->op.param_mem; int batch = input_tensor->dims[0]; int channel = input_tensor->dims[1]; @@ -195,14 +194,14 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex { if (input_tensor->data_type == TENGINE_DT_FP32) { - float* input_data = ( float* )input_tensor->data + n * in_size + c * in_cstep; - float* output_data = ( float* )output_tensor->data + n * out_size + c * out_cstep; + float* input_data = (float*)input_tensor->data + n * in_size + c * in_cstep; + float* output_data = (float*)output_tensor->data + n * out_size + c * out_cstep; ref_pad_fp32(input_data, output_data, in_h, in_w, out_h, out_w, pad_top, pad_left, param->value); } - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) { - uint8_t* input_data = ( uint8_t* )input_tensor->data + n * in_size + c * in_cstep; - uint8_t* output_data = ( uint8_t* )output_tensor->data + n * out_size + c * out_cstep; + uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_size + c * in_cstep; + uint8_t* output_data = (uint8_t*)output_tensor->data + n * out_size + c * out_cstep; ref_pad_uint8(input_data, output_data, in_h, in_w, out_h, out_w, pad_top, pad_left, param->value); } } @@ -223,13 +222,12 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc } static struct node_ops pad_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score -}; + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score}; int register_pad_ref_op() { diff --git a/source/device/cpu/op/permute/permute_ref.c b/source/device/cpu/op/permute/permute_ref.c index ce8869641..6e705ab31 100644 --- a/source/device/cpu/op/permute/permute_ref.c +++ b/source/device/cpu/op/permute/permute_ref.c @@ -36,7 +36,6 @@ #include - static void __hwc(const float* input, float* output, int hh, int ww, int cc, int wc, int hw) { for (int h = 0; h < hh; ++h) @@ -58,12 +57,12 @@ static void __chw(const float* input, float* output, int hh, int ww, int cc, int { for (int c = 0; c < cc; ++c) { - float* output_ptr = output + c * hw; // chw + float* output_ptr = output + c * hw; // chw for (int h = 0; h < hh; ++h) { for (int w = 0; w < ww; ++w) { - const float* input_ptr = input + h * wc + w * cc; // input hwc + wc + const float* input_ptr = input + h * wc + w * cc; // input hwc + wc // hw + w = input_ptr[c] output_ptr[h * ww + w] = input_ptr[c]; } @@ -92,12 +91,12 @@ static void __chw_u8(const uint8_t* input, uint8_t* output, int hh, int ww, int { for (int c = 0; c < cc; ++c) { - uint8_t* output_ptr = output + c * hw; // chw + uint8_t* output_ptr = output + c * hw; // chw for (int h = 0; h < hh; ++h) { for (int w = 0; w < ww; ++w) { - const uint8_t* input_ptr = input + h * wc + w * cc; // input hwc + wc + const uint8_t* input_ptr = input + h * wc + w * cc; // input hwc + wc // hw + w = input_ptr[c] output_ptr[h * ww + w] = input_ptr[c]; } @@ -126,12 +125,12 @@ static void __chw_i8(const int8_t* input, int8_t* output, int hh, int ww, int cc { for (int c = 0; c < cc; ++c) { - int8_t* output_ptr = output + c * hw; // chw + int8_t* output_ptr = output + c * hw; // chw for (int h = 0; h < hh; ++h) { for (int w = 0; w < ww; ++w) { - const int8_t* input_ptr = input + h * wc + w * cc; // input hwc + wc + const int8_t* input_ptr = input + h * wc + w * cc; // input hwc + wc // hw + w = input_ptr[c] output_ptr[h * ww + w] = input_ptr[c]; } @@ -401,7 +400,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct graph* ir_graph = ir_node->graph; struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - permute_param_t* param = ( struct permute_param* )(ir_node->op.param_mem); + permute_param_t* param = (struct permute_param*)(ir_node->op.param_mem); int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) diff --git a/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.c b/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.c index 6cb4e3781..4b6d3fe7a 100644 --- a/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.c +++ b/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.c @@ -48,7 +48,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct struct tensor* input_tensor; struct tensor* output_tensor; - struct pool_param* pool_param = ( struct pool_param* )ir_node->op.param_mem; + struct pool_param* pool_param = (struct pool_param*)ir_node->op.param_mem; input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); @@ -65,7 +65,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor; struct tensor* output_tensor; - struct pool_param* pool_param = ( struct pool_param* )ir_node->op.param_mem; + struct pool_param* pool_param = (struct pool_param*)ir_node->op.param_mem; input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); @@ -92,7 +92,7 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) { - struct pool_param* pool_param = ( struct pool_param* )exec_node->op.param_mem; + struct pool_param* pool_param = (struct pool_param*)exec_node->op.param_mem; int global = pool_param->global; int type = pool_param->pool_method; @@ -104,7 +104,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc int pad_h1 = pool_param->pad_h1; int pad_w0 = pool_param->pad_w0; int pad_w1 = pool_param->pad_w1; - int pad_tf = pool_param->pad_h0_org; // maybe there is a bug. + int pad_tf = pool_param->pad_h0_org; // maybe there is a bug. int pool_size = 0; @@ -136,8 +136,8 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc /* general max pooling, k2s2, k2k2p1, k3s1p1, k3s2, k3s2p1 */ if (type == POOL_MAX && (pad_h0 == pad_w0) && (pad_h1 == pad_w1) && pad_tf != -1) { - if (pad_h0 == 0 && (pool_size == POOL_K2S2)) - return 0; + if (pad_h0 == 0 && (pool_size == POOL_K2S2)) + return 0; if (pad_h0 == 0 && (pool_size == POOL_K3S2)) return OPS_SCORE_BEST; if (pad_h0 == 1 && (pool_size == POOL_K2S2 || pool_size == POOL_K3S2 || pool_size == POOL_K3S1)) @@ -151,7 +151,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; if (pad_h0 == 1 && pad_h1 == 1 && (pool_size == POOL_K2S2 || pool_size == POOL_K3S2 || pool_size == POOL_K3S1)) return OPS_SCORE_BEST; - else if(pad_h0 == 0 && pad_h1 == 1 && (pool_size == POOL_K3S2)) + else if (pad_h0 == 0 && pad_h1 == 1 && (pool_size == POOL_K3S2)) return OPS_SCORE_BEST; } } @@ -159,7 +159,6 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return 0; } - static struct node_ops hcl_node_ops = {.prerun = prerun, .run = run, .reshape = NULL, @@ -168,13 +167,11 @@ static struct node_ops hcl_node_ops = {.prerun = prerun, .release_node = release_node, .score = score}; - int register_pooling_hcl_arm_op() { return register_builtin_node_ops(OP_POOL, &hcl_node_ops); } - int unregister_pooling_hcl_arm_op() { return unregister_builtin_node_ops(OP_POOL, &hcl_node_ops); diff --git a/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.h b/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.h index ddc2bbedc..062e66015 100644 --- a/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.h +++ b/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.h @@ -43,10 +43,9 @@ #include #define POOL_GENERIC 0 -#define POOL_K2S2 1 -#define POOL_K3S2 2 -#define POOL_K3S1 3 - +#define POOL_K2S2 1 +#define POOL_K3S2 2 +#define POOL_K3S1 3 typedef void (*pooling_kernel_t)(const void* input, void* output, int inc, int inh, int inw, int outh, int outw, int, int, int, int, int, int, int pad_h1, int pad_w1, int); @@ -318,9 +317,7 @@ static void avg_3x3s2(const float* input, float* output, int inc, int inh, int i } for (int j = block_w * 4; j < outw; j++) { - *out_ptr = - (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2] + line2[0] + line2[1] + line2[2]) * - 0.11111111f; + *out_ptr = (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2] + line2[0] + line2[1] + line2[2]) * 0.11111111f; out_ptr++; line0 += 2; line1 += 2; @@ -1242,9 +1239,7 @@ static void avg_3x3s2_p1(const float* input, float* output, int inc, int inh, in } for (int j = block_w * 4 + 1; j < outw; j++) { - *out_ptr = - (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2] + line2[0] + line2[1] + line2[2]) * - 0.11111111f; + *out_ptr = (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2] + line2[0] + line2[1] + line2[2]) * 0.11111111f; out_ptr++; line0 += 2; line1 += 2; @@ -1516,9 +1511,7 @@ static void avg_3x3s1_p1(const float* input, float* output, int inc, int inh, in // mid for (int j = 0; j < mid_w; j++) { - *out_ptr = - (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2] + line2[0] + line2[1] + line2[2]) * - 0.11111111f; + *out_ptr = (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2] + line2[0] + line2[1] + line2[2]) * 0.11111111f; out_ptr++; line0 += 1; line1 += 1; @@ -1629,9 +1622,9 @@ int pooling_kernel_perf_prerun(struct tensor* input, struct tensor* out, struct if (param->global) { if (param->pool_method == POOL_AVG) - param->funct = ( pooling_kernel_t )avg_global; + param->funct = (pooling_kernel_t)avg_global; else if (param->pool_method == POOL_MAX) - param->funct = ( pooling_kernel_t )max_global; + param->funct = (pooling_kernel_t)max_global; assert(param->funct != NULL); return 0; @@ -1659,18 +1652,18 @@ int pooling_kernel_perf_prerun(struct tensor* input, struct tensor* out, struct if (param->pad_h0 == 0) { if (pool_size == POOL_K2S2) - param->funct = ( pooling_kernel_t )max_2x2s2; + param->funct = (pooling_kernel_t)max_2x2s2; else if (pool_size == POOL_K3S2) - param->funct = ( pooling_kernel_t )max_3x3s2; + param->funct = (pooling_kernel_t)max_3x3s2; } else if (param->pad_h0 == 1) { if (pool_size == POOL_K2S2) - param->funct = ( pooling_kernel_t )max_2x2s2_p1; + param->funct = (pooling_kernel_t)max_2x2s2_p1; else if (pool_size == POOL_K3S2) - param->funct = ( pooling_kernel_t )max_3x3s2_p1; + param->funct = (pooling_kernel_t)max_3x3s2_p1; else if (pool_size == POOL_K3S1) - param->funct = ( pooling_kernel_t )max_3x3s1_p1; + param->funct = (pooling_kernel_t)max_3x3s1_p1; } } @@ -1691,23 +1684,23 @@ int pooling_kernel_perf_prerun(struct tensor* input, struct tensor* out, struct if (param->pad_h0 == 0 && param->pad_h1 == 0) { if (pool_size == POOL_K2S2) - param->funct = ( pooling_kernel_t )avg_2x2s2; + param->funct = (pooling_kernel_t)avg_2x2s2; else if (pool_size == POOL_K3S2) - param->funct = ( pooling_kernel_t )avg_3x3s2; + param->funct = (pooling_kernel_t)avg_3x3s2; } else if (param->pad_h0 == 1 && param->pad_h1 == 1) { if (pool_size == POOL_K2S2) - param->funct = ( pooling_kernel_t )avg_2x2s2_p1; + param->funct = (pooling_kernel_t)avg_2x2s2_p1; else if (pool_size == POOL_K3S2) - param->funct = ( pooling_kernel_t )avg_3x3s2_p1; + param->funct = (pooling_kernel_t)avg_3x3s2_p1; else if (pool_size == POOL_K3S1) - param->funct = ( pooling_kernel_t )avg_3x3s1_p1; + param->funct = (pooling_kernel_t)avg_3x3s1_p1; } else if (param->pad_h0 == 0 && param->pad_h1 == 1) { if (pool_size == POOL_K3S2) - param->funct = ( pooling_kernel_t ) avg_3x3s2; + param->funct = (pooling_kernel_t)avg_3x3s2; } } diff --git a/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm_int8.h b/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm_int8.h index bd5789084..e87487781 100644 --- a/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm_int8.h +++ b/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm_int8.h @@ -43,9 +43,9 @@ #include #define POOL_GENERIC 0 -#define POOL_K2S2 1 -#define POOL_K3S2 2 -#define POOL_K3S1 3 +#define POOL_K2S2 1 +#define POOL_K3S2 2 +#define POOL_K3S1 3 static inline int8_t arm_max_int8(int8_t a, int8_t b) { @@ -64,7 +64,7 @@ static inline int8_t arm_min_int8(int8_t a, int8_t b) } typedef void (*pooling_kernel_int8_t)(const void* input, void* output, int inc, int inh, int inw, int outh, int outw, int k_h, - int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale,float out_scale); + int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale, float out_scale); static void pad_0_align_2D_int8(int8_t* dst, int8_t* src, int m, int n, int m_align, int n_align, int pad_h, int pad_w) { @@ -125,16 +125,16 @@ static void delete_0_3D_int8(int8_t* dst, int8_t* src, int m_align, int n_align, } static void avg_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh, int inw, int outh, int outw, int k_h, - int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale,float out_scale) + int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale, float out_scale) { int in_hw = inw * inh; int out_hw = outh * outw; - if(pad_w1 > 0) + if (pad_w1 > 0) { outw--; } - if(pad_h1 > 0) + if (pad_h1 > 0) { outh--; } @@ -142,15 +142,15 @@ static void avg_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh int remain_w = inw - outw * 2; int index = 0; - for(int c = 0; c < inc; c++) + for (int c = 0; c < inc; c++) { index = 0; const int8_t* line0 = input + c * in_hw; const int8_t* line1 = line0 + inw; int8_t* out_ptr = output + c * out_hw; - for(int i = 0; i < outh; i++) + for (int i = 0; i < outh; i++) { - for(int j = 0; j < block_w; j++) + for (int j = 0; j < block_w; j++) { int8x8_t p00 = vld1_s8(line0); int8x8_t p10 = vld1_s8(line1); @@ -162,18 +162,18 @@ static void avg_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh #ifdef __aarch64__ /* pairwaise max */ sum0 = vpaddq_s16(sum0, sum1); - for(int n = 0; n < 8; n++) + for (int n = 0; n < 8; n++) { - out_ptr[n] = ( int8_t )round(sum0[n] / 4); + out_ptr[n] = (int8_t)round(sum0[n] / 4); } #else /* pairwaise max */ int32x4_t suml0 = vpaddlq_s16(sum0); int32x4_t suml1 = vpaddlq_s16(sum1); - for(int n = 0; n < 4; n++) + for (int n = 0; n < 4; n++) { - out_ptr[n] = ( int8_t )round(suml0[n] / 4); - out_ptr[n + 1] = ( int8_t )round(suml1[n] / 4); + out_ptr[n] = (int8_t)round(suml0[n] / 4); + out_ptr[n + 1] = (int8_t)round(suml1[n] / 4); } #endif line0 += 16; @@ -181,7 +181,7 @@ static void avg_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh index = index + 8; } index = block_w * 8; - if(outw - index >= 4) + if (outw - index >= 4) { int8x8_t p00 = vld1_s8(line0); int8x8_t p10 = vld1_s8(line1); @@ -190,42 +190,42 @@ static void avg_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh /* pairwaise max */ int16x8_t sum1 = {0}; sum0 = vpaddq_s16(sum0, sum1); - for(int n = 0; n < 4; n++) + for (int n = 0; n < 4; n++) { - out_ptr[n] = ( int8_t )round(sum0[n] / 4); + out_ptr[n] = (int8_t)round(sum0[n] / 4); } #else /* pairwaise max */ int32x4_t suml0 = vpaddlq_s16(sum0); - for(int n = 0; n < 4; n++) + for (int n = 0; n < 4; n++) { - out_ptr[n] = ( int8_t )round(suml0[n] / 4); + out_ptr[n] = (int8_t)round(suml0[n] / 4); } #endif line0 += 8; out_ptr = out_ptr + 4; index = index + 4; } - for(; index < outw; index++) + for (; index < outw; index++) { - *out_ptr = ( int8_t )round((line0[0] + line0[1] + line1[0] + line1[1]) / 4); + *out_ptr = (int8_t)round((line0[0] + line0[1] + line1[0] + line1[1]) / 4); out_ptr++; line0 += 2; line1 += 2; } - if(pad_w1 > 0) + if (pad_w1 > 0) { - *out_ptr = ( int8_t )round((line0[0] + line1[0]) / 2); + *out_ptr = (int8_t)round((line0[0] + line1[0]) / 2); out_ptr++; } line0 += remain_w + inw; line1 += remain_w + inw; } - if(pad_h1) + if (pad_h1) { index = 0; - for(int j = 0; j < block_w; j++) + for (int j = 0; j < block_w; j++) { int8x8_t p00 = vld1_s8(line0); int8x8_t p01 = vld1_s8(line0 + 8); @@ -237,17 +237,17 @@ static void avg_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh int16x8_t sum1 = vaddl_s8(p01, p02); #ifdef __aarch64__ sum0 = vpaddq_s16(sum0, sum1); - for(int n = 0; n < 8; n++) + for (int n = 0; n < 8; n++) { - out_ptr[n] = ( int8_t )round(sum0[n] / 4); + out_ptr[n] = (int8_t)round(sum0[n] / 4); } #else int32x4_t suml0 = vpaddlq_s16(sum0); int32x4_t suml1 = vpaddlq_s16(sum1); - for(int n = 0; n < 4; n++) + for (int n = 0; n < 4; n++) { - out_ptr[n] = ( int8_t )round(suml0[n] / 4); - out_ptr[n + 1] = ( int8_t )round(suml1[n] / 4); + out_ptr[n] = (int8_t)round(suml0[n] / 4); + out_ptr[n + 1] = (int8_t)round(suml1[n] / 4); } #endif line0 += 16; @@ -255,7 +255,7 @@ static void avg_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh index = index + 8; } index = block_w * 8; - if(outw - index >= 4) + if (outw - index >= 4) { int8x8_t p00 = vld1_s8(line0); int8x8_t p01 = {0}; @@ -264,31 +264,31 @@ static void avg_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh /* pairwaise max */ int16x8_t sum1 = {0}; sum0 = vpaddq_s16(sum0, sum1); - for(int n = 0; n < 4; n++) + for (int n = 0; n < 4; n++) { - out_ptr[n] = ( int8_t )round(sum0[n] / 4); + out_ptr[n] = (int8_t)round(sum0[n] / 4); } #else /* pairwaise max */ int32x4_t suml0 = vpaddlq_s16(sum0); - for(int n = 0; n < 4; n++) + for (int n = 0; n < 4; n++) { - out_ptr[n] = ( int8_t )round(suml0[n] / 4); + out_ptr[n] = (int8_t)round(suml0[n] / 4); } #endif line0 += 8; out_ptr = out_ptr + 4; index = index + 4; } - for(; index < outw; index++) + for (; index < outw; index++) { int sum0 = line0[0] + line0[1]; - *out_ptr = ( int8_t )round((sum0) / 2); + *out_ptr = (int8_t)round((sum0) / 2); out_ptr++; line0 += 2; line1 += 2; } - if(pad_w1 > 0) + if (pad_w1 > 0) { *out_ptr = line0[0]; out_ptr++; @@ -298,16 +298,16 @@ static void avg_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh } static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh, int inw, int outh, int outw, int k_h, - int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale,float out_scale) + int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale, float out_scale) { int in_hw = inw * inh; int out_hw = outh * outw; - if(pad_w1 > 0) + if (pad_w1 > 0) { outw--; } - if(pad_h1 > 0) + if (pad_h1 > 0) { outh--; } @@ -318,14 +318,14 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh #endif int remain_w = inw - outw * 2; int index = 0; - for(int c = 0; c < inc; c++) + for (int c = 0; c < inc; c++) { const int8_t* line0 = input + c * in_hw; const int8_t* line1 = line0 + inw; int8_t* out_ptr = output + c * out_hw; - for(int i = 0; i < outh; i++) + for (int i = 0; i < outh; i++) { - for(int j = 0; j < block_w; j++) + for (int j = 0; j < block_w; j++) { #ifdef __aarch64__ int8x16_t p00 = vld1q_s8(line0); @@ -362,7 +362,7 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh } index = block_w * 8; #endif - if(outw - index >= 8) + if (outw - index >= 8) { int8x8_t p00 = vld1_s8(line0); int8x8_t p10 = vld1_s8(line1); @@ -380,7 +380,7 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh out_ptr = out_ptr + 8; index = index + 8; } - if(outw - index >= 4) + if (outw - index >= 4) { int8x8_t p00 = vld1_s8(line0); int8x8_t p10 = vld1_s8(line1); @@ -399,7 +399,7 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh out_ptr = out_ptr + 4; index = index + 4; } - for(; index < outw; index++) + for (; index < outw; index++) { int8_t max0 = arm_max_int8(line0[0], line0[1]); int8_t max1 = arm_max_int8(line1[0], line1[1]); @@ -409,7 +409,7 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh line0 += 2; line1 += 2; } - if(pad_w1 > 0) + if (pad_w1 > 0) { *out_ptr = arm_max_int8(line0[0], line1[0]); out_ptr++; @@ -417,9 +417,9 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh line0 += remain_w + inw; line1 += remain_w + inw; } - if(pad_h1 > 0) + if (pad_h1 > 0) { - for(int j = 0; j < block_w; j++) + for (int j = 0; j < block_w; j++) { #ifdef __aarch64__ int8x16_t p00 = vld1q_s8(line0); @@ -444,7 +444,7 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh } index = block_w * 8; #endif - if(outw - index >= 8) + if (outw - index >= 8) { int8x8_t p00 = vld1_s8(line0); int8x8_t p01 = vld1_s8(line0 + 8); @@ -456,7 +456,7 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh out_ptr = out_ptr + 8; index = index + 8; } - if(outw - index >= 4) + if (outw - index >= 4) { int8x8_t p00 = vld1_s8(line0); /* pairwaise max */ @@ -472,13 +472,13 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh out_ptr = out_ptr + 4; index = index + 4; } - for(; index < outw; index++) + for (; index < outw; index++) { *out_ptr = arm_max_int8(line0[0], line0[1]); out_ptr++; line0 += 2; } - if(pad_w1 > 0) + if (pad_w1 > 0) { *out_ptr = arm_max_int8(line0[0], line1[0]); out_ptr++; @@ -488,32 +488,32 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh } static void avg_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh, int inw, int outh, int outw, int k_h, - int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale,float out_scale) + int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale, float out_scale) { int in_hw = inw * inh; int out_hw = outh * outw; - if(pad_w1 > 0) + if (pad_w1 > 0) { outw--; } - if(pad_h1 > 0) + if (pad_h1 > 0) { outh--; } int block_w = outw >> 3; int remain_w = inw - outw * 2; int index = 0; - for(int c = 0; c < inc; c++) + for (int c = 0; c < inc; c++) { const int8_t* line0 = input + c * in_hw; const int8_t* line1 = line0 + inw; const int8_t* line2 = line1 + inw; int8_t* out_ptr = output + c * out_hw; - for(int i = 0; i < outh; i++) + for (int i = 0; i < outh; i++) { index = 0; - for(int j = 0; j < block_w; j++) + for (int j = 0; j < block_w; j++) { int8x8x2_t p00 = vld2_s8(line0); int8x8x2_t p10 = vld2_s8(line1); @@ -538,9 +538,9 @@ static void avg_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh // sum0 = vadd_s8(vadd_s8(sum0, sum1), sum2); - for(int n = 0; n < 8; n++) + for (int n = 0; n < 8; n++) { - out_ptr[n] = ( int8_t )round(sum0[n] / 9); + out_ptr[n] = (int8_t)round(sum0[n] / 9); } p00 = p00_new; @@ -555,36 +555,35 @@ static void avg_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh index = index + 8; } - for(; index < outw; index++) + for (; index < outw; index++) { - int sum = - (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2] + line2[0] + line2[1] + line2[2]); - *out_ptr = ( int8_t )round(sum / 9); + int sum = (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2] + line2[0] + line2[1] + line2[2]); + *out_ptr = (int8_t)round(sum / 9); out_ptr++; line0 += 2; line1 += 2; line2 += 2; } - if(pad_w1 == 1) + if (pad_w1 == 1) { int sum = (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2]); - *out_ptr = ( int8_t )round(sum / 6); + *out_ptr = (int8_t)round(sum / 6); out_ptr++; } - else if(pad_w1 == 2) + else if (pad_w1 == 2) { int sum = (line0[0] + line1[0] + line2[0]); - *out_ptr = ( int8_t )round(sum / 6); + *out_ptr = (int8_t)round(sum / 6); out_ptr++; } line0 += remain_w + inw; line1 += remain_w + inw; line2 += remain_w + inw; } - if(pad_h1 == 1) + if (pad_h1 == 1) { index = 0; - for(int j = 0; j < block_w; j++) + for (int j = 0; j < block_w; j++) { int8x8x2_t p00 = vld2_s8(line0); int8x8x2_t p10 = vld2_s8(line1); @@ -600,9 +599,9 @@ static void avg_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh int8x8_t p11 = vext_s8(p10.val[0], p10_new.val[0], 1); sum0 = vaddw_s8(sum0, p11); - for(int n = 0; n < 8; n++) + for (int n = 0; n < 8; n++) { - out_ptr[n] = ( int8_t )round(sum0[n] / 6); + out_ptr[n] = (int8_t)round(sum0[n] / 6); } p00 = p00_new; @@ -612,31 +611,31 @@ static void avg_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh out_ptr += 8; index = index + 8; } - for(; index < outw; index++) + for (; index < outw; index++) { int sum = (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2]); - *out_ptr = ( int8_t )round(sum / 6); + *out_ptr = (int8_t)round(sum / 6); out_ptr++; line0 += 2; line1 += 2; } - if(pad_w1 == 1) + if (pad_w1 == 1) { int sum = (line0[0] + line0[1] + line1[0] + line1[1]); - *out_ptr = ( int8_t )round(sum / 4); + *out_ptr = (int8_t)round(sum / 4); out_ptr++; } - else if(pad_w1 == 2) + else if (pad_w1 == 2) { int sum = (line0[0] + line1[0]); - *out_ptr = ( int8_t )round(sum / 2); + *out_ptr = (int8_t)round(sum / 2); out_ptr++; } } - else if(pad_h1 == 2) + else if (pad_h1 == 2) { index = 0; - for(int j = 0; j < block_w; j++) + for (int j = 0; j < block_w; j++) { int8x8x2_t p00 = vld2_s8(line0); int8x8x2_t p00_new = vld2_s8(line0 + 16); @@ -644,9 +643,9 @@ static void avg_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh int8x8_t p01 = vext_s8(p00.val[0], p00_new.val[0], 1); sum0 = vaddw_s8(sum0, p01); - for(int n = 0; n < 8; n++) + for (int n = 0; n < 8; n++) { - out_ptr[n] = ( int8_t )round(sum0[n] / 3); + out_ptr[n] = (int8_t)round(sum0[n] / 3); } p00 = p00_new; @@ -654,18 +653,18 @@ static void avg_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh out_ptr += 8; index = index + 8; } - for(; index < outw; index++) + for (; index < outw; index++) { - *out_ptr = ( int8_t )round((line0[0] + line0[1] + line0[2]) / 3); + *out_ptr = (int8_t)round((line0[0] + line0[1] + line0[2]) / 3); out_ptr++; line0 += 2; } - if(pad_w1 == 1) + if (pad_w1 == 1) { - *out_ptr = ( int8_t )round((line0[0] + line0[1]) / 2); + *out_ptr = (int8_t)round((line0[0] + line0[1]) / 2); out_ptr++; } - else if(pad_w1 == 2) + else if (pad_w1 == 2) { *out_ptr = line0[0]; out_ptr++; @@ -675,16 +674,16 @@ static void avg_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh } static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh, int inw, int outh, int outw, int k_h, - int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale,float out_scale) + int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale, float out_scale) { int in_hw = inw * inh; int out_hw = outh * outw; - if(pad_w1 > 0) + if (pad_w1 > 0) { outw--; } - if(pad_h1 > 0) + if (pad_h1 > 0) { outh--; } @@ -693,18 +692,18 @@ static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh int index = 0; - for(int c = 0; c < inc; c++) + for (int c = 0; c < inc; c++) { const int8_t* line0 = input + c * in_hw; const int8_t* line1 = line0 + inw; const int8_t* line2 = line1 + inw; int8_t* out_ptr = output + c * out_hw; - for(int i = 0; i < outh; i++) + for (int i = 0; i < outh; i++) { int8x16x2_t p00 = vld2q_s8(line0); int8x16x2_t p10 = vld2q_s8(line1); int8x16x2_t p20 = vld2q_s8(line2); - for(int j = 0; j < block_w; j++) + for (int j = 0; j < block_w; j++) { /* p00 = [1,2,3,4,5,6,7,8...] @@ -745,7 +744,7 @@ static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh index = block_w * 16; - if(outw - index > 8) + if (outw - index > 8) { int8x8x2_t p00 = vld2_s8(line0); int8x8x2_t p10 = vld2_s8(line1); @@ -779,7 +778,7 @@ static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh out_ptr += 8; index = index + 8; } - for(; index < outw; index++) + for (; index < outw; index++) { int8_t max0 = arm_max_int8(arm_max_int8(line0[0], line0[1]), line0[2]); int8_t max1 = arm_max_int8(arm_max_int8(line1[0], line1[1]), line1[2]); @@ -791,7 +790,7 @@ static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh line1 += 2; line2 += 2; } - if(pad_w1 == 1) + if (pad_w1 == 1) { int8_t max0 = arm_max_int8(arm_max_int8(line0[0], line0[1]), arm_max_int8(line1[0], line1[1])); *out_ptr = arm_max_int8(arm_max_int8(line2[0], line2[1]), max0); @@ -801,11 +800,11 @@ static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh line1 += remain_w + inw; line2 += remain_w + inw; } - if(pad_h1 == 1) + if (pad_h1 == 1) { int8x16x2_t p00 = vld2q_s8(line0); int8x16x2_t p10 = vld2q_s8(line1); - for(int j = 0; j < block_w; j++) + for (int j = 0; j < block_w; j++) { int8x16x2_t p00_new = vld2q_s8(line0 + 32); int8x16_t max0 = vmaxq_s8(p00.val[0], p00.val[1]); @@ -830,7 +829,7 @@ static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh index = block_w * 16; - if(outw - index > 8) + if (outw - index > 8) { int8x8x2_t p00 = vld2_s8(line0); int8x8x2_t p10 = vld2_s8(line1); @@ -856,7 +855,7 @@ static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh out_ptr += 8; index = index + 8; } - for(; index < outw; index++) + for (; index < outw; index++) { int8_t max0 = arm_max_int8(arm_max_int8(line0[0], line0[1]), line0[2]); int8_t max1 = arm_max_int8(arm_max_int8(line1[0], line1[1]), line1[2]); @@ -865,7 +864,7 @@ static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh line0 += 2; line1 += 2; } - if(pad_w1 == 1) + if (pad_w1 == 1) { *out_ptr = arm_max_int8(arm_max_int8(line0[0], line0[1]), arm_max_int8(line1[0], line1[1])); out_ptr++; @@ -875,18 +874,18 @@ static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh } static void avg_global_int8(const int8_t* input, int8_t* output, int inc, int inh, int inw, int outh, int outw, int k_h, - int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale,float out_scale) + int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale, float out_scale) { int in_hw = inw * inh; int block = in_hw >> 4; - for(int c = 0; c < inc; c++) + for (int c = 0; c < inc; c++) { int index = 0; const int8_t* line0 = input + c * in_hw; int8_t* out_ptr = output + c; int sum = 0; - for(int j = 0; j < block; j++) + for (int j = 0; j < block; j++) { int8x8_t p00 = vld1_s8(line0); int8x8_t p01 = vld1_s8(line0 + 8); @@ -897,30 +896,30 @@ static void avg_global_int8(const int8_t* input, int8_t* output, int inc, int in } index = block * 16; - for(int j = index; j < in_hw; j++) + for (int j = index; j < in_hw; j++) { sum += line0[0]; line0++; } float sum_fp32 = sum * in_scale; - sum_fp32 = sum_fp32/in_hw; - int tmp = (int)round(sum_fp32/out_scale); - if(tmp > 127) + sum_fp32 = sum_fp32 / in_hw; + int tmp = (int)round(sum_fp32 / out_scale); + if (tmp > 127) tmp = 127; - else if(tmp < -127) + else if (tmp < -127) tmp = -127; - *out_ptr = ( int8_t )tmp;//round(sum / in_hw); + *out_ptr = (int8_t)tmp; //round(sum / in_hw); } } static void max_global_int8(const int8_t* input, int8_t* output, int inc, int inh, int inw, int outh, int outw, int k_h, - int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale,float out_scale) + int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale, float out_scale) { int in_hw = inw * inh; int block = in_hw >> 5; - for(int c = 0; c < inc; c++) + for (int c = 0; c < inc; c++) { int index = 0; const int8_t* line0 = input + c * in_hw; @@ -928,7 +927,7 @@ static void max_global_int8(const int8_t* input, int8_t* output, int inc, int in int8x16_t p00 = vld1q_s8(line0); int8x16_t res = p00; - for(int j = 0; j < block; j++) + for (int j = 0; j < block; j++) { int8x16_t p00 = vld1q_s8(line0); int8x16_t p01 = vld1q_s8(line0 + 16); @@ -937,11 +936,11 @@ static void max_global_int8(const int8_t* input, int8_t* output, int inc, int in line0 += 32; } int8_t max_ = 0; - if(block > 0) + if (block > 0) { max_ = res[0]; #ifdef __aarch64__ - for(int n = 1; n < 16; n++) + for (int n = 1; n < 16; n++) { max_ = arm_max_int8(max_, res[n]); } @@ -969,7 +968,7 @@ static void max_global_int8(const int8_t* input, int8_t* output, int inc, int in max_ = line0[0]; } index = block * 32; - for(int j = index; j < in_hw; j++) + for (int j = index; j < in_hw; j++) { max_ = arm_max_int8(max_, line0[0]); line0++; @@ -986,9 +985,9 @@ int pooling_kernel_int8_perf_prerun(struct tensor* input, struct tensor* out, st if (param->global) { if (param->pool_method == POOL_AVG) - param->funct = ( pooling_kernel_int8_t )avg_global_int8; + param->funct = (pooling_kernel_int8_t)avg_global_int8; else if (param->pool_method == POOL_MAX) - param->funct = ( pooling_kernel_int8_t )max_global_int8; + param->funct = (pooling_kernel_int8_t)max_global_int8; assert(param->funct != NULL); return 0; @@ -1009,9 +1008,9 @@ int pooling_kernel_int8_perf_prerun(struct tensor* input, struct tensor* out, st if ((param->pad_h0 == param->pad_w0) && (param->pad_h1 == param->pad_w1)) { if (pool_size == POOL_K2S2) - param->funct = ( pooling_kernel_int8_t )max_2x2s2_int8; + param->funct = (pooling_kernel_int8_t)max_2x2s2_int8; else if (pool_size == POOL_K3S2) - param->funct = ( pooling_kernel_int8_t )max_3x3s2_int8; + param->funct = (pooling_kernel_int8_t)max_3x3s2_int8; } } /* general avg pooling, k2s2, k2s2p1, k3s2, k3s2p1 */ @@ -1020,9 +1019,9 @@ int pooling_kernel_int8_perf_prerun(struct tensor* input, struct tensor* out, st if ((param->pad_h0 == param->pad_w0) && (param->pad_h1 == param->pad_w1)) { if (pool_size == POOL_K2S2) - param->funct = ( pooling_kernel_int8_t )avg_2x2s2_int8; + param->funct = (pooling_kernel_int8_t)avg_2x2s2_int8; else if (pool_size == POOL_K3S2) - param->funct = ( pooling_kernel_int8_t )avg_3x3s2_int8; + param->funct = (pooling_kernel_int8_t)avg_3x3s2_int8; } } @@ -1079,7 +1078,7 @@ int pooling_kernel_int8_perf_run(struct tensor* input, struct tensor* output, st if (param->input_pad != NULL) { pad_0_align_3D_int8((int8_t*)param->input_pad + n * c * in_h_pad * in_w_pad, (int8_t*)input_frame, - in_h_origin, in_w_origin, in_h_pad, in_w_pad, c, pad_h0, pad_w0); + in_h_origin, in_w_origin, in_h_pad, in_w_pad, c, pad_h0, pad_w0); } #pragma omp parallel for num_threads(num_thread) diff --git a/source/device/cpu/op/pooling/cortex-m/pooling_cmsis.c b/source/device/cpu/op/pooling/cortex-m/pooling_cmsis.c index fa1540cb9..e30c84c7e 100644 --- a/source/device/cpu/op/pooling/cortex-m/pooling_cmsis.c +++ b/source/device/cpu/op/pooling/cortex-m/pooling_cmsis.c @@ -44,7 +44,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor; struct tensor* output_tensor; - struct pool_param* pool_param = ( struct pool_param* )ir_node->op.param_mem; + struct pool_param* pool_param = (struct pool_param*)ir_node->op.param_mem; input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); diff --git a/source/device/cpu/op/pooling/pooling_kernel_ref.h b/source/device/cpu/op/pooling/pooling_kernel_ref.h index f835433e1..43c471415 100644 --- a/source/device/cpu/op/pooling/pooling_kernel_ref.h +++ b/source/device/cpu/op/pooling/pooling_kernel_ref.h @@ -31,17 +31,16 @@ #include "graph/node.h" #include "graph/graph.h" - int ref_pooling_fp32(struct tensor* input_tensor, struct tensor* output_tensor, - struct pool_param* pool_param, int num_thread); + struct pool_param* pool_param, int num_thread); int ref_pooling_fp16(struct tensor* input_tensor, struct tensor* output_tensor, - struct pool_param* pool_param, int num_thread); + struct pool_param* pool_param, int num_thread); int ref_pooling_uint8(struct tensor* input_tensor, struct tensor* output_tensor, - struct pool_param* pool_param, int num_thread); + struct pool_param* pool_param, int num_thread); int ref_pooling_int8(struct tensor* input_tensor, struct tensor* output_tensor, - struct pool_param* pool_param, int num_thread); + struct pool_param* pool_param, int num_thread); #endif diff --git a/source/device/cpu/op/pooling/pooling_kernel_ref_fp16.c b/source/device/cpu/op/pooling/pooling_kernel_ref_fp16.c index 141bea3ce..31a694e8e 100644 --- a/source/device/cpu/op/pooling/pooling_kernel_ref_fp16.c +++ b/source/device/cpu/op/pooling/pooling_kernel_ref_fp16.c @@ -41,7 +41,6 @@ #define HCL_POOL_MAX 0 /* Max pooling */ #define HCL_POOL_AVG 1 /* Average pooling */ - #if MACOS #else @@ -49,11 +48,11 @@ static inline void calc_sum_fp16(const fp16_t* input, fp16_t* sum, int layout, i int start_h, int start_w, int end_h, int end_w) { float sum_f = 0.0f; - for(int i = start_h; i < end_h; i++) + for (int i = start_h; i < end_h; i++) { - for(int j = start_w; j < end_w; j++) + for (int j = start_w; j < end_w; j++) { - if(layout == 0) + if (layout == 0) sum_f += fp16_to_fp32(input[cur_ch * h * w + i * w + j]); else sum_f += fp16_to_fp32(input[i * w * c + j * c + cur_ch]); @@ -67,15 +66,15 @@ static inline void calc_max_fp16(const fp16_t* input, fp16_t* max, int layout, i { float max_f = 0.0f; float tmp = 0.0f; - if(layout == 0) + if (layout == 0) max_f = fp16_to_fp32(input[cur_ch * h * w + start_h * w + start_w]); else max_f = fp16_to_fp32(input[start_h * w * c + start_w * c + cur_ch]); - for(int i = start_h; i < end_h; i++) + for (int i = start_h; i < end_h; i++) { - for(int j = start_w; j < end_w; j++) + for (int j = start_w; j < end_w; j++) { - if(layout == 0) + if (layout == 0) tmp = fp16_to_fp32(input[cur_ch * h * w + i * w + j]); else tmp = fp16_to_fp32(input[i * w * c + j * c + cur_ch]); @@ -89,7 +88,7 @@ static inline void calc_max_fp16(const fp16_t* input, fp16_t* max, int layout, i #endif int ref_pooling_fp16(struct tensor* input_tensor, struct tensor* output_tensor, - struct pool_param* pool_param, int num_thread) + struct pool_param* pool_param, int num_thread) { int layout = input_tensor->layout; int type = input_tensor->data_type; @@ -122,27 +121,27 @@ int ref_pooling_fp16(struct tensor* input_tensor, struct tensor* output_tensor, fp16_t* input = (fp16_t*)input_tensor->data; fp16_t* output = (fp16_t*)output_tensor->data; - for(int n = 0; n < batch; n++) + for (int n = 0; n < batch; n++) { const fp16_t* input_cur = input + n * input_chw; - for(int c = 0; c < channel; c++) + for (int c = 0; c < channel; c++) { - for(int ph = 0; ph < out_h; ph++) + for (int ph = 0; ph < out_h; ph++) { - for(int pw = 0; pw < out_w; pw++) + for (int pw = 0; pw < out_w; pw++) { int pool_size = 1; int offset = 0; int h_start = ph * stride_h - pad_h; int h_end = h_start + kernel_h; - if(h_end > in_h + pad_h) + if (h_end > in_h + pad_h) h_end = in_h + pad_h; int w_start = pw * stride_w - pad_w; int w_end = w_start + kernel_w; - if(w_end > in_w + pad_w) + if (w_end > in_w + pad_w) w_end = in_w + pad_w; - if(caffe_flavor) + if (caffe_flavor) pool_size = (h_end - h_start) * (w_end - w_start); h_start = h_start > 0 ? h_start : 0; @@ -150,25 +149,25 @@ int ref_pooling_fp16(struct tensor* input_tensor, struct tensor* output_tensor, h_end = h_end < in_h ? h_end : in_h; w_end = w_end < in_w ? w_end : in_w; - if(!caffe_flavor) + if (!caffe_flavor) pool_size = (h_end - h_start) * (w_end - w_start); - if(layout == 0) // nchw + if (layout == 0) // nchw offset = n * output_chw + c * out_h * out_w + ph * out_w + pw; else offset = n * output_chw + ph * out_w * channel + pw * channel + c; - if(method == 0) + if (method == 0) { fp16_t max; calc_max_fp16(input_cur, &max, layout, channel, in_h, in_w, - c, h_start, w_start, h_end, w_end); + c, h_start, w_start, h_end, w_end); output[offset] = max; } - else if(method == 1) + else if (method == 1) { fp16_t sum; calc_sum_fp16(input_cur, &sum, layout, channel, in_h, in_w, - c, h_start, w_start, h_end, w_end); + c, h_start, w_start, h_end, w_end); output[offset] = fp32_to_fp16(fp16_to_fp32(sum) / pool_size); } else @@ -176,7 +175,6 @@ int ref_pooling_fp16(struct tensor* input_tensor, struct tensor* output_tensor, } } } - } #endif diff --git a/source/device/cpu/op/pooling/pooling_kernel_ref_fp32.c b/source/device/cpu/op/pooling/pooling_kernel_ref_fp32.c index c78e40bc3..05f499ddf 100644 --- a/source/device/cpu/op/pooling/pooling_kernel_ref_fp32.c +++ b/source/device/cpu/op/pooling/pooling_kernel_ref_fp32.c @@ -41,7 +41,6 @@ #define HCL_POOL_MAX 0 /* Max pooling */ #define HCL_POOL_AVG 1 /* Average pooling */ - static inline float calc_sum_fp32(const float* input, int layout, int c, int h, int w, int cur_ch, int start_h, int start_w, int end_h, int end_w) { @@ -78,7 +77,7 @@ static inline float calc_max_fp32(const float* input, int layout, int c, int h, } int ref_pooling_fp32(struct tensor* input_tensor, struct tensor* output_tensor, - struct pool_param* pool_param, int num_thread) + struct pool_param* pool_param, int num_thread) { int layout = input_tensor->layout; int type = input_tensor->data_type; @@ -105,7 +104,6 @@ int ref_pooling_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int caffe_flavor = pool_param->caffe_flavor; int method = pool_param->pool_method; - float* input = (float*)input_tensor->data; float* output = (float*)output_tensor->data; @@ -141,19 +139,19 @@ int ref_pooling_fp32(struct tensor* input_tensor, struct tensor* output_tensor, if (!caffe_flavor) pool_size = (h_end - h_start) * (w_end - w_start); - + offset = n * output_chw + c * out_h * out_w + ph * out_w + pw; if (method == HCL_POOL_MAX) { float max = calc_max_fp32(input_cur, layout, channel, in_h, in_w, c, h_start, w_start, - h_end, w_end); + h_end, w_end); output[offset] = max; } else if (method == HCL_POOL_AVG) { float sum = calc_sum_fp32(input_cur, layout, channel, in_h, in_w, c, h_start, w_start, - h_end, w_end); + h_end, w_end); output[offset] = sum / pool_size; } else diff --git a/source/device/cpu/op/pooling/pooling_kernel_ref_int8.c b/source/device/cpu/op/pooling/pooling_kernel_ref_int8.c index f33a590d5..1ab3a14cd 100644 --- a/source/device/cpu/op/pooling/pooling_kernel_ref_int8.c +++ b/source/device/cpu/op/pooling/pooling_kernel_ref_int8.c @@ -41,15 +41,14 @@ #define HCL_POOL_MAX 0 /* Max pooling */ #define HCL_POOL_AVG 1 /* Average pooling */ - static inline int calc_sum_int8(const int8_t* input, int layout, int c, int h, int w, int cur_ch, int start_h, int start_w, int end_h, int end_w) { int sum = 0; - for(int i = start_h; i < end_h; i++) - for(int j = start_w; j < end_w; j++) + for (int i = start_h; i < end_h; i++) + for (int j = start_w; j < end_w; j++) { - if(layout == 0) + if (layout == 0) sum += input[cur_ch * h * w + i * w + j]; else sum += input[i * w * c + j * c + cur_ch]; @@ -62,16 +61,16 @@ static inline int8_t calc_max_int8(const int8_t* input, int layout, int c, int h int start_w, int end_h, int end_w) { int8_t max = 0; - if(layout == 0) + if (layout == 0) max = input[cur_ch * h * w + start_h * w + start_w]; else max = input[start_h * w * c + start_w * c + cur_ch]; int8_t tmp = 0; - for(int i = start_h; i < end_h; i++) - for(int j = start_w; j < end_w; j++) + for (int i = start_h; i < end_h; i++) + for (int j = start_w; j < end_w; j++) { - if(layout == 0) + if (layout == 0) tmp = input[cur_ch * h * w + i * w + j]; else tmp = input[i * w * c + j * c + cur_ch]; @@ -83,7 +82,7 @@ static inline int8_t calc_max_int8(const int8_t* input, int layout, int c, int h } int ref_pooling_int8(struct tensor* input_tensor, struct tensor* output_tensor, - struct pool_param* pool_param, int num_thread) + struct pool_param* pool_param, int num_thread) { int layout = input_tensor->layout; int type = input_tensor->data_type; @@ -110,8 +109,8 @@ int ref_pooling_int8(struct tensor* input_tensor, struct tensor* output_tensor, int caffe_flavor = pool_param->caffe_flavor; int method = pool_param->pool_method; - int8_t* input_int8 = ( int8_t* )input_tensor->data; - int8_t* output_int8 = ( int8_t* )output_tensor->data; + int8_t* input_int8 = (int8_t*)input_tensor->data; + int8_t* output_int8 = (int8_t*)output_tensor->data; float input_scale = input_tensor->scale; float output_scale = output_tensor->scale; @@ -119,7 +118,7 @@ int ref_pooling_int8(struct tensor* input_tensor, struct tensor* output_tensor, for (int n = 0; n < batch; n++) { - const int8_t * input_cur = input_int8 + n * input_chw; + const int8_t* input_cur = input_int8 + n * input_chw; for (int c = 0; c < channel; c++) { for (int ph = 0; ph < out_h; ph++) @@ -149,7 +148,7 @@ int ref_pooling_int8(struct tensor* input_tensor, struct tensor* output_tensor, if (!caffe_flavor) pool_size = (h_end - h_start) * (w_end - w_start); - if (layout == TENGINE_LAYOUT_NCHW) // nchw + if (layout == TENGINE_LAYOUT_NCHW) // nchw offset = n * output_chw + c * out_h * out_w + ph * out_w + pw; else offset = n * output_chw + ph * out_w * channel + pw * channel + c; @@ -157,9 +156,9 @@ int ref_pooling_int8(struct tensor* input_tensor, struct tensor* output_tensor, if (method == HCL_POOL_MAX) { int8_t max = calc_max_int8(input_cur, layout, channel, in_h, in_w, c, h_start, w_start, - h_end, w_end); + h_end, w_end); - int32_t data_i32 = round((float )max * requant_scale); + int32_t data_i32 = round((float)max * requant_scale); if (data_i32 > 127) data_i32 = 127; else if (data_i32 < -127) @@ -172,7 +171,7 @@ int ref_pooling_int8(struct tensor* input_tensor, struct tensor* output_tensor, h_end, w_end); float sum_fp32 = sum_i32 * input_scale; sum_fp32 = sum_fp32 / (float)pool_size; - int32_t data_i32 = round((float )sum_fp32 / output_scale); + int32_t data_i32 = round((float)sum_fp32 / output_scale); if (data_i32 > 127) data_i32 = 127; else if (data_i32 < -127) diff --git a/source/device/cpu/op/pooling/pooling_kernel_ref_uint8.c b/source/device/cpu/op/pooling/pooling_kernel_ref_uint8.c index effd0c6ad..54e6b8c68 100644 --- a/source/device/cpu/op/pooling/pooling_kernel_ref_uint8.c +++ b/source/device/cpu/op/pooling/pooling_kernel_ref_uint8.c @@ -41,7 +41,6 @@ #define HCL_POOL_MAX 0 /* Max pooling */ #define HCL_POOL_AVG 1 /* Average pooling */ - static inline float calc_sum_fp32(const float* input, int layout, int c, int h, int w, int cur_ch, int start_h, int start_w, int end_h, int end_w) { @@ -90,7 +89,7 @@ static inline float calc_max_fp32(const float* input, int layout, int c, int h, } int ref_pooling_uint8(struct tensor* input_tensor, struct tensor* output_tensor, - struct pool_param* pool_param, int num_thread) + struct pool_param* pool_param, int num_thread) { int layout = input_tensor->layout; int type = input_tensor->data_type; @@ -117,8 +116,8 @@ int ref_pooling_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int caffe_flavor = pool_param->caffe_flavor; int method = pool_param->pool_method; - uint8_t* input_uint8 = ( uint8_t* )input_tensor->data; - uint8_t* output_uint8 = ( uint8_t* )output_tensor->data; + uint8_t* input_uint8 = (uint8_t*)input_tensor->data; + uint8_t* output_uint8 = (uint8_t*)output_tensor->data; float input_scale = input_tensor->scale; float output_scale = output_tensor->scale; @@ -126,8 +125,8 @@ int ref_pooling_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int output_zero = output_tensor->zero_point; /* input dequant */ - float* input_fp32 = ( float* )sys_malloc(input_tensor->elem_num * sizeof(float)); - float* output_fp32 = ( float* )sys_malloc(output_tensor->elem_num * sizeof(float)); + float* input_fp32 = (float*)sys_malloc(input_tensor->elem_num * sizeof(float)); + float* output_fp32 = (float*)sys_malloc(output_tensor->elem_num * sizeof(float)); for (int i = 0; i < input_tensor->elem_num; i++) input_fp32[i] = (input_uint8[i] - input_zero) * input_scale; @@ -167,7 +166,7 @@ int ref_pooling_uint8(struct tensor* input_tensor, struct tensor* output_tensor, if (!caffe_flavor) pool_size = (h_end - h_start) * (w_end - w_start); - if (layout == TENGINE_LAYOUT_NCHW) // nchw + if (layout == TENGINE_LAYOUT_NCHW) // nchw offset = n * output_chw + c * out_h * out_w + ph * out_w + pw; else offset = n * output_chw + ph * out_w * channel + pw * channel + c; @@ -175,13 +174,13 @@ int ref_pooling_uint8(struct tensor* input_tensor, struct tensor* output_tensor, if (method == HCL_POOL_MAX) { float max = calc_max_fp32(input_cur, layout, channel, in_h, in_w, c, h_start, w_start, - h_end, w_end); + h_end, w_end); output[offset] = max; } else if (method == HCL_POOL_AVG) { float sum = calc_sum_fp32(input_cur, layout, channel, in_h, in_w, c, h_start, w_start, - h_end, w_end); + h_end, w_end); output[offset] = sum / pool_size; } else diff --git a/source/device/cpu/op/pooling/pooling_ref.c b/source/device/cpu/op/pooling/pooling_ref.c index 9261a6eec..df8ecb6a2 100644 --- a/source/device/cpu/op/pooling/pooling_ref.c +++ b/source/device/cpu/op/pooling/pooling_ref.c @@ -38,7 +38,6 @@ #include "pooling_kernel_ref.h" - static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { struct node* ir_node = exec_node->ir_node; @@ -46,35 +45,34 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct pool_param* pool_param = ( struct pool_param* )ir_node->op.param_mem; + struct pool_param* pool_param = (struct pool_param*)ir_node->op.param_mem; int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_pooling_fp32(input_tensor, output_tensor, pool_param, exec_graph->num_thread); else if (input_tensor->data_type == TENGINE_DT_FP16) - #if MACOS +#if MACOS TLOG_ERR("FP16 not support mac os"); - #else +#else ret = ref_pooling_fp16(input_tensor, output_tensor, pool_param, exec_graph->num_thread); - #endif +#endif else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_pooling_uint8(input_tensor, output_tensor, pool_param, exec_graph->num_thread); else if (input_tensor->data_type == TENGINE_DT_INT8) ret = ref_pooling_int8(input_tensor, output_tensor, pool_param, exec_graph->num_thread); else - TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type); + TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type); return 0; } - static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct pool_param* pool_param = ( struct pool_param* )ir_node->op.param_mem; + struct pool_param* pool_param = (struct pool_param*)ir_node->op.param_mem; int ret = 0; @@ -84,9 +82,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc int input_w = input_tensor->dims[3]; int output_h, output_w; - if (pool_param->kernel_h == input_h && pool_param->kernel_w == input_w && - pool_param->pad_w0 == 0 && pool_param->pad_w1 == 0 && - pool_param->pad_h0 == 0 && pool_param->pad_h1 == 0) + if (pool_param->kernel_h == input_h && pool_param->kernel_w == input_w && pool_param->pad_w0 == 0 && pool_param->pad_w1 == 0 && pool_param->pad_h0 == 0 && pool_param->pad_h1 == 0) { pool_param->global = 1; } @@ -131,8 +127,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc int dims[4]; - if (output_tensor->dims[1] != channel || output_tensor->dims[2] != output_h || - output_tensor->dims[3] != output_w) + if (output_tensor->dims[1] != channel || output_tensor->dims[2] != output_h || output_tensor->dims[3] != output_w) { dims[0] = batch; dims[1] = channel; @@ -144,31 +139,26 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc return ret; } - static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; } - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* dev) { return 0; } - static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* dev) { return 0; } - static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) { return OPS_SCORE_CANDO; } - static struct node_ops hcl_node_ops = {.prerun = NULL, .run = run, .reshape = reshape, @@ -177,13 +167,11 @@ static struct node_ops hcl_node_ops = {.prerun = NULL, .release_node = release_node, .score = score}; - int register_pooling_ref_op() { return register_builtin_node_ops(OP_POOL, &hcl_node_ops); } - int unregister_pooling_ref_op() { unregister_builtin_node_ops(OP_POOL, &hcl_node_ops); diff --git a/source/device/cpu/op/prelu/cortex_a/prelu_hcl_arm.c b/source/device/cpu/op/prelu/cortex_a/prelu_hcl_arm.c index 2c96992da..9012a5686 100644 --- a/source/device/cpu/op/prelu/cortex_a/prelu_hcl_arm.c +++ b/source/device/cpu/op/prelu/cortex_a/prelu_hcl_arm.c @@ -36,7 +36,6 @@ #include - static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { struct node* ir_node = exec_node->ir_node; @@ -48,8 +47,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - if (input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] || - input_tensor->dims[3] != output_tensor->dims[3]) + if (input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] || input_tensor->dims[3] != output_tensor->dims[3]) ret = set_ir_tensor_shape(output_tensor, input_tensor->dims, input_tensor->dim_num); return ret; diff --git a/source/device/cpu/op/prelu/cortex_a/prelu_kernel_arm.c b/source/device/cpu/op/prelu/cortex_a/prelu_kernel_arm.c index feb139cfc..33a6c6817 100644 --- a/source/device/cpu/op/prelu/cortex_a/prelu_kernel_arm.c +++ b/source/device/cpu/op/prelu/cortex_a/prelu_kernel_arm.c @@ -26,7 +26,6 @@ #include - #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b)) diff --git a/source/device/cpu/op/prelu/prelu_ref.c b/source/device/cpu/op/prelu/prelu_ref.c index d338122be..6dd8e4151 100644 --- a/source/device/cpu/op/prelu/prelu_ref.c +++ b/source/device/cpu/op/prelu/prelu_ref.c @@ -34,7 +34,6 @@ #include - #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -155,7 +154,7 @@ static int ref_prelu_fp32(struct tensor* input_tensor, struct tensor* output_ten } } - return 0; + return 0; } static int ref_prelu_uint8(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* slope_tensor) @@ -176,13 +175,13 @@ static int ref_prelu_uint8(struct tensor* input_tensor, struct tensor* output_te int output_size = output_tensor->elem_num; int slope_size = slope_tensor->elem_num; - float* input_fp32 = ( float* )sys_malloc(input_size * sizeof(float)); - float* output_fp32 = ( float* )sys_malloc(output_size * sizeof(float)); - float* slope_fp32 = ( float* )sys_malloc(slope_size * sizeof(float)); + float* input_fp32 = (float*)sys_malloc(input_size * sizeof(float)); + float* output_fp32 = (float*)sys_malloc(output_size * sizeof(float)); + float* slope_fp32 = (float*)sys_malloc(slope_size * sizeof(float)); for (int i = 0; i < input_size; i++) { - input_fp32[i] = (( float )input_data[i] - ( float )input_zero) * input_scale; + input_fp32[i] = ((float)input_data[i] - (float)input_zero) * input_scale; } for (int i = 0; i < slope_size; i++) { @@ -333,17 +332,17 @@ static int ref_prelu_int8(struct tensor* input_tensor, struct tensor* output_ten int output_size = output_tensor->elem_num; int slope_size = slope_tensor->elem_num; - float* input_fp32 = ( float* )sys_malloc(input_size * sizeof(float)); - float* output_fp32 = ( float* )sys_malloc(output_size * sizeof(float)); - float* slope_fp32 = ( float* )sys_malloc(slope_size * sizeof(float)); + float* input_fp32 = (float*)sys_malloc(input_size * sizeof(float)); + float* output_fp32 = (float*)sys_malloc(output_size * sizeof(float)); + float* slope_fp32 = (float*)sys_malloc(slope_size * sizeof(float)); for (int i = 0; i < input_size; i++) { - input_fp32[i] = ( float )data[i] * input_scale; + input_fp32[i] = (float)data[i] * input_scale; } for (int i = 0; i < slope_size; i++) { - slope_fp32[i] = ( float )slope[i] * slope_scale; + slope_fp32[i] = (float)slope[i] * slope_scale; } int offset = 0; @@ -408,8 +407,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - if (input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] || - input_tensor->dims[3] != output_tensor->dims[3]) + if (input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] || input_tensor->dims[3] != output_tensor->dims[3]) ret = set_ir_tensor_shape(output_tensor, input_tensor->dims, input_tensor->dim_num); return ret; @@ -430,9 +428,9 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_prelu_fp32(input_tensor, output_tensor, slope_tensor); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_prelu_uint8(input_tensor, output_tensor, slope_tensor); - else if(input_tensor->data_type == TENGINE_DT_INT8) + else if (input_tensor->data_type == TENGINE_DT_INT8) ret = ref_prelu_int8(input_tensor, output_tensor, slope_tensor); else TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type); diff --git a/source/device/cpu/op/priorbox/priorbox_ref.c b/source/device/cpu/op/priorbox/priorbox_ref.c index 99f3ccc05..39df5ec09 100644 --- a/source/device/cpu/op/priorbox/priorbox_ref.c +++ b/source/device/cpu/op/priorbox/priorbox_ref.c @@ -36,7 +36,6 @@ #include - #define T_MAX(a, b) ((a) > (b) ? (a) : (b)) #define T_MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -55,15 +54,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; struct tensor* featmap_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); - struct tensor* data_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); - struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - priorbox_param_t* param = ( priorbox_param_t* )(ir_node->op.param_mem); + struct tensor* data_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); + struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); + priorbox_param_t* param = (priorbox_param_t*)(ir_node->op.param_mem); float* output_fp32 = NULL; if (output_tensor->data_type == TENGINE_DT_FP32) - output_fp32 = ( float* )output_tensor->data; + output_fp32 = (float*)output_tensor->data; else if (output_tensor->data_type == TENGINE_DT_UINT8 || output_tensor->data_type == TENGINE_DT_INT8) - output_fp32 = ( float* )sys_malloc(output_tensor->elem_num * sizeof(float )); + output_fp32 = (float*)sys_malloc(output_tensor->elem_num * sizeof(float)); const int data_height = data_tensor->dims[2]; const int data_width = data_tensor->dims[3]; @@ -83,8 +82,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex float step_w, step_h; if (param->step_h == 0 || param->step_w == 0) { - step_w = ( float )(image_w) / feat_width; - step_h = ( float )(image_h) / feat_height; + step_w = (float)(image_w) / feat_width; + step_h = (float)(image_h) / feat_height; } else { @@ -105,7 +104,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex float center_x = (w + offset_) * step_w; float center_y = (h + offset_) * step_h; float box_width, box_height; - for (int s = 0; s < ( int )param->min_size_num; ++s) + for (int s = 0; s < (int)param->min_size_num; ++s) { int min_size_ = param->min_size[s]; // first prior: aspect_ratio = 1, size = min_size @@ -130,7 +129,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex } // rest of priors - for (int r = 0; r < ( int )param->aspect_ratio_size; ++r) + for (int r = 0; r < (int)param->aspect_ratio_size; ++r) { float ar = param->aspect_ratio[r]; @@ -179,7 +178,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex { uint8_t* output_org = (uint8_t*)output_tensor->data; - for (int i=0; ielem_num; i++) + for (int i = 0; i < output_tensor->elem_num; i++) { int udata = (int)(output_fp32[i] / output_tensor->scale + output_tensor->zero_point); if (udata > 255) @@ -197,7 +196,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex { int8_t* output_org = (int8_t*)output_tensor->data; - for (int i=0; ielem_num; i++) + for (int i = 0; i < output_tensor->elem_num; i++) { int data_i32 = round(output_fp32[i] / output_tensor->scale); if (data_i32 > 127) diff --git a/source/device/cpu/op/psroipooling/psroipooling_ref.c b/source/device/cpu/op/psroipooling/psroipooling_ref.c index e7ff48fb9..9039a3f8d 100644 --- a/source/device/cpu/op/psroipooling/psroipooling_ref.c +++ b/source/device/cpu/op/psroipooling/psroipooling_ref.c @@ -36,7 +36,6 @@ #include - #define T_MAX(a, b) ((a) > (b) ? (a) : (b)) #define T_MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -68,8 +67,8 @@ static int ref_psroipooling_fp32(struct tensor* featmap_tensor, struct tensor* r int roi_w = T_MAX(roi_x1 - roi_x0, 0); int roi_h = T_MAX(roi_y1 - roi_y0, 0); - float bin_w = ( float )roi_w / ( float )out_w; - float bin_h = ( float )roi_h / ( float )out_h; + float bin_w = (float)roi_w / (float)out_w; + float bin_h = (float)roi_h / (float)out_h; for (int c = 0; c < output_dim; c++) { @@ -80,10 +79,10 @@ static int ref_psroipooling_fp32(struct tensor* featmap_tensor, struct tensor* r { float* inptr = featmap + (c * out_h + h) * out_w + w; - int hstart = floor(roi_y0 + ( float )( h )*bin_h); - int wstart = floor(roi_x0 + ( float )( w )*bin_w); - int hend = ceil(roi_y0 + ( float )(h + 1) * bin_h); - int wend = ceil(roi_x0 + ( float )(w + 1) * bin_w); + int hstart = floor(roi_y0 + (float)(h)*bin_h); + int wstart = floor(roi_x0 + (float)(w)*bin_w); + int hend = ceil(roi_y0 + (float)(h + 1) * bin_h); + int wend = ceil(roi_x0 + (float)(w + 1) * bin_w); hstart = T_MIN(T_MAX(hstart, 0), in_h); wstart = T_MIN(T_MAX(wstart, 0), in_w); @@ -102,7 +101,7 @@ static int ref_psroipooling_fp32(struct tensor* featmap_tensor, struct tensor* r sum += inptr[index]; } } - outptr[w] = is_empty ? 0.f : (sum / ( float )area); + outptr[w] = is_empty ? 0.f : (sum / (float)area); } outptr += out_w; } @@ -133,7 +132,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex featmap_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); roi_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct psroipooling_param* psroipooling_param = ( struct psroipooling_param* )ir_node->op.param_mem; + struct psroipooling_param* psroipooling_param = (struct psroipooling_param*)ir_node->op.param_mem; ref_psroipooling_fp32(featmap_tensor, roi_tensor, output_tensor, psroipooling_param, exec_graph->num_thread); diff --git a/source/device/cpu/op/reciprocal/reciprocal_ref.c b/source/device/cpu/op/reciprocal/reciprocal_ref.c index 95903a2f4..c770bb657 100644 --- a/source/device/cpu/op/reciprocal/reciprocal_ref.c +++ b/source/device/cpu/op/reciprocal/reciprocal_ref.c @@ -72,8 +72,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - int ret = -1; - if(input_tensor->data_type == TENGINE_DT_FP32) + int ret = -1; + if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_reciprocal_fp32(input_tensor, output_tensor, exec_graph->num_thread); else printf("Input data type %d not to be supported.\n", input_tensor->data_type); @@ -98,13 +98,13 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc } static struct node_ops hcl_node_ops = { - .prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score}; int register_reciprocal_ref_op() { diff --git a/source/device/cpu/op/reducel2/reducel2_ref.c b/source/device/cpu/op/reducel2/reducel2_ref.c index 3108faf76..e92f98caf 100644 --- a/source/device/cpu/op/reducel2/reducel2_ref.c +++ b/source/device/cpu/op/reducel2/reducel2_ref.c @@ -36,7 +36,6 @@ #include - struct ref_reducel2_param { int axis; @@ -89,10 +88,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct graph* ir_graph = ir_node->graph; struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct reducel2_param* op_param = ( struct reducel2_param* )ir_node->op.param_mem; + struct reducel2_param* op_param = (struct reducel2_param*)ir_node->op.param_mem; - void* in_data = ( void* )input_tensor->data; - void* out_data = ( void* )output_tensor->data; + void* in_data = (void*)input_tensor->data; + void* out_data = (void*)output_tensor->data; struct ref_reducel2_param param; diff --git a/source/device/cpu/op/reduction/reduction_kernel_ref.h b/source/device/cpu/op/reduction/reduction_kernel_ref.h index 4aa4f1ec1..c3459b2da 100644 --- a/source/device/cpu/op/reduction/reduction_kernel_ref.h +++ b/source/device/cpu/op/reduction/reduction_kernel_ref.h @@ -30,7 +30,6 @@ #include #include - #define FLOAT_MAX 3.4028235E38 #define FLOAT_MIN -3.4028235E38 @@ -148,7 +147,7 @@ struct reduce_param_ref }; static int ref_reduce_uint8(uint8_t* data, uint8_t* out_data, int dim0, int dim1, int dim2, int dim3, int out_size, - struct reduce_param_ref* param, int dim_num, int* dims) + struct reduce_param_ref* param, int dim_num, int* dims) { int offset = 0; int param_dim0 = param->param_dim[0]; @@ -164,7 +163,7 @@ static int ref_reduce_uint8(uint8_t* data, uint8_t* out_data, int dim0, int dim1 { if (param_dim0 == 1 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2 && (dim_num > 4)) { - if(dim_num == 5) + if (dim_num == 5) { sum_5d_ax1_uint8(dims, dim_num, data, out_data, in_scale, in_zp, out_scale, out_zp); } @@ -177,7 +176,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int struct reduce_param_ref* param, int dim_num, int* dims) { int offset = 0; - float* tmp = ( float* )sys_malloc(sizeof(float) * out_size); + float* tmp = (float*)sys_malloc(sizeof(float) * out_size); memset(tmp, 0, sizeof(float) * out_size); int param_dim0 = param->param_dim[0]; int param_dim1 = param->param_dim[1]; @@ -186,8 +185,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int if (param->type == 0) { - if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || - (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) + if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) { for (int n = 0; n < dim0; n++) { @@ -205,9 +203,10 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int } } } - else if(param_dim0 == 1 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2 && (dim_num > 4)) + else if (param_dim0 == 1 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2 && (dim_num > 4)) { - if(dim_num == 5){ + if (dim_num == 5) + { sum_5d_ax1(dims, dim_num, data, tmp); } } @@ -215,7 +214,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int { sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp); } - else if (param_dim0 == 1 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2 && (dim_num <= 4) ) + else if (param_dim0 == 1 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2 && (dim_num <= 4)) { fprintf(stderr, "wrond dim_num %d \n", dim_num); sum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp); @@ -228,83 +227,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int { sum_4d_ax3(dim0, dim1, dim2, dim3, data, tmp); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); sum_3d_ax0(dim1, dim2, dim3, tmp, tmp_01); free(tmp_01); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); sum_3d_ax1(dim1, dim2, dim3, tmp, tmp_02); free(tmp_02); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3); sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03); sum_3d_ax2(dim1, dim2, dim3, tmp, tmp_03); free(tmp_03); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); sum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); sum_3d_ax1(dim0, dim2, dim3, tmp, tmp_12); free(tmp_12); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3); sum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13); sum_3d_ax2(dim0, dim2, dim3, tmp, tmp_13); free(tmp_13); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) { // reduce on axis2 - float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3); + float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3); memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3); sum_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23); sum_3d_ax2(dim0, dim1, dim3, tmp, tmp_23); free(tmp_23); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_0, 0, sizeof(float) * dim2 * dim3); sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -314,18 +302,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_0); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_1, 0, sizeof(float) * dim2 * dim3); sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -335,18 +318,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3); memset(tmp_1, 0, sizeof(float) * dim1 * dim3); sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); @@ -356,18 +334,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_02); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) + else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) { // reduce on axis0 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3); memset(tmp_1, 0, sizeof(float) * dim0 * dim3); sum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); @@ -381,8 +354,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int // reduce mean else if (param->type == 1) { - if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || - (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) + if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) { float s_tmp = 0.f; for (int n = 0; n < dim0; n++) @@ -418,83 +390,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int { mean_4d_ax3(dim0, dim1, dim2, dim3, data, tmp); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); mean_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); mean_3d_ax0(dim1, dim2, dim3, tmp, tmp_01); free(tmp_01); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); mean_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); mean_3d_ax1(dim1, dim2, dim3, tmp, tmp_02); free(tmp_02); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3); mean_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03); mean_3d_ax2(dim1, dim2, dim3, tmp, tmp_03); free(tmp_03); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); mean_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); mean_3d_ax1(dim0, dim2, dim3, tmp, tmp_12); free(tmp_12); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3); mean_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13); mean_3d_ax2(dim0, dim2, dim3, tmp, tmp_13); free(tmp_13); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) { // reduce on axis2 - float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3); + float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3); memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3); mean_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23); mean_3d_ax2(dim0, dim1, dim3, tmp, tmp_23); free(tmp_23); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_0, 0, sizeof(float) * dim2 * dim3); mean_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -504,18 +465,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_0); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_1, 0, sizeof(float) * dim2 * dim3); mean_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -525,18 +481,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3); memset(tmp_1, 0, sizeof(float) * dim1 * dim3); mean_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); @@ -546,18 +497,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_02); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) + else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) { // reduce on axis0 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3); memset(tmp_1, 0, sizeof(float) * dim0 * dim3); mean_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); @@ -571,8 +517,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int // reduce asum else if (param->type == 2) { - if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || - (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) + if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) { float s_tmp = 0.f; for (int n = 0; n < dim0; n++) @@ -608,83 +553,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int { asum_4d_ax3(dim0, dim1, dim2, dim3, data, tmp); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); sum_3d_ax0(dim1, dim2, dim3, tmp, tmp_01); free(tmp_01); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); sum_3d_ax1(dim1, dim2, dim3, tmp, tmp_02); free(tmp_02); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3); asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03); sum_3d_ax2(dim1, dim2, dim3, tmp, tmp_03); free(tmp_03); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); asum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); sum_3d_ax1(dim0, dim2, dim3, tmp, tmp_12); free(tmp_12); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3); asum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13); sum_3d_ax2(dim0, dim2, dim3, tmp, tmp_13); free(tmp_13); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) { // reduce on axis2 - float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3); + float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3); memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3); asum_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23); sum_3d_ax2(dim0, dim1, dim3, tmp, tmp_23); free(tmp_23); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_0, 0, sizeof(float) * dim2 * dim3); asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -694,18 +628,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_0); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_1, 0, sizeof(float) * dim2 * dim3); asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -715,18 +644,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3); memset(tmp_1, 0, sizeof(float) * dim1 * dim3); asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); @@ -736,18 +660,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_02); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) + else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) { // reduce on axis0 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3); memset(tmp_1, 0, sizeof(float) * dim0 * dim3); asum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); @@ -761,8 +680,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int // reduce sqsum else if (param->type == 3) { - if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || - (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) + if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) { float s_tmp = 0.f; for (int n = 0; n < dim0; n++) @@ -798,83 +716,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int { sqsum_4d_ax3(dim0, dim1, dim2, dim3, data, tmp); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); sqsum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); sum_3d_ax0(dim1, dim2, dim3, tmp, tmp_01); free(tmp_01); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); sqsum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); sum_3d_ax1(dim1, dim2, dim3, tmp, tmp_02); free(tmp_02); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3); sqsum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03); sum_3d_ax2(dim1, dim2, dim3, tmp, tmp_03); free(tmp_03); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); sqsum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); sum_3d_ax1(dim0, dim2, dim3, tmp, tmp_12); free(tmp_12); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3); sqsum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13); sum_3d_ax2(dim0, dim2, dim3, tmp, tmp_13); free(tmp_13); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) { // reduce on axis2 - float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3); + float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3); memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3); sqsum_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23); sum_3d_ax2(dim0, dim1, dim3, tmp, tmp_23); free(tmp_23); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_0, 0, sizeof(float) * dim2 * dim3); sqsum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -884,18 +791,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_0); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_1, 0, sizeof(float) * dim2 * dim3); sqsum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -905,18 +807,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3); memset(tmp_1, 0, sizeof(float) * dim1 * dim3); sqsum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); @@ -926,18 +823,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_02); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) + else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) { // reduce on axis0 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3); memset(tmp_1, 0, sizeof(float) * dim0 * dim3); sqsum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); @@ -951,8 +843,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int // reduce max else if (param->type == 4) { - if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || - (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) + if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) { float s_tmp = FLOAT_MIN; for (int n = 0; n < dim0; n++) @@ -989,83 +880,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int { max_4d_ax3(dim0, dim1, dim2, dim3, data, tmp); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); max_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); max_3d_ax0(dim1, dim2, dim3, tmp, tmp_01); free(tmp_01); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); max_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); max_3d_ax1(dim1, dim2, dim3, tmp, tmp_02); free(tmp_02); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3); max_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03); max_3d_ax2(dim1, dim2, dim3, tmp, tmp_03); free(tmp_03); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); max_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); max_3d_ax1(dim0, dim2, dim3, tmp, tmp_12); free(tmp_12); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3); max_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13); max_3d_ax2(dim0, dim2, dim3, tmp, tmp_13); free(tmp_13); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) { // reduce on axis2 - float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3); + float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3); memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3); max_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23); max_3d_ax2(dim0, dim1, dim3, tmp, tmp_23); free(tmp_23); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_0, 0, sizeof(float) * dim2 * dim3); max_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -1075,18 +955,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_0); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_1, 0, sizeof(float) * dim2 * dim3); max_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -1096,18 +971,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3); memset(tmp_1, 0, sizeof(float) * dim1 * dim3); max_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); @@ -1117,18 +987,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_02); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) + else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) { // reduce on axis0 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3); memset(tmp_1, 0, sizeof(float) * dim0 * dim3); max_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); @@ -1142,8 +1007,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int // reduce min else if (param->type == 5) { - if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || - (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) + if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) { float s_tmp = FLOAT_MAX; for (int n = 0; n < dim0; n++) @@ -1180,83 +1044,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int { min_4d_ax3(dim0, dim1, dim2, dim3, data, tmp); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); min_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); min_3d_ax0(dim1, dim2, dim3, tmp, tmp_01); free(tmp_01); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); min_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); min_3d_ax1(dim1, dim2, dim3, tmp, tmp_02); free(tmp_02); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3); min_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03); min_3d_ax2(dim1, dim2, dim3, tmp, tmp_03); free(tmp_03); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); min_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); min_3d_ax1(dim0, dim2, dim3, tmp, tmp_12); free(tmp_12); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3); min_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13); min_3d_ax2(dim0, dim2, dim3, tmp, tmp_13); free(tmp_13); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) { // reduce on axis2 - float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3); + float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3); memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3); min_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23); min_3d_ax2(dim0, dim1, dim3, tmp, tmp_23); free(tmp_23); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_0, 0, sizeof(float) * dim2 * dim3); min_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -1266,18 +1119,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_0); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_1, 0, sizeof(float) * dim2 * dim3); min_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -1287,18 +1135,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3); memset(tmp_1, 0, sizeof(float) * dim1 * dim3); min_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); @@ -1308,18 +1151,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_02); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) + else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) { // reduce on axis0 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3); memset(tmp_1, 0, sizeof(float) * dim0 * dim3); min_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); @@ -1333,8 +1171,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int // reduce prod else if (param->type == 6) { - if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || - (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) + if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) { float s_tmp = 1.f; for (int n = 0; n < dim0; n++) @@ -1370,83 +1207,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int { prod_4d_ax3(dim0, dim1, dim2, dim3, data, tmp); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); prod_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); prod_3d_ax0(dim1, dim2, dim3, tmp, tmp_01); free(tmp_01); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); prod_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); prod_3d_ax1(dim1, dim2, dim3, tmp, tmp_02); free(tmp_02); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3); prod_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03); prod_3d_ax2(dim1, dim2, dim3, tmp, tmp_03); free(tmp_03); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); prod_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); prod_3d_ax1(dim0, dim2, dim3, tmp, tmp_12); free(tmp_12); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3); prod_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13); prod_3d_ax2(dim0, dim2, dim3, tmp, tmp_13); free(tmp_13); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) { // reduce on axis2 - float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3); + float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3); memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3); prod_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23); prod_3d_ax2(dim0, dim1, dim3, tmp, tmp_23); free(tmp_23); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_0, 0, sizeof(float) * dim2 * dim3); prod_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -1456,18 +1282,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_0); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_1, 0, sizeof(float) * dim2 * dim3); prod_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -1477,18 +1298,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3); memset(tmp_1, 0, sizeof(float) * dim1 * dim3); prod_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); @@ -1498,18 +1314,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_02); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) + else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) { // reduce on axis0 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3); memset(tmp_1, 0, sizeof(float) * dim0 * dim3); prod_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); @@ -1523,8 +1334,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int // reduce l1 else if (param->type == 7) { - if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || - (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) + if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) { float s_tmp = 0.f; for (int n = 0; n < dim0; n++) @@ -1560,83 +1370,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int { asum_4d_ax3(dim0, dim1, dim2, dim3, data, tmp); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); sum_3d_ax0(dim1, dim2, dim3, tmp, tmp_01); free(tmp_01); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); sum_3d_ax1(dim1, dim2, dim3, tmp, tmp_02); free(tmp_02); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3); asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03); sum_3d_ax2(dim1, dim2, dim3, tmp, tmp_03); free(tmp_03); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); asum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); sum_3d_ax1(dim0, dim2, dim3, tmp, tmp_12); free(tmp_12); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3); asum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13); sum_3d_ax2(dim0, dim2, dim3, tmp, tmp_13); free(tmp_13); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) { // reduce on axis2 - float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3); + float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3); memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3); asum_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23); sum_3d_ax2(dim0, dim1, dim3, tmp, tmp_23); free(tmp_23); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_0, 0, sizeof(float) * dim2 * dim3); asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -1646,18 +1445,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_0); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_1, 0, sizeof(float) * dim2 * dim3); asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -1667,18 +1461,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3); memset(tmp_1, 0, sizeof(float) * dim1 * dim3); asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); @@ -1688,18 +1477,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_02); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) + else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) { // reduce on axis0 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3); memset(tmp_1, 0, sizeof(float) * dim0 * dim3); asum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); @@ -1713,8 +1497,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int // reduce l2 else if (param->type == 8) { - if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || - (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) + if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) { float s_tmp = 0.f; for (int n = 0; n < dim0; n++) @@ -1750,83 +1533,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int { l2_4d_ax3(dim0, dim1, dim2, dim3, data, tmp); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); l2_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); sum_3d_ax0(dim1, dim2, dim3, tmp, tmp_01); free(tmp_01); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); l2_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); sum_3d_ax1(dim1, dim2, dim3, tmp, tmp_02); free(tmp_02); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3); l2_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03); sum_3d_ax2(dim1, dim2, dim3, tmp, tmp_03); free(tmp_03); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); l2_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); sum_3d_ax1(dim0, dim2, dim3, tmp, tmp_12); free(tmp_12); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3); l2_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13); sum_3d_ax2(dim0, dim2, dim3, tmp, tmp_13); free(tmp_13); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) { // reduce on axis2 - float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3); + float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3); memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3); l2_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23); sum_3d_ax2(dim0, dim1, dim3, tmp, tmp_23); free(tmp_23); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_0, 0, sizeof(float) * dim2 * dim3); l2_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -1836,18 +1608,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_0); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_1, 0, sizeof(float) * dim2 * dim3); l2_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -1857,18 +1624,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3); memset(tmp_1, 0, sizeof(float) * dim1 * dim3); l2_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); @@ -1878,18 +1640,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_02); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) + else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) { // reduce on axis0 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3); memset(tmp_1, 0, sizeof(float) * dim0 * dim3); l2_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); @@ -1903,8 +1660,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int // reduce log sum else if (param->type == 9) { - if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || - (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) + if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) { float s_tmp = 0.f; for (int n = 0; n < dim0; n++) @@ -1940,83 +1696,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int { logsum_4d_ax3(dim0, dim1, dim2, dim3, data, tmp); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); logsum_3d_ax0(dim1, dim2, dim3, tmp, tmp_01); free(tmp_01); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); logsum_3d_ax1(dim1, dim2, dim3, tmp, tmp_02); free(tmp_02); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3); sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03); logsum_3d_ax2(dim1, dim2, dim3, tmp, tmp_03); free(tmp_03); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); sum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); logsum_3d_ax1(dim0, dim2, dim3, tmp, tmp_12); free(tmp_12); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3); sum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13); logsum_3d_ax2(dim0, dim2, dim3, tmp, tmp_13); free(tmp_13); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) { // reduce on axis2 - float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3); + float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3); memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3); sum_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23); logsum_3d_ax2(dim0, dim1, dim3, tmp, tmp_23); free(tmp_23); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_0, 0, sizeof(float) * dim2 * dim3); sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -2026,18 +1771,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_0); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_1, 0, sizeof(float) * dim2 * dim3); sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -2047,18 +1787,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3); memset(tmp_1, 0, sizeof(float) * dim1 * dim3); sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); @@ -2068,18 +1803,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_02); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) + else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) { // reduce on axis0 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3); memset(tmp_1, 0, sizeof(float) * dim0 * dim3); sum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); @@ -2092,8 +1822,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int } else if (param->type == 10) { - if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || - (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) + if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3)) { float s_tmp = 0.f; for (int n = 0; n < dim0; n++) @@ -2129,83 +1858,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int { logsumexp_4d_ax3(dim0, dim1, dim2, dim3, data, tmp); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); sumexp_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); logsum_3d_ax0(dim1, dim2, dim3, tmp, tmp_01); free(tmp_01); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); sumexp_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); logsum_3d_ax1(dim1, dim2, dim3, tmp, tmp_02); free(tmp_02); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0))) { // reduce on axis0 - float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3); sumexp_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03); logsum_3d_ax2(dim1, dim2, dim3, tmp, tmp_03); free(tmp_03); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); sumexp_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); logsum_3d_ax1(dim0, dim2, dim3, tmp, tmp_12); free(tmp_12); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1))) { // reduce on axis1 - float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3); sumexp_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13); logsum_3d_ax2(dim0, dim2, dim3, tmp, tmp_13); free(tmp_13); } - else if (param_dim2 == -2 && param_dim3 == -2 && - ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) + else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2))) { // reduce on axis2 - float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3); + float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3); memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3); sumexp_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23); logsum_3d_ax2(dim0, dim1, dim3, tmp, tmp_23); free(tmp_23); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_0, 0, sizeof(float) * dim2 * dim3); sumexp_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -2215,18 +1933,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_0); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3); memset(tmp_1, 0, sizeof(float) * dim2 * dim3); sumexp_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01); @@ -2236,18 +1949,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_01); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || - (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) + else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0))) { // reduce on axis0 - float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3); + float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3); memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3); memset(tmp_1, 0, sizeof(float) * dim1 * dim3); sumexp_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02); @@ -2257,18 +1965,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int free(tmp_02); free(tmp_1); } - else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || - (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || - (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || - (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || - (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || - (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) + else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1))) { // reduce on axis0 - float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3); + float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3); memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3); - float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3); + float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3); memset(tmp_1, 0, sizeof(float) * dim0 * dim3); sumexp_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12); @@ -2436,11 +2139,14 @@ void sum_5d_ax1(int* dims, int dim_num, float* data, float* tmp) int dim2 = dims[2]; int dim3 = dims[3]; int dim4 = dims[4]; - int chw = dim2*dim3*dim4; - for(int j = 0; j < dim0; j++){ - for(int n = 0; n < dim1; n++){ - for(int size = 0; size < chw; size++){ - tmp[size] += data[n*chw + size]; + int chw = dim2 * dim3 * dim4; + for (int j = 0; j < dim0; j++) + { + for (int n = 0; n < dim1; n++) + { + for (int size = 0; size < chw; size++) + { + tmp[size] += data[n * chw + size]; } } } @@ -2453,20 +2159,24 @@ void sum_5d_ax1_uint8(int* dims, int dim_num, uint8_t* data, uint8_t* out_data, int dim2 = dims[2]; int dim3 = dims[3]; int dim4 = dims[4]; - int chw = dim2*dim3*dim4; + int chw = dim2 * dim3 * dim4; - float* tmp = ( float* )malloc(sizeof(float) * chw); + float* tmp = (float*)malloc(sizeof(float) * chw); memset(tmp, 0, sizeof(float) * chw); - for(int j = 0; j < dim0; j++){ - for(int n = 0; n < dim1; n++){ - for(int size = 0; size < chw; size++){ - float tmp_in_data = in_scale * (data[n*chw + size] - in_zp); + for (int j = 0; j < dim0; j++) + { + for (int n = 0; n < dim1; n++) + { + for (int size = 0; size < chw; size++) + { + float tmp_in_data = in_scale * (data[n * chw + size] - in_zp); tmp[size] += tmp_in_data; } } } - for(int size = 0; size < chw; size++){ + for (int size = 0; size < chw; size++) + { int32_t data_i32 = round(tmp[size] / out_scale + out_zp); if (data_i32 > 255) data_i32 = 255; @@ -3288,7 +2998,7 @@ void l2_4d_ax0(int dim0, int dim1, int dim2, int dim3, float* data, float* tmp) for (int n = 0; n < dim0; n++) { int offset = n * dim1 * dim2 * dim3 + j; - tmp[j] += sqrt((double )data[offset] * data[offset]); + tmp[j] += sqrt((double)data[offset] * data[offset]); } } } @@ -3301,7 +3011,7 @@ void l2_4d_ax1(int dim0, int dim1, int dim2, int dim3, float* data, float* tmp) for (int h = 0; h < dim1; h++) { int offset = n * dim1 * dim2 * dim3 + h * dim2 * dim3 + cw; - tmp[n * dim2 * dim3 + cw] += sqrt((double )data[offset] * data[offset]); + tmp[n * dim2 * dim3 + cw] += sqrt((double)data[offset] * data[offset]); } } } @@ -3317,7 +3027,7 @@ void l2_4d_ax2(int dim0, int dim1, int dim2, int dim3, float* data, float* tmp) for (int w = 0; w < dim2; w++) { int offset = n * dim1 * dim2 * dim3 + h * dim2 * dim3 + w * dim3 + c; - tmp[n * dim1 * dim3 + h * dim3 + c] += sqrt((double )data[offset] * data[offset]); + tmp[n * dim1 * dim3 + h * dim3 + c] += sqrt((double)data[offset] * data[offset]); } } } @@ -3334,7 +3044,7 @@ void l2_4d_ax3(int dim0, int dim1, int dim2, int dim3, float* data, float* tmp) for (int c = 0; c < dim3; c++) { int offset = n * dim1 * dim2 * dim3 + h * dim2 * dim3 + w * dim3 + c; - tmp[n * dim1 * dim2 + h * dim2 + w] += sqrt((double )data[offset] * data[offset]); + tmp[n * dim1 * dim2 + h * dim2 + w] += sqrt((double)data[offset] * data[offset]); } } } @@ -3347,7 +3057,7 @@ void l2_3d_ax0(int dim1, int dim2, int dim3, float* tmp, float* tmp_01) for (int h = 0; h < dim1; h++) { int index = h * dim2 * dim3 + wc; - tmp[wc] += sqrt((double )tmp_01[index] * tmp_01[index]); + tmp[wc] += sqrt((double)tmp_01[index] * tmp_01[index]); } } } @@ -3360,7 +3070,7 @@ void l2_3d_ax1(int dim1, int dim2, int dim3, float* tmp, float* tmp_02) for (int w = 0; w < dim2; w++) { int index = h * dim2 * dim3 + w * dim3 + c; - tmp[h * dim3 + c] += sqrt((double )tmp_02[index] * tmp_02[index]); + tmp[h * dim3 + c] += sqrt((double)tmp_02[index] * tmp_02[index]); } } } @@ -3374,7 +3084,7 @@ void l2_3d_ax2(int dim1, int dim2, int dim3, float* tmp, float* tmp_03) for (int c = 0; c < dim3; c++) { int index = h * dim2 * dim3 + w * dim3 + c; - tmp[h * dim2 + w] += sqrt((double )tmp_03[index] * tmp_03[index]); + tmp[h * dim2 + w] += sqrt((double)tmp_03[index] * tmp_03[index]); } } } @@ -3386,7 +3096,7 @@ void l2_2d_ax0(int dim1, int dim2, float* tmp, float* tmp_0) for (int h = 0; h < dim1; h++) { int index = h * dim2 + w; - tmp[w] += sqrt((double )tmp_0[index] * tmp_0[index]); + tmp[w] += sqrt((double)tmp_0[index] * tmp_0[index]); } } } @@ -3397,7 +3107,7 @@ void l2_2d_ax1(int dim1, int dim2, float* tmp, float* tmp_1) for (int w = 0; w < dim2; w++) { int index = h * dim2 + w; - tmp[h] += sqrt((double )tmp_1[index] * tmp_1[index]); + tmp[h] += sqrt((double)tmp_1[index] * tmp_1[index]); } } } diff --git a/source/device/cpu/op/reduction/reduction_ref.c b/source/device/cpu/op/reduction/reduction_ref.c index d4641652a..f3353f326 100644 --- a/source/device/cpu/op/reduction/reduction_ref.c +++ b/source/device/cpu/op/reduction/reduction_ref.c @@ -38,7 +38,6 @@ #include - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -61,7 +60,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct reduction_param* reduction_param = ( struct reduction_param* )ir_node->op.param_mem; + struct reduction_param* reduction_param = (struct reduction_param*)ir_node->op.param_mem; struct reduce_param_ref param; int out_tensor_size = 1; for (int i = 0; i < output_tensor->dim_num; i++) @@ -71,8 +70,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int element_size = output_tensor->elem_size; // int dims[4] = {1, 1, 1, 1}; - int* dims = (int*)malloc(input_tensor->dim_num*sizeof(int)); - memset(dims, 0, input_tensor->dim_num*sizeof(int)); + int* dims = (int*)malloc(input_tensor->dim_num * sizeof(int)); + memset(dims, 0, input_tensor->dim_num * sizeof(int)); for (int i = 0; i < input_tensor->dim_num; i++) { dims[i] = input_tensor->dims[i]; @@ -81,7 +80,6 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int dim1 = dims[1]; int dim2 = dims[2]; int dim3 = dims[3]; - param.param_dim[0] = reduction_param->dim_0; param.param_dim[1] = reduction_param->dim_1; @@ -93,8 +91,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int ret = 0; if (input_tensor->data_type == TENGINE_DT_FP32) { - ret = ref_reduce_fp32(( float* )input_tensor->data, ( float* )output_tensor->data, dim0, dim1, dim2, dim3, - out_tensor_size, ¶m, in_dim_num, dims); + ret = ref_reduce_fp32((float*)input_tensor->data, (float*)output_tensor->data, dim0, dim1, dim2, dim3, + out_tensor_size, ¶m, in_dim_num, dims); } else if (input_tensor->data_type == TENGINE_DT_UINT8) { @@ -102,8 +100,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex param.output_scale = output_tensor->scale; param.input_zp = input_tensor->zero_point; param.output_zp = output_tensor->zero_point; - ret = ref_reduce_uint8(( uint8_t* )input_tensor->data, ( uint8_t* )output_tensor->data, dim0, dim1, dim2, dim3, - out_tensor_size, ¶m, in_dim_num, dims); + ret = ref_reduce_uint8((uint8_t*)input_tensor->data, (uint8_t*)output_tensor->data, dim0, dim1, dim2, dim3, + out_tensor_size, ¶m, in_dim_num, dims); } free(dims); diff --git a/source/device/cpu/op/region/region_ref.c b/source/device/cpu/op/region/region_ref.c index 1b02e8178..3bb0b37a1 100644 --- a/source/device/cpu/op/region/region_ref.c +++ b/source/device/cpu/op/region/region_ref.c @@ -37,7 +37,6 @@ #include #include - static int entry_index(int batch, int location, int entry, int hw, int chw, int classes) { int coords = 4; @@ -157,7 +156,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct region_param* region_param = ( struct region_param* )ir_node->op.param_mem; + struct region_param* region_param = (struct region_param*)ir_node->op.param_mem; ref_region_fp32(input_tensor, output_tensor, region_param, exec_graph->num_thread); diff --git a/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.c b/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.c index 2a49d6fbf..0f885ba8b 100644 --- a/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.c +++ b/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.c @@ -30,7 +30,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -51,7 +50,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct relu_param* relu_param = ( struct relu_param* )ir_node->op.param_mem; + struct relu_param* relu_param = (struct relu_param*)ir_node->op.param_mem; perf_relu_fp32(input_tensor, output_tensor, relu_param->negative_slope, exec_graph->num_thread); diff --git a/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.h b/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.h index 35296a1d0..f1ff0f56a 100644 --- a/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.h +++ b/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.h @@ -33,7 +33,6 @@ #include - static int perf_relu_fp32(struct tensor* input_tensor, struct tensor* output_tensor, float negative_slope, int num_thread) { @@ -66,7 +65,7 @@ static int perf_relu_fp32(struct tensor* input_tensor, struct tensor* output_ten int remain = size - (nn << 2); #else int remain = size; -#endif // __ARM_NEON +#endif // __ARM_NEON #if __ARM_NEON float32x4_t _zero = vdupq_n_f32(0.f); @@ -110,7 +109,7 @@ static int perf_relu_fp32(struct tensor* input_tensor, struct tensor* output_ten int remain = size - (nn << 2); #else int remain = size; -#endif // __ARM_NEON +#endif // __ARM_NEON #if __ARM_NEON float32x4_t _zero = vdupq_n_f32(0.f); diff --git a/source/device/cpu/op/relu/cortex-a/relu_kernel_arm.c b/source/device/cpu/op/relu/cortex-a/relu_kernel_arm.c index f499251ca..026206ef6 100644 --- a/source/device/cpu/op/relu/cortex-a/relu_kernel_arm.c +++ b/source/device/cpu/op/relu/cortex-a/relu_kernel_arm.c @@ -30,12 +30,11 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) - static inline int relu_kernel(const int i, const int id, const void* data, const float* input, float* output, const float slope) { float32x4_t _zero = vdupq_n_f32(0.f); - int step = (( int* )data)[0]; + int step = ((int*)data)[0]; const float* cur_input = input + id * step; float* cur_output = output + id * step; if (slope == 0) @@ -80,8 +79,8 @@ static inline int relu_kernel(const int i, const int id, const void* data, const int relu_arm_run(struct tensor* output_tensor, struct tensor* input_tensor, struct relu_param* relu_param, int num_thread) { - float* data = ( float* )input_tensor->data; - float* out_data = ( float* )output_tensor->data; + float* data = (float*)input_tensor->data; + float* out_data = (float*)output_tensor->data; float negativeslope = relu_param->negative_slope; int chan_num = input_tensor->dims[0] * input_tensor->dims[1]; diff --git a/source/device/cpu/op/relu/cortex-a/relu_kernel_arm.h b/source/device/cpu/op/relu/cortex-a/relu_kernel_arm.h index b4dd59ca8..25439d5d9 100644 --- a/source/device/cpu/op/relu/cortex-a/relu_kernel_arm.h +++ b/source/device/cpu/op/relu/cortex-a/relu_kernel_arm.h @@ -31,7 +31,6 @@ #include "graph/node.h" #include "graph/graph.h" - int relu_arm_run(struct tensor* output_tensor, struct tensor* input_tensor, struct relu_param* relu_param, int num_thread); diff --git a/source/device/cpu/op/relu/cortex-m/relu_cmsis.c b/source/device/cpu/op/relu/cortex-m/relu_cmsis.c index 2e19f4260..72d506512 100644 --- a/source/device/cpu/op/relu/cortex-m/relu_cmsis.c +++ b/source/device/cpu/op/relu/cortex-m/relu_cmsis.c @@ -37,7 +37,6 @@ #include "arm_math.h" - /** * @brief Q7 RELU function * @param[in,out] data pointer to input diff --git a/source/device/cpu/op/relu/relu_kernel_ref.h b/source/device/cpu/op/relu/relu_kernel_ref.h index e4927d200..981b0cb53 100644 --- a/source/device/cpu/op/relu/relu_kernel_ref.h +++ b/source/device/cpu/op/relu/relu_kernel_ref.h @@ -25,12 +25,10 @@ #ifndef __RELU_KERNEL_REF_H__ #define __RELU_KERNEL_REF_H__ - #include "graph/tensor.h" #include "graph/node.h" #include "graph/graph.h" - int ref_relu_fp32(struct tensor* input_tensor, struct tensor* output_tensor, float negative_slope); int ref_relu_fp16(struct tensor* input_tensor, struct tensor* output_tensor, float negative_slope); diff --git a/source/device/cpu/op/relu/relu_kernel_ref_fp16.c b/source/device/cpu/op/relu/relu_kernel_ref_fp16.c index a47f3de15..d5fab13f7 100644 --- a/source/device/cpu/op/relu/relu_kernel_ref_fp16.c +++ b/source/device/cpu/op/relu/relu_kernel_ref_fp16.c @@ -36,7 +36,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - #if MACOS #else int ref_relu_fp16(struct tensor* input_tensor, struct tensor* output_tensor, float negative_slope) @@ -50,7 +49,7 @@ int ref_relu_fp16(struct tensor* input_tensor, struct tensor* output_tensor, flo fp16_t* output_fp16 = (fp16_t*)output_tensor->data; float* input_fp32 = (float*)sys_malloc(total_size * sizeof(float)); - for(int i=0; i< total_size; i++) + for (int i = 0; i < total_size; i++) { input_fp32[i] = fp16_to_fp32(input_fp16[i]); } @@ -78,7 +77,7 @@ int ref_relu_fp16(struct tensor* input_tensor, struct tensor* output_tensor, flo } /* cost fp32 to fp16 */ - for(int i=0; ielem_num; diff --git a/source/device/cpu/op/relu/relu_kernel_ref_int8.c b/source/device/cpu/op/relu/relu_kernel_ref_int8.c index a43f844f8..885444b29 100644 --- a/source/device/cpu/op/relu/relu_kernel_ref_int8.c +++ b/source/device/cpu/op/relu/relu_kernel_ref_int8.c @@ -38,7 +38,6 @@ #include - int ref_relu_int8(struct tensor* input_tensor, struct tensor* output_tensor, float negative_slope) { int total_size = input_tensor->elem_num; @@ -51,9 +50,9 @@ int ref_relu_int8(struct tensor* input_tensor, struct tensor* output_tensor, flo float* data_fp32 = (float*)sys_malloc(total_size * sizeof(float)); - for(int i=0; i 127) diff --git a/source/device/cpu/op/relu/relu_kernel_ref_uint8.c b/source/device/cpu/op/relu/relu_kernel_ref_uint8.c index f687332ff..1b64308cd 100644 --- a/source/device/cpu/op/relu/relu_kernel_ref_uint8.c +++ b/source/device/cpu/op/relu/relu_kernel_ref_uint8.c @@ -38,7 +38,6 @@ #include - int ref_relu_uint8(struct tensor* input_tensor, struct tensor* output_tensor, float negative_slope) { int total_size = input_tensor->elem_num; @@ -53,9 +52,9 @@ int ref_relu_uint8(struct tensor* input_tensor, struct tensor* output_tensor, fl float* data_fp32 = (float*)sys_malloc(total_size * sizeof(float)); - for(int i=0; i 255) diff --git a/source/device/cpu/op/relu/relu_ref.c b/source/device/cpu/op/relu/relu_ref.c index b4a1e66f3..2b0372686 100644 --- a/source/device/cpu/op/relu/relu_ref.c +++ b/source/device/cpu/op/relu/relu_ref.c @@ -38,7 +38,6 @@ #include "relu_kernel_ref.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -56,17 +55,17 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct relu_param* relu_param = ( struct relu_param* )ir_node->op.param_mem; + struct relu_param* relu_param = (struct relu_param*)ir_node->op.param_mem; int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_relu_fp32(input_tensor, output_tensor, relu_param->negative_slope); else if (input_tensor->data_type == TENGINE_DT_FP16) - #if MACOS +#if MACOS TLOG_ERR("FP16 not support mac os"); - #else +#else ret = ref_relu_fp16(input_tensor, output_tensor, relu_param->negative_slope); - #endif +#endif else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_relu_uint8(input_tensor, output_tensor, relu_param->negative_slope); else if (input_tensor->data_type == TENGINE_DT_INT8) diff --git a/source/device/cpu/op/relu1/relu1_ref.c b/source/device/cpu/op/relu1/relu1_ref.c index bee4fd347..337bc5812 100644 --- a/source/device/cpu/op/relu1/relu1_ref.c +++ b/source/device/cpu/op/relu1/relu1_ref.c @@ -32,7 +32,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - int ref_relu1_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread) { int w = input_tensor->dims[3]; diff --git a/source/device/cpu/op/relu6/relu6_ref.c b/source/device/cpu/op/relu6/relu6_ref.c index 834565e2a..98bfa2006 100644 --- a/source/device/cpu/op/relu6/relu6_ref.c +++ b/source/device/cpu/op/relu6/relu6_ref.c @@ -34,7 +34,6 @@ #include - int ref_relu6_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread) { int w = input_tensor->dims[3]; @@ -57,12 +56,12 @@ int ref_relu6_uint8(struct tensor* input_tensor, struct tensor* output_tensor, i float* data_fp32 = (float*)sys_malloc(total_size * sizeof(float)); - for(int i = 0; i < total_size; i++) - data_fp32[i] = ((float) input_uint8[i] - (float)input_zero) * input_scale; + for (int i = 0; i < total_size; i++) + data_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale; for (int n = 0; n < batch; n++) { -//#pragma omp parallel for num_threads(num_thread) + //#pragma omp parallel for num_threads(num_thread) for (int q = 0; q < channels; q++) { float* src = data_fp32 + batch_step * n + c_step * q; @@ -73,14 +72,14 @@ int ref_relu6_uint8(struct tensor* input_tensor, struct tensor* output_tensor, i dst[i] = src[i]; if (src[i] > 6) dst[i] = 6; - else if(src[i] < 0) + else if (src[i] < 0) dst[i] = 0; } } } // quant - for(int i=0; i 255) @@ -93,7 +92,6 @@ int ref_relu6_uint8(struct tensor* input_tensor, struct tensor* output_tensor, i return 0; } - int ref_relu6_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread) { int w = input_tensor->dims[3]; @@ -144,10 +142,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - int ret = -1; - if(input_tensor->data_type == TENGINE_DT_FP32) + int ret = -1; + if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_relu6_fp32(input_tensor, output_tensor, exec_graph->num_thread); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_relu6_uint8(input_tensor, output_tensor, exec_graph->num_thread); return ret; diff --git a/source/device/cpu/op/reorg/reorg_ref.c b/source/device/cpu/op/reorg/reorg_ref.c index 84a976afd..3cff628a0 100644 --- a/source/device/cpu/op/reorg/reorg_ref.c +++ b/source/device/cpu/op/reorg/reorg_ref.c @@ -36,7 +36,6 @@ #include - static int ref_reorg_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct reorg_param* param, int num_thread) { @@ -98,7 +97,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct reorg_param* reorg_param = ( struct reorg_param* )ir_node->op.param_mem; + struct reorg_param* reorg_param = (struct reorg_param*)ir_node->op.param_mem; int ret = ref_reorg_fp32(input_tensor, output_tensor, reorg_param, exec_graph->num_thread); if (ret != 0) diff --git a/source/device/cpu/op/reshape/reshape_ref.c b/source/device/cpu/op/reshape/reshape_ref.c index a0e59b6fb..09ddd5f5b 100644 --- a/source/device/cpu/op/reshape/reshape_ref.c +++ b/source/device/cpu/op/reshape/reshape_ref.c @@ -35,7 +35,6 @@ #include #include - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -62,23 +61,26 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex switch (input_tensor->data_type) { - case TENGINE_DT_FP32: - case TENGINE_DT_INT32: { - size *= 4; - break; - } - case TENGINE_DT_FP16: - case TENGINE_DT_INT16: { - size *= 2; - break; - } - case TENGINE_DT_UINT8: - case TENGINE_DT_INT8: { - size *= 1; - break; - } - default: - return -1; + case TENGINE_DT_FP32: + case TENGINE_DT_INT32: + { + size *= 4; + break; + } + case TENGINE_DT_FP16: + case TENGINE_DT_INT16: + { + size *= 2; + break; + } + case TENGINE_DT_UINT8: + case TENGINE_DT_INT8: + { + size *= 1; + break; + } + default: + return -1; } if (size <= 0) @@ -88,7 +90,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex /* transpose nchw to nhwc */ //check dim size first??? - if(input_tensor->dim_num == 4 && (output_tensor->dim_num == 2||output_tensor->dim_num == 3||output_tensor->dim_num == 4)) + if (input_tensor->dim_num == 4 && (output_tensor->dim_num == 2 || output_tensor->dim_num == 3 || output_tensor->dim_num == 4)) { if (ir_graph->model_layout == TENGINE_LAYOUT_NHWC) { @@ -106,7 +108,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex float* input_fp32 = (float*)input_tensor->data; float* output_fp32 = (float*)output_tensor->data; - float* data_fp32_temp = ( float* )malloc(size); + float* data_fp32_temp = (float*)malloc(size); int index = 0; for (int h = 0; h < in_h; h++) @@ -138,7 +140,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex float* input_fp32 = (float*)input_tensor->data; float* output_fp32 = (float*)output_tensor->data; - float* data_fp32_temp = ( float* )malloc(size); + float* data_fp32_temp = (float*)malloc(size); int index = 0; for (int h = 0; h < in_h; h++) @@ -196,7 +198,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex uint8_t* input_uint8 = (uint8_t*)input_tensor->data; uint8_t* output_uint8 = (uint8_t*)output_tensor->data; - uint8_t* data_uint8_temp = ( uint8_t* )malloc(size); + uint8_t* data_uint8_temp = (uint8_t*)malloc(size); int index = 0; for (int h = 0; h < in_h; h++) @@ -228,7 +230,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex uint8_t* input_uint8 = (uint8_t*)input_tensor->data; uint8_t* output_uint8 = (uint8_t*)output_tensor->data; - uint8_t* data_uint8_temp = ( uint8_t* )malloc(size); + uint8_t* data_uint8_temp = (uint8_t*)malloc(size); int index = 0; for (int h = 0; h < in_h; h++) @@ -263,7 +265,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int8_t* input_int8 = (int8_t*)input_tensor->data; int8_t* output_int8 = (int8_t*)output_tensor->data; - int8_t* data_int8_temp = ( int8_t* )malloc(size); + int8_t* data_int8_temp = (int8_t*)malloc(size); int index = 0; for (int h = 0; h < in_h; h++) @@ -295,7 +297,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int8_t* input_int8 = (int8_t*)input_tensor->data; int8_t* output_int8 = (int8_t*)output_tensor->data; - int8_t* data_int8_temp = ( int8_t* )malloc(size); + int8_t* data_int8_temp = (int8_t*)malloc(size); int index = 0; for (int h = 0; h < in_h; h++) diff --git a/source/device/cpu/op/resize/resize_ref.c b/source/device/cpu/op/resize/resize_ref.c index 6b2a38a5d..c787f3ec6 100644 --- a/source/device/cpu/op/resize/resize_ref.c +++ b/source/device/cpu/op/resize/resize_ref.c @@ -36,7 +36,6 @@ #include - #define T_MAX(a, b) ((a) > (b) ? (a) : (b)) #define T_MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -75,8 +74,7 @@ static void bilinear_resize(float* inp, float* output, int h, int w, int c, floa for (int k = 0; k < c; k++) { int in_index = in_idx + k * in_hw; - output[k * out_hw + out_idx] = inp[in_index] * fx_0 * fy_0 + inp[in_index + w] * fx_0 * fy + - inp[in_index + 1] * fx * fy_0 + inp[in_index + w + 1] * fx * fy; + output[k * out_hw + out_idx] = inp[in_index] * fx_0 * fy_0 + inp[in_index + w] * fx_0 * fy + inp[in_index + 1] * fx * fy_0 + inp[in_index + w + 1] * fx * fy; } } } @@ -94,10 +92,10 @@ static void nearest_neighbor_resize(float* inp, float* out, int h, int w, int c_ output = out + k * oh * ow; for (int i = 0; i < oh; i++) { - si = T_MIN(( int )(i * scale_y), h - 1); + si = T_MIN((int)(i * scale_y), h - 1); for (int j = 0; j < ow; j++) { - sj = T_MIN(( int )(j * scale_x), w - 1); + sj = T_MIN((int)(j * scale_x), w - 1); output[i * ow + j] = input[si * w + sj]; } } @@ -128,14 +126,14 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct resize_param* resize_param = ( struct resize_param* )ir_node->op.param_mem; + struct resize_param* resize_param = (struct resize_param*)ir_node->op.param_mem; float scale_x = 1.f / resize_param->scale_w; float scale_y = 1.f / resize_param->scale_h; int in_chw = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; int out_chw = output_tensor->dims[1] * output_tensor->dims[2] * output_tensor->dims[3]; - float* input = ( float* )input_tensor->data; - float* output = ( float* )output_tensor->data; + float* input = (float*)input_tensor->data; + float* output = (float*)output_tensor->data; if (resize_param->type == 0) { diff --git a/source/device/cpu/op/reverse/reverse_ref.c b/source/device/cpu/op/reverse/reverse_ref.c index 66b692614..07d4a6a9c 100644 --- a/source/device/cpu/op/reverse/reverse_ref.c +++ b/source/device/cpu/op/reverse/reverse_ref.c @@ -34,10 +34,9 @@ #include - struct reverse_param { - int in_shape[4]; // the dim of the input + int in_shape[4]; // the dim of the input int dim_size; }; @@ -64,8 +63,7 @@ int ref_reverse_fp32(void* input, void* input_axis, void* output, const struct r { for (int x = 0; x < param->in_shape[3]; x++) { - out_ptr[i * in_chw + j * in_hw + y * in_w + x] = - in_ptr[(param->in_shape[0] - 1 - i) * in_chw + j * in_hw + y * in_w + x]; + out_ptr[i * in_chw + j * in_hw + y * in_w + x] = in_ptr[(param->in_shape[0] - 1 - i) * in_chw + j * in_hw + y * in_w + x]; } } } @@ -82,8 +80,7 @@ int ref_reverse_fp32(void* input, void* input_axis, void* output, const struct r { for (int x = 0; x < param->in_shape[3]; x++) { - out_ptr[i * in_chw + j * in_hw + y * in_w + x] = - in_ptr[i * in_chw + (param->in_shape[1] - 1 - j) * in_hw + y * in_w + x]; + out_ptr[i * in_chw + j * in_hw + y * in_w + x] = in_ptr[i * in_chw + (param->in_shape[1] - 1 - j) * in_hw + y * in_w + x]; } } } @@ -100,8 +97,7 @@ int ref_reverse_fp32(void* input, void* input_axis, void* output, const struct r { for (int x = 0; x < param->in_shape[3]; x++) { - out_ptr[i * in_chw + j * in_hw + y * in_w + x] = - in_ptr[i * in_chw + j * in_hw + (param->in_shape[2] - 1 - y) * in_w + x]; + out_ptr[i * in_chw + j * in_hw + y * in_w + x] = in_ptr[i * in_chw + j * in_hw + (param->in_shape[2] - 1 - y) * in_w + x]; } } } @@ -118,8 +114,7 @@ int ref_reverse_fp32(void* input, void* input_axis, void* output, const struct r { for (int x = 0; x < param->in_shape[3]; x++) { - out_ptr[i * in_chw + j * in_hw + y * in_w + x] = - in_ptr[i * in_chw + j * in_hw + y * in_w + (param->in_shape[3] - 1 - x)]; + out_ptr[i * in_chw + j * in_hw + y * in_w + x] = in_ptr[i * in_chw + j * in_hw + y * in_w + (param->in_shape[3] - 1 - x)]; } } } @@ -157,8 +152,7 @@ int ref_reverse_uint8(void* input, void* input_axis, void* output, const struct { for (int x = 0; x < param->in_shape[3]; x++) { - out_ptr[i * in_chw + j * in_hw + y * in_w + x] = - in_ptr[(param->in_shape[0] - 1 - i) * in_chw + j * in_hw + y * in_w + x]; + out_ptr[i * in_chw + j * in_hw + y * in_w + x] = in_ptr[(param->in_shape[0] - 1 - i) * in_chw + j * in_hw + y * in_w + x]; } } } @@ -175,8 +169,7 @@ int ref_reverse_uint8(void* input, void* input_axis, void* output, const struct { for (int x = 0; x < param->in_shape[3]; x++) { - out_ptr[i * in_chw + j * in_hw + y * in_w + x] = - in_ptr[i * in_chw + (param->in_shape[1] - 1 - j) * in_hw + y * in_w + x]; + out_ptr[i * in_chw + j * in_hw + y * in_w + x] = in_ptr[i * in_chw + (param->in_shape[1] - 1 - j) * in_hw + y * in_w + x]; } } } @@ -193,8 +186,7 @@ int ref_reverse_uint8(void* input, void* input_axis, void* output, const struct { for (int x = 0; x < param->in_shape[3]; x++) { - out_ptr[i * in_chw + j * in_hw + y * in_w + x] = - in_ptr[i * in_chw + j * in_hw + (param->in_shape[2] - 1 - y) * in_w + x]; + out_ptr[i * in_chw + j * in_hw + y * in_w + x] = in_ptr[i * in_chw + j * in_hw + (param->in_shape[2] - 1 - y) * in_w + x]; } } } @@ -211,8 +203,7 @@ int ref_reverse_uint8(void* input, void* input_axis, void* output, const struct { for (int x = 0; x < param->in_shape[3]; x++) { - out_ptr[i * in_chw + j * in_hw + y * in_w + x] = - in_ptr[i * in_chw + j * in_hw + y * in_w + (param->in_shape[3] - 1 - x)]; + out_ptr[i * in_chw + j * in_hw + y * in_w + x] = in_ptr[i * in_chw + j * in_hw + y * in_w + (param->in_shape[3] - 1 - x)]; } } } @@ -267,9 +258,9 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_reverse_fp32(input_tensor->data, axis_tensor->data, output_tensor->data, &reverse_param, exec_graph->num_thread); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_reverse_uint8(input_tensor->data, axis_tensor->data, output_tensor->data, &reverse_param, - exec_graph->num_thread); + exec_graph->num_thread); return ret; } diff --git a/source/device/cpu/op/rnn/rnn_ref.c b/source/device/cpu/op/rnn/rnn_ref.c index 37b258689..ee60e4247 100644 --- a/source/device/cpu/op/rnn/rnn_ref.c +++ b/source/device/cpu/op/rnn/rnn_ref.c @@ -37,7 +37,6 @@ #include #include - struct rnn_ref_param { float* init_h_data; @@ -87,10 +86,10 @@ static int do_RNN_step(const float* input, float* init_h, const float* kernel, c int input_total_size = input_size + hidden_size; int batch_cell_size = hidden_size * batch_size; - float* ig = ( float* )malloc(batch_cell_size * sizeof(float)); + float* ig = (float*)malloc(batch_cell_size * sizeof(float)); - float* merged_input = ( float* )malloc(sizeof(float) * batch_size * (input_total_size)); - float* matmul_result = ( float* )malloc(sizeof(float) * batch_size * hidden_size); + float* merged_input = (float*)malloc(sizeof(float) * batch_size * (input_total_size)); + float* matmul_result = (float*)malloc(sizeof(float) * batch_size * hidden_size); // merge input concat_axis_1_rnn(input, init_h, merged_input, batch_size, input_size, hidden_size); @@ -123,7 +122,7 @@ static int do_RNN_step(const float* input, float* init_h, const float* kernel, c static int ref_rnn_fp32(float* input, float* output, struct rnn_ref_param* param) { - float* init_h = ( float* )malloc((unsigned long )param->batch_size * param->hidden_size * sizeof(float)); + float* init_h = (float*)malloc((unsigned long)param->batch_size * param->hidden_size * sizeof(float)); if (param->init_h_data) { for (int i = 0; i < param->batch_size; i++) @@ -133,7 +132,7 @@ static int ref_rnn_fp32(float* input, float* output, struct rnn_ref_param* param } else { - memset(init_h, 0x0, sizeof((unsigned long )param->batch_size * param->hidden_size * sizeof(float))); + memset(init_h, 0x0, sizeof((unsigned long)param->batch_size * param->hidden_size * sizeof(float))); } int ret = 0; @@ -151,7 +150,7 @@ static int ref_rnn_fp32(float* input, float* output, struct rnn_ref_param* param // final_state [batch_size,hidden_size] if (i + param->output_len >= param->seq_lens) { - memcpy(output, init_h, (unsigned long )param->batch_size * param->hidden_size * sizeof(float)); + memcpy(output, init_h, (unsigned long)param->batch_size * param->hidden_size * sizeof(float)); output += param->batch_size * param->hidden_size; } } @@ -180,7 +179,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct struct tensor* output_tensor; output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); int in_num = ir_node->input_num; - struct rnn_param* rnn_param = ( struct rnn_param* )ir_node->op.param_mem; + struct rnn_param* rnn_param = (struct rnn_param*)ir_node->op.param_mem; struct tensor* init_h_tensor; for (int count = 0; count < in_num; count++) @@ -215,7 +214,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); kernel_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct rnn_param* rnn_param = ( struct rnn_param* )ir_node->op.param_mem; + struct rnn_param* rnn_param = (struct rnn_param*)ir_node->op.param_mem; int input_size = rnn_param->input_size; int hidden_size = rnn_param->hidden_size; @@ -227,7 +226,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int batch_size = input_tensor->dims[1]; int output_len = rnn_param->output_len; - float* init_h = ( float* )malloc((size_t)batch_size * hidden_size * sizeof(float)); + float* init_h = (float*)malloc((size_t)batch_size * hidden_size * sizeof(float)); if (init_h == NULL) { return -1; diff --git a/source/device/cpu/op/roialign/roialign_ref.c b/source/device/cpu/op/roialign/roialign_ref.c index 599998f0a..61de55300 100644 --- a/source/device/cpu/op/roialign/roialign_ref.c +++ b/source/device/cpu/op/roialign/roialign_ref.c @@ -36,7 +36,6 @@ #include - #define T_MAX(a, b) ((a) > (b) ? (a) : (b)) #define T_MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -95,8 +94,8 @@ static int ref_roialign_fp32(struct tensor* input_tensor, struct tensor* roi_ten float roi_w = T_MAX(roi_x2 - roi_x1, 1); float roi_h = T_MAX(roi_y2 - roi_y1, 1); - float bin_size_w = roi_w / ( float )w; - float bin_size_h = roi_h / ( float )h; + float bin_size_w = roi_w / (float)w; + float bin_size_h = roi_h / (float)h; int channel = input_tensor->dims[1]; int in_height = input_tensor->dims[2]; @@ -123,10 +122,10 @@ static int ref_roialign_fp32(struct tensor* input_tensor, struct tensor* roi_ten float hend = roi_y1 + (ph + 1) * bin_size_h; float wend = roi_x1 + (pw + 1) * bin_size_w; - hstart = T_MIN(T_MAX(hstart, 0.f), ( float )in_height); - wstart = T_MIN(T_MAX(wstart, 0.f), ( float )in_width); - hend = T_MIN(T_MAX(hend, 0.f), ( float )in_height); - wend = T_MIN(T_MAX(wend, 0.f), ( float )in_width); + hstart = T_MIN(T_MAX(hstart, 0.f), (float)in_height); + wstart = T_MIN(T_MAX(wstart, 0.f), (float)in_width); + hend = T_MIN(T_MAX(hend, 0.f), (float)in_height); + wend = T_MIN(T_MAX(wend, 0.f), (float)in_width); int bin_grid_h = ceil(hend - hstart); int bin_grid_w = ceil(wend - wstart); @@ -137,18 +136,18 @@ static int ref_roialign_fp32(struct tensor* input_tensor, struct tensor* roi_ten float sum = 0.f; for (int by = 0; by < bin_grid_h; by++) { - float y = hstart + (by + 0.5f) * bin_size_h / ( float )bin_grid_h; + float y = hstart + (by + 0.5f) * bin_size_h / (float)bin_grid_h; for (int bx = 0; bx < bin_grid_w; bx++) { - float x = wstart + (bx + 0.5f) * bin_size_w / ( float )bin_grid_w; + float x = wstart + (bx + 0.5f) * bin_size_w / (float)bin_grid_w; // bilinear interpolate at (x,y) float v = bilinear_interpolate(ptr, in_width, in_height, x, y); sum += v; } } - outptr[pw] = is_empty ? 0.f : (sum / ( float )area); + outptr[pw] = is_empty ? 0.f : (sum / (float)area); } outptr += w; } @@ -178,7 +177,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); roi_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct roialign_param* roialign_param = ( struct roialign_param* )ir_node->op.param_mem; + struct roialign_param* roialign_param = (struct roialign_param*)ir_node->op.param_mem; ref_roialign_fp32(input_tensor, roi_tensor, output_tensor, roialign_param, exec_graph->num_thread); diff --git a/source/device/cpu/op/roipooling/roipooling_ref.c b/source/device/cpu/op/roipooling/roipooling_ref.c index 3a59ca997..cf554bbec 100644 --- a/source/device/cpu/op/roipooling/roipooling_ref.c +++ b/source/device/cpu/op/roipooling/roipooling_ref.c @@ -36,7 +36,6 @@ #include - #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -72,8 +71,8 @@ static int ref_roipooling_fp32(struct tensor* input_tensor, struct tensor* roi_t int roi_w = MAX(roi_x1 - roi_x0 + 1, 1); int roi_h = MAX(roi_y1 - roi_y0 + 1, 1); - float bin_w = ( float )roi_w / ( float )out_w; - float bin_h = ( float )roi_h / ( float )out_h; + float bin_w = (float)roi_w / (float)out_w; + float bin_h = (float)roi_h / (float)out_h; for (int c = 0; c < channel; ++c) { @@ -83,10 +82,10 @@ static int ref_roipooling_fp32(struct tensor* input_tensor, struct tensor* roi_t { for (int w = 0; w < out_w; ++w) { - int h0 = roi_y0 + ( int )floor((double)( h )*bin_h); - int h1 = roi_y0 + ( int )ceil((double)(h + 1) * bin_h); - int w0 = roi_x0 + ( int )floor((double)( w )*bin_w); - int w1 = roi_x0 + ( int )ceil((double)(w + 1) * bin_w); + int h0 = roi_y0 + (int)floor((double)(h)*bin_h); + int h1 = roi_y0 + (int)ceil((double)(h + 1) * bin_h); + int w0 = roi_x0 + (int)floor((double)(w)*bin_w); + int w1 = roi_x0 + (int)ceil((double)(w + 1) * bin_w); h0 = MIN(MAX(h0, 0), in_h); h1 = MIN(MAX(h1, 0), in_h); @@ -134,7 +133,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); roi_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct roipooling_param* roipooling_param = ( struct roipooling_param* )ir_node->op.param_mem; + struct roipooling_param* roipooling_param = (struct roipooling_param*)ir_node->op.param_mem; // set output dims int dims[4]; @@ -156,7 +155,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc struct graph* ir_graph = node->graph; struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]); - struct roipooling_param* roipooling_param = ( struct roipooling_param* )node->op.param_mem; + struct roipooling_param* roipooling_param = (struct roipooling_param*)node->op.param_mem; int dims[4]; diff --git a/source/device/cpu/op/round/round_ref.c b/source/device/cpu/op/round/round_ref.c index 1524fa1a0..ca76ee7d6 100644 --- a/source/device/cpu/op/round/round_ref.c +++ b/source/device/cpu/op/round/round_ref.c @@ -34,7 +34,6 @@ #include - int ref_round_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread) { // dims size = 2 or 3 diff --git a/source/device/cpu/op/rpn/rpn_ref.c b/source/device/cpu/op/rpn/rpn_ref.c index a9d20813d..6112e332b 100644 --- a/source/device/cpu/op/rpn/rpn_ref.c +++ b/source/device/cpu/op/rpn/rpn_ref.c @@ -38,20 +38,19 @@ #include #include - struct anchor_box { - float x0; // xmin - float y0; // ymin - float x1; // xmax - float y1; // ymax + float x0; // xmin + float y0; // ymin + float x1; // xmax + float y1; // ymax }; struct RPN_Box { - float x0; // xmin - float y0; // ymin - float x1; // xmax - float y1; // ymax + float x0; // xmin + float y0; // ymin + float x1; // xmax + float y1; // ymax float score; }; @@ -174,9 +173,9 @@ void nms_rpn_boxes(struct RPN_Box* input_boxes, int* size, float nms_thresh) int input_size = *size; int output_size = 0; - struct RPN_Box* output_boxes = ( struct RPN_Box* )sys_malloc(sizeof(struct RPN_Box) * input_size); - float* areas = ( float* )sys_malloc(sizeof(float) * input_size); - int* picked = ( int* )sys_malloc(sizeof(int) * input_size); + struct RPN_Box* output_boxes = (struct RPN_Box*)sys_malloc(sizeof(struct RPN_Box) * input_size); + float* areas = (float*)sys_malloc(sizeof(float) * input_size); + int* picked = (int*)sys_malloc(sizeof(int) * input_size); for (int i = 0; i < input_size; ++i) { @@ -220,13 +219,13 @@ void ref_proposal_local_anchor(int feat_height, int feat_width, int feat_stride, float* local_anchors) { int feat_size = feat_height * feat_width; - int num_anchors = ( int )anchors->elem_num; + int num_anchors = (int)anchors->elem_num; for (int i = 0; i < num_anchors; ++i) { for (int j = 0; j < feat_height; j++) for (int k = 0; k < feat_width; k++) { - Anchor_t anchor_val = *( Anchor_t* )(get_vector_data(anchors, i)); + Anchor_t anchor_val = *(Anchor_t*)(get_vector_data(anchors, i)); local_anchors[(i * 4 + 0) * feat_size + j * feat_width + k] = anchor_val.x0 + k * feat_stride; local_anchors[(i * 4 + 1) * feat_size + j * feat_width + k] = anchor_val.y0 + j * feat_stride; local_anchors[(i * 4 + 2) * feat_size + j * feat_width + k] = anchor_val.x1 + k * feat_stride; @@ -242,7 +241,7 @@ int ref_rpn_fp32(const float* score, float* featmap, float* anchors, float* outp int featmap_size = param->feat_height * param->feat_width * param->feat_chan; int max_num_boxes = featmap_size / 4; - struct RPN_Box* boxes = ( struct RPN_Box* )sys_malloc(max_num_boxes * sizeof(struct RPN_Box)); + struct RPN_Box* boxes = (struct RPN_Box*)sys_malloc(max_num_boxes * sizeof(struct RPN_Box)); bbox_tranform_inv(featmap, anchors, param); @@ -301,7 +300,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { struct node* ir_node = exec_node->ir_node; - rpn_param_t* _param = ( struct rpn_param* )(ir_node->op.param_mem); + rpn_param_t* _param = (struct rpn_param*)(ir_node->op.param_mem); struct graph* ir_graph = ir_node->graph; struct tensor* score_tensor; struct tensor* featmap_tensor; @@ -315,11 +314,11 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex const void* score_org = score_tensor->data; void* featmap_org = featmap_tensor->data; - const float* info_org = ( float* )info_tensor->data; + const float* info_org = (float*)info_tensor->data; void* output_org = output_tensor->data; struct rpn_param_ref param; - param.num_anchors = ( int )_param->anchors_->elem_num; + param.num_anchors = (int)_param->anchors_->elem_num; param.feat_chan = featmap_tensor->dims[1]; param.feat_height = featmap_tensor->dims[2]; param.feat_width = featmap_tensor->dims[3]; @@ -334,7 +333,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex param.min_size = _param->min_size; param.feat_stride = _param->feat_stride; int size = param.num_anchors * 4 * feat_size; - float* local_anchors = ( float* )sys_malloc(size * sizeof(float)); + float* local_anchors = (float*)sys_malloc(size * sizeof(float)); ref_proposal_local_anchor(featmap_tensor->dims[2], featmap_tensor->dims[3], _param->feat_stride, _param->anchors_, local_anchors); diff --git a/source/device/cpu/op/scale/scale_ref.c b/source/device/cpu/op/scale/scale_ref.c index 7a1c30d51..426fcd2c8 100644 --- a/source/device/cpu/op/scale/scale_ref.c +++ b/source/device/cpu/op/scale/scale_ref.c @@ -34,7 +34,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - int ref_scale_fp32(struct tensor* input_tensor, struct tensor* gamma_tensor, struct tensor* beta_tensor, struct tensor* output_tensor, struct scale_param* param, int num_thread) { @@ -110,7 +109,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex beta_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct scale_param* scale_param = ( struct scale_param* )ir_node->op.param_mem; + struct scale_param* scale_param = (struct scale_param*)ir_node->op.param_mem; ref_scale_fp32(input_tensor, gamma_tensor, beta_tensor, output_tensor, scale_param, exec_graph->num_thread); diff --git a/source/device/cpu/op/scatter/scatter_ref.c b/source/device/cpu/op/scatter/scatter_ref.c index ca5b8b598..cb0e2ed69 100644 --- a/source/device/cpu/op/scatter/scatter_ref.c +++ b/source/device/cpu/op/scatter/scatter_ref.c @@ -38,7 +38,6 @@ #include #include - struct ref_scatter_param { int axis; @@ -51,11 +50,9 @@ struct ref_scatter_param int indiceSize; }; - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct ref_scatter_param* scatter_op_param = - (struct ref_scatter_param*)sys_malloc(sizeof(struct ref_scatter_param)); + struct ref_scatter_param* scatter_op_param = (struct ref_scatter_param*)sys_malloc(sizeof(struct ref_scatter_param)); memset(scatter_op_param, 0, sizeof(struct ref_scatter_param)); exec_node->ops_priv = scatter_op_param; return 0; @@ -72,47 +69,52 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); - struct ref_scatter_param* scatter_op_param = ( struct ref_scatter_param* )exec_node->ops_priv; + struct ref_scatter_param* scatter_op_param = (struct ref_scatter_param*)exec_node->ops_priv; struct scatter_param* param = (struct scatter_param*)(ir_node->op.param_mem); scatter_op_param->dim_size = input_tensor->dim_num; scatter_op_param->is_onnx = param->is_onnx; - for(int i = 0; i < 4; i++){ + for (int i = 0; i < 4; i++) + { scatter_op_param->dims[i] = 1; } - - if(scatter_op_param->is_onnx){ + + if (scatter_op_param->is_onnx) + { struct tensor* indices_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); int indicesDimsSize = indices_tensor->dim_num; - scatter_op_param->indice_dim = (int*)malloc(sizeof(int)*indicesDimsSize); + scatter_op_param->indice_dim = (int*)malloc(sizeof(int) * indicesDimsSize); scatter_op_param->indiceSize = indicesDimsSize; - + struct tensor* updates_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); int updatesDimsSize = updates_tensor->dim_num; - scatter_op_param->update_dim = (int*)malloc(sizeof(int)*updatesDimsSize); + scatter_op_param->update_dim = (int*)malloc(sizeof(int) * updatesDimsSize); scatter_op_param->updateSize = updatesDimsSize; } return 0; } -static int ref_scatter_fp32(float* input, float* output, int* indices, float* updates, struct ref_scatter_param* op_param){ +static int ref_scatter_fp32(float* input, float* output, int* indices, float* updates, struct ref_scatter_param* op_param) +{ int axis = op_param->axis; bool is_onnx = op_param->is_onnx; TLOG_ERR("indices %f %f \n", updates[0], updates[1]); TLOG_ERR("indices %d %d \n", indices[0], indices[1]); int outSize = 1; - for(int i = 0; i < op_param->dim_size; i++){ - outSize *= op_param->dims[4-op_param->dim_size+i]; + for (int i = 0; i < op_param->dim_size; i++) + { + outSize *= op_param->dims[4 - op_param->dim_size + i]; } - memcpy(output, input, sizeof(float)*outSize); + memcpy(output, input, sizeof(float) * outSize); int calIndexDims[4]; int realIndexDims[4]; - int outCalAxis[4] ; + int outCalAxis[4]; int outRealAxis[4]; int updateDims[4]; - for(int i = 0; i< 4; i++){ + for (int i = 0; i < 4; i++) + { calIndexDims[i] = 0; realIndexDims[i] = 1; outCalAxis[i] = 0; @@ -122,16 +124,18 @@ static int ref_scatter_fp32(float* input, float* output, int* indices, float* up int diff = 4 - op_param->updateSize; //TLOG_ERR("update size: %d \n", op_param->updateSize); - for(int i=0; i < op_param->updateSize; i++){ + for (int i = 0; i < op_param->updateSize; i++) + { calIndexDims[diff + i] = op_param->update_dim[i]; realIndexDims[diff + i] = op_param->update_dim[i]; - TLOG_ERR("%d %d \n",calIndexDims[diff + i], realIndexDims[diff + i]); + TLOG_ERR("%d %d \n", calIndexDims[diff + i], realIndexDims[diff + i]); } diff = 4 - op_param->dim_size; - for(int i = 0; i < op_param->dim_size; i++){ + for (int i = 0; i < op_param->dim_size; i++) + { outCalAxis[diff + i] = 1; - outRealAxis[diff+i] = op_param->dims[diff+i]; + outRealAxis[diff + i] = op_param->dims[diff + i]; } outCalAxis[diff + op_param->axis] = 2; @@ -142,123 +146,151 @@ static int ref_scatter_fp32(float* input, float* output, int* indices, float* up TLOG_ERR("Ready for test\n"); // TLOG_ERR("reaslIndexDims: %d %d %d %d \n", realIndexDims[0] ,realIndexDims[1], realIndexDims[2],realIndexDims[3]); // op_param->axis = -1; - if(is_onnx){ - if(op_param->axis != -1){ - if(op_param->dim_size == 1){ + if (is_onnx) + { + if (op_param->axis != -1) + { + if (op_param->dim_size == 1) + { TLOG_ERR("dims 1\n"); - for(int n = 0; n < realIndexDims[0]; n++){ - for(int c = 0; c < realIndexDims[1]; c++){ - for(int h = 0; h < realIndexDims[2]; h++){ - for(int w = 0; w < realIndexDims[3]; w++){ - - int ii = n*calIndexDims[1]*calIndexDims[2]*calIndexDims[3]+c*calIndexDims[2]*calIndexDims[3]+h*calIndexDims[3]+w; + for (int n = 0; n < realIndexDims[0]; n++) + { + for (int c = 0; c < realIndexDims[1]; c++) + { + for (int h = 0; h < realIndexDims[2]; h++) + { + for (int w = 0; w < realIndexDims[3]; w++) + { + int ii = n * calIndexDims[1] * calIndexDims[2] * calIndexDims[3] + c * calIndexDims[2] * calIndexDims[3] + h * calIndexDims[3] + w; int index = indices[ii]; - if(index < 0){ + if (index < 0) + { index = inW + index + 1; } float value = updates[ii]; int outIndex = index; output[outIndex] = value; - - } } } } - } else if(op_param->dim_size == 2){ + } + else if (op_param->dim_size == 2) + { TLOG_ERR("dims 2 in \n"); - for(int n = 0; n < realIndexDims[0]; n++){ - for(int c = 0; c < realIndexDims[1]; c++){ - for(int h = 0; h < realIndexDims[2]; h++){ - for(int w = 0; w < realIndexDims[3]; w++){ + for (int n = 0; n < realIndexDims[0]; n++) + { + for (int c = 0; c < realIndexDims[1]; c++) + { + for (int h = 0; h < realIndexDims[2]; h++) + { + for (int w = 0; w < realIndexDims[3]; w++) + { TLOG_ERR("cadsfasd \n"); - int ii = n*calIndexDims[1]*calIndexDims[2]*calIndexDims[3]+c*calIndexDims[2]*calIndexDims[3]+h*calIndexDims[3]+w; + int ii = n * calIndexDims[1] * calIndexDims[2] * calIndexDims[3] + c * calIndexDims[2] * calIndexDims[3] + h * calIndexDims[3] + w; TLOG_ERR("cadsfasd 2 %d \n", ii); int index = indices[ii]; TLOG_ERR("cadsfasd 3\n"); float value = updates[ii]; TLOG_ERR("dims 2ddd\n"); - if(op_param->axis == 1){ + if (op_param->axis == 1) + { index = index < 0 ? inW + index + 1 : index; - - int outIndex = h*realIndexDims[3] + index; + + int outIndex = h * realIndexDims[3] + index; TLOG_ERR("%d %d \n", index, outIndex); output[outIndex] = value; } - if(op_param->axis == 0){ - index = index < 0 ? inH + index + 1: index; - - int outIndex = index*realIndexDims[3] + w; + if (op_param->axis == 0) + { + index = index < 0 ? inH + index + 1 : index; + + int outIndex = index * realIndexDims[3] + w; TLOG_ERR("%d %d \n", index, outIndex); output[outIndex] = value; } - } } } } - } else if(op_param->dim_size == 3) { + } + else if (op_param->dim_size == 3) + { TLOG_ERR("dims 3\n"); - for(int n = 0; n < realIndexDims[0]; n++){ - for(int c = 0; c < realIndexDims[1]; c++){ - for(int h = 0; h < realIndexDims[2]; h++){ - for(int w = 0; w < realIndexDims[3]; w++){ - - int ii = n*calIndexDims[1]*calIndexDims[2]*calIndexDims[3]+c*calIndexDims[2]*calIndexDims[3]+h*calIndexDims[3]+w; + for (int n = 0; n < realIndexDims[0]; n++) + { + for (int c = 0; c < realIndexDims[1]; c++) + { + for (int h = 0; h < realIndexDims[2]; h++) + { + for (int w = 0; w < realIndexDims[3]; w++) + { + int ii = n * calIndexDims[1] * calIndexDims[2] * calIndexDims[3] + c * calIndexDims[2] * calIndexDims[3] + h * calIndexDims[3] + w; int index = indices[ii]; float value = updates[ii]; - if(op_param->axis == 1){ - index = index < 0 ? inH + index + 1: index; - int outIndex = c*inH*inW + index*realIndexDims[3] + w; + if (op_param->axis == 1) + { + index = index < 0 ? inH + index + 1 : index; + int outIndex = c * inH * inW + index * realIndexDims[3] + w; output[outIndex] = value; } - if(op_param->axis == 0){ - index = index < 0 ? inC + index + 1: index; + if (op_param->axis == 0) + { + index = index < 0 ? inC + index + 1 : index; // TLOG_ERR("%d \n", index); - int outIndex = index*inH*inW + h*realIndexDims[3] + w; + int outIndex = index * inH * inW + h * realIndexDims[3] + w; output[outIndex] = value; } - if(op_param->axis == 2){ - index = index < 0 ? inW + index + 1: index; - int outIndex = c*inH*inW + h*realIndexDims[3] + index; + if (op_param->axis == 2) + { + index = index < 0 ? inW + index + 1 : index; + int outIndex = c * inH * inW + h * realIndexDims[3] + index; output[outIndex] = value; } - } } } } - } else if(op_param->dim_size == 4){ + } + else if (op_param->dim_size == 4) + { TLOG_ERR("dims 4\n"); - for(int n = 0; n < realIndexDims[0]; n++){ - for(int c = 0; c < realIndexDims[1]; c++){ - for(int h = 0; h < realIndexDims[2]; h++){ - for(int w = 0; w < realIndexDims[3]; w++){ - - int ii = n*calIndexDims[1]*calIndexDims[2]*calIndexDims[3]+c*calIndexDims[2]*calIndexDims[3]+h*calIndexDims[3]+w; + for (int n = 0; n < realIndexDims[0]; n++) + { + for (int c = 0; c < realIndexDims[1]; c++) + { + for (int h = 0; h < realIndexDims[2]; h++) + { + for (int w = 0; w < realIndexDims[3]; w++) + { + int ii = n * calIndexDims[1] * calIndexDims[2] * calIndexDims[3] + c * calIndexDims[2] * calIndexDims[3] + h * calIndexDims[3] + w; int index = indices[ii]; float value = updates[ii]; - if(op_param->axis == 1){ - index = index < 0 ? inC + index + 1: index; - int outIndex = n*inC*inH*inW + index*inH*inW + h*realIndexDims[3] + w; + if (op_param->axis == 1) + { + index = index < 0 ? inC + index + 1 : index; + int outIndex = n * inC * inH * inW + index * inH * inW + h * realIndexDims[3] + w; output[outIndex] = value; } - if(op_param->axis == 0){ - index = index < 0 ? inN + index + 1: index; - int outIndex = index*inC*inH*inW + c*inH*inW + h*realIndexDims[3] + w; + if (op_param->axis == 0) + { + index = index < 0 ? inN + index + 1 : index; + int outIndex = index * inC * inH * inW + c * inH * inW + h * realIndexDims[3] + w; output[outIndex] = value; } - if(op_param->axis == 2){ - index = index < 0 ? inH + index + 1: index; - int outIndex = n*inC*inH*inW + c*inH*inW + index*realIndexDims[3] + w; + if (op_param->axis == 2) + { + index = index < 0 ? inH + index + 1 : index; + int outIndex = n * inC * inH * inW + c * inH * inW + index * realIndexDims[3] + w; output[outIndex] = value; } - if(op_param->axis == 3){ - index = index < 0 ? inW + index + 1: index; - int outIndex = n*inC*inH*inW + c*inH*inW + h*realIndexDims[3] + index; + if (op_param->axis == 3) + { + index = index < 0 ? inW + index + 1 : index; + int outIndex = n * inC * inH * inW + c * inH * inW + h * realIndexDims[3] + index; output[outIndex] = value; } } @@ -266,38 +298,46 @@ static int ref_scatter_fp32(float* input, float* output, int* indices, float* up } } } - } else { + } + else + { int data_dims[4] = {1}; - for(int i = 0; i < op_param->dim_size; i++){ + for (int i = 0; i < op_param->dim_size; i++) + { data_dims[3 - i] = op_param->dims[i]; } - int iCHW = data_dims[1]* data_dims[2]* data_dims[3]; - int iHW = data_dims[2]*data_dims[3]; - - - for(int i = 0; i < op_param->updateSize; i++){ + int iCHW = data_dims[1] * data_dims[2] * data_dims[3]; + int iHW = data_dims[2] * data_dims[3]; + + for (int i = 0; i < op_param->updateSize; i++) + { updateDims[4 - op_param->updateSize + i] = op_param->update_dim[i]; } - int uCHW = updateDims[1]*updateDims[2]*updateDims[3]; - int uHW = updateDims[2]*updateDims[3]; - for(int n = 0; n < updateDims[0]; n++){ - for(int c = 0; c < updateDims[1]; c++){ - for(int h = 0; h < updateDims[2]; h++){ - for(int w = 0; w < updateDims[3]; w++){ - int updateIndex = n*uCHW + c * uHW + h*updateDims[3] + w; + int uCHW = updateDims[1] * updateDims[2] * updateDims[3]; + int uHW = updateDims[2] * updateDims[3]; + for (int n = 0; n < updateDims[0]; n++) + { + for (int c = 0; c < updateDims[1]; c++) + { + for (int h = 0; h < updateDims[2]; h++) + { + for (int w = 0; w < updateDims[3]; w++) + { + int updateIndex = n * uCHW + c * uHW + h * updateDims[3] + w; int value = updates[updateIndex]; int index = indices[updateIndex]; - int outIndex = n*iCHW + c*iHW + w * updateDims[2] + index; + int outIndex = n * iCHW + c * iHW + w * updateDims[2] + index; output[outIndex] = value; - } - } + } } } } - } else { + } + else + { return -1; } @@ -309,45 +349,51 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); - struct ref_scatter_param* scatter_op_param = ( struct ref_scatter_param* )exec_node->ops_priv; + struct ref_scatter_param* scatter_op_param = (struct ref_scatter_param*)exec_node->ops_priv; struct scatter_param* param = (struct scatter_param*)(ir_node->op.param_mem); int inputDimsSize = input_tensor->dim_num; - for(int i = 0; i < inputDimsSize; i++){ - scatter_op_param->dims[4-inputDimsSize+i] = input_tensor->dims[i]; + for (int i = 0; i < inputDimsSize; i++) + { + scatter_op_param->dims[4 - inputDimsSize + i] = input_tensor->dims[i]; } struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); scatter_op_param->axis = param->axis; scatter_op_param->is_onnx = param->is_onnx; - if(scatter_op_param->is_onnx){ + if (scatter_op_param->is_onnx) + { struct tensor* indices_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); int indicesDimsSize = indices_tensor->dim_num; - for(int i = 0; i < indicesDimsSize; i++){ + for (int i = 0; i < indicesDimsSize; i++) + { scatter_op_param->indice_dim[i] = indices_tensor->dims[i]; } struct tensor* updates_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); int updatesDimsSize = updates_tensor->dim_num; - for(int i = 0 ; i < updatesDimsSize; i++){ + for (int i = 0; i < updatesDimsSize; i++) + { scatter_op_param->update_dim[i] = updates_tensor->dims[i]; } - TLOG_ERR("Indecues %d \n",indicesDimsSize); - - int ret = ref_scatter_fp32((float*)input_tensor->data, (float*)output_tensor->data, - (int*)indices_tensor->data, (float*)updates_tensor->data, scatter_op_param); - if(ret < 0){ + TLOG_ERR("Indecues %d \n", indicesDimsSize); + + int ret = ref_scatter_fp32((float*)input_tensor->data, (float*)output_tensor->data, + (int*)indices_tensor->data, (float*)updates_tensor->data, scatter_op_param); + if (ret < 0) + { TLOG_ERR("Scatter reference error \n"); } - } else { + } + else + { return -1; } - return 0; } static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct ref_scatter_param* scatter_op_param = ( struct ref_scatter_param* )exec_node->ops_priv; + struct ref_scatter_param* scatter_op_param = (struct ref_scatter_param*)exec_node->ops_priv; sys_free(scatter_op_param->indice_dim); sys_free(scatter_op_param->update_dim); diff --git a/source/device/cpu/op/selu/cortex-a/selu_hcl_arm.c b/source/device/cpu/op/selu/cortex-a/selu_hcl_arm.c index 6fbea330e..026625d71 100644 --- a/source/device/cpu/op/selu/cortex-a/selu_hcl_arm.c +++ b/source/device/cpu/op/selu/cortex-a/selu_hcl_arm.c @@ -34,7 +34,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -59,7 +58,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct selu_param* selu_param = ( struct selu_param* )ir_node->op.param_mem; + struct selu_param* selu_param = (struct selu_param*)ir_node->op.param_mem; int num_thread = exec_graph->num_thread; diff --git a/source/device/cpu/op/selu/cortex-a/selu_kernel_arm.c b/source/device/cpu/op/selu/cortex-a/selu_kernel_arm.c index 28f68424c..4da7f06f8 100644 --- a/source/device/cpu/op/selu/cortex-a/selu_kernel_arm.c +++ b/source/device/cpu/op/selu/cortex-a/selu_kernel_arm.c @@ -30,11 +30,10 @@ #include - void selu_kernel(int i, int id, void* data, const float* input, float* output, float alpha, float lambda) { float alpha_lambda = alpha * lambda; - int step = (( int* )data)[0]; + int step = ((int*)data)[0]; float32x4_t _one = vdupq_n_f32(1.f); float32x4_t _zero = vdupq_n_f32(0.f); float32x4_t _alpha_lambda = vdupq_n_f32(alpha_lambda); @@ -71,8 +70,8 @@ void selu_kernel(int i, int id, void* data, const float* input, float* output, f int selu_run(struct tensor* output_tensor, struct tensor* input_tensor, struct selu_param* selu_param, int num_thread) { - float* data = ( float* )input_tensor->data; - float* out_data = ( float* )output_tensor->data; + float* data = (float*)input_tensor->data; + float* out_data = (float*)output_tensor->data; float alpha = selu_param->alpha; float lambda = selu_param->lambda; diff --git a/source/device/cpu/op/selu/cortex-a/selu_kernel_arm.h b/source/device/cpu/op/selu/cortex-a/selu_kernel_arm.h index 91220aa06..3ed2955cd 100644 --- a/source/device/cpu/op/selu/cortex-a/selu_kernel_arm.h +++ b/source/device/cpu/op/selu/cortex-a/selu_kernel_arm.h @@ -29,7 +29,6 @@ #include "graph/tensor.h" - int selu_run(struct tensor* output_tensor, struct tensor* input_tensor, struct selu_param* selu_param, int num_thread); #endif diff --git a/source/device/cpu/op/selu/selu_ref.c b/source/device/cpu/op/selu/selu_ref.c index 789e1df1b..557f8105d 100644 --- a/source/device/cpu/op/selu/selu_ref.c +++ b/source/device/cpu/op/selu/selu_ref.c @@ -36,12 +36,11 @@ #include - int ref_selu_fp32(struct tensor* output_tensor, struct tensor* input_tensor, struct selu_param* selu_param, int num_thread) { - float* data = ( float* )input_tensor->data; - float* out_data = ( float* )output_tensor->data; + float* data = (float*)input_tensor->data; + float* out_data = (float*)output_tensor->data; float alpha = selu_param->alpha; float lambda = selu_param->lambda; float alpha_lambda = alpha * lambda; @@ -53,8 +52,8 @@ int ref_selu_fp32(struct tensor* output_tensor, struct tensor* input_tensor, str for (int i = 0; i < chan_num; i++) { int offset = i * chan_size; - float* input_data = ( float* )input_tensor->data + i * chan_size; - float* output_data = ( float* )output_tensor->data + i * chan_size; + float* input_data = (float*)input_tensor->data + i * chan_size; + float* output_data = (float*)output_tensor->data + i * chan_size; for (int j = 0; j < chan_size; j++) { @@ -69,7 +68,7 @@ int ref_selu_fp32(struct tensor* output_tensor, struct tensor* input_tensor, str } int ref_selu_uint8(struct tensor* output_tensor, struct tensor* input_tensor, struct selu_param* selu_param, - int num_thread) + int num_thread) { /* dequant */ uint8_t* input_uint8 = (uint8_t*)input_tensor->data; @@ -81,12 +80,12 @@ int ref_selu_uint8(struct tensor* output_tensor, struct tensor* input_tensor, st int input_size = input_tensor->elem_num; int output_size = output_tensor->elem_num; - float* input_data = ( float* )sys_malloc(input_size * sizeof(float)); - float* output_data = ( float* )sys_malloc(output_size * sizeof(float)); + float* input_data = (float*)sys_malloc(input_size * sizeof(float)); + float* output_data = (float*)sys_malloc(output_size * sizeof(float)); for (int i = 0; i < input_size; i++) { - input_data[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale; + input_data[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale; } float alpha = selu_param->alpha; @@ -100,8 +99,8 @@ int ref_selu_uint8(struct tensor* output_tensor, struct tensor* input_tensor, st for (int i = 0; i < chan_num; i++) { int offset = i * chan_size; - input_data = ( float* )input_tensor->data + i * chan_size; - output_data = ( float* )output_tensor->data + i * chan_size; + input_data = (float*)input_tensor->data + i * chan_size; + output_data = (float*)output_tensor->data + i * chan_size; for (int j = 0; j < chan_size; j++) { @@ -151,14 +150,14 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct selu_param* selu_param = ( struct selu_param* )ir_node->op.param_mem; + struct selu_param* selu_param = (struct selu_param*)ir_node->op.param_mem; int num_thread = exec_graph->num_thread; - int ret = -1; + int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_selu_fp32(output_tensor, input_tensor, selu_param, num_thread); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_selu_uint8(output_tensor, input_tensor, selu_param, num_thread); return ret; diff --git a/source/device/cpu/op/shape/shape_ref.c b/source/device/cpu/op/shape/shape_ref.c index c515c8505..ec27a9c41 100644 --- a/source/device/cpu/op/shape/shape_ref.c +++ b/source/device/cpu/op/shape/shape_ref.c @@ -32,7 +32,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -67,7 +66,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex const int* inDims = input_tensor->dims; int inDims_size = input_tensor->dim_num; int* outData = (int*)output_tensor->data; - for(int i = 0; i < inDims_size; i++){ + for (int i = 0; i < inDims_size; i++) + { *outData = inDims[i]; outData++; } diff --git a/source/device/cpu/op/shuffle_channel/shuffle_channel_ref.c b/source/device/cpu/op/shuffle_channel/shuffle_channel_ref.c index f1838ccb7..545bf2fc0 100644 --- a/source/device/cpu/op/shuffle_channel/shuffle_channel_ref.c +++ b/source/device/cpu/op/shuffle_channel/shuffle_channel_ref.c @@ -37,7 +37,6 @@ #include #include - int ref_shuffle_channel_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct shuffle_channel_param* param) { int batch = input_tensor->dims[0]; @@ -141,7 +140,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct struct graph* ir_graph = ir_node->graph; struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); - if (input_tensor->dim_num !=4) + if (input_tensor->dim_num != 4) { TLOG_ERR("dims num is not 4, not support shuffle channel\n"); return -1; @@ -156,14 +155,14 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct graph* ir_graph = ir_node->graph; struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct shuffle_channel_param* param = ( struct shuffle_channel_param* )ir_node->op.param_mem; + struct shuffle_channel_param* param = (struct shuffle_channel_param*)ir_node->op.param_mem; - int ret = -1; + int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_shuffle_channel_fp32(input_tensor, output_tensor, param); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_shuffle_channel_uint8(input_tensor, output_tensor, param); - else if(input_tensor->data_type == TENGINE_DT_INT8) + else if (input_tensor->data_type == TENGINE_DT_INT8) ret = ref_shuffle_channel_int8(input_tensor, output_tensor, param); else TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type); diff --git a/source/device/cpu/op/sigmoid/cortex-a/sigmoid_hcl_arm.c b/source/device/cpu/op/sigmoid/cortex-a/sigmoid_hcl_arm.c index 3fb563818..1b7b3fbaf 100644 --- a/source/device/cpu/op/sigmoid/cortex-a/sigmoid_hcl_arm.c +++ b/source/device/cpu/op/sigmoid/cortex-a/sigmoid_hcl_arm.c @@ -34,7 +34,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; diff --git a/source/device/cpu/op/sigmoid/cortex-a/sigmoid_kernel_arm.c b/source/device/cpu/op/sigmoid/cortex-a/sigmoid_kernel_arm.c index 47713d9dc..af186d50c 100644 --- a/source/device/cpu/op/sigmoid/cortex-a/sigmoid_kernel_arm.c +++ b/source/device/cpu/op/sigmoid/cortex-a/sigmoid_kernel_arm.c @@ -28,7 +28,6 @@ #include - #define SIGMOID_MAX(a, b) ((a) > (b) ? (a) : (b)) #define SIGMOID_MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -55,9 +54,9 @@ static float fast_exp1(float x) float t = x * 1.442695041f; float fi = floorf(t); float f = t - fi; - int i = ( int )fi; + int i = (int)fi; cvt.f = (0.3371894346f * f + 0.657636276f) * f + 1.00172476f; /* compute 2^f */ - cvt.i += (i << 23); /* scale by 2^i */ + cvt.i += (i << 23); /* scale by 2^i */ return cvt.f; } @@ -71,12 +70,10 @@ static float acl_exp(float x) /* exp(x) = = 2^k * exp(x-k ln2); k = round(x/ln2)*/ float t = x * 1.4426950408f; - float f = x - (( int )t) * 0.6931471805f; - int i = ( int )t; + float f = x - ((int)t) * 0.6931471805f; + int i = (int)t; /// cvt.f = (0.3371894346f * f + 0.657636276f) * f + 1.00172476f; /* compute 2^f */ - cvt.f = - 1 + f * 1.00000011921f + (0.0416598916054f + f * 0.00833693705499f) * f * f + - ((0.500000596046f + f * 0.166665703058f) + (0.0014122662833f + f * 0.000195780929062f) * f * f) * f * f * f * f; + cvt.f = 1 + f * 1.00000011921f + (0.0416598916054f + f * 0.00833693705499f) * f * f + ((0.500000596046f + f * 0.166665703058f) + (0.0014122662833f + f * 0.000195780929062f) * f * f) * f * f * f * f; cvt.i += (i << 23); /* scale by 2^i */ return cvt.f; } @@ -125,8 +122,8 @@ static inline float32x4_t vtaylor_polyq_f32(float32x4_t x, struct tab* coeffs) /* ACL exp function impelement */ static inline float32x4_t vexpq_f32(float32x4_t x) { - const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2) - const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2) + const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2) + const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2) const float32x4_t CONST_0 = vdupq_n_f32(0.f); const int32x4_t CONST_NEGATIVE_126 = vdupq_n_s32(-126); @@ -148,7 +145,7 @@ exp(x) = lim(1+x/n)^n // n=10 */ static inline float32x4_t vexpq10_f32(float32x4_t x) { - x = vmlaq_n_f32(vdupq_n_f32(1.0f), x, 0.0009765625f); // n = 10 + x = vmlaq_n_f32(vdupq_n_f32(1.0f), x, 0.0009765625f); // n = 10 x = vmulq_f32(x, x); x = vmulq_f32(x, x); x = vmulq_f32(x, x); @@ -165,8 +162,8 @@ static inline float32x4_t vexpq10_f32(float32x4_t x) int sigmoid_run(struct tensor* output_tensor, struct tensor* input_tensor, int num_thread) { init_tab(); - float* input = ( float* )input_tensor->data; - float* output = ( float* )output_tensor->data; + float* input = (float*)input_tensor->data; + float* output = (float*)output_tensor->data; float32x4_t min = vdupq_n_f32(-30.0f); float32x4_t max = vdupq_n_f32(30.0f); diff --git a/source/device/cpu/op/sigmoid/cortex-a/sigmoid_kernel_arm.h b/source/device/cpu/op/sigmoid/cortex-a/sigmoid_kernel_arm.h index c0fc8a80f..276ee54e8 100644 --- a/source/device/cpu/op/sigmoid/cortex-a/sigmoid_kernel_arm.h +++ b/source/device/cpu/op/sigmoid/cortex-a/sigmoid_kernel_arm.h @@ -29,7 +29,6 @@ #include - struct tab { float32x4_t a0; diff --git a/source/device/cpu/op/sigmoid/sigmoid_ref.c b/source/device/cpu/op/sigmoid/sigmoid_ref.c index a347c1684..fd2286f65 100644 --- a/source/device/cpu/op/sigmoid/sigmoid_ref.c +++ b/source/device/cpu/op/sigmoid/sigmoid_ref.c @@ -34,7 +34,6 @@ #include - #define SIGMOID_MAX(a, b) ((a) > (b) ? (a) : (b)) #define SIGMOID_MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -44,19 +43,19 @@ int ref_sigmoid_fp32(struct tensor* input_tensor, struct tensor* output_tensor, if (dim_num == 4) { - int batch = input_tensor->dims[0]; + int batch = input_tensor->dims[0]; int channel = input_tensor->dims[1]; - int cstep = input_tensor->dims[2] * input_tensor->dims[3]; - int bstep = channel * cstep; + int cstep = input_tensor->dims[2] * input_tensor->dims[3]; + int bstep = channel * cstep; - for (int n=0; ndata + n * bstep + c * cstep; + float* input_data = (float*)input_tensor->data + n * bstep + c * cstep; float* output_data = (float*)output_tensor->data + n * bstep + c * cstep; - for (int i=0; idata; uint8_t* output_uint8 = (uint8_t*)output_tensor->data; @@ -95,12 +93,12 @@ int ref_sigmoid_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int input_size = input_tensor->elem_num; int output_size = output_tensor->elem_num; - float* input_fp32 = ( float* )sys_malloc(input_size * sizeof(float)); - float* output_fp32 = ( float* )sys_malloc(output_size * sizeof(float)); + float* input_fp32 = (float*)sys_malloc(input_size * sizeof(float)); + float* output_fp32 = (float*)sys_malloc(output_size * sizeof(float)); for (int i = 0; i < input_size; i++) { - input_fp32[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale; + input_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale; } for (int i = 0; i < input_size; i++) @@ -123,7 +121,7 @@ int ref_sigmoid_uint8(struct tensor* input_tensor, struct tensor* output_tensor, } sys_free(input_fp32); - sys_free(output_fp32); + sys_free(output_fp32); return 0; } @@ -149,8 +147,7 @@ static int reshape_node(struct node_ops* node_ops, struct exec_node* exec_node, input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - if (input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] || - input_tensor->dims[3] != output_tensor->dims[3]) + if (input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] || input_tensor->dims[3] != output_tensor->dims[3]) ret = set_ir_tensor_shape(output_tensor, input_tensor->dims, input_tensor->dim_num); return ret; @@ -168,12 +165,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - int ret = -1; + int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_sigmoid_fp32(input_tensor, output_tensor, exec_graph->num_thread); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_sigmoid_uint8(input_tensor, output_tensor, exec_graph->num_thread); - + return ret; } diff --git a/source/device/cpu/op/slice/slice_ref.c b/source/device/cpu/op/slice/slice_ref.c index 54d295c51..04825d9aa 100644 --- a/source/device/cpu/op/slice/slice_ref.c +++ b/source/device/cpu/op/slice/slice_ref.c @@ -39,25 +39,24 @@ #include #include - struct shape_dim { - int dims[4]; // for caffe - int begins[4]; // for tf - int sizes[4]; // for tf + int dims[4]; // for caffe + int begins[4]; // for tf + int sizes[4]; // for tf }; struct slice_param_ref { - int in_shape[4]; // the dim of the input + int in_shape[4]; // the dim of the input int in_shape_3[3]; int in_shape_2[2]; - struct shape_dim* output_shape; // out shape + struct shape_dim* output_shape; // out shape int out_num; int dim_num; - int axis; // for caffe - int step; // for onnx - float out_scale; // for input tensor int8 + int axis; // for caffe + int step; // for onnx + float out_scale; // for input tensor int8 bool iscaffe; bool ismxnet; bool isonnx; @@ -128,8 +127,7 @@ static int tf_run(const int8_t* in_data, int8_t** out_data, int element_size, co for (int j = start_dim_2; j < stop_dim_2; ++j) { int len = stop_dim_3 - start_dim_3; - int input_off = - n * in_dim_1 * in_dim_2 * in_dim_3 + i * in_dim_2 * in_dim_3 + j * in_dim_3 + start_dim_3; + int input_off = n * in_dim_1 * in_dim_2 * in_dim_3 + i * in_dim_2 * in_dim_3 + j * in_dim_3 + start_dim_3; memcpy(output, input + input_off * element_size, (size_t)len * element_size); output += len * element_size; } @@ -168,8 +166,7 @@ static int mxnet_run(const int8_t* in_data, int8_t** out_data, int element_size, for (int j = start_2; j < stop_2; ++j) { int len = start_3 - stop_3; - int input_off = - n * in_dim_1 * in_dim_2 * in_dim_3 + i * in_dim_2 * in_dim_3 + j * in_dim_3 + start_3; + int input_off = n * in_dim_1 * in_dim_2 * in_dim_3 + i * in_dim_2 * in_dim_3 + j * in_dim_3 + start_3; memcpy(output, input + input_off * element_size, (size_t)len * element_size); output += len * element_size; } @@ -267,8 +264,7 @@ static int onnx_run(const int8_t* in_data, int8_t** out_data, int element_size, { for (int k = start_3; k < stop_3; k = k + step_3) { - int input_index = - n * in_dim_1 * in_dim_2 * in_dim_3 + i * in_dim_2 * in_dim_3 + j * in_dim_3 + k; + int input_index = n * in_dim_1 * in_dim_2 * in_dim_3 + i * in_dim_2 * in_dim_3 + j * in_dim_3 + k; memcpy(output, input + input_index * element_size, element_size); output += element_size; } @@ -373,16 +369,16 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex { struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; - struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); + struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); struct slice_param_ref op_param; - slice_param_t* _param = ( struct slice_param* )(ir_node->op.param_mem); + slice_param_t* _param = (struct slice_param*)(ir_node->op.param_mem); int out_num = exec_node->output_num; struct shape_dim sd[MAX_SHAPE_DIM_NUM * 2]; - int8_t** out_data_ptrs = ( int8_t** )sys_malloc(out_num * sizeof(int8_t*)); - if(out_data_ptrs == NULL) + int8_t** out_data_ptrs = (int8_t**)sys_malloc(out_num * sizeof(int8_t*)); + if (out_data_ptrs == NULL) { return -1; } @@ -390,12 +386,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex op_param.axis = _param->axis; op_param.output_shape = sd; op_param.out_num = out_num; - op_param.dim_num = ( int )(input_tensor->dim_num); + op_param.dim_num = (int)(input_tensor->dim_num); op_param.iscaffe = _param->iscaffe; op_param.ismxnet = _param->ismxnet; op_param.isonnx = _param->isonnx; - int8_t* input = ( int8_t* )input_tensor->data; + int8_t* input = (int8_t*)input_tensor->data; unsigned int mem_size = input_tensor->elem_size; if (op_param.iscaffe) @@ -413,7 +409,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex { op_param.output_shape[i].dims[j] = out_tensor->dims[j]; } - out_data_ptrs[i] = ( int8_t* )out_tensor->data; + out_data_ptrs[i] = (int8_t*)out_tensor->data; } } else if (op_param.ismxnet || op_param.isonnx) @@ -439,17 +435,16 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex } } struct tensor* out_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - out_data_ptrs[0] = ( int8_t* )out_tensor->data; + out_data_ptrs[0] = (int8_t*)out_tensor->data; - if (input_tensor->dims[0] == output_tensor->dims[0] && input_tensor->dims[1] == output_tensor->dims[1] && - input_tensor->dims[2] == output_tensor->dims[2] && input_tensor->dims[3] == output_tensor->dims[3]) + if (input_tensor->dims[0] == output_tensor->dims[0] && input_tensor->dims[1] == output_tensor->dims[1] && input_tensor->dims[2] == output_tensor->dims[2] && input_tensor->dims[3] == output_tensor->dims[3]) { - memcpy(( void* )(out_data_ptrs[0]), ( void* )input, mem_size*input_tensor->elem_num); + memcpy((void*)(out_data_ptrs[0]), (void*)input, mem_size * input_tensor->elem_num); sys_free(out_data_ptrs); return true; } } - else // For tensorflow, there is only one output tensor + else // For tensorflow, there is only one output tensor { int maxdim = 4; int real_dim = op_param.dim_num; @@ -464,14 +459,14 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex } else { - op_param.output_shape[0].begins[idx] = *( int* )get_vector_data(_param->begin_, dim_idx); - op_param.output_shape[0].sizes[idx] = *( int* )get_vector_data(_param->size_, dim_idx); + op_param.output_shape[0].begins[idx] = *(int*)get_vector_data(_param->begin_, dim_idx); + op_param.output_shape[0].sizes[idx] = *(int*)get_vector_data(_param->size_, dim_idx); op_param.in_shape[idx] = input_tensor->dims[dim_idx]; dim_idx++; } } struct tensor* out_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - out_data_ptrs[0] = ( int8_t* )out_tensor->data; + out_data_ptrs[0] = (int8_t*)out_tensor->data; } int ret = -1; @@ -487,19 +482,19 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int32_t input_zero = input_tensor->zero_point; int32_t output_zero = output_tensor->zero_point; - float* input_fp32 = (float*)sys_malloc(input_tensor->elem_num * sizeof(float)); + float* input_fp32 = (float*)sys_malloc(input_tensor->elem_num * sizeof(float)); float* output_fp32 = (float*)sys_malloc(output_tensor->elem_num * sizeof(float)); - out_data_ptrs[0] = ( int8_t* )output_fp32; + out_data_ptrs[0] = (int8_t*)output_fp32; - for(int i=0; ielem_num; i++) + for (int i = 0; i < input_tensor->elem_num; i++) { - input_fp32[i] = ((float )input_uint8[i] - (float )input_zero) * input_scale; + input_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale; } - ret = ref_slice_common((int8_t *)input_fp32, out_data_ptrs, sizeof(float), &op_param); + ret = ref_slice_common((int8_t*)input_fp32, out_data_ptrs, sizeof(float), &op_param); /* quant to uint8 */ - for(int i=0; ielem_num; i++) + for (int i = 0; i < output_tensor->elem_num; i++) { int udata = round(output_fp32[i] / output_scale + output_zero); if (udata > 255) diff --git a/source/device/cpu/op/softmax/cortex-a/softmax_hcl_arm.c b/source/device/cpu/op/softmax/cortex-a/softmax_hcl_arm.c index ac41456a5..9ffe8e5c2 100644 --- a/source/device/cpu/op/softmax/cortex-a/softmax_hcl_arm.c +++ b/source/device/cpu/op/softmax/cortex-a/softmax_hcl_arm.c @@ -40,7 +40,6 @@ #include - static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { struct node* ir_node = exec_node->ir_node; @@ -52,16 +51,15 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - if (input_tensor->dims[0] != output_tensor->dims[0] || input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] || - input_tensor->dims[3] != output_tensor->dims[3]) - ret = set_ir_tensor_shape(output_tensor, input_tensor->dims, input_tensor->dim_num); + if (input_tensor->dims[0] != output_tensor->dims[0] || input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] || input_tensor->dims[3] != output_tensor->dims[3]) + ret = set_ir_tensor_shape(output_tensor, input_tensor->dims, input_tensor->dim_num); return ret; } static inline float32x4_t vexpq10_f32(float32x4_t x) { - x = vmlaq_n_f32(vdupq_n_f32(1.0f), x, 0.0009765625f); // n = 10 + x = vmlaq_n_f32(vdupq_n_f32(1.0f), x, 0.0009765625f); // n = 10 x = vmulq_f32(x, x); x = vmulq_f32(x, x); x = vmulq_f32(x, x); @@ -77,8 +75,8 @@ static inline float32x4_t vexpq10_f32(float32x4_t x) static void GetMaxArray(float* input, float* array, int in_size, int on_size, int num_thread) { - float* input_ptr = ( float* )input; - float* array_ptr = ( float* )array; + float* input_ptr = (float*)input; + float* array_ptr = (float*)array; memset(array, 0, in_size * sizeof(float)); // #pragma omp parallel for num_threads(num_thread) @@ -115,10 +113,10 @@ static void GetMaxArray(float* input, float* array, int in_size, int on_size, in static void GetOutResult(float* input, float* output, float* maxarray, float* sum_array, int in_size, int on_size, int num_thread) { - float* input_ptr = ( float* )input; - float* output_ptr = ( float* )output; - float* maxarray_ptr = ( float* )maxarray; - float* sum_array_ptr = ( float* )sum_array; + float* input_ptr = (float*)input; + float* output_ptr = (float*)output; + float* maxarray_ptr = (float*)maxarray; + float* sum_array_ptr = (float*)sum_array; memset(sum_array, 0x0, in_size * sizeof(float)); @@ -183,7 +181,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct softmax_param* softmax_param = ( struct softmax_param* )ir_node->op.param_mem; + struct softmax_param* softmax_param = (struct softmax_param*)ir_node->op.param_mem; int element_size = input_tensor->elem_size; int dims[4]; @@ -211,8 +209,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex uint8_t* input = input_tensor->data; uint8_t* output = output_tensor->data; - float* max_array = ( float* )malloc(in_size * sizeof(float)); - float* sum_array = ( float* )malloc(in_size * sizeof(float)); + float* max_array = (float*)malloc(in_size * sizeof(float)); + float* sum_array = (float*)malloc(in_size * sizeof(float)); int on_in_size = on_size * in_size; @@ -221,8 +219,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex if (element_size == 1) { - input_f = ( float* )malloc(on_in_size * 4); - output_f = ( float* )malloc(on_in_size * 4); + input_f = (float*)malloc(on_in_size * 4); + output_f = (float*)malloc(on_in_size * 4); /* todo */ @@ -235,8 +233,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex /* get max */ int img_base = i * on_in_size * element_size; - GetMaxArray(( float* )(input + img_base), max_array, in_size, on_size, exec_graph->num_thread); - GetOutResult(( float* )(input + img_base), ( float* )(output + img_base), max_array, sum_array, in_size, + GetMaxArray((float*)(input + img_base), max_array, in_size, on_size, exec_graph->num_thread); + GetOutResult((float*)(input + img_base), (float*)(output + img_base), max_array, sum_array, in_size, on_size, exec_graph->num_thread); } diff --git a/source/device/cpu/op/softmax/cortex-m/softmax_cmsis.c b/source/device/cpu/op/softmax/cortex-m/softmax_cmsis.c index 3c63e3a3c..93678c225 100644 --- a/source/device/cpu/op/softmax/cortex-m/softmax_cmsis.c +++ b/source/device/cpu/op/softmax/cortex-m/softmax_cmsis.c @@ -34,7 +34,6 @@ #include "arm_math.h" - /** * @brief Q7 softmax function * @param[in] vec_in pointer to input vector @@ -55,9 +54,8 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - if (input_tensor->dims[0] != output_tensor->dims[0] || input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] || - input_tensor->dims[3] != output_tensor->dims[3]) - ret = set_ir_tensor_shape(output_tensor, input_tensor->dims, input_tensor->dim_num); + if (input_tensor->dims[0] != output_tensor->dims[0] || input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] || input_tensor->dims[3] != output_tensor->dims[3]) + ret = set_ir_tensor_shape(output_tensor, input_tensor->dims, input_tensor->dim_num); return ret; } diff --git a/source/device/cpu/op/softmax/softmax_kernel_ref.h b/source/device/cpu/op/softmax/softmax_kernel_ref.h index 2cf5cd4c9..5c58a44de 100644 --- a/source/device/cpu/op/softmax/softmax_kernel_ref.h +++ b/source/device/cpu/op/softmax/softmax_kernel_ref.h @@ -25,7 +25,6 @@ #ifndef __SOFTMAX_KERNEL_REF_H__ #define __SOFTMAX_KERNEL_REF_H__ - #include "graph/tensor.h" #include "graph/node.h" #include "graph/graph.h" @@ -33,11 +32,10 @@ #include #include - static void GetMaxArray(void* input, void* array, int in_size, int on_size) { - float* input_ptr = ( float* )input; - float* array_ptr = ( float* )array; + float* input_ptr = (float*)input; + float* array_ptr = (float*)array; memcpy(array_ptr, input_ptr, in_size * sizeof(float)); @@ -53,10 +51,10 @@ static void GetMaxArray(void* input, void* array, int in_size, int on_size) static void GetOutResult(void* input, void* output, void* array, void* sum_array, int in_size, int on_size) { - float* input_ptr = ( float* )input; - float* output_ptr = ( float* )output; - float* array_ptr = ( float* )array; - float* sum_array_ptr = ( float* )sum_array; + float* input_ptr = (float*)input; + float* output_ptr = (float*)output; + float* array_ptr = (float*)array; + float* sum_array_ptr = (float*)sum_array; memset(sum_array, 0x0, in_size * sizeof(float)); diff --git a/source/device/cpu/op/softmax/softmax_kernel_ref_fp32.c b/source/device/cpu/op/softmax/softmax_kernel_ref_fp32.c index ecf256746..351be8b21 100644 --- a/source/device/cpu/op/softmax/softmax_kernel_ref_fp32.c +++ b/source/device/cpu/op/softmax/softmax_kernel_ref_fp32.c @@ -36,13 +36,12 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - int ref_softmax_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int axis) { int element_size = input_tensor->elem_size; int type = input_tensor->data_type; - int* dims = ( int* )sys_malloc(input_tensor->dim_num * sizeof(int)); + int* dims = (int*)sys_malloc(input_tensor->dim_num * sizeof(int)); for (int i = 0; i < input_tensor->dim_num; i++) { dims[i] = input_tensor->dims[i]; @@ -63,8 +62,8 @@ int ref_softmax_fp32(struct tensor* input_tensor, struct tensor* output_tensor, } on_size = dims[axis]; - float* max_array = ( float* )sys_malloc(in_size * sizeof(float)); - float* sum_array = ( float* )sys_malloc(in_size * sizeof(float)); + float* max_array = (float*)sys_malloc(in_size * sizeof(float)); + float* sum_array = (float*)sys_malloc(in_size * sizeof(float)); int on_in_size = on_size * in_size; diff --git a/source/device/cpu/op/softmax/softmax_kernel_ref_int8.c b/source/device/cpu/op/softmax/softmax_kernel_ref_int8.c index 015f5149e..7b13afd7f 100644 --- a/source/device/cpu/op/softmax/softmax_kernel_ref_int8.c +++ b/source/device/cpu/op/softmax/softmax_kernel_ref_int8.c @@ -38,13 +38,12 @@ #include - int ref_softmax_int8(struct tensor* input_tensor, struct tensor* output_tensor, int axis) { int element_size = input_tensor->elem_size; int type = input_tensor->data_type; - int* dims = ( int* )sys_malloc(input_tensor->dim_num * sizeof(int)); + int* dims = (int*)sys_malloc(input_tensor->dim_num * sizeof(int)); for (int i = 0; i < input_tensor->dim_num; i++) { dims[i] = input_tensor->dims[i]; @@ -65,16 +64,16 @@ int ref_softmax_int8(struct tensor* input_tensor, struct tensor* output_tensor, } on_size = dims[axis]; - float* max_array = ( float* )sys_malloc(in_size * sizeof(float)); - float* sum_array = ( float* )sys_malloc(in_size * sizeof(float)); + float* max_array = (float*)sys_malloc(in_size * sizeof(float)); + float* sum_array = (float*)sys_malloc(in_size * sizeof(float)); int on_in_size = on_size * in_size; int totol_size = on_in_size * out_size; int8_t* input = (int8_t*)input_tensor->data; int8_t* output = (int8_t*)output_tensor->data; - float* input_f = ( float* )sys_malloc(totol_size * 4); - float* output_f = ( float* )sys_malloc(totol_size * 4); + float* input_f = (float*)sys_malloc(totol_size * 4); + float* output_f = (float*)sys_malloc(totol_size * 4); float input_scale = input_tensor->scale; float output_scale = output_tensor->scale; diff --git a/source/device/cpu/op/softmax/softmax_kernel_ref_uint8.c b/source/device/cpu/op/softmax/softmax_kernel_ref_uint8.c index 08a3cdb58..93565ad5c 100644 --- a/source/device/cpu/op/softmax/softmax_kernel_ref_uint8.c +++ b/source/device/cpu/op/softmax/softmax_kernel_ref_uint8.c @@ -38,13 +38,12 @@ #include - int ref_softmax_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int axis) { int element_size = input_tensor->elem_size; int type = input_tensor->data_type; - int* dims = ( int* )sys_malloc(input_tensor->dim_num * sizeof(int)); + int* dims = (int*)sys_malloc(input_tensor->dim_num * sizeof(int)); for (int i = 0; i < input_tensor->dim_num; i++) { dims[i] = input_tensor->dims[i]; @@ -65,16 +64,16 @@ int ref_softmax_uint8(struct tensor* input_tensor, struct tensor* output_tensor, } on_size = dims[axis]; - float* max_array = ( float* )sys_malloc(in_size * sizeof(float)); - float* sum_array = ( float* )sys_malloc(in_size * sizeof(float)); + float* max_array = (float*)sys_malloc(in_size * sizeof(float)); + float* sum_array = (float*)sys_malloc(in_size * sizeof(float)); int on_in_size = on_size * in_size; int totol_size = on_in_size * out_size; uint8_t* input = (uint8_t*)input_tensor->data; uint8_t* output = (uint8_t*)output_tensor->data; - float* input_f = ( float* )sys_malloc(totol_size * 4); - float* output_f = ( float* )sys_malloc(totol_size * 4); + float* input_f = (float*)sys_malloc(totol_size * 4); + float* output_f = (float*)sys_malloc(totol_size * 4); float input_scale = input_tensor->scale; float output_scale = output_tensor->scale; diff --git a/source/device/cpu/op/softmax/softmax_ref.c b/source/device/cpu/op/softmax/softmax_ref.c index 1042877b6..cb1a3b49d 100644 --- a/source/device/cpu/op/softmax/softmax_ref.c +++ b/source/device/cpu/op/softmax/softmax_ref.c @@ -38,7 +38,6 @@ #include "softmax_kernel_ref.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -56,12 +55,13 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct softmax_param* softmax_param = ( struct softmax_param* )ir_node->op.param_mem; + struct softmax_param* softmax_param = (struct softmax_param*)ir_node->op.param_mem; // Check: axis must be in the range: [-input_tensor->dim_num, input_tensor->dim_num) // Note: Here we always assume 0 <= input_tensor->dim_num int axis = softmax_param->axis; - if (axis < -input_tensor->dim_num || input_tensor->dim_num <= axis) { + if (axis < -input_tensor->dim_num || input_tensor->dim_num <= axis) + { TLOG_ERR("Input softmax axis %d not to be supported.\n", axis); return -1; } @@ -99,8 +99,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - if (input_tensor->dims[0] != output_tensor->dims[0] || input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] || - input_tensor->dims[3] != output_tensor->dims[3]) + if (input_tensor->dims[0] != output_tensor->dims[0] || input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] || input_tensor->dims[3] != output_tensor->dims[3]) ret = set_ir_tensor_shape(output_tensor, input_tensor->dims, input_tensor->dim_num); return ret; @@ -112,12 +111,12 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc } static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score}; int register_softmax_ref_op() { diff --git a/source/device/cpu/op/softplus/softplus_ref.c b/source/device/cpu/op/softplus/softplus_ref.c index efb41cfc3..6931ab047 100644 --- a/source/device/cpu/op/softplus/softplus_ref.c +++ b/source/device/cpu/op/softplus/softplus_ref.c @@ -86,8 +86,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - int ret = -1; - if(input_tensor->data_type == TENGINE_DT_FP32) + int ret = -1; + if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_softplus_fp32(input_tensor, output_tensor, exec_graph->num_thread); else printf("Input data type %d not to be supported.\n", input_tensor->data_type); @@ -112,14 +112,13 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc } static struct node_ops hcl_node_ops = { - .prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score -}; + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score}; int register_softplus_ref_op() { diff --git a/source/device/cpu/op/spacetobatchnd/spacetobatchnd_ref.c b/source/device/cpu/op/spacetobatchnd/spacetobatchnd_ref.c index 0c7843831..6a0aa26a4 100644 --- a/source/device/cpu/op/spacetobatchnd/spacetobatchnd_ref.c +++ b/source/device/cpu/op/spacetobatchnd/spacetobatchnd_ref.c @@ -36,7 +36,6 @@ #include - static int ref_spacetobatchnd_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct spacetobatchnd_param* param, int num_thread) { @@ -92,22 +91,16 @@ static int ref_spacetobatchnd_fp32(struct tensor* input_tensor, struct tensor* o { for (int out_w = 0; out_w < output_width; ++out_w) { - float* out = - out_data + out_b * out_stride_batch + c * out_stride_depth + out_h * out_stride_height + out_w; + float* out = out_data + out_b * out_stride_batch + c * out_stride_depth + out_h * out_stride_height + out_w; - if (out_h * block_shape_height + shift_h < padding_top || - out_h * block_shape_height + shift_h >= padding_top + input_height || - out_w * block_shape_width + shift_w < padding_left || - out_w * block_shape_width + shift_w >= padding_left + input_width) + if (out_h * block_shape_height + shift_h < padding_top || out_h * block_shape_height + shift_h >= padding_top + input_height || out_w * block_shape_width + shift_w < padding_left || out_w * block_shape_width + shift_w >= padding_left + input_width) { // This may not execute correctly when pad_value != 0 and T != uint8. *out = 0; } else { - const float* in = in_data + input_batch * in_stride_batch + c * in_stride_depth + - ((out_h * block_shape_height + shift_h) - padding_top) * in_stride_height + - ((out_w * block_shape_width + shift_w) - padding_left); + const float* in = in_data + input_batch * in_stride_batch + c * in_stride_depth + ((out_h * block_shape_height + shift_h) - padding_top) * in_stride_height + ((out_w * block_shape_width + shift_w) - padding_left); *out = *in; } } @@ -119,7 +112,7 @@ static int ref_spacetobatchnd_fp32(struct tensor* input_tensor, struct tensor* o } static int ref_spacetobatchnd_uint8(struct tensor* input_tensor, struct tensor* output_tensor, - struct spacetobatchnd_param* param, int num_thread) + struct spacetobatchnd_param* param, int num_thread) { /* dequant */ uint8_t* input_uint8 = (uint8_t*)input_tensor->data; @@ -131,12 +124,12 @@ static int ref_spacetobatchnd_uint8(struct tensor* input_tensor, struct tensor* int input_size = input_tensor->elem_num; int output_size = output_tensor->elem_num; - float* in_data = ( float* )sys_malloc(input_size * sizeof(float)); - float* out_data = ( float* )sys_malloc(output_size * sizeof(float)); + float* in_data = (float*)sys_malloc(input_size * sizeof(float)); + float* out_data = (float*)sys_malloc(output_size * sizeof(float)); for (int i = 0; i < input_size; i++) { - in_data[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale; + in_data[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale; } int out_dims[4]; @@ -188,22 +181,16 @@ static int ref_spacetobatchnd_uint8(struct tensor* input_tensor, struct tensor* { for (int out_w = 0; out_w < output_width; ++out_w) { - float* out = - out_data + out_b * out_stride_batch + c * out_stride_depth + out_h * out_stride_height + out_w; + float* out = out_data + out_b * out_stride_batch + c * out_stride_depth + out_h * out_stride_height + out_w; - if (out_h * block_shape_height + shift_h < padding_top || - out_h * block_shape_height + shift_h >= padding_top + input_height || - out_w * block_shape_width + shift_w < padding_left || - out_w * block_shape_width + shift_w >= padding_left + input_width) + if (out_h * block_shape_height + shift_h < padding_top || out_h * block_shape_height + shift_h >= padding_top + input_height || out_w * block_shape_width + shift_w < padding_left || out_w * block_shape_width + shift_w >= padding_left + input_width) { // This may not execute correctly when pad_value != 0 and T != uint8. *out = 0; } else { - const float* in = in_data + input_batch * in_stride_batch + c * in_stride_depth + - ((out_h * block_shape_height + shift_h) - padding_top) * in_stride_height + - ((out_w * block_shape_width + shift_w) - padding_left); + const float* in = in_data + input_batch * in_stride_batch + c * in_stride_depth + ((out_h * block_shape_height + shift_h) - padding_top) * in_stride_height + ((out_w * block_shape_width + shift_w) - padding_left); *out = *in; } } @@ -247,12 +234,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct spacetobatchnd_param* spacetobatchnd_param = ( struct spacetobatchnd_param* )ir_node->op.param_mem; + struct spacetobatchnd_param* spacetobatchnd_param = (struct spacetobatchnd_param*)ir_node->op.param_mem; if (input_tensor->data_type == TENGINE_DT_FP32) ref_spacetobatchnd_fp32(input_tensor, output_tensor, spacetobatchnd_param, exec_graph->num_thread); - else if(input_tensor->data_type == TENGINE_DT_UINT8) - ref_spacetobatchnd_uint8(input_tensor, output_tensor, spacetobatchnd_param, exec_graph->num_thread); + else if (input_tensor->data_type == TENGINE_DT_UINT8) + ref_spacetobatchnd_uint8(input_tensor, output_tensor, spacetobatchnd_param, exec_graph->num_thread); return 0; } diff --git a/source/device/cpu/op/spacetodepth/spacetodepth_ref.c b/source/device/cpu/op/spacetodepth/spacetodepth_ref.c index bcec8c2d7..aa8217929 100644 --- a/source/device/cpu/op/spacetodepth/spacetodepth_ref.c +++ b/source/device/cpu/op/spacetodepth/spacetodepth_ref.c @@ -32,10 +32,8 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - #include - int ref_spacetodepth_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread) { float* input_data = (float*)input_tensor->data; @@ -93,7 +91,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_spacetodepth_fp32(input_tensor, output_tensor, exec_graph->num_thread); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_spacetodepth_uint8(input_tensor, output_tensor, exec_graph->num_thread); return ret; diff --git a/source/device/cpu/op/sparsetodense/sparsetodense_ref.c b/source/device/cpu/op/sparsetodense/sparsetodense_ref.c index ea05f1a48..6179ad14c 100644 --- a/source/device/cpu/op/sparsetodense/sparsetodense_ref.c +++ b/source/device/cpu/op/sparsetodense/sparsetodense_ref.c @@ -36,7 +36,6 @@ #include - int ref_sparsetodense_fp32(struct tensor* input_tensor, struct tensor* output_shape_tensor, struct tensor* sparse_values_tensor, struct tensor* output_tensor, struct sparsetodense_param* param, int num_thread) @@ -166,7 +165,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex sparse_values_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct sparsetodense_param* sparsetodense_param = ( struct sparsetodense_param* )ir_node->op.param_mem; + struct sparsetodense_param* sparsetodense_param = (struct sparsetodense_param*)ir_node->op.param_mem; int ret = ref_sparsetodense_fp32(input_tensor, output_shape_tensor, sparse_values_tensor, output_tensor, sparsetodense_param, exec_graph->num_thread); diff --git a/source/device/cpu/op/spatialtransformer/spatialtransformer_ref.c b/source/device/cpu/op/spatialtransformer/spatialtransformer_ref.c index cbe6cedf9..dfd4e730c 100644 --- a/source/device/cpu/op/spatialtransformer/spatialtransformer_ref.c +++ b/source/device/cpu/op/spatialtransformer/spatialtransformer_ref.c @@ -36,63 +36,70 @@ #include -int between(float value, float lowerBound, float upperBound){ - if(value >= lowerBound && value <= upperBound){ +int between(float value, float lowerBound, float upperBound) +{ + if (value >= lowerBound && value <= upperBound) + { return 1; - } else { + } + else + { return 0; } } -int BilinearSampling(int o_n, int o_c, int o_h, int o_w, int i_c, int i_h, int i_w, float* in_data, float* out_data, float* grid_total){ - +int BilinearSampling(int o_n, int o_c, int o_h, int o_w, int i_c, int i_h, int i_w, float* in_data, float* out_data, float* grid_total) +{ float* tmp_out = out_data; - for(int n = 0; n < o_n; n++){ - for(int c = 0; c < o_c; c++){ - for(int h = 0; h < o_h; h++){ - for(int w = 0; w < o_w; w++){ - int out_index = n*o_c*o_h*o_w + c*o_h*o_w + h*o_w + w; - int grid_index = n*o_h*o_w*2 + h*o_w + w; - float y_real = (*(grid_total + grid_index + o_h*o_w) + 1.0) * (i_h-1.0)/2.0; - float x_real = (*(grid_total + grid_index)+1.0)*(i_w - 1.0)/2.0; + for (int n = 0; n < o_n; n++) + { + for (int c = 0; c < o_c; c++) + { + for (int h = 0; h < o_h; h++) + { + for (int w = 0; w < o_w; w++) + { + int out_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w; + int grid_index = n * o_h * o_w * 2 + h * o_w + w; + float y_real = (*(grid_total + grid_index + o_h * o_w) + 1.0) * (i_h - 1.0) / 2.0; + float x_real = (*(grid_total + grid_index) + 1.0) * (i_w - 1.0) / 2.0; int top_left_y = floor(y_real); int top_left_x = floor(x_real); float top_left_y_w = 1.0 - (y_real - top_left_y); float top_left_x_w = 1.0 - (x_real - top_left_x); - int data_index = n*i_c*i_h*i_w + c*i_h*i_w + top_left_y * i_w + top_left_x; + int data_index = n * i_c * i_h * i_w + c * i_h * i_w + top_left_y * i_w + top_left_x; float top_left_v = 0; float top_right_v = 0; float bottom_left_v = 0; float bottom_right_v = 0; int lower_bound = 0; - if (between(top_left_x, lower_bound, i_w-1) && between(top_left_y, lower_bound, i_h-1)){ - top_left_v = *(in_data + data_index); - } - if (between(top_left_x + 1, lower_bound, i_w-1) && between(top_left_y, lower_bound, i_h-1)){ - top_right_v = *(in_data + data_index + 1); - } - if (between(top_left_x, lower_bound, i_w-1) && between(top_left_y + 1, lower_bound, i_h-1)){ - bottom_left_v = *(in_data + data_index + i_w); - } - if (between(top_left_x+1, lower_bound, i_w-1) && between(top_left_y + 1, lower_bound, i_h-1)){ - bottom_right_v = *(in_data + data_index + i_w + 1); - } - *(tmp_out+out_index) = top_left_v * top_left_y_w * top_left_x_w + - top_right_v * top_left_y_w * (1.0 - top_left_x_w) + - bottom_left_v * (1.0 - top_left_y_w) * top_left_x_w + - bottom_right_v * (1.0 - top_left_y_w) * (1.0 - top_left_x_w); + if (between(top_left_x, lower_bound, i_w - 1) && between(top_left_y, lower_bound, i_h - 1)) + { + top_left_v = *(in_data + data_index); + } + if (between(top_left_x + 1, lower_bound, i_w - 1) && between(top_left_y, lower_bound, i_h - 1)) + { + top_right_v = *(in_data + data_index + 1); + } + if (between(top_left_x, lower_bound, i_w - 1) && between(top_left_y + 1, lower_bound, i_h - 1)) + { + bottom_left_v = *(in_data + data_index + i_w); } - + if (between(top_left_x + 1, lower_bound, i_w - 1) && between(top_left_y + 1, lower_bound, i_h - 1)) + { + bottom_right_v = *(in_data + data_index + i_w + 1); + } + *(tmp_out + out_index) = top_left_v * top_left_y_w * top_left_x_w + top_right_v * top_left_y_w * (1.0 - top_left_x_w) + bottom_left_v * (1.0 - top_left_y_w) * top_left_x_w + bottom_right_v * (1.0 - top_left_y_w) * (1.0 - top_left_x_w); + } } } } return 1; } -int ref_spatialtransformer_fp32(struct tensor* input_tensor,struct tensor* input_tensor1, struct tensor* output_tensor, - struct spatialtransformer_param* param, int num_thread) +int ref_spatialtransformer_fp32(struct tensor* input_tensor, struct tensor* input_tensor1, struct tensor* output_tensor, + struct spatialtransformer_param* param, int num_thread) { - int indices_dim_size = input_tensor->dim_num; float* in_data = (float*)input_tensor->data; @@ -101,40 +108,48 @@ int ref_spatialtransformer_fp32(struct tensor* input_tensor,struct tensor* input int batch = input_tensor->dims[1]; - float* workspace = (float*)malloc(sizeof(float)*3*param->target_shape[0]*param->target_shape[1]); + float* workspace = (float*)malloc(sizeof(float) * 3 * param->target_shape[0] * param->target_shape[1]); - int target_shape_hw = param->target_shape[0]*param->target_shape[1]; - for(int i = 1; i <= target_shape_hw; i++){ - workspace[0*target_shape_hw + i-1] = -1.0 + (i-1) % param->target_shape[1] * 2.0 / (param->target_shape[1] - 1); - workspace[1*target_shape_hw + i-1] = -1.0 + (i-1) / param->target_shape[1] * 2.0 / (param->target_shape[0] - 1); - workspace[2*target_shape_hw + i-1] = 1.0; + int target_shape_hw = param->target_shape[0] * param->target_shape[1]; + for (int i = 1; i <= target_shape_hw; i++) + { + workspace[0 * target_shape_hw + i - 1] = -1.0 + (i - 1) % param->target_shape[1] * 2.0 / (param->target_shape[1] - 1); + workspace[1 * target_shape_hw + i - 1] = -1.0 + (i - 1) / param->target_shape[1] * 2.0 / (param->target_shape[0] - 1); + workspace[2 * target_shape_hw + i - 1] = 1.0; } int m = 2; int p = target_shape_hw; int n = 3; - float* grid_src = (float*)malloc(sizeof(float)*2*target_shape_hw*batch); - float* grid_dst = (float*)malloc(sizeof(float)*3*target_shape_hw); - - for(int i = 0; i < 3*target_shape_hw; i++){ + float* grid_src = (float*)malloc(sizeof(float) * 2 * target_shape_hw * batch); + float* grid_dst = (float*)malloc(sizeof(float) * 3 * target_shape_hw); + + for (int i = 0; i < 3 * target_shape_hw; i++) + { grid_dst[i] = workspace[i]; } - if(param->transformer_type == 0){ // Affine - for(int b = 0; b < batch; b++){ + if (param->transformer_type == 0) + { // Affine + for (int b = 0; b < batch; b++) + { int index = b * target_shape_hw; float* grid_src_batch = grid_src + 0; - for(int i = 0; i < m; i++){ - for(int j = 0; j < target_shape_hw; j++){ - grid_src_batch[i*p + j] = 0; - for(int a = 1; a <= n; a++){ - grid_src_batch[i*p + j] += loc_data[i*n + a - 1] * grid_dst[(a-1)*p + j]; + for (int i = 0; i < m; i++) + { + for (int j = 0; j < target_shape_hw; j++) + { + grid_src_batch[i * p + j] = 0; + for (int a = 1; a <= n; a++) + { + grid_src_batch[i * p + j] += loc_data[i * n + a - 1] * grid_dst[(a - 1) * p + j]; } } } } } - - if (param->sampler_type == 1) { // Bilinear + + if (param->sampler_type == 1) + { // Bilinear int o_n = output_tensor->dims[0]; int o_c = output_tensor->dims[1]; int o_h = output_tensor->dims[2]; @@ -142,14 +157,16 @@ int ref_spatialtransformer_fp32(struct tensor* input_tensor,struct tensor* input int i_c = input_tensor->dims[1]; int i_h = input_tensor->dims[2]; int i_w = input_tensor->dims[3]; - int ret=BilinearSampling(o_n, o_c, o_h, o_w, i_c, i_h, i_w, in_data, out_data, grid_src); - } else { + int ret = BilinearSampling(o_n, o_c, o_h, o_w, i_c, i_h, i_w, in_data, out_data, grid_src); + } + else + { TLOG_ERR("Extra type not support yet\n"); } free(grid_src); free(grid_dst); - free(workspace); + free(workspace); return 0; } @@ -183,10 +200,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor1 = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); int indices_dim_size = input_tensor1->dim_num; - struct spatialtransformer_param* spatialtransformer_param = ( struct spatialtransformer_param* )ir_node->op.param_mem; + struct spatialtransformer_param* spatialtransformer_param = (struct spatialtransformer_param*)ir_node->op.param_mem; - int ret = ref_spatialtransformer_fp32(input_tensor,input_tensor1, output_tensor, - spatialtransformer_param, exec_graph->num_thread); + int ret = ref_spatialtransformer_fp32(input_tensor, input_tensor1, output_tensor, + spatialtransformer_param, exec_graph->num_thread); if (ret != 0) return -1; diff --git a/source/device/cpu/op/split/split_ref.c b/source/device/cpu/op/split/split_ref.c index a79f2613f..2a7fa2890 100644 --- a/source/device/cpu/op/split/split_ref.c +++ b/source/device/cpu/op/split/split_ref.c @@ -37,7 +37,6 @@ #include #include - int ref_split_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct split_param* split_param, int* slice_index, int num_slices, int slice_size, int in_slice, int slice_axis) { float* input_data = (float*)input_tensor->data; @@ -62,8 +61,8 @@ int ref_split_fp32(struct tensor* input_tensor, struct tensor* output_tensor, st *slice_index += out_slice; } - - return 0; + + return 0; } int ref_split_uint8(struct tensor* input_tensor, struct tensor* output_tensor, struct split_param* split_param, int* slice_index, int num_slices, int slice_size, int in_slice, int slice_axis) @@ -90,8 +89,8 @@ int ref_split_uint8(struct tensor* input_tensor, struct tensor* output_tensor, s *slice_index += out_slice; } - - return 0; + + return 0; } static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) @@ -118,7 +117,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); // output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct split_param* split_param = ( struct split_param* )ir_node->op.param_mem; + struct split_param* split_param = (struct split_param*)ir_node->op.param_mem; /* the follow codes need to be checked ! */ int slice_axis = split_param->axis; @@ -135,15 +134,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int slice_index = 0; int out_num = ir_node->output_num; - int ret = -1; + int ret = -1; for (int i = 0; i < out_num; i++) { struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[i]); - + if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_split_fp32(input_tensor, output_tensor, split_param, &slice_index, num_slices, slice_size, in_slice, slice_axis); - else if(input_tensor->data_type == TENGINE_DT_UINT8) - ret = ref_split_uint8(input_tensor, output_tensor, split_param, &slice_index, num_slices, slice_size, in_slice, slice_axis); + else if (input_tensor->data_type == TENGINE_DT_UINT8) + ret = ref_split_uint8(input_tensor, output_tensor, split_param, &slice_index, num_slices, slice_size, in_slice, slice_axis); } return ret; diff --git a/source/device/cpu/op/squareddifference/squareddifference_ref.c b/source/device/cpu/op/squareddifference/squareddifference_ref.c index 6fc416891..66a600291 100644 --- a/source/device/cpu/op/squareddifference/squareddifference_ref.c +++ b/source/device/cpu/op/squareddifference/squareddifference_ref.c @@ -34,7 +34,6 @@ #include - int ref_squareddifference_fp32(struct tensor* input_tensor_0, struct tensor* input_tensor_1, struct tensor* output_tensor, int num_thread) { @@ -86,7 +85,7 @@ int ref_squareddifference_fp32(struct tensor* input_tensor_0, struct tensor* inp } int ref_squareddifference_uint8(struct tensor* input_tensor_0, struct tensor* input_tensor_1, - struct tensor* output_tensor, int num_thread) + struct tensor* output_tensor, int num_thread) { /* dequant */ uint8_t* input0_uint8 = (uint8_t*)input_tensor_0->data; @@ -102,17 +101,17 @@ int ref_squareddifference_uint8(struct tensor* input_tensor_0, struct tensor* in int input1_size = input_tensor_1->elem_num; int output_size = output_tensor->elem_num; - float* input0 = ( float* )sys_malloc(input0_size * sizeof(float)); - float* input1 = ( float* )sys_malloc(input1_size * sizeof(float)); - float* output = ( float* )sys_malloc(output_size * sizeof(float)); + float* input0 = (float*)sys_malloc(input0_size * sizeof(float)); + float* input1 = (float*)sys_malloc(input1_size * sizeof(float)); + float* output = (float*)sys_malloc(output_size * sizeof(float)); for (int i = 0; i < input0_size; i++) { - input0[i] = (( float )input0_uint8[i] - ( float )input0_zero) * input0_scale; + input0[i] = ((float)input0_uint8[i] - (float)input0_zero) * input0_scale; } for (int i = 0; i < input1_size; i++) { - input1[i] = (( float )input1_uint8[i] - ( float )input1_zero) * input1_scale; + input1[i] = ((float)input1_uint8[i] - (float)input1_zero) * input1_scale; } // dims size = 2 or 3 @@ -201,7 +200,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int ret = -1; if (input_tensor_0->data_type == TENGINE_DT_FP32) ret = ref_squareddifference_fp32(input_tensor_0, input_tensor_1, output_tensor, exec_graph->num_thread); - else if(input_tensor_0->data_type == TENGINE_DT_UINT8) + else if (input_tensor_0->data_type == TENGINE_DT_UINT8) ret = ref_squareddifference_uint8(input_tensor_0, input_tensor_1, output_tensor, exec_graph->num_thread); return ret; diff --git a/source/device/cpu/op/squeeze/squeeze_ref.c b/source/device/cpu/op/squeeze/squeeze_ref.c index 7550bdb25..1928d299e 100644 --- a/source/device/cpu/op/squeeze/squeeze_ref.c +++ b/source/device/cpu/op/squeeze/squeeze_ref.c @@ -36,7 +36,6 @@ #include - int ref_squeeze_fp32(struct tensor* input_tensor, struct tensor* output_tensor) { float* input_data = (float*)input_tensor->data; @@ -80,10 +79,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - int ret = -1; + int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_squeeze_fp32(input_tensor, output_tensor); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_squeeze_uint8(input_tensor, output_tensor); return ret; diff --git a/source/device/cpu/op/strided_slice/strided_slice_ref.c b/source/device/cpu/op/strided_slice/strided_slice_ref.c index 1f0297187..bb3cb9111 100644 --- a/source/device/cpu/op/strided_slice/strided_slice_ref.c +++ b/source/device/cpu/op/strided_slice/strided_slice_ref.c @@ -37,7 +37,6 @@ #include - int ref_strided_slice_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct strided_slice_param* param) { int batch_num = input_tensor->dims[0]; @@ -65,10 +64,7 @@ int ref_strided_slice_fp32(struct tensor* input_tensor, struct tensor* output_te { for (int w = 0; w < out_w; w++) { - int input_index = (param->begin[0] + n * param->stride[0]) * in_chw + - (param->begin[1] + c * param->stride[1]) * in_hw + - (param->begin[2] + h * param->stride[2]) * in_w + - (param->begin[3] + w * param->stride[3]); + int input_index = (param->begin[0] + n * param->stride[0]) * in_chw + (param->begin[1] + c * param->stride[1]) * in_hw + (param->begin[2] + h * param->stride[2]) * in_w + (param->begin[3] + w * param->stride[3]); int output_index = n * out_chw + c * out_hw + h * out_w + w; output_data[output_index] = input_data[input_index]; @@ -107,10 +103,7 @@ int ref_strided_slice_uint8(struct tensor* input_tensor, struct tensor* output_t { for (int w = 0; w < out_w; w++) { - int input_index = (param->begin[0] + n * param->stride[0]) * in_chw + - (param->begin[1] + c * param->stride[1]) * in_hw + - (param->begin[2] + h * param->stride[2]) * in_w + - (param->begin[3] + w * param->stride[3]); + int input_index = (param->begin[0] + n * param->stride[0]) * in_chw + (param->begin[1] + c * param->stride[1]) * in_hw + (param->begin[2] + h * param->stride[2]) * in_w + (param->begin[3] + w * param->stride[3]); int output_index = n * out_chw + c * out_hw + h * out_w + w; output_data[output_index] = input_data[input_index]; @@ -144,12 +137,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct strided_slice_param* param = ( struct strided_slice_param* )ir_node->op.param_mem; + struct strided_slice_param* param = (struct strided_slice_param*)ir_node->op.param_mem; - int ret = -1; + int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_strided_slice_fp32(input_tensor, output_tensor, param); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_strided_slice_uint8(input_tensor, output_tensor, param); return ret; diff --git a/source/device/cpu/op/swap_axis/swap_axis_ref.c b/source/device/cpu/op/swap_axis/swap_axis_ref.c index 81b21e328..6aeef17bb 100644 --- a/source/device/cpu/op/swap_axis/swap_axis_ref.c +++ b/source/device/cpu/op/swap_axis/swap_axis_ref.c @@ -37,21 +37,18 @@ #include #include - static int ref_swap_axis_common(struct tensor* input_tensor, struct tensor* output_tensor, const int* dims, int element_size) { - const float* in_data = ( float* )input_tensor->data; - float* out_data = ( float* )output_tensor->data; + const float* in_data = (float*)input_tensor->data; + float* out_data = (float*)output_tensor->data; for (int i = 0; i < dims[0]; i++) for (int j = 0; j < dims[3]; j++) for (int p = 0; p < dims[2]; p++) for (int q = 0; q < dims[1]; q++) { - int out_index = i * dims[1] * dims[2] * dims[3] * dims[4] + j * dims[2] * dims[1] * dims[4] + - p * dims[1] * dims[4] + q * dims[4]; - int in_index = i * dims[1] * dims[2] * dims[3] * dims[4] + q * dims[2] * dims[3] * dims[4] + - p * dims[3] * dims[4] + j * dims[4]; + int out_index = i * dims[1] * dims[2] * dims[3] * dims[4] + j * dims[2] * dims[1] * dims[4] + p * dims[1] * dims[4] + q * dims[4]; + int in_index = i * dims[1] * dims[2] * dims[3] * dims[4] + q * dims[2] * dims[3] * dims[4] + p * dims[3] * dims[4] + j * dims[4]; memcpy(out_data + out_index * element_size, in_data + in_index * element_size, (size_t)dims[4] * element_size); } @@ -60,18 +57,16 @@ static int ref_swap_axis_common(struct tensor* input_tensor, struct tensor* outp static int ref_swap_axis_uint8(struct tensor* input_tensor, struct tensor* output_tensor, const int* dims, int element_size) { - const uint8_t* in_data = ( uint8_t* )input_tensor->data; - uint8_t* out_data = ( uint8_t* )output_tensor->data; + const uint8_t* in_data = (uint8_t*)input_tensor->data; + uint8_t* out_data = (uint8_t*)output_tensor->data; for (int i = 0; i < dims[0]; i++) for (int j = 0; j < dims[3]; j++) for (int p = 0; p < dims[2]; p++) for (int q = 0; q < dims[1]; q++) { - int out_index = i * dims[1] * dims[2] * dims[3] * dims[4] + j * dims[2] * dims[1] * dims[4] + - p * dims[1] * dims[4] + q * dims[4]; - int in_index = i * dims[1] * dims[2] * dims[3] * dims[4] + q * dims[2] * dims[3] * dims[4] + - p * dims[3] * dims[4] + j * dims[4]; + int out_index = i * dims[1] * dims[2] * dims[3] * dims[4] + j * dims[2] * dims[1] * dims[4] + p * dims[1] * dims[4] + q * dims[4]; + int in_index = i * dims[1] * dims[2] * dims[3] * dims[4] + q * dims[2] * dims[3] * dims[4] + p * dims[3] * dims[4] + j * dims[4]; memcpy(out_data + out_index * element_size, in_data + in_index * element_size, (size_t)dims[4] * element_size); } @@ -95,7 +90,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct swap_axis_param* _param = ( struct swap_axis_param* )(ir_node->op.param_mem); + struct swap_axis_param* _param = (struct swap_axis_param*)(ir_node->op.param_mem); int in_size = 1; for (int i = 0; i < input_tensor->dim_num; i++) { @@ -127,10 +122,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex for (int i = dim1 + 1; i < in_size; i++) dims[4] *= input_tensor->dims[i]; - int ret = -1; + int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_swap_axis_common(input_tensor, output_tensor, dims, sizeof(float)); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_swap_axis_uint8(input_tensor, output_tensor, dims, sizeof(uint8_t)); return ret; diff --git a/source/device/cpu/op/tanh/cortex-a/tanh_hcl_arm.c b/source/device/cpu/op/tanh/cortex-a/tanh_hcl_arm.c index ac9bf9b41..de5975df5 100644 --- a/source/device/cpu/op/tanh/cortex-a/tanh_hcl_arm.c +++ b/source/device/cpu/op/tanh/cortex-a/tanh_hcl_arm.c @@ -34,7 +34,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { exec_node->inplace_map[0] = 0; @@ -64,8 +63,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - float* idata = ( float* )input_tensor->data; - float* odata = ( float* )output_tensor->data; + float* idata = (float*)input_tensor->data; + float* odata = (float*)output_tensor->data; if (idata != odata) { TLOG_ERR("input and output are not the same mem\n"); diff --git a/source/device/cpu/op/tanh/cortex-a/tanh_kernel_arm.c b/source/device/cpu/op/tanh/cortex-a/tanh_kernel_arm.c index 10de24f67..813075fc3 100644 --- a/source/device/cpu/op/tanh/cortex-a/tanh_kernel_arm.c +++ b/source/device/cpu/op/tanh/cortex-a/tanh_kernel_arm.c @@ -28,7 +28,6 @@ #include - #define T_MAX(a, b) ((a) > (b) ? (a) : (b)) #define T_MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -63,7 +62,7 @@ exp(x) = lim(1+x/n)^n // n=10 */ static inline float32x4_t vexpq10_f32(float32x4_t x) { - x = vmlaq_n_f32(vdupq_n_f32(1.0f), x, 0.0009765625f); // n = 10 + x = vmlaq_n_f32(vdupq_n_f32(1.0f), x, 0.0009765625f); // n = 10 x = vmulq_f32(x, x); x = vmulq_f32(x, x); x = vmulq_f32(x, x); @@ -79,7 +78,7 @@ static inline float32x4_t vexpq10_f32(float32x4_t x) static void tanh_kernel(int i, int id, void* data, const float* input, float* output) { - int step = (( int* )data)[0]; + int step = ((int*)data)[0]; float32x4_t min = vdupq_n_f32(-30.0f); float32x4_t max = vdupq_n_f32(30.0f); const float* cur_input = input + id * step; @@ -113,8 +112,8 @@ static void tanh_kernel(int i, int id, void* data, const float* input, float* ou int tanh_run(struct tensor* output_tensor, struct tensor* input_tensor, int num_thread) { - float* data = ( float* )input_tensor->data; - float* out_data = ( float* )output_tensor->data; + float* data = (float*)input_tensor->data; + float* out_data = (float*)output_tensor->data; int chan_num = (input_tensor->dims[0]) * (input_tensor->dims[1]); int chan_size = (input_tensor->dims[2]) * (input_tensor->dims[3]); diff --git a/source/device/cpu/op/tanh/tanh_ref.c b/source/device/cpu/op/tanh/tanh_ref.c index a9236fb66..390f64332 100644 --- a/source/device/cpu/op/tanh/tanh_ref.c +++ b/source/device/cpu/op/tanh/tanh_ref.c @@ -35,7 +35,6 @@ #include - int ref_tanh_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread) { float* input_data = (float*)input_tensor->data; @@ -61,12 +60,12 @@ int ref_tanh_uint8(struct tensor* input_tensor, struct tensor* output_tensor, in int input_size = input_tensor->elem_num; int output_size = output_tensor->elem_num; - float* input_fp32 = ( float* )sys_malloc(input_size * sizeof(float)); - float* output_fp32 = ( float* )sys_malloc(output_size * sizeof(float)); + float* input_fp32 = (float*)sys_malloc(input_size * sizeof(float)); + float* output_fp32 = (float*)sys_malloc(output_size * sizeof(float)); for (int i = 0; i < input_size; i++) { - input_fp32[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale; + input_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale; } for (int i = 0; i < input_size; i++) @@ -108,10 +107,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - int ret = -1; + int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_tanh_fp32(input_tensor, output_tensor, exec_graph->num_thread); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_tanh_uint8(input_tensor, output_tensor, exec_graph->num_thread); return ret; diff --git a/source/device/cpu/op/threshold/threshold_ref.c b/source/device/cpu/op/threshold/threshold_ref.c index 60013623e..4672086a5 100644 --- a/source/device/cpu/op/threshold/threshold_ref.c +++ b/source/device/cpu/op/threshold/threshold_ref.c @@ -36,7 +36,6 @@ #include - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -64,12 +63,12 @@ int ref_threshold_uint8(struct tensor* input_tensor, struct tensor* output_tenso int input_size = input_tensor->elem_num; int output_size = output_tensor->elem_num; - float* input_fp32 = ( float* )sys_malloc(input_size * sizeof(float)); - float* output_fp32 = ( float* )sys_malloc(output_size * sizeof(float)); + float* input_fp32 = (float*)sys_malloc(input_size * sizeof(float)); + float* output_fp32 = (float*)sys_malloc(output_size * sizeof(float)); for (int i = 0; i < input_size; i++) { - input_fp32[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale; + input_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale; } for (int i = 0; i < size; i++) @@ -98,7 +97,7 @@ int ref_threshold_fp32(struct tensor* input_tensor, struct tensor* output_tensor { float* input_data = (float*)input_tensor->data; float* out_data = (float*)output_tensor->data; - + for (int i = 0; i < size; i++) { out_data[i] = input_data[i] > threshold ? 1.f : 0.f; @@ -115,12 +114,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct threshold_param* param = ( struct threshold_param* )node->op.param_mem; + struct threshold_param* param = (struct threshold_param*)node->op.param_mem; - int ret = -1; + int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_threshold_fp32(input_tensor, output_tensor, param->threshold, output_tensor->elem_num, 1.0f, 0.0f); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_threshold_uint8(input_tensor, output_tensor, param->threshold, output_tensor->elem_num, 1.0f, 0.0f); return ret; diff --git a/source/device/cpu/op/tile/tile_ref.c b/source/device/cpu/op/tile/tile_ref.c index 0397e8772..0f51a5310 100644 --- a/source/device/cpu/op/tile/tile_ref.c +++ b/source/device/cpu/op/tile/tile_ref.c @@ -37,7 +37,6 @@ #include - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -52,26 +51,25 @@ static int ref_tile_fp32(float* data, float* output, int* repeat, int* inDim, in { int index = 0; - if(flag == 0) // caffe + if (flag == 0) // caffe { - for(int in = 0; in < inDim[0]; in++) + for (int in = 0; in < inDim[0]; in++) { - for(int rn = 0; rn < repeat[3]; rn++) + for (int rn = 0; rn < repeat[3]; rn++) { - for(int ic = 0; ic < inDim[1]; ic++) + for (int ic = 0; ic < inDim[1]; ic++) { - for(int rc = 0; rc < repeat[2]; rc++) + for (int rc = 0; rc < repeat[2]; rc++) { - for(int ih = 0; ih < inDim[2]; ih++) + for (int ih = 0; ih < inDim[2]; ih++) { - for(int rh = 0; rh < repeat[1]; rh++) + for (int rh = 0; rh < repeat[1]; rh++) { - for(int iw = 0; iw < inDim[3]; iw++) + for (int iw = 0; iw < inDim[3]; iw++) { - for(int rw = 0; rw < repeat[0]; rw++) + for (int rw = 0; rw < repeat[0]; rw++) { - int inDataSize = in * inDim[1] * inDim[2] * inDim[3] + ic * inDim[2] * inDim[3] + - ih * inDim[3] + iw; + int inDataSize = in * inDim[1] * inDim[2] * inDim[3] + ic * inDim[2] * inDim[3] + ih * inDim[3] + iw; output[index] = data[inDataSize]; index++; } @@ -83,7 +81,7 @@ static int ref_tile_fp32(float* data, float* output, int* repeat, int* inDim, in } } } - else if(flag == 1) // onnx + else if (flag == 1) // onnx { int n = inDim[0]; int c = inDim[1]; @@ -94,15 +92,15 @@ static int ref_tile_fp32(float* data, float* output, int* repeat, int* inDim, in int rh = repeat[1]; int rw = repeat[0]; - int n1 = n*rn; - int c1 = c*rc; - int h1 = h*rh; - int w1 = w*rw; + int n1 = n * rn; + int c1 = c * rc; + int h1 = h * rh; + int w1 = w * rw; - int size = outDim[0]*outDim[1]*outDim[2]*outDim[3]; + int size = outDim[0] * outDim[1] * outDim[2] * outDim[3]; for (int i = 0; i < size; ++i) { - index = i / (c1*h1*w1) % n * (c*h*w) + i % (c1*h1*w1) / (h1*w1) % c * (h*w) + i % (h1*w1) / w1 % h * w + i % w1 % w; + index = i / (c1 * h1 * w1) % n * (c * h * w) + i % (c1 * h1 * w1) / (h1 * w1) % c * (h * w) + i % (h1 * w1) / w1 % h * w + i % w1 % w; output[i] = data[index]; } } @@ -135,29 +133,29 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int size = 0; int default_value = 1; - if(frame_flag == 0) + if (frame_flag == 0) { size = param->reps_size; - for(int i = 0; i < 4 - size; i++) + for (int i = 0; i < 4 - size; i++) { push_vector_data(repeat, (void*)&default_value); } } - else if ( frame_flag == 1) + else if (frame_flag == 1) { - size = input_reps_shape[0]*input_reps_shape[1]*input_reps_shape[2]*input_reps_shape[3]; - for(int i = 0; i < size; i++) + size = input_reps_shape[0] * input_reps_shape[1] * input_reps_shape[2] * input_reps_shape[3]; + for (int i = 0; i < size; i++) { push_vector_data(repeat, (void*)&input_reps[i]); } - for(int i = 0; i < 4 - size; i++) + for (int i = 0; i < 4 - size; i++) { push_vector_data(repeat, (void*)&default_value); } } - int* repeat_data = (int*)sys_malloc(get_vector_num(repeat)*sizeof(int)); - for(int i = 0; i < get_vector_num(repeat); i++) + int* repeat_data = (int*)sys_malloc(get_vector_num(repeat) * sizeof(int)); + for (int i = 0; i < get_vector_num(repeat); i++) { int* a = (int*)get_vector_data(repeat, i); repeat_data[i] = *a; @@ -176,15 +174,13 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc } static struct node_ops hcl_node_ops = { - .prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score -}; - + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score}; int register_tile_ref_op() { diff --git a/source/device/cpu/op/topkv2/topkv2_ref.c b/source/device/cpu/op/topkv2/topkv2_ref.c index 73054192e..b84cc2433 100644 --- a/source/device/cpu/op/topkv2/topkv2_ref.c +++ b/source/device/cpu/op/topkv2/topkv2_ref.c @@ -37,7 +37,6 @@ #include #include - struct topkv2_param_ref { int k; @@ -108,7 +107,7 @@ static int ref_topkv2_fp32(float* in_data, float* out_data, int* out_index, stru int row_size = param->row_size; int num_rows = param->num_rows; - int* index = ( int* )sys_malloc(row_size * sizeof(int)); + int* index = (int*)sys_malloc(row_size * sizeof(int)); for (int i = 0; i < num_rows; ++i) { @@ -137,18 +136,18 @@ static int ref_topkv2_uint8(struct tensor* input_tensor, struct tensor* output_t int input_size = input_tensor->elem_num; int output_size = output_tensor->elem_num; - float* in_data = ( float* )sys_malloc(input_size * sizeof(float)); - float* out_data = ( float* )sys_malloc(output_size * sizeof(float)); + float* in_data = (float*)sys_malloc(input_size * sizeof(float)); + float* out_data = (float*)sys_malloc(output_size * sizeof(float)); for (int i = 0; i < input_size; i++) { - in_data[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale; + in_data[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale; } int k = param->k; int row_size = param->row_size; int num_rows = param->num_rows; - int* index = ( int* )sys_malloc(row_size * sizeof(int)); + int* index = (int*)sys_malloc(row_size * sizeof(int)); for (int i = 0; i < num_rows; ++i) { @@ -162,7 +161,7 @@ static int ref_topkv2_uint8(struct tensor* input_tensor, struct tensor* output_t memcpy(&out_index[i * k], index, k * sizeof(float)); sys_free(index); } - + /* quant */ for (int i = 0; i < output_size; i++) { @@ -199,10 +198,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex { struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; - struct topkv2_param* _param = ( struct topkv2_param* )(ir_node->op.param_mem); + struct topkv2_param* _param = (struct topkv2_param*)(ir_node->op.param_mem); struct tensor* input_tensor; int out_nums = ir_node->output_num; - struct topkv2_priv_info* topkv2_priv_info = ( struct topkv2_priv_info* )exec_node->ops_priv; + struct topkv2_priv_info* topkv2_priv_info = (struct topkv2_priv_info*)exec_node->ops_priv; input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); struct tensor* output_tensor_1 = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[1]); @@ -216,13 +215,13 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex op_param.k = _param->k; op_param.row_size = input_tensor->dims[dims_len - 1]; op_param.num_rows = num_rows; - float* input = ( float* )input_tensor->data; - + float* input = (float*)input_tensor->data; + int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) - ret = ref_topkv2_fp32(input, ( float* )output_tensor->data, ( int* )output_tensor_1->data, &op_param); - else if(input_tensor->data_type == TENGINE_DT_UINT8) - ret = ref_topkv2_uint8(input_tensor, output_tensor, ( int* )output_tensor_1->data, &op_param); + ret = ref_topkv2_fp32(input, (float*)output_tensor->data, (int*)output_tensor_1->data, &op_param); + else if (input_tensor->data_type == TENGINE_DT_UINT8) + ret = ref_topkv2_uint8(input_tensor, output_tensor, (int*)output_tensor_1->data, &op_param); return ret; } diff --git a/source/device/cpu/op/transpose/transpose_ref.c b/source/device/cpu/op/transpose/transpose_ref.c index 2b030e3ab..ec14fd38f 100644 --- a/source/device/cpu/op/transpose/transpose_ref.c +++ b/source/device/cpu/op/transpose/transpose_ref.c @@ -37,7 +37,6 @@ #include #include - struct ref_transpose_param { int* in_dims; @@ -58,9 +57,9 @@ void transpose2d(float* input, float* output, const struct ref_transpose_param* int stride1 = inStride[param->permute[1]]; for (int n = 0; n < out_dim0; n++) - { // 1 + { // 1 for (int h = 0; h < out_dim1; h++) - { // 1 + { // 1 output[n * out_dim1 + h] = input[n * stride0 + h * stride1]; } } @@ -88,11 +87,11 @@ void transpose3d(float* input, float* output, const struct ref_transpose_param* int stride2 = inStride[param->permute[2]]; for (int n = 0; n < out_dim0; n++) - { // 1 + { // 1 for (int h = 0; h < out_dim1; h++) - { // 1 + { // 1 for (int w = 0; w < out_dim2; w++) - { // 2 + { // 2 output[n * outStride0 + h * outStride1 + w] = input[n * stride0 + h * stride1 + w * stride2]; } } @@ -127,15 +126,14 @@ void transpose4d(float* input, float* output, const struct ref_transpose_param* int stride3 = inStride[param->permute[3]]; for (int n = 0; n < out_dim0; n++) - { // 1 + { // 1 for (int h = 0; h < out_dim1; h++) - { // 1 + { // 1 for (int w = 0; w < out_dim2; w++) - { // 2 + { // 2 for (int c = 0; c < out_dim3; c++) - { // 2 - output[n * outStride0 + h * outStride1 + w * outStride2 + c] = - input[n * stride0 + h * stride1 + w * stride2 + c * stride3]; + { // 2 + output[n * outStride0 + h * outStride1 + w * outStride2 + c] = input[n * stride0 + h * stride1 + w * stride2 + c * stride3]; } } } @@ -173,17 +171,16 @@ void transpose5d(float* input, float* output, const struct ref_transpose_param* int stride4 = inStride[param->permute[4]]; for (int n = 0; n < out_dim0; n++) - { // 1 + { // 1 for (int h = 0; h < out_dim1; h++) - { // 1 + { // 1 for (int w = 0; w < out_dim2; w++) - { // 2 + { // 2 for (int c = 0; c < out_dim3; c++) - { // 2 + { // 2 for (int x = 0; x < out_dim4; x++) { - output[n * outStride0 + h * outStride1 + w * outStride2 + c * outStride3 + x] = - input[n * stride0 + h * stride1 + w * stride2 + c * stride3 + x * stride4]; + output[n * outStride0 + h * outStride1 + w * outStride2 + c * outStride3 + x] = input[n * stride0 + h * stride1 + w * stride2 + c * stride3 + x * stride4]; } } } @@ -228,20 +225,18 @@ void transpose6d(float* input, float* output, const struct ref_transpose_param* int stride5 = inStride[param->permute[5]]; for (int n = 0; n < out_dim0; n++) - { // 1 + { // 1 for (int h = 0; h < out_dim1; h++) - { // 1 + { // 1 for (int w = 0; w < out_dim2; w++) - { // 2 + { // 2 for (int c = 0; c < out_dim3; c++) - { // 2 + { // 2 for (int x = 0; x < out_dim4; x++) { for (int y = 0; y < out_dim5; y++) { - output[n * outStride0 + h * outStride1 + w * outStride2 + c * outStride3 + x * outStride4 + - y] = input[n * stride0 + h * stride1 + w * stride2 + c * stride3 + x * stride4 + - y * stride5]; + output[n * outStride0 + h * outStride1 + w * outStride2 + c * outStride3 + x * outStride4 + y] = input[n * stride0 + h * stride1 + w * stride2 + c * stride3 + x * stride4 + y * stride5]; } } } @@ -254,23 +249,23 @@ static int ref_transpose_fp32(float* input, float* output, const struct ref_tran { switch (param->dims) { - case 2: - transpose2d(input, output, param); - break; - case 3: - transpose3d(input, output, param); - break; - case 4: - transpose4d(input, output, param); - break; - case 5: - transpose5d(input, output, param); - break; - case 6: - transpose6d(input, output, param); - break; - default: - break; + case 2: + transpose2d(input, output, param); + break; + case 3: + transpose3d(input, output, param); + break; + case 4: + transpose4d(input, output, param); + break; + case 5: + transpose5d(input, output, param); + break; + case 6: + transpose6d(input, output, param); + break; + default: + break; } return 0; } @@ -287,33 +282,33 @@ static int ref_transpose_uint8(struct tensor* input_tensor, struct tensor* outpu int input_size = input_tensor->elem_num; int output_size = output_tensor->elem_num; - float* input = ( float* )sys_malloc(input_size * sizeof(float)); - float* output = ( float* )sys_malloc(output_size * sizeof(float)); + float* input = (float*)sys_malloc(input_size * sizeof(float)); + float* output = (float*)sys_malloc(output_size * sizeof(float)); for (int i = 0; i < input_size; i++) { - input[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale; + input[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale; } switch (param->dims) { - case 2: - transpose2d(input, output, param); - break; - case 3: - transpose3d(input, output, param); - break; - case 4: - transpose4d(input, output, param); - break; - case 5: - transpose5d(input, output, param); - break; - case 6: - transpose6d(input, output, param); - break; - default: - break; + case 2: + transpose2d(input, output, param); + break; + case 3: + transpose3d(input, output, param); + break; + case 4: + transpose4d(input, output, param); + break; + case 5: + transpose5d(input, output, param); + break; + case 6: + transpose6d(input, output, param); + break; + default: + break; } /* quant */ @@ -328,15 +323,14 @@ static int ref_transpose_uint8(struct tensor* input_tensor, struct tensor* outpu } sys_free(input); - sys_free(output); + sys_free(output); return 0; } static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct ref_transpose_param* op_param = - ( struct ref_transpose_param* )sys_malloc(sizeof(struct ref_transpose_param)); + struct ref_transpose_param* op_param = (struct ref_transpose_param*)sys_malloc(sizeof(struct ref_transpose_param)); memset(op_param, 0, sizeof(struct ref_transpose_param)); exec_node->ops_priv = op_param; @@ -359,13 +353,13 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct ref_transpose_param* op_param = ( struct ref_transpose_param* )exec_node->ops_priv; - struct transpose_param* transpose_param = ( struct transpose_param* )ir_node->op.param_mem; + struct ref_transpose_param* op_param = (struct ref_transpose_param*)exec_node->ops_priv; + struct transpose_param* transpose_param = (struct transpose_param*)ir_node->op.param_mem; int tr_size = transpose_param->tr_shape_size; // int tr_size = 2 ; - op_param->permute = ( int* )sys_malloc(tr_size * sizeof(int)); + op_param->permute = (int*)sys_malloc(tr_size * sizeof(int)); op_param->dims = input_tensor->dim_num; - op_param->in_dims = ( int* )sys_malloc(op_param->dims * sizeof(int)); + op_param->in_dims = (int*)sys_malloc(op_param->dims * sizeof(int)); return 0; } @@ -373,7 +367,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { struct node* ir_node = exec_node->ir_node; - struct ref_transpose_param* op_param = ( struct ref_transpose_param* )exec_node->ops_priv; + struct ref_transpose_param* op_param = (struct ref_transpose_param*)exec_node->ops_priv; sys_free(op_param->permute); sys_free(op_param->in_dims); @@ -390,12 +384,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct transpose_param* transpose_param = ( struct transpose_param* )ir_node->op.param_mem; + struct transpose_param* transpose_param = (struct transpose_param*)ir_node->op.param_mem; - void* out_data = ( void* )output_tensor->data; - void* in_data = ( void* )input_tensor->data; + void* out_data = (void*)output_tensor->data; + void* in_data = (void*)input_tensor->data; - struct ref_transpose_param* op_param = ( struct ref_transpose_param* )exec_node->ops_priv; + struct ref_transpose_param* op_param = (struct ref_transpose_param*)exec_node->ops_priv; int tr_size = transpose_param->tr_shape_size; @@ -404,7 +398,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex op_param->permute[i] = transpose_param->tr_shape[i]; } - for (int i = 0; i < ( int )op_param->dims; i++) + for (int i = 0; i < (int)op_param->dims; i++) { op_param->in_dims[i] = input_tensor->dims[i]; } @@ -412,8 +406,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_transpose_fp32((float*)in_data, (float*)out_data, op_param); - else if(input_tensor->data_type == TENGINE_DT_UINT8) - ret = ref_transpose_uint8(input_tensor, output_tensor, op_param); + else if (input_tensor->data_type == TENGINE_DT_UINT8) + ret = ref_transpose_uint8(input_tensor, output_tensor, op_param); return ret; } diff --git a/source/device/cpu/op/unary/unary_kernel_ref.h b/source/device/cpu/op/unary/unary_kernel_ref.h index 9b44a3cee..7520ef0a0 100644 --- a/source/device/cpu/op/unary/unary_kernel_ref.h +++ b/source/device/cpu/op/unary/unary_kernel_ref.h @@ -25,7 +25,6 @@ #ifndef __UNARY_KERNEL_REF_H__ #define __UNARY_KERNEL_REF_H__ - #include "graph/tensor.h" #include "graph/node.h" #include "graph/graph.h" diff --git a/source/device/cpu/op/unary/unary_kernel_ref_fp32.c b/source/device/cpu/op/unary/unary_kernel_ref_fp32.c index 06b129e19..58e0a4c2f 100644 --- a/source/device/cpu/op/unary/unary_kernel_ref_fp32.c +++ b/source/device/cpu/op/unary/unary_kernel_ref_fp32.c @@ -38,7 +38,6 @@ #include - int ref_unary_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct unary_param* param) { float* in_data = (float*)input_tensor->data; @@ -49,110 +48,110 @@ int ref_unary_fp32(struct tensor* input_tensor, struct tensor* output_tensor, st switch (type) { - case 0: - for (int i = 0; i < size; i++) - { - out_data[i] = fabs(in_data[i]); - } - break; - case 1: - for (int i = 0; i < size; i++) - { - out_data[i] = -(in_data[i]); - } - break; - case 2: - for (int i = 0; i < size; i++) - { - out_data[i] = floor(in_data[i]); - } - break; - case 3: - for (int i = 0; i < size; i++) - { - out_data[i] = ceil(in_data[i]); - } - break; - case 4: - for (int i = 0; i < size; i++) - { - out_data[i] = in_data[i] * in_data[i]; - } - break; - case 5: - for (int i = 0; i < size; i++) - { - out_data[i] = sqrt(in_data[i]); - } - break; - case 6: - for (int i = 0; i < size; i++) - { - out_data[i] = 1.f / sqrt(in_data[i]); - } - break; - case 7: - for (int i = 0; i < size; i++) - { - out_data[i] = exp(in_data[i]); - } - break; - case 8: - for (int i = 0; i < size; i++) - { - out_data[i] = log(in_data[i]); - } - break; - case 9: - for (int i = 0; i < size; i++) - { - out_data[i] = sin(in_data[i]); - } - break; - case 10: - for (int i = 0; i < size; i++) - { - out_data[i] = cos(in_data[i]); - } - break; - case 11: - for (int i = 0; i < size; i++) - { - out_data[i] = tan(in_data[i]); - } - break; - case 12: - for (int i = 0; i < size; i++) - { - out_data[i] = asin(in_data[i]); - } - break; - case 13: - for (int i = 0; i < size; i++) - { - out_data[i] = acos(in_data[i]); - } - break; - case 14: - for (int i = 0; i < size; i++) - { - out_data[i] = atan(in_data[i]); - } - break; - case 15: - for (int i = 0; i < size; i++) - { - out_data[i] = 1.f / (in_data[i]); - } - break; - case 16: - for (int i = 0; i < size; i++) - { - out_data[i] = tanh(in_data[i]); - } - break; - default: - break; + case 0: + for (int i = 0; i < size; i++) + { + out_data[i] = fabs(in_data[i]); + } + break; + case 1: + for (int i = 0; i < size; i++) + { + out_data[i] = -(in_data[i]); + } + break; + case 2: + for (int i = 0; i < size; i++) + { + out_data[i] = floor(in_data[i]); + } + break; + case 3: + for (int i = 0; i < size; i++) + { + out_data[i] = ceil(in_data[i]); + } + break; + case 4: + for (int i = 0; i < size; i++) + { + out_data[i] = in_data[i] * in_data[i]; + } + break; + case 5: + for (int i = 0; i < size; i++) + { + out_data[i] = sqrt(in_data[i]); + } + break; + case 6: + for (int i = 0; i < size; i++) + { + out_data[i] = 1.f / sqrt(in_data[i]); + } + break; + case 7: + for (int i = 0; i < size; i++) + { + out_data[i] = exp(in_data[i]); + } + break; + case 8: + for (int i = 0; i < size; i++) + { + out_data[i] = log(in_data[i]); + } + break; + case 9: + for (int i = 0; i < size; i++) + { + out_data[i] = sin(in_data[i]); + } + break; + case 10: + for (int i = 0; i < size; i++) + { + out_data[i] = cos(in_data[i]); + } + break; + case 11: + for (int i = 0; i < size; i++) + { + out_data[i] = tan(in_data[i]); + } + break; + case 12: + for (int i = 0; i < size; i++) + { + out_data[i] = asin(in_data[i]); + } + break; + case 13: + for (int i = 0; i < size; i++) + { + out_data[i] = acos(in_data[i]); + } + break; + case 14: + for (int i = 0; i < size; i++) + { + out_data[i] = atan(in_data[i]); + } + break; + case 15: + for (int i = 0; i < size; i++) + { + out_data[i] = 1.f / (in_data[i]); + } + break; + case 16: + for (int i = 0; i < size; i++) + { + out_data[i] = tanh(in_data[i]); + } + break; + default: + break; } return 0; diff --git a/source/device/cpu/op/unary/unary_kernel_ref_uint8.c b/source/device/cpu/op/unary/unary_kernel_ref_uint8.c index 98d04b637..cb2b0957c 100644 --- a/source/device/cpu/op/unary/unary_kernel_ref_uint8.c +++ b/source/device/cpu/op/unary/unary_kernel_ref_uint8.c @@ -38,7 +38,6 @@ #include - int ref_unary_uint8(struct tensor* input_tensor, struct tensor* output_tensor, struct unary_param* param) { /* dequant */ @@ -51,12 +50,12 @@ int ref_unary_uint8(struct tensor* input_tensor, struct tensor* output_tensor, s int input_size = input_tensor->elem_num; int output_size = output_tensor->elem_num; - float* in_data = ( float* )sys_malloc(input_size * sizeof(float)); - float* out_data = ( float* )sys_malloc(output_size * sizeof(float)); + float* in_data = (float*)sys_malloc(input_size * sizeof(float)); + float* out_data = (float*)sys_malloc(output_size * sizeof(float)); for (int i = 0; i < input_size; i++) { - in_data[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale; + in_data[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale; } int size = input_tensor->elem_num; @@ -65,110 +64,110 @@ int ref_unary_uint8(struct tensor* input_tensor, struct tensor* output_tensor, s switch (type) { - case 0: - for (int i = 0; i < size; i++) - { - out_data[i] = fabs(in_data[i]); - } - break; - case 1: - for (int i = 0; i < size; i++) - { - out_data[i] = -(in_data[i]); - } - break; - case 2: - for (int i = 0; i < size; i++) - { - out_data[i] = floor(in_data[i]); - } - break; - case 3: - for (int i = 0; i < size; i++) - { - out_data[i] = ceil(in_data[i]); - } - break; - case 4: - for (int i = 0; i < size; i++) - { - out_data[i] = in_data[i] * in_data[i]; - } - break; - case 5: - for (int i = 0; i < size; i++) - { - out_data[i] = sqrt(in_data[i]); - } - break; - case 6: - for (int i = 0; i < size; i++) - { - out_data[i] = 1.f / sqrt(in_data[i]); - } - break; - case 7: - for (int i = 0; i < size; i++) - { - out_data[i] = exp(in_data[i]); - } - break; - case 8: - for (int i = 0; i < size; i++) - { - out_data[i] = log(in_data[i]); - } - break; - case 9: - for (int i = 0; i < size; i++) - { - out_data[i] = sin(in_data[i]); - } - break; - case 10: - for (int i = 0; i < size; i++) - { - out_data[i] = cos(in_data[i]); - } - break; - case 11: - for (int i = 0; i < size; i++) - { - out_data[i] = tan(in_data[i]); - } - break; - case 12: - for (int i = 0; i < size; i++) - { - out_data[i] = asin(in_data[i]); - } - break; - case 13: - for (int i = 0; i < size; i++) - { - out_data[i] = acos(in_data[i]); - } - break; - case 14: - for (int i = 0; i < size; i++) - { - out_data[i] = atan(in_data[i]); - } - break; - case 15: - for (int i = 0; i < size; i++) - { - out_data[i] = 1.f / (in_data[i]); - } - break; - case 16: - for (int i = 0; i < size; i++) - { - out_data[i] = tanh(in_data[i]); - } - break; - default: - break; + case 0: + for (int i = 0; i < size; i++) + { + out_data[i] = fabs(in_data[i]); + } + break; + case 1: + for (int i = 0; i < size; i++) + { + out_data[i] = -(in_data[i]); + } + break; + case 2: + for (int i = 0; i < size; i++) + { + out_data[i] = floor(in_data[i]); + } + break; + case 3: + for (int i = 0; i < size; i++) + { + out_data[i] = ceil(in_data[i]); + } + break; + case 4: + for (int i = 0; i < size; i++) + { + out_data[i] = in_data[i] * in_data[i]; + } + break; + case 5: + for (int i = 0; i < size; i++) + { + out_data[i] = sqrt(in_data[i]); + } + break; + case 6: + for (int i = 0; i < size; i++) + { + out_data[i] = 1.f / sqrt(in_data[i]); + } + break; + case 7: + for (int i = 0; i < size; i++) + { + out_data[i] = exp(in_data[i]); + } + break; + case 8: + for (int i = 0; i < size; i++) + { + out_data[i] = log(in_data[i]); + } + break; + case 9: + for (int i = 0; i < size; i++) + { + out_data[i] = sin(in_data[i]); + } + break; + case 10: + for (int i = 0; i < size; i++) + { + out_data[i] = cos(in_data[i]); + } + break; + case 11: + for (int i = 0; i < size; i++) + { + out_data[i] = tan(in_data[i]); + } + break; + case 12: + for (int i = 0; i < size; i++) + { + out_data[i] = asin(in_data[i]); + } + break; + case 13: + for (int i = 0; i < size; i++) + { + out_data[i] = acos(in_data[i]); + } + break; + case 14: + for (int i = 0; i < size; i++) + { + out_data[i] = atan(in_data[i]); + } + break; + case 15: + for (int i = 0; i < size; i++) + { + out_data[i] = 1.f / (in_data[i]); + } + break; + case 16: + for (int i = 0; i < size; i++) + { + out_data[i] = tanh(in_data[i]); + } + break; + default: + break; } /* quant */ diff --git a/source/device/cpu/op/unary/unary_ref.c b/source/device/cpu/op/unary/unary_ref.c index 915c69bbf..0f9610a2e 100644 --- a/source/device/cpu/op/unary/unary_ref.c +++ b/source/device/cpu/op/unary/unary_ref.c @@ -36,7 +36,6 @@ #include "unary_kernel_ref.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -54,15 +53,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct unary_param* unary_param = ( struct unary_param* )ir_node->op.param_mem; + struct unary_param* unary_param = (struct unary_param*)ir_node->op.param_mem; - int ret = -1; + int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_unary_fp32(input_tensor, output_tensor, unary_param); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_unary_uint8(input_tensor, output_tensor, unary_param); else - TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type); + TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type); return ret; } diff --git a/source/device/cpu/op/unsqueeze/unsqueeze_ref.c b/source/device/cpu/op/unsqueeze/unsqueeze_ref.c index 2fcb30b0d..70847a7d9 100644 --- a/source/device/cpu/op/unsqueeze/unsqueeze_ref.c +++ b/source/device/cpu/op/unsqueeze/unsqueeze_ref.c @@ -36,7 +36,6 @@ #include - int ref_unsqueeze_fp32(struct tensor* input_tensor, struct tensor* output_tensor) { float* input_data = (float*)input_tensor->data; @@ -80,10 +79,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - int ret = -1; + int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_unsqueeze_fp32(input_tensor, output_tensor); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_unsqueeze_uint8(input_tensor, output_tensor); return ret; diff --git a/source/device/cpu/op/upsample/upsample_ref.c b/source/device/cpu/op/upsample/upsample_ref.c index d6aa8d7e8..23ea6ff99 100644 --- a/source/device/cpu/op/upsample/upsample_ref.c +++ b/source/device/cpu/op/upsample/upsample_ref.c @@ -36,7 +36,6 @@ #include - static int ref_upsample_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct upsample_param* param, int num_thread) { @@ -96,12 +95,12 @@ static int ref_upsample_uint8(struct tensor* input_tensor, struct tensor* output int input_size = input_tensor->elem_num; int output_size = output_tensor->elem_num; - float* input_fp32 = ( float* )sys_malloc(input_size * sizeof(float)); - float* output_fp32 = ( float* )sys_malloc(output_size * sizeof(float)); + float* input_fp32 = (float*)sys_malloc(input_size * sizeof(float)); + float* output_fp32 = (float*)sys_malloc(output_size * sizeof(float)); for (int i = 0; i < input_size; i++) { - input_fp32[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale; + input_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale; } /* fp32 inference */ @@ -160,7 +159,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct upsample_param* upsample_param = ( struct upsample_param* )ir_node->op.param_mem; + struct upsample_param* upsample_param = (struct upsample_param*)ir_node->op.param_mem; int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) diff --git a/source/device/cpu/op/where/where_ref.c b/source/device/cpu/op/where/where_ref.c index 9a6fd7fe8..52a2fd778 100644 --- a/source/device/cpu/op/where/where_ref.c +++ b/source/device/cpu/op/where/where_ref.c @@ -32,7 +32,6 @@ #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" - static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { return 0; @@ -44,7 +43,7 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, } static int ref_where_fp32(float* condition, float* data_a, float* data_b, float* output, int size) { - for(int i = 0; i < size; i++) + for (int i = 0; i < size; i++) { output[i] = condition[i] ? data_a[i] : data_b[i]; } @@ -63,19 +62,21 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct tensor* input_tensor_a = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); struct tensor* input_tensor_b = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); - + int elem_num_condition = input_tensor->elem_num; int elem_num_a = input_tensor_a->elem_num; int elem_num_b = input_tensor_b->elem_num; - if(elem_num_condition != elem_num_a || elem_num_condition != elem_num_b){ + if (elem_num_condition != elem_num_a || elem_num_condition != elem_num_b) + { TLOG_ERR("Tensor size is not equal\n"); return -1; } - int ret = ref_where_fp32((float*)input_tensor->data, (float*)input_tensor_a->data, - (float*)input_tensor_b->data, (float*)output_tensor->data, elem_num_a); - if(ret < -1){ + int ret = ref_where_fp32((float*)input_tensor->data, (float*)input_tensor_a->data, + (float*)input_tensor_b->data, (float*)output_tensor->data, elem_num_a); + if (ret < -1) + { TLOG_ERR("where operator execution error\n"); return -1; } diff --git a/source/device/cpu/op/zeroslike/zeroslike_ref.c b/source/device/cpu/op/zeroslike/zeroslike_ref.c index fd8ebf2f9..47b83d417 100644 --- a/source/device/cpu/op/zeroslike/zeroslike_ref.c +++ b/source/device/cpu/op/zeroslike/zeroslike_ref.c @@ -34,7 +34,6 @@ #include - int ref_zeroslike_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread) { // dims size = 2 or 3 @@ -157,7 +156,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex int ret = -1; if (input_tensor->data_type == TENGINE_DT_FP32) ret = ref_zeroslike_fp32(input_tensor, output_tensor, exec_graph->num_thread); - else if(input_tensor->data_type == TENGINE_DT_UINT8) + else if (input_tensor->data_type == TENGINE_DT_UINT8) ret = ref_zeroslike_uint8(input_tensor, output_tensor, exec_graph->num_thread); return ret; diff --git a/source/device/cuda/cuda_device.hpp b/source/device/cuda/cuda_device.hpp index af3359888..bbadedb17 100644 --- a/source/device/cuda/cuda_device.hpp +++ b/source/device/cuda/cuda_device.hpp @@ -28,8 +28,7 @@ #define CUDA_DEV_NAME "CUDA" -extern "C" -{ +extern "C" { struct cuda_device { struct device base; diff --git a/source/device/cuda/cuda_executor.hpp b/source/device/cuda/cuda_executor.hpp index 410b223e8..15b6afd2b 100644 --- a/source/device/cuda/cuda_executor.hpp +++ b/source/device/cuda/cuda_executor.hpp @@ -28,8 +28,7 @@ #include #include -extern "C" -{ +extern "C" { #include "graph/node.h" #include "graph/graph.h" #include "graph/subgraph.h" @@ -42,7 +41,7 @@ extern "C" typedef std::map dict_uint2uint; typedef std::map dict_uint2voidx; -typedef std::function< void() > GPU_kernel; +typedef std::function GPU_kernel; class CUDAEngine { @@ -84,5 +83,5 @@ class CUDAEngine cudnnConvolutionFwdAlgo_t algo1; public: - dict_uint2voidx gpu_addr_map; + dict_uint2voidx gpu_addr_map; }; diff --git a/source/device/cuda/cuda_graph.hpp b/source/device/cuda/cuda_graph.hpp index 1eaa3a230..72764181e 100644 --- a/source/device/cuda/cuda_graph.hpp +++ b/source/device/cuda/cuda_graph.hpp @@ -24,12 +24,10 @@ #pragma once -extern "C" -{ +extern "C" { #include "device/device.h" #include "graph/subgraph.h" - int cuda_dev_init(struct device* dev); int cuda_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options); int cuda_dev_run(struct device* dev, struct subgraph* subgraph); diff --git a/source/device/cuda/cuda_limit.hpp b/source/device/cuda/cuda_limit.hpp index d9c34fc5d..02d89a264 100644 --- a/source/device/cuda/cuda_limit.hpp +++ b/source/device/cuda/cuda_limit.hpp @@ -34,26 +34,23 @@ #pragma once -extern "C" -{ +extern "C" { #include "operator/op.h" } - const int cuda_supported_ops[] = { - OP_CLIP, - OP_CONCAT, - OP_CONST, - OP_CONV, - OP_DROPOUT, - OP_ELTWISE, - OP_FC, - OP_FLATTEN, - OP_INPUT, - OP_PERMUTE, - OP_POOL, - OP_RELU, - OP_RESHAPE, - OP_SLICE, - OP_SOFTMAX -}; + OP_CLIP, + OP_CONCAT, + OP_CONST, + OP_CONV, + OP_DROPOUT, + OP_ELTWISE, + OP_FC, + OP_FLATTEN, + OP_INPUT, + OP_PERMUTE, + OP_POOL, + OP_RELU, + OP_RESHAPE, + OP_SLICE, + OP_SOFTMAX}; diff --git a/source/device/device.c b/source/device/device.c index 0ae392e49..c43bf4534 100644 --- a/source/device/device.c +++ b/source/device/device.c @@ -28,7 +28,6 @@ #include - void init_ir_device(ir_device_t* device, const char* name) { if (NULL != name) @@ -45,10 +44,9 @@ void init_ir_device(ir_device_t* device, const char* name) device->allocator = NULL; device->optimizer = NULL; device->scheduler = NULL; - device->privacy = NULL; + device->privacy = NULL; } - int get_device_option_size(ir_device_t* device) { // TODO: need an impl diff --git a/source/device/device.h b/source/device/device.h index 0c82cca12..f67f0d8ab 100644 --- a/source/device/device.h +++ b/source/device/device.h @@ -33,7 +33,6 @@ struct vector; #include - /*! * @struct ir_interface_t * @brief Abstract neural network runnable device interface struct @@ -65,7 +64,6 @@ typedef struct interface int (*release_device)(struct device* device); } ir_interface_t; - /*! * @struct ir_allocator_t * @brief Abstract neural network runnable device allocator struct @@ -85,18 +83,16 @@ typedef struct allocator int (*release)(struct device*, struct subgraph*); } ir_allocator_t; - /*! * @struct ir_optimizer_t * @brief Abstract neural network runnable device expend optimizer */ typedef struct optimizer { - int (*split_graph)(struct graph* ir_graph); //!< interface of split graph delegation - int (*optimize_graph)(struct graph* ir_graph, int precision); //!< interface of optimizing graph delegation + int (*split_graph)(struct graph* ir_graph); //!< interface of split graph delegation + int (*optimize_graph)(struct graph* ir_graph, int precision); //!< interface of optimizing graph delegation } ir_optimizer_t; - /*! * @struct nn_device_t * @brief Abstract neural network runnable device description struct @@ -104,14 +100,13 @@ typedef struct optimizer typedef struct device { const char* name; - struct interface* interface; //!< device scheduler operation interface - struct allocator* allocator; //!< device allocation operation interface - struct optimizer* optimizer; //!< device optimizer operation interface - struct scheduler* scheduler; //!< device scheduler - void* privacy; //!< device privacy data + struct interface* interface; //!< device scheduler operation interface + struct allocator* allocator; //!< device allocation operation interface + struct optimizer* optimizer; //!< device optimizer operation interface + struct scheduler* scheduler; //!< device scheduler + void* privacy; //!< device privacy data } ir_device_t; - /*! * @brief Initialize a device. * @@ -122,7 +117,6 @@ typedef struct device */ void init_ir_device(ir_device_t* device, const char* name); - /*! * @brief Size of a device option struct. * diff --git a/source/device/opencl/ocl_define.h b/source/device/opencl/ocl_define.h index 48a28a63f..010fc2651 100644 --- a/source/device/opencl/ocl_define.h +++ b/source/device/opencl/ocl_define.h @@ -26,9 +26,8 @@ #define OCL_DEV_NAME "OCL" - typedef struct ocl_option { char* dev_name; - int precision; //!< precision of calculation + int precision; //!< precision of calculation } ocl_opt_t; diff --git a/source/device/opencl/ocl_device.hpp b/source/device/opencl/ocl_device.hpp index b76608f04..3ced1cb81 100644 --- a/source/device/opencl/ocl_device.hpp +++ b/source/device/opencl/ocl_device.hpp @@ -26,8 +26,7 @@ #include "ocl_define.h" -extern "C" -{ +extern "C" { #include "api/c_api.h" #include "device/device.h" diff --git a/source/device/opencl/ocl_executor.hpp b/source/device/opencl/ocl_executor.hpp index 5b5434b2c..9649a1d99 100644 --- a/source/device/opencl/ocl_executor.hpp +++ b/source/device/opencl/ocl_executor.hpp @@ -22,9 +22,7 @@ * Author: lswang@openailab.com */ - -extern "C" -{ +extern "C" { #include "api/c_api.h" #include "device/device.h" #include "graph/tensor.h" @@ -57,15 +55,15 @@ struct OCLqueue int dims; cl_kernel queue_kernel; cl_event enentPoint; - size_t *queue_global_work_size; - size_t *queue_local_work_size; + size_t* queue_global_work_size; + size_t* queue_local_work_size; }; class OCLEngine { public: -// OCLEngine(); -// ~OCLEngine() = default; + // OCLEngine(); + // ~OCLEngine() = default; int OCLEnginePreRun(struct subgraph* subgraph); int OCLEngineRun(struct subgraph* subgraph); @@ -73,12 +71,11 @@ class OCLEngine private: bool init(); - bool build_kernel(const char *filename, const char *kernel_name); + bool build_kernel(const char* filename, const char* kernel_name); bool OCLTensorMap(struct graph* ir_graph, int ir_tensor_idx, cl_mem_flags flag); int BuildTensor(struct subgraph* subgraph); int BuildKernel(struct subgraph* subgraph); - bool AddClipNode(struct node* ir_node); bool AddConcatNode(struct node* ir_node); bool AddConvolutionNode(struct node* ir_node); @@ -91,11 +88,10 @@ class OCLEngine bool AddReshapeNode(struct node* ir_node); bool AddSliceNode(struct node* ir_node); - private: - cl_int status; + cl_int status; cl_platform_id platform; - cl_device_id *devices; + cl_device_id* devices; cl_context context; cl_command_queue commandQueue; @@ -103,13 +99,9 @@ class OCLEngine cl_kernel kernel; public: - dict_uint2clmem ocl_tensor_map; - std::vector queue_list; + dict_uint2clmem ocl_tensor_map; + std::vector queue_list; public: int bin_num; - }; - - - diff --git a/source/device/opencl/ocl_graph.hpp b/source/device/opencl/ocl_graph.hpp index c5531f08c..6eed3d0de 100644 --- a/source/device/opencl/ocl_graph.hpp +++ b/source/device/opencl/ocl_graph.hpp @@ -24,12 +24,10 @@ #pragma once -extern "C" -{ +extern "C" { #include "device/device.h" #include "graph/subgraph.h" - int ocl_dev_init(struct device* dev); int ocl_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options); int ocl_dev_run(struct device* dev, struct subgraph* subgraph); diff --git a/source/device/opencl/ocl_helper.hpp b/source/device/opencl/ocl_helper.hpp index 5fabe377d..e6f556b52 100644 --- a/source/device/opencl/ocl_helper.hpp +++ b/source/device/opencl/ocl_helper.hpp @@ -32,8 +32,7 @@ #include #include -extern "C" -{ +extern "C" { #include "api/c_api.h" #include "graph/tensor.h" #include "graph/node.h" @@ -49,15 +48,14 @@ bool CHECK_ENQUEUE_KERNEL_STATUS(cl_int status); bool CHECK_ENQUEUE_BUFFER_STATUS(cl_int status); /** convert the kernel file into a string */ -int convertToString(const char *filename, std::string& s); +int convertToString(const char* filename, std::string& s); /**Getting platforms and choose an available one.*/ -int getPlatform(cl_platform_id &platform); +int getPlatform(cl_platform_id& platform); /**Step 2:Query the platform and choose the first GPU device if has one.*/ -cl_device_id *getCl_device_id(cl_platform_id &platform); +cl_device_id* getCl_device_id(cl_platform_id& platform); void get_device_message(); void dump_sub_graph(struct subgraph* sub_graph); - diff --git a/source/device/opencl/ocl_limit.hpp b/source/device/opencl/ocl_limit.hpp index f319c1dea..da6c45a7e 100644 --- a/source/device/opencl/ocl_limit.hpp +++ b/source/device/opencl/ocl_limit.hpp @@ -22,139 +22,134 @@ * Author: hhchen@openailab.com */ - #pragma once -extern "C" -{ +extern "C" { #include "operator/op.h" } - const int ocl_supported_ops[] = { - OP_CLIP, - OP_CONCAT, - OP_CONST, - OP_CONV, - OP_DROPOUT, - OP_ELTWISE, - OP_FC, - OP_FLATTEN, - OP_INPUT, -//// OP_PERMUTE, - OP_POOL, - OP_RELU, - OP_RESHAPE, - OP_SLICE, -//// OP_SOFTMAX - - -// OP_BIAS, + OP_CLIP, + OP_CONCAT, + OP_CONST, + OP_CONV, + OP_DROPOUT, + OP_ELTWISE, + OP_FC, + OP_FLATTEN, + OP_INPUT, + //// OP_PERMUTE, + OP_POOL, + OP_RELU, + OP_RESHAPE, + OP_SLICE, + //// OP_SOFTMAX -//// OP_ABSVAL, -//// OP_ADD_N, -//// OP_ARGMAX, -//// OP_ARGMIN, -//// OP_BATCHNORM, -//// OP_BATCHTOSPACEND, -//// OP_BIAS, -//// OP_BROADMUL, -// -//// OP_CAST, -//// OP_CEIL, -//// OP_CLIP, -//// OP_COMPARISON, -//// OP_CONCAT, -// OP_CONST, -// OP_CONV, -//// OP_CROP, -//// OP_DECONV, -//// OP_DEPTHTOSPACE, -//// OP_DETECTION_OUTPUT, -//// OP_DETECTION_POSTPROCESS, -// -//// OP_DROPOUT, -//// OP_ELTWISE, -//// OP_ELU, -//// OP_EMBEDDING, -//// OP_EXPANDDIMS, -//// OP_FC, -//// OP_FLATTEN, -//// OP_GATHER, -//// OP_GEMM, -//// OP_GRU, -//// OP_HARDSIGMOID, -//// OP_HARDSWISH, -// OP_INPUT, -//// OP_INSTANCENORM, -//// OP_INTERP, -//// OP_LOGICAL, -//// OP_LOGISTIC, -//// OP_LRN, -//// OP_LSTM, -//// OP_MATMUL, -//// OP_MAXIMUM, -//// OP_MEAN, -//// OP_MINIMUM, -//// OP_MVN, -//// OP_NOOP, -//// OP_NORMALIZE, -// -//// OP_PAD, -//// OP_PERMUTE, -// OP_POOL, -//// OP_PRELU, -//// OP_PRIORBOX, -//// OP_PSROIPOOLING, -//// OP_REDUCEL2, -//// OP_REDUCTION, -//// OP_REGION, -// OP_RELU, -// -//// OP_RELU6, -//// OP_REORG, -//// OP_RESHAPE, -//// OP_RESIZE, -//// OP_REVERSE, -//// OP_RNN, -//// OP_ROIALIGN, -//// OP_ROIPOOLING, -//// OP_ROUND, -//// OP_RPN, -//// OP_SCALE, -//// OP_SELU, -//// OP_SHUFFLECHANNEL, -//// OP_SIGMOID, -// -//// OP_SLICE, -//// OP_SOFTMAX, -//// OP_SPACETOBATCHND, -//// OP_SPACETODEPTH, -//// OP_SPARSETODENSE, -//// OP_SPLIT, -//// OP_SQUAREDDIFFERENCE, -//// OP_SQUEEZE, -//// OP_STRIDED_SLICE, -//// OP_SWAP_AXIS, -//// OP_TANH, -//// OP_THRESHOLD, -//// OP_TOPKV2, -//// OP_TRANSPOSE, -//// OP_UNARY, -//// OP_UNSQUEEZE, -//// OP_UPSAMPLE, -//// OP_ZEROSLIKE, -//// OP_MISH, -//// OP_LOGSOFTMAX, -//// OP_RELU1, -//// OP_L2NORMALIZATION, -//// OP_L2POOL, -//// OP_TILE, -//// OP_SHAPE, -//// OP_SCATTER, -//// OP_WHERE, -//// OP_BUILTIN_LAST + // OP_BIAS, + //// OP_ABSVAL, + //// OP_ADD_N, + //// OP_ARGMAX, + //// OP_ARGMIN, + //// OP_BATCHNORM, + //// OP_BATCHTOSPACEND, + //// OP_BIAS, + //// OP_BROADMUL, + // + //// OP_CAST, + //// OP_CEIL, + //// OP_CLIP, + //// OP_COMPARISON, + //// OP_CONCAT, + // OP_CONST, + // OP_CONV, + //// OP_CROP, + //// OP_DECONV, + //// OP_DEPTHTOSPACE, + //// OP_DETECTION_OUTPUT, + //// OP_DETECTION_POSTPROCESS, + // + //// OP_DROPOUT, + //// OP_ELTWISE, + //// OP_ELU, + //// OP_EMBEDDING, + //// OP_EXPANDDIMS, + //// OP_FC, + //// OP_FLATTEN, + //// OP_GATHER, + //// OP_GEMM, + //// OP_GRU, + //// OP_HARDSIGMOID, + //// OP_HARDSWISH, + // OP_INPUT, + //// OP_INSTANCENORM, + //// OP_INTERP, + //// OP_LOGICAL, + //// OP_LOGISTIC, + //// OP_LRN, + //// OP_LSTM, + //// OP_MATMUL, + //// OP_MAXIMUM, + //// OP_MEAN, + //// OP_MINIMUM, + //// OP_MVN, + //// OP_NOOP, + //// OP_NORMALIZE, + // + //// OP_PAD, + //// OP_PERMUTE, + // OP_POOL, + //// OP_PRELU, + //// OP_PRIORBOX, + //// OP_PSROIPOOLING, + //// OP_REDUCEL2, + //// OP_REDUCTION, + //// OP_REGION, + // OP_RELU, + // + //// OP_RELU6, + //// OP_REORG, + //// OP_RESHAPE, + //// OP_RESIZE, + //// OP_REVERSE, + //// OP_RNN, + //// OP_ROIALIGN, + //// OP_ROIPOOLING, + //// OP_ROUND, + //// OP_RPN, + //// OP_SCALE, + //// OP_SELU, + //// OP_SHUFFLECHANNEL, + //// OP_SIGMOID, + // + //// OP_SLICE, + //// OP_SOFTMAX, + //// OP_SPACETOBATCHND, + //// OP_SPACETODEPTH, + //// OP_SPARSETODENSE, + //// OP_SPLIT, + //// OP_SQUAREDDIFFERENCE, + //// OP_SQUEEZE, + //// OP_STRIDED_SLICE, + //// OP_SWAP_AXIS, + //// OP_TANH, + //// OP_THRESHOLD, + //// OP_TOPKV2, + //// OP_TRANSPOSE, + //// OP_UNARY, + //// OP_UNSQUEEZE, + //// OP_UPSAMPLE, + //// OP_ZEROSLIKE, + //// OP_MISH, + //// OP_LOGSOFTMAX, + //// OP_RELU1, + //// OP_L2NORMALIZATION, + //// OP_L2POOL, + //// OP_TILE, + //// OP_SHAPE, + //// OP_SCATTER, + //// OP_WHERE, + //// OP_BUILTIN_LAST }; diff --git a/source/device/tensorrt/trt_define.h b/source/device/tensorrt/trt_define.h index 93faa31d4..88fd302f1 100644 --- a/source/device/tensorrt/trt_define.h +++ b/source/device/tensorrt/trt_define.h @@ -24,16 +24,15 @@ #pragma once -#define TRT_DEVICE_NAME "TensorRT" - -#define EXPORT_BEGIN extern "C" { -#define EXPORT_FINISH } +#define TRT_DEVICE_NAME "TensorRT" +#define EXPORT_BEGIN extern "C" { +#define EXPORT_FINISH } typedef struct trt_option { char* dev_name; - int gpu_index; //!< select which GPU to run graph - int dla_index; //!< select to use NVIDIA DLA - int precision; //!< precision of calculation + int gpu_index; //!< select which GPU to run graph + int dla_index; //!< select to use NVIDIA DLA + int precision; //!< precision of calculation } trt_opt_t; diff --git a/source/device/tensorrt/trt_device.hpp b/source/device/tensorrt/trt_device.hpp index d4cbd2873..8c2275049 100644 --- a/source/device/tensorrt/trt_device.hpp +++ b/source/device/tensorrt/trt_device.hpp @@ -30,13 +30,11 @@ EXPORT_BEGIN #include "api/c_api.h" #include "device/device.h" - struct trt_device { struct device base; }; - DLLEXPORT int register_cpu_device(void); EXPORT_FINISH diff --git a/source/device/tensorrt/trt_executor.hpp b/source/device/tensorrt/trt_executor.hpp index 0954ed436..b7523f0a2 100644 --- a/source/device/tensorrt/trt_executor.hpp +++ b/source/device/tensorrt/trt_executor.hpp @@ -42,7 +42,6 @@ EXPORT_FINISH #include #include - class TensorRTEngine { public: @@ -63,7 +62,7 @@ class TensorRTEngine int get_type(int mode, nvinfer1::DataType& type); private: - size_t card_id; + size_t card_id; uint16_t tensor_swap_count; std::map tensor_real_map; @@ -116,5 +115,5 @@ class TensorRTEngine nvinfer1::INetworkDefinition* network; nvinfer1::IBuilderConfig* config; nvinfer1::ICudaEngine* engine; - nvinfer1::IExecutionContext *context; + nvinfer1::IExecutionContext* context; }; diff --git a/source/device/tensorrt/trt_graph.hpp b/source/device/tensorrt/trt_graph.hpp index 0ceb1f88f..7050eb79b 100644 --- a/source/device/tensorrt/trt_graph.hpp +++ b/source/device/tensorrt/trt_graph.hpp @@ -34,7 +34,6 @@ EXPORT_BEGIN #include "graph/subgraph.h" #include "device/device.h" - int trt_dev_init(struct device* dev); int trt_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options); int trt_dev_run(struct device* dev, struct subgraph* subgraph); diff --git a/source/device/tensorrt/trt_helper.hpp b/source/device/tensorrt/trt_helper.hpp index 6e8640886..63fa3c35d 100644 --- a/source/device/tensorrt/trt_helper.hpp +++ b/source/device/tensorrt/trt_helper.hpp @@ -42,7 +42,6 @@ #include #include - #ifdef _MSC_VER #define FN_NAME __FUNCTION__ #else @@ -53,40 +52,53 @@ #define ENABLE_DLA_API 1 #endif -#define CHECK(status) \ - do \ - { \ - auto ret = (status); \ - if (ret != 0) \ - { \ - Log(Loglevel, "TensorRT Engine", "Cuda failure: %d", ret); \ - abort(); \ - } \ +#define CHECK(status) \ + do \ + { \ + auto ret = (status); \ + if (ret != 0) \ + { \ + Log(Loglevel, "TensorRT Engine", "Cuda failure: %d", ret); \ + abort(); \ + } \ } while (0) - constexpr long double operator"" _GiB(long double val) { return val * (1 << 30); } -constexpr long double operator"" _MiB(long double val) { return val * (1 << 20); } -constexpr long double operator"" _KiB(long double val) { return val * (1 << 10); } +constexpr long double operator"" _MiB(long double val) +{ + return val * (1 << 20); +} +constexpr long double operator"" _KiB(long double val) +{ + return val * (1 << 10); +} // These is necessary if we want to be able to write 1_GiB instead of 1.0_GiB. // Since the return type is signed, -1_GiB will work as expected. -constexpr long long int operator"" _GiB(long long unsigned int val) { return val * (1 << 30); } -constexpr long long int operator"" _MiB(long long unsigned int val) { return val * (1 << 20); } -constexpr long long int operator"" _KiB(long long unsigned int val) { return val * (1 << 10); } - - +constexpr long long int operator"" _GiB(long long unsigned int val) +{ + return val * (1 << 30); +} +constexpr long long int operator"" _MiB(long long unsigned int val) +{ + return val * (1 << 20); +} +constexpr long long int operator"" _KiB(long long unsigned int val) +{ + return val * (1 << 10); +} -class Logger :public nvinfer1::ILogger +class Logger : public nvinfer1::ILogger { public: nvinfer1::ILogger::Severity severity_; public: - Logger(nvinfer1::ILogger::Severity severity = nvinfer1::ILogger::Severity::kINFO) :severity_(severity) {}; + Logger(nvinfer1::ILogger::Severity severity = nvinfer1::ILogger::Severity::kINFO) + : severity_(severity){}; void log(Severity severity, const char* msg) override { @@ -94,21 +106,21 @@ class Logger :public nvinfer1::ILogger { switch (severity) { - case nvinfer1::ILogger::Severity::kINTERNAL_ERROR: - fprintf(stderr, "Tengine Fatal: %s\n", msg); - break; - case nvinfer1::ILogger::Severity::kERROR: - fprintf(stderr, "Tengine Error: %s\n", msg); - break; - case nvinfer1::ILogger::Severity::kWARNING: - fprintf(stderr, "Tengine Warning: %s\n", msg); - break; - case nvinfer1::ILogger::Severity::kINFO: - fprintf(stderr, "Tengine Info: %s\n", msg); - break; - default: - fprintf(stderr, "Tengine Normal: %s\n", msg); - break; + case nvinfer1::ILogger::Severity::kINTERNAL_ERROR: + fprintf(stderr, "Tengine Fatal: %s\n", msg); + break; + case nvinfer1::ILogger::Severity::kERROR: + fprintf(stderr, "Tengine Error: %s\n", msg); + break; + case nvinfer1::ILogger::Severity::kWARNING: + fprintf(stderr, "Tengine Warning: %s\n", msg); + break; + case nvinfer1::ILogger::Severity::kINFO: + fprintf(stderr, "Tengine Info: %s\n", msg); + break; + default: + fprintf(stderr, "Tengine Normal: %s\n", msg); + break; } } else @@ -128,10 +140,9 @@ class Logger :public nvinfer1::ILogger } }; - struct InferDeleter { - template + template void operator()(T* obj) const { if (obj) @@ -141,7 +152,6 @@ struct InferDeleter } }; - inline void enableDLA(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, int useDLACore, bool allowGPUFallback = true) { if (useDLACore >= 0) @@ -166,7 +176,6 @@ inline void enableDLA(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* con } } - // Ensures that every tensor used by a network has a scale. // // All tensors in a network must have a range specified if a calibrator is not used. @@ -187,7 +196,7 @@ void setAllTensorScales(nvinfer1::INetworkDefinition* network, float inScales = auto layer = network->getLayer(i); for (int j = 0; j < layer->getNbInputs(); j++) { - nvinfer1::ITensor* input{ layer->getInput(j) }; + nvinfer1::ITensor* input{layer->getInput(j)}; // Optional inputs are nullptr here and are from RNN layers. if (input != nullptr && !input->dynamicRangeIsSet()) { @@ -204,7 +213,7 @@ void setAllTensorScales(nvinfer1::INetworkDefinition* network, float inScales = auto layer = network->getLayer(i); for (int j = 0; j < layer->getNbOutputs(); j++) { - nvinfer1::ITensor* output{ layer->getOutput(j) }; + nvinfer1::ITensor* output{layer->getOutput(j)}; // Optional outputs are nullptr here and are from RNN layers. if (output != nullptr && !output->dynamicRangeIsSet()) { @@ -222,7 +231,6 @@ void setAllTensorScales(nvinfer1::INetworkDefinition* network, float inScales = } } - struct CaffeBufferShutter { ~CaffeBufferShutter() @@ -231,7 +239,6 @@ struct CaffeBufferShutter } }; - struct UffBufferShutter { ~UffBufferShutter() @@ -240,9 +247,7 @@ struct UffBufferShutter } }; - -template +template using TensorRTSmartPoint = std::unique_ptr; - using TensorRTShapeRange = std::array()>; diff --git a/source/device/tensorrt/trt_limit.hpp b/source/device/tensorrt/trt_limit.hpp index 9380a28b3..aa39ef1af 100644 --- a/source/device/tensorrt/trt_limit.hpp +++ b/source/device/tensorrt/trt_limit.hpp @@ -42,57 +42,56 @@ EXPORT_FINISH #include - #if NV_TENSORRT_MAJOR < 5 #error "Tengine: The minimum supported version of TensorRT is 5.\n" #endif const int trt_supported_ops[] = { - OP_ABSVAL, - OP_ADD_N, + OP_ABSVAL, + OP_ADD_N, #if NV_TENSORRT_MAJOR >= 6 // OP_ARGMAX, // OP_ARGMIN, #endif - OP_BATCHNORM, - //OP_BATCHTOSPACEND, // Not supported, last checked version 7.1.3 + OP_BATCHNORM, +//OP_BATCHTOSPACEND, // Not supported, last checked version 7.1.3 // OP_BIAS, #if NV_TENSORRT_MAJOR >= 6 -// OP_BROADMUL, -// OP_CAST, -// OP_CEIL, - OP_CLIP, + // OP_BROADMUL, + // OP_CAST, + // OP_CEIL, + OP_CLIP, #endif #if NV_TENSORRT_MAJOR >= 7 // OP_COMPARISON, #endif - OP_CONCAT, - OP_CONST, - OP_CONV, - OP_CROP, - OP_DECONV, -// OP_DEPTHTOSPACE, - //OP_DETECTION_OUTPUT, // Not supported, last checked version 7.1.3 - //OP_DETECTION_POSTPROCESS, // Not supported, last checked version 7.1.3 - OP_DROPOUT, - OP_ELTWISE, + OP_CONCAT, + OP_CONST, + OP_CONV, + OP_CROP, + OP_DECONV, + // OP_DEPTHTOSPACE, + //OP_DETECTION_OUTPUT, // Not supported, last checked version 7.1.3 + //OP_DETECTION_POSTPROCESS, // Not supported, last checked version 7.1.3 + OP_DROPOUT, + OP_ELTWISE, // OP_ELU, - //OP_EMBEDDING, // Not supported, last checked version 7.1.3 +//OP_EMBEDDING, // Not supported, last checked version 7.1.3 #if NV_TENSORRT_MAJOR >= 6 // OP_EXPANDDIMS, #endif - OP_FC, - OP_FLATTEN, -// OP_GATHER, - OP_GEMM, + OP_FC, + OP_FLATTEN, + // OP_GATHER, + OP_GEMM, #if NV_TENSORRT_MAJOR >= 7 // OP_GRU, #endif -// OP_HARDSIGMOID, -// OP_HARDSWISH, // Not supported, last checked version 7.1.3 - OP_INPUT, - OP_INSTANCENORM, - OP_INTERP, // should be as UpSample + // OP_HARDSIGMOID, + // OP_HARDSWISH, // Not supported, last checked version 7.1.3 + OP_INPUT, + OP_INSTANCENORM, + OP_INTERP, // should be as UpSample // OP_LOGICAL, #if NV_TENSORRT_MAJOR >= 7 // OP_LOGISTIC, @@ -101,73 +100,73 @@ const int trt_supported_ops[] = { #if NV_TENSORRT_MAJOR >= 7 // OP_LSTM, #endif -// OP_MATMUL, -// OP_MAXIMUM, -// OP_MEAN, -// OP_MINIMUM, - //OP_MVN, // Not supported, last checked version 7.1.3 -// OP_NOOP, - //OP_NORMALIZE, // Not supported, last checked version 7.1.3 - OP_PAD, - OP_PERMUTE, - OP_POOL, -// OP_PRELU, - //OP_PRIORBOX, // Not supported, last checked version 7.1.3 - //OP_PSROIPOOLING, // Not supported, last checked version 7.1.3 -// OP_REDUCEL2, - OP_REDUCTION, - //OP_REGION, // Not supported, last checked version 7.1.3 - OP_RELU, - OP_RELU6, - //OP_REORG, // Not supported, last checked version 7.1.3 - OP_RESHAPE, + // OP_MATMUL, + // OP_MAXIMUM, + // OP_MEAN, + // OP_MINIMUM, + //OP_MVN, // Not supported, last checked version 7.1.3 + // OP_NOOP, + //OP_NORMALIZE, // Not supported, last checked version 7.1.3 + OP_PAD, + OP_PERMUTE, + OP_POOL, + // OP_PRELU, + //OP_PRIORBOX, // Not supported, last checked version 7.1.3 + //OP_PSROIPOOLING, // Not supported, last checked version 7.1.3 + // OP_REDUCEL2, + OP_REDUCTION, + //OP_REGION, // Not supported, last checked version 7.1.3 + OP_RELU, + OP_RELU6, + //OP_REORG, // Not supported, last checked version 7.1.3 + OP_RESHAPE, #if NV_TENSORRT_MAJOR >= 6 - OP_RESIZE, + OP_RESIZE, #endif - //OP_REVERSE, // Not supported, last checked version 7.1.3 +//OP_REVERSE, // Not supported, last checked version 7.1.3 #if NV_TENSORRT_MAJOR >= 7 // OP_RNN, #endif - //OP_ROIALIGN, // Not supported, last checked version 7.1.3 - //OP_ROIPOOLING, // Not supported, last checked version 7.1.3 - //OP_ROUND, - //OP_RPN, +//OP_ROIALIGN, // Not supported, last checked version 7.1.3 +//OP_ROIPOOLING, // Not supported, last checked version 7.1.3 +//OP_ROUND, +//OP_RPN, // OP_SCALE, // OP_SELU, - //OP_SHUFFLECHANNEL, // Not supported, last checked version 7.1.3 +//OP_SHUFFLECHANNEL, // Not supported, last checked version 7.1.3 // OP_SIGMOID, #if NV_TENSORRT_MAJOR >= 6 - OP_SLICE, + OP_SLICE, #endif - OP_SOFTMAX, - //OP_SPACETOBATCHND, // Not supported, last checked version 7.1.3 -// OP_SPACETODEPTH, - //OP_SPARSETODENSE, // Not supported, last checked version 7.1.3 - OP_SPLIT, - //OP_SQUAREDDIFFERENCE, // Not supported, last checked version 7.1.3 - OP_SQUEEZE, - //OP_STRIDED_SLICE, // Not supported, last checked version 7.1.3 - //OP_SWAP_AXIS, -// OP_TANH, - //OP_THRESHOLD, // Not supported, last checked version 7.1.3 - //OP_THRESHOLD, // Not supported, last checked version 7.1.3 -// OP_TOPKV2, - OP_TRANSPOSE, -// OP_UNARY, -// OP_UNSQUEEZE, - OP_UPSAMPLE, - //OP_ZEROSLIKE, // Not supported, last checked version 7.1.3 - OP_MISH, + OP_SOFTMAX, + //OP_SPACETOBATCHND, // Not supported, last checked version 7.1.3 + // OP_SPACETODEPTH, + //OP_SPARSETODENSE, // Not supported, last checked version 7.1.3 + OP_SPLIT, + //OP_SQUAREDDIFFERENCE, // Not supported, last checked version 7.1.3 + OP_SQUEEZE, + //OP_STRIDED_SLICE, // Not supported, last checked version 7.1.3 + //OP_SWAP_AXIS, + // OP_TANH, + //OP_THRESHOLD, // Not supported, last checked version 7.1.3 + //OP_THRESHOLD, // Not supported, last checked version 7.1.3 + // OP_TOPKV2, + OP_TRANSPOSE, + // OP_UNARY, + // OP_UNSQUEEZE, + OP_UPSAMPLE, + //OP_ZEROSLIKE, // Not supported, last checked version 7.1.3 + OP_MISH, // OP_LOGSOFTMAX, #if NV_TENSORRT_MAJOR >= 6 - OP_RELU1, + OP_RELU1, #endif - //OP_L2NORMALIZATION, // Not supported, last checked version 7.1.3 - //OP_L2POOL, // Not supported, last checked version 7.1.3 +//OP_L2NORMALIZATION, // Not supported, last checked version 7.1.3 +//OP_L2POOL, // Not supported, last checked version 7.1.3 #if NV_TENSORRT_MAJOR >= 7 // OP_TILE, #endif - OP_SHAPE, + OP_SHAPE, // OP_SCATTER, #if NV_TENSORRT_MAJOR >= 7 // OP_WHERE, diff --git a/source/device/tim-vx/timvx_device.hpp b/source/device/tim-vx/timvx_device.hpp index 67b4c742b..d5aba6230 100644 --- a/source/device/tim-vx/timvx_device.hpp +++ b/source/device/tim-vx/timvx_device.hpp @@ -26,8 +26,7 @@ #include "timvx_define.h" -extern "C" -{ +extern "C" { #include "api/c_api.h" #include "device/device.h" diff --git a/source/device/tim-vx/timvx_dump.c b/source/device/tim-vx/timvx_dump.c index 7035f21d7..640e024e7 100644 --- a/source/device/tim-vx/timvx_dump.c +++ b/source/device/tim-vx/timvx_dump.c @@ -1,562 +1,559 @@ - -#include "timvx_dump.h" - -#include "device/device.h" -#include "graph/tensor.h" -#include "graph/node.h" -#include "graph/graph.h" -#include "graph/subgraph.h" -#include "operator/op.h" -#include "utility/log.h" - -#include -#include -#include - -#ifdef _MSC_VER -#include -#else -#include -#include -#endif - - -#include -#include -#include - -#ifdef _MSC_VER -#include -#else -#include -#include -#endif - - -int print_tensor_data_value_timvx(FILE* file, const struct tensor* tensor, int offset) -{ - switch (tensor->data_type) - { - case TENGINE_DT_FP32: - { - float* base_ptr = (float*)tensor->data; - float val = base_ptr[offset]; - if (val < 0) - fprintf(file, "%.4f ", val); - else - fprintf(file, " %.4f ", val); - break; - } -// case TENGINE_DT_FP16: -// { -// fp16_t* base_ptr = (fp16_t*)tensor->data; -// fp16_t val = base_ptr[offset]; -// -// float val_fp32 = fp16_to_fp32(val); -// -// if (val_fp32 < 0) -// fprintf(file, "%.4f ", val_fp32); -// else -// fprintf(file, " %.4f ", val_fp32); -// break; -// } - case TENGINE_DT_UINT8: - { - uint8_t* base_ptr = (uint8_t*)tensor->data; - uint8_t val = base_ptr[offset]; - - float scale = tensor->scale; - int32_t zero_point = tensor->zero_point; - - float val_fp32 = (float)((int)val - (int)zero_point) * scale; - if (val_fp32 < 0) - fprintf(file, "%.4f ", val_fp32); - else - fprintf(file, " %.4f ", val_fp32); - break; - } - case TENGINE_DT_INT8: - { - int8_t* base_ptr = (int8_t*)tensor->data; - int8_t val = base_ptr[offset]; - - float scale = tensor->scale; - - float val_fp32 = (float)val * scale; - if (val_fp32 < 0) - fprintf(file, "%.4f ", val_fp32); - else - fprintf(file, " %.4f ", val_fp32); - } - case TENGINE_DT_INT32: - { - int32_t* base_ptr = (int32_t*)tensor->data; - int8_t val = base_ptr[offset]; - - float scale = tensor->scale; - float val_fp32 = (float)val * scale; - - if (val_fp32 < 0) - fprintf(file, "%.6f ", val_fp32); - else - fprintf(file, " %.6f ", val_fp32); - } - } - - return 0; -} - -const char* get_tensor_data_type_string_timvx(int data_type) -{ - switch (data_type) - { - case TENGINE_DT_FP32: - return "fp32"; - case TENGINE_DT_FP16: - return "fp16"; - case TENGINE_DT_INT8: - return "int8"; - case TENGINE_DT_UINT8: - return "uint8"; - case TENGINE_DT_INT32: - return "int32"; - case TENGINE_DT_INT16: - return "int16"; - default: - return "unknown"; - } -} - -void print_tensor_data_to_file_timvx(FILE* file, const struct tensor* tensor) -{ - switch (tensor->dim_num) - { - case 5: - { - int dim5 = tensor->dims[0], batch = tensor->dims[1], channel = 0, height = 0, width = 0; - - if (TENGINE_LAYOUT_NCHW == tensor->layout) - { - channel = tensor->dims[2]; - height = tensor->dims[3]; - width = tensor->dims[4]; - } - if (TENGINE_LAYOUT_NHWC == tensor->layout) - { - height = tensor->dims[2]; - width = tensor->dims[3]; - channel = tensor->dims[4]; - } - - if (TENGINE_DT_FP32 == tensor->data_type) - { - fprintf(file, "Shape is {%d %d %d %d %d}, data type is fp32\n", dim5, batch, channel, height, width); - } - else - { - if (TENGINE_DT_FP16 == tensor->data_type) - { - fprintf(file, "Shape is {%d %d %d %d %d}, data type is fp16, cast to fp32\n", dim5, batch, channel, height, width); - } - else - { - const char* type_name = get_tensor_data_type_string_timvx(tensor->data_type); - fprintf(file, "Shape is {%d %d %d %d %d}, data type is %s, inverse quantization to fp32\n", dim5, batch, channel, height, width, type_name); - } - } - - for (int d5 = 0; d5 < dim5; d5++) - { - fprintf(file, "Dim5 %d:\n", d5); - - for (int n = 0; n < batch; n++) - { - fprintf(file, "\tBatch %d:\n", n); - - for (int ch = 0; ch < channel; ch++) - { - fprintf(file, "\t\tChannel %d:\n", ch); - - for (int h = 0; h < height; h++) - { - fprintf(file, "\t\t\t"); - - for (int w = 0; w < width; w++) - { - int offset = 0; - - if (TENGINE_LAYOUT_NCHW == tensor->layout) - { - offset += d5 * batch * channel * height * width; - offset += n * channel * height * width; - offset += ch * height * width; - offset += h * width; - offset += w; - } - if (TENGINE_LAYOUT_NHWC == tensor->layout) - { - offset += d5 * batch * channel * height * width; - offset += n * channel * height * width; - offset += ch; - offset += h * width * channel; - offset += w * channel; - } - - print_tensor_data_value_timvx(file, tensor, offset); - } - fprintf(file, "\n"); - } - fprintf(file, "\n"); - } - fprintf(file, "\n"); - } - fprintf(file, "\n"); - } - - break; - } - case 4: - { - int batch = tensor->dims[0], channel = 0, height = 0, width = 0; - - if (TENGINE_LAYOUT_NCHW == tensor->layout) - { - channel = tensor->dims[1]; - height = tensor->dims[2]; - width = tensor->dims[3]; - } - if (TENGINE_LAYOUT_NHWC == tensor->layout) - { - height = tensor->dims[1]; - width = tensor->dims[2]; - channel = tensor->dims[3]; - } - - if (TENGINE_DT_FP32 == tensor->data_type) - { - fprintf(file, "Shape is {%d %d %d %d}, data type is fp32\n", batch, channel, height, width); - } - else - { - if (TENGINE_DT_FP16 == tensor->data_type) - { - fprintf(file, "Shape is {%d %d %d %d}, data type is fp16, cast to fp32\n", batch, channel, height, width); - } - else - { - const char* type_name = get_tensor_data_type_string_timvx(tensor->data_type); - fprintf(file, "Shape is {%d %d %d %d}, data type is %s, inverse quantization to fp32\n", batch, channel, height, width, type_name); - } - } - - for (int n = 0; n < batch; n++) - { - fprintf(file, "Batch %d:\n", n); - - for (int ch = 0; ch < channel; ch++) - { - fprintf(file, "\tChannel %d:\n", ch); - - for (int h = 0; h < height; h++) - { - fprintf(file, "\t\t"); - - for (int w = 0; w < width; w++) - { - int offset = 0; - - if (TENGINE_LAYOUT_NCHW == tensor->layout) - { - offset += n * channel * height * width; - offset += ch * height * width; - offset += h * width; - offset += w; - } - if (TENGINE_LAYOUT_NHWC == tensor->layout) - { - offset += n * channel * height * width; - offset += ch; - offset += h * width * channel; - offset += w * channel; - } - - print_tensor_data_value_timvx(file, tensor, offset); - } - fprintf(file, "\n"); - } - fprintf(file, "\n"); - } - fprintf(file, "\n"); - } - - break; - } - case 3: - { - int batch = 0, height = 0, width = 0; - - if (TENGINE_LAYOUT_NCHW == tensor->layout) - { - batch = tensor->dims[0]; - height = tensor->dims[1]; - width = tensor->dims[2]; - } - if (TENGINE_LAYOUT_NHWC == tensor->layout) - { - height = tensor->dims[0]; - width = tensor->dims[1]; - batch = tensor->dims[2]; - } - - if (TENGINE_DT_FP32 == tensor->data_type) - { - fprintf(file, "Shape is {%d %d %d}, data type is fp32\n", batch, height, width); - } - else - { - if (TENGINE_DT_FP16 == tensor->data_type) - { - fprintf(file, "Shape is {%d %d %d}, data type is fp16, cast to fp32\n", batch, height, width); - } - else - { - const char* type_name = get_tensor_data_type_string_timvx(tensor->data_type); - fprintf(file, "Shape is {%d %d %d}, data type is %s, inverse quantization to fp32\n", batch, height, width, type_name); - } - } - - for (int n = 0; n < batch; n++) - { - for (int h = 0; h < height; h++) - { - fprintf(file, "Channel %d:\n", h); - fprintf(file, "\t"); - - for (int w = 0; w < width; w++) - { - int offset = 0; - - if (TENGINE_LAYOUT_NCHW == tensor->layout) - { - offset += n * height * width; - offset += h * width; - offset += w; - } - if (TENGINE_LAYOUT_NHWC == tensor->layout) - { - offset += h; - offset += n * width * height; - offset += w * height; - } - - print_tensor_data_value_timvx(file, tensor, offset); - } - fprintf(file, "\n"); - } - fprintf(file, "\n"); - } - - break; - } - case 2: - { - int batch = 0, width = 0; - - if (TENGINE_LAYOUT_NCHW == tensor->layout) - { - batch = tensor->dims[0]; - width = tensor->dims[1]; - } - if (TENGINE_LAYOUT_NHWC == tensor->layout) - { - batch = tensor->dims[0]; - width = tensor->dims[1]; - } - - if (TENGINE_DT_FP32 == tensor->data_type) - { - fprintf(file, "Shape is {%d %d}, data type is fp32\n", batch, width); - } - else - { - if (TENGINE_DT_FP16 == tensor->data_type) - { - fprintf(file, "Shape is {%d %d}, data type is fp16, cast to fp32\n", batch, width); - } - else - { - const char* type_name = get_tensor_data_type_string_timvx(tensor->data_type); - fprintf(file, "Shape is {%d %d}, data type is %s, inverse quantization to fp32\n", batch, width, type_name); - } - } - - for (int n = 0; n < batch; n++) - { - for (int w = 0; w < width; w++) - { - int offset = 0; - - offset += n * width; - offset += w; - - print_tensor_data_value_timvx(file, tensor, offset); - } - fprintf(file, "\n"); - } - - break; - } - case 1: - { - int width = tensor->dims[0]; - - fprintf(file, "Shape is {%d}, data type is fp32\n", width); - - - for (int w = 0; w < width; w++) - { - print_tensor_data_value_timvx(file, tensor, w); - } - - break; - } - default: - printf("Input dimension %d not to be supported.\n", tensor->dim_num); - } -} - -char* replace_string_character_timvx(char* src_str, char* dst_str, char* target_char, char* replaced_char) -{ - char* p; - char* _out = dst_str; - char* _str = src_str; - char* _src = target_char; - char* _dst = replaced_char; - size_t src_size = strlen(_src); - size_t dst_size = strlen(_dst); - size_t len = 0; - - do - { - p = strstr(_str, _src); - if (p == 0) - { - strcpy(_out, _str); - return dst_str; - } - len = p - _str; - memcpy(_out, _str, len); - memcpy(_out + len, _dst, dst_size); - _str = p + src_size; - _out = _out + len + dst_size; - } while (p); - - return dst_str; -} - -void extract_feature_from_tensor_timvx(const char* comment, const char* layer_name, const struct tensor* tensor) -{ - // 1. deal with saving path - char save_dir[256] = { '0' }; - - const char *env_path = getenv(TENGINE_DUMP_DIR); - - if (NULL != env_path && (256 - 2) > strlen(env_path)) - { - strcpy(save_dir, env_path); - - if ('/' == save_dir[strlen(env_path)] || '\\' == save_dir[strlen(env_path)]) - { -#ifdef _MSC_VER - save_dir[strlen(env_path)] = '\\'; - save_dir[strlen(env_path) + 1] = 0; -#else - save_dir[strlen(env_path)] = '/'; - save_dir[strlen(env_path) + 1] = 0; -#endif - } - } - else - { -// TLOG_WARNING("Tengine: Env var \"TENGINE_DUMP_DIR\" is too long(%d vs. 254). Using default path.\n", strlen(env_path)); - sprintf(save_dir, "./output/"); -#ifdef _MSC_VER - CreateDirectoryA(save_dir, NULL); -#else - int ret = mkdir(save_dir, S_IRWXU | S_IRGRP | S_IWGRP | S_IROTH); -// if (0 != ret) -// { -// TLOG_WARNING("Tengine: Create saving folder failed(%d), skip dump.\n", ret); -// return; -// } -#endif - } - - // 2. deal with layer name - char layer_short_name[64], layer_legal_name[64]; - - if (64 < strlen(layer_name)) - { - memcpy(layer_short_name, layer_name, 64 - 1); - layer_short_name[64 - 1] = 0; - } - else - { - strcpy(layer_short_name, layer_name); - } - - replace_string_character_timvx(layer_short_name, layer_legal_name, "/", "-"); - - // 3. join path - char output_file_path[512] = { '0' }; - - if (strlen(layer_legal_name) + strlen(save_dir) + strlen(comment) > 256 - 16) - { - TLOG_WARNING("Tengine: Name of saving file is too long(%d vs. %d), skip dump.\n", strlen(layer_legal_name) + strlen(save_dir) + strlen(comment), 256 - 16); - return; - } - - sprintf(output_file_path, "%s%s_%s_blob_data.txt", save_dir, layer_legal_name, comment); - - FILE* file = fopen(output_file_path, "w"); - if (NULL == file) - { - fprintf(stderr, "Tengine: Open file(%s) failed, skip dump\n", output_file_path); - return; - } - - print_tensor_data_to_file_timvx(file, tensor); - - // close file - fclose(file); - file = NULL; -} - -void dump_sub_graph_timvx(struct subgraph* sub_graph) -{ - TLOG_INFO("Sub graph[%d]: {%8s } has %d nodes, %d input tensors, %d output tensors.\n", sub_graph->index, sub_graph->device->name, sub_graph->node_num, sub_graph->input_num, sub_graph->output_num); - TLOG_INFO("\tSub nodes: [ "); - - for (int j = 0; j < sub_graph->node_num - 1; j++) - { - int node_id = sub_graph->node_list[j]; - TLOG_INFO("%d, ", node_id); - } - TLOG_INFO("%d ].\n", sub_graph->node_list[sub_graph->node_num - 1]); - - TLOG_INFO("\tSub input tensors: [ "); - for (int j = 0; j < sub_graph->input_num - 1; j++) - { - int tensor_id = sub_graph->input_tensor_list[j]; - TLOG_INFO("%d, ", tensor_id); - } - TLOG_INFO("%d ].\n", sub_graph->input_tensor_list[sub_graph->input_num - 1]); - - TLOG_INFO("\tSub output tensors: [ "); - for (int j = 0; j < sub_graph->output_num - 1; j++) - { - int tensor_id = sub_graph->output_tensor_list[j]; - TLOG_INFO("%d, ", tensor_id); - } - TLOG_INFO("%d ].\n", sub_graph->output_tensor_list[sub_graph->output_num - 1]); + +#include "timvx_dump.h" + +#include "device/device.h" +#include "graph/tensor.h" +#include "graph/node.h" +#include "graph/graph.h" +#include "graph/subgraph.h" +#include "operator/op.h" +#include "utility/log.h" + +#include +#include +#include + +#ifdef _MSC_VER +#include +#else +#include +#include +#endif + +#include +#include +#include + +#ifdef _MSC_VER +#include +#else +#include +#include +#endif + +int print_tensor_data_value_timvx(FILE* file, const struct tensor* tensor, int offset) +{ + switch (tensor->data_type) + { + case TENGINE_DT_FP32: + { + float* base_ptr = (float*)tensor->data; + float val = base_ptr[offset]; + if (val < 0) + fprintf(file, "%.4f ", val); + else + fprintf(file, " %.4f ", val); + break; + } + // case TENGINE_DT_FP16: + // { + // fp16_t* base_ptr = (fp16_t*)tensor->data; + // fp16_t val = base_ptr[offset]; + // + // float val_fp32 = fp16_to_fp32(val); + // + // if (val_fp32 < 0) + // fprintf(file, "%.4f ", val_fp32); + // else + // fprintf(file, " %.4f ", val_fp32); + // break; + // } + case TENGINE_DT_UINT8: + { + uint8_t* base_ptr = (uint8_t*)tensor->data; + uint8_t val = base_ptr[offset]; + + float scale = tensor->scale; + int32_t zero_point = tensor->zero_point; + + float val_fp32 = (float)((int)val - (int)zero_point) * scale; + if (val_fp32 < 0) + fprintf(file, "%.4f ", val_fp32); + else + fprintf(file, " %.4f ", val_fp32); + break; + } + case TENGINE_DT_INT8: + { + int8_t* base_ptr = (int8_t*)tensor->data; + int8_t val = base_ptr[offset]; + + float scale = tensor->scale; + + float val_fp32 = (float)val * scale; + if (val_fp32 < 0) + fprintf(file, "%.4f ", val_fp32); + else + fprintf(file, " %.4f ", val_fp32); + } + case TENGINE_DT_INT32: + { + int32_t* base_ptr = (int32_t*)tensor->data; + int8_t val = base_ptr[offset]; + + float scale = tensor->scale; + float val_fp32 = (float)val * scale; + + if (val_fp32 < 0) + fprintf(file, "%.6f ", val_fp32); + else + fprintf(file, " %.6f ", val_fp32); + } + } + + return 0; +} + +const char* get_tensor_data_type_string_timvx(int data_type) +{ + switch (data_type) + { + case TENGINE_DT_FP32: + return "fp32"; + case TENGINE_DT_FP16: + return "fp16"; + case TENGINE_DT_INT8: + return "int8"; + case TENGINE_DT_UINT8: + return "uint8"; + case TENGINE_DT_INT32: + return "int32"; + case TENGINE_DT_INT16: + return "int16"; + default: + return "unknown"; + } +} + +void print_tensor_data_to_file_timvx(FILE* file, const struct tensor* tensor) +{ + switch (tensor->dim_num) + { + case 5: + { + int dim5 = tensor->dims[0], batch = tensor->dims[1], channel = 0, height = 0, width = 0; + + if (TENGINE_LAYOUT_NCHW == tensor->layout) + { + channel = tensor->dims[2]; + height = tensor->dims[3]; + width = tensor->dims[4]; + } + if (TENGINE_LAYOUT_NHWC == tensor->layout) + { + height = tensor->dims[2]; + width = tensor->dims[3]; + channel = tensor->dims[4]; + } + + if (TENGINE_DT_FP32 == tensor->data_type) + { + fprintf(file, "Shape is {%d %d %d %d %d}, data type is fp32\n", dim5, batch, channel, height, width); + } + else + { + if (TENGINE_DT_FP16 == tensor->data_type) + { + fprintf(file, "Shape is {%d %d %d %d %d}, data type is fp16, cast to fp32\n", dim5, batch, channel, height, width); + } + else + { + const char* type_name = get_tensor_data_type_string_timvx(tensor->data_type); + fprintf(file, "Shape is {%d %d %d %d %d}, data type is %s, inverse quantization to fp32\n", dim5, batch, channel, height, width, type_name); + } + } + + for (int d5 = 0; d5 < dim5; d5++) + { + fprintf(file, "Dim5 %d:\n", d5); + + for (int n = 0; n < batch; n++) + { + fprintf(file, "\tBatch %d:\n", n); + + for (int ch = 0; ch < channel; ch++) + { + fprintf(file, "\t\tChannel %d:\n", ch); + + for (int h = 0; h < height; h++) + { + fprintf(file, "\t\t\t"); + + for (int w = 0; w < width; w++) + { + int offset = 0; + + if (TENGINE_LAYOUT_NCHW == tensor->layout) + { + offset += d5 * batch * channel * height * width; + offset += n * channel * height * width; + offset += ch * height * width; + offset += h * width; + offset += w; + } + if (TENGINE_LAYOUT_NHWC == tensor->layout) + { + offset += d5 * batch * channel * height * width; + offset += n * channel * height * width; + offset += ch; + offset += h * width * channel; + offset += w * channel; + } + + print_tensor_data_value_timvx(file, tensor, offset); + } + fprintf(file, "\n"); + } + fprintf(file, "\n"); + } + fprintf(file, "\n"); + } + fprintf(file, "\n"); + } + + break; + } + case 4: + { + int batch = tensor->dims[0], channel = 0, height = 0, width = 0; + + if (TENGINE_LAYOUT_NCHW == tensor->layout) + { + channel = tensor->dims[1]; + height = tensor->dims[2]; + width = tensor->dims[3]; + } + if (TENGINE_LAYOUT_NHWC == tensor->layout) + { + height = tensor->dims[1]; + width = tensor->dims[2]; + channel = tensor->dims[3]; + } + + if (TENGINE_DT_FP32 == tensor->data_type) + { + fprintf(file, "Shape is {%d %d %d %d}, data type is fp32\n", batch, channel, height, width); + } + else + { + if (TENGINE_DT_FP16 == tensor->data_type) + { + fprintf(file, "Shape is {%d %d %d %d}, data type is fp16, cast to fp32\n", batch, channel, height, width); + } + else + { + const char* type_name = get_tensor_data_type_string_timvx(tensor->data_type); + fprintf(file, "Shape is {%d %d %d %d}, data type is %s, inverse quantization to fp32\n", batch, channel, height, width, type_name); + } + } + + for (int n = 0; n < batch; n++) + { + fprintf(file, "Batch %d:\n", n); + + for (int ch = 0; ch < channel; ch++) + { + fprintf(file, "\tChannel %d:\n", ch); + + for (int h = 0; h < height; h++) + { + fprintf(file, "\t\t"); + + for (int w = 0; w < width; w++) + { + int offset = 0; + + if (TENGINE_LAYOUT_NCHW == tensor->layout) + { + offset += n * channel * height * width; + offset += ch * height * width; + offset += h * width; + offset += w; + } + if (TENGINE_LAYOUT_NHWC == tensor->layout) + { + offset += n * channel * height * width; + offset += ch; + offset += h * width * channel; + offset += w * channel; + } + + print_tensor_data_value_timvx(file, tensor, offset); + } + fprintf(file, "\n"); + } + fprintf(file, "\n"); + } + fprintf(file, "\n"); + } + + break; + } + case 3: + { + int batch = 0, height = 0, width = 0; + + if (TENGINE_LAYOUT_NCHW == tensor->layout) + { + batch = tensor->dims[0]; + height = tensor->dims[1]; + width = tensor->dims[2]; + } + if (TENGINE_LAYOUT_NHWC == tensor->layout) + { + height = tensor->dims[0]; + width = tensor->dims[1]; + batch = tensor->dims[2]; + } + + if (TENGINE_DT_FP32 == tensor->data_type) + { + fprintf(file, "Shape is {%d %d %d}, data type is fp32\n", batch, height, width); + } + else + { + if (TENGINE_DT_FP16 == tensor->data_type) + { + fprintf(file, "Shape is {%d %d %d}, data type is fp16, cast to fp32\n", batch, height, width); + } + else + { + const char* type_name = get_tensor_data_type_string_timvx(tensor->data_type); + fprintf(file, "Shape is {%d %d %d}, data type is %s, inverse quantization to fp32\n", batch, height, width, type_name); + } + } + + for (int n = 0; n < batch; n++) + { + for (int h = 0; h < height; h++) + { + fprintf(file, "Channel %d:\n", h); + fprintf(file, "\t"); + + for (int w = 0; w < width; w++) + { + int offset = 0; + + if (TENGINE_LAYOUT_NCHW == tensor->layout) + { + offset += n * height * width; + offset += h * width; + offset += w; + } + if (TENGINE_LAYOUT_NHWC == tensor->layout) + { + offset += h; + offset += n * width * height; + offset += w * height; + } + + print_tensor_data_value_timvx(file, tensor, offset); + } + fprintf(file, "\n"); + } + fprintf(file, "\n"); + } + + break; + } + case 2: + { + int batch = 0, width = 0; + + if (TENGINE_LAYOUT_NCHW == tensor->layout) + { + batch = tensor->dims[0]; + width = tensor->dims[1]; + } + if (TENGINE_LAYOUT_NHWC == tensor->layout) + { + batch = tensor->dims[0]; + width = tensor->dims[1]; + } + + if (TENGINE_DT_FP32 == tensor->data_type) + { + fprintf(file, "Shape is {%d %d}, data type is fp32\n", batch, width); + } + else + { + if (TENGINE_DT_FP16 == tensor->data_type) + { + fprintf(file, "Shape is {%d %d}, data type is fp16, cast to fp32\n", batch, width); + } + else + { + const char* type_name = get_tensor_data_type_string_timvx(tensor->data_type); + fprintf(file, "Shape is {%d %d}, data type is %s, inverse quantization to fp32\n", batch, width, type_name); + } + } + + for (int n = 0; n < batch; n++) + { + for (int w = 0; w < width; w++) + { + int offset = 0; + + offset += n * width; + offset += w; + + print_tensor_data_value_timvx(file, tensor, offset); + } + fprintf(file, "\n"); + } + + break; + } + case 1: + { + int width = tensor->dims[0]; + + fprintf(file, "Shape is {%d}, data type is fp32\n", width); + + for (int w = 0; w < width; w++) + { + print_tensor_data_value_timvx(file, tensor, w); + } + + break; + } + default: + printf("Input dimension %d not to be supported.\n", tensor->dim_num); + } +} + +char* replace_string_character_timvx(char* src_str, char* dst_str, char* target_char, char* replaced_char) +{ + char* p; + char* _out = dst_str; + char* _str = src_str; + char* _src = target_char; + char* _dst = replaced_char; + size_t src_size = strlen(_src); + size_t dst_size = strlen(_dst); + size_t len = 0; + + do + { + p = strstr(_str, _src); + if (p == 0) + { + strcpy(_out, _str); + return dst_str; + } + len = p - _str; + memcpy(_out, _str, len); + memcpy(_out + len, _dst, dst_size); + _str = p + src_size; + _out = _out + len + dst_size; + } while (p); + + return dst_str; +} + +void extract_feature_from_tensor_timvx(const char* comment, const char* layer_name, const struct tensor* tensor) +{ + // 1. deal with saving path + char save_dir[256] = {'0'}; + + const char* env_path = getenv(TENGINE_DUMP_DIR); + + if (NULL != env_path && (256 - 2) > strlen(env_path)) + { + strcpy(save_dir, env_path); + + if ('/' == save_dir[strlen(env_path)] || '\\' == save_dir[strlen(env_path)]) + { +#ifdef _MSC_VER + save_dir[strlen(env_path)] = '\\'; + save_dir[strlen(env_path) + 1] = 0; +#else + save_dir[strlen(env_path)] = '/'; + save_dir[strlen(env_path) + 1] = 0; +#endif + } + } + else + { + // TLOG_WARNING("Tengine: Env var \"TENGINE_DUMP_DIR\" is too long(%d vs. 254). Using default path.\n", strlen(env_path)); + sprintf(save_dir, "./output/"); +#ifdef _MSC_VER + CreateDirectoryA(save_dir, NULL); +#else + int ret = mkdir(save_dir, S_IRWXU | S_IRGRP | S_IWGRP | S_IROTH); +// if (0 != ret) +// { +// TLOG_WARNING("Tengine: Create saving folder failed(%d), skip dump.\n", ret); +// return; +// } +#endif + } + + // 2. deal with layer name + char layer_short_name[64], layer_legal_name[64]; + + if (64 < strlen(layer_name)) + { + memcpy(layer_short_name, layer_name, 64 - 1); + layer_short_name[64 - 1] = 0; + } + else + { + strcpy(layer_short_name, layer_name); + } + + replace_string_character_timvx(layer_short_name, layer_legal_name, "/", "-"); + + // 3. join path + char output_file_path[512] = {'0'}; + + if (strlen(layer_legal_name) + strlen(save_dir) + strlen(comment) > 256 - 16) + { + TLOG_WARNING("Tengine: Name of saving file is too long(%d vs. %d), skip dump.\n", strlen(layer_legal_name) + strlen(save_dir) + strlen(comment), 256 - 16); + return; + } + + sprintf(output_file_path, "%s%s_%s_blob_data.txt", save_dir, layer_legal_name, comment); + + FILE* file = fopen(output_file_path, "w"); + if (NULL == file) + { + fprintf(stderr, "Tengine: Open file(%s) failed, skip dump\n", output_file_path); + return; + } + + print_tensor_data_to_file_timvx(file, tensor); + + // close file + fclose(file); + file = NULL; +} + +void dump_sub_graph_timvx(struct subgraph* sub_graph) +{ + TLOG_INFO("Sub graph[%d]: {%8s } has %d nodes, %d input tensors, %d output tensors.\n", sub_graph->index, sub_graph->device->name, sub_graph->node_num, sub_graph->input_num, sub_graph->output_num); + TLOG_INFO("\tSub nodes: [ "); + + for (int j = 0; j < sub_graph->node_num - 1; j++) + { + int node_id = sub_graph->node_list[j]; + TLOG_INFO("%d, ", node_id); + } + TLOG_INFO("%d ].\n", sub_graph->node_list[sub_graph->node_num - 1]); + + TLOG_INFO("\tSub input tensors: [ "); + for (int j = 0; j < sub_graph->input_num - 1; j++) + { + int tensor_id = sub_graph->input_tensor_list[j]; + TLOG_INFO("%d, ", tensor_id); + } + TLOG_INFO("%d ].\n", sub_graph->input_tensor_list[sub_graph->input_num - 1]); + + TLOG_INFO("\tSub output tensors: [ "); + for (int j = 0; j < sub_graph->output_num - 1; j++) + { + int tensor_id = sub_graph->output_tensor_list[j]; + TLOG_INFO("%d, ", tensor_id); + } + TLOG_INFO("%d ].\n", sub_graph->output_tensor_list[sub_graph->output_num - 1]); } \ No newline at end of file diff --git a/source/device/tim-vx/timvx_dump.h b/source/device/tim-vx/timvx_dump.h index 8c9a607d3..7a9f1778b 100644 --- a/source/device/tim-vx/timvx_dump.h +++ b/source/device/tim-vx/timvx_dump.h @@ -1,35 +1,35 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2021, OPEN AI LAB - * Author: lswang@openailab.com - */ - -#pragma once - -struct tensor; -struct subgraph; - -#define TENGINE_DUMP_DIR "TG_DEBUG_DUMP_DIR" -#define TENGINE_DUMP_LAYER "TG_DEBUG_DATA" - -void extract_feature_from_tensor_timvx(const char* comment, const char* layer_name, const struct tensor* tensor); - +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2021, OPEN AI LAB + * Author: lswang@openailab.com + */ + +#pragma once + +struct tensor; +struct subgraph; + +#define TENGINE_DUMP_DIR "TG_DEBUG_DUMP_DIR" +#define TENGINE_DUMP_LAYER "TG_DEBUG_DATA" + +void extract_feature_from_tensor_timvx(const char* comment, const char* layer_name, const struct tensor* tensor); + void dump_sub_graph_timvx(struct subgraph* sub_graph); \ No newline at end of file diff --git a/source/device/tim-vx/timvx_executor.hpp b/source/device/tim-vx/timvx_executor.hpp index bdb1f8d4c..faedb1529 100644 --- a/source/device/tim-vx/timvx_executor.hpp +++ b/source/device/tim-vx/timvx_executor.hpp @@ -24,8 +24,7 @@ #pragma once -extern "C" -{ +extern "C" { #include "device/device.h" #include "graph/tensor.h" #include "graph/node.h" @@ -45,7 +44,6 @@ extern "C" #include #include - #include "convolution_param.h" #include "tim/vx/tensor.h" @@ -74,20 +72,18 @@ extern "C" #include "tim/vx/ops/split.h" #include "tim/vx/ops/transpose.h" -#define SPEC_TYPE_CONV 1 -#define SPEC_TYPE_CONV_BIAS 2 -#define SPEC_TYPE_DWCONV 3 -#define SPEC_TYPE_INTERP 4 -#define SPEC_TYPE_OUTPUT 5 -#define SPEC_TYPE_PRELU 6 -#define SPEC_TYPE_SLICE 7 -#define SPEC_TYPE_RESHAPE 8 -#define SPEC_TYPE_INPUT 9 - - -typedef std::map> dict_irt2vxt; -typedef std::map> dict_irt2vxo; +#define SPEC_TYPE_CONV 1 +#define SPEC_TYPE_CONV_BIAS 2 +#define SPEC_TYPE_DWCONV 3 +#define SPEC_TYPE_INTERP 4 +#define SPEC_TYPE_OUTPUT 5 +#define SPEC_TYPE_PRELU 6 +#define SPEC_TYPE_SLICE 7 +#define SPEC_TYPE_RESHAPE 8 +#define SPEC_TYPE_INPUT 9 +typedef std::map > dict_irt2vxt; +typedef std::map > dict_irt2vxo; class VXEngine { @@ -136,15 +132,13 @@ class VXEngine bool AddTransposeNode(struct node* ir_node); bool AddUpsampleNode(struct node* ir_node); - public: std::shared_ptr context; std::shared_ptr graph; std::shared_ptr ops; std::vector nbg_buffer; - private: - dict_irt2vxt vx_tensor_map; - dict_irt2vxo vx_node_map; + dict_irt2vxt vx_tensor_map; + dict_irt2vxo vx_node_map; }; diff --git a/source/device/tim-vx/timvx_graph.hpp b/source/device/tim-vx/timvx_graph.hpp index defaf4cd8..156fc7a1e 100644 --- a/source/device/tim-vx/timvx_graph.hpp +++ b/source/device/tim-vx/timvx_graph.hpp @@ -24,12 +24,10 @@ #pragma once -extern "C" -{ +extern "C" { #include "device/device.h" #include "graph/subgraph.h" - int timvx_dev_init(struct device* dev); int timvx_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options); int timvx_dev_run(struct device* dev, struct subgraph* subgraph); diff --git a/source/device/tim-vx/timvx_limit.hpp b/source/device/tim-vx/timvx_limit.hpp index acb2e6bb4..ba521dd12 100644 --- a/source/device/tim-vx/timvx_limit.hpp +++ b/source/device/tim-vx/timvx_limit.hpp @@ -24,113 +24,111 @@ #pragma once -extern "C" -{ +extern "C" { #include "operator/op.h" } - const int timvx_supported_ops[] = { -// OP_GENERIC, -// OP_ABSVAL, -// OP_ADD_N, -// OP_ARGMAX, -// OP_ARGMIN, + // OP_GENERIC, + // OP_ABSVAL, + // OP_ADD_N, + // OP_ARGMAX, + // OP_ARGMIN, OP_BATCHNORM, -// OP_BATCHTOSPACEND, -// OP_BIAS, -// OP_BROADMUL, -// OP_CAST, -// OP_CEIL, + // OP_BATCHTOSPACEND, + // OP_BIAS, + // OP_BROADMUL, + // OP_CAST, + // OP_CEIL, OP_CLIP, -// OP_COMPARISON, + // OP_COMPARISON, OP_CONCAT, OP_CONST, OP_CONV, -// OP_CROP, + // OP_CROP, OP_DECONV, OP_DEPTHTOSPACE, -// OP_DETECTION_OUTPUT, -// OP_DETECTION_POSTPROCESS, + // OP_DETECTION_OUTPUT, + // OP_DETECTION_POSTPROCESS, OP_DROPOUT, OP_ELTWISE, OP_ELU, -// OP_EMBEDDING, -// OP_EXPANDDIMS, + // OP_EMBEDDING, + // OP_EXPANDDIMS, OP_FC, OP_FLATTEN, OP_GATHER, -// OP_GEMM, -// OP_GRU, -// OP_HARDSIGMOID, + // OP_GEMM, + // OP_GRU, + // OP_HARDSIGMOID, OP_HARDSWISH, OP_INPUT, OP_INSTANCENORM, OP_INTERP, -// OP_LOGICAL, -// OP_LOGISTIC, -// OP_LRN, -// OP_LSTM, -// OP_MATMUL, -// OP_MAXIMUM, -// OP_MEAN, -// OP_MINIMUM, -// OP_MVN, -// OP_NOOP, -// OP_NORMALIZE, -// OP_PAD, + // OP_LOGICAL, + // OP_LOGISTIC, + // OP_LRN, + // OP_LSTM, + // OP_MATMUL, + // OP_MAXIMUM, + // OP_MEAN, + // OP_MINIMUM, + // OP_MVN, + // OP_NOOP, + // OP_NORMALIZE, + // OP_PAD, OP_PERMUTE, OP_POOL, OP_PRELU, -// OP_PRIORBOX, -// OP_PSROIPOOLING, -// OP_REDUCEL2, -// OP_REDUCTION, -// OP_REGION, + // OP_PRIORBOX, + // OP_PSROIPOOLING, + // OP_REDUCEL2, + // OP_REDUCTION, + // OP_REGION, OP_RELU, OP_RELU6, -// OP_REORG, + // OP_REORG, OP_RESHAPE, OP_RESIZE, -// OP_REVERSE, -// OP_RNN, -// OP_ROIALIGN, -// OP_ROIPOOLING, -// OP_ROUND, -// OP_RPN, + // OP_REVERSE, + // OP_RNN, + // OP_ROIALIGN, + // OP_ROIPOOLING, + // OP_ROUND, + // OP_RPN, OP_SCALE, -// OP_SELU, -// OP_SHUFFLECHANNEL, + // OP_SELU, + // OP_SHUFFLECHANNEL, OP_SIGMOID, OP_SLICE, OP_SOFTMAX, -// OP_SPACETOBATCHND, + // OP_SPACETOBATCHND, OP_SPACETODEPTH, -// OP_SPARSETODENSE, + // OP_SPARSETODENSE, OP_SPLIT, -// OP_SQUAREDDIFFERENCE, -// OP_SQUEEZE, -// OP_STRIDED_SLICE, -// OP_SWAP_AXIS, + // OP_SQUAREDDIFFERENCE, + // OP_SQUEEZE, + // OP_STRIDED_SLICE, + // OP_SWAP_AXIS, OP_TANH, -// OP_THRESHOLD, -// OP_TOPKV2, + // OP_THRESHOLD, + // OP_TOPKV2, OP_TRANSPOSE, -// OP_UNARY, -// OP_UNSQUEEZE, + // OP_UNARY, + // OP_UNSQUEEZE, OP_UPSAMPLE, -// OP_ZEROSLIKE, + // OP_ZEROSLIKE, OP_MISH, -// OP_LOGSOFTMAX, -// OP_RELU1, -// OP_L2NORMALIZATION, -// OP_L2POOL, -// OP_TILE, -// OP_SHAPE, -// OP_SCATTER, -// OP_WHERE, -// OP_SOFTPLUS, -// OP_RECIPROCAL, -// OP_BUILTIN_LAST + // OP_LOGSOFTMAX, + // OP_RELU1, + // OP_L2NORMALIZATION, + // OP_L2POOL, + // OP_TILE, + // OP_SHAPE, + // OP_SCATTER, + // OP_WHERE, + // OP_SOFTPLUS, + // OP_RECIPROCAL, + // OP_BUILTIN_LAST }; diff --git a/source/device/vulkan/layer/concat_vulkan.cpp b/source/device/vulkan/layer/concat_vulkan.cpp index 926e7b19a..99357ba52 100644 --- a/source/device/vulkan/layer/concat_vulkan.cpp +++ b/source/device/vulkan/layer/concat_vulkan.cpp @@ -82,52 +82,58 @@ Concat_vulkan::Concat_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) graph = ir_graph; node = ir_node; - for(int i = 0; i < ir_node->input_num; i++) + for (int i = 0; i < ir_node->input_num; i++) { - struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[i]); + struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[i]); std::string name = input->name; bottoms.push_back(name); } - for(int i = 0; i < ir_node->output_num; i++) + for (int i = 0; i < ir_node->output_num; i++) { - struct tensor *output = get_ir_graph_tensor(graph, node->input_tensors[i]); + struct tensor* output = get_ir_graph_tensor(graph, node->input_tensors[i]); std::string name = output->name; tops.push_back(name); } // params - struct tensor *input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]); - struct tensor *output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]); - input_c = input_tensor->dims[1]; // param->input_channel; + struct tensor* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]); + struct tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]); + input_c = input_tensor->dims[1]; // param->input_channel; input_h = input_tensor->dims[2]; input_w = input_tensor->dims[3]; - output_c = output_tensor->dims[1]; // param->output_channel; + output_c = output_tensor->dims[1]; // param->output_channel; output_h = output_tensor->dims[2]; output_w = output_tensor->dims[3]; - struct concat_param *param = (struct concat_param *)ir_node->op.param_mem; - axis = param->axis -1; + struct concat_param* param = (struct concat_param*)ir_node->op.param_mem; + axis = param->axis - 1; } int Concat_vulkan::create_pipeline(const Option& _opt) { Option opt = _opt; - const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0]; + const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0]; const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0]; int out_elempack = 1; - if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1; - if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1; - if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1; + if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 + : 1; + if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 + : 1; + if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 + : 1; int elempack = 1; if (axis == 0) { - if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; - if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; - if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 + : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 + : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 + : 1; // TODO fix other input data shape to set elempack // for (size_t b = 1; b < bottom_shapes.size(); b++) @@ -328,7 +334,8 @@ int Concat_vulkan::record_pipeline(const std::vector& bottom_blobs, st top_w += bottom_blob.w * bottom_blob.elempack; } - int out_elempack = opt.use_shader_pack8 && top_w % 8 == 0 ? 8 : top_w % 4 == 0 ? 4 : 1; + int out_elempack = opt.use_shader_pack8 && top_w % 8 == 0 ? 8 : top_w % 4 == 0 ? 4 + : 1; size_t out_elemsize = elemsize / elempack * out_elempack; if (opt.use_fp16_packed && !opt.use_fp16_storage) @@ -430,7 +437,8 @@ int Concat_vulkan::record_pipeline(const std::vector& bottom_blobs, st top_h += bottom_blob.h * bottom_blob.elempack; } - int out_elempack = opt.use_shader_pack8 && top_h % 8 == 0 ? 8 : top_h % 4 == 0 ? 4 : 1; + int out_elempack = opt.use_shader_pack8 && top_h % 8 == 0 ? 8 : top_h % 4 == 0 ? 4 + : 1; size_t out_elemsize = elemsize / elempack * out_elempack; if (opt.use_fp16_packed && !opt.use_fp16_storage) @@ -557,9 +565,9 @@ int Concat_vulkan::record_pipeline(const std::vector& bottom_blobs, st constants[9].i = top_blob.cstep; constants[10].i = woffset; - const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b % 2] + const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b % 2] : elempack == 4 ? pipeline_concat_pack4[b % 2] - : pipeline_concat[b % 2]; + : pipeline_concat[b % 2]; cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); @@ -587,7 +595,8 @@ int Concat_vulkan::record_pipeline(const std::vector& bottom_blobs, st top_channels += bottom_blob.c * bottom_blob.elempack; } - int out_elempack = opt.use_shader_pack8 && top_channels % 8 == 0 ? 8 : top_channels % 4 == 0 ? 4 : 1; + int out_elempack = opt.use_shader_pack8 && top_channels % 8 == 0 ? 8 : top_channels % 4 == 0 ? 4 + : 1; size_t out_elemsize = elemsize / elempack * out_elempack; if (opt.use_fp16_packed && !opt.use_fp16_storage) @@ -715,9 +724,9 @@ int Concat_vulkan::record_pipeline(const std::vector& bottom_blobs, st constants[9].i = top_blob.cstep; constants[10].i = hoffset; - const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b % 2] + const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b % 2] : elempack == 4 ? pipeline_concat_pack4[b % 2] - : pipeline_concat[b % 2]; + : pipeline_concat[b % 2]; cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); @@ -770,9 +779,9 @@ int Concat_vulkan::record_pipeline(const std::vector& bottom_blobs, st constants[9].i = top_blob.cstep; constants[10].i = woffset; - const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b % 2] + const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b % 2] : elempack == 4 ? pipeline_concat_pack4[b % 2] - : pipeline_concat[b % 2]; + : pipeline_concat[b % 2]; cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); @@ -785,4 +794,4 @@ int Concat_vulkan::record_pipeline(const std::vector& bottom_blobs, st return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/concat_vulkan.hpp b/source/device/vulkan/layer/concat_vulkan.hpp index 6476fc997..b03d8efe6 100644 --- a/source/device/vulkan/layer/concat_vulkan.hpp +++ b/source/device/vulkan/layer/concat_vulkan.hpp @@ -45,7 +45,7 @@ #include "concat_param.h" -namespace TEngine{ +namespace TEngine { class Concat_vulkan : public Layer { @@ -55,7 +55,7 @@ class Concat_vulkan : public Layer virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); - + virtual int record_pipeline(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; public: @@ -76,6 +76,6 @@ class Concat_vulkan : public Layer int axis; }; -} // namespace TEngine +} // namespace TEngine #endif \ No newline at end of file diff --git a/source/device/vulkan/layer/convolution_vulkan.cpp b/source/device/vulkan/layer/convolution_vulkan.cpp index 5f135feba..d1c7335b6 100644 --- a/source/device/vulkan/layer/convolution_vulkan.cpp +++ b/source/device/vulkan/layer/convolution_vulkan.cpp @@ -70,27 +70,27 @@ Convolution_vulkan::Convolution_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) graph = ir_graph; node = ir_node; - struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); std::string name = input->name; bottoms.push_back(name); // Tensor* output_tensor = t_node->GetOutputTensor(0); - struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); name = output->name; tops.push_back(name); // Convolution* conv_op = dynamic_cast(node->GetOp()); // ConvParam* param = conv_op->GetParam(); - struct conv_param *param = (struct conv_param *)ir_node->op.param_mem; + struct conv_param* param = (struct conv_param*)ir_node->op.param_mem; group = param->group; - input_c = input->dims[1]; // param->input_channel; + input_c = input->dims[1]; // param->input_channel; input_h = input->dims[2]; input_w = input->dims[3]; - pad_w0 = param->pad_w0; // left padding columns - pad_w1 = param->pad_w1; // right padding columns - pad_h0 = param->pad_h0; // top padding rows - pad_h1 = param->pad_h1; // bottom padding rows + pad_w0 = param->pad_w0; // left padding columns + pad_w1 = param->pad_w1; // right padding columns + pad_h0 = param->pad_h0; // top padding rows + pad_h1 = param->pad_h1; // bottom padding rows stride_w = param->stride_w; stride_h = param->stride_h; dilation_w = param->dilation_w; @@ -98,10 +98,10 @@ Convolution_vulkan::Convolution_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) kernel_w = param->kernel_w; kernel_h = param->kernel_h; activation = param->activation == 0 ? 1 : -1; - output_c = output->dims[1]; // param->output_channel; + output_c = output->dims[1]; // param->output_channel; output_h = output->dims[2]; output_w = output->dims[3]; - struct tensor *weight = get_ir_graph_tensor(graph, node->input_tensors[1]); + struct tensor* weight = get_ir_graph_tensor(graph, node->input_tensors[1]); weight_data_size = weight->elem_num; } @@ -150,7 +150,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) shape_bordered = Tshape(shape.w + pad_left + pad_right, shape.h + pad_top + pad_bottom, shape.c); } else if ((pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233) - || (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)) + || (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)) { const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; @@ -168,8 +168,10 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) } } - int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1; - int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 + : 1; + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 + : 1; size_t elemsize; size_t out_elemsize; @@ -234,28 +236,28 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) padding->create_pipeline(opt); } - + std::vector specializations(10 + 10); - specializations[0].i = kernel_w; // kernel_w; - specializations[1].i = kernel_h; // kernel_h - specializations[2].i = dilation_w; // dilation_w; - specializations[3].i = dilation_h; // dilation_h; - specializations[4].i = stride_w; // stride_w; - specializations[5].i = stride_h; // stride_h; - specializations[6].i = node->input_num>2 ? 1 : 0; // bias_term; - specializations[7].i = activation; // activation_type; - specializations[8].f = 0;//param->activation; // activation_params.w >= 1 ? activation_params[0] : 0.f; - specializations[9].f = 0;//param->activation; // activation_params.w == 2 ? activation_params[1] : 0.f; - specializations[10 + 0].i = 0;//3; // shape_bordered_packed.dims; - specializations[10 + 1].i = 0;//input_w + pad_w0 + pad_w1; // shape_bordered_packed.w; - specializations[10 + 2].i = 0;//input_h + pad_h0 + pad_h1; // shape_bordered_packed.h; - specializations[10 + 3].i = 0;//input_c; // shape_bordered_packed.c; - specializations[10 + 4].i = 0;//(input_w + pad_w0 + pad_w1) * (input_h + pad_h0 + pad_h1); // shape_bordered_packed.cstep; - specializations[10 + 5].i = 0; // out_shape_packed.dims; - specializations[10 + 6].i = 0;//output_w; // out_shape_packed.w; - specializations[10 + 7].i = 0;//output_h; // out_shape_packed.h; - specializations[10 + 8].i = 0;//output_c; // out_shape_packed.c; - specializations[10 + 9].i = 0;//output_w * output_h; // out_shape_packed.cstep; + specializations[0].i = kernel_w; // kernel_w; + specializations[1].i = kernel_h; // kernel_h + specializations[2].i = dilation_w; // dilation_w; + specializations[3].i = dilation_h; // dilation_h; + specializations[4].i = stride_w; // stride_w; + specializations[5].i = stride_h; // stride_h; + specializations[6].i = node->input_num > 2 ? 1 : 0; // bias_term; + specializations[7].i = activation; // activation_type; + specializations[8].f = 0; //param->activation; // activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations[9].f = 0; //param->activation; // activation_params.w == 2 ? activation_params[1] : 0.f; + specializations[10 + 0].i = 0; //3; // shape_bordered_packed.dims; + specializations[10 + 1].i = 0; //input_w + pad_w0 + pad_w1; // shape_bordered_packed.w; + specializations[10 + 2].i = 0; //input_h + pad_h0 + pad_h1; // shape_bordered_packed.h; + specializations[10 + 3].i = 0; //input_c; // shape_bordered_packed.c; + specializations[10 + 4].i = 0; //(input_w + pad_w0 + pad_w1) * (input_h + pad_h0 + pad_h1); // shape_bordered_packed.cstep; + specializations[10 + 5].i = 0; // out_shape_packed.dims; + specializations[10 + 6].i = 0; //output_w; // out_shape_packed.w; + specializations[10 + 7].i = 0; //output_h; // out_shape_packed.h; + specializations[10 + 8].i = 0; //output_c; // out_shape_packed.c; + specializations[10 + 9].i = 0; //output_w * output_h; // out_shape_packed.cstep; // TODO with local_size_xyz and shader_index options @@ -263,9 +265,8 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) local_size_xyz.w = std::min(8, out_shape_packed.w); local_size_xyz.h = std::min(8, out_shape_packed.h); local_size_xyz.c = std::min(4, out_shape_packed.c); - - // TLOG_INFO("create pipeline elempack out_elempack:%d %d\n", elempack, out_elempack); + // TLOG_INFO("create pipeline elempack out_elempack:%d %d\n", elempack, out_elempack); if (elempack == 1 && out_elempack == 1) { @@ -384,7 +385,7 @@ int Convolution_vulkan::destroy_pipeline(const Option& /*opt*/) } int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) -{ +{ tensor* weight_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]); // Tensor weight_data = Tensor(weight_tensor->elem_num, 1, 1, weight_tensor->data); @@ -399,9 +400,11 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) int num_output = output_c; int num_input = input_c; //weight_data_size / maxk / num_output; - int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1; + int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 + : 1; // int elempack = 1; - int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 + : 1; // TLOG_INFO("conv upload model pack:%d %d\n", elempack, out_elempack); @@ -409,25 +412,24 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) { Tensor weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); - weight_data_packed.create(maxk, num_input/elempack, num_output/out_elempack, (size_t)4*elempack*out_elempack, elempack*out_elempack); - for (int q=0; q+(out_elempack-1)input_num > 2) + if (node->input_num > 2) { tensor* bias_tensor = get_ir_graph_tensor(graph, node->input_tensors[2]); Tensor bias_data = Tensor(bias_tensor->elem_num, bias_tensor->data); @@ -470,7 +472,6 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) { cmd.record_upload(bias_data_packed, bias_data_gpu, opt); } - } // if (innerproduct) @@ -492,7 +493,7 @@ int Convolution_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& t bottom_blob_dim3.w = 1; bottom_blob_dim3.cstep = 1; } - + int w = bottom_blob_dim3.w; int h = bottom_blob_dim3.h; int channels = bottom_blob_dim3.c; @@ -500,7 +501,8 @@ int Convolution_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& t int elempack = bottom_blob_dim3.elempack; // TLOG_INFO("botom shape:%d %d %d %d %d %d %d\n", bottom_blob.dims, bottom_blob.c, bottom_blob.h, bottom_blob.w, bottom_blob.elemsize, bottom_blob.elempack, bottom_blob.cstep); - int out_elempack = opt.use_shader_pack8 && output_c % 8 == 0 ? 8 : output_c % 4 == 0 ? 4 : 1; + int out_elempack = opt.use_shader_pack8 && output_c % 8 == 0 ? 8 : output_c % 4 == 0 ? 4 + : 1; size_t out_elemsize = elemsize / elempack * out_elempack; VkTensor bottom_blob_bordered = bottom_blob_dim3; @@ -551,7 +553,7 @@ int Convolution_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& t dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; dispatcher.h = 1; dispatcher.c = top_blob.c; - + cmd.record_pipeline(pipeline_convolution_pack4_1x1s1d1, bindings, constants, dispatcher); } else if (elempack == 8 && out_elempack == 8 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1) @@ -609,7 +611,7 @@ int Convolution_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& t // TLOG_INFO("top shape:%d %d %d\n", top_blob.c, top_blob.h, top_blob.w); // cmd.record_pipeline(pipeline_convolution, bindings, constants, top_blob); - // TLOG_INFO("run record convolution\n"); + // TLOG_INFO("run record convolution\n"); return 0; } diff --git a/source/device/vulkan/layer/convolution_vulkan.hpp b/source/device/vulkan/layer/convolution_vulkan.hpp index a1e7c1ad8..c0799f877 100644 --- a/source/device/vulkan/layer/convolution_vulkan.hpp +++ b/source/device/vulkan/layer/convolution_vulkan.hpp @@ -63,16 +63,15 @@ class Convolution_vulkan : public Layer // virtual int record_pipeline(VkCompute& cmd, const Option& opt) const; virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; - public: int group; int input_c; int input_h; int input_w; - int pad_w0; // left padding columns - int pad_w1; // right padding columns - int pad_h0; // top padding rows - int pad_h1; // bottom padding rows + int pad_w0; // left padding columns + int pad_w1; // right padding columns + int pad_h0; // top padding rows + int pad_h1; // bottom padding rows int stride_h; int stride_w; int dilation_h; @@ -111,5 +110,4 @@ class Convolution_vulkan : public Layer } // namespace TEngine - #endif diff --git a/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp b/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp index bc950cf38..51f83b773 100644 --- a/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp +++ b/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp @@ -42,52 +42,52 @@ namespace TEngine { - ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan() - { - support_vulkan = true; - pipeline_convolutiondepthwise = 0; - } +ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan() +{ + support_vulkan = true; + pipeline_convolutiondepthwise = 0; +} - ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) - { - support_vulkan = true; +ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) +{ + support_vulkan = true; - padding = 0; + padding = 0; - pipeline_convolutiondepthwise = 0; - pipeline_convolutiondepthwise_pack4 = 0; - pipeline_convolutiondepthwise_pack8 = 0; - graph = ir_graph; - node = ir_node; - - struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); - std::string name = input->name; - bottoms.push_back(name); - - struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); - name = output->name; - tops.push_back(name); - - struct conv_param *param = (struct conv_param *)ir_node->op.param_mem; - - group = param->group; - input_c = input->dims[1]; // param->input_channel; - input_h = input->dims[2]; - input_w = input->dims[3]; - pad_w0 = param->pad_w0; // left padding columns - pad_w1 = param->pad_w1; // right padding columns - pad_h0 = param->pad_h0; // top padding rows - pad_h1 = param->pad_h1; // bottom padding rows - stride_w = param->stride_w; - stride_h = param->stride_h; - dilation_w = param->dilation_w; - dilation_h = param->dilation_h; - kernel_w = param->kernel_w; - kernel_h = param->kernel_h; - output_c = output->dims[1]; // param->output_channel; - output_h = output->dims[2]; - output_w = output->dims[3]; - } + pipeline_convolutiondepthwise = 0; + pipeline_convolutiondepthwise_pack4 = 0; + pipeline_convolutiondepthwise_pack8 = 0; + graph = ir_graph; + node = ir_node; + + struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); + std::string name = input->name; + bottoms.push_back(name); + + struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); + name = output->name; + tops.push_back(name); + + struct conv_param* param = (struct conv_param*)ir_node->op.param_mem; + + group = param->group; + input_c = input->dims[1]; // param->input_channel; + input_h = input->dims[2]; + input_w = input->dims[3]; + pad_w0 = param->pad_w0; // left padding columns + pad_w1 = param->pad_w1; // right padding columns + pad_h0 = param->pad_h0; // top padding rows + pad_h1 = param->pad_h1; // bottom padding rows + stride_w = param->stride_w; + stride_h = param->stride_h; + dilation_w = param->dilation_w; + dilation_h = param->dilation_h; + kernel_w = param->kernel_w; + kernel_h = param->kernel_h; + output_c = output->dims[1]; // param->output_channel; + output_h = output->dims[2]; + output_w = output->dims[3]; +} int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt) { @@ -114,13 +114,14 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt) padding->create_pipeline(opt); } - // const int maxk = kernel_w * kernel_h; int channels = input_c; // (weight_data_size / group) / maxk / (num_output / group) * group; int num_output = output_c; - int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1; - int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 + : 1; + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 + : 1; size_t elemsize; size_t out_elemsize; @@ -141,27 +142,27 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt) } std::vector specializations(11 + 10); - specializations[0].i = kernel_w; // kernel_w; - specializations[1].i = kernel_h; // kernel_h - specializations[2].i = dilation_w; // dilation_w; - specializations[3].i = dilation_h; // dilation_h; - specializations[4].i = stride_w; // stride_w; - specializations[5].i = stride_h; // stride_h; - specializations[6].i = node->input_num >2 ? 1 : 0; // bias_term; + specializations[0].i = kernel_w; // kernel_w; + specializations[1].i = kernel_h; // kernel_h + specializations[2].i = dilation_w; // dilation_w; + specializations[3].i = dilation_h; // dilation_h; + specializations[4].i = stride_w; // stride_w; + specializations[5].i = stride_h; // stride_h; + specializations[6].i = node->input_num > 2 ? 1 : 0; // bias_term; specializations[7].i = group; - specializations[8].i = 1;//param->activation; // activation_type; - specializations[9].f = 0;//param->activation; // activation_params.w >= 1 ? activation_params[0] : 0.f; - specializations[10].f = 0;//param->activation; // activation_params.w == 2 ? activation_params[1] : 0.f; - specializations[11 + 0].i = 0; // 3; // shape_bordered_packed.dims; - specializations[11 + 1].i = 0; // input_w + pad_w0 + pad_w1; // shape_bordered_packed.w; - specializations[11 + 2].i = 0; // input_h + pad_h0 + pad_h1; // shape_bordered_packed.h; - specializations[11 + 3].i = 0; // input_c; // shape_bordered_packed.c; - specializations[11 + 4].i = 0; // (input_w + pad_w0 + pad_w1) * (input_h + pad_h0 + pad_h1); // shape_bordered_packed.cstep; - specializations[11 + 5].i = 0; // 3; // out_shape_packed.dims; - specializations[11 + 6].i = 0; // output_w; // out_shape_packed.w; - specializations[11 + 7].i = 0; // output_h; // out_shape_packed.h; - specializations[11 + 8].i = 0; // output_c; // out_shape_packed.c; - specializations[11 + 9].i = 0; // output_w * output_h; // out_shape_packed.cstep; + specializations[8].i = 1; //param->activation; // activation_type; + specializations[9].f = 0; //param->activation; // activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations[10].f = 0; //param->activation; // activation_params.w == 2 ? activation_params[1] : 0.f; + specializations[11 + 0].i = 0; // 3; // shape_bordered_packed.dims; + specializations[11 + 1].i = 0; // input_w + pad_w0 + pad_w1; // shape_bordered_packed.w; + specializations[11 + 2].i = 0; // input_h + pad_h0 + pad_h1; // shape_bordered_packed.h; + specializations[11 + 3].i = 0; // input_c; // shape_bordered_packed.c; + specializations[11 + 4].i = 0; // (input_w + pad_w0 + pad_w1) * (input_h + pad_h0 + pad_h1); // shape_bordered_packed.cstep; + specializations[11 + 5].i = 0; // 3; // out_shape_packed.dims; + specializations[11 + 6].i = 0; // output_w; // out_shape_packed.w; + specializations[11 + 7].i = 0; // output_h; // out_shape_packed.h; + specializations[11 + 8].i = 0; // output_c; // out_shape_packed.c; + specializations[11 + 9].i = 0; // output_w * output_h; // out_shape_packed.cstep; VkTensor local_size_xyz; local_size_xyz.w = std::min(4, output_w); @@ -217,14 +218,15 @@ int ConvolutionDepthWise_vulkan::destroy_pipeline(const Option& opt) int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt) { - // upload kernel data + // upload kernel data const int maxk = kernel_w * kernel_h; int channels = input_c; // (weight_data_size / group) / maxk / (num_output / group) * group; int num_output = output_c; - int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1; - int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; - + int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 + : 1; + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 + : 1; tensor* weight_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]); Tensor weight_data = Tensor(weight_tensor->elem_num, weight_tensor->data); @@ -236,13 +238,13 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt cmd.record_upload(weight_data_packed, weight_data_gpu, opt); // upload bias data - if(node->input_num > 2) + if (node->input_num > 2) { tensor* bias_tensor = get_ir_graph_tensor(graph, node->input_tensors[2]); Tensor bias_data = Tensor(bias_tensor->elem_num, bias_tensor->data); Tensor bias_data_packed; convert_packing(bias_data, bias_data_packed, out_elempack); - cmd.record_upload(bias_data_packed, bias_data_gpu, opt); + cmd.record_upload(bias_data_packed, bias_data_gpu, opt); } return 0; } @@ -255,7 +257,6 @@ int ConvolutionDepthWise_vulkan::record_pipeline(const VkTensor& bottom_blob, Vk size_t elemsize = bottom_blob.elemsize; int elempack = bottom_blob.elempack; - VkTensor bottom_blob_bordered = bottom_blob; if (pad_h0 > 0 || pad_h1 > 0 || pad_w0 > 0 || pad_w1 > 0) { @@ -268,7 +269,7 @@ int ConvolutionDepthWise_vulkan::record_pipeline(const VkTensor& bottom_blob, Vk padding->record_pipeline(bottom_blob, bottom_blob_bordered, cmd, opt_pad); } - top_blob.create(output_w, output_h, output_c/elempack, elemsize, elempack, opt.blob_vkallocator); + top_blob.create(output_w, output_h, output_c / elempack, elemsize, elempack, opt.blob_vkallocator); std::vector bindings(4); bindings[0] = bottom_blob_bordered; @@ -289,13 +290,13 @@ int ConvolutionDepthWise_vulkan::record_pipeline(const VkTensor& bottom_blob, Vk constants[9].i = top_blob.cstep; // printf("top shape:%d %d %d\n", top_blob.c, top_blob.h, top_blob.w); - const Pipeline* pipeline = elempack == 8 ? pipeline_convolutiondepthwise_pack8 - : elempack == 4 ? pipeline_convolutiondepthwise_pack4 - : pipeline_convolutiondepthwise; + const Pipeline* pipeline = elempack == 8 ? pipeline_convolutiondepthwise_pack8 + : elempack == 4 ? pipeline_convolutiondepthwise_pack4 + : pipeline_convolutiondepthwise; cmd.record_pipeline(pipeline, bindings, constants, top_blob); return 0; } -} \ No newline at end of file +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp b/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp index 05f78f22c..7b867529b 100644 --- a/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp +++ b/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp @@ -65,10 +65,10 @@ class ConvolutionDepthWise_vulkan : public Layer int input_c; int input_h; int input_w; - int pad_w0; // left padding columns - int pad_w1; // right padding columns - int pad_h0; // top padding rows - int pad_h1; // bottom padding rows + int pad_w0; // left padding columns + int pad_w1; // right padding columns + int pad_h0; // top padding rows + int pad_h1; // bottom padding rows int stride_h; int stride_w; int dilation_h; @@ -92,5 +92,4 @@ class ConvolutionDepthWise_vulkan : public Layer } // namespace TEngine - #endif diff --git a/source/device/vulkan/layer/crop_vulkan.cpp b/source/device/vulkan/layer/crop_vulkan.cpp index 26f8768e8..d00325e34 100644 --- a/source/device/vulkan/layer/crop_vulkan.cpp +++ b/source/device/vulkan/layer/crop_vulkan.cpp @@ -76,36 +76,36 @@ Crop_vulkan::Crop_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) graph = ir_graph; node = ir_node; - for(int i = 0; i < ir_node->input_num; i++) + for (int i = 0; i < ir_node->input_num; i++) { - struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[i]); + struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[i]); std::string name = input->name; bottoms.push_back(name); } - for(int i = 0; i < ir_node->output_num; i++) + for (int i = 0; i < ir_node->output_num; i++) { - struct tensor *output = get_ir_graph_tensor(graph, node->input_tensors[i]); + struct tensor* output = get_ir_graph_tensor(graph, node->input_tensors[i]); std::string name = output->name; tops.push_back(name); } // params - struct tensor *input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]); - struct tensor *output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]); - input_c = input_tensor->dims[1]; // param->input_channel; + struct tensor* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]); + struct tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]); + input_c = input_tensor->dims[1]; // param->input_channel; input_h = input_tensor->dims[2]; input_w = input_tensor->dims[3]; - output_c = output_tensor->dims[1]; // param->output_channel; + output_c = output_tensor->dims[1]; // param->output_channel; output_h = output_tensor->dims[2]; output_w = output_tensor->dims[3]; - struct crop_param *param = (struct crop_param *)ir_node->op.param_mem; + struct crop_param* param = (struct crop_param*)ir_node->op.param_mem; int num_args = param->num_args; - int offset_c = 0; // param->offset_c; - int offset_h = 0; // param->offset_h; - int offset_w = 0; // param->offset_w; + int offset_c = 0; // param->offset_c; + int offset_h = 0; // param->offset_h; + int offset_w = 0; // param->offset_w; int crop_h = param->crop_h; int crop_w = param->crop_w; int center_crop = param->center_crop; @@ -117,27 +117,34 @@ int Crop_vulkan::create_pipeline(const Option& _opt) { Option opt = _opt; - const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0]; + const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0]; const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0]; int elempack = 1; - if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; - if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; - if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 + : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 + : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 + : 1; int out_elempack = 1; - if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1; - if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1; - if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1; + if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 + : 1; + if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 + : 1; + if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 + : 1; int offset_elempack = 1; - + { // TODO vec and image crop if (offset_c == 0) offset_elempack = elempack; else - offset_elempack = opt.use_shader_pack8 && offset_c % 8 == 0 ? 8 : offset_c % 4 == 0 ? 4 : 1; + offset_elempack = opt.use_shader_pack8 && offset_c % 8 == 0 ? 8 : offset_c % 4 == 0 ? 4 + : 1; } size_t elemsize; @@ -192,16 +199,16 @@ int Crop_vulkan::create_pipeline(const Option& _opt) std::vector specializations(1 + 10); specializations[0].i = vkdev->info.bug_implicit_fp16_arithmetic; - specializations[1 + 0].i = 0; // shape_unpacked.dims; - specializations[1 + 1].i = 0; // shape_unpacked.w; - specializations[1 + 2].i = 0; // shape_unpacked.h; - specializations[1 + 3].i = 0; // shape_unpacked.c; - specializations[1 + 4].i = 0; // shape_unpacked.cstep; - specializations[1 + 5].i = 0; // out_shape_packed.dims; - specializations[1 + 6].i = 0; // out_shape_packed.w; - specializations[1 + 7].i = 0; // out_shape_packed.h; - specializations[1 + 8].i = 0; // out_shape_packed.c; - specializations[1 + 9].i = 0; // out_shape_packed.cstep; + specializations[1 + 0].i = 0; // shape_unpacked.dims; + specializations[1 + 1].i = 0; // shape_unpacked.w; + specializations[1 + 2].i = 0; // shape_unpacked.h; + specializations[1 + 3].i = 0; // shape_unpacked.c; + specializations[1 + 4].i = 0; // shape_unpacked.cstep; + specializations[1 + 5].i = 0; // out_shape_packed.dims; + specializations[1 + 6].i = 0; // out_shape_packed.w; + specializations[1 + 7].i = 0; // out_shape_packed.h; + specializations[1 + 8].i = 0; // out_shape_packed.c; + specializations[1 + 9].i = 0; // out_shape_packed.cstep; Tensor local_size_xyz; if (out_shape_packed.dims == 1) @@ -295,7 +302,6 @@ int Crop_vulkan::create_pipeline(const Option& _opt) pipeline_crop_pack8to1->create(LayerShaderType::crop_pack8to1, opt, specializations); } - return 0; } @@ -357,9 +363,12 @@ int Crop_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob return 0; } - int offset_elempack = _coffset == 0 ? elempack : opt.use_shader_pack8 && _coffset % 8 == 0 ? 8 : _coffset % 4 == 0 ? 4 : 1; + int offset_elempack = _coffset == 0 ? elempack : opt.use_shader_pack8 && _coffset % 8 == 0 ? 8 + : _coffset % 4 == 0 ? 4 + : 1; - int out_elempack = opt.use_shader_pack8 && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1; + int out_elempack = opt.use_shader_pack8 && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 + : 1; size_t out_elemsize = elemsize / elempack * out_elempack; if (opt.use_fp16_packed && !opt.use_fp16_storage) @@ -483,9 +492,9 @@ int Crop_vulkan::record_pipeline(const std::vector& bottom_blobs, std: _outw = output_w; _outh = output_h; _outc = output_c; - _woffset = 0; // offset_w; - _hoffset = 0; // offset_h; - _coffset = 0; // offset_c; + _woffset = 0; // offset_w; + _hoffset = 0; // offset_h; + _coffset = 0; // offset_c; // TODO vec and image crop @@ -497,9 +506,12 @@ int Crop_vulkan::record_pipeline(const std::vector& bottom_blobs, std: return 0; } - int offset_elempack = _coffset == 0 ? elempack : opt.use_shader_pack8 && _coffset % 8 == 0 ? 8 : _coffset % 4 == 0 ? 4 : 1; + int offset_elempack = _coffset == 0 ? elempack : opt.use_shader_pack8 && _coffset % 8 == 0 ? 8 + : _coffset % 4 == 0 ? 4 + : 1; - int out_elempack = opt.use_shader_pack8 && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1; + int out_elempack = opt.use_shader_pack8 && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 + : 1; size_t out_elemsize = elemsize / elempack * out_elempack; if (opt.use_fp16_packed && !opt.use_fp16_storage) @@ -604,4 +616,4 @@ int Crop_vulkan::record_pipeline(const std::vector& bottom_blobs, std: return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/crop_vulkan.hpp b/source/device/vulkan/layer/crop_vulkan.hpp index 1a55f3ca1..2316f07c0 100644 --- a/source/device/vulkan/layer/crop_vulkan.hpp +++ b/source/device/vulkan/layer/crop_vulkan.hpp @@ -45,7 +45,7 @@ #include "crop_param.h" -namespace TEngine{ +namespace TEngine { class Crop_vulkan : public Layer { @@ -55,7 +55,7 @@ class Crop_vulkan : public Layer virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); - + void resolve_crop_roi(const Tensor& bottom_blob, int& _woffset, int& _hoffset, int& _coffset, int& _outw, int& _outh, int& _outc) const; virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; virtual int record_pipeline(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; @@ -78,7 +78,7 @@ class Crop_vulkan : public Layer int output_c; int output_h; int output_w; - + int num_args; int offset_c; int offset_h; @@ -90,6 +90,6 @@ class Crop_vulkan : public Layer int flag; }; -} // namespace TEngine +} // namespace TEngine #endif \ No newline at end of file diff --git a/source/device/vulkan/layer/dropout_vulkan.cpp b/source/device/vulkan/layer/dropout_vulkan.cpp index a6c3e0724..bf46fa34c 100644 --- a/source/device/vulkan/layer/dropout_vulkan.cpp +++ b/source/device/vulkan/layer/dropout_vulkan.cpp @@ -64,26 +64,26 @@ Dropout_vulkan::Dropout_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) graph = ir_graph; node = ir_node; - struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); std::string name = input->name; bottoms.push_back(name); - struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); name = output->name; tops.push_back(name); // params - input_c = input->dims[1]; // param->input_channel; + input_c = input->dims[1]; // param->input_channel; input_h = input->dims[2]; input_w = input->dims[3]; - output_c = output->dims[1]; // param->output_channel; + output_c = output->dims[1]; // param->output_channel; output_h = output->dims[2]; output_w = output->dims[3]; - if(input->scale != 0) + if (input->scale != 0) scale = input->scale; else - scale = 1.0f; + scale = 1.0f; } int Dropout_vulkan::create_pipeline(const Option& opt) @@ -91,9 +91,12 @@ int Dropout_vulkan::create_pipeline(const Option& opt) const Tensor& shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0]; int elempack = 1; - if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; - if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; - if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 + : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 + : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 + : 1; size_t elemsize; if (opt.use_fp16_storage) @@ -202,15 +205,13 @@ int Dropout_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, c constants[3].i = bottom_top_blob.c; constants[4].i = bottom_top_blob.cstep; - const Pipeline* pipeline = elempack == 8 ? pipeline_dropout_pack8 + const Pipeline* pipeline = elempack == 8 ? pipeline_dropout_pack8 : elempack == 4 ? pipeline_dropout_pack4 - : pipeline_dropout; + : pipeline_dropout; cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob); return 0; } - - -} // namespace TEngine \ No newline at end of file +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/dropout_vulkan.hpp b/source/device/vulkan/layer/dropout_vulkan.hpp index b6e943889..478345ca7 100644 --- a/source/device/vulkan/layer/dropout_vulkan.hpp +++ b/source/device/vulkan/layer/dropout_vulkan.hpp @@ -43,7 +43,7 @@ #include "../vulkan_layer.hpp" #include "../vulkan_command.hpp" -namespace TEngine{ +namespace TEngine { class Dropout_vulkan : public Layer { @@ -54,7 +54,7 @@ class Dropout_vulkan : public Layer virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); // virtual int upload_model(VkTransfer& cmd, const Option& opt); - + virtual int record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const; public: @@ -70,9 +70,8 @@ class Dropout_vulkan : public Layer int output_h; int output_w; float scale; - }; -} // namespace TEngine +} // namespace TEngine #endif \ No newline at end of file diff --git a/source/device/vulkan/layer/eltwise_vulkan.cpp b/source/device/vulkan/layer/eltwise_vulkan.cpp index 9fc322bc9..a8d112bf4 100644 --- a/source/device/vulkan/layer/eltwise_vulkan.cpp +++ b/source/device/vulkan/layer/eltwise_vulkan.cpp @@ -70,22 +70,22 @@ Eltwise_vulkan::Eltwise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) graph = ir_graph; node = ir_node; - for(int i = 0; i < ir_node->input_num; i++) + for (int i = 0; i < ir_node->input_num; i++) { - struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[i]); + struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[i]); std::string name = input->name; bottoms.push_back(name); } - for(int i = 0; i < ir_node->output_num; i++) + for (int i = 0; i < ir_node->output_num; i++) { - struct tensor *output = get_ir_graph_tensor(graph, node->input_tensors[i]); + struct tensor* output = get_ir_graph_tensor(graph, node->input_tensors[i]); std::string name = output->name; tops.push_back(name); } - struct eltwise_param *param = (struct eltwise_param *)ir_node->op.param_mem; - op_type = (param -> type) / 2; + struct eltwise_param* param = (struct eltwise_param*)ir_node->op.param_mem; + op_type = (param->type) / 2; } int Eltwise_vulkan::create_pipeline(const Option& opt) @@ -93,9 +93,12 @@ int Eltwise_vulkan::create_pipeline(const Option& opt) const Tensor& shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0]; int elempack = 1; - if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; - if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; - if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 + : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 + : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 + : 1; size_t elemsize; if (opt.use_fp16_storage) @@ -118,12 +121,12 @@ int Eltwise_vulkan::create_pipeline(const Option& opt) std::vector specializations(2 + 5); specializations[0].i = op_type; - specializations[1].i = 0; // coeffs.w == 0 ? 0 : 1; TODO fix coeffs value - specializations[2 + 0].i = 0; // shape_packed.dims; - specializations[2 + 1].i = 0; // shape_packed.w; - specializations[2 + 2].i = 0; // shape_packed.h; - specializations[2 + 3].i = 0; // shape_packed.c; - specializations[2 + 4].i = 0; // shape_packed.cstep; + specializations[1].i = 0; // coeffs.w == 0 ? 0 : 1; TODO fix coeffs value + specializations[2 + 0].i = 0; // shape_packed.dims; + specializations[2 + 1].i = 0; // shape_packed.w; + specializations[2 + 2].i = 0; // shape_packed.h; + specializations[2 + 3].i = 0; // shape_packed.c; + specializations[2 + 4].i = 0; // shape_packed.cstep; Tensor local_size_xyz; if (shape_packed.dims == 1) @@ -228,12 +231,12 @@ int Eltwise_vulkan::record_pipeline(const std::vector& bottom_blobs, s constants[2].i = top_blob.h; constants[3].i = top_blob.c; constants[4].i = top_blob.cstep; - constants[5].f = 1.0f; // coeffs.w == 0 ? 1.f : coeffs[0]; TODO fix coeffs value - constants[6].f = 1.0f; // coeffs.w == 0 ? 1.f : coeffs[1]; + constants[5].f = 1.0f; // coeffs.w == 0 ? 1.f : coeffs[0]; TODO fix coeffs value + constants[6].f = 1.0f; // coeffs.w == 0 ? 1.f : coeffs[1]; - const Pipeline* pipeline = elempack == 8 ? pipeline_eltwise_pack8[1] + const Pipeline* pipeline = elempack == 8 ? pipeline_eltwise_pack8[1] : elempack == 4 ? pipeline_eltwise_pack4[1] - : pipeline_eltwise[1]; + : pipeline_eltwise[1]; cmd.record_pipeline(pipeline, bindings, constants, top_blob); @@ -251,11 +254,11 @@ int Eltwise_vulkan::record_pipeline(const std::vector& bottom_blobs, s constants[3].i = top_blob.c; constants[4].i = top_blob.cstep; constants[5].f = 1.f; - constants[6].f = 1.0f; // coeffs.w == 0 ? 1 : coeffs[b]; TODO fixcoeffs value + constants[6].f = 1.0f; // coeffs.w == 0 ? 1 : coeffs[b]; TODO fixcoeffs value - const Pipeline* pipeline = elempack == 8 ? pipeline_eltwise_pack8[b % 2] + const Pipeline* pipeline = elempack == 8 ? pipeline_eltwise_pack8[b % 2] : elempack == 4 ? pipeline_eltwise_pack4[b % 2] - : pipeline_eltwise[b % 2]; + : pipeline_eltwise[b % 2]; cmd.record_pipeline(pipeline, bindings, constants, top_blob); } @@ -263,4 +266,4 @@ int Eltwise_vulkan::record_pipeline(const std::vector& bottom_blobs, s return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/eltwise_vulkan.hpp b/source/device/vulkan/layer/eltwise_vulkan.hpp index 5830b076d..5830aea6a 100644 --- a/source/device/vulkan/layer/eltwise_vulkan.hpp +++ b/source/device/vulkan/layer/eltwise_vulkan.hpp @@ -45,7 +45,7 @@ #include "eltwise_param.h" -namespace TEngine{ +namespace TEngine { class Eltwise_vulkan : public Layer { @@ -55,7 +55,7 @@ class Eltwise_vulkan : public Layer virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); - + virtual int record_pipeline(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; public: @@ -84,7 +84,7 @@ class Eltwise_vulkan : public Layer ELT_SQUARE, ELT_POW }; - int op_type; // Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2 + int op_type; // Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2 int input_c; int input_h; @@ -94,6 +94,6 @@ class Eltwise_vulkan : public Layer int output_w; }; -} // namespace TEngine +} // namespace TEngine #endif \ No newline at end of file diff --git a/source/device/vulkan/layer/flatten_vulkan.cpp b/source/device/vulkan/layer/flatten_vulkan.cpp index 589b7d5d4..798402f2c 100644 --- a/source/device/vulkan/layer/flatten_vulkan.cpp +++ b/source/device/vulkan/layer/flatten_vulkan.cpp @@ -70,22 +70,22 @@ Flatten_vulkan::Flatten_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) graph = ir_graph; node = ir_node; - struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); std::string name = input->name; bottoms.push_back(name); - struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); name = output->name; tops.push_back(name); // params - input_c = input->dims[1]; // param->input_channel; + input_c = input->dims[1]; // param->input_channel; input_h = input->dims[2]; input_w = input->dims[3]; - output_c = output->dims[1]; // param->output_channel; + output_c = output->dims[1]; // param->output_channel; output_h = output->dims[2]; output_w = output->dims[3]; - output_size = output->dims[3]*output->dims[2]*output->dims[1]; + output_size = output->dims[3] * output->dims[2] * output->dims[1]; } int Flatten_vulkan::create_pipeline(const Option& _opt) @@ -95,14 +95,17 @@ int Flatten_vulkan::create_pipeline(const Option& _opt) // const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0]; const Tensor& out_shape = Tensor(output_size, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0]; - int elempack = 1; - if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; - if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; - if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 + : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 + : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 + : 1; int out_elempack = 1; - if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1; + if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 + : 1; size_t elemsize; size_t out_elemsize; @@ -137,16 +140,16 @@ int Flatten_vulkan::create_pipeline(const Option& _opt) } std::vector specializations(0 + 10); - specializations[0 + 0].i = 0; // shape_packed.dims; - specializations[0 + 1].i = 0; // shape_packed.w; - specializations[0 + 2].i = 0; // shape_packed.h; - specializations[0 + 3].i = 0; // shape_packed.c; - specializations[0 + 4].i = 0; // shape_packed.cstep; - specializations[0 + 5].i = 0; // out_shape_packed.dims; - specializations[0 + 6].i = 0; // out_shape_packed.w; - specializations[0 + 7].i = 0; // out_shape_packed.h; - specializations[0 + 8].i = 0; // out_shape_packed.c; - specializations[0 + 9].i = 0; // out_shape_packed.cstep; + specializations[0 + 0].i = 0; // shape_packed.dims; + specializations[0 + 1].i = 0; // shape_packed.w; + specializations[0 + 2].i = 0; // shape_packed.h; + specializations[0 + 3].i = 0; // shape_packed.c; + specializations[0 + 4].i = 0; // shape_packed.cstep; + specializations[0 + 5].i = 0; // out_shape_packed.dims; + specializations[0 + 6].i = 0; // out_shape_packed.w; + specializations[0 + 7].i = 0; // out_shape_packed.h; + specializations[0 + 8].i = 0; // out_shape_packed.c; + specializations[0 + 9].i = 0; // out_shape_packed.cstep; Tensor local_size_xyz(64, 1, 1, (void*)0); if (out_shape_packed.dims != 0) @@ -207,8 +210,6 @@ int Flatten_vulkan::create_pipeline(const Option& _opt) return 0; } - - int Flatten_vulkan::destroy_pipeline(const Option& /*opt*/) { delete pipeline_flatten; @@ -250,7 +251,8 @@ int Flatten_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b int total = w * h * channels * elempack; - int out_elempack = opt.use_shader_pack8 && total % 8 == 0 ? 8 : total % 4 == 0 ? 4 : 1; + int out_elempack = opt.use_shader_pack8 && total % 8 == 0 ? 8 : total % 4 == 0 ? 4 + : 1; size_t out_elemsize = elemsize / elempack * out_elempack; if (opt.use_fp16_packed && !opt.use_fp16_storage) @@ -323,4 +325,4 @@ int Flatten_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/flatten_vulkan.hpp b/source/device/vulkan/layer/flatten_vulkan.hpp index 91de06f9f..cd364ddf2 100644 --- a/source/device/vulkan/layer/flatten_vulkan.hpp +++ b/source/device/vulkan/layer/flatten_vulkan.hpp @@ -45,7 +45,7 @@ #include "flatten_param.h" -namespace TEngine{ +namespace TEngine { class Flatten_vulkan : public Layer { @@ -55,7 +55,7 @@ class Flatten_vulkan : public Layer virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); - + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; public: @@ -74,9 +74,8 @@ class Flatten_vulkan : public Layer int output_h; int output_w; int output_size; - }; -} // namespace TEngine +} // namespace TEngine #endif \ No newline at end of file diff --git a/source/device/vulkan/layer/innerproduct_vulkan.cpp b/source/device/vulkan/layer/innerproduct_vulkan.cpp index c4ba14e99..8e1d66b8a 100644 --- a/source/device/vulkan/layer/innerproduct_vulkan.cpp +++ b/source/device/vulkan/layer/innerproduct_vulkan.cpp @@ -80,35 +80,34 @@ InnerProduct_vulkan::InnerProduct_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_nod graph = ir_graph; node = ir_node; - struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); std::string name = input->name; bottoms.push_back(name); - struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); name = output->name; tops.push_back(name); - struct fc_param *param = (struct fc_param *)ir_node->op.param_mem; + struct fc_param* param = (struct fc_param*)ir_node->op.param_mem; num_output = param->num_output; - input_c = input->dims[1]; // param->input_channel; + input_c = input->dims[1]; // param->input_channel; input_h = input->dims[2]; input_w = input->dims[3]; - output_c = output->dims[1]; // param->output_channel; + output_c = output->dims[1]; // param->output_channel; output_h = output->dims[2]; output_w = output->dims[3]; - struct tensor *weight = get_ir_graph_tensor(graph, node->input_tensors[1]); + struct tensor* weight = get_ir_graph_tensor(graph, node->input_tensors[1]); weight_data_size = weight->elem_num; activation_type = -1; - } int InnerProduct_vulkan::create_pipeline(const Option& _opt) { Option opt = _opt; - const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0]; + const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0]; const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0]; Tensor shape_flatten; @@ -119,8 +118,10 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt) int num_input = weight_data_size / num_output; - int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1; - int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 + : 1; + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 + : 1; size_t elemsize; size_t out_elemsize; @@ -161,27 +162,26 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt) flatten->output_w = shape_flatten.w; flatten->output_h = shape_flatten.h; flatten->output_c = shape_flatten.c; - flatten->output_size = shape_flatten.w*shape_flatten.h*shape_flatten.c; + flatten->output_size = shape_flatten.w * shape_flatten.h * shape_flatten.c; flatten->create_pipeline(opt); } - std::vector specializations(4 + 10); specializations[0].i = bias_term; specializations[1].i = activation_type; - specializations[2].f = 0.f; // activation_params.w >= 1 ? activation_params[0] : 0.f; - specializations[3].f = 0.f; // activation_params.w == 2 ? activation_params[1] : 0.f; - specializations[4 + 0].i = 0; // shape_flatten_packed.dims; - specializations[4 + 1].i = 0; // shape_flatten_packed.w; - specializations[4 + 2].i = 0; // shape_flatten_packed.h; - specializations[4 + 3].i = 0; // shape_flatten_packed.c; - specializations[4 + 4].i = 0; // shape_flatten_packed.cstep; - specializations[4 + 5].i = 0; // out_shape_packed.dims; - specializations[4 + 6].i = 0; // out_shape_packed.w; - specializations[4 + 7].i = 0; // out_shape_packed.h; - specializations[4 + 8].i = 0; // out_shape_packed.c; - specializations[4 + 9].i = 0; // out_shape_packed.cstep; + specializations[2].f = 0.f; // activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations[3].f = 0.f; // activation_params.w == 2 ? activation_params[1] : 0.f; + specializations[4 + 0].i = 0; // shape_flatten_packed.dims; + specializations[4 + 1].i = 0; // shape_flatten_packed.w; + specializations[4 + 2].i = 0; // shape_flatten_packed.h; + specializations[4 + 3].i = 0; // shape_flatten_packed.c; + specializations[4 + 4].i = 0; // shape_flatten_packed.cstep; + specializations[4 + 5].i = 0; // out_shape_packed.dims; + specializations[4 + 6].i = 0; // out_shape_packed.w; + specializations[4 + 7].i = 0; // out_shape_packed.h; + specializations[4 + 8].i = 0; // out_shape_packed.c; + specializations[4 + 9].i = 0; // out_shape_packed.cstep; Tensor local_size_xyz(std::min(64, num_output / out_elempack), 1, 1, (void*)0); if (out_shape_packed.dims != 0) @@ -309,8 +309,10 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd, const Option& opt) { int num_input = weight_data_size / num_output; - int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1; - int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 + : 1; + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 + : 1; // src = inch-outch // dst = pa-pb-inch/pa-outch/pb @@ -386,7 +388,8 @@ int InnerProduct_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& size_t elemsize = bottom_blob_flattened.elemsize; int elempack = bottom_blob_flattened.elempack; - int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 + : 1; size_t out_elemsize = elemsize / elempack * out_elempack; if (opt.use_fp16_packed && !opt.use_fp16_storage) @@ -461,4 +464,4 @@ int InnerProduct_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/innerproduct_vulkan.hpp b/source/device/vulkan/layer/innerproduct_vulkan.hpp index c682bcb46..c66c36947 100644 --- a/source/device/vulkan/layer/innerproduct_vulkan.hpp +++ b/source/device/vulkan/layer/innerproduct_vulkan.hpp @@ -58,7 +58,7 @@ class InnerProduct_vulkan : public Layer virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); virtual int upload_model(VkTransfer& cmd, const Option& opt); - + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; public: @@ -98,6 +98,6 @@ class InnerProduct_vulkan : public Layer int output_w; }; -} // namespace TEngine +} // namespace TEngine -#endif // LAYER_INNERPRODUCT_VULKAN_H \ No newline at end of file +#endif // LAYER_INNERPRODUCT_VULKAN_H \ No newline at end of file diff --git a/source/device/vulkan/layer/interp_vulkan.cpp b/source/device/vulkan/layer/interp_vulkan.cpp index 586846b72..81c8ae748 100644 --- a/source/device/vulkan/layer/interp_vulkan.cpp +++ b/source/device/vulkan/layer/interp_vulkan.cpp @@ -76,23 +76,23 @@ Interp_vulkan::Interp_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) graph = ir_graph; node = ir_node; - struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); std::string name = input->name; bottoms.push_back(name); - struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); name = output->name; tops.push_back(name); // params - input_c = input->dims[1]; // param->input_channel; + input_c = input->dims[1]; // param->input_channel; input_h = input->dims[2]; input_w = input->dims[3]; - output_c = output->dims[1]; // param->output_channel; + output_c = output->dims[1]; // param->output_channel; output_h = output->dims[2]; output_w = output->dims[3]; - struct interp_param *param = (struct interp_param *)ir_node->op.param_mem; + struct interp_param* param = (struct interp_param*)ir_node->op.param_mem; if (param->height_scale != 0 && param->width_scale != 0) { @@ -101,27 +101,33 @@ Interp_vulkan::Interp_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) } else { - height_scale = (float )output->dims[2] / (float )input_h; - width_scale = (float )output->dims[2] / (float )input_w; + height_scale = (float)output->dims[2] / (float)input_h; + width_scale = (float)output->dims[2] / (float)input_w; } - resize_type = 2;//param->resize_type; + resize_type = 2; //param->resize_type; } int Interp_vulkan::create_pipeline(const Option& _opt) { Option opt = _opt; - const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Mat() : bottom_shapes[0]; + const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Mat() : bottom_shapes[0]; const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0]; int elempack = 1; - if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; - if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; - if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 + : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 + : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 + : 1; int out_elempack = 1; - if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1; - if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1; - if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1; + if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 + : 1; + if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 + : 1; + if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 + : 1; size_t elemsize; size_t out_elemsize; @@ -162,16 +168,16 @@ int Interp_vulkan::create_pipeline(const Option& _opt) { std::vector specializations(1 + 10); specializations[0].i = resize_type; - specializations[1 + 0].i = 0; // shape_packed.dims; - specializations[1 + 1].i = 0; // shape_packed.w; - specializations[1 + 2].i = 0; // shape_packed.h; - specializations[1 + 3].i = 0; // shape_packed.c; - specializations[1 + 4].i = 0; // shape_packed.cstep; - specializations[1 + 5].i = 0; // out_shape_packed.dims; - specializations[1 + 6].i = 0; // out_shape_packed.w; - specializations[1 + 7].i = 0; // out_shape_packed.h; - specializations[1 + 8].i = 0; // out_shape_packed.c; - specializations[1 + 9].i = 0; // out_shape_packed.cstep; + specializations[1 + 0].i = 0; // shape_packed.dims; + specializations[1 + 1].i = 0; // shape_packed.w; + specializations[1 + 2].i = 0; // shape_packed.h; + specializations[1 + 3].i = 0; // shape_packed.c; + specializations[1 + 4].i = 0; // shape_packed.cstep; + specializations[1 + 5].i = 0; // out_shape_packed.dims; + specializations[1 + 6].i = 0; // out_shape_packed.w; + specializations[1 + 7].i = 0; // out_shape_packed.h; + specializations[1 + 8].i = 0; // out_shape_packed.c; + specializations[1 + 9].i = 0; // out_shape_packed.cstep; Tensor local_size_xyz; if (out_shape_packed.dims == 2) @@ -250,16 +256,16 @@ int Interp_vulkan::create_pipeline(const Option& _opt) } std::vector specializations(0 + 10); - specializations[0 + 0].i = 0; // shape_packed.dims; - specializations[0 + 1].i = 0; // shape_packed.w; - specializations[0 + 2].i = 0; // shape_packed.h; - specializations[0 + 3].i = 0; // shape_packed.c; - specializations[0 + 4].i = 0; // shape_packed.cstep; - specializations[0 + 5].i = 0; // out_shape_packed.dims; - specializations[0 + 6].i = 0; // out_shape_packed.w; - specializations[0 + 7].i = 0; // out_shape_packed.h; - specializations[0 + 8].i = 0; // out_shape_packed.c; - specializations[0 + 9].i = 0; // out_shape_packed.cstep; + specializations[0 + 0].i = 0; // shape_packed.dims; + specializations[0 + 1].i = 0; // shape_packed.w; + specializations[0 + 2].i = 0; // shape_packed.h; + specializations[0 + 3].i = 0; // shape_packed.c; + specializations[0 + 4].i = 0; // shape_packed.cstep; + specializations[0 + 5].i = 0; // out_shape_packed.dims; + specializations[0 + 6].i = 0; // out_shape_packed.w; + specializations[0 + 7].i = 0; // out_shape_packed.h; + specializations[0 + 8].i = 0; // out_shape_packed.c; + specializations[0 + 9].i = 0; // out_shape_packed.cstep; Tensor local_size_xyz; if (out_shape_packed.dims == 2) @@ -378,9 +384,9 @@ int Interp_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_bl constants[10].f = w / (float)outw; constants[11].f = h / (float)outh; - const Pipeline* pipeline = elempack == 8 ? pipeline_interp_pack8 + const Pipeline* pipeline = elempack == 8 ? pipeline_interp_pack8 : elempack == 4 ? pipeline_interp_pack4 - : pipeline_interp; + : pipeline_interp; cmd.record_pipeline(pipeline, bindings, constants, top_blob); } @@ -451,9 +457,9 @@ int Interp_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_bl constants[8].i = top_blob.c; constants[9].i = top_blob.cstep; - const Pipeline* pipeline = elempack == 8 ? pipeline_interp_bicubic_pack8 + const Pipeline* pipeline = elempack == 8 ? pipeline_interp_bicubic_pack8 : elempack == 4 ? pipeline_interp_bicubic_pack4 - : pipeline_interp_bicubic; + : pipeline_interp_bicubic; cmd.record_pipeline(pipeline, bindings, constants, top_blob); } @@ -461,4 +467,4 @@ int Interp_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_bl return 0; } -} // TEngine \ No newline at end of file +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/interp_vulkan.hpp b/source/device/vulkan/layer/interp_vulkan.hpp index ef3886f45..98574f499 100644 --- a/source/device/vulkan/layer/interp_vulkan.hpp +++ b/source/device/vulkan/layer/interp_vulkan.hpp @@ -45,7 +45,7 @@ #include "interp_param.h" -namespace TEngine{ +namespace TEngine { class Interp_vulkan : public Layer { @@ -56,7 +56,7 @@ class Interp_vulkan : public Layer virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); // virtual int upload_model(VkTransfer& cmd, const Option& opt); - + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; public: @@ -78,15 +78,13 @@ class Interp_vulkan : public Layer int output_h; int output_w; - int resize_type; //1=nearest 2=bilinear 3=bicubic + int resize_type; //1=nearest 2=bilinear 3=bicubic int output_height; int output_width; float height_scale; float width_scale; - - }; -} // namespace TEngine +} // namespace TEngine #endif \ No newline at end of file diff --git a/source/device/vulkan/layer/packing_vulkan.cpp b/source/device/vulkan/layer/packing_vulkan.cpp index 86a6c9538..88a6de812 100644 --- a/source/device/vulkan/layer/packing_vulkan.cpp +++ b/source/device/vulkan/layer/packing_vulkan.cpp @@ -60,8 +60,6 @@ Packing_vulkan::Packing_vulkan() int Packing_vulkan::create_pipeline(const Option& _opt) { - - Option opt = _opt; // const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0]; // const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; @@ -90,7 +88,6 @@ int Packing_vulkan::create_pipeline(const Option& _opt) // if (out_shape.dims == 1) out_shape_packed = Mat(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack); // if (out_shape.dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack); // if (out_shape.dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack); - // check blob shape // if (!vkdev->shape_support_image_storage(out_shape_packed)) @@ -102,7 +99,7 @@ int Packing_vulkan::create_pipeline(const Option& _opt) std::vector specializations(2 + 10); specializations[0].i = storage_type_from; specializations[1].i = storage_type_to; - specializations[2 + 0].i = 0;// FIXME shape elempack may be dynamic + specializations[2 + 0].i = 0; // FIXME shape elempack may be dynamic specializations[2 + 1].i = 0; specializations[2 + 2].i = 0; specializations[2 + 3].i = 0; @@ -112,11 +109,10 @@ int Packing_vulkan::create_pipeline(const Option& _opt) specializations[2 + 7].i = 0; //out_shape_packed_h; specializations[2 + 8].i = 0; //out_shape_packed_c; specializations[2 + 9].i = 0; //out_shape_packed_cstep; - // printf("out shape dims:%d ---------------------------------\n", out_shape_packed_dims); - VkTensor local_size_xyz;// TODO more precise group size guessed from out_shape_packed + VkTensor local_size_xyz; // TODO more precise group size guessed from out_shape_packed if (out_shape_packed_dims == 1) { local_size_xyz.w = 64; @@ -487,7 +483,6 @@ int Packing_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b cmd.record_pipeline(pipeline_packing_pack8to1, buffer_bindings, image_bindings, constants, bottom_blob); } - // printf("run packing vulkan record pipeline\n"); return 0; } diff --git a/source/device/vulkan/layer/packing_vulkan.hpp b/source/device/vulkan/layer/packing_vulkan.hpp index 10b748020..f528edf11 100644 --- a/source/device/vulkan/layer/packing_vulkan.hpp +++ b/source/device/vulkan/layer/packing_vulkan.hpp @@ -52,7 +52,7 @@ class Packing_vulkan : public Layer virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); - + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; public: @@ -92,5 +92,4 @@ class Packing_vulkan : public Layer } // namespace TEngine - #endif diff --git a/source/device/vulkan/layer/padding_vulkan.cpp b/source/device/vulkan/layer/padding_vulkan.cpp index 756fb05c9..27fa57853 100644 --- a/source/device/vulkan/layer/padding_vulkan.cpp +++ b/source/device/vulkan/layer/padding_vulkan.cpp @@ -50,32 +50,31 @@ Padding_vulkan::Padding_vulkan() pipeline_padding_pack8 = 0; } - - int Padding_vulkan::create_pipeline(const Option& opt) { int elempack = 1; - elempack = opt.use_shader_pack8 && input_c % 8 == 0 ? 8 : input_c % 4 == 0 ? 4 : 1; + elempack = opt.use_shader_pack8 && input_c % 8 == 0 ? 8 : input_c % 4 == 0 ? 4 + : 1; int out_elempack; - out_elempack = opt.use_shader_pack8 && output_c % 8 == 0 ? 8 : output_c % 4 == 0 ? 4 : 1; + out_elempack = opt.use_shader_pack8 && output_c % 8 == 0 ? 8 : output_c % 4 == 0 ? 4 + : 1; // printf("create padding pipeline elempack:%d %d \n", elempack, out_elempack); - std::vector specializations(3 + 10); specializations[0].i = type; specializations[1].f = value; - specializations[2].i = 0; // per_channel_pad_data_size ? 1 : 0; - specializations[3 + 0].i = 3; // shape_packed.dims; - specializations[3 + 1].i = input_w; // shape_packed.w; - specializations[3 + 2].i = input_h; // shape_packed.h; - specializations[3 + 3].i = input_c; // shape_packed.c; + specializations[2].i = 0; // per_channel_pad_data_size ? 1 : 0; + specializations[3 + 0].i = 3; // shape_packed.dims; + specializations[3 + 1].i = input_w; // shape_packed.w; + specializations[3 + 2].i = input_h; // shape_packed.h; + specializations[3 + 3].i = input_c; // shape_packed.c; specializations[3 + 4].i = input_w * input_h; // shape_packed.cstep; - specializations[3 + 5].i = 3; // out_shape_packed.dims; - specializations[3 + 6].i = output_w; // out_shape_packed.w; - specializations[3 + 7].i = output_h; // out_shape_packed.h; - specializations[3 + 8].i = output_c; // out_shape_packed.c; - specializations[3 + 9].i = output_w * output_h; // out_shape_packed.cstep; + specializations[3 + 5].i = 3; // out_shape_packed.dims; + specializations[3 + 6].i = output_w; // out_shape_packed.w; + specializations[3 + 7].i = output_h; // out_shape_packed.h; + specializations[3 + 8].i = output_c; // out_shape_packed.c; + specializations[3 + 9].i = output_w * output_h; // out_shape_packed.cstep; VkTensor local_size_xyz; // if (out_shape_packed.dims != 0) @@ -87,7 +86,7 @@ int Padding_vulkan::create_pipeline(const Option& opt) // pack1 // if (shape.dims == 0 || elempack == 1) - if(elempack == 1) + if (elempack == 1) { pipeline_padding = new Pipeline(vkdev); pipeline_padding->set_optimal_local_size_xyz(local_size_xyz); @@ -96,7 +95,7 @@ int Padding_vulkan::create_pipeline(const Option& opt) // pack4 // if (shape.dims == 0 || elempack == 4) - if(elempack == 4) + if (elempack == 4) { pipeline_padding_pack4 = new Pipeline(vkdev); pipeline_padding_pack4->set_optimal_local_size_xyz(local_size_xyz); @@ -111,7 +110,7 @@ int Padding_vulkan::create_pipeline(const Option& opt) pipeline_padding_pack8->set_optimal_local_size_xyz(local_size_xyz); pipeline_padding_pack8->create(LayerShaderType::padding_pack8, opt, specializations); } - + return 0; } @@ -120,7 +119,6 @@ int Padding_vulkan::destroy_pipeline(const Option& /*opt*/) return 0; } - int Padding_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const { if (top == 0 && bottom == 0 && left == 0 && right == 0) @@ -160,11 +158,11 @@ int Padding_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b constants[9].i = top_blob.cstep; constants[10].i = left; constants[11].i = top; - + // printf("padding shape:%d %d %d %d %d %d %d %d %d\n", top_blob.c, top_blob.h, top_blob.w, top_blob.cstep, bottom_blob.c, bottom_blob.h, bottom_blob.w, bottom_blob.cstep, elempack); - const Pipeline* pipeline = elempack == 8 ? pipeline_padding_pack8 - : elempack == 4 ? pipeline_padding_pack4 - : pipeline_padding; + const Pipeline* pipeline = elempack == 8 ? pipeline_padding_pack8 + : elempack == 4 ? pipeline_padding_pack4 + : pipeline_padding; cmd.record_pipeline(pipeline, bindings, constants, top_blob); diff --git a/source/device/vulkan/layer/padding_vulkan.hpp b/source/device/vulkan/layer/padding_vulkan.hpp index f6aabe066..03bbce43d 100644 --- a/source/device/vulkan/layer/padding_vulkan.hpp +++ b/source/device/vulkan/layer/padding_vulkan.hpp @@ -52,7 +52,7 @@ class Padding_vulkan : public Layer virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); - + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; public: @@ -60,7 +60,7 @@ class Padding_vulkan : public Layer int bottom; int left; int right; - int type;// 0=CONSTANT 1=REPLICATE 2=REFLECT + int type; // 0=CONSTANT 1=REPLICATE 2=REFLECT float value; int input_w; int input_h; @@ -77,5 +77,4 @@ class Padding_vulkan : public Layer } // namespace TEngine - #endif diff --git a/source/device/vulkan/layer/permute_vulkan.cpp b/source/device/vulkan/layer/permute_vulkan.cpp index 461b3cc25..0bead6791 100644 --- a/source/device/vulkan/layer/permute_vulkan.cpp +++ b/source/device/vulkan/layer/permute_vulkan.cpp @@ -76,27 +76,27 @@ Permute_vulkan::Permute_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) graph = ir_graph; node = ir_node; - struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); std::string name = input->name; bottoms.push_back(name); - struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); name = output->name; tops.push_back(name); // params - input_c = input->dims[1]; // param->input_channel; + input_c = input->dims[1]; // param->input_channel; input_h = input->dims[2]; input_w = input->dims[3]; - output_c = output->dims[1]; // param->output_channel; + output_c = output->dims[1]; // param->output_channel; output_h = output->dims[2]; output_w = output->dims[3]; // TODO fix order_type value - struct permute_param *param = (struct permute_param *)ir_node->op.param_mem; + struct permute_param* param = (struct permute_param*)ir_node->op.param_mem; if ((param->order0 == 0) && (param->order1 == 2) && (param->order2 == 3) && (param->order3 == 1)) { - order_type = 3; + order_type = 3; } else if ((param->order0 == 1) && (param->order1 == 0) && (param->order2 == 2) && input->dim_num == 3) { @@ -106,24 +106,29 @@ Permute_vulkan::Permute_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) { order_type = 0; } - } int Permute_vulkan::create_pipeline(const Option& _opt) { Option opt = _opt; - const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0]; + const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0]; const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0]; int elempack = 1; - if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; - if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; - if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 + : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 + : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 + : 1; int out_elempack = 1; - if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1; - if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1; - if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1; + if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 + : 1; + if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 + : 1; + if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 + : 1; size_t elemsize; size_t out_elemsize; @@ -162,16 +167,16 @@ int Permute_vulkan::create_pipeline(const Option& _opt) std::vector specializations(1 + 10); specializations[0].i = order_type; - specializations[1 + 0].i = 0; // shape_packed.dims; - specializations[1 + 1].i = 0; // shape_packed.w; - specializations[1 + 2].i = 0; // shape_packed.h; - specializations[1 + 3].i = 0; // shape_packed.c; - specializations[1 + 4].i = 0; // shape_packed.cstep; - specializations[1 + 5].i = 0; // out_shape_packed.dims; - specializations[1 + 6].i = 0; // out_shape_packed.w; - specializations[1 + 7].i = 0; // out_shape_packed.h; - specializations[1 + 8].i = 0; // out_shape_packed.c; - specializations[1 + 9].i = 0; // out_shape_packed.cstep; + specializations[1 + 0].i = 0; // shape_packed.dims; + specializations[1 + 1].i = 0; // shape_packed.w; + specializations[1 + 2].i = 0; // shape_packed.h; + specializations[1 + 3].i = 0; // shape_packed.c; + specializations[1 + 4].i = 0; // shape_packed.cstep; + specializations[1 + 5].i = 0; // out_shape_packed.dims; + specializations[1 + 6].i = 0; // out_shape_packed.w; + specializations[1 + 7].i = 0; // out_shape_packed.h; + specializations[1 + 8].i = 0; // out_shape_packed.c; + specializations[1 + 9].i = 0; // out_shape_packed.cstep; Tensor local_size_xyz_bottom; // pack4to1 and pack8to1 if (shape_packed.dims == 2) @@ -342,7 +347,8 @@ int Permute_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b outh = w; } - out_elempack = opt.use_shader_pack8 && outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 : 1; + out_elempack = opt.use_shader_pack8 && outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 + : 1; out_elemsize = elemsize / elempack * out_elempack; if (opt.use_fp16_packed && !opt.use_fp16_storage) @@ -401,7 +407,8 @@ int Permute_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b outc = w; } - out_elempack = opt.use_shader_pack8 && outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4 : 1; + out_elempack = opt.use_shader_pack8 && outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4 + : 1; out_elemsize = elemsize / elempack * out_elempack; if (opt.use_fp16_packed && !opt.use_fp16_storage) @@ -472,4 +479,4 @@ int Permute_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/permute_vulkan.hpp b/source/device/vulkan/layer/permute_vulkan.hpp index 5ea17c635..2a6763c13 100644 --- a/source/device/vulkan/layer/permute_vulkan.hpp +++ b/source/device/vulkan/layer/permute_vulkan.hpp @@ -45,7 +45,7 @@ #include "permute_param.h" -namespace TEngine{ +namespace TEngine { class Permute_vulkan : public Layer { @@ -55,7 +55,7 @@ class Permute_vulkan : public Layer virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); - + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; public: @@ -79,6 +79,6 @@ class Permute_vulkan : public Layer int order_type; }; -} // namespace TEngine +} // namespace TEngine #endif \ No newline at end of file diff --git a/source/device/vulkan/layer/pooling_vulkan.cpp b/source/device/vulkan/layer/pooling_vulkan.cpp index eb50b1704..8f4234367 100644 --- a/source/device/vulkan/layer/pooling_vulkan.cpp +++ b/source/device/vulkan/layer/pooling_vulkan.cpp @@ -51,7 +51,6 @@ Pooling_vulkan::Pooling_vulkan() pipeline_pooling_global = 0; pipeline_pooling_global_pack4 = 0; pipeline_pooling_global_pack8 = 0; - } Pooling_vulkan::Pooling_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) @@ -67,28 +66,28 @@ Pooling_vulkan::Pooling_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) graph = ir_graph; node = ir_node; - struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); std::string name = input->name; bottoms.push_back(name); // Tensor* output_tensor = t_node->GetOutputTensor(0); - struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); name = output->name; tops.push_back(name); - struct pool_param *param_ = (struct pool_param *)ir_node->op.param_mem; + struct pool_param* param_ = (struct pool_param*)ir_node->op.param_mem; - pooling_type = param_->pool_method; // 0:max 1:avg + pooling_type = param_->pool_method; // 0:max 1:avg kernel_h = param_->kernel_h; kernel_w = param_->kernel_w; stride_h = param_->stride_h; stride_w = param_->stride_w; global = param_->global; caffe_flavor = param_->caffe_flavor; - pad_h0 = param_->pad_h0; - pad_w0 = param_->pad_w0; - pad_h1 = param_->pad_h1; - pad_w1 = param_->pad_w1; + pad_h0 = param_->pad_h0; + pad_w0 = param_->pad_w0; + pad_h1 = param_->pad_h1; + pad_w1 = param_->pad_w1; input_c = input->dims[1]; input_h = input->dims[2]; input_w = input->dims[3]; @@ -98,11 +97,12 @@ Pooling_vulkan::Pooling_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) // printf("create pooling layer with param:%d %d %d %d %d %d %d %d %d %d\n", kernel_h, kernel_w, stride_h, stride_w, global, pad_h0, pad_h1, pad_w0, pad_w1, param_->alg); } - int Pooling_vulkan::create_pipeline(const Option& opt) { - int elempack = opt.use_shader_pack8 && input_c % 8 == 0 ? 8 : input_c % 4 == 0 ? 4 : 1; - int out_elempack = opt.use_shader_pack8 && output_c % 8 == 0 ? 8 : output_c % 4 == 0 ? 4 : 1; + int elempack = opt.use_shader_pack8 && input_c % 8 == 0 ? 8 : input_c % 4 == 0 ? 4 + : 1; + int out_elempack = opt.use_shader_pack8 && output_c % 8 == 0 ? 8 : output_c % 4 == 0 ? 4 + : 1; size_t elemsize; size_t out_elemsize; @@ -121,7 +121,7 @@ int Pooling_vulkan::create_pipeline(const Option& opt) elemsize = elempack * 4u; out_elemsize = out_elempack * 4u; } - + { padding = new Padding_vulkan(); padding->vkdev = vkdev; @@ -143,7 +143,7 @@ int Pooling_vulkan::create_pipeline(const Option& opt) padding->create_pipeline(opt); } - if(global) + if (global) { std::vector specializations(1 + 10); specializations[0].i = pooling_type; @@ -203,18 +203,18 @@ int Pooling_vulkan::create_pipeline(const Option& opt) specializations[7].i = pad_h0; specializations[8].i = pad_h1; specializations[9].i = global; - specializations[10].i = 0; // pad_mode; - specializations[11].i = 0; // avgpool_count_include_pad; - specializations[12 + 0].i = 0; // 3; // shape_bordered_packed.dims; - specializations[12 + 1].i = 0; // input_w; // shape_bordered_packed.w; - specializations[12 + 2].i = 0; // input_h; // shape_bordered_packed.h; - specializations[12 + 3].i = 0; // input_c; // shape_bordered_packed.c; - specializations[12 + 4].i = 0; // input_w * input_h; // shape_bordered_packed.cstep; - specializations[12 + 5].i = 0; // 3; // out_shape_packed.dims; - specializations[12 + 6].i = 0; // output_w; // out_shape_packed.w; - specializations[12 + 7].i = 0; // output_h; // out_shape_packed.h; - specializations[12 + 8].i = 0; // output_c; // out_shape_packed.c; - specializations[12 + 9].i = 0; // output_h * output_c; // out_shape_packed.cstep; + specializations[10].i = 0; // pad_mode; + specializations[11].i = 0; // avgpool_count_include_pad; + specializations[12 + 0].i = 0; // 3; // shape_bordered_packed.dims; + specializations[12 + 1].i = 0; // input_w; // shape_bordered_packed.w; + specializations[12 + 2].i = 0; // input_h; // shape_bordered_packed.h; + specializations[12 + 3].i = 0; // input_c; // shape_bordered_packed.c; + specializations[12 + 4].i = 0; // input_w * input_h; // shape_bordered_packed.cstep; + specializations[12 + 5].i = 0; // 3; // out_shape_packed.dims; + specializations[12 + 6].i = 0; // output_w; // out_shape_packed.w; + specializations[12 + 7].i = 0; // output_h; // out_shape_packed.h; + specializations[12 + 8].i = 0; // output_c; // out_shape_packed.c; + specializations[12 + 9].i = 0; // output_h * output_c; // out_shape_packed.cstep; VkTensor local_size_xyz; local_size_xyz.w = std::min(4, output_w); @@ -262,10 +262,10 @@ int Pooling_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b size_t elemsize = bottom_blob.elemsize; int elempack = bottom_blob.elempack; - if(global) + if (global) { // printf("input shape: %d %d %d, out shape: %d %d %d\n", input_c, input_h, input_w, output_c, output_h, output_w); - top_blob.create(output_c/elempack, elemsize, elempack, opt.blob_vkallocator); + top_blob.create(output_c / elempack, elemsize, elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; // printf("top shape:%d %d %d\n", top_blob.c, top_blob.h, top_blob.w); @@ -285,9 +285,9 @@ int Pooling_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b constants[8].i = top_blob.c; constants[9].i = top_blob.cstep; - const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_global_pack8 + const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_global_pack8 : elempack == 4 ? pipeline_pooling_global_pack4 - : pipeline_pooling_global; + : pipeline_pooling_global; cmd.record_pipeline(pipeline, bindings, constants, top_blob); @@ -306,8 +306,7 @@ int Pooling_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b padding->record_pipeline(bottom_blob, bottom_blob_bordered, cmd, opt_pad); } - top_blob.create(output_w, output_h, output_c/elempack, elemsize, elempack, opt.blob_vkallocator); - + top_blob.create(output_w, output_h, output_c / elempack, elemsize, elempack, opt.blob_vkallocator); std::vector bindings(2); bindings[0] = bottom_blob_bordered; @@ -327,9 +326,9 @@ int Pooling_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b constants[10].i = 0; constants[11].i = 0; - const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_pack8 + const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_pack8 : elempack == 4 ? pipeline_pooling_pack4 - : pipeline_pooling; + : pipeline_pooling; cmd.record_pipeline(pipeline, bindings, constants, top_blob); return 0; diff --git a/source/device/vulkan/layer/pooling_vulkan.hpp b/source/device/vulkan/layer/pooling_vulkan.hpp index e4a823e9e..33be747b2 100644 --- a/source/device/vulkan/layer/pooling_vulkan.hpp +++ b/source/device/vulkan/layer/pooling_vulkan.hpp @@ -56,21 +56,21 @@ class Pooling_vulkan : public Layer virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); - + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; public: - int pooling_type; // // 0:max 1:avg - int kernel_h; // = param_->kernel_h; - int kernel_w; // = param_->kernel_w; - int stride_h; // = param_->stride_h; - int stride_w; // = param_->stride_w; - int global; // = param_->global; + int pooling_type; // // 0:max 1:avg + int kernel_h; // = param_->kernel_h; + int kernel_w; // = param_->kernel_w; + int stride_h; // = param_->stride_h; + int stride_w; // = param_->stride_w; + int global; // = param_->global; int caffe_flavor; // = param_->caffe_flavor; - int pad_h0; // = param_->pad_h0; - int pad_w0; // = param_->pad_w0; - int pad_h1; // = param_->pad_h1; - int pad_w1; // = param_->pad_w1; + int pad_h0; // = param_->pad_h0; + int pad_w0; // = param_->pad_w0; + int pad_h1; // = param_->pad_h1; + int pad_w1; // = param_->pad_w1; int input_c; int input_h; int input_w; @@ -91,5 +91,4 @@ class Pooling_vulkan : public Layer } // namespace TEngine - #endif diff --git a/source/device/vulkan/layer/priorbox_vulkan.cpp b/source/device/vulkan/layer/priorbox_vulkan.cpp index de81aec7a..23198f4e8 100644 --- a/source/device/vulkan/layer/priorbox_vulkan.cpp +++ b/source/device/vulkan/layer/priorbox_vulkan.cpp @@ -60,28 +60,28 @@ PriorBox_vulkan::PriorBox_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) graph = ir_graph; node = ir_node; - for(int i = 0; i < ir_node->input_num; i++) + for (int i = 0; i < ir_node->input_num; i++) { - struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[i]); + struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[i]); std::string name = input->name; bottoms.push_back(name); } - for(int i = 0; i < ir_node->output_num; i++) + for (int i = 0; i < ir_node->output_num; i++) { - struct tensor *output = get_ir_graph_tensor(graph, node->input_tensors[i]); + struct tensor* output = get_ir_graph_tensor(graph, node->input_tensors[i]); std::string name = output->name; tops.push_back(name); } // params - struct tensor *featmap_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]); - struct tensor *data_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]); - struct tensor *output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]); - input_c = data_tensor->dims[1]; // param->input_channel; + struct tensor* featmap_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]); + struct tensor* data_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]); + struct tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]); + input_c = data_tensor->dims[1]; // param->input_channel; input_h = data_tensor->dims[2]; input_w = data_tensor->dims[3]; - output_c = output_tensor->dims[1]; // param->output_channel; + output_c = output_tensor->dims[1]; // param->output_channel; output_h = output_tensor->dims[2]; output_w = output_tensor->dims[3]; @@ -90,8 +90,8 @@ PriorBox_vulkan::PriorBox_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) const int feat_height = featmap_tensor->dims[2]; const int feat_width = featmap_tensor->dims[3]; - struct priorbox_param *param = (struct priorbox_param *)ir_node->op.param_mem; - + struct priorbox_param* param = (struct priorbox_param*)ir_node->op.param_mem; + variances[0] = (param->variance)[0]; variances[1] = (param->variance)[1]; variances[2] = (param->variance)[2]; @@ -112,8 +112,8 @@ PriorBox_vulkan::PriorBox_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) if (param->step_h == 0 || param->step_w == 0) { - step_width = ( float )(image_width) / feat_width; - step_height = ( float )(image_height) / feat_height; + step_width = (float)(image_width) / feat_width; + step_height = (float)(image_height) / feat_height; } else { @@ -137,9 +137,12 @@ int PriorBox_vulkan::create_pipeline(const Option& opt) const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0]; int elempack = 1; - if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; - if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; - if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 + : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 + : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 + : 1; size_t elemsize; if (opt.use_fp16_storage) @@ -182,8 +185,8 @@ int PriorBox_vulkan::create_pipeline(const Option& opt) specializations[8].i = num_max_size; specializations[9].i = num_aspect_ratio; specializations[10].i = num_prior; - specializations[11 + 0].i = 0;//shape_packed.w; - specializations[11 + 1].i = 0;//shape_packed.h; + specializations[11 + 0].i = 0; //shape_packed.w; + specializations[11 + 1].i = 0; //shape_packed.h; pipeline_priorbox = new Pipeline(vkdev); pipeline_priorbox->set_optimal_local_size_xyz(); @@ -348,4 +351,4 @@ int PriorBox_vulkan::record_pipeline(const std::vector& bottom_blobs, return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/priorbox_vulkan.hpp b/source/device/vulkan/layer/priorbox_vulkan.hpp index 69b8f8bb7..3ae12f99e 100644 --- a/source/device/vulkan/layer/priorbox_vulkan.hpp +++ b/source/device/vulkan/layer/priorbox_vulkan.hpp @@ -45,7 +45,7 @@ #include "priorbox_param.h" -namespace TEngine{ +namespace TEngine { class PriorBox_vulkan : public Layer { @@ -56,7 +56,7 @@ class PriorBox_vulkan : public Layer virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); virtual int upload_model(VkTransfer& cmd, const Option& opt); - + virtual int record_pipeline(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; public: @@ -91,6 +91,6 @@ class PriorBox_vulkan : public Layer VkTensor aspect_ratios_gpu; }; -} // namespace TEngine +} // namespace TEngine #endif \ No newline at end of file diff --git a/source/device/vulkan/layer/relu_vulkan.cpp b/source/device/vulkan/layer/relu_vulkan.cpp index f541806cf..510d4245b 100644 --- a/source/device/vulkan/layer/relu_vulkan.cpp +++ b/source/device/vulkan/layer/relu_vulkan.cpp @@ -64,23 +64,23 @@ ReLU_vulkan::ReLU_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) graph = ir_graph; node = ir_node; - struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); std::string name = input->name; bottoms.push_back(name); - struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); name = output->name; tops.push_back(name); // params - input_c = input->dims[1]; // param->input_channel; + input_c = input->dims[1]; // param->input_channel; input_h = input->dims[2]; input_w = input->dims[3]; - output_c = output->dims[1]; // param->output_channel; + output_c = output->dims[1]; // param->output_channel; output_h = output->dims[2]; output_w = output->dims[3]; - struct relu_param *param = (struct relu_param *)ir_node->op.param_mem; + struct relu_param* param = (struct relu_param*)ir_node->op.param_mem; negative_slope = param->negative_slope; } @@ -89,9 +89,12 @@ int ReLU_vulkan::create_pipeline(const Option& opt) const Tensor& shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0]; int elempack = 1; - if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; - if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; - if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 + : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 + : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 + : 1; size_t elemsize; if (opt.use_fp16_storage) @@ -113,12 +116,12 @@ int ReLU_vulkan::create_pipeline(const Option& opt) if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); std::vector specializations(1 + 5); - specializations[0].f = negative_slope; // slope; - specializations[1 + 0].i = 0; // shape_packed.dims; - specializations[1 + 1].i = 0; // shape_packed.w; - specializations[1 + 2].i = 0; // shape_packed.h; - specializations[1 + 3].i = 0; // shape_packed.c; - specializations[1 + 4].i = 0; // shape_packed.cstep; + specializations[0].f = negative_slope; // slope; + specializations[1 + 0].i = 0; // shape_packed.dims; + specializations[1 + 1].i = 0; // shape_packed.w; + specializations[1 + 2].i = 0; // shape_packed.h; + specializations[1 + 3].i = 0; // shape_packed.c; + specializations[1 + 4].i = 0; // shape_packed.cstep; Tensor local_size_xyz; if (shape_packed.dims == 1) @@ -167,7 +170,6 @@ int ReLU_vulkan::create_pipeline(const Option& opt) return 0; } - int ReLU_vulkan::destroy_pipeline(const Option& /*opt*/) { delete pipeline_relu; @@ -196,9 +198,9 @@ int ReLU_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, cons constants[3].i = bottom_top_blob.c; constants[4].i = bottom_top_blob.cstep; - const Pipeline* pipeline = elempack == 8 ? pipeline_relu_pack8 + const Pipeline* pipeline = elempack == 8 ? pipeline_relu_pack8 : elempack == 4 ? pipeline_relu_pack4 - : pipeline_relu; + : pipeline_relu; cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob); @@ -211,4 +213,4 @@ int ReLU_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob return 0; } -} \ No newline at end of file +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/relu_vulkan.hpp b/source/device/vulkan/layer/relu_vulkan.hpp index c928a756f..c707481c8 100644 --- a/source/device/vulkan/layer/relu_vulkan.hpp +++ b/source/device/vulkan/layer/relu_vulkan.hpp @@ -45,7 +45,7 @@ #include "relu_param.h" -namespace TEngine{ +namespace TEngine { class ReLU_vulkan : public Layer { @@ -55,7 +55,7 @@ class ReLU_vulkan : public Layer virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); - + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; virtual int record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const; @@ -74,6 +74,6 @@ class ReLU_vulkan : public Layer float negative_slope; }; -} // namespace TEngine +} // namespace TEngine #endif \ No newline at end of file diff --git a/source/device/vulkan/layer/reshape_vulkan.cpp b/source/device/vulkan/layer/reshape_vulkan.cpp index 7e36dca8f..3f12e241f 100644 --- a/source/device/vulkan/layer/reshape_vulkan.cpp +++ b/source/device/vulkan/layer/reshape_vulkan.cpp @@ -86,59 +86,56 @@ Reshape_vulkan::Reshape_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) graph = ir_graph; node = ir_node; - struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); std::string name = input->name; bottoms.push_back(name); - struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); name = output->name; tops.push_back(name); // params - input_c = input->dims[1]; // param->input_channel; + input_c = input->dims[1]; // param->input_channel; input_h = input->dims[2]; input_w = input->dims[3]; - struct reshape_param *param = (struct reshape_param *)ir_node->op.param_mem; + struct reshape_param* param = (struct reshape_param*)ir_node->op.param_mem; ndim = param->dim_size; permute = param->reverse; - // TODO fix + // TODO fix // c = param->re_shape[0]; // w = param->re_shape[1]; // h = param->re_shape[2]; - if(param->dim_size == 4) + if (param->dim_size == 4) { ndim = 3; - output_c = output->dims[1]; // param->output_channel; + output_c = output->dims[1]; // param->output_channel; output_h = output->dims[2]; output_w = output->dims[3]; - c = output->dims[1]; // param->output_channel; + c = output->dims[1]; // param->output_channel; h = output->dims[2]; w = output->dims[3]; } else { ndim = param->dim_size; - - output_c = output->dims[0]; // param->output_channel; + + output_c = output->dims[0]; // param->output_channel; output_h = output->dims[1]; output_w = output->dims[2]; - c = output_c; // param->output_channel; + c = output_c; // param->output_channel; h = output_h; w = output_w; } - - - } int Reshape_vulkan::create_pipeline(const Option& _opt) { Option opt = _opt; - const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0]; + const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0]; const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0]; bool need_permute = permute == 1; @@ -161,14 +158,20 @@ int Reshape_vulkan::create_pipeline(const Option& _opt) } int elempack = 1; - if (shape_permuted.dims == 1) elempack = opt.use_shader_pack8 && shape_permuted.w % 8 == 0 ? 8 : shape_permuted.w % 4 == 0 ? 4 : 1; - if (shape_permuted.dims == 2) elempack = opt.use_shader_pack8 && shape_permuted.h % 8 == 0 ? 8 : shape_permuted.h % 4 == 0 ? 4 : 1; - if (shape_permuted.dims == 3) elempack = opt.use_shader_pack8 && shape_permuted.c % 8 == 0 ? 8 : shape_permuted.c % 4 == 0 ? 4 : 1; + if (shape_permuted.dims == 1) elempack = opt.use_shader_pack8 && shape_permuted.w % 8 == 0 ? 8 : shape_permuted.w % 4 == 0 ? 4 + : 1; + if (shape_permuted.dims == 2) elempack = opt.use_shader_pack8 && shape_permuted.h % 8 == 0 ? 8 : shape_permuted.h % 4 == 0 ? 4 + : 1; + if (shape_permuted.dims == 3) elempack = opt.use_shader_pack8 && shape_permuted.c % 8 == 0 ? 8 : shape_permuted.c % 4 == 0 ? 4 + : 1; int out_elempack = 1; - if (out_shape_permuted.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape_permuted.w % 8 == 0 ? 8 : out_shape_permuted.w % 4 == 0 ? 4 : 1; - if (out_shape_permuted.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape_permuted.h % 8 == 0 ? 8 : out_shape_permuted.h % 4 == 0 ? 4 : 1; - if (out_shape_permuted.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape_permuted.c % 8 == 0 ? 8 : out_shape_permuted.c % 4 == 0 ? 4 : 1; + if (out_shape_permuted.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape_permuted.w % 8 == 0 ? 8 : out_shape_permuted.w % 4 == 0 ? 4 + : 1; + if (out_shape_permuted.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape_permuted.h % 8 == 0 ? 8 : out_shape_permuted.h % 4 == 0 ? 4 + : 1; + if (out_shape_permuted.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape_permuted.c % 8 == 0 ? 8 : out_shape_permuted.c % 4 == 0 ? 4 + : 1; size_t elemsize; size_t out_elemsize; @@ -204,19 +207,19 @@ int Reshape_vulkan::create_pipeline(const Option& _opt) support_image_storage = false; opt.use_image_storage = false; } - + std::vector specializations(1 + 10); specializations[0].i = ndim; - specializations[1 + 0].i = 0; // shape_packed.dims; - specializations[1 + 1].i = 0; // shape_packed.w; - specializations[1 + 2].i = 0; // shape_packed.h; - specializations[1 + 3].i = 0; // shape_packed.c; - specializations[1 + 4].i = 0; // shape_packed.cstep; - specializations[1 + 5].i = 0; // out_shape_packed.dims; - specializations[1 + 6].i = 0; // out_shape_packed.w; - specializations[1 + 7].i = 0; // out_shape_packed.h; - specializations[1 + 8].i = 0; // out_shape_packed.c; - specializations[1 + 9].i = 0; // out_shape_packed.cstep; + specializations[1 + 0].i = 0; // shape_packed.dims; + specializations[1 + 1].i = 0; // shape_packed.w; + specializations[1 + 2].i = 0; // shape_packed.h; + specializations[1 + 3].i = 0; // shape_packed.c; + specializations[1 + 4].i = 0; // shape_packed.cstep; + specializations[1 + 5].i = 0; // out_shape_packed.dims; + specializations[1 + 6].i = 0; // out_shape_packed.w; + specializations[1 + 7].i = 0; // out_shape_packed.h; + specializations[1 + 8].i = 0; // out_shape_packed.c; + specializations[1 + 9].i = 0; // out_shape_packed.cstep; Tensor local_size_xyz_bottom; // pack4to1 and pack8to1 if (shape_packed.dims == 1) @@ -415,7 +418,8 @@ int Reshape_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b if (outw == -1) outw = total; - out_elempack = opt.use_shader_pack8 && outw % 8 == 0 ? 8 : outw % 4 == 0 ? 4 : 1; + out_elempack = opt.use_shader_pack8 && outw % 8 == 0 ? 8 : outw % 4 == 0 ? 4 + : 1; if (dims == 1 && bottom_blob.w == outw && elempack == out_elempack) { @@ -435,7 +439,8 @@ int Reshape_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b if (outh == -1) outh = total / outw; - out_elempack = opt.use_shader_pack8 && outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 : 1; + out_elempack = opt.use_shader_pack8 && outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 + : 1; if (dims == 2 && bottom_blob.h == outh && elempack == out_elempack) { @@ -460,7 +465,8 @@ int Reshape_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b if (outc == -1) outc = total / outh / outw; - out_elempack = opt.use_shader_pack8 && outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4 : 1; + out_elempack = opt.use_shader_pack8 && outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4 + : 1; if (dims == 3 && bottom_blob.c == outc && elempack == out_elempack) { @@ -576,5 +582,4 @@ int Reshape_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b return 0; } - -} // namespace TEngine \ No newline at end of file +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/reshape_vulkan.hpp b/source/device/vulkan/layer/reshape_vulkan.hpp index 33bc2be41..1d52e48a8 100644 --- a/source/device/vulkan/layer/reshape_vulkan.hpp +++ b/source/device/vulkan/layer/reshape_vulkan.hpp @@ -45,7 +45,7 @@ #include "reshape_param.h" -namespace TEngine{ +namespace TEngine { class Reshape_vulkan : public Layer { @@ -55,7 +55,7 @@ class Reshape_vulkan : public Layer virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); - + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; public: @@ -90,9 +90,8 @@ class Reshape_vulkan : public Layer int permute; int ndim; - }; -} // namespace TEngine +} // namespace TEngine #endif \ No newline at end of file diff --git a/source/device/vulkan/layer/softmax_vulkan.cpp b/source/device/vulkan/layer/softmax_vulkan.cpp index 970e03295..8ee653505 100644 --- a/source/device/vulkan/layer/softmax_vulkan.cpp +++ b/source/device/vulkan/layer/softmax_vulkan.cpp @@ -86,24 +86,24 @@ Softmax_vulkan::Softmax_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) graph = ir_graph; node = ir_node; - struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); std::string name = input->name; bottoms.push_back(name); - struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); name = output->name; tops.push_back(name); // params - input_c = input->dims[1]; // param->input_channel; + input_c = input->dims[1]; // param->input_channel; input_h = input->dims[2]; input_w = input->dims[3]; - output_c = output->dims[1]; // param->output_channel; + output_c = output->dims[1]; // param->output_channel; output_h = output->dims[2]; output_w = output->dims[3]; - - struct softmax_param *param = (struct softmax_param *)ir_node->op.param_mem; - axis = param->axis-1; + + struct softmax_param* param = (struct softmax_param*)ir_node->op.param_mem; + axis = param->axis - 1; } int Softmax_vulkan::create_pipeline(const Option& opt) @@ -111,9 +111,12 @@ int Softmax_vulkan::create_pipeline(const Option& opt) const Tensor& shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0]; int elempack = 1; - if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; - if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; - if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 + : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 + : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 + : 1; size_t elemsize; if (opt.use_fp16_storage) @@ -162,16 +165,16 @@ int Softmax_vulkan::create_pipeline(const Option& opt) std::vector specializations(1 + 10); specializations[0].i = axis; - specializations[1 + 0].i = 0; // shape_packed.dims; - specializations[1 + 1].i = 0; // shape_packed.w; - specializations[1 + 2].i = 0; // shape_packed.h; - specializations[1 + 3].i = 0; // shape_packed.c; - specializations[1 + 4].i = 0; // shape_packed.cstep; - specializations[1 + 5].i = 0; // workspace_shape_packed.dims; - specializations[1 + 6].i = 0; // workspace_shape_packed.w; - specializations[1 + 7].i = 0; // workspace_shape_packed.h; - specializations[1 + 8].i = 0; // workspace_shape_packed.c; - specializations[1 + 9].i = 0; // workspace_shape_packed.cstep; + specializations[1 + 0].i = 0; // shape_packed.dims; + specializations[1 + 1].i = 0; // shape_packed.w; + specializations[1 + 2].i = 0; // shape_packed.h; + specializations[1 + 3].i = 0; // shape_packed.c; + specializations[1 + 4].i = 0; // shape_packed.cstep; + specializations[1 + 5].i = 0; // workspace_shape_packed.dims; + specializations[1 + 6].i = 0; // workspace_shape_packed.w; + specializations[1 + 7].i = 0; // workspace_shape_packed.h; + specializations[1 + 8].i = 0; // workspace_shape_packed.c; + specializations[1 + 9].i = 0; // workspace_shape_packed.cstep; { Tensor local_size_xyz; @@ -294,7 +297,6 @@ int Softmax_vulkan::create_pipeline(const Option& opt) return 0; } - int Softmax_vulkan::destroy_pipeline(const Option& /*opt*/) { delete pipeline_softmax_reduce_max; @@ -397,9 +399,9 @@ int Softmax_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, c constants[8].i = max_workspace.c; constants[9].i = max_workspace.cstep; - const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_reduce_max_pack8 + const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_reduce_max_pack8 : elempack == 4 ? pipeline_softmax_reduce_max_pack4 - : pipeline_softmax_reduce_max; + : pipeline_softmax_reduce_max; cmd.record_pipeline(pipeline, bindings, constants, max_workspace); } @@ -422,9 +424,9 @@ int Softmax_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, c constants[8].i = max_workspace.c; constants[9].i = max_workspace.cstep; - const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_exp_sub_max_pack8 + const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_exp_sub_max_pack8 : elempack == 4 ? pipeline_softmax_exp_sub_max_pack4 - : pipeline_softmax_exp_sub_max; + : pipeline_softmax_exp_sub_max; cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob); } @@ -447,9 +449,9 @@ int Softmax_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, c constants[8].i = sum_workspace.c; constants[9].i = sum_workspace.cstep; - const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_reduce_sum_pack8 + const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_reduce_sum_pack8 : elempack == 4 ? pipeline_softmax_reduce_sum_pack4 - : pipeline_softmax_reduce_sum; + : pipeline_softmax_reduce_sum; cmd.record_pipeline(pipeline, bindings, constants, sum_workspace); } @@ -472,9 +474,9 @@ int Softmax_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, c constants[8].i = sum_workspace.c; constants[9].i = sum_workspace.cstep; - const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_div_sum_pack8 + const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_div_sum_pack8 : elempack == 4 ? pipeline_softmax_div_sum_pack4 - : pipeline_softmax_div_sum; + : pipeline_softmax_div_sum; cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob); } @@ -482,5 +484,4 @@ int Softmax_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, c return 0; } - -} // namespace TEngine +} // namespace TEngine diff --git a/source/device/vulkan/layer/softmax_vulkan.hpp b/source/device/vulkan/layer/softmax_vulkan.hpp index 108ea5d62..94c1be27c 100644 --- a/source/device/vulkan/layer/softmax_vulkan.hpp +++ b/source/device/vulkan/layer/softmax_vulkan.hpp @@ -45,7 +45,7 @@ #include "softmax_param.h" -namespace TEngine{ +namespace TEngine { class Softmax_vulkan : public Layer { @@ -55,7 +55,7 @@ class Softmax_vulkan : public Layer virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); - + virtual int record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const; public: @@ -82,9 +82,8 @@ class Softmax_vulkan : public Layer int output_c; int output_h; int output_w; - }; -} // namespace TEngine +} // namespace TEngine #endif \ No newline at end of file diff --git a/source/device/vulkan/layer_shader_type.h b/source/device/vulkan/layer_shader_type.h index e9c713062..2fc6d359c 100644 --- a/source/device/vulkan/layer_shader_type.h +++ b/source/device/vulkan/layer_shader_type.h @@ -47,7 +47,7 @@ enum LayerShaderType { #include "layer_shader_type_enum.h" }; -} // namespace LayerType +} // namespace LayerShaderType } // namespace TEngine diff --git a/source/device/vulkan/vulkan_allocator.cpp b/source/device/vulkan/vulkan_allocator.cpp index c5483ca4f..b901923cd 100644 --- a/source/device/vulkan/vulkan_allocator.cpp +++ b/source/device/vulkan/vulkan_allocator.cpp @@ -48,10 +48,10 @@ namespace TEngine { Allocator::~Allocator() { - } -VkAllocator::VkAllocator(const GPUDevice* _vkdev) : vkdev(_vkdev) +VkAllocator::VkAllocator(const GPUDevice* _vkdev) + : vkdev(_vkdev) { buffer_memory_type_index = (uint32_t)-1; image_memory_type_index = (uint32_t)-1; @@ -258,7 +258,8 @@ VkImageView VkAllocator::create_imageview(VkImageViewType type, VkImage image, V return imageview; } -VkBlobAllocator::VkBlobAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkdev) +VkBlobAllocator::VkBlobAllocator(const GPUDevice* _vkdev) + : VkAllocator(_vkdev) { buffer_offset_alignment = vkdev->info.buffer_offset_alignment; bind_memory_offset_alignment = vkdev->info.buffer_image_granularity; @@ -273,7 +274,7 @@ VkBlobAllocator::VkBlobAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkdev) buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.non_coherent_atom_size); } - block_size = alignSize(16 * 1024 * 1024, buffer_offset_alignment);// 16M + block_size = alignSize(16 * 1024 * 1024, buffer_offset_alignment); // 16M } VkBlobAllocator::~VkBlobAllocator() @@ -284,18 +285,18 @@ VkBlobAllocator::~VkBlobAllocator() // TODO void VkBlobAllocator::clear() { -// TLOG_INFO("VkBlobAllocator %lu", buffer_blocks.size()); + // TLOG_INFO("VkBlobAllocator %lu", buffer_blocks.size()); - for (size_t i=0; i >::iterator it = buffer_budgets[i].begin(); -// while (it != buffer_budgets[i].end()) -// { -// TLOG_INFO("VkBlobAllocator budget %p %lu %lu", ptr->buffer, it->first, it->second); -// it++; -// } + // std::list< std::pair >::iterator it = buffer_budgets[i].begin(); + // while (it != buffer_budgets[i].end()) + // { + // TLOG_INFO("VkBlobAllocator budget %p %lu %lu", ptr->buffer, it->first, it->second); + // it++; + // } if (mappable) vkUnmapMemory(vkdev->vkdevice(), ptr->memory); @@ -309,16 +310,16 @@ void VkBlobAllocator::clear() buffer_budgets.clear(); - for (size_t i=0; i >::iterator it = image_memory_budgets[i].begin(); -// while (it != image_memory_budgets[i].end()) -// { -// TLOG_INFO("VkBlobAllocator budget %p %lu %lu", memory, it->first, it->second); -// it++; -// } + // std::list< std::pair >::iterator it = image_memory_budgets[i].begin(); + // while (it != image_memory_budgets[i].end()) + // { + // TLOG_INFO("VkBlobAllocator budget %p %lu %lu", memory, it->first, it->second); + // it++; + // } vkFreeMemory(vkdev->vkdevice(), memory, 0); } @@ -334,9 +335,9 @@ VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size) const int buffer_block_count = buffer_blocks.size(); // find first spare space in buffer_blocks - for (int i=0; i >::iterator it = buffer_budgets[i].begin(); + std::list >::iterator it = buffer_budgets[i].begin(); while (it != buffer_budgets[i].end()) { size_t budget_size = it->second; @@ -430,7 +431,7 @@ VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size) ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; // adjust buffer_budgets - std::list< std::pair > budget; + std::list > budget; if (new_block_size > aligned_size) { budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size)); @@ -440,7 +441,6 @@ VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size) // TLOG_INFO("VkBlobAllocator M %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity); return ptr; - } VkImageMemory* VkBlobAllocator::fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack) @@ -536,9 +536,9 @@ VkImageMemory* VkBlobAllocator::fastMalloc(int dims, int w, int h, int c, size_t const int image_memory_block_count = image_memory_blocks.size(); // find first spare space in image_memory_blocks - for (int i=0; i >::iterator it = image_memory_budgets[i].begin(); + std::list >::iterator it = image_memory_budgets[i].begin(); while (it != image_memory_budgets[i].end()) { // we cannot use it->first directly for base offset alignment @@ -589,7 +589,7 @@ VkImageMemory* VkBlobAllocator::fastMalloc(int dims, int w, int h, int c, size_t it->second -= aligned_size; } -// TLOG_INFO("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity); + // TLOG_INFO("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity); return ptr; } @@ -636,27 +636,26 @@ VkImageMemory* VkBlobAllocator::fastMalloc(int dims, int w, int h, int c, size_t // adjust image_memory_budgets image_memory_blocks.push_back(ptr->memory); - std::list< std::pair > budget; + std::list > budget; if (new_block_size > aligned_size) { budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size)); } image_memory_budgets.push_back(budget); -// TLOG_INFO("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity); + // TLOG_INFO("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity); return ptr; } - void VkBlobAllocator::fastFree(VkBufferMemory* ptr) { -// TLOG_INFO("VkBlobAllocator F %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity); + // TLOG_INFO("VkBlobAllocator F %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity); const int buffer_block_count = buffer_blocks.size(); int block_index = -1; - for (int i=0; ibuffer == ptr->buffer && buffer_blocks[i]->memory == ptr->memory) { @@ -675,10 +674,10 @@ void VkBlobAllocator::fastFree(VkBufferMemory* ptr) } // merge - std::list< std::pair >::iterator it_merge_left = buffer_budgets[block_index].end(); - std::list< std::pair >::iterator it_merge_right = buffer_budgets[block_index].end(); - std::list< std::pair >::iterator it = buffer_budgets[block_index].begin(); - for ( ; it != buffer_budgets[block_index].end(); it++) + std::list >::iterator it_merge_left = buffer_budgets[block_index].end(); + std::list >::iterator it_merge_right = buffer_budgets[block_index].end(); + std::list >::iterator it = buffer_budgets[block_index].begin(); + for (; it != buffer_budgets[block_index].end(); it++) { if (it->first + it->second == ptr->offset) { @@ -722,12 +721,12 @@ void VkBlobAllocator::fastFree(VkBufferMemory* ptr) void VkBlobAllocator::fastFree(VkImageMemory* ptr) { -// TLOG_INFO("VkBlobAllocator F %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity); + // TLOG_INFO("VkBlobAllocator F %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity); const int image_memory_block_count = image_memory_blocks.size(); int block_index = -1; - for (int i=0; imemory) { @@ -752,10 +751,10 @@ void VkBlobAllocator::fastFree(VkImageMemory* ptr) } // merge - std::list< std::pair >::iterator it_merge_left = image_memory_budgets[block_index].end(); - std::list< std::pair >::iterator it_merge_right = image_memory_budgets[block_index].end(); - std::list< std::pair >::iterator it = image_memory_budgets[block_index].begin(); - for ( ; it != image_memory_budgets[block_index].end(); it++) + std::list >::iterator it_merge_left = image_memory_budgets[block_index].end(); + std::list >::iterator it_merge_right = image_memory_budgets[block_index].end(); + std::list >::iterator it = image_memory_budgets[block_index].begin(); + for (; it != image_memory_budgets[block_index].end(); it++) { if (it->first + it->second == ptr->bind_offset) { @@ -803,7 +802,8 @@ void VkBlobAllocator::fastFree(VkImageMemory* ptr) } } -VkWeightAllocator::VkWeightAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkdev) +VkWeightAllocator::VkWeightAllocator(const GPUDevice* _vkdev) + : VkAllocator(_vkdev) { buffer_offset_alignment = vkdev->info.buffer_offset_alignment; bind_memory_offset_alignment = vkdev->info.buffer_image_granularity; @@ -818,7 +818,7 @@ VkWeightAllocator::VkWeightAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkd buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.non_coherent_atom_size); } - block_size = alignSize(8 * 1024 * 1024, buffer_offset_alignment);// 8M + block_size = alignSize(8 * 1024 * 1024, buffer_offset_alignment); // 8M } VkWeightAllocator::~VkWeightAllocator() @@ -827,7 +827,6 @@ VkWeightAllocator::~VkWeightAllocator() printf("run VkWeightAllocator descontruction function\n"); } - void VkWeightAllocator::clear() { printf("run VkWeightAllocator clear function\n"); @@ -842,9 +841,9 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size) const int buffer_block_count = buffer_blocks.size(); // find first spare space in buffer_blocks - for (int i=0; i= aligned_size) { size_t block_offset = block_size - free_size; @@ -861,8 +860,8 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size) buffer_block_free_spaces[i] -= aligned_size; - return ptr; - } + return ptr; + } } size_t new_block_size = std::max(block_size, aligned_size); @@ -874,7 +873,7 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size) if (vkdev->info.support_VK_KHR_get_memory_requirements2 && vkdev->info.support_VK_KHR_dedicated_allocation) { - VkBufferMemoryRequirementsInfo2KHR bufferMemoryRequirementsInfo2; + VkBufferMemoryRequirementsInfo2KHR bufferMemoryRequirementsInfo2; bufferMemoryRequirementsInfo2.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2_KHR; bufferMemoryRequirementsInfo2.pNext = 0; bufferMemoryRequirementsInfo2.buffer = block->buffer; @@ -892,42 +891,42 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size) bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation; - if (dedicatedAllocation) + if (dedicatedAllocation) { - // setup memory type and alignment - if (buffer_memory_type_index == (uint32_t)-1) + // setup memory type and alignment + if (buffer_memory_type_index == (uint32_t)-1) { - if (vkdev->info.type == 1) - { - // integrated gpu, prefer unified memory - buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); - } - else - { - // discrete gpu, device local - buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); - } - - mappable = vkdev->is_mappable(buffer_memory_type_index); + if (vkdev->info.type == 1) + { + // integrated gpu, prefer unified memory + buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); + } + else + { + // discrete gpu, device local + buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + } + + mappable = vkdev->is_mappable(buffer_memory_type_index); coherent = vkdev->is_coherent(buffer_memory_type_index); - } + } - block->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, buffer_memory_type_index, 0, block->buffer); - // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset - vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0); + block->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, buffer_memory_type_index, 0, block->buffer); + // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset + vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0); - block->mapped_ptr = 0; + block->mapped_ptr = 0; if (mappable) { vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr); } - dedicated_buffer_blocks.push_back(block); + dedicated_buffer_blocks.push_back(block); - // return sub buffer + // return sub buffer VkBufferMemory* ptr = new VkBufferMemory; - ptr->buffer = block->buffer; + ptr->buffer = block->buffer; ptr->offset = 0; ptr->memory = block->memory; ptr->capacity = new_block_size; @@ -936,7 +935,7 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size) ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; return ptr; - } + } } VkMemoryRequirements memoryRequirements; @@ -945,18 +944,18 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size) // setup memory type and alignment if (buffer_memory_type_index == (uint32_t)-1) { - if (vkdev->info.type == 1) + if (vkdev->info.type == 1) { // integrated gpu, prefer unified memory - buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); - } - else - { - // discrete gpu, device local - buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); - } - - mappable = vkdev->is_mappable(buffer_memory_type_index); + buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); + } + else + { + // discrete gpu, device local + buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + } + + mappable = vkdev->is_mappable(buffer_memory_type_index); coherent = vkdev->is_coherent(buffer_memory_type_index); } @@ -965,7 +964,7 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size) // ignore memoryRequirements.alignment as we always bind at zero offset vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0); -// printf("VkWeightAllocator M %p", block->buffer); + // printf("VkWeightAllocator M %p", block->buffer); block->mapped_ptr = 0; if (mappable) { @@ -1155,7 +1154,7 @@ VkImageMemory* VkWeightAllocator::fastMalloc(int dims, int w, int h, int c, size const int image_memory_block_count = image_memory_blocks.size(); // find first spare space in buffer_blocks - for (int i=0; ibuffer); + // TLOG_INFO("VkWeightAllocator F %p", ptr->buffer); delete ptr; } void VkWeightAllocator::fastFree(VkImageMemory* ptr) { -// TLOG_INFO("VkWeightAllocator F %p", ptr->memory); + // TLOG_INFO("VkWeightAllocator F %p", ptr->memory); if (!ptr->command_refcount) { @@ -1262,12 +1260,13 @@ void VkWeightAllocator::fastFree(VkImageMemory* ptr) } } -VkStagingAllocator::VkStagingAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkdev) +VkStagingAllocator::VkStagingAllocator(const GPUDevice* _vkdev) + : VkAllocator(_vkdev) { mappable = true; coherent = true; - size_compare_ratio = 192;// 0.75f * 256 + size_compare_ratio = 192; // 0.75f * 256 } VkStagingAllocator::~VkStagingAllocator() @@ -1277,13 +1276,13 @@ VkStagingAllocator::~VkStagingAllocator() void VkStagingAllocator::clear() { -// TLOG_INFO("VkStagingAllocator %lu", buffer_budgets.size()); + // TLOG_INFO("VkStagingAllocator %lu", buffer_budgets.size()); for (std::list::iterator it = buffer_budgets.begin(); it != buffer_budgets.end(); it++) { VkBufferMemory* ptr = *it; -// TLOG_INFO("VkStagingAllocator F %p", ptr->buffer); + // TLOG_INFO("VkStagingAllocator F %p", ptr->buffer); vkUnmapMemory(vkdev->vkdevice(), ptr->memory); vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0); @@ -1310,7 +1309,7 @@ VkBufferMemory* VkStagingAllocator::fastMalloc(size_t size) { buffer_budgets.erase(it); -// TLOG_INFO("VkStagingAllocator M %p %lu reused %lu", ptr->buffer, size, capacity); + // TLOG_INFO("VkStagingAllocator M %p %lu reused %lu", ptr->buffer, size, capacity); return ptr; } @@ -1342,7 +1341,7 @@ VkBufferMemory* VkStagingAllocator::fastMalloc(size_t size) ptr->access_flags = 0; ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; -// TLOG_INFO("VkStagingAllocator M %p %lu", ptr->buffer, size); + // TLOG_INFO("VkStagingAllocator M %p %lu", ptr->buffer, size); return ptr; } @@ -1394,14 +1393,14 @@ VkImageMemory* VkStagingAllocator::fastMalloc(int dims, int w, int h, int c, siz ptr->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; ptr->command_refcount = 0; -// TLOG_INFO("VkStagingAllocator M %p %d %d %d %d %d", ptr->image, dims, width, height, depth, format); + // TLOG_INFO("VkStagingAllocator M %p %d %d %d %d %d", ptr->image, dims, width, height, depth, format); return ptr; } void VkStagingAllocator::fastFree(VkBufferMemory* ptr) { -// TLOG_INFO("VkStagingAllocator F %p", ptr->buffer); + // TLOG_INFO("VkStagingAllocator F %p", ptr->buffer); // return to buffer_budgets buffer_budgets.push_back(ptr); @@ -1409,14 +1408,15 @@ void VkStagingAllocator::fastFree(VkBufferMemory* ptr) void VkStagingAllocator::fastFree(VkImageMemory* ptr) { -// TLOG_INFO("VkStagingAllocator F %p", ptr->image); + // TLOG_INFO("VkStagingAllocator F %p", ptr->image); free(ptr->mapped_ptr); delete ptr; } -VkWeightStagingAllocator::VkWeightStagingAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkdev) +VkWeightStagingAllocator::VkWeightStagingAllocator(const GPUDevice* _vkdev) + : VkAllocator(_vkdev) { mappable = true; coherent = true; @@ -1455,14 +1455,14 @@ VkBufferMemory* VkWeightStagingAllocator::fastMalloc(size_t size) ptr->access_flags = 0; ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; -// printf("VkWeightStagingAllocator M %p %lu", ptr->buffer, size); + // printf("VkWeightStagingAllocator M %p %lu", ptr->buffer, size); return ptr; } void VkWeightStagingAllocator::fastFree(VkBufferMemory* ptr) { -// TLOG_INFO("VkWeightStagingAllocator F %p", ptr->buffer); + // TLOG_INFO("VkWeightStagingAllocator F %p", ptr->buffer); vkUnmapMemory(vkdev->vkdevice(), ptr->memory); vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0); diff --git a/source/device/vulkan/vulkan_allocator.hpp b/source/device/vulkan/vulkan_allocator.hpp index 4a8f7e1c3..ffb0e4360 100644 --- a/source/device/vulkan/vulkan_allocator.hpp +++ b/source/device/vulkan/vulkan_allocator.hpp @@ -10,17 +10,18 @@ #include "vulkan_platform.hpp" namespace TEngine { - -#define MALLOC_ALIGN 16 -template static inline _Tp* alignPtr(_Tp* ptr, int n=(int)sizeof(_Tp)) +#define MALLOC_ALIGN 16 + +template +static inline _Tp* alignPtr(_Tp* ptr, int n = (int)sizeof(_Tp)) { - return (_Tp*)(((size_t)ptr + n-1) & -n); + return (_Tp*)(((size_t)ptr + n - 1) & -n); } static inline size_t alignSize(size_t sz, int n) { - return (sz + n-1) & -n; + return (sz + n - 1) & -n; } static inline void* fastMalloc(size_t size) @@ -42,8 +43,12 @@ static inline void fastFree(void* ptr) } } -static inline int TENGINE_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; } - +static inline int TENGINE_XADD(int* addr, int delta) +{ + int tmp = *addr; + *addr += delta; + return tmp; +} class Allocator { @@ -158,8 +163,13 @@ class VkAllocator { public: VkAllocator(const GPUDevice* _vkdev); - virtual ~VkAllocator() { clear(); } - virtual void clear() {} + virtual ~VkAllocator() + { + clear(); + } + virtual void clear() + { + } virtual VkBufferMemory* fastMalloc(size_t size) = 0; virtual void fastFree(VkBufferMemory* ptr) = 0; @@ -198,16 +208,16 @@ class VkBlobAllocator : public VkAllocator virtual VkBufferMemory* fastMalloc(size_t size); virtual void fastFree(VkBufferMemory* ptr); - virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack);//{ return 0; } + virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack); //{ return 0; } virtual void fastFree(VkImageMemory* ptr); protected: size_t block_size; size_t buffer_offset_alignment; size_t bind_memory_offset_alignment; - std::vector< std::list< std::pair > > buffer_budgets; + std::vector > > buffer_budgets; std::vector buffer_blocks; - std::vector< std::list< std::pair > > image_memory_budgets; + std::vector > > image_memory_budgets; std::vector image_memory_blocks; }; @@ -224,7 +234,7 @@ class VkWeightAllocator : public VkAllocator public: virtual VkBufferMemory* fastMalloc(size_t size); virtual void fastFree(VkBufferMemory* ptr); - virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack);//{ return 0; } + virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack); //{ return 0; } virtual void fastFree(VkImageMemory* ptr); protected: @@ -239,7 +249,6 @@ class VkWeightAllocator : public VkAllocator std::vector dedicated_image_memory_blocks; }; - class VkStagingAllocator : public VkAllocator { public: @@ -256,15 +265,14 @@ class VkStagingAllocator : public VkAllocator virtual VkBufferMemory* fastMalloc(size_t size); virtual void fastFree(VkBufferMemory* ptr); - virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack);//{ return 0; } + virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack); //{ return 0; } virtual void fastFree(VkImageMemory* ptr); protected: - unsigned int size_compare_ratio;// 0~256 + unsigned int size_compare_ratio; // 0~256 std::list buffer_budgets; }; - class VkWeightStagingAllocator : public VkAllocator { public: @@ -274,11 +282,16 @@ class VkWeightStagingAllocator : public VkAllocator public: virtual VkBufferMemory* fastMalloc(size_t size); virtual void fastFree(VkBufferMemory* ptr); - virtual VkImageMemory* fastMalloc(int /*dims*/, int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/) { return 0; } - virtual void fastFree(VkImageMemory* /*ptr*/) {} + virtual VkImageMemory* fastMalloc(int /*dims*/, int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/) + { + return 0; + } + virtual void fastFree(VkImageMemory* /*ptr*/) + { + } protected: }; -} +} // namespace TEngine #endif diff --git a/source/device/vulkan/vulkan_command.cpp b/source/device/vulkan/vulkan_command.cpp index b5545fe6b..05a7299ea 100644 --- a/source/device/vulkan/vulkan_command.cpp +++ b/source/device/vulkan/vulkan_command.cpp @@ -31,7 +31,8 @@ namespace TEngine { -VkCompute::VkCompute(const GPUDevice* _vkdev) : vkdev(_vkdev) +VkCompute::VkCompute(const GPUDevice* _vkdev) + : vkdev(_vkdev) { compute_command_pool = 0; compute_command_buffer = 0; @@ -40,10 +41,9 @@ VkCompute::VkCompute(const GPUDevice* _vkdev) : vkdev(_vkdev) init(); } - VkCompute::~VkCompute() { - for (size_t i=0; iinfo.support_VK_KHR_push_descriptor) { - for (size_t i=0; ivkdevice(), descriptor_pools[i], 1, &descriptorsets[i]); vkDestroyDescriptorPool(vkdev->vkdevice(), descriptor_pools[i], 0); @@ -82,76 +82,76 @@ void VkCompute::record_upload(tensor* src, VkTensor& dst, const Option& opt) { Tensor src_tensor = Tensor(src); record_upload(src_tensor, dst, opt); -// // const ir_tensor* src_fp16; -// // if (src.elemsize == src.elempack * 4u) -// if(src->elem_size == opt.elempack * 4u) -// { -// // cpu cast to fp16 (discrete gpu) -// if (vkdev->info.type == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && opt.elempack % 4 == 0))) -// { -// // ncnn::cast_float32_to_float16(src, src_fp16, opt); -// printf("need to add cast_float32_to_float16 here, fix me!\n"); -// } -// else -// { -// // src_fp16 = src; -// } -// } -// else -// { -// // src_fp16 = src; -// } - -// // upload -// VkTensor dst_staging; -// if (opt.blob_vkallocator->mappable) -// { -// // dst_staging.create_like(src_fp16, opt.blob_vkallocator); -// dst_staging.create_like(src, opt.blob_vkallocator); -// } -// else -// { -// // dst_staging.create_like(src_fp16, opt.staging_vkallocator); -// dst_staging.create_like(src, opt.staging_vkallocator); -// } -// if (dst_staging.empty()) -// return; - -// // stash staging -// upload_staging_buffers.push_back(dst_staging); - -// // TLOG_INFO("upload_staging_buffer %p -> %p +%d ~%d", src_fp16.data, dst_staging.buffer(), dst_staging.buffer_offset(), dst_staging.buffer_capacity()); - -// // memcpy src to device -// // memcpy(dst_staging.mapped_ptr(), src_fp16->data, src_fp16->elem_size * src_fp16->elem_num); -// memcpy(dst_staging.mapped_ptr(), src->data, src->elem_size * src->elem_num); -// dst_staging.allocator->flush(dst_staging.data); - -// // mark device host-write @ null -// dst_staging.data->access_flags = VK_ACCESS_HOST_WRITE_BIT; -// dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; - -// // TODO -// // not use pack for now------------------------ -// // // resolve dst_elempack -// int dims = src->dim_num; -// int elemcount = 0; -// // src dims[0-3] n c h w -// // if (dims == 1) elemcount = opt.elempack * src_fp16.w; -// // if (dims == 2) elemcount = opt.elempack * src_fp16.h; -// // if (dims == 3) elemcount = opt.elempack * src_fp16.c; -// if(dims == 4) -// elemcount = opt.elempack * src->dims[1]; -// else -// elemcount = opt.elempack * src->dims[0]; - -// int dst_elempack = 1; -// if (opt.use_shader_pack8) -// dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4 : 1; -// else -// dst_elempack = elemcount % 4 == 0 ? 4 : 1; - -// vkdev->convert_packing(dst_staging, dst, dst_elempack, *this, opt); + // // const ir_tensor* src_fp16; + // // if (src.elemsize == src.elempack * 4u) + // if(src->elem_size == opt.elempack * 4u) + // { + // // cpu cast to fp16 (discrete gpu) + // if (vkdev->info.type == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && opt.elempack % 4 == 0))) + // { + // // ncnn::cast_float32_to_float16(src, src_fp16, opt); + // printf("need to add cast_float32_to_float16 here, fix me!\n"); + // } + // else + // { + // // src_fp16 = src; + // } + // } + // else + // { + // // src_fp16 = src; + // } + + // // upload + // VkTensor dst_staging; + // if (opt.blob_vkallocator->mappable) + // { + // // dst_staging.create_like(src_fp16, opt.blob_vkallocator); + // dst_staging.create_like(src, opt.blob_vkallocator); + // } + // else + // { + // // dst_staging.create_like(src_fp16, opt.staging_vkallocator); + // dst_staging.create_like(src, opt.staging_vkallocator); + // } + // if (dst_staging.empty()) + // return; + + // // stash staging + // upload_staging_buffers.push_back(dst_staging); + + // // TLOG_INFO("upload_staging_buffer %p -> %p +%d ~%d", src_fp16.data, dst_staging.buffer(), dst_staging.buffer_offset(), dst_staging.buffer_capacity()); + + // // memcpy src to device + // // memcpy(dst_staging.mapped_ptr(), src_fp16->data, src_fp16->elem_size * src_fp16->elem_num); + // memcpy(dst_staging.mapped_ptr(), src->data, src->elem_size * src->elem_num); + // dst_staging.allocator->flush(dst_staging.data); + + // // mark device host-write @ null + // dst_staging.data->access_flags = VK_ACCESS_HOST_WRITE_BIT; + // dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; + + // // TODO + // // not use pack for now------------------------ + // // // resolve dst_elempack + // int dims = src->dim_num; + // int elemcount = 0; + // // src dims[0-3] n c h w + // // if (dims == 1) elemcount = opt.elempack * src_fp16.w; + // // if (dims == 2) elemcount = opt.elempack * src_fp16.h; + // // if (dims == 3) elemcount = opt.elempack * src_fp16.c; + // if(dims == 4) + // elemcount = opt.elempack * src->dims[1]; + // else + // elemcount = opt.elempack * src->dims[0]; + + // int dst_elempack = 1; + // if (opt.use_shader_pack8) + // dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4 : 1; + // else + // dst_elempack = elemcount % 4 == 0 ? 4 : 1; + + // vkdev->convert_packing(dst_staging, dst, dst_elempack, *this, opt); } void VkCompute::record_upload(const Tensor& src, VkTensor& dst, const Option& opt) @@ -193,7 +193,7 @@ void VkCompute::record_upload(const Tensor& src, VkTensor& dst, const Option& op // stash staging upload_staging_buffers.push_back(dst_staging); -// TLOG_INFO("upload_staging_buffer %p -> %p +%d ~%d", src_fp16.data, dst_staging.buffer(), dst_staging.buffer_offset(), dst_staging.buffer_capacity()); + // TLOG_INFO("upload_staging_buffer %p -> %p +%d ~%d", src_fp16.data, dst_staging.buffer(), dst_staging.buffer_offset(), dst_staging.buffer_capacity()); // memcpy src to device memcpy(dst_staging.mapped_ptr(), src_fp16.data, src_fp16.total() * src_fp16.elemsize); @@ -212,10 +212,11 @@ void VkCompute::record_upload(const Tensor& src, VkTensor& dst, const Option& op int dst_elempack = 1; if (opt.use_shader_pack8) - dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4 : 1; + dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4 + : 1; else dst_elempack = elemcount % 4 == 0 ? 4 : 1; - + // gpu cast to fp16 on the fly (integrated gpu) vkdev->convert_packing(dst_staging, dst, dst_elempack, *this, opt); } @@ -384,71 +385,71 @@ int VkCompute::submit_and_wait() // printf("delayed_records count:%d\n", record_count); // handle delayed records - for (size_t i=0; i %p", src.buffer(), src.buffer_offset(), src.buffer_capacity(), dst.data); + // TLOG_INFO("post_download %p +%d ~%d -> %p", src.buffer(), src.buffer_offset(), src.buffer_capacity(), dst.data); - src.allocator->invalidate(src.data); - // memcpy(dst.data, src.mapped_ptr(), dst.elem_size * dst.elem_num); - memcpy(dst.data, src.mapped_ptr(), dst.total() * dst.elemsize); - break; - } - case record::TYPE_post_cast_float16_to_float32: - { - // TODO - printf("submit delayed_records TYPE_post_cast_float16_to_float32, Do nothing, fix me\n"); - break; - } - default: - break; + src.allocator->invalidate(src.data); + // memcpy(dst.data, src.mapped_ptr(), dst.elem_size * dst.elem_num); + memcpy(dst.data, src.mapped_ptr(), dst.total() * dst.elemsize); + break; + } + case record::TYPE_post_cast_float16_to_float32: + { + // TODO + printf("submit delayed_records TYPE_post_cast_float16_to_float32, Do nothing, fix me\n"); + break; + } + default: + break; } } @@ -534,7 +535,6 @@ int VkCompute::submit_and_wait() return 0; } - int VkCompute::init() { // compute_command_pool @@ -664,7 +664,7 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vectorshader_info.binding_types[i]; @@ -673,7 +673,7 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vectorget_dummy_buffer() : buffer_bindings[buffer_index]; buffer_index++; -// TLOG_INFO("binding #%d buffer = %d %d %d %d @ %lu %d = %p +%ld ~%ld", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.buffer(), binding.buffer_offset(), binding.buffer_capacity()); + // TLOG_INFO("binding #%d buffer = %d %d %d %d @ %lu %d = %p +%ld ~%ld", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.buffer(), binding.buffer_offset(), binding.buffer_capacity()); if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) { @@ -719,7 +719,7 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vectorget_dummy_image() : image_bindings[image_index]; image_index++; -// TLOG_INFO("binding #%d image = %d %d %d %d @ %lu %d = %p +%ld ~%ld %p", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.image(), binding.data->bind_offset, binding.data->bind_capacity, binding.imageview()); + // TLOG_INFO("binding #%d image = %d %d %d %d @ %lu %d = %p +%ld ~%ld %p", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.image(), binding.data->bind_offset, binding.data->bind_capacity, binding.imageview()); if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->image_layout != VK_IMAGE_LAYOUT_GENERAL || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) { @@ -775,11 +775,11 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vectorget_dummy_image() : image_bindings[image_index]; image_index++; -// TLOG_INFO("binding #%d sampler = %d %d %d %d @ %lu %d = %p +%ld ~%ld %p", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.image(), binding.data->bind_offset, binding.data->bind_capacity, binding.imageview()); + // TLOG_INFO("binding #%d sampler = %d %d %d %d @ %lu %d = %p +%ld ~%ld %p", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.image(), binding.data->bind_offset, binding.data->bind_capacity, binding.imageview()); // if the same image used for both storage image and combined image sampler // only apply image layout transition to general - for (int j=0; jshader_info.binding_types[j] == 2 && binding.data == image_bindings[j].data) { @@ -865,7 +865,7 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vectorshader_info.binding_types[i]; @@ -910,7 +910,7 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vectorshader_info.binding_types[i]; @@ -972,7 +972,7 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector writeDescriptorSets(binding_count); { const unsigned char* p_descriptorInfos = descriptorInfos.data(); - for (int i=0; ishader_info.binding_types[i]; @@ -1072,7 +1072,8 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vectorinfo.unified_compute_transfer_queue) @@ -1174,8 +1175,8 @@ int VkTransfer::init() } } - // upload_command_buffer - { + // upload_command_buffer + { VkCommandBufferAllocateInfo commandBufferAllocateInfo; commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; commandBufferAllocateInfo.pNext = 0; @@ -1189,10 +1190,10 @@ int VkTransfer::init() printf("vkAllocateCommandBuffers failed %d", ret); return -1; } - } + } - // upload_compute_semaphore - { + // upload_compute_semaphore + { VkSemaphoreCreateInfo semaphoreCreateInfo; semaphoreCreateInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; semaphoreCreateInfo.pNext = 0; @@ -1200,15 +1201,15 @@ int VkTransfer::init() VkResult ret = vkCreateSemaphore(vkdev->vkdevice(), &semaphoreCreateInfo, 0, &upload_compute_semaphore); - if (ret != VK_SUCCESS) - { + if (ret != VK_SUCCESS) + { printf("vkCreateSemaphore failed %d", ret); - return -1; + return -1; + } } - } - // upload_command_fence - { + // upload_command_fence + { VkFenceCreateInfo fenceCreateInfo; fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; fenceCreateInfo.pNext = 0; @@ -1216,13 +1217,13 @@ int VkTransfer::init() VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &upload_command_fence); - if (ret != VK_SUCCESS) + if (ret != VK_SUCCESS) { printf("vkCreateFence failed %d", ret); return -1; + } } } - } begin_command_buffer(); @@ -1266,7 +1267,6 @@ int VkTransfer::begin_command_buffer() return 0; } - int VkTransfer::end_command_buffer() { { @@ -1362,9 +1362,9 @@ int VkTransfer::submit_and_wait() return -1; } } - + { - VkPipelineStageFlags wait_dst_stage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;// FIXME + VkPipelineStageFlags wait_dst_stage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; // FIXME VkSubmitInfo submitInfo; submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; submitInfo.pNext = 0; @@ -1386,11 +1386,11 @@ int VkTransfer::submit_and_wait() return -1; } } - + vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index, transfer_queue); } vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue); - + // wait if (vkdev->info.unified_compute_transfer_queue) { @@ -1403,7 +1403,7 @@ int VkTransfer::submit_and_wait() } else { - VkFence fences[2] = { upload_command_fence, compute_command_fence }; + VkFence fences[2] = {upload_command_fence, compute_command_fence}; VkResult ret = vkWaitForFences(vkdev->vkdevice(), 2, fences, VK_TRUE, UINT64_MAX); if (ret != VK_SUCCESS) @@ -1417,7 +1417,7 @@ int VkTransfer::submit_and_wait() void VkTransfer::record_upload(const Tensor& src, VkTensor& dst, const Option& opt) { -// TLOG_INFO("record_upload src = %d | %d %d %d @ %d", src.dims, src.w, src.h, src.c, src.elempack); + // TLOG_INFO("record_upload src = %d | %d %d %d @ %d", src.dims, src.w, src.h, src.c, src.elempack); // NOTE keep the hack here ? if (src.elemsize == src.elempack * 4u) @@ -1596,7 +1596,7 @@ void VkTransfer::record_upload(const Tensor& src, VkTensor& dst, const Option& o void VkTransfer::record_upload(const tensor* src, VkTensor& dst, const Option& opt) { -// TLOG_INFO("record_upload src = %d | %d %d %d @ %d", src.dims, src.w, src.h, src.c, src.elempack); + // TLOG_INFO("record_upload src = %d | %d %d %d @ %d", src.dims, src.w, src.h, src.c, src.elempack); // NOTE keep the hack here ? // printf("elem size: %d, elempack:%d\n", src.elemsize, src.elempack); diff --git a/source/device/vulkan/vulkan_command.hpp b/source/device/vulkan/vulkan_command.hpp index 1f5e82e06..345371066 100644 --- a/source/device/vulkan/vulkan_command.hpp +++ b/source/device/vulkan/vulkan_command.hpp @@ -55,7 +55,7 @@ class VkCompute void record_pipeline(const Pipeline* pipeline, const std::vector& buffer_bindings, const std::vector& image_bindings, const std::vector& constants, const VkTensor& dispatcher); void record_pipeline(const Pipeline* pipeline, const std::vector& buffer_bindings, const std::vector& image_bindings, const std::vector& constants, const VkImageTensor& dispatcher); void record_pipeline(const Pipeline* pipeline, const std::vector& buffer_bindings, const std::vector& image_bindings, const std::vector& constants, int dispatcher_w, int dispatcher_h, int dispatcher_c); - + int submit_and_wait(); int reset(); @@ -106,35 +106,110 @@ class VkCompute union { - struct { VkBuffer src; VkBuffer dst; uint32_t region_count; const VkBufferCopy* regions; } copy_buffer; - struct { VkImage src; VkImageLayout src_layout; VkImage dst; VkImageLayout dst_layout; uint32_t region_count; const VkImageCopy* regions; } copy_image; - struct { VkBuffer src; VkImage dst; VkImageLayout layout; uint32_t region_count; const VkBufferImageCopy* regions; } copy_buffer_to_image; - struct { VkImage src; VkImageLayout layout; VkBuffer dst; uint32_t region_count; const VkBufferImageCopy* regions; } copy_image_to_buffer; - - struct { VkPipelineBindPoint bind_point; VkPipeline pipeline; } bind_pipeline; - struct { VkPipelineBindPoint bind_point; VkPipelineLayout pipeline_layout; uint32_t descriptorset_count; uint32_t descriptorset_offset; } bind_descriptorsets; - struct { VkPipelineLayout pipeline_layout; VkShaderStageFlags stage_flags; uint32_t size; const void* values; } push_constants; - - struct { uint32_t group_count_x; uint32_t group_count_y; uint32_t group_count_z; } dispatch; - - struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkMemoryBarrier* barriers; } memory_barrers; - struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkBufferMemoryBarrier* barriers; } buffer_barrers; - struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkImageMemoryBarrier* barriers; } image_barrers; - - struct { uint32_t download_post_buffer_mat_offset; uint32_t download_post_mat_fp16_offset; } post_download; - struct { uint32_t download_post_mat_fp16_offset; uint32_t download_post_mat_offset; } post_cast_float16_to_float32; + struct + { + VkBuffer src; + VkBuffer dst; + uint32_t region_count; + const VkBufferCopy* regions; + } copy_buffer; + struct + { + VkImage src; + VkImageLayout src_layout; + VkImage dst; + VkImageLayout dst_layout; + uint32_t region_count; + const VkImageCopy* regions; + } copy_image; + struct + { + VkBuffer src; + VkImage dst; + VkImageLayout layout; + uint32_t region_count; + const VkBufferImageCopy* regions; + } copy_buffer_to_image; + struct + { + VkImage src; + VkImageLayout layout; + VkBuffer dst; + uint32_t region_count; + const VkBufferImageCopy* regions; + } copy_image_to_buffer; + + struct + { + VkPipelineBindPoint bind_point; + VkPipeline pipeline; + } bind_pipeline; + struct + { + VkPipelineBindPoint bind_point; + VkPipelineLayout pipeline_layout; + uint32_t descriptorset_count; + uint32_t descriptorset_offset; + } bind_descriptorsets; + struct + { + VkPipelineLayout pipeline_layout; + VkShaderStageFlags stage_flags; + uint32_t size; + const void* values; + } push_constants; + + struct + { + uint32_t group_count_x; + uint32_t group_count_y; + uint32_t group_count_z; + } dispatch; + + struct + { + VkPipelineStageFlags src_stage; + VkPipelineStageFlags dst_stage; + uint32_t barrier_count; + const VkMemoryBarrier* barriers; + } memory_barrers; + struct + { + VkPipelineStageFlags src_stage; + VkPipelineStageFlags dst_stage; + uint32_t barrier_count; + const VkBufferMemoryBarrier* barriers; + } buffer_barrers; + struct + { + VkPipelineStageFlags src_stage; + VkPipelineStageFlags dst_stage; + uint32_t barrier_count; + const VkImageMemoryBarrier* barriers; + } image_barrers; + + struct + { + uint32_t download_post_buffer_mat_offset; + uint32_t download_post_mat_fp16_offset; + } post_download; + struct + { + uint32_t download_post_mat_fp16_offset; + uint32_t download_post_mat_offset; + } post_cast_float16_to_float32; }; }; std::vector delayed_records; }; - class VkTransfer { public: VkTransfer(const GPUDevice* vkdev); ~VkTransfer(); + public: void record_upload(const tensor* src, VkTensor& dst, const Option& opt); void record_upload(const Tensor& src, VkTensor& dst, const Option& opt); diff --git a/source/device/vulkan/vulkan_define.h b/source/device/vulkan/vulkan_define.h index e0c68277a..68de6df99 100644 --- a/source/device/vulkan/vulkan_define.h +++ b/source/device/vulkan/vulkan_define.h @@ -26,9 +26,8 @@ #define VULKAN_DEV_NAME "VK" - typedef struct vulkan_option { char* dev_name; - int precision; //!< precision of calculation + int precision; //!< precision of calculation } vulkan_opt_t; diff --git a/source/device/vulkan/vulkan_device.hpp b/source/device/vulkan/vulkan_device.hpp index 9560261fe..1fee0d5e1 100644 --- a/source/device/vulkan/vulkan_device.hpp +++ b/source/device/vulkan/vulkan_device.hpp @@ -26,8 +26,7 @@ #include "vulkan_define.h" -extern "C" -{ +extern "C" { #include "api/c_api.h" #include "device/device.h" diff --git a/source/device/vulkan/vulkan_executor.hpp b/source/device/vulkan/vulkan_executor.hpp index 28ae46efb..c4cc99a6c 100644 --- a/source/device/vulkan/vulkan_executor.hpp +++ b/source/device/vulkan/vulkan_executor.hpp @@ -22,9 +22,7 @@ * Author: lswang@openailab.com */ - -extern "C" -{ +extern "C" { #include "api/c_api.h" #include "device/device.h" #include "graph/tensor.h" @@ -57,15 +55,15 @@ struct VULKANqueue int dims; // cl_kernel queue_kernel; // cl_event enentPoint; - size_t *queue_global_work_size; - size_t *queue_local_work_size; + size_t* queue_global_work_size; + size_t* queue_local_work_size; }; class VULKANEngine { public: -// VULKANEngine(); -// ~VULKANEngine() = default; + // VULKANEngine(); + // ~VULKANEngine() = default; int VULKANEnginePreRun(struct subgraph* subgraph); int VULKANEngineRun(struct subgraph* subgraph); @@ -75,15 +73,10 @@ class VULKANEngine bool init(); private: - public: // dict_uint2clmem vulkan_tensor_map; - std::vector queue_list; + std::vector queue_list; public: int bin_num; - }; - - - diff --git a/source/device/vulkan/vulkan_gpu.cpp b/source/device/vulkan/vulkan_gpu.cpp index dac4e9486..fba68aa70 100644 --- a/source/device/vulkan/vulkan_gpu.cpp +++ b/source/device/vulkan/vulkan_gpu.cpp @@ -80,8 +80,7 @@ struct layer_shader_registry_entry #include "layer_shader_spv_data.h" -static const layer_shader_registry_entry layer_shader_registry[] = -{ +static const layer_shader_registry_entry layer_shader_registry[] = { #include "layer_shader_registry.h" }; @@ -130,21 +129,23 @@ PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR = 0; // compile with old vulkan sdk #if VK_HEADER_VERSION < 80 #define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR (VkStructureType)1000177000 -typedef struct VkPhysicalDevice8BitStorageFeaturesKHR { - VkStructureType sType; - void* pNext; - VkBool32 storageBuffer8BitAccess; - VkBool32 uniformAndStorageBuffer8BitAccess; - VkBool32 storagePushConstant8; +typedef struct VkPhysicalDevice8BitStorageFeaturesKHR +{ + VkStructureType sType; + void* pNext; + VkBool32 storageBuffer8BitAccess; + VkBool32 uniformAndStorageBuffer8BitAccess; + VkBool32 storagePushConstant8; } VkPhysicalDevice8BitStorageFeaturesKHR; #endif // VK_HEADER_VERSION < 80 #if VK_HEADER_VERSION < 95 #define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR (VkStructureType)1000082000 -typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR { - VkStructureType sType; - void* pNext; - VkBool32 shaderFloat16; - VkBool32 shaderInt8; +typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR +{ + VkStructureType sType; + void* pNext; + VkBool32 shaderFloat16; + VkBool32 shaderInt8; } VkPhysicalDeviceFloat16Int8FeaturesKHR; #endif // VK_HEADER_VERSION < 95 @@ -157,7 +158,7 @@ static int init_instance_extension() if (support_VK_KHR_get_physical_device_properties2) { - vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFeatures2KHR"); + vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFeatures2KHR"); vkGetPhysicalDeviceProperties2KHR = (PFN_vkGetPhysicalDeviceProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceProperties2KHR"); vkGetPhysicalDeviceFormatProperties2KHR = (PFN_vkGetPhysicalDeviceFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFormatProperties2KHR"); vkGetPhysicalDeviceImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceImageFormatProperties2KHR"); @@ -225,19 +226,19 @@ void DestroyDebugUtilsMessengerEXT(VkInstance instance, VkDebugUtilsMessengerEXT static uint32_t find_device_compute_queue(const std::vector& queueFamilyProperties) { // first try, compute only queue - for (uint32_t i=0; i& queueFamilyProperties) { // first try, graphics only queue - for (uint32_t i=0; i& queueFamilyProperties) { // first try, transfer only queue - for (uint32_t i=0; i= 26 support_VK_KHR_android_surface = 0; #endif // __ANDROID_API__ >= 26 - for (uint32_t j=0; j= 26 gpu_info.support_VK_ANDROID_external_memory_android_hardware_buffer = 0; #endif // __ANDROID_API__ >= 26 - for (uint32_t j=0; j enabledExtensions; if (info.support_VK_KHR_8bit_storage) @@ -986,9 +988,9 @@ GPUDevice::GPUDevice(int device_index) : info(g_gpu_infos[device_index]) querySamplerYcbcrConversionFeatures.pNext = enabledExtensionFeatures; enabledExtensionFeatures = &querySamplerYcbcrConversionFeatures; } - std::vector compute_queue_priorities(info.compute_queue_count, 1.f);// 0.f ~ 1.f - std::vector graphics_queue_priorities(info.graphics_queue_count, 1.f);// 0.f ~ 1.f - std::vector transfer_queue_priorities(info.transfer_queue_count, 1.f);// 0.f ~ 1.f + std::vector compute_queue_priorities(info.compute_queue_count, 1.f); // 0.f ~ 1.f + std::vector graphics_queue_priorities(info.graphics_queue_count, 1.f); // 0.f ~ 1.f + std::vector transfer_queue_priorities(info.transfer_queue_count, 1.f); // 0.f ~ 1.f VkDeviceQueueCreateInfo deviceQueueCreateInfos[3]; VkDeviceQueueCreateInfo deviceComputeQueueCreateInfo; @@ -1048,7 +1050,7 @@ GPUDevice::GPUDevice(int device_index) : info(g_gpu_infos[device_index]) deviceCreateInfo.ppEnabledLayerNames = 0; deviceCreateInfo.enabledExtensionCount = enabledExtensions.size(); deviceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data(); - deviceCreateInfo.pEnabledFeatures = 0;// VkPhysicalDeviceFeatures pointer + deviceCreateInfo.pEnabledFeatures = 0; // VkPhysicalDeviceFeatures pointer VkResult ret = vkCreateDevice(info.physical_device, &deviceCreateInfo, 0, &device); if (ret != VK_SUCCESS) @@ -1066,7 +1068,7 @@ GPUDevice::GPUDevice(int device_index) : info(g_gpu_infos[device_index]) for (uint32_t i = 0; i < info.compute_queue_count; i++) { vkGetDeviceQueue(device, info.compute_queue_family_index, i, &compute_queues[i]); - + blob_allocators[i] = new VkBlobAllocator(this); staging_allocators[i] = new VkStagingAllocator(this); } @@ -1265,13 +1267,10 @@ VkShaderModule GPUDevice::compile_shader_module(const uint32_t* spv_data, size_t return shader_module; } - - - uint32_t GPUDevice::find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const { // first try, find required and with preferred and without preferred_not - for (uint32_t i=0; i& queues = queue_family_index == info.compute_queue_family_index ? compute_queues - : queue_family_index == info.graphics_queue_family_index ? graphics_queues : transfer_queues; - for (int i=0; i<(int)queues.size(); i++) + std::vector& queues = queue_family_index == info.compute_queue_family_index ? compute_queues + : queue_family_index == info.graphics_queue_family_index ? graphics_queues + : transfer_queues; + for (int i = 0; i < (int)queues.size(); i++) { VkQueue queue = queues[i]; if (queue) @@ -1390,9 +1390,10 @@ void GPUDevice::reclaim_queue(uint32_t queue_family_index, VkQueue queue) const // TODO MutexLockGuard lock(queue_lock); - std::vector& queues = queue_family_index == info.compute_queue_family_index ? compute_queues - : queue_family_index == info.graphics_queue_family_index ? graphics_queues : transfer_queues; - for (int i=0; i<(int)queues.size(); i++) + std::vector& queues = queue_family_index == info.compute_queue_family_index ? compute_queues + : queue_family_index == info.graphics_queue_family_index ? graphics_queues + : transfer_queues; + for (int i = 0; i < (int)queues.size(); i++) { if (!queues[i]) { @@ -1408,7 +1409,7 @@ VkAllocator* GPUDevice::acquire_blob_allocator() const { MutexLockGuard lock(blob_allocator_lock); - for (int i=0; i<(int)blob_allocators.size(); i++) + for (int i = 0; i < (int)blob_allocators.size(); i++) { VkAllocator* allocator = blob_allocators[i]; if (allocator) @@ -1426,7 +1427,7 @@ void GPUDevice::reclaim_blob_allocator(VkAllocator* allocator) const { MutexLockGuard lock(blob_allocator_lock); - for (int i=0; i<(int)blob_allocators.size(); i++) + for (int i = 0; i < (int)blob_allocators.size(); i++) { if (!blob_allocators[i]) { @@ -1438,12 +1439,11 @@ void GPUDevice::reclaim_blob_allocator(VkAllocator* allocator) const TLOG_INFO("FATAL ERROR! reclaim_blob_allocator get wild allocator %p", allocator); } - VkAllocator* GPUDevice::acquire_staging_allocator() const { MutexLockGuard lock(staging_allocator_lock); - for (int i=0; i<(int)staging_allocators.size(); i++) + for (int i = 0; i < (int)staging_allocators.size(); i++) { VkAllocator* allocator = staging_allocators[i]; if (allocator) @@ -1457,12 +1457,11 @@ VkAllocator* GPUDevice::acquire_staging_allocator() const return 0; } - void GPUDevice::reclaim_staging_allocator(VkAllocator* allocator) const { MutexLockGuard lock(staging_allocator_lock); - for (int i=0; i<(int)staging_allocators.size(); i++) + for (int i = 0; i < (int)staging_allocators.size(); i++) { if (!staging_allocators[i]) { @@ -1483,7 +1482,7 @@ int GPUDevice::create_shader_module() } shader_modules.resize(layer_shader_registry_entry_count, VK_NULL_HANDLE); - for (int i=0; ibind_offset, image.data->bind_capacity, image.imageview()); + // TLOG_INFO("xxx barrier image %p +%d ~%d %p", image.image(), image.data->bind_offset, image.data->bind_capacity, image.imageview()); // image layout transform any @ any to shader-write @ compute VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; @@ -1874,7 +1877,6 @@ class VkDummyCompute : public VkCompute image.data->image_layout = VK_IMAGE_LAYOUT_GENERAL; image.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; } - }; int GPUDevice::create_dummy_buffer_image() @@ -1911,56 +1913,57 @@ int GPUDevice::create_utility_operator() // from buffer | image // to buffer | image - for (int i0=0; i0<2; i0++) - { - for (int i1=0; i1<2; i1++) + for (int i0 = 0; i0 < 2; i0++) { - opt.use_image_storage = (i0 == 1 || i1 == 1); -// #if __APPLE__ -// if (opt.use_image_storage) -// continue; -// #endif - - // from fp32-b/i | fp16p-b/i | fp16s-b/i - // to fp32-b/i | fp16p-b/i | fp16s-b/i - for (int j0=0; j0<3; j0++) - { - for (int j1=0; j1<3; j1++) + for (int i1 = 0; i1 < 2; i1++) { - opt.use_fp16_packed = (j0 == 1 || j1 == 1); - opt.use_fp16_storage = (j0 == 2 || j1 == 2); - - if (!info.support_fp16_packed && opt.use_fp16_packed) - continue; - - if (!info.support_fp16_storage && opt.use_fp16_storage) - continue; - - // from pack1 | pack4 | pack8 - for (int k=0; k<3; k++) + opt.use_image_storage = (i0 == 1 || i1 == 1); + // #if __APPLE__ + // if (opt.use_image_storage) + // continue; + // #endif + + // from fp32-b/i | fp16p-b/i | fp16s-b/i + // to fp32-b/i | fp16p-b/i | fp16s-b/i + for (int j0 = 0; j0 < 3; j0++) { - // enable pack8 for pack8to1/pack8to4 - opt.use_shader_pack8 = true; - - { // create packing layer - TEngine::Packing_vulkan* uop = new Packing_vulkan(); - uop->vkdev = this; - - uop->out_elempack = k == 0 ? 1 : k == 1 ? 4 : 8; - uop->cast_type_from = j0 + 1; - uop->cast_type_to = j1 + 1; - uop->storage_type_from = i0; - uop->storage_type_to = i1; - // TLOG_INFO("out_elempack:%d %d %d %d %d\n", uop->out_elempack, uop->cast_type_from, uop->cast_type_to, uop->storage_type_from, uop->storage_type_to); - - uop->create_pipeline(opt); - - uop_packing[i0][i1][j0][j1][k] = uop; + for (int j1 = 0; j1 < 3; j1++) + { + opt.use_fp16_packed = (j0 == 1 || j1 == 1); + opt.use_fp16_storage = (j0 == 2 || j1 == 2); + + if (!info.support_fp16_packed && opt.use_fp16_packed) + continue; + + if (!info.support_fp16_storage && opt.use_fp16_storage) + continue; + + // from pack1 | pack4 | pack8 + for (int k = 0; k < 3; k++) + { + // enable pack8 for pack8to1/pack8to4 + opt.use_shader_pack8 = true; + + { // create packing layer + TEngine::Packing_vulkan* uop = new Packing_vulkan(); + uop->vkdev = this; + + uop->out_elempack = k == 0 ? 1 : k == 1 ? 4 + : 8; + uop->cast_type_from = j0 + 1; + uop->cast_type_to = j1 + 1; + uop->storage_type_from = i0; + uop->storage_type_to = i1; + // TLOG_INFO("out_elempack:%d %d %d %d %d\n", uop->out_elempack, uop->cast_type_from, uop->cast_type_to, uop->storage_type_from, uop->storage_type_to); + + uop->create_pipeline(opt); + + uop_packing[i0][i1][j0][j1][k] = uop; + } + } } } } - } - } } return 0; @@ -1972,47 +1975,47 @@ void GPUDevice::destroy_utility_operator() // from buffer | image // to buffer | image - for (int i0=0; i0<2; i0++) + for (int i0 = 0; i0 < 2; i0++) { - for (int i1=0; i1<2; i1++) - { - opt.use_image_storage = (i0 == 1 || i1 == 1); + for (int i1 = 0; i1 < 2; i1++) + { + opt.use_image_storage = (i0 == 1 || i1 == 1); #if __APPLE__ - if (opt.use_image_storage) - continue; + if (opt.use_image_storage) + continue; #endif - // from fp32-b/i | fp16p-b/i | fp16s-b/i - // to fp32-b/i | fp16p-b/i | fp16s-b/i - for (int j0=0; j0<3; j0++) - { - for (int j1=0; j1<3; j1++) - { - opt.use_fp16_packed = (j0 == 1 || j1 == 1); - opt.use_fp16_storage = (j0 == 2 || j1 == 2); + // from fp32-b/i | fp16p-b/i | fp16s-b/i + // to fp32-b/i | fp16p-b/i | fp16s-b/i + for (int j0 = 0; j0 < 3; j0++) + { + for (int j1 = 0; j1 < 3; j1++) + { + opt.use_fp16_packed = (j0 == 1 || j1 == 1); + opt.use_fp16_storage = (j0 == 2 || j1 == 2); - if (!info.support_fp16_packed && opt.use_fp16_packed) - continue; + if (!info.support_fp16_packed && opt.use_fp16_packed) + continue; - if (!info.support_fp16_storage && opt.use_fp16_storage) - continue; + if (!info.support_fp16_storage && opt.use_fp16_storage) + continue; - // from pack1 | pack4 | pack8 - for (int k=0; k<3; k++) - { - opt.use_shader_pack8 = (k == 2 || k == 2); + // from pack1 | pack4 | pack8 + for (int k = 0; k < 3; k++) + { + opt.use_shader_pack8 = (k == 2 || k == 2); - TEngine::Layer* uop = uop_packing[i0][i1][j0][j1][k]; + TEngine::Layer* uop = uop_packing[i0][i1][j0][j1][k]; - uop->destroy_pipeline(opt); + uop->destroy_pipeline(opt); - delete uop; + delete uop; - uop_packing[i0][i1][j0][j1][k] = 0; + uop_packing[i0][i1][j0][j1][k] = 0; + } + } } } - } - } } } @@ -2022,9 +2025,12 @@ void GPUDevice::convert_packing(const VkTensor& src, VkTensor& dst, int dst_elem Option opt = _opt; opt.use_image_storage = false; - int cast_type_from_index = src.elemsize == src.elempack * 4u ? 0 : opt.use_fp16_storage ? 2 : 1; - int cast_type_to_index = opt.use_fp16_storage ? 2 : opt.use_fp16_packed && dst_elempack % 4 == 0 ? 1 : 0; - int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2; + int cast_type_from_index = src.elemsize == src.elempack * 4u ? 0 : opt.use_fp16_storage ? 2 + : 1; + int cast_type_to_index = opt.use_fp16_storage ? 2 : opt.use_fp16_packed && dst_elempack % 4 == 0 ? 1 + : 0; + int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 + : 2; // TLOG_INFO("convert_packing b2b %d %d %d\n", cast_type_from_index, cast_type_to_index, packing_type_to_index); diff --git a/source/device/vulkan/vulkan_gpu.hpp b/source/device/vulkan/vulkan_gpu.hpp index b0a6466a1..b5cce6eac 100644 --- a/source/device/vulkan/vulkan_gpu.hpp +++ b/source/device/vulkan/vulkan_gpu.hpp @@ -196,7 +196,10 @@ class GPUDevice const GpuInfo& info; - VkDevice vkdevice() const { return device; } + VkDevice vkdevice() const + { + return device; + } VkShaderModule get_shader_module(int shader_type_index) const; @@ -294,17 +297,17 @@ class GPUDevice mutable std::vector compute_queues; mutable std::vector graphics_queues; mutable std::vector transfer_queues; - + mutable Mutex queue_lock; // default blob allocator for each queue mutable std::vector blob_allocators; - + mutable Mutex blob_allocator_lock; // default staging allocator for each queue mutable std::vector staging_allocators; - + mutable Mutex staging_allocator_lock; // dummy buffer and image @@ -335,15 +338,24 @@ class ShaderInfo // 1 = storage buffer // 2 = storage image // 3 = combined image sampler - int binding_types[16];// 16 is large enough(maybe) + int binding_types[16]; // 16 is large enough(maybe) }; const ShaderInfo& get_shader_info(int shader_type_index); int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info); -union vk_specialization_type { int i; float f; uint32_t u32; }; -union vk_constant_type { int i; float f; }; +union vk_specialization_type +{ + int i; + float f; + uint32_t u32; +}; +union vk_constant_type +{ + int i; + float f; +}; -} +} // namespace TEngine #endif // VULKAN_GPU_HPP diff --git a/source/device/vulkan/vulkan_graph.hpp b/source/device/vulkan/vulkan_graph.hpp index 8218f271c..700f95103 100644 --- a/source/device/vulkan/vulkan_graph.hpp +++ b/source/device/vulkan/vulkan_graph.hpp @@ -38,8 +38,7 @@ #include "vulkan_option.hpp" #include "vulkan_layer.hpp" -extern "C" -{ +extern "C" { // #include "device/device.h" // #include "graph/subgraph.h" @@ -55,19 +54,21 @@ extern "C" #include "utility/vector.h" #include "utility/log.h" - #include "convolution_param.h" namespace TEngine { class VulkanDevice; -class VulkanGraph { - -friend VulkanDevice; +class VulkanGraph +{ + friend VulkanDevice; public: - const std::string& GetName(void) const {return name_;} + const std::string& GetName(void) const + { + return name_; + } VulkanGraph(const std::string& name); VulkanGraph(struct subgraph* graph); @@ -82,14 +83,14 @@ friend VulkanDevice; bool CreatePoolingPipeline(ir_node_t* node); std::unordered_map tensor_map_; // tengine lite cpu tensor list - std::unordered_map tensor_map; // vulkan cpu tensor list - std::unordered_map vktensor_map_; // vulkan gpu tensor list + std::unordered_map tensor_map; // vulkan cpu tensor list + std::unordered_map vktensor_map_; // vulkan gpu tensor list bool OpSupported(const std::string& name); Option opt; Pipeline* pipeline_convolution; - + int record_graph_pipeline(); int upload_model(); @@ -106,23 +107,21 @@ friend VulkanDevice; VkAllocator* weight_vkallocator; VkAllocator* weight_staging_vkallocator; - -private: +private: VkAllocator* local_blob_vkallocator; VkAllocator* local_staging_vkallocator; - + std::string name_; - std::vector gpu_mem_vector_; - std::vector mem_buf_vector_; + std::vector gpu_mem_vector_; + std::vector mem_buf_vector_; std::map iotensor_map_; }; } //namespace TEngine - int vulkan_dev_init(struct device* dev); int vulkan_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options); int vulkan_dev_run(struct device* dev, struct subgraph* subgraph); @@ -130,7 +129,6 @@ int vulkan_dev_postrun(struct device* dev, struct subgraph* subgraph); int vulkan_dev_release(struct device* dev); } - /* diff --git a/source/device/vulkan/vulkan_helper.hpp b/source/device/vulkan/vulkan_helper.hpp index 3955be7bb..a273a3b25 100644 --- a/source/device/vulkan/vulkan_helper.hpp +++ b/source/device/vulkan/vulkan_helper.hpp @@ -32,8 +32,7 @@ #include #include -extern "C" -{ +extern "C" { #include "api/c_api.h" #include "graph/tensor.h" #include "graph/node.h" @@ -49,7 +48,7 @@ extern "C" // bool CHECK_ENQUEUE_BUFFER_STATUS(cl_int status); /** convert the kernel file into a string */ -int convertToString(const char *filename, std::string& s); +int convertToString(const char* filename, std::string& s); /**Getting platforms and choose an available one.*/ // int getPlatform(cl_platform_id &platform); @@ -60,4 +59,3 @@ int convertToString(const char *filename, std::string& s); void get_device_message(); void dump_sub_graph(struct subgraph* sub_graph); - diff --git a/source/device/vulkan/vulkan_layer.cpp b/source/device/vulkan/vulkan_layer.cpp index a4c7e4dab..84f2b9de2 100644 --- a/source/device/vulkan/vulkan_layer.cpp +++ b/source/device/vulkan/vulkan_layer.cpp @@ -81,4 +81,4 @@ int Layer::record_pipeline(const std::vector& bottom_blobs, std::vecto return 0; } -} // TEngine \ No newline at end of file +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/vulkan_layer.hpp b/source/device/vulkan/vulkan_layer.hpp index 526ca148b..2c2be9710 100644 --- a/source/device/vulkan/vulkan_layer.hpp +++ b/source/device/vulkan/vulkan_layer.hpp @@ -44,8 +44,7 @@ #include "vulkan_command.hpp" #include "vulkan_pipeline.hpp" -extern "C" -{ +extern "C" { #include "api/c_api.h" #include "device/device.h" #include "graph/tensor.h" @@ -114,6 +113,6 @@ class Layer Layer* create_layer(std::string type); -} // TEngine +} // namespace TEngine #endif // VULKAN_LAYER_HPP diff --git a/source/device/vulkan/vulkan_limit.hpp b/source/device/vulkan/vulkan_limit.hpp index 741786fae..fbb45e089 100644 --- a/source/device/vulkan/vulkan_limit.hpp +++ b/source/device/vulkan/vulkan_limit.hpp @@ -22,139 +22,134 @@ * Author: hhchen@openailab.com */ - #pragma once -extern "C" -{ +extern "C" { #include "operator/op.h" } - const int vulkan_supported_ops[] = { - OP_CLIP, - OP_CONCAT, - OP_CONST, - OP_CONV, - OP_DROPOUT, - OP_ELTWISE, - OP_FC, - OP_FLATTEN, - OP_INPUT, -//// OP_PERMUTE, - OP_POOL, - OP_RELU, - OP_RESHAPE, - OP_SLICE, -//// OP_SOFTMAX - - -// OP_BIAS, + OP_CLIP, + OP_CONCAT, + OP_CONST, + OP_CONV, + OP_DROPOUT, + OP_ELTWISE, + OP_FC, + OP_FLATTEN, + OP_INPUT, + //// OP_PERMUTE, + OP_POOL, + OP_RELU, + OP_RESHAPE, + OP_SLICE, + //// OP_SOFTMAX -//// OP_ABSVAL, -//// OP_ADD_N, -//// OP_ARGMAX, -//// OP_ARGMIN, -//// OP_BATCHNORM, -//// OP_BATCHTOSPACEND, -//// OP_BIAS, -//// OP_BROADMUL, -// -//// OP_CAST, -//// OP_CEIL, -//// OP_CLIP, -//// OP_COMPARISON, -//// OP_CONCAT, -// OP_CONST, -// OP_CONV, -//// OP_CROP, -//// OP_DECONV, -//// OP_DEPTHTOSPACE, -//// OP_DETECTION_OUTPUT, -//// OP_DETECTION_POSTPROCESS, -// -//// OP_DROPOUT, -//// OP_ELTWISE, -//// OP_ELU, -//// OP_EMBEDDING, -//// OP_EXPANDDIMS, -//// OP_FC, -//// OP_FLATTEN, -//// OP_GATHER, -//// OP_GEMM, -//// OP_GRU, -//// OP_HARDSIGMOID, -//// OP_HARDSWISH, -// OP_INPUT, -//// OP_INSTANCENORM, -//// OP_INTERP, -//// OP_LOGICAL, -//// OP_LOGISTIC, -//// OP_LRN, -//// OP_LSTM, -//// OP_MATMUL, -//// OP_MAXIMUM, -//// OP_MEAN, -//// OP_MINIMUM, -//// OP_MVN, -//// OP_NOOP, -//// OP_NORMALIZE, -// -//// OP_PAD, -//// OP_PERMUTE, -// OP_POOL, -//// OP_PRELU, -//// OP_PRIORBOX, -//// OP_PSROIPOOLING, -//// OP_REDUCEL2, -//// OP_REDUCTION, -//// OP_REGION, -// OP_RELU, -// -//// OP_RELU6, -//// OP_REORG, -//// OP_RESHAPE, -//// OP_RESIZE, -//// OP_REVERSE, -//// OP_RNN, -//// OP_ROIALIGN, -//// OP_ROIPOOLING, -//// OP_ROUND, -//// OP_RPN, -//// OP_SCALE, -//// OP_SELU, -//// OP_SHUFFLECHANNEL, -//// OP_SIGMOID, -// -//// OP_SLICE, -//// OP_SOFTMAX, -//// OP_SPACETOBATCHND, -//// OP_SPACETODEPTH, -//// OP_SPARSETODENSE, -//// OP_SPLIT, -//// OP_SQUAREDDIFFERENCE, -//// OP_SQUEEZE, -//// OP_STRIDED_SLICE, -//// OP_SWAP_AXIS, -//// OP_TANH, -//// OP_THRESHOLD, -//// OP_TOPKV2, -//// OP_TRANSPOSE, -//// OP_UNARY, -//// OP_UNSQUEEZE, -//// OP_UPSAMPLE, -//// OP_ZEROSLIKE, -//// OP_MISH, -//// OP_LOGSOFTMAX, -//// OP_RELU1, -//// OP_L2NORMALIZATION, -//// OP_L2POOL, -//// OP_TILE, -//// OP_SHAPE, -//// OP_SCATTER, -//// OP_WHERE, -//// OP_BUILTIN_LAST + // OP_BIAS, + //// OP_ABSVAL, + //// OP_ADD_N, + //// OP_ARGMAX, + //// OP_ARGMIN, + //// OP_BATCHNORM, + //// OP_BATCHTOSPACEND, + //// OP_BIAS, + //// OP_BROADMUL, + // + //// OP_CAST, + //// OP_CEIL, + //// OP_CLIP, + //// OP_COMPARISON, + //// OP_CONCAT, + // OP_CONST, + // OP_CONV, + //// OP_CROP, + //// OP_DECONV, + //// OP_DEPTHTOSPACE, + //// OP_DETECTION_OUTPUT, + //// OP_DETECTION_POSTPROCESS, + // + //// OP_DROPOUT, + //// OP_ELTWISE, + //// OP_ELU, + //// OP_EMBEDDING, + //// OP_EXPANDDIMS, + //// OP_FC, + //// OP_FLATTEN, + //// OP_GATHER, + //// OP_GEMM, + //// OP_GRU, + //// OP_HARDSIGMOID, + //// OP_HARDSWISH, + // OP_INPUT, + //// OP_INSTANCENORM, + //// OP_INTERP, + //// OP_LOGICAL, + //// OP_LOGISTIC, + //// OP_LRN, + //// OP_LSTM, + //// OP_MATMUL, + //// OP_MAXIMUM, + //// OP_MEAN, + //// OP_MINIMUM, + //// OP_MVN, + //// OP_NOOP, + //// OP_NORMALIZE, + // + //// OP_PAD, + //// OP_PERMUTE, + // OP_POOL, + //// OP_PRELU, + //// OP_PRIORBOX, + //// OP_PSROIPOOLING, + //// OP_REDUCEL2, + //// OP_REDUCTION, + //// OP_REGION, + // OP_RELU, + // + //// OP_RELU6, + //// OP_REORG, + //// OP_RESHAPE, + //// OP_RESIZE, + //// OP_REVERSE, + //// OP_RNN, + //// OP_ROIALIGN, + //// OP_ROIPOOLING, + //// OP_ROUND, + //// OP_RPN, + //// OP_SCALE, + //// OP_SELU, + //// OP_SHUFFLECHANNEL, + //// OP_SIGMOID, + // + //// OP_SLICE, + //// OP_SOFTMAX, + //// OP_SPACETOBATCHND, + //// OP_SPACETODEPTH, + //// OP_SPARSETODENSE, + //// OP_SPLIT, + //// OP_SQUAREDDIFFERENCE, + //// OP_SQUEEZE, + //// OP_STRIDED_SLICE, + //// OP_SWAP_AXIS, + //// OP_TANH, + //// OP_THRESHOLD, + //// OP_TOPKV2, + //// OP_TRANSPOSE, + //// OP_UNARY, + //// OP_UNSQUEEZE, + //// OP_UPSAMPLE, + //// OP_ZEROSLIKE, + //// OP_MISH, + //// OP_LOGSOFTMAX, + //// OP_RELU1, + //// OP_L2NORMALIZATION, + //// OP_L2POOL, + //// OP_TILE, + //// OP_SHAPE, + //// OP_SCATTER, + //// OP_WHERE, + //// OP_BUILTIN_LAST }; diff --git a/source/device/vulkan/vulkan_option.cpp b/source/device/vulkan/vulkan_option.cpp index d57440411..e61d37a13 100644 --- a/source/device/vulkan/vulkan_option.cpp +++ b/source/device/vulkan/vulkan_option.cpp @@ -58,7 +58,7 @@ Option::Option() use_int8_inference = true; use_vulkan_compute = true; - use_fp16_packed = true; + use_fp16_packed = true; use_fp16_storage = true; use_fp16_arithmetic = false; use_int8_storage = false; diff --git a/source/device/vulkan/vulkan_pipeline.cpp b/source/device/vulkan/vulkan_pipeline.cpp index 6935c76b5..d604db1f6 100644 --- a/source/device/vulkan/vulkan_pipeline.cpp +++ b/source/device/vulkan/vulkan_pipeline.cpp @@ -46,7 +46,8 @@ namespace TEngine { -Pipeline::Pipeline(const GPUDevice* _vkdev) : vkdev(_vkdev) +Pipeline::Pipeline(const GPUDevice* _vkdev) + : vkdev(_vkdev) { local_shader_module = 0; @@ -92,7 +93,7 @@ int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const std:: local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size); } -// TLOG_INFO("local_shader_module %p created", local_shader_module); + // TLOG_INFO("local_shader_module %p created", local_shader_module); return create(local_shader_module, si, specializations); } @@ -198,12 +199,12 @@ void Pipeline::destroy() { vkdev->vkDestroyDescriptorUpdateTemplateKHR(vkdev->vkdevice(), descriptor_update_template, 0); descriptor_update_template = 0; - } + } } if (pipeline) { - vkDestroyPipeline(vkdev->vkdevice(), pipeline, 0); + vkDestroyPipeline(vkdev->vkdevice(), pipeline, 0); pipeline = 0; } @@ -307,7 +308,7 @@ void Pipeline::set_local_size_xyz(int w, int h, int c) local_size_y = h; local_size_z = c; -// TLOG_INFO("local size = %d %d %d", local_size_x, local_size_y, local_size_z); + // TLOG_INFO("local size = %d %d %d", local_size_x, local_size_y, local_size_z); } int Pipeline::create_descriptorset_layout() @@ -321,7 +322,7 @@ int Pipeline::create_descriptorset_layout() } std::vector descriptorSetLayoutBindings(binding_count); - for (int i=0; i 0) { - pipelineLayoutCreateInfo.pushConstantRangeCount = 1; - pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange; + pipelineLayoutCreateInfo.pushConstantRangeCount = 1; + pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange; } else { - pipelineLayoutCreateInfo.pushConstantRangeCount = 0; - pipelineLayoutCreateInfo.pPushConstantRanges = 0; + pipelineLayoutCreateInfo.pushConstantRangeCount = 0; + pipelineLayoutCreateInfo.pPushConstantRanges = 0; } VkResult ret = vkCreatePipelineLayout(vkdev->vkdevice(), &pipelineLayoutCreateInfo, 0, &pipeline_layout); @@ -418,7 +419,6 @@ int Pipeline::create_pipeline_layout() return 0; } - int Pipeline::create_pipeline(VkShaderModule shader_module, const std::vector& specializations) { const int specialization_count = specializations.size(); @@ -427,7 +427,7 @@ int Pipeline::create_pipeline(VkShaderModule shader_module, const std::vector specializationMapEntries; specializationMapEntries.resize(specialization_count + 3); - for (int i=0; i descriptorUpdateTemplateEntries(binding_count); size_t offset = 0; - for (int i=0; iinfo.support_VK_KHR_push_descriptor) { - descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR; + descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR; } else { - descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR; + descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR; } // descriptorSetLayout should be ignored if VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR // FIXME HACK WARNING TODO NOTE but crash on radv if set NULL :( @@ -564,5 +564,4 @@ int Pipeline::create_descriptor_update_template() return 0; } - } // namespace TEngine diff --git a/source/device/vulkan/vulkan_pipeline.hpp b/source/device/vulkan/vulkan_pipeline.hpp index 9980d2e43..a2c349901 100644 --- a/source/device/vulkan/vulkan_pipeline.hpp +++ b/source/device/vulkan/vulkan_pipeline.hpp @@ -57,7 +57,7 @@ class Pipeline public: void set_optimal_local_size_xyz(int w = 4, int h = 4, int c = 4); - + void set_optimal_local_size_xyz(const VkTensor& local_size_xyz); void set_optimal_local_size_xyz(const Tensor& local_size_xyz); void set_local_size_xyz(int w, int h, int c); diff --git a/source/device/vulkan/vulkan_platform.hpp b/source/device/vulkan/vulkan_platform.hpp index cc03681a7..97f588246 100644 --- a/source/device/vulkan/vulkan_platform.hpp +++ b/source/device/vulkan/vulkan_platform.hpp @@ -47,10 +47,23 @@ namespace TEngine { class Mutex { public: - Mutex() { pthread_mutex_init(&mutex, 0); } - ~Mutex() { pthread_mutex_destroy(&mutex); } - void lock() { pthread_mutex_lock(&mutex); } - void unlock() { pthread_mutex_unlock(&mutex); } + Mutex() + { + pthread_mutex_init(&mutex, 0); + } + ~Mutex() + { + pthread_mutex_destroy(&mutex); + } + void lock() + { + pthread_mutex_lock(&mutex); + } + void unlock() + { + pthread_mutex_unlock(&mutex); + } + private: friend class ConditionVariable; pthread_mutex_t mutex; @@ -59,8 +72,16 @@ class Mutex class MutexLockGuard { public: - MutexLockGuard(Mutex& _mutex) : mutex(_mutex) { mutex.lock(); } - ~MutexLockGuard() { mutex.unlock(); } + MutexLockGuard(Mutex& _mutex) + : mutex(_mutex) + { + mutex.lock(); + } + ~MutexLockGuard() + { + mutex.unlock(); + } + private: Mutex& mutex; }; @@ -68,11 +89,27 @@ class MutexLockGuard class ConditionVariable { public: - ConditionVariable() { pthread_cond_init(&cond, 0); } - ~ConditionVariable() { pthread_cond_destroy(&cond); } - void wait(Mutex& mutex) { pthread_cond_wait(&cond, &mutex.mutex); } - void broadcast() { pthread_cond_broadcast(&cond); } - void signal() { pthread_cond_signal(&cond); } + ConditionVariable() + { + pthread_cond_init(&cond, 0); + } + ~ConditionVariable() + { + pthread_cond_destroy(&cond); + } + void wait(Mutex& mutex) + { + pthread_cond_wait(&cond, &mutex.mutex); + } + void broadcast() + { + pthread_cond_broadcast(&cond); + } + void signal() + { + pthread_cond_signal(&cond); + } + private: pthread_cond_t cond; }; @@ -80,9 +117,18 @@ class ConditionVariable class Thread { public: - Thread(void* (*start)(void*), void* args = 0) { pthread_create(&t, 0, start, args); } - ~Thread() {} - void join() { pthread_join(t, 0); } + Thread(void* (*start)(void*), void* args = 0) + { + pthread_create(&t, 0, start, args); + } + ~Thread() + { + } + void join() + { + pthread_join(t, 0); + } + private: pthread_t t; }; diff --git a/source/device/vulkan/vulkan_tensor.cpp b/source/device/vulkan/vulkan_tensor.cpp index 38f588502..8beff0cc8 100644 --- a/source/device/vulkan/vulkan_tensor.cpp +++ b/source/device/vulkan/vulkan_tensor.cpp @@ -98,7 +98,7 @@ void convert_packing(const Tensor& src, Tensor& dst, int _elempack, const Option if (dst.empty()) return; - #pragma omp parallel for +#pragma omp parallel for for (int i = 0; i < outh; i++) { unsigned char* outptr = (unsigned char*)dst + i * w * out_elemsize; @@ -135,7 +135,7 @@ void convert_packing(const Tensor& src, Tensor& dst, int _elempack, const Option if (dst.empty()) return; - #pragma omp parallel for +#pragma omp parallel for for (int q = 0; q < outc; q++) { Tensor out = dst.channel(q); @@ -309,11 +309,11 @@ void cast_float32_to_float16(const Tensor& src, Tensor& dst, const Option& opt) dst.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); } if (dst.empty()) - return ; + return; int size = w * h * elempack; - #pragma omp parallel for +#pragma omp parallel for for (int q = 0; q < channels; q++) { const float* ptr = src.channel(q); @@ -324,7 +324,6 @@ void cast_float32_to_float16(const Tensor& src, Tensor& dst, const Option& opt) outptr[i] = float32_to_float16(ptr[i]); } } - } void cast_float16_to_float32(const Tensor& src, Tensor& dst, const Option& opt) @@ -353,11 +352,11 @@ void cast_float16_to_float32(const Tensor& src, Tensor& dst, const Option& opt) dst.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); } if (dst.empty()) - return ; + return; int size = w * h * elempack; - #pragma omp parallel for +#pragma omp parallel for for (int q = 0; q < channels; q++) { const unsigned short* ptr = src.channel(q); @@ -368,7 +367,6 @@ void cast_float16_to_float32(const Tensor& src, Tensor& dst, const Option& opt) outptr[i] = float16_to_float32(ptr[i]); } } - } -} // namespace TEngine +} // namespace TEngine diff --git a/source/device/vulkan/vulkan_tensor.hpp b/source/device/vulkan/vulkan_tensor.hpp index a0ef5a9bd..f10868c8c 100644 --- a/source/device/vulkan/vulkan_tensor.hpp +++ b/source/device/vulkan/vulkan_tensor.hpp @@ -44,8 +44,7 @@ #include // #include "tengine_ir.h" -extern "C" -{ +extern "C" { #include "graph/tensor.h" #include "graph/node.h" #include "graph/graph.h" @@ -169,8 +168,10 @@ class Tensor const float* row(int y) const; // access raw data - template operator T*(); - template operator const T*() const; + template + operator T*(); + template + operator const T*() const; // pointer to the data void* data; @@ -205,8 +206,6 @@ class Tensor size_t cstep; }; - - class VkTensor { public: @@ -242,7 +241,7 @@ class VkTensor ~VkTensor(); // assign VkTensor& operator=(const VkTensor& m); - // reshape vec + // reshape vec VkTensor reshape(int w, Allocator* allocator = 0) const; // reshape image VkTensor reshape(int w, int h, Allocator* allocator = 0) const; @@ -290,7 +289,7 @@ class VkTensor // shape only // Mat shape() const; - + // low-level reference VkBuffer buffer() const; size_t buffer_offset() const; @@ -388,7 +387,6 @@ class VkImageTensor // allocate like void create_like(const VkImageTensor& im, VkAllocator* allocator); - // mapped ///Mat mapped() const; void* mapped_ptr() const; @@ -418,7 +416,7 @@ class VkImageTensor // pointer to the reference counter // when points to user-allocated data, the pointer is NULL - + int* refcount; // element size in bytes @@ -1139,7 +1137,6 @@ inline void VkImageTensor::create_like(const tensor* m, VkAllocator* _allocator) create(_w, _h, _c, _elemsize, _elempack, _allocator); } - inline void VkImageTensor::create_like(const VkTensor& m, VkAllocator* _allocator) { int _dims = m.dims; @@ -1248,23 +1245,25 @@ inline VkImageView VkImageTensor::imageview() const return data->imageview; } - ///////////////////////////////////////////////////////////////////////////////////////////////////////////// //Tensor defination inline Tensor::Tensor() : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) { -} +} inline Tensor::Tensor(int _w, size_t _elemsize, Allocator* _allocator) : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) { create(_w, _elemsize, _allocator); -} +} -inline Tensor::Tensor(int _w, int _h, size_t _elemsize, Allocator* _allocator) : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0){ - create(_w, _h, _elemsize, _allocator);} +inline Tensor::Tensor(int _w, int _h, size_t _elemsize, Allocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) +{ + create(_w, _h, _elemsize, _allocator); +} inline Tensor::Tensor(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator) : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) { @@ -1299,7 +1298,7 @@ inline Tensor::Tensor(const Tensor& m) inline Tensor::Tensor(struct tensor* m) : data(m->data), refcount(0), elemsize(0), elempack(1), allocator(0), dims(0), w(0), h(0), c(0) { - if(m->layout == 0) + if (m->layout == 0) { c = m->dims[1]; h = m->dims[2]; @@ -1398,7 +1397,7 @@ inline Tensor Tensor::reshape(int _w, Allocator* _allocator) const m.create(_w, elemsize, elempack, _allocator); // flatten - for (int i=0; i 0) { @@ -1792,26 +1791,23 @@ inline const float* Tensor::row(int y) const return (const float*)((unsigned char*)data + w * y * elemsize); } -template +template inline Tensor::operator T*() { return (T*)data; } -template +template inline Tensor::operator const T*() const { return (const T*)data; } void convert_packing(const Tensor& src, Tensor& dst, int elempack, const Option& opt = Option()); -void convert_packing(tensor* src, Tensor&dst, int elempack, const Option& opt = Option()); +void convert_packing(tensor* src, Tensor& dst, int elempack, const Option& opt = Option()); void cast_float32_to_float16(const Tensor& src, Tensor& dst, const Option& opt = Option()); void cast_float16_to_float32(const Tensor& src, Tensor& dst, const Option& opt = Option()); - } // namespace TEngine - #endif // VULKAN_TENSOR_HPP - diff --git a/source/executer/executer.c b/source/executer/executer.c index 28fb4e513..7350cc4c7 100644 --- a/source/executer/executer.c +++ b/source/executer/executer.c @@ -30,19 +30,17 @@ #include - void init_attribute(ir_attribute_t* attribute, ir_context_t* context) { - attribute->status = GRAPH_STAT_CREATED; - attribute->priority = 0; - attribute->policy = DEFAULT_POLICY; - attribute->private_context = 0; - attribute->context = context; - attribute->device_privacy = NULL; - attribute->scheduler_privacy = NULL; + attribute->status = GRAPH_STAT_CREATED; + attribute->priority = 0; + attribute->policy = DEFAULT_POLICY; + attribute->private_context = 0; + attribute->context = context; + attribute->device_privacy = NULL; + attribute->scheduler_privacy = NULL; } - void destroy_attribute(struct graph* graph, ir_attribute_t* attribute) { if (NULL != attribute->device_privacy) @@ -58,14 +56,12 @@ void destroy_attribute(struct graph* graph, ir_attribute_t* attribute) sys_free(attribute); } - int release_device_mem(struct device* dev, ir_memory_t* dev_mem) { // TODO: return -1; } - void init_ir_context(ir_context_t* context, const char* name) { if (NULL != name) @@ -76,11 +72,11 @@ void init_ir_context(ir_context_t* context, const char* name) } else { - context->name = NULL; + context->name = NULL; } - context->scheduler = NULL; - context->device = NULL; - context->default_options = NULL; - context->device_options = NULL; + context->scheduler = NULL; + context->device = NULL; + context->default_options = NULL; + context->device_options = NULL; } diff --git a/source/executer/executer.h b/source/executer/executer.h index 1d86dc2d9..fe381f4c2 100644 --- a/source/executer/executer.h +++ b/source/executer/executer.h @@ -32,7 +32,6 @@ struct device; struct graph; struct scheduler; - /*! * @struct ir_context_t * @brief Abstract neural network runnable execution context @@ -40,42 +39,39 @@ struct scheduler; typedef struct context { char* name; - struct scheduler* scheduler; //!< binding scheduler of this context - struct device* device; //!< binding device of this context - void* default_options; // - ir_graph_t* create_ir_graph(struct context* context) { ir_graph_t* ir_graph = (ir_graph_t*)sys_malloc(sizeof(ir_graph_t)); @@ -55,37 +53,35 @@ ir_graph_t* create_ir_graph(struct context* context) return ir_graph; } - void init_ir_graph(ir_graph_t* graph, struct context* context) { - graph->tensor_list = NULL; - graph->node_list = NULL; - graph->input_nodes = NULL; - graph->output_nodes = NULL; + graph->tensor_list = NULL; + graph->node_list = NULL; + graph->input_nodes = NULL; + graph->output_nodes = NULL; - graph->tensor_num = 0; - graph->node_num = 0; - graph->input_num = 0; - graph->output_num = 0; + graph->tensor_num = 0; + graph->node_num = 0; + graph->input_num = 0; + graph->output_num = 0; - graph->subgraph_list = create_vector(sizeof(struct subgraph*), NULL); + graph->subgraph_list = create_vector(sizeof(struct subgraph*), NULL); - graph->graph_layout = TENGINE_LAYOUT_NCHW; - graph->model_layout = TENGINE_LAYOUT_NCHW; - graph->model_format = MODEL_FORMAT_TENGINE; + graph->graph_layout = TENGINE_LAYOUT_NCHW; + graph->model_layout = TENGINE_LAYOUT_NCHW; + graph->model_format = MODEL_FORMAT_TENGINE; - graph->serializer = NULL; - graph->serializer_privacy = NULL; + graph->serializer = NULL; + graph->serializer_privacy = NULL; - graph->device = NULL; - graph->device_privacy = NULL; + graph->device = NULL; + graph->device_privacy = NULL; - graph->status = GRAPH_STAT_CREATED; + graph->status = GRAPH_STAT_CREATED; init_attribute(graph->attribute, context); } - void destroy_ir_graph(ir_graph_t* graph) { //!< 1, destroy subgraph @@ -134,7 +130,6 @@ void destroy_ir_graph(ir_graph_t* graph) sys_free(graph); } - int set_ir_graph_input_node(ir_graph_t* graph, int16_t input_nodes[], int input_number) { if (0 >= input_number) @@ -142,7 +137,7 @@ int set_ir_graph_input_node(ir_graph_t* graph, int16_t input_nodes[], int input_ return -1; } - int16_t* new_input_nodes = ( int16_t* )sys_malloc(input_number * sizeof(int16_t)); + int16_t* new_input_nodes = (int16_t*)sys_malloc(input_number * sizeof(int16_t)); if (NULL == new_input_nodes) { return -1; @@ -200,25 +195,21 @@ int set_ir_graph_output_node(ir_graph_t* graph, int16_t output_nodes[], int outp return 0; } - struct tensor* get_ir_graph_tensor(ir_graph_t* graph, int index) { return graph->tensor_list[index]; } - struct node* get_ir_graph_node(ir_graph_t* graph, int index) { return graph->node_list[index]; } - struct subgraph* get_ir_graph_subgraph(ir_graph_t* graph, int index) { return *(struct subgraph**)get_vector_data(graph->subgraph_list, index); } - int infer_ir_graph_shape(ir_graph_t* graph) { const int node_num = graph->node_num; @@ -281,7 +272,6 @@ int infer_ir_graph_shape(ir_graph_t* graph) return 0; } - void dump_ir_graph(ir_graph_t* graph) { TLOG_INFO("graph node_num %u tensor_num: %u subgraph_num: %u\n", graph->node_num, graph->tensor_num, diff --git a/source/graph/graph.h b/source/graph/graph.h index e5d746138..a336ef837 100644 --- a/source/graph/graph.h +++ b/source/graph/graph.h @@ -33,41 +33,39 @@ struct tensor; struct device; struct attribute; - /*! * @struct ir_graph_t * @brief Abstract graph intermediate representation */ typedef struct graph { - struct tensor** tensor_list; //!< the tensor list of a graph - struct node** node_list; //!< the node list of a graph - int16_t* input_nodes; //!< input nodes index array of a graph - int16_t* output_nodes; //!< output nodes index array of a graph + struct tensor** tensor_list; //!< the tensor list of a graph + struct node** node_list; //!< the node list of a graph + int16_t* input_nodes; //!< input nodes index array of a graph + int16_t* output_nodes; //!< output nodes index array of a graph - uint16_t tensor_num; //!< the count of all graph tensor - uint16_t node_num; //!< the count of all graph node - uint16_t input_num; //!< input nodes index count of a graph - uint16_t output_num; //!< input nodes index count of a graph + uint16_t tensor_num; //!< the count of all graph tensor + uint16_t node_num; //!< the count of all graph node + uint16_t input_num; //!< input nodes index count of a graph + uint16_t output_num; //!< input nodes index count of a graph - int8_t graph_layout; //!< the data layout of a graph - int8_t model_layout; //!< model layout of graph source model - int8_t model_format; //!< model format of graph source model + int8_t graph_layout; //!< the data layout of a graph + int8_t model_layout; //!< model layout of graph source model + int8_t model_format; //!< model format of graph source model - uint8_t status; //!< the status of graph + uint8_t status; //!< the status of graph - struct serializer* serializer; //!< serializer of graph - void* serializer_privacy; //!< privacy data of serializer + struct serializer* serializer; //!< serializer of graph + void* serializer_privacy; //!< privacy data of serializer - struct device* device; //!< assigned nn_device for this graph - void* device_privacy; //!< privacy data of device + struct device* device; //!< assigned nn_device for this graph + void* device_privacy; //!< privacy data of device - struct attribute* attribute; //index = node_index; - ir_node->dynamic_shape = 0; - ir_node->input_num = 0; - ir_node->output_num = 0; - ir_node->node_type = TE_NODE_TYPE_INTER; - ir_node->input_tensors = NULL; - ir_node->output_tensors = NULL; - ir_node->name = NULL; - ir_node->op.type = op_type; - ir_node->op.version = op_version; - ir_node->op.same_shape = 1; - ir_node->op.param_size = 0; - ir_node->op.param_mem = NULL; - ir_node->op.infer_shape = NULL; - ir_node->subgraph_idx = -1; + ir_node->index = node_index; + ir_node->dynamic_shape = 0; + ir_node->input_num = 0; + ir_node->output_num = 0; + ir_node->node_type = TE_NODE_TYPE_INTER; + ir_node->input_tensors = NULL; + ir_node->output_tensors = NULL; + ir_node->name = NULL; + ir_node->op.type = op_type; + ir_node->op.version = op_version; + ir_node->op.same_shape = 1; + ir_node->op.param_size = 0; + ir_node->op.param_mem = NULL; + ir_node->op.infer_shape = NULL; + ir_node->subgraph_idx = -1; } - ir_node_t* create_ir_node(struct graph* ir_graph, const char* node_name, int op_type, int op_version) { ir_node_t* node = (ir_node_t*)sys_malloc(sizeof(ir_node_t)); @@ -69,7 +67,7 @@ ir_node_t* create_ir_node(struct graph* ir_graph, const char* node_name, int op_ init_ir_node(node, op_type, op_version, ir_graph->node_num); // check if any op param should be set - ir_method_t * method = find_op_method(op_type, op_version); + ir_method_t* method = find_op_method(op_type, op_version); if ((NULL != method) && (NULL != method->init) && (method->init(&node->op) < 0)) { sys_free(node); @@ -98,7 +96,6 @@ ir_node_t* create_ir_node(struct graph* ir_graph, const char* node_name, int op_ return node; } - void destroy_ir_node(struct graph* ir_graph, ir_node_t* ir_node) { if (NULL != ir_node->name) @@ -129,10 +126,9 @@ void destroy_ir_node(struct graph* ir_graph, ir_node_t* ir_node) sys_free(ir_node); } - char* create_ir_node_name_from_index(int index) { - char* name = ( char* )sys_malloc(16); + char* name = (char*)sys_malloc(16); if (NULL == name) { return NULL; @@ -141,7 +137,6 @@ char* create_ir_node_name_from_index(int index) return name; } - int get_ir_node_index_from_name(struct graph* ir_graph, const char* node_name) { ir_node_t* ir_node; @@ -177,12 +172,11 @@ int get_ir_node_index_from_name(struct graph* ir_graph, const char* node_name) return -1; } - int set_ir_node_input_tensor(ir_node_t* node, int input_idx, ir_tensor_t* tensor) { if (input_idx >= node->input_num) { - int16_t* new_tensor = ( int16_t* )sys_realloc(node->input_tensors, sizeof(int16_t) * (input_idx + 1)); + int16_t* new_tensor = (int16_t*)sys_realloc(node->input_tensors, sizeof(int16_t) * (input_idx + 1)); if (NULL == new_tensor) { @@ -206,7 +200,6 @@ int set_ir_node_input_tensor(ir_node_t* node, int input_idx, ir_tensor_t* tensor return 0; } - int set_ir_node_output_tensor(ir_node_t* node, int output_idx, ir_tensor_t* tensor) { if (output_idx >= node->output_num) @@ -228,7 +221,6 @@ int set_ir_node_output_tensor(ir_node_t* node, int output_idx, ir_tensor_t* tens return 0; } - void dump_ir_node(struct graph* ir_graph, ir_node_t* ir_node) { if (NULL != ir_node->name) diff --git a/source/graph/node.h b/source/graph/node.h index 7f8f8b74a..507737e32 100644 --- a/source/graph/node.h +++ b/source/graph/node.h @@ -31,35 +31,32 @@ #include - struct node; struct tensor; struct graph; - /*! * @struct ir_node_t * @brief Abstract node intermediate representation */ typedef struct node { - uint16_t index; //!< the index of a node - uint8_t dynamic_shape; //!< flag of dynamic shape - uint8_t input_num; //!< count of input tensor - uint8_t output_num; //!< count of output tensor - uint8_t node_type; //!< type of node: { input, output, intermediate } - int8_t subgraph_idx; //!< id of the owner subgraph + uint16_t index; //!< the index of a node + uint8_t dynamic_shape; //!< flag of dynamic shape + uint8_t input_num; //!< count of input tensor + uint8_t output_num; //!< count of output tensor + uint8_t node_type; //!< type of node: { input, output, intermediate } + int8_t subgraph_idx; //!< id of the owner subgraph - uint16_t* input_tensors; //!< id array of input tensor - uint16_t* output_tensors; //!< id array of output tensor + uint16_t* input_tensors; //!< id array of input tensor + uint16_t* output_tensors; //!< id array of output tensor - char* name; //!< name of a node + char* name; //!< name of a node - struct op op; //!< operator of a node - struct graph* graph; //!< pointer of the related graph + struct op op; //!< operator of a node + struct graph* graph; //!< pointer of the related graph } ir_node_t; - /*! * @brief Create a node for a graph. * @@ -72,7 +69,6 @@ typedef struct node */ ir_node_t* create_ir_node(struct graph* ir_graph, const char* node_name, int op_type, int op_version); - /*! * @brief Destroy a node. * @@ -83,7 +79,6 @@ ir_node_t* create_ir_node(struct graph* ir_graph, const char* node_name, int op_ */ void destroy_ir_node(struct graph* ir_graph, ir_node_t* ir_node); - /*! * @brief Set node name from id, for anonymity ones. * @@ -93,7 +88,6 @@ void destroy_ir_node(struct graph* ir_graph, ir_node_t* ir_node); */ char* create_ir_node_name_from_index(int index); - /*! * @brief Get node id from name, for anonymity ones. * @@ -108,7 +102,6 @@ char* create_ir_node_name_from_index(int index); */ int get_ir_node_index_from_name(struct graph* ir_graph, const char* node_name); - /*! * @brief Mark a tensor as node a specific input tensor. * @@ -120,7 +113,6 @@ int get_ir_node_index_from_name(struct graph* ir_graph, const char* node_name); */ int set_ir_node_input_tensor(ir_node_t* ir_node, int input_idx, struct tensor* tensor); - /*! * @brief Mark a tensor as node a specific output tensor. * @@ -132,7 +124,6 @@ int set_ir_node_input_tensor(ir_node_t* ir_node, int input_idx, struct tensor* t */ int set_ir_node_output_tensor(ir_node_t* ir_node, int output_idx, struct tensor* tensor); - /*! * @brief Dump the node. * diff --git a/source/graph/subgraph.c b/source/graph/subgraph.c index f0b619dab..41387e1ce 100644 --- a/source/graph/subgraph.c +++ b/source/graph/subgraph.c @@ -29,25 +29,23 @@ #include "device/device.h" #include "api/c_api.h" - void init_ir_subgraph(struct graph* graph, struct subgraph* subgraph, int index) { - subgraph->index = index; - subgraph->input_ready_count = 0; - subgraph->input_wait_count = 0; - subgraph->input_num = 0; - subgraph->output_num = 0; - subgraph->node_num = 0; - subgraph->node_list = NULL; - subgraph->input_tensor_list = NULL; - subgraph->output_tensor_list = NULL; - subgraph->graph = graph; - subgraph->device = NULL; - subgraph->device_graph = NULL; - subgraph->status = GRAPH_STAT_CREATED; + subgraph->index = index; + subgraph->input_ready_count = 0; + subgraph->input_wait_count = 0; + subgraph->input_num = 0; + subgraph->output_num = 0; + subgraph->node_num = 0; + subgraph->node_list = NULL; + subgraph->input_tensor_list = NULL; + subgraph->output_tensor_list = NULL; + subgraph->graph = graph; + subgraph->device = NULL; + subgraph->device_graph = NULL; + subgraph->status = GRAPH_STAT_CREATED; } - void release_ir_subgraph(struct graph* graph, struct subgraph* subgraph) { struct device* device = subgraph->device; diff --git a/source/graph/subgraph.h b/source/graph/subgraph.h index 9f7936833..1ae252c3b 100644 --- a/source/graph/subgraph.h +++ b/source/graph/subgraph.h @@ -30,33 +30,31 @@ struct graph; struct device; - /*! * @struct ir_subgraph_t * @brief Abstract subgraph intermediate representation */ typedef struct subgraph { - uint8_t index; //!< the index of a subgraph - uint8_t input_ready_count; //!< the count of all in ready input tensors - uint8_t input_wait_count; //!< the count of all out of ready input tensors - uint8_t input_num; //!< the count of input tensors - uint8_t output_num; //!< the count of output tensors - uint8_t status; //!< the execution status of subgraph + uint8_t index; //!< the index of a subgraph + uint8_t input_ready_count; //!< the count of all in ready input tensors + uint8_t input_wait_count; //!< the count of all out of ready input tensors + uint8_t input_num; //!< the count of input tensors + uint8_t output_num; //!< the count of output tensors + uint8_t status; //!< the execution status of subgraph - uint16_t node_num; //!< the count of nodes in subgraph - uint16_t* node_list; //!< all nodes index list of subgraph + uint16_t node_num; //!< the count of nodes in subgraph + uint16_t* node_list; //!< all nodes index list of subgraph - uint16_t* input_tensor_list; //!< input tensors index list of subgraph - uint16_t* output_tensor_list; //!< output tensors index list of subgraph + uint16_t* input_tensor_list; //!< input tensors index list of subgraph + uint16_t* output_tensor_list; //!< output tensors index list of subgraph - struct graph* graph; //!< the pointer of the related graph + struct graph* graph; //!< the pointer of the related graph - struct device* device; //!< the device which will the subgraph running on - void* device_graph; //!< the related device graph + struct device* device; //!< the device which will the subgraph running on + void* device_graph; //!< the related device graph } ir_subgraph_t; - /*! * @brief Init a subgraph. * @@ -66,7 +64,6 @@ typedef struct subgraph */ void init_ir_subgraph(struct graph* graph, ir_subgraph_t* subgraph, int index); - /*! * @brief Release a subgraph. * diff --git a/source/graph/tensor.c b/source/graph/tensor.c index c5d049edf..5b065a458 100644 --- a/source/graph/tensor.c +++ b/source/graph/tensor.c @@ -38,46 +38,43 @@ #include #include - void init_ir_tensor(ir_tensor_t* ir_tensor, int tensor_index, int data_type) { + ir_tensor->index = tensor_index; + ir_tensor->producer = -1; - ir_tensor->index = tensor_index; - ir_tensor->producer = -1; - - ir_tensor->consumer = ( int16_t* )sys_malloc(sizeof(int16_t) * TE_MAX_CONSUMER_NUM); + ir_tensor->consumer = (int16_t*)sys_malloc(sizeof(int16_t) * TE_MAX_CONSUMER_NUM); for (int i = 0; i < TE_MAX_CONSUMER_NUM; i++) { ir_tensor->consumer[i] = -1; } - ir_tensor->reshaped = 0; + ir_tensor->reshaped = 0; ir_tensor->consumer_num = 0; - ir_tensor->tensor_type = TENSOR_TYPE_VAR; - ir_tensor->data_type = data_type; - ir_tensor->dim_num = 0; - ir_tensor->elem_size = get_tenser_element_size(data_type); - ir_tensor->subgraph_num = 0; - ir_tensor->free_host_mem = 0; - ir_tensor->internal_allocated = 1; - ir_tensor->layout = TENGINE_LAYOUT_NCHW; - ir_tensor->quant_param_num = 0; - ir_tensor->elem_num = 0; + ir_tensor->tensor_type = TENSOR_TYPE_VAR; + ir_tensor->data_type = data_type; + ir_tensor->dim_num = 0; + ir_tensor->elem_size = get_tenser_element_size(data_type); + ir_tensor->subgraph_num = 0; + ir_tensor->free_host_mem = 0; + ir_tensor->internal_allocated = 1; + ir_tensor->layout = TENGINE_LAYOUT_NCHW; + ir_tensor->quant_param_num = 0; + ir_tensor->elem_num = 0; for (int i = 0; i < MAX_SHAPE_DIM_NUM; i++) { ir_tensor->dims[i] = 0; } - ir_tensor->data = NULL; - ir_tensor->name = NULL; - ir_tensor->scale_list = NULL; - ir_tensor->zp_list = NULL; - ir_tensor->dev_mem = NULL; - ir_tensor->subgraph_list = NULL; + ir_tensor->data = NULL; + ir_tensor->name = NULL; + ir_tensor->scale_list = NULL; + ir_tensor->zp_list = NULL; + ir_tensor->dev_mem = NULL; + ir_tensor->subgraph_list = NULL; } - ir_tensor_t* create_ir_tensor(ir_graph_t* ir_graph, const char* tensor_name, int data_type) { ir_tensor_t* ir_tensor = (ir_tensor_t*)sys_malloc(sizeof(ir_tensor_t)); @@ -122,7 +119,6 @@ ir_tensor_t* create_ir_tensor(ir_graph_t* ir_graph, const char* tensor_name, int return ir_tensor; } - void destroy_ir_tensor(ir_graph_t* ir_graph, ir_tensor_t* ir_tensor) { if (ir_tensor->quant_param_num > 1) @@ -166,7 +162,6 @@ void destroy_ir_tensor(ir_graph_t* ir_graph, ir_tensor_t* ir_tensor) sys_free(ir_tensor); } - int set_ir_tensor_shape(ir_tensor_t* tensor, const int dims[], int dim_number) { if (MAX_SHAPE_DIM_NUM + 1 < dim_number) @@ -194,7 +189,6 @@ int set_ir_tensor_shape(ir_tensor_t* tensor, const int dims[], int dim_number) return 0; } - char* create_ir_tensor_name_from_index(int index) { char* name = (char*)sys_malloc(TE_COMMON_ALIGN_SIZE * 2); @@ -208,7 +202,6 @@ char* create_ir_tensor_name_from_index(int index) return name; } - int get_ir_tensor_index_from_name(ir_graph_t* graph, const char* tensor_name) { const char* last_symbol_ptr = strrchr(tensor_name, '_'); @@ -242,7 +235,6 @@ int get_ir_tensor_index_from_name(ir_graph_t* graph, const char* tensor_name) return -1; } - int set_ir_tensor_quantization_parameter(ir_tensor_t* tensor, const float* scale, const int* zero_point, int number) { if (NULL == scale || NULL == zero_point) @@ -284,7 +276,6 @@ int set_ir_tensor_quantization_parameter(ir_tensor_t* tensor, const float* scale return 0; } - int get_ir_tensor_quantization_parameter(ir_tensor_t* tensor, float* scale, int* zero_point, int number) { if (number < tensor->quant_param_num) @@ -306,7 +297,6 @@ int get_ir_tensor_quantization_parameter(ir_tensor_t* tensor, float* scale, int* return tensor->quant_param_num; } - void dump_ir_tensor(ir_graph_t* g, ir_tensor_t* t) { if (NULL != t->name) @@ -355,7 +345,7 @@ int set_ir_tensor_consumer(ir_tensor_t* ir_tensor, const int index) { if (TE_MAX_CONSUMER_NUM <= ir_tensor->consumer_num) { - int16_t* new_consumer = ( int16_t* )sys_realloc(ir_tensor->consumer, sizeof(int16_t) * (ir_tensor->consumer_num + 1)); + int16_t* new_consumer = (int16_t*)sys_realloc(ir_tensor->consumer, sizeof(int16_t) * (ir_tensor->consumer_num + 1)); if (NULL == new_consumer) { return -1; @@ -368,4 +358,4 @@ int set_ir_tensor_consumer(ir_tensor_t* ir_tensor, const int index) ir_tensor->consumer_num++; return 0; -} +} diff --git a/source/graph/tensor.h b/source/graph/tensor.h index c92a7943e..601d23467 100644 --- a/source/graph/tensor.h +++ b/source/graph/tensor.h @@ -36,31 +36,30 @@ extern "C" { struct node; struct graph; - /*! * @struct ir_tensor_t * @brief Abstract tensor intermediate representation */ typedef struct tensor { - uint16_t index; //!< the index of a tensor - int16_t producer; //!< node id, '-1' means no producer - int16_t* consumer; //!< consumer nodes array - - uint8_t reshaped; //!< the tensor's shape has changed - uint8_t consumer_num; //!< count of consumer nodes - uint8_t tensor_type; //!< tensor_type: { const, input, var, dep } - uint8_t data_type; //!< data_type: { int8, uint8, fp32, fp16, int32 } - uint8_t dim_num; //!< count of dimensions - uint8_t elem_size; //!< size of single element - uint8_t subgraph_num; //!< count of all subgraph those will waiting this tensor ready - uint8_t free_host_mem; //!< should free host memory? - uint8_t internal_allocated; //!< how memory is allocated? - uint8_t layout; //!< tensor layout: { TENGINE_LAYOUT_NCHW, TENGINE_LAYOUT_NHWC } - - uint16_t quant_param_num; //!< quantization dimension - uint32_t elem_num; //!< count of total elements - int dims[TE_MAX_SHAPE_DIM_NUM]; //!< shape dimensions + uint16_t index; //!< the index of a tensor + int16_t producer; //!< node id, '-1' means no producer + int16_t* consumer; //!< consumer nodes array + + uint8_t reshaped; //!< the tensor's shape has changed + uint8_t consumer_num; //!< count of consumer nodes + uint8_t tensor_type; //!< tensor_type: { const, input, var, dep } + uint8_t data_type; //!< data_type: { int8, uint8, fp32, fp16, int32 } + uint8_t dim_num; //!< count of dimensions + uint8_t elem_size; //!< size of single element + uint8_t subgraph_num; //!< count of all subgraph those will waiting this tensor ready + uint8_t free_host_mem; //!< should free host memory? + uint8_t internal_allocated; //!< how memory is allocated? + uint8_t layout; //!< tensor layout: { TENGINE_LAYOUT_NCHW, TENGINE_LAYOUT_NHWC } + + uint16_t quant_param_num; //!< quantization dimension + uint32_t elem_num; //!< count of total elements + int dims[TE_MAX_SHAPE_DIM_NUM]; //!< shape dimensions /*! * @union anonymity data pointer @@ -68,15 +67,15 @@ typedef struct tensor */ union { - void* data; - int8_t* i8; - uint8_t* u8; - float* f32; - uint16_t* f16; - int32_t* i32; + void* data; + int8_t* i8; + uint8_t* u8; + float* f32; + uint16_t* f16; + int32_t* i32; }; - char* name; //!< tensor name + char* name; //!< tensor name /*! * @union anonymity quantization scale union @@ -85,7 +84,7 @@ typedef struct tensor union { float* scale_list; - float scale; + float scale; }; /*! @@ -94,15 +93,14 @@ typedef struct tensor */ union { - int zero_point; + int zero_point; int* zp_list; }; struct dev_mem* dev_mem; - uint8_t* subgraph_list; //!< subgraph index list of those subgraph will waiting this tensor ready + uint8_t* subgraph_list; //!< subgraph index list of those subgraph will waiting this tensor ready } ir_tensor_t; - /*! * @brief Create a tensor for a graph. * @@ -114,7 +112,6 @@ typedef struct tensor */ ir_tensor_t* create_ir_tensor(struct graph* graph, const char* tensor_name, int data_type); - /*! * @brief Destroy a tensor. * @@ -125,7 +122,6 @@ ir_tensor_t* create_ir_tensor(struct graph* graph, const char* tensor_name, int */ void destroy_ir_tensor(struct graph* ir_graph, ir_tensor_t* ir_tensor); - /*! * @brief Set shape for a tensor. * @@ -137,7 +133,6 @@ void destroy_ir_tensor(struct graph* ir_graph, ir_tensor_t* ir_tensor); */ int set_ir_tensor_shape(ir_tensor_t* ir_tensor, const int dims[], int dim_number); - /*! * @brief Set tensor name from id, for anonymity ones. * @@ -147,7 +142,6 @@ int set_ir_tensor_shape(ir_tensor_t* ir_tensor, const int dims[], int dim_number */ char* create_ir_tensor_name_from_index(int index); - /*! * @brief Get tensor id from name, for anonymity ones. * @@ -158,7 +152,6 @@ char* create_ir_tensor_name_from_index(int index); */ int get_ir_tensor_index_from_name(struct graph* ir_graph, const char* tensor_name); - /*! * @brief Set tensor quantization parameter. * @@ -171,7 +164,6 @@ int get_ir_tensor_index_from_name(struct graph* ir_graph, const char* tensor_nam */ int set_ir_tensor_quantization_parameter(ir_tensor_t* ir_tensor, const float* scale, const int* zero_point, int number); - /*! * @brief Get tensor quantization parameter. * @@ -184,7 +176,6 @@ int set_ir_tensor_quantization_parameter(ir_tensor_t* ir_tensor, const float* sc */ int get_ir_tensor_quantization_parameter(ir_tensor_t* ir_tensor, float* scale, int* zero_point, int number); - /*! * @brief Dump the tensor. * @@ -201,7 +192,7 @@ void dump_ir_tensor(struct graph* ir_graph, ir_tensor_t* ir_tensor); * * @return statue value, 0 success, other value failure. */ -int set_ir_tensor_consumer(ir_tensor_t* ir_tensor, const int index); +int set_ir_tensor_consumer(ir_tensor_t* ir_tensor, const int index); #ifdef __cplusplus } diff --git a/source/module/module.c b/source/module/module.c index 2a945a285..f3e4398bb 100644 --- a/source/module/module.c +++ b/source/module/module.c @@ -35,12 +35,10 @@ #include #include - -static vector_t* internal_serializer_registry = NULL; //!< registry of model serializer -static vector_t* internal_device_registry = NULL; //!< registry of runnable neural network device -static vector_t* internal_op_method_registry = NULL; //!< registry of operators -static vector_t* internal_op_name_registry = NULL; //!< registry of operators name - +static vector_t* internal_serializer_registry = NULL; //!< registry of model serializer +static vector_t* internal_device_registry = NULL; //!< registry of runnable neural network device +static vector_t* internal_op_method_registry = NULL; //!< registry of operators +static vector_t* internal_op_name_registry = NULL; //!< registry of operators name /*! * @struct ir_op_map_t @@ -48,14 +46,12 @@ static vector_t* internal_op_name_registry = NULL; //!< registry of operato */ typedef struct op_name_entry { - int type; //!< the type of a operator - const char* name; //!< the name of a operator + int type; //!< the type of a operator + const char* name; //!< the name of a operator } ir_op_name_entry_t; - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - static int initialize_serializer_registry(const char* name) { if (NULL == internal_serializer_registry) @@ -71,7 +67,6 @@ static int initialize_serializer_registry(const char* name) return 0; } - int register_serializer(serializer_t* serializer) { initialize_serializer_registry(serializer->get_name(serializer)); @@ -101,7 +96,6 @@ int register_serializer(serializer_t* serializer) return 0; } - serializer_t* find_serializer_via_name(const char* name) { if (NULL == internal_serializer_registry) @@ -131,7 +125,6 @@ serializer_t* find_serializer_via_name(const char* name) return NULL; } - serializer_t* find_serializer_via_index(int index) { int count = get_serializer_count(); @@ -147,7 +140,6 @@ serializer_t* find_serializer_via_index(int index) } } - int get_serializer_count() { if (NULL == internal_serializer_registry) @@ -160,7 +152,6 @@ int get_serializer_count() } } - int unregister_serializer(serializer_t* serializer) { if (NULL == serializer) @@ -194,7 +185,6 @@ int unregister_serializer(serializer_t* serializer) return remove_vector_via_pointer(internal_serializer_registry, &serializer); } - int release_serializer_registry() { while (get_vector_num(internal_serializer_registry) > 0) @@ -209,10 +199,8 @@ int release_serializer_registry() return 0; } - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - static int initialize_device_registry(const char* name) { if (NULL == internal_device_registry) @@ -228,7 +216,6 @@ static int initialize_device_registry(const char* name) return 0; } - ir_device_t* find_device_via_name(const char* name) { if (NULL == internal_device_registry) @@ -258,13 +245,11 @@ ir_device_t* find_device_via_name(const char* name) return NULL; } - struct device* find_default_device() { return find_device_via_name("CPU"); } - ir_device_t* find_device_via_index(int index) { int count = get_device_count(); @@ -280,7 +265,6 @@ ir_device_t* find_device_via_index(int index) } } - int get_device_count() { if (NULL == internal_device_registry) @@ -293,7 +277,6 @@ int get_device_count() } } - int register_device(ir_device_t* device) { initialize_device_registry(device->name); @@ -323,7 +306,6 @@ int register_device(ir_device_t* device) return 0; } - int unregister_device(ir_device_t* device) { if (NULL == find_device_via_name(device->name)) @@ -339,7 +321,6 @@ int unregister_device(ir_device_t* device) return remove_vector_via_pointer(internal_device_registry, &device); } - int release_device_registry() { while (get_vector_num(internal_device_registry) > 0) @@ -354,10 +335,8 @@ int release_device_registry() return 0; } - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - int initialize_op_name_registry(const char* name) { if (NULL == internal_op_name_registry) @@ -374,7 +353,6 @@ int initialize_op_name_registry(const char* name) return 0; } - int register_op_name(int type, const char* name) { initialize_op_name_registry(name); @@ -392,7 +370,6 @@ int register_op_name(int type, const char* name) return push_vector_data(internal_op_name_registry, &op_map); } - int unregister_op_name(int type) { int i; @@ -415,7 +392,6 @@ int unregister_op_name(int type) return 0; } - int release_op_name_registry() { while (get_vector_num(internal_op_name_registry) > 0) @@ -430,7 +406,6 @@ int release_op_name_registry() return 0; } - static int initialize_op_registry(const char* name) { if (NULL == internal_op_method_registry) @@ -446,7 +421,6 @@ static int initialize_op_registry(const char* name) return 0; } - static int register_op_registry(ir_method_t* method) { if (find_op_method(method->type, method->version)) @@ -457,7 +431,6 @@ static int register_op_registry(ir_method_t* method) return push_vector_data(internal_op_method_registry, method); } - int register_op(int type, const char* name, ir_method_t* method) { initialize_op_registry(name); @@ -485,7 +458,6 @@ int register_op(int type, const char* name, ir_method_t* method) return 0; } - ir_method_t* find_op_method(int type, int version) { int op_count = get_vector_num(internal_op_method_registry); @@ -503,7 +475,6 @@ ir_method_t* find_op_method(int type, int version) return NULL; } - ir_method_t* find_op_method_via_index(int index) { int count = get_op_method_count(); @@ -519,7 +490,6 @@ ir_method_t* find_op_method_via_index(int index) } } - const char* find_op_name(int type) { int count = get_vector_num(internal_op_name_registry); @@ -535,7 +505,6 @@ const char* find_op_name(int type) return NULL; } - int get_op_method_count() { if (NULL == internal_op_method_registry) @@ -548,7 +517,6 @@ int get_op_method_count() } } - int unregister_op(int type, int version) { int matched_count = 0; @@ -587,7 +555,6 @@ int unregister_op(int type, int version) return 0; } - int release_op_registry(void) { while (get_vector_num(internal_op_method_registry) > 0) diff --git a/source/module/module.h b/source/module/module.h index 03f5560ef..36a12bf19 100644 --- a/source/module/module.h +++ b/source/module/module.h @@ -29,7 +29,6 @@ struct op; struct method; struct device; - /*! * @brief Register a serializer. * @@ -39,7 +38,6 @@ struct device; */ int register_serializer(struct serializer* serializer); - /*! * @brief Find the serializer via its name. * @@ -49,7 +47,6 @@ int register_serializer(struct serializer* serializer); */ struct serializer* find_serializer_via_name(const char* name); - /*! * @brief Find the serializer via its registered index. * @@ -59,7 +56,6 @@ struct serializer* find_serializer_via_name(const char* name); */ struct serializer* find_serializer_via_index(int index); - /*! * @brief Get count of all registered serializer. * @@ -67,7 +63,6 @@ struct serializer* find_serializer_via_index(int index); */ int get_serializer_count(); - /*! * @brief Unregister a serializer. * @@ -77,7 +72,6 @@ int get_serializer_count(); */ int unregister_serializer(struct serializer* serializer); - /*! * @brief Release all serializer. * @@ -85,7 +79,6 @@ int unregister_serializer(struct serializer* serializer); */ int release_serializer_registry(); - /*! * @brief Register a device. * @@ -95,7 +88,6 @@ int release_serializer_registry(); */ int register_device(struct device* device); - /*! * @brief Find the device via its name. * @@ -105,7 +97,6 @@ int register_device(struct device* device); */ struct device* find_device_via_name(const char* name); - /*! * @brief Find the default device. * @@ -113,7 +104,6 @@ struct device* find_device_via_name(const char* name); */ struct device* find_default_device(); - /*! * @brief Find the device via its registered index. * @@ -123,7 +113,6 @@ struct device* find_default_device(); */ struct device* find_device_via_index(int index); - /*! * @brief Get count of all registered device. * @@ -131,7 +120,6 @@ struct device* find_device_via_index(int index); */ int get_device_count(); - /*! * @brief Register a device. * @@ -141,7 +129,6 @@ int get_device_count(); */ int unregister_device(struct device* device); - /*! * @brief Release all device. * @@ -149,7 +136,6 @@ int unregister_device(struct device* device); */ int release_device_registry(); - /*! * @brief Register an operator method. * @@ -161,7 +147,6 @@ int release_device_registry(); */ int register_op(int type, const char* name, struct method* method); - /*! * @brief Find an operator method. * @@ -172,7 +157,6 @@ int register_op(int type, const char* name, struct method* method); */ struct method* find_op_method(int type, int version); - /*! * @brief Find an operator method via its registered index. * @@ -182,7 +166,6 @@ struct method* find_op_method(int type, int version); */ struct method* find_op_method_via_index(int index); - /*! * @brief Find an operator name. * @@ -192,7 +175,6 @@ struct method* find_op_method_via_index(int index); */ const char* find_op_name(int type); - /*! * @brief Get count of all registered operator method. * @@ -200,7 +182,6 @@ const char* find_op_name(int type); */ int get_op_method_count(); - /*! * @brief Register an operator. * @@ -211,7 +192,6 @@ int get_op_method_count(); */ int unregister_op(int type, int version); - /*! * @brief Release all operator. * diff --git a/source/operator/op.c b/source/operator/op.c index 051339885..fb29d670c 100644 --- a/source/operator/op.c +++ b/source/operator/op.c @@ -29,13 +29,11 @@ #include - void init_op_struct(ir_op_t* op) { memset(op, 0, sizeof(ir_node_t)); } - void init_method_struct(ir_method_t* method) { memset(method, 0, sizeof(ir_method_t)); diff --git a/source/operator/op.h b/source/operator/op.h index 36b55fe15..129fae65f 100644 --- a/source/operator/op.h +++ b/source/operator/op.h @@ -29,10 +29,8 @@ #include "op_name.h" - struct node; - /*! * @enum op_type * @brief Enumeration of supported operators @@ -53,7 +51,7 @@ enum OP_CLIP, OP_COMPARISON, OP_CONCAT, - OP_CONST, + OP_CONST, OP_CONV, OP_CROP, OP_DECONV, @@ -144,37 +142,32 @@ enum OP_BUILTIN_LAST }; - /*! * @struct ir_op_t * @brief Abstract operator intermediate representation */ typedef struct op { - uint16_t type; //!< the type of a operator - uint8_t version; //!< the version of a operator - uint8_t same_shape; //!< the flag of whether the operator will keep shape - uint16_t param_size; //!< size of parameter memory buffer - void* param_mem; //!< parameter memory buffer - int (*infer_shape)(struct node*); //!< infer(or broadcast) the shape from input to output(s) + uint16_t type; //!< the type of a operator + uint8_t version; //!< the version of a operator + uint8_t same_shape; //!< the flag of whether the operator will keep shape + uint16_t param_size; //!< size of parameter memory buffer + void* param_mem; //!< parameter memory buffer + int (*infer_shape)(struct node*); //!< infer(or broadcast) the shape from input to output(s) } ir_op_t; - /*! * @struct ir_op_method_t * @brief Abstract method of operator intermediate representation */ typedef struct method { - - int type; //!< the type of a operator - int version; //!< the version of a operator - int (*init)(ir_op_t* op); + int type; //!< the type of a operator + int version; //!< the version of a operator + int (*init)(ir_op_t* op); void (*release)(ir_op_t* op); } ir_method_t; - void init_op_struct(ir_op_t* op); - void init_method_struct(ir_method_t* method); diff --git a/source/operator/op_name.h b/source/operator/op_name.h index f90c431c6..068045dd2 100644 --- a/source/operator/op_name.h +++ b/source/operator/op_name.h @@ -24,105 +24,105 @@ #pragma once -#define OP_GENERIC_NAME "Generic" -#define OP_ABSVAL_NAME "Absval" -#define OP_ADD_N_NAME "Add_n" -#define OP_ARGMAX_NAME "ArgMax" -#define OP_ARGMIN_NAME "ArgMin" -#define OP_BATCHNORM_NAME "BatchNormalize" -#define OP_BATCHTOSPACEND_NAME "Batchtospacend" -#define OP_BIAS_NAME "Bias" -#define OP_BROADMUL_NAME "BroadMul" -#define OP_CAST_NAME "Cast" -#define OP_CEIL_NAME "Ceil" -#define OP_CLIP_NAME "Clip" -#define OP_COMPARISON_NAME "Comparison" -#define OP_CONCAT_NAME "Concat" -#define OP_CONV_NAME "Convolution" -#define OP_CONST_NAME "Const" -#define OP_CROP_NAME "Crop" -#define OP_DECONV_NAME "Deconvolution" -#define OP_DEPTHTOSPACE_NAME "Depthtospace" -#define OP_DETECTION_OUTPUT_NAME "DetectionOutput" -#define OP_DETECTION_POSTPROCESS_NAME "DetectionPostProcess" -#define OP_DROPOUT_NAME "Dropout" -#define OP_ELTWISE_NAME "Eltwise" -#define OP_ELU_NAME "Elu" -#define OP_EMBEDDING_NAME "Embedding" -#define OP_EXPANDDIMS_NAME "Expanddims" -#define OP_FC_NAME "FullyConnected" -#define OP_FLATTEN_NAME "Flatten" -#define OP_GATHER_NAME "Gather" -#define OP_GEMM_NAME "Gemm" -#define OP_GRU_NAME "Gru" -#define OP_HARDSIGMOID_NAME "HardSigmoid" -#define OP_HARDSWISH_NAME "Hardswish" -#define OP_INPUT_NAME "InputOp" -#define OP_INSTANCENORM_NAME "InstanceNorm" -#define OP_INTERP_NAME "Interp" -#define OP_LOGICAL_NAME "Logical" -#define OP_LOGISTIC_NAME "Logistic" -#define OP_LRN_NAME "Lrn" -#define OP_LSTM_NAME "Lstm" -#define OP_MATMUL_NAME "Matmul" -#define OP_MAXIMUM_NAME "Maximum" -#define OP_MEAN_NAME "Mean" -#define OP_MINIMUM_NAME "Minimum" -#define OP_MVN_NAME "Mvn" -#define OP_NOOP_NAME "Noop" -#define OP_NORMALIZE_NAME "Normalize" -#define OP_PAD_NAME "Pad" -#define OP_PERMUTE_NAME "Permute" -#define OP_POOL_NAME "Pooling" -#define OP_PRELU_NAME "PReLU" -#define OP_PRIORBOX_NAME "PriorBox" -#define OP_PSROIPOOLING_NAME "Psroipooling" -#define OP_REDUCEL2_NAME "ReduceL2" -#define OP_REDUCTION_NAME "Reduction" -#define OP_REGION_NAME "Region" -#define OP_RELU_NAME "ReLU" -#define OP_RELU6_NAME "ReLU6" -#define OP_REORG_NAME "Reorg" -#define OP_RESHAPE_NAME "Reshape" -#define OP_RESIZE_NAME "Resize" -#define OP_REVERSE_NAME "Reverse" -#define OP_RNN_NAME "RNN" -#define OP_ROIALIGN_NAME "Roialign" -#define OP_ROIPOOLING_NAME "RoiPooling" -#define OP_ROUND_NAME "Round" -#define OP_RPN_NAME "Rpn" -#define OP_SCALE_NAME "Scale" -#define OP_SELU_NAME "Selu" -#define OP_SHUFFLECHANNEL_NAME "ShuffleChannel" -#define OP_SIGMOID_NAME "Sigmoid" -#define OP_SLICE_NAME "Slice" -#define OP_SOFTMAX_NAME "Softmax" -#define OP_SPACETOBATCHND_NAME "Spacetobatchnd" -#define OP_SPACETODEPTH_NAME "Spacetodepth" -#define OP_SPARSETODENSE_NAME "SparseToDense" -#define OP_SPLIT_NAME "Split" -#define OP_SQUAREDDIFFERENCE_NAME "SquaredDifference" -#define OP_SQUEEZE_NAME "Squeeze" -#define OP_STRIDEDSLICE_NAME "StridedSlice" -#define OP_SWAP_AXIS_NAME "SwapAxis" -#define OP_TANH_NAME "Tanh" -#define OP_THRESHOLD_NAME "Threshold" -#define OP_TOPKV2_NAME "Topkv2" -#define OP_TRANSPOSE_NAME "Transpose" -#define OP_UNARY_NAME "Unary" -#define OP_UNSQUEEZE_NAME "Unsqueeze" -#define OP_UPSAMPLE_NAME "Upsample" -#define OP_ZEROSLIKE_NAME "ZerosLike" -#define OP_MISH_NAME "Mish" -#define OP_LOGSOFTMAX_NAME "LogSoftmax" -#define OP_RELU1_NAME "ReLU1" -#define OP_L2NORMALIZATION_NAME "L2Normalization" -#define OP_L2POOL_NAME "L2Pool" -#define OP_TILE_NAME "Tile" -#define OP_SHAPE_NAME "Shape" -#define OP_SCATTER_NAME "Scatter" -#define OP_WHERE_NAME "Where" -#define OP_SOFTPLUS_NAME "Softplus" -#define OP_RECIPROCAL_NAME "Reciprocal" -#define OP_SPATIALTRANSFORMER_NAME "SpatialTransformer" -#define OP_EXPAND_NAME "Expand" +#define OP_GENERIC_NAME "Generic" +#define OP_ABSVAL_NAME "Absval" +#define OP_ADD_N_NAME "Add_n" +#define OP_ARGMAX_NAME "ArgMax" +#define OP_ARGMIN_NAME "ArgMin" +#define OP_BATCHNORM_NAME "BatchNormalize" +#define OP_BATCHTOSPACEND_NAME "Batchtospacend" +#define OP_BIAS_NAME "Bias" +#define OP_BROADMUL_NAME "BroadMul" +#define OP_CAST_NAME "Cast" +#define OP_CEIL_NAME "Ceil" +#define OP_CLIP_NAME "Clip" +#define OP_COMPARISON_NAME "Comparison" +#define OP_CONCAT_NAME "Concat" +#define OP_CONV_NAME "Convolution" +#define OP_CONST_NAME "Const" +#define OP_CROP_NAME "Crop" +#define OP_DECONV_NAME "Deconvolution" +#define OP_DEPTHTOSPACE_NAME "Depthtospace" +#define OP_DETECTION_OUTPUT_NAME "DetectionOutput" +#define OP_DETECTION_POSTPROCESS_NAME "DetectionPostProcess" +#define OP_DROPOUT_NAME "Dropout" +#define OP_ELTWISE_NAME "Eltwise" +#define OP_ELU_NAME "Elu" +#define OP_EMBEDDING_NAME "Embedding" +#define OP_EXPANDDIMS_NAME "Expanddims" +#define OP_FC_NAME "FullyConnected" +#define OP_FLATTEN_NAME "Flatten" +#define OP_GATHER_NAME "Gather" +#define OP_GEMM_NAME "Gemm" +#define OP_GRU_NAME "Gru" +#define OP_HARDSIGMOID_NAME "HardSigmoid" +#define OP_HARDSWISH_NAME "Hardswish" +#define OP_INPUT_NAME "InputOp" +#define OP_INSTANCENORM_NAME "InstanceNorm" +#define OP_INTERP_NAME "Interp" +#define OP_LOGICAL_NAME "Logical" +#define OP_LOGISTIC_NAME "Logistic" +#define OP_LRN_NAME "Lrn" +#define OP_LSTM_NAME "Lstm" +#define OP_MATMUL_NAME "Matmul" +#define OP_MAXIMUM_NAME "Maximum" +#define OP_MEAN_NAME "Mean" +#define OP_MINIMUM_NAME "Minimum" +#define OP_MVN_NAME "Mvn" +#define OP_NOOP_NAME "Noop" +#define OP_NORMALIZE_NAME "Normalize" +#define OP_PAD_NAME "Pad" +#define OP_PERMUTE_NAME "Permute" +#define OP_POOL_NAME "Pooling" +#define OP_PRELU_NAME "PReLU" +#define OP_PRIORBOX_NAME "PriorBox" +#define OP_PSROIPOOLING_NAME "Psroipooling" +#define OP_REDUCEL2_NAME "ReduceL2" +#define OP_REDUCTION_NAME "Reduction" +#define OP_REGION_NAME "Region" +#define OP_RELU_NAME "ReLU" +#define OP_RELU6_NAME "ReLU6" +#define OP_REORG_NAME "Reorg" +#define OP_RESHAPE_NAME "Reshape" +#define OP_RESIZE_NAME "Resize" +#define OP_REVERSE_NAME "Reverse" +#define OP_RNN_NAME "RNN" +#define OP_ROIALIGN_NAME "Roialign" +#define OP_ROIPOOLING_NAME "RoiPooling" +#define OP_ROUND_NAME "Round" +#define OP_RPN_NAME "Rpn" +#define OP_SCALE_NAME "Scale" +#define OP_SELU_NAME "Selu" +#define OP_SHUFFLECHANNEL_NAME "ShuffleChannel" +#define OP_SIGMOID_NAME "Sigmoid" +#define OP_SLICE_NAME "Slice" +#define OP_SOFTMAX_NAME "Softmax" +#define OP_SPACETOBATCHND_NAME "Spacetobatchnd" +#define OP_SPACETODEPTH_NAME "Spacetodepth" +#define OP_SPARSETODENSE_NAME "SparseToDense" +#define OP_SPLIT_NAME "Split" +#define OP_SQUAREDDIFFERENCE_NAME "SquaredDifference" +#define OP_SQUEEZE_NAME "Squeeze" +#define OP_STRIDEDSLICE_NAME "StridedSlice" +#define OP_SWAP_AXIS_NAME "SwapAxis" +#define OP_TANH_NAME "Tanh" +#define OP_THRESHOLD_NAME "Threshold" +#define OP_TOPKV2_NAME "Topkv2" +#define OP_TRANSPOSE_NAME "Transpose" +#define OP_UNARY_NAME "Unary" +#define OP_UNSQUEEZE_NAME "Unsqueeze" +#define OP_UPSAMPLE_NAME "Upsample" +#define OP_ZEROSLIKE_NAME "ZerosLike" +#define OP_MISH_NAME "Mish" +#define OP_LOGSOFTMAX_NAME "LogSoftmax" +#define OP_RELU1_NAME "ReLU1" +#define OP_L2NORMALIZATION_NAME "L2Normalization" +#define OP_L2POOL_NAME "L2Pool" +#define OP_TILE_NAME "Tile" +#define OP_SHAPE_NAME "Shape" +#define OP_SCATTER_NAME "Scatter" +#define OP_WHERE_NAME "Where" +#define OP_SOFTPLUS_NAME "Softplus" +#define OP_RECIPROCAL_NAME "Reciprocal" +#define OP_SPATIALTRANSFORMER_NAME "SpatialTransformer" +#define OP_EXPAND_NAME "Expand" diff --git a/source/operator/prototype/absval.c b/source/operator/prototype/absval.c index bd3367f01..2565ecfbf 100644 --- a/source/operator/prototype/absval.c +++ b/source/operator/prototype/absval.c @@ -28,7 +28,6 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; @@ -40,7 +39,6 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { op->param_mem = NULL; @@ -51,13 +49,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_absval_op() { struct method m; @@ -66,11 +62,9 @@ int register_absval_op() m.init = init_op; m.release = release_op; - return register_op(OP_ABSVAL, OP_ABSVAL_NAME, &m); } - int unregister_absval_op() { return unregister_op(OP_ABSVAL, 1); diff --git a/source/operator/prototype/add_n.c b/source/operator/prototype/add_n.c index 6b6d48ad5..a33ae1f28 100644 --- a/source/operator/prototype/add_n.c +++ b/source/operator/prototype/add_n.c @@ -29,7 +29,6 @@ #include - static int init_op(struct op* op) { op->same_shape = 1; @@ -37,7 +36,6 @@ static int init_op(struct op* op) return 0; } - int register_add_n_op() { struct method m; @@ -45,11 +43,9 @@ int register_add_n_op() m.init = init_op; m.release = NULL; - return register_op(OP_ADD_N, OP_ADD_N_NAME, &m); } - int unregister_add_n_op() { return unregister_op(OP_ADD_N, 1); diff --git a/source/operator/prototype/argmax.c b/source/operator/prototype/argmax.c index e5eabc44d..25733f6f9 100644 --- a/source/operator/prototype/argmax.c +++ b/source/operator/prototype/argmax.c @@ -30,14 +30,13 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct argmax_param* argmax_param = ( struct argmax_param* )(node->op.param_mem); + struct argmax_param* argmax_param = (struct argmax_param*)(node->op.param_mem); int axis = argmax_param->axis; @@ -55,7 +54,7 @@ static int infer_shape(struct node* node) input->dims[0] = tmp; input->dims[3] = 1; - if (input->dims[0] != 1) // input 3 keepdimss + if (input->dims[0] != 1) // input 3 keepdimss { for (int i = 0, j = 0; i < 3; i++) { @@ -63,7 +62,7 @@ static int infer_shape(struct node* node) outdims[j++] = input->dims[i]; } } - else // input 2 keepdimss + else // input 2 keepdimss { for (int i = 0, j = 0; i < 4; i++) outdims[j++] = input->dims[i]; @@ -85,7 +84,6 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { struct argmax_param* argmax_param = (struct argmax_param*)sys_malloc(sizeof(struct argmax_param)); @@ -107,13 +105,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_argmax_op() { struct method m; @@ -124,7 +120,6 @@ int register_argmax_op() return register_op(OP_ARGMAX, OP_ARGMAX_NAME, &m); } - int unregister_argmax_op() { return unregister_op(OP_ARGMAX, 1); diff --git a/source/operator/prototype/argmin.c b/source/operator/prototype/argmin.c index 670415fe0..bf8844c93 100644 --- a/source/operator/prototype/argmin.c +++ b/source/operator/prototype/argmin.c @@ -31,14 +31,13 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct argmin_param* argmin_param = ( struct argmin_param* )(node->op.param_mem); + struct argmin_param* argmin_param = (struct argmin_param*)(node->op.param_mem); int axis = argmin_param->axis; @@ -56,7 +55,7 @@ static int infer_shape(struct node* node) input->dims[0] = tmp; input->dims[3] = 1; - if (input->dims[0] != 1) // input 3 keepdimss + if (input->dims[0] != 1) // input 3 keepdimss { for (int i = 0, j = 0; i < 3; i++) { @@ -64,7 +63,7 @@ static int infer_shape(struct node* node) outdims[j++] = input->dims[i]; } } - else // input 2 keepdimss + else // input 2 keepdimss { for (int i = 0, j = 0; i < 4; i++) outdims[j++] = input->dims[i]; @@ -86,10 +85,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct argmin_param* argmin_param = ( struct argmin_param* )sys_malloc(sizeof(struct argmin_param)); + struct argmin_param* argmin_param = (struct argmin_param*)sys_malloc(sizeof(struct argmin_param)); if (argmin_param == NULL) { @@ -108,13 +106,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_argmin_op() { struct method m; @@ -125,7 +121,6 @@ int register_argmin_op() return register_op(OP_ARGMIN, OP_ARGMIN_NAME, &m); } - int unregister_argmin_op() { return unregister_op(OP_ARGMIN, 1); diff --git a/source/operator/prototype/batchnorm.c b/source/operator/prototype/batchnorm.c index f905fed0a..888844873 100644 --- a/source/operator/prototype/batchnorm.c +++ b/source/operator/prototype/batchnorm.c @@ -30,7 +30,6 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; @@ -40,10 +39,9 @@ static int infer_shape(struct node* node) return set_ir_tensor_shape(output, input->dims, input->dim_num); } - static int init_op(struct op* op) { - batchnorm_param_t* batchnorm_param = ( batchnorm_param_t* )sys_malloc(sizeof(batchnorm_param_t)); + batchnorm_param_t* batchnorm_param = (batchnorm_param_t*)sys_malloc(sizeof(batchnorm_param_t)); if (batchnorm_param == NULL) { @@ -62,13 +60,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_batchnorm_op() { struct method m; @@ -77,7 +73,6 @@ int register_batchnorm_op() m.init = init_op; m.release = release_op; - return register_op(OP_BATCHNORM, OP_BATCHNORM_NAME, &m); } diff --git a/source/operator/prototype/batchtospacend.c b/source/operator/prototype/batchtospacend.c index bced4cb27..e307de16d 100644 --- a/source/operator/prototype/batchtospacend.c +++ b/source/operator/prototype/batchtospacend.c @@ -30,22 +30,19 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct batchtospacend_param* batchtospacend_param = ( struct batchtospacend_param* )(node->op.param_mem); + struct batchtospacend_param* batchtospacend_param = (struct batchtospacend_param*)(node->op.param_mem); int out_dim[4]; out_dim[0] = input->dims[0] / (batchtospacend_param->dilation_x * batchtospacend_param->dilation_y); - out_dim[1] = input->dims[1] * batchtospacend_param->dilation_y - batchtospacend_param->crop_top - - batchtospacend_param->crop_bottom; - out_dim[2] = input->dims[2] * batchtospacend_param->dilation_x - batchtospacend_param->crop_left - - batchtospacend_param->crop_right; + out_dim[1] = input->dims[1] * batchtospacend_param->dilation_y - batchtospacend_param->crop_top - batchtospacend_param->crop_bottom; + out_dim[2] = input->dims[2] * batchtospacend_param->dilation_x - batchtospacend_param->crop_left - batchtospacend_param->crop_right; out_dim[3] = input->dims[3]; set_ir_tensor_shape(output, out_dim, 4); @@ -53,11 +50,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct batchtospacend_param* batchtospacend_param = - ( struct batchtospacend_param* )sys_malloc(sizeof(struct batchtospacend_param)); + struct batchtospacend_param* batchtospacend_param = (struct batchtospacend_param*)sys_malloc(sizeof(struct batchtospacend_param)); if (batchtospacend_param == NULL) { @@ -80,13 +75,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_batchtospacend_op() { struct method m; @@ -95,11 +88,9 @@ int register_batchtospacend_op() m.init = init_op; m.release = release_op; - return register_op(OP_BATCHTOSPACEND, OP_BATCHTOSPACEND_NAME, &m); } - int unregister_batchtospacend_op() { return unregister_op(OP_BATCHTOSPACEND, 1); diff --git a/source/operator/prototype/bias.c b/source/operator/prototype/bias.c index f2b9ed6e2..f2b0b01b9 100644 --- a/source/operator/prototype/bias.c +++ b/source/operator/prototype/bias.c @@ -27,7 +27,6 @@ #include "graph/graph.h" #include "module/module.h" - static int infer_shape(ir_node_t* node) { ir_graph_t* ir_graph = node->graph; @@ -39,7 +38,6 @@ static int infer_shape(ir_node_t* node) return 0; } - static int init_op(ir_op_t* op) { op->same_shape = 0; @@ -48,9 +46,9 @@ static int init_op(ir_op_t* op) return 0; } - -static void release_op(ir_op_t* op) {} - +static void release_op(ir_op_t* op) +{ +} int register_bias_op() { @@ -63,7 +61,6 @@ int register_bias_op() return register_op(OP_BIAS, OP_BIAS_NAME, &m); } - int unregister_bias_op() { return unregister_op(OP_BIAS, 1); diff --git a/source/operator/prototype/broadmul.c b/source/operator/prototype/broadmul.c index 26012172e..42272e19b 100644 --- a/source/operator/prototype/broadmul.c +++ b/source/operator/prototype/broadmul.c @@ -28,7 +28,6 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; @@ -40,7 +39,6 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { /*set the param default value */ @@ -52,9 +50,9 @@ static int init_op(struct op* op) return 0; } - -static void release_op(struct op* op) {} - +static void release_op(struct op* op) +{ +} int register_broadmul_op() { @@ -67,7 +65,6 @@ int register_broadmul_op() return register_op(OP_BROADMUL, OP_BROADMUL_NAME, &m); } - int unregister_broadmul_op() { return unregister_op(OP_BROADMUL, 1); diff --git a/source/operator/prototype/cast.c b/source/operator/prototype/cast.c index 87b7311f4..87b480440 100644 --- a/source/operator/prototype/cast.c +++ b/source/operator/prototype/cast.c @@ -29,7 +29,6 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int init_op(ir_op_t* op) { struct cast_param* cast_param = (struct cast_param*)sys_malloc(sizeof(struct cast_param)); @@ -50,13 +49,11 @@ static int init_op(ir_op_t* op) return 0; } - static void release_op(ir_op_t* op) { sys_free(op->param_mem); } - int register_cast_op() { ir_method_t m; @@ -68,7 +65,6 @@ int register_cast_op() return register_op(OP_CAST, OP_CAST_NAME, &m); } - int unregister_cast_op() { return unregister_op(OP_CAST, 1); diff --git a/source/operator/prototype/ceil.c b/source/operator/prototype/ceil.c index 2a894afd9..f28c58416 100644 --- a/source/operator/prototype/ceil.c +++ b/source/operator/prototype/ceil.c @@ -27,7 +27,6 @@ #include "graph/graph.h" #include "module/module.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; @@ -39,7 +38,6 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { op->same_shape = 0; @@ -48,8 +46,9 @@ static int init_op(struct op* op) return 0; } -static void release_op(struct op* op) {} - +static void release_op(struct op* op) +{ +} int register_ceil_op() { @@ -62,7 +61,6 @@ int register_ceil_op() return register_op(OP_CEIL, OP_CEIL_NAME, &m); } - int unregister_ceil_op() { return unregister_op(OP_CEIL, 1); diff --git a/source/operator/prototype/clip.c b/source/operator/prototype/clip.c index cbcbc94a7..663f0ff7e 100644 --- a/source/operator/prototype/clip.c +++ b/source/operator/prototype/clip.c @@ -32,7 +32,6 @@ #include "float.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; @@ -44,9 +43,9 @@ static int infer_shape(struct node* node) struct tensor* clip_min = get_ir_graph_tensor(ir_graph, node->input_tensors[1]); struct tensor* clip_max = get_ir_graph_tensor(ir_graph, node->input_tensors[2]); - struct clip_param* clip_param = ( struct clip_param* )node->op.param_mem; - float* min = (float *)clip_min->data; - float* max = (float *)clip_max->data; + struct clip_param* clip_param = (struct clip_param*)node->op.param_mem; + float* min = (float*)clip_min->data; + float* max = (float*)clip_max->data; clip_param->min = min[0]; clip_param->max = max[0]; } @@ -58,7 +57,7 @@ static int infer_shape(struct node* node) static int init_op(struct op* op) { - struct clip_param* clip_param = ( struct clip_param* )sys_malloc(sizeof(struct clip_param)); + struct clip_param* clip_param = (struct clip_param*)sys_malloc(sizeof(struct clip_param)); if (clip_param == NULL) { @@ -77,13 +76,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_clip_op() { struct method m; @@ -95,7 +92,6 @@ int register_clip_op() return register_op(OP_CLIP, OP_CLIP_NAME, &m); } - int unregister_clip_op() { return unregister_op(OP_CLIP, 1); diff --git a/source/operator/prototype/comparison.c b/source/operator/prototype/comparison.c index 4fc33ab5b..8468c7e3d 100644 --- a/source/operator/prototype/comparison.c +++ b/source/operator/prototype/comparison.c @@ -33,7 +33,6 @@ #include - #define CALC_TENSOR_SHAPE_SIZE(outval, IR_TENSOR) \ { \ outval = 1; \ @@ -43,7 +42,6 @@ } \ } - static int infer_shape(struct node* node) { struct graph* graph = node->graph; @@ -75,10 +73,9 @@ static int infer_shape(struct node* node) } } - static int init_op(struct op* op) { - struct comparison_param* param = ( struct comparison_param* )sys_malloc(sizeof(struct comparison_param)); + struct comparison_param* param = (struct comparison_param*)sys_malloc(sizeof(struct comparison_param)); if (param == NULL) { @@ -95,13 +92,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_comparison_op() { struct method m; @@ -110,11 +105,9 @@ int register_comparison_op() m.init = init_op; m.release = release_op; - return register_op(OP_COMPARISON, OP_COMPARISON_NAME, &m); } - int unregister_comparison_op() { return unregister_op(OP_COMPARISON, 1); diff --git a/source/operator/prototype/concat.c b/source/operator/prototype/concat.c index 7d8c802b8..478cd797d 100644 --- a/source/operator/prototype/concat.c +++ b/source/operator/prototype/concat.c @@ -31,13 +31,12 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int infer_shape(ir_node_t* node) { ir_graph_t* graph = node->graph; ir_tensor_t* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct concat_param* concat_param = ( struct concat_param* )(node->op.param_mem); + struct concat_param* concat_param = (struct concat_param*)(node->op.param_mem); int concat_shape = 0; int axis = concat_param->axis; @@ -104,10 +103,9 @@ static int infer_shape(ir_node_t* node) return 0; } - static int init_op(ir_op_t* op) { - struct concat_param* concat_param = ( struct concat_param* )sys_malloc(sizeof(struct concat_param)); + struct concat_param* concat_param = (struct concat_param*)sys_malloc(sizeof(struct concat_param)); if (concat_param == NULL) { @@ -125,13 +123,11 @@ static int init_op(ir_op_t* op) return 0; } - static void release_op(ir_op_t* op) { sys_free(op->param_mem); } - int register_concat_op() { ir_method_t m; @@ -143,7 +139,6 @@ int register_concat_op() return register_op(OP_CONCAT, OP_CONCAT_NAME, &m); } - int unregister_concat_op() { return unregister_op(OP_CONCAT, 1); diff --git a/source/operator/prototype/const.c b/source/operator/prototype/const.c index e4d5c8bd7..56d4fd203 100644 --- a/source/operator/prototype/const.c +++ b/source/operator/prototype/const.c @@ -27,15 +27,14 @@ #include "graph/graph.h" #include "module/module.h" - static int init_op(ir_op_t* op) { return 0; } - -static void release_op(ir_op_t* op) {} - +static void release_op(ir_op_t* op) +{ +} int register_const_op() { @@ -45,10 +44,9 @@ int register_const_op() m.init = init_op; m.release = release_op; - return register_op(OP_CONST, OP_CONST_NAME , &m); + return register_op(OP_CONST, OP_CONST_NAME, &m); } - int unregister_const_op() { return unregister_op(OP_CONST, 1); diff --git a/source/operator/prototype/convolution.c b/source/operator/prototype/convolution.c index 5bada8581..9ae31d787 100644 --- a/source/operator/prototype/convolution.c +++ b/source/operator/prototype/convolution.c @@ -32,14 +32,13 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int infer_shape(ir_node_t* node) { ir_graph_t* graph = node->graph; ir_tensor_t* input = get_ir_graph_tensor(graph, node->input_tensors[0]); ir_tensor_t* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct conv_param* conv_param = ( struct conv_param* )(node->op.param_mem); + struct conv_param* conv_param = (struct conv_param*)(node->op.param_mem); int n = input->dims[0]; int h, w; @@ -97,8 +96,7 @@ static int infer_shape(ir_node_t* node) } else { - out_h = (h - conv_param->dilation_h * (conv_param->kernel_h - 1) - 1 + conv_param->pad_h0 + conv_param->pad_h1) / - conv_param->stride_h + 1; + out_h = (h - conv_param->dilation_h * (conv_param->kernel_h - 1) - 1 + conv_param->pad_h0 + conv_param->pad_h1) / conv_param->stride_h + 1; } if (conv_param->pad_w0 < 0) @@ -122,8 +120,7 @@ static int infer_shape(ir_node_t* node) } else { - out_w = (w - conv_param->dilation_w * (conv_param->kernel_w - 1) - 1 + conv_param->pad_w0 + conv_param->pad_w1) / - conv_param->stride_w + 1; + out_w = (w - conv_param->dilation_w * (conv_param->kernel_w - 1) - 1 + conv_param->pad_w0 + conv_param->pad_w1) / conv_param->stride_w + 1; } int dims[4]; @@ -133,7 +130,7 @@ static int infer_shape(ir_node_t* node) dims[2] = out_h; dims[3] = out_w; - for (int i=0; i<4; i++) + for (int i = 0; i < 4; i++) { if (dims[i] == 0) { @@ -146,10 +143,9 @@ static int infer_shape(ir_node_t* node) return 0; } - static int init_op(ir_op_t* op) { - struct conv_param* conv_param = ( struct conv_param* )sys_malloc(sizeof(struct conv_param)); + struct conv_param* conv_param = (struct conv_param*)sys_malloc(sizeof(struct conv_param)); if (conv_param == NULL) { @@ -180,13 +176,11 @@ static int init_op(ir_op_t* op) return 0; } - static void release_op(ir_op_t* op) { sys_free(op->param_mem); } - int register_convolution_op() { ir_method_t m; @@ -198,7 +192,6 @@ int register_convolution_op() return register_op(OP_CONV, OP_CONV_NAME, &m); } - int unregister_convolution_op() { return unregister_op(OP_CONV, 1); diff --git a/source/operator/prototype/convolution_param.h b/source/operator/prototype/convolution_param.h index 22fb7e34c..602e6d135 100644 --- a/source/operator/prototype/convolution_param.h +++ b/source/operator/prototype/convolution_param.h @@ -46,22 +46,22 @@ struct conv_param struct conv_priv_info { - void* interleave_buffer; // kernel transform buffer - void* interleave_buffer_pack4; // kernel pack4 - void* im2col_buffer; // input data transform buffer - void* im2col_buffer_pack4; // input data transform buffer pack4 + void* interleave_buffer; // kernel transform buffer + void* interleave_buffer_pack4; // kernel pack4 + void* im2col_buffer; // input data transform buffer + void* im2col_buffer_pack4; // input data transform buffer pack4 void* input_pad; void* dot_block; void* transform_input; void* output_bordered; - int im2col_buffer_size; // kernel transform buffer size - int im2col_buffer_pack4_size; // kernel transform buffer size - int interleave_buffer_size; // input data transform buffer size + int im2col_buffer_size; // kernel transform buffer size + int im2col_buffer_pack4_size; // kernel transform buffer size + int interleave_buffer_size; // input data transform buffer size int interleave_buffer_pack4_size; - int external_im2col_mem; // flag - int external_im2col_pack4_mem; // flag - int external_interleave_mem; // flag - int external_interleave_pack4_mem; // flag + int external_im2col_mem; // flag + int external_im2col_pack4_mem; // flag + int external_interleave_mem; // flag + int external_interleave_pack4_mem; // flag int cpu_type; int winograd; int wino_off; diff --git a/source/operator/prototype/crop.c b/source/operator/prototype/crop.c index 4cab29e09..0a1fb106f 100644 --- a/source/operator/prototype/crop.c +++ b/source/operator/prototype/crop.c @@ -31,13 +31,12 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[1]); // Don't try to modify ! struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct crop_param* crop_param = ( struct crop_param* )(node->op.param_mem); + struct crop_param* crop_param = (struct crop_param*)(node->op.param_mem); int input_h = input->dims[2]; int input_w = input->dims[3]; @@ -78,10 +77,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct crop_param* crop_param = ( struct crop_param* )sys_malloc(sizeof(struct crop_param)); + struct crop_param* crop_param = (struct crop_param*)sys_malloc(sizeof(struct crop_param)); if (crop_param == NULL) { @@ -107,13 +105,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_crop_op() { struct method m; @@ -122,11 +118,9 @@ int register_crop_op() m.init = init_op; m.release = release_op; - return register_op(OP_CROP, OP_CROP_NAME, &m); } - int unregister_crop_op() { return unregister_op(OP_CROP, 1); diff --git a/source/operator/prototype/deconvolution.c b/source/operator/prototype/deconvolution.c index 3257a1e74..a030ee506 100644 --- a/source/operator/prototype/deconvolution.c +++ b/source/operator/prototype/deconvolution.c @@ -32,14 +32,13 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct deconv_param* deconv_param = ( struct deconv_param* )(node->op.param_mem); + struct deconv_param* deconv_param = (struct deconv_param*)(node->op.param_mem); int n = input->dims[0]; int h, w; @@ -99,7 +98,7 @@ static int infer_shape(struct node* node) static int init_op(struct op* op) { - struct deconv_param* deconv_param = ( struct deconv_param* )sys_malloc(sizeof(struct deconv_param)); + struct deconv_param* deconv_param = (struct deconv_param*)sys_malloc(sizeof(struct deconv_param)); if (deconv_param == NULL) { @@ -144,7 +143,6 @@ int register_deconvolution_op() m.init = init_op; m.release = release_op; - return register_op(OP_DECONV, OP_DECONV_NAME, &m); } diff --git a/source/operator/prototype/depthtospace.c b/source/operator/prototype/depthtospace.c index 424e42236..3af00fdd1 100644 --- a/source/operator/prototype/depthtospace.c +++ b/source/operator/prototype/depthtospace.c @@ -32,34 +32,31 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct depthtospace_param* depthtospace_param = ( struct depthtospace_param* )(node->op.param_mem); + struct depthtospace_param* depthtospace_param = (struct depthtospace_param*)(node->op.param_mem); /* todo reshape */ int dims[4]; int block_size = depthtospace_param->block_size; - dims[0] = input->dims[0]; // batch - dims[1] = input->dims[1] / (block_size * block_size); // channel - dims[2] = input->dims[2] * block_size; // height - dims[3] = input->dims[3] * block_size; // width + dims[0] = input->dims[0]; // batch + dims[1] = input->dims[1] / (block_size * block_size); // channel + dims[2] = input->dims[2] * block_size; // height + dims[3] = input->dims[3] * block_size; // width set_ir_tensor_shape(output, dims, 4); return 0; } - static int init_op(struct op* op) { - struct depthtospace_param* depthtospace_param = - ( struct depthtospace_param* )sys_malloc(sizeof(struct depthtospace_param)); + struct depthtospace_param* depthtospace_param = (struct depthtospace_param*)sys_malloc(sizeof(struct depthtospace_param)); if (depthtospace_param == NULL) { @@ -77,13 +74,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_depthtospace_op() { struct method m; @@ -92,11 +87,9 @@ int register_depthtospace_op() m.init = init_op; m.release = release_op; - return register_op(OP_DEPTHTOSPACE, OP_DEPTHTOSPACE_NAME, &m); } - int unregister_depthtospace_op() { return unregister_op(OP_DEPTHTOSPACE, 1); diff --git a/source/operator/prototype/detection_output.c b/source/operator/prototype/detection_output.c index 05a02a5c2..cc49e3028 100644 --- a/source/operator/prototype/detection_output.c +++ b/source/operator/prototype/detection_output.c @@ -31,13 +31,12 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]); - struct detection_output_param* param = ( struct detection_output_param* )node->op.param_mem; + struct detection_output_param* param = (struct detection_output_param*)node->op.param_mem; int dims[TE_MAX_SHAPE_DIM_NUM] = {0}; @@ -52,11 +51,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct detection_output_param* detection_output_param = - ( struct detection_output_param* )sys_malloc(sizeof(struct detection_output_param)); + struct detection_output_param* detection_output_param = (struct detection_output_param*)sys_malloc(sizeof(struct detection_output_param)); if (detection_output_param == NULL) { @@ -77,13 +74,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_detection_output_op() { struct method m; @@ -92,11 +87,9 @@ int register_detection_output_op() m.init = init_op; m.release = release_op; - return register_op(OP_DETECTION_OUTPUT, OP_DETECTION_OUTPUT_NAME, &m); } - int unregister_detection_output_op() { return unregister_op(OP_DETECTION_OUTPUT, 1); diff --git a/source/operator/prototype/detection_postprocess.c b/source/operator/prototype/detection_postprocess.c index a0aed51fb..35c29cf56 100644 --- a/source/operator/prototype/detection_postprocess.c +++ b/source/operator/prototype/detection_postprocess.c @@ -32,7 +32,6 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; @@ -44,8 +43,7 @@ static int infer_shape(struct node* node) struct tensor* output2 = get_ir_graph_tensor(ir_graph, node->output_tensors[2]); struct tensor* output3 = get_ir_graph_tensor(ir_graph, node->output_tensors[3]); - struct detection_postprocess_param* detection_postprocess_param = - ( struct detection_postprocess_param* )(node->op.param_mem); + struct detection_postprocess_param* detection_postprocess_param = (struct detection_postprocess_param*)(node->op.param_mem); int max_detections = detection_postprocess_param->max_detections; int max_classes_per_detection = detection_postprocess_param->max_classes_per_detection; int num_classes = detection_postprocess_param->num_classes; @@ -54,8 +52,7 @@ static int infer_shape(struct node* node) int* in_dim2 = &input1->dims[TE_MAX_SHAPE_DIM_NUM]; // Only support: batch_size == 1 && num_coord == 4 - if (input0->dims[0] != 1 || input0->dims[1] != 4 || input1->dims[0] != 1 || input1->dims[2] != input0->dims[2] || - input1->dims[1] != num_classes + 1) + if (input0->dims[0] != 1 || input0->dims[1] != 4 || input1->dims[0] != 1 || input1->dims[2] != input0->dims[2] || input1->dims[1] != num_classes + 1) { TLOG_ERR("Not Support.\n"); return -1; @@ -73,11 +70,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct detection_postprocess_param* detection_postprocess_param = - ( struct detection_postprocess_param* )sys_malloc(sizeof(struct detection_postprocess_param)); + struct detection_postprocess_param* detection_postprocess_param = (struct detection_postprocess_param*)sys_malloc(sizeof(struct detection_postprocess_param)); if (detection_postprocess_param == NULL) { @@ -94,11 +89,9 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { - struct detection_postprocess_param* detection_postprocess_param = - ( struct detection_postprocess_param* )op->param_mem; + struct detection_postprocess_param* detection_postprocess_param = (struct detection_postprocess_param*)op->param_mem; if (detection_postprocess_param->scales) sys_free(detection_postprocess_param->scales); @@ -106,7 +99,6 @@ static void release_op(struct op* op) sys_free(op->param_mem); } - int register_detection_postprocess_op() { struct method m; @@ -118,7 +110,6 @@ int register_detection_postprocess_op() return register_op(OP_DETECTION_POSTPROCESS, OP_DETECTION_POSTPROCESS_NAME, &m); } - int unregister_detection_postprocess_op() { return unregister_op(OP_DETECTION_POSTPROCESS, 1); diff --git a/source/operator/prototype/detection_postprocess_param.h b/source/operator/prototype/detection_postprocess_param.h index 77a751071..3c53cc022 100644 --- a/source/operator/prototype/detection_postprocess_param.h +++ b/source/operator/prototype/detection_postprocess_param.h @@ -32,7 +32,7 @@ struct detection_postprocess_param float nms_score_threshold; float nms_iou_threshold; int num_classes; - float* scales; // y_scale, x_scale, h_scale, w_scale + float* scales; // y_scale, x_scale, h_scale, w_scale }; #endif diff --git a/source/operator/prototype/dropout.c b/source/operator/prototype/dropout.c index 348a8666a..4fda96805 100644 --- a/source/operator/prototype/dropout.c +++ b/source/operator/prototype/dropout.c @@ -28,7 +28,6 @@ #include "graph/graph.h" #include "module/module.h" - static int init_op(struct op* op) { op->same_shape = 1; @@ -37,9 +36,9 @@ static int init_op(struct op* op) return 0; } - -static void release_op(struct op* op) {} - +static void release_op(struct op* op) +{ +} int register_dropout_op() { @@ -52,7 +51,6 @@ int register_dropout_op() return register_op(OP_DROPOUT, OP_DROPOUT_NAME, &m); } - int unregister_dropout_op() { return unregister_op(OP_DROPOUT, 1); diff --git a/source/operator/prototype/eltwise.c b/source/operator/prototype/eltwise.c index 8266b7c00..4288b8935 100644 --- a/source/operator/prototype/eltwise.c +++ b/source/operator/prototype/eltwise.c @@ -34,14 +34,13 @@ #include - static int infer_shape(struct node* node) { struct graph* graph = node->graph; struct tensor* input0 = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct eltwise_param* eltwise_param = ( struct eltwise_param* )(node->op.param_mem); + struct eltwise_param* eltwise_param = (struct eltwise_param*)(node->op.param_mem); if (node->input_num == 1) { @@ -77,10 +76,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct eltwise_param* eltwise_param = ( struct eltwise_param* )sys_malloc(sizeof(struct eltwise_param)); + struct eltwise_param* eltwise_param = (struct eltwise_param*)sys_malloc(sizeof(struct eltwise_param)); if (eltwise_param == NULL) { @@ -98,13 +96,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_eltwise_op() { struct method m; @@ -113,11 +109,9 @@ int register_eltwise_op() m.init = init_op; m.release = release_op; - return register_op(OP_ELTWISE, OP_ELTWISE_NAME, &m); } - int unregister_eltwise_op() { return unregister_op(OP_ELTWISE, 1); diff --git a/source/operator/prototype/elu.c b/source/operator/prototype/elu.c index 23ad7d4bd..8f0698983 100644 --- a/source/operator/prototype/elu.c +++ b/source/operator/prototype/elu.c @@ -31,7 +31,6 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(ir_node_t* node) { ir_graph_t* ir_graph = node->graph; @@ -43,10 +42,9 @@ static int infer_shape(ir_node_t* node) return 0; } - static int init_op(ir_op_t* op) { - struct elu_param* elu_param = ( struct elu_param* )sys_malloc(sizeof(struct elu_param)); + struct elu_param* elu_param = (struct elu_param*)sys_malloc(sizeof(struct elu_param)); if (elu_param == NULL) { @@ -64,13 +62,11 @@ static int init_op(ir_op_t* op) return 0; } - static void release_op(ir_op_t* op) { sys_free(op->param_mem); } - int register_elu_op() { ir_method_t m; @@ -82,7 +78,6 @@ int register_elu_op() return register_op(OP_ELU, OP_ELU_NAME, &m); } - int unregister_elu_op() { return unregister_op(OP_ELU, 1); diff --git a/source/operator/prototype/embedding.c b/source/operator/prototype/embedding.c index b87b08b89..a8db23069 100644 --- a/source/operator/prototype/embedding.c +++ b/source/operator/prototype/embedding.c @@ -32,7 +32,6 @@ #include - static int infer_shape(struct node* node) { struct graph* graph = node->graph; @@ -46,7 +45,7 @@ static int infer_shape(struct node* node) dims[0] *= input->dims[ii]; } - struct embedding_param* param = ( struct embedding_param* )node->op.param_mem; + struct embedding_param* param = (struct embedding_param*)node->op.param_mem; dims[1] = param->num_output; @@ -55,10 +54,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct embedding_param* param = ( struct embedding_param* )sys_malloc(sizeof(struct embedding_param)); + struct embedding_param* param = (struct embedding_param*)sys_malloc(sizeof(struct embedding_param)); if (param == NULL) { @@ -75,13 +73,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_embedding_op() { struct method m; @@ -90,11 +86,9 @@ int register_embedding_op() m.init = init_op; m.release = release_op; - return register_op(OP_EMBEDDING, OP_EMBEDDING_NAME, &m); } - int unregister_embedding_op() { return unregister_op(OP_EMBEDDING, 1); diff --git a/source/operator/prototype/embedding_param.h b/source/operator/prototype/embedding_param.h index 3489e9f46..d2f268375 100644 --- a/source/operator/prototype/embedding_param.h +++ b/source/operator/prototype/embedding_param.h @@ -28,7 +28,7 @@ struct embedding_param { int num_output; int input_dim; - int bias_term; // if use bias + int bias_term; // if use bias int weight_data_size; }; diff --git a/source/operator/prototype/expand.c b/source/operator/prototype/expand.c index de22eb949..521c86173 100644 --- a/source/operator/prototype/expand.c +++ b/source/operator/prototype/expand.c @@ -40,8 +40,8 @@ static int infer_shape(struct node* node) struct vector* dims = create_vector(sizeof(int), NULL); struct vector* dims1 = create_vector(sizeof(int), NULL); struct vector* dims2 = create_vector(sizeof(int), NULL); - - expand_param_t* param = ( struct expand_param* )(node->op.param_mem); + + expand_param_t* param = (struct expand_param*)(node->op.param_mem); struct graph* graph = node->graph; struct tensor* input1 = get_ir_graph_tensor(graph, node->input_tensors[0]); @@ -49,82 +49,86 @@ static int infer_shape(struct node* node) struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); int flag = 1; - int32_t * input2_data = (int32_t*)input2->data; - for(int i = 0; i < input2->elem_num; i++) + int32_t* input2_data = (int32_t*)input2->data; + for (int i = 0; i < input2->elem_num; i++) { - if(input2_data[i] == 0){ + if (input2_data[i] == 0) + { flag = 0; } } - if(flag == 1) + if (flag == 1) { - for(int i = 0; i < input2->elem_num; i++) + for (int i = 0; i < input2->elem_num; i++) param->ex_shape[i] = input2_data[i]; } - - for(int i = 0; i < (int)param->dim_num; i++) + + for (int i = 0; i < (int)param->dim_num; i++) { int temp = param->ex_shape[i]; push_vector_data(dims2, (void*)&temp); } int num = get_vector_num(dims2); - int input1_dim_size = input1->dim_num; int input2_dim_size = param->dim_num; - - if(input1_dim_size == input2_dim_size) + + if (input1_dim_size == input2_dim_size) { - for(int i = 0; i < input2_dim_size; i++) + for (int i = 0; i < input2_dim_size; i++) { - if(input1->dims[i] >= param->ex_shape[i]) + if (input1->dims[i] >= param->ex_shape[i]) { int temp = input1->dims[i]; push_vector_data(dims, (void*)&temp); - } + } else { int temp = param->ex_shape[i]; push_vector_data(dims, (void*)&temp); } } - } else { + } + else + { int diff = fabs(input1_dim_size - input2_dim_size); - if(input1_dim_size > input2_dim_size) + if (input1_dim_size > input2_dim_size) { - for(int i = 0; i < input1_dim_size; i++) + for (int i = 0; i < input1_dim_size; i++) { int temp = input1->dims[i]; push_vector_data(dims, (void*)&temp); } - for(int i = 0; i < input1_dim_size - diff; i++) + for (int i = 0; i < input1_dim_size - diff; i++) { - if(input1->dims[i+diff] > param->ex_shape[i]) + if (input1->dims[i + diff] > param->ex_shape[i]) { - int temp = input1->dims[i+diff]; + int temp = input1->dims[i + diff]; push_vector_data(dims, (void*)&temp); - } - else + } + else { int temp = param->ex_shape[i]; push_vector_data(dims, (void*)&temp); } } - } else { - for(int i = 0; i < input2_dim_size; i++) + } + else + { + for (int i = 0; i < input2_dim_size; i++) { int temp = param->ex_shape[i]; push_vector_data(dims, (void*)&temp); } - for(int i = 0; i < input2_dim_size - diff; i++) + for (int i = 0; i < input2_dim_size - diff; i++) { - if(param->ex_shape[i+diff] > input1->dims[i]) + if (param->ex_shape[i + diff] > input1->dims[i]) { - int temp = param->ex_shape[i+diff]; + int temp = param->ex_shape[i + diff]; push_vector_data(dims, (void*)&temp); - } - else + } + else { int temp = input1->dims[i]; push_vector_data(dims, (void*)&temp); @@ -133,8 +137,8 @@ static int infer_shape(struct node* node) } } int new_size = 1; - int* new_shape_temp = (int*)sys_malloc(get_vector_num(dims)*sizeof(int)); - for(int i = 0; i < get_vector_num(dims); i++) + int* new_shape_temp = (int*)sys_malloc(get_vector_num(dims) * sizeof(int)); + for (int i = 0; i < get_vector_num(dims); i++) { int* a = (int*)get_vector_data(dims, i); new_shape_temp[i] = *a; @@ -150,10 +154,9 @@ static int infer_shape(struct node* node) return ret; } - static int init_op(struct op* op) { - struct expand_param* expand_param = ( struct expand_param* )sys_malloc(sizeof(struct expand_param)); + struct expand_param* expand_param = (struct expand_param*)sys_malloc(sizeof(struct expand_param)); if (expand_param == NULL) { @@ -170,10 +173,9 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { - struct expand_param* expand_param = ( struct expand_param* )op->param_mem; + struct expand_param* expand_param = (struct expand_param*)op->param_mem; if (expand_param->ex_shape) sys_free(expand_param->ex_shape); @@ -181,7 +183,6 @@ static void release_op(struct op* op) sys_free(op->param_mem); } - int register_expand_op() { struct method m; @@ -193,7 +194,6 @@ int register_expand_op() return register_op(OP_EXPAND, OP_EXPAND_NAME, &m); } - int unregister_expand_op() { return unregister_op(OP_EXPAND, 1); diff --git a/source/operator/prototype/expanddims.c b/source/operator/prototype/expanddims.c index 7b56952b3..c488aa84c 100644 --- a/source/operator/prototype/expanddims.c +++ b/source/operator/prototype/expanddims.c @@ -31,14 +31,13 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct expanddims_param* expanddims_param = ( struct expanddims_param* )(node->op.param_mem); + struct expanddims_param* expanddims_param = (struct expanddims_param*)(node->op.param_mem); int axis = expanddims_param->axis; int in_size = input->dim_num; @@ -66,10 +65,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct expanddims_param* expanddims_param = ( struct expanddims_param* )sys_malloc(sizeof(struct expanddims_param)); + struct expanddims_param* expanddims_param = (struct expanddims_param*)sys_malloc(sizeof(struct expanddims_param)); if (expanddims_param == NULL) { @@ -87,13 +85,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_expanddims_op() { struct method m; @@ -105,7 +101,6 @@ int register_expanddims_op() return register_op(OP_EXPANDDIMS, OP_EXPANDDIMS_NAME, &m); } - int unregister_expanddims_op() { return unregister_op(OP_EXPANDDIMS, 1); diff --git a/source/operator/prototype/fc.c b/source/operator/prototype/fc.c index c96860578..4c4fc3c9f 100644 --- a/source/operator/prototype/fc.c +++ b/source/operator/prototype/fc.c @@ -32,7 +32,6 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int infer_shape(ir_node_t* node) { ir_graph_t* graph = node->graph; @@ -105,7 +104,7 @@ static int infer_shape(ir_node_t* node) static int init_op(ir_op_t* op) { - struct fc_param* fc_param = ( struct fc_param* )sys_malloc(sizeof(struct fc_param)); + struct fc_param* fc_param = (struct fc_param*)sys_malloc(sizeof(struct fc_param)); if (fc_param == NULL) { diff --git a/source/operator/prototype/flatten.c b/source/operator/prototype/flatten.c index 9bfbc2777..5354dfce2 100644 --- a/source/operator/prototype/flatten.c +++ b/source/operator/prototype/flatten.c @@ -31,14 +31,13 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct flatten_param* flatten_param = ( struct flatten_param* )(node->op.param_mem); + struct flatten_param* flatten_param = (struct flatten_param*)(node->op.param_mem); int new_channel = 1; for (int i = flatten_param->axis; i <= flatten_param->end_axis && i < input->dim_num; i++) @@ -59,10 +58,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct flatten_param* flatten_param = ( struct flatten_param* )sys_malloc(sizeof(struct flatten_param)); + struct flatten_param* flatten_param = (struct flatten_param*)sys_malloc(sizeof(struct flatten_param)); if (flatten_param == NULL) { @@ -81,13 +79,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_flatten_op() { struct method m; @@ -96,11 +92,9 @@ int register_flatten_op() m.init = init_op; m.release = release_op; - return register_op(OP_FLATTEN, OP_FLATTEN_NAME, &m); } - int unregister_flatten_op() { return unregister_op(OP_FLATTEN, 1); diff --git a/source/operator/prototype/gather.c b/source/operator/prototype/gather.c index 4dd72a5b5..0027a57c4 100644 --- a/source/operator/prototype/gather.c +++ b/source/operator/prototype/gather.c @@ -32,43 +32,42 @@ #include "utility/sys_port.h" #include "utility/vector.h" - static int infer_shape(struct node* node) { - struct graph* graph = node->graph; - struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); + struct graph* graph = node->graph; + struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct gather_param* _param = ( struct gather_param* )(node->op.param_mem); - + struct gather_param* _param = (struct gather_param*)(node->op.param_mem); + int indices_size = _param->indices_num; - + struct vector* new_shape_temp = create_vector(sizeof(int), NULL); - if(_param->is_onnx) + if (_param->is_onnx) { - if(_param->axis == 0) + if (_param->axis == 0) { - for(int i = 0; i < input->dim_num - 1; i++) + for (int i = 0; i < input->dim_num - 1; i++) { - push_vector_data(new_shape_temp, (void* )&input->dims[i+1]); + push_vector_data(new_shape_temp, (void*)&input->dims[i + 1]); } } else { - for(int i = 0; i < input->dim_num; i++) + for (int i = 0; i < input->dim_num; i++) { - if(i == _param->axis) - push_vector_data(new_shape_temp, (void* )&indices_size); + if (i == _param->axis) + push_vector_data(new_shape_temp, (void*)&indices_size); else - push_vector_data(new_shape_temp, (void* )&input->dims[i]); + push_vector_data(new_shape_temp, (void*)&input->dims[i]); } } - int* shape_temp = (int *)sys_malloc(get_vector_num(new_shape_temp) * sizeof(int)); + int* shape_temp = (int*)sys_malloc(get_vector_num(new_shape_temp) * sizeof(int)); - for (int i=0; idims[0]; dims[1] = input->dims[1]; dims[2] = input->dims[2]; dims[3] = input->dims[3]; - if( _param->axis > ( int )input->dim_num) + if (_param->axis > (int)input->dim_num) { return -1; - } - dims[_param->axis] = indices_size; + } + dims[_param->axis] = indices_size; set_ir_tensor_shape(output, dims, 4); } - + release_vector(new_shape_temp); return 0; } - static int init_op(struct op* op) { - struct gather_param* gather_param = ( struct gather_param* )sys_malloc(sizeof(struct gather_param)); + struct gather_param* gather_param = (struct gather_param*)sys_malloc(sizeof(struct gather_param)); if (gather_param == NULL) { @@ -117,13 +115,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_gather_op() { struct method m; @@ -132,11 +128,9 @@ int register_gather_op() m.init = init_op; m.release = release_op; - return register_op(OP_GATHER, OP_GATHER_NAME, &m); } - int unregister_gather_op() { return unregister_op(OP_GATHER, 1); diff --git a/source/operator/prototype/gemm.c b/source/operator/prototype/gemm.c index f1b299f54..5bb0151e8 100644 --- a/source/operator/prototype/gemm.c +++ b/source/operator/prototype/gemm.c @@ -32,7 +32,6 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; @@ -40,7 +39,7 @@ static int infer_shape(struct node* node) struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); struct tensor* weight = get_ir_graph_tensor(graph, node->input_tensors[1]); - struct gemm_param* gemm_param = ( struct gemm_param* )(node->op.param_mem); + struct gemm_param* gemm_param = (struct gemm_param*)(node->op.param_mem); int dims[2]; if (gemm_param->transA) @@ -58,16 +57,15 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct gemm_param* gemm_param = ( struct gemm_param* )sys_malloc(sizeof(struct gemm_param)); + struct gemm_param* gemm_param = (struct gemm_param*)sys_malloc(sizeof(struct gemm_param)); if (gemm_param == NULL) { return -1; } - + /*set the param default value */ gemm_param->transA = 0; gemm_param->transB = 0; @@ -80,13 +78,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_gemm_op() { struct method m; @@ -98,7 +94,6 @@ int register_gemm_op() return register_op(OP_GEMM, OP_GEMM_NAME, &m); } - int unregister_gemm_op() { return unregister_op(OP_GEMM, 1); diff --git a/source/operator/prototype/generic.c b/source/operator/prototype/generic.c index 2e6b70980..bf504d6d2 100644 --- a/source/operator/prototype/generic.c +++ b/source/operator/prototype/generic.c @@ -32,7 +32,6 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; @@ -62,10 +61,9 @@ static int infer_shape(struct node* node) return -1; } - static int init_op(struct op* op) { - struct generic_param* generic_param = ( struct generic_param* )sys_malloc(sizeof(struct generic_param)); + struct generic_param* generic_param = (struct generic_param*)sys_malloc(sizeof(struct generic_param)); if (generic_param == NULL) { @@ -80,13 +78,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_generic_op() { struct method m; @@ -98,7 +94,6 @@ int register_generic_op() return register_op(OP_GENERIC, OP_GENERIC_NAME, &m); } - int unregister_generic_op() { return unregister_op(OP_GENERIC, 1); diff --git a/source/operator/prototype/generic_param.h b/source/operator/prototype/generic_param.h index 7d04091b8..c63206af8 100644 --- a/source/operator/prototype/generic_param.h +++ b/source/operator/prototype/generic_param.h @@ -27,7 +27,7 @@ struct generic_param { - const char* op_name; // what real action? + const char* op_name; // what real action? int max_input_num; int max_output_num; }; diff --git a/source/operator/prototype/gru.c b/source/operator/prototype/gru.c index 0b8f5a89d..bfca1de3e 100644 --- a/source/operator/prototype/gru.c +++ b/source/operator/prototype/gru.c @@ -32,14 +32,13 @@ #include "utility/sys_port.h" #include "utility/vector.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]); struct tensor* weight = get_ir_graph_tensor(ir_graph, node->input_tensors[1]); struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]); - struct gru_param* gru_param = ( struct gru_param* )(node->op.param_mem); + struct gru_param* gru_param = (struct gru_param*)(node->op.param_mem); int batch_size = input->dims[1]; int dims[4]; dims[0] = input->dims[0]; @@ -51,10 +50,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - gru_param_t* gru_param = ( gru_param_t* )sys_malloc(sizeof(gru_param_t)); + gru_param_t* gru_param = (gru_param_t*)sys_malloc(sizeof(gru_param_t)); if (gru_param == NULL) { @@ -79,13 +77,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_gru_op() { struct method m; @@ -94,11 +90,9 @@ int register_gru_op() m.init = init_op; m.release = release_op; - return register_op(OP_GRU, OP_GRU_NAME, &m); } - int unregister_gru_op() { return unregister_op(OP_GRU, 1); diff --git a/source/operator/prototype/gru_param.h b/source/operator/prototype/gru_param.h index d1ba5266b..ae85273a1 100644 --- a/source/operator/prototype/gru_param.h +++ b/source/operator/prototype/gru_param.h @@ -26,7 +26,7 @@ #define __GRU_PARAM_H__ #define GRU_ACT_SIGMOID 1 -#define GRU_ACT_TANH 2 +#define GRU_ACT_TANH 2 typedef struct gru_param { float clip; diff --git a/source/operator/prototype/hardsigmoid.c b/source/operator/prototype/hardsigmoid.c index 1f44c4655..77d779a82 100644 --- a/source/operator/prototype/hardsigmoid.c +++ b/source/operator/prototype/hardsigmoid.c @@ -32,7 +32,6 @@ #include - static int infer_shape(struct node* node) { struct graph* graph = node->graph; @@ -44,10 +43,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct hard_sigmoid_param* param = ( struct hard_sigmoid_param* )sys_malloc(sizeof(struct hard_sigmoid_param)); + struct hard_sigmoid_param* param = (struct hard_sigmoid_param*)sys_malloc(sizeof(struct hard_sigmoid_param)); if (param == NULL) { @@ -64,13 +62,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_hardsigmoid_op() { struct method m; @@ -82,7 +78,6 @@ int register_hardsigmoid_op() return register_op(OP_HARDSIGMOID, OP_HARDSIGMOID_NAME, &m); } - int unregister_hardsigmoid_op() { return unregister_op(OP_HARDSIGMOID, 1); diff --git a/source/operator/prototype/hardswish.c b/source/operator/prototype/hardswish.c index 49819c71b..04216b9dd 100644 --- a/source/operator/prototype/hardswish.c +++ b/source/operator/prototype/hardswish.c @@ -32,10 +32,9 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int init_op(struct op* op) { - struct hardswish_param* hardswish_param = ( struct hardswish_param* )sys_malloc(sizeof(struct hardswish_param)); + struct hardswish_param* hardswish_param = (struct hardswish_param*)sys_malloc(sizeof(struct hardswish_param)); if (hardswish_param == NULL) { @@ -54,13 +53,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_hardswish_op() { struct method m; @@ -72,7 +69,6 @@ int register_hardswish_op() return register_op(OP_HARDSWISH, OP_HARDSWISH_NAME, &m); } - int unregister_hardswish_op() { return unregister_op(OP_HARDSWISH, 1); diff --git a/source/operator/prototype/input.c b/source/operator/prototype/input.c index a9166e60a..551d18ee3 100644 --- a/source/operator/prototype/input.c +++ b/source/operator/prototype/input.c @@ -28,7 +28,6 @@ #include "graph/graph.h" #include "module/module.h" - static int init_op(ir_op_t* op) { op->same_shape = 1; @@ -37,9 +36,9 @@ static int init_op(ir_op_t* op) return 0; } - -static void release_op(ir_op_t* op) {} - +static void release_op(ir_op_t* op) +{ +} int register_input_op() { @@ -49,10 +48,9 @@ int register_input_op() m.init = init_op; m.release = release_op; - return register_op(OP_INPUT, OP_INPUT_NAME , &m); + return register_op(OP_INPUT, OP_INPUT_NAME, &m); } - int unregister_input_op() { return unregister_op(OP_INPUT, 1); diff --git a/source/operator/prototype/instancenorm.c b/source/operator/prototype/instancenorm.c index 279e45adc..001f474ae 100644 --- a/source/operator/prototype/instancenorm.c +++ b/source/operator/prototype/instancenorm.c @@ -34,7 +34,6 @@ #include - static int infer_shape(struct node* node) { struct graph* graph = node->graph; @@ -46,10 +45,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct instancenorm_Param* param = ( struct instancenorm_Param* )sys_malloc(sizeof(struct instancenorm_Param)); + struct instancenorm_Param* param = (struct instancenorm_Param*)sys_malloc(sizeof(struct instancenorm_Param)); if (param == NULL) { @@ -66,13 +64,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_instancenorm_op() { struct method m; @@ -84,7 +80,6 @@ int register_instancenorm_op() return register_op(OP_INSTANCENORM, OP_INSTANCENORM_NAME, &m); } - int unregister_instancenorm_op() { return unregister_op(OP_INSTANCENORM, 1); diff --git a/source/operator/prototype/interp.c b/source/operator/prototype/interp.c index 63815c5dd..88c36499b 100644 --- a/source/operator/prototype/interp.c +++ b/source/operator/prototype/interp.c @@ -31,7 +31,6 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; @@ -42,7 +41,7 @@ static int infer_shape(struct node* node) int in_h = input->dims[2]; int in_w = input->dims[3]; - struct interp_param* param = ( struct interp_param* )(node->op.param_mem); + struct interp_param* param = (struct interp_param*)(node->op.param_mem); if (param == NULL) { @@ -56,8 +55,8 @@ static int infer_shape(struct node* node) } else { - param->height_scale = (float )param->output_height / (float )in_h; - param->width_scale = (float )param->output_width / (float )in_w; + param->height_scale = (float)param->output_height / (float)in_h; + param->width_scale = (float)param->output_width / (float)in_w; } int dim[4] = {0}; @@ -72,10 +71,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct interp_param* interp_param = ( struct interp_param* )sys_malloc(sizeof(struct interp_param)); + struct interp_param* interp_param = (struct interp_param*)sys_malloc(sizeof(struct interp_param)); if (interp_param == NULL) { @@ -97,13 +95,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_interp_op() { struct method m; @@ -115,7 +111,6 @@ int register_interp_op() return register_op(OP_INTERP, OP_INTERP_NAME, &m); } - int unregister_interp_op() { return unregister_op(OP_INTERP, 1); diff --git a/source/operator/prototype/l2normalization.c b/source/operator/prototype/l2normalization.c index 8089cc4bd..b6dfed267 100644 --- a/source/operator/prototype/l2normalization.c +++ b/source/operator/prototype/l2normalization.c @@ -30,7 +30,6 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; @@ -42,7 +41,6 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { op->same_shape = 0; @@ -51,9 +49,9 @@ static int init_op(struct op* op) return 0; } - -static void release_op(struct op* op) {} - +static void release_op(struct op* op) +{ +} int register_l2normalization_op() { @@ -66,7 +64,6 @@ int register_l2normalization_op() return register_op(OP_L2NORMALIZATION, OP_L2NORMALIZATION_NAME, &m); } - int unregister_l2normalization_op() { return unregister_op(OP_L2NORMALIZATION, 1); diff --git a/source/operator/prototype/l2pool.c b/source/operator/prototype/l2pool.c index b4e24f355..d5c79cbe6 100644 --- a/source/operator/prototype/l2pool.c +++ b/source/operator/prototype/l2pool.c @@ -31,26 +31,28 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; struct tensor* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct l2pool_param* l2pool_param = (struct l2pool_param* )(node->op.param_mem); + struct l2pool_param* l2pool_param = (struct l2pool_param*)(node->op.param_mem); int input_h = input_tensor->dims[1]; int input_w = input_tensor->dims[2]; int output_h = 0; int output_w = 0; - if(l2pool_param->paddingType == 1){ - output_h = (input_h + l2pool_param->stride_h -1 )/l2pool_param->stride_h; - output_w = (input_w + l2pool_param->stride_w -1 )/l2pool_param->stride_w; - } else { - output_h = (input_h + l2pool_param->stride_h - l2pool_param->kernel_h)/l2pool_param->stride_h; - output_w = (input_w + l2pool_param->stride_w - l2pool_param->kernel_w)/l2pool_param->stride_w; + if (l2pool_param->paddingType == 1) + { + output_h = (input_h + l2pool_param->stride_h - 1) / l2pool_param->stride_h; + output_w = (input_w + l2pool_param->stride_w - 1) / l2pool_param->stride_w; + } + else + { + output_h = (input_h + l2pool_param->stride_h - l2pool_param->kernel_h) / l2pool_param->stride_h; + output_w = (input_w + l2pool_param->stride_w - l2pool_param->kernel_w) / l2pool_param->stride_w; } int dims[4]; dims[0] = input_tensor->dims[0]; @@ -61,13 +63,11 @@ static int infer_shape(struct node* node) set_ir_tensor_shape(output_tensor, dims, 4); return 0; - } - static int init_op(struct op* op) { - struct l2pool_param* l2pool_param = ( struct l2pool_param* )sys_malloc(sizeof(struct l2pool_param)); + struct l2pool_param* l2pool_param = (struct l2pool_param*)sys_malloc(sizeof(struct l2pool_param)); if (l2pool_param == NULL) { @@ -82,13 +82,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_l2pool_op() { struct method m; @@ -96,13 +94,10 @@ int register_l2pool_op() m.init = init_op; m.release = release_op; - return register_op(OP_L2POOL, OP_L2POOL_NAME, &m); - } - int unregister_l2pool_op() { - return unregister_op(OP_L2POOL,1); + return unregister_op(OP_L2POOL, 1); } diff --git a/source/operator/prototype/l2pool_param.h b/source/operator/prototype/l2pool_param.h index 477242241..57eef5352 100644 --- a/source/operator/prototype/l2pool_param.h +++ b/source/operator/prototype/l2pool_param.h @@ -25,7 +25,8 @@ #ifndef __L2POOL_H__ #define __L2POOL_H__ -enum{ +enum +{ kNone = 0, kSame, kValid diff --git a/source/operator/prototype/logical.c b/source/operator/prototype/logical.c index 4091ff40f..ddb66ead4 100644 --- a/source/operator/prototype/logical.c +++ b/source/operator/prototype/logical.c @@ -32,7 +32,6 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int infer_shape(struct node* node) { if (node->input_num == 1) @@ -64,10 +63,9 @@ static int infer_shape(struct node* node) return -1; } - static int init_op(struct op* op) { - struct logical_param* logical_param = ( struct logical_param* )sys_malloc(sizeof(struct logical_param)); + struct logical_param* logical_param = (struct logical_param*)sys_malloc(sizeof(struct logical_param)); if (logical_param == NULL) { @@ -85,13 +83,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_logical_op() { struct method m; @@ -103,7 +99,6 @@ int register_logical_op() return register_op(OP_LOGICAL, OP_LOGICAL_NAME, &m); } - int unregister_logical_op() { return unregister_op(OP_LOGICAL, 1); diff --git a/source/operator/prototype/logsoftmax.c b/source/operator/prototype/logsoftmax.c index 73b44db13..3fc1946ba 100644 --- a/source/operator/prototype/logsoftmax.c +++ b/source/operator/prototype/logsoftmax.c @@ -32,7 +32,6 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; @@ -44,10 +43,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct logsoftmax_param* logsoftmax_param = ( struct logsoftmax_param* )sys_malloc(sizeof(struct logsoftmax_param)); + struct logsoftmax_param* logsoftmax_param = (struct logsoftmax_param*)sys_malloc(sizeof(struct logsoftmax_param)); if (logsoftmax_param == NULL) { @@ -64,13 +62,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_logsoftmax_op() { struct method m; @@ -81,8 +77,7 @@ int register_logsoftmax_op() return register_op(OP_LOGSOFTMAX, OP_LOGSOFTMAX_NAME, &m); } - int unregister_logsoftmax_op() { - return unregister_op(OP_LOGSOFTMAX,1); + return unregister_op(OP_LOGSOFTMAX, 1); } diff --git a/source/operator/prototype/lrn.c b/source/operator/prototype/lrn.c index a45e9655e..b24d6efed 100644 --- a/source/operator/prototype/lrn.c +++ b/source/operator/prototype/lrn.c @@ -31,7 +31,6 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; @@ -43,10 +42,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct lrn_param* lrn_param = ( struct lrn_param* )sys_malloc(sizeof(struct lrn_param)); + struct lrn_param* lrn_param = (struct lrn_param*)sys_malloc(sizeof(struct lrn_param)); if (lrn_param == NULL) { @@ -68,13 +66,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_lrn_op() { struct method m; @@ -83,11 +79,9 @@ int register_lrn_op() m.init = init_op; m.release = release_op; - return register_op(OP_LRN, OP_LRN_NAME, &m); } - int unregister_lrn_op() { return unregister_op(OP_LRN, 1); diff --git a/source/operator/prototype/lstm.c b/source/operator/prototype/lstm.c index af55daccd..280a86013 100644 --- a/source/operator/prototype/lstm.c +++ b/source/operator/prototype/lstm.c @@ -32,13 +32,12 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]); - struct lstm_param* lstm_param = ( struct lstm_param* )(node->op.param_mem); + struct lstm_param* lstm_param = (struct lstm_param*)(node->op.param_mem); int batch_size = input->dims[1]; if (lstm_param->mxnet_flag == 0) { @@ -64,10 +63,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - lstm_param_t* lstm_param = ( lstm_param_t* )sys_malloc(sizeof(lstm_param_t)); + lstm_param_t* lstm_param = (lstm_param_t*)sys_malloc(sizeof(lstm_param_t)); if (lstm_param == NULL) { @@ -95,13 +93,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_lstm_op() { struct method m; @@ -113,7 +109,6 @@ int register_lstm_op() return register_op(OP_LSTM, OP_LSTM_NAME, &m); } - int unregister_lstm_op() { return unregister_op(OP_LSTM, 1); diff --git a/source/operator/prototype/lstm_param.h b/source/operator/prototype/lstm_param.h index 0aa111974..9fc8ff60c 100644 --- a/source/operator/prototype/lstm_param.h +++ b/source/operator/prototype/lstm_param.h @@ -26,7 +26,7 @@ #define __LSTM_PARAM_H__ #define LSTM_ACT_SIGMOID 1 -#define LSTM_ACT_TANH 2 +#define LSTM_ACT_TANH 2 typedef struct lstm_param { float forget_bias; diff --git a/source/operator/prototype/matmul.c b/source/operator/prototype/matmul.c index 262a4e742..74a93c0d1 100644 --- a/source/operator/prototype/matmul.c +++ b/source/operator/prototype/matmul.c @@ -29,7 +29,6 @@ #include "module/module.h" #include "utility/log.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; @@ -77,7 +76,6 @@ static int infer_shape(struct node* node) return -1; } - static int init_op(struct op* op) { op->same_shape = 0; @@ -85,7 +83,6 @@ static int init_op(struct op* op) return 0; } - int register_matmul_op() { struct method m; @@ -94,11 +91,9 @@ int register_matmul_op() m.init = init_op; m.release = NULL; - return register_op(OP_MATMUL, OP_MATMUL_NAME, &m); } - int unregister_matmul_op() { return unregister_op(OP_MATMUL, 1); diff --git a/source/operator/prototype/maximum.c b/source/operator/prototype/maximum.c index 2d72a812c..d0bf587c9 100644 --- a/source/operator/prototype/maximum.c +++ b/source/operator/prototype/maximum.c @@ -28,7 +28,6 @@ #include "graph/graph.h" #include "module/module.h" - static int init_op(struct op* op) { op->same_shape = 1; @@ -36,7 +35,6 @@ static int init_op(struct op* op) return 0; } - int register_maximum_op() { struct method m; @@ -47,7 +45,6 @@ int register_maximum_op() return register_op(OP_MAXIMUM, OP_MAXIMUM_NAME, &m); } - int unregister_maximum_op() { return unregister_op(OP_MAXIMUM, 1); diff --git a/source/operator/prototype/mean.c b/source/operator/prototype/mean.c index 345f2473e..f4d9ec962 100644 --- a/source/operator/prototype/mean.c +++ b/source/operator/prototype/mean.c @@ -28,7 +28,6 @@ #include "graph/graph.h" #include "module/module.h" - static int init_op(struct op* op) { op->same_shape = 1; @@ -36,7 +35,6 @@ static int init_op(struct op* op) return 0; } - int register_mean_op() { struct method m; @@ -47,7 +45,6 @@ int register_mean_op() return register_op(OP_MEAN, OP_MEAN_NAME, &m); } - int unregister_mean_op() { return unregister_op(OP_MEAN, 1); diff --git a/source/operator/prototype/minimum.c b/source/operator/prototype/minimum.c index fb83dcbbd..9dc624feb 100644 --- a/source/operator/prototype/minimum.c +++ b/source/operator/prototype/minimum.c @@ -30,7 +30,6 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int init_op(struct op* op) { op->same_shape = 1; @@ -38,7 +37,6 @@ static int init_op(struct op* op) return 0; } - int register_minimum_op() { struct method m; @@ -49,7 +47,6 @@ int register_minimum_op() return register_op(OP_MINIMUM, OP_MINIMUM_NAME, &m); } - int unregister_minimum_op() { return unregister_op(OP_MINIMUM, 1); diff --git a/source/operator/prototype/mish.c b/source/operator/prototype/mish.c index a47e0e83a..bd45c69e8 100644 --- a/source/operator/prototype/mish.c +++ b/source/operator/prototype/mish.c @@ -27,7 +27,6 @@ #include "graph/graph.h" #include "module/module.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; @@ -39,7 +38,6 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { op->same_shape = 0; @@ -48,9 +46,9 @@ static int init_op(struct op* op) return 0; } - -static void release_op(struct op* op) {} - +static void release_op(struct op* op) +{ +} int register_mish_op() { @@ -63,7 +61,6 @@ int register_mish_op() return register_op(OP_MISH, OP_MISH_NAME, &m); } - int unregister_mish_op() { return unregister_op(OP_MISH, 1); diff --git a/source/operator/prototype/mvn.c b/source/operator/prototype/mvn.c index 4be7fe36a..b200b2e7a 100644 --- a/source/operator/prototype/mvn.c +++ b/source/operator/prototype/mvn.c @@ -32,10 +32,9 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int init_op(struct op* op) { - struct mvn_param* param = ( struct mvn_param* )sys_malloc(sizeof(struct mvn_param)); + struct mvn_param* param = (struct mvn_param*)sys_malloc(sizeof(struct mvn_param)); if (param == NULL) { @@ -51,13 +50,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_mvn_op() { struct method m; @@ -69,7 +66,6 @@ int register_mvn_op() return register_op(OP_MVN, OP_MVN_NAME, &m); } - int unregister_mvn_op() { return unregister_op(OP_MVN, 1); diff --git a/source/operator/prototype/noop.c b/source/operator/prototype/noop.c index 3df6c4e9a..1ad30a9bd 100644 --- a/source/operator/prototype/noop.c +++ b/source/operator/prototype/noop.c @@ -28,7 +28,6 @@ #include "graph/graph.h" #include "module/module.h" - static int init_op(struct op* op) { op->same_shape = 1; @@ -37,13 +36,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { // sys_free(op->param_mem); } - int register_noop_op() { struct method m; @@ -55,7 +52,6 @@ int register_noop_op() return register_op(OP_NOOP, OP_NOOP_NAME, &m); } - int unregister_noop_op() { return unregister_op(OP_NOOP, 1); diff --git a/source/operator/prototype/normalize.c b/source/operator/prototype/normalize.c index efbfbabe4..ec3068ce2 100644 --- a/source/operator/prototype/normalize.c +++ b/source/operator/prototype/normalize.c @@ -31,10 +31,9 @@ #include "module/module.h" #include "utility/sys_port.h" - static int init_op(struct op* op) { - normalize_param_t* normalize_param = ( normalize_param_t* )sys_malloc(sizeof(normalize_param_t)); + normalize_param_t* normalize_param = (normalize_param_t*)sys_malloc(sizeof(normalize_param_t)); if (normalize_param == NULL) { @@ -52,13 +51,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_normalize_op() { struct method m; @@ -70,7 +67,6 @@ int register_normalize_op() return register_op(OP_NORMALIZE, OP_NORMALIZE_NAME, &m); } - int unregister_normalize_op() { return unregister_op(OP_NORMALIZE, 1); diff --git a/source/operator/prototype/pad.c b/source/operator/prototype/pad.c index a04c6a968..c96ad367e 100644 --- a/source/operator/prototype/pad.c +++ b/source/operator/prototype/pad.c @@ -31,18 +31,16 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(ir_node_t* node) { ir_graph_t* graph = node->graph; ir_tensor_t* input = get_ir_graph_tensor(graph, node->input_tensors[0]); ir_tensor_t* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct pad_param* pad_param = ( struct pad_param* )(node->op.param_mem); + struct pad_param* pad_param = (struct pad_param*)(node->op.param_mem); int dims[TE_MAX_SHAPE_DIM_NUM] = {0}; - if (pad_param->pad_0_h != -1 && pad_param->pad_0_w != -1 && pad_param->pad_1_h != -1 && pad_param->pad_1_w != -1 && - pad_param->pad_2_h != -1 && pad_param->pad_2_w != -1 && pad_param->pad_3_h != -1 && pad_param->pad_3_w != -1) + if (pad_param->pad_0_h != -1 && pad_param->pad_0_w != -1 && pad_param->pad_1_h != -1 && pad_param->pad_1_w != -1 && pad_param->pad_2_h != -1 && pad_param->pad_2_w != -1 && pad_param->pad_3_h != -1 && pad_param->pad_3_w != -1) { dims[0] = input->dims[0] + pad_param->pad_0_h + pad_param->pad_0_w; dims[1] = input->dims[1] + pad_param->pad_1_h + pad_param->pad_1_w; @@ -59,10 +57,9 @@ static int infer_shape(ir_node_t* node) return 0; } - static int init_op(ir_op_t* op) { - struct pad_param* pad_param = ( struct pad_param* )sys_malloc(sizeof(struct pad_param)); + struct pad_param* pad_param = (struct pad_param*)sys_malloc(sizeof(struct pad_param)); if (pad_param == NULL) { @@ -70,13 +67,13 @@ static int init_op(ir_op_t* op) } pad_param->mode = 0; - pad_param->pad_0_h = -1; // n + pad_param->pad_0_h = -1; // n pad_param->pad_0_w = -1; - pad_param->pad_1_h = -1; // c + pad_param->pad_1_h = -1; // c pad_param->pad_1_w = -1; - pad_param->pad_2_h = -1; // h + pad_param->pad_2_h = -1; // h pad_param->pad_2_w = -1; - pad_param->pad_3_h = -1; // w + pad_param->pad_3_h = -1; // w pad_param->pad_3_w = -1; pad_param->value = 0; @@ -89,13 +86,11 @@ static int init_op(ir_op_t* op) return 0; } - static void release_op(ir_op_t* op) { sys_free(op->param_mem); } - int register_pad_op() { ir_method_t m; @@ -107,7 +102,6 @@ int register_pad_op() return register_op(OP_PAD, OP_PAD_NAME, &m); } - int unregister_pad_op() { return unregister_op(OP_PAD, 1); diff --git a/source/operator/prototype/pad_param.h b/source/operator/prototype/pad_param.h index 87228a054..166a90a9f 100644 --- a/source/operator/prototype/pad_param.h +++ b/source/operator/prototype/pad_param.h @@ -28,13 +28,13 @@ struct pad_param { // mode : 0: CONSTANT; 1: REFLECT; 2: SYMMETRIC. int mode; - int pad_0_h; // n + int pad_0_h; // n int pad_0_w; - int pad_1_h; // c + int pad_1_h; // c int pad_1_w; - int pad_2_h; // h + int pad_2_h; // h int pad_2_w; - int pad_3_h; // w + int pad_3_h; // w int pad_3_w; - float value; // pad value + float value; // pad value }; diff --git a/source/operator/prototype/permute.c b/source/operator/prototype/permute.c index 49568214b..234071341 100644 --- a/source/operator/prototype/permute.c +++ b/source/operator/prototype/permute.c @@ -31,13 +31,12 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - permute_param_t* param = ( struct permute_param* )(node->op.param_mem); + permute_param_t* param = (struct permute_param*)(node->op.param_mem); int dims[TE_MAX_SHAPE_DIM_NUM] = {0}; int dim_size = input->dim_num; @@ -67,10 +66,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct permute_param* permute_param = ( struct permute_param* )sys_malloc(sizeof(struct permute_param)); + struct permute_param* permute_param = (struct permute_param*)sys_malloc(sizeof(struct permute_param)); if (permute_param == NULL) { @@ -91,13 +89,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_permute_op() { struct method m; @@ -109,7 +105,6 @@ int register_permute_op() return register_op(OP_PERMUTE, OP_PERMUTE_NAME, &m); } - int unregister_permute_op() { return unregister_op(OP_PERMUTE, 1); diff --git a/source/operator/prototype/pooling.c b/source/operator/prototype/pooling.c index 0ca3d1b7a..7a4a80e2d 100644 --- a/source/operator/prototype/pooling.c +++ b/source/operator/prototype/pooling.c @@ -31,13 +31,12 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(ir_node_t* node) { ir_graph_t* ir_graph = node->graph; ir_tensor_t* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]); ir_tensor_t* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]); - struct pool_param* pool_param = ( struct pool_param* )node->op.param_mem; + struct pool_param* pool_param = (struct pool_param*)node->op.param_mem; int batch = input->dims[0]; int channel = input->dims[1]; @@ -45,9 +44,7 @@ static int infer_shape(ir_node_t* node) int input_w = input->dims[3]; int output_h, output_w; - if (pool_param->kernel_h == input_h && pool_param->kernel_w == input_w && - pool_param->pad_w0 == 0 && pool_param->pad_w1 == 0 && - pool_param->pad_h0 == 0 && pool_param->pad_h1 == 0) + if (pool_param->kernel_h == input_h && pool_param->kernel_w == input_w && pool_param->pad_w0 == 0 && pool_param->pad_w1 == 0 && pool_param->pad_h0 == 0 && pool_param->pad_h1 == 0) { pool_param->global = 1; } @@ -102,10 +99,9 @@ static int infer_shape(ir_node_t* node) return 0; } - static int init_op(ir_op_t* op) { - struct pool_param* pool_param = ( struct pool_param* )sys_malloc(sizeof(struct pool_param)); + struct pool_param* pool_param = (struct pool_param*)sys_malloc(sizeof(struct pool_param)); if (pool_param == NULL) { @@ -137,13 +133,11 @@ static int init_op(ir_op_t* op) return 0; } - static void release_op(ir_op_t* op) { sys_free(op->param_mem); } - int register_pooling_op() { ir_method_t m; @@ -155,7 +149,6 @@ int register_pooling_op() return register_op(OP_POOL, OP_POOL_NAME, &m); } - int unregister_pooling_op() { return unregister_op(OP_POOL, 1); diff --git a/source/operator/prototype/pooling_param.h b/source/operator/prototype/pooling_param.h index 6bd28d9ea..214c3df33 100644 --- a/source/operator/prototype/pooling_param.h +++ b/source/operator/prototype/pooling_param.h @@ -35,7 +35,7 @@ enum struct pool_param { - int pool_method; // 0:max 1:avg + int pool_method; // 0:max 1:avg int kernel_h; int kernel_w; int stride_h; @@ -44,7 +44,7 @@ struct pool_param int pad_h1; int pad_w0; int pad_w1; - int global; // 0:general 1:global + int global; // 0:general 1:global int caffe_flavor; void* funct; diff --git a/source/operator/prototype/prelu.c b/source/operator/prototype/prelu.c index 1496333d3..8fd689a45 100644 --- a/source/operator/prototype/prelu.c +++ b/source/operator/prototype/prelu.c @@ -27,7 +27,6 @@ #include "graph/graph.h" #include "module/module.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; @@ -39,7 +38,6 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { op->same_shape = 0; @@ -48,9 +46,9 @@ static int init_op(struct op* op) return 0; } - -static void release_op(struct op* op) {} - +static void release_op(struct op* op) +{ +} int register_prelu_op() { @@ -60,11 +58,9 @@ int register_prelu_op() m.init = init_op; m.release = release_op; - return register_op(OP_PRELU, OP_PRELU_NAME, &m); } - int unregister_prelu_op() { return unregister_op(OP_PRELU, 1); diff --git a/source/operator/prototype/priorbox.c b/source/operator/prototype/priorbox.c index 1ef13f0d8..79251ef35 100644 --- a/source/operator/prototype/priorbox.c +++ b/source/operator/prototype/priorbox.c @@ -31,10 +31,9 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { - priorbox_param_t* priorbox_param = ( priorbox_param_t* )node->op.param_mem; + priorbox_param_t* priorbox_param = (priorbox_param_t*)node->op.param_mem; struct graph* ir_graph = node->graph; struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]); @@ -76,10 +75,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct priorbox_param* priorbox_param = ( struct priorbox_param* )sys_malloc(sizeof(struct priorbox_param)); + struct priorbox_param* priorbox_param = (struct priorbox_param*)sys_malloc(sizeof(struct priorbox_param)); if (priorbox_param == NULL) { @@ -96,10 +94,9 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { - struct priorbox_param* priorbox_param = ( struct priorbox_param* )op->param_mem; + struct priorbox_param* priorbox_param = (struct priorbox_param*)op->param_mem; if (priorbox_param->aspect_ratio) sys_free(priorbox_param->aspect_ratio); @@ -113,7 +110,6 @@ static void release_op(struct op* op) sys_free(op->param_mem); } - int register_priorbox_op() { struct method m; @@ -122,11 +118,9 @@ int register_priorbox_op() m.init = init_op; m.release = release_op; - return register_op(OP_PRIORBOX, OP_PRIORBOX_NAME, &m); } - int unregister_priorbox_op() { return unregister_op(OP_PRIORBOX, 1); diff --git a/source/operator/prototype/psroipooling.c b/source/operator/prototype/psroipooling.c index 30d597d1e..3508571b8 100644 --- a/source/operator/prototype/psroipooling.c +++ b/source/operator/prototype/psroipooling.c @@ -32,14 +32,13 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct psroipooling_param* psroipooling_param = ( struct psroipooling_param* )(node->op.param_mem); + struct psroipooling_param* psroipooling_param = (struct psroipooling_param*)(node->op.param_mem); int output_n = input->dims[0]; int output_c = psroipooling_param->output_dim; @@ -58,7 +57,6 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { struct psroipooling_param* psroipooling_param = (struct psroipooling_param*)sys_malloc(sizeof(struct psroipooling_param)); @@ -82,13 +80,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_psroipooling_op() { struct method m; @@ -100,7 +96,6 @@ int register_psroipooling_op() return register_op(OP_PSROIPOOLING, OP_PSROIPOOLING_NAME, &m); } - int unregister_psroipooling_op() { return unregister_op(OP_PSROIPOOLING, 1); diff --git a/source/operator/prototype/reciprocal.c b/source/operator/prototype/reciprocal.c index 74724f5ca..91a49689a 100644 --- a/source/operator/prototype/reciprocal.c +++ b/source/operator/prototype/reciprocal.c @@ -48,7 +48,9 @@ static int init_op(struct op* op) return 0; } -static void release_op(struct op* op) {} +static void release_op(struct op* op) +{ +} int register_reciprocal_op() { diff --git a/source/operator/prototype/reducel2.c b/source/operator/prototype/reducel2.c index 107c02475..64871f9df 100644 --- a/source/operator/prototype/reducel2.c +++ b/source/operator/prototype/reducel2.c @@ -31,23 +31,22 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]); - struct reducel2_param* reducel2_param = ( struct reducel2_param* )node->op.param_mem; + struct reducel2_param* reducel2_param = (struct reducel2_param*)node->op.param_mem; int kd = reducel2_param->keepdim; int axis = reducel2_param->axis; - int* out_dim = ( int* )sys_malloc(input->dim_num * sizeof(int)); + int* out_dim = (int*)sys_malloc(input->dim_num * sizeof(int)); if (axis < 0) axis = axis + input->dim_num; - for (unsigned int i = 0; i < input->dim_num && i < ( unsigned int )axis; i++) + for (unsigned int i = 0; i < input->dim_num && i < (unsigned int)axis; i++) { out_dim[i] = input->dims[i]; } @@ -67,10 +66,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct reducel2_param* reducel2_param = ( struct reducel2_param* )sys_malloc(sizeof(struct reducel2_param)); + struct reducel2_param* reducel2_param = (struct reducel2_param*)sys_malloc(sizeof(struct reducel2_param)); if (reducel2_param == NULL) { @@ -88,13 +86,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_reducel2_op() { struct method m; @@ -106,7 +102,6 @@ int register_reducel2_op() return register_op(OP_REDUCEL2, OP_REDUCEL2_NAME, &m); } - int unregister_reducel2_op() { return unregister_op(OP_REDUCEL2, 1); diff --git a/source/operator/prototype/reduction.c b/source/operator/prototype/reduction.c index cea60886b..128c0d668 100644 --- a/source/operator/prototype/reduction.c +++ b/source/operator/prototype/reduction.c @@ -31,7 +31,6 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct reduction_param* reduction_param = (struct reduction_param*)node->op.param_mem; @@ -41,7 +40,7 @@ static int infer_shape(struct node* node) struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]); int kd = reduction_param->keepdim; - int* in_dim = ( int* )sys_malloc(input->dim_num * sizeof(int)); + int* in_dim = (int*)sys_malloc(input->dim_num * sizeof(int)); for (int i = 0; i < input->dim_num; i++) { @@ -64,7 +63,7 @@ static int infer_shape(struct node* node) { count++; } - int* new_shape = ( int* )sys_malloc(count * sizeof(int)); + int* new_shape = (int*)sys_malloc(count * sizeof(int)); int size = 0; if (reduction_param->dim_0 != -2) { @@ -87,7 +86,7 @@ static int infer_shape(struct node* node) size++; } - int8_t should_reduced[5] = { 0, 0, 0, 0, 0}; + int8_t should_reduced[5] = {0, 0, 0, 0, 0}; int reduceddim = 0; int real_shape[5] = {0, 1, 2, 3, 4}; @@ -138,7 +137,7 @@ static int infer_shape(struct node* node) } else { - int* odim = ( int* )sys_malloc(input->dim_num * sizeof(int)); + int* odim = (int*)sys_malloc(input->dim_num * sizeof(int)); for (int i_idx = 0, o_idx = 0; i_idx < input->dim_num; i_idx++) { odim[o_idx++] = 1; @@ -163,7 +162,7 @@ static int infer_shape(struct node* node) { o_size = input->dim_num; } - int* odim = ( int* )sys_malloc(o_size * sizeof(int)); + int* odim = (int*)sys_malloc(o_size * sizeof(int)); for (int i_idx = 0, o_idx = 0; i_idx < input->dim_num; i_idx++) { if (!should_reduced[i_idx]) @@ -184,10 +183,9 @@ static int infer_shape(struct node* node) } } - static int init_op(struct op* op) { - struct reduction_param* reduction_param = ( struct reduction_param* )sys_malloc(sizeof(struct reduction_param)); + struct reduction_param* reduction_param = (struct reduction_param*)sys_malloc(sizeof(struct reduction_param)); if (reduction_param == NULL) { @@ -209,13 +207,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_reduction_op() { struct method m; @@ -224,11 +220,9 @@ int register_reduction_op() m.init = init_op; m.release = release_op; - return register_op(OP_REDUCTION, OP_REDUCTION_NAME, &m); } - int unregister_reduction_op() { return unregister_op(OP_REDUCTION, 1); diff --git a/source/operator/prototype/region.c b/source/operator/prototype/region.c index 8d2c23704..28490cc2f 100644 --- a/source/operator/prototype/region.c +++ b/source/operator/prototype/region.c @@ -31,10 +31,9 @@ #include "module/module.h" #include "utility/sys_port.h" - static int init_op(struct op* op) { - struct region_param* region_param = ( struct region_param* )sys_malloc(sizeof(struct region_param)); + struct region_param* region_param = (struct region_param*)sys_malloc(sizeof(struct region_param)); if (region_param == NULL) { @@ -52,13 +51,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_region_op() { struct method m; @@ -70,7 +67,6 @@ int register_region_op() return register_op(OP_REGION, OP_REGION_NAME, &m); } - int unregister_region_op() { return unregister_op(OP_REGION, 1); diff --git a/source/operator/prototype/relu.c b/source/operator/prototype/relu.c index 859c5e461..57b179de2 100644 --- a/source/operator/prototype/relu.c +++ b/source/operator/prototype/relu.c @@ -31,7 +31,6 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(ir_node_t* node) { ir_graph_t* ir_graph = node->graph; @@ -43,10 +42,9 @@ static int infer_shape(ir_node_t* node) return 0; } - static int init_op(ir_op_t* op) { - struct relu_param* relu_param = ( struct relu_param* )sys_malloc(sizeof(struct relu_param)); + struct relu_param* relu_param = (struct relu_param*)sys_malloc(sizeof(struct relu_param)); if (relu_param == NULL) { @@ -64,13 +62,11 @@ static int init_op(ir_op_t* op) return 0; } - static void release_op(ir_op_t* op) { sys_free(op->param_mem); } - int register_relu_op() { ir_method_t m; @@ -82,7 +78,6 @@ int register_relu_op() return register_op(OP_RELU, OP_RELU_NAME, &m); } - int unregister_relu_op() { return unregister_op(OP_RELU, 1); diff --git a/source/operator/prototype/relu1.c b/source/operator/prototype/relu1.c index 96c151888..8ef7f40e1 100644 --- a/source/operator/prototype/relu1.c +++ b/source/operator/prototype/relu1.c @@ -22,7 +22,6 @@ * Author: bzhang@openailab.com */ - #include "api/c_api.h" #include "graph/tensor.h" #include "graph/node.h" @@ -30,7 +29,6 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(ir_node_t* node) { ir_graph_t* ir_graph = node->graph; @@ -42,7 +40,6 @@ static int infer_shape(ir_node_t* node) return 0; } - static int init_op(ir_op_t* op) { op->same_shape = 0; @@ -51,9 +48,9 @@ static int init_op(ir_op_t* op) return 0; } - -static void release_op(ir_op_t* op) {} - +static void release_op(ir_op_t* op) +{ +} int register_relu1_op() { @@ -66,7 +63,6 @@ int register_relu1_op() return register_op(OP_RELU1, OP_RELU1_NAME, &m); } - int unregister_relu1_op() { return unregister_op(OP_RELU1, 1); diff --git a/source/operator/prototype/relu6.c b/source/operator/prototype/relu6.c index b78a28d39..e2ff0d269 100644 --- a/source/operator/prototype/relu6.c +++ b/source/operator/prototype/relu6.c @@ -31,7 +31,6 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(ir_node_t* node) { ir_graph_t* ir_graph = node->graph; @@ -43,7 +42,6 @@ static int infer_shape(ir_node_t* node) return 0; } - static int init_op(ir_op_t* op) { op->same_shape = 0; @@ -52,9 +50,9 @@ static int init_op(ir_op_t* op) return 0; } - -static void release_op(ir_op_t* op) {} - +static void release_op(ir_op_t* op) +{ +} int register_relu6_op() { @@ -67,7 +65,6 @@ int register_relu6_op() return register_op(OP_RELU6, OP_RELU6_NAME, &m); } - int unregister_relu6_op() { return unregister_op(OP_RELU6, 1); diff --git a/source/operator/prototype/reorg.c b/source/operator/prototype/reorg.c index d61dc30c3..b526ab224 100644 --- a/source/operator/prototype/reorg.c +++ b/source/operator/prototype/reorg.c @@ -31,13 +31,12 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct reorg_param* reorg_param = ( struct reorg_param* )(node->op.param_mem); + struct reorg_param* reorg_param = (struct reorg_param*)(node->op.param_mem); int stride = reorg_param->stride; @@ -58,10 +57,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct reorg_param* reorg_param = ( struct reorg_param* )sys_malloc(sizeof(struct reorg_param)); + struct reorg_param* reorg_param = (struct reorg_param*)sys_malloc(sizeof(struct reorg_param)); if (reorg_param == NULL) { @@ -79,13 +77,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_reorg_op() { struct method m; @@ -97,7 +93,6 @@ int register_reorg_op() return register_op(OP_REORG, OP_REORG_NAME, &m); } - int unregister_reorg_op() { return unregister_op(OP_REORG, 1); diff --git a/source/operator/prototype/reshape.c b/source/operator/prototype/reshape.c index 7b2f31303..9c9252153 100644 --- a/source/operator/prototype/reshape.c +++ b/source/operator/prototype/reshape.c @@ -34,10 +34,9 @@ #include - static int infer_shape(struct node* node) { - reshape_param_t* param = ( struct reshape_param* )(node->op.param_mem); + reshape_param_t* param = (struct reshape_param*)(node->op.param_mem); struct graph* graph = node->graph; struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); @@ -59,13 +58,13 @@ static int infer_shape(struct node* node) if (param->is_mxnet) { int temp = input->dims[in_idx]; - push_vector_data(new_shape, ( void* )&temp); + push_vector_data(new_shape, (void*)&temp); } else { int temp = 1; if (i == 0) - push_vector_data(new_shape, ( void* )&temp); + push_vector_data(new_shape, (void*)&temp); } in_idx++; @@ -73,20 +72,20 @@ static int infer_shape(struct node* node) else if (-1 == param->re_shape[i]) { int temp = -1; - push_vector_data(new_shape, ( void* )&temp); + push_vector_data(new_shape, (void*)&temp); in_idx++; } else if (-2 == param->re_shape[i]) { for (; in_idx < input_dim_size; ++in_idx) { - push_vector_data(new_shape, ( void* )&input->dims[in_idx]); + push_vector_data(new_shape, (void*)&input->dims[in_idx]); } } else if (-3 == param->re_shape[i]) { int temp = input->dims[in_idx] * input->dims[in_idx + 1]; - push_vector_data(new_shape, ( void* )&temp); + push_vector_data(new_shape, (void*)&temp); in_idx = in_idx + 2; } else if (-4 == param->re_shape[i]) @@ -94,14 +93,14 @@ static int infer_shape(struct node* node) int muti_val = param->re_shape[i + 1]; if (muti_val == -1) muti_val = 1; - push_vector_data(new_shape, ( void* )&muti_val); - push_vector_data(new_shape, ( void* )¶m->re_shape[i + 2]); + push_vector_data(new_shape, (void*)&muti_val); + push_vector_data(new_shape, (void*)¶m->re_shape[i + 2]); i = i + 2; in_idx++; } else { - push_vector_data(new_shape, ( void* )¶m->re_shape[i]); + push_vector_data(new_shape, (void*)¶m->re_shape[i]); in_idx++; } } @@ -110,7 +109,7 @@ static int infer_shape(struct node* node) int dim_size = get_vector_num(new_shape); for (int i = 0; i < dim_size; i++) { - int temp = (( int* )get_vector_data(new_shape, i))[0]; + int temp = ((int*)get_vector_data(new_shape, i))[0]; if (temp == -1) idx = i; else @@ -120,12 +119,12 @@ static int infer_shape(struct node* node) if (idx >= 0) { int temp = size / new_size; - set_vector_data(new_shape, idx, ( void* )&temp); + set_vector_data(new_shape, idx, (void*)&temp); } - if ((( int* )get_vector_data(new_shape, 0))[0] == -1 && get_vector_num(new_shape) == 1) + if (((int*)get_vector_data(new_shape, 0))[0] == -1 && get_vector_num(new_shape) == 1) { - set_vector_data(new_shape, 0, ( void* )&size); + set_vector_data(new_shape, 0, (void*)&size); } if (param->reverse) @@ -145,16 +144,17 @@ static int infer_shape(struct node* node) } new_size = 1; - int* new_shape_temp = ( int* )sys_malloc(get_vector_num(new_shape) * sizeof(int)); + int* new_shape_temp = (int*)sys_malloc(get_vector_num(new_shape) * sizeof(int)); for (int i = 0; i < get_vector_num(new_shape); i++) { - int* a = ( int* )get_vector_data(new_shape, i); + int* a = (int*)get_vector_data(new_shape, i); new_shape_temp[i] = *a; new_size *= new_shape_temp[i]; } // check input and reshaped size - if (new_size != size) { + if (new_size != size) + { TLOG_ERR("Error: input elem num(%d) != reshaped elem num(%d)\n", size, new_size); return -1; } @@ -169,10 +169,9 @@ static int infer_shape(struct node* node) return ret; } - static int init_op(struct op* op) { - struct reshape_param* reshape_param = ( struct reshape_param* )sys_malloc(sizeof(struct reshape_param)); + struct reshape_param* reshape_param = (struct reshape_param*)sys_malloc(sizeof(struct reshape_param)); if (reshape_param == NULL) { @@ -189,10 +188,9 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { - struct reshape_param* reshape_param = ( struct reshape_param* )op->param_mem; + struct reshape_param* reshape_param = (struct reshape_param*)op->param_mem; if (reshape_param->re_shape) sys_free(reshape_param->re_shape); @@ -200,7 +198,6 @@ static void release_op(struct op* op) sys_free(op->param_mem); } - int register_reshape_op() { struct method m; @@ -212,7 +209,6 @@ int register_reshape_op() return register_op(OP_RESHAPE, OP_RESHAPE_NAME, &m); } - int unregister_reshape_op() { return unregister_op(OP_RESHAPE, 1); diff --git a/source/operator/prototype/resize.c b/source/operator/prototype/resize.c index df40dd8d2..676691c6b 100644 --- a/source/operator/prototype/resize.c +++ b/source/operator/prototype/resize.c @@ -32,26 +32,25 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct resize_param* resize_param = ( struct resize_param* )(node->op.param_mem); + struct resize_param* resize_param = (struct resize_param*)(node->op.param_mem); int dims[4]; dims[0] = input->dims[0]; if (graph->graph_layout == TENGINE_LAYOUT_NCHW) { dims[1] = input->dims[1]; - dims[2] = ( int )(input->dims[2] * resize_param->scale_h); - dims[3] = ( int )(input->dims[3] * resize_param->scale_w); + dims[2] = (int)(input->dims[2] * resize_param->scale_h); + dims[3] = (int)(input->dims[3] * resize_param->scale_w); } else if (graph->graph_layout == TENGINE_LAYOUT_NHWC) { - dims[1] = ( int )(input->dims[1] * resize_param->scale_h); - dims[2] = ( int )(input->dims[2] * resize_param->scale_w); + dims[1] = (int)(input->dims[1] * resize_param->scale_h); + dims[2] = (int)(input->dims[2] * resize_param->scale_w); dims[3] = input->dims[3]; } else @@ -65,10 +64,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct resize_param* resize_param = ( struct resize_param* )sys_malloc(sizeof(struct resize_param)); + struct resize_param* resize_param = (struct resize_param*)sys_malloc(sizeof(struct resize_param)); if (resize_param == NULL) { @@ -88,13 +86,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_resize_op() { struct method m; @@ -106,7 +102,6 @@ int register_resize_op() return register_op(OP_RESIZE, OP_RESIZE_NAME, &m); } - int unregister_resize_op() { return unregister_op(OP_RESIZE, 1); diff --git a/source/operator/prototype/resize_param.h b/source/operator/prototype/resize_param.h index 49c069bfb..d0748eef7 100644 --- a/source/operator/prototype/resize_param.h +++ b/source/operator/prototype/resize_param.h @@ -29,7 +29,7 @@ struct resize_param { float scale_w; float scale_h; - int type; // 0 for NEAREST_NEIGHBOR // 1 for BILIEAR + int type; // 0 for NEAREST_NEIGHBOR // 1 for BILIEAR }; #endif diff --git a/source/operator/prototype/reverse.c b/source/operator/prototype/reverse.c index 4098491d7..31fc1406f 100644 --- a/source/operator/prototype/reverse.c +++ b/source/operator/prototype/reverse.c @@ -27,7 +27,6 @@ #include "graph/graph.h" #include "module/module.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; @@ -39,7 +38,6 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { op->same_shape = 0; @@ -48,13 +46,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { // sys_free(op->param_mem); } - int register_reverse_op() { struct method m; @@ -66,7 +62,6 @@ int register_reverse_op() return register_op(OP_REVERSE, OP_REVERSE_NAME, &m); } - int unregister_reverse_op() { return unregister_op(OP_REVERSE, 1); diff --git a/source/operator/prototype/rnn.c b/source/operator/prototype/rnn.c index f037f95f9..0973a9f24 100644 --- a/source/operator/prototype/rnn.c +++ b/source/operator/prototype/rnn.c @@ -31,13 +31,12 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct rnn_param* rnn_param = ( struct rnn_param* )(node->op.param_mem); + struct rnn_param* rnn_param = (struct rnn_param*)(node->op.param_mem); int dims[3]; // input tensors: @@ -57,10 +56,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct rnn_param* rnn_param = ( struct rnn_param* )sys_malloc(sizeof(struct rnn_param)); + struct rnn_param* rnn_param = (struct rnn_param*)sys_malloc(sizeof(struct rnn_param)); if (rnn_param == NULL) { @@ -77,13 +75,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_rnn_op() { struct method m; @@ -95,7 +91,6 @@ int register_rnn_op() return register_op(OP_RNN, OP_RNN_NAME, &m); } - int unregister_rnn_op() { return unregister_op(OP_RNN, 1); diff --git a/source/operator/prototype/roialign.c b/source/operator/prototype/roialign.c index 999ec7afa..74b6b37ed 100644 --- a/source/operator/prototype/roialign.c +++ b/source/operator/prototype/roialign.c @@ -31,14 +31,13 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct roialign_param* roialign_param = ( struct roialign_param* )(node->op.param_mem); + struct roialign_param* roialign_param = (struct roialign_param*)(node->op.param_mem); int out_dim[4]; @@ -52,10 +51,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct roialign_param* roialign_param = ( struct roialign_param* )sys_malloc(sizeof(struct roialign_param)); + struct roialign_param* roialign_param = (struct roialign_param*)sys_malloc(sizeof(struct roialign_param)); if (roialign_param == NULL) { @@ -75,13 +73,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_roialign_op() { struct method m; @@ -90,11 +86,9 @@ int register_roialign_op() m.init = init_op; m.release = release_op; - return register_op(OP_ROIALIGN, OP_ROIALIGN_NAME, &m); } - int unregister_roialign_op() { return unregister_op(OP_ROIALIGN, 1); diff --git a/source/operator/prototype/roipooling.c b/source/operator/prototype/roipooling.c index 16cf9de0d..b2e13791c 100644 --- a/source/operator/prototype/roipooling.c +++ b/source/operator/prototype/roipooling.c @@ -31,13 +31,12 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]); - struct roipooling_param* roipooling_param = ( struct roipooling_param* )node->op.param_mem; + struct roipooling_param* roipooling_param = (struct roipooling_param*)node->op.param_mem; int dims[4]; @@ -51,10 +50,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct roipooling_param* roipooling_param = ( struct roipooling_param* )sys_malloc(sizeof(struct roipooling_param)); + struct roipooling_param* roipooling_param = (struct roipooling_param*)sys_malloc(sizeof(struct roipooling_param)); if (roipooling_param == NULL) { @@ -72,13 +70,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_roipooling_op() { struct method m; diff --git a/source/operator/prototype/round.c b/source/operator/prototype/round.c index a938b2f58..491637253 100644 --- a/source/operator/prototype/round.c +++ b/source/operator/prototype/round.c @@ -27,7 +27,6 @@ #include "graph/graph.h" #include "module/module.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; @@ -39,7 +38,6 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { op->same_shape = 0; @@ -48,13 +46,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { // sys_free(op->param_mem); } - int register_round_op() { struct method m; @@ -66,7 +62,6 @@ int register_round_op() return register_op(OP_ROUND, OP_ROUND_NAME, &m); } - int unregister_round_op() { return unregister_op(OP_ROUND, 1); diff --git a/source/operator/prototype/rpn.c b/source/operator/prototype/rpn.c index 424416a67..762653b65 100644 --- a/source/operator/prototype/rpn.c +++ b/source/operator/prototype/rpn.c @@ -34,7 +34,6 @@ #include - void mkanchor(float w, float h, float x_ctr, float y_ctr, Anchor_t* tmp) { tmp->x0 = (x_ctr - 0.5f * (w - 1)); @@ -43,7 +42,6 @@ void mkanchor(float w, float h, float x_ctr, float y_ctr, Anchor_t* tmp) tmp->y1 = (y_ctr + 0.5f * (h - 1)); } - void whctrs(const Anchor_t anchor, Box_t* result) { result->w = (anchor.x1 - anchor.x0 + 1); @@ -52,41 +50,38 @@ void whctrs(const Anchor_t anchor, Box_t* result) result->cy = ((anchor.y1 + anchor.y0) * 0.5f); } - void scale_enum(const Anchor_t anchor, const struct vector* anchor_scales_, struct vector* result) { Box_t tmp_box; whctrs(anchor, &tmp_box); - for (int i = 0; i < ( int )anchor_scales_->elem_num; ++i) + for (int i = 0; i < (int)anchor_scales_->elem_num; ++i) { Anchor_t tmp; - float as_val = *( float* )(get_vector_data(( struct vector* )anchor_scales_, i)); + float as_val = *(float*)(get_vector_data((struct vector*)anchor_scales_, i)); mkanchor(tmp_box.w * as_val, tmp_box.h * as_val, tmp_box.cx, tmp_box.cy, &tmp); push_vector_data(result, &tmp); } } - void ratio_enum(const Anchor_t anchor, const struct vector* ratios_, struct vector* result) { Box_t tmp_box; whctrs(anchor, &tmp_box); float area = tmp_box.h * tmp_box.w; - for (int i = 0; i < ( int )ratios_->elem_num; ++i) + for (int i = 0; i < (int)ratios_->elem_num; ++i) { - float size_ratio = area / *( float* )(get_vector_data(( struct vector* )ratios_, i)); + float size_ratio = area / *(float*)(get_vector_data((struct vector*)ratios_, i)); Anchor_t tmp; float new_w = roundf(sqrt(size_ratio)); - float new_h = roundf(new_w * *( float* )(get_vector_data(( struct vector* )ratios_, i))); + float new_h = roundf(new_w * *(float*)(get_vector_data((struct vector*)ratios_, i))); mkanchor(new_w, new_h, tmp_box.cx, tmp_box.cy, &tmp); push_vector_data(result, &tmp); } } - void generate_anchors(const int base_size, const struct vector* ratios_, const struct vector* scales_, struct vector* gen_anchors_) { @@ -99,14 +94,14 @@ void generate_anchors(const int base_size, const struct vector* ratios_, const s struct vector* ratio_anchors = create_vector(sizeof(struct Anchor), NULL); ratio_enum(base_anchor, ratios_, ratio_anchors); - for (int i = 0; i < ( int )ratio_anchors->elem_num; ++i) + for (int i = 0; i < (int)ratio_anchors->elem_num; ++i) { struct vector* scale_anchors = create_vector(sizeof(struct Anchor), NULL); - scale_enum(*( Anchor_t* )get_vector_data(ratio_anchors, i), scales_, scale_anchors); + scale_enum(*(Anchor_t*)get_vector_data(ratio_anchors, i), scales_, scale_anchors); for (int j = 0; j < scale_anchors->elem_num; j++) { - Anchor_t tmp_s = *( Anchor_t* )get_vector_data(scale_anchors, j); + Anchor_t tmp_s = *(Anchor_t*)get_vector_data(scale_anchors, j); push_vector_data(gen_anchors_, &tmp_s); } @@ -121,7 +116,7 @@ static int infer_shape(struct node* node) struct graph* ir_graph = node->graph; struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]); - rpn_param_t* rpn_param = ( rpn_param_t* )node->op.param_mem; + rpn_param_t* rpn_param = (rpn_param_t*)node->op.param_mem; rpn_param->anchors_ = create_vector(sizeof(struct Anchor), NULL); generate_anchors(rpn_param->basesize, rpn_param->ratios, rpn_param->anchor_scales, rpn_param->anchors_); @@ -136,10 +131,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct rpn_param* rpn_param = ( struct rpn_param* )sys_malloc(sizeof(struct rpn_param)); + struct rpn_param* rpn_param = (struct rpn_param*)sys_malloc(sizeof(struct rpn_param)); if (rpn_param == NULL) { @@ -160,10 +154,9 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { - struct rpn_param* rpn_param = ( struct rpn_param* )op->param_mem; + struct rpn_param* rpn_param = (struct rpn_param*)op->param_mem; if (rpn_param->anchors_) release_vector(rpn_param->anchors_); @@ -175,7 +168,6 @@ static void release_op(struct op* op) sys_free(op->param_mem); } - int register_rpn_op() { struct method m; @@ -184,11 +176,9 @@ int register_rpn_op() m.init = init_op; m.release = release_op; - return register_op(OP_RPN, OP_RPN_NAME, &m); } - int unregister_rpn_op() { return unregister_op(OP_RPN, 1); diff --git a/source/operator/prototype/scale.c b/source/operator/prototype/scale.c index b51ad1e2e..0eaa7fc5a 100644 --- a/source/operator/prototype/scale.c +++ b/source/operator/prototype/scale.c @@ -31,7 +31,6 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(ir_node_t* node) { ir_graph_t* ir_graph = node->graph; @@ -43,10 +42,9 @@ static int infer_shape(ir_node_t* node) return 0; } - static int init_op(ir_op_t* op) { - struct scale_param* scale_param = ( struct scale_param* )sys_malloc(sizeof(struct scale_param)); + struct scale_param* scale_param = (struct scale_param*)sys_malloc(sizeof(struct scale_param)); if (scale_param == NULL) { @@ -66,13 +64,11 @@ static int init_op(ir_op_t* op) return 0; } - static void release_op(ir_op_t* op) { sys_free(op->param_mem); } - int register_scale_op() { ir_method_t m; @@ -84,7 +80,6 @@ int register_scale_op() return register_op(OP_SCALE, OP_SCALE_NAME, &m); } - int unregister_scale_op() { return unregister_op(OP_SCALE, 1); diff --git a/source/operator/prototype/scatter.c b/source/operator/prototype/scatter.c index 24b7baa8a..1c98cf9e6 100644 --- a/source/operator/prototype/scatter.c +++ b/source/operator/prototype/scatter.c @@ -31,7 +31,6 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; @@ -42,7 +41,6 @@ static int infer_shape(struct node* node) return ret; } - static int init_op(struct op* op) { struct scatter_param* scatter_param = (struct scatter_param*)sys_malloc(sizeof(struct scatter_param)); @@ -63,13 +61,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_scatter_op() { struct method m; @@ -78,11 +74,9 @@ int register_scatter_op() m.release = release_op; return register_op(OP_SCATTER, OP_SCATTER_NAME, &m); - } - int unregister_scatter_op() { - return unregister_op(OP_SCATTER,1); + return unregister_op(OP_SCATTER, 1); } diff --git a/source/operator/prototype/selu.c b/source/operator/prototype/selu.c index d4bd5e479..4422d4ced 100644 --- a/source/operator/prototype/selu.c +++ b/source/operator/prototype/selu.c @@ -31,7 +31,6 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(ir_node_t* node) { ir_graph_t* ir_graph = node->graph; @@ -43,10 +42,9 @@ static int infer_shape(ir_node_t* node) return 0; } - static int init_op(struct op* op) { - struct selu_param* selu_param = ( struct selu_param* )sys_malloc(sizeof(struct selu_param)); + struct selu_param* selu_param = (struct selu_param*)sys_malloc(sizeof(struct selu_param)); if (selu_param == NULL) { @@ -65,13 +63,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_selu_op() { struct method m; @@ -83,7 +79,6 @@ int register_selu_op() return register_op(OP_SELU, OP_SELU_NAME, &m); } - int unregister_selu_op() { return unregister_op(OP_SELU, 1); diff --git a/source/operator/prototype/shape.c b/source/operator/prototype/shape.c index 365ae6a5e..68dcde0dd 100644 --- a/source/operator/prototype/shape.c +++ b/source/operator/prototype/shape.c @@ -27,7 +27,6 @@ #include "graph/graph.h" #include "module/module.h" - static int infer_shape(ir_node_t* node) { ir_graph_t* ir_graph = node->graph; @@ -39,7 +38,6 @@ static int infer_shape(ir_node_t* node) return 0; } - static int init_op(ir_op_t* op) { op->same_shape = 0; @@ -48,9 +46,9 @@ static int init_op(ir_op_t* op) return 0; } - -static void release_op(ir_op_t* op) {} - +static void release_op(ir_op_t* op) +{ +} int register_shape_op() { @@ -63,7 +61,6 @@ int register_shape_op() return register_op(OP_SHAPE, OP_SHAPE_NAME, &m); } - int unregister_shape_op() { return unregister_op(OP_SHAPE, 1); diff --git a/source/operator/prototype/shuffle_channel.c b/source/operator/prototype/shuffle_channel.c index ba0898575..bd23d739b 100644 --- a/source/operator/prototype/shuffle_channel.c +++ b/source/operator/prototype/shuffle_channel.c @@ -31,7 +31,6 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; @@ -43,11 +42,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct shuffle_channel_param* param = - ( struct shuffle_channel_param* )sys_malloc(sizeof(struct shuffle_channel_param)); + struct shuffle_channel_param* param = (struct shuffle_channel_param*)sys_malloc(sizeof(struct shuffle_channel_param)); if (param == NULL) { @@ -66,13 +63,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_shuffle_channel_op() { struct method m; @@ -84,7 +79,6 @@ int register_shuffle_channel_op() return register_op(OP_SHUFFLECHANNEL, OP_SHUFFLECHANNEL_NAME, &m); } - int unregister_shuffle_channel_op() { return unregister_op(OP_SHUFFLECHANNEL, 1); diff --git a/source/operator/prototype/sigmoid.c b/source/operator/prototype/sigmoid.c index bbb249777..13eabcc7f 100644 --- a/source/operator/prototype/sigmoid.c +++ b/source/operator/prototype/sigmoid.c @@ -28,7 +28,6 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(ir_node_t* node) { ir_graph_t* ir_graph = node->graph; @@ -40,7 +39,6 @@ static int infer_shape(ir_node_t* node) return 0; } - static int init_op(ir_op_t* op) { op->same_shape = 0; @@ -49,13 +47,11 @@ static int init_op(ir_op_t* op) return 0; } - static void release_op(ir_op_t* op) { // sys_free(op->param_mem); } - int register_sigmoid_op() { ir_method_t m; @@ -68,7 +64,6 @@ int register_sigmoid_op() return register_op(OP_SIGMOID, OP_SIGMOID_NAME, &m); } - int unregister_sigmoid_op() { // sys_free(GET_PARAM_PARSE_MAP(sigmoid_param)); diff --git a/source/operator/prototype/slice.c b/source/operator/prototype/slice.c index 67a61eab5..018369094 100644 --- a/source/operator/prototype/slice.c +++ b/source/operator/prototype/slice.c @@ -31,19 +31,20 @@ #include "module/module.h" #include "utility/vector.h" #include "utility/sys_port.h" -#include "utility/log.h" // for: TLOG_ERR +#include "utility/log.h" // for: TLOG_ERR static int infer_shape(ir_node_t* node) { ir_graph_t* ir_graph = node->graph; ir_tensor_t* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]); - struct slice_param* slice_param = ( struct slice_param* )(node->op.param_mem); + struct slice_param* slice_param = (struct slice_param*)(node->op.param_mem); int dims_len = input->dim_num; int dims_in[TE_MAX_SHAPE_DIM_NUM * 2]; // Check: axis must be in the range: [-input->dim_num, input->dim_num) // Note: Here we always assume 0 <= input->dim_num - if (slice_param->axis < -input->dim_num || input->dim_num <= slice_param->axis) { + if (slice_param->axis < -input->dim_num || input->dim_num <= slice_param->axis) + { TLOG_ERR("Input slice axis %d not to be supported.\n", slice_param->axis); return -1; } @@ -67,8 +68,8 @@ static int infer_shape(ir_node_t* node) unsigned int i = 0; for (; i < slice_param->slice_point_->elem_num; ++i) { - dims_in[slice_axis] = (*( int* )get_vector_data(slice_param->slice_point_, i) - prev); - prev = *( int* )get_vector_data(slice_param->slice_point_, i); + dims_in[slice_axis] = (*(int*)get_vector_data(slice_param->slice_point_, i) - prev); + prev = *(int*)get_vector_data(slice_param->slice_point_, i); set_ir_tensor_shape(get_ir_graph_tensor(ir_graph, node->output_tensors[i]), dims_in, dims_len); } // The last one @@ -80,7 +81,7 @@ static int infer_shape(ir_node_t* node) int out_num = node->output_num; if (dims_in[slice_axis] % out_num != 0) return -1; - if (slice_axis > ( int )dims_len) + if (slice_axis > (int)dims_len) return -1; dims_in[slice_axis] = dims_in[slice_axis] / out_num; for (int i = 0; i < out_num; i++) @@ -158,22 +159,20 @@ static int infer_shape(ir_node_t* node) int dim_len = input->dim_num; int out_dims[TE_MAX_SHAPE_DIM_NUM * 2]; // input shape size must be equal to begin and size's size; - if ((slice_param->size_->elem_num != slice_param->begin_->elem_num) || - (slice_param->size_->elem_num != dim_len)) + if ((slice_param->size_->elem_num != slice_param->begin_->elem_num) || (slice_param->size_->elem_num != dim_len)) return -1; for (unsigned int i = 0; i < dim_len; i++) { - out_dims[i] = *( int* )get_vector_data(slice_param->size_, i); + out_dims[i] = *(int*)get_vector_data(slice_param->size_, i); } set_ir_tensor_shape(get_ir_graph_tensor(ir_graph, node->output_tensors[0]), out_dims, dim_len); } return 0; } - static int init_op(ir_op_t* op) { - slice_param_t* slice_param = ( slice_param_t* )sys_malloc(sizeof(slice_param_t)); + slice_param_t* slice_param = (slice_param_t*)sys_malloc(sizeof(slice_param_t)); if (slice_param == NULL) { @@ -194,10 +193,9 @@ static int init_op(ir_op_t* op) return 0; } - static void release_op(ir_op_t* op) { - slice_param_t* slice_param = ( slice_param_t* )op->param_mem; + slice_param_t* slice_param = (slice_param_t*)op->param_mem; if (slice_param->slice_point_) release_vector(slice_param->slice_point_); @@ -209,7 +207,6 @@ static void release_op(ir_op_t* op) sys_free(op->param_mem); } - int register_slice_op() { ir_method_t m; @@ -221,7 +218,6 @@ int register_slice_op() return register_op(OP_SLICE, OP_SLICE_NAME, &m); } - int unregister_slice_op() { return unregister_op(OP_SLICE, 1); diff --git a/source/operator/prototype/slice_param.h b/source/operator/prototype/slice_param.h index dad3a6f96..1e1e8c605 100644 --- a/source/operator/prototype/slice_param.h +++ b/source/operator/prototype/slice_param.h @@ -26,7 +26,6 @@ #include "stdint.h" - typedef struct slice_param { struct vector* slice_point_; diff --git a/source/operator/prototype/softmax.c b/source/operator/prototype/softmax.c index e0cf47d63..e9e3a3a65 100644 --- a/source/operator/prototype/softmax.c +++ b/source/operator/prototype/softmax.c @@ -31,7 +31,6 @@ #include "utility/vector.h" #include "utility/sys_port.h" - static int infer_shape(ir_node_t* node) { ir_graph_t* ir_graph = node->graph; @@ -45,10 +44,9 @@ static int infer_shape(ir_node_t* node) return 0; } - static int init_op(ir_op_t* op) { - struct softmax_param* softmax_param = ( struct softmax_param* )sys_malloc(sizeof(struct softmax_param)); + struct softmax_param* softmax_param = (struct softmax_param*)sys_malloc(sizeof(struct softmax_param)); if (softmax_param == NULL) { @@ -66,13 +64,11 @@ static int init_op(ir_op_t* op) return 0; } - static void release_op(ir_op_t* op) { sys_free(op->param_mem); } - int register_softmax_op() { ir_method_t m; @@ -84,7 +80,6 @@ int register_softmax_op() return register_op(OP_SOFTMAX, OP_SOFTMAX_NAME, &m); } - int unregister_softmax_op() { return unregister_op(OP_SOFTMAX, 1); diff --git a/source/operator/prototype/softplus.c b/source/operator/prototype/softplus.c index 8c5754015..0ffc08336 100644 --- a/source/operator/prototype/softplus.c +++ b/source/operator/prototype/softplus.c @@ -29,7 +29,6 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; @@ -49,7 +48,9 @@ static int init_op(struct op* op) return 0; } -static void release_op(struct op* op) {} +static void release_op(struct op* op) +{ +} int register_softplus_op() { diff --git a/source/operator/prototype/spacetobatchnd.c b/source/operator/prototype/spacetobatchnd.c index aeb4fa5b7..5e94c9397 100644 --- a/source/operator/prototype/spacetobatchnd.c +++ b/source/operator/prototype/spacetobatchnd.c @@ -31,7 +31,6 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct spacetobatchnd_param* spacetobatchnd_param = (struct spacetobatchnd_param*)(node->op.param_mem); @@ -43,10 +42,8 @@ static int infer_shape(struct node* node) int out_dim[4]; out_dim[0] = input->dims[0] * (spacetobatchnd_param->dilation_x) * (spacetobatchnd_param->dilation_y); - out_dim[1] = (input->dims[1] + spacetobatchnd_param->pad_top + spacetobatchnd_param->pad_bottom) / - spacetobatchnd_param->dilation_y; - out_dim[2] = (input->dims[2] + spacetobatchnd_param->pad_left + spacetobatchnd_param->pad_right) / - spacetobatchnd_param->dilation_x; + out_dim[1] = (input->dims[1] + spacetobatchnd_param->pad_top + spacetobatchnd_param->pad_bottom) / spacetobatchnd_param->dilation_y; + out_dim[2] = (input->dims[2] + spacetobatchnd_param->pad_left + spacetobatchnd_param->pad_right) / spacetobatchnd_param->dilation_x; out_dim[3] = input->dims[3]; set_ir_tensor_shape(output, out_dim, 4); @@ -54,11 +51,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct spacetobatchnd_param* spacetobatchnd_param = - ( struct spacetobatchnd_param* )sys_malloc(sizeof(struct spacetobatchnd_param)); + struct spacetobatchnd_param* spacetobatchnd_param = (struct spacetobatchnd_param*)sys_malloc(sizeof(struct spacetobatchnd_param)); if (spacetobatchnd_param == NULL) { @@ -81,13 +76,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_spacetobatchnd_op() { struct method m; @@ -99,7 +92,6 @@ int register_spacetobatchnd_op() return register_op(OP_SPACETOBATCHND, OP_SPACETOBATCHND_NAME, &m); } - int unregister_spacetobatchnd_op() { return unregister_op(OP_SPACETOBATCHND, 1); diff --git a/source/operator/prototype/spacetodepth.c b/source/operator/prototype/spacetodepth.c index 5680da626..85e4a5e9e 100644 --- a/source/operator/prototype/spacetodepth.c +++ b/source/operator/prototype/spacetodepth.c @@ -31,10 +31,9 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { - struct spacetodepth_param* spacetodepth_param = ( struct spacetodepth_param* )(node->op.param_mem); + struct spacetodepth_param* spacetodepth_param = (struct spacetodepth_param*)(node->op.param_mem); struct graph* graph = node->graph; struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); @@ -44,21 +43,19 @@ static int infer_shape(struct node* node) int dims[4]; int block_size = spacetodepth_param->block_size; - dims[0] = input->dims[0]; // batch - dims[1] = input->dims[1] * (block_size * block_size); // channel - dims[2] = input->dims[2] / block_size; // height - dims[3] = input->dims[3] / block_size; // width + dims[0] = input->dims[0]; // batch + dims[1] = input->dims[1] * (block_size * block_size); // channel + dims[2] = input->dims[2] / block_size; // height + dims[3] = input->dims[3] / block_size; // width set_ir_tensor_shape(output, dims, 4); return 0; } - static int init_op(struct op* op) { - struct spacetodepth_param* spacetodepth_param = - ( struct spacetodepth_param* )sys_malloc(sizeof(struct spacetodepth_param)); + struct spacetodepth_param* spacetodepth_param = (struct spacetodepth_param*)sys_malloc(sizeof(struct spacetodepth_param)); if (spacetodepth_param == NULL) { @@ -76,13 +73,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_spacetodepth_op() { struct method m; @@ -91,11 +86,9 @@ int register_spacetodepth_op() m.init = init_op; m.release = release_op; - return register_op(OP_SPACETODEPTH, OP_SPACETODEPTH_NAME, &m); } - int unregister_spacetodepth_op() { return unregister_op(OP_SPACETODEPTH, 1); diff --git a/source/operator/prototype/sparsetodense.c b/source/operator/prototype/sparsetodense.c index e9539802c..af8dffa45 100644 --- a/source/operator/prototype/sparsetodense.c +++ b/source/operator/prototype/sparsetodense.c @@ -31,10 +31,9 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { - struct sparsetodense_param* sparsetodense_param = ( struct sparsetodense_param* )(node->op.param_mem); + struct sparsetodense_param* sparsetodense_param = (struct sparsetodense_param*)(node->op.param_mem); struct graph* graph = node->graph; struct tensor* input0 = get_ir_graph_tensor(graph, node->input_tensors[0]); @@ -67,7 +66,6 @@ static int infer_shape(struct node* node) } } - static int init_op(struct op* op) { struct sparsetodense_param* sparsetodense_param = (struct sparsetodense_param*)sys_malloc(sizeof(struct sparsetodense_param)); @@ -90,13 +88,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_sparsetodense_op() { struct method m; @@ -108,7 +104,6 @@ int register_sparsetodense_op() return register_op(OP_SPARSETODENSE, OP_SPARSETODENSE_NAME, &m); } - int unregister_sparsetodense_op() { return unregister_op(OP_SPARSETODENSE, 1); diff --git a/source/operator/prototype/spatialtransformer.c b/source/operator/prototype/spatialtransformer.c index 41b089e84..f3962bd1d 100644 --- a/source/operator/prototype/spatialtransformer.c +++ b/source/operator/prototype/spatialtransformer.c @@ -34,10 +34,9 @@ #include - static int infer_shape(struct node* node) { - struct spatialtransformer_param* param = ( struct spatialtransformer_param* )(node->op.param_mem); + struct spatialtransformer_param* param = (struct spatialtransformer_param*)(node->op.param_mem); struct graph* graph = node->graph; struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); @@ -45,25 +44,27 @@ static int infer_shape(struct node* node) struct vector* new_shape = create_vector(sizeof(int), NULL); int dim_size = 2; - for(int i = 0; i < dim_size; i++ ){ + for (int i = 0; i < dim_size; i++) + { int shape = param->target_shape[i]; push_vector_data(new_shape, (void*)&shape); } - int out_dim_size =4; - int* new_shape_temp = ( int* )sys_malloc(out_dim_size * sizeof(int)); + int out_dim_size = 4; + int* new_shape_temp = (int*)sys_malloc(out_dim_size * sizeof(int)); - if(dim_size == 2){ + if (dim_size == 2) + { for (int i = 0; i < get_vector_num(new_shape); i++) { - int* a = ( int* )get_vector_data(new_shape, i); - new_shape_temp[i+dim_size] = *a; + int* a = (int*)get_vector_data(new_shape, i); + new_shape_temp[i + dim_size] = *a; } new_shape_temp[0] = 1; new_shape_temp[1] = input->dims[1]; } - output->layout = input->layout; + output->layout = input->layout; int ret = set_ir_tensor_shape(output, new_shape_temp, out_dim_size); sys_free(new_shape_temp); @@ -71,7 +72,6 @@ static int infer_shape(struct node* node) return ret; } - static int init_op(struct op* op) { struct spatialtransformer_param* spatialtransformer_param = (struct spatialtransformer_param*)sys_malloc(sizeof(struct spatialtransformer_param)); @@ -95,20 +95,16 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { - - struct spatialtransformer_param* param = ( struct spatialtransformer_param* )op->param_mem; + struct spatialtransformer_param* param = (struct spatialtransformer_param*)op->param_mem; if (param->target_shape) sys_free(param->target_shape); sys_free(op->param_mem); - } - int register_spatialtransformer_op() { struct method m; @@ -120,7 +116,6 @@ int register_spatialtransformer_op() return register_op(OP_SPATIALTRANSFORMER, OP_SPATIALTRANSFORMER_NAME, &m); } - int unregister_spatialtransformer_op() { return unregister_op(OP_SPATIALTRANSFORMER, 1); diff --git a/source/operator/prototype/split.c b/source/operator/prototype/split.c index 778d4e888..295a83821 100644 --- a/source/operator/prototype/split.c +++ b/source/operator/prototype/split.c @@ -32,12 +32,11 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int infer_shape(ir_node_t* node) { ir_graph_t* graph = node->graph; ir_tensor_t* input = get_ir_graph_tensor(graph, node->input_tensors[0]); - struct split_param* split_param = ( struct split_param* )(node->op.param_mem); + struct split_param* split_param = (struct split_param*)(node->op.param_mem); int axis = split_param->axis; @@ -62,7 +61,7 @@ static int infer_shape(ir_node_t* node) for (int i = 0; i < get_vector_num(split_param->split_sizes_); i++) { - sum_check += (( int* )get_vector_data(split_param->split_sizes_, i))[0]; + sum_check += ((int*)get_vector_data(split_param->split_sizes_, i))[0]; } if (sum_check != input_slice_num) @@ -73,7 +72,7 @@ static int infer_shape(ir_node_t* node) for (int i = 0; i < get_vector_num(split_param->split_sizes_); i++) { - input_dim[axis] = (( int* )get_vector_data(split_param->split_sizes_, i))[0]; + input_dim[axis] = ((int*)get_vector_data(split_param->split_sizes_, i))[0]; ir_tensor_t* output = get_ir_graph_tensor(graph, node->output_tensors[i]); set_ir_tensor_shape(output, input_dim, input->dim_num); } @@ -121,10 +120,9 @@ static int infer_shape(ir_node_t* node) return 0; } - static int init_op(ir_op_t* op) { - struct split_param* split_param = ( struct split_param* )sys_malloc(sizeof(struct split_param)); + struct split_param* split_param = (struct split_param*)sys_malloc(sizeof(struct split_param)); if (split_param == NULL) { @@ -146,10 +144,9 @@ static int init_op(ir_op_t* op) return 0; } - static void release_op(ir_op_t* op) { - struct split_param* split_param = ( struct split_param* )op->param_mem; + struct split_param* split_param = (struct split_param*)op->param_mem; if (split_param->split_sizes_) release_vector(split_param->split_sizes_); @@ -157,7 +154,6 @@ static void release_op(ir_op_t* op) sys_free(op->param_mem); } - int register_split_op() { ir_method_t m; @@ -169,7 +165,6 @@ int register_split_op() return register_op(OP_SPLIT, OP_SPLIT_NAME, &m); } - int unregister_split_op() { return unregister_op(OP_SPLIT, 1); diff --git a/source/operator/prototype/squareddifference.c b/source/operator/prototype/squareddifference.c index e7595a2d3..d4753d5db 100644 --- a/source/operator/prototype/squareddifference.c +++ b/source/operator/prototype/squareddifference.c @@ -27,7 +27,6 @@ #include "graph/graph.h" #include "module/module.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; @@ -51,7 +50,6 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { op->same_shape = 0; @@ -60,9 +58,9 @@ static int init_op(struct op* op) return 0; } - -static void release_op(struct op* op) {} - +static void release_op(struct op* op) +{ +} int register_squareddifference_op() { @@ -75,7 +73,6 @@ int register_squareddifference_op() return register_op(OP_SQUAREDDIFFERENCE, OP_SQUAREDDIFFERENCE_NAME, &m); } - int unregister_squareddifference_op() { return unregister_op(OP_SQUAREDDIFFERENCE, 1); diff --git a/source/operator/prototype/squeeze.c b/source/operator/prototype/squeeze.c index d2a027925..36767ebd9 100644 --- a/source/operator/prototype/squeeze.c +++ b/source/operator/prototype/squeeze.c @@ -32,13 +32,12 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]); - struct squeeze_param* squeeze_param = ( struct squeeze_param* )node->op.param_mem; + struct squeeze_param* squeeze_param = (struct squeeze_param*)node->op.param_mem; int in_size = input->dim_num; @@ -65,7 +64,7 @@ static int infer_shape(struct node* node) dim_size++; } - int8_t should_squeeze[4] = { 0 }; + int8_t should_squeeze[4] = {0}; int squeezeddim = 0; int newshape_size = dim_size; int real_shape[4] = {0, 2, 3, 1}; @@ -111,7 +110,7 @@ static int infer_shape(struct node* node) } } - int* odim = ( int* )sys_malloc((in_size - squeezeddim) * sizeof(int)); + int* odim = (int*)sys_malloc((in_size - squeezeddim) * sizeof(int)); int o_idx = 0; for (int i_idx = 0; i_idx < in_size; i_idx++) { @@ -125,10 +124,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct squeeze_param* squeeze_param = ( struct squeeze_param* )sys_malloc(sizeof(struct squeeze_param)); + struct squeeze_param* squeeze_param = (struct squeeze_param*)sys_malloc(sizeof(struct squeeze_param)); if (squeeze_param == NULL) { @@ -148,13 +146,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_squeeze_op() { struct method m; @@ -166,7 +162,6 @@ int register_squeeze_op() return register_op(OP_SQUEEZE, OP_SQUEEZE_NAME, &m); } - int unregister_squeeze_op() { return unregister_op(OP_SQUEEZE, 1); diff --git a/source/operator/prototype/strided_slice.c b/source/operator/prototype/strided_slice.c index 8bf798699..1a1cac9b4 100644 --- a/source/operator/prototype/strided_slice.c +++ b/source/operator/prototype/strided_slice.c @@ -33,30 +33,25 @@ #include - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]); - struct strided_slice_param* param_ = ( struct strided_slice_param* )(node->op.param_mem); + struct strided_slice_param* param_ = (struct strided_slice_param*)(node->op.param_mem); - int delta_0 = (-param_->begin[0] + param_->end[0]) < 0 ? param_->begin[0] - param_->end[0] : - -param_->begin[0] + param_->end[0]; - int delta_1 = (-param_->begin[1] + param_->end[1]) < 0 ? param_->begin[1] - param_->end[1] : - -param_->begin[1] + param_->end[1]; - int delta_2 = (-param_->begin[2] + param_->end[2]) < 0 ? param_->begin[2] - param_->end[2] : - -param_->begin[2] + param_->end[2]; - int delta_3 = (-param_->begin[3] + param_->end[3]) < 0 ? param_->begin[3] - param_->end[3] : - -param_->begin[3] + param_->end[3]; + int delta_0 = (-param_->begin[0] + param_->end[0]) < 0 ? param_->begin[0] - param_->end[0] : -param_->begin[0] + param_->end[0]; + int delta_1 = (-param_->begin[1] + param_->end[1]) < 0 ? param_->begin[1] - param_->end[1] : -param_->begin[1] + param_->end[1]; + int delta_2 = (-param_->begin[2] + param_->end[2]) < 0 ? param_->begin[2] - param_->end[2] : -param_->begin[2] + param_->end[2]; + int delta_3 = (-param_->begin[3] + param_->end[3]) < 0 ? param_->begin[3] - param_->end[3] : -param_->begin[3] + param_->end[3]; int dims[4] = {0}; - dims[0] = ceil((( float )input->dims[0] - ( float )delta_0) / ( float )param_->stride[0]); - dims[1] = ceil((( float )input->dims[1] - ( float )delta_1) / ( float )param_->stride[1]); - dims[2] = ceil((( float )input->dims[2] - ( float )delta_2) / ( float )param_->stride[2]); - dims[3] = ceil((( float )input->dims[3] - ( float )delta_3) / ( float )param_->stride[3]); + dims[0] = ceil(((float)input->dims[0] - (float)delta_0) / (float)param_->stride[0]); + dims[1] = ceil(((float)input->dims[1] - (float)delta_1) / (float)param_->stride[1]); + dims[2] = ceil(((float)input->dims[2] - (float)delta_2) / (float)param_->stride[2]); + dims[3] = ceil(((float)input->dims[3] - (float)delta_3) / (float)param_->stride[3]); - for (int i=0; i<4; i++) + for (int i = 0; i < 4; i++) { if (dims[i] == 0) dims[i] = 1; @@ -67,11 +62,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct strided_slice_param* strided_slice_param = - ( struct strided_slice_param* )sys_malloc(sizeof(struct strided_slice_param)); + struct strided_slice_param* strided_slice_param = (struct strided_slice_param*)sys_malloc(sizeof(struct strided_slice_param)); if (strided_slice_param == NULL) { @@ -93,13 +86,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_strided_slice_op() { struct method m; @@ -111,7 +102,6 @@ int register_strided_slice_op() return register_op(OP_STRIDED_SLICE, OP_STRIDEDSLICE_NAME, &m); } - int unregister_strided_slice_op() { return unregister_op(OP_STRIDED_SLICE, 1); diff --git a/source/operator/prototype/swap_axis.c b/source/operator/prototype/swap_axis.c index 66d9561ca..22f4ba96b 100644 --- a/source/operator/prototype/swap_axis.c +++ b/source/operator/prototype/swap_axis.c @@ -31,13 +31,12 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]); - struct swap_axis_param* swap_axis_param = ( struct swap_axis_param* )node->op.param_mem; + struct swap_axis_param* swap_axis_param = (struct swap_axis_param*)node->op.param_mem; if (swap_axis_param->dim_0 == swap_axis_param->dim_1) { @@ -59,7 +58,7 @@ static int infer_shape(struct node* node) if (swap_axis_param->dim_0 >= in_size || swap_axis_param->dim_1 >= in_size) return -1; - int* newdim = ( int* )sys_malloc(in_size * sizeof(int)); + int* newdim = (int*)sys_malloc(in_size * sizeof(int)); for (int i = 0; i < in_size; i++) { newdim[i] = input->dims[i]; @@ -72,10 +71,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct swap_axis_param* swap_axis_param = ( struct swap_axis_param* )sys_malloc(sizeof(struct swap_axis_param)); + struct swap_axis_param* swap_axis_param = (struct swap_axis_param*)sys_malloc(sizeof(struct swap_axis_param)); if (swap_axis_param == NULL) { @@ -93,13 +91,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_swap_axis_op() { struct method m; @@ -111,7 +107,6 @@ int register_swap_axis_op() return register_op(OP_SWAP_AXIS, OP_SWAP_AXIS_NAME, &m); } - int unregister_swap_axis_op() { return unregister_op(OP_SWAP_AXIS, 1); diff --git a/source/operator/prototype/tanh.c b/source/operator/prototype/tanh.c index 96cda2e32..aa40d591c 100644 --- a/source/operator/prototype/tanh.c +++ b/source/operator/prototype/tanh.c @@ -28,7 +28,6 @@ #include "module/module.h" #include "utility/vector.h" - static int infer_shape(ir_node_t* node) { ir_graph_t* ir_graph = node->graph; @@ -36,13 +35,12 @@ static int infer_shape(ir_node_t* node) ir_tensor_t* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]); output->layout = input->layout; - + set_ir_tensor_shape(output, input->dims, input->dim_num); return 0; } - static int init_op(ir_op_t* op) { op->same_shape = 0; @@ -51,13 +49,11 @@ static int init_op(ir_op_t* op) return 0; } - static void release_op(ir_op_t* op) { // sys_free(op->param_mem); } - int register_tanh_op() { ir_method_t m; @@ -70,7 +66,6 @@ int register_tanh_op() return register_op(OP_TANH, OP_TANH_NAME, &m); } - int unregister_tanh_op() { // sys_free(GET_PARAM_PARSE_MAP(tanh_param)); diff --git a/source/operator/prototype/threshold.c b/source/operator/prototype/threshold.c index 3af483dd8..8d98c24af 100644 --- a/source/operator/prototype/threshold.c +++ b/source/operator/prototype/threshold.c @@ -32,7 +32,6 @@ #include - static int infer_shape(struct node* node) { struct graph* graph = node->graph; @@ -44,10 +43,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct threshold_param* param = ( struct threshold_param* )sys_malloc(sizeof(struct threshold_param)); + struct threshold_param* param = (struct threshold_param*)sys_malloc(sizeof(struct threshold_param)); if (param == NULL) { @@ -64,13 +62,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_threshold_op() { struct method m; @@ -82,7 +78,6 @@ int register_threshold_op() return register_op(OP_THRESHOLD, OP_THRESHOLD_NAME, &m); } - int unregister_threshold_op() { return unregister_op(OP_THRESHOLD, 1); diff --git a/source/operator/prototype/tile.c b/source/operator/prototype/tile.c index 10ba81625..faf647837 100644 --- a/source/operator/prototype/tile.c +++ b/source/operator/prototype/tile.c @@ -37,7 +37,6 @@ #include #endif - static int infer_shape(struct node* node) { struct tile_param* param = (struct tile_param*)node->op.param_mem; @@ -55,20 +54,20 @@ static int infer_shape(struct node* node) struct vector* reps_vector = create_vector(sizeof(int), NULL); - for(int i = 0; i < param->reps_size; i++) + for (int i = 0; i < param->reps_size; i++) { push_vector_data(reps_vector, (void*)¶m->reps[i]); } - if(frame == 0) // caffe + if (frame == 0) // caffe { int param_size = get_vector_num(reps_vector); - if(param_size != 0) + if (param_size != 0) { - for(int i = 0; i < param_size / 2; i++) + for (int i = 0; i < param_size / 2; i++) { - int temp = ((int*)get_vector_data(reps_vector,0))[0]; - int ori_reps = ((int*)get_vector_data(reps_vector, param_size -i -1))[0]; + int temp = ((int*)get_vector_data(reps_vector, 0))[0]; + int ori_reps = ((int*)get_vector_data(reps_vector, param_size - i - 1))[0]; set_vector_data(reps_vector, i, (void*)&ori_reps); } } @@ -77,45 +76,45 @@ static int infer_shape(struct node* node) return -1; } int push_data = 1; - switch(param_size) + switch (param_size) { - case 0: - for(int i = 0; i < 4; i++) - { - push_vector_data(reps_vector, (void*)&push_data); - } - break; - case 1: - for(int i = 0; i < 3; i++) - { - push_vector_data(reps_vector, (void*)&push_data); - }; - break; - case 2: - for(int i = 0; i < 2; i++) - { - push_vector_data(reps_vector, (void*)&push_data); - } - break; - case 3: + case 0: + for (int i = 0; i < 4; i++) + { + push_vector_data(reps_vector, (void*)&push_data); + } + break; + case 1: + for (int i = 0; i < 3; i++) + { + push_vector_data(reps_vector, (void*)&push_data); + }; + break; + case 2: + for (int i = 0; i < 2; i++) + { push_vector_data(reps_vector, (void*)&push_data); - break; - default: - break; + } + break; + case 3: + push_vector_data(reps_vector, (void*)&push_data); + break; + default: + break; } - output_n = input_tensor->dims[0]*(( int* )get_vector_data(reps_vector, 3))[0]; - output_c = input_tensor->dims[1]*(( int* )get_vector_data(reps_vector, 2))[0]; - output_h = input_tensor->dims[2]*(( int* )get_vector_data(reps_vector, 1))[0]; - output_w = input_tensor->dims[3]*(( int* )get_vector_data(reps_vector, 0))[0]; - } - else if (frame == 1) + output_n = input_tensor->dims[0] * ((int*)get_vector_data(reps_vector, 3))[0]; + output_c = input_tensor->dims[1] * ((int*)get_vector_data(reps_vector, 2))[0]; + output_h = input_tensor->dims[2] * ((int*)get_vector_data(reps_vector, 1))[0]; + output_w = input_tensor->dims[3] * ((int*)get_vector_data(reps_vector, 0))[0]; + } + else if (frame == 1) { printf("Tile::InferShape onnx\n"); } - int* new_shape = (int*)sys_malloc(get_vector_num(reps_vector)*sizeof(int)); - for(int i = 0; i < get_vector_num(reps_vector); i++) + int* new_shape = (int*)sys_malloc(get_vector_num(reps_vector) * sizeof(int)); + for (int i = 0; i < get_vector_num(reps_vector); i++) { int* a = (int*)get_vector_data(reps_vector, i); new_shape[i] = *a; @@ -127,17 +126,16 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct tile_param* tile_param = ( struct tile_param* )sys_malloc(sizeof(struct tile_param)); + struct tile_param* tile_param = (struct tile_param*)sys_malloc(sizeof(struct tile_param)); if (tile_param == NULL) { return -1; } - memset(tile_param,0,sizeof(struct tile_param)); + memset(tile_param, 0, sizeof(struct tile_param)); op->param_mem = tile_param; op->param_size = sizeof(struct tile_param); op->same_shape = 0; @@ -146,16 +144,14 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { struct tile_param* tile_param = (struct tile_param*)op->param_mem; - if(tile_param->reps) + if (tile_param->reps) sys_free(tile_param->reps); sys_free(op->param_mem); } - int register_tile_op() { struct method m; @@ -163,13 +159,10 @@ int register_tile_op() m.init = init_op; m.release = release_op; - return register_op(OP_TILE, OP_TILE_NAME, &m); - } - int unregister_tile_op() { - return unregister_op(OP_TILE,1); + return unregister_op(OP_TILE, 1); } diff --git a/source/operator/prototype/topkv2.c b/source/operator/prototype/topkv2.c index 50ddfbbb2..a98e773e2 100644 --- a/source/operator/prototype/topkv2.c +++ b/source/operator/prototype/topkv2.c @@ -31,7 +31,6 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct topkv2_param* topkv2_param = (struct topkv2_param*)node->op.param_mem; @@ -42,7 +41,7 @@ static int infer_shape(struct node* node) struct tensor* output1 = get_ir_graph_tensor(ir_graph, node->output_tensors[1]); int in_size = input->dim_num; - int* in_dim = ( int* )sys_malloc((in_size) * sizeof(int)); + int* in_dim = (int*)sys_malloc((in_size) * sizeof(int)); if (topkv2_param->k > input->dims[in_size - 1]) { @@ -61,10 +60,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct topkv2_param* topkv2_param = ( struct topkv2_param* )sys_malloc(sizeof(struct topkv2_param)); + struct topkv2_param* topkv2_param = (struct topkv2_param*)sys_malloc(sizeof(struct topkv2_param)); if (topkv2_param == NULL) { @@ -82,13 +80,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_topkv2_op() { struct method m; @@ -97,11 +93,9 @@ int register_topkv2_op() m.init = init_op; m.release = release_op; - return register_op(OP_TOPKV2, OP_TOPKV2_NAME, &m); } - int unregister_topkv2_op() { return unregister_op(OP_TOPKV2, 1); diff --git a/source/operator/prototype/transpose.c b/source/operator/prototype/transpose.c index 50024f101..5c211d453 100644 --- a/source/operator/prototype/transpose.c +++ b/source/operator/prototype/transpose.c @@ -32,16 +32,15 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int infer_shape(struct node* node) { struct graph* graph = node->graph; struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); - struct transpose_param* param = ( struct transpose_param* )(node->op.param_mem); + struct transpose_param* param = (struct transpose_param*)(node->op.param_mem); int new_shape_size = param->tr_shape_size; - int* out_dims = ( int* )sys_malloc(new_shape_size * sizeof(int)); + int* out_dims = (int*)sys_malloc(new_shape_size * sizeof(int)); for (int i = 0; i < new_shape_size; i++) { @@ -54,10 +53,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct transpose_param* transpose_param = ( struct transpose_param* )sys_malloc(sizeof(struct transpose_param)); + struct transpose_param* transpose_param = (struct transpose_param*)sys_malloc(sizeof(struct transpose_param)); if (transpose_param == NULL) { @@ -78,10 +76,9 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { - struct transpose_param* transpose_param = ( struct transpose_param* )op->param_mem; + struct transpose_param* transpose_param = (struct transpose_param*)op->param_mem; if (transpose_param->tr_shape) sys_free(transpose_param->tr_shape); @@ -89,7 +86,6 @@ static void release_op(struct op* op) sys_free(op->param_mem); } - int register_transpose_op() { struct method m; @@ -98,11 +94,9 @@ int register_transpose_op() m.init = init_op; m.release = release_op; - return register_op(OP_TRANSPOSE, OP_TRANSPOSE_NAME, &m); } - int unregister_transpose_op() { return unregister_op(OP_TRANSPOSE, 1); diff --git a/source/operator/prototype/unary.c b/source/operator/prototype/unary.c index cb72cfd7f..8c0c64196 100644 --- a/source/operator/prototype/unary.c +++ b/source/operator/prototype/unary.c @@ -31,7 +31,6 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; @@ -45,10 +44,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct unary_param* unary_param = ( struct unary_param* )sys_malloc(sizeof(struct unary_param)); + struct unary_param* unary_param = (struct unary_param*)sys_malloc(sizeof(struct unary_param)); if (unary_param == NULL) { @@ -66,13 +64,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_unary_op() { struct method m; @@ -81,11 +77,9 @@ int register_unary_op() m.init = init_op; m.release = release_op; - return register_op(OP_UNARY, OP_UNARY_NAME, &m); } - int unregister_unary_op() { return unregister_op(OP_UNARY, 1); diff --git a/source/operator/prototype/unsqueeze.c b/source/operator/prototype/unsqueeze.c index 196e6795f..77b75f8d5 100644 --- a/source/operator/prototype/unsqueeze.c +++ b/source/operator/prototype/unsqueeze.c @@ -31,16 +31,15 @@ #include "module/module.h" #include "utility/sys_port.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]); struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]); - struct unsqueeze_param* unsqueeze_param = ( struct unsqueeze_param* )node->op.param_mem; + struct unsqueeze_param* unsqueeze_param = (struct unsqueeze_param*)node->op.param_mem; int axises_size = unsqueeze_param->axises_size; - int* out_dim = ( int* )sys_malloc((input->dim_num + axises_size) * sizeof(int)); + int* out_dim = (int*)sys_malloc((input->dim_num + axises_size) * sizeof(int)); if (axises_size == 1) { @@ -90,7 +89,6 @@ static int infer_shape(struct node* node) out_dim[i] = input->dims[k]; k++; } - } } @@ -101,10 +99,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct unsqueeze_param* unsqueeze_param = ( struct unsqueeze_param* )sys_malloc(sizeof(struct unsqueeze_param)); + struct unsqueeze_param* unsqueeze_param = (struct unsqueeze_param*)sys_malloc(sizeof(struct unsqueeze_param)); if (unsqueeze_param == NULL) { @@ -121,7 +118,6 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { struct unsqueeze_param* unsqueeze_param = (struct unsqueeze_param*)op->param_mem; @@ -130,7 +126,6 @@ static void release_op(struct op* op) sys_free(op->param_mem); } - int register_unsqueeze_op() { struct method m; @@ -142,7 +137,6 @@ int register_unsqueeze_op() return register_op(OP_UNSQUEEZE, OP_UNSQUEEZE_NAME, &m); } - int unregister_unsqueeze_op() { return unregister_op(OP_UNSQUEEZE, 1); diff --git a/source/operator/prototype/upsample.c b/source/operator/prototype/upsample.c index a026e79c0..fc7e29ebc 100644 --- a/source/operator/prototype/upsample.c +++ b/source/operator/prototype/upsample.c @@ -32,10 +32,9 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int infer_shape(struct node* node) { - struct upsample_param* upsample_param = ( struct upsample_param* )(node->op.param_mem); + struct upsample_param* upsample_param = (struct upsample_param*)(node->op.param_mem); struct graph* graph = node->graph; struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]); @@ -55,10 +54,9 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { - struct upsample_param* upsample_param = ( struct upsample_param* )sys_malloc(sizeof(struct upsample_param)); + struct upsample_param* upsample_param = (struct upsample_param*)sys_malloc(sizeof(struct upsample_param)); if (upsample_param == NULL) { @@ -76,13 +74,11 @@ static int init_op(struct op* op) return 0; } - static void release_op(struct op* op) { sys_free(op->param_mem); } - int register_upsample_op() { struct method m; @@ -94,7 +90,6 @@ int register_upsample_op() return register_op(OP_UPSAMPLE, OP_UPSAMPLE_NAME, &m); } - int unregister_upsample_op() { return unregister_op(OP_UPSAMPLE, 1); diff --git a/source/operator/prototype/where.c b/source/operator/prototype/where.c index 8b9285657..4a97d6b22 100644 --- a/source/operator/prototype/where.c +++ b/source/operator/prototype/where.c @@ -30,7 +30,6 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; @@ -42,7 +41,6 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { op->same_shape = 0; @@ -51,9 +49,9 @@ static int init_op(struct op* op) return 0; } - -static void release_op(struct op* op) {} - +static void release_op(struct op* op) +{ +} int register_where_op() { @@ -66,7 +64,6 @@ int register_where_op() return register_op(OP_WHERE, OP_WHERE_NAME, &m); } - int unregister_where_op() { return unregister_op(OP_WHERE, 1); diff --git a/source/operator/prototype/zeroslike.c b/source/operator/prototype/zeroslike.c index 251adab6c..a6ae52644 100644 --- a/source/operator/prototype/zeroslike.c +++ b/source/operator/prototype/zeroslike.c @@ -27,7 +27,6 @@ #include "graph/graph.h" #include "module/module.h" - static int infer_shape(struct node* node) { struct graph* ir_graph = node->graph; @@ -39,7 +38,6 @@ static int infer_shape(struct node* node) return 0; } - static int init_op(struct op* op) { op->same_shape = 0; @@ -48,9 +46,9 @@ static int init_op(struct op* op) return 0; } - -static void release_op(struct op* op) {} - +static void release_op(struct op* op) +{ +} int register_zeroslike_op() { @@ -63,7 +61,6 @@ int register_zeroslike_op() return register_op(OP_ZEROSLIKE, OP_ZEROSLIKE_NAME, &m); } - int unregister_zeroslike_op() { return unregister_op(OP_ZEROSLIKE, 1); diff --git a/source/optimizer/estimation.c b/source/optimizer/estimation.c index 7e8e604ea..2fba81972 100644 --- a/source/optimizer/estimation.c +++ b/source/optimizer/estimation.c @@ -39,21 +39,19 @@ #include #endif - void init_memory_block(memory_block_t* memory_block, uint16_t index) { if (NULL != memory_block) { memory_block->index = index; - memory_block->size = 0; + memory_block->size = 0; memory_block->tensor_count = 0; - memory_block->tensor_list = NULL; + memory_block->tensor_list = NULL; memory_block->tensor_index = 0; memory_block->inuse = 0; } } - memory_block_t* find_unused_memory_block(struct vector* memory_blocks) { int memory_blocks_count = get_vector_num(memory_blocks); @@ -69,7 +67,6 @@ memory_block_t* find_unused_memory_block(struct vector* memory_blocks) return NULL; } - memory_block_t* get_usable_memory_block(struct vector* memory_blocks) { memory_block_t* memory_block = find_unused_memory_block(memory_blocks); @@ -89,13 +86,12 @@ memory_block_t* get_usable_memory_block(struct vector* memory_blocks) return memory_block; } - int mark_memory_block_with_tensor(ir_graph_t* graph, memory_block_t* memory_block, uint16_t index) { ir_tensor_t* tensor = get_ir_graph_tensor(graph, index); memory_block->tensor_count += 1; - memory_block->tensor_list = (uint16_t*)sys_realloc(memory_block->tensor_list, memory_block->tensor_count * sizeof(uint16_t)); + memory_block->tensor_list = (uint16_t*)sys_realloc(memory_block->tensor_list, memory_block->tensor_count * sizeof(uint16_t)); memory_block->inuse = 1; uint32_t tensor_buffer_size = tensor->elem_num * tensor->elem_size; @@ -108,7 +104,6 @@ int mark_memory_block_with_tensor(ir_graph_t* graph, memory_block_t* memory_bloc return 0; } - int estimate_subgraph_memory_blocks(struct subgraph* subgraph, struct vector* memory_blocks) { if (NULL == subgraph || NULL == memory_blocks) diff --git a/source/optimizer/estimation.h b/source/optimizer/estimation.h index b2f1580bd..3cc51026d 100644 --- a/source/optimizer/estimation.h +++ b/source/optimizer/estimation.h @@ -30,22 +30,20 @@ struct subgraph; struct vector; - /*! * @struct ir_subgraph_t * @brief Abstract subgraph intermediate representation */ typedef struct memory_block { - uint16_t index; //!< the index of a memory_block - uint32_t size; //!< final estimated memory size - uint16_t tensor_count; //!< referenced tensor count - uint16_t* tensor_list; //!< referenced tensor list - uint16_t tensor_index; //!< referenced tensor index, which is largest one - uint8_t inuse; //!< flag mark if this block is inuse + uint16_t index; //!< the index of a memory_block + uint32_t size; //!< final estimated memory size + uint16_t tensor_count; //!< referenced tensor count + uint16_t* tensor_list; //!< referenced tensor list + uint16_t tensor_index; //!< referenced tensor index, which is largest one + uint8_t inuse; //!< flag mark if this block is inuse } memory_block_t; - /*! * @brief Init tensor quantization parameter. * @@ -54,7 +52,6 @@ typedef struct memory_block */ void init_memory_block(memory_block_t* memory_block, uint16_t index); - /*! * @brief Set tensor quantization parameter. * diff --git a/source/optimizer/helper.c b/source/optimizer/helper.c index ac2439d68..e325fc22b 100644 --- a/source/optimizer/helper.c +++ b/source/optimizer/helper.c @@ -30,7 +30,6 @@ #include "graph/subgraph.h" #include "operator/op.h" - int is_index_in_array(const uint16_t* array, const uint16_t array_size, const uint16_t index) { for (uint16_t i = 0; i < array_size; i++) @@ -46,19 +45,16 @@ int is_index_in_array(const uint16_t* array, const uint16_t array_size, const ui return 0; } - int is_subgraph_input_tensor(const struct subgraph* subgraph, const uint16_t tensor_index) { return is_index_in_array(subgraph->input_tensor_list, (uint16_t)subgraph->input_num, tensor_index); } - int is_subgraph_output_tensor(const struct subgraph* subgraph, const uint16_t tensor_index) { return is_index_in_array(subgraph->output_tensor_list, (uint16_t)subgraph->input_num, tensor_index); } - int is_variable_tensor_in_subgraph(const ir_subgraph_t* subgraph, const uint16_t tensor_index) { // only each node outputs need to be checked next diff --git a/source/optimizer/helper.h b/source/optimizer/helper.h index d684c317e..f626ed705 100644 --- a/source/optimizer/helper.h +++ b/source/optimizer/helper.h @@ -30,7 +30,6 @@ struct subgraph; struct vector; - int is_subgraph_input_tensor(const struct subgraph* subgraph, uint16_t tensor_index); int is_subgraph_output_tensor(const struct subgraph* subgraph, uint16_t tensor_index); diff --git a/source/optimizer/split.c b/source/optimizer/split.c index 2aaafedb9..75004e189 100644 --- a/source/optimizer/split.c +++ b/source/optimizer/split.c @@ -39,7 +39,6 @@ #define MODEL_COMPLEX_COUNT 3 - int check_sub_info(struct graph* ir_graph) { int subgraph_num = get_vector_num(ir_graph->subgraph_list); @@ -51,7 +50,6 @@ int check_sub_info(struct graph* ir_graph) return -1; } - int tensor_in_precision(const struct tensor* tensor, struct vector* allowed_precision) { int count = get_vector_num(allowed_precision); @@ -67,7 +65,6 @@ int tensor_in_precision(const struct tensor* tensor, struct vector* allowed_prec return -1; } - int node_in_precision(const struct graph* ir_graph, uint16_t node_id, struct vector* allowed_precision) { if (node_id > ir_graph->node_num) @@ -100,7 +97,6 @@ int node_in_precision(const struct graph* ir_graph, uint16_t node_id, struct vec return -1; } - int node_in_list(const struct graph* ir_graph, struct vector* ops_list, const uint16_t node_id) { if (NULL == ir_graph || NULL == ops_list) @@ -122,7 +118,6 @@ int node_in_list(const struct graph* ir_graph, struct vector* ops_list, const ui return -1; } - struct vector* get_graph_blocked_nodes(const struct graph* ir_graph, struct vector* blocked_ops, struct vector* allowed_precision) { struct vector* blocked_nodes_list = create_vector(sizeof(uint16_t), NULL); @@ -141,7 +136,6 @@ struct vector* get_graph_blocked_nodes(const struct graph* ir_graph, struct vect return blocked_nodes_list; } - // policy has some issue, must be fixed void split_graph_node_to_sub_graph(struct graph* ir_graph, struct vector* allowed_ops, struct vector* blocked_ops, struct vector* allowed_precision) { @@ -156,7 +150,6 @@ void split_graph_node_to_sub_graph(struct graph* ir_graph, struct vector* allowe // scan from back to front for (int i = blocked_nodes_count - 1; i >= 0; i--) { - // start node id (the blocked one) uint16_t first_node_id = *((uint16_t*)get_vector_data(blocked_nodes_list, i)); // end node id (not including its self; the next blocked one, or the last one) @@ -186,7 +179,7 @@ void split_graph_node_to_sub_graph(struct graph* ir_graph, struct vector* allowe } } - if (children_nodes_is_complicated < MODEL_COMPLEX_COUNT) // directly add these nodes to sub graph list + if (children_nodes_is_complicated < MODEL_COMPLEX_COUNT) // directly add these nodes to sub graph list { struct subgraph* sub_graph = (struct subgraph*)sys_malloc(sizeof(struct subgraph)); init_ir_subgraph((struct graph*)ir_graph, sub_graph, 0); @@ -318,7 +311,6 @@ void split_graph_node_to_sub_graph(struct graph* ir_graph, struct vector* allowe } } - void generate_sub_graph_io(struct graph* ir_graph) { int sub_graph_count = get_vector_num(ir_graph->subgraph_list); @@ -541,8 +533,6 @@ void generate_sub_graph_io(struct graph* ir_graph) } } - - void add_sub_graph_to_ir_graph(struct graph* ir_graph) { const int sub_graphs_count = get_vector_num(ir_graph->subgraph_list); @@ -750,7 +740,6 @@ void add_sub_graph_to_ir_graph(struct graph* ir_graph) } } - void dump_sub_graph(struct subgraph* sub_graph) { TLOG_INFO("Sub graph[%d]: {%8s } has %d nodes, %d input tensors, %d output tensors.\n", sub_graph->index, sub_graph->device->name, sub_graph->node_num, sub_graph->input_num, sub_graph->output_num); diff --git a/source/optimizer/split.h b/source/optimizer/split.h index ace1f27f0..1e65a733d 100644 --- a/source/optimizer/split.h +++ b/source/optimizer/split.h @@ -28,7 +28,6 @@ struct graph; struct subgraph; struct vector; - int check_sub_info(struct graph* ir_graph); struct vector* get_graph_blocked_nodes(const struct graph* ir_graph, struct vector* blocked_ops, struct vector* allowed_precision); diff --git a/source/scheduler/scheduler.c b/source/scheduler/scheduler.c index fdcb60a56..d352be39e 100644 --- a/source/scheduler/scheduler.c +++ b/source/scheduler/scheduler.c @@ -36,7 +36,6 @@ #include - static int sched_prerun(ir_scheduler_t* scheduler, ir_graph_t* ir_graph) { int subgraph_num = get_vector_num(ir_graph->subgraph_list); @@ -71,7 +70,6 @@ static int sched_prerun(ir_scheduler_t* scheduler, ir_graph_t* ir_graph) return 0; } - static int sched_run(ir_scheduler_t* scheduler, ir_graph_t* ir_graph, int block) { if (block == 0) @@ -114,7 +112,7 @@ static int sched_run(ir_scheduler_t* scheduler, ir_graph_t* ir_graph, int block) for (int i = 0; i < wait_num; i++) { - struct subgraph* subgraph = *( struct subgraph** )get_vector_data(wait_list, i); + struct subgraph* subgraph = *(struct subgraph**)get_vector_data(wait_list, i); if (subgraph->input_ready_count == subgraph->input_wait_count) ready_list[ready_num++] = i; @@ -128,7 +126,7 @@ static int sched_run(ir_scheduler_t* scheduler, ir_graph_t* ir_graph, int block) for (int i = 0; i < ready_num; i++) { - struct subgraph* subgraph = *( struct subgraph** )get_vector_data(wait_list, ready_list[i]); + struct subgraph* subgraph = *(struct subgraph**)get_vector_data(wait_list, ready_list[i]); ir_device_t* nn_dev = subgraph->device; subgraph->status = GRAPH_STAT_RUNNING; @@ -183,13 +181,11 @@ static int sched_run(ir_scheduler_t* scheduler, ir_graph_t* ir_graph, int block) return 0; } - static int sched_wait(ir_scheduler_t* scheduler, ir_graph_t* ir_graph) { return -1; } - static int sched_postrun(ir_scheduler_t* scheduler, ir_graph_t* ir_graph) { int subgraph_num = get_vector_num(ir_graph->subgraph_list); @@ -216,17 +212,15 @@ static int sched_postrun(ir_scheduler_t* scheduler, ir_graph_t* ir_graph) return 0; } - static ir_scheduler_t sync_scheduler = { - .name = "sync", - .prerun = sched_prerun, - .run = sched_run, - .wait = sched_wait, - .postrun = sched_postrun, - .release = NULL, + .name = "sync", + .prerun = sched_prerun, + .run = sched_run, + .wait = sched_wait, + .postrun = sched_postrun, + .release = NULL, }; - ir_scheduler_t* find_default_scheduler(void) { return &sync_scheduler; diff --git a/source/scheduler/scheduler.h b/source/scheduler/scheduler.h index cca04c830..3ab8f619e 100644 --- a/source/scheduler/scheduler.h +++ b/source/scheduler/scheduler.h @@ -28,7 +28,6 @@ struct graph; struct vector; - /*! * @struct ir_scheduler_t * @brief Abstract scheduler intermediate representation @@ -37,14 +36,13 @@ typedef struct scheduler { const char* name; - int (*prerun)(struct scheduler*, struct graph*); - int (*run)(struct scheduler*, struct graph*, int block); - int (*wait)(struct scheduler*, struct graph*); - int (*postrun)(struct scheduler*, struct graph*); + int (*prerun)(struct scheduler*, struct graph*); + int (*run)(struct scheduler*, struct graph*, int block); + int (*wait)(struct scheduler*, struct graph*); + int (*postrun)(struct scheduler*, struct graph*); void (*release)(struct scheduler*); } ir_scheduler_t; - /*! * @brief Dump the node. * diff --git a/source/serializer/serializer.c b/source/serializer/serializer.c index 47c13affe..a6e90b13e 100644 --- a/source/serializer/serializer.c +++ b/source/serializer/serializer.c @@ -27,7 +27,6 @@ #include - void init_serializer(struct serializer* serializer) { memset(serializer, 0, sizeof(serializer_t)); diff --git a/source/serializer/serializer.h b/source/serializer/serializer.h index 21c1b1b30..592a803ad 100644 --- a/source/serializer/serializer.h +++ b/source/serializer/serializer.h @@ -29,7 +29,6 @@ struct graph; - /*! * @struct serializer_t * @brief Abstract serializer @@ -60,7 +59,6 @@ typedef struct serializer int (*release)(struct serializer*); } serializer_t; - /*! * @brief Initialize serializer * diff --git a/source/serializer/tmfile/op/tm2_add_n.c b/source/serializer/tmfile/op/tm2_add_n.c index ba606cc66..441d38ed4 100644 --- a/source/serializer/tmfile/op/tm2_add_n.c +++ b/source/serializer/tmfile/op/tm2_add_n.c @@ -31,19 +31,16 @@ #include "serializer/tmfile/tm2_serializer.h" #include "utility/log.h" - static int add_n_op_map(int op) { return OP_ADD_N; } - static int tm2_load_add_n(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_add_n_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_argmax.c b/source/serializer/tmfile/op/tm2_argmax.c index 22c6603b3..3bfeb8665 100644 --- a/source/serializer/tmfile/op/tm2_argmax.c +++ b/source/serializer/tmfile/op/tm2_argmax.c @@ -34,19 +34,17 @@ #include "device/device.h" #include "utility/log.h" - static int argmax_op_map(int op) { return OP_ARGMAX; } - static int tm2_load_argmax(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { struct argmax_param* argmax_param = (struct argmax_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_ArgMaxParam* tm_param = ( TM2_ArgMaxParam* )(mem_base + tm_op->offset_t_param); + const TM2_ArgMaxParam* tm_param = (TM2_ArgMaxParam*)(mem_base + tm_op->offset_t_param); argmax_param->axis = tm_param->axis; argmax_param->keepdims = tm_param->keepdims; @@ -54,7 +52,6 @@ static int tm2_load_argmax(struct graph* ir_graph, struct node* ir_node, const T return 0; } - int register_tm2_argmax_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -70,7 +67,6 @@ int register_tm2_argmax_op() return 0; } - int unregister_tm2_argmax_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_argmin.c b/source/serializer/tmfile/op/tm2_argmin.c index ab34d8920..782bd9dd3 100644 --- a/source/serializer/tmfile/op/tm2_argmin.c +++ b/source/serializer/tmfile/op/tm2_argmin.c @@ -34,19 +34,17 @@ #include "device/device.h" #include "utility/log.h" - static int argmin_op_map(int op) { return OP_ARGMIN; } - static int tm2_load_argmin(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct argmin_param* argmin_param = ( struct argmin_param* )ir_node->op.param_mem; + struct argmin_param* argmin_param = (struct argmin_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_ArgMaxParam* tm_param = ( TM2_ArgMaxParam* )(mem_base + tm_op->offset_t_param); + const TM2_ArgMaxParam* tm_param = (TM2_ArgMaxParam*)(mem_base + tm_op->offset_t_param); argmin_param->axis = tm_param->axis; argmin_param->keepdims = tm_param->keepdims; @@ -54,7 +52,6 @@ static int tm2_load_argmin(struct graph* ir_graph, struct node* ir_node, const T return 0; } - int register_tm2_argmin_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -70,7 +67,6 @@ int register_tm2_argmin_op() return 0; } - int unregister_tm2_argmin_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_batchnorm.c b/source/serializer/tmfile/op/tm2_batchnorm.c index 17acd0467..a56447aec 100644 --- a/source/serializer/tmfile/op/tm2_batchnorm.c +++ b/source/serializer/tmfile/op/tm2_batchnorm.c @@ -34,19 +34,17 @@ #include "device/device.h" #include "utility/log.h" - static int batchnorm_op_map(int op) { return OP_BATCHNORM; } - static int tm2_load_batchnorm(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct batchnorm_param* batchnorm_param = ( struct batchnorm_param* )ir_node->op.param_mem; + struct batchnorm_param* batchnorm_param = (struct batchnorm_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_BatchNormParam* tm_param = ( TM2_BatchNormParam* )(mem_base + tm_op->offset_t_param); + const TM2_BatchNormParam* tm_param = (TM2_BatchNormParam*)(mem_base + tm_op->offset_t_param); batchnorm_param->rescale_factor = tm_param->rescale_factor; batchnorm_param->eps = tm_param->eps; @@ -55,7 +53,6 @@ static int tm2_load_batchnorm(struct graph* ir_graph, struct node* ir_node, cons return 0; } - int register_tm2_batchnorm_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -71,7 +68,6 @@ int register_tm2_batchnorm_op() return 0; } - int unregister_tm2_batchnorm_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_batchtospacend.c b/source/serializer/tmfile/op/tm2_batchtospacend.c index 2f4077820..dd68d30e3 100644 --- a/source/serializer/tmfile/op/tm2_batchtospacend.c +++ b/source/serializer/tmfile/op/tm2_batchtospacend.c @@ -34,19 +34,17 @@ #include "device/device.h" #include "utility/log.h" - static int batchtospacend_op_map(int op) { return OP_BATCHTOSPACEND; } - static int tm2_load_batchtospacend(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct batchtospacend_param* batchtospacend_param = ( struct batchtospacend_param* )ir_node->op.param_mem; + struct batchtospacend_param* batchtospacend_param = (struct batchtospacend_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_BatchToSpaceNDParam* tm_param = ( TM2_BatchToSpaceNDParam* )(mem_base + tm_op->offset_t_param); + const TM2_BatchToSpaceNDParam* tm_param = (TM2_BatchToSpaceNDParam*)(mem_base + tm_op->offset_t_param); batchtospacend_param->dilation_x = tm_param->dilation_x; batchtospacend_param->dilation_y = tm_param->dilation_y; @@ -58,7 +56,6 @@ static int tm2_load_batchtospacend(struct graph* ir_graph, struct node* ir_node, return 0; } - int register_tm2_batchtospacend_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -74,7 +71,6 @@ int register_tm2_batchtospacend_op() return 0; } - int unregister_tm2_batchtospacend_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_bias.c b/source/serializer/tmfile/op/tm2_bias.c index 64294cc91..6764fa10b 100644 --- a/source/serializer/tmfile/op/tm2_bias.c +++ b/source/serializer/tmfile/op/tm2_bias.c @@ -32,20 +32,17 @@ #include "device/device.h" #include "utility/log.h" - static int bias_op_map(int op) { return OP_BIAS; } - static int tm2_load_bias(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_bias_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -61,7 +58,6 @@ int register_tm2_bias_op() return 0; } - int unregister_tm2_bias_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_broadmul.c b/source/serializer/tmfile/op/tm2_broadmul.c index 6172a2cb4..33df417bc 100644 --- a/source/serializer/tmfile/op/tm2_broadmul.c +++ b/source/serializer/tmfile/op/tm2_broadmul.c @@ -32,20 +32,17 @@ #include "device/device.h" #include "utility/log.h" - static int broadmul_op_map(int op) { return OP_BROADMUL; } - static int tm2_load_broadmul(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_broadmul_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -61,7 +58,6 @@ int register_tm2_broadmul_op() return 0; } - int unregister_tm2_broadmul_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_cast.c b/source/serializer/tmfile/op/tm2_cast.c index 3a256fbf7..19c4fbe80 100644 --- a/source/serializer/tmfile/op/tm2_cast.c +++ b/source/serializer/tmfile/op/tm2_cast.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int op_map(int op) { return OP_CAST; } - static int tm2_load_cast(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct cast_param* param = ( struct cast_param* )ir_node->op.param_mem; + struct cast_param* param = (struct cast_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_CastParam* tm_param = ( TM2_CastParam* )(mem_base + tm_op->offset_t_param); + const TM2_CastParam* tm_param = (TM2_CastParam*)(mem_base + tm_op->offset_t_param); param->type_from = tm_param->type_from; param->type_to = tm_param->type_to; @@ -57,7 +55,6 @@ static int tm2_load_cast(struct graph* ir_graph, struct node* ir_node, const TM2 return 0; } - int register_tm2_cast_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -73,7 +70,6 @@ int register_tm2_cast_op() return 0; } - int unregister_tm2_cast_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_ceil.c b/source/serializer/tmfile/op/tm2_ceil.c index f88e790b5..0d5abe606 100644 --- a/source/serializer/tmfile/op/tm2_ceil.c +++ b/source/serializer/tmfile/op/tm2_ceil.c @@ -32,20 +32,17 @@ #include "device/device.h" #include "utility/log.h" - static int ceil_op_map(int op) { return OP_CEIL; } - static int tm2_load_ceil(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_ceil_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -61,7 +58,6 @@ int register_tm2_ceil_op() return 0; } - int unregister_tm2_ceil_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_clip.c b/source/serializer/tmfile/op/tm2_clip.c index 80a54e072..1e0478ff1 100644 --- a/source/serializer/tmfile/op/tm2_clip.c +++ b/source/serializer/tmfile/op/tm2_clip.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int clip_op_map(int op) { return OP_CLIP; } - static int tm2_load_clip(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct clip_param* clip_param = ( struct clip_param* )ir_node->op.param_mem; + struct clip_param* clip_param = (struct clip_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_ClipParam* tm_param = ( TM2_ClipParam* )(mem_base + tm_op->offset_t_param); + const TM2_ClipParam* tm_param = (TM2_ClipParam*)(mem_base + tm_op->offset_t_param); clip_param->max = tm_param->max; clip_param->min = tm_param->min; @@ -55,7 +53,6 @@ static int tm2_load_clip(struct graph* ir_graph, struct node* ir_node, const TM2 return 0; } - int register_tm2_clip_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -71,7 +68,6 @@ int register_tm2_clip_op() return 0; } - int unregister_tm2_clip_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_comparison.c b/source/serializer/tmfile/op/tm2_comparison.c index 3e71ed91a..05220ab5c 100644 --- a/source/serializer/tmfile/op/tm2_comparison.c +++ b/source/serializer/tmfile/op/tm2_comparison.c @@ -34,27 +34,24 @@ #include "device/device.h" #include "utility/log.h" - static int comparison_op_map(int op) { return OP_COMPARISON; } - static int tm2_load_comparison(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct comparison_param* param = ( struct comparison_param* )ir_node->op.param_mem; + struct comparison_param* param = (struct comparison_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_ComparisonParam* tm_param = ( TM2_ComparisonParam* )(mem_base + tm_op->offset_t_param); + const TM2_ComparisonParam* tm_param = (TM2_ComparisonParam*)(mem_base + tm_op->offset_t_param); param->type = tm_param->type; return 0; } - int register_tm2_comparison_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -70,7 +67,6 @@ int register_tm2_comparison_op() return 0; } - int unregister_tm2_comparison_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_concat.c b/source/serializer/tmfile/op/tm2_concat.c index 4e7cc6324..44949f093 100644 --- a/source/serializer/tmfile/op/tm2_concat.c +++ b/source/serializer/tmfile/op/tm2_concat.c @@ -34,27 +34,24 @@ #include "device/device.h" #include "utility/log.h" - static int concat_op_map(int op) { return OP_CONCAT; } - static int tm2_load_concat(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct concat_param* concat_param = ( struct concat_param* )ir_node->op.param_mem; + struct concat_param* concat_param = (struct concat_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_ConcatParam* tm_param = ( TM2_ConcatParam* )(mem_base + tm_op->offset_t_param); + const TM2_ConcatParam* tm_param = (TM2_ConcatParam*)(mem_base + tm_op->offset_t_param); concat_param->axis = tm_param->axis; return 0; } - int register_tm2_concat_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -70,7 +67,6 @@ int register_tm2_concat_op() return 0; } - int unregister_tm2_concat_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_conv.c b/source/serializer/tmfile/op/tm2_conv.c index e8aa3144c..8397206ba 100644 --- a/source/serializer/tmfile/op/tm2_conv.c +++ b/source/serializer/tmfile/op/tm2_conv.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int conv_op_map(int op) { return OP_CONV; } - static int tm2_load_conv(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_ConvParam* tm_param = ( TM2_ConvParam* )(mem_base + tm_op->offset_t_param); + const TM2_ConvParam* tm_param = (TM2_ConvParam*)(mem_base + tm_op->offset_t_param); conv_param->kernel_h = tm_param->kernel_h; conv_param->kernel_w = tm_param->kernel_w; @@ -85,7 +83,6 @@ static int tm2_load_conv(struct graph* ir_graph, struct node* ir_node, const TM2 return 0; } - int register_tm2_conv_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -101,7 +98,6 @@ int register_tm2_conv_op() return 0; } - int unregister_tm2_conv_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_crop.c b/source/serializer/tmfile/op/tm2_crop.c index ce4b21212..1098bd48d 100644 --- a/source/serializer/tmfile/op/tm2_crop.c +++ b/source/serializer/tmfile/op/tm2_crop.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int crop_op_map(int op) { return OP_CROP; } - static int tm2_load_crop(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct crop_param* crop_param = ( struct crop_param* )ir_node->op.param_mem; + struct crop_param* crop_param = (struct crop_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_CropParam* tm_param = ( TM2_CropParam* )(mem_base + tm_op->offset_t_param); + const TM2_CropParam* tm_param = (TM2_CropParam*)(mem_base + tm_op->offset_t_param); crop_param->num_args = tm_param->num_args; crop_param->offset_c = tm_param->offset_c; @@ -62,7 +60,6 @@ static int tm2_load_crop(struct graph* ir_graph, struct node* ir_node, const TM2 return 0; } - int register_tm2_crop_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -78,7 +75,6 @@ int register_tm2_crop_op() return 0; } - int unregister_tm2_crop_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_deconv.c b/source/serializer/tmfile/op/tm2_deconv.c index 153216caf..286cad77e 100644 --- a/source/serializer/tmfile/op/tm2_deconv.c +++ b/source/serializer/tmfile/op/tm2_deconv.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int deconv_op_map(int op) { return OP_DECONV; } - static int tm2_load_deconv(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct deconv_param* deconv_param = ( struct deconv_param* )ir_node->op.param_mem; + struct deconv_param* deconv_param = (struct deconv_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_DeconvParam* tm_param = ( TM2_DeconvParam* )(mem_base + tm_op->offset_t_param); + const TM2_DeconvParam* tm_param = (TM2_DeconvParam*)(mem_base + tm_op->offset_t_param); deconv_param->kernel_h = tm_param->kernel_h; deconv_param->kernel_w = tm_param->kernel_w; @@ -64,17 +62,16 @@ static int tm2_load_deconv(struct graph* ir_graph, struct node* ir_node, const T deconv_param->dilation_h = tm_param->dilation_h; deconv_param->dilation_w = tm_param->dilation_w; - deconv_param->group = tm_param->group ; - deconv_param->num_output = tm_param->num_output ; - deconv_param->activation = tm_param->activation ; - + deconv_param->group = tm_param->group; + deconv_param->num_output = tm_param->num_output; + deconv_param->activation = tm_param->activation; + deconv_param->output_pad_h0 = tm_param->output_pad_h0; deconv_param->output_pad_w0 = tm_param->output_pad_w0; return 0; } - int register_tm2_deconv_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -90,7 +87,6 @@ int register_tm2_deconv_op() return 0; } - int unregister_tm2_deconv_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_depthtospace.c b/source/serializer/tmfile/op/tm2_depthtospace.c index f66248d1a..d7282f275 100644 --- a/source/serializer/tmfile/op/tm2_depthtospace.c +++ b/source/serializer/tmfile/op/tm2_depthtospace.c @@ -34,27 +34,24 @@ #include "device/device.h" #include "utility/log.h" - static int depthtospace_op_map(int op) { return OP_DEPTHTOSPACE; } - static int tm2_load_depthtospace(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct depthtospace_param* depthtospace_param = ( struct depthtospace_param* )ir_node->op.param_mem; + struct depthtospace_param* depthtospace_param = (struct depthtospace_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_DepthToSpaceParam* tm_param = ( TM2_DepthToSpaceParam* )(mem_base + tm_op->offset_t_param); + const TM2_DepthToSpaceParam* tm_param = (TM2_DepthToSpaceParam*)(mem_base + tm_op->offset_t_param); depthtospace_param->block_size = tm_param->block_size; return 0; } - int register_tm2_depthtospace_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -70,7 +67,6 @@ int register_tm2_depthtospace_op() return 0; } - int unregister_tm2_depthtospace_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_detection_output.c b/source/serializer/tmfile/op/tm2_detection_output.c index 152c8a1a6..50e1edd4c 100644 --- a/source/serializer/tmfile/op/tm2_detection_output.c +++ b/source/serializer/tmfile/op/tm2_detection_output.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int detection_op_map(int op) { return OP_DETECTION_OUTPUT; } - static int tm2_load_detection(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct detection_output_param* detection_output_param = ( struct detection_output_param* )ir_node->op.param_mem; + struct detection_output_param* detection_output_param = (struct detection_output_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_DetectionOutputParam* tm_param = ( TM2_DetectionOutputParam* )(mem_base + tm_op->offset_t_param); + const TM2_DetectionOutputParam* tm_param = (TM2_DetectionOutputParam*)(mem_base + tm_op->offset_t_param); detection_output_param->num_classes = tm_param->num_classes; detection_output_param->keep_top_k = tm_param->keep_top_k; @@ -58,7 +56,6 @@ static int tm2_load_detection(struct graph* ir_graph, struct node* ir_node, cons return 0; } - int register_tm2_detection_output_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -74,7 +71,6 @@ int register_tm2_detection_output_op() return 0; } - int unregister_tm2_detection_output_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_detection_postprocess.c b/source/serializer/tmfile/op/tm2_detection_postprocess.c index 371554288..0a06c2698 100644 --- a/source/serializer/tmfile/op/tm2_detection_postprocess.c +++ b/source/serializer/tmfile/op/tm2_detection_postprocess.c @@ -35,22 +35,18 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int detection_postprocess_op_map(int op) { return OP_DETECTION_POSTPROCESS; } - static int tm2_load_detection_postprocess(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct detection_postprocess_param* detection_postprocess_param = - ( struct detection_postprocess_param* )ir_node->op.param_mem; + struct detection_postprocess_param* detection_postprocess_param = (struct detection_postprocess_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_DetectionPostProcessParam* tm_param = - ( TM2_DetectionPostProcessParam* )(mem_base + tm_op->offset_t_param); + const TM2_DetectionPostProcessParam* tm_param = (TM2_DetectionPostProcessParam*)(mem_base + tm_op->offset_t_param); detection_postprocess_param->max_detections = tm_param->max_detections; detection_postprocess_param->max_classes_per_detection = tm_param->max_classes_per_detection; @@ -62,13 +58,12 @@ static int tm2_load_detection_postprocess(struct graph* ir_graph, struct node* i detection_postprocess_param->scales = (float*)sys_malloc(vf_scales->v_num * sizeof(float)); for (unsigned int i = 0; i < vf_scales->v_num; - i++) // TODO : need to check v_num .Next called in run function(detection_postprocess) default as 4 ? + i++) // TODO : need to check v_num .Next called in run function(detection_postprocess) default as 4 ? detection_postprocess_param->scales[i] = vf_scales->data[i]; return 0; } - int register_tm2_detection_postprocess_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -85,7 +80,6 @@ int register_tm2_detection_postprocess_op() return 0; } - int unregister_tm2_detection_postprocess_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_dropout.c b/source/serializer/tmfile/op/tm2_dropout.c index 1efea5674..3faeb9907 100644 --- a/source/serializer/tmfile/op/tm2_dropout.c +++ b/source/serializer/tmfile/op/tm2_dropout.c @@ -32,20 +32,17 @@ #include "device/device.h" #include "utility/log.h" - static int dropout_op_map(int op) { return OP_DROPOUT; } - static int tm2_load_dropout(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_dropout_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -61,7 +58,6 @@ int register_tm2_dropout_op() return 0; } - int unregister_tm2_dropout_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_eltwise.c b/source/serializer/tmfile/op/tm2_eltwise.c index 7c2b168d7..274e711dd 100644 --- a/source/serializer/tmfile/op/tm2_eltwise.c +++ b/source/serializer/tmfile/op/tm2_eltwise.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int eltwise_op_map(int op) { return OP_ELTWISE; } - static int tm2_load_eltwise(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct eltwise_param* eltwise_param = ( struct eltwise_param* )ir_node->op.param_mem; + struct eltwise_param* eltwise_param = (struct eltwise_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_EltwiseParam* tm_param = ( TM2_EltwiseParam* )(mem_base + tm_op->offset_t_param); + const TM2_EltwiseParam* tm_param = (TM2_EltwiseParam*)(mem_base + tm_op->offset_t_param); eltwise_param->type = tm_param->type; eltwise_param->caffe_flavor = tm_param->caffe_flavor; @@ -58,7 +56,6 @@ static int tm2_load_eltwise(struct graph* ir_graph, struct node* ir_node, const return 0; } - int register_tm2_eltwise_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -74,7 +71,6 @@ int register_tm2_eltwise_op() return 0; } - int unregister_tm2_eltwise_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_elu.c b/source/serializer/tmfile/op/tm2_elu.c index 1cb72d337..5a4147542 100644 --- a/source/serializer/tmfile/op/tm2_elu.c +++ b/source/serializer/tmfile/op/tm2_elu.c @@ -34,27 +34,24 @@ #include "device/device.h" #include "utility/log.h" - static int elu_op_map(int op) { return OP_ELU; } - static int tm2_load_elu(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct elu_param* param = ( struct elu_param* )ir_node->op.param_mem; + struct elu_param* param = (struct elu_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_EluParam* tm_param = ( TM2_EluParam* )(mem_base + tm_op->offset_t_param); + const TM2_EluParam* tm_param = (TM2_EluParam*)(mem_base + tm_op->offset_t_param); param->alpha = tm_param->alpha; return 0; } - int register_tm2_elu_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -70,7 +67,6 @@ int register_tm2_elu_op() return 0; } - int unregister_tm2_elu_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_embedding.c b/source/serializer/tmfile/op/tm2_embedding.c index f91165d53..0cb838dab 100644 --- a/source/serializer/tmfile/op/tm2_embedding.c +++ b/source/serializer/tmfile/op/tm2_embedding.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int gather_op_map(int op) { return OP_EMBEDDING; } - static int tm2_load_embedding(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct embedding_param* gather_param = ( struct embedding_param* )ir_node->op.param_mem; + struct embedding_param* gather_param = (struct embedding_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_EmbedParam* tm_param = ( TM2_EmbedParam* )(mem_base + tm_op->offset_t_param); + const TM2_EmbedParam* tm_param = (TM2_EmbedParam*)(mem_base + tm_op->offset_t_param); // gather_param->bias_term = tm_param->bias_term; gather_param->input_dim = tm_param->input_dim; @@ -57,7 +55,6 @@ static int tm2_load_embedding(struct graph* ir_graph, struct node* ir_node, cons return 0; } - int register_tm2_embedding_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -73,7 +70,6 @@ int register_tm2_embedding_op() return 0; } - int unregister_tm2_embedding_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_expand.c b/source/serializer/tmfile/op/tm2_expand.c index 441d0671e..11090dc18 100644 --- a/source/serializer/tmfile/op/tm2_expand.c +++ b/source/serializer/tmfile/op/tm2_expand.c @@ -35,29 +35,26 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int expand_op_map(int op) { return OP_EXPAND; } - static int tm2_load_expand(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, - const TM2_Operator* tm_op) + const TM2_Operator* tm_op) { - struct expand_param* param = ( struct expand_param* )ir_node->op.param_mem; + struct expand_param* param = (struct expand_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_ExpandParam* tm_param = ( TM2_ExpandParam* )(mem_base + tm_op->offset_t_param); + const TM2_ExpandParam* tm_param = (TM2_ExpandParam*)(mem_base + tm_op->offset_t_param); if (tm_param->offset_ex_shape != TM2_NOT_SET) { - const TM2_Vector_dims* v_ex_shape = ( TM2_Vector_dims* )(mem_base + tm_param->offset_ex_shape); - param->ex_shape = ( int* )sys_malloc(v_ex_shape->v_num * sizeof(int)); + const TM2_Vector_dims* v_ex_shape = (TM2_Vector_dims*)(mem_base + tm_param->offset_ex_shape); + param->ex_shape = (int*)sys_malloc(v_ex_shape->v_num * sizeof(int)); for (unsigned int i = 0; i < v_ex_shape->v_num; i++) { param->ex_shape[i] = v_ex_shape->dims[i]; - } } param->dim_num = tm_param->dim_num; @@ -65,7 +62,6 @@ static int tm2_load_expand(struct graph* ir_graph, struct node* ir_node, const T return 0; } - int register_tm2_expand_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -81,7 +77,6 @@ int register_tm2_expand_op() return 0; } - int unregister_tm2_expand_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_expanddims.c b/source/serializer/tmfile/op/tm2_expanddims.c index d0a8a7a6f..6826d62a4 100644 --- a/source/serializer/tmfile/op/tm2_expanddims.c +++ b/source/serializer/tmfile/op/tm2_expanddims.c @@ -34,27 +34,24 @@ #include "device/device.h" #include "utility/log.h" - static int expanddims_op_map(int op) { return OP_EXPANDDIMS; } - static int tm2_load_expanddims(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct expanddims_param* expanddims_param = ( struct expanddims_param* )ir_node->op.param_mem; + struct expanddims_param* expanddims_param = (struct expanddims_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_ExpanddimsParam* tm_param = ( TM2_ExpanddimsParam* )(mem_base + tm_op->offset_t_param); + const TM2_ExpanddimsParam* tm_param = (TM2_ExpanddimsParam*)(mem_base + tm_op->offset_t_param); expanddims_param->axis = tm_param->axis; return 0; } - int register_tm2_expanddims_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -70,7 +67,6 @@ int register_tm2_expanddims_op() return 0; } - int unregister_tm2_expanddims_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_fc.c b/source/serializer/tmfile/op/tm2_fc.c index f61d49277..6a8920bf9 100644 --- a/source/serializer/tmfile/op/tm2_fc.c +++ b/source/serializer/tmfile/op/tm2_fc.c @@ -34,29 +34,26 @@ #include "device/device.h" #include "utility/log.h" - static int fc_op_map(int op) { return OP_FC; } - static int tm2_load_fc(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { /* todo: using new TM2 model definition*/ /* TODO: get input_channel from tm_param */ - struct fc_param* fc_param = ( struct fc_param* )ir_node->op.param_mem; + struct fc_param* fc_param = (struct fc_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_FCParam* tm_param = ( TM2_FCParam* )(mem_base + tm_op->offset_t_param); + const TM2_FCParam* tm_param = (TM2_FCParam*)(mem_base + tm_op->offset_t_param); fc_param->num_output = tm_param->num_output; return 0; } - int register_tm2_fc_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -72,7 +69,6 @@ int register_tm2_fc_op() return 0; } - int unregister_tm2_fc_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_flatten.c b/source/serializer/tmfile/op/tm2_flatten.c index 7401139ca..9a95a0c40 100644 --- a/source/serializer/tmfile/op/tm2_flatten.c +++ b/source/serializer/tmfile/op/tm2_flatten.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int flatten_op_map(int op) { return OP_FLATTEN; } - static int tm2_load_flatten(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct flatten_param* flatten_param = ( struct flatten_param* )ir_node->op.param_mem; + struct flatten_param* flatten_param = (struct flatten_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_FlattenParam* tm_param = ( TM2_FlattenParam* )(mem_base + tm_op->offset_t_param); + const TM2_FlattenParam* tm_param = (TM2_FlattenParam*)(mem_base + tm_op->offset_t_param); flatten_param->end_axis = tm_param->end_axis; flatten_param->axis = tm_param->axis; @@ -55,7 +53,6 @@ static int tm2_load_flatten(struct graph* ir_graph, struct node* ir_node, const return 0; } - int register_tm2_flatten_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -71,7 +68,6 @@ int register_tm2_flatten_op() return 0; } - int unregister_tm2_flatten_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_gather.c b/source/serializer/tmfile/op/tm2_gather.c index 092fabf0d..6f63b532c 100644 --- a/source/serializer/tmfile/op/tm2_gather.c +++ b/source/serializer/tmfile/op/tm2_gather.c @@ -34,24 +34,22 @@ #include "device/device.h" #include "utility/log.h" - static int gather_op_map(int op) { return OP_GATHER; } - static int tm2_load_gather(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct gather_param* gather_param = ( struct gather_param* )ir_node->op.param_mem; + struct gather_param* gather_param = (struct gather_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_GatherParam* tm_param = ( TM2_GatherParam* )(mem_base + tm_op->offset_t_param); + const TM2_GatherParam* tm_param = (TM2_GatherParam*)(mem_base + tm_op->offset_t_param); - gather_param->axis = tm_param->axis; - gather_param->indices_num = tm_param->indices_num ; - if(tm_param->is_onnx) + gather_param->axis = tm_param->axis; + gather_param->indices_num = tm_param->indices_num; + if (tm_param->is_onnx) gather_param->is_onnx = true; else gather_param->is_onnx = false; @@ -59,7 +57,6 @@ static int tm2_load_gather(struct graph* ir_graph, struct node* ir_node, const T return 0; } - int register_tm2_gather_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -75,7 +72,6 @@ int register_tm2_gather_op() return 0; } - int unregister_tm2_gather_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_gemm.c b/source/serializer/tmfile/op/tm2_gemm.c index 574f6cf02..491e4ccfa 100644 --- a/source/serializer/tmfile/op/tm2_gemm.c +++ b/source/serializer/tmfile/op/tm2_gemm.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int gemm_op_map(int op) { return OP_GEMM; } - static int tm2_load_gemm(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct gemm_param* gemm_param = ( struct gemm_param* )ir_node->op.param_mem; + struct gemm_param* gemm_param = (struct gemm_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_GemmParam* tm_param = ( TM2_GemmParam* )(mem_base + tm_op->offset_t_param); + const TM2_GemmParam* tm_param = (TM2_GemmParam*)(mem_base + tm_op->offset_t_param); gemm_param->alpha = tm_param->alpha; gemm_param->beta = tm_param->beta; @@ -57,7 +55,6 @@ static int tm2_load_gemm(struct graph* ir_graph, struct node* ir_node, const TM2 return 0; } - int register_tm2_gemm_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -73,7 +70,6 @@ int register_tm2_gemm_op() return 0; } - int unregister_tm2_gemm_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_generic.c b/source/serializer/tmfile/op/tm2_generic.c index 5cdf3ebe3..0258d2e2c 100644 --- a/source/serializer/tmfile/op/tm2_generic.c +++ b/source/serializer/tmfile/op/tm2_generic.c @@ -34,29 +34,26 @@ #include "device/device.h" #include "utility/log.h" - static int generic_op_map(int op) { return OP_GENERIC; } - static int tm2_load_generic(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct generic_param* generic_param = ( struct generic_param* )ir_node->op.param_mem; + struct generic_param* generic_param = (struct generic_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_GenericParam* tm_param = ( TM2_GenericParam* )(mem_base + tm_op->offset_t_param); + const TM2_GenericParam* tm_param = (TM2_GenericParam*)(mem_base + tm_op->offset_t_param); generic_param->max_input_num = tm_param->max_input_num; generic_param->max_output_num = tm_param->max_output_num; - generic_param->op_name = ( char* )&tm_param->offset_s_opname; // TODO: Need to check . + generic_param->op_name = (char*)&tm_param->offset_s_opname; // TODO: Need to check . return 0; } - int register_tm2_generic_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -72,7 +69,6 @@ int register_tm2_generic_op() return 0; } - int unregister_tm2_generic_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_gru.c b/source/serializer/tmfile/op/tm2_gru.c index f34cc0533..09db3c174 100644 --- a/source/serializer/tmfile/op/tm2_gru.c +++ b/source/serializer/tmfile/op/tm2_gru.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int gru_op_map(int op) { return OP_GRU; } - static int tm2_load_gru(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct gru_param* gru_param = ( struct gru_param* )ir_node->op.param_mem; + struct gru_param* gru_param = (struct gru_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_GRUParam* tm_param = ( TM2_GRUParam* )(mem_base + tm_op->offset_t_param); + const TM2_GRUParam* tm_param = (TM2_GRUParam*)(mem_base + tm_op->offset_t_param); gru_param->clip = tm_param->clip; gru_param->output_len = tm_param->output_len; @@ -63,7 +61,6 @@ static int tm2_load_gru(struct graph* ir_graph, struct node* ir_node, const TM2_ return 0; } - int register_tm2_gru_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -79,7 +76,6 @@ int register_tm2_gru_op() return 0; } - int unregister_tm2_gru_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_hardsigmoid.c b/source/serializer/tmfile/op/tm2_hardsigmoid.c index 0dcb40dcc..c7e681845 100644 --- a/source/serializer/tmfile/op/tm2_hardsigmoid.c +++ b/source/serializer/tmfile/op/tm2_hardsigmoid.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int gather_op_map(int op) { return OP_HARDSIGMOID; } - static int tm2_load_hard_sigmoid(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct hard_sigmoid_param* gather_param = ( struct hard_sigmoid_param* )ir_node->op.param_mem; + struct hard_sigmoid_param* gather_param = (struct hard_sigmoid_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_HardsigmoidParam* tm_param = ( TM2_HardsigmoidParam* )(mem_base + tm_op->offset_t_param); + const TM2_HardsigmoidParam* tm_param = (TM2_HardsigmoidParam*)(mem_base + tm_op->offset_t_param); gather_param->alpha = tm_param->alpha; gather_param->beta = tm_param->beta; @@ -55,7 +53,6 @@ static int tm2_load_hard_sigmoid(struct graph* ir_graph, struct node* ir_node, c return 0; } - int register_tm2_hardsigmoid_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -71,7 +68,6 @@ int register_tm2_hardsigmoid_op() return 0; } - int unregister_tm2_hardsigmoid_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_hardswish.c b/source/serializer/tmfile/op/tm2_hardswish.c index 03ebf3260..4f42636be 100644 --- a/source/serializer/tmfile/op/tm2_hardswish.c +++ b/source/serializer/tmfile/op/tm2_hardswish.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int gather_op_map(int op) { return OP_HARDSWISH; } - static int tm2_load_hardswish(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct hardswish_param* gather_param = ( struct hardswish_param* )ir_node->op.param_mem; + struct hardswish_param* gather_param = (struct hardswish_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_HardSwishParam* tm_param = ( TM2_HardSwishParam* )(mem_base + tm_op->offset_t_param); + const TM2_HardSwishParam* tm_param = (TM2_HardSwishParam*)(mem_base + tm_op->offset_t_param); gather_param->alpha = tm_param->alpha; gather_param->beta = tm_param->beta; @@ -55,7 +53,6 @@ static int tm2_load_hardswish(struct graph* ir_graph, struct node* ir_node, cons return 0; } - int register_tm2_hardswish_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -71,7 +68,6 @@ int register_tm2_hardswish_op() return 0; } - int unregister_tm2_hardswish_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_instancenorm.c b/source/serializer/tmfile/op/tm2_instancenorm.c index 6f9c17bf4..526987c59 100644 --- a/source/serializer/tmfile/op/tm2_instancenorm.c +++ b/source/serializer/tmfile/op/tm2_instancenorm.c @@ -34,27 +34,24 @@ #include "device/device.h" #include "utility/log.h" - static int instancenorm_op_map(int op) { return OP_INSTANCENORM; } - static int tm2_load_instancenorm(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct instancenorm_Param* gather_param = ( struct instancenorm_Param* )ir_node->op.param_mem; + struct instancenorm_Param* gather_param = (struct instancenorm_Param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_InstanceNormParam* tm_param = ( TM2_InstanceNormParam* )(mem_base + tm_op->offset_t_param); + const TM2_InstanceNormParam* tm_param = (TM2_InstanceNormParam*)(mem_base + tm_op->offset_t_param); gather_param->eps = tm_param->eps; return 0; } - int register_tm2_instancenorm_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -70,7 +67,6 @@ int register_tm2_instancenorm_op() return 0; } - int unregister_tm2_instancenorm_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_interp.c b/source/serializer/tmfile/op/tm2_interp.c index aea2786ee..992a0c069 100644 --- a/source/serializer/tmfile/op/tm2_interp.c +++ b/source/serializer/tmfile/op/tm2_interp.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int interp_op_map(int op) { return OP_INTERP; } - static int tm2_load_interp(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct interp_param* param = ( struct interp_param* )ir_node->op.param_mem; + struct interp_param* param = (struct interp_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_InterpParam* tm_param = ( TM2_InterpParam* )(mem_base + tm_op->offset_t_param); + const TM2_InterpParam* tm_param = (TM2_InterpParam*)(mem_base + tm_op->offset_t_param); param->resize_type = tm_param->resize_type; param->width_scale = tm_param->width_scale; @@ -58,7 +56,6 @@ static int tm2_load_interp(struct graph* ir_graph, struct node* ir_node, const T return 0; } - int register_tm2_interp_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -74,7 +71,6 @@ int register_tm2_interp_op() return 0; } - int unregister_tm2_interp_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_l2normalization.c b/source/serializer/tmfile/op/tm2_l2normalization.c index 94c777c12..52727ffd4 100644 --- a/source/serializer/tmfile/op/tm2_l2normalization.c +++ b/source/serializer/tmfile/op/tm2_l2normalization.c @@ -32,19 +32,16 @@ #include "device/device.h" #include "utility/log.h" - static int l2normalization_op_map(int op) { return OP_L2NORMALIZATION; } - static int tm2_load_l2normalization(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_l2normalization_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -60,7 +57,6 @@ int register_tm2_l2normalization_op() return 0; } - int unregister_tm2_l2normalization_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_l2pool.c b/source/serializer/tmfile/op/tm2_l2pool.c index 1ff98e4e4..e569383f3 100644 --- a/source/serializer/tmfile/op/tm2_l2pool.c +++ b/source/serializer/tmfile/op/tm2_l2pool.c @@ -34,15 +34,13 @@ #include "device/device.h" #include "utility/log.h" - static int l2pool_op_map(int op) { return OP_L2POOL; } - static int tm2_load_l2pool(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, - const TM2_Operator* tm_op) + const TM2_Operator* tm_op) { struct l2pool_param* l2pool_param = (struct l2pool_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; @@ -56,7 +54,6 @@ static int tm2_load_l2pool(struct graph* ir_graph, struct node* ir_node, const T return 0; } - int register_tm2_l2pool_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -72,7 +69,6 @@ int register_tm2_l2pool_op() return 0; } - int unregister_tm2_l2pool_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_logical.c b/source/serializer/tmfile/op/tm2_logical.c index d6fd3079f..ad2b77f59 100644 --- a/source/serializer/tmfile/op/tm2_logical.c +++ b/source/serializer/tmfile/op/tm2_logical.c @@ -34,27 +34,24 @@ #include "device/device.h" #include "utility/log.h" - static int logical_op_map(int op) { return OP_LOGICAL; } - static int tm2_load_logical(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct logical_param* logical_param = ( struct logical_param* )ir_node->op.param_mem; + struct logical_param* logical_param = (struct logical_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_LogicalParam* tm_param = ( TM2_LogicalParam* )(mem_base + tm_op->offset_t_param); + const TM2_LogicalParam* tm_param = (TM2_LogicalParam*)(mem_base + tm_op->offset_t_param); logical_param->type = tm_param->type; return 0; } - int register_tm2_logical_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -70,7 +67,6 @@ int register_tm2_logical_op() return 0; } - int unregister_tm2_logical_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_logistic.c b/source/serializer/tmfile/op/tm2_logistic.c index b1a585d20..b5a815db4 100644 --- a/source/serializer/tmfile/op/tm2_logistic.c +++ b/source/serializer/tmfile/op/tm2_logistic.c @@ -32,20 +32,17 @@ #include "device/device.h" #include "utility/log.h" - static int logistic_op_map(int op) { return OP_LOGISTIC; } - static int tm2_load_logistic(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_logistic_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -61,7 +58,6 @@ int register_tm2_logistic_op() return 0; } - int unregister_tm2_logistic_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_logsoftmax.c b/source/serializer/tmfile/op/tm2_logsoftmax.c index 0dcc23c2b..afc0d0303 100644 --- a/source/serializer/tmfile/op/tm2_logsoftmax.c +++ b/source/serializer/tmfile/op/tm2_logsoftmax.c @@ -34,13 +34,11 @@ #include "device/device.h" #include "utility/log.h" - static int logsoftmax_op_map(int op) { return OP_LOGSOFTMAX; } - static int tm2_load_logsoftmax(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { struct logsoftmax_param* logsoftmax_param = (struct logsoftmax_param*)ir_node->op.param_mem; @@ -53,7 +51,6 @@ static int tm2_load_logsoftmax(struct graph* ir_graph, struct node* ir_node, con return 0; } - int register_tm2_logsoftmax_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -69,7 +66,6 @@ int register_tm2_logsoftmax_op() return 0; } - int unregister_tm2_logsoftmax_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_lrn.c b/source/serializer/tmfile/op/tm2_lrn.c index 9a8a463a3..f5b536ac7 100644 --- a/source/serializer/tmfile/op/tm2_lrn.c +++ b/source/serializer/tmfile/op/tm2_lrn.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int lrn_op_map(int op) { return OP_LRN; } - static int tm2_load_lrn(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct lrn_param* lrn_param = ( struct lrn_param* )ir_node->op.param_mem; + struct lrn_param* lrn_param = (struct lrn_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_LRNParam* tm_param = ( TM2_LRNParam* )(mem_base + tm_op->offset_t_param); + const TM2_LRNParam* tm_param = (TM2_LRNParam*)(mem_base + tm_op->offset_t_param); lrn_param->local_size = tm_param->local_size; lrn_param->alpha = tm_param->alpha; @@ -58,7 +56,6 @@ static int tm2_load_lrn(struct graph* ir_graph, struct node* ir_node, const TM2_ return 0; } - int register_tm2_lrn_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -74,7 +71,6 @@ int register_tm2_lrn_op() return 0; } - int unregister_tm2_lrn_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_lstm.c b/source/serializer/tmfile/op/tm2_lstm.c index f8143802a..cdd982998 100644 --- a/source/serializer/tmfile/op/tm2_lstm.c +++ b/source/serializer/tmfile/op/tm2_lstm.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int lstm_op_map(int op) { return OP_LSTM; } - static int tm2_load_lstm(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct lstm_param* lstm_param = ( struct lstm_param* )ir_node->op.param_mem; + struct lstm_param* lstm_param = (struct lstm_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_LstmParam* tm_param = ( TM2_LstmParam* )(mem_base + tm_op->offset_t_param); + const TM2_LstmParam* tm_param = (TM2_LstmParam*)(mem_base + tm_op->offset_t_param); lstm_param->forget_bias = tm_param->forget_bias; lstm_param->clip = tm_param->clip; @@ -71,7 +69,6 @@ static int tm2_load_lstm(struct graph* ir_graph, struct node* ir_node, const TM2 return 0; } - int register_tm2_lstm_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -87,7 +84,6 @@ int register_tm2_lstm_op() return 0; } - int unregister_tm2_lstm_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_matmul.c b/source/serializer/tmfile/op/tm2_matmul.c index 3ccb1221d..11efabd7e 100644 --- a/source/serializer/tmfile/op/tm2_matmul.c +++ b/source/serializer/tmfile/op/tm2_matmul.c @@ -32,20 +32,17 @@ #include "device/device.h" #include "utility/log.h" - static int matmul_op_map(int op) { return OP_MATMUL; } - static int tm2_load_matmul(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_matmul_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -61,7 +58,6 @@ int register_tm2_matmul_op() return 0; } - int unregister_tm2_matmul_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_maximum.c b/source/serializer/tmfile/op/tm2_maximum.c index 4319e73f3..ca19f22e9 100644 --- a/source/serializer/tmfile/op/tm2_maximum.c +++ b/source/serializer/tmfile/op/tm2_maximum.c @@ -32,20 +32,17 @@ #include "device/device.h" #include "utility/log.h" - static int max_op_map(int op) { return OP_MAXIMUM; } - static int tm2_load_max(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, - const TM2_Operator* tm_op) + const TM2_Operator* tm_op) { return 0; } - int register_tm2_maximum_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -61,7 +58,6 @@ int register_tm2_maximum_op() return 0; } - int unregister_tm2_maximum_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_mean.c b/source/serializer/tmfile/op/tm2_mean.c index d6e7888d0..9fa66927c 100644 --- a/source/serializer/tmfile/op/tm2_mean.c +++ b/source/serializer/tmfile/op/tm2_mean.c @@ -32,20 +32,17 @@ #include "device/device.h" #include "utility/log.h" - static int mean_op_map(int op) { return OP_MEAN; } - static int tm2_load_mean(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_mean_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -61,7 +58,6 @@ int register_tm2_mean_op() return 0; } - int unregister_tm2_mean_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_mish.c b/source/serializer/tmfile/op/tm2_mish.c index 986169940..a3a46d932 100644 --- a/source/serializer/tmfile/op/tm2_mish.c +++ b/source/serializer/tmfile/op/tm2_mish.c @@ -32,19 +32,16 @@ #include "device/device.h" #include "utility/log.h" - static int mish_op_map(int op) { return OP_MISH; } - static int tm2_load_mish(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_mish_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -60,7 +57,6 @@ int register_tm2_mish_op() return 0; } - int unregister_tm2_mish_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_mvn.c b/source/serializer/tmfile/op/tm2_mvn.c index 49383d5b0..4d8435dad 100644 --- a/source/serializer/tmfile/op/tm2_mvn.c +++ b/source/serializer/tmfile/op/tm2_mvn.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int mvn_op_map(int op) { return OP_MVN; } - static int tm2_load_mvn(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct mvn_param* gather_param = ( struct mvn_param* )ir_node->op.param_mem; + struct mvn_param* gather_param = (struct mvn_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_MVNParam* tm_param = ( TM2_MVNParam* )(mem_base + tm_op->offset_t_param); + const TM2_MVNParam* tm_param = (TM2_MVNParam*)(mem_base + tm_op->offset_t_param); gather_param->across_channels = tm_param->across_channels; gather_param->eps = tm_param->eps; @@ -56,7 +54,6 @@ static int tm2_load_mvn(struct graph* ir_graph, struct node* ir_node, const TM2_ return 0; } - int register_tm2_mvn_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -72,7 +69,6 @@ int register_tm2_mvn_op() return 0; } - int unregister_tm2_mvn_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_noop.c b/source/serializer/tmfile/op/tm2_noop.c index 342ee76f6..e033835e3 100644 --- a/source/serializer/tmfile/op/tm2_noop.c +++ b/source/serializer/tmfile/op/tm2_noop.c @@ -32,20 +32,17 @@ #include "device/device.h" #include "utility/log.h" - static int noop_op_map(int op) { return OP_NOOP; } - static int tm2_load_noop(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_noop_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -61,7 +58,6 @@ int register_tm2_noop_op() return 0; } - int unregister_tm2_noop_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_normalize.c b/source/serializer/tmfile/op/tm2_normalize.c index 349052c69..a533212ed 100644 --- a/source/serializer/tmfile/op/tm2_normalize.c +++ b/source/serializer/tmfile/op/tm2_normalize.c @@ -34,19 +34,17 @@ #include "device/device.h" #include "utility/log.h" - static int normalize_op_map(int op) { return OP_NORMALIZE; } - static int tm2_load_normalize(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct normalize_param* normalize_param = ( struct normalize_param* )ir_node->op.param_mem; + struct normalize_param* normalize_param = (struct normalize_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_NormalizeParam* tm_param = ( TM2_NormalizeParam* )(mem_base + tm_op->offset_t_param); + const TM2_NormalizeParam* tm_param = (TM2_NormalizeParam*)(mem_base + tm_op->offset_t_param); normalize_param->across_spatial = tm_param->across_spatial; normalize_param->channel_shared = tm_param->channel_shared; @@ -54,7 +52,6 @@ static int tm2_load_normalize(struct graph* ir_graph, struct node* ir_node, cons return 0; } - int register_tm2_normalize_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -70,7 +67,6 @@ int register_tm2_normalize_op() return 0; } - int unregister_tm2_normalize_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_pad.c b/source/serializer/tmfile/op/tm2_pad.c index 6bcf48d39..1c6b36ce6 100644 --- a/source/serializer/tmfile/op/tm2_pad.c +++ b/source/serializer/tmfile/op/tm2_pad.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int pad_op_map(int op) { return OP_PAD; } - static int tm2_load_pad(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct pad_param* pad_param = ( struct pad_param* )ir_node->op.param_mem; + struct pad_param* pad_param = (struct pad_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_PadParam* tm_param = ( TM2_PadParam* )(mem_base + tm_op->offset_t_param); + const TM2_PadParam* tm_param = (TM2_PadParam*)(mem_base + tm_op->offset_t_param); pad_param->mode = tm_param->mode; pad_param->value = tm_param->value; @@ -63,7 +61,6 @@ static int tm2_load_pad(struct graph* ir_graph, struct node* ir_node, const TM2_ return 0; } - int register_tm2_pad_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -79,7 +76,6 @@ int register_tm2_pad_op() return 0; } - int unregister_tm2_pad_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_permute.c b/source/serializer/tmfile/op/tm2_permute.c index 7ed874ffe..d51fd8681 100644 --- a/source/serializer/tmfile/op/tm2_permute.c +++ b/source/serializer/tmfile/op/tm2_permute.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int permute_op_map(int op) { return OP_PERMUTE; } - static int tm2_load_permute(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct permute_param* permute_param = ( struct permute_param* )ir_node->op.param_mem; + struct permute_param* permute_param = (struct permute_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_PermuteParam* tm_param = ( TM2_PermuteParam* )(mem_base + tm_op->offset_t_param); + const TM2_PermuteParam* tm_param = (TM2_PermuteParam*)(mem_base + tm_op->offset_t_param); permute_param->flag = tm_param->flag; permute_param->order0 = tm_param->order0; @@ -58,7 +56,6 @@ static int tm2_load_permute(struct graph* ir_graph, struct node* ir_node, const return 0; } - int register_tm2_permute_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -74,7 +71,6 @@ int register_tm2_permute_op() return 0; } - int unregister_tm2_permute_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_pool.c b/source/serializer/tmfile/op/tm2_pool.c index 572bf5cf6..fb08f6b2b 100644 --- a/source/serializer/tmfile/op/tm2_pool.c +++ b/source/serializer/tmfile/op/tm2_pool.c @@ -34,7 +34,6 @@ #include "device/device.h" #include "utility/log.h" - static int pooling_op_map(int op) { return OP_POOL; @@ -42,10 +41,10 @@ static int pooling_op_map(int op) static int tm2_load_pooling(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct pool_param* pool_param = ( struct pool_param* )ir_node->op.param_mem; + struct pool_param* pool_param = (struct pool_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_PoolParam* tm_param = ( TM2_PoolParam* )(mem_base + tm_op->offset_t_param); + const TM2_PoolParam* tm_param = (TM2_PoolParam*)(mem_base + tm_op->offset_t_param); pool_param->kernel_h = tm_param->kernel_h; pool_param->kernel_w = tm_param->kernel_w; @@ -69,7 +68,6 @@ static int tm2_load_pooling(struct graph* ir_graph, struct node* ir_node, const return 0; } - int register_tm2_pool_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -85,7 +83,6 @@ int register_tm2_pool_op() return 0; } - int unregister_tm2_pool_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_prelu.c b/source/serializer/tmfile/op/tm2_prelu.c index c7c934e43..a43223b37 100644 --- a/source/serializer/tmfile/op/tm2_prelu.c +++ b/source/serializer/tmfile/op/tm2_prelu.c @@ -32,20 +32,17 @@ #include "device/device.h" #include "utility/log.h" - static int prelu_op_map(int op) { return OP_PRELU; } - static int tm2_load_prelu(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_prelu_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -61,7 +58,6 @@ int register_tm2_prelu_op() return 0; } - int unregister_tm2_prelu_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_priorbox.c b/source/serializer/tmfile/op/tm2_priorbox.c index 6328f2bd5..76e21f261 100644 --- a/source/serializer/tmfile/op/tm2_priorbox.c +++ b/source/serializer/tmfile/op/tm2_priorbox.c @@ -35,13 +35,11 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int priorbox_op_map(int op) { return OP_PRIORBOX; } - static int tm2_load_priorbox(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { struct priorbox_param* priorbox_param = (struct priorbox_param*)ir_node->op.param_mem; @@ -86,10 +84,8 @@ static int tm2_load_priorbox(struct graph* ir_graph, struct node* ir_node, const return 0; } - // TODO: add unload op - int register_tm2_priorbox_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -105,7 +101,6 @@ int register_tm2_priorbox_op() return 0; } - int unregister_tm2_priorbox_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_psroipooling.c b/source/serializer/tmfile/op/tm2_psroipooling.c index ed86af268..6548e8319 100644 --- a/source/serializer/tmfile/op/tm2_psroipooling.c +++ b/source/serializer/tmfile/op/tm2_psroipooling.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int psroipooling_op_map(int op) { return OP_PSROIPOOLING; } - static int tm2_load_psroipooling(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct psroipooling_param* psroipooling_param = ( struct psroipooling_param* )ir_node->op.param_mem; + struct psroipooling_param* psroipooling_param = (struct psroipooling_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_PsroipoolingParam* tm_param = ( TM2_PsroipoolingParam* )(mem_base + tm_op->offset_t_param); + const TM2_PsroipoolingParam* tm_param = (TM2_PsroipoolingParam*)(mem_base + tm_op->offset_t_param); psroipooling_param->pooled_w = tm_param->pooled_w; psroipooling_param->pooled_h = tm_param->pooled_h; @@ -57,7 +55,6 @@ static int tm2_load_psroipooling(struct graph* ir_graph, struct node* ir_node, c return 0; } - int register_tm2_psroipooling_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -73,7 +70,6 @@ int register_tm2_psroipooling_op() return 0; } - int unregister_tm2_psroipooling_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_reciprocal.c b/source/serializer/tmfile/op/tm2_reciprocal.c index ff3da06df..384d70406 100644 --- a/source/serializer/tmfile/op/tm2_reciprocal.c +++ b/source/serializer/tmfile/op/tm2_reciprocal.c @@ -38,7 +38,7 @@ static int reciprocal_op_map(int op) } static int tm2_load_reciprocal(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, - const TM2_Operator* tm_op) + const TM2_Operator* tm_op) { return 0; } diff --git a/source/serializer/tmfile/op/tm2_reducel2.c b/source/serializer/tmfile/op/tm2_reducel2.c index 5c75bf27c..942780629 100644 --- a/source/serializer/tmfile/op/tm2_reducel2.c +++ b/source/serializer/tmfile/op/tm2_reducel2.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int reducel2_op_map(int op) { return OP_REDUCEL2; } - static int tm2_load_reducel2(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct reducel2_param* reducel2_param = ( struct reducel2_param* )ir_node->op.param_mem; + struct reducel2_param* reducel2_param = (struct reducel2_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_ReduceL2Param* tm_param = ( TM2_ReduceL2Param* )(mem_base + tm_op->offset_t_param); + const TM2_ReduceL2Param* tm_param = (TM2_ReduceL2Param*)(mem_base + tm_op->offset_t_param); reducel2_param->axis = tm_param->axis; reducel2_param->keepdim = tm_param->keepdim; @@ -55,7 +53,6 @@ static int tm2_load_reducel2(struct graph* ir_graph, struct node* ir_node, const return 0; } - int register_tm2_reducel2_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -71,7 +68,6 @@ int register_tm2_reducel2_op() return 0; } - int unregister_tm2_reducel2_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_reduction.c b/source/serializer/tmfile/op/tm2_reduction.c index a7d68cb67..e7b548003 100644 --- a/source/serializer/tmfile/op/tm2_reduction.c +++ b/source/serializer/tmfile/op/tm2_reduction.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int reduction_op_map(int op) { return OP_REDUCTION; } - static int tm2_load_reduction(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct reduction_param* reduction_param = ( struct reduction_param* )ir_node->op.param_mem; + struct reduction_param* reduction_param = (struct reduction_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_ReductionParam* tm_param = ( TM2_ReductionParam* )(mem_base + tm_op->offset_t_param); + const TM2_ReductionParam* tm_param = (TM2_ReductionParam*)(mem_base + tm_op->offset_t_param); reduction_param->dim_0 = tm_param->dim_0; reduction_param->dim_1 = tm_param->dim_1; @@ -59,7 +57,6 @@ static int tm2_load_reduction(struct graph* ir_graph, struct node* ir_node, cons return 0; } - int register_tm2_reduction_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -75,7 +72,6 @@ int register_tm2_reduction_op() return 0; } - int unregister_tm2_reduction_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_region.c b/source/serializer/tmfile/op/tm2_region.c index 0effa5d3b..15d55646e 100644 --- a/source/serializer/tmfile/op/tm2_region.c +++ b/source/serializer/tmfile/op/tm2_region.c @@ -35,20 +35,18 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int region_op_map(int op) { return OP_REGION; } - static int tm2_load_region(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct region_param* region_param = ( struct region_param* )ir_node->op.param_mem; + struct region_param* region_param = (struct region_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_RegionParam* tm_param = ( TM2_RegionParam* )(mem_base + tm_op->offset_t_param); + const TM2_RegionParam* tm_param = (TM2_RegionParam*)(mem_base + tm_op->offset_t_param); const TM2_Vector_floats* v_biases = (TM2_Vector_floats*)(mem_base + tm_param->offset_vf_biases); region_param->num_classes = tm_param->num_classes; @@ -67,7 +65,6 @@ static int tm2_load_region(struct graph* ir_graph, struct node* ir_node, const T return 0; } - int register_tm2_region_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -83,7 +80,6 @@ int register_tm2_region_op() return 0; } - int unregister_tm2_region_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_relu.c b/source/serializer/tmfile/op/tm2_relu.c index 22282f3f9..d6fb24a57 100644 --- a/source/serializer/tmfile/op/tm2_relu.c +++ b/source/serializer/tmfile/op/tm2_relu.c @@ -34,27 +34,24 @@ #include "device/device.h" #include "utility/log.h" - static int relu_op_map(int op) { return OP_RELU; } - static int tm2_load_relu(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct relu_param* relu_param = ( struct relu_param* )ir_node->op.param_mem; + struct relu_param* relu_param = (struct relu_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_ReLuParam* tm_param = ( TM2_ReLuParam* )(mem_base + tm_op->offset_t_param); + const TM2_ReLuParam* tm_param = (TM2_ReLuParam*)(mem_base + tm_op->offset_t_param); relu_param->negative_slope = tm_param->negative_slope; return 0; } - int register_tm2_relu_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -70,7 +67,6 @@ int register_tm2_relu_op() return 0; } - int unregister_tm2_relu_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_relu1.c b/source/serializer/tmfile/op/tm2_relu1.c index cba2ea20c..732cfc8e2 100644 --- a/source/serializer/tmfile/op/tm2_relu1.c +++ b/source/serializer/tmfile/op/tm2_relu1.c @@ -32,20 +32,17 @@ #include "device/device.h" #include "utility/log.h" - static int relu1_op_map(int op) { return OP_RELU1; } - static int tm2_load_relu1(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_relu1_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -61,7 +58,6 @@ int register_tm2_relu1_op() return 0; } - int unregister_tm2_relu1_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_relu6.c b/source/serializer/tmfile/op/tm2_relu6.c index 46686be27..74faff826 100644 --- a/source/serializer/tmfile/op/tm2_relu6.c +++ b/source/serializer/tmfile/op/tm2_relu6.c @@ -32,20 +32,17 @@ #include "device/device.h" #include "utility/log.h" - static int relu6_op_map(int op) { return OP_RELU6; } - static int tm2_load_relu6(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_relu6_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -61,7 +58,6 @@ int register_tm2_relu6_op() return 0; } - int unregister_tm2_relu6_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_reorg.c b/source/serializer/tmfile/op/tm2_reorg.c index 02e801945..c28667bdc 100644 --- a/source/serializer/tmfile/op/tm2_reorg.c +++ b/source/serializer/tmfile/op/tm2_reorg.c @@ -34,26 +34,23 @@ #include "device/device.h" #include "utility/log.h" - static int reorg_op_map(int op) { return OP_REORG; } - static int tm2_load_reorg(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct reorg_param* reorg_param = ( struct reorg_param* )ir_node->op.param_mem; + struct reorg_param* reorg_param = (struct reorg_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_ReorgParam* tm_param = ( TM2_ReorgParam* )(mem_base + tm_op->offset_t_param); + const TM2_ReorgParam* tm_param = (TM2_ReorgParam*)(mem_base + tm_op->offset_t_param); reorg_param->stride = tm_param->stride; return 0; } - int register_tm2_reorg_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -69,7 +66,6 @@ int register_tm2_reorg_op() return 0; } - int unregister_tm2_reorg_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_reshape.c b/source/serializer/tmfile/op/tm2_reshape.c index e890cfad3..4be1bfe08 100644 --- a/source/serializer/tmfile/op/tm2_reshape.c +++ b/source/serializer/tmfile/op/tm2_reshape.c @@ -35,20 +35,18 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int reshape_op_map(int op) { return OP_RESHAPE; } - static int tm2_load_reshape(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct reshape_param* param = ( struct reshape_param* )ir_node->op.param_mem; + struct reshape_param* param = (struct reshape_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_ReshapeParam* tm_param = ( TM2_ReshapeParam* )(mem_base + tm_op->offset_t_param); + const TM2_ReshapeParam* tm_param = (TM2_ReshapeParam*)(mem_base + tm_op->offset_t_param); // set the reverse if (tm_param->reverse) param->reverse = true; @@ -62,10 +60,10 @@ static int tm2_load_reshape(struct graph* ir_graph, struct node* ir_node, const if (tm_param->offset_re_shape != TM2_NOT_SET) { - const TM2_Vector_dims* v_re_shape = ( TM2_Vector_dims* )(mem_base + tm_param->offset_re_shape); + const TM2_Vector_dims* v_re_shape = (TM2_Vector_dims*)(mem_base + tm_param->offset_re_shape); param->dim_size = v_re_shape->v_num; - param->re_shape = ( int* )sys_malloc(v_re_shape->v_num * sizeof(int)); + param->re_shape = (int*)sys_malloc(v_re_shape->v_num * sizeof(int)); for (unsigned int i = 0; i < v_re_shape->v_num; i++) { @@ -76,7 +74,6 @@ static int tm2_load_reshape(struct graph* ir_graph, struct node* ir_node, const return 0; } - int register_tm2_reshape_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -92,7 +89,6 @@ int register_tm2_reshape_op() return 0; } - int unregister_tm2_reshape_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_resize.c b/source/serializer/tmfile/op/tm2_resize.c index 4e7a2fe1f..75fb2c9b6 100644 --- a/source/serializer/tmfile/op/tm2_resize.c +++ b/source/serializer/tmfile/op/tm2_resize.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int resize_op_map(int op) { return OP_RESIZE; } - static int tm2_load_resize(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct resize_param* resize_param = ( struct resize_param* )ir_node->op.param_mem; + struct resize_param* resize_param = (struct resize_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_ResizeParam* tm_param = ( TM2_ResizeParam* )(mem_base + tm_op->offset_t_param); + const TM2_ResizeParam* tm_param = (TM2_ResizeParam*)(mem_base + tm_op->offset_t_param); resize_param->scale_h = tm_param->scale_x; resize_param->scale_w = tm_param->scale_y; @@ -55,7 +53,6 @@ static int tm2_load_resize(struct graph* ir_graph, struct node* ir_node, const T return 0; } - int register_tm2_resize_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -71,7 +68,6 @@ int register_tm2_resize_op() return 0; } - int unregister_tm2_resize_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_reverse.c b/source/serializer/tmfile/op/tm2_reverse.c index 6107d44e1..ec37b4fd1 100644 --- a/source/serializer/tmfile/op/tm2_reverse.c +++ b/source/serializer/tmfile/op/tm2_reverse.c @@ -32,20 +32,17 @@ #include "device/device.h" #include "utility/log.h" - static int reverse_op_map(int op) { return OP_REVERSE; } - static int tm2_load_reverse(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_reverse_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -61,7 +58,6 @@ int register_tm2_reverse_op() return 0; } - int unregister_tm2_reverse_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_rnn.c b/source/serializer/tmfile/op/tm2_rnn.c index 82ee818ca..0c8a94c49 100644 --- a/source/serializer/tmfile/op/tm2_rnn.c +++ b/source/serializer/tmfile/op/tm2_rnn.c @@ -1,4 +1,4 @@ - /* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int rnn_op_map(int op) { return OP_RNN; } - static int tm2_load_rnn(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, - const TM2_Operator* tm_op) + const TM2_Operator* tm_op) { - struct rnn_param* rnn_param = (struct rnn_param* )ir_node->op.param_mem; - const struct tm2_priv* tm2_priv = (struct tm2_priv* )ir_graph->serializer_privacy; + struct rnn_param* rnn_param = (struct rnn_param*)ir_node->op.param_mem; + const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_RnnParam* tm_param = (TM2_RnnParam* )(mem_base + tm_op->offset_t_param); + const TM2_RnnParam* tm_param = (TM2_RnnParam*)(mem_base + tm_op->offset_t_param); rnn_param->clip = tm_param->clip; rnn_param->output_len = tm_param->output_len; @@ -62,12 +60,11 @@ static int tm2_load_rnn(struct graph* ir_graph, struct node* ir_node, const TM2_ return 0; } - int register_tm2_rnn_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); - if(tm2_s == NULL) + if (tm2_s == NULL) { TLOG_ERR("tengine serializer has not been registered yet\n"); return -1; @@ -78,7 +75,6 @@ int register_tm2_rnn_op() return 0; } - int unregister_tm2_rnn_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_roialign.c b/source/serializer/tmfile/op/tm2_roialign.c index 746436fa9..44626fa7d 100644 --- a/source/serializer/tmfile/op/tm2_roialign.c +++ b/source/serializer/tmfile/op/tm2_roialign.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int roialign_op_map(int op) { return OP_ROIALIGN; } - static int tm2_load_roialign(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct roialign_param* roialign_param = ( struct roialign_param* )ir_node->op.param_mem; + struct roialign_param* roialign_param = (struct roialign_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_RoialignParam* tm_param = ( TM2_RoialignParam* )(mem_base + tm_op->offset_t_param); + const TM2_RoialignParam* tm_param = (TM2_RoialignParam*)(mem_base + tm_op->offset_t_param); roialign_param->pooled_width = tm_param->pooled_width; roialign_param->pooled_height = tm_param->pooled_height; @@ -56,7 +54,6 @@ static int tm2_load_roialign(struct graph* ir_graph, struct node* ir_node, const return 0; } - int register_tm2_roialign_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -72,7 +69,6 @@ int register_tm2_roialign_op() return 0; } - int unregister_tm2_roialign_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_roipooling.c b/source/serializer/tmfile/op/tm2_roipooling.c index 53d8ec007..b1d617f98 100644 --- a/source/serializer/tmfile/op/tm2_roipooling.c +++ b/source/serializer/tmfile/op/tm2_roipooling.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int roi_pooling_op_map(int op) { return OP_ROIPOOLING; } - static int tm2_load_roi_pooling(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct roipooling_param* roi_pooling_param = ( struct roipooling_param* )ir_node->op.param_mem; + struct roipooling_param* roi_pooling_param = (struct roipooling_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_ROIPoolingParam* tm_param = ( TM2_ROIPoolingParam* )(mem_base + tm_op->offset_t_param); + const TM2_ROIPoolingParam* tm_param = (TM2_ROIPoolingParam*)(mem_base + tm_op->offset_t_param); roi_pooling_param->pooled_h = tm_param->pooled_h; roi_pooling_param->pooled_w = tm_param->pooled_w; @@ -56,7 +54,6 @@ static int tm2_load_roi_pooling(struct graph* ir_graph, struct node* ir_node, co return 0; } - int register_tm2_roipooling_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -72,7 +69,6 @@ int register_tm2_roipooling_op() return 0; } - int unregister_tm2_roipooling_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_round.c b/source/serializer/tmfile/op/tm2_round.c index 757430933..f7bcebc58 100644 --- a/source/serializer/tmfile/op/tm2_round.c +++ b/source/serializer/tmfile/op/tm2_round.c @@ -32,24 +32,21 @@ #include "device/device.h" #include "utility/log.h" - static int round_op_map(int op) { return OP_ROUND; } - static int tm2_load_round(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_round_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); - if(tm2_s == NULL) + if (tm2_s == NULL) { TLOG_ERR("tengine serializer has not been registered yet\n"); return -1; @@ -60,7 +57,6 @@ int register_tm2_round_op() return 0; } - int unregister_tm2_round_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_rpn.c b/source/serializer/tmfile/op/tm2_rpn.c index e43ec3f3b..a662076b9 100644 --- a/source/serializer/tmfile/op/tm2_rpn.c +++ b/source/serializer/tmfile/op/tm2_rpn.c @@ -35,20 +35,18 @@ #include "utility/vector.h" #include "utility/log.h" - static int rpn_op_map(int op) { return OP_RPN; } - static int tm2_load_rpn(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct rpn_param* rpn_param = ( struct rpn_param* )ir_node->op.param_mem; + struct rpn_param* rpn_param = (struct rpn_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_RPNParam* tm_param = ( TM2_RPNParam* )(mem_base + tm_op->offset_t_param); + const TM2_RPNParam* tm_param = (TM2_RPNParam*)(mem_base + tm_op->offset_t_param); rpn_param->basesize = tm_param->basesize; rpn_param->feat_stride = tm_param->feat_stride; @@ -65,7 +63,7 @@ static int tm2_load_rpn(struct graph* ir_graph, struct node* ir_node, const TM2_ for (unsigned int i = 0; i < v_anchor_scales->v_num; i++) { - push_vector_data(rpn_param->anchor_scales, ( void* )&v_anchor_scales->data[i]); + push_vector_data(rpn_param->anchor_scales, (void*)&v_anchor_scales->data[i]); } } @@ -77,14 +75,13 @@ static int tm2_load_rpn(struct graph* ir_graph, struct node* ir_node, const TM2_ for (unsigned int i = 0; i < v_ratios->v_num; i++) { - push_vector_data(rpn_param->ratios, ( void* )&v_ratios->data[i]); + push_vector_data(rpn_param->ratios, (void*)&v_ratios->data[i]); } } return 0; } - int register_tm2_rpn_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -100,7 +97,6 @@ int register_tm2_rpn_op() return 0; } - int unregister_tm2_rpn_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_scale.c b/source/serializer/tmfile/op/tm2_scale.c index 287a5001e..00d11b25f 100644 --- a/source/serializer/tmfile/op/tm2_scale.c +++ b/source/serializer/tmfile/op/tm2_scale.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int scale_op_map(int op) { return OP_SCALE; } - static int tm2_load_scale(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct scale_param* scale_param = ( struct scale_param* )ir_node->op.param_mem; + struct scale_param* scale_param = (struct scale_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_ScaleParam* tm_param = ( TM2_ScaleParam* )(mem_base + tm_op->offset_t_param); + const TM2_ScaleParam* tm_param = (TM2_ScaleParam*)(mem_base + tm_op->offset_t_param); scale_param->axis = tm_param->axis; scale_param->num_axes = tm_param->num_axes; @@ -56,7 +54,6 @@ static int tm2_load_scale(struct graph* ir_graph, struct node* ir_node, const TM return 0; } - int register_tm2_scale_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -72,7 +69,6 @@ int register_tm2_scale_op() return 0; } - int unregister_tm2_scale_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_scatter.c b/source/serializer/tmfile/op/tm2_scatter.c index a4e416c19..a7add086b 100644 --- a/source/serializer/tmfile/op/tm2_scatter.c +++ b/source/serializer/tmfile/op/tm2_scatter.c @@ -34,15 +34,13 @@ #include "device/device.h" #include "utility/log.h" - static int scatter_op_map(int op) { return OP_SCATTER; } - static int tm2_load_scatter(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, - const TM2_Operator* tm_op) + const TM2_Operator* tm_op) { struct scatter_param* scatter_param = (struct scatter_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; @@ -54,7 +52,6 @@ static int tm2_load_scatter(struct graph* ir_graph, struct node* ir_node, const return 0; } - int register_tm2_scatter_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -70,7 +67,6 @@ int register_tm2_scatter_op() return 0; } - int unregister_tm2_scatter_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_selu.c b/source/serializer/tmfile/op/tm2_selu.c index 34f1e5f4a..daae951da 100644 --- a/source/serializer/tmfile/op/tm2_selu.c +++ b/source/serializer/tmfile/op/tm2_selu.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int selu_op_map(int op) { return OP_SELU; } - static int tm2_load_selu(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct selu_param* selu_param = ( struct selu_param* )ir_node->op.param_mem; + struct selu_param* selu_param = (struct selu_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_SeluParam* tm_param = ( TM2_SeluParam* )(mem_base + tm_op->offset_t_param); + const TM2_SeluParam* tm_param = (TM2_SeluParam*)(mem_base + tm_op->offset_t_param); selu_param->alpha = tm_param->alpha; selu_param->lambda = tm_param->lambda; @@ -55,7 +53,6 @@ static int tm2_load_selu(struct graph* ir_graph, struct node* ir_node, const TM2 return 0; } - int register_tm2_selu_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -71,7 +68,6 @@ int register_tm2_selu_op() return 0; } - int unregister_tm2_selu_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_shape.c b/source/serializer/tmfile/op/tm2_shape.c index fc8394679..ae2770821 100644 --- a/source/serializer/tmfile/op/tm2_shape.c +++ b/source/serializer/tmfile/op/tm2_shape.c @@ -32,20 +32,17 @@ #include "device/device.h" #include "utility/log.h" - static int shape_op_map(int op) { return OP_SHAPE; } - static int tm2_load_shape(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_shape_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -61,7 +58,6 @@ int register_tm2_shape_op() return 0; } - int unregister_tm2_shape_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_shuffle_channel.c b/source/serializer/tmfile/op/tm2_shuffle_channel.c index f4e3086f4..b2323e3b7 100644 --- a/source/serializer/tmfile/op/tm2_shuffle_channel.c +++ b/source/serializer/tmfile/op/tm2_shuffle_channel.c @@ -34,27 +34,24 @@ #include "device/device.h" #include "utility/log.h" - static int shuffle_channel_op_map(int op) { return OP_SHUFFLECHANNEL; } - static int tm2_load_shuffle_channel(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct shuffle_channel_param* param = ( struct shuffle_channel_param* )ir_node->op.param_mem; + struct shuffle_channel_param* param = (struct shuffle_channel_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_ShuffleChannelParam* tm_param = ( TM2_ShuffleChannelParam* )(mem_base + tm_op->offset_t_param); + const TM2_ShuffleChannelParam* tm_param = (TM2_ShuffleChannelParam*)(mem_base + tm_op->offset_t_param); param->group = tm_param->group; return 0; } - int register_tm2_shuffle_channel_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -71,7 +68,6 @@ int register_tm2_shuffle_channel_op() return 0; } - int unregister_tm2_shuffle_channel_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_sigmoid.c b/source/serializer/tmfile/op/tm2_sigmoid.c index 51709da66..6cd020db2 100644 --- a/source/serializer/tmfile/op/tm2_sigmoid.c +++ b/source/serializer/tmfile/op/tm2_sigmoid.c @@ -32,20 +32,17 @@ #include "device/device.h" #include "utility/log.h" - static int sigmoid_op_map(int op) { return OP_SIGMOID; } - static int tm2_load_sigmoid(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_sigmoid_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -61,7 +58,6 @@ int register_tm2_sigmoid_op() return 0; } - int unregister_tm2_sigmoid_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_slice.c b/source/serializer/tmfile/op/tm2_slice.c index e2c6c80a5..42a011a2c 100644 --- a/source/serializer/tmfile/op/tm2_slice.c +++ b/source/serializer/tmfile/op/tm2_slice.c @@ -35,17 +35,15 @@ #include "utility/vector.h" #include "utility/log.h" - static int slice_op_map(int op) { return OP_SLICE; } - static int tm2_load_slice(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct slice_param* slice_param = ( struct slice_param* )ir_node->op.param_mem; + struct slice_param* slice_param = (struct slice_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; const TM2_SliceParam* tm_param = (TM2_SliceParam*)(mem_base + tm_op->offset_t_param); @@ -64,35 +62,34 @@ static int tm2_load_slice(struct graph* ir_graph, struct node* ir_node, const TM if (tm_param->offset_vi_begins != TM2_NOT_SET) { - const TM2_Vector_indices* v_begins = ( TM2_Vector_indices* )(mem_base + tm_param->offset_vi_begins); + const TM2_Vector_indices* v_begins = (TM2_Vector_indices*)(mem_base + tm_param->offset_vi_begins); for (unsigned int i = 0; i < v_begins->v_num; i++) { - push_vector_data(slice_param->begin_, ( void* )&v_begins->indices[i]); + push_vector_data(slice_param->begin_, (void*)&v_begins->indices[i]); } } if (tm_param->offset_vi_sizes != TM2_NOT_SET) { - const TM2_Vector_indices* v_size = ( TM2_Vector_indices* )(mem_base + tm_param->offset_vi_sizes); + const TM2_Vector_indices* v_size = (TM2_Vector_indices*)(mem_base + tm_param->offset_vi_sizes); for (unsigned int i = 0; i < v_size->v_num; i++) { - push_vector_data(slice_param->size_, ( void* )&v_size->indices[i]); + push_vector_data(slice_param->size_, (void*)&v_size->indices[i]); } } if (tm_param->offset_vi_slice_points != TM2_NOT_SET) { - const TM2_Vector_indices* v_slice_point = ( TM2_Vector_indices* )(mem_base + tm_param->offset_vi_slice_points); + const TM2_Vector_indices* v_slice_point = (TM2_Vector_indices*)(mem_base + tm_param->offset_vi_slice_points); for (unsigned int i = 0; i < v_slice_point->v_num; i++) { - push_vector_data(slice_param->slice_point_, ( void* )&v_slice_point->indices[i]); + push_vector_data(slice_param->slice_point_, (void*)&v_slice_point->indices[i]); } } return 0; } - int register_tm2_slice_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -108,7 +105,6 @@ int register_tm2_slice_op() return 0; } - int unregister_tm2_slice_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_softmax.c b/source/serializer/tmfile/op/tm2_softmax.c index 0eb832202..9cf340b07 100644 --- a/source/serializer/tmfile/op/tm2_softmax.c +++ b/source/serializer/tmfile/op/tm2_softmax.c @@ -34,27 +34,24 @@ #include "device/device.h" #include "utility/log.h" - static int softmax_op_map(int op) { return OP_SOFTMAX; } - static int tm2_load_softmax(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct softmax_param* softmax_param = ( struct softmax_param* )ir_node->op.param_mem; + struct softmax_param* softmax_param = (struct softmax_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_SoftmaxParam* tm_param = ( TM2_SoftmaxParam* )(mem_base + tm_op->offset_t_param); + const TM2_SoftmaxParam* tm_param = (TM2_SoftmaxParam*)(mem_base + tm_op->offset_t_param); softmax_param->axis = tm_param->axis; return 0; } - int register_tm2_softmax_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -70,7 +67,6 @@ int register_tm2_softmax_op() return 0; } - int unregister_tm2_softmax_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_softplus.c b/source/serializer/tmfile/op/tm2_softplus.c index 5e0f10f16..9f35786fa 100644 --- a/source/serializer/tmfile/op/tm2_softplus.c +++ b/source/serializer/tmfile/op/tm2_softplus.c @@ -38,7 +38,7 @@ static int softplus_op_map(int op) } static int tm2_load_softplus(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, - const TM2_Operator* tm_op) + const TM2_Operator* tm_op) { return 0; } diff --git a/source/serializer/tmfile/op/tm2_spacetobatchnd.c b/source/serializer/tmfile/op/tm2_spacetobatchnd.c index 2163d2fee..7f6f5aa3e 100644 --- a/source/serializer/tmfile/op/tm2_spacetobatchnd.c +++ b/source/serializer/tmfile/op/tm2_spacetobatchnd.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int spacetobatchnd_op_map(int op) { return OP_SPACETOBATCHND; } - static int tm2_load_spacetobatchnd(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct spacetobatchnd_param* spacetobatchnd_param = ( struct spacetobatchnd_param* )ir_node->op.param_mem; + struct spacetobatchnd_param* spacetobatchnd_param = (struct spacetobatchnd_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_SpaceToBatchNDParam* tm_param = ( TM2_SpaceToBatchNDParam* )(mem_base + tm_op->offset_t_param); + const TM2_SpaceToBatchNDParam* tm_param = (TM2_SpaceToBatchNDParam*)(mem_base + tm_op->offset_t_param); spacetobatchnd_param->dilation_x = tm_param->dilation_x; spacetobatchnd_param->dilation_y = tm_param->dilation_y; @@ -59,7 +57,6 @@ static int tm2_load_spacetobatchnd(struct graph* ir_graph, struct node* ir_node, return 0; } - int register_tm2_spacetobatchnd_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -75,7 +72,6 @@ int register_tm2_spacetobatchnd_op() return 0; } - int unregister_tm2_spacetobatchnd_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_spacetodepth.c b/source/serializer/tmfile/op/tm2_spacetodepth.c index 5eae73a50..d305df490 100644 --- a/source/serializer/tmfile/op/tm2_spacetodepth.c +++ b/source/serializer/tmfile/op/tm2_spacetodepth.c @@ -32,19 +32,16 @@ #include "device/device.h" #include "utility/log.h" - static int spacetodepth_op_map(int op) { return OP_SPACETODEPTH; } - static int tm2_load_spacetodepth(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_spacetodepth_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -60,7 +57,6 @@ int register_tm2_spacetodepth_op() return 0; } - int unregister_tm2_spacetodepth_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_sparsetodense.c b/source/serializer/tmfile/op/tm2_sparsetodense.c index 0a73bac7b..e0504c69e 100644 --- a/source/serializer/tmfile/op/tm2_sparsetodense.c +++ b/source/serializer/tmfile/op/tm2_sparsetodense.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int sparsetodense_op_map(int op) { return OP_SPARSETODENSE; } - static int tm2_load_sparsetodense(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct sparsetodense_param* sparsetodense_param = ( struct sparsetodense_param* )ir_node->op.param_mem; + struct sparsetodense_param* sparsetodense_param = (struct sparsetodense_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_SparseToDenseParam* tm_param = ( TM2_SparseToDenseParam* )(mem_base + tm_op->offset_t_param); + const TM2_SparseToDenseParam* tm_param = (TM2_SparseToDenseParam*)(mem_base + tm_op->offset_t_param); sparsetodense_param->default_value = tm_param->default_value; sparsetodense_param->output_shape_size0 = tm_param->output_shape_size0; @@ -56,7 +54,6 @@ static int tm2_load_sparsetodense(struct graph* ir_graph, struct node* ir_node, return 0; } - int register_tm2_sparsetodense_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -72,7 +69,6 @@ int register_tm2_sparsetodense_op() return 0; } - int unregister_tm2_sparsetodense_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_spatialtransformer.c b/source/serializer/tmfile/op/tm2_spatialtransformer.c index f5ba0273b..4537ddb64 100644 --- a/source/serializer/tmfile/op/tm2_spatialtransformer.c +++ b/source/serializer/tmfile/op/tm2_spatialtransformer.c @@ -35,29 +35,27 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int spatialtransformer_op_map(int op) { return OP_SPATIALTRANSFORMER; } - static int tm2_load_spatialtransformer(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, - const TM2_Operator* tm_op) + const TM2_Operator* tm_op) { - struct spatialtransformer_param* param = ( struct spatialtransformer_param* )ir_node->op.param_mem; + struct spatialtransformer_param* param = (struct spatialtransformer_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_SpatialTransformerParam* tm_param = ( TM2_SpatialTransformerParam* )(mem_base + tm_op->offset_t_param); + const TM2_SpatialTransformerParam* tm_param = (TM2_SpatialTransformerParam*)(mem_base + tm_op->offset_t_param); param->sampler_type = tm_param->sampler_type; param->transformer_type = tm_param->transformer_type; int index = 0; if (tm_param->offset_ta_shape != TM2_NOT_SET) { - const TM2_Vector_dims* v_ta_shape = ( TM2_Vector_dims* )(mem_base + tm_param->offset_ta_shape); + const TM2_Vector_dims* v_ta_shape = (TM2_Vector_dims*)(mem_base + tm_param->offset_ta_shape); - param->target_shape = ( int* )sys_malloc(v_ta_shape->v_num * sizeof(int)); + param->target_shape = (int*)sys_malloc(v_ta_shape->v_num * sizeof(int)); for (unsigned int i = 0; i < v_ta_shape->v_num; i++) { param->target_shape[i] = v_ta_shape->dims[i]; @@ -66,7 +64,6 @@ static int tm2_load_spatialtransformer(struct graph* ir_graph, struct node* ir_n return 0; } - int register_tm2_spatialtransformer_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -82,7 +79,6 @@ int register_tm2_spatialtransformer_op() return 0; } - int unregister_tm2_spatialtransformer_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_split.c b/source/serializer/tmfile/op/tm2_split.c index 850ecaebb..a96abe453 100644 --- a/source/serializer/tmfile/op/tm2_split.c +++ b/source/serializer/tmfile/op/tm2_split.c @@ -35,20 +35,18 @@ #include "utility/vector.h" #include "utility/log.h" - static int split_op_map(int op) { return OP_SPLIT; } - static int tm2_load_split(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct split_param* split_param = ( struct split_param* )ir_node->op.param_mem; + struct split_param* split_param = (struct split_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_SplitParam* tm_param = ( TM2_SplitParam* )(mem_base + tm_op->offset_t_param); + const TM2_SplitParam* tm_param = (TM2_SplitParam*)(mem_base + tm_op->offset_t_param); if (tm_param->is_caffe) split_param->is_caffe = true; @@ -67,13 +65,13 @@ static int tm2_load_split(struct graph* ir_graph, struct node* ir_node, const TM split_param->split_dim = tm_param->split_dim; if (tm_param->offset_split_sizes != TM2_NOT_SET) { - const TM2_Vector_dims* v_split_sizes = ( TM2_Vector_dims* )(mem_base + tm_param->offset_split_sizes); + const TM2_Vector_dims* v_split_sizes = (TM2_Vector_dims*)(mem_base + tm_param->offset_split_sizes); split_param->split_sizes_ = create_vector(sizeof(int), NULL); for (int i = 0; i < v_split_sizes->v_num; i++) { int dim = v_split_sizes->dims[i]; - push_vector_data(split_param->split_sizes_, ( void* )(&dim)); + push_vector_data(split_param->split_sizes_, (void*)(&dim)); } } } @@ -81,7 +79,6 @@ static int tm2_load_split(struct graph* ir_graph, struct node* ir_node, const TM return 0; } - int register_tm2_split_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -97,7 +94,6 @@ int register_tm2_split_op() return 0; } - int unregister_tm2_split_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_squareddifference.c b/source/serializer/tmfile/op/tm2_squareddifference.c index 8dbe175de..51f27c99f 100644 --- a/source/serializer/tmfile/op/tm2_squareddifference.c +++ b/source/serializer/tmfile/op/tm2_squareddifference.c @@ -32,20 +32,17 @@ #include "device/device.h" #include "utility/log.h" - static int squareddifference_op_map(int op) { return OP_SQUAREDDIFFERENCE; } - static int tm2_load_squareddifference(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_squareddifference_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -62,7 +59,6 @@ int register_tm2_squareddifference_op() return 0; } - int unregister_tm2_squareddifference_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_squeeze.c b/source/serializer/tmfile/op/tm2_squeeze.c index a59f821e2..f3cec3b5b 100644 --- a/source/serializer/tmfile/op/tm2_squeeze.c +++ b/source/serializer/tmfile/op/tm2_squeeze.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int squeeze_op_map(int op) { return OP_SQUEEZE; } - static int tm2_load_squeeze(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct squeeze_param* squeeze_param = ( struct squeeze_param* )ir_node->op.param_mem; + struct squeeze_param* squeeze_param = (struct squeeze_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_SqueezeParam* tm_param = ( TM2_SqueezeParam* )(mem_base + tm_op->offset_t_param); + const TM2_SqueezeParam* tm_param = (TM2_SqueezeParam*)(mem_base + tm_op->offset_t_param); squeeze_param->dim_0 = tm_param->dim_0; squeeze_param->dim_1 = tm_param->dim_1; @@ -57,7 +55,6 @@ static int tm2_load_squeeze(struct graph* ir_graph, struct node* ir_node, const return 0; } - int register_tm2_squeeze_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -73,7 +70,6 @@ int register_tm2_squeeze_op() return 0; } - int unregister_tm2_squeeze_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_strided_slice.c b/source/serializer/tmfile/op/tm2_strided_slice.c index a11e8b8b7..4fbf03df2 100644 --- a/source/serializer/tmfile/op/tm2_strided_slice.c +++ b/source/serializer/tmfile/op/tm2_strided_slice.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int strided_slice_op_map(int op) { return OP_STRIDED_SLICE; } - static int tm2_load_strided_slice(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct strided_slice_param* strided_slice_param = ( struct strided_slice_param* )ir_node->op.param_mem; + struct strided_slice_param* strided_slice_param = (struct strided_slice_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_StridedSliceParam* tm_param = ( TM2_StridedSliceParam* )(mem_base + tm_op->offset_t_param); + const TM2_StridedSliceParam* tm_param = (TM2_StridedSliceParam*)(mem_base + tm_op->offset_t_param); strided_slice_param->begin[0] = tm_param->begin_n; strided_slice_param->begin[1] = tm_param->begin_c; @@ -65,7 +63,6 @@ static int tm2_load_strided_slice(struct graph* ir_graph, struct node* ir_node, return 0; } - int register_tm2_strided_slice_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -81,7 +78,6 @@ int register_tm2_strided_slice_op() return 0; } - int unregister_tm2_strided_slice_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_swap_axis.c b/source/serializer/tmfile/op/tm2_swap_axis.c index 8860b30cf..3332e144c 100644 --- a/source/serializer/tmfile/op/tm2_swap_axis.c +++ b/source/serializer/tmfile/op/tm2_swap_axis.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int swap_axis_op_map(int op) { return OP_SWAP_AXIS; } - static int tm2_load_swap_axis(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct swap_axis_param* swap_axis_param = ( struct swap_axis_param* )ir_node->op.param_mem; + struct swap_axis_param* swap_axis_param = (struct swap_axis_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_SwapAxisParam* tm_param = ( TM2_SwapAxisParam* )(mem_base + tm_op->offset_t_param); + const TM2_SwapAxisParam* tm_param = (TM2_SwapAxisParam*)(mem_base + tm_op->offset_t_param); swap_axis_param->dim_0 = tm_param->dim_0; swap_axis_param->dim_1 = tm_param->dim_1; @@ -55,7 +53,6 @@ static int tm2_load_swap_axis(struct graph* ir_graph, struct node* ir_node, cons return 0; } - int register_tm2_swap_axis_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -71,7 +68,6 @@ int register_tm2_swap_axis_op() return 0; } - int unregister_tm2_swap_axis_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_tanh.c b/source/serializer/tmfile/op/tm2_tanh.c index 5428ef3d6..42e4b8a8f 100644 --- a/source/serializer/tmfile/op/tm2_tanh.c +++ b/source/serializer/tmfile/op/tm2_tanh.c @@ -32,20 +32,17 @@ #include "device/device.h" #include "utility/log.h" - static int tanh_op_map(int op) { return OP_TANH; } - static int tm2_load_tanh(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_tanh_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -61,7 +58,6 @@ int register_tm2_tanh_op() return 0; } - int unregister_tm2_tanh_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_threshold.c b/source/serializer/tmfile/op/tm2_threshold.c index 7c42ca455..a24b83050 100644 --- a/source/serializer/tmfile/op/tm2_threshold.c +++ b/source/serializer/tmfile/op/tm2_threshold.c @@ -34,26 +34,23 @@ #include "device/device.h" #include "utility/log.h" - static int threshold_op_map(int op) { return OP_THRESHOLD; } - static int tm2_load_threshold(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct threshold_param* param = ( struct threshold_param* )ir_node->op.param_mem; + struct threshold_param* param = (struct threshold_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_ThresholdParam* tm_param = ( TM2_ThresholdParam* )(mem_base + tm_op->offset_t_param); + const TM2_ThresholdParam* tm_param = (TM2_ThresholdParam*)(mem_base + tm_op->offset_t_param); param->threshold = tm_param->threshold; return 0; } - int register_tm2_threshold_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -69,7 +66,6 @@ int register_tm2_threshold_op() return 0; } - int unregister_tm2_threshold_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_tile.c b/source/serializer/tmfile/op/tm2_tile.c index 84cc7ca62..a128bc4f8 100644 --- a/source/serializer/tmfile/op/tm2_tile.c +++ b/source/serializer/tmfile/op/tm2_tile.c @@ -35,13 +35,11 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int tile_op_map(int op) { return OP_TILE; } - static int tm2_load_tile(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { struct tile_param* tile_param = (struct tile_param*)ir_node->op.param_mem; @@ -51,21 +49,20 @@ static int tm2_load_tile(struct graph* ir_graph, struct node* ir_node, const TM2 tile_param->frame_flag = tm_param->frame_flag; if (tm_param->offset_reps != TM2_NOT_SET) { - const TM2_Vector_dims* v_re_shape = ( TM2_Vector_dims* )(mem_base + tm_param->offset_reps); + const TM2_Vector_dims* v_re_shape = (TM2_Vector_dims*)(mem_base + tm_param->offset_reps); tile_param->reps_size = v_re_shape->v_num; - tile_param->reps = ( int* )sys_malloc(v_re_shape->v_num * sizeof(int)); + tile_param->reps = (int*)sys_malloc(v_re_shape->v_num * sizeof(int)); for (unsigned int i = 0; i < v_re_shape->v_num; i++) { tile_param->reps[i] = v_re_shape->dims[i]; } - } + } return 0; } - int register_tm2_tile_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -81,7 +78,6 @@ int register_tm2_tile_op() return 0; } - int unregister_tm2_tile_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_topkv2.c b/source/serializer/tmfile/op/tm2_topkv2.c index c2287949f..8ae0dda27 100644 --- a/source/serializer/tmfile/op/tm2_topkv2.c +++ b/source/serializer/tmfile/op/tm2_topkv2.c @@ -34,20 +34,18 @@ #include "device/device.h" #include "utility/log.h" - static int topkv2_op_map(int op) { return OP_TOPKV2; } - static int tm2_load_topkv2(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct topkv2_param* topkv2_param = ( struct topkv2_param* )ir_node->op.param_mem; + struct topkv2_param* topkv2_param = (struct topkv2_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_TopKV2Param* tm_param = ( TM2_TopKV2Param* )(mem_base + tm_op->offset_t_param); + const TM2_TopKV2Param* tm_param = (TM2_TopKV2Param*)(mem_base + tm_op->offset_t_param); topkv2_param->k = tm_param->k; if (tm_param->sorted) @@ -58,7 +56,6 @@ static int tm2_load_topkv2(struct graph* ir_graph, struct node* ir_node, const T return 0; } - int register_tm2_topkv2_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -74,7 +71,6 @@ int register_tm2_topkv2_op() return 0; } - int unregister_tm2_topkv2_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_transpose.c b/source/serializer/tmfile/op/tm2_transpose.c index c97077317..da5fd4a7b 100644 --- a/source/serializer/tmfile/op/tm2_transpose.c +++ b/source/serializer/tmfile/op/tm2_transpose.c @@ -35,16 +35,14 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int transpose_op_map(int op) { return OP_TRANSPOSE; } - static int tm2_load_transpose(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct transpose_param* transpose_param = ( struct transpose_param* )ir_node->op.param_mem; + struct transpose_param* transpose_param = (struct transpose_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; const TM2_TransposeParam* tm_param = (TM2_TransposeParam*)(mem_base + tm_op->offset_t_param); @@ -64,7 +62,6 @@ static int tm2_load_transpose(struct graph* ir_graph, struct node* ir_node, cons return 0; } - int register_tm2_transpose_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -80,7 +77,6 @@ int register_tm2_transpose_op() return 0; } - int unregister_tm2_transpose_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_unary.c b/source/serializer/tmfile/op/tm2_unary.c index a25e0e121..aced83cf9 100644 --- a/source/serializer/tmfile/op/tm2_unary.c +++ b/source/serializer/tmfile/op/tm2_unary.c @@ -34,26 +34,23 @@ #include "device/device.h" #include "utility/log.h" - static int unary_op_map(int op) { return OP_UNARY; } - static int tm2_load_unary(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct unary_param* unary_param = ( struct unary_param* )ir_node->op.param_mem; + struct unary_param* unary_param = (struct unary_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_UnaryParam* tm_param = ( TM2_UnaryParam* )(mem_base + tm_op->offset_t_param); + const TM2_UnaryParam* tm_param = (TM2_UnaryParam*)(mem_base + tm_op->offset_t_param); unary_param->type = tm_param->type; return 0; } - int register_tm2_unary_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -69,7 +66,6 @@ int register_tm2_unary_op() return 0; } - int unregister_tm2_unary_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_unsqueeze.c b/source/serializer/tmfile/op/tm2_unsqueeze.c index 3e3fbb61f..3458bca7c 100644 --- a/source/serializer/tmfile/op/tm2_unsqueeze.c +++ b/source/serializer/tmfile/op/tm2_unsqueeze.c @@ -35,26 +35,24 @@ #include "utility/sys_port.h" #include "utility/log.h" - static int unsqueeze_op_map(int op) { return OP_UNSQUEEZE; } - static int tm2_load_unsqueeze(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct unsqueeze_param* unsqueeze_param = ( struct unsqueeze_param* )ir_node->op.param_mem; + struct unsqueeze_param* unsqueeze_param = (struct unsqueeze_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_UnsqueezeParam* tm_param = ( TM2_UnsqueezeParam* )(mem_base + tm_op->offset_t_param); + const TM2_UnsqueezeParam* tm_param = (TM2_UnsqueezeParam*)(mem_base + tm_op->offset_t_param); if (tm_param->offset_vi_axises != TM2_NOT_SET) { - const TM2_Vector_dims* v_axises = ( TM2_Vector_dims* )(mem_base + tm_param->offset_vi_axises); + const TM2_Vector_dims* v_axises = (TM2_Vector_dims*)(mem_base + tm_param->offset_vi_axises); unsqueeze_param->axises_size = v_axises->v_num; - unsqueeze_param->axises = ( int* )sys_malloc(v_axises->v_num * sizeof(int)); + unsqueeze_param->axises = (int*)sys_malloc(v_axises->v_num * sizeof(int)); for (unsigned int i = 0; i < v_axises->v_num; i++) unsqueeze_param->axises[i] = v_axises->dims[i]; } @@ -62,7 +60,6 @@ static int tm2_load_unsqueeze(struct graph* ir_graph, struct node* ir_node, cons return 0; } - int register_tm2_unsqueeze_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -78,7 +75,6 @@ int register_tm2_unsqueeze_op() return 0; } - int unregister_tm2_unsqueeze_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_upsample.c b/source/serializer/tmfile/op/tm2_upsample.c index ef1418ce3..1edb7ec4a 100644 --- a/source/serializer/tmfile/op/tm2_upsample.c +++ b/source/serializer/tmfile/op/tm2_upsample.c @@ -34,27 +34,24 @@ #include "device/device.h" #include "utility/log.h" - static int upsample_op_map(int op) { return OP_UPSAMPLE; } - static int tm2_load_upsample(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { - struct upsample_param* upsample_param = ( struct upsample_param* )ir_node->op.param_mem; + struct upsample_param* upsample_param = (struct upsample_param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; const char* mem_base = tm2_priv->base; - const TM2_UpsampleParam* tm_param = ( TM2_UpsampleParam* )(mem_base + tm_op->offset_t_param); + const TM2_UpsampleParam* tm_param = (TM2_UpsampleParam*)(mem_base + tm_op->offset_t_param); upsample_param->scale = tm_param->scale; return 0; } - int register_tm2_upsample_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -70,7 +67,6 @@ int register_tm2_upsample_op() return 0; } - int unregister_tm2_upsample_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_where.c b/source/serializer/tmfile/op/tm2_where.c index 866895086..80d34e049 100644 --- a/source/serializer/tmfile/op/tm2_where.c +++ b/source/serializer/tmfile/op/tm2_where.c @@ -32,20 +32,17 @@ #include "device/device.h" #include "utility/log.h" - static int where_op_map(int op) { return OP_RELU1; } - static int tm2_load_where(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_where_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -61,7 +58,6 @@ int register_tm2_where_op() return 0; } - int unregister_tm2_where_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/op/tm2_zeroslike.c b/source/serializer/tmfile/op/tm2_zeroslike.c index e1e735644..830ce2707 100644 --- a/source/serializer/tmfile/op/tm2_zeroslike.c +++ b/source/serializer/tmfile/op/tm2_zeroslike.c @@ -32,20 +32,17 @@ #include "device/device.h" #include "utility/log.h" - static int zeroslike_op_map(int op) { return OP_ZEROSLIKE; } - static int tm2_load_zeroslike(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op) { return 0; } - int register_tm2_zeroslike_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); @@ -61,7 +58,6 @@ int register_tm2_zeroslike_op() return 0; } - int unregister_tm2_zeroslike_op() { struct serializer* tm2_s = find_serializer_via_name("tengine"); diff --git a/source/serializer/tmfile/tm2_format.h b/source/serializer/tmfile/tm2_format.h index f303e4dee..670715f95 100644 --- a/source/serializer/tmfile/tm2_format.h +++ b/source/serializer/tmfile/tm2_format.h @@ -32,271 +32,271 @@ extern "C" { #endif -#define TM2_FILE_VER_MAIN 2 -#define TM2_FILE_VER_SUB 0 -#define TM2_FILE_VER_COMPILE 0 +#define TM2_FILE_VER_MAIN 2 +#define TM2_FILE_VER_SUB 0 +#define TM2_FILE_VER_COMPILE 0 -#define TM2_OP_VER 1 +#define TM2_OP_VER 1 -#define TM2_NOT_SET 0x00 +#define TM2_NOT_SET 0x00 /* Type define */ -typedef uint32_t tm_uoffset_t; /* offset is 4-byte unsigned integer */ -typedef uint32_t tm_size_t; /* size is 4-byte unsigned integer */ -typedef uint8_t tm_bool_t; /* bool is 1-byte unsigned integer */ +typedef uint32_t tm_uoffset_t; /* offset is 4-byte unsigned integer */ +typedef uint32_t tm_size_t; /* size is 4-byte unsigned integer */ +typedef uint8_t tm_bool_t; /* bool is 1-byte unsigned integer */ /* Operator strings */ -#define TM2_OPSTR_ACCURACY "Accuracy" -#define TM2_OPSTR_BATCHNORMALIZATION "BatchNormalization" -#define TM2_OPSTR_BILINEARRESIZE "Resize" -#define TM2_OPSTR_CONCAT "Concat" -#define TM2_OPSTR_CONST "Const" -#define TM2_OPSTR_CONVOLUTION "Convolution" -#define TM2_OPSTR_DECONVOLUTION "Deconvolution" -#define TM2_OPSTR_DETECTIONOUTPUT "DetectionOutput" -#define TM2_OPSTR_DROPOUT "Dropout" -#define TM2_OPSTR_ELTWISE "Eltwise" -#define TM2_OPSTR_FLATTEN "Flatten" -#define TM2_OPSTR_FULLYCONNECTED "FullyConnected" -#define TM2_OPSTR_INPUTOP "InputOp" -#define TM2_OPSTR_LRN "LRN" -#define TM2_OPSTR_NORMALIZE "Normalize" -#define TM2_OPSTR_PERMUTE "Permute" -#define TM2_OPSTR_POOLING "Pooling" -#define TM2_OPSTR_PRELU "PReLU" -#define TM2_OPSTR_PRIORBOX "PriorBox" -#define TM2_OPSTR_REGION "Region" -#define TM2_OPSTR_RELU "ReLu" -#define TM2_OPSTR_RELU6 "ReLu6" -#define TM2_OPSTR_REORG "Reorg" -#define TM2_OPSTR_RESHAPE "Reshape" -#define TM2_OPSTR_ROIPOOLING "ROIPooling" -#define TM2_OPSTR_RPN "RPN" -#define TM2_OPSTR_SCALE "Scale" -#define TM2_OPSTR_SLICE "Slice" -#define TM2_OPSTR_SOFTMAX "Softmax" -#define TM2_OPSTR_SPLIT "Split" -#define TM2_OPSTR_DETECTIONPOSTPROCESS "DetectionPostProcess" -#define TM2_OPSTR_GEMM "Gemm" -#define TM2_OPSTR_GENERIC "Generic" -#define TM2_OPSTR_LOGISTIC "Logistic" -#define TM2_OPSTR_LSTM "LSTM" -#define TM2_OPSTR_RNN "RNN" -#define TM2_OPSTR_TANH "Tanh" -#define TM2_OPSTR_SIGMOID "Sigmoid" -#define TM2_OPSTR_SQUEEZE "Squeeze" -#define TM2_OPSTR_PAD "Pad" -#define TM2_OPSTR_STRIDEDSLICE "StridedSlice" -#define TM2_OPSTR_REDUCTION "Reduction" -#define TM2_OPSTR_ARGMAX "ArgMax" -#define TM2_OPSTR_ARGMIN "ArgMin" -#define TM2_OPSTR_TOPKV2 "TopKV2" -#define TM2_OPSTR_MAX "Maximum" -#define TM2_OPSTR_MIN "Minimum" -#define TM2_OPSTR_ADDN "Addn" -#define TM2_OPSTR_SWAPAXIS "SwapAxis" -#define TM2_OPSTR_GRU "GRU" -#define TM2_OPSTR_FUSEDBNSCALERELU "Fused.BNScaleReLu" -#define TM2_OPSTR_UPSAMPLE "Upsample" -#define TM2_OPSTR_SHUFFLECHANNEL "ShuffleChannel" -#define TM2_OPSTR_RESIZE "Resize" -#define TM2_OPSTR_SPACETOBATCHND "SpaceToBatchND" -#define TM2_OPSTR_BATCHTOSPACEND "BatchToSpaceND" -#define TM2_OPSTR_CROP "Crop" -#define TM2_OPSTR_PSROIPOOLING "Psroipooling" -#define TM2_OPSTR_ROIALIGN "Roialign" -#define TM2_OPSTR_EXPANDDIMS "Expanddims" -#define TM2_OPSTR_UNARY "Unary" -#define TM2_OPSTR_BIAS "Bias" -#define TM2_OPSTR_NOOP "Noop" -#define TM2_OPSTR_THRESHOLD "Threshold" -#define TM2_OPSTR_HARDSIGMOID "Hardsigmoid" -#define TM2_OPSTR_EMBED "Embedding" -#define TM2_OPSTR_INSTANCENORM "InstanceNorm" -#define TM2_OPSTR_MVN "MVN" -#define TM2_OPSTR_ABSVAL "Absval" -#define TM2_OPSTR_CAST "Cast" -#define TM2_OPSTR_HARDSWISH "HardSwish" -#define TM2_OPSTR_INTERP "Interp" -#define TM2_OPSTR_SELU "Selu" -#define TM2_OPSTR_ELU "Elu" -#define TM2_OPSTR_BROADMUL "BroadMul" -#define TM2_OPSTR_LOGICAL "Logical" -#define TM2_OPSTR_GATHER "Gather" -#define TM2_OPSTR_TRANSPOSE "Transpose" -#define TM2_OPSTR_REVERSE "Reverse" -#define TM2_OPSTR_COMPARISON "Comparison" -#define TM2_OPSTR_SPACETODEPTH "SpaceToDepth" -#define TM2_OPSTR_DEPTHTOSPACE "DepthToSpace" -#define TM2_OPSTR_SQUAREDDIFFERENCE "SquaredDifference" -#define TM2_OPSTR_SPARSETODENSE "SparseToDense" -#define TM2_OPSTR_CEIL "Ceil" -#define TM2_OPSTR_ROUND "Round" -#define TM2_OPSTR_ZEROSLIKE "ZerosLike" -#define TM2_OPSTR_CLIP "Clip" -#define TM2_OPSTR_UNSQUEEZE "Unsqueeze" -#define TM2_OPSTR_REDUCEL2 "ReduceL2" -#define TM2_OPSTR_MEAN "Mean" -#define TM2_OPSTR_MATMUL "MatMul" -#define TM2_OPSTR_MISH "Mish" -#define TM2_OPSTR_L2NORMALIZATION "L2Normalization" -#define TM2_OPSTR_RELU1 "ReLU1" -#define TM2_OPSTR_SHAPE "Shape" -#define TM2_OPSTR_LOGSOFTMAX "LogSoftmax" -#define TM2_OPSTR_SCATTER "Scatter" -#define TM2_OPSTR_TILE "Tile" -#define TM2_OPSTR_L2POOL "L2Pool" -#define TM2_OPSTR_SOFTPLUS "Softplus" -#define TM2_OPSTR_RECIPROCAL "Reciprocal" -#define TM2_OPSTR_SPATIALTRANSFORMER "SpatialTransformer" -#define TM2_OPSTR_EXPAND "Expand" +#define TM2_OPSTR_ACCURACY "Accuracy" +#define TM2_OPSTR_BATCHNORMALIZATION "BatchNormalization" +#define TM2_OPSTR_BILINEARRESIZE "Resize" +#define TM2_OPSTR_CONCAT "Concat" +#define TM2_OPSTR_CONST "Const" +#define TM2_OPSTR_CONVOLUTION "Convolution" +#define TM2_OPSTR_DECONVOLUTION "Deconvolution" +#define TM2_OPSTR_DETECTIONOUTPUT "DetectionOutput" +#define TM2_OPSTR_DROPOUT "Dropout" +#define TM2_OPSTR_ELTWISE "Eltwise" +#define TM2_OPSTR_FLATTEN "Flatten" +#define TM2_OPSTR_FULLYCONNECTED "FullyConnected" +#define TM2_OPSTR_INPUTOP "InputOp" +#define TM2_OPSTR_LRN "LRN" +#define TM2_OPSTR_NORMALIZE "Normalize" +#define TM2_OPSTR_PERMUTE "Permute" +#define TM2_OPSTR_POOLING "Pooling" +#define TM2_OPSTR_PRELU "PReLU" +#define TM2_OPSTR_PRIORBOX "PriorBox" +#define TM2_OPSTR_REGION "Region" +#define TM2_OPSTR_RELU "ReLu" +#define TM2_OPSTR_RELU6 "ReLu6" +#define TM2_OPSTR_REORG "Reorg" +#define TM2_OPSTR_RESHAPE "Reshape" +#define TM2_OPSTR_ROIPOOLING "ROIPooling" +#define TM2_OPSTR_RPN "RPN" +#define TM2_OPSTR_SCALE "Scale" +#define TM2_OPSTR_SLICE "Slice" +#define TM2_OPSTR_SOFTMAX "Softmax" +#define TM2_OPSTR_SPLIT "Split" +#define TM2_OPSTR_DETECTIONPOSTPROCESS "DetectionPostProcess" +#define TM2_OPSTR_GEMM "Gemm" +#define TM2_OPSTR_GENERIC "Generic" +#define TM2_OPSTR_LOGISTIC "Logistic" +#define TM2_OPSTR_LSTM "LSTM" +#define TM2_OPSTR_RNN "RNN" +#define TM2_OPSTR_TANH "Tanh" +#define TM2_OPSTR_SIGMOID "Sigmoid" +#define TM2_OPSTR_SQUEEZE "Squeeze" +#define TM2_OPSTR_PAD "Pad" +#define TM2_OPSTR_STRIDEDSLICE "StridedSlice" +#define TM2_OPSTR_REDUCTION "Reduction" +#define TM2_OPSTR_ARGMAX "ArgMax" +#define TM2_OPSTR_ARGMIN "ArgMin" +#define TM2_OPSTR_TOPKV2 "TopKV2" +#define TM2_OPSTR_MAX "Maximum" +#define TM2_OPSTR_MIN "Minimum" +#define TM2_OPSTR_ADDN "Addn" +#define TM2_OPSTR_SWAPAXIS "SwapAxis" +#define TM2_OPSTR_GRU "GRU" +#define TM2_OPSTR_FUSEDBNSCALERELU "Fused.BNScaleReLu" +#define TM2_OPSTR_UPSAMPLE "Upsample" +#define TM2_OPSTR_SHUFFLECHANNEL "ShuffleChannel" +#define TM2_OPSTR_RESIZE "Resize" +#define TM2_OPSTR_SPACETOBATCHND "SpaceToBatchND" +#define TM2_OPSTR_BATCHTOSPACEND "BatchToSpaceND" +#define TM2_OPSTR_CROP "Crop" +#define TM2_OPSTR_PSROIPOOLING "Psroipooling" +#define TM2_OPSTR_ROIALIGN "Roialign" +#define TM2_OPSTR_EXPANDDIMS "Expanddims" +#define TM2_OPSTR_UNARY "Unary" +#define TM2_OPSTR_BIAS "Bias" +#define TM2_OPSTR_NOOP "Noop" +#define TM2_OPSTR_THRESHOLD "Threshold" +#define TM2_OPSTR_HARDSIGMOID "Hardsigmoid" +#define TM2_OPSTR_EMBED "Embedding" +#define TM2_OPSTR_INSTANCENORM "InstanceNorm" +#define TM2_OPSTR_MVN "MVN" +#define TM2_OPSTR_ABSVAL "Absval" +#define TM2_OPSTR_CAST "Cast" +#define TM2_OPSTR_HARDSWISH "HardSwish" +#define TM2_OPSTR_INTERP "Interp" +#define TM2_OPSTR_SELU "Selu" +#define TM2_OPSTR_ELU "Elu" +#define TM2_OPSTR_BROADMUL "BroadMul" +#define TM2_OPSTR_LOGICAL "Logical" +#define TM2_OPSTR_GATHER "Gather" +#define TM2_OPSTR_TRANSPOSE "Transpose" +#define TM2_OPSTR_REVERSE "Reverse" +#define TM2_OPSTR_COMPARISON "Comparison" +#define TM2_OPSTR_SPACETODEPTH "SpaceToDepth" +#define TM2_OPSTR_DEPTHTOSPACE "DepthToSpace" +#define TM2_OPSTR_SQUAREDDIFFERENCE "SquaredDifference" +#define TM2_OPSTR_SPARSETODENSE "SparseToDense" +#define TM2_OPSTR_CEIL "Ceil" +#define TM2_OPSTR_ROUND "Round" +#define TM2_OPSTR_ZEROSLIKE "ZerosLike" +#define TM2_OPSTR_CLIP "Clip" +#define TM2_OPSTR_UNSQUEEZE "Unsqueeze" +#define TM2_OPSTR_REDUCEL2 "ReduceL2" +#define TM2_OPSTR_MEAN "Mean" +#define TM2_OPSTR_MATMUL "MatMul" +#define TM2_OPSTR_MISH "Mish" +#define TM2_OPSTR_L2NORMALIZATION "L2Normalization" +#define TM2_OPSTR_RELU1 "ReLU1" +#define TM2_OPSTR_SHAPE "Shape" +#define TM2_OPSTR_LOGSOFTMAX "LogSoftmax" +#define TM2_OPSTR_SCATTER "Scatter" +#define TM2_OPSTR_TILE "Tile" +#define TM2_OPSTR_L2POOL "L2Pool" +#define TM2_OPSTR_SOFTPLUS "Softplus" +#define TM2_OPSTR_RECIPROCAL "Reciprocal" +#define TM2_OPSTR_SPATIALTRANSFORMER "SpatialTransformer" +#define TM2_OPSTR_EXPAND "Expand" /* Operator types */ -#define TM2_OPTYPE_ACCURACY 0 /* No Param */ -#define TM2_OPTYPE_BATCHNORMALIZATION 1 /* TM2_BatchNormParam */ -#define TM2_OPTYPE_BILINEARRESIZE 2 /* TM2_ResizeParam */ -#define TM2_OPTYPE_CONCAT 3 /* TM2_ConcatParam */ -#define TM2_OPTYPE_CONST 4 /* No Param */ -#define TM2_OPTYPE_CONVOLUTION 5 /* TM2_ConvParam */ -#define TM2_OPTYPE_DECONVOLUTION 6 /* TM2_DeconvParam */ -#define TM2_OPTYPE_DETECTIONOUTPUT 7 /* TM2_DetectionOutputParam */ -#define TM2_OPTYPE_DROPOUT 8 /* No Param */ -#define TM2_OPTYPE_ELTWISE 9 /* TM2_EltwiseParam */ -#define TM2_OPTYPE_FLATTEN 10 /* TM2_FlattenParam */ -#define TM2_OPTYPE_FULLYCONNECTED 11 /* TM2_FCParam */ -#define TM2_OPTYPE_INPUTOP 12 /* No Param */ -#define TM2_OPTYPE_LRN 13 /* TM2_LRNParam */ -#define TM2_OPTYPE_NORMALIZE 14 /* TM2_NormalizeParam */ -#define TM2_OPTYPE_PERMUTE 15 /* TM2_PermuteParam */ -#define TM2_OPTYPE_POOLING 16 /* TM2_PoolParam */ -#define TM2_OPTYPE_PRELU 17 /* No Param */ -#define TM2_OPTYPE_PRIORBOX 18 /* TM2_PriorBoxParam */ -#define TM2_OPTYPE_REGION 19 /* TM2_RegionParam */ -#define TM2_OPTYPE_RELU 20 /* TM2_ReLuParam */ -#define TM2_OPTYPE_RELU6 21 /* No Param */ -#define TM2_OPTYPE_REORG 22 /* TM2_ReorgParam */ -#define TM2_OPTYPE_RESHAPE 23 /* TM2_ReshapeParam */ -#define TM2_OPTYPE_ROIPOOLING 24 /* TM2_ROIPoolingParam */ -#define TM2_OPTYPE_RPN 25 /* TM2_RPNParam */ -#define TM2_OPTYPE_SCALE 26 /* TM2_ScaleParam */ -#define TM2_OPTYPE_SLICE 27 /* TM2_SliceParam */ -#define TM2_OPTYPE_SOFTMAX 28 /* TM2_SoftmaxParam */ -#define TM2_OPTYPE_SPLIT 29 /* No Param */ -#define TM2_OPTYPE_DETECTIONPOSTPROCESS 30 /* TM2_DetectionPostProcessParam */ -#define TM2_OPTYPE_GEMM 31 /* TM2_GemmParam */ -#define TM2_OPTYPE_GENERIC 32 /* TM2_GenericParam */ -#define TM2_OPTYPE_LOGISTIC 33 /* No Param */ -#define TM2_OPTYPE_LSTM 34 /* TM2_LstmParam */ -#define TM2_OPTYPE_RNN 35 /* TM2_RnnParam */ -#define TM2_OPTYPE_TANH 36 /* No Param */ -#define TM2_OPTYPE_SIGMOID 37 /* No Param */ -#define TM2_OPTYPE_SQUEEZE 38 /* TM2_SqueezeParam */ -#define TM2_OPTYPE_FUSEDBNSCALERELU 39 /* No Param */ -#define TM2_OPTYPE_PAD 40 /* TM2_PadParam */ -#define TM2_OPTYPE_STRIDEDSLICE 41 /* TM2_StrideSliceParam */ -#define TM2_OPTYPE_ARGMAX 42 /* TM2_ArgmaxParam */ -#define TM2_OPTYPE_ARGMIN 43 /* TM2_ArgminParam */ -#define TM2_OPTYPE_TOPKV2 44 /* TM2_TopkV2Param */ -#define TM2_OPTYPE_REDUCTION 45 /* TM2_ReductionParam */ -#define TM2_OPTYPE_MAX 46 /* No Param */ -#define TM2_OPTYPE_MIN 47 /* No Param */ -#define TM2_OPTYPE_GRU 48 /* TM2_GruParam */ -#define TM2_OPTYPE_ADDN 49 /* TM2_AddNParam */ -#define TM2_OPTYPE_SWAPAXIS 50 /* TM2_SwapAixsParam */ -#define TM2_OPTYPE_UPSAMPLE 51 /* TM2_UpsampleParam */ -#define TM2_OPTYPE_SPACETOBATCHND 52 -#define TM2_OPTYPE_BATCHTOSPACEND 53 -#define TM2_OPTYPE_RESIZE 54 -#define TM2_OPTYPE_SHUFFLECHANNEL 55 /* TM2_ShuffleChannelPara */ -#define TM2_OPTYPE_CROP 56 /* TM2_CropParam */ -#define TM2_OPTYPE_ROIALIGN 57 -#define TM2_OPTYPE_PSROIPOOLING 58 -#define TM2_OPTYPE_UNARY 59 -#define TM2_OPTYPE_EXPANDDIMS 60 -#define TM2_OPTYPE_BIAS 61 -#define TM2_OPTYPE_NOOP 62 -#define TM2_OPTYPE_THRESHOLD 63 -#define TM2_OPTYPE_HARDSIGMOID 64 -#define TM2_OPTYPE_EMBED 65 -#define TM2_OPTYPE_INSTANCENORM 66 -#define TM2_OPTYPE_MVN 67 -#define TM2_OPTYPE_ABSVAL 68 -#define TM2_OPTYPE_CAST 69 -#define TM2_OPTYPE_HARDSWISH 70 -#define TM2_OPTYPE_INTERP 71 -#define TM2_OPTYPE_SELU 72 -#define TM2_OPTYPE_ELU 73 -#define TM2_OPTYPE_BROADMUL 74 -#define TM2_OPTYPE_LOGICAL 75 -#define TM2_OPTYPE_GATHER 76 -#define TM2_OPTYPE_TRANSPOSE 77 -#define TM2_OPTYPE_COMPARISON 78 -#define TM2_OPTYPE_SPACETODEPTH 79 -#define TM2_OPTYPE_DEPTHTOSPACE 80 -#define TM2_OPTYPE_REVERSE 81 -#define TM2_OPTYPE_SPARSETODENSE 82 -#define TM2_OPTYPE_CEIL 83 -#define TM2_OPTYPE_SQUAREDDIFFERENCE 84 -#define TM2_OPTYPE_ROUND 85 -#define TM2_OPTYPE_ZEROSLIKE 86 -#define TM2_OPTYPE_CLIP 87 -#define TM2_OPTYPE_UNSQUEEZE 88 -#define TM2_OPTYPE_REDUCEL2 89 -#define TM2_OPTYPE_MEAN 90 -#define TM2_OPTYPE_MATMUL 91 -#define TM2_OPTYPE_EXPAND 92 -#define TM2_OPTYPE_SCATTER 93 -#define TM2_OPTYPE_SHAPE 94 -#define TM2_OPTYPE_WHERE 95 -#define TM2_OPTYPE_TILE 96 -#define TM2_OPTYPE_MISH 97 -#define TM2_OPTYPE_L2POOL 98 -#define TM2_OPTYPE_LOGSOFTMAX 99 -#define TM2_OPTYPE_RELU1 100 -#define TM2_OPTYPE_L2NORMALIZATION 101 -#define TM2_OPTYPE_SOFTPLUS 102 -#define TM2_OPTYPE_RECIPROCAL 103 -#define TM2_OPTYPE_SPATIALTRANSFORMER 105 -#define TM2_OPTYPE_NUM 106 +#define TM2_OPTYPE_ACCURACY 0 /* No Param */ +#define TM2_OPTYPE_BATCHNORMALIZATION 1 /* TM2_BatchNormParam */ +#define TM2_OPTYPE_BILINEARRESIZE 2 /* TM2_ResizeParam */ +#define TM2_OPTYPE_CONCAT 3 /* TM2_ConcatParam */ +#define TM2_OPTYPE_CONST 4 /* No Param */ +#define TM2_OPTYPE_CONVOLUTION 5 /* TM2_ConvParam */ +#define TM2_OPTYPE_DECONVOLUTION 6 /* TM2_DeconvParam */ +#define TM2_OPTYPE_DETECTIONOUTPUT 7 /* TM2_DetectionOutputParam */ +#define TM2_OPTYPE_DROPOUT 8 /* No Param */ +#define TM2_OPTYPE_ELTWISE 9 /* TM2_EltwiseParam */ +#define TM2_OPTYPE_FLATTEN 10 /* TM2_FlattenParam */ +#define TM2_OPTYPE_FULLYCONNECTED 11 /* TM2_FCParam */ +#define TM2_OPTYPE_INPUTOP 12 /* No Param */ +#define TM2_OPTYPE_LRN 13 /* TM2_LRNParam */ +#define TM2_OPTYPE_NORMALIZE 14 /* TM2_NormalizeParam */ +#define TM2_OPTYPE_PERMUTE 15 /* TM2_PermuteParam */ +#define TM2_OPTYPE_POOLING 16 /* TM2_PoolParam */ +#define TM2_OPTYPE_PRELU 17 /* No Param */ +#define TM2_OPTYPE_PRIORBOX 18 /* TM2_PriorBoxParam */ +#define TM2_OPTYPE_REGION 19 /* TM2_RegionParam */ +#define TM2_OPTYPE_RELU 20 /* TM2_ReLuParam */ +#define TM2_OPTYPE_RELU6 21 /* No Param */ +#define TM2_OPTYPE_REORG 22 /* TM2_ReorgParam */ +#define TM2_OPTYPE_RESHAPE 23 /* TM2_ReshapeParam */ +#define TM2_OPTYPE_ROIPOOLING 24 /* TM2_ROIPoolingParam */ +#define TM2_OPTYPE_RPN 25 /* TM2_RPNParam */ +#define TM2_OPTYPE_SCALE 26 /* TM2_ScaleParam */ +#define TM2_OPTYPE_SLICE 27 /* TM2_SliceParam */ +#define TM2_OPTYPE_SOFTMAX 28 /* TM2_SoftmaxParam */ +#define TM2_OPTYPE_SPLIT 29 /* No Param */ +#define TM2_OPTYPE_DETECTIONPOSTPROCESS 30 /* TM2_DetectionPostProcessParam */ +#define TM2_OPTYPE_GEMM 31 /* TM2_GemmParam */ +#define TM2_OPTYPE_GENERIC 32 /* TM2_GenericParam */ +#define TM2_OPTYPE_LOGISTIC 33 /* No Param */ +#define TM2_OPTYPE_LSTM 34 /* TM2_LstmParam */ +#define TM2_OPTYPE_RNN 35 /* TM2_RnnParam */ +#define TM2_OPTYPE_TANH 36 /* No Param */ +#define TM2_OPTYPE_SIGMOID 37 /* No Param */ +#define TM2_OPTYPE_SQUEEZE 38 /* TM2_SqueezeParam */ +#define TM2_OPTYPE_FUSEDBNSCALERELU 39 /* No Param */ +#define TM2_OPTYPE_PAD 40 /* TM2_PadParam */ +#define TM2_OPTYPE_STRIDEDSLICE 41 /* TM2_StrideSliceParam */ +#define TM2_OPTYPE_ARGMAX 42 /* TM2_ArgmaxParam */ +#define TM2_OPTYPE_ARGMIN 43 /* TM2_ArgminParam */ +#define TM2_OPTYPE_TOPKV2 44 /* TM2_TopkV2Param */ +#define TM2_OPTYPE_REDUCTION 45 /* TM2_ReductionParam */ +#define TM2_OPTYPE_MAX 46 /* No Param */ +#define TM2_OPTYPE_MIN 47 /* No Param */ +#define TM2_OPTYPE_GRU 48 /* TM2_GruParam */ +#define TM2_OPTYPE_ADDN 49 /* TM2_AddNParam */ +#define TM2_OPTYPE_SWAPAXIS 50 /* TM2_SwapAixsParam */ +#define TM2_OPTYPE_UPSAMPLE 51 /* TM2_UpsampleParam */ +#define TM2_OPTYPE_SPACETOBATCHND 52 +#define TM2_OPTYPE_BATCHTOSPACEND 53 +#define TM2_OPTYPE_RESIZE 54 +#define TM2_OPTYPE_SHUFFLECHANNEL 55 /* TM2_ShuffleChannelPara */ +#define TM2_OPTYPE_CROP 56 /* TM2_CropParam */ +#define TM2_OPTYPE_ROIALIGN 57 +#define TM2_OPTYPE_PSROIPOOLING 58 +#define TM2_OPTYPE_UNARY 59 +#define TM2_OPTYPE_EXPANDDIMS 60 +#define TM2_OPTYPE_BIAS 61 +#define TM2_OPTYPE_NOOP 62 +#define TM2_OPTYPE_THRESHOLD 63 +#define TM2_OPTYPE_HARDSIGMOID 64 +#define TM2_OPTYPE_EMBED 65 +#define TM2_OPTYPE_INSTANCENORM 66 +#define TM2_OPTYPE_MVN 67 +#define TM2_OPTYPE_ABSVAL 68 +#define TM2_OPTYPE_CAST 69 +#define TM2_OPTYPE_HARDSWISH 70 +#define TM2_OPTYPE_INTERP 71 +#define TM2_OPTYPE_SELU 72 +#define TM2_OPTYPE_ELU 73 +#define TM2_OPTYPE_BROADMUL 74 +#define TM2_OPTYPE_LOGICAL 75 +#define TM2_OPTYPE_GATHER 76 +#define TM2_OPTYPE_TRANSPOSE 77 +#define TM2_OPTYPE_COMPARISON 78 +#define TM2_OPTYPE_SPACETODEPTH 79 +#define TM2_OPTYPE_DEPTHTOSPACE 80 +#define TM2_OPTYPE_REVERSE 81 +#define TM2_OPTYPE_SPARSETODENSE 82 +#define TM2_OPTYPE_CEIL 83 +#define TM2_OPTYPE_SQUAREDDIFFERENCE 84 +#define TM2_OPTYPE_ROUND 85 +#define TM2_OPTYPE_ZEROSLIKE 86 +#define TM2_OPTYPE_CLIP 87 +#define TM2_OPTYPE_UNSQUEEZE 88 +#define TM2_OPTYPE_REDUCEL2 89 +#define TM2_OPTYPE_MEAN 90 +#define TM2_OPTYPE_MATMUL 91 +#define TM2_OPTYPE_EXPAND 92 +#define TM2_OPTYPE_SCATTER 93 +#define TM2_OPTYPE_SHAPE 94 +#define TM2_OPTYPE_WHERE 95 +#define TM2_OPTYPE_TILE 96 +#define TM2_OPTYPE_MISH 97 +#define TM2_OPTYPE_L2POOL 98 +#define TM2_OPTYPE_LOGSOFTMAX 99 +#define TM2_OPTYPE_RELU1 100 +#define TM2_OPTYPE_L2NORMALIZATION 101 +#define TM2_OPTYPE_SOFTPLUS 102 +#define TM2_OPTYPE_RECIPROCAL 103 +#define TM2_OPTYPE_SPATIALTRANSFORMER 105 +#define TM2_OPTYPE_NUM 106 /* --------------------- -------- TM objects -------------------------------- */ typedef struct { - uint16_t ver_main; /* main version of Tengine model file format */ - uint16_t ver_sub; /* sub version of Tengine model file format */ - uint16_t ver_compile; /* compile version of Tengine model file format */ + uint16_t ver_main; /* main version of Tengine model file format */ + uint16_t ver_sub; /* sub version of Tengine model file format */ + uint16_t ver_compile; /* compile version of Tengine model file format */ tm_uoffset_t offset_root; /* offset of root table (TM2_Model) */ } TM2_Header; /* Root table of Tengine model */ typedef struct { - int32_t orig_format; /* format of original model */ - int32_t sub_format; /* sub format for DLA model */ + int32_t orig_format; /* format of original model */ + int32_t sub_format; /* sub format for DLA model */ tm_uoffset_t offset_vo_subgraphs; /* offset of TM2_Vector_offsets */ - tm_uoffset_t offset_s_mname; /* offset of string */ + tm_uoffset_t offset_s_mname; /* offset of string */ } TM2_Model; /* Only 1 subgraph is supported currently */ typedef struct { - uint32_t subgraph_id; /* subgraph id */ - int32_t graph_layout; /* actual data layout */ - int32_t model_layout; /* data layout of original model */ - tm_uoffset_t offset_vi_input_indices; /* offset of TM2_Vector_indices */ + uint32_t subgraph_id; /* subgraph id */ + int32_t graph_layout; /* actual data layout */ + int32_t model_layout; /* data layout of original model */ + tm_uoffset_t offset_vi_input_indices; /* offset of TM2_Vector_indices */ tm_uoffset_t offset_vi_output_indices; /* offset of TM2_Vector_indices */ - tm_uoffset_t offset_vo_seq_nodes; /* offset of TM2_Vector_offsets */ - tm_uoffset_t offset_vo_tensors; /* offset of TM2_Vector_offsets */ - tm_uoffset_t offset_vo_buffers; /* offset of TM2_Vector_offsets */ - tm_uoffset_t offset_s_sname; /* offset of string */ - tm_uoffset_t offset_vo_sub_info; /* offset of TM2_Vector_offsets */ + tm_uoffset_t offset_vo_seq_nodes; /* offset of TM2_Vector_offsets */ + tm_uoffset_t offset_vo_tensors; /* offset of TM2_Vector_offsets */ + tm_uoffset_t offset_vo_buffers; /* offset of TM2_Vector_offsets */ + tm_uoffset_t offset_s_sname; /* offset of string */ + tm_uoffset_t offset_vo_sub_info; /* offset of TM2_Vector_offsets */ } TM2_Subgraph; typedef struct { - uint32_t subgraph_id; /* sub graph idx */ - uint32_t input_wait_count; /* input wait count */ - int32_t data_type; /* FP32 FP16 U8 INT8 */ + uint32_t subgraph_id; /* sub graph idx */ + uint32_t input_wait_count; /* input wait count */ + int32_t data_type; /* FP32 FP16 U8 INT8 */ tm_uoffset_t offset_vi_node_list; /* offset of TM2_Vector_indices */ tm_uoffset_t offset_vi_input_tensor; /* offset of TM2_Vector_indices */ tm_uoffset_t offset_vi_output_tensor; /* offset of TM2_Vector_indices */ @@ -306,25 +306,25 @@ typedef struct typedef struct { tm_uoffset_t offset_s_attrname; /* offset of string */ - tm_uoffset_t offset_s_attrval; /* offset of string */ + tm_uoffset_t offset_s_attrval; /* offset of string */ int32_t attr_type; } TM2_Attr; typedef struct { - uint32_t node_id; /* node id */ - tm_uoffset_t offset_vi_input_tensors; /* offset of TM2_Vector_indices */ + uint32_t node_id; /* node id */ + tm_uoffset_t offset_vi_input_tensors; /* offset of TM2_Vector_indices */ tm_uoffset_t offset_vi_output_tensors; /* offset of TM2_Vector_indices */ - tm_uoffset_t offset_t_operator; /* offset of table */ - tm_uoffset_t offset_s_nname; /* offset of string */ - tm_uoffset_t offset_vo_attrs; /* offset of TM2_Vector_offsets */ + tm_uoffset_t offset_t_operator; /* offset of table */ + tm_uoffset_t offset_s_nname; /* offset of string */ + tm_uoffset_t offset_vo_attrs; /* offset of TM2_Vector_offsets */ tm_bool_t dynamic_shape; } TM2_Node; typedef struct { - uint32_t op_ver; /* version of operator */ - uint32_t operator_type; /* operator type */ + uint32_t op_ver; /* version of operator */ + uint32_t operator_type; /* operator type */ tm_uoffset_t offset_t_param; /* offset of table */ } TM2_Operator; @@ -339,8 +339,8 @@ typedef struct { uint32_t tensor_id; uint32_t buffer_id; - tm_uoffset_t offset_vd_dims; /* offset of TM2_Vector_dims */ - tm_uoffset_t offset_s_tname; /* offset of string */ + tm_uoffset_t offset_vd_dims; /* offset of TM2_Vector_dims */ + tm_uoffset_t offset_s_tname; /* offset of string */ tm_uoffset_t offect_vo_quantparams; /* offset of TM2_Vector_offsets */ int32_t layout; int32_t type; @@ -349,13 +349,13 @@ typedef struct typedef struct { - tm_size_t size; /* buffer size */ + tm_size_t size; /* buffer size */ tm_uoffset_t offset_data; /* offset of buffer data */ } TM2_Buffer; typedef struct { - tm_size_t size; /* string size */ + tm_size_t size; /* string size */ tm_uoffset_t offset_data; /* offset of string data */ } TM2_String; @@ -387,7 +387,7 @@ typedef struct typedef struct { - tm_size_t v_num; /* number of vector elements */ + tm_size_t v_num; /* number of vector elements */ float data[0][4]; /* x0, y0, x1, y1 */ } TM2_Vector_anchors; @@ -521,9 +521,9 @@ typedef struct typedef struct { - tm_uoffset_t offset_vf_min_size; /* offset of TM2_Vector_floats */ - tm_uoffset_t offset_vf_max_size; /* offset of TM2_Vector_floats */ - tm_uoffset_t offset_vf_variance; /* offset of TM2_Vector_floats */ + tm_uoffset_t offset_vf_min_size; /* offset of TM2_Vector_floats */ + tm_uoffset_t offset_vf_max_size; /* offset of TM2_Vector_floats */ + tm_uoffset_t offset_vf_variance; /* offset of TM2_Vector_floats */ tm_uoffset_t offset_vf_aspect_ratio; /* offset of TM2_Vector_floats */ int32_t flip; int32_t clip; @@ -581,7 +581,7 @@ typedef struct typedef struct { - tm_uoffset_t offset_vf_ratios; /* pointer to TM2_Vector_floats */ + tm_uoffset_t offset_vf_ratios; /* pointer to TM2_Vector_floats */ tm_uoffset_t offset_vf_anchor_scales; /* pointer to TM2_Vector_floats */ int32_t feat_stride; int32_t basesize; @@ -603,8 +603,8 @@ typedef struct { int32_t axis; tm_uoffset_t offset_vi_slice_points; /* offset of TM2_Vector_dims */ - tm_uoffset_t offset_vi_begins; /* offset of TM2_Vector_dims */ - tm_uoffset_t offset_vi_sizes; /* offset of TM2_Vector_dims */ + tm_uoffset_t offset_vi_begins; /* offset of TM2_Vector_dims */ + tm_uoffset_t offset_vi_sizes; /* offset of TM2_Vector_dims */ int32_t iscaffe; int32_t ismxnet; int32_t isonnx; @@ -892,7 +892,7 @@ typedef struct typedef struct { - int32_t resize_type; // 1=nearest 2=bilinear 3=bicubic + int32_t resize_type; // 1=nearest 2=bilinear 3=bicubic float width_scale; float height_scale; int32_t output_width; @@ -990,20 +990,19 @@ typedef struct tm_uoffset_t offset_reps; } TM2_TileParam; -typedef struct +typedef struct { int sampler_type; int transformer_type; int shape_size; tm_uoffset_t offset_ta_shape; -}TM2_SpatialTransformerParam; +} TM2_SpatialTransformerParam; -typedef struct +typedef struct { tm_uoffset_t offset_ex_shape; int dim_num; -}TM2_ExpandParam; - +} TM2_ExpandParam; #ifdef __cplusplus } diff --git a/source/serializer/tmfile/tm2_serializer.c b/source/serializer/tmfile/tm2_serializer.c index b5d6f5953..3fc87d660 100644 --- a/source/serializer/tmfile/tm2_serializer.c +++ b/source/serializer/tmfile/tm2_serializer.c @@ -51,7 +51,6 @@ #include - struct op_loader_entry { int op_type; @@ -83,18 +82,18 @@ static char* strdup_name(char* buf, int size) static inline const TM2_Header* get_tm_file_header(const char* base) { - return ( const TM2_Header* )(base); + return (const TM2_Header*)(base); } static inline const TM2_Model* get_tm_file_model(const char* base, const TM2_Header* header) { - return ( const TM2_Model* )(base + header->offset_root); + return (const TM2_Model*)(base + header->offset_root); } static inline const TM2_Subgraph* get_tm_file_subgraph(const char* base, const TM2_Model* model) { - const TM2_Vector_offsets* v_graphs = ( TM2_Vector_offsets* )(base + model->offset_vo_subgraphs); - const TM2_Subgraph* tm_graph = ( TM2_Subgraph* )(base + v_graphs->offsets[0]); + const TM2_Vector_offsets* v_graphs = (TM2_Vector_offsets*)(base + model->offset_vo_subgraphs); + const TM2_Subgraph* tm_graph = (TM2_Subgraph*)(base + v_graphs->offsets[0]); return tm_graph; } @@ -105,7 +104,7 @@ static struct op_loader_entry* find_op_loader(struct tm2_serializer* s, int op_t for (int i = 0; i < loader_num; i++) { - struct op_loader_entry* e = ( struct op_loader_entry* )get_vector_data(s->loader_list, i); + struct op_loader_entry* e = (struct op_loader_entry*)get_vector_data(s->loader_list, i); if (e->op_type == op_type) return e; @@ -143,7 +142,7 @@ static int unregister_tm2_op_loader(struct tm2_serializer* s, int op_type, int o for (int i = 0; i < n; i++) { - struct op_loader_entry* e = ( struct op_loader_entry* )get_vector_data(s->loader_list, i); + struct op_loader_entry* e = (struct op_loader_entry*)get_vector_data(s->loader_list, i); if (e->op_type == op_type && e->loader == op_loader) { @@ -157,11 +156,11 @@ static int unregister_tm2_op_loader(struct tm2_serializer* s, int op_type, int o static int load_graph_tensors(struct tm2_serializer* tm2_s, struct graph* graph, struct tm2_priv* priv) { - char* mem_base = ( char* )priv->base; + char* mem_base = (char*)priv->base; const TM2_Subgraph* tm_graph = priv->subgraph; - const TM2_Vector_offsets* v_tensors = ( TM2_Vector_offsets* )(mem_base + tm_graph->offset_vo_tensors); - const TM2_Vector_offsets* v_buffers = ( TM2_Vector_offsets* )(mem_base + tm_graph->offset_vo_buffers); + const TM2_Vector_offsets* v_tensors = (TM2_Vector_offsets*)(mem_base + tm_graph->offset_vo_tensors); + const TM2_Vector_offsets* v_buffers = (TM2_Vector_offsets*)(mem_base + tm_graph->offset_vo_buffers); graph->graph_layout = tm_graph->graph_layout; graph->model_layout = tm_graph->model_layout; @@ -175,8 +174,8 @@ static int load_graph_tensors(struct tm2_serializer* tm2_s, struct graph* graph, for (int i = 0; i < v_tensors->v_num; i++) { - const TM2_Tensor* tm_tensor = ( TM2_Tensor* )(mem_base + v_tensors->offsets[i]); - int flag_permute = 0; // flag the tensor has to be permute + const TM2_Tensor* tm_tensor = (TM2_Tensor*)(mem_base + v_tensors->offsets[i]); + int flag_permute = 0; // flag the tensor has to be permute int dims_org[8] = {0}; /* TODO: check type definition */ @@ -193,14 +192,14 @@ static int load_graph_tensors(struct tm2_serializer* tm2_s, struct graph* graph, if (tm_tensor->offset_s_tname != TM2_NOT_SET) { // TODO: using update the TM2 model - const TM2_String* tm_str = ( TM2_String* )(mem_base + tm_tensor->offset_s_tname); + const TM2_String* tm_str = (TM2_String*)(mem_base + tm_tensor->offset_s_tname); ir_tensor->name = strdup_name(mem_base + tm_str->offset_data, tm_str->size); } /* shape */ if (tm_tensor->offset_vd_dims != TM2_NOT_SET) { - const TM2_Vector_dims* v_dims = ( TM2_Vector_dims* )(mem_base + tm_tensor->offset_vd_dims); + const TM2_Vector_dims* v_dims = (TM2_Vector_dims*)(mem_base + tm_tensor->offset_vd_dims); if (tm_graph->model_layout == TENGINE_LAYOUT_NCHW) { @@ -217,10 +216,10 @@ static int load_graph_tensors(struct tm2_serializer* tm2_s, struct graph* graph, dims_org[2] = v_dims->dims[2]; dims_org[3] = v_dims->dims[3]; - dims[0] = v_dims->dims[0]; // c_out - dims[1] = v_dims->dims[3]; // c_in - dims[2] = v_dims->dims[1]; // h - dims[3] = v_dims->dims[2]; // w + dims[0] = v_dims->dims[0]; // c_out + dims[1] = v_dims->dims[3]; // c_in + dims[2] = v_dims->dims[1]; // h + dims[3] = v_dims->dims[2]; // w set_ir_tensor_shape(ir_tensor, dims, v_dims->v_num); @@ -236,7 +235,7 @@ static int load_graph_tensors(struct tm2_serializer* tm2_s, struct graph* graph, /* load const type of tensor, such as the weight or bias for convolution node */ if (ir_tensor->tensor_type == TENSOR_TYPE_CONST) { - const TM2_Buffer* tm_buf = ( TM2_Buffer* )(mem_base + v_buffers->offsets[tm_tensor->buffer_id]); + const TM2_Buffer* tm_buf = (TM2_Buffer*)(mem_base + v_buffers->offsets[tm_tensor->buffer_id]); /* fill temp data buffer to benchmark */ if (tm_buf->offset_data == TM2_NOT_SET) @@ -348,7 +347,7 @@ static int load_graph_tensors(struct tm2_serializer* tm2_s, struct graph* graph, if (type == TENGINE_DT_UINT8 || type == TENGINE_DT_INT8) { - unsigned char* tensor_data_org = ( unsigned char* )sys_malloc(size * sizeof(unsigned char)); + unsigned char* tensor_data_org = (unsigned char*)sys_malloc(size * sizeof(unsigned char)); unsigned char* original_date = (unsigned char*)ir_tensor->data; for (int n = 0; n < size; n++) @@ -363,9 +362,9 @@ static int load_graph_tensors(struct tm2_serializer* tm2_s, struct graph* graph, dims[3] = ir_tensor->dims[3]; /* nhwc to nchw */ -// fprintf(stderr, "%s:\n", ir_tensor->name); -// fprintf(stderr, "original %d, %d, %d, %d\n", dims_org[0], dims_org[1], dims_org[2], dims_org[3]); -// fprintf(stderr, "permute %d, %d, %d, %d\n", dims[0], dims[1], dims[2], dims[3]); + // fprintf(stderr, "%s:\n", ir_tensor->name); + // fprintf(stderr, "original %d, %d, %d, %d\n", dims_org[0], dims_org[1], dims_org[2], dims_org[3]); + // fprintf(stderr, "permute %d, %d, %d, %d\n", dims[0], dims[1], dims[2], dims[3]); unsigned char* input = tensor_data_org; unsigned char* output = (unsigned char*)ir_tensor->data; @@ -436,28 +435,27 @@ static int load_graph_tensors(struct tm2_serializer* tm2_s, struct graph* graph, /* load vector type of tensor */ if (tm_tensor->offect_vo_quantparams != TM2_NOT_SET) { - const TM2_Vector_offsets* v_quantparams = - ( TM2_Vector_offsets* )(mem_base + tm_tensor->offect_vo_quantparams); + const TM2_Vector_offsets* v_quantparams = (TM2_Vector_offsets*)(mem_base + tm_tensor->offect_vo_quantparams); /* currently only support one quant param */ ir_tensor->quant_param_num = v_quantparams->v_num; if (v_quantparams->v_num == 1) { - const TM2_QuantParam* tm_qtparam = ( TM2_QuantParam* )(mem_base + v_quantparams->offsets[0]); + const TM2_QuantParam* tm_qtparam = (TM2_QuantParam*)(mem_base + v_quantparams->offsets[0]); ir_tensor->scale = tm_qtparam->scale; ir_tensor->zero_point = tm_qtparam->zero_point; -// printf("name %s, scale %f, zero %d\n", ir_tensor->name, ir_tensor->scale, ir_tensor->zero_point); + // printf("name %s, scale %f, zero %d\n", ir_tensor->name, ir_tensor->scale, ir_tensor->zero_point); } else if (v_quantparams->v_num > 1) { // to do : need to be updated - ir_tensor->scale_list = ( float* )sys_malloc(sizeof(float) * v_quantparams->v_num); - ir_tensor->zp_list = ( int* )sys_malloc(sizeof(int) * v_quantparams->v_num); + ir_tensor->scale_list = (float*)sys_malloc(sizeof(float) * v_quantparams->v_num); + ir_tensor->zp_list = (int*)sys_malloc(sizeof(int) * v_quantparams->v_num); for (int j = 0; j < v_quantparams->v_num; j++) { - const TM2_QuantParam* tm_qtparam = ( TM2_QuantParam* )(mem_base + v_quantparams->offsets[j]); + const TM2_QuantParam* tm_qtparam = (TM2_QuantParam*)(mem_base + v_quantparams->offsets[j]); ir_tensor->scale_list[j] = tm_qtparam->scale; ir_tensor->zp_list[j] = tm_qtparam->zero_point; } @@ -469,16 +467,16 @@ static int load_graph_tensors(struct tm2_serializer* tm2_s, struct graph* graph, static int load_graph_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph, struct tm2_priv* priv) { - char* mem_base = ( char* )priv->base; + char* mem_base = (char*)priv->base; const TM2_Subgraph* tm_graph = priv->subgraph; - const TM2_Vector_offsets* v_nodes = ( TM2_Vector_offsets* )(mem_base + tm_graph->offset_vo_seq_nodes); + const TM2_Vector_offsets* v_nodes = (TM2_Vector_offsets*)(mem_base + tm_graph->offset_vo_seq_nodes); unsigned int i; for (i = 0; i < v_nodes->v_num; i++) { - const TM2_Node* tm_node = ( TM2_Node* )(mem_base + v_nodes->offsets[i]); - const TM2_Operator* tm_operator = ( TM2_Operator* )(mem_base + tm_node->offset_t_operator); + const TM2_Node* tm_node = (TM2_Node*)(mem_base + v_nodes->offsets[i]); + const TM2_Operator* tm_operator = (TM2_Operator*)(mem_base + tm_node->offset_t_operator); int op_type = tm_operator->operator_type; int op_version = tm_operator->op_ver; @@ -509,7 +507,7 @@ static int load_graph_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph if (tm_node->offset_s_nname != TM2_NOT_SET) { - const TM2_String* str = ( TM2_String* )(mem_base + tm_node->offset_s_nname); + const TM2_String* str = (TM2_String*)(mem_base + tm_node->offset_s_nname); // TODO: update with new tm2 ir_node->name = strdup_name(mem_base + str->offset_data, str->size); } @@ -517,8 +515,7 @@ static int load_graph_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph /* node inputs */ if (tm_node->offset_vi_input_tensors != TM2_NOT_SET) { - const TM2_Vector_indices* v_input_tensors = - ( TM2_Vector_indices* )(mem_base + tm_node->offset_vi_input_tensors); + const TM2_Vector_indices* v_input_tensors = (TM2_Vector_indices*)(mem_base + tm_node->offset_vi_input_tensors); for (int j = 0; j < v_input_tensors->v_num; j++) { @@ -542,8 +539,7 @@ static int load_graph_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph break; } - const TM2_Vector_indices* v_output_tensors = - ( TM2_Vector_indices* )(mem_base + tm_node->offset_vi_output_tensors); + const TM2_Vector_indices* v_output_tensors = (TM2_Vector_indices*)(mem_base + tm_node->offset_vi_output_tensors); for (int j = 0; j < v_output_tensors->v_num; j++) { @@ -565,7 +561,7 @@ static int load_graph_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph { if (op_type == TM2_OPTYPE_SOFTMAX) { - TM2_SoftmaxParam* tm_param = ( TM2_SoftmaxParam* )(mem_base + tm_operator->offset_t_param); + TM2_SoftmaxParam* tm_param = (TM2_SoftmaxParam*)(mem_base + tm_operator->offset_t_param); if (tm_param->axis == 3) tm_param->axis = 1; @@ -587,14 +583,14 @@ static int load_graph_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph if (op_type == TM2_OPTYPE_REDUCTION) { - TM2_ReductionParam* tm_param = ( TM2_ReductionParam* )(mem_base + tm_operator->offset_t_param); + TM2_ReductionParam* tm_param = (TM2_ReductionParam*)(mem_base + tm_operator->offset_t_param); if (tm_param->dim_0 == 1 && tm_param->dim_1 == 2) { tm_param->dim_0 = 2; tm_param->dim_1 = 3; } - else if(tm_param->dim_0 == -1) + else if (tm_param->dim_0 == -1) { tm_param->dim_0 = 4; } @@ -606,35 +602,35 @@ static int load_graph_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph if (op_type == TM2_OPTYPE_PAD) { - TM2_PadParam* tm_param = ( TM2_PadParam* )(mem_base + tm_operator->offset_t_param); + TM2_PadParam* tm_param = (TM2_PadParam*)(mem_base + tm_operator->offset_t_param); int pads[8] = {0}; - pads[0] = tm_param->pad_n_0; // n + pads[0] = tm_param->pad_n_0; // n pads[1] = tm_param->pad_n_1; - pads[2] = tm_param->pad_c_0; // h + pads[2] = tm_param->pad_c_0; // h pads[3] = tm_param->pad_c_1; - pads[4] = tm_param->pad_h_0; // w + pads[4] = tm_param->pad_h_0; // w pads[5] = tm_param->pad_h_1; - pads[6] = tm_param->pad_w_0; // c + pads[6] = tm_param->pad_w_0; // c pads[7] = tm_param->pad_w_1; /* nhwc to nchw */ - tm_param->pad_c_0 = pads[6]; // c + tm_param->pad_c_0 = pads[6]; // c tm_param->pad_c_1 = pads[7]; - tm_param->pad_h_0 = pads[2]; // h + tm_param->pad_h_0 = pads[2]; // h tm_param->pad_h_1 = pads[3]; - tm_param->pad_w_0 = pads[4]; // w + tm_param->pad_w_0 = pads[4]; // w tm_param->pad_w_1 = pads[5]; } if (op_type == TM2_OPTYPE_STRIDEDSLICE) { - TM2_StridedSliceParam* tm_param = ( TM2_StridedSliceParam* )(mem_base + tm_operator->offset_t_param); + TM2_StridedSliceParam* tm_param = (TM2_StridedSliceParam*)(mem_base + tm_operator->offset_t_param); int begin[4] = {0}; int end[4] = {0}; @@ -673,8 +669,8 @@ static int load_graph_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph if (op_type == TM2_OPTYPE_RESHAPE) { - TM2_ReshapeParam* tm_param = ( TM2_ReshapeParam* )(mem_base + tm_operator->offset_t_param); - TM2_Vector_dims* v_reshape = ( TM2_Vector_dims* )(mem_base + tm_param->offset_re_shape); + TM2_ReshapeParam* tm_param = (TM2_ReshapeParam*)(mem_base + tm_operator->offset_t_param); + TM2_Vector_dims* v_reshape = (TM2_Vector_dims*)(mem_base + tm_param->offset_re_shape); if (tm_param->offset_re_shape != TM2_NOT_SET) { @@ -737,12 +733,12 @@ static int load_graph_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph static int set_graph_io_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph, struct tm2_priv* priv) { - char* mem_base = ( char* )priv->base; + char* mem_base = (char*)priv->base; const TM2_Subgraph* tm_graph = priv->subgraph; - const TM2_Vector_indices* v_input_nodes = ( TM2_Vector_indices* )(mem_base + tm_graph->offset_vi_input_indices); - const TM2_Vector_indices* v_output_nodes = ( TM2_Vector_indices* )(mem_base + tm_graph->offset_vi_output_indices); + const TM2_Vector_indices* v_input_nodes = (TM2_Vector_indices*)(mem_base + tm_graph->offset_vi_input_indices); + const TM2_Vector_indices* v_output_nodes = (TM2_Vector_indices*)(mem_base + tm_graph->offset_vi_output_indices); - int16_t* node_idx = ( int16_t* )sys_malloc(sizeof(int16_t) * v_input_nodes->v_num); + int16_t* node_idx = (int16_t*)sys_malloc(sizeof(int16_t) * v_input_nodes->v_num); if (node_idx == NULL) { @@ -758,7 +754,7 @@ static int set_graph_io_nodes(struct tm2_serializer* tm2_s, struct graph* ir_gra sys_free(node_idx); - node_idx = ( int16_t* )sys_malloc(sizeof(int16_t) * v_output_nodes->v_num); + node_idx = (int16_t*)sys_malloc(sizeof(int16_t) * v_output_nodes->v_num); for (unsigned int i = 0; i < v_output_nodes->v_num; i++) { @@ -774,10 +770,10 @@ static int set_graph_io_nodes(struct tm2_serializer* tm2_s, struct graph* ir_gra static int load_graph_sub_info(struct tm2_serializer* s, struct graph* graph, struct tm2_priv* priv) { - char* mem_base = ( char* )priv->base; - const TM2_Vector_offsets* v_graphs = ( TM2_Vector_offsets* )(mem_base + priv->model->offset_vo_subgraphs); + char* mem_base = (char*)priv->base; + const TM2_Vector_offsets* v_graphs = (TM2_Vector_offsets*)(mem_base + priv->model->offset_vo_subgraphs); const TM2_Subgraph* tm_graph = priv->subgraph; - const TM2_Vector_offsets* v_sub_info = ( TM2_Vector_offsets* )(mem_base + tm_graph->offset_vo_sub_info); + const TM2_Vector_offsets* v_sub_info = (TM2_Vector_offsets*)(mem_base + tm_graph->offset_vo_sub_info); if (v_sub_info == TM2_NOT_SET || v_graphs->v_num == 1) { @@ -789,10 +785,10 @@ static int load_graph_sub_info(struct tm2_serializer* s, struct graph* graph, st int sub_graph_num = v_sub_info->v_num; for (int i = 0; i < sub_graph_num; i++) { - struct subgraph* subgraph = ( struct subgraph* )sys_malloc(sizeof(struct subgraph)); + struct subgraph* subgraph = (struct subgraph*)sys_malloc(sizeof(struct subgraph)); init_ir_subgraph(graph, subgraph, i); - TM2_Sub_Info* sub_info = ( TM2_Sub_Info* )(mem_base + v_sub_info->offsets[i]); + TM2_Sub_Info* sub_info = (TM2_Sub_Info*)(mem_base + v_sub_info->offsets[i]); subgraph->index = sub_info->subgraph_id; subgraph->input_wait_count = sub_info->input_wait_count; @@ -802,7 +798,7 @@ static int load_graph_sub_info(struct tm2_serializer* s, struct graph* graph, st // subgraph->nn_dev->name = strdup_name(mem_base + device_name->offset_data, device_name->size); char* name = (char*)(mem_base + device_name->offset_data); - TM2_Vector_indices* v_node_list = ( TM2_Vector_indices* )(mem_base + sub_info->offset_vi_node_list); + TM2_Vector_indices* v_node_list = (TM2_Vector_indices*)(mem_base + sub_info->offset_vi_node_list); subgraph->node_num = v_node_list->v_num; subgraph->node_list = (uint16_t*)sys_malloc(sizeof(uint16_t) * subgraph->node_num); for (int j = 0; j < v_node_list->v_num; j++) @@ -810,7 +806,7 @@ static int load_graph_sub_info(struct tm2_serializer* s, struct graph* graph, st subgraph->node_list[j] = v_node_list->indices[j]; } - TM2_Vector_indices* v_input_tensor = ( TM2_Vector_indices* )(mem_base + sub_info->offset_vi_input_tensor); + TM2_Vector_indices* v_input_tensor = (TM2_Vector_indices*)(mem_base + sub_info->offset_vi_input_tensor); subgraph->input_num = v_input_tensor->v_num; subgraph->input_tensor_list = (uint16_t*)sys_malloc(sizeof(uint16_t) * subgraph->input_num); for (int j = 0; j < v_input_tensor->v_num; j++) @@ -818,7 +814,7 @@ static int load_graph_sub_info(struct tm2_serializer* s, struct graph* graph, st subgraph->input_tensor_list[j] = v_input_tensor->indices[j]; } - TM2_Vector_indices* v_output_tensor = ( TM2_Vector_indices* )(mem_base + sub_info->offset_vi_output_tensor); + TM2_Vector_indices* v_output_tensor = (TM2_Vector_indices*)(mem_base + sub_info->offset_vi_output_tensor); subgraph->output_num = v_output_tensor->v_num; subgraph->output_tensor_list = (uint16_t*)sys_malloc(sizeof(uint16_t) * subgraph->output_num); for (int j = 0; j < v_output_tensor->v_num; j++) @@ -838,7 +834,7 @@ static int load_graph_sub_info(struct tm2_serializer* s, struct graph* graph, st static int load_graph(struct serializer* s, struct graph* graph, struct tm2_priv* priv) { - struct tm2_serializer* tm2_s = ( struct tm2_serializer* )s; + struct tm2_serializer* tm2_s = (struct tm2_serializer*)s; /* version check */ if (priv->header->ver_main != TM2_FILE_VER_MAIN) @@ -861,7 +857,7 @@ static int load_graph(struct serializer* s, struct graph* graph, struct tm2_priv return 0; - error: +error: unload_graph(s, graph, priv, NULL); return -1; } @@ -891,10 +887,10 @@ static int load_model(struct serializer* s, struct graph* graph, const char* fna // return -1; // } - void* mem_base = ( void* )sys_malloc(file_len); + void* mem_base = (void*)sys_malloc(file_len); int ret = read(fd, mem_base, file_len); - struct tm2_priv* priv = ( struct tm2_priv* )sys_malloc(sizeof(struct tm2_priv)); + struct tm2_priv* priv = (struct tm2_priv*)sys_malloc(sizeof(struct tm2_priv)); if (priv == NULL) { @@ -918,7 +914,7 @@ static int load_model(struct serializer* s, struct graph* graph, const char* fna static int load_mem(struct serializer* s, struct graph* graph, const void* addr, int size, va_list ap) { - struct tm2_priv* priv = ( struct tm2_priv* )sys_malloc(sizeof(struct tm2_priv)); + struct tm2_priv* priv = (struct tm2_priv*)sys_malloc(sizeof(struct tm2_priv)); if (priv == NULL) { @@ -941,7 +937,7 @@ static int load_mem(struct serializer* s, struct graph* graph, const void* addr, static int unload_graph(struct serializer* s, struct graph* graph, void* s_priv, void* dev_priv) { - struct tm2_priv* priv = ( struct tm2_priv* )s_priv; + struct tm2_priv* priv = (struct tm2_priv*)s_priv; if (priv->fd >= 0) { @@ -952,7 +948,7 @@ static int unload_graph(struct serializer* s, struct graph* graph, void* s_priv, if (priv->base) { - sys_free(( void* )priv->base); + sys_free((void*)priv->base); priv->base = NULL; } @@ -969,7 +965,7 @@ static int unload_graph(struct serializer* s, struct graph* graph, void* s_priv, static int register_op_loader(struct serializer* s, int op_type, int op_ver, void* op_load_func, void* op_map_func, void* ver_map_func) { - struct tm2_serializer* tm2_s = ( struct tm2_serializer* )s; + struct tm2_serializer* tm2_s = (struct tm2_serializer*)s; tm2_op_loader_t op_load = (tm2_op_loader_t)op_load_func; tm2_map_t op_map = (tm2_map_t)op_map_func; tm2_map_t ver_map = (tm2_map_t)ver_map_func; @@ -979,7 +975,7 @@ static int register_op_loader(struct serializer* s, int op_type, int op_ver, voi static int unregister_op_loader(struct serializer* s, int op_type, int op_ver, void* op_load_func) { - struct tm2_serializer* tm2_s = ( struct tm2_serializer* )s; + struct tm2_serializer* tm2_s = (struct tm2_serializer*)s; tm2_op_loader_t op_load = (tm2_op_loader_t)op_load_func; return unregister_tm2_op_loader(tm2_s, op_type, op_ver, op_load); @@ -1002,7 +998,7 @@ static int input_op_map(int op) static int init_tm2_serializer(struct serializer* s) { - struct tm2_serializer* tm2_s = ( struct tm2_serializer* )s; + struct tm2_serializer* tm2_s = (struct tm2_serializer*)s; tm2_s->loader_list = create_vector(sizeof(struct op_loader_entry), NULL); @@ -1028,27 +1024,24 @@ static int release_tm2_serializer(struct serializer* s) } static struct tm2_serializer tm2_serializer = { - .base = - { - .get_name = get_name, - .load_model = load_model, - .load_mem = load_mem, - .unload_graph = unload_graph, - .register_op_loader = register_op_loader, - .unregister_op_loader = unregister_op_loader, - .init = init_tm2_serializer, - .release = release_tm2_serializer, - }, - .loader_list = NULL, + .base = { + .get_name = get_name, + .load_model = load_model, + .load_mem = load_mem, + .unload_graph = unload_graph, + .register_op_loader = register_op_loader, + .unregister_op_loader = unregister_op_loader, + .init = init_tm2_serializer, + .release = release_tm2_serializer, + }, + .loader_list = NULL, }; - int register_tm2_serializer() { return register_serializer((struct serializer*)&tm2_serializer); } - int unregister_tm2_serializer() { return unregister_serializer((struct serializer*)&tm2_serializer); diff --git a/source/serializer/tmfile/tm2_serializer.h b/source/serializer/tmfile/tm2_serializer.h index 23e2edf21..1e0887fe7 100644 --- a/source/serializer/tmfile/tm2_serializer.h +++ b/source/serializer/tmfile/tm2_serializer.h @@ -31,19 +31,16 @@ struct node; struct graph; - struct tm2_priv { int fd; /* for file load */ int mem_len; - const char* base; /* mem base for model */ - const TM2_Header* header; /* file header */ - const TM2_Model* model; /* model header */ + const char* base; /* mem base for model */ + const TM2_Header* header; /* file header */ + const TM2_Model* model; /* model header */ const TM2_Subgraph* subgraph; /* subgraph */ }; - typedef int (*tm2_op_loader_t)(struct graph*, struct node*, const TM2_Node*, const TM2_Operator* tm_op); - typedef int (*tm2_map_t)(int); diff --git a/source/system/cpu.c b/source/system/cpu.c index 7feffa8ac..87a6007a0 100644 --- a/source/system/cpu.c +++ b/source/system/cpu.c @@ -174,7 +174,7 @@ static int get_max_freq_khz(int cpuid) fclose(fp); - if (max_freq_khz <=0 && EOF == ret) + if (max_freq_khz <= 0 && EOF == ret) return -1; else return max_freq_khz; @@ -206,7 +206,7 @@ static int set_sched_affinity(size_t thread_affinity_mask) #define CPU_SETSIZE 1024 #endif #ifndef __NCPUBITS -#define __NCPUBITS (8 * sizeof (unsigned long)) +#define __NCPUBITS (8 * sizeof(unsigned long)) #endif typedef struct @@ -222,7 +222,7 @@ static int set_sched_affinity(size_t thread_affinity_mask) #if (defined __GLIBC__) || (defined _OHOS_) || (defined V831) pid_t pid = syscall(SYS_gettid); #else - #ifdef PI3 +#ifdef PI3 pid_t pid = getpid(); #else @@ -237,7 +237,7 @@ static int set_sched_affinity(size_t thread_affinity_mask) #endif cpu_set_t mask; CPU_ZERO(&mask); -// for (int i = 0; i < ( int )sizeof(size_t) * 8; i++) + // for (int i = 0; i < ( int )sizeof(size_t) * 8; i++) for (int i = 0; i < core_count; i++) { if (thread_affinity_mask & (1 << i)) @@ -361,13 +361,13 @@ int set_cpu_affine(size_t mask) #elif __APPLE_IOS__ || _MSC_VER // threads affinity not supported on ios - ( void )mask; + (void)mask; return -1; #else int status = set_sched_affinity(mask); if (0 != status) return -1; - return 0; + return 0; #endif return 0; @@ -377,20 +377,20 @@ size_t get_cpu_cluster_mask(int cluster) { switch (cluster) { - case TENGINE_CLUSTER_BIG: - if (0 != affinity_mask_big_cluster) - return affinity_mask_big_cluster; - break; - case TENGINE_CLUSTER_MEDIUM: - if (0 != affinity_mask_medium_cluster) - return affinity_mask_medium_cluster; - break; - case TENGINE_CLUSTER_LITTLE: - if (0 != affinity_mask_little_cluster) - return affinity_mask_little_cluster; - break; - default: - break; + case TENGINE_CLUSTER_BIG: + if (0 != affinity_mask_big_cluster) + return affinity_mask_big_cluster; + break; + case TENGINE_CLUSTER_MEDIUM: + if (0 != affinity_mask_medium_cluster) + return affinity_mask_medium_cluster; + break; + case TENGINE_CLUSTER_LITTLE: + if (0 != affinity_mask_little_cluster) + return affinity_mask_little_cluster; + break; + default: + break; } return affinity_mask_all_cluster; diff --git a/source/utility/float.c b/source/utility/float.c index e8bd5e6f2..0496a2d7a 100644 --- a/source/utility/float.c +++ b/source/utility/float.c @@ -24,18 +24,17 @@ #include "utility/float.h" -#define BF16_EXP_MAX ( 256 - 1) // 2^8 - 1 -#define FP16_EXP_MAX ( 32 - 1) // 2^5 - 1 -#define FP32_EXP_MAX ( 256 - 1) // 2^8 - 1 -#define FP64_EXP_MAX (2048 - 1) // 2^11 - 1 - -#define FP16_NAN ((FP16_EXP_MAX << 10) + 1) -#define FP16_INF ((FP16_EXP_MAX << 10) + 0) -#define BF16_NAN ((BF16_EXP_MAX << 7) + 1) -#define BF16_INF ((BF16_EXP_MAX << 7) + 0) -#define FP32_NAN ((FP32_EXP_MAX << 23) + 1) -#define FP32_INF ((FP32_EXP_MAX << 23) + 0) - +#define BF16_EXP_MAX (256 - 1) // 2^8 - 1 +#define FP16_EXP_MAX (32 - 1) // 2^5 - 1 +#define FP32_EXP_MAX (256 - 1) // 2^8 - 1 +#define FP64_EXP_MAX (2048 - 1) // 2^11 - 1 + +#define FP16_NAN ((FP16_EXP_MAX << 10) + 1) +#define FP16_INF ((FP16_EXP_MAX << 10) + 0) +#define BF16_NAN ((BF16_EXP_MAX << 7) + 1) +#define BF16_INF ((BF16_EXP_MAX << 7) + 0) +#define FP32_NAN ((FP32_EXP_MAX << 23) + 1) +#define FP32_INF ((FP32_EXP_MAX << 23) + 0) #ifndef __ARM_ARCH fp32_t fp16_to_fp32(fp16_t package) @@ -55,7 +54,7 @@ fp32_t fp16_to_fp32(fp16_t package) if (FP16_EXP_MAX != package.exp && 0 != package.exp && 0 != package.frac) { data.frac = package.frac << 13; - data.exp = package.exp + (- 15 + 127); + data.exp = package.exp + (-15 + 127); data.sign = package.sign; return data.value; @@ -65,7 +64,7 @@ fp32_t fp16_to_fp32(fp16_t package) if (FP16_EXP_MAX == package.exp && 0 == package.frac) { data.frac = 0; - data.exp = FP32_EXP_MAX; + data.exp = FP32_EXP_MAX; data.sign = package.sign; return data.value; @@ -75,7 +74,7 @@ fp32_t fp16_to_fp32(fp16_t package) if (FP16_EXP_MAX == package.exp && 0 != package.frac) { data.frac = 1; - data.exp = FP32_EXP_MAX; + data.exp = FP32_EXP_MAX; data.sign = package.sign; return data.value; @@ -85,7 +84,7 @@ fp32_t fp16_to_fp32(fp16_t package) if (0 == package.exp && 0 != package.frac) { uint16_t frac = package.frac; - uint16_t exp = 0; + uint16_t exp = 0; while (0 == (frac & (uint16_t)0x200)) { @@ -94,7 +93,7 @@ fp32_t fp16_to_fp32(fp16_t package) } data.frac = (frac << 1) & (uint16_t)0x3FF; - data.exp = -exp + (-15 + 127); + data.exp = -exp + (-15 + 127); data.sign = package.sign; return data.value; @@ -103,7 +102,6 @@ fp32_t fp16_to_fp32(fp16_t package) return data.value; } - fp16_t fp32_to_fp16(fp32_t value) { fp32_pack_t* package = (fp32_pack_t*)(&value); @@ -113,7 +111,7 @@ fp16_t fp32_to_fp16(fp32_t value) if (0 == package->exp) { data.value = 0; - data.sign = package->sign; + data.sign = package->sign; return data; } @@ -121,13 +119,13 @@ fp16_t fp32_to_fp16(fp32_t value) // means normalized value if (FP32_EXP_MAX != package->exp && 0 != package->exp && 0 != package->frac) { - int16_t exp = package->exp + (-15 + 127); + int16_t exp = package->exp + (-15 + 127); // means overflow if (31 <= exp) { data.frac = 0; - data.exp = FP16_EXP_MAX; + data.exp = FP16_EXP_MAX; data.sign = package->sign; } else if (0 >= exp) @@ -135,21 +133,21 @@ fp16_t fp32_to_fp16(fp32_t value) // means subnormal numbers if (-10 <= exp) { - data.frac = (package->frac | 0x800000) >> (14 - exp); - data.exp = 0; - data.sign = package->sign; + data.frac = (package->frac | 0x800000) >> (14 - exp); + data.exp = 0; + data.sign = package->sign; } // means underflow else { data.value = 0; - data.sign = package->sign; + data.sign = package->sign; } } else { data.frac = package->frac >> 13; - data.exp = exp; + data.exp = exp; data.sign = package->sign; } @@ -160,7 +158,7 @@ fp16_t fp32_to_fp16(fp32_t value) if (FP32_EXP_MAX == package->exp && 0 == package->frac) { data.frac = 0; - data.exp = FP16_EXP_MAX; + data.exp = FP16_EXP_MAX; data.sign = package->sign; return data; @@ -170,7 +168,7 @@ fp16_t fp32_to_fp16(fp32_t value) if (FP32_EXP_MAX == package->exp && 0 != package->frac) { data.frac = 1; - data.exp = FP16_EXP_MAX; + data.exp = FP16_EXP_MAX; data.sign = package->sign; return data; @@ -181,7 +179,6 @@ fp16_t fp32_to_fp16(fp32_t value) } #endif - fp32_t bf16_to_fp32(bf16_t package) { fp32_pack_t data; @@ -189,7 +186,6 @@ fp32_t bf16_to_fp32(bf16_t package) return data.value; } - bf16_t fp32_to_bf16(fp32_t value) { fp32_pack_t* package = (fp32_pack_t*)(&value); @@ -198,7 +194,6 @@ bf16_t fp32_to_bf16(fp32_t value) return data; } - #ifndef _MSC_VER fp32_t pxr24_to_fp32(pxr24_pack_t package) { @@ -210,7 +205,6 @@ fp32_t pxr24_to_fp32(pxr24_pack_t package) return data.value; } - pxr24_pack_t fp32_to_pxr24(fp32_t value) { fp32_pack_t* package = (fp32_pack_t*)(&value); @@ -220,7 +214,7 @@ pxr24_pack_t fp32_to_pxr24(fp32_t value) pxr24_pack_t* ptr = (pxr24_pack_t*)((uint8_t*)(&pxr24_val)); data.frac = ptr->frac; - data.exp = ptr->exp; + data.exp = ptr->exp; data.sign = ptr->sign; return data; diff --git a/source/utility/float.h b/source/utility/float.h index a02f1a518..e7fdef127 100644 --- a/source/utility/float.h +++ b/source/utility/float.h @@ -39,64 +39,59 @@ // IEEE 754 // ISO/IEC/IEEE FDIS 60559:2010 - #ifdef _MSC_VER -#pragma pack (push,1) +#pragma pack(push, 1) #endif typedef union fp16_pack { struct { uint16_t frac : 10; - uint16_t exp : 5; - uint16_t sign : 1; + uint16_t exp : 5; + uint16_t sign : 1; } PACKAGE_MARK; uint16_t value; } PACKAGE_MARK fp16_pack_t; - typedef union bf16_pack { struct { - uint16_t frac : 7; - uint16_t exp : 8; - uint16_t sign : 1; + uint16_t frac : 7; + uint16_t exp : 8; + uint16_t sign : 1; } PACKAGE_MARK; uint16_t value; } PACKAGE_MARK bf16_pack_t; - #ifdef _MSC_VER typedef struct afp24_pack { uint16_t frac : 16; - uint8_t exp : 7; - uint8_t sign : 1; + uint8_t exp : 7; + uint8_t sign : 1; } afp24_pack_t; - typedef struct pxr24_pack { uint16_t frac : 15; - uint16_t : 1; - uint8_t : 7; - uint8_t sign : 1; + uint16_t : 1; + uint8_t : 7; + uint8_t sign : 1; } pxr24_pack_t; #else typedef struct afp24_pack { uint32_t frac : 16; - uint32_t exp : 7; - uint32_t sign : 1; + uint32_t exp : 7; + uint32_t sign : 1; } PACKAGE_MARK afp24_pack_t; - typedef struct pxr24_pack { uint32_t frac : 15; - uint32_t exp : 8; - uint32_t sign : 1; + uint32_t exp : 8; + uint32_t sign : 1; } PACKAGE_MARK pxr24_pack_t; #endif @@ -105,20 +100,19 @@ typedef union fp32_pack struct { uint32_t frac : 23; - uint32_t exp : 8; - uint32_t sign : 1; + uint32_t exp : 8; + uint32_t sign : 1; } PACKAGE_MARK; float value; } PACKAGE_MARK fp32_pack_t; - typedef union fp64_pack { struct { uint64_t frac : 52; - uint64_t exp : 11; - uint64_t sign : 1; + uint64_t exp : 11; + uint64_t sign : 1; } PACKAGE_MARK; double value; } PACKAGE_MARK fp64_pack_t; @@ -126,16 +120,14 @@ typedef union fp64_pack #pragma pack(pop) #endif - #ifdef __ARM_ARCH -typedef __fp16 fp16_t; +typedef __fp16 fp16_t; #else typedef fp16_pack_t fp16_t; #endif typedef bf16_pack_t bf16_t; -typedef float fp32_t; -typedef double fp64_t; - +typedef float fp32_t; +typedef double fp64_t; #ifndef __ARM_ARCH /*! @@ -147,7 +139,6 @@ typedef double fp64_t; */ fp32_t fp16_to_fp32(fp16_t package); - /*! * @brief Convert a number from float32 to float16. * @@ -158,7 +149,6 @@ fp32_t fp16_to_fp32(fp16_t package); fp16_t fp32_to_fp16(fp32_t package); #endif - /*! * @brief Convert a number from float16 to float32. * @@ -168,7 +158,6 @@ fp16_t fp32_to_fp16(fp32_t package); */ fp32_t bf16_to_fp32(bf16_t package); - /*! * @brief Convert a number from float32 to float16. * @@ -178,7 +167,6 @@ fp32_t bf16_to_fp32(bf16_t package); */ bf16_t fp32_to_bf16(fp32_t package); - #ifdef __ARM_ARCH #define fp16_to_fp32(data) ({ float f = data; f; }) #define fp32_to_fp16(data) ({ __fp16 f = data; f; }) diff --git a/source/utility/lock.c b/source/utility/lock.c index 6d299b6d3..f3de78577 100644 --- a/source/utility/lock.c +++ b/source/utility/lock.c @@ -27,26 +27,22 @@ #include "defines.h" #include "utility/sys_port.h" - static inline void bare_metal_mutex_init(mutex_t* mutex) { mutex->locker = sys_malloc(sizeof(mutex->locker)); *((int*)(mutex->locker)) = 0; } - static inline void bare_metal_mutex_lock(mutex_t* mutex) { *((int*)(mutex->locker)) = 1; } - static inline void bare_metal_mutex_unlock(mutex_t* mutex) { *((int*)(mutex->locker)) = 0; } - static inline void bare_metal_mutex_free(mutex_t* mutex) { if (NULL != mutex->locker) @@ -57,43 +53,34 @@ static inline void bare_metal_mutex_free(mutex_t* mutex) mutex->locker = NULL; } - // for WIN MSVC - - - #ifdef TENGINE_HAS_LIB_POSIX_THREAD #include typedef pthread_mutex_t lock_t; - static inline void posix_thread_mutex_init(mutex_t* mutex) { mutex->locker = sys_malloc(sizeof(lock_t)); pthread_mutex_init((lock_t*)mutex->locker, NULL); } - static inline void posix_thread_mutex_lock(mutex_t* mutex) { pthread_mutex_lock((lock_t*)mutex->locker); } - static inline void posix_thread_mutex_unlock(mutex_t* mutex) { pthread_mutex_unlock((lock_t*)mutex->locker); } - static inline void posix_thread_mutex_free(mutex_t* mutex) { return bare_metal_mutex_free(mutex); } - void init_mutex(mutex_t* mutex) { mutex->init = posix_thread_mutex_init; @@ -108,14 +95,12 @@ void init_mutex(mutex_t* mutex) typedef CRITICAL_SECTION lock_t; - -static inline void win_mutex_init(mutex_t* mutex) +static inline void win_mutex_init(mutex_t* mutex) { mutex->locker = sys_malloc(sizeof(lock_t)); InitializeCriticalSection((lock_t*)mutex->locker); } - static inline void win_mutex_lock(mutex_t* mutex) { if (NULL != mutex->locker) @@ -124,7 +109,6 @@ static inline void win_mutex_lock(mutex_t* mutex) } } - static inline void win_mutex_unlock(mutex_t* mutex) { if (NULL != mutex->locker) @@ -133,13 +117,11 @@ static inline void win_mutex_unlock(mutex_t* mutex) } } - static inline void win_mutex_free(mutex_t* mutex) { return bare_metal_mutex_free(mutex); } - void init_mutex(mutex_t* mutex) { mutex->init = win_mutex_init; @@ -161,27 +143,17 @@ void init_mutex(mutex_t* mutex) } #endif // end TENGINE_HAS_LIB_POSIX_THREAD - void lock_mutex(mutex_t* mutex) { return mutex->lock(mutex); } - void unlock_mutex(mutex_t* mutex) { return mutex->unlock(mutex); } - void free_mutex(mutex_t* mutex) { return mutex->free(mutex); } - - - - - - - diff --git a/source/utility/lock.h b/source/utility/lock.h index b4d7ee4c3..502fc99a4 100644 --- a/source/utility/lock.h +++ b/source/utility/lock.h @@ -24,21 +24,19 @@ #pragma once - /*! * @struct abstract_mutex * @brief Abstract mutex_t, platform independence */ typedef struct abstract_mutex { - void* locker; //!< platform dependence mutex impl - void (*init)(struct abstract_mutex* mutex); //!< init this mutex - void (*lock)(struct abstract_mutex* mutex); //!< lock this mutex - void (*unlock)(struct abstract_mutex* mutex); //!< unlock this mutex - void (*free)(struct abstract_mutex* mutex); //!< destroy this mutex + void* locker; //!< platform dependence mutex impl + void (*init)(struct abstract_mutex* mutex); //!< init this mutex + void (*lock)(struct abstract_mutex* mutex); //!< lock this mutex + void (*unlock)(struct abstract_mutex* mutex); //!< unlock this mutex + void (*free)(struct abstract_mutex* mutex); //!< destroy this mutex } mutex_t; - /*! * @brief Init a abstract mutex. * @@ -46,7 +44,6 @@ typedef struct abstract_mutex */ void init_mutex(mutex_t* mutex); - /*! * @brief Init a abstract mutex. * @@ -54,7 +51,6 @@ void init_mutex(mutex_t* mutex); */ void lock_mutex(mutex_t* mutex); - /*! * @brief Init a abstract mutex. * @@ -62,7 +58,6 @@ void lock_mutex(mutex_t* mutex); */ void unlock_mutex(mutex_t* mutex); - /*! * @brief Init a abstract mutex. * diff --git a/source/utility/log.c b/source/utility/log.c index 317a2cd14..1382d42cc 100644 --- a/source/utility/log.c +++ b/source/utility/log.c @@ -29,7 +29,6 @@ #include "api/c_api.h" #include "utility/lock.h" - #include #include #include @@ -38,11 +37,9 @@ #include #endif - static mutex_t log_locker; static const char* map_table[] = {"EMERG", "ALERT", "CRIT", "ERROR", "WARN", "NOTICE", "INFO", "DEBUG"}; - static void safety_log(struct logger* logger, char* message) { if (0 != message[TE_MAX_LOG_LENGTH - 1]) @@ -55,7 +52,6 @@ static void safety_log(struct logger* logger, char* message) unlock_mutex(&log_locker); } - static void do_log(struct logger* logger, enum log_level level, const char* fmt, ...) { if (logger->log_level < level || level > LOG_DEBUG) @@ -68,47 +64,47 @@ static void do_log(struct logger* logger, enum log_level level, const char* fmt, switch (level) { - case LOG_EMERG: - case LOG_ALERT: - case LOG_CRIT: - { - __android_log_print(ANDROID_LOG_FATAL, "Tengine", fmt, _ap); - break; - } - case LOG_ERR: - { - __android_log_print(ANDROID_LOG_ERROR, "Tengine", fmt, _ap); - break; - } - case LOG_WARNING: - { - __android_log_print(ANDROID_LOG_WARN, "Tengine", fmt, _ap); - break; - } - case LOG_NOTICE: - case LOG_INFO: - { - __android_log_print(ANDROID_LOG_INFO, "Tengine", fmt, _ap); - break; - } - case LOG_DEBUG: - { - __android_log_print(ANDROID_LOG_DEBUG, "Tengine", fmt, _ap); - break; - } - default: - { - __android_log_print(ANDROID_LOG_VERBOSE, "Tengine", fmt, _ap); - } + case LOG_EMERG: + case LOG_ALERT: + case LOG_CRIT: + { + __android_log_print(ANDROID_LOG_FATAL, "Tengine", fmt, _ap); + break; + } + case LOG_ERR: + { + __android_log_print(ANDROID_LOG_ERROR, "Tengine", fmt, _ap); + break; + } + case LOG_WARNING: + { + __android_log_print(ANDROID_LOG_WARN, "Tengine", fmt, _ap); + break; + } + case LOG_NOTICE: + case LOG_INFO: + { + __android_log_print(ANDROID_LOG_INFO, "Tengine", fmt, _ap); + break; + } + case LOG_DEBUG: + { + __android_log_print(ANDROID_LOG_DEBUG, "Tengine", fmt, _ap); + break; + } + default: + { + __android_log_print(ANDROID_LOG_VERBOSE, "Tengine", fmt, _ap); + } } va_end(_ap); return; #else va_list ap; - char msg[TE_MAX_LOG_LENGTH] = { 0 }; - int max_len = TE_MAX_LOG_LENGTH; - int left = max_len; + char msg[TE_MAX_LOG_LENGTH] = {0}; + int max_len = TE_MAX_LOG_LENGTH; + int left = max_len; char* p = msg; int ret; @@ -157,7 +153,6 @@ static void do_log(struct logger* logger, enum log_level level, const char* fmt, #endif } - static void change_log_level(struct logger* logger, int level) { if (level < 0 || level > LOG_DEBUG) @@ -168,19 +163,16 @@ static void change_log_level(struct logger* logger, int level) logger->log_level = level; } - static void set_output_func(struct logger* logger, void (*func)(const char*)) { logger->output_func = func; } - static void output_stderr(const char* msg) { fprintf(stderr, "%s", msg); } - struct logger* get_default_logger(void) { static int inited = 0; diff --git a/source/utility/log.h b/source/utility/log.h index 7126950d2..993a5e4e8 100644 --- a/source/utility/log.h +++ b/source/utility/log.h @@ -27,7 +27,6 @@ #include "api/c_api.h" - struct log_option { int print_prefix; @@ -35,7 +34,6 @@ struct log_option int print_level; }; - struct logger { const char* prefix; @@ -49,68 +47,66 @@ struct logger void (*set_output_func)(struct logger*, void (*func)(const char*)); }; - struct logger* get_default_logger(void); - -#define SET_LOG_OUTPUT(func) \ - do \ - { \ - struct logger* logger = get_default_logger(); \ - logger->set_output_func(logger, func); \ - } while(0) - -#define SET_LOG_LEVEL(level) \ - do \ - { \ - struct logger* logger = get_default_logger(); \ - logger->set_log_level(logger, level); \ - } while(0) - -#define SET_LOG_PRINT_TIME(val) \ - do \ - { \ - struct logger* logger = get_default_logger(); \ - logger->option.print_time = val; \ - } while(0) - -#define SET_LOG_PRINT_LEVEL(val) \ - do \ - { \ - struct logger* logger = get_default_logger(); \ - logger->option.print_level = val; \ - } while(0) - -#define SET_LOG_PRINT_PREFIX(val) \ - do \ - { \ - struct logger* logger = get_default_logger(); \ - logger->option.print_prefix = val; \ - } while(0) - -#define SET_LOG_PREFIX(prefix) \ - do \ - { \ - struct logger* logger = get_default_logger(); \ - logger->prefix = prefix; \ - } while(0) - -#define LOG(level, fmt, ...) \ - do \ - { \ - struct logger* logger = get_default_logger(); \ - logger->log(logger, level, fmt, ##__VA_ARGS__); \ - } while(0) - -#define TLOG_EMERG(fmt, ...) LOG(LOG_EMERG, fmt, ##__VA_ARGS__) -#define TLOG_ALERT(fmt, ...) LOG(LOG_ALERT, fmt, ##__VA_ARGS__) -#define TLOG_CRIT(fmt, ...) LOG(LOG_CRIT, fmt, ##__VA_ARGS__) -#define TLOG_ERR(fmt, ...) LOG(LOG_ERR, fmt, ##__VA_ARGS__) -#define TLOG_WARNING(fmt, ...) LOG(LOG_WARNING, fmt, ##__VA_ARGS__) -#define TLOG_NOTICE(fmt, ...) LOG(LOG_NOTICE, fmt, ##__VA_ARGS__) -#define TLOG_INFO(fmt, ...) LOG(LOG_INFO, fmt, ##__VA_ARGS__) -#define TLOG_DEBUG(fmt, ...) LOG(LOG_DEBUG, fmt, ##__VA_ARGS__) - -#define XLOG(level, fmt, ...) \ - LOG(level, "%s:%d ", __FILE__, __LINE__); \ +#define SET_LOG_OUTPUT(func) \ + do \ + { \ + struct logger* logger = get_default_logger(); \ + logger->set_output_func(logger, func); \ + } while (0) + +#define SET_LOG_LEVEL(level) \ + do \ + { \ + struct logger* logger = get_default_logger(); \ + logger->set_log_level(logger, level); \ + } while (0) + +#define SET_LOG_PRINT_TIME(val) \ + do \ + { \ + struct logger* logger = get_default_logger(); \ + logger->option.print_time = val; \ + } while (0) + +#define SET_LOG_PRINT_LEVEL(val) \ + do \ + { \ + struct logger* logger = get_default_logger(); \ + logger->option.print_level = val; \ + } while (0) + +#define SET_LOG_PRINT_PREFIX(val) \ + do \ + { \ + struct logger* logger = get_default_logger(); \ + logger->option.print_prefix = val; \ + } while (0) + +#define SET_LOG_PREFIX(prefix) \ + do \ + { \ + struct logger* logger = get_default_logger(); \ + logger->prefix = prefix; \ + } while (0) + +#define LOG(level, fmt, ...) \ + do \ + { \ + struct logger* logger = get_default_logger(); \ + logger->log(logger, level, fmt, ##__VA_ARGS__); \ + } while (0) + +#define TLOG_EMERG(fmt, ...) LOG(LOG_EMERG, fmt, ##__VA_ARGS__) +#define TLOG_ALERT(fmt, ...) LOG(LOG_ALERT, fmt, ##__VA_ARGS__) +#define TLOG_CRIT(fmt, ...) LOG(LOG_CRIT, fmt, ##__VA_ARGS__) +#define TLOG_ERR(fmt, ...) LOG(LOG_ERR, fmt, ##__VA_ARGS__) +#define TLOG_WARNING(fmt, ...) LOG(LOG_WARNING, fmt, ##__VA_ARGS__) +#define TLOG_NOTICE(fmt, ...) LOG(LOG_NOTICE, fmt, ##__VA_ARGS__) +#define TLOG_INFO(fmt, ...) LOG(LOG_INFO, fmt, ##__VA_ARGS__) +#define TLOG_DEBUG(fmt, ...) LOG(LOG_DEBUG, fmt, ##__VA_ARGS__) + +#define XLOG(level, fmt, ...) \ + LOG(level, "%s:%d ", __FILE__, __LINE__); \ LOG(level, fmt, ##__VA_ARGS__) diff --git a/source/utility/math.c b/source/utility/math.c index 1d7ab8c9d..117d343fa 100644 --- a/source/utility/math.c +++ b/source/utility/math.c @@ -27,38 +27,32 @@ #include - int imin(int a, int b) { return a <= b ? a : b; } - int imax(int a, int b) { return a >= b ? a : b; } - int min_abs(int a, int b) { return imin(abs(a), abs(b)); } - int max_abs(int a, int b) { return imax(abs(a), abs(b)); } - static int solve_gcd(int large, int small) { int val = large % small; return 0 == val ? small : gcd(small, val); } - int gcd(int a, int b) { if (0 == a || 0 == b) @@ -67,7 +61,6 @@ int gcd(int a, int b) return solve_gcd(max_abs(a, b), min_abs(a, b)); } - int lcm(int a, int b) { if (0 == a || 0 == b) @@ -76,14 +69,12 @@ int lcm(int a, int b) return abs(a * b) / solve_gcd(max_abs(a, b), min_abs(a, b)); } - int align(int value, int step) { const int mask = ~(abs(step) - 1); return (value + step) & mask; } - void* align_address(void* address, int step) { const size_t mask = ~(abs(step) - 1); diff --git a/source/utility/math.h b/source/utility/math.h index 672ddcdc1..16a7c5d9d 100644 --- a/source/utility/math.h +++ b/source/utility/math.h @@ -25,7 +25,6 @@ #pragma once - /*! * @brief Solve min value * @@ -36,7 +35,6 @@ */ int imin(int a, int b); - /*! * @brief Solve max value * @@ -47,7 +45,6 @@ int imin(int a, int b); */ int imax(int a, int b); - /*! * @brief Solve min absolute value * @@ -58,7 +55,6 @@ int imax(int a, int b); */ int min_abs(int a, int b); - /*! * @brief Solve max absolute value * @@ -69,7 +65,6 @@ int min_abs(int a, int b); */ int max_abs(int a, int b); - /*! * @brief Solve greatest common divisor * @@ -80,7 +75,6 @@ int max_abs(int a, int b); */ int gcd(int a, int b); - /*! * @brief Solve lowest common multiple * @@ -91,7 +85,6 @@ int gcd(int a, int b); */ int lcm(int a, int b); - /*! * @brief Solve min aligned value with the step length * @@ -102,7 +95,6 @@ int lcm(int a, int b); */ int align(int value, int step); - /*! * @brief Get aligned pointer * diff --git a/source/utility/mem_stat.c b/source/utility/mem_stat.c index c1106bba6..44e2212f5 100644 --- a/source/utility/mem_stat.c +++ b/source/utility/mem_stat.c @@ -69,7 +69,7 @@ static int find_block_list(void* ptr) for (i = 0; i < n; i++) { - struct block_stat* block_stat = ( struct block_stat* )get_vector_data(block_list, i); + struct block_stat* block_stat = (struct block_stat*)get_vector_data(block_list, i); if (block_stat->ptr == ptr) break; @@ -178,7 +178,7 @@ void stat_free(void* ptr) return; } - struct block_stat* block_stat = ( struct block_stat* )get_vector_data(block_list, idx); + struct block_stat* block_stat = (struct block_stat*)get_vector_data(block_list, idx); mem_stat.free_count++; mem_stat.cur_mem_size -= block_stat->size; @@ -204,7 +204,7 @@ void* stat_realloc(void* ptr, size_t size) void* new_ptr = realloc(ptr, size); - struct block_stat* block_stat = ( struct block_stat* )get_vector_data(block_list, idx); + struct block_stat* block_stat = (struct block_stat*)get_vector_data(block_list, idx); if (new_ptr == NULL) { diff --git a/source/utility/sys_port.c b/source/utility/sys_port.c index 783009568..a2887a929 100644 --- a/source/utility/sys_port.c +++ b/source/utility/sys_port.c @@ -82,7 +82,7 @@ char* strdup(const char* src) int n = strlen(src); - char* new_str = ( char* )sys_malloc(n + 1); + char* new_str = (char*)sys_malloc(n + 1); if (new_str == NULL) return NULL; diff --git a/source/utility/sys_port.h b/source/utility/sys_port.h index 83f4a5f72..151043663 100644 --- a/source/utility/sys_port.h +++ b/source/utility/sys_port.h @@ -52,8 +52,8 @@ void* sys_realloc(void* ptr, size_t size); #ifdef CONFIG_INTERN_ALLOCATOR -#define malloc buddy_malloc -#define free buddy_free +#define malloc buddy_malloc +#define free buddy_free #define realloc buddy_realloc void* buddy_malloc(size_t size); diff --git a/source/utility/utils.c b/source/utility/utils.c index d3e86838b..8079d1336 100644 --- a/source/utility/utils.c +++ b/source/utility/utils.c @@ -23,7 +23,6 @@ * Revised: lswang@openailab.com */ - #include "utility/utils.h" #include "defines.h" @@ -35,25 +34,23 @@ #include #include - const char* get_tensor_type_string(int tensor_type) { switch (tensor_type) { - case TENSOR_TYPE_VAR: - return "var"; - case TENSOR_TYPE_CONST: - return "const"; - case TENSOR_TYPE_INPUT: - return "input"; - case TENSOR_TYPE_DEP: - return "dep"; - default: - return "unknown"; + case TENSOR_TYPE_VAR: + return "var"; + case TENSOR_TYPE_CONST: + return "const"; + case TENSOR_TYPE_INPUT: + return "input"; + case TENSOR_TYPE_DEP: + return "dep"; + default: + return "unknown"; } } - const char* get_tensor_layout_string(int layout) { if (layout == TENGINE_LAYOUT_NHWC) @@ -62,31 +59,29 @@ const char* get_tensor_layout_string(int layout) return "NCHW"; } - const char* get_model_format_string(int model_format) { switch (model_format) { - case MODEL_FORMAT_TENGINE: - return "tengine"; - case MODEL_FORMAT_CAFFE: - return "caffe"; - case MODEL_FORMAT_ONNX: - return "onnx"; - case MODEL_FORMAT_MXNET: - return "mxnet"; - case MODEL_FORMAT_TENSORFLOW: - return "tensorflow"; - case MODEL_FORMAT_TFLITE: - return "tflite"; - case MODEL_FORMAT_DLA: - return "dla"; - default: - return "unknown"; + case MODEL_FORMAT_TENGINE: + return "tengine"; + case MODEL_FORMAT_CAFFE: + return "caffe"; + case MODEL_FORMAT_ONNX: + return "onnx"; + case MODEL_FORMAT_MXNET: + return "mxnet"; + case MODEL_FORMAT_TENSORFLOW: + return "tensorflow"; + case MODEL_FORMAT_TFLITE: + return "tflite"; + case MODEL_FORMAT_DLA: + return "dla"; + default: + return "unknown"; } } - int get_op_type_from_name(const char* name) { int count = get_op_method_count(); @@ -104,68 +99,63 @@ int get_op_type_from_name(const char* name) return -1; } - const char* get_op_name_from_type(int op_type) { return find_op_name(op_type); } - int get_tenser_element_size(int data_type) { switch (data_type) { - case TENGINE_DT_FP32: - case TENGINE_DT_INT32: - return 4; - case TENGINE_DT_FP16: - case TENGINE_DT_INT16: - return 2; - case TENGINE_DT_INT8: - case TENGINE_DT_UINT8: - return 1; - default: - return 0; + case TENGINE_DT_FP32: + case TENGINE_DT_INT32: + return 4; + case TENGINE_DT_FP16: + case TENGINE_DT_INT16: + return 2; + case TENGINE_DT_INT8: + case TENGINE_DT_UINT8: + return 1; + default: + return 0; } } - const char* get_tensor_data_type_string(int data_type) { switch (data_type) { - case TENGINE_DT_FP32: - return "fp32"; - case TENGINE_DT_FP16: - return "fp16"; - case TENGINE_DT_INT8: - return "int8"; - case TENGINE_DT_UINT8: - return "uint8"; - case TENGINE_DT_INT32: - return "int32"; - case TENGINE_DT_INT16: - return "int16"; - default: - return "unknown"; + case TENGINE_DT_FP32: + return "fp32"; + case TENGINE_DT_FP16: + return "fp16"; + case TENGINE_DT_INT8: + return "int8"; + case TENGINE_DT_UINT8: + return "uint8"; + case TENGINE_DT_INT32: + return "int32"; + case TENGINE_DT_INT16: + return "int16"; + default: + return "unknown"; } } - const char* data_type_typeinfo_name(int data_type) { switch (data_type) { - case TENGINE_DT_INT32: - return "i"; - case TENGINE_DT_FP32: - return "f"; - default: - return NULL; + case TENGINE_DT_INT32: + return "i"; + case TENGINE_DT_FP32: + return "f"; + default: + return NULL; } } - void dump_float(const char* file_name, float* data, int number) { FILE* fp = fopen(file_name, "w"); @@ -184,7 +174,6 @@ void dump_float(const char* file_name, float* data, int number) fclose(fp); } - int get_mask_count(size_t mask) { int count = 0; @@ -196,7 +185,6 @@ int get_mask_count(size_t mask) return count; } - int get_mask_index(size_t mask) { if (get_mask_count(mask) > 1) diff --git a/source/utility/utils.h b/source/utility/utils.h index c52ccfdbd..59ad75aff 100644 --- a/source/utility/utils.h +++ b/source/utility/utils.h @@ -27,7 +27,6 @@ #include - /*! * @brief Convert tensor type to char array. * @@ -37,7 +36,6 @@ */ const char* get_tensor_type_string(int tensor_type); - /*! * @brief Convert tensor layout to char array. * @@ -47,7 +45,6 @@ const char* get_tensor_type_string(int tensor_type); */ const char* get_tensor_layout_string(int tensor_layout); - /*! * @brief Convert model format to char array. * @@ -57,7 +54,6 @@ const char* get_tensor_layout_string(int tensor_layout); */ const char* get_model_format_string(int model_format); - /*! * @brief Convert operator name char array to enumeration value. * @@ -67,7 +63,6 @@ const char* get_model_format_string(int model_format); */ int get_op_type_from_name(const char* name); - /*! * @brief Convert operator enumeration value to char array. * @@ -77,7 +72,6 @@ int get_op_type_from_name(const char* name); */ const char* get_op_name_from_type(int op_type); - /*! * @brief Get single element size of the tensor data type. * @@ -87,7 +81,6 @@ const char* get_op_name_from_type(int op_type); */ int get_tenser_element_size(int data_type); - /*! * @brief Convert tensor data type to char array. * @@ -97,7 +90,6 @@ int get_tenser_element_size(int data_type); */ const char* get_tensor_data_type_string(int data_type); - /*! * @brief Convert tensor data type single letter char array. * @@ -107,11 +99,8 @@ const char* get_tensor_data_type_string(int data_type); */ const char* data_type_typeinfo_name(int data_type); - void dump_float(const char* file_name, float* data, int number); - int get_mask_count(size_t mask); - int get_mask_index(size_t mask); diff --git a/source/utility/vector.c b/source/utility/vector.c index be8b1f01d..887d4d385 100644 --- a/source/utility/vector.c +++ b/source/utility/vector.c @@ -31,25 +31,22 @@ #include - typedef struct vector_entry { int valid; unsigned char data[]; } vector_entry_t; - static inline vector_entry_t* get_vector_entry(vector_t* v, int idx) { return (vector_entry_t*)((char*)v->mem + v->entry_size * idx); } - static inline void free_vector_data_resource(vector_t* v, int idx) { vector_entry_t* e = get_vector_entry(v, idx); - if(e->valid && v->free_func) + if (e->valid && v->free_func) { v->free_func(e->data); } @@ -57,7 +54,6 @@ static inline void free_vector_data_resource(vector_t* v, int idx) e->valid = 0; } - static inline void remove_vector_data_not_tail(vector_t* v, int idx) { vector_entry_t* entry_ptr = NULL; @@ -78,7 +74,6 @@ static inline void remove_vector_data_not_tail(vector_t* v, int idx) entry_ptr->valid = 0; } - vector_t* create_vector(int elem_size, void (*free_data)(void*)) { vector_t* v = (vector_t*)sys_malloc(sizeof(vector_t)); @@ -109,7 +104,6 @@ vector_t* create_vector(int elem_size, void (*free_data)(void*)) return v; } - void release_vector(vector_t* v) { for (int i = 0; i < v->elem_num; i++) @@ -121,7 +115,6 @@ void release_vector(vector_t* v) sys_free(v); } - int get_vector_num(vector_t* v) { if (NULL != v) @@ -132,7 +125,6 @@ int get_vector_num(vector_t* v) return 0; } - int resize_vector(vector_t* v, int new_size) { void* new_mem = NULL; @@ -162,7 +154,7 @@ int resize_vector(vector_t* v, int new_size) } v->real_mem = new_mem; - v->mem = ( void* )(((size_t)(v->real_mem)) & (~(TE_VECTOR_ALIGN_SIZE - 1))); + v->mem = (void*)(((size_t)(v->real_mem)) & (~(TE_VECTOR_ALIGN_SIZE - 1))); for (int i = v->space_num; i < new_size; i++) { @@ -175,10 +167,9 @@ int resize_vector(vector_t* v, int new_size) return 0; } - int push_vector_data(vector_t* v, void* data) { - if(v->elem_num == v->space_num && resize_vector(v, v->elem_num + v->ahead_num) < 0) + if (v->elem_num == v->space_num && resize_vector(v, v->elem_num + v->ahead_num) < 0) { return -1; } @@ -189,12 +180,11 @@ int push_vector_data(vector_t* v, void* data) return 0; } - int set_vector_data(vector_t* v, int idx, void* data) { vector_entry_t* e = NULL; - if(idx >= v->elem_num) + if (idx >= v->elem_num) return -1; free_vector_data_resource(v, idx); @@ -207,10 +197,9 @@ int set_vector_data(vector_t* v, int idx, void* data) return 0; } - void* get_vector_data(vector_t* v, int index) { - if(index >= v->elem_num) + if (index >= v->elem_num) { return NULL; } @@ -220,7 +209,6 @@ void* get_vector_data(vector_t* v, int index) return e->data; } - int remove_vector_via_pointer(vector_t* v, void* data) { const int count = v->elem_num; @@ -245,11 +233,10 @@ int remove_vector_via_pointer(vector_t* v, void* data) return 0; } - void remove_vector_via_index(vector_t* v, int idx) { // the last one - if(idx == v->elem_num - 1) + if (idx == v->elem_num - 1) { free_vector_data_resource(v, idx); v->elem_num--; diff --git a/source/utility/vector.h b/source/utility/vector.h index 73153b3f1..45ab70f03 100644 --- a/source/utility/vector.h +++ b/source/utility/vector.h @@ -35,18 +35,17 @@ extern "C" { */ typedef struct vector { - int elem_size; //!< elements size which will be pushed into vector - int elem_num; //!< current counter of inserted elements - - int entry_size; //!< size of inside vector header entry - int space_num; //!< the allocated elements counter, which should greater equal to 'elem_num' - int ahead_num; //!< allocated step when vector is full - void* real_mem; //!< real aligned memory address which point to vector entry - void* mem; //!< visual aligned address which point to the very begging of elements - void (*free_func)(void*); //!< elements free function, will be called when release elements or vector + int elem_size; //!< elements size which will be pushed into vector + int elem_num; //!< current counter of inserted elements + + int entry_size; //!< size of inside vector header entry + int space_num; //!< the allocated elements counter, which should greater equal to 'elem_num' + int ahead_num; //!< allocated step when vector is full + void* real_mem; //!< real aligned memory address which point to vector entry + void* mem; //!< visual aligned address which point to the very begging of elements + void (*free_func)(void*); //!< elements free function, will be called when release elements or vector } vector_t; - /*! * @brief Create a vector for a struct(or something else). * @@ -59,7 +58,6 @@ typedef struct vector */ vector_t* create_vector(int elem_size, void (*free_func)(void*)); - /*! * @brief Release a vector. * @@ -67,7 +65,6 @@ vector_t* create_vector(int elem_size, void (*free_func)(void*)); */ void release_vector(vector_t* v); - /*! * @brief Get the count of elements. * @@ -77,7 +74,6 @@ void release_vector(vector_t* v); */ int get_vector_num(vector_t* v); - /*! * @brief Resize a vector. * @@ -88,7 +84,6 @@ int get_vector_num(vector_t* v); */ int resize_vector(vector_t* v, int new_size); - /*! * @brief Push a element into vector from its pointer. * @@ -99,7 +94,6 @@ int resize_vector(vector_t* v, int new_size); */ int push_vector_data(vector_t* v, void* data); - /*! * @brief Set a element via its index. * @@ -111,7 +105,6 @@ int push_vector_data(vector_t* v, void* data); */ int set_vector_data(vector_t* v, int index, void* data); - /*! * @brief Get a element via its index. * @@ -122,7 +115,6 @@ int set_vector_data(vector_t* v, int index, void* data); */ void* get_vector_data(vector_t* v, int index); - /*! * @brief Remove a element via its pointer. * @@ -133,7 +125,6 @@ void* get_vector_data(vector_t* v, int index); */ int remove_vector_via_pointer(vector_t* v, void* data); - /*! * @brief Remove a element via its index. * diff --git a/tests/common/common.h b/tests/common/common.h index 40a263aba..9ab861855 100644 --- a/tests/common/common.h +++ b/tests/common/common.h @@ -42,9 +42,9 @@ #ifdef _WIN32 #define WIN32_LEAN_AND_MEAN #include -#else // _WIN32 +#else // _WIN32 #include -#endif // _WIN32 +#endif // _WIN32 #ifdef _WIN32 static double get_current_time() @@ -56,7 +56,7 @@ static double get_current_time() return pc.QuadPart * 1000.0 / freq.QuadPart; } -#else // _WIN32 +#else // _WIN32 static double get_current_time() { @@ -65,7 +65,7 @@ static double get_current_time() return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0; } -#endif // _WIN32 +#endif // _WIN32 static void split(float* array, char* str, const char* del) { @@ -78,4 +78,4 @@ static void split(float* array, char* str, const char* del) } } -#endif // __COMMON_H__ +#endif // __COMMON_H__ diff --git a/tests/common/compiler_fp16.h b/tests/common/compiler_fp16.h index 1857d7eec..d770707c2 100644 --- a/tests/common/compiler_fp16.h +++ b/tests/common/compiler_fp16.h @@ -48,7 +48,7 @@ extern "C" { #else #ifdef _MSC_VER -#pragma pack (push,1) +#pragma pack(push, 1) struct fp16_pack { unsigned short frac : 10; @@ -84,12 +84,12 @@ typedef struct fp16_pack __fp16; static inline float fp16_to_fp32(__fp16 data) { float f; - struct fp32_pack* fp32 = ( struct fp32_pack* )&f; + struct fp32_pack* fp32 = (struct fp32_pack*)&f; struct fp16_pack* fp16 = &data; int exp = fp16->exp; - if(exp == 31 && fp16->frac != 0) + if (exp == 31 && fp16->frac != 0) { // return __builtin_inf()-__builtin_inf(); fp32->sign = fp16->sign; @@ -99,28 +99,28 @@ static inline float fp16_to_fp32(__fp16 data) return f; } - if(exp == 31) + if (exp == 31) exp = 255; - if(exp == 0) + if (exp == 0) exp = 0; else exp = (exp - 15) + 127; fp32->exp = exp; fp32->sign = fp16->sign; - fp32->frac = (( int )fp16->frac) << 13; + fp32->frac = ((int)fp16->frac) << 13; return f; } static inline __fp16 fp32_to_fp16(float data) { - struct fp32_pack* fp32 = ( struct fp32_pack* )&data; + struct fp32_pack* fp32 = (struct fp32_pack*)&data; struct fp16_pack fp16; int exp = fp32->exp; - if(fp32->exp == 255 && fp32->frac != 0) + if (fp32->exp == 255 && fp32->frac != 0) { // NaN fp16.exp = 31; @@ -130,9 +130,9 @@ static inline __fp16 fp32_to_fp16(float data) return fp16; } - if((exp - 127) < -14) + if ((exp - 127) < -14) exp = 0; - else if((exp - 127) > 15) + else if ((exp - 127) > 15) exp = 31; else exp = exp - 127 + 15; diff --git a/tests/common/stb_image.h b/tests/common/stb_image.h index aa445aadf..142610cf4 100644 --- a/tests/common/stb_image.h +++ b/tests/common/stb_image.h @@ -3,13 +3,13 @@ #ifndef STBI_NO_STDIO #include -#endif // STBI_NO_STDIO +#endif // STBI_NO_STDIO #define STBI_VERSION 1 enum { - STBI_default = 0, // only used for desired_channels + STBI_default = 0, // only used for desired_channels STBI_grey = 1, STBI_grey_alpha = 2, @@ -36,9 +36,9 @@ extern "C" { typedef struct { int (*read)(void* user, char* data, - int size); // fill 'data' with 'size' bytes. return number of bytes actually read - void (*skip)(void* user, int n); // skip the next 'n' bytes, or 'unget' the last -n bytes if negative - int (*eof)(void* user); // returns nonzero if we are at end of file/data + int size); // fill 'data' with 'size' bytes. return number of bytes actually read + void (*skip)(void* user, int n); // skip the next 'n' bytes, or 'unget' the last -n bytes if negative + int (*eof)(void* user); // returns nonzero if we are at end of file/data } stbi_io_callbacks; //////////////////////////////////// @@ -95,12 +95,12 @@ extern float* stbi_loadf_from_file(FILE* f, int* x, int* y, int* channels_in_fil #ifndef STBI_NO_HDR extern void stbi_hdr_to_ldr_gamma(float gamma); extern void stbi_hdr_to_ldr_scale(float scale); -#endif // STBI_NO_HDR +#endif // STBI_NO_HDR #ifndef STBI_NO_LINEAR extern void stbi_ldr_to_hdr_gamma(float gamma); extern void stbi_ldr_to_hdr_scale(float scale); -#endif // STBI_NO_LINEAR +#endif // STBI_NO_LINEAR // stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR extern int stbi_is_hdr_from_callbacks(stbi_io_callbacks const* clbk, void* user); @@ -108,7 +108,7 @@ extern int stbi_is_hdr_from_memory(stbi_uc const* buffer, int len); #ifndef STBI_NO_STDIO extern int stbi_is_hdr(char const* filename); extern int stbi_is_hdr_from_file(FILE* f); -#endif // STBI_NO_STDIO +#endif // STBI_NO_STDIO // get a VERY brief reason for failure // NOT THREADSAFE @@ -160,14 +160,12 @@ extern int stbi_zlib_decode_noheader_buffer(char* obuffer, int olen, const char* // // //// end header file ///////////////////////////////////////////////////// -#endif // STBI_INCLUDE_STB_IMAGE_H +#endif // STBI_INCLUDE_STB_IMAGE_H #define STB_IMAGE_IMPLEMENTATION #ifdef STB_IMAGE_IMPLEMENTATION -#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) || defined(STBI_ONLY_TGA) || \ - defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || \ - defined(STBI_ONLY_PNM) || defined(STBI_ONLY_ZLIB) +#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) || defined(STBI_ONLY_ZLIB) #ifndef STBI_ONLY_JPEG #define STBI_NO_JPEG #endif @@ -202,13 +200,13 @@ extern int stbi_zlib_decode_noheader_buffer(char* obuffer, int olen, const char* #endif #include -#include // ptrdiff_t on osx +#include // ptrdiff_t on osx #include #include #include #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) -#include // ldexp, pow +#include // ldexp, pow #endif #ifndef STBI_NO_STDIO @@ -247,9 +245,9 @@ typedef int32_t stbi__int32; typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1]; #ifdef _MSC_VER -#define STBI_NOTUSED(v) ( void )(v) +#define STBI_NOTUSED(v) (void)(v) #else -#define STBI_NOTUSED(v) ( void )sizeof(v) +#define STBI_NOTUSED(v) (void)sizeof(v) #endif #ifdef _MSC_VER @@ -271,9 +269,9 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1]; #endif #ifndef STBI_MALLOC -#define STBI_MALLOC(sz) malloc(sz) +#define STBI_MALLOC(sz) malloc(sz) #define STBI_REALLOC(p, newsz) realloc(p, newsz) -#define STBI_FREE(p) free(p) +#define STBI_FREE(p) free(p) #endif #ifndef STBI_REALLOC_SIZED @@ -319,8 +317,8 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1]; #ifdef _MSC_VER -#if _MSC_VER >= 1400 // not VC6 -#include // __cpuid +#if _MSC_VER >= 1400 // not VC6 +#include // __cpuid static int stbi__cpuid3(void) { int info[4]; @@ -347,7 +345,7 @@ static int stbi__sse2_available(void) int info3 = stbi__cpuid3(); return ((info3 >> 26) & 1) != 0; } -#else // assume GCC-style if not VC++ +#else // assume GCC-style if not VC++ #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16))) static int stbi__sse2_available(void) @@ -404,8 +402,8 @@ static void stbi__start_mem(stbi__context* s, stbi_uc const* buffer, int len) { s->io.read = NULL; s->read_from_callbacks = 0; - s->img_buffer = s->img_buffer_original = ( stbi_uc* )buffer; - s->img_buffer_end = s->img_buffer_original_end = ( stbi_uc* )buffer + len; + s->img_buffer = s->img_buffer_original = (stbi_uc*)buffer; + s->img_buffer_end = s->img_buffer_original_end = (stbi_uc*)buffer + len; } // initialize a callback-based context @@ -424,17 +422,17 @@ static void stbi__start_callbacks(stbi__context* s, stbi_io_callbacks* c, void* static int stbi__stdio_read(void* user, char* data, int size) { - return ( int )fread(data, 1, size, ( FILE* )user); + return (int)fread(data, 1, size, (FILE*)user); } static void stbi__stdio_skip(void* user, int n) { - fseek(( FILE* )user, n, SEEK_CUR); + fseek((FILE*)user, n, SEEK_CUR); } static int stbi__stdio_eof(void* user) { - return feof(( FILE* )user); + return feof((FILE*)user); } static stbi_io_callbacks stbi__stdio_callbacks = { @@ -445,12 +443,12 @@ static stbi_io_callbacks stbi__stdio_callbacks = { static void stbi__start_file(stbi__context* s, FILE* f) { - stbi__start_callbacks(s, &stbi__stdio_callbacks, ( void* )f); + stbi__start_callbacks(s, &stbi__stdio_callbacks, (void*)f); } // static void stop_file(stbi__context *s) { } -#endif // !STBI_NO_STDIO +#endif // !STBI_NO_STDIO static void stbi__rewind(stbi__context* s) { @@ -564,7 +562,7 @@ static void* stbi__malloc(size_t size) // negative terms are considered invalid. static int stbi__addsizes_valid(int a, int b) { - if(b < 0) + if (b < 0) return 0; // now 0 <= b <= INT_MAX, hence also // 0 <= INT_MAX - b <= INTMAX. @@ -577,10 +575,10 @@ static int stbi__addsizes_valid(int a, int b) // negative factors are considered invalid. static int stbi__mul2sizes_valid(int a, int b) { - if(a < 0 || b < 0) + if (a < 0 || b < 0) return 0; - if(b == 0) - return 1; // mul-by-0 is always safe + if (b == 0) + return 1; // mul-by-0 is always safe // portable way to check for no overflows in a*b return a <= INT_MAX / b; } @@ -601,22 +599,21 @@ static int stbi__mad3sizes_valid(int a, int b, int c, int add) #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add) { - return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) && stbi__mul2sizes_valid(a * b * c, d) && - stbi__addsizes_valid(a * b * c * d, add); + return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) && stbi__mul2sizes_valid(a * b * c, d) && stbi__addsizes_valid(a * b * c * d, add); } #endif // mallocs with size overflow checking static void* stbi__malloc_mad2(int a, int b, int add) { - if(!stbi__mad2sizes_valid(a, b, add)) + if (!stbi__mad2sizes_valid(a, b, add)) return NULL; return stbi__malloc(a * b + add); } static void* stbi__malloc_mad3(int a, int b, int c, int add) { - if(!stbi__mad3sizes_valid(a, b, c, add)) + if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL; return stbi__malloc(a * b * c + add); } @@ -624,7 +621,7 @@ static void* stbi__malloc_mad3(int a, int b, int c, int add) #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) static void* stbi__malloc_mad4(int a, int b, int c, int d, int add) { - if(!stbi__mad4sizes_valid(a, b, c, d, add)) + if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL; return stbi__malloc(a * b * c * d + add); } @@ -642,8 +639,8 @@ static void* stbi__malloc_mad4(int a, int b, int c, int d, int add) #define stbi__err(x, y) stbi__err(x) #endif -#define stbi__errpf(x, y) (( float* )(size_t)(stbi__err(x, y) ? NULL : NULL)) -#define stbi__errpuc(x, y) (( unsigned char* )(size_t)(stbi__err(x, y) ? NULL : NULL)) +#define stbi__errpf(x, y) ((float*)(size_t)(stbi__err(x, y) ? NULL : NULL)) +#define stbi__errpuc(x, y) ((unsigned char*)(size_t)(stbi__err(x, y) ? NULL : NULL)) extern void stbi_image_free(void* retval_from_stbi_load) { @@ -667,43 +664,42 @@ extern void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip) static void* stbi__load_main(stbi__context* s, int* x, int* y, int* comp, int req_comp, stbi__result_info* ri, int bpc) { - memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields - ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed - ri->channel_order = - STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order + memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields + ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed + ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order ri->num_channels = 0; #ifndef STBI_NO_JPEG - if(stbi__jpeg_test(s)) + if (stbi__jpeg_test(s)) return stbi__jpeg_load(s, x, y, comp, req_comp, ri); #endif #ifndef STBI_NO_PNG - if(stbi__png_test(s)) + if (stbi__png_test(s)) return stbi__png_load(s, x, y, comp, req_comp, ri); #endif #ifndef STBI_NO_BMP - if(stbi__bmp_test(s)) + if (stbi__bmp_test(s)) return stbi__bmp_load(s, x, y, comp, req_comp, ri); #endif #ifndef STBI_NO_GIF - if(stbi__gif_test(s)) + if (stbi__gif_test(s)) return stbi__gif_load(s, x, y, comp, req_comp, ri); #endif #ifndef STBI_NO_PSD - if(stbi__psd_test(s)) + if (stbi__psd_test(s)) return stbi__psd_load(s, x, y, comp, req_comp, ri, bpc); #endif #ifndef STBI_NO_PIC - if(stbi__pic_test(s)) + if (stbi__pic_test(s)) return stbi__pic_load(s, x, y, comp, req_comp, ri); #endif #ifndef STBI_NO_PNM - if(stbi__pnm_test(s)) + if (stbi__pnm_test(s)) return stbi__pnm_load(s, x, y, comp, req_comp, ri); #endif #ifndef STBI_NO_HDR - if(stbi__hdr_test(s)) + if (stbi__hdr_test(s)) { float* hdr = stbi__hdr_load(s, x, y, comp, req_comp, ri); return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp); @@ -712,7 +708,7 @@ static void* stbi__load_main(stbi__context* s, int* x, int* y, int* comp, int re #ifndef STBI_NO_TGA // test tga last because it's a crappy test! - if(stbi__tga_test(s)) + if (stbi__tga_test(s)) return stbi__tga_load(s, x, y, comp, req_comp, ri); #endif @@ -725,13 +721,12 @@ static stbi_uc* stbi__convert_16_to_8(stbi__uint16* orig, int w, int h, int chan int img_len = w * h * channels; stbi_uc* reduced; - reduced = ( stbi_uc* )stbi__malloc(img_len); - if(reduced == NULL) + reduced = (stbi_uc*)stbi__malloc(img_len); + if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory"); - for(i = 0; i < img_len; ++i) - reduced[i] = - (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling + for (i = 0; i < img_len; ++i) + reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling STBI_FREE(orig); return reduced; @@ -743,13 +738,12 @@ static stbi__uint16* stbi__convert_8_to_16(stbi_uc* orig, int w, int h, int chan int img_len = w * h * channels; stbi__uint16* enlarged; - enlarged = ( stbi__uint16* )stbi__malloc(img_len * 2); - if(enlarged == NULL) - return ( stbi__uint16* )stbi__errpuc("outofmem", "Out of memory"); + enlarged = (stbi__uint16*)stbi__malloc(img_len * 2); + if (enlarged == NULL) + return (stbi__uint16*)stbi__errpuc("outofmem", "Out of memory"); - for(i = 0; i < img_len; ++i) - enlarged[i] = - (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff + for (i = 0; i < img_len; ++i) + enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff STBI_FREE(orig); return enlarged; @@ -758,17 +752,17 @@ static stbi__uint16* stbi__convert_8_to_16(stbi_uc* orig, int w, int h, int chan static void stbi__vertical_flip(void* image, int w, int h, int bytes_per_pixel) { int row; - size_t bytes_per_row = ( size_t )w * bytes_per_pixel; + size_t bytes_per_row = (size_t)w * bytes_per_pixel; stbi_uc temp[2048]; - stbi_uc* bytes = ( stbi_uc* )image; + stbi_uc* bytes = (stbi_uc*)image; - for(row = 0; row < (h >> 1); row++) + for (row = 0; row < (h >> 1); row++) { stbi_uc* row0 = bytes + row * bytes_per_row; stbi_uc* row1 = bytes + (h - row - 1) * bytes_per_row; // swap row0 with row1 size_t bytes_left = bytes_per_row; - while(bytes_left) + while (bytes_left) { size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp); memcpy(temp, row0, bytes_copy); @@ -786,8 +780,8 @@ static void stbi__vertical_flip_slices(void* image, int w, int h, int z, int byt int slice; int slice_size = w * h * bytes_per_pixel; - stbi_uc* bytes = ( stbi_uc* )image; - for(slice = 0; slice < z; ++slice) + stbi_uc* bytes = (stbi_uc*)image; + for (slice = 0; slice < z; ++slice) { stbi__vertical_flip(bytes, w, h, bytes_per_pixel); bytes += slice_size; @@ -799,25 +793,25 @@ static unsigned char* stbi__load_and_postprocess_8bit(stbi__context* s, int* x, stbi__result_info ri; void* result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8); - if(result == NULL) + if (result == NULL) return NULL; - if(ri.bits_per_channel != 8) + if (ri.bits_per_channel != 8) { STBI_ASSERT(ri.bits_per_channel == 16); - result = stbi__convert_16_to_8(( stbi__uint16* )result, *x, *y, req_comp == 0 ? *comp : req_comp); + result = stbi__convert_16_to_8((stbi__uint16*)result, *x, *y, req_comp == 0 ? *comp : req_comp); ri.bits_per_channel = 8; } // @TODO: move stbi__convert_format to here - if(stbi__vertically_flip_on_load) + if (stbi__vertically_flip_on_load) { int channels = req_comp ? req_comp : *comp; stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc)); } - return ( unsigned char* )result; + return (unsigned char*)result; } static stbi__uint16* stbi__load_and_postprocess_16bit(stbi__context* s, int* x, int* y, int* comp, int req_comp) @@ -825,32 +819,32 @@ static stbi__uint16* stbi__load_and_postprocess_16bit(stbi__context* s, int* x, stbi__result_info ri; void* result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16); - if(result == NULL) + if (result == NULL) return NULL; - if(ri.bits_per_channel != 16) + if (ri.bits_per_channel != 16) { STBI_ASSERT(ri.bits_per_channel == 8); - result = stbi__convert_8_to_16(( stbi_uc* )result, *x, *y, req_comp == 0 ? *comp : req_comp); + result = stbi__convert_8_to_16((stbi_uc*)result, *x, *y, req_comp == 0 ? *comp : req_comp); ri.bits_per_channel = 16; } // @TODO: move stbi__convert_format16 to here // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision - if(stbi__vertically_flip_on_load) + if (stbi__vertically_flip_on_load) { int channels = req_comp ? req_comp : *comp; stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16)); } - return ( stbi__uint16* )result; + return (stbi__uint16*)result; } #if !defined(STBI_NO_HDR) || !defined(STBI_NO_LINEAR) static void stbi__float_postprocess(float* result, int* x, int* y, int* comp, int req_comp) { - if(stbi__vertically_flip_on_load && result != NULL) + if (stbi__vertically_flip_on_load && result != NULL) { int channels = req_comp ? req_comp : *comp; stbi__vertical_flip(result, *x, *y, channels * sizeof(float)); @@ -864,7 +858,7 @@ static FILE* stbi__fopen(char const* filename, char const* mode) { FILE* f; #if defined(_MSC_VER) && _MSC_VER >= 1400 - if(0 != fopen_s(&f, filename, mode)) + if (0 != fopen_s(&f, filename, mode)) f = 0; #else f = fopen(filename, mode); @@ -876,7 +870,7 @@ extern stbi_uc* stbi_load(const char* filename, int* x, int* y, int* comp, int r { FILE* f = stbi__fopen(filename, "rb"); unsigned char* result; - if(!f) + if (!f) return stbi__errpuc("can't fopen", "Unable to open file"); result = stbi_load_from_file(f, x, y, comp, req_comp); fclose(f); @@ -889,10 +883,10 @@ extern stbi_uc* stbi_load_from_file(FILE* f, int* x, int* y, int* comp, int req_ stbi__context s; stbi__start_file(&s, f); result = stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp); - if(result) + if (result) { // need to 'unget' all the characters in the IO buffer - fseek(f, -( int )(s.img_buffer_end - s.img_buffer), SEEK_CUR); + fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR); } return result; } @@ -903,10 +897,10 @@ extern stbi__uint16* stbi_load_from_file_16(FILE* f, int* x, int* y, int* comp, stbi__context s; stbi__start_file(&s, f); result = stbi__load_and_postprocess_16bit(&s, x, y, comp, req_comp); - if(result) + if (result) { // need to 'unget' all the characters in the IO buffer - fseek(f, -( int )(s.img_buffer_end - s.img_buffer), SEEK_CUR); + fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR); } return result; } @@ -915,14 +909,14 @@ extern stbi_us* stbi_load_16(char const* filename, int* x, int* y, int* comp, in { FILE* f = stbi__fopen(filename, "rb"); stbi__uint16* result; - if(!f) - return ( stbi_us* )stbi__errpuc("can't fopen", "Unable to open file"); + if (!f) + return (stbi_us*)stbi__errpuc("can't fopen", "Unable to open file"); result = stbi_load_from_file_16(f, x, y, comp, req_comp); fclose(f); return result; } -#endif //! STBI_NO_STDIO +#endif //! STBI_NO_STDIO extern stbi_us* stbi_load_16_from_memory(stbi_uc const* buffer, int len, int* x, int* y, int* channels_in_file, int desired_channels) @@ -936,7 +930,7 @@ extern stbi_us* stbi_load_16_from_callbacks(stbi_io_callbacks const* clbk, void* int* channels_in_file, int desired_channels) { stbi__context s; - stbi__start_callbacks(&s, ( stbi_io_callbacks* )clbk, user); + stbi__start_callbacks(&s, (stbi_io_callbacks*)clbk, user); return stbi__load_and_postprocess_16bit(&s, x, y, channels_in_file, desired_channels); } @@ -951,7 +945,7 @@ extern stbi_uc* stbi_load_from_callbacks(stbi_io_callbacks const* clbk, void* us int req_comp) { stbi__context s; - stbi__start_callbacks(&s, ( stbi_io_callbacks* )clbk, user); + stbi__start_callbacks(&s, (stbi_io_callbacks*)clbk, user); return stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp); } @@ -963,8 +957,8 @@ extern stbi_uc* stbi_load_gif_from_memory(stbi_uc const* buffer, int len, int** stbi__context s; stbi__start_mem(&s, buffer, len); - result = ( unsigned char* )stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp); - if(stbi__vertically_flip_on_load) + result = (unsigned char*)stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp); + if (stbi__vertically_flip_on_load) { stbi__vertical_flip_slices(result, *x, *y, *z, *comp); } @@ -978,17 +972,17 @@ static float* stbi__loadf_main(stbi__context* s, int* x, int* y, int* comp, int { unsigned char* data; #ifndef STBI_NO_HDR - if(stbi__hdr_test(s)) + if (stbi__hdr_test(s)) { stbi__result_info ri; float* hdr_data = stbi__hdr_load(s, x, y, comp, req_comp, &ri); - if(hdr_data) + if (hdr_data) stbi__float_postprocess(hdr_data, x, y, comp, req_comp); return hdr_data; } #endif data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp); - if(data) + if (data) return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp); return stbi__errpf("unknown image type", "Image not of any known type, or corrupt"); } @@ -1004,7 +998,7 @@ extern float* stbi_loadf_from_callbacks(stbi_io_callbacks const* clbk, void* use int req_comp) { stbi__context s; - stbi__start_callbacks(&s, ( stbi_io_callbacks* )clbk, user); + stbi__start_callbacks(&s, (stbi_io_callbacks*)clbk, user); return stbi__loadf_main(&s, x, y, comp, req_comp); } @@ -1013,7 +1007,7 @@ extern float* stbi_loadf(char const* filename, int* x, int* y, int* comp, int re { float* result; FILE* f = stbi__fopen(filename, "rb"); - if(!f) + if (!f) return stbi__errpf("can't fopen", "Unable to open file"); result = stbi_loadf_from_file(f, x, y, comp, req_comp); fclose(f); @@ -1026,9 +1020,9 @@ extern float* stbi_loadf_from_file(FILE* f, int* x, int* y, int* comp, int req_c stbi__start_file(&s, f); return stbi__loadf_main(&s, x, y, comp, req_comp); } -#endif // !STBI_NO_STDIO +#endif // !STBI_NO_STDIO -#endif // !STBI_NO_LINEAR +#endif // !STBI_NO_LINEAR // these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is // defined, for API simplicity; if STBI_NO_LINEAR is defined, it always @@ -1052,7 +1046,7 @@ extern int stbi_is_hdr(char const* filename) { FILE* f = stbi__fopen(filename, "rb"); int result = 0; - if(f) + if (f) { result = stbi_is_hdr_from_file(f); fclose(f); @@ -1075,13 +1069,13 @@ extern int stbi_is_hdr_from_file(FILE* f) return 0; #endif } -#endif // !STBI_NO_STDIO +#endif // !STBI_NO_STDIO extern int stbi_is_hdr_from_callbacks(stbi_io_callbacks const* clbk, void* user) { #ifndef STBI_NO_HDR stbi__context s; - stbi__start_callbacks(&s, ( stbi_io_callbacks* )clbk, user); + stbi__start_callbacks(&s, (stbi_io_callbacks*)clbk, user); return stbi__hdr_test(&s); #else STBI_NOTUSED(clbk); @@ -1128,8 +1122,8 @@ enum static void stbi__refill_buffer(stbi__context* s) { - int n = (s->io.read)(s->io_user_data, ( char* )s->buffer_start, s->buflen); - if(n == 0) + int n = (s->io.read)(s->io_user_data, (char*)s->buffer_start, s->buflen); + if (n == 0) { // at end of file, treat same as if from memory, but need to handle case // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file @@ -1147,9 +1141,9 @@ static void stbi__refill_buffer(stbi__context* s) stbi_inline static stbi_uc stbi__get8(stbi__context* s) { - if(s->img_buffer < s->img_buffer_end) + if (s->img_buffer < s->img_buffer_end) return *s->img_buffer++; - if(s->read_from_callbacks) + if (s->read_from_callbacks) { stbi__refill_buffer(s); return *s->img_buffer++; @@ -1159,13 +1153,13 @@ stbi_inline static stbi_uc stbi__get8(stbi__context* s) stbi_inline static int stbi__at_eof(stbi__context* s) { - if(s->io.read) + if (s->io.read) { - if(!(s->io.eof)(s->io_user_data)) + if (!(s->io.eof)(s->io_user_data)) return 0; // if feof() is true, check if buffer = end // special case: we've only got the special 0 character at the end - if(s->read_from_callbacks == 0) + if (s->read_from_callbacks == 0) return 1; } @@ -1174,15 +1168,15 @@ stbi_inline static int stbi__at_eof(stbi__context* s) static void stbi__skip(stbi__context* s, int n) { - if(n < 0) + if (n < 0) { s->img_buffer = s->img_buffer_end; return; } - if(s->io.read) + if (s->io.read) { - int blen = ( int )(s->img_buffer_end - s->img_buffer); - if(blen < n) + int blen = (int)(s->img_buffer_end - s->img_buffer); + if (blen < n) { s->img_buffer = s->img_buffer_end; (s->io.skip)(s->io_user_data, n - blen); @@ -1194,23 +1188,23 @@ static void stbi__skip(stbi__context* s, int n) static int stbi__getn(stbi__context* s, stbi_uc* buffer, int n) { - if(s->io.read) + if (s->io.read) { - int blen = ( int )(s->img_buffer_end - s->img_buffer); - if(blen < n) + int blen = (int)(s->img_buffer_end - s->img_buffer); + if (blen < n) { int res, count; memcpy(buffer, s->img_buffer, blen); - count = (s->io.read)(s->io_user_data, ( char* )buffer + blen, n - blen); + count = (s->io.read)(s->io_user_data, (char*)buffer + blen, n - blen); res = (count == (n - blen)); s->img_buffer = s->img_buffer_end; return res; } } - if(s->img_buffer + n <= s->img_buffer_end) + if (s->img_buffer + n <= s->img_buffer_end) { memcpy(buffer, s->img_buffer, n); s->img_buffer += n; @@ -1250,7 +1244,7 @@ static stbi__uint32 stbi__get32le(stbi__context* s) } #endif -#define STBI__BYTECAST(x) ((stbi_uc)(( x )&255)) // truncate int to byte without warnings +#define STBI__BYTECAST(x) ((stbi_uc)((x)&255)) // truncate int to byte without warnings ////////////////////////////////////////////////////////////////////////////// // @@ -1273,29 +1267,29 @@ static unsigned char* stbi__convert_format(unsigned char* data, int img_n, int r int i, j; unsigned char* good; - if(req_comp == img_n) + if (req_comp == img_n) return data; STBI_ASSERT(req_comp >= 1 && req_comp <= 4); - good = ( unsigned char* )stbi__malloc_mad3(req_comp, x, y, 0); - if(good == NULL) + good = (unsigned char*)stbi__malloc_mad3(req_comp, x, y, 0); + if (good == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); } - for(j = 0; j < ( int )y; ++j) + for (j = 0; j < (int)y; ++j) { unsigned char* src = data + j * x * img_n; unsigned char* dest = good + j * x * req_comp; -#define STBI__COMBO(a, b) (( a )*8 + (b)) +#define STBI__COMBO(a, b) ((a)*8 + (b)) #define STBI__CASE(a, b) \ case STBI__COMBO(a, b): \ - for(i = x - 1; i >= 0; --i, src += a, dest += b) + for (i = x - 1; i >= 0; --i, src += a, dest += b) // convert source image with img_n components to one with req_comp components; // avoid switch per pixel, so use switch per scanline and massive macros - switch(STBI__COMBO(img_n, req_comp)) + switch (STBI__COMBO(img_n, req_comp)) { STBI__CASE(1, 2) { @@ -1357,8 +1351,8 @@ static unsigned char* stbi__convert_format(unsigned char* data, int img_n, int r dest[0] = src[0], dest[1] = src[1], dest[2] = src[2]; } break; - default: - STBI_ASSERT(0); + default: + STBI_ASSERT(0); } #undef STBI__CASE } @@ -1377,29 +1371,29 @@ static stbi__uint16* stbi__convert_format16(stbi__uint16* data, int img_n, int r int i, j; stbi__uint16* good; - if(req_comp == img_n) + if (req_comp == img_n) return data; STBI_ASSERT(req_comp >= 1 && req_comp <= 4); - good = ( stbi__uint16* )stbi__malloc((size_t)req_comp * x * y * 2); - if(good == NULL) + good = (stbi__uint16*)stbi__malloc((size_t)req_comp * x * y * 2); + if (good == NULL) { STBI_FREE(data); - return ( stbi__uint16* )stbi__errpuc("outofmem", "Out of memory"); + return (stbi__uint16*)stbi__errpuc("outofmem", "Out of memory"); } - for(j = 0; j < ( int )y; ++j) + for (j = 0; j < (int)y; ++j) { stbi__uint16* src = data + j * x * img_n; stbi__uint16* dest = good + j * x * req_comp; -#define STBI__COMBO(a, b) (( a )*8 + (b)) +#define STBI__COMBO(a, b) ((a)*8 + (b)) #define STBI__CASE(a, b) \ case STBI__COMBO(a, b): \ - for(i = x - 1; i >= 0; --i, src += a, dest += b) + for (i = x - 1; i >= 0; --i, src += a, dest += b) // convert source image with img_n components to one with req_comp components; // avoid switch per pixel, so use switch per scanline and massive macros - switch(STBI__COMBO(img_n, req_comp)) + switch (STBI__COMBO(img_n, req_comp)) { STBI__CASE(1, 2) { @@ -1461,8 +1455,8 @@ static stbi__uint16* stbi__convert_format16(stbi__uint16* data, int img_n, int r dest[0] = src[0], dest[1] = src[1], dest[2] = src[2]; } break; - default: - STBI_ASSERT(0); + default: + STBI_ASSERT(0); } #undef STBI__CASE } @@ -1476,26 +1470,26 @@ static float* stbi__ldr_to_hdr(stbi_uc* data, int x, int y, int comp) { int i, k, n; float* output; - if(!data) + if (!data) return NULL; - output = ( float* )stbi__malloc_mad4(x, y, comp, sizeof(float), 0); - if(output == NULL) + output = (float*)stbi__malloc_mad4(x, y, comp, sizeof(float), 0); + if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); } // compute number of non-alpha components - if(comp & 1) + if (comp & 1) n = comp; else n = comp - 1; - for(i = 0; i < x * y; ++i) + for (i = 0; i < x * y; ++i) { - for(k = 0; k < n; ++k) + for (k = 0; k < n; ++k) { - output[i * comp + k] = ( float )(pow(data[i * comp + k] / 255.0f, stbi__l2h_gamma) * stbi__l2h_scale); + output[i * comp + k] = (float)(pow(data[i * comp + k] / 255.0f, stbi__l2h_gamma) * stbi__l2h_scale); } - if(k < comp) + if (k < comp) output[i * comp + k] = data[i * comp + k] / 255.0f; } STBI_FREE(data); @@ -1504,43 +1498,43 @@ static float* stbi__ldr_to_hdr(stbi_uc* data, int x, int y, int comp) #endif #ifndef STBI_NO_HDR -#define stbi__float2int(x) (( int )(x)) +#define stbi__float2int(x) ((int)(x)) static stbi_uc* stbi__hdr_to_ldr(float* data, int x, int y, int comp) { int i, k, n; stbi_uc* output; - if(!data) + if (!data) return NULL; - output = ( stbi_uc* )stbi__malloc_mad3(x, y, comp, 0); - if(output == NULL) + output = (stbi_uc*)stbi__malloc_mad3(x, y, comp, 0); + if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); } // compute number of non-alpha components - if(comp & 1) + if (comp & 1) n = comp; else n = comp - 1; - for(i = 0; i < x * y; ++i) + for (i = 0; i < x * y; ++i) { - for(k = 0; k < n; ++k) + for (k = 0; k < n; ++k) { - float z = ( float )pow((double)data[i * comp + k] * stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f; - if(z < 0) + float z = (float)pow((double)data[i * comp + k] * stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f; + if (z < 0) z = 0; - if(z > 255) + if (z > 255) z = 255; - output[i * comp + k] = ( stbi_uc )stbi__float2int(z); + output[i * comp + k] = (stbi_uc)stbi__float2int(z); } - if(k < comp) + if (k < comp) { float z = data[i * comp + k] * 255 + 0.5f; - if(z < 0) + if (z < 0) z = 0; - if(z > 255) + if (z > 255) z = 255; - output[i * comp + k] = ( stbi_uc )stbi__float2int(z); + output[i * comp + k] = (stbi_uc)stbi__float2int(z); } } STBI_FREE(data); @@ -1572,7 +1566,7 @@ static stbi_uc* stbi__hdr_to_ldr(float* data, int x, int y, int comp) #ifndef STBI_NO_JPEG // huffman decoding acceleration -#define FAST_BITS 9 // larger handles more cases; smaller stomps less cache +#define FAST_BITS 9 // larger handles more cases; smaller stomps less cache typedef struct { @@ -1582,7 +1576,7 @@ typedef struct stbi_uc values[256]; stbi_uc size[257]; unsigned int maxcode[18]; - int delta[17]; // old 'firstsymbol' - old 'firstcode' + int delta[17]; // old 'firstsymbol' - old 'firstcode' } stbi__huffman; typedef struct @@ -1611,14 +1605,14 @@ typedef struct stbi_uc* data; void *raw_data, *raw_coeff; stbi_uc* linebuf; - short* coeff; // progressive only - int coeff_w, coeff_h; // number of 8x8 coefficient blocks + short* coeff; // progressive only + int coeff_w, coeff_h; // number of 8x8 coefficient blocks } img_comp[4]; - stbi__uint32 code_buffer; // jpeg entropy-coded buffer - int code_bits; // number of valid bits - unsigned char marker; // marker seen while filling entropy buffer - int nomore; // flag if we saw a marker so must stop + stbi__uint32 code_buffer; // jpeg entropy-coded buffer + int code_bits; // number of valid bits + unsigned char marker; // marker seen while filling entropy buffer + int nomore; // flag if we saw a marker so must stop int progressive; int spec_start; @@ -1627,7 +1621,7 @@ typedef struct int succ_low; int eob_run; int jfif; - int app14_color_transform; // Adobe APP14 tag + int app14_color_transform; // Adobe APP14 tag int rgb; int scan_n, order[4]; @@ -1645,23 +1639,23 @@ static int stbi__build_huffman(stbi__huffman* h, int* count) int i, j, k = 0; unsigned int code; // build size list for each symbol (from JPEG spec) - for(i = 0; i < 16; ++i) - for(j = 0; j < count[i]; ++j) + for (i = 0; i < 16; ++i) + for (j = 0; j < count[i]; ++j) h->size[k++] = (stbi_uc)(i + 1); h->size[k] = 0; // compute actual symbols (from jpeg spec) code = 0; k = 0; - for(j = 1; j <= 16; ++j) + for (j = 1; j <= 16; ++j) { // compute delta to add to code to compute symbol id h->delta[j] = k - code; - if(h->size[k] == j) + if (h->size[k] == j) { - while(h->size[k] == j) + while (h->size[k] == j) h->code[k++] = (stbi__uint16)(code++); - if(code - 1 >= (1u << j)) + if (code - 1 >= (1u << j)) return stbi__err("bad code lengths", "Corrupt JPEG"); } // compute largest code + 1 for this size, preshifted as needed later @@ -1672,16 +1666,16 @@ static int stbi__build_huffman(stbi__huffman* h, int* count) // build non-spec acceleration table; 255 is flag for not-accelerated memset(h->fast, 255, 1 << FAST_BITS); - for(i = 0; i < k; ++i) + for (i = 0; i < k; ++i) { int s = h->size[i]; - if(s <= FAST_BITS) + if (s <= FAST_BITS) { int c = h->code[i] << (FAST_BITS - s); int m = 1 << (FAST_BITS - s); - for(j = 0; j < m; ++j) + for (j = 0; j < m; ++j) { - h->fast[c + j] = ( stbi_uc )i; + h->fast[c + j] = (stbi_uc)i; } } } @@ -1693,26 +1687,26 @@ static int stbi__build_huffman(stbi__huffman* h, int* count) static void stbi__build_fast_ac(stbi__int16* fast_ac, stbi__huffman* h) { int i; - for(i = 0; i < (1 << FAST_BITS); ++i) + for (i = 0; i < (1 << FAST_BITS); ++i) { stbi_uc fast = h->fast[i]; fast_ac[i] = 0; - if(fast < 255) + if (fast < 255) { int rs = h->values[fast]; int run = (rs >> 4) & 15; int magbits = rs & 15; int len = h->size[fast]; - if(magbits && len + magbits <= FAST_BITS) + if (magbits && len + magbits <= FAST_BITS) { // magnitude code followed by receive_extend code int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits); int m = 1 << (magbits - 1); - if(k < m) + if (k < m) k += (~0U << magbits) + 1; // if the result is small enough, we can fit it in fast_ac table - if(k >= -128 && k <= 127) + if (k >= -128 && k <= 127) fast_ac[i] = (stbi__int16)((k * 256) + (run * 16) + (len + magbits)); } } @@ -1724,25 +1718,25 @@ static void stbi__grow_buffer_unsafe(stbi__jpeg* j) do { unsigned int b = j->nomore ? 0 : stbi__get8(j->s); - if(b == 0xff) + if (b == 0xff) { int c = stbi__get8(j->s); - while(c == 0xff) - c = stbi__get8(j->s); // consume fill bytes - if(c != 0) + while (c == 0xff) + c = stbi__get8(j->s); // consume fill bytes + if (c != 0) { - j->marker = ( unsigned char )c; + j->marker = (unsigned char)c; j->nomore = 1; return; } } j->code_buffer |= b << (24 - j->code_bits); j->code_bits += 8; - } while(j->code_bits <= 24); + } while (j->code_bits <= 24); } // (1 << n) - 1 -static const stbi__uint32 stbi__bmask[17] = {0, 1, 3, 7, 15, 31, 63, 127, 255, +static const stbi__uint32 stbi__bmask[17] = {0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767, 65535}; // decode a jpeg huffman value from the bitstream @@ -1751,17 +1745,17 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg* j, stbi__huffman* h) unsigned int temp; int c, k; - if(j->code_bits < 16) + if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); // look at the top FAST_BITS and determine what symbol ID it is, // if the code is <= FAST_BITS c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1); k = h->fast[c]; - if(k < 255) + if (k < 255) { int s = h->size[k]; - if(s > j->code_bits) + if (s > j->code_bits) return -1; j->code_buffer <<= s; j->code_bits -= s; @@ -1775,17 +1769,17 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg* j, stbi__huffman* h) // wants to be compared against something shifted to have 16; // that way we don't need to shift inside the loop. temp = j->code_buffer >> 16; - for(k = FAST_BITS + 1;; ++k) - if(temp < h->maxcode[k]) + for (k = FAST_BITS + 1;; ++k) + if (temp < h->maxcode[k]) break; - if(k == 17) + if (k == 17) { // error! code not found j->code_bits -= 16; return -1; } - if(k > j->code_bits) + if (k > j->code_bits) return -1; // convert the huffman code to the symbol id @@ -1799,7 +1793,7 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg* j, stbi__huffman* h) } // bias[n] = (-1<code_bits < n) + if (j->code_bits < n) stbi__grow_buffer_unsafe(j); - sgn = ( stbi__int32 )j->code_buffer >> 31; // sign bit is always in MSB + sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB k = stbi_lrot(j->code_buffer, n); - STBI_ASSERT(n >= 0 && n < ( int )(sizeof(stbi__bmask) / sizeof(*stbi__bmask))); + STBI_ASSERT(n >= 0 && n < (int)(sizeof(stbi__bmask) / sizeof(*stbi__bmask))); j->code_buffer = k & ~stbi__bmask[n]; k &= stbi__bmask[n]; j->code_bits -= n; @@ -1824,7 +1818,7 @@ stbi_inline static int stbi__extend_receive(stbi__jpeg* j, int n) stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg* j, int n) { unsigned int k; - if(j->code_bits < n) + if (j->code_bits < n) stbi__grow_buffer_unsafe(j); k = stbi_lrot(j->code_buffer, n); j->code_buffer = k & ~stbi__bmask[n]; @@ -1836,7 +1830,7 @@ stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg* j, int n) stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg* j) { unsigned int k; - if(j->code_bits < 1) + if (j->code_bits < 1) stbi__grow_buffer_unsafe(j); k = j->code_buffer; j->code_buffer <<= 1; @@ -1860,10 +1854,10 @@ static int stbi__jpeg_decode_block(stbi__jpeg* j, short data[64], stbi__huffman* int diff, dc, k; int t; - if(j->code_bits < 16) + if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); t = stbi__jpeg_huff_decode(j, hdc); - if(t < 0) + if (t < 0) return stbi__err("bad huffman code", "Corrupt JPEG"); // 0 all the ac values now so we can do it 32-bits at a time @@ -1872,7 +1866,7 @@ static int stbi__jpeg_decode_block(stbi__jpeg* j, short data[64], stbi__huffman* diff = t ? stbi__extend_receive(j, t) : 0; dc = j->img_comp[b].dc_pred + diff; j->img_comp[b].dc_pred = dc; - data[0] = ( short )(dc * dequant[0]); + data[0] = (short)(dc * dequant[0]); // decode AC components, see JPEG spec k = 1; @@ -1880,31 +1874,31 @@ static int stbi__jpeg_decode_block(stbi__jpeg* j, short data[64], stbi__huffman* { unsigned int zig; int c, r, s; - if(j->code_bits < 16) + if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1); r = fac[c]; - if(r) - { // fast-AC path - k += (r >> 4) & 15; // run - s = r & 15; // combined length + if (r) + { // fast-AC path + k += (r >> 4) & 15; // run + s = r & 15; // combined length j->code_buffer <<= s; j->code_bits -= s; // decode into unzigzag'd location zig = stbi__jpeg_dezigzag[k++]; - data[zig] = ( short )((r >> 8) * dequant[zig]); + data[zig] = (short)((r >> 8) * dequant[zig]); } else { int rs = stbi__jpeg_huff_decode(j, hac); - if(rs < 0) + if (rs < 0) return stbi__err("bad huffman code", "Corrupt JPEG"); s = rs & 15; r = rs >> 4; - if(s == 0) + if (s == 0) { - if(rs != 0xf0) - break; // end block + if (rs != 0xf0) + break; // end block k += 16; } else @@ -1912,10 +1906,10 @@ static int stbi__jpeg_decode_block(stbi__jpeg* j, short data[64], stbi__huffman* k += r; // decode into unzigzag'd location zig = stbi__jpeg_dezigzag[k++]; - data[zig] = ( short )(stbi__extend_receive(j, s) * dequant[zig]); + data[zig] = (short)(stbi__extend_receive(j, s) * dequant[zig]); } } - } while(k < 64); + } while (k < 64); return 1; } @@ -1923,28 +1917,28 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg* j, short data[64], stbi__ { int diff, dc; int t; - if(j->spec_end != 0) + if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG"); - if(j->code_bits < 16) + if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); - if(j->succ_high == 0) + if (j->succ_high == 0) { // first scan for DC coefficient, must be first - memset(data, 0, 64 * sizeof(data[0])); // 0 all the ac values now + memset(data, 0, 64 * sizeof(data[0])); // 0 all the ac values now t = stbi__jpeg_huff_decode(j, hdc); diff = t ? stbi__extend_receive(j, t) : 0; dc = j->img_comp[b].dc_pred + diff; j->img_comp[b].dc_pred = dc; - data[0] = ( short )(dc << j->succ_low); + data[0] = (short)(dc << j->succ_low); } else { // refinement scan for DC coefficient - if(stbi__jpeg_get_bit(j)) - data[0] += ( short )(1 << j->succ_low); + if (stbi__jpeg_get_bit(j)) + data[0] += (short)(1 << j->succ_low); } return 1; } @@ -1954,14 +1948,14 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg* j, short data[64], stbi__ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__huffman* hac, stbi__int16* fac) { int k; - if(j->spec_start == 0) + if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG"); - if(j->succ_high == 0) + if (j->succ_high == 0) { int shift = j->succ_low; - if(j->eob_run) + if (j->eob_run) { --j->eob_run; return 1; @@ -1972,32 +1966,32 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__ { unsigned int zig; int c, r, s; - if(j->code_bits < 16) + if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1); r = fac[c]; - if(r) - { // fast-AC path - k += (r >> 4) & 15; // run - s = r & 15; // combined length + if (r) + { // fast-AC path + k += (r >> 4) & 15; // run + s = r & 15; // combined length j->code_buffer <<= s; j->code_bits -= s; zig = stbi__jpeg_dezigzag[k++]; - data[zig] = ( short )((r >> 8) << shift); + data[zig] = (short)((r >> 8) << shift); } else { int rs = stbi__jpeg_huff_decode(j, hac); - if(rs < 0) + if (rs < 0) return stbi__err("bad huffman code", "Corrupt JPEG"); s = rs & 15; r = rs >> 4; - if(s == 0) + if (s == 0) { - if(r < 15) + if (r < 15) { j->eob_run = (1 << r); - if(r) + if (r) j->eob_run += stbi__jpeg_get_bits(j, r); --j->eob_run; break; @@ -2008,28 +2002,28 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__ { k += r; zig = stbi__jpeg_dezigzag[k++]; - data[zig] = ( short )(stbi__extend_receive(j, s) << shift); + data[zig] = (short)(stbi__extend_receive(j, s) << shift); } } - } while(k <= j->spec_end); + } while (k <= j->spec_end); } else { // refinement scan for these AC coefficients - short bit = ( short )(1 << j->succ_low); + short bit = (short)(1 << j->succ_low); - if(j->eob_run) + if (j->eob_run) { --j->eob_run; - for(k = j->spec_start; k <= j->spec_end; ++k) + for (k = j->spec_start; k <= j->spec_end; ++k) { short* p = &data[stbi__jpeg_dezigzag[k]]; - if(*p != 0) - if(stbi__jpeg_get_bit(j)) - if((*p & bit) == 0) + if (*p != 0) + if (stbi__jpeg_get_bit(j)) + if ((*p & bit) == 0) { - if(*p > 0) + if (*p > 0) *p += bit; else *p -= bit; @@ -2043,19 +2037,19 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__ { int r, s; int rs = stbi__jpeg_huff_decode( - j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh - if(rs < 0) + j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh + if (rs < 0) return stbi__err("bad huffman code", "Corrupt JPEG"); s = rs & 15; r = rs >> 4; - if(s == 0) + if (s == 0) { - if(r < 15) + if (r < 15) { j->eob_run = (1 << r) - 1; - if(r) + if (r) j->eob_run += stbi__jpeg_get_bits(j, r); - r = 64; // force end of block + r = 64; // force end of block } else { @@ -2066,25 +2060,25 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__ } else { - if(s != 1) + if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG"); // sign bit - if(stbi__jpeg_get_bit(j)) + if (stbi__jpeg_get_bit(j)) s = bit; else s = -bit; } // advance by r - while(k <= j->spec_end) + while (k <= j->spec_end) { short* p = &data[stbi__jpeg_dezigzag[k++]]; - if(*p != 0) + if (*p != 0) { - if(stbi__jpeg_get_bit(j)) - if((*p & bit) == 0) + if (stbi__jpeg_get_bit(j)) + if ((*p & bit) == 0) { - if(*p > 0) + if (*p > 0) *p += bit; else *p -= bit; @@ -2092,15 +2086,15 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__ } else { - if(r == 0) + if (r == 0) { - *p = ( short )s; + *p = (short)s; break; } --r; } } - } while(k <= j->spec_end); + } while (k <= j->spec_end); } } return 1; @@ -2110,18 +2104,18 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__ stbi_inline static stbi_uc stbi__clamp(int x) { // trick to use a single test to catch both cases - if(( unsigned int )x > 255) + if ((unsigned int)x > 255) { - if(x < 0) + if (x < 0) return 0; - if(x > 255) + if (x > 255) return 255; } - return ( stbi_uc )x; + return (stbi_uc)x; } -#define stbi__f2f(x) (( int )((( x )*4096 + 0.5))) -#define stbi__fsh(x) (( x )*4096) +#define stbi__f2f(x) ((int)(((x)*4096 + 0.5))) +#define stbi__fsh(x) ((x)*4096) // derived from jidctint -- DCT_ISLOW #define STBI__IDCT_1D(s0, s1, s2, s3, s4, s5, s6, s7) \ @@ -2168,10 +2162,10 @@ static void stbi__idct_block(stbi_uc* out, int out_stride, short data[64]) short* d = data; // columns - for(i = 0; i < 8; ++i, ++d, ++v) + for (i = 0; i < 8; ++i, ++d, ++v) { // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing - if(d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 && d[48] == 0 && d[56] == 0) + if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 && d[48] == 0 && d[56] == 0) { // no shortcut 0 seconds // (1|2|3|4|5|6|7)==0 0 seconds @@ -2200,7 +2194,7 @@ static void stbi__idct_block(stbi_uc* out, int out_stride, short data[64]) } } - for(i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride) + for (i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride) { // no fast case since the first 1D IDCT spread components out STBI__IDCT_1D(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]) @@ -2330,14 +2324,14 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64]) __m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17)); // load - row0 = _mm_load_si128(( const __m128i* )(data + 0 * 8)); - row1 = _mm_load_si128(( const __m128i* )(data + 1 * 8)); - row2 = _mm_load_si128(( const __m128i* )(data + 2 * 8)); - row3 = _mm_load_si128(( const __m128i* )(data + 3 * 8)); - row4 = _mm_load_si128(( const __m128i* )(data + 4 * 8)); - row5 = _mm_load_si128(( const __m128i* )(data + 5 * 8)); - row6 = _mm_load_si128(( const __m128i* )(data + 6 * 8)); - row7 = _mm_load_si128(( const __m128i* )(data + 7 * 8)); + row0 = _mm_load_si128((const __m128i*)(data + 0 * 8)); + row1 = _mm_load_si128((const __m128i*)(data + 1 * 8)); + row2 = _mm_load_si128((const __m128i*)(data + 2 * 8)); + row3 = _mm_load_si128((const __m128i*)(data + 3 * 8)); + row4 = _mm_load_si128((const __m128i*)(data + 4 * 8)); + row5 = _mm_load_si128((const __m128i*)(data + 5 * 8)); + row6 = _mm_load_si128((const __m128i*)(data + 6 * 8)); + row7 = _mm_load_si128((const __m128i*)(data + 7 * 8)); // column pass dct_pass(bias_0, 10); @@ -2367,39 +2361,39 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64]) { // pack - __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7 + __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7 __m128i p1 = _mm_packus_epi16(row2, row3); __m128i p2 = _mm_packus_epi16(row4, row5); __m128i p3 = _mm_packus_epi16(row6, row7); // 8bit 8x8 transpose pass 1 - dct_interleave8(p0, p2); // a0e0a1e1... - dct_interleave8(p1, p3); // c0g0c1g1... + dct_interleave8(p0, p2); // a0e0a1e1... + dct_interleave8(p1, p3); // c0g0c1g1... // transpose pass 2 - dct_interleave8(p0, p1); // a0c0e0g0... - dct_interleave8(p2, p3); // b0d0f0h0... + dct_interleave8(p0, p1); // a0c0e0g0... + dct_interleave8(p2, p3); // b0d0f0h0... // transpose pass 3 - dct_interleave8(p0, p2); // a0b0c0d0... - dct_interleave8(p1, p3); // a4b4c4d4... + dct_interleave8(p0, p2); // a0b0c0d0... + dct_interleave8(p1, p3); // a4b4c4d4... // store - _mm_storel_epi64(( __m128i* )out, p0); + _mm_storel_epi64((__m128i*)out, p0); out += out_stride; - _mm_storel_epi64(( __m128i* )out, _mm_shuffle_epi32(p0, 0x4e)); + _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride; - _mm_storel_epi64(( __m128i* )out, p2); + _mm_storel_epi64((__m128i*)out, p2); out += out_stride; - _mm_storel_epi64(( __m128i* )out, _mm_shuffle_epi32(p2, 0x4e)); + _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride; - _mm_storel_epi64(( __m128i* )out, p1); + _mm_storel_epi64((__m128i*)out, p1); out += out_stride; - _mm_storel_epi64(( __m128i* )out, _mm_shuffle_epi32(p1, 0x4e)); + _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride; - _mm_storel_epi64(( __m128i* )out, p3); + _mm_storel_epi64((__m128i*)out, p3); out += out_stride; - _mm_storel_epi64(( __m128i* )out, _mm_shuffle_epi32(p3, 0x4e)); + _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi32(p3, 0x4e)); } #undef dct_const @@ -2413,7 +2407,7 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64]) #undef dct_pass } -#endif // STBI_SSE2 +#endif // STBI_SSE2 #ifdef STBI_NEON @@ -2548,19 +2542,19 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64]) } // pass 1 - dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6 + dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6 dct_trn16(row2, row3); dct_trn16(row4, row5); dct_trn16(row6, row7); // pass 2 - dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4 + dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4 dct_trn32(row1, row3); dct_trn32(row4, row6); dct_trn32(row5, row7); // pass 3 - dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0 + dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0 dct_trn64(row1, row5); dct_trn64(row2, row6); dct_trn64(row3, row7); @@ -2659,7 +2653,7 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64]) #undef dct_pass } -#endif // STBI_NEON +#endif // STBI_NEON #define STBI__MARKER_none 0xff // if there's a pending marker from the entropy stream, return that @@ -2668,17 +2662,17 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64]) static stbi_uc stbi__get_marker(stbi__jpeg* j) { stbi_uc x; - if(j->marker != STBI__MARKER_none) + if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; } x = stbi__get8(j->s); - if(x != 0xff) + if (x != 0xff) return STBI__MARKER_none; - while(x == 0xff) - x = stbi__get8(j->s); // consume repeated 0xff fill bytes + while (x == 0xff) + x = stbi__get8(j->s); // consume repeated 0xff fill bytes return x; } @@ -2704,9 +2698,9 @@ static void stbi__jpeg_reset(stbi__jpeg* j) static int stbi__parse_entropy_coded_data(stbi__jpeg* z) { stbi__jpeg_reset(z); - if(!z->progressive) + if (!z->progressive) { - if(z->scan_n == 1) + if (z->scan_n == 1) { int i, j; STBI_SIMD_ALIGN(short, data[64]); @@ -2717,24 +2711,24 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z) // component has, independent of interleaved MCU blocking and such int w = (z->img_comp[n].x + 7) >> 3; int h = (z->img_comp[n].y + 7) >> 3; - for(j = 0; j < h; ++j) + for (j = 0; j < h; ++j) { - for(i = 0; i < w; ++i) + for (i = 0; i < w; ++i) { int ha = z->img_comp[n].ha; - if(!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, - z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) + if (!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, + z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0; z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2, data); // every data block is an MCU, so countdown the restart interval - if(--z->todo <= 0) + if (--z->todo <= 0) { - if(z->code_bits < 24) + if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); // if it's NOT a restart, then just bail, so we get corrupt data // rather than no data - if(!STBI__RESTART(z->marker)) + if (!STBI__RESTART(z->marker)) return 1; stbi__jpeg_reset(z); } @@ -2743,28 +2737,28 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z) return 1; } else - { // interleaved + { // interleaved int i, j, k, x, y; STBI_SIMD_ALIGN(short, data[64]); - for(j = 0; j < z->img_mcu_y; ++j) + for (j = 0; j < z->img_mcu_y; ++j) { - for(i = 0; i < z->img_mcu_x; ++i) + for (i = 0; i < z->img_mcu_x; ++i) { // scan an interleaved mcu... process scan_n components in order - for(k = 0; k < z->scan_n; ++k) + for (k = 0; k < z->scan_n; ++k) { int n = z->order[k]; // scan out an mcu's worth of this component; that's just determined // by the basic H and V specified for the component - for(y = 0; y < z->img_comp[n].v; ++y) + for (y = 0; y < z->img_comp[n].v; ++y) { - for(x = 0; x < z->img_comp[n].h; ++x) + for (x = 0; x < z->img_comp[n].h; ++x) { int x2 = (i * z->img_comp[n].h + x) * 8; int y2 = (j * z->img_comp[n].v + y) * 8; int ha = z->img_comp[n].ha; - if(!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, - z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) + if (!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, + z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0; z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * y2 + x2, z->img_comp[n].w2, data); @@ -2773,11 +2767,11 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z) } // after all interleaved components, that's an interleaved MCU, // so now count down the restart interval - if(--z->todo <= 0) + if (--z->todo <= 0) { - if(z->code_bits < 24) + if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); - if(!STBI__RESTART(z->marker)) + if (!STBI__RESTART(z->marker)) return 1; stbi__jpeg_reset(z); } @@ -2788,7 +2782,7 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z) } else { - if(z->scan_n == 1) + if (z->scan_n == 1) { int i, j; int n = z->order[0]; @@ -2798,28 +2792,28 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z) // component has, independent of interleaved MCU blocking and such int w = (z->img_comp[n].x + 7) >> 3; int h = (z->img_comp[n].y + 7) >> 3; - for(j = 0; j < h; ++j) + for (j = 0; j < h; ++j) { - for(i = 0; i < w; ++i) + for (i = 0; i < w; ++i) { short* data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w); - if(z->spec_start == 0) + if (z->spec_start == 0) { - if(!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n)) + if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n)) return 0; } else { int ha = z->img_comp[n].ha; - if(!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha])) + if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha])) return 0; } // every data block is an MCU, so countdown the restart interval - if(--z->todo <= 0) + if (--z->todo <= 0) { - if(z->code_bits < 24) + if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); - if(!STBI__RESTART(z->marker)) + if (!STBI__RESTART(z->marker)) return 1; stbi__jpeg_reset(z); } @@ -2828,37 +2822,37 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z) return 1; } else - { // interleaved + { // interleaved int i, j, k, x, y; - for(j = 0; j < z->img_mcu_y; ++j) + for (j = 0; j < z->img_mcu_y; ++j) { - for(i = 0; i < z->img_mcu_x; ++i) + for (i = 0; i < z->img_mcu_x; ++i) { // scan an interleaved mcu... process scan_n components in order - for(k = 0; k < z->scan_n; ++k) + for (k = 0; k < z->scan_n; ++k) { int n = z->order[k]; // scan out an mcu's worth of this component; that's just determined // by the basic H and V specified for the component - for(y = 0; y < z->img_comp[n].v; ++y) + for (y = 0; y < z->img_comp[n].v; ++y) { - for(x = 0; x < z->img_comp[n].h; ++x) + for (x = 0; x < z->img_comp[n].h; ++x) { int x2 = (i * z->img_comp[n].h + x); int y2 = (j * z->img_comp[n].v + y); short* data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w); - if(!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n)) + if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n)) return 0; } } } // after all interleaved components, that's an interleaved MCU, // so now count down the restart interval - if(--z->todo <= 0) + if (--z->todo <= 0) { - if(z->code_bits < 24) + if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); - if(!STBI__RESTART(z->marker)) + if (!STBI__RESTART(z->marker)) return 1; stbi__jpeg_reset(z); } @@ -2872,23 +2866,23 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z) static void stbi__jpeg_dequantize(short* data, stbi__uint16* dequant) { int i; - for(i = 0; i < 64; ++i) + for (i = 0; i < 64; ++i) data[i] *= dequant[i]; } static void stbi__jpeg_finish(stbi__jpeg* z) { - if(z->progressive) + if (z->progressive) { // dequantize and idct the data int i, j, n; - for(n = 0; n < z->s->img_n; ++n) + for (n = 0; n < z->s->img_n; ++n) { int w = (z->img_comp[n].x + 7) >> 3; int h = (z->img_comp[n].y + 7) >> 3; - for(j = 0; j < h; ++j) + for (j = 0; j < h; ++j) { - for(i = 0; i < w; ++i) + for (i = 0; i < w; ++i) { short* data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w); stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]); @@ -2903,114 +2897,113 @@ static void stbi__jpeg_finish(stbi__jpeg* z) static int stbi__process_marker(stbi__jpeg* z, int m) { int L; - switch(m) + switch (m) { - case STBI__MARKER_none: // no marker found - return stbi__err("expected marker", "Corrupt JPEG"); + case STBI__MARKER_none: // no marker found + return stbi__err("expected marker", "Corrupt JPEG"); - case 0xDD: // DRI - specify restart interval - if(stbi__get16be(z->s) != 4) - return stbi__err("bad DRI len", "Corrupt JPEG"); - z->restart_interval = stbi__get16be(z->s); - return 1; + case 0xDD: // DRI - specify restart interval + if (stbi__get16be(z->s) != 4) + return stbi__err("bad DRI len", "Corrupt JPEG"); + z->restart_interval = stbi__get16be(z->s); + return 1; + + case 0xDB: // DQT - define quantization table + L = stbi__get16be(z->s) - 2; + while (L > 0) + { + int q = stbi__get8(z->s); + int p = q >> 4, sixteen = (p != 0); + int t = q & 15, i; + if (p != 0 && p != 1) + return stbi__err("bad DQT type", "Corrupt JPEG"); + if (t > 3) + return stbi__err("bad DQT table", "Corrupt JPEG"); + + for (i = 0; i < 64; ++i) + z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s)); + L -= (sixteen ? 129 : 65); + } + return L == 0; - case 0xDB: // DQT - define quantization table - L = stbi__get16be(z->s) - 2; - while(L > 0) + case 0xC4: // DHT - define huffman table + L = stbi__get16be(z->s) - 2; + while (L > 0) + { + stbi_uc* v; + int sizes[16], i, n = 0; + int q = stbi__get8(z->s); + int tc = q >> 4; + int th = q & 15; + if (tc > 1 || th > 3) + return stbi__err("bad DHT header", "Corrupt JPEG"); + for (i = 0; i < 16; ++i) { - int q = stbi__get8(z->s); - int p = q >> 4, sixteen = (p != 0); - int t = q & 15, i; - if(p != 0 && p != 1) - return stbi__err("bad DQT type", "Corrupt JPEG"); - if(t > 3) - return stbi__err("bad DQT table", "Corrupt JPEG"); - - for(i = 0; i < 64; ++i) - z->dequant[t][stbi__jpeg_dezigzag[i]] = - (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s)); - L -= (sixteen ? 129 : 65); + sizes[i] = stbi__get8(z->s); + n += sizes[i]; } - return L == 0; - - case 0xC4: // DHT - define huffman table - L = stbi__get16be(z->s) - 2; - while(L > 0) + L -= 17; + if (tc == 0) { - stbi_uc* v; - int sizes[16], i, n = 0; - int q = stbi__get8(z->s); - int tc = q >> 4; - int th = q & 15; - if(tc > 1 || th > 3) - return stbi__err("bad DHT header", "Corrupt JPEG"); - for(i = 0; i < 16; ++i) - { - sizes[i] = stbi__get8(z->s); - n += sizes[i]; - } - L -= 17; - if(tc == 0) - { - if(!stbi__build_huffman(z->huff_dc + th, sizes)) - return 0; - v = z->huff_dc[th].values; - } - else - { - if(!stbi__build_huffman(z->huff_ac + th, sizes)) - return 0; - v = z->huff_ac[th].values; - } - for(i = 0; i < n; ++i) - v[i] = stbi__get8(z->s); - if(tc != 0) - stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th); - L -= n; + if (!stbi__build_huffman(z->huff_dc + th, sizes)) + return 0; + v = z->huff_dc[th].values; } - return L == 0; + else + { + if (!stbi__build_huffman(z->huff_ac + th, sizes)) + return 0; + v = z->huff_ac[th].values; + } + for (i = 0; i < n; ++i) + v[i] = stbi__get8(z->s); + if (tc != 0) + stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th); + L -= n; + } + return L == 0; } // check for comment block or APP blocks - if((m >= 0xE0 && m <= 0xEF) || m == 0xFE) + if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) { L = stbi__get16be(z->s); - if(L < 2) + if (L < 2) { - if(m == 0xFE) + if (m == 0xFE) return stbi__err("bad COM len", "Corrupt JPEG"); else return stbi__err("bad APP len", "Corrupt JPEG"); } L -= 2; - if(m == 0xE0 && L >= 5) - { // JFIF APP0 segment + if (m == 0xE0 && L >= 5) + { // JFIF APP0 segment static const unsigned char tag[5] = {'J', 'F', 'I', 'F', '\0'}; int ok = 1; int i; - for(i = 0; i < 5; ++i) - if(stbi__get8(z->s) != tag[i]) + for (i = 0; i < 5; ++i) + if (stbi__get8(z->s) != tag[i]) ok = 0; L -= 5; - if(ok) + if (ok) z->jfif = 1; } - else if(m == 0xEE && L >= 12) - { // Adobe APP14 segment + else if (m == 0xEE && L >= 12) + { // Adobe APP14 segment static const unsigned char tag[6] = {'A', 'd', 'o', 'b', 'e', '\0'}; int ok = 1; int i; - for(i = 0; i < 6; ++i) - if(stbi__get8(z->s) != tag[i]) + for (i = 0; i < 6; ++i) + if (stbi__get8(z->s) != tag[i]) ok = 0; L -= 6; - if(ok) + if (ok) { - stbi__get8(z->s); // version - stbi__get16be(z->s); // flags0 - stbi__get16be(z->s); // flags1 - z->app14_color_transform = stbi__get8(z->s); // color transform + stbi__get8(z->s); // version + stbi__get16be(z->s); // flags0 + stbi__get16be(z->s); // flags1 + z->app14_color_transform = stbi__get8(z->s); // color transform L -= 6; } } @@ -3028,24 +3021,24 @@ static int stbi__process_scan_header(stbi__jpeg* z) int i; int Ls = stbi__get16be(z->s); z->scan_n = stbi__get8(z->s); - if(z->scan_n < 1 || z->scan_n > 4 || z->scan_n > ( int )z->s->img_n) + if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int)z->s->img_n) return stbi__err("bad SOS component count", "Corrupt JPEG"); - if(Ls != 6 + 2 * z->scan_n) + if (Ls != 6 + 2 * z->scan_n) return stbi__err("bad SOS len", "Corrupt JPEG"); - for(i = 0; i < z->scan_n; ++i) + for (i = 0; i < z->scan_n; ++i) { int id = stbi__get8(z->s), which; int q = stbi__get8(z->s); - for(which = 0; which < z->s->img_n; ++which) - if(z->img_comp[which].id == id) + for (which = 0; which < z->s->img_n; ++which) + if (z->img_comp[which].id == id) break; - if(which == z->s->img_n) - return 0; // no match + if (which == z->s->img_n) + return 0; // no match z->img_comp[which].hd = q >> 4; - if(z->img_comp[which].hd > 3) + if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff", "Corrupt JPEG"); z->img_comp[which].ha = q & 15; - if(z->img_comp[which].ha > 3) + if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff", "Corrupt JPEG"); z->order[i] = which; } @@ -3053,21 +3046,20 @@ static int stbi__process_scan_header(stbi__jpeg* z) { int aa; z->spec_start = stbi__get8(z->s); - z->spec_end = stbi__get8(z->s); // should be 63, but might be 0 + z->spec_end = stbi__get8(z->s); // should be 63, but might be 0 aa = stbi__get8(z->s); z->succ_high = (aa >> 4); z->succ_low = (aa & 15); - if(z->progressive) + if (z->progressive) { - if(z->spec_start > 63 || z->spec_end > 63 || z->spec_start > z->spec_end || z->succ_high > 13 || - z->succ_low > 13) + if (z->spec_start > 63 || z->spec_end > 63 || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13) return stbi__err("bad SOS", "Corrupt JPEG"); } else { - if(z->spec_start != 0) + if (z->spec_start != 0) return stbi__err("bad SOS", "Corrupt JPEG"); - if(z->succ_high != 0 || z->succ_low != 0) + if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS", "Corrupt JPEG"); z->spec_end = 63; } @@ -3079,21 +3071,21 @@ static int stbi__process_scan_header(stbi__jpeg* z) static int stbi__free_jpeg_components(stbi__jpeg* z, int ncomp, int why) { int i; - for(i = 0; i < ncomp; ++i) + for (i = 0; i < ncomp; ++i) { - if(z->img_comp[i].raw_data) + if (z->img_comp[i].raw_data) { STBI_FREE(z->img_comp[i].raw_data); z->img_comp[i].raw_data = NULL; z->img_comp[i].data = NULL; } - if(z->img_comp[i].raw_coeff) + if (z->img_comp[i].raw_coeff) { STBI_FREE(z->img_comp[i].raw_coeff); z->img_comp[i].raw_coeff = 0; z->img_comp[i].coeff = 0; } - if(z->img_comp[i].linebuf) + if (z->img_comp[i].linebuf) { STBI_FREE(z->img_comp[i].linebuf); z->img_comp[i].linebuf = NULL; @@ -3107,62 +3099,62 @@ static int stbi__process_frame_header(stbi__jpeg* z, int scan) stbi__context* s = z->s; int Lf, p, i, q, h_max = 1, v_max = 1, c; Lf = stbi__get16be(s); - if(Lf < 11) - return stbi__err("bad SOF len", "Corrupt JPEG"); // JPEG + if (Lf < 11) + return stbi__err("bad SOF len", "Corrupt JPEG"); // JPEG p = stbi__get8(s); - if(p != 8) - return stbi__err("only 8-bit", "JPEG format not supported: 8-bit only"); // JPEG baseline + if (p != 8) + return stbi__err("only 8-bit", "JPEG format not supported: 8-bit only"); // JPEG baseline s->img_y = stbi__get16be(s); - if(s->img_y == 0) + if (s->img_y == 0) return stbi__err( "no header height", - "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG + "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG s->img_x = stbi__get16be(s); - if(s->img_x == 0) - return stbi__err("0 width", "Corrupt JPEG"); // JPEG requires + if (s->img_x == 0) + return stbi__err("0 width", "Corrupt JPEG"); // JPEG requires c = stbi__get8(s); - if(c != 3 && c != 1 && c != 4) + if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count", "Corrupt JPEG"); s->img_n = c; - for(i = 0; i < c; ++i) + for (i = 0; i < c; ++i) { z->img_comp[i].data = NULL; z->img_comp[i].linebuf = NULL; } - if(Lf != 8 + 3 * s->img_n) + if (Lf != 8 + 3 * s->img_n) return stbi__err("bad SOF len", "Corrupt JPEG"); z->rgb = 0; - for(i = 0; i < s->img_n; ++i) + for (i = 0; i < s->img_n; ++i) { static const unsigned char rgb[3] = {'R', 'G', 'B'}; z->img_comp[i].id = stbi__get8(s); - if(s->img_n == 3 && z->img_comp[i].id == rgb[i]) + if (s->img_n == 3 && z->img_comp[i].id == rgb[i]) ++z->rgb; q = stbi__get8(s); z->img_comp[i].h = (q >> 4); - if(!z->img_comp[i].h || z->img_comp[i].h > 4) + if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H", "Corrupt JPEG"); z->img_comp[i].v = q & 15; - if(!z->img_comp[i].v || z->img_comp[i].v > 4) + if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V", "Corrupt JPEG"); z->img_comp[i].tq = stbi__get8(s); - if(z->img_comp[i].tq > 3) + if (z->img_comp[i].tq > 3) return stbi__err("bad TQ", "Corrupt JPEG"); } - if(scan != STBI__SCAN_load) + if (scan != STBI__SCAN_load) return 1; - if(!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) + if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode"); - for(i = 0; i < s->img_n; ++i) + for (i = 0; i < s->img_n; ++i) { - if(z->img_comp[i].h > h_max) + if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h; - if(z->img_comp[i].v > v_max) + if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v; } @@ -3175,7 +3167,7 @@ static int stbi__process_frame_header(stbi__jpeg* z, int scan) z->img_mcu_x = (s->img_x + z->img_mcu_w - 1) / z->img_mcu_w; z->img_mcu_y = (s->img_y + z->img_mcu_h - 1) / z->img_mcu_h; - for(i = 0; i < s->img_n; ++i) + for (i = 0; i < s->img_n; ++i) { // number of effective pixels (e.g. for non-interleaved MCU) z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max - 1) / h_max; @@ -3193,19 +3185,19 @@ static int stbi__process_frame_header(stbi__jpeg* z, int scan) z->img_comp[i].raw_coeff = 0; z->img_comp[i].linebuf = NULL; z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15); - if(z->img_comp[i].raw_data == NULL) + if (z->img_comp[i].raw_data == NULL) return stbi__free_jpeg_components(z, i + 1, stbi__err("outofmem", "Out of memory")); // align blocks for idct using mmx/sse - z->img_comp[i].data = ( stbi_uc* )((( size_t )z->img_comp[i].raw_data + 15) & ~15); - if(z->progressive) + z->img_comp[i].data = (stbi_uc*)(((size_t)z->img_comp[i].raw_data + 15) & ~15); + if (z->progressive) { // w2, h2 are multiples of 8 (see above) z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8; z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8; z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15); - if(z->img_comp[i].raw_coeff == NULL) + if (z->img_comp[i].raw_coeff == NULL) return stbi__free_jpeg_components(z, i + 1, stbi__err("outofmem", "Out of memory")); - z->img_comp[i].coeff = ( short* )((( size_t )z->img_comp[i].raw_coeff + 15) & ~15); + z->img_comp[i].coeff = (short*)(((size_t)z->img_comp[i].raw_coeff + 15) & ~15); } } @@ -3225,29 +3217,29 @@ static int stbi__decode_jpeg_header(stbi__jpeg* z, int scan) { int m; z->jfif = 0; - z->app14_color_transform = -1; // valid values are 0,1,2 - z->marker = STBI__MARKER_none; // initialize cached marker to empty + z->app14_color_transform = -1; // valid values are 0,1,2 + z->marker = STBI__MARKER_none; // initialize cached marker to empty m = stbi__get_marker(z); - if(!stbi__SOI(m)) + if (!stbi__SOI(m)) return stbi__err("no SOI", "Corrupt JPEG"); - if(scan == STBI__SCAN_type) + if (scan == STBI__SCAN_type) return 1; m = stbi__get_marker(z); - while(!stbi__SOF(m)) + while (!stbi__SOF(m)) { - if(!stbi__process_marker(z, m)) + if (!stbi__process_marker(z, m)) return 0; m = stbi__get_marker(z); - while(m == STBI__MARKER_none) + while (m == STBI__MARKER_none) { // some files have extra padding after their blocks, so ok, we'll scan - if(stbi__at_eof(z->s)) + if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG"); m = stbi__get_marker(z); } } z->progressive = stbi__SOF_progressive(m); - if(!stbi__process_frame_header(z, scan)) + if (!stbi__process_frame_header(z, scan)) return 0; return 1; } @@ -3256,30 +3248,30 @@ static int stbi__decode_jpeg_header(stbi__jpeg* z, int scan) static int stbi__decode_jpeg_image(stbi__jpeg* j) { int m; - for(m = 0; m < 4; m++) + for (m = 0; m < 4; m++) { j->img_comp[m].raw_data = NULL; j->img_comp[m].raw_coeff = NULL; } j->restart_interval = 0; - if(!stbi__decode_jpeg_header(j, STBI__SCAN_load)) + if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0; m = stbi__get_marker(j); - while(!stbi__EOI(m)) + while (!stbi__EOI(m)) { - if(stbi__SOS(m)) + if (stbi__SOS(m)) { - if(!stbi__process_scan_header(j)) + if (!stbi__process_scan_header(j)) return 0; - if(!stbi__parse_entropy_coded_data(j)) + if (!stbi__parse_entropy_coded_data(j)) return 0; - if(j->marker == STBI__MARKER_none) + if (j->marker == STBI__MARKER_none) { // handle 0s at the end of image data from IP Kamera 9060 - while(!stbi__at_eof(j->s)) + while (!stbi__at_eof(j->s)) { int x = stbi__get8(j->s); - if(x == 255) + if (x == 255) { j->marker = stbi__get8(j->s); break; @@ -3289,23 +3281,23 @@ static int stbi__decode_jpeg_image(stbi__jpeg* j) // return 0 } } - else if(stbi__DNL(m)) + else if (stbi__DNL(m)) { int Ld = stbi__get16be(j->s); stbi__uint32 NL = stbi__get16be(j->s); - if(Ld != 4) + if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG"); - if(NL != j->s->img_y) + if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG"); } else { - if(!stbi__process_marker(j, m)) + if (!stbi__process_marker(j, m)) return 0; } m = stbi__get_marker(j); } - if(j->progressive) + if (j->progressive) stbi__jpeg_finish(j); return 1; } @@ -3330,7 +3322,7 @@ static stbi_uc* stbi__resample_row_v_2(stbi_uc* out, stbi_uc* in_near, stbi_uc* // need to generate two samples vertically for every one in input int i; STBI_NOTUSED(hs); - for(i = 0; i < w; ++i) + for (i = 0; i < w; ++i) out[i] = stbi__div4(3 * in_near[i] + in_far[i] + 2); return out; } @@ -3341,7 +3333,7 @@ static stbi_uc* stbi__resample_row_h_2(stbi_uc* out, stbi_uc* in_near, stbi_uc* int i; stbi_uc* input = in_near; - if(w == 1) + if (w == 1) { // if only one sample, can't do any interpolation out[0] = out[1] = input[0]; @@ -3350,7 +3342,7 @@ static stbi_uc* stbi__resample_row_h_2(stbi_uc* out, stbi_uc* in_near, stbi_uc* out[0] = input[0]; out[1] = stbi__div4(input[0] * 3 + input[1] + 2); - for(i = 1; i < w - 1; ++i) + for (i = 1; i < w - 1; ++i) { int n = 3 * input[i] + 2; out[i * 2 + 0] = stbi__div4(n + input[i - 1]); @@ -3371,7 +3363,7 @@ static stbi_uc* stbi__resample_row_hv_2(stbi_uc* out, stbi_uc* in_near, stbi_uc* { // need to generate 2x2 samples for every one in input int i, t0, t1; - if(w == 1) + if (w == 1) { out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2); return out; @@ -3379,7 +3371,7 @@ static stbi_uc* stbi__resample_row_hv_2(stbi_uc* out, stbi_uc* in_near, stbi_uc* t1 = 3 * in_near[0] + in_far[0]; out[0] = stbi__div4(t1 + 2); - for(i = 1; i < w; ++i) + for (i = 1; i < w; ++i) { t0 = t1; t1 = 3 * in_near[i] + in_far[i]; @@ -3399,7 +3391,7 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb // need to generate 2x2 samples for every one in input int i = 0, t0, t1; - if(w == 1) + if (w == 1) { out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2); return out; @@ -3409,19 +3401,19 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb // process groups of 8 pixels for as long as we can. // note we can't handle the last pixel in a row in this loop // because we need to handle the filter boundary conditions. - for(; i < ((w - 1) & ~7); i += 8) + for (; i < ((w - 1) & ~7); i += 8) { #if defined(STBI_SSE2) // load and perform the vertical filtering pass // this uses 3*x + y = 4*x + (y - x) __m128i zero = _mm_setzero_si128(); - __m128i farb = _mm_loadl_epi64(( __m128i* )(in_far + i)); - __m128i nearb = _mm_loadl_epi64(( __m128i* )(in_near + i)); + __m128i farb = _mm_loadl_epi64((__m128i*)(in_far + i)); + __m128i nearb = _mm_loadl_epi64((__m128i*)(in_near + i)); __m128i farw = _mm_unpacklo_epi8(farb, zero); __m128i nearw = _mm_unpacklo_epi8(nearb, zero); __m128i diff = _mm_sub_epi16(farw, nearw); __m128i nears = _mm_slli_epi16(nearw, 2); - __m128i curr = _mm_add_epi16(nears, diff); // current row + __m128i curr = _mm_add_epi16(nears, diff); // current row // horizontal filter works the same based on shifted vers of current // row. "prev" is current row shifted right by 1 pixel; we need to @@ -3453,7 +3445,7 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb // pack and write output __m128i outv = _mm_packus_epi16(de0, de1); - _mm_storeu_si128(( __m128i* )(out + i * 2), outv); + _mm_storeu_si128((__m128i*)(out + i * 2), outv); #elif defined(STBI_NEON) // load and perform the vertical filtering pass // this uses 3*x + y = 4*x + (y - x) @@ -3461,7 +3453,7 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb uint8x8_t nearb = vld1_u8(in_near + i); int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb)); int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2)); - int16x8_t curr = vaddq_s16(nears, diff); // current row + int16x8_t curr = vaddq_s16(nears, diff); // current row // horizontal filter works the same based on shifted vers of current // row. "prev" is current row shifted right by 1 pixel; we need to @@ -3498,7 +3490,7 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb t1 = 3 * in_near[i] + in_far[i]; out[i * 2] = stbi__div16(3 * t1 + t0 + 8); - for(++i; i < w; ++i) + for (++i; i < w; ++i) { t0 = t1; t1 = 3 * in_near[i] + in_far[i]; @@ -3518,22 +3510,22 @@ static stbi_uc* stbi__resample_row_generic(stbi_uc* out, stbi_uc* in_near, stbi_ // resample with nearest-neighbor int i, j; STBI_NOTUSED(in_far); - for(i = 0; i < w; ++i) - for(j = 0; j < hs; ++j) + for (i = 0; i < w; ++i) + for (j = 0; j < hs; ++j) out[i * hs + j] = in_near[i]; return out; } // this is a reduced-precision calculation of YCbCr-to-RGB introduced // to make sure the code produces the same results in both SIMD and scalar -#define stbi__float2fixed(x) ((( int )(( x )*4096.0f + 0.5f)) << 8) +#define stbi__float2fixed(x) (((int)((x)*4096.0f + 0.5f)) << 8) static void stbi__YCbCr_to_RGB_row(stbi_uc* out, const stbi_uc* y, const stbi_uc* pcb, const stbi_uc* pcr, int count, int step) { int i; - for(i = 0; i < count; ++i) + for (i = 0; i < count; ++i) { - int y_fixed = (y[i] << 20) + (1 << 19); // rounding + int y_fixed = (y[i] << 20) + (1 << 19); // rounding int r, g, b; int cr = pcr[i] - 128; int cb = pcb[i] - 128; @@ -3543,30 +3535,30 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc* out, const stbi_uc* y, const stbi_uc r >>= 20; g >>= 20; b >>= 20; - if(( unsigned )r > 255) + if ((unsigned)r > 255) { - if(r < 0) + if (r < 0) r = 0; else r = 255; } - if(( unsigned )g > 255) + if ((unsigned)g > 255) { - if(g < 0) + if (g < 0) g = 0; else g = 255; } - if(( unsigned )b > 255) + if ((unsigned)b > 255) { - if(b < 0) + if (b < 0) b = 0; else b = 255; } - out[0] = ( stbi_uc )r; - out[1] = ( stbi_uc )g; - out[2] = ( stbi_uc )b; + out[0] = (stbi_uc)r; + out[1] = (stbi_uc)g; + out[2] = (stbi_uc)b; out[3] = 255; out += step; } @@ -3582,25 +3574,25 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons // step == 3 is pretty ugly on the final interleave, and i'm not convinced // it's useful in practice (you wouldn't use it for textures, for example). // so just accelerate step == 4 case. - if(step == 4) + if (step == 4) { // this is a fairly straightforward implementation and not super-optimized. __m128i signflip = _mm_set1_epi8(-0x80); - __m128i cr_const0 = _mm_set1_epi16(( short )(1.40200f * 4096.0f + 0.5f)); - __m128i cr_const1 = _mm_set1_epi16(-( short )(0.71414f * 4096.0f + 0.5f)); - __m128i cb_const0 = _mm_set1_epi16(-( short )(0.34414f * 4096.0f + 0.5f)); - __m128i cb_const1 = _mm_set1_epi16(( short )(1.77200f * 4096.0f + 0.5f)); - __m128i y_bias = _mm_set1_epi8(( char )( unsigned char )128); - __m128i xw = _mm_set1_epi16(255); // alpha channel - - for(; i + 7 < count; i += 8) + __m128i cr_const0 = _mm_set1_epi16((short)(1.40200f * 4096.0f + 0.5f)); + __m128i cr_const1 = _mm_set1_epi16(-(short)(0.71414f * 4096.0f + 0.5f)); + __m128i cb_const0 = _mm_set1_epi16(-(short)(0.34414f * 4096.0f + 0.5f)); + __m128i cb_const1 = _mm_set1_epi16((short)(1.77200f * 4096.0f + 0.5f)); + __m128i y_bias = _mm_set1_epi8((char)(unsigned char)128); + __m128i xw = _mm_set1_epi16(255); // alpha channel + + for (; i + 7 < count; i += 8) { // load - __m128i y_bytes = _mm_loadl_epi64(( __m128i* )(y + i)); - __m128i cr_bytes = _mm_loadl_epi64(( __m128i* )(pcr + i)); - __m128i cb_bytes = _mm_loadl_epi64(( __m128i* )(pcb + i)); - __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128 - __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128 + __m128i y_bytes = _mm_loadl_epi64((__m128i*)(y + i)); + __m128i cr_bytes = _mm_loadl_epi64((__m128i*)(pcr + i)); + __m128i cb_bytes = _mm_loadl_epi64((__m128i*)(pcb + i)); + __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128 + __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128 // unpack to short (and left-shift cr, cb by 8) __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes); @@ -3634,8 +3626,8 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons __m128i o1 = _mm_unpackhi_epi16(t0, t1); // store - _mm_storeu_si128(( __m128i* )(out + 0), o0); - _mm_storeu_si128(( __m128i* )(out + 16), o1); + _mm_storeu_si128((__m128i*)(out + 0), o0); + _mm_storeu_si128((__m128i*)(out + 16), o1); out += 32; } } @@ -3643,16 +3635,16 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons #ifdef STBI_NEON // in this version, step=3 support would be easy to add. but is there demand? - if(step == 4) + if (step == 4) { // this is a fairly straightforward implementation and not super-optimized. uint8x8_t signflip = vdup_n_u8(0x80); - int16x8_t cr_const0 = vdupq_n_s16(( short )(1.40200f * 4096.0f + 0.5f)); - int16x8_t cr_const1 = vdupq_n_s16(-( short )(0.71414f * 4096.0f + 0.5f)); - int16x8_t cb_const0 = vdupq_n_s16(-( short )(0.34414f * 4096.0f + 0.5f)); - int16x8_t cb_const1 = vdupq_n_s16(( short )(1.77200f * 4096.0f + 0.5f)); + int16x8_t cr_const0 = vdupq_n_s16((short)(1.40200f * 4096.0f + 0.5f)); + int16x8_t cr_const1 = vdupq_n_s16(-(short)(0.71414f * 4096.0f + 0.5f)); + int16x8_t cb_const0 = vdupq_n_s16(-(short)(0.34414f * 4096.0f + 0.5f)); + int16x8_t cb_const1 = vdupq_n_s16((short)(1.77200f * 4096.0f + 0.5f)); - for(; i + 7 < count; i += 8) + for (; i + 7 < count; i += 8) { // load uint8x8_t y_bytes = vld1_u8(y + i); @@ -3689,9 +3681,9 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons } #endif - for(; i < count; ++i) + for (; i < count; ++i) { - int y_fixed = (y[i] << 20) + (1 << 19); // rounding + int y_fixed = (y[i] << 20) + (1 << 19); // rounding int r, g, b; int cr = pcr[i] - 128; int cb = pcb[i] - 128; @@ -3701,30 +3693,30 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons r >>= 20; g >>= 20; b >>= 20; - if(( unsigned )r > 255) + if ((unsigned)r > 255) { - if(r < 0) + if (r < 0) r = 0; else r = 255; } - if(( unsigned )g > 255) + if ((unsigned)g > 255) { - if(g < 0) + if (g < 0) g = 0; else g = 255; } - if(( unsigned )b > 255) + if ((unsigned)b > 255) { - if(b < 0) + if (b < 0) b = 0; else b = 255; } - out[0] = ( stbi_uc )r; - out[1] = ( stbi_uc )g; - out[2] = ( stbi_uc )b; + out[0] = (stbi_uc)r; + out[1] = (stbi_uc)g; + out[2] = (stbi_uc)b; out[3] = 255; out += step; } @@ -3739,7 +3731,7 @@ static void stbi__setup_jpeg(stbi__jpeg* j) j->resample_row_hv_2_kernel = stbi__resample_row_hv_2; #ifdef STBI_SSE2 - if(stbi__sse2_available()) + if (stbi__sse2_available()) { j->idct_block_kernel = stbi__idct_simd; j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd; @@ -3764,9 +3756,9 @@ typedef struct { resample_row_func resample; stbi_uc *line0, *line1; - int hs, vs; // expansion factor in each axis - int w_lores; // horizontal pixels pre-expansion - int ystep; // how far through vertical expansion we are + int hs, vs; // expansion factor in each axis + int w_lores; // horizontal pixels pre-expansion + int ystep; // how far through vertical expansion we are int ypos; // which pre-expansion row we're on } stbi__resample; @@ -3780,25 +3772,26 @@ static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y) static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp, int req_comp) { int n, decode_n, is_rgb; - z->s->img_n = 0; // make stbi__cleanup_jpeg safe + z->s->img_n = 0; // make stbi__cleanup_jpeg safe // validate req_comp - if(req_comp < 0 || req_comp > 4) + if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error"); // load a jpeg image from whichever source, but leave in YCbCr format - if(!stbi__decode_jpeg_image(z)) + if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; } // determine actual number of components to generate - n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1; + n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 + : 1; is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif)); - if(z->s->img_n == 3 && n < 3 && !is_rgb) + if (z->s->img_n == 3 && n < 3 && !is_rgb) decode_n = 1; else decode_n = z->s->img_n; @@ -3812,14 +3805,14 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp stbi__resample res_comp[4]; - for(k = 0; k < decode_n; ++k) + for (k = 0; k < decode_n; ++k) { stbi__resample* r = &res_comp[k]; // allocate line buffer big enough for upsampling off the edges // with upsample factor of 4 - z->img_comp[k].linebuf = ( stbi_uc* )stbi__malloc(z->s->img_x + 3); - if(!z->img_comp[k].linebuf) + z->img_comp[k].linebuf = (stbi_uc*)stbi__malloc(z->s->img_x + 3); + if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); @@ -3832,52 +3825,52 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp r->ypos = 0; r->line0 = r->line1 = z->img_comp[k].data; - if(r->hs == 1 && r->vs == 1) + if (r->hs == 1 && r->vs == 1) r->resample = resample_row_1; - else if(r->hs == 1 && r->vs == 2) + else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2; - else if(r->hs == 2 && r->vs == 1) + else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2; - else if(r->hs == 2 && r->vs == 2) + else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel; else r->resample = stbi__resample_row_generic; } // can't error after this so, this is safe - output = ( stbi_uc* )stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1); - if(!output) + output = (stbi_uc*)stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1); + if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); } // now go ahead and resample - for(j = 0; j < z->s->img_y; ++j) + for (j = 0; j < z->s->img_y; ++j) { stbi_uc* out = output + n * z->s->img_x * j; - for(k = 0; k < decode_n; ++k) + for (k = 0; k < decode_n; ++k) { stbi__resample* r = &res_comp[k]; int y_bot = r->ystep >= (r->vs >> 1); coutput[k] = r->resample(z->img_comp[k].linebuf, y_bot ? r->line1 : r->line0, y_bot ? r->line0 : r->line1, r->w_lores, r->hs); - if(++r->ystep >= r->vs) + if (++r->ystep >= r->vs) { r->ystep = 0; r->line0 = r->line1; - if(++r->ypos < z->img_comp[k].y) + if (++r->ypos < z->img_comp[k].y) r->line1 += z->img_comp[k].w2; } } - if(n >= 3) + if (n >= 3) { stbi_uc* y = coutput[0]; - if(z->s->img_n == 3) + if (z->s->img_n == 3) { - if(is_rgb) + if (is_rgb) { - for(i = 0; i < z->s->img_x; ++i) + for (i = 0; i < z->s->img_x; ++i) { out[0] = y[i]; out[1] = coutput[1][i]; @@ -3891,11 +3884,11 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); } } - else if(z->s->img_n == 4) + else if (z->s->img_n == 4) { - if(z->app14_color_transform == 0) - { // CMYK - for(i = 0; i < z->s->img_x; ++i) + if (z->app14_color_transform == 0) + { // CMYK + for (i = 0; i < z->s->img_x; ++i) { stbi_uc m = coutput[3][i]; out[0] = stbi__blinn_8x8(coutput[0][i], m); @@ -3905,10 +3898,10 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp out += n; } } - else if(z->app14_color_transform == 2) - { // YCCK + else if (z->app14_color_transform == 2) + { // YCCK z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); - for(i = 0; i < z->s->img_x; ++i) + for (i = 0; i < z->s->img_x; ++i) { stbi_uc m = coutput[3][i]; out[0] = stbi__blinn_8x8(255 - out[0], m); @@ -3918,37 +3911,37 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp } } else - { // YCbCr + alpha? Ignore the fourth channel for now + { // YCbCr + alpha? Ignore the fourth channel for now z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); } } else - for(i = 0; i < z->s->img_x; ++i) + for (i = 0; i < z->s->img_x; ++i) { out[0] = out[1] = out[2] = y[i]; - out[3] = 255; // not used if n==3 + out[3] = 255; // not used if n==3 out += n; } } else { - if(is_rgb) + if (is_rgb) { - if(n == 1) - for(i = 0; i < z->s->img_x; ++i) + if (n == 1) + for (i = 0; i < z->s->img_x; ++i) *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]); else { - for(i = 0; i < z->s->img_x; ++i, out += 2) + for (i = 0; i < z->s->img_x; ++i, out += 2) { out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]); out[1] = 255; } } } - else if(z->s->img_n == 4 && z->app14_color_transform == 0) + else if (z->s->img_n == 4 && z->app14_color_transform == 0) { - for(i = 0; i < z->s->img_x; ++i) + for (i = 0; i < z->s->img_x; ++i) { stbi_uc m = coutput[3][i]; stbi_uc r = stbi__blinn_8x8(coutput[0][i], m); @@ -3959,9 +3952,9 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp out += n; } } - else if(z->s->img_n == 4 && z->app14_color_transform == 2) + else if (z->s->img_n == 4 && z->app14_color_transform == 2) { - for(i = 0; i < z->s->img_x; ++i) + for (i = 0; i < z->s->img_x; ++i) { out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]); out[1] = 255; @@ -3971,11 +3964,11 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp else { stbi_uc* y = coutput[0]; - if(n == 1) - for(i = 0; i < z->s->img_x; ++i) + if (n == 1) + for (i = 0; i < z->s->img_x; ++i) out[i] = y[i]; else - for(i = 0; i < z->s->img_x; ++i) + for (i = 0; i < z->s->img_x; ++i) *out++ = y[i], *out++ = 255; } } @@ -3983,8 +3976,8 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp stbi__cleanup_jpeg(z); *out_x = z->s->img_x; *out_y = z->s->img_y; - if(comp) - *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output + if (comp) + *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output return output; } } @@ -3992,7 +3985,7 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp static void* stbi__jpeg_load(stbi__context* s, int* x, int* y, int* comp, int req_comp, stbi__result_info* ri) { unsigned char* result; - stbi__jpeg* j = ( stbi__jpeg* )stbi__malloc(sizeof(stbi__jpeg)); + stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg)); STBI_NOTUSED(ri); j->s = s; stbi__setup_jpeg(j); @@ -4004,7 +3997,7 @@ static void* stbi__jpeg_load(stbi__context* s, int* x, int* y, int* comp, int re static int stbi__jpeg_test(stbi__context* s) { int r; - stbi__jpeg* j = ( stbi__jpeg* )stbi__malloc(sizeof(stbi__jpeg)); + stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg)); j->s = s; stbi__setup_jpeg(j); r = stbi__decode_jpeg_header(j, STBI__SCAN_type); @@ -4015,16 +4008,16 @@ static int stbi__jpeg_test(stbi__context* s) static int stbi__jpeg_info_raw(stbi__jpeg* j, int* x, int* y, int* comp) { - if(!stbi__decode_jpeg_header(j, STBI__SCAN_header)) + if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) { stbi__rewind(j->s); return 0; } - if(x) + if (x) *x = j->s->img_x; - if(y) + if (y) *y = j->s->img_y; - if(comp) + if (comp) *comp = j->s->img_n >= 3 ? 3 : 1; return 1; } @@ -4032,7 +4025,7 @@ static int stbi__jpeg_info_raw(stbi__jpeg* j, int* x, int* y, int* comp) static int stbi__jpeg_info(stbi__context* s, int* x, int* y, int* comp) { int result; - stbi__jpeg* j = ( stbi__jpeg* )(stbi__malloc(sizeof(stbi__jpeg))); + stbi__jpeg* j = (stbi__jpeg*)(stbi__malloc(sizeof(stbi__jpeg))); j->s = s; result = stbi__jpeg_info_raw(j, x, y, comp); STBI_FREE(j); @@ -4050,7 +4043,7 @@ static int stbi__jpeg_info(stbi__context* s, int* x, int* y, int* comp) #ifndef STBI_NO_ZLIB // fast-way is faster to check than jpeg huffman, but slow way is slower -#define STBI__ZFAST_BITS 9 // accelerate all cases in default tables +#define STBI__ZFAST_BITS 9 // accelerate all cases in default tables #define STBI__ZFAST_MASK ((1 << STBI__ZFAST_BITS) - 1) // zlib-style huffman encoding @@ -4090,40 +4083,40 @@ static int stbi__zbuild_huffman(stbi__zhuffman* z, const stbi_uc* sizelist, int // DEFLATE spec for generating codes memset(sizes, 0, sizeof(sizes)); memset(z->fast, 0, sizeof(z->fast)); - for(i = 0; i < num; ++i) + for (i = 0; i < num; ++i) ++sizes[sizelist[i]]; sizes[0] = 0; - for(i = 1; i < 16; ++i) - if(sizes[i] > (1 << i)) + for (i = 1; i < 16; ++i) + if (sizes[i] > (1 << i)) return stbi__err("bad sizes", "Corrupt PNG"); code = 0; - for(i = 1; i < 16; ++i) + for (i = 1; i < 16; ++i) { next_code[i] = code; - z->firstcode[i] = ( stbi__uint16 )code; - z->firstsymbol[i] = ( stbi__uint16 )k; + z->firstcode[i] = (stbi__uint16)code; + z->firstsymbol[i] = (stbi__uint16)k; code = (code + sizes[i]); - if(sizes[i]) - if(code - 1 >= (1 << i)) + if (sizes[i]) + if (code - 1 >= (1 << i)) return stbi__err("bad codelengths", "Corrupt PNG"); - z->maxcode[i] = code << (16 - i); // preshift for inner loop + z->maxcode[i] = code << (16 - i); // preshift for inner loop code <<= 1; k += sizes[i]; } - z->maxcode[16] = 0x10000; // sentinel - for(i = 0; i < num; ++i) + z->maxcode[16] = 0x10000; // sentinel + for (i = 0; i < num; ++i) { int s = sizelist[i]; - if(s) + if (s) { int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s]; stbi__uint16 fastv = (stbi__uint16)((s << 9) | i); - z->size[c] = ( stbi_uc )s; - z->value[c] = ( stbi__uint16 )i; - if(s <= STBI__ZFAST_BITS) + z->size[c] = (stbi_uc)s; + z->value[c] = (stbi__uint16)i; + if (s <= STBI__ZFAST_BITS) { int j = stbi__bit_reverse(next_code[s], s); - while(j < (1 << STBI__ZFAST_BITS)) + while (j < (1 << STBI__ZFAST_BITS)) { z->fast[j] = fastv; j += (1 << s); @@ -4157,7 +4150,7 @@ typedef struct stbi_inline static stbi_uc stbi__zget8(stbi__zbuf* z) { - if(z->zbuffer >= z->zbuffer_end) + if (z->zbuffer >= z->zbuffer_end) return 0; return *z->zbuffer++; } @@ -4167,15 +4160,15 @@ static void stbi__fill_bits(stbi__zbuf* z) do { STBI_ASSERT(z->code_buffer < (1U << z->num_bits)); - z->code_buffer |= ( unsigned int )stbi__zget8(z) << z->num_bits; + z->code_buffer |= (unsigned int)stbi__zget8(z) << z->num_bits; z->num_bits += 8; - } while(z->num_bits <= 24); + } while (z->num_bits <= 24); } stbi_inline static unsigned int stbi__zreceive(stbi__zbuf* z, int n) { unsigned int k; - if(z->num_bits < n) + if (z->num_bits < n) stbi__fill_bits(z); k = z->code_buffer & ((1 << n) - 1); z->code_buffer >>= n; @@ -4189,11 +4182,11 @@ static int stbi__zhuffman_decode_slowpath(stbi__zbuf* a, stbi__zhuffman* z) // not resolved by fast table, so compute it the slow way // use jpeg approach, which requires MSbits at top k = stbi__bit_reverse(a->code_buffer, 16); - for(s = STBI__ZFAST_BITS + 1;; ++s) - if(k < z->maxcode[s]) + for (s = STBI__ZFAST_BITS + 1;; ++s) + if (k < z->maxcode[s]) break; - if(s == 16) - return -1; // invalid code! + if (s == 16) + return -1; // invalid code! // code size is s, so: b = (k >> (16 - s)) - z->firstcode[s] + z->firstsymbol[s]; STBI_ASSERT(z->size[b] == s); @@ -4205,10 +4198,10 @@ static int stbi__zhuffman_decode_slowpath(stbi__zbuf* a, stbi__zhuffman* z) stbi_inline static int stbi__zhuffman_decode(stbi__zbuf* a, stbi__zhuffman* z) { int b, s; - if(a->num_bits < 16) + if (a->num_bits < 16) stbi__fill_bits(a); b = z->fast[a->code_buffer & STBI__ZFAST_MASK]; - if(b) + if (b) { s = b >> 9; a->code_buffer >>= s; @@ -4218,20 +4211,20 @@ stbi_inline static int stbi__zhuffman_decode(stbi__zbuf* a, stbi__zhuffman* z) return stbi__zhuffman_decode_slowpath(a, z); } -static int stbi__zexpand(stbi__zbuf* z, char* zout, int n) // need to make room for n bytes +static int stbi__zexpand(stbi__zbuf* z, char* zout, int n) // need to make room for n bytes { char* q; int cur, limit, old_limit; z->zout = zout; - if(!z->z_expandable) + if (!z->z_expandable) return stbi__err("output buffer limit", "Corrupt PNG"); - cur = ( int )(z->zout - z->zout_start); - limit = old_limit = ( int )(z->zout_end - z->zout_start); - while(cur + n > limit) + cur = (int)(z->zout - z->zout_start); + limit = old_limit = (int)(z->zout_end - z->zout_start); + while (cur + n > limit) limit *= 2; - q = ( char* )STBI_REALLOC_SIZED(z->zout_start, old_limit, limit); + q = (char*)STBI_REALLOC_SIZED(z->zout_start, old_limit, limit); STBI_NOTUSED(old_limit); - if(q == NULL) + if (q == NULL) return stbi__err("outofmem", "Out of memory"); z->zout_start = q; z->zout = q + cur; @@ -4239,82 +4232,82 @@ static int stbi__zexpand(stbi__zbuf* z, char* zout, int n) // need to make ro return 1; } -static const int stbi__zlength_base[31] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, - 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0}; +static const int stbi__zlength_base[31] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, + 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0}; static const int stbi__zlength_extra[31] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 0, 0}; -static const int stbi__zdist_base[32] = {1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, - 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, - 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0, 0}; +static const int stbi__zdist_base[32] = {1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, + 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, + 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0, 0}; -static const int stbi__zdist_extra[32] = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, +static const int stbi__zdist_extra[32] = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13}; static int stbi__parse_huffman_block(stbi__zbuf* a) { char* zout = a->zout; - for(;;) + for (;;) { int z = stbi__zhuffman_decode(a, &a->z_length); - if(z < 256) + if (z < 256) { - if(z < 0) - return stbi__err("bad huffman code", "Corrupt PNG"); // error in huffman codes - if(zout >= a->zout_end) + if (z < 0) + return stbi__err("bad huffman code", "Corrupt PNG"); // error in huffman codes + if (zout >= a->zout_end) { - if(!stbi__zexpand(a, zout, 1)) + if (!stbi__zexpand(a, zout, 1)) return 0; zout = a->zout; } - *zout++ = ( char )z; + *zout++ = (char)z; } else { stbi_uc* p; int len, dist; - if(z == 256) + if (z == 256) { a->zout = zout; return 1; } z -= 257; len = stbi__zlength_base[z]; - if(stbi__zlength_extra[z]) + if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]); z = stbi__zhuffman_decode(a, &a->z_distance); - if(z < 0) + if (z < 0) return stbi__err("bad huffman code", "Corrupt PNG"); dist = stbi__zdist_base[z]; - if(stbi__zdist_extra[z]) + if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]); - if(zout - a->zout_start < dist) + if (zout - a->zout_start < dist) return stbi__err("bad dist", "Corrupt PNG"); - if(zout + len > a->zout_end) + if (zout + len > a->zout_end) { - if(!stbi__zexpand(a, zout, len)) + if (!stbi__zexpand(a, zout, len)) return 0; zout = a->zout; } - p = ( stbi_uc* )(zout - dist); - if(dist == 1) - { // run of one byte; common in images. + p = (stbi_uc*)(zout - dist); + if (dist == 1) + { // run of one byte; common in images. stbi_uc v = *p; - if(len) + if (len) { do *zout++ = v; - while(--len); + while (--len); } } else { - if(len) + if (len) { do *zout++ = *p++; - while(--len); + while (--len); } } } @@ -4325,7 +4318,7 @@ static int stbi__compute_huffman_codes(stbi__zbuf* a) { static const stbi_uc length_dezigzag[19] = {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; stbi__zhuffman z_codelength; - stbi_uc lencodes[286 + 32 + 137]; // padding for maximum single op + stbi_uc lencodes[286 + 32 + 137]; // padding for maximum single op stbi_uc codelength_sizes[19]; int i, n; @@ -4335,50 +4328,50 @@ static int stbi__compute_huffman_codes(stbi__zbuf* a) int ntot = hlit + hdist; memset(codelength_sizes, 0, sizeof(codelength_sizes)); - for(i = 0; i < hclen; ++i) + for (i = 0; i < hclen; ++i) { int s = stbi__zreceive(a, 3); - codelength_sizes[length_dezigzag[i]] = ( stbi_uc )s; + codelength_sizes[length_dezigzag[i]] = (stbi_uc)s; } - if(!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) + if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0; n = 0; - while(n < ntot) + while (n < ntot) { int c = stbi__zhuffman_decode(a, &z_codelength); - if(c < 0 || c >= 19) + if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG"); - if(c < 16) - lencodes[n++] = ( stbi_uc )c; + if (c < 16) + lencodes[n++] = (stbi_uc)c; else { stbi_uc fill = 0; - if(c == 16) + if (c == 16) { c = stbi__zreceive(a, 2) + 3; - if(n == 0) + if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG"); fill = lencodes[n - 1]; } - else if(c == 17) + else if (c == 17) c = stbi__zreceive(a, 3) + 3; else { STBI_ASSERT(c == 18); c = stbi__zreceive(a, 7) + 11; } - if(ntot - n < c) + if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG"); memset(lencodes + n, fill, c); n += c; } } - if(n != ntot) + if (n != ntot) return stbi__err("bad codelengths", "Corrupt PNG"); - if(!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) + if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0; - if(!stbi__zbuild_huffman(&a->z_distance, lencodes + hlit, hdist)) + if (!stbi__zbuild_huffman(&a->z_distance, lencodes + hlit, hdist)) return 0; return 1; } @@ -4387,28 +4380,28 @@ static int stbi__parse_uncompressed_block(stbi__zbuf* a) { stbi_uc header[4]; int len, nlen, k; - if(a->num_bits & 7) - stbi__zreceive(a, a->num_bits & 7); // discard + if (a->num_bits & 7) + stbi__zreceive(a, a->num_bits & 7); // discard // drain the bit-packed data into header k = 0; - while(a->num_bits > 0) + while (a->num_bits > 0) { - header[k++] = (stbi_uc)(a->code_buffer & 255); // suppress MSVC run-time check + header[k++] = (stbi_uc)(a->code_buffer & 255); // suppress MSVC run-time check a->code_buffer >>= 8; a->num_bits -= 8; } STBI_ASSERT(a->num_bits == 0); // now fill header the normal way - while(k < 4) + while (k < 4) header[k++] = stbi__zget8(a); len = header[1] * 256 + header[0]; nlen = header[3] * 256 + header[2]; - if(nlen != (len ^ 0xffff)) + if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt", "Corrupt PNG"); - if(a->zbuffer + len > a->zbuffer_end) + if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer", "Corrupt PNG"); - if(a->zout + len > a->zout_end) - if(!stbi__zexpand(a, a->zout, len)) + if (a->zout + len > a->zout_end) + if (!stbi__zexpand(a, a->zout, len)) return 0; memcpy(a->zout, a->zbuffer, len); a->zbuffer += len; @@ -4422,12 +4415,12 @@ static int stbi__parse_zlib_header(stbi__zbuf* a) int cm = cmf & 15; /* int cinfo = cmf >> 4; */ int flg = stbi__zget8(a); - if((cmf * 256 + flg) % 31 != 0) - return stbi__err("bad zlib header", "Corrupt PNG"); // zlib spec - if(flg & 32) - return stbi__err("no preset dict", "Corrupt PNG"); // preset dictionary not allowed in png - if(cm != 8) - return stbi__err("bad compression", "Corrupt PNG"); // DEFLATE required for png + if ((cmf * 256 + flg) % 31 != 0) + return stbi__err("bad zlib header", "Corrupt PNG"); // zlib spec + if (flg & 32) + return stbi__err("no preset dict", "Corrupt PNG"); // preset dictionary not allowed in png + if (cm != 8) + return stbi__err("bad compression", "Corrupt PNG"); // DEFLATE required for png // window = 1 << (8 + cinfo)... but who cares, we fully buffer output return 1; } @@ -4459,8 +4452,8 @@ Init algorithm: static int stbi__parse_zlib(stbi__zbuf* a, int parse_header) { int final, type; - if(parse_header) - if(!stbi__parse_zlib_header(a)) + if (parse_header) + if (!stbi__parse_zlib_header(a)) return 0; a->num_bits = 0; a->code_buffer = 0; @@ -4468,34 +4461,34 @@ static int stbi__parse_zlib(stbi__zbuf* a, int parse_header) { final = stbi__zreceive(a, 1); type = stbi__zreceive(a, 2); - if(type == 0) + if (type == 0) { - if(!stbi__parse_uncompressed_block(a)) + if (!stbi__parse_uncompressed_block(a)) return 0; } - else if(type == 3) + else if (type == 3) { return 0; } else { - if(type == 1) + if (type == 1) { // use fixed code lengths - if(!stbi__zbuild_huffman(&a->z_length, stbi__zdefault_length, 288)) + if (!stbi__zbuild_huffman(&a->z_length, stbi__zdefault_length, 288)) return 0; - if(!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance, 32)) + if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance, 32)) return 0; } else { - if(!stbi__compute_huffman_codes(a)) + if (!stbi__compute_huffman_codes(a)) return 0; } - if(!stbi__parse_huffman_block(a)) + if (!stbi__parse_huffman_block(a)) return 0; } - } while(!final); + } while (!final); return 1; } @@ -4512,15 +4505,15 @@ static int stbi__do_zlib(stbi__zbuf* a, char* obuf, int olen, int exp, int parse extern char* stbi_zlib_decode_malloc_guesssize(const char* buffer, int len, int initial_size, int* outlen) { stbi__zbuf a; - char* p = ( char* )stbi__malloc(initial_size); - if(p == NULL) + char* p = (char*)stbi__malloc(initial_size); + if (p == NULL) return NULL; - a.zbuffer = ( stbi_uc* )buffer; - a.zbuffer_end = ( stbi_uc* )buffer + len; - if(stbi__do_zlib(&a, p, initial_size, 1, 1)) + a.zbuffer = (stbi_uc*)buffer; + a.zbuffer_end = (stbi_uc*)buffer + len; + if (stbi__do_zlib(&a, p, initial_size, 1, 1)) { - if(outlen) - *outlen = ( int )(a.zout - a.zout_start); + if (outlen) + *outlen = (int)(a.zout - a.zout_start); return a.zout_start; } else @@ -4539,15 +4532,15 @@ extern char* stbi_zlib_decode_malloc_guesssize_headerflag(const char* buffer, in int parse_header) { stbi__zbuf a; - char* p = ( char* )stbi__malloc(initial_size); - if(p == NULL) + char* p = (char*)stbi__malloc(initial_size); + if (p == NULL) return NULL; - a.zbuffer = ( stbi_uc* )buffer; - a.zbuffer_end = ( stbi_uc* )buffer + len; - if(stbi__do_zlib(&a, p, initial_size, 1, parse_header)) + a.zbuffer = (stbi_uc*)buffer; + a.zbuffer_end = (stbi_uc*)buffer + len; + if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) { - if(outlen) - *outlen = ( int )(a.zout - a.zout_start); + if (outlen) + *outlen = (int)(a.zout - a.zout_start); return a.zout_start; } else @@ -4560,10 +4553,10 @@ extern char* stbi_zlib_decode_malloc_guesssize_headerflag(const char* buffer, in extern int stbi_zlib_decode_buffer(char* obuffer, int olen, char const* ibuffer, int ilen) { stbi__zbuf a; - a.zbuffer = ( stbi_uc* )ibuffer; - a.zbuffer_end = ( stbi_uc* )ibuffer + ilen; - if(stbi__do_zlib(&a, obuffer, olen, 0, 1)) - return ( int )(a.zout - a.zout_start); + a.zbuffer = (stbi_uc*)ibuffer; + a.zbuffer_end = (stbi_uc*)ibuffer + ilen; + if (stbi__do_zlib(&a, obuffer, olen, 0, 1)) + return (int)(a.zout - a.zout_start); else return -1; } @@ -4571,15 +4564,15 @@ extern int stbi_zlib_decode_buffer(char* obuffer, int olen, char const* ibuffer, extern char* stbi_zlib_decode_noheader_malloc(char const* buffer, int len, int* outlen) { stbi__zbuf a; - char* p = ( char* )stbi__malloc(16384); - if(p == NULL) + char* p = (char*)stbi__malloc(16384); + if (p == NULL) return NULL; - a.zbuffer = ( stbi_uc* )buffer; - a.zbuffer_end = ( stbi_uc* )buffer + len; - if(stbi__do_zlib(&a, p, 16384, 1, 0)) + a.zbuffer = (stbi_uc*)buffer; + a.zbuffer_end = (stbi_uc*)buffer + len; + if (stbi__do_zlib(&a, p, 16384, 1, 0)) { - if(outlen) - *outlen = ( int )(a.zout - a.zout_start); + if (outlen) + *outlen = (int)(a.zout - a.zout_start); return a.zout_start; } else @@ -4592,10 +4585,10 @@ extern char* stbi_zlib_decode_noheader_malloc(char const* buffer, int len, int* extern int stbi_zlib_decode_noheader_buffer(char* obuffer, int olen, const char* ibuffer, int ilen) { stbi__zbuf a; - a.zbuffer = ( stbi_uc* )ibuffer; - a.zbuffer_end = ( stbi_uc* )ibuffer + ilen; - if(stbi__do_zlib(&a, obuffer, olen, 0, 0)) - return ( int )(a.zout - a.zout_start); + a.zbuffer = (stbi_uc*)ibuffer; + a.zbuffer_end = (stbi_uc*)ibuffer + ilen; + if (stbi__do_zlib(&a, obuffer, olen, 0, 0)) + return (int)(a.zout - a.zout_start); else return -1; } @@ -4630,8 +4623,8 @@ static int stbi__check_png_header(stbi__context* s) { static const stbi_uc png_sig[8] = {137, 80, 78, 71, 13, 10, 26, 10}; int i; - for(i = 0; i < 8; ++i) - if(stbi__get8(s) != png_sig[i]) + for (i = 0; i < 8; ++i) + if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig", "Not a PNG"); return 1; } @@ -4663,9 +4656,9 @@ static int stbi__paeth(int a, int b, int c) int pa = abs(p - a); int pb = abs(p - b); int pc = abs(p - c); - if(pa <= pb && pa <= pc) + if (pa <= pb && pa <= pc) return a; - if(pb <= pc) + if (pb <= pc) return b; return c; } @@ -4681,18 +4674,18 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r stbi__uint32 i, j, stride = x * out_n * bytes; stbi__uint32 img_len, img_width_bytes; int k; - int img_n = s->img_n; // copy it into a local for later + int img_n = s->img_n; // copy it into a local for later int output_bytes = out_n * bytes; int filter_bytes = img_n * bytes; int width = x; STBI_ASSERT(out_n == s->img_n || out_n == s->img_n + 1); - a->out = ( stbi_uc* )stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into - if(!a->out) + a->out = (stbi_uc*)stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into + if (!a->out) return stbi__err("outofmem", "Out of memory"); - if(!stbi__mad3sizes_valid(img_n, x, depth, 7)) + if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG"); img_width_bytes = (((img_n * x * depth) + 7) >> 3); img_len = (img_width_bytes + 1) * y; @@ -4700,75 +4693,74 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r // we used to check for exact match between raw_len and img_len on non-interlaced PNGs, // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros), // so just check for raw_len < img_len always. - if(raw_len < img_len) + if (raw_len < img_len) return stbi__err("not enough pixels", "Corrupt PNG"); - for(j = 0; j < y; ++j) + for (j = 0; j < y; ++j) { stbi_uc* cur = a->out + stride * j; stbi_uc* prior; int filter = *raw++; - if(filter > 4) + if (filter > 4) return stbi__err("invalid filter", "Corrupt PNG"); - if(depth < 8) + if (depth < 8) { STBI_ASSERT(img_width_bytes <= x); - cur += x * out_n - - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place + cur += x * out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place filter_bytes = 1; width = img_width_bytes; } - prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above + prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above // if first row, use special filter that doesn't sample previous row - if(j == 0) + if (j == 0) filter = first_row_filter[filter]; // handle first byte explicitly - for(k = 0; k < filter_bytes; ++k) + for (k = 0; k < filter_bytes; ++k) { - switch(filter) + switch (filter) { - case STBI__F_none: - cur[k] = raw[k]; - break; - case STBI__F_sub: - cur[k] = raw[k]; - break; - case STBI__F_up: - cur[k] = STBI__BYTECAST(raw[k] + prior[k]); - break; - case STBI__F_avg: - cur[k] = STBI__BYTECAST(raw[k] + (prior[k] >> 1)); - break; - case STBI__F_paeth: - cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0, prior[k], 0)); - break; - case STBI__F_avg_first: - cur[k] = raw[k]; - break; - case STBI__F_paeth_first: - cur[k] = raw[k]; - break; + case STBI__F_none: + cur[k] = raw[k]; + break; + case STBI__F_sub: + cur[k] = raw[k]; + break; + case STBI__F_up: + cur[k] = STBI__BYTECAST(raw[k] + prior[k]); + break; + case STBI__F_avg: + cur[k] = STBI__BYTECAST(raw[k] + (prior[k] >> 1)); + break; + case STBI__F_paeth: + cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0, prior[k], 0)); + break; + case STBI__F_avg_first: + cur[k] = raw[k]; + break; + case STBI__F_paeth_first: + cur[k] = raw[k]; + break; } } - if(depth == 8) + if (depth == 8) { - if(img_n != out_n) - cur[img_n] = 255; // first pixel + if (img_n != out_n) + cur[img_n] = 255; // first pixel raw += img_n; cur += out_n; prior += out_n; } - else if(depth == 16) + else if (depth == 16) { - if(img_n != out_n) + if (img_n != out_n) { - cur[filter_bytes] = 255; // first pixel top byte - cur[filter_bytes + 1] = 255; // first pixel bottom byte + cur[filter_bytes] = 255; // first pixel top byte + cur[filter_bytes + 1] = 255; // first pixel bottom byte } raw += filter_bytes; cur += output_bytes; @@ -4782,49 +4774,48 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r } // this is a little gross, so that we don't switch per-pixel or per-component - if(depth < 8 || img_n == out_n) + if (depth < 8 || img_n == out_n) { int nk = (width - 1) * filter_bytes; #define STBI__CASE(f) \ case f: \ - for(k = 0; k < nk; ++k) - switch(filter) + for (k = 0; k < nk; ++k) + switch (filter) { - // "none" filter turns into a memcpy here; make that explicit. - case STBI__F_none: - memcpy(cur, raw, nk); - break; - STBI__CASE(STBI__F_sub) - { - cur[k] = STBI__BYTECAST(raw[k] + cur[k - filter_bytes]); - } - break; - STBI__CASE(STBI__F_up) - { - cur[k] = STBI__BYTECAST(raw[k] + prior[k]); - } - break; - STBI__CASE(STBI__F_avg) - { - cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1)); - } - break; - STBI__CASE(STBI__F_paeth) - { - cur[k] = STBI__BYTECAST(raw[k] + - stbi__paeth(cur[k - filter_bytes], prior[k], prior[k - filter_bytes])); - } - break; - STBI__CASE(STBI__F_avg_first) - { - cur[k] = STBI__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1)); - } - break; - STBI__CASE(STBI__F_paeth_first) - { - cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], 0, 0)); - } - break; + // "none" filter turns into a memcpy here; make that explicit. + case STBI__F_none: + memcpy(cur, raw, nk); + break; + STBI__CASE(STBI__F_sub) + { + cur[k] = STBI__BYTECAST(raw[k] + cur[k - filter_bytes]); + } + break; + STBI__CASE(STBI__F_up) + { + cur[k] = STBI__BYTECAST(raw[k] + prior[k]); + } + break; + STBI__CASE(STBI__F_avg) + { + cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1)); + } + break; + STBI__CASE(STBI__F_paeth) + { + cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], prior[k], prior[k - filter_bytes])); + } + break; + STBI__CASE(STBI__F_avg_first) + { + cur[k] = STBI__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1)); + } + break; + STBI__CASE(STBI__F_paeth_first) + { + cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], 0, 0)); + } + break; } #undef STBI__CASE raw += nk; @@ -4832,12 +4823,12 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r else { STBI_ASSERT(img_n + 1 == out_n); -#define STBI__CASE(f) \ - case f: \ - for(i = x - 1; i >= 1; \ - --i, cur[filter_bytes] = 255, raw += filter_bytes, cur += output_bytes, prior += output_bytes) \ - for(k = 0; k < filter_bytes; ++k) - switch(filter) +#define STBI__CASE(f) \ + case f: \ + for (i = x - 1; i >= 1; \ + --i, cur[filter_bytes] = 255, raw += filter_bytes, cur += output_bytes, prior += output_bytes) \ + for (k = 0; k < filter_bytes; ++k) + switch (filter) { STBI__CASE(STBI__F_none) { @@ -4861,8 +4852,7 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r break; STBI__CASE(STBI__F_paeth) { - cur[k] = - STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - output_bytes], prior[k], prior[k - output_bytes])); + cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - output_bytes], prior[k], prior[k - output_bytes])); } break; STBI__CASE(STBI__F_avg_first) @@ -4880,10 +4870,10 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r // the loop above sets the high byte of the pixels' alpha, but for // 16 bit png files we also need the low byte set. we'll do that here. - if(depth == 16) + if (depth == 16) { - cur = a->out + stride * j; // start at the beginning of the row again - for(i = 0; i < x; ++i, cur += output_bytes) + cur = a->out + stride * j; // start at the beginning of the row again + for (i = 0; i < x; ++i, cur += output_bytes) { cur[filter_bytes + 1] = 255; } @@ -4894,17 +4884,16 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r // we make a separate pass to expand bits to pixels; for performance, // this could run two scanlines behind the above code, so it won't // intefere with filtering but will still be in the cache. - if(depth < 8) + if (depth < 8) { - for(j = 0; j < y; ++j) + for (j = 0; j < y; ++j) { stbi_uc* cur = a->out + stride * j; stbi_uc* in = a->out + stride * j + x * out_n - img_width_bytes; // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for // 1/2/4-bit png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data // that will be skipped in the later loop - stbi_uc scale = - (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range + stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range // note that the final byte might overshoot and write more data than desired. // we can allocate enough data that this never writes out of memory, but it @@ -4912,35 +4901,35 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel. // so we need to explicitly clamp the final ones - if(depth == 4) + if (depth == 4) { - for(k = x * img_n; k >= 2; k -= 2, ++in) + for (k = x * img_n; k >= 2; k -= 2, ++in) { *cur++ = scale * ((*in >> 4)); *cur++ = scale * ((*in) & 0x0f); } - if(k > 0) + if (k > 0) *cur++ = scale * ((*in >> 4)); } - else if(depth == 2) + else if (depth == 2) { - for(k = x * img_n; k >= 4; k -= 4, ++in) + for (k = x * img_n; k >= 4; k -= 4, ++in) { *cur++ = scale * ((*in >> 6)); *cur++ = scale * ((*in >> 4) & 0x03); *cur++ = scale * ((*in >> 2) & 0x03); *cur++ = scale * ((*in) & 0x03); } - if(k > 0) + if (k > 0) *cur++ = scale * ((*in >> 6)); - if(k > 1) + if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03); - if(k > 2) + if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03); } - else if(depth == 1) + else if (depth == 1) { - for(k = x * img_n; k >= 8; k -= 8, ++in) + for (k = x * img_n; k >= 8; k -= 8, ++in) { *cur++ = scale * ((*in >> 7)); *cur++ = scale * ((*in >> 6) & 0x01); @@ -4951,29 +4940,29 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r *cur++ = scale * ((*in >> 1) & 0x01); *cur++ = scale * ((*in) & 0x01); } - if(k > 0) + if (k > 0) *cur++ = scale * ((*in >> 7)); - if(k > 1) + if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01); - if(k > 2) + if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01); - if(k > 3) + if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01); - if(k > 4) + if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01); - if(k > 5) + if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01); - if(k > 6) + if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01); } - if(img_n != out_n) + if (img_n != out_n) { int q; // insert alpha = 255 cur = a->out + stride * j; - if(img_n == 1) + if (img_n == 1) { - for(q = x - 1; q >= 0; --q) + for (q = x - 1; q >= 0; --q) { cur[q * 2 + 1] = 255; cur[q * 2 + 0] = cur[q]; @@ -4982,7 +4971,7 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r else { STBI_ASSERT(img_n == 3); - for(q = x - 1; q >= 0; --q) + for (q = x - 1; q >= 0; --q) { cur[q * 4 + 3] = 255; cur[q * 4 + 2] = cur[q * 3 + 2]; @@ -4993,16 +4982,16 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r } } } - else if(depth == 16) + else if (depth == 16) { // force the image data from big-endian to platform-native. // this is done in a separate pass due to the decoding relying // on the data being untouched, but could probably be done // per-line during decode if care is taken. stbi_uc* cur = a->out; - stbi__uint16* cur16 = ( stbi__uint16* )cur; + stbi__uint16* cur16 = (stbi__uint16*)cur; - for(i = 0; i < x * y * out_n; ++i, cur16++, cur += 2) + for (i = 0; i < x * y * out_n; ++i, cur16++, cur += 2) { *cur16 = (cur[0] << 8) | cur[1]; } @@ -5018,12 +5007,12 @@ static int stbi__create_png_image(stbi__png* a, stbi_uc* image_data, stbi__uint3 int out_bytes = out_n * bytes; stbi_uc* final; int p; - if(!interlaced) + if (!interlaced) return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color); // de-interlacing - final = ( stbi_uc* )stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0); - for(p = 0; p < 7; ++p) + final = (stbi_uc*)stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0); + for (p = 0; p < 7; ++p) { int xorig[] = {0, 4, 0, 2, 0, 1, 0}; int yorig[] = {0, 0, 4, 0, 2, 0, 1}; @@ -5033,17 +5022,17 @@ static int stbi__create_png_image(stbi__png* a, stbi_uc* image_data, stbi__uint3 // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1 x = (a->s->img_x - xorig[p] + xspc[p] - 1) / xspc[p]; y = (a->s->img_y - yorig[p] + yspc[p] - 1) / yspc[p]; - if(x && y) + if (x && y) { stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y; - if(!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) + if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) { STBI_FREE(final); return 0; } - for(j = 0; j < y; ++j) + for (j = 0; j < y; ++j) { - for(i = 0; i < x; ++i) + for (i = 0; i < x; ++i) { int out_y = j * yspc[p] + yorig[p]; int out_x = i * xspc[p] + xorig[p]; @@ -5071,9 +5060,9 @@ static int stbi__compute_transparency(stbi__png* z, stbi_uc tc[3], int out_n) // already got 255 as the alpha value in the output STBI_ASSERT(out_n == 2 || out_n == 4); - if(out_n == 2) + if (out_n == 2) { - for(i = 0; i < pixel_count; ++i) + for (i = 0; i < pixel_count; ++i) { p[1] = (p[0] == tc[0] ? 0 : 255); p += 2; @@ -5081,9 +5070,9 @@ static int stbi__compute_transparency(stbi__png* z, stbi_uc tc[3], int out_n) } else { - for(i = 0; i < pixel_count; ++i) + for (i = 0; i < pixel_count; ++i) { - if(p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) + if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) p[3] = 0; p += 4; } @@ -5095,15 +5084,15 @@ static int stbi__compute_transparency16(stbi__png* z, stbi__uint16 tc[3], int ou { stbi__context* s = z->s; stbi__uint32 i, pixel_count = s->img_x * s->img_y; - stbi__uint16* p = ( stbi__uint16* )z->out; + stbi__uint16* p = (stbi__uint16*)z->out; // compute color-based transparency, assuming we've // already got 65535 as the alpha value in the output STBI_ASSERT(out_n == 2 || out_n == 4); - if(out_n == 2) + if (out_n == 2) { - for(i = 0; i < pixel_count; ++i) + for (i = 0; i < pixel_count; ++i) { p[1] = (p[0] == tc[0] ? 0 : 65535); p += 2; @@ -5111,9 +5100,9 @@ static int stbi__compute_transparency16(stbi__png* z, stbi__uint16 tc[3], int ou } else { - for(i = 0; i < pixel_count; ++i) + for (i = 0; i < pixel_count; ++i) { - if(p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) + if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) p[3] = 0; p += 4; } @@ -5126,16 +5115,16 @@ static int stbi__expand_png_palette(stbi__png* a, stbi_uc* palette, int len, int stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y; stbi_uc *p, *temp_out, *orig = a->out; - p = ( stbi_uc* )stbi__malloc_mad2(pixel_count, pal_img_n, 0); - if(p == NULL) + p = (stbi_uc*)stbi__malloc_mad2(pixel_count, pal_img_n, 0); + if (p == NULL) return stbi__err("outofmem", "Out of memory"); // between here and free(out) below, exitting would leak temp_out = p; - if(pal_img_n == 3) + if (pal_img_n == 3) { - for(i = 0; i < pixel_count; ++i) + for (i = 0; i < pixel_count; ++i) { int n = orig[i] * 4; p[0] = palette[n]; @@ -5146,7 +5135,7 @@ static int stbi__expand_png_palette(stbi__png* a, stbi_uc* palette, int len, int } else { - for(i = 0; i < pixel_count; ++i) + for (i = 0; i < pixel_count; ++i) { int n = orig[i] * 4; p[0] = palette[n]; @@ -5183,9 +5172,9 @@ static void stbi__de_iphone(stbi__png* z) stbi__uint32 i, pixel_count = s->img_x * s->img_y; stbi_uc* p = z->out; - if(s->img_out_n == 3) - { // convert bgr to rgb - for(i = 0; i < pixel_count; ++i) + if (s->img_out_n == 3) + { // convert bgr to rgb + for (i = 0; i < pixel_count; ++i) { stbi_uc t = p[0]; p[0] = p[2]; @@ -5196,14 +5185,14 @@ static void stbi__de_iphone(stbi__png* z) else { STBI_ASSERT(s->img_out_n == 4); - if(stbi__unpremultiply_on_load) + if (stbi__unpremultiply_on_load) { // convert bgr to rgb and unpremultiply - for(i = 0; i < pixel_count; ++i) + for (i = 0; i < pixel_count; ++i) { stbi_uc a = p[3]; stbi_uc t = p[0]; - if(a) + if (a) { stbi_uc half = a / 2; p[0] = (p[2] * 255 + half) / a; @@ -5221,7 +5210,7 @@ static void stbi__de_iphone(stbi__png* z) else { // convert bgr to rgb - for(i = 0; i < pixel_count; ++i) + for (i = 0; i < pixel_count; ++i) { stbi_uc t = p[0]; p[0] = p[2]; @@ -5233,7 +5222,7 @@ static void stbi__de_iphone(stbi__png* z) } #define STBI__PNG_TYPE(a, b, c, d) \ - ((( unsigned )(a) << 24) + (( unsigned )(b) << 16) + (( unsigned )(c) << 8) + ( unsigned )(d)) + (((unsigned)(a) << 24) + ((unsigned)(b) << 16) + ((unsigned)(c) << 8) + (unsigned)(d)) static int stbi__parse_png_file(stbi__png* z, int scan, int req_comp) { @@ -5248,250 +5237,249 @@ static int stbi__parse_png_file(stbi__png* z, int scan, int req_comp) z->idata = NULL; z->out = NULL; - if(!stbi__check_png_header(s)) + if (!stbi__check_png_header(s)) return 0; - if(scan == STBI__SCAN_type) + if (scan == STBI__SCAN_type) return 1; - for(;;) + for (;;) { stbi__pngchunk c = stbi__get_chunk_header(s); - switch(c.type) + switch (c.type) { - case STBI__PNG_TYPE('C', 'g', 'B', 'I'): - is_iphone = 1; - stbi__skip(s, c.length); - break; - case STBI__PNG_TYPE('I', 'H', 'D', 'R'): + case STBI__PNG_TYPE('C', 'g', 'B', 'I'): + is_iphone = 1; + stbi__skip(s, c.length); + break; + case STBI__PNG_TYPE('I', 'H', 'D', 'R'): + { + int comp, filter; + if (!first) + return stbi__err("multiple IHDR", "Corrupt PNG"); + first = 0; + if (c.length != 13) + return stbi__err("bad IHDR len", "Corrupt PNG"); + s->img_x = stbi__get32be(s); + if (s->img_x > (1 << 24)) + return stbi__err("too large", "Very large image (corrupt?)"); + s->img_y = stbi__get32be(s); + if (s->img_y > (1 << 24)) + return stbi__err("too large", "Very large image (corrupt?)"); + z->depth = stbi__get8(s); + if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16) + return stbi__err("1/2/4/8/16-bit only", "PNG not supported: 1/2/4/8/16-bit only"); + color = stbi__get8(s); + if (color > 6) + return stbi__err("bad ctype", "Corrupt PNG"); + if (color == 3 && z->depth == 16) + return stbi__err("bad ctype", "Corrupt PNG"); + if (color == 3) + pal_img_n = 3; + else if (color & 1) + return stbi__err("bad ctype", "Corrupt PNG"); + comp = stbi__get8(s); + if (comp) + return stbi__err("bad comp method", "Corrupt PNG"); + filter = stbi__get8(s); + if (filter) + return stbi__err("bad filter method", "Corrupt PNG"); + interlace = stbi__get8(s); + if (interlace > 1) + return stbi__err("bad interlace method", "Corrupt PNG"); + if (!s->img_x || !s->img_y) + return stbi__err("0-pixel image", "Corrupt PNG"); + if (!pal_img_n) { - int comp, filter; - if(!first) - return stbi__err("multiple IHDR", "Corrupt PNG"); - first = 0; - if(c.length != 13) - return stbi__err("bad IHDR len", "Corrupt PNG"); - s->img_x = stbi__get32be(s); - if(s->img_x > (1 << 24)) - return stbi__err("too large", "Very large image (corrupt?)"); - s->img_y = stbi__get32be(s); - if(s->img_y > (1 << 24)) - return stbi__err("too large", "Very large image (corrupt?)"); - z->depth = stbi__get8(s); - if(z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16) - return stbi__err("1/2/4/8/16-bit only", "PNG not supported: 1/2/4/8/16-bit only"); - color = stbi__get8(s); - if(color > 6) - return stbi__err("bad ctype", "Corrupt PNG"); - if(color == 3 && z->depth == 16) - return stbi__err("bad ctype", "Corrupt PNG"); - if(color == 3) - pal_img_n = 3; - else if(color & 1) - return stbi__err("bad ctype", "Corrupt PNG"); - comp = stbi__get8(s); - if(comp) - return stbi__err("bad comp method", "Corrupt PNG"); - filter = stbi__get8(s); - if(filter) - return stbi__err("bad filter method", "Corrupt PNG"); - interlace = stbi__get8(s); - if(interlace > 1) - return stbi__err("bad interlace method", "Corrupt PNG"); - if(!s->img_x || !s->img_y) - return stbi__err("0-pixel image", "Corrupt PNG"); - if(!pal_img_n) - { - s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0); - if((1 << 30) / s->img_x / s->img_n < s->img_y) - return stbi__err("too large", "Image too large to decode"); - if(scan == STBI__SCAN_header) - return 1; - } - else - { - // if paletted, then pal_n is our final components, and - // img_n is # components to decompress/filter. - s->img_n = 1; - if((1 << 30) / s->img_x / 4 < s->img_y) - return stbi__err("too large", "Corrupt PNG"); - // if SCAN_header, have to scan to see if we have a tRNS - } - break; + s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0); + if ((1 << 30) / s->img_x / s->img_n < s->img_y) + return stbi__err("too large", "Image too large to decode"); + if (scan == STBI__SCAN_header) + return 1; + } + else + { + // if paletted, then pal_n is our final components, and + // img_n is # components to decompress/filter. + s->img_n = 1; + if ((1 << 30) / s->img_x / 4 < s->img_y) + return stbi__err("too large", "Corrupt PNG"); + // if SCAN_header, have to scan to see if we have a tRNS } + break; + } - case STBI__PNG_TYPE('P', 'L', 'T', 'E'): + case STBI__PNG_TYPE('P', 'L', 'T', 'E'): + { + if (first) + return stbi__err("first not IHDR", "Corrupt PNG"); + if (c.length > 256 * 3) + return stbi__err("invalid PLTE", "Corrupt PNG"); + pal_len = c.length / 3; + if (pal_len * 3 != c.length) + return stbi__err("invalid PLTE", "Corrupt PNG"); + for (i = 0; i < pal_len; ++i) + { + palette[i * 4 + 0] = stbi__get8(s); + palette[i * 4 + 1] = stbi__get8(s); + palette[i * 4 + 2] = stbi__get8(s); + palette[i * 4 + 3] = 255; + } + break; + } + + case STBI__PNG_TYPE('t', 'R', 'N', 'S'): + { + if (first) + return stbi__err("first not IHDR", "Corrupt PNG"); + if (z->idata) + return stbi__err("tRNS after IDAT", "Corrupt PNG"); + if (pal_img_n) { - if(first) - return stbi__err("first not IHDR", "Corrupt PNG"); - if(c.length > 256 * 3) - return stbi__err("invalid PLTE", "Corrupt PNG"); - pal_len = c.length / 3; - if(pal_len * 3 != c.length) - return stbi__err("invalid PLTE", "Corrupt PNG"); - for(i = 0; i < pal_len; ++i) + if (scan == STBI__SCAN_header) { - palette[i * 4 + 0] = stbi__get8(s); - palette[i * 4 + 1] = stbi__get8(s); - palette[i * 4 + 2] = stbi__get8(s); - palette[i * 4 + 3] = 255; + s->img_n = 4; + return 1; } - break; + if (pal_len == 0) + return stbi__err("tRNS before PLTE", "Corrupt PNG"); + if (c.length > pal_len) + return stbi__err("bad tRNS len", "Corrupt PNG"); + pal_img_n = 4; + for (i = 0; i < c.length; ++i) + palette[i * 4 + 3] = stbi__get8(s); } - - case STBI__PNG_TYPE('t', 'R', 'N', 'S'): + else { - if(first) - return stbi__err("first not IHDR", "Corrupt PNG"); - if(z->idata) - return stbi__err("tRNS after IDAT", "Corrupt PNG"); - if(pal_img_n) + if (!(s->img_n & 1)) + return stbi__err("tRNS with alpha", "Corrupt PNG"); + if (c.length != (stbi__uint32)s->img_n * 2) + return stbi__err("bad tRNS len", "Corrupt PNG"); + has_trans = 1; + if (z->depth == 16) { - if(scan == STBI__SCAN_header) - { - s->img_n = 4; - return 1; - } - if(pal_len == 0) - return stbi__err("tRNS before PLTE", "Corrupt PNG"); - if(c.length > pal_len) - return stbi__err("bad tRNS len", "Corrupt PNG"); - pal_img_n = 4; - for(i = 0; i < c.length; ++i) - palette[i * 4 + 3] = stbi__get8(s); + for (k = 0; k < s->img_n; ++k) + tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is } else { - if(!(s->img_n & 1)) - return stbi__err("tRNS with alpha", "Corrupt PNG"); - if(c.length != ( stbi__uint32 )s->img_n * 2) - return stbi__err("bad tRNS len", "Corrupt PNG"); - has_trans = 1; - if(z->depth == 16) - { - for(k = 0; k < s->img_n; ++k) - tc16[k] = ( stbi__uint16 )stbi__get16be(s); // copy the values as-is - } - else - { - for(k = 0; k < s->img_n; ++k) - tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * - stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger - } + for (k = 0; k < s->img_n; ++k) + tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger } - break; } + break; + } - case STBI__PNG_TYPE('I', 'D', 'A', 'T'): + case STBI__PNG_TYPE('I', 'D', 'A', 'T'): + { + if (first) + return stbi__err("first not IHDR", "Corrupt PNG"); + if (pal_img_n && !pal_len) + return stbi__err("no PLTE", "Corrupt PNG"); + if (scan == STBI__SCAN_header) { - if(first) - return stbi__err("first not IHDR", "Corrupt PNG"); - if(pal_img_n && !pal_len) - return stbi__err("no PLTE", "Corrupt PNG"); - if(scan == STBI__SCAN_header) - { - s->img_n = pal_img_n; - return 1; - } - if(( int )(ioff + c.length) < ( int )ioff) - return 0; - if(ioff + c.length > idata_limit) - { - stbi__uint32 idata_limit_old = idata_limit; - stbi_uc* p; - if(idata_limit == 0) - idata_limit = c.length > 4096 ? c.length : 4096; - while(ioff + c.length > idata_limit) - idata_limit *= 2; - STBI_NOTUSED(idata_limit_old); - p = ( stbi_uc* )STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); - if(p == NULL) - return stbi__err("outofmem", "Out of memory"); - z->idata = p; - } - if(!stbi__getn(s, z->idata + ioff, c.length)) - return stbi__err("outofdata", "Corrupt PNG"); - ioff += c.length; - break; + s->img_n = pal_img_n; + return 1; + } + if ((int)(ioff + c.length) < (int)ioff) + return 0; + if (ioff + c.length > idata_limit) + { + stbi__uint32 idata_limit_old = idata_limit; + stbi_uc* p; + if (idata_limit == 0) + idata_limit = c.length > 4096 ? c.length : 4096; + while (ioff + c.length > idata_limit) + idata_limit *= 2; + STBI_NOTUSED(idata_limit_old); + p = (stbi_uc*)STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); + if (p == NULL) + return stbi__err("outofmem", "Out of memory"); + z->idata = p; } + if (!stbi__getn(s, z->idata + ioff, c.length)) + return stbi__err("outofdata", "Corrupt PNG"); + ioff += c.length; + break; + } - case STBI__PNG_TYPE('I', 'E', 'N', 'D'): + case STBI__PNG_TYPE('I', 'E', 'N', 'D'): + { + stbi__uint32 raw_len, bpl; + if (first) + return stbi__err("first not IHDR", "Corrupt PNG"); + if (scan != STBI__SCAN_load) + return 1; + if (z->idata == NULL) + return stbi__err("no IDAT", "Corrupt PNG"); + // initial guess for decoded data size to avoid unnecessary reallocs + bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component + raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */; + z->expanded = (stbi_uc*)stbi_zlib_decode_malloc_guesssize_headerflag((char*)z->idata, ioff, raw_len, + (int*)&raw_len, !is_iphone); + if (z->expanded == NULL) + return 0; // zlib should set error + STBI_FREE(z->idata); + z->idata = NULL; + if ((req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n) || has_trans) + s->img_out_n = s->img_n + 1; + else + s->img_out_n = s->img_n; + if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) + return 0; + if (has_trans) { - stbi__uint32 raw_len, bpl; - if(first) - return stbi__err("first not IHDR", "Corrupt PNG"); - if(scan != STBI__SCAN_load) - return 1; - if(z->idata == NULL) - return stbi__err("no IDAT", "Corrupt PNG"); - // initial guess for decoded data size to avoid unnecessary reallocs - bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component - raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */; - z->expanded = ( stbi_uc* )stbi_zlib_decode_malloc_guesssize_headerflag(( char* )z->idata, ioff, raw_len, - ( int* )&raw_len, !is_iphone); - if(z->expanded == NULL) - return 0; // zlib should set error - STBI_FREE(z->idata); - z->idata = NULL; - if((req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n) || has_trans) - s->img_out_n = s->img_n + 1; - else - s->img_out_n = s->img_n; - if(!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) - return 0; - if(has_trans) + if (z->depth == 16) { - if(z->depth == 16) - { - if(!stbi__compute_transparency16(z, tc16, s->img_out_n)) - return 0; - } - else - { - if(!stbi__compute_transparency(z, tc, s->img_out_n)) - return 0; - } - } - if(is_iphone && stbi__de_iphone_flag && s->img_out_n > 2) - stbi__de_iphone(z); - if(pal_img_n) - { - // pal_img_n == 3 or 4 - s->img_n = pal_img_n; // record the actual colors we had - s->img_out_n = pal_img_n; - if(req_comp >= 3) - s->img_out_n = req_comp; - if(!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n)) + if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0; } - else if(has_trans) + else { - // non-paletted image with tRNS -> source image has (constant) alpha - ++s->img_n; + if (!stbi__compute_transparency(z, tc, s->img_out_n)) + return 0; } - STBI_FREE(z->expanded); - z->expanded = NULL; - return 1; } + if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2) + stbi__de_iphone(z); + if (pal_img_n) + { + // pal_img_n == 3 or 4 + s->img_n = pal_img_n; // record the actual colors we had + s->img_out_n = pal_img_n; + if (req_comp >= 3) + s->img_out_n = req_comp; + if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n)) + return 0; + } + else if (has_trans) + { + // non-paletted image with tRNS -> source image has (constant) alpha + ++s->img_n; + } + STBI_FREE(z->expanded); + z->expanded = NULL; + return 1; + } - default: - // if critical, fail - if(first) - return stbi__err("first not IHDR", "Corrupt PNG"); - if((c.type & (1 << 29)) == 0) - { + default: + // if critical, fail + if (first) + return stbi__err("first not IHDR", "Corrupt PNG"); + if ((c.type & (1 << 29)) == 0) + { #ifndef STBI_NO_FAILURE_STRINGS - // not threadsafe - static char invalid_chunk[] = "XXXX PNG chunk not known"; - invalid_chunk[0] = STBI__BYTECAST(c.type >> 24); - invalid_chunk[1] = STBI__BYTECAST(c.type >> 16); - invalid_chunk[2] = STBI__BYTECAST(c.type >> 8); - invalid_chunk[3] = STBI__BYTECAST(c.type >> 0); + // not threadsafe + static char invalid_chunk[] = "XXXX PNG chunk not known"; + invalid_chunk[0] = STBI__BYTECAST(c.type >> 24); + invalid_chunk[1] = STBI__BYTECAST(c.type >> 16); + invalid_chunk[2] = STBI__BYTECAST(c.type >> 8); + invalid_chunk[3] = STBI__BYTECAST(c.type >> 0); #endif - return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type"); - } - stbi__skip(s, c.length); - break; + return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type"); + } + stbi__skip(s, c.length); + break; } // end of PNG chunk, read and skip CRC stbi__get32be(s); @@ -5501,31 +5489,30 @@ static int stbi__parse_png_file(stbi__png* z, int scan, int req_comp) static void* stbi__do_png(stbi__png* p, int* x, int* y, int* n, int req_comp, stbi__result_info* ri) { void* result = NULL; - if(req_comp < 0 || req_comp > 4) + if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error"); - if(stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) + if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) { - if(p->depth < 8) + if (p->depth < 8) ri->bits_per_channel = 8; else ri->bits_per_channel = p->depth; result = p->out; p->out = NULL; - if(req_comp && req_comp != p->s->img_out_n) + if (req_comp && req_comp != p->s->img_out_n) { - if(ri->bits_per_channel == 8) - result = - stbi__convert_format(( unsigned char* )result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y); + if (ri->bits_per_channel == 8) + result = stbi__convert_format((unsigned char*)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y); else - result = stbi__convert_format16(( stbi__uint16* )result, p->s->img_out_n, req_comp, p->s->img_x, + result = stbi__convert_format16((stbi__uint16*)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y); p->s->img_out_n = req_comp; - if(result == NULL) + if (result == NULL) return result; } *x = p->s->img_x; *y = p->s->img_y; - if(n) + if (n) *n = p->s->img_n; } STBI_FREE(p->out); @@ -5555,16 +5542,16 @@ static int stbi__png_test(stbi__context* s) static int stbi__png_info_raw(stbi__png* p, int* x, int* y, int* comp) { - if(!stbi__parse_png_file(p, STBI__SCAN_header, 0)) + if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) { stbi__rewind(p->s); return 0; } - if(x) + if (x) *x = p->s->img_x; - if(y) + if (y) *y = p->s->img_y; - if(comp) + if (comp) *comp = p->s->img_n; return 1; } @@ -5580,9 +5567,9 @@ static int stbi__png_is16(stbi__context* s) { stbi__png p; p.s = s; - if(!stbi__png_info_raw(&p, NULL, NULL, NULL)) + if (!stbi__png_info_raw(&p, NULL, NULL, NULL)) return 0; - if(p.depth != 16) + if (p.depth != 16) { stbi__rewind(p.s); return 0; @@ -5598,14 +5585,14 @@ static int stbi__bmp_test_raw(stbi__context* s) { int r; int sz; - if(stbi__get8(s) != 'B') + if (stbi__get8(s) != 'B') return 0; - if(stbi__get8(s) != 'M') + if (stbi__get8(s) != 'M') return 0; - stbi__get32le(s); // discard filesize - stbi__get16le(s); // discard reserved - stbi__get16le(s); // discard reserved - stbi__get32le(s); // discard data offset + stbi__get32le(s); // discard filesize + stbi__get16le(s); // discard reserved + stbi__get16le(s); // discard reserved + stbi__get32le(s); // discard data offset sz = stbi__get32le(s); r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124); return r; @@ -5622,28 +5609,28 @@ static int stbi__bmp_test(stbi__context* s) static int stbi__high_bit(unsigned int z) { int n = 0; - if(z == 0) + if (z == 0) return -1; - if(z >= 0x10000) + if (z >= 0x10000) n += 16, z >>= 16; - if(z >= 0x00100) + if (z >= 0x00100) n += 8, z >>= 8; - if(z >= 0x00010) + if (z >= 0x00010) n += 4, z >>= 4; - if(z >= 0x00004) + if (z >= 0x00004) n += 2, z >>= 2; - if(z >= 0x00002) + if (z >= 0x00002) n += 1, z >>= 1; return n; } static int stbi__bitcount(unsigned int a) { - a = (a & 0x55555555) + ((a >> 1) & 0x55555555); // max 2 - a = (a & 0x33333333) + ((a >> 2) & 0x33333333); // max 4 - a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits - a = (a + (a >> 8)); // max 16 per 8 bits - a = (a + (a >> 16)); // max 32 per 8 bits + a = (a & 0x55555555) + ((a >> 1) & 0x55555555); // max 2 + a = (a & 0x33333333) + ((a >> 2) & 0x33333333); // max 4 + a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits + a = (a + (a >> 8)); // max 16 per 8 bits + a = (a + (a >> 16)); // max 32 per 8 bits return a & 0xff; } @@ -5664,16 +5651,24 @@ static int stbi__shiftsigned(int v, int shift, int bits) 0x01 /*0b00000001*/, }; static unsigned int shift_table[9] = { - 0, 0, 0, 1, 0, 2, 4, 6, 0, + 0, + 0, + 0, + 1, + 0, + 2, + 4, + 6, + 0, }; - if(shift < 0) + if (shift < 0) v <<= -shift; else v >>= shift; STBI_ASSERT(v >= 0 && v < 256); v >>= (8 - bits); STBI_ASSERT(bits >= 0 && bits <= 8); - return ( int )(( unsigned )v * mul_table[bits]) >> shift_table[bits]; + return (int)((unsigned)v * mul_table[bits]) >> shift_table[bits]; } typedef struct @@ -5685,18 +5680,18 @@ typedef struct static void* stbi__bmp_parse_header(stbi__context* s, stbi__bmp_data* info) { int hsz; - if(stbi__get8(s) != 'B' || stbi__get8(s) != 'M') + if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP"); - stbi__get32le(s); // discard filesize - stbi__get16le(s); // discard reserved - stbi__get16le(s); // discard reserved + stbi__get32le(s); // discard filesize + stbi__get16le(s); // discard reserved + stbi__get16le(s); // discard reserved info->offset = stbi__get32le(s); info->hsz = hsz = stbi__get32le(s); info->mr = info->mg = info->mb = info->ma = 0; - if(hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) + if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown"); - if(hsz == 12) + if (hsz == 12) { s->img_x = stbi__get16le(s); s->img_y = stbi__get16le(s); @@ -5706,39 +5701,39 @@ static void* stbi__bmp_parse_header(stbi__context* s, stbi__bmp_data* info) s->img_x = stbi__get32le(s); s->img_y = stbi__get32le(s); } - if(stbi__get16le(s) != 1) + if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP"); info->bpp = stbi__get16le(s); - if(hsz != 12) + if (hsz != 12) { int compress = stbi__get32le(s); - if(compress == 1 || compress == 2) + if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE"); - stbi__get32le(s); // discard sizeof - stbi__get32le(s); // discard hres - stbi__get32le(s); // discard vres - stbi__get32le(s); // discard colorsused - stbi__get32le(s); // discard max important - if(hsz == 40 || hsz == 56) - { - if(hsz == 56) + stbi__get32le(s); // discard sizeof + stbi__get32le(s); // discard hres + stbi__get32le(s); // discard vres + stbi__get32le(s); // discard colorsused + stbi__get32le(s); // discard max important + if (hsz == 40 || hsz == 56) + { + if (hsz == 56) { stbi__get32le(s); stbi__get32le(s); stbi__get32le(s); stbi__get32le(s); } - if(info->bpp == 16 || info->bpp == 32) + if (info->bpp == 16 || info->bpp == 32) { - if(compress == 0) + if (compress == 0) { - if(info->bpp == 32) + if (info->bpp == 32) { info->mr = 0xffu << 16; info->mg = 0xffu << 8; info->mb = 0xffu << 0; info->ma = 0xffu << 24; - info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0 + info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0 } else { @@ -5747,13 +5742,13 @@ static void* stbi__bmp_parse_header(stbi__context* s, stbi__bmp_data* info) info->mb = 31u << 0; } } - else if(compress == 3) + else if (compress == 3) { info->mr = stbi__get32le(s); info->mg = stbi__get32le(s); info->mb = stbi__get32le(s); // not documented, but generated by photoshop and handled by mspaint - if(info->mr == info->mg && info->mg == info->mb) + if (info->mr == info->mg && info->mg == info->mb) { // ?!?!? return stbi__errpuc("bad BMP", "bad BMP"); @@ -5766,25 +5761,25 @@ static void* stbi__bmp_parse_header(stbi__context* s, stbi__bmp_data* info) else { int i; - if(hsz != 108 && hsz != 124) + if (hsz != 108 && hsz != 124) return stbi__errpuc("bad BMP", "bad BMP"); info->mr = stbi__get32le(s); info->mg = stbi__get32le(s); info->mb = stbi__get32le(s); info->ma = stbi__get32le(s); - stbi__get32le(s); // discard color space - for(i = 0; i < 12; ++i) - stbi__get32le(s); // discard color space parameters - if(hsz == 124) + stbi__get32le(s); // discard color space + for (i = 0; i < 12; ++i) + stbi__get32le(s); // discard color space parameters + if (hsz == 124) { - stbi__get32le(s); // discard rendering intent - stbi__get32le(s); // discard offset of profile data - stbi__get32le(s); // discard size of profile data - stbi__get32le(s); // discard reserved + stbi__get32le(s); // discard rendering intent + stbi__get32le(s); // discard offset of profile data + stbi__get32le(s); // discard size of profile data + stbi__get32le(s); // discard reserved } } } - return ( void* )1; + return (void*)1; } static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req_comp, stbi__result_info* ri) @@ -5798,11 +5793,11 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req STBI_NOTUSED(ri); info.all_a = 255; - if(stbi__bmp_parse_header(s, &info) == NULL) - return NULL; // error code already set + if (stbi__bmp_parse_header(s, &info) == NULL) + return NULL; // error code already set - flip_vertically = (( int )s->img_y) > 0; - s->img_y = abs(( int )s->img_y); + flip_vertically = ((int)s->img_y) > 0; + s->img_y = abs((int)s->img_y); mr = info.mr; mg = info.mg; @@ -5810,53 +5805,53 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req ma = info.ma; all_a = info.all_a; - if(info.hsz == 12) + if (info.hsz == 12) { - if(info.bpp < 24) + if (info.bpp < 24) psize = (info.offset - 14 - 24) / 3; } else { - if(info.bpp < 16) + if (info.bpp < 16) psize = (info.offset - 14 - info.hsz) >> 2; } s->img_n = ma ? 4 : 3; - if(req_comp && req_comp >= 3) // we can directly decode 3 or 4 + if (req_comp && req_comp >= 3) // we can directly decode 3 or 4 target = req_comp; else - target = s->img_n; // if they want monochrome, we'll post-convert + target = s->img_n; // if they want monochrome, we'll post-convert // sanity-check size - if(!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0)) + if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0)) return stbi__errpuc("too large", "Corrupt BMP"); - out = ( stbi_uc* )stbi__malloc_mad3(target, s->img_x, s->img_y, 0); - if(!out) + out = (stbi_uc*)stbi__malloc_mad3(target, s->img_x, s->img_y, 0); + if (!out) return stbi__errpuc("outofmem", "Out of memory"); - if(info.bpp < 16) + if (info.bpp < 16) { int z = 0; - if(psize == 0 || psize > 256) + if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); } - for(i = 0; i < psize; ++i) + for (i = 0; i < psize; ++i) { pal[i][2] = stbi__get8(s); pal[i][1] = stbi__get8(s); pal[i][0] = stbi__get8(s); - if(info.hsz != 12) + if (info.hsz != 12) stbi__get8(s); pal[i][3] = 255; } stbi__skip(s, info.offset - 14 - info.hsz - psize * (info.hsz == 12 ? 3 : 4)); - if(info.bpp == 1) + if (info.bpp == 1) width = (s->img_x + 7) >> 3; - else if(info.bpp == 4) + else if (info.bpp == 4) width = (s->img_x + 1) >> 1; - else if(info.bpp == 8) + else if (info.bpp == 8) width = s->img_x; else { @@ -5864,18 +5859,18 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req return stbi__errpuc("bad bpp", "Corrupt BMP"); } pad = (-width) & 3; - if(info.bpp == 1) + if (info.bpp == 1) { - for(j = 0; j < ( int )s->img_y; ++j) + for (j = 0; j < (int)s->img_y; ++j) { int bit_offset = 7, v = stbi__get8(s); - for(i = 0; i < ( int )s->img_x; ++i) + for (i = 0; i < (int)s->img_x; ++i) { int color = (v >> bit_offset) & 0x1; out[z++] = pal[color][0]; out[z++] = pal[color][1]; out[z++] = pal[color][2]; - if((--bit_offset) < 0) + if ((--bit_offset) < 0) { bit_offset = 7; v = stbi__get8(s); @@ -5886,12 +5881,12 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req } else { - for(j = 0; j < ( int )s->img_y; ++j) + for (j = 0; j < (int)s->img_y; ++j) { - for(i = 0; i < ( int )s->img_x; i += 2) + for (i = 0; i < (int)s->img_x; i += 2) { int v = stbi__get8(s), v2 = 0; - if(info.bpp == 4) + if (info.bpp == 4) { v2 = v & 15; v >>= 4; @@ -5899,15 +5894,15 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req out[z++] = pal[v][0]; out[z++] = pal[v][1]; out[z++] = pal[v][2]; - if(target == 4) + if (target == 4) out[z++] = 255; - if(i + 1 == ( int )s->img_x) + if (i + 1 == (int)s->img_x) break; v = (info.bpp == 8) ? stbi__get8(s) : v2; out[z++] = pal[v][0]; out[z++] = pal[v][1]; out[z++] = pal[v][2]; - if(target == 4) + if (target == 4) out[z++] = 255; } stbi__skip(s, pad); @@ -5920,25 +5915,25 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req int z = 0; int easy = 0; stbi__skip(s, info.offset - 14 - info.hsz); - if(info.bpp == 24) + if (info.bpp == 24) width = 3 * s->img_x; - else if(info.bpp == 16) + else if (info.bpp == 16) width = 2 * s->img_x; else /* bpp = 32 and pad = 0 */ width = 0; pad = (-width) & 3; - if(info.bpp == 24) + if (info.bpp == 24) { easy = 1; } - else if(info.bpp == 32) + else if (info.bpp == 32) { - if(mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000) + if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000) easy = 2; } - if(!easy) + if (!easy) { - if(!mr || !mg || !mb) + if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); @@ -5953,11 +5948,11 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req ashift = stbi__high_bit(ma) - 7; acount = stbi__bitcount(ma); } - for(j = 0; j < ( int )s->img_y; ++j) + for (j = 0; j < (int)s->img_y; ++j) { - if(easy) + if (easy) { - for(i = 0; i < ( int )s->img_x; ++i) + for (i = 0; i < (int)s->img_x; ++i) { unsigned char a; out[z + 2] = stbi__get8(s); @@ -5966,23 +5961,23 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req z += 3; a = (easy == 2 ? stbi__get8(s) : 255); all_a |= a; - if(target == 4) + if (target == 4) out[z++] = a; } } else { int bpp = info.bpp; - for(i = 0; i < ( int )s->img_x; ++i) + for (i = 0; i < (int)s->img_x; ++i) { - stbi__uint32 v = (bpp == 16 ? ( stbi__uint32 )stbi__get16le(s) : stbi__get32le(s)); + stbi__uint32 v = (bpp == 16 ? (stbi__uint32)stbi__get16le(s) : stbi__get32le(s)); unsigned int a; out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount)); out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount)); out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount)); a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255); all_a |= a; - if(target == 4) + if (target == 4) out[z++] = STBI__BYTECAST(a); } } @@ -5991,34 +5986,34 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req } // if alpha channel is all 0s, replace with all 255s - if(target == 4 && all_a == 0) - for(i = 4 * s->img_x * s->img_y - 1; i >= 0; i -= 4) + if (target == 4 && all_a == 0) + for (i = 4 * s->img_x * s->img_y - 1; i >= 0; i -= 4) out[i] = 255; - if(flip_vertically) + if (flip_vertically) { stbi_uc t; - for(j = 0; j<( int )s->img_y>> 1; ++j) + for (j = 0; j < (int)s->img_y >> 1; ++j) { stbi_uc* p1 = out + j * s->img_x * target; stbi_uc* p2 = out + (s->img_y - 1 - j) * s->img_x * target; - for(i = 0; i < ( int )s->img_x * target; ++i) + for (i = 0; i < (int)s->img_x * target; ++i) { t = p1[i], p1[i] = p2[i], p2[i] = t; } } } - if(req_comp && req_comp != target) + if (req_comp && req_comp != target) { out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y); - if(out == NULL) - return out; // stbi__convert_format frees input on failure + if (out == NULL) + return out; // stbi__convert_format frees input on failure } *x = s->img_x; *y = s->img_y; - if(comp) + if (comp) *comp = s->img_n; return out; } @@ -6031,25 +6026,25 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16) { // only RGB or RGBA (incl. 16bit) or grey allowed - if(is_rgb16) + if (is_rgb16) *is_rgb16 = 0; - switch(bits_per_pixel) - { - case 8: - return STBI_grey; - case 16: - if(is_grey) - return STBI_grey_alpha; - // fallthrough - case 15: - if(is_rgb16) - *is_rgb16 = 1; - return STBI_rgb; - case 24: // fallthrough - case 32: - return bits_per_pixel / 8; - default: - return 0; + switch (bits_per_pixel) + { + case 8: + return STBI_grey; + case 16: + if (is_grey) + return STBI_grey_alpha; + // fallthrough + case 15: + if (is_rgb16) + *is_rgb16 = 1; + return STBI_rgb; + case 24: // fallthrough + case 32: + return bits_per_pixel / 8; + default: + return 0; } } @@ -6057,58 +6052,58 @@ static int stbi__tga_info(stbi__context* s, int* x, int* y, int* comp) { int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp; int sz, tga_colormap_type; - stbi__get8(s); // discard Offset - tga_colormap_type = stbi__get8(s); // colormap type - if(tga_colormap_type > 1) + stbi__get8(s); // discard Offset + tga_colormap_type = stbi__get8(s); // colormap type + if (tga_colormap_type > 1) { stbi__rewind(s); - return 0; // only RGB or indexed allowed + return 0; // only RGB or indexed allowed } - tga_image_type = stbi__get8(s); // image type - if(tga_colormap_type == 1) - { // colormapped (paletted) image - if(tga_image_type != 1 && tga_image_type != 9) + tga_image_type = stbi__get8(s); // image type + if (tga_colormap_type == 1) + { // colormapped (paletted) image + if (tga_image_type != 1 && tga_image_type != 9) { stbi__rewind(s); return 0; } - stbi__skip(s, 4); // skip index of first colormap entry and number of entries - sz = stbi__get8(s); // check bits per palette color entry - if((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) + stbi__skip(s, 4); // skip index of first colormap entry and number of entries + sz = stbi__get8(s); // check bits per palette color entry + if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) { stbi__rewind(s); return 0; } - stbi__skip(s, 4); // skip image x and y origin + stbi__skip(s, 4); // skip image x and y origin tga_colormap_bpp = sz; } else - { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE - if((tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11)) + { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE + if ((tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11)) { stbi__rewind(s); - return 0; // only RGB or grey allowed, +/- RLE + return 0; // only RGB or grey allowed, +/- RLE } - stbi__skip(s, 9); // skip colormap specification and image x/y origin + stbi__skip(s, 9); // skip colormap specification and image x/y origin tga_colormap_bpp = 0; } tga_w = stbi__get16le(s); - if(tga_w < 1) + if (tga_w < 1) { stbi__rewind(s); - return 0; // test width + return 0; // test width } tga_h = stbi__get16le(s); - if(tga_h < 1) + if (tga_h < 1) { stbi__rewind(s); - return 0; // test height + return 0; // test height } - tga_bits_per_pixel = stbi__get8(s); // bits per pixel - stbi__get8(s); // ignore alpha bits - if(tga_colormap_bpp != 0) + tga_bits_per_pixel = stbi__get8(s); // bits per pixel + stbi__get8(s); // ignore alpha bits + if (tga_colormap_bpp != 0) { - if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) + if ((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) { // when using a colormap, tga_bits_per_pixel is the size of the indexes // I don't think anything but 8 or 16bit indexes makes sense @@ -6121,56 +6116,56 @@ static int stbi__tga_info(stbi__context* s, int* x, int* y, int* comp) { tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL); } - if(!tga_comp) + if (!tga_comp) { stbi__rewind(s); return 0; } - if(x) + if (x) *x = tga_w; - if(y) + if (y) *y = tga_h; - if(comp) + if (comp) *comp = tga_comp; - return 1; // seems to have passed everything + return 1; // seems to have passed everything } static int stbi__tga_test(stbi__context* s) { int res = 0; int sz, tga_color_type; - stbi__get8(s); // discard Offset - tga_color_type = stbi__get8(s); // color type - if(tga_color_type > 1) - goto errorEnd; // only RGB or indexed allowed - sz = stbi__get8(s); // image type - if(tga_color_type == 1) - { // colormapped (paletted) image - if(sz != 1 && sz != 9) - goto errorEnd; // colortype 1 demands image type 1 or 9 - stbi__skip(s, 4); // skip index of first colormap entry and number of entries - sz = stbi__get8(s); // check bits per palette color entry - if((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) + stbi__get8(s); // discard Offset + tga_color_type = stbi__get8(s); // color type + if (tga_color_type > 1) + goto errorEnd; // only RGB or indexed allowed + sz = stbi__get8(s); // image type + if (tga_color_type == 1) + { // colormapped (paletted) image + if (sz != 1 && sz != 9) + goto errorEnd; // colortype 1 demands image type 1 or 9 + stbi__skip(s, 4); // skip index of first colormap entry and number of entries + sz = stbi__get8(s); // check bits per palette color entry + if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) goto errorEnd; - stbi__skip(s, 4); // skip image x and y origin + stbi__skip(s, 4); // skip image x and y origin } else - { // "normal" image w/o colormap - if((sz != 2) && (sz != 3) && (sz != 10) && (sz != 11)) - goto errorEnd; // only RGB or grey allowed, +/- RLE - stbi__skip(s, 9); // skip colormap specification and image x/y origin - } - if(stbi__get16le(s) < 1) - goto errorEnd; // test width - if(stbi__get16le(s) < 1) - goto errorEnd; // test height - sz = stbi__get8(s); // bits per pixel - if((tga_color_type == 1) && (sz != 8) && (sz != 16)) - goto errorEnd; // for colormapped images, bpp is size of an index - if((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) + { // "normal" image w/o colormap + if ((sz != 2) && (sz != 3) && (sz != 10) && (sz != 11)) + goto errorEnd; // only RGB or grey allowed, +/- RLE + stbi__skip(s, 9); // skip colormap specification and image x/y origin + } + if (stbi__get16le(s) < 1) + goto errorEnd; // test width + if (stbi__get16le(s) < 1) + goto errorEnd; // test height + sz = stbi__get8(s); // bits per pixel + if ((tga_color_type == 1) && (sz != 8) && (sz != 16)) + goto errorEnd; // for colormapped images, bpp is size of an index + if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) goto errorEnd; - res = 1; // if we got this far, everything's good and we can return 1 instead of 0 + res = 1; // if we got this far, everything's good and we can return 1 instead of 0 errorEnd: stbi__rewind(s); @@ -6180,7 +6175,7 @@ static int stbi__tga_test(stbi__context* s) // read 16bit value and convert to 24bit RGB static void stbi__tga_read_rgb16(stbi__context* s, stbi_uc* out) { - stbi__uint16 px = ( stbi__uint16 )stbi__get16le(s); + stbi__uint16 px = (stbi__uint16)stbi__get16le(s); stbi__uint16 fiveBitMask = 31; // we have 3 channels with 5bits each int r = (px >> 10) & fiveBitMask; @@ -6226,7 +6221,7 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req STBI_NOTUSED(ri); // do a tiny bit of precessing - if(tga_image_type >= 8) + if (tga_image_type >= 8) { tga_image_type -= 8; tga_is_RLE = 1; @@ -6234,33 +6229,33 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req tga_inverted = 1 - ((tga_inverted >> 5) & 1); // If I'm paletted, then I'll use the number of bits from the palette - if(tga_indexed) + if (tga_indexed) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16); else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16); - if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency + if (!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency return stbi__errpuc("bad format", "Can't find out TGA pixelformat"); // tga info *x = tga_width; *y = tga_height; - if(comp) + if (comp) *comp = tga_comp; - if(!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0)) + if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0)) return stbi__errpuc("too large", "Corrupt TGA"); - tga_data = ( unsigned char* )stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0); - if(!tga_data) + tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0); + if (!tga_data) return stbi__errpuc("outofmem", "Out of memory"); // skip to the data's starting position (offset usually = 0) stbi__skip(s, tga_offset); - if(!tga_indexed && !tga_is_RLE && !tga_rgb16) + if (!tga_indexed && !tga_is_RLE && !tga_rgb16) { - for(i = 0; i < tga_height; ++i) + for (i = 0; i < tga_height; ++i) { int row = tga_inverted ? tga_height - i - 1 : i; stbi_uc* tga_row = tga_data + row * tga_width * tga_comp; @@ -6270,28 +6265,28 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req else { // do I need to load a palette? - if(tga_indexed) + if (tga_indexed) { // any data to skip? (offset usually = 0) stbi__skip(s, tga_palette_start); // load the palette - tga_palette = ( unsigned char* )stbi__malloc_mad2(tga_palette_len, tga_comp, 0); - if(!tga_palette) + tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0); + if (!tga_palette) { STBI_FREE(tga_data); return stbi__errpuc("outofmem", "Out of memory"); } - if(tga_rgb16) + if (tga_rgb16) { stbi_uc* pal_entry = tga_palette; STBI_ASSERT(tga_comp == STBI_rgb); - for(i = 0; i < tga_palette_len; ++i) + for (i = 0; i < tga_palette_len; ++i) { stbi__tga_read_rgb16(s, pal_entry); pal_entry += tga_comp; } } - else if(!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) + else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) { STBI_FREE(tga_data); STBI_FREE(tga_palette); @@ -6299,12 +6294,12 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req } } // load the data - for(i = 0; i < tga_width * tga_height; ++i) + for (i = 0; i < tga_width * tga_height; ++i) { // if I'm in RLE mode, do I need to get a RLE stbi__pngchunk? - if(tga_is_RLE) + if (tga_is_RLE) { - if(RLE_count == 0) + if (RLE_count == 0) { // yep, get the next byte as a RLE command int RLE_cmd = stbi__get8(s); @@ -6312,7 +6307,7 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req RLE_repeating = RLE_cmd >> 7; read_next_pixel = 1; } - else if(!RLE_repeating) + else if (!RLE_repeating) { read_next_pixel = 1; } @@ -6322,25 +6317,25 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req read_next_pixel = 1; } // OK, if I need to read a pixel, do it now - if(read_next_pixel) + if (read_next_pixel) { // load however much data we did have - if(tga_indexed) + if (tga_indexed) { // read in index, then perform the lookup int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s); - if(pal_idx >= tga_palette_len) + if (pal_idx >= tga_palette_len) { // invalid index pal_idx = 0; } pal_idx *= tga_comp; - for(j = 0; j < tga_comp; ++j) + for (j = 0; j < tga_comp; ++j) { raw_data[j] = tga_palette[pal_idx + j]; } } - else if(tga_rgb16) + else if (tga_rgb16) { STBI_ASSERT(tga_comp == STBI_rgb); stbi__tga_read_rgb16(s, raw_data); @@ -6348,30 +6343,30 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req else { // read in the data raw - for(j = 0; j < tga_comp; ++j) + for (j = 0; j < tga_comp; ++j) { raw_data[j] = stbi__get8(s); } } // clear the reading flag for the next pixel read_next_pixel = 0; - } // end of reading a pixel + } // end of reading a pixel // copy data - for(j = 0; j < tga_comp; ++j) + for (j = 0; j < tga_comp; ++j) tga_data[i * tga_comp + j] = raw_data[j]; // in case we're in RLE mode, keep counting down --RLE_count; } // do I need to invert the image? - if(tga_inverted) + if (tga_inverted) { - for(j = 0; j * 2 < tga_height; ++j) + for (j = 0; j * 2 < tga_height; ++j) { int index1 = j * tga_width * tga_comp; int index2 = (tga_height - 1 - j) * tga_width * tga_comp; - for(i = tga_width * tga_comp; i > 0; --i) + for (i = tga_width * tga_comp; i > 0; --i) { unsigned char temp = tga_data[index1]; tga_data[index1] = tga_data[index2]; @@ -6382,17 +6377,17 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req } } // clear my palette, if I had one - if(tga_palette != NULL) + if (tga_palette != NULL) { STBI_FREE(tga_palette); } } // swap RGB - if the source data was RGB16, it already is in the right order - if(tga_comp >= 3 && !tga_rgb16) + if (tga_comp >= 3 && !tga_rgb16) { unsigned char* tga_pixel = tga_data; - for(i = 0; i < tga_width * tga_height; ++i) + for (i = 0; i < tga_width * tga_height; ++i) { unsigned char temp = tga_pixel[0]; tga_pixel[0] = tga_pixel[2]; @@ -6402,7 +6397,7 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req } // convert to target component count - if(req_comp && req_comp != tga_comp) + if (req_comp && req_comp != tga_comp) tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height); // the things I do to get rid of an error message, and yet keep @@ -6429,38 +6424,38 @@ static int stbi__psd_decode_rle(stbi__context* s, stbi_uc* p, int pixelCount) int count, nleft, len; count = 0; - while((nleft = pixelCount - count) > 0) + while ((nleft = pixelCount - count) > 0) { len = stbi__get8(s); - if(len == 128) + if (len == 128) { // No-op. } - else if(len < 128) + else if (len < 128) { // Copy next len+1 bytes literally. len++; - if(len > nleft) - return 0; // corrupt data + if (len > nleft) + return 0; // corrupt data count += len; - while(len) + while (len) { *p = stbi__get8(s); p += 4; len--; } } - else if(len > 128) + else if (len > 128) { stbi_uc val; // Next -len+1 bytes in the dest are replicated from next source byte. // (Interpret len as a negative 8-bit int.) len = 257 - len; - if(len > nleft) - return 0; // corrupt data + if (len > nleft) + return 0; // corrupt data val = stbi__get8(s); count += len; - while(len) + while (len) { *p = val; p += 4; @@ -6483,11 +6478,11 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req STBI_NOTUSED(ri); // Check identifier - if(stbi__get32be(s) != 0x38425053) // "8BPS" + if (stbi__get32be(s) != 0x38425053) // "8BPS" return stbi__errpuc("not PSD", "Corrupt PSD image"); // Check file type version. - if(stbi__get16be(s) != 1) + if (stbi__get16be(s) != 1) return stbi__errpuc("wrong version", "Unsupported version of PSD image"); // Skip 6 reserved bytes. @@ -6495,7 +6490,7 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req // Read the number of channels (R, G, B, A, etc). channelCount = stbi__get16be(s); - if(channelCount < 0 || channelCount > 16) + if (channelCount < 0 || channelCount > 16) return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image"); // Read the rows and columns of the image. @@ -6504,7 +6499,7 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req // Make sure the depth is 8 bits. bitdepth = stbi__get16be(s); - if(bitdepth != 8 && bitdepth != 16) + if (bitdepth != 8 && bitdepth != 16) return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit"); // Make sure the color mode is RGB. @@ -6517,7 +6512,7 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req // 7: Multichannel // 8: Duotone // 9: Lab color - if(stbi__get16be(s) != 3) + if (stbi__get16be(s) != 3) return stbi__errpuc("wrong color format", "PSD is not in RGB color format"); // Skip the Mode Data. (It's the palette for indexed color; other info for other modes.) @@ -6534,24 +6529,24 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req // 0: no compression // 1: RLE compressed compression = stbi__get16be(s); - if(compression > 1) + if (compression > 1) return stbi__errpuc("bad compression", "PSD has an unknown compression format"); // Check size - if(!stbi__mad3sizes_valid(4, w, h, 0)) + if (!stbi__mad3sizes_valid(4, w, h, 0)) return stbi__errpuc("too large", "Corrupt PSD"); // Create the destination image. - if(!compression && bitdepth == 16 && bpc == 16) + if (!compression && bitdepth == 16 && bpc == 16) { - out = ( stbi_uc* )stbi__malloc_mad3(8, w, h, 0); + out = (stbi_uc*)stbi__malloc_mad3(8, w, h, 0); ri->bits_per_channel = 16; } else - out = ( stbi_uc* )stbi__malloc(4 * (size_t)w * h); + out = (stbi_uc*)stbi__malloc(4 * (size_t)w * h); - if(!out) + if (!out) return stbi__errpuc("outofmem", "Out of memory"); pixelCount = w * h; @@ -6559,7 +6554,7 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req // memset( out, 0, pixelCount * 4 ); // Finally, the image data. - if(compression) + if (compression) { // RLE as used by .PSD and .TIFF // Loop until you get the number of unpacked bytes you are expecting: @@ -6574,21 +6569,21 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req stbi__skip(s, h * channelCount * 2); // Read the RLE data by channel. - for(channel = 0; channel < 4; channel++) + for (channel = 0; channel < 4; channel++) { stbi_uc* p; p = out + channel; - if(channel >= channelCount) + if (channel >= channelCount) { // Fill this channel with default data. - for(i = 0; i < pixelCount; i++, p += 4) + for (i = 0; i < pixelCount; i++, p += 4) *p = (channel == 3 ? 255 : 0); } else { // Read the RLE data. - if(!stbi__psd_decode_rle(s, p, pixelCount)) + if (!stbi__psd_decode_rle(s, p, pixelCount)) { STBI_FREE(out); return stbi__errpuc("corrupt", "bad RLE data"); @@ -6602,45 +6597,45 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image. // Read the data by channel. - for(channel = 0; channel < 4; channel++) + for (channel = 0; channel < 4; channel++) { - if(channel >= channelCount) + if (channel >= channelCount) { // Fill this channel with default data. - if(bitdepth == 16 && bpc == 16) + if (bitdepth == 16 && bpc == 16) { - stbi__uint16* q = (( stbi__uint16* )out) + channel; + stbi__uint16* q = ((stbi__uint16*)out) + channel; stbi__uint16 val = channel == 3 ? 65535 : 0; - for(i = 0; i < pixelCount; i++, q += 4) + for (i = 0; i < pixelCount; i++, q += 4) *q = val; } else { stbi_uc* p = out + channel; stbi_uc val = channel == 3 ? 255 : 0; - for(i = 0; i < pixelCount; i++, p += 4) + for (i = 0; i < pixelCount; i++, p += 4) *p = val; } } else { - if(ri->bits_per_channel == 16) - { // output bpc - stbi__uint16* q = (( stbi__uint16* )out) + channel; - for(i = 0; i < pixelCount; i++, q += 4) - *q = ( stbi__uint16 )stbi__get16be(s); + if (ri->bits_per_channel == 16) + { // output bpc + stbi__uint16* q = ((stbi__uint16*)out) + channel; + for (i = 0; i < pixelCount; i++, q += 4) + *q = (stbi__uint16)stbi__get16be(s); } else { stbi_uc* p = out + channel; - if(bitdepth == 16) - { // input bpc - for(i = 0; i < pixelCount; i++, p += 4) + if (bitdepth == 16) + { // input bpc + for (i = 0; i < pixelCount; i++, p += 4) *p = (stbi_uc)(stbi__get16be(s) >> 8); } else { - for(i = 0; i < pixelCount; i++, p += 4) + for (i = 0; i < pixelCount; i++, p += 4) *p = stbi__get8(s); } } @@ -6649,14 +6644,14 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req } // remove weird white matte from PSD - if(channelCount >= 4) + if (channelCount >= 4) { - if(ri->bits_per_channel == 16) + if (ri->bits_per_channel == 16) { - for(i = 0; i < w * h; ++i) + for (i = 0; i < w * h; ++i) { - stbi__uint16* pixel = ( stbi__uint16* )out + 4 * i; - if(pixel[3] != 0 && pixel[3] != 65535) + stbi__uint16* pixel = (stbi__uint16*)out + 4 * i; + if (pixel[3] != 0 && pixel[3] != 65535) { float a = pixel[3] / 65535.0f; float ra = 1.0f / a; @@ -6669,34 +6664,34 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req } else { - for(i = 0; i < w * h; ++i) + for (i = 0; i < w * h; ++i) { unsigned char* pixel = out + 4 * i; - if(pixel[3] != 0 && pixel[3] != 255) + if (pixel[3] != 0 && pixel[3] != 255) { float a = pixel[3] / 255.0f; float ra = 1.0f / a; float inv_a = 255.0f * (1 - ra); - pixel[0] = ( unsigned char )(pixel[0] * ra + inv_a); - pixel[1] = ( unsigned char )(pixel[1] * ra + inv_a); - pixel[2] = ( unsigned char )(pixel[2] * ra + inv_a); + pixel[0] = (unsigned char)(pixel[0] * ra + inv_a); + pixel[1] = (unsigned char)(pixel[1] * ra + inv_a); + pixel[2] = (unsigned char)(pixel[2] * ra + inv_a); } } } } // convert to desired output format - if(req_comp && req_comp != 4) + if (req_comp && req_comp != 4) { - if(ri->bits_per_channel == 16) - out = ( stbi_uc* )stbi__convert_format16(( stbi__uint16* )out, 4, req_comp, w, h); + if (ri->bits_per_channel == 16) + out = (stbi_uc*)stbi__convert_format16((stbi__uint16*)out, 4, req_comp, w, h); else out = stbi__convert_format(out, 4, req_comp, w, h); - if(out == NULL) - return out; // stbi__convert_format frees input on failure + if (out == NULL) + return out; // stbi__convert_format frees input on failure } - if(comp) + if (comp) *comp = 4; *y = h; *x = w; @@ -6716,8 +6711,8 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req static int stbi__pic_is4(stbi__context* s, const char* str) { int i; - for(i = 0; i < 4; ++i) - if(stbi__get8(s) != ( stbi_uc )str[i]) + for (i = 0; i < 4; ++i) + if (stbi__get8(s) != (stbi_uc)str[i]) return 0; return 1; @@ -6727,13 +6722,13 @@ static int stbi__pic_test_core(stbi__context* s) { int i; - if(!stbi__pic_is4(s, "\x53\x80\xF6\x34")) + if (!stbi__pic_is4(s, "\x53\x80\xF6\x34")) return 0; - for(i = 0; i < 84; ++i) + for (i = 0; i < 84; ++i) stbi__get8(s); - if(!stbi__pic_is4(s, "PICT")) + if (!stbi__pic_is4(s, "PICT")) return 0; return 1; @@ -6748,11 +6743,11 @@ static stbi_uc* stbi__readval(stbi__context* s, int channel, stbi_uc* dest) { int mask = 0x80, i; - for(i = 0; i < 4; ++i, mask >>= 1) + for (i = 0; i < 4; ++i, mask >>= 1) { - if(channel & mask) + if (channel & mask) { - if(stbi__at_eof(s)) + if (stbi__at_eof(s)) return stbi__errpuc("bad file", "PIC file too short"); dest[i] = stbi__get8(s); } @@ -6765,8 +6760,8 @@ static void stbi__copyval(int channel, stbi_uc* dest, const stbi_uc* src) { int mask = 0x80, i; - for(i = 0; i < 4; ++i, mask >>= 1) - if(channel & mask) + for (i = 0; i < 4; ++i, mask >>= 1) + if (channel & mask) dest[i] = src[i]; } @@ -6781,7 +6776,7 @@ static stbi_uc* stbi__pic_load_core(stbi__context* s, int width, int height, int { stbi__pic_packet* packet; - if(num_packets == sizeof(packets) / sizeof(packets[0])) + if (num_packets == sizeof(packets) / sizeof(packets[0])) return stbi__errpuc("bad format", "too many packets"); packet = &packets[num_packets++]; @@ -6793,103 +6788,103 @@ static stbi_uc* stbi__pic_load_core(stbi__context* s, int width, int height, int act_comp |= packet->channel; - if(stbi__at_eof(s)) + if (stbi__at_eof(s)) return stbi__errpuc("bad file", "file too short (reading packets)"); - if(packet->size != 8) + if (packet->size != 8) return stbi__errpuc("bad format", "packet isn't 8bpp"); - } while(chained); + } while (chained); - *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel? + *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel? - for(y = 0; y < height; ++y) + for (y = 0; y < height; ++y) { int packet_idx; - for(packet_idx = 0; packet_idx < num_packets; ++packet_idx) + for (packet_idx = 0; packet_idx < num_packets; ++packet_idx) { stbi__pic_packet* packet = &packets[packet_idx]; stbi_uc* dest = result + y * width * 4; - switch(packet->type) + switch (packet->type) { - default: - return stbi__errpuc("bad format", "packet has bad compression type"); + default: + return stbi__errpuc("bad format", "packet has bad compression type"); - case 0: - { // uncompressed - int x; + case 0: + { // uncompressed + int x; - for(x = 0; x < width; ++x, dest += 4) - if(!stbi__readval(s, packet->channel, dest)) - return 0; - break; - } + for (x = 0; x < width; ++x, dest += 4) + if (!stbi__readval(s, packet->channel, dest)) + return 0; + break; + } - case 1: // Pure RLE - { - int left = width, i; + case 1: // Pure RLE + { + int left = width, i; - while(left > 0) - { - stbi_uc count, value[4]; + while (left > 0) + { + stbi_uc count, value[4]; - count = stbi__get8(s); - if(stbi__at_eof(s)) - return stbi__errpuc("bad file", "file too short (pure read count)"); + count = stbi__get8(s); + if (stbi__at_eof(s)) + return stbi__errpuc("bad file", "file too short (pure read count)"); - if(count > left) - count = ( stbi_uc )left; + if (count > left) + count = (stbi_uc)left; - if(!stbi__readval(s, packet->channel, value)) - return 0; + if (!stbi__readval(s, packet->channel, value)) + return 0; - for(i = 0; i < count; ++i, dest += 4) - stbi__copyval(packet->channel, dest, value); - left -= count; - } + for (i = 0; i < count; ++i, dest += 4) + stbi__copyval(packet->channel, dest, value); + left -= count; } - break; + } + break; - case 2: - { // Mixed RLE - int left = width; - while(left > 0) - { - int count = stbi__get8(s), i; - if(stbi__at_eof(s)) - return stbi__errpuc("bad file", "file too short (mixed read count)"); + case 2: + { // Mixed RLE + int left = width; + while (left > 0) + { + int count = stbi__get8(s), i; + if (stbi__at_eof(s)) + return stbi__errpuc("bad file", "file too short (mixed read count)"); - if(count >= 128) - { // Repeated - stbi_uc value[4]; + if (count >= 128) + { // Repeated + stbi_uc value[4]; - if(count == 128) - count = stbi__get16be(s); - else - count -= 127; - if(count > left) - return stbi__errpuc("bad file", "scanline overrun"); + if (count == 128) + count = stbi__get16be(s); + else + count -= 127; + if (count > left) + return stbi__errpuc("bad file", "scanline overrun"); - if(!stbi__readval(s, packet->channel, value)) - return 0; + if (!stbi__readval(s, packet->channel, value)) + return 0; - for(i = 0; i < count; ++i, dest += 4) - stbi__copyval(packet->channel, dest, value); - } - else - { // Raw - ++count; - if(count > left) - return stbi__errpuc("bad file", "scanline overrun"); + for (i = 0; i < count; ++i, dest += 4) + stbi__copyval(packet->channel, dest, value); + } + else + { // Raw + ++count; + if (count > left) + return stbi__errpuc("bad file", "scanline overrun"); - for(i = 0; i < count; ++i, dest += 4) - if(!stbi__readval(s, packet->channel, dest)) - return 0; - } - left -= count; + for (i = 0; i < count; ++i, dest += 4) + if (!stbi__readval(s, packet->channel, dest)) + return 0; } - break; + left -= count; } + break; + } } } } @@ -6903,35 +6898,35 @@ static void* stbi__pic_load(stbi__context* s, int* px, int* py, int* comp, int r int i, x, y, internal_comp; STBI_NOTUSED(ri); - if(!comp) + if (!comp) comp = &internal_comp; - for(i = 0; i < 92; ++i) + for (i = 0; i < 92; ++i) stbi__get8(s); x = stbi__get16be(s); y = stbi__get16be(s); - if(stbi__at_eof(s)) + if (stbi__at_eof(s)) return stbi__errpuc("bad file", "file too short (pic header)"); - if(!stbi__mad3sizes_valid(x, y, 4, 0)) + if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode"); - stbi__get32be(s); // skip `ratio' - stbi__get16be(s); // skip `fields' - stbi__get16be(s); // skip `pad' + stbi__get32be(s); // skip `ratio' + stbi__get16be(s); // skip `fields' + stbi__get16be(s); // skip `pad' // intermediate buffer is RGBA - result = ( stbi_uc* )stbi__malloc_mad3(x, y, 4, 0); + result = (stbi_uc*)stbi__malloc_mad3(x, y, 4, 0); memset(result, 0xff, (size_t)x * y * 4); - if(!stbi__pic_load_core(s, x, y, comp, result)) + if (!stbi__pic_load_core(s, x, y, comp, result)) { STBI_FREE(result); result = 0; } *px = x; *py = y; - if(req_comp == 0) + if (req_comp == 0) req_comp = *comp; result = stbi__convert_format(result, 4, req_comp, x, y); @@ -6960,8 +6955,8 @@ typedef struct typedef struct { int w, h; - stbi_uc* out; // output buffer (always 4 components) - stbi_uc* background; // The current "background" as far as a gif is concerned + stbi_uc* out; // output buffer (always 4 components) + stbi_uc* background; // The current "background" as far as a gif is concerned stbi_uc* history; int flags, bgindex, ratio, transparent, eflags; stbi_uc pal[256][4]; @@ -6980,12 +6975,12 @@ typedef struct static int stbi__gif_test_raw(stbi__context* s) { int sz; - if(stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') + if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0; sz = stbi__get8(s); - if(sz != '9' && sz != '7') + if (sz != '9' && sz != '7') return 0; - if(stbi__get8(s) != 'a') + if (stbi__get8(s) != 'a') return 0; return 1; } @@ -7000,7 +6995,7 @@ static int stbi__gif_test(stbi__context* s) static void stbi__gif_parse_colortable(stbi__context* s, stbi_uc pal[256][4], int num_entries, int transp) { int i; - for(i = 0; i < num_entries; ++i) + for (i = 0; i < num_entries; ++i) { pal[i][2] = stbi__get8(s); pal[i][1] = stbi__get8(s); @@ -7012,13 +7007,13 @@ static void stbi__gif_parse_colortable(stbi__context* s, stbi_uc pal[256][4], in static int stbi__gif_header(stbi__context* s, stbi__gif* g, int* comp, int is_info) { stbi_uc version; - if(stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') + if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return stbi__err("not GIF", "Corrupt GIF"); version = stbi__get8(s); - if(version != '7' && version != '9') + if (version != '7' && version != '9') return stbi__err("not GIF", "Corrupt GIF"); - if(stbi__get8(s) != 'a') + if (stbi__get8(s) != 'a') return stbi__err("not GIF", "Corrupt GIF"); stbi__g_failure_reason = ""; @@ -7029,13 +7024,13 @@ static int stbi__gif_header(stbi__context* s, stbi__gif* g, int* comp, int is_in g->ratio = stbi__get8(s); g->transparent = -1; - if(comp != 0) - *comp = 4; // can't actually tell whether it's 3 or 4 until we parse the comments + if (comp != 0) + *comp = 4; // can't actually tell whether it's 3 or 4 until we parse the comments - if(is_info) + if (is_info) return 1; - if(g->flags & 0x80) + if (g->flags & 0x80) stbi__gif_parse_colortable(s, g->pal, 2 << (g->flags & 7), -1); return 1; @@ -7043,16 +7038,16 @@ static int stbi__gif_header(stbi__context* s, stbi__gif* g, int* comp, int is_in static int stbi__gif_info_raw(stbi__context* s, int* x, int* y, int* comp) { - stbi__gif* g = ( stbi__gif* )stbi__malloc(sizeof(stbi__gif)); - if(!stbi__gif_header(s, g, comp, 1)) + stbi__gif* g = (stbi__gif*)stbi__malloc(sizeof(stbi__gif)); + if (!stbi__gif_header(s, g, comp, 1)) { STBI_FREE(g); stbi__rewind(s); return 0; } - if(x) + if (x) *x = g->w; - if(y) + if (y) *y = g->h; STBI_FREE(g); return 1; @@ -7065,10 +7060,10 @@ static void stbi__out_gif_code(stbi__gif* g, stbi__uint16 code) // recurse to decode the prefixes, since the linked-list is backwards, // and working backwards through an interleaved image would be nasty - if(g->codes[code].prefix >= 0) + if (g->codes[code].prefix >= 0) stbi__out_gif_code(g, g->codes[code].prefix); - if(g->cur_y >= g->max_y) + if (g->cur_y >= g->max_y) return; idx = g->cur_x + g->cur_y; @@ -7076,8 +7071,8 @@ static void stbi__out_gif_code(stbi__gif* g, stbi__uint16 code) g->history[idx / 4] = 1; c = &g->color_table[g->codes[code].suffix * 4]; - if(c[3] > 128) - { // don't render transparent pixels; + if (c[3] > 128) + { // don't render transparent pixels; p[0] = c[2]; p[1] = c[1]; p[2] = c[0]; @@ -7085,12 +7080,12 @@ static void stbi__out_gif_code(stbi__gif* g, stbi__uint16 code) } g->cur_x += 4; - if(g->cur_x >= g->max_x) + if (g->cur_x >= g->max_x) { g->cur_x = g->start_x; g->cur_y += g->step; - while(g->cur_y >= g->max_y && g->parse > 0) + while (g->cur_y >= g->max_y && g->parse > 0) { g->step = (1 << g->parse) * g->line_size; g->cur_y = g->start_y + (g->step >> 1); @@ -7108,7 +7103,7 @@ static stbi_uc* stbi__process_gif_raster(stbi__context* s, stbi__gif* g) stbi__gif_lzw* p; lzw_cs = stbi__get8(s); - if(lzw_cs > 12) + if (lzw_cs > 12) return NULL; clear = 1 << lzw_cs; first = 1; @@ -7116,11 +7111,11 @@ static stbi_uc* stbi__process_gif_raster(stbi__context* s, stbi__gif* g) codemask = (1 << codesize) - 1; bits = 0; valid_bits = 0; - for(init_code = 0; init_code < clear; init_code++) + for (init_code = 0; init_code < clear; init_code++) { g->codes[init_code].prefix = -1; - g->codes[init_code].first = ( stbi_uc )init_code; - g->codes[init_code].suffix = ( stbi_uc )init_code; + g->codes[init_code].first = (stbi_uc)init_code; + g->codes[init_code].suffix = (stbi_uc)init_code; } // support no starting clear code @@ -7128,18 +7123,18 @@ static stbi_uc* stbi__process_gif_raster(stbi__context* s, stbi__gif* g) oldcode = -1; len = 0; - for(;;) + for (;;) { - if(valid_bits < codesize) + if (valid_bits < codesize) { - if(len == 0) + if (len == 0) { - len = stbi__get8(s); // start new block - if(len == 0) + len = stbi__get8(s); // start new block + if (len == 0) return g->out; } --len; - bits |= ( stbi__int32 )stbi__get8(s) << valid_bits; + bits |= (stbi__int32)stbi__get8(s) << valid_bits; valid_bits += 8; } else @@ -7148,46 +7143,46 @@ static stbi_uc* stbi__process_gif_raster(stbi__context* s, stbi__gif* g) bits >>= codesize; valid_bits -= codesize; // @OPTIMIZE: is there some way we can accelerate the non-clear path? - if(code == clear) - { // clear code + if (code == clear) + { // clear code codesize = lzw_cs + 1; codemask = (1 << codesize) - 1; avail = clear + 2; oldcode = -1; first = 0; } - else if(code == clear + 1) - { // end of stream code + else if (code == clear + 1) + { // end of stream code stbi__skip(s, len); - while((len = stbi__get8(s)) > 0) + while ((len = stbi__get8(s)) > 0) stbi__skip(s, len); return g->out; } - else if(code <= avail) + else if (code <= avail) { - if(first) + if (first) { return stbi__errpuc("no clear code", "Corrupt GIF"); } - if(oldcode >= 0) + if (oldcode >= 0) { p = &g->codes[avail++]; - if(avail > 8192) + if (avail > 8192) { return stbi__errpuc("too many codes", "Corrupt GIF"); } - p->prefix = ( stbi__int16 )oldcode; + p->prefix = (stbi__int16)oldcode; p->first = g->codes[oldcode].first; p->suffix = (code == avail) ? p->first : g->codes[code].first; } - else if(code == avail) + else if (code == avail) return stbi__errpuc("illegal code in raster", "Corrupt GIF"); - stbi__out_gif_code(g, ( stbi__uint16 )code); + stbi__out_gif_code(g, (stbi__uint16)code); - if((avail & codemask) == 0 && avail <= 0x0FFF) + if ((avail & codemask) == 0 && avail <= 0x0FFF) { codesize++; codemask = (1 << codesize) - 1; @@ -7214,22 +7209,22 @@ static stbi_uc* stbi__gif_load_next(stbi__context* s, stbi__gif* g, int* comp, i // on first frame, any non-written pixels get the background colour (non-transparent) first_frame = 0; - if(g->out == 0) - { - if(!stbi__gif_header(s, g, comp, 0)) - return 0; // stbi__g_failure_reason set by stbi__gif_header - g->out = ( stbi_uc* )stbi__malloc(4 * (size_t)(g->w) * g->h); - g->background = ( stbi_uc* )stbi__malloc(4 * (size_t)(g->w) * g->h); - g->history = ( stbi_uc* )stbi__malloc((size_t)(g->w) * g->h); - if(g->out == 0) + if (g->out == 0) + { + if (!stbi__gif_header(s, g, comp, 0)) + return 0; // stbi__g_failure_reason set by stbi__gif_header + g->out = (stbi_uc*)stbi__malloc(4 * (size_t)(g->w) * g->h); + g->background = (stbi_uc*)stbi__malloc(4 * (size_t)(g->w) * g->h); + g->history = (stbi_uc*)stbi__malloc((size_t)(g->w) * g->h); + if (g->out == 0) return stbi__errpuc("outofmem", "Out of memory"); // image is treated as "tranparent" at the start - ie, nothing overwrites the current background; // background colour is only used for pixels that are not rendered first frame, after that "background" // color refers to teh color that was there the previous frame. memset(g->out, 0x00, 4 * (size_t)(g->w) * g->h); - memset(g->background, 0x00, 4 * (size_t)(g->w) * g->h); // state of the background (starts transparent) - memset(g->history, 0x00, (size_t)(g->w) * g->h); // pixels that were affected previous frame + memset(g->background, 0x00, 4 * (size_t)(g->w) * g->h); // state of the background (starts transparent) + memset(g->history, 0x00, (size_t)(g->w) * g->h); // pixels that were affected previous frame first_frame = 1; } else @@ -7238,27 +7233,27 @@ static stbi_uc* stbi__gif_load_next(stbi__context* s, stbi__gif* g, int* comp, i dispose = (g->eflags & 0x1C) >> 2; pcount = g->w * g->h; - if((dispose == 3) && (two_back == 0)) + if ((dispose == 3) && (two_back == 0)) { - dispose = 2; // if I don't have an image to revert back to, default to the old background + dispose = 2; // if I don't have an image to revert back to, default to the old background } - if(dispose == 3) - { // use previous graphic - for(pi = 0; pi < pcount; ++pi) + if (dispose == 3) + { // use previous graphic + for (pi = 0; pi < pcount; ++pi) { - if(g->history[pi]) + if (g->history[pi]) { memcpy(&g->out[pi * 4], &two_back[pi * 4], 4); } } } - else if(dispose == 2) + else if (dispose == 2) { // restore what was changed last frame to background before that frame; - for(pi = 0; pi < pcount; ++pi) + for (pi = 0; pi < pcount; ++pi) { - if(g->history[pi]) + if (g->history[pi]) { memcpy(&g->out[pi * 4], &g->background[pi * 4], 4); } @@ -7277,139 +7272,139 @@ static stbi_uc* stbi__gif_load_next(stbi__context* s, stbi__gif* g, int* comp, i } // clear my history; - memset(g->history, 0x00, (size_t)(g->w) * g->h); // pixels that were affected previous frame + memset(g->history, 0x00, (size_t)(g->w) * g->h); // pixels that were affected previous frame - for(;;) + for (;;) { int tag = stbi__get8(s); - switch(tag) + switch (tag) + { + case 0x2C: /* Image Descriptor */ { - case 0x2C: /* Image Descriptor */ + stbi__int32 x, y, w, h; + stbi_uc* o; + + x = stbi__get16le(s); + y = stbi__get16le(s); + w = stbi__get16le(s); + h = stbi__get16le(s); + if (((x + w) > (g->w)) || ((y + h) > (g->h))) + return stbi__errpuc("bad Image Descriptor", "Corrupt GIF"); + + g->line_size = g->w * 4; + g->start_x = x * 4; + g->start_y = y * g->line_size; + g->max_x = g->start_x + w * 4; + g->max_y = g->start_y + h * g->line_size; + g->cur_x = g->start_x; + g->cur_y = g->start_y; + + g->lflags = stbi__get8(s); + + if (g->lflags & 0x40) { - stbi__int32 x, y, w, h; - stbi_uc* o; - - x = stbi__get16le(s); - y = stbi__get16le(s); - w = stbi__get16le(s); - h = stbi__get16le(s); - if(((x + w) > (g->w)) || ((y + h) > (g->h))) - return stbi__errpuc("bad Image Descriptor", "Corrupt GIF"); - - g->line_size = g->w * 4; - g->start_x = x * 4; - g->start_y = y * g->line_size; - g->max_x = g->start_x + w * 4; - g->max_y = g->start_y + h * g->line_size; - g->cur_x = g->start_x; - g->cur_y = g->start_y; - - g->lflags = stbi__get8(s); - - if(g->lflags & 0x40) - { - g->step = 8 * g->line_size; // first interlaced spacing - g->parse = 3; - } - else - { - g->step = g->line_size; - g->parse = 0; - } + g->step = 8 * g->line_size; // first interlaced spacing + g->parse = 3; + } + else + { + g->step = g->line_size; + g->parse = 0; + } - if(g->lflags & 0x80) - { - stbi__gif_parse_colortable(s, g->lpal, 2 << (g->lflags & 7), - g->eflags & 0x01 ? g->transparent : -1); - g->color_table = ( stbi_uc* )g->lpal; - } - else if(g->flags & 0x80) - { - g->color_table = ( stbi_uc* )g->pal; - } - else - return stbi__errpuc("missing color table", "Corrupt GIF"); + if (g->lflags & 0x80) + { + stbi__gif_parse_colortable(s, g->lpal, 2 << (g->lflags & 7), + g->eflags & 0x01 ? g->transparent : -1); + g->color_table = (stbi_uc*)g->lpal; + } + else if (g->flags & 0x80) + { + g->color_table = (stbi_uc*)g->pal; + } + else + return stbi__errpuc("missing color table", "Corrupt GIF"); - o = stbi__process_gif_raster(s, g); - if(o == NULL) - return NULL; + o = stbi__process_gif_raster(s, g); + if (o == NULL) + return NULL; - // if this was the first frame, - pcount = g->w * g->h; - if(first_frame && (g->bgindex > 0)) + // if this was the first frame, + pcount = g->w * g->h; + if (first_frame && (g->bgindex > 0)) + { + // if first frame, any pixel not drawn to gets the background color + for (pi = 0; pi < pcount; ++pi) { - // if first frame, any pixel not drawn to gets the background color - for(pi = 0; pi < pcount; ++pi) + if (g->history[pi] == 0) { - if(g->history[pi] == 0) - { - g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will - // be reset next frame if need be; - memcpy(&g->out[pi * 4], &g->pal[g->bgindex], 4); - } + g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will + // be reset next frame if need be; + memcpy(&g->out[pi * 4], &g->pal[g->bgindex], 4); } } - - return o; } - case 0x21: // Comment Extension. - { - int len; - int ext = stbi__get8(s); - if(ext == 0xF9) - { // Graphic Control Extension. - len = stbi__get8(s); - if(len == 4) - { - g->eflags = stbi__get8(s); - g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths. + return o; + } - // unset old transparent - if(g->transparent >= 0) - { - g->pal[g->transparent][3] = 255; - } - if(g->eflags & 0x01) - { - g->transparent = stbi__get8(s); - if(g->transparent >= 0) - { - g->pal[g->transparent][3] = 0; - } - } - else + case 0x21: // Comment Extension. + { + int len; + int ext = stbi__get8(s); + if (ext == 0xF9) + { // Graphic Control Extension. + len = stbi__get8(s); + if (len == 4) + { + g->eflags = stbi__get8(s); + g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths. + + // unset old transparent + if (g->transparent >= 0) + { + g->pal[g->transparent][3] = 255; + } + if (g->eflags & 0x01) + { + g->transparent = stbi__get8(s); + if (g->transparent >= 0) { - // don't need transparent - stbi__skip(s, 1); - g->transparent = -1; + g->pal[g->transparent][3] = 0; } } else { - stbi__skip(s, len); - break; + // don't need transparent + stbi__skip(s, 1); + g->transparent = -1; } } - while((len = stbi__get8(s)) != 0) + else { stbi__skip(s, len); + break; } - break; } + while ((len = stbi__get8(s)) != 0) + { + stbi__skip(s, len); + } + break; + } - case 0x3B: // gif stream termination code - return ( stbi_uc* )s; // using '1' causes warning on some compilers + case 0x3B: // gif stream termination code + return (stbi_uc*)s; // using '1' causes warning on some compilers - default: - return stbi__errpuc("unknown code", "Corrupt GIF"); + default: + return stbi__errpuc("unknown code", "Corrupt GIF"); } } } static void* stbi__load_gif_main(stbi__context* s, int** delays, int* x, int* y, int* z, int* comp, int req_comp) { - if(stbi__gif_test(s)) + if (stbi__gif_test(s)) { int layers = 0; stbi_uc* u = 0; @@ -7418,7 +7413,7 @@ static void* stbi__load_gif_main(stbi__context* s, int** delays, int* x, int* y, stbi__gif g; int stride; memset(&g, 0, sizeof(g)); - if(delays) + if (delays) { *delays = 0; } @@ -7426,44 +7421,44 @@ static void* stbi__load_gif_main(stbi__context* s, int** delays, int* x, int* y, do { u = stbi__gif_load_next(s, &g, comp, req_comp, two_back); - if(u == ( stbi_uc* )s) - u = 0; // end of animated gif marker + if (u == (stbi_uc*)s) + u = 0; // end of animated gif marker - if(u) + if (u) { *x = g.w; *y = g.h; ++layers; stride = g.w * g.h * 4; - if(out) + if (out) { - out = ( stbi_uc* )STBI_REALLOC(out, (size_t)layers * stride); - if(delays) + out = (stbi_uc*)STBI_REALLOC(out, (size_t)layers * stride); + if (delays) { - *delays = ( int* )STBI_REALLOC(*delays, sizeof(int) * layers); + *delays = (int*)STBI_REALLOC(*delays, sizeof(int) * layers); } } else { - out = ( stbi_uc* )stbi__malloc((size_t)layers * stride); - if(delays) + out = (stbi_uc*)stbi__malloc((size_t)layers * stride); + if (delays) { - *delays = ( int* )stbi__malloc(layers * sizeof(int)); + *delays = (int*)stbi__malloc(layers * sizeof(int)); } } memcpy(out + ((layers - 1) * stride), u, stride); - if(layers >= 2) + if (layers >= 2) { two_back = out - 2 * stride; } - if(delays) + if (delays) { (*delays)[layers - 1U] = g.delay; } } - } while(u != 0); + } while (u != 0); // free temp buffer; STBI_FREE(g.out); @@ -7471,7 +7466,7 @@ static void* stbi__load_gif_main(stbi__context* s, int** delays, int* x, int* y, STBI_FREE(g.background); // do the final conversion after loading everything; - if(req_comp && req_comp != 4) + if (req_comp && req_comp != 4) out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h); *z = layers; @@ -7490,16 +7485,16 @@ static void* stbi__gif_load(stbi__context* s, int* x, int* y, int* comp, int req memset(&g, 0, sizeof(g)); u = stbi__gif_load_next(s, &g, comp, req_comp, 0); - if(u == ( stbi_uc* )s) - u = 0; // end of animated gif marker - if(u) + if (u == (stbi_uc*)s) + u = 0; // end of animated gif marker + if (u) { *x = g.w; *y = g.h; // moved conversion to after successful load so that the same // can be done for multiple frames. - if(req_comp && req_comp != 4) + if (req_comp && req_comp != 4) u = stbi__convert_format(u, 4, req_comp, g.w, g.h); } @@ -7523,8 +7518,8 @@ static int stbi__gif_info(stbi__context* s, int* x, int* y, int* comp) static int stbi__hdr_test_core(stbi__context* s, const char* signature) { int i; - for(i = 0; signature[i]; ++i) - if(stbi__get8(s) != signature[i]) + for (i = 0; signature[i]; ++i) + if (stbi__get8(s) != signature[i]) return 0; stbi__rewind(s); return 1; @@ -7534,7 +7529,7 @@ static int stbi__hdr_test(stbi__context* s) { int r = stbi__hdr_test_core(s, "#?RADIANCE\n"); stbi__rewind(s); - if(!r) + if (!r) { r = stbi__hdr_test_core(s, "#?RGBE\n"); stbi__rewind(s); @@ -7548,19 +7543,19 @@ static char* stbi__hdr_gettoken(stbi__context* z, char* buffer) int len = 0; char c = '\0'; - c = ( char )stbi__get8(z); + c = (char)stbi__get8(z); - while(!stbi__at_eof(z) && c != '\n') + while (!stbi__at_eof(z) && c != '\n') { buffer[len++] = c; - if(len == STBI__HDR_BUFLEN - 1) + if (len == STBI__HDR_BUFLEN - 1) { // flush to end of line - while(!stbi__at_eof(z) && stbi__get8(z) != '\n') + while (!stbi__at_eof(z) && stbi__get8(z) != '\n') ; break; } - c = ( char )stbi__get8(z); + c = (char)stbi__get8(z); } buffer[len] = 0; @@ -7569,12 +7564,12 @@ static char* stbi__hdr_gettoken(stbi__context* z, char* buffer) static void stbi__hdr_convert(float* output, stbi_uc* input, int req_comp) { - if(input[3] != 0) + if (input[3] != 0) { float f1; // Exponent - f1 = ( float )ldexp(1.0f, input[3] - ( int )(128 + 8)); - if(req_comp <= 2) + f1 = (float)ldexp(1.0f, input[3] - (int)(128 + 8)); + if (req_comp <= 2) output[0] = (input[0] + input[1] + input[2]) * f1 / 3; else { @@ -7582,25 +7577,25 @@ static void stbi__hdr_convert(float* output, stbi_uc* input, int req_comp) output[1] = input[1] * f1; output[2] = input[2] * f1; } - if(req_comp == 2) + if (req_comp == 2) output[1] = 1; - if(req_comp == 4) + if (req_comp == 4) output[3] = 1; } else { - switch(req_comp) + switch (req_comp) { - case 4: - output[3] = 1; /* fallthrough */ - case 3: - output[0] = output[1] = output[2] = 0; - break; - case 2: - output[1] = 1; /* fallthrough */ - case 1: - output[0] = 0; - break; + case 4: + output[3] = 1; /* fallthrough */ + case 3: + output[0] = output[1] = output[2] = 0; + break; + case 2: + output[1] = 1; /* fallthrough */ + case 1: + output[0] = 0; + break; } } } @@ -7621,63 +7616,63 @@ static float* stbi__hdr_load(stbi__context* s, int* x, int* y, int* comp, int re // Check identifier headerToken = stbi__hdr_gettoken(s, buffer); - if(strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0) + if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0) return stbi__errpf("not HDR", "Corrupt HDR image"); // Parse header - for(;;) + for (;;) { token = stbi__hdr_gettoken(s, buffer); - if(token[0] == 0) + if (token[0] == 0) break; - if(strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) + if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1; } - if(!valid) + if (!valid) return stbi__errpf("unsupported format", "Unsupported HDR format"); // Parse width and height // can't use sscanf() if we're not using stdio! token = stbi__hdr_gettoken(s, buffer); - if(strncmp(token, "-Y ", 3)) + if (strncmp(token, "-Y ", 3)) return stbi__errpf("unsupported data layout", "Unsupported HDR format"); token += 3; - height = ( int )strtol(token, &token, 10); - while(*token == ' ') + height = (int)strtol(token, &token, 10); + while (*token == ' ') ++token; - if(strncmp(token, "+X ", 3)) + if (strncmp(token, "+X ", 3)) return stbi__errpf("unsupported data layout", "Unsupported HDR format"); token += 3; - width = ( int )strtol(token, NULL, 10); + width = (int)strtol(token, NULL, 10); *x = width; *y = height; - if(comp) + if (comp) *comp = 3; - if(req_comp == 0) + if (req_comp == 0) req_comp = 3; - if(!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0)) + if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0)) return stbi__errpf("too large", "HDR image is too large"); // Read data - hdr_data = ( float* )stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0); - if(!hdr_data) + hdr_data = (float*)stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0); + if (!hdr_data) return stbi__errpf("outofmem", "Out of memory"); // Load image data // image data is stored as some number of sca - if(width < 8 || width >= 32768) + if (width < 8 || width >= 32768) { // Read flat data - for(j = 0; j < height; ++j) + for (j = 0; j < height; ++j) { - for(i = 0; i < width; ++i) + for (i = 0; i < width; ++i) { stbi_uc rgbe[4]; - main_decode_loop: +main_decode_loop: stbi__getn(s, rgbe, 4); stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp); } @@ -7688,83 +7683,83 @@ static float* stbi__hdr_load(stbi__context* s, int* x, int* y, int* comp, int re // Read RLE-encoded data scanline = NULL; - for(j = 0; j < height; ++j) + for (j = 0; j < height; ++j) { c1 = stbi__get8(s); c2 = stbi__get8(s); len = stbi__get8(s); - if(c1 != 2 || c2 != 2 || (len & 0x80)) + if (c1 != 2 || c2 != 2 || (len & 0x80)) { // not run-length encoded, so we have to actually use THIS data as a decoded // pixel (note this can't be a valid pixel--one of RGB must be >= 128) stbi_uc rgbe[4]; - rgbe[0] = ( stbi_uc )c1; - rgbe[1] = ( stbi_uc )c2; - rgbe[2] = ( stbi_uc )len; - rgbe[3] = ( stbi_uc )stbi__get8(s); + rgbe[0] = (stbi_uc)c1; + rgbe[1] = (stbi_uc)c2; + rgbe[2] = (stbi_uc)len; + rgbe[3] = (stbi_uc)stbi__get8(s); stbi__hdr_convert(hdr_data, rgbe, req_comp); i = 1; j = 0; STBI_FREE(scanline); - goto main_decode_loop; // yes, this makes no sense + goto main_decode_loop; // yes, this makes no sense } len <<= 8; len |= stbi__get8(s); - if(len != width) + if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); } - if(scanline == NULL) + if (scanline == NULL) { - scanline = ( stbi_uc* )stbi__malloc_mad2(width, 4, 0); - if(!scanline) + scanline = (stbi_uc*)stbi__malloc_mad2(width, 4, 0); + if (!scanline) { STBI_FREE(hdr_data); return stbi__errpf("outofmem", "Out of memory"); } } - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { int nleft; i = 0; - while((nleft = width - i) > 0) + while ((nleft = width - i) > 0) { count = stbi__get8(s); - if(count > 128) + if (count > 128) { // Run value = stbi__get8(s); count -= 128; - if(count > nleft) + if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); } - for(z = 0; z < count; ++z) + for (z = 0; z < count; ++z) scanline[i++ * 4 + k] = value; } else { // Dump - if(count > nleft) + if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); } - for(z = 0; z < count; ++z) + for (z = 0; z < count; ++z) scanline[i++ * 4 + k] = stbi__get8(s); } } } - for(i = 0; i < width; ++i) + for (i = 0; i < width; ++i) stbi__hdr_convert(hdr_data + (j * width + i) * req_comp, scanline + i * 4, req_comp); } - if(scanline) + if (scanline) STBI_FREE(scanline); } @@ -7778,54 +7773,54 @@ static int stbi__hdr_info(stbi__context* s, int* x, int* y, int* comp) int valid = 0; int dummy; - if(!x) + if (!x) x = &dummy; - if(!y) + if (!y) y = &dummy; - if(!comp) + if (!comp) comp = &dummy; - if(stbi__hdr_test(s) == 0) + if (stbi__hdr_test(s) == 0) { stbi__rewind(s); return 0; } - for(;;) + for (;;) { token = stbi__hdr_gettoken(s, buffer); - if(token[0] == 0) + if (token[0] == 0) break; - if(strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) + if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1; } - if(!valid) + if (!valid) { stbi__rewind(s); return 0; } token = stbi__hdr_gettoken(s, buffer); - if(strncmp(token, "-Y ", 3)) + if (strncmp(token, "-Y ", 3)) { stbi__rewind(s); return 0; } token += 3; - *y = ( int )strtol(token, &token, 10); - while(*token == ' ') + *y = (int)strtol(token, &token, 10); + while (*token == ' ') ++token; - if(strncmp(token, "+X ", 3)) + if (strncmp(token, "+X ", 3)) { stbi__rewind(s); return 0; } token += 3; - *x = ( int )strtol(token, NULL, 10); + *x = (int)strtol(token, NULL, 10); *comp = 3; return 1; } -#endif // STBI_NO_HDR +#endif // STBI_NO_HDR #ifndef STBI_NO_BMP static int stbi__bmp_info(stbi__context* s, int* x, int* y, int* comp) @@ -7836,13 +7831,13 @@ static int stbi__bmp_info(stbi__context* s, int* x, int* y, int* comp) info.all_a = 255; p = stbi__bmp_parse_header(s, &info); stbi__rewind(s); - if(p == NULL) + if (p == NULL) return 0; - if(x) + if (x) *x = s->img_x; - if(y) + if (y) *y = s->img_y; - if(comp) + if (comp) *comp = info.ma ? 4 : 3; return 1; } @@ -7852,25 +7847,25 @@ static int stbi__bmp_info(stbi__context* s, int* x, int* y, int* comp) static int stbi__psd_info(stbi__context* s, int* x, int* y, int* comp) { int channelCount, dummy, depth; - if(!x) + if (!x) x = &dummy; - if(!y) + if (!y) y = &dummy; - if(!comp) + if (!comp) comp = &dummy; - if(stbi__get32be(s) != 0x38425053) + if (stbi__get32be(s) != 0x38425053) { stbi__rewind(s); return 0; } - if(stbi__get16be(s) != 1) + if (stbi__get16be(s) != 1) { stbi__rewind(s); return 0; } stbi__skip(s, 6); channelCount = stbi__get16be(s); - if(channelCount < 0 || channelCount > 16) + if (channelCount < 0 || channelCount > 16) { stbi__rewind(s); return 0; @@ -7878,12 +7873,12 @@ static int stbi__psd_info(stbi__context* s, int* x, int* y, int* comp) *y = stbi__get32be(s); *x = stbi__get32be(s); depth = stbi__get16be(s); - if(depth != 8 && depth != 16) + if (depth != 8 && depth != 16) { stbi__rewind(s); return 0; } - if(stbi__get16be(s) != 3) + if (stbi__get16be(s) != 3) { stbi__rewind(s); return 0; @@ -7895,27 +7890,27 @@ static int stbi__psd_info(stbi__context* s, int* x, int* y, int* comp) static int stbi__psd_is16(stbi__context* s) { int channelCount, depth; - if(stbi__get32be(s) != 0x38425053) + if (stbi__get32be(s) != 0x38425053) { stbi__rewind(s); return 0; } - if(stbi__get16be(s) != 1) + if (stbi__get16be(s) != 1) { stbi__rewind(s); return 0; } stbi__skip(s, 6); channelCount = stbi__get16be(s); - if(channelCount < 0 || channelCount > 16) + if (channelCount < 0 || channelCount > 16) { stbi__rewind(s); return 0; } - ( void )stbi__get32be(s); - ( void )stbi__get32be(s); + (void)stbi__get32be(s); + (void)stbi__get32be(s); depth = stbi__get16be(s); - if(depth != 16) + if (depth != 16) { stbi__rewind(s); return 0; @@ -7930,14 +7925,14 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp) int act_comp = 0, num_packets = 0, chained, dummy; stbi__pic_packet packets[10]; - if(!x) + if (!x) x = &dummy; - if(!y) + if (!y) y = &dummy; - if(!comp) + if (!comp) comp = &dummy; - if(!stbi__pic_is4(s, "\x53\x80\xF6\x34")) + if (!stbi__pic_is4(s, "\x53\x80\xF6\x34")) { stbi__rewind(s); return 0; @@ -7947,12 +7942,12 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp) *x = stbi__get16be(s); *y = stbi__get16be(s); - if(stbi__at_eof(s)) + if (stbi__at_eof(s)) { stbi__rewind(s); return 0; } - if((*x) != 0 && (1 << 28) / (*x) < (*y)) + if ((*x) != 0 && (1 << 28) / (*x) < (*y)) { stbi__rewind(s); return 0; @@ -7964,7 +7959,7 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp) { stbi__pic_packet* packet; - if(num_packets == sizeof(packets) / sizeof(packets[0])) + if (num_packets == sizeof(packets) / sizeof(packets[0])) return 0; packet = &packets[num_packets++]; @@ -7974,17 +7969,17 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp) packet->channel = stbi__get8(s); act_comp |= packet->channel; - if(stbi__at_eof(s)) + if (stbi__at_eof(s)) { stbi__rewind(s); return 0; } - if(packet->size != 8) + if (packet->size != 8) { stbi__rewind(s); return 0; } - } while(chained); + } while (chained); *comp = (act_comp & 0x10 ? 4 : 3); @@ -8009,9 +8004,9 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp) static int stbi__pnm_test(stbi__context* s) { char p, t; - p = ( char )stbi__get8(s); - t = ( char )stbi__get8(s); - if(p != 'P' || (t != '5' && t != '6')) + p = (char)stbi__get8(s); + t = (char)stbi__get8(s); + if (p != 'P' || (t != '5' && t != '6')) { stbi__rewind(s); return 0; @@ -8024,27 +8019,27 @@ static void* stbi__pnm_load(stbi__context* s, int* x, int* y, int* comp, int req stbi_uc* out; STBI_NOTUSED(ri); - if(!stbi__pnm_info(s, ( int* )&s->img_x, ( int* )&s->img_y, ( int* )&s->img_n)) + if (!stbi__pnm_info(s, (int*)&s->img_x, (int*)&s->img_y, (int*)&s->img_n)) return 0; *x = s->img_x; *y = s->img_y; - if(comp) + if (comp) *comp = s->img_n; - if(!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0)) + if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0)) return stbi__errpuc("too large", "PNM too large"); - out = ( stbi_uc* )stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0); - if(!out) + out = (stbi_uc*)stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0); + if (!out) return stbi__errpuc("outofmem", "Out of memory"); stbi__getn(s, out, s->img_n * s->img_x * s->img_y); - if(req_comp && req_comp != s->img_n) + if (req_comp && req_comp != s->img_n) { out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y); - if(out == NULL) - return out; // stbi__convert_format frees input on failure + if (out == NULL) + return out; // stbi__convert_format frees input on failure } return out; } @@ -8056,16 +8051,16 @@ static int stbi__pnm_isspace(char c) static void stbi__pnm_skip_whitespace(stbi__context* s, char* c) { - for(;;) + for (;;) { - while(!stbi__at_eof(s) && stbi__pnm_isspace(*c)) - *c = ( char )stbi__get8(s); + while (!stbi__at_eof(s) && stbi__pnm_isspace(*c)) + *c = (char)stbi__get8(s); - if(stbi__at_eof(s) || *c != '#') + if (stbi__at_eof(s) || *c != '#') break; - while(!stbi__at_eof(s) && *c != '\n' && *c != '\r') - *c = ( char )stbi__get8(s); + while (!stbi__at_eof(s) && *c != '\n' && *c != '\r') + *c = (char)stbi__get8(s); } } @@ -8078,10 +8073,10 @@ static int stbi__pnm_getinteger(stbi__context* s, char* c) { int value = 0; - while(!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) + while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) { value = value * 10 + (*c - '0'); - *c = ( char )stbi__get8(s); + *c = (char)stbi__get8(s); } return value; @@ -8092,38 +8087,38 @@ static int stbi__pnm_info(stbi__context* s, int* x, int* y, int* comp) int maxv, dummy; char c, p, t; - if(!x) + if (!x) x = &dummy; - if(!y) + if (!y) y = &dummy; - if(!comp) + if (!comp) comp = &dummy; stbi__rewind(s); // Get identifier - p = ( char )stbi__get8(s); - t = ( char )stbi__get8(s); - if(p != 'P' || (t != '5' && t != '6')) + p = (char)stbi__get8(s); + t = (char)stbi__get8(s); + if (p != 'P' || (t != '5' && t != '6')) { stbi__rewind(s); return 0; } - *comp = (t == '6') ? 3 : 1; // '5' is 1-component .pgm; '6' is 3-component .ppm + *comp = (t == '6') ? 3 : 1; // '5' is 1-component .pgm; '6' is 3-component .ppm - c = ( char )stbi__get8(s); + c = (char)stbi__get8(s); stbi__pnm_skip_whitespace(s, &c); - *x = stbi__pnm_getinteger(s, &c); // read width + *x = stbi__pnm_getinteger(s, &c); // read width stbi__pnm_skip_whitespace(s, &c); - *y = stbi__pnm_getinteger(s, &c); // read height + *y = stbi__pnm_getinteger(s, &c); // read height stbi__pnm_skip_whitespace(s, &c); - maxv = stbi__pnm_getinteger(s, &c); // read max value + maxv = stbi__pnm_getinteger(s, &c); // read max value - if(maxv > 255) + if (maxv > 255) return stbi__err("max value > 255", "PPM image not 8-bit"); else return 1; @@ -8133,48 +8128,48 @@ static int stbi__pnm_info(stbi__context* s, int* x, int* y, int* comp) static int stbi__info_main(stbi__context* s, int* x, int* y, int* comp) { #ifndef STBI_NO_JPEG - if(stbi__jpeg_info(s, x, y, comp)) + if (stbi__jpeg_info(s, x, y, comp)) return 1; #endif #ifndef STBI_NO_PNG - if(stbi__png_info(s, x, y, comp)) + if (stbi__png_info(s, x, y, comp)) return 1; #endif #ifndef STBI_NO_GIF - if(stbi__gif_info(s, x, y, comp)) + if (stbi__gif_info(s, x, y, comp)) return 1; #endif #ifndef STBI_NO_BMP - if(stbi__bmp_info(s, x, y, comp)) + if (stbi__bmp_info(s, x, y, comp)) return 1; #endif #ifndef STBI_NO_PSD - if(stbi__psd_info(s, x, y, comp)) + if (stbi__psd_info(s, x, y, comp)) return 1; #endif #ifndef STBI_NO_PIC - if(stbi__pic_info(s, x, y, comp)) + if (stbi__pic_info(s, x, y, comp)) return 1; #endif #ifndef STBI_NO_PNM - if(stbi__pnm_info(s, x, y, comp)) + if (stbi__pnm_info(s, x, y, comp)) return 1; #endif #ifndef STBI_NO_HDR - if(stbi__hdr_info(s, x, y, comp)) + if (stbi__hdr_info(s, x, y, comp)) return 1; #endif // test tga last because it's a crappy test! #ifndef STBI_NO_TGA - if(stbi__tga_info(s, x, y, comp)) + if (stbi__tga_info(s, x, y, comp)) return 1; #endif return stbi__err("unknown image type", "Image not of any known type, or corrupt"); @@ -8183,12 +8178,12 @@ static int stbi__info_main(stbi__context* s, int* x, int* y, int* comp) static int stbi__is_16_main(stbi__context* s) { #ifndef STBI_NO_PNG - if(stbi__png_is16(s)) + if (stbi__png_is16(s)) return 1; #endif #ifndef STBI_NO_PSD - if(stbi__psd_is16(s)) + if (stbi__psd_is16(s)) return 1; #endif @@ -8200,7 +8195,7 @@ extern int stbi_info(char const* filename, int* x, int* y, int* comp) { FILE* f = stbi__fopen(filename, "rb"); int result; - if(!f) + if (!f) return stbi__err("can't fopen", "Unable to open file"); result = stbi_info_from_file(f, x, y, comp); fclose(f); @@ -8222,7 +8217,7 @@ extern int stbi_is_16_bit(char const* filename) { FILE* f = stbi__fopen(filename, "rb"); int result; - if(!f) + if (!f) return stbi__err("can't fopen", "Unable to open file"); result = stbi_is_16_bit_from_file(f); fclose(f); @@ -8239,7 +8234,7 @@ extern int stbi_is_16_bit_from_file(FILE* f) fseek(f, pos, SEEK_SET); return r; } -#endif // !STBI_NO_STDIO +#endif // !STBI_NO_STDIO extern int stbi_info_from_memory(stbi_uc const* buffer, int len, int* x, int* y, int* comp) { @@ -8251,7 +8246,7 @@ extern int stbi_info_from_memory(stbi_uc const* buffer, int len, int* x, int* y, extern int stbi_info_from_callbacks(stbi_io_callbacks const* c, void* user, int* x, int* y, int* comp) { stbi__context s; - stbi__start_callbacks(&s, ( stbi_io_callbacks* )c, user); + stbi__start_callbacks(&s, (stbi_io_callbacks*)c, user); return stbi__info_main(&s, x, y, comp); } @@ -8265,11 +8260,11 @@ extern int stbi_is_16_bit_from_memory(stbi_uc const* buffer, int len) extern int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const* c, void* user) { stbi__context s; - stbi__start_callbacks(&s, ( stbi_io_callbacks* )c, user); + stbi__start_callbacks(&s, (stbi_io_callbacks*)c, user); return stbi__is_16_main(&s); } -#endif // STB_IMAGE_IMPLEMENTATION +#endif // STB_IMAGE_IMPLEMENTATION /* revision history: diff --git a/tests/common/stb_image_write.h b/tests/common/stb_image_write.h index 42b7c1796..fe585cf94 100644 --- a/tests/common/stb_image_write.h +++ b/tests/common/stb_image_write.h @@ -14,7 +14,7 @@ #endif #endif -#ifndef STB_IMAGE_WRITE_STATIC // C++ forbids static forward declarations +#ifndef STB_IMAGE_WRITE_STATIC // C++ forbids static forward declarations extern int stbi_write_tga_with_rle; extern int stbi_write_png_compression_level; extern int stbi_write_force_png_filter; @@ -40,7 +40,7 @@ STBIWDEF int stbi_write_jpg_to_func(stbi_write_func* func, void* context, int x, STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean); -#endif // INCLUDE_STB_IMAGE_WRITE_H +#endif // INCLUDE_STB_IMAGE_WRITE_H #define STB_IMAGE_WRITE_IMPLEMENTATION #ifdef STB_IMAGE_WRITE_IMPLEMENTATION @@ -56,7 +56,7 @@ STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean); #ifndef STBI_WRITE_NO_STDIO #include -#endif // STBI_WRITE_NO_STDIO +#endif // STBI_WRITE_NO_STDIO #include #include @@ -72,9 +72,9 @@ STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean); #endif #ifndef STBIW_MALLOC -#define STBIW_MALLOC(sz) malloc(sz) +#define STBIW_MALLOC(sz) malloc(sz) #define STBIW_REALLOC(p, newsz) realloc(p, newsz) -#define STBIW_FREE(p) free(p) +#define STBIW_FREE(p) free(p) #endif #ifndef STBIW_REALLOC_SIZED @@ -90,7 +90,7 @@ STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean); #define STBIW_ASSERT(x) assert(x) #endif -#define STBIW_UCHAR(x) ( unsigned char )(( x )&0xff) +#define STBIW_UCHAR(x) (unsigned char)((x)&0xff) #ifdef STB_IMAGE_WRITE_STATIC static int stbi__flip_vertically_on_write = 0; @@ -126,69 +126,69 @@ static void stbi__start_write_callbacks(stbi__write_context* s, stbi_write_func* static void stbi__stdio_write(void* context, void* data, int size) { - fwrite(data, 1, size, ( FILE* )context); + fwrite(data, 1, size, (FILE*)context); } static int stbi__start_write_file(stbi__write_context* s, const char* filename) { FILE* f; #ifdef STBI_MSC_SECURE_CRT - if(fopen_s(&f, filename, "wb")) + if (fopen_s(&f, filename, "wb")) f = NULL; #else f = fopen(filename, "wb"); #endif - stbi__start_write_callbacks(s, stbi__stdio_write, ( void* )f); + stbi__start_write_callbacks(s, stbi__stdio_write, (void*)f); return f != NULL; } static void stbi__end_write_file(stbi__write_context* s) { - fclose(( FILE* )s->context); + fclose((FILE*)s->context); } -#endif // !STBI_WRITE_NO_STDIO +#endif // !STBI_WRITE_NO_STDIO typedef unsigned int stbiw_uint32; typedef int stb_image_write_test[sizeof(stbiw_uint32) == 4 ? 1 : -1]; static void stbiw__writefv(stbi__write_context* s, const char* fmt, va_list v) { - while(*fmt) + while (*fmt) { - switch(*fmt++) + switch (*fmt++) { - case ' ': - break; - case '1': - { - unsigned char x = STBIW_UCHAR(va_arg(v, int)); - s->func(s->context, &x, 1); - break; - } - case '2': - { - int x = va_arg(v, int); - unsigned char b[2]; - b[0] = STBIW_UCHAR(x); - b[1] = STBIW_UCHAR(x >> 8); - s->func(s->context, b, 2); - break; - } - case '4': - { - stbiw_uint32 x = va_arg(v, int); - unsigned char b[4]; - b[0] = STBIW_UCHAR(x); - b[1] = STBIW_UCHAR(x >> 8); - b[2] = STBIW_UCHAR(x >> 16); - b[3] = STBIW_UCHAR(x >> 24); - s->func(s->context, b, 4); - break; - } - default: - STBIW_ASSERT(0); - return; + case ' ': + break; + case '1': + { + unsigned char x = STBIW_UCHAR(va_arg(v, int)); + s->func(s->context, &x, 1); + break; + } + case '2': + { + int x = va_arg(v, int); + unsigned char b[2]; + b[0] = STBIW_UCHAR(x); + b[1] = STBIW_UCHAR(x >> 8); + s->func(s->context, b, 2); + break; + } + case '4': + { + stbiw_uint32 x = va_arg(v, int); + unsigned char b[4]; + b[0] = STBIW_UCHAR(x); + b[1] = STBIW_UCHAR(x >> 8); + b[2] = STBIW_UCHAR(x >> 16); + b[3] = STBIW_UCHAR(x >> 24); + s->func(s->context, b, 4); + break; + } + default: + STBIW_ASSERT(0); + return; } } } @@ -219,33 +219,33 @@ static void stbiw__write_pixel(stbi__write_context* s, int rgb_dir, int comp, in unsigned char bg[3] = {255, 0, 255}, px[3]; int k; - if(write_alpha < 0) + if (write_alpha < 0) s->func(s->context, &d[comp - 1], 1); - switch(comp) + switch (comp) { - case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case - case 1: - if(expand_mono) - stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp - else - s->func(s->context, d, 1); // monochrome TGA - break; - case 4: - if(!write_alpha) - { - // composite against pink background - for(k = 0; k < 3; ++k) - px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255; - stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]); - break; - } - /* FALLTHROUGH */ - case 3: - stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]); + case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case + case 1: + if (expand_mono) + stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp + else + s->func(s->context, d, 1); // monochrome TGA + break; + case 4: + if (!write_alpha) + { + // composite against pink background + for (k = 0; k < 3; ++k) + px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255; + stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]); break; + } + /* FALLTHROUGH */ + case 3: + stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]); + break; } - if(write_alpha > 0) + if (write_alpha > 0) s->func(s->context, &d[comp - 1], 1); } @@ -255,22 +255,22 @@ static void stbiw__write_pixels(stbi__write_context* s, int rgb_dir, int vdir, i stbiw_uint32 zero = 0; int i, j, j_end; - if(y <= 0) + if (y <= 0) return; - if(stbi__flip_vertically_on_write) + if (stbi__flip_vertically_on_write) vdir *= -1; - if(vdir < 0) + if (vdir < 0) j_end = -1, j = y - 1; else j_end = y, j = 0; - for(; j != j_end; j += vdir) + for (; j != j_end; j += vdir) { - for(i = 0; i < x; ++i) + for (i = 0; i < x; ++i) { - unsigned char* d = ( unsigned char* )data + (j * x + i) * comp; + unsigned char* d = (unsigned char*)data + (j * x + i) * comp; stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d); } s->func(s->context, &zero, scanline_pad); @@ -280,7 +280,7 @@ static void stbiw__write_pixels(stbi__write_context* s, int rgb_dir, int vdir, i static int stbiw__outfile(stbi__write_context* s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void* data, int alpha, int pad, const char* fmt, ...) { - if(y < 0 || x < 0) + if (y < 0 || x < 0) { return 0; } @@ -298,11 +298,11 @@ static int stbiw__outfile(stbi__write_context* s, int rgb_dir, int vdir, int x, static int stbi_write_bmp_core(stbi__write_context* s, int x, int y, int comp, const void* data) { int pad = (-x * 3) & 3; - return stbiw__outfile(s, -1, -1, x, y, comp, 1, ( void* )data, 0, pad, + return stbiw__outfile(s, -1, -1, x, y, comp, 1, (void*)data, 0, pad, "11 4 22 4" "4 44 22 444444", - 'B', 'M', 14 + 40 + (x * 3 + pad) * y, 0, 0, 14 + 40, // file header - 40, x, y, 1, 24, 0, 0, 0, 0, 0, 0); // bitmap header + 'B', 'M', 14 + 40 + (x * 3 + pad) * y, 0, 0, 14 + 40, // file header + 40, x, y, 1, 24, 0, 0, 0, 0, 0, 0); // bitmap header } STBIWDEF int stbi_write_bmp_to_func(stbi_write_func* func, void* context, int x, int y, int comp, const void* data) @@ -316,7 +316,7 @@ STBIWDEF int stbi_write_bmp_to_func(stbi_write_func* func, void* context, int x, STBIWDEF int stbi_write_bmp(char const* filename, int x, int y, int comp, const void* data) { stbi__write_context s; - if(stbi__start_write_file(&s, filename)) + if (stbi__start_write_file(&s, filename)) { int r = stbi_write_bmp_core(&s, x, y, comp, data); stbi__end_write_file(&s); @@ -325,20 +325,20 @@ STBIWDEF int stbi_write_bmp(char const* filename, int x, int y, int comp, const else return 0; } -#endif //! STBI_WRITE_NO_STDIO +#endif //! STBI_WRITE_NO_STDIO static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, void* data) { int has_alpha = (comp == 2 || comp == 4); int colorbytes = has_alpha ? comp - 1 : comp; - int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3 + int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3 - if(y < 0 || x < 0) + if (y < 0 || x < 0) return 0; - if(!stbi_write_tga_with_rle) + if (!stbi_write_tga_with_rle) { - return stbiw__outfile(s, -1, -1, x, y, comp, 0, ( void* )data, has_alpha, 0, "111 221 2222 11", 0, 0, format, 0, + return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void*)data, has_alpha, 0, "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8); } else @@ -349,7 +349,7 @@ static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, v stbiw__writef(s, "111 221 2222 11", 0, 0, format + 8, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8); - if(stbi__flip_vertically_on_write) + if (stbi__flip_vertically_on_write) { j = 0; jend = y; @@ -361,27 +361,27 @@ static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, v jend = -1; jdir = -1; } - for(; j != jend; j += jdir) + for (; j != jend; j += jdir) { - unsigned char* row = ( unsigned char* )data + j * x * comp; + unsigned char* row = (unsigned char*)data + j * x * comp; int len; - for(i = 0; i < x; i += len) + for (i = 0; i < x; i += len) { unsigned char* begin = row + i * comp; int diff = 1; len = 1; - if(i < x - 1) + if (i < x - 1) { ++len; diff = memcmp(begin, row + (i + 1) * comp, comp); - if(diff) + if (diff) { const unsigned char* prev = begin; - for(k = i + 2; k < x && len < 128; ++k) + for (k = i + 2; k < x && len < 128; ++k) { - if(memcmp(prev, row + k * comp, comp)) + if (memcmp(prev, row + k * comp, comp)) { prev += comp; ++len; @@ -395,9 +395,9 @@ static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, v } else { - for(k = i + 2; k < x && len < 128; ++k) + for (k = i + 2; k < x && len < 128; ++k) { - if(!memcmp(begin, row + k * comp, comp)) + if (!memcmp(begin, row + k * comp, comp)) { ++len; } @@ -409,11 +409,11 @@ static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, v } } - if(diff) + if (diff) { unsigned char header = STBIW_UCHAR(len - 1); s->func(s->context, &header, 1); - for(k = 0; k < len; ++k) + for (k = 0; k < len; ++k) { stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp); } @@ -434,16 +434,16 @@ STBIWDEF int stbi_write_tga_to_func(stbi_write_func* func, void* context, int x, { stbi__write_context s; stbi__start_write_callbacks(&s, func, context); - return stbi_write_tga_core(&s, x, y, comp, ( void* )data); + return stbi_write_tga_core(&s, x, y, comp, (void*)data); } #ifndef STBI_WRITE_NO_STDIO STBIWDEF int stbi_write_tga(char const* filename, int x, int y, int comp, const void* data) { stbi__write_context s; - if(stbi__start_write_file(&s, filename)) + if (stbi__start_write_file(&s, filename)) { - int r = stbi_write_tga_core(&s, x, y, comp, ( void* )data); + int r = stbi_write_tga_core(&s, x, y, comp, (void*)data); stbi__end_write_file(&s); return r; } @@ -463,18 +463,18 @@ void stbiw__linear_to_rgbe(unsigned char* rgbe, float* linear) int exponent; float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2])); - if(maxcomp < 1e-32f) + if (maxcomp < 1e-32f) { rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0; } else { - float normalize = ( float )frexp(maxcomp, &exponent) * 256.0f / maxcomp; + float normalize = (float)frexp(maxcomp, &exponent) * 256.0f / maxcomp; - rgbe[0] = ( unsigned char )(linear[0] * normalize); - rgbe[1] = ( unsigned char )(linear[1] * normalize); - rgbe[2] = ( unsigned char )(linear[2] * normalize); - rgbe[3] = ( unsigned char )(exponent + 128); + rgbe[0] = (unsigned char)(linear[0] * normalize); + rgbe[1] = (unsigned char)(linear[1] * normalize); + rgbe[2] = (unsigned char)(linear[2] * normalize); + rgbe[3] = (unsigned char)(exponent + 128); } } @@ -489,7 +489,7 @@ void stbiw__write_run_data(stbi__write_context* s, int length, unsigned char dat void stbiw__write_dump_data(stbi__write_context* s, int length, unsigned char* data) { unsigned char lengthbyte = STBIW_UCHAR(length); - STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code + STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code s->func(s->context, &lengthbyte, 1); s->func(s->context, data, length); } @@ -505,21 +505,21 @@ void stbiw__write_hdr_scanline(stbi__write_context* s, int width, int ncomp, uns scanlineheader[3] = (width & 0x00ff); /* skip RLE for images too small or large */ - if(width < 8 || width >= 32768) + if (width < 8 || width >= 32768) { - for(x = 0; x < width; x++) + for (x = 0; x < width; x++) { - switch(ncomp) + switch (ncomp) { - case 4: /* fallthrough */ - case 3: - linear[2] = scanline[x * ncomp + 2]; - linear[1] = scanline[x * ncomp + 1]; - linear[0] = scanline[x * ncomp + 0]; - break; - default: - linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0]; - break; + case 4: /* fallthrough */ + case 3: + linear[2] = scanline[x * ncomp + 2]; + linear[1] = scanline[x * ncomp + 1]; + linear[0] = scanline[x * ncomp + 0]; + break; + default: + linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0]; + break; } stbiw__linear_to_rgbe(rgbe, linear); s->func(s->context, rgbe, 4); @@ -529,19 +529,19 @@ void stbiw__write_hdr_scanline(stbi__write_context* s, int width, int ncomp, uns { int c, r; /* encode into scratch buffer */ - for(x = 0; x < width; x++) + for (x = 0; x < width; x++) { - switch(ncomp) + switch (ncomp) { - case 4: /* fallthrough */ - case 3: - linear[2] = scanline[x * ncomp + 2]; - linear[1] = scanline[x * ncomp + 1]; - linear[0] = scanline[x * ncomp + 0]; - break; - default: - linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0]; - break; + case 4: /* fallthrough */ + case 3: + linear[2] = scanline[x * ncomp + 2]; + linear[1] = scanline[x * ncomp + 1]; + linear[0] = scanline[x * ncomp + 0]; + break; + default: + linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0]; + break; } stbiw__linear_to_rgbe(rgbe, linear); scratch[x + width * 0] = rgbe[0]; @@ -553,43 +553,43 @@ void stbiw__write_hdr_scanline(stbi__write_context* s, int width, int ncomp, uns s->func(s->context, scanlineheader, 4); /* RLE each component separately */ - for(c = 0; c < 4; c++) + for (c = 0; c < 4; c++) { unsigned char* comp = &scratch[width * c]; x = 0; - while(x < width) + while (x < width) { // find first run r = x; - while(r + 2 < width) + while (r + 2 < width) { - if(comp[r] == comp[r + 1] && comp[r] == comp[r + 2]) + if (comp[r] == comp[r + 1] && comp[r] == comp[r + 2]) break; ++r; } - if(r + 2 >= width) + if (r + 2 >= width) r = width; // dump up to first run - while(x < r) + while (x < r) { int len = r - x; - if(len > 128) + if (len > 128) len = 128; stbiw__write_dump_data(s, len, &comp[x]); x += len; } // if there's a run, output it - if(r + 2 < width) - { // same test as what we break out of in search loop, so only true if we break'd + if (r + 2 < width) + { // same test as what we break out of in search loop, so only true if we break'd // find next byte after run - while(r < width && comp[r] == comp[x]) + while (r < width && comp[r] == comp[x]) ++r; // output run up to r - while(x < r) + while (x < r) { int len = r - x; - if(len > 127) + if (len > 127) len = 127; stbiw__write_run_data(s, len, comp[x]); x += len; @@ -602,12 +602,12 @@ void stbiw__write_hdr_scanline(stbi__write_context* s, int width, int ncomp, uns static int stbi_write_hdr_core(stbi__write_context* s, int x, int y, int comp, float* data) { - if(y <= 0 || x <= 0 || data == NULL) + if (y <= 0 || x <= 0 || data == NULL) return 0; else { // Each component is stored separately. Allocate scratch space for full output scanline. - unsigned char* scratch = ( unsigned char* )STBIW_MALLOC(x * 4); + unsigned char* scratch = (unsigned char*)STBIW_MALLOC(x * 4); int i, len; char buffer[128]; char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n"; @@ -620,7 +620,7 @@ static int stbi_write_hdr_core(stbi__write_context* s, int x, int y, int comp, f #endif s->func(s->context, buffer, len); - for(i = 0; i < y; i++) + for (i = 0; i < y; i++) stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp * x * (stbi__flip_vertically_on_write ? y - 1 - i : i) * x); STBIW_FREE(scratch); @@ -632,23 +632,23 @@ STBIWDEF int stbi_write_hdr_to_func(stbi_write_func* func, void* context, int x, { stbi__write_context s; stbi__start_write_callbacks(&s, func, context); - return stbi_write_hdr_core(&s, x, y, comp, ( float* )data); + return stbi_write_hdr_core(&s, x, y, comp, (float*)data); } #ifndef STBI_WRITE_NO_STDIO STBIWDEF int stbi_write_hdr(char const* filename, int x, int y, int comp, const float* data) { stbi__write_context s; - if(stbi__start_write_file(&s, filename)) + if (stbi__start_write_file(&s, filename)) { - int r = stbi_write_hdr_core(&s, x, y, comp, ( float* )data); + int r = stbi_write_hdr_core(&s, x, y, comp, (float*)data); stbi__end_write_file(&s); return r; } else return 0; } -#endif // STBI_WRITE_NO_STDIO +#endif // STBI_WRITE_NO_STDIO ////////////////////////////////////////////////////////////////////////////// // @@ -657,30 +657,29 @@ STBIWDEF int stbi_write_hdr(char const* filename, int x, int y, int comp, const #ifndef STBIW_ZLIB_COMPRESS // stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size() -#define stbiw__sbraw(a) (( int* )( a )-2) -#define stbiw__sbm(a) stbiw__sbraw(a)[0] -#define stbiw__sbn(a) stbiw__sbraw(a)[1] +#define stbiw__sbraw(a) ((int*)(a)-2) +#define stbiw__sbm(a) stbiw__sbraw(a)[0] +#define stbiw__sbn(a) stbiw__sbraw(a)[1] -#define stbiw__sbneedgrow(a, n) ((a) == 0 || stbiw__sbn(a) + n >= stbiw__sbm(a)) +#define stbiw__sbneedgrow(a, n) ((a) == 0 || stbiw__sbn(a) + n >= stbiw__sbm(a)) #define stbiw__sbmaybegrow(a, n) (stbiw__sbneedgrow(a, (n)) ? stbiw__sbgrow(a, n) : 0) -#define stbiw__sbgrow(a, n) stbiw__sbgrowf(( void** )&(a), (n), sizeof(*(a))) +#define stbiw__sbgrow(a, n) stbiw__sbgrowf((void**)&(a), (n), sizeof(*(a))) #define stbiw__sbpush(a, v) (stbiw__sbmaybegrow(a, 1), (a)[stbiw__sbn(a)++] = (v)) -#define stbiw__sbcount(a) ((a) ? stbiw__sbn(a) : 0) -#define stbiw__sbfree(a) ((a) ? STBIW_FREE(stbiw__sbraw(a)), 0 : 0) +#define stbiw__sbcount(a) ((a) ? stbiw__sbn(a) : 0) +#define stbiw__sbfree(a) ((a) ? STBIW_FREE(stbiw__sbraw(a)), 0 : 0) static void* stbiw__sbgrowf(void** arr, int increment, int itemsize) { int m = *arr ? 2 * stbiw__sbm(*arr) + increment : increment + 1; - void* p = - STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr) * itemsize + sizeof(int) * 2) : 0, - (unsigned long)itemsize * m + sizeof(int) * 2); + void* p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr) * itemsize + sizeof(int) * 2) : 0, + (unsigned long)itemsize * m + sizeof(int) * 2); STBIW_ASSERT(p); - if(p) + if (p) { - if(!*arr) - (( int* )p)[1] = 0; - *arr = ( void* )(( int* )p + 2); + if (!*arr) + ((int*)p)[1] = 0; + *arr = (void*)((int*)p + 2); stbiw__sbm(*arr) = m; } return *arr; @@ -688,7 +687,7 @@ static void* stbiw__sbgrowf(void** arr, int increment, int itemsize) static unsigned char* stbiw__zlib_flushf(unsigned char* data, unsigned int* bitbuffer, int* bitcount) { - while(*bitcount >= 8) + while (*bitcount >= 8) { stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer)); *bitbuffer >>= 8; @@ -700,7 +699,7 @@ static unsigned char* stbiw__zlib_flushf(unsigned char* data, unsigned int* bitb static int stbiw__zlib_bitrev(int code, int codebits) { int res = 0; - while(codebits--) + while (codebits--) { res = (res << 1) | (code & 1); code >>= 1; @@ -711,8 +710,8 @@ static int stbiw__zlib_bitrev(int code, int codebits) static unsigned int stbiw__zlib_countm(unsigned char* a, unsigned char* b, int limit) { int i; - for(i = 0; i < limit && i < 258; ++i) - if(a[i] != b[i]) + for (i = 0; i < limit && i < 258; ++i) + if (a[i] != b[i]) break; return i; } @@ -729,93 +728,94 @@ static unsigned int stbiw__zhash(unsigned char* data) return hash; } -#define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount)) +#define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount)) #define stbiw__zlib_add(code, codebits) (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush()) -#define stbiw__zlib_huffa(b, c) stbiw__zlib_add(stbiw__zlib_bitrev(b, c), c) +#define stbiw__zlib_huffa(b, c) stbiw__zlib_add(stbiw__zlib_bitrev(b, c), c) // default huffman tables #define stbiw__zlib_huff1(n) stbiw__zlib_huffa(0x30 + (n), 8) -#define stbiw__zlib_huff2(n) stbiw__zlib_huffa(0x190 + ( n )-144, 9) -#define stbiw__zlib_huff3(n) stbiw__zlib_huffa(0 + ( n )-256, 7) -#define stbiw__zlib_huff4(n) stbiw__zlib_huffa(0xc0 + ( n )-280, 8) -#define stbiw__zlib_huff(n) \ - ((n) <= 143 ? stbiw__zlib_huff1(n) : \ - (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n)) +#define stbiw__zlib_huff2(n) stbiw__zlib_huffa(0x190 + (n)-144, 9) +#define stbiw__zlib_huff3(n) stbiw__zlib_huffa(0 + (n)-256, 7) +#define stbiw__zlib_huff4(n) stbiw__zlib_huffa(0xc0 + (n)-280, 8) +#define stbiw__zlib_huff(n) \ + ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) \ + : (n) <= 279 ? stbiw__zlib_huff3(n) \ + : stbiw__zlib_huff4(n)) #define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n)) #define stbiw__ZHASH 16384 -#endif // STBIW_ZLIB_COMPRESS +#endif // STBIW_ZLIB_COMPRESS unsigned char* stbi_zlib_compress(unsigned char* data, int data_len, int* out_len, int quality) { #ifdef STBIW_ZLIB_COMPRESS // user provided a zlib compress implementation, use that return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality); -#else // use builtin - static unsigned short lengthc[] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, +#else // use builtin + static unsigned short lengthc[] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 259}; static unsigned char lengtheb[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0}; - static unsigned short distc[] = {1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, - 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, + static unsigned short distc[] = {1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, + 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 32768}; - static unsigned char disteb[] = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, + static unsigned char disteb[] = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13}; unsigned int bitbuf = 0; int i, j, bitcount = 0; unsigned char* out = NULL; - unsigned char*** hash_table = ( unsigned char*** )STBIW_MALLOC(stbiw__ZHASH * sizeof(char**)); - if(hash_table == NULL) + unsigned char*** hash_table = (unsigned char***)STBIW_MALLOC(stbiw__ZHASH * sizeof(char**)); + if (hash_table == NULL) return NULL; - if(quality < 5) + if (quality < 5) quality = 5; - stbiw__sbpush(out, 0x78); // DEFLATE 32K window - stbiw__sbpush(out, 0x5e); // FLEVEL = 1 + stbiw__sbpush(out, 0x78); // DEFLATE 32K window + stbiw__sbpush(out, 0x5e); // FLEVEL = 1 stbiw__zlib_add(1, 1); // BFINAL = 1 stbiw__zlib_add(1, 2); // BTYPE = 1 -- fixed huffman - for(i = 0; i < stbiw__ZHASH; ++i) + for (i = 0; i < stbiw__ZHASH; ++i) hash_table[i] = NULL; i = 0; - while(i < data_len - 3) + while (i < data_len - 3) { // hash next 3 bytes of data to be compressed int h = stbiw__zhash(data + i) & (stbiw__ZHASH - 1), best = 3; unsigned char* bestloc = 0; unsigned char** hlist = hash_table[h]; int n = stbiw__sbcount(hlist); - for(j = 0; j < n; ++j) + for (j = 0; j < n; ++j) { - if(hlist[j] - data > i - 32768) - { // if entry lies within window + if (hlist[j] - data > i - 32768) + { // if entry lies within window int d = stbiw__zlib_countm(hlist[j], data + i, data_len - i); - if(d >= best) + if (d >= best) best = d, bestloc = hlist[j]; } } // when hash table entry is too long, delete half the entries - if(hash_table[h] && stbiw__sbn(hash_table[h]) == 2 * quality) + if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2 * quality) { STBIW_MEMMOVE(hash_table[h], hash_table[h] + quality, sizeof(hash_table[h][0]) * quality); stbiw__sbn(hash_table[h]) = quality; } stbiw__sbpush(hash_table[h], data + i); - if(bestloc) + if (bestloc) { // "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal h = stbiw__zhash(data + i + 1) & (stbiw__ZHASH - 1); hlist = hash_table[h]; n = stbiw__sbcount(hlist); - for(j = 0; j < n; ++j) + for (j = 0; j < n; ++j) { - if(hlist[j] - data > i - 32767) + if (hlist[j] - data > i - 32767) { int e = stbiw__zlib_countm(hlist[j], data + i + 1, data_len - i - 1); - if(e > best) - { // if next match is better, bail on current match + if (e > best) + { // if next match is better, bail on current match bestloc = NULL; break; } @@ -823,19 +823,19 @@ unsigned char* stbi_zlib_compress(unsigned char* data, int data_len, int* out_le } } - if(bestloc) + if (bestloc) { - int d = ( int )(data + i - bestloc); // distance back + int d = (int)(data + i - bestloc); // distance back STBIW_ASSERT(d <= 32767 && best <= 258); - for(j = 0; best > lengthc[j + 1] - 1; ++j) + for (j = 0; best > lengthc[j + 1] - 1; ++j) ; stbiw__zlib_huff(j + 257); - if(lengtheb[j]) + if (lengtheb[j]) stbiw__zlib_add(best - lengthc[j], lengtheb[j]); - for(j = 0; d > distc[j + 1] - 1; ++j) + for (j = 0; d > distc[j + 1] - 1; ++j) ; stbiw__zlib_add(stbiw__zlib_bitrev(j, 5), 5); - if(disteb[j]) + if (disteb[j]) stbiw__zlib_add(d - distc[j], disteb[j]); i += best; } @@ -846,25 +846,25 @@ unsigned char* stbi_zlib_compress(unsigned char* data, int data_len, int* out_le } } // write out final bytes - for(; i < data_len; ++i) + for (; i < data_len; ++i) stbiw__zlib_huffb(data[i]); - stbiw__zlib_huff(256); // end of block + stbiw__zlib_huff(256); // end of block // pad with 0 bits to byte boundary - while(bitcount) + while (bitcount) stbiw__zlib_add(0, 1); - for(i = 0; i < stbiw__ZHASH; ++i) - ( void )stbiw__sbfree(hash_table[i]); + for (i = 0; i < stbiw__ZHASH; ++i) + (void)stbiw__sbfree(hash_table[i]); STBIW_FREE(hash_table); { // compute adler32 on input unsigned int s1 = 1, s2 = 0; - int blocklen = ( int )(data_len % 5552); + int blocklen = (int)(data_len % 5552); j = 0; - while(j < data_len) + while (j < data_len) { - for(i = 0; i < blocklen; ++i) + for (i = 0; i < blocklen; ++i) s1 += data[j + i], s2 += s1; s1 %= 65521, s2 %= 65521; j += blocklen; @@ -878,8 +878,8 @@ unsigned char* stbi_zlib_compress(unsigned char* data, int data_len, int* out_le *out_len = stbiw__sbn(out); // make returned pointer freeable STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len); - return ( unsigned char* )stbiw__sbraw(out); -#endif // STBIW_ZLIB_COMPRESS + return (unsigned char*)stbiw__sbraw(out); +#endif // STBIW_ZLIB_COMPRESS } static unsigned int stbiw__crc32(unsigned char* buffer, int len) @@ -917,14 +917,14 @@ static unsigned int stbiw__crc32(unsigned char* buffer, int len) unsigned int crc = ~0u; int i; - for(i = 0; i < len; ++i) + for (i = 0; i < len; ++i) crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)]; return ~crc; } #define stbiw__wpng4(o, a, b, c, d) \ ((o)[0] = STBIW_UCHAR(a), (o)[1] = STBIW_UCHAR(b), (o)[2] = STBIW_UCHAR(c), (o)[3] = STBIW_UCHAR(d), (o) += 4) -#define stbiw__wp32(data, v) stbiw__wpng4(data, (v) >> 24, (v) >> 16, (v) >> 8, (v)); +#define stbiw__wp32(data, v) stbiw__wpng4(data, (v) >> 24, (v) >> 16, (v) >> 8, (v)); #define stbiw__wptag(data, s) stbiw__wpng4(data, s[0], s[1], s[2], s[3]) static void stbiw__wpcrc(unsigned char** data, int len) @@ -936,9 +936,9 @@ static void stbiw__wpcrc(unsigned char** data, int len) static unsigned char stbiw__paeth(int a, int b, int c) { int p = a + b - c, pa = abs(p - a), pb = abs(p - b), pc = abs(p - c); - if(pa <= pb && pa <= pc) + if (pa <= pb && pa <= pc) return STBIW_UCHAR(a); - if(pb <= pc) + if (pb <= pc) return STBIW_UCHAR(b); return STBIW_UCHAR(c); } @@ -954,58 +954,58 @@ static void stbiw__encode_png_line(unsigned char* pixels, int stride_bytes, int int type = mymap[filter_type]; unsigned char* z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height - 1 - y : y); int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes; - for(i = 0; i < n; ++i) + for (i = 0; i < n; ++i) { - switch(type) + switch (type) { - case 0: - line_buffer[i] = z[i]; - break; - case 1: - line_buffer[i] = z[i]; - break; - case 2: - line_buffer[i] = z[i] - z[i - signed_stride]; - break; - case 3: - line_buffer[i] = z[i] - (z[i - signed_stride] >> 1); - break; - case 4: - line_buffer[i] = ( signed char )(z[i] - stbiw__paeth(0, z[i - signed_stride], 0)); - break; - case 5: - line_buffer[i] = z[i]; - break; - case 6: - line_buffer[i] = z[i]; - break; + case 0: + line_buffer[i] = z[i]; + break; + case 1: + line_buffer[i] = z[i]; + break; + case 2: + line_buffer[i] = z[i] - z[i - signed_stride]; + break; + case 3: + line_buffer[i] = z[i] - (z[i - signed_stride] >> 1); + break; + case 4: + line_buffer[i] = (signed char)(z[i] - stbiw__paeth(0, z[i - signed_stride], 0)); + break; + case 5: + line_buffer[i] = z[i]; + break; + case 6: + line_buffer[i] = z[i]; + break; } } - for(i = n; i < width * n; ++i) + for (i = n; i < width * n; ++i) { - switch(type) + switch (type) { - case 0: - line_buffer[i] = z[i]; - break; - case 1: - line_buffer[i] = z[i] - z[i - n]; - break; - case 2: - line_buffer[i] = z[i] - z[i - signed_stride]; - break; - case 3: - line_buffer[i] = z[i] - ((z[i - n] + z[i - signed_stride]) >> 1); - break; - case 4: - line_buffer[i] = z[i] - stbiw__paeth(z[i - n], z[i - signed_stride], z[i - signed_stride - n]); - break; - case 5: - line_buffer[i] = z[i] - (z[i - n] >> 1); - break; - case 6: - line_buffer[i] = z[i] - stbiw__paeth(z[i - n], 0, 0); - break; + case 0: + line_buffer[i] = z[i]; + break; + case 1: + line_buffer[i] = z[i] - z[i - n]; + break; + case 2: + line_buffer[i] = z[i] - z[i - signed_stride]; + break; + case 3: + line_buffer[i] = z[i] - ((z[i - n] + z[i - signed_stride]) >> 1); + break; + case 4: + line_buffer[i] = z[i] - stbiw__paeth(z[i - n], z[i - signed_stride], z[i - signed_stride - n]); + break; + case 5: + line_buffer[i] = z[i] - (z[i - n] >> 1); + break; + case 6: + line_buffer[i] = z[i] - stbiw__paeth(z[i - n], 0, 0); + break; } } } @@ -1019,76 +1019,76 @@ unsigned char* stbi_write_png_to_mem(unsigned char* pixels, int stride_bytes, in signed char* line_buffer; int j, zlen; - if(stride_bytes == 0) + if (stride_bytes == 0) stride_bytes = x * n; - if(force_filter >= 5) + if (force_filter >= 5) { force_filter = -1; } - filt = ( unsigned char* )STBIW_MALLOC((x * n + 1) * (size_t)y); - if(!filt) + filt = (unsigned char*)STBIW_MALLOC((x * n + 1) * (size_t)y); + if (!filt) return 0; - line_buffer = ( signed char* )STBIW_MALLOC((size_t)x * n); - if(!line_buffer) + line_buffer = (signed char*)STBIW_MALLOC((size_t)x * n); + if (!line_buffer) { STBIW_FREE(filt); return 0; } - for(j = 0; j < y; ++j) + for (j = 0; j < y; ++j) { int filter_type; - if(force_filter > -1) + if (force_filter > -1) { filter_type = force_filter; stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, force_filter, line_buffer); } else - { // Estimate the best filter by running through all of them: + { // Estimate the best filter by running through all of them: int best_filter = 0, best_filter_val = 0x7fffffff, est, i; - for(filter_type = 0; filter_type < 5; filter_type++) + for (filter_type = 0; filter_type < 5; filter_type++) { stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, filter_type, line_buffer); // Estimate the entropy of the line using this filter; the less, the better. est = 0; - for(i = 0; i < x * n; ++i) + for (i = 0; i < x * n; ++i) { - est += abs(( signed char )line_buffer[i]); + est += abs((signed char)line_buffer[i]); } - if(est < best_filter_val) + if (est < best_filter_val) { best_filter_val = est; best_filter = filter_type; } } - if(filter_type != best_filter) - { // If the last iteration already got us the best filter, don't redo it + if (filter_type != best_filter) + { // If the last iteration already got us the best filter, don't redo it stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, best_filter, line_buffer); filter_type = best_filter; } } // when we get here, filter_type contains the filter type, and line_buffer contains the data - filt[j * (x * n + 1)] = ( unsigned char )filter_type; + filt[j * (x * n + 1)] = (unsigned char)filter_type; STBIW_MEMMOVE(filt + j * (x * n + 1) + 1, line_buffer, (size_t)x * n); } STBIW_FREE(line_buffer); zlib = stbi_zlib_compress(filt, y * (x * n + 1), &zlen, stbi_write_png_compression_level); STBIW_FREE(filt); - if(!zlib) + if (!zlib) return 0; // each tag requires 12 bytes of overhead - out = ( unsigned char* )STBIW_MALLOC(8 + 12 + 13 + 12 + zlen + 12); - if(!out) + out = (unsigned char*)STBIW_MALLOC(8 + 12 + 13 + 12 + zlen + 12); + if (!out) return 0; *out_len = 8 + 12 + 13 + 12 + zlen + 12; o = out; STBIW_MEMMOVE(o, sig, 8); o += 8; - stbiw__wp32(o, 13); // header length + stbiw__wp32(o, 13); // header length stbiw__wptag(o, "IHDR"); stbiw__wp32(o, x); stbiw__wp32(o, y); @@ -1120,16 +1120,16 @@ STBIWDEF int stbi_write_png(char const* filename, int x, int y, int comp, const { FILE* f; int len; - unsigned char* png = stbi_write_png_to_mem(( unsigned char* )data, stride_bytes, x, y, comp, &len); - if(png == NULL) + unsigned char* png = stbi_write_png_to_mem((unsigned char*)data, stride_bytes, x, y, comp, &len); + if (png == NULL) return 0; #ifdef STBI_MSC_SECURE_CRT - if(fopen_s(&f, filename, "wb")) + if (fopen_s(&f, filename, "wb")) f = NULL; #else f = fopen(filename, "wb"); #endif - if(!f) + if (!f) { STBIW_FREE(png); return 0; @@ -1145,8 +1145,8 @@ STBIWDEF int stbi_write_png_to_func(stbi_write_func* func, void* context, int x, int stride_bytes) { int len; - unsigned char* png = stbi_write_png_to_mem(( unsigned char* )data, stride_bytes, x, y, comp, &len); - if(png == NULL) + unsigned char* png = stbi_write_png_to_mem((unsigned char*)data, stride_bytes, x, y, comp, &len); + if (png == NULL) return 0; func(context, png, len); STBIW_FREE(png); @@ -1161,8 +1161,8 @@ STBIWDEF int stbi_write_png_to_func(stbi_write_func* func, void* context, int x, * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html */ -static const unsigned char stbiw__jpg_ZigZag[] = {0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42, - 3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53, +static const unsigned char stbiw__jpg_ZigZag[] = {0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42, + 3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53, 10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60, 21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63}; @@ -1171,11 +1171,11 @@ static void stbiw__jpg_writeBits(stbi__write_context* s, int* bitBufP, int* bitC int bitBuf = *bitBufP, bitCnt = *bitCntP; bitCnt += bs[1]; bitBuf |= bs[0] << (24 - bitCnt); - while(bitCnt >= 8) + while (bitCnt >= 8) { unsigned char c = (bitBuf >> 16) & 255; stbiw__putc(s, c); - if(c == 255) + if (c == 255) { stbiw__putc(s, 0); } @@ -1202,33 +1202,33 @@ static void stbiw__jpg_DCT(float* d0p, float* d1p, float* d2p, float* d3p, float float tmp4 = d3 - d4; // Even part - float tmp10 = tmp0 + tmp3; // phase 2 + float tmp10 = tmp0 + tmp3; // phase 2 float tmp13 = tmp0 - tmp3; float tmp11 = tmp1 + tmp2; float tmp12 = tmp1 - tmp2; - d0 = tmp10 + tmp11; // phase 3 + d0 = tmp10 + tmp11; // phase 3 d4 = tmp10 - tmp11; - z1 = (tmp12 + tmp13) * 0.707106781f; // c4 - d2 = tmp13 + z1; // phase 5 + z1 = (tmp12 + tmp13) * 0.707106781f; // c4 + d2 = tmp13 + z1; // phase 5 d6 = tmp13 - z1; // Odd part - tmp10 = tmp4 + tmp5; // phase 2 + tmp10 = tmp4 + tmp5; // phase 2 tmp11 = tmp5 + tmp6; tmp12 = tmp6 + tmp7; // The rotator is modified from fig 4-8 to avoid extra negations. - z5 = (tmp10 - tmp12) * 0.382683433f; // c6 - z2 = tmp10 * 0.541196100f + z5; // c2-c6 - z4 = tmp12 * 1.306562965f + z5; // c2+c6 - z3 = tmp11 * 0.707106781f; // c4 + z5 = (tmp10 - tmp12) * 0.382683433f; // c6 + z2 = tmp10 * 0.541196100f + z5; // c2-c6 + z4 = tmp12 * 1.306562965f + z5; // c2+c6 + z3 = tmp11 * 0.707106781f; // c4 - z11 = tmp7 + z3; // phase 5 + z11 = tmp7 + z3; // phase 5 z13 = tmp7 - z3; - *d5p = z13 + z2; // phase 6 + *d5p = z13 + z2; // phase 6 *d3p = z13 - z2; *d1p = z11 + z4; *d7p = z11 - z4; @@ -1244,7 +1244,7 @@ static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) int tmp1 = val < 0 ? -val : val; val = val < 0 ? val - 1 : val; bits[1] = 1; - while(tmp1 >>= 1) + while (tmp1 >>= 1) { ++bits[1]; } @@ -1260,29 +1260,29 @@ static int stbiw__jpg_processDU(stbi__write_context* s, int* bitBuf, int* bitCnt int DU[64]; // DCT rows - for(dataOff = 0; dataOff < 64; dataOff += 8) + for (dataOff = 0; dataOff < 64; dataOff += 8) { stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff + 1], &CDU[dataOff + 2], &CDU[dataOff + 3], &CDU[dataOff + 4], &CDU[dataOff + 5], &CDU[dataOff + 6], &CDU[dataOff + 7]); } // DCT columns - for(dataOff = 0; dataOff < 8; ++dataOff) + for (dataOff = 0; dataOff < 8; ++dataOff) { stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff + 8], &CDU[dataOff + 16], &CDU[dataOff + 24], &CDU[dataOff + 32], &CDU[dataOff + 40], &CDU[dataOff + 48], &CDU[dataOff + 56]); } // Quantize/descale/zigzag the coefficients - for(i = 0; i < 64; ++i) + for (i = 0; i < 64; ++i) { float v = CDU[i] * fdtbl[i]; // DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f)); // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway? - DU[stbiw__jpg_ZigZag[i]] = ( int )(v < 0 ? v - 0.5f : v + 0.5f); + DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? v - 0.5f : v + 0.5f); } // Encode DC diff = DU[0] - DC; - if(diff == 0) + if (diff == 0) { stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]); } @@ -1295,29 +1295,29 @@ static int stbiw__jpg_processDU(stbi__write_context* s, int* bitBuf, int* bitCnt } // Encode ACs end0pos = 63; - for(; (end0pos > 0) && (DU[end0pos] == 0); --end0pos) + for (; (end0pos > 0) && (DU[end0pos] == 0); --end0pos) { } // end0pos = first element in reverse order !=0 - if(end0pos == 0) + if (end0pos == 0) { stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB); return DU[0]; } - for(i = 1; i <= end0pos; ++i) + for (i = 1; i <= end0pos; ++i) { int startpos = i; int nrzeroes; unsigned short bits[2]; - for(; DU[i] == 0 && i <= end0pos; ++i) + for (; DU[i] == 0 && i <= end0pos; ++i) { } nrzeroes = i - startpos; - if(nrzeroes >= 16) + if (nrzeroes >= 16) { int lng = nrzeroes >> 4; int nrmarker; - for(nrmarker = 1; nrmarker <= lng; ++nrmarker) + for (nrmarker = 1; nrmarker <= lng; ++nrmarker) stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes); nrzeroes &= 15; } @@ -1325,7 +1325,7 @@ static int stbiw__jpg_processDU(stbi__write_context* s, int* bitBuf, int* bitCnt stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes << 4) + bits[1]]); stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits); } - if(end0pos != 63) + if (end0pos != 63) { stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB); } @@ -1362,111 +1362,50 @@ static int stbi_write_jpg_core(stbi__write_context* s, int width, int height, in 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa}; // Huffman tables - static const unsigned short YDC_HT[256][2] = {{0, 2}, {2, 3}, {3, 3}, {4, 3}, {5, 3}, {6, 3}, - {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9}}; - static const unsigned short UVDC_HT[256][2] = {{0, 2}, {1, 2}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, - {62, 6}, {126, 7}, {254, 8}, {510, 9}, {1022, 10}, {2046, 11}}; + static const unsigned short YDC_HT[256][2] = {{0, 2}, {2, 3}, {3, 3}, {4, 3}, {5, 3}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9}}; + static const unsigned short UVDC_HT[256][2] = {{0, 2}, {1, 2}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9}, {1022, 10}, {2046, 11}}; static const unsigned short YAC_HT[256][2] = { - {10, 4}, {0, 2}, {1, 2}, {4, 3}, {11, 4}, {26, 5}, {120, 7}, {248, 8}, - {1014, 10}, {65410, 16}, {65411, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {12, 4}, {27, 5}, {121, 7}, {502, 9}, {2038, 11}, {65412, 16}, {65413, 16}, - {65414, 16}, {65415, 16}, {65416, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {28, 5}, {249, 8}, {1015, 10}, {4084, 12}, {65417, 16}, {65418, 16}, {65419, 16}, - {65420, 16}, {65421, 16}, {65422, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {58, 6}, {503, 9}, {4085, 12}, {65423, 16}, {65424, 16}, {65425, 16}, {65426, 16}, - {65427, 16}, {65428, 16}, {65429, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {59, 6}, {1016, 10}, {65430, 16}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, - {65435, 16}, {65436, 16}, {65437, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {122, 7}, {2039, 11}, {65438, 16}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, - {65443, 16}, {65444, 16}, {65445, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {123, 7}, {4086, 12}, {65446, 16}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, - {65451, 16}, {65452, 16}, {65453, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {250, 8}, {4087, 12}, {65454, 16}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, - {65459, 16}, {65460, 16}, {65461, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {504, 9}, {32704, 15}, {65462, 16}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, - {65467, 16}, {65468, 16}, {65469, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {505, 9}, {65470, 16}, {65471, 16}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, - {65476, 16}, {65477, 16}, {65478, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {506, 9}, {65479, 16}, {65480, 16}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, - {65485, 16}, {65486, 16}, {65487, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {1017, 10}, {65488, 16}, {65489, 16}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, - {65494, 16}, {65495, 16}, {65496, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {1018, 10}, {65497, 16}, {65498, 16}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, - {65503, 16}, {65504, 16}, {65505, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {2040, 11}, {65506, 16}, {65507, 16}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, - {65512, 16}, {65513, 16}, {65514, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {65515, 16}, {65516, 16}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, - {65522, 16}, {65523, 16}, {65524, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {2041, 11}, {65525, 16}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, - {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}; + {10, 4}, {0, 2}, {1, 2}, {4, 3}, {11, 4}, {26, 5}, {120, 7}, {248, 8}, {1014, 10}, {65410, 16}, {65411, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {12, 4}, {27, 5}, {121, 7}, {502, 9}, {2038, 11}, {65412, 16}, {65413, 16}, {65414, 16}, {65415, 16}, {65416, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {28, 5}, {249, 8}, {1015, 10}, {4084, 12}, {65417, 16}, {65418, 16}, {65419, 16}, {65420, 16}, {65421, 16}, {65422, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {58, 6}, {503, 9}, {4085, 12}, {65423, 16}, {65424, 16}, {65425, 16}, {65426, 16}, {65427, 16}, {65428, 16}, {65429, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {59, 6}, {1016, 10}, {65430, 16}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {122, 7}, {2039, 11}, {65438, 16}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {123, 7}, {4086, 12}, {65446, 16}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16}, {65453, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {250, 8}, {4087, 12}, {65454, 16}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16}, {65460, 16}, {65461, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {504, 9}, {32704, 15}, {65462, 16}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {505, 9}, {65470, 16}, {65471, 16}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {506, 9}, {65479, 16}, {65480, 16}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16}, {65487, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {1017, 10}, {65488, 16}, {65489, 16}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {1018, 10}, {65497, 16}, {65498, 16}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, {65505, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {2040, 11}, {65506, 16}, {65507, 16}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {65515, 16}, {65516, 16}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {2041, 11}, {65525, 16}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}; static const unsigned short UVAC_HT[256][2] = { - {0, 2}, {1, 2}, {4, 3}, {10, 4}, {24, 5}, {25, 5}, {56, 6}, {120, 7}, - {500, 9}, {1014, 10}, {4084, 12}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {11, 4}, {57, 6}, {246, 8}, {501, 9}, {2038, 11}, {4085, 12}, {65416, 16}, - {65417, 16}, {65418, 16}, {65419, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {26, 5}, {247, 8}, {1015, 10}, {4086, 12}, {32706, 15}, {65420, 16}, {65421, 16}, - {65422, 16}, {65423, 16}, {65424, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {27, 5}, {248, 8}, {1016, 10}, {4087, 12}, {65425, 16}, {65426, 16}, {65427, 16}, - {65428, 16}, {65429, 16}, {65430, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {58, 6}, {502, 9}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16}, - {65436, 16}, {65437, 16}, {65438, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {59, 6}, {1017, 10}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, - {65444, 16}, {65445, 16}, {65446, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {121, 7}, {2039, 11}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, - {65452, 16}, {65453, 16}, {65454, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {122, 7}, {2040, 11}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16}, - {65460, 16}, {65461, 16}, {65462, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {249, 8}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16}, - {65469, 16}, {65470, 16}, {65471, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {503, 9}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, - {65478, 16}, {65479, 16}, {65480, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {504, 9}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16}, - {65487, 16}, {65488, 16}, {65489, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {505, 9}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, - {65496, 16}, {65497, 16}, {65498, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {506, 9}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, - {65505, 16}, {65506, 16}, {65507, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {2041, 11}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16}, - {65514, 16}, {65515, 16}, {65516, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {16352, 14}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16}, - {65523, 16}, {65524, 16}, {65525, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {1018, 10}, {32707, 15}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, - {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}; - static const int YQT[] = {16, 11, 10, 16, 24, 40, 51, 61, 12, 12, 14, 19, 26, 58, 60, 55, - 14, 13, 16, 24, 40, 57, 69, 56, 14, 17, 22, 29, 51, 87, 80, 62, - 18, 22, 37, 56, 68, 109, 103, 77, 24, 35, 55, 64, 81, 104, 113, 92, + {0, 2}, {1, 2}, {4, 3}, {10, 4}, {24, 5}, {25, 5}, {56, 6}, {120, 7}, {500, 9}, {1014, 10}, {4084, 12}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {11, 4}, {57, 6}, {246, 8}, {501, 9}, {2038, 11}, {4085, 12}, {65416, 16}, {65417, 16}, {65418, 16}, {65419, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {26, 5}, {247, 8}, {1015, 10}, {4086, 12}, {32706, 15}, {65420, 16}, {65421, 16}, {65422, 16}, {65423, 16}, {65424, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {27, 5}, {248, 8}, {1016, 10}, {4087, 12}, {65425, 16}, {65426, 16}, {65427, 16}, {65428, 16}, {65429, 16}, {65430, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {58, 6}, {502, 9}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {65438, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {59, 6}, {1017, 10}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16}, {65446, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {121, 7}, {2039, 11}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16}, {65453, 16}, {65454, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {122, 7}, {2040, 11}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16}, {65460, 16}, {65461, 16}, {65462, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {249, 8}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {65470, 16}, {65471, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {503, 9}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16}, {65479, 16}, {65480, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {504, 9}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16}, {65487, 16}, {65488, 16}, {65489, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {505, 9}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16}, {65497, 16}, {65498, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {506, 9}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, {65505, 16}, {65506, 16}, {65507, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {2041, 11}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16}, {65515, 16}, {65516, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {16352, 14}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {65525, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {1018, 10}, {32707, 15}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}; + static const int YQT[] = {16, 11, 10, 16, 24, 40, 51, 61, 12, 12, 14, 19, 26, 58, 60, 55, + 14, 13, 16, 24, 40, 57, 69, 56, 14, 17, 22, 29, 51, 87, 80, 62, + 18, 22, 37, 56, 68, 109, 103, 77, 24, 35, 55, 64, 81, 104, 113, 92, 49, 64, 78, 87, 103, 121, 120, 101, 72, 92, 95, 98, 112, 100, 103, 99}; static const int UVQT[] = {17, 18, 24, 47, 99, 99, 99, 99, 18, 21, 26, 66, 99, 99, 99, 99, 24, 26, 56, 99, 99, 99, 99, 99, 47, 66, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99}; - static const float aasf[] = {1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, - 1.175875602f * 2.828427125f, 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, + static const float aasf[] = {1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, + 1.175875602f * 2.828427125f, 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f}; int row, col, i, k; float fdtbl_Y[64], fdtbl_UV[64]; unsigned char YTable[64], UVTable[64]; - if(!data || !width || !height || comp > 4 || comp < 1) + if (!data || !width || !height || comp > 4 || comp < 1) { return 0; } quality = quality ? quality : 90; - quality = quality < 1 ? 1 : quality > 100 ? 100 : quality; + quality = quality < 1 ? 1 : quality > 100 ? 100 + : quality; quality = quality < 50 ? 5000 / quality : 200 - quality * 2; - for(i = 0; i < 64; ++i) + for (i = 0; i < 64; ++i) { int uvti, yti = (YQT[i] * quality + 50) / 100; - YTable[stbiw__jpg_ZigZag[i]] = ( unsigned char )(yti < 1 ? 1 : yti > 255 ? 255 : yti); + YTable[stbiw__jpg_ZigZag[i]] = (unsigned char)(yti < 1 ? 1 : yti > 255 ? 255 + : yti); uvti = (UVQT[i] * quality + 50) / 100; - UVTable[stbiw__jpg_ZigZag[i]] = ( unsigned char )(uvti < 1 ? 1 : uvti > 255 ? 255 : uvti); + UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char)(uvti < 1 ? 1 : uvti > 255 ? 255 + : uvti); } - for(row = 0, k = 0; row < 8; ++row) + for (row = 0, k = 0; row < 8; ++row) { - for(col = 0; col < 8; ++col, ++k) + for (col = 0; col < 8; ++col, ++k) { fdtbl_Y[k] = 1 / (YTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]); fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]); @@ -1475,17 +1414,17 @@ static int stbi_write_jpg_core(stbi__write_context* s, int width, int height, in // Write Headers { - static const unsigned char head0[] = {0xFF, 0xD8, 0xFF, 0xE0, 0, 0x10, 'J', 'F', 'I', 'F', 0, 1, 1, - 0, 0, 1, 0, 1, 0, 0, 0xFF, 0xDB, 0, 0x84, 0}; + static const unsigned char head0[] = {0xFF, 0xD8, 0xFF, 0xE0, 0, 0x10, 'J', 'F', 'I', 'F', 0, 1, 1, + 0, 0, 1, 0, 1, 0, 0, 0xFF, 0xDB, 0, 0x84, 0}; static const unsigned char head2[] = {0xFF, 0xDA, 0, 0xC, 3, 1, 0, 2, 0x11, 3, 0x11, 0, 0x3F, 0}; const unsigned char head1[] = {0xFF, 0xC0, 0, 0x11, 8, - ( unsigned char )(height >> 8), + (unsigned char)(height >> 8), STBIW_UCHAR(height), - ( unsigned char )(width >> 8), + (unsigned char)(width >> 8), STBIW_UCHAR(width), 3, 1, @@ -1502,50 +1441,50 @@ static int stbi_write_jpg_core(stbi__write_context* s, int width, int height, in 0x01, 0xA2, 0}; - s->func(s->context, ( void* )head0, sizeof(head0)); - s->func(s->context, ( void* )YTable, sizeof(YTable)); + s->func(s->context, (void*)head0, sizeof(head0)); + s->func(s->context, (void*)YTable, sizeof(YTable)); stbiw__putc(s, 1); s->func(s->context, UVTable, sizeof(UVTable)); - s->func(s->context, ( void* )head1, sizeof(head1)); - s->func(s->context, ( void* )(std_dc_luminance_nrcodes + 1), sizeof(std_dc_luminance_nrcodes) - 1); - s->func(s->context, ( void* )std_dc_luminance_values, sizeof(std_dc_luminance_values)); - stbiw__putc(s, 0x10); // HTYACinfo - s->func(s->context, ( void* )(std_ac_luminance_nrcodes + 1), sizeof(std_ac_luminance_nrcodes) - 1); - s->func(s->context, ( void* )std_ac_luminance_values, sizeof(std_ac_luminance_values)); - stbiw__putc(s, 1); // HTUDCinfo - s->func(s->context, ( void* )(std_dc_chrominance_nrcodes + 1), sizeof(std_dc_chrominance_nrcodes) - 1); - s->func(s->context, ( void* )std_dc_chrominance_values, sizeof(std_dc_chrominance_values)); - stbiw__putc(s, 0x11); // HTUACinfo - s->func(s->context, ( void* )(std_ac_chrominance_nrcodes + 1), sizeof(std_ac_chrominance_nrcodes) - 1); - s->func(s->context, ( void* )std_ac_chrominance_values, sizeof(std_ac_chrominance_values)); - s->func(s->context, ( void* )head2, sizeof(head2)); + s->func(s->context, (void*)head1, sizeof(head1)); + s->func(s->context, (void*)(std_dc_luminance_nrcodes + 1), sizeof(std_dc_luminance_nrcodes) - 1); + s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values)); + stbiw__putc(s, 0x10); // HTYACinfo + s->func(s->context, (void*)(std_ac_luminance_nrcodes + 1), sizeof(std_ac_luminance_nrcodes) - 1); + s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values)); + stbiw__putc(s, 1); // HTUDCinfo + s->func(s->context, (void*)(std_dc_chrominance_nrcodes + 1), sizeof(std_dc_chrominance_nrcodes) - 1); + s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values)); + stbiw__putc(s, 0x11); // HTUACinfo + s->func(s->context, (void*)(std_ac_chrominance_nrcodes + 1), sizeof(std_ac_chrominance_nrcodes) - 1); + s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values)); + s->func(s->context, (void*)head2, sizeof(head2)); } // Encode 8x8 macroblocks { static const unsigned short fillBits[] = {0x7F, 7}; - const unsigned char* imageData = ( const unsigned char* )data; + const unsigned char* imageData = (const unsigned char*)data; int DCY = 0, DCU = 0, DCV = 0; int bitBuf = 0, bitCnt = 0; // comp == 2 is grey+alpha (alpha is ignored) int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0; int x, y, pos; - for(y = 0; y < height; y += 8) + for (y = 0; y < height; y += 8) { - for(x = 0; x < width; x += 8) + for (x = 0; x < width; x += 8) { float YDU[64], UDU[64], VDU[64]; - for(row = y, pos = 0; row < y + 8; ++row) + for (row = y, pos = 0; row < y + 8; ++row) { - for(col = x; col < x + 8; ++col, ++pos) + for (col = x; col < x + 8; ++col, ++pos) { int p = (stbi__flip_vertically_on_write ? height - 1 - row : row) * width * comp + col * comp; float r, g, b; - if(row >= height) + if (row >= height) { p -= width * comp * (row + 1 - height); } - if(col >= width) + if (col >= width) { p -= comp * (col + 1 - width); } @@ -1581,14 +1520,14 @@ STBIWDEF int stbi_write_jpg_to_func(stbi_write_func* func, void* context, int x, { stbi__write_context s; stbi__start_write_callbacks(&s, func, context); - return stbi_write_jpg_core(&s, x, y, comp, ( void* )data, quality); + return stbi_write_jpg_core(&s, x, y, comp, (void*)data, quality); } #ifndef STBI_WRITE_NO_STDIO STBIWDEF int stbi_write_jpg(char const* filename, int x, int y, int comp, const void* data, int quality) { stbi__write_context s; - if(stbi__start_write_file(&s, filename)) + if (stbi__start_write_file(&s, filename)) { int r = stbi_write_jpg_core(&s, x, y, comp, data, quality); stbi__end_write_file(&s); @@ -1599,7 +1538,7 @@ STBIWDEF int stbi_write_jpg(char const* filename, int x, int y, int comp, const } #endif -#endif // STB_IMAGE_WRITE_IMPLEMENTATION +#endif // STB_IMAGE_WRITE_IMPLEMENTATION /* Revision history 1.09 (2018-02-11) diff --git a/tests/common/tengine_operations.c b/tests/common/tengine_operations.c index ca621cb07..3f2c885bf 100644 --- a/tests/common/tengine_operations.c +++ b/tests/common/tengine_operations.c @@ -71,7 +71,7 @@ image load_image_stb(const char* filename, int channels) { int dst_index = i + w * j + w * h * k; int src_index = k + src_c * i + src_c * w * j; - im.data[dst_index] = ( float )data[src_index]; + im.data[dst_index] = (float)data[src_index]; } } } @@ -83,7 +83,7 @@ image load_image_stb(const char* filename, int channels) image make_image(int w, int h, int c) { image out = make_empty_image(w, h, c); - out.data = ( float* )calloc((size_t)h * w * c, sizeof(float)); + out.data = (float*)calloc((size_t)h * w * c, sizeof(float)); return out; } @@ -125,17 +125,17 @@ image imread_process(const char* filename, int img_w, int img_h, float* means, f switch (choice) { - case 0: - out = gray2bgr(out); - break; - case 1: - out = rgb2gray(out); - break; - case 2: - out = rgb2bgr_permute(out); - break; - default: - break; + case 0: + out = gray2bgr(out); + break; + case 1: + out = rgb2gray(out); + break; + case 2: + out = rgb2bgr_permute(out); + break; + default: + break; } image resImg = make_image(img_w, img_h, out.c); @@ -171,8 +171,8 @@ image resize_image(image im, int ow, int oh) int h = im.h; int w = im.w; float shift = 0.f; - float _scale_x = ( float )((w - shift) / (ow - shift)); - float _scale_y = ( float )((h - shift) / (oh - shift)); + float _scale_x = (float)((w - shift) / (ow - shift)); + float _scale_y = (float)((h - shift) / (oh - shift)); float32x4_t scale_x = vdupq_n_f32(_scale_x); float offset = 0.5; int in_hw = h * w; @@ -215,8 +215,7 @@ image resize_image(image im, int ow, int oh) float32x4_t fx_0 = vsubq_f32(offset_1, fx); - const int32x4_t in_idx = - vaddq_s32(vaddq_s32(vmulq_s32(sy_0, w_0), vcvtq_s32_f32(sx)), vmulq_s32(in_hw_0, k_0)); + const int32x4_t in_idx = vaddq_s32(vaddq_s32(vmulq_s32(sy_0, w_0), vcvtq_s32_f32(sx)), vmulq_s32(in_hw_0, k_0)); int32x4_t in_index0 = in_idx; int32x4_t in_index2 = vaddq_s32(in_idx, vcvtq_s32_f32(offset_1)); int32x4_t in_index1 = vaddq_s32(in_idx, w_0); @@ -290,8 +289,8 @@ image resize_image(image im, int ow, int oh) int h = im.h; int w = im.w; float shift = 0.f; - float _scale_x = ( float )((w - shift) / (ow - shift)); - float _scale_y = ( float )((h - shift) / (oh - shift)); + float _scale_x = (float)((w - shift) / (ow - shift)); + float _scale_y = (float)((h - shift) / (oh - shift)); float32x4_t scale_x = vdupq_n_f32(_scale_x); float offset = 0.5; @@ -335,8 +334,7 @@ image resize_image(image im, int ow, int oh) float32x4_t fx_0 = vsubq_f32(offset_1, fx); - const int32x4_t in_idx = - vaddq_s32(vaddq_s32(vmulq_s32(sy_0, w_0), vcvtq_s32_f32(sx)), vmulq_s32(in_hw_0, k_0)); + const int32x4_t in_idx = vaddq_s32(vaddq_s32(vmulq_s32(sy_0, w_0), vcvtq_s32_f32(sx)), vmulq_s32(in_hw_0, k_0)); int32x4_t in_index0 = in_idx; int32x4_t in_index2 = vaddq_s32(in_idx, vcvtq_s32_f32(offset_1)); @@ -408,8 +406,8 @@ image resize_image(image im, int ow, int oh) #endif #else - float scale_x = ( float )(im.w) / (ow); - float scale_y = ( float )(im.h) / (oh); + float scale_x = (float)(im.w) / (ow); + float scale_y = (float)(im.h) / (oh); int w = im.w; int h = im.h; int in_hw = h * w; @@ -481,13 +479,13 @@ image copyMaker(image im, int top, int bottom, int left, int right, float value) void save_image(image im, const char* name) { char buff[256]; - unsigned char* data = ( unsigned char* )calloc((size_t)im.w * im.h * im.c, sizeof(char)); + unsigned char* data = (unsigned char*)calloc((size_t)im.w * im.h * im.c, sizeof(char)); int i, k; for (k = 0; k < im.c; ++k) { for (i = 0; i < im.w * im.h; ++i) { - data[i * im.c + k] = ( unsigned char )(im.data[i + k * im.w * im.h]); + data[i * im.c + k] = (unsigned char)(im.data[i + k * im.w * im.h]); } } @@ -505,24 +503,24 @@ void save_image(image im, const char* name) switch (f) { - case 0: - sprintf(buff, "%s.jpg", name); - success = stbi_write_jpg(buff, im.w, im.h, im.c, data, 80); - break; - case 1: - sprintf(buff, "%s.png", name); - success = stbi_write_png(buff, im.w, im.h, im.c, data, im.w * im.c); - break; - case 2: - sprintf(buff, "%s.tga", name); - success = stbi_write_tga(buff, im.w, im.h, im.c, data); - break; - case 3: - sprintf(buff, "%s.bmp", name); - success = stbi_write_bmp(buff, im.w, im.h, im.c, data); - break; - default: - return; + case 0: + sprintf(buff, "%s.jpg", name); + success = stbi_write_jpg(buff, im.w, im.h, im.c, data, 80); + break; + case 1: + sprintf(buff, "%s.png", name); + success = stbi_write_png(buff, im.w, im.h, im.c, data, im.w * im.c); + break; + case 2: + sprintf(buff, "%s.tga", name); + success = stbi_write_tga(buff, im.w, im.h, im.c, data); + break; + case 3: + sprintf(buff, "%s.bmp", name); + success = stbi_write_bmp(buff, im.w, im.h, im.c, data); + break; + default: + return; } free(data); if (!success) @@ -588,7 +586,7 @@ static float get_pixelBychannel(image m, int x, int y, int c) image copy_image(image p) { image copy = p; - copy.data = ( float* )calloc((size_t)p.h * p.w * p.c, sizeof(float)); + copy.data = (float*)calloc((size_t)p.h * p.w * p.c, sizeof(float)); memcpy(copy.data, p.data, (unsigned long)p.h * p.w * p.c * sizeof(float)); return copy; } @@ -644,7 +642,8 @@ image imread2post(const char* filename) { image im = load_image_stb(filename, 0); const int len = im.c * im.h * im.w; - for (int i = 0; i < len; ++i) { + for (int i = 0; i < len; ++i) + { im.data[i] *= 255; } return im; @@ -653,20 +652,21 @@ image imread2post(const char* filename) image rgb2bgr_permute(image src) { const int len = src.c * src.h * src.w; - float* GRB = ( float* )malloc(sizeof(float) * len); + float* GRB = (float*)malloc(sizeof(float) * len); for (int c = 0; c < src.c; c++) { for (int h = 0; h < src.h; h++) { for (int w = 0; w < src.w; w++) { - int newIndex = ( c )*src.h * src.w + h * src.w + w; + int newIndex = (c)*src.h * src.w + h * src.w + w; int grbIndex = (2 - c) * src.h * src.w + h * src.w + w; GRB[grbIndex] = src.data[newIndex]; } } } - for (int i = 0; i < len; ++i) { + for (int i = 0; i < len; ++i) + { src.data[i] = GRB[i]; } free(GRB); @@ -675,14 +675,14 @@ image rgb2bgr_permute(image src) image image_permute(image src) { - float* GRB = ( float* )malloc(sizeof(float) * src.c * src.h * src.w); + float* GRB = (float*)malloc(sizeof(float) * src.c * src.h * src.w); for (int c = 0; c < src.c; c++) { for (int h = 0; h < src.h; h++) { for (int w = 0; w < src.w; w++) { - int newIndex = ( c )*src.h * src.w + h * src.w + w; + int newIndex = (c)*src.h * src.w + h * src.w + w; int grbIndex = (2 - c) * src.h * src.w + h * src.w + w; GRB[grbIndex] = src.data[newIndex]; } @@ -698,7 +698,7 @@ image gray2bgr(image src) res.c = 3; res.h = src.h; res.w = src.w; - res.data = ( float* )malloc(sizeof(float) * 3 * src.h * src.w); + res.data = (float*)malloc(sizeof(float) * 3 * src.h * src.w); for (int x = 0; x < src.h; x++) { for (int y = 0; y < src.w; y++) @@ -716,7 +716,7 @@ image gray2bgr(image src) image tranpose(image src) { int size = src.c * src.h * src.w; - float* tempData = ( float* )malloc(sizeof(float) * size); + float* tempData = (float*)malloc(sizeof(float) * size); int index = 0; for (int c = 0; c < src.c; c++) @@ -813,7 +813,7 @@ image rgb2gray(image src) res.h = src.h; res.w = src.w; res.c = 1; - res.data = ( float* )malloc(sizeof(float) * res.h * res.w); + res.data = (float*)malloc(sizeof(float) * res.h * res.w); for (int i = 0; i < res.h; i++) { for (int j = 0; j < res.w; j++) @@ -840,7 +840,7 @@ image letterbox(image im, int w, int h) { int ow = im.w; int oh = im.h; - if ((( float )w / im.w) < (( float )h / im.h)) + if (((float)w / im.w) < ((float)h / im.h)) { ow = w; oh = (im.h * w) / im.w; @@ -855,7 +855,7 @@ image letterbox(image im, int w, int h) boxed.w = w; boxed.h = h; boxed.c = im.c; - boxed.data = ( float* )malloc(sizeof(float) * im.c * h * w); + boxed.data = (float*)malloc(sizeof(float) * im.c * h * w); for (int i = 0; i < boxed.c * boxed.h * boxed.w; i++) { @@ -870,20 +870,20 @@ image letterbox(image im, int w, int h) void tengine_resize_f32(float* data, float* res, int ow, int oh, int c, int h, int w) { - float _scale_x = ( float )(w) / ( float )(ow); - float _scale_y = ( float )(h) / ( float )(oh); + float _scale_x = (float)(w) / (float)(ow); + float _scale_y = (float)(h) / (float)(oh); float offset = 0.5f; - int16_t* buf = ( int16_t* )malloc((ow + ow + ow + oh + oh + oh) * sizeof(int16_t)); - int16_t* xCoef = ( int16_t* )(buf); - int16_t* xPos = ( int16_t* )(buf + ow + ow); - int16_t* yCoef = ( int16_t* )(buf + ow + ow + ow); - int16_t* yPos = ( int16_t* )(buf + ow + ow + ow + oh + oh); + int16_t* buf = (int16_t*)malloc((ow + ow + ow + oh + oh + oh) * sizeof(int16_t)); + int16_t* xCoef = (int16_t*)(buf); + int16_t* xPos = (int16_t*)(buf + ow + ow); + int16_t* yCoef = (int16_t*)(buf + ow + ow + ow); + int16_t* yPos = (int16_t*)(buf + ow + ow + ow + oh + oh); for (int i = 0; i < ow; i++) { - float fx = ( float )((( float )i + offset) * _scale_x - offset); - int sx = ( int )fx; + float fx = (float)(((float)i + offset) * _scale_x - offset); + int sx = (int)fx; fx -= sx; if (sx < 0) { @@ -902,8 +902,8 @@ void tengine_resize_f32(float* data, float* res, int ow, int oh, int c, int h, i for (int j = 0; j < oh; j++) { - float fy = ( float )((( float )j + offset) * _scale_y - offset); - int sy = ( int )fy; + float fy = (float)(((float)j + offset) * _scale_y - offset); + int sy = (int)fy; fy -= sy; if (sy < 0) { @@ -921,7 +921,7 @@ void tengine_resize_f32(float* data, float* res, int ow, int oh, int c, int h, i } // int32_t* row = new int32_t[ow + ow]; - int32_t* row = ( int32_t* )malloc((ow + ow) * sizeof(int32_t)); + int32_t* row = (int32_t*)malloc((ow + ow) * sizeof(int32_t)); for (int k = 0; k < c; k++) { @@ -1021,7 +1021,7 @@ static void sort_cls_score(cls_score* array, int left, int right) void print_topk(float* data, int total_num, int topk) { - cls_score* cls_scores = ( cls_score* )malloc(total_num * sizeof(cls_score)); + cls_score* cls_scores = (cls_score*)malloc(total_num * sizeof(cls_score)); for (int i = 0; i < total_num; i++) { cls_scores[i].id = i; diff --git a/tests/common/util/mathp.c b/tests/common/util/mathp.c index c1e5933cd..52a78aa18 100644 --- a/tests/common/util/mathp.c +++ b/tests/common/util/mathp.c @@ -27,38 +27,32 @@ #include - int imin(int a, int b) { return a <= b ? a : b; } - int imax(int a, int b) { return a >= b ? a : b; } - int min_abs(int a, int b) { return imin(abs(a), abs(b)); } - int max_abs(int a, int b) { return imax(abs(a), abs(b)); } - static int solve_gcd(int large, int small) { int val = large % small; return 0 == val ? small : gcd(small, val); } - int gcd(int a, int b) { if (0 == a || 0 == b) @@ -67,7 +61,6 @@ int gcd(int a, int b) return solve_gcd(max_abs(a, b), min_abs(a, b)); } - int lcm(int a, int b) { if (0 == a || 0 == b) @@ -76,14 +69,12 @@ int lcm(int a, int b) return abs(a * b) / solve_gcd(max_abs(a, b), min_abs(a, b)); } - int align(int value, int step) { const int mask = ~(abs(step) - 1); return (value + step) & mask; } - void* align_address(void* address, int step) { const size_t mask = ~(abs(step) - 1); diff --git a/tests/common/util/mathp.h b/tests/common/util/mathp.h index 672ddcdc1..16a7c5d9d 100644 --- a/tests/common/util/mathp.h +++ b/tests/common/util/mathp.h @@ -25,7 +25,6 @@ #pragma once - /*! * @brief Solve min value * @@ -36,7 +35,6 @@ */ int imin(int a, int b); - /*! * @brief Solve max value * @@ -47,7 +45,6 @@ int imin(int a, int b); */ int imax(int a, int b); - /*! * @brief Solve min absolute value * @@ -58,7 +55,6 @@ int imax(int a, int b); */ int min_abs(int a, int b); - /*! * @brief Solve max absolute value * @@ -69,7 +65,6 @@ int min_abs(int a, int b); */ int max_abs(int a, int b); - /*! * @brief Solve greatest common divisor * @@ -80,7 +75,6 @@ int max_abs(int a, int b); */ int gcd(int a, int b); - /*! * @brief Solve lowest common multiple * @@ -91,7 +85,6 @@ int gcd(int a, int b); */ int lcm(int a, int b); - /*! * @brief Solve min aligned value with the step length * @@ -102,7 +95,6 @@ int lcm(int a, int b); */ int align(int value, int step); - /*! * @brief Get aligned pointer * diff --git a/tests/common/util/vector.c b/tests/common/util/vector.c index c4bfd87f6..636009936 100644 --- a/tests/common/util/vector.c +++ b/tests/common/util/vector.c @@ -31,25 +31,22 @@ #include - typedef struct vector_entry { int valid; unsigned char data[]; } vector_entry_t; - static inline vector_entry_t* get_vector_entry(vector_t* v, int idx) { return (vector_entry_t*)((char*)v->mem + v->entry_size * idx); } - static inline void free_vector_data_resource(vector_t* v, int idx) { vector_entry_t* e = get_vector_entry(v, idx); - if(e->valid && v->free_func) + if (e->valid && v->free_func) { v->free_func(e->data); } @@ -57,7 +54,6 @@ static inline void free_vector_data_resource(vector_t* v, int idx) e->valid = 0; } - static inline void remove_vector_data_not_tail(vector_t* v, int idx) { vector_entry_t* entry_ptr = NULL; @@ -78,7 +74,6 @@ static inline void remove_vector_data_not_tail(vector_t* v, int idx) entry_ptr->valid = 0; } - vector_t* create_vector(int elem_size, void (*free_data)(void*)) { vector_t* v = (vector_t*)malloc(sizeof(vector_t)); @@ -109,7 +104,6 @@ vector_t* create_vector(int elem_size, void (*free_data)(void*)) return v; } - void release_vector(vector_t* v) { for (int i = 0; i < v->elem_num; i++) @@ -121,7 +115,6 @@ void release_vector(vector_t* v) free(v); } - int get_vector_num(vector_t* v) { if (NULL != v) @@ -132,7 +125,6 @@ int get_vector_num(vector_t* v) return 0; } - int resize_vector(vector_t* v, int new_size) { void* new_mem = NULL; @@ -162,7 +154,7 @@ int resize_vector(vector_t* v, int new_size) } v->real_mem = new_mem; - v->mem = ( void* )(((size_t)(v->real_mem)) & (~(TE_VECTOR_ALIGN_SIZE - 1))); + v->mem = (void*)(((size_t)(v->real_mem)) & (~(TE_VECTOR_ALIGN_SIZE - 1))); for (int i = v->space_num; i < new_size; i++) { @@ -175,10 +167,9 @@ int resize_vector(vector_t* v, int new_size) return 0; } - int push_vector_data(vector_t* v, void* data) { - if(v->elem_num == v->space_num && resize_vector(v, v->elem_num + v->ahead_num) < 0) + if (v->elem_num == v->space_num && resize_vector(v, v->elem_num + v->ahead_num) < 0) { return -1; } @@ -189,12 +180,11 @@ int push_vector_data(vector_t* v, void* data) return 0; } - int set_vector_data(vector_t* v, int idx, void* data) { vector_entry_t* e = NULL; - if(idx >= v->elem_num) + if (idx >= v->elem_num) return -1; free_vector_data_resource(v, idx); @@ -207,10 +197,9 @@ int set_vector_data(vector_t* v, int idx, void* data) return 0; } - void* get_vector_data(vector_t* v, int index) { - if(index >= v->elem_num) + if (index >= v->elem_num) { return NULL; } @@ -220,7 +209,6 @@ void* get_vector_data(vector_t* v, int index) return e->data; } - int remove_vector_via_pointer(vector_t* v, void* data) { const int count = v->elem_num; @@ -245,11 +233,10 @@ int remove_vector_via_pointer(vector_t* v, void* data) return 0; } - void remove_vector_via_index(vector_t* v, int idx) { // the last one - if(idx == v->elem_num - 1) + if (idx == v->elem_num - 1) { free_vector_data_resource(v, idx); v->elem_num--; diff --git a/tests/common/util/vector.h b/tests/common/util/vector.h index ef7a97906..959985e11 100644 --- a/tests/common/util/vector.h +++ b/tests/common/util/vector.h @@ -25,25 +25,23 @@ #pragma once - /*! * @struct vector_t * @brief C style vector for consecutive storage. */ typedef struct vector { - int elem_size; //!< elements size which will be pushed into vector - int elem_num; //!< current counter of inserted elements - - int entry_size; //!< size of inside vector header entry - int space_num; //!< the allocated elements counter, which should greater equal to 'elem_num' - int ahead_num; //!< allocated step when vector is full - void* real_mem; //!< real aligned memory address which point to vector entry - void* mem; //!< visual aligned address which point to the very begging of elements - void (*free_func)(void*); //!< elements free function, will be called when release elements or vector + int elem_size; //!< elements size which will be pushed into vector + int elem_num; //!< current counter of inserted elements + + int entry_size; //!< size of inside vector header entry + int space_num; //!< the allocated elements counter, which should greater equal to 'elem_num' + int ahead_num; //!< allocated step when vector is full + void* real_mem; //!< real aligned memory address which point to vector entry + void* mem; //!< visual aligned address which point to the very begging of elements + void (*free_func)(void*); //!< elements free function, will be called when release elements or vector } vector_t; - /*! * @brief Create a vector for a struct(or something else). * @@ -56,7 +54,6 @@ typedef struct vector */ vector_t* create_vector(int elem_size, void (*free_func)(void*)); - /*! * @brief Release a vector. * @@ -64,7 +61,6 @@ vector_t* create_vector(int elem_size, void (*free_func)(void*)); */ void release_vector(vector_t* v); - /*! * @brief Get the count of elements. * @@ -74,7 +70,6 @@ void release_vector(vector_t* v); */ int get_vector_num(vector_t* v); - /*! * @brief Resize a vector. * @@ -85,7 +80,6 @@ int get_vector_num(vector_t* v); */ int resize_vector(vector_t* v, int new_size); - /*! * @brief Push a element into vector from its pointer. * @@ -96,7 +90,6 @@ int resize_vector(vector_t* v, int new_size); */ int push_vector_data(vector_t* v, void* data); - /*! * @brief Set a element via its index. * @@ -108,7 +101,6 @@ int push_vector_data(vector_t* v, void* data); */ int set_vector_data(vector_t* v, int index, void* data); - /*! * @brief Get a element via its index. * @@ -119,7 +111,6 @@ int set_vector_data(vector_t* v, int index, void* data); */ void* get_vector_data(vector_t* v, int index); - /*! * @brief Remove a element via its pointer. * @@ -130,7 +121,6 @@ void* get_vector_data(vector_t* v, int index); */ int remove_vector_via_pointer(vector_t* v, void* data); - /*! * @brief Remove a element via its index. * diff --git a/tests/models/test_model_alphapose.cpp b/tests/models/test_model_alphapose.cpp index 678b33bf3..02a5a84ad 100644 --- a/tests/models/test_model_alphapose.cpp +++ b/tests/models/test_model_alphapose.cpp @@ -37,24 +37,24 @@ #include "tengine/c_api.h" #include "tengine_operations.h" -#define DEFAULT_IMG_H 320 -#define DEFAULT_IMG_W 256 -#define DEFAULT_SCALE1 (0.0039216) -#define DEFAULT_SCALE2 (0.0039215) -#define DEFAULT_SCALE3 (0.0039215) -#define DEFAULT_MEAN1 0.406 -#define DEFAULT_MEAN2 0.457 -#define DEFAULT_MEAN3 0.480 +#define DEFAULT_IMG_H 320 +#define DEFAULT_IMG_W 256 +#define DEFAULT_SCALE1 (0.0039216) +#define DEFAULT_SCALE2 (0.0039215) +#define DEFAULT_SCALE3 (0.0039215) +#define DEFAULT_MEAN1 0.406 +#define DEFAULT_MEAN2 0.457 +#define DEFAULT_MEAN3 0.480 #define DEFAULT_REPEAT_COUNT 1 #define DEFAULT_THREAD_COUNT 1 const float s_keypoint_thresh = 0.2; int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.0001) + if (fabs(tmp) > 0.0001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -69,7 +69,7 @@ void show_usage() fprintf(stderr, "[Usage]: [-h]\n [-m model_file] [-r repeat_count] [-t thread_count]\n"); } -bool tengine_predict(float * input_data, graph_t graph, const int input_dims[4], const int & num_thread, const int & loop_count) +bool tengine_predict(float* input_data, graph_t graph, const int input_dims[4], const int& num_thread, const int& loop_count) { /* set runtime options */ struct options opt; @@ -144,20 +144,20 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -198,7 +198,7 @@ int main(int argc, char* argv[]) std::string model_name = "alphapose"; std::string input_file = "./data/" + model_name + "_in.bin"; - FILE *fp; + FILE* fp; fp = fopen(input_file.c_str(), "rb"); if (fread(input_data1.data(), sizeof(float), img_size, fp) == 0) { @@ -219,11 +219,11 @@ int main(int argc, char* argv[]) int heatmap_dims[MAX_SHAPE_DIM_NUM] = {0}; get_tensor_shape(output_tensor, heatmap_dims, MAX_SHAPE_DIM_NUM); - float *data = (float *) (get_tensor_buffer(output_tensor)); + float* data = (float*)(get_tensor_buffer(output_tensor)); int output_size1 = get_tensor_buffer_size(output_tensor) / (sizeof(float)); std::string reference_file1 = "./data/" + model_name + "_out.bin"; std::vector reference_data1(output_size1); - FILE *fp1; + FILE* fp1; fp1 = fopen(reference_file1.c_str(), "rb"); if (fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0) { @@ -233,7 +233,6 @@ int main(int argc, char* argv[]) fclose(fp1); int ret1 = float_mismatch(data, reference_data1.data(), output_size1); - /* release tengine */ postrun_graph(graph); destroy_graph(graph); diff --git a/tests/models/test_model_classification.cpp b/tests/models/test_model_classification.cpp index caa348451..633e8f65b 100644 --- a/tests/models/test_model_classification.cpp +++ b/tests/models/test_model_classification.cpp @@ -36,24 +36,24 @@ #include "tengine/c_api.h" #include "tengine_operations.h" -#define DEFAULT_IMG_H 224 -#define DEFAULT_IMG_W 224 -#define DEFAULT_SCALE1 1.f -#define DEFAULT_SCALE2 1.f -#define DEFAULT_SCALE3 1.f -#define DEFAULT_MEAN1 104.007 -#define DEFAULT_MEAN2 116.669 -#define DEFAULT_MEAN3 122.679 -#define DEFAULT_LOOP_COUNT 1 +#define DEFAULT_IMG_H 224 +#define DEFAULT_IMG_W 224 +#define DEFAULT_SCALE1 1.f +#define DEFAULT_SCALE2 1.f +#define DEFAULT_SCALE3 1.f +#define DEFAULT_MEAN1 104.007 +#define DEFAULT_MEAN2 116.669 +#define DEFAULT_MEAN3 122.679 +#define DEFAULT_LOOP_COUNT 1 #define DEFAULT_THREAD_COUNT 1 #define DEFAULT_CPU_AFFINITY 255 int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.001) + if (fabs(tmp) > 0.001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -82,7 +82,7 @@ int main(int argc, char* argv[]) int num_thread = DEFAULT_THREAD_COUNT; int cpu_affinity = DEFAULT_CPU_AFFINITY; std::string model_name; - std::string model_file; + std::string model_file; char* image_file = NULL; float img_hw[2] = {0.f}; int img_h = 0; @@ -95,37 +95,37 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_name = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'g': - split(img_hw, optarg, ","); - img_h = ( int )img_hw[0]; - img_w = ( int )img_hw[1]; - break; - case 's': - split(scale, optarg, ","); - break; - case 'w': - split(mean, optarg, ","); - break; - case 'r': - loop_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'a': - cpu_affinity = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_name = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'g': + split(img_hw, optarg, ","); + img_h = (int)img_hw[0]; + img_w = (int)img_hw[1]; + break; + case 's': + split(scale, optarg, ","); + break; + case 'w': + split(mean, optarg, ","); + break; + case 'r': + loop_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'a': + cpu_affinity = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -202,7 +202,7 @@ int main(int argc, char* argv[]) /* set the shape, data buffer of input_tensor of the graph */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw + int dims[] = {1, 3, img_h, img_w}; // nchw std::vector input_data(img_size); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); @@ -222,7 +222,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -252,7 +252,7 @@ int main(int argc, char* argv[]) /* get the result of classification */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); print_topk(output_data, output_size, 5); @@ -261,12 +261,12 @@ int main(int argc, char* argv[]) /* check the result */ std::string reference_file = "./data/" + model_name + "_out.bin"; std::vector reference_data(output_size); - FILE *fp; + FILE* fp; fp = fopen(reference_file.c_str(), "rb"); if (!fp) { - fprintf(stderr, "read reference %s failed!\n",reference_file.c_str()); - return -1; + fprintf(stderr, "read reference %s failed!\n", reference_file.c_str()); + return -1; } if (fread(reference_data.data(), sizeof(float), output_size, fp) == 0) { @@ -282,5 +282,5 @@ int main(int argc, char* argv[]) destroy_graph(graph); release_tengine(); - return ret; + return ret; } diff --git a/tests/models/test_model_common.cpp b/tests/models/test_model_common.cpp index 26a3076f6..1f5b2ae7c 100644 --- a/tests/models/test_model_common.cpp +++ b/tests/models/test_model_common.cpp @@ -71,8 +71,8 @@ int onnx_model_test(std::string model_file, int img_c, int img_h, int img_w) /* set the shape, data buffer of input_tensor of the graph */ int img_size = img_h * img_w * img_c; - int dims[] = {1, img_c, img_h, img_w}; // nchw - float* input_data = ( float* )malloc(img_size * sizeof(float)); + int dims[] = {1, img_c, img_h, img_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -91,7 +91,7 @@ int onnx_model_test(std::string model_file, int img_c, int img_h, int img_w) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -105,7 +105,7 @@ int onnx_model_test(std::string model_file, int img_c, int img_h, int img_w) { input_data[i] = 1.f; } - + /* run graph */ if (run_graph(graph, 1) < 0) { @@ -118,7 +118,7 @@ int onnx_model_test(std::string model_file, int img_c, int img_h, int img_w) { /* get the result of classification */ tensor_t output_tensor = get_graph_output_tensor(graph, tensor_id, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); const char* tensor_name = get_tensor_name(output_tensor); fprintf(stderr, "test output tensor: %s begin\n", tensor_name); @@ -133,11 +133,11 @@ int onnx_model_test(std::string model_file, int img_c, int img_h, int img_w) fprintf(stderr, "open val file %s failed.\n", val_data.c_str()); return -1; } - + std::string line_str; char* end; int onnx_out_size = 1; - while(std::getline(f, line_str)) + while (std::getline(f, line_str)) { // std::cout << line_str << std::endl; if (line_str == "shape:") @@ -157,12 +157,12 @@ int onnx_model_test(std::string model_file, int img_c, int img_h, int img_w) float* onnx_out_data = (float*)malloc(sizeof(float) * onnx_out_size); int i = 0; - while(std::getline(f, line_str)) + while (std::getline(f, line_str)) { std::stringstream ss(line_str); std::string str; int j = 0; - while(getline(ss, str, ' ')) + while (getline(ss, str, ' ')) { float tmp = strtof32(str.c_str(), &end); onnx_out_data[i++] = tmp; @@ -182,7 +182,7 @@ int onnx_model_test(std::string model_file, int img_c, int img_h, int img_w) } fprintf(stderr, "test model: %s pass!\n", model_file.c_str()); - + /* release tengine */ free(input_data); postrun_graph(graph); @@ -216,20 +216,20 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'g': - split(img_hw, optarg, ","); - img_c = ( int )img_hw[0]; - img_h = ( int )img_hw[1]; - img_w = ( int )img_hw[2]; - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'g': + split(img_hw, optarg, ","); + img_c = (int)img_hw[0]; + img_h = (int)img_hw[1]; + img_w = (int)img_hw[2]; + break; + case 'h': + show_usage(); + return 0; + default: + break; } } diff --git a/tests/models/test_model_crnn.cpp b/tests/models/test_model_crnn.cpp index e280054e7..9ae20d5fa 100644 --- a/tests/models/test_model_crnn.cpp +++ b/tests/models/test_model_crnn.cpp @@ -40,10 +40,10 @@ int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.0001) + if (fabs(tmp) > 0.0001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -93,23 +93,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } - std::string model_name="crnn_lite_dense"; + std::string model_name = "crnn_lite_dense"; /* check files */ if (model_file == nullptr) { @@ -145,7 +145,7 @@ int main(int argc, char* argv[]) int img_size = img_h * img_w * 1; int dims[] = {1, 1, img_h, img_w}; - float* input_data = ( float* )malloc(img_size * sizeof(float)); + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == nullptr) @@ -175,7 +175,7 @@ int main(int argc, char* argv[]) /* prepare process input data, set the data mem to input tensor */ std::string input_file = "./data/" + model_name + "_in.bin"; - FILE *fp; + FILE* fp; fp = fopen(input_file.c_str(), "rb"); if (fread(input_data, sizeof(float), img_size, fp) == 0) { @@ -208,13 +208,13 @@ int main(int argc, char* argv[]) /* process the crnn result */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* ocr_data = ( float* )get_tensor_buffer(output_tensor); + float* ocr_data = (float*)get_tensor_buffer(output_tensor); /* check the result */ int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); std::string reference_file = "./data/" + model_name + "_out.bin"; std::vector reference_data(output_size); - FILE *fp1; + FILE* fp1; fp1 = fopen(reference_file.c_str(), "rb"); if (fread(reference_data.data(), sizeof(float), output_size, fp1) == 0) { @@ -225,7 +225,7 @@ int main(int argc, char* argv[]) int ret = float_mismatch(ocr_data, reference_data.data(), output_size); -// process_crnn_result(ocr_data, label_file); + // process_crnn_result(ocr_data, label_file); free(input_data); postrun_graph(graph); diff --git a/tests/models/test_model_efficientdet.c b/tests/models/test_model_efficientdet.c index 1a0ac9759..5ede1b1d6 100644 --- a/tests/models/test_model_efficientdet.c +++ b/tests/models/test_model_efficientdet.c @@ -31,24 +31,24 @@ #include "tengine/c_api.h" #include "tengine_operations.h" -#define DEFAULT_IMG_H 512 -#define DEFAULT_IMG_W 512 -#define DEFAULT_SCALE1 0.017124754f -#define DEFAULT_SCALE2 0.017507003f -#define DEFAULT_SCALE3 0.017429194f -#define DEFAULT_MEAN1 123.675 -#define DEFAULT_MEAN2 116.280 -#define DEFAULT_MEAN3 103.530 -#define DEFAULT_LOOP_COUNT 1 +#define DEFAULT_IMG_H 512 +#define DEFAULT_IMG_W 512 +#define DEFAULT_SCALE1 0.017124754f +#define DEFAULT_SCALE2 0.017507003f +#define DEFAULT_SCALE3 0.017429194f +#define DEFAULT_MEAN1 123.675 +#define DEFAULT_MEAN2 116.280 +#define DEFAULT_MEAN3 103.530 +#define DEFAULT_LOOP_COUNT 1 #define DEFAULT_THREAD_COUNT 1 #define DEFAULT_CPU_AFFINITY 255 int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.001) + if (fabs(tmp) > 0.001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -59,15 +59,18 @@ int float_mismatch(float* current, float* reference, int size) } void repeat(const float* arr, int arr_length, int times, float offset, - float* result, int arr_starts_from, int arr_stride) { + float* result, int arr_starts_from, int arr_stride) +{ int length = arr_length * times; - if (result == NULL) { + if (result == NULL) + { result = malloc(length * sizeof(float)); arr_starts_from = 0; } - for (int i = 0, j = 0; i < length; i++, j += arr_stride) { + for (int i = 0, j = 0; i < length; i++, j += arr_stride) + { result[j + arr_starts_from] = arr[i / times] + offset; } } @@ -78,9 +81,9 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in int PYRAMID_LEVELS[] = {3, 4, 5, 6, 7}; int STRIDES[] = {8, 16, 32, 64, 128}; float SCALES[] = { - (float) pow(2, 0.), - (float) pow(2, 1. / 3.), - (float) pow(2, 2. / 3.), + (float)pow(2, 0.), + (float)pow(2, 1. / 3.), + (float)pow(2, 2. / 3.), }; float RATIOS_X[] = {1.f, 1.4f, 0.7f}; float RATIOS_Y[] = {1.f, 0.7f, 1.4f}; @@ -117,8 +120,8 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in /* set the shape, data buffer of input_tensor of the graph */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - float* input_data = ( float* )malloc(img_size * sizeof(float)); + int dims[] = {1, 3, img_h, img_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -132,9 +135,7 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in fprintf(stderr, "Set input tensor shape failed\n"); return -1; } - - if (set_tensor_buffer(input_tensor, input_data, img_size * sizeof(float)) < 0) { fprintf(stderr, "Set input tensor buffer failed\n"); @@ -152,7 +153,7 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in float means[3] = {mean[0], mean[1], mean[2]}; float scales[3] = {scale[0], scale[1], scale[2]}; char* input_file = "./data/efficientdet_in.bin"; - FILE *fp; + FILE* fp; fp = fopen(input_file, "rb"); if (fread(input_data, sizeof(float), img_size, fp) == 0) @@ -191,19 +192,19 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in /* get the result of classification */ tensor_t output_tensor_regression = get_graph_output_tensor(graph, 0, 0); - float* output_data_regression = ( float* )get_tensor_buffer(output_tensor_regression); + float* output_data_regression = (float*)get_tensor_buffer(output_tensor_regression); int num_anchors = get_tensor_buffer_size(output_tensor_regression) / sizeof(float) / 4; tensor_t output_tensor_classification = get_graph_output_tensor(graph, 1, 0); - float* output_data_classification = ( float* )get_tensor_buffer(output_tensor_classification); + float* output_data_classification = (float*)get_tensor_buffer(output_tensor_classification); int num_classes = get_tensor_buffer_size(output_tensor_classification) / sizeof(float) / num_anchors; // postprocess char* output_file1 = "./data/efficientdet_out1.bin"; char* output_file2 = "./data/efficientdet_out2.bin"; - float* reference_data1 = (float*)malloc(num_anchors*sizeof(float)); - float* reference_data2 = (float*)malloc(num_classes*sizeof(float)); - FILE *fp1; + float* reference_data1 = (float*)malloc(num_anchors * sizeof(float)); + float* reference_data2 = (float*)malloc(num_classes * sizeof(float)); + FILE* fp1; //read fp1 = fopen(output_file1, "rb"); if (!fp1 || fread(reference_data1, sizeof(float), num_anchors, fp1) == 0) @@ -221,8 +222,8 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in fclose(fp1); int ret1 = float_mismatch(output_data_regression, reference_data1, num_anchors); int ret2 = float_mismatch(output_data_classification, reference_data2, num_classes); - - int ret = (ret1 | ret2 ); + + int ret = (ret1 | ret2); /* release tengine */ free(input_data); @@ -236,8 +237,8 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in void show_usage() { fprintf( - stderr, - "[Usage]: [-h]\n [-m model_file] \n [-r loop_count] [-t thread_count] [-a cpu_affinity]\n"); + stderr, + "[Usage]: [-h]\n [-m model_file] \n [-r loop_count] [-t thread_count] [-a cpu_affinity]\n"); } int main(int argc, char* argv[]) @@ -258,23 +259,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'r': - loop_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'a': - cpu_affinity = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'r': + loop_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'a': + cpu_affinity = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -304,7 +305,7 @@ int main(int argc, char* argv[]) scale[0] = DEFAULT_SCALE1; scale[1] = DEFAULT_SCALE2; scale[2] = DEFAULT_SCALE3; - } + } if (mean[0] == -1.0 || mean[1] == -1.0 || mean[2] == -1.0) { diff --git a/tests/models/test_model_hrnet.cpp b/tests/models/test_model_hrnet.cpp index 18f8aacfe..7d021808a 100644 --- a/tests/models/test_model_hrnet.cpp +++ b/tests/models/test_model_hrnet.cpp @@ -34,27 +34,29 @@ #define DEFAULT_REPEAT_COUNT 1 #define DEFAULT_THREAD_COUNT 1 -#define LETTERBOX_ROWS 256 -#define LETTERBOX_COLS 256 -#define MODEL_CHANNELS 3 -#define HEATMAP_CHANNEL 16 +#define LETTERBOX_ROWS 256 +#define LETTERBOX_COLS 256 +#define MODEL_CHANNELS 3 +#define HEATMAP_CHANNEL 16 -typedef struct { +typedef struct +{ float x; float y; float score; } ai_point_t; -struct skeleton { +struct skeleton +{ int connection[2]; int left_right_neutral; }; int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.0001) + if (fabs(tmp) > 0.0001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -69,13 +71,13 @@ void show_usage() fprintf(stderr, "[Usage]: [-h]\n [-m model_file] [-r repeat_count] [-t thread_count]\n"); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { int repeat_count = DEFAULT_REPEAT_COUNT; int num_thread = DEFAULT_THREAD_COUNT; char model_string[] = "./models/hrnet.tmfile"; - char *model_file = model_string; - char *image_file = nullptr; + char* model_file = model_string; + char* image_file = nullptr; int img_h = LETTERBOX_COLS; int img_w = LETTERBOX_ROWS; // ai_body_parts_s pose; @@ -88,20 +90,20 @@ int main(int argc, char *argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -116,7 +118,6 @@ int main(int argc, char *argv[]) if (!check_file_exist(model_file)) return -1; - /* set runtime options */ struct options opt; opt.num_thread = num_thread; @@ -142,7 +143,7 @@ int main(int argc, char *argv[]) /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw + int dims[] = {1, 3, img_h, img_w}; // nchw std::vector input_data(img_size); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); @@ -174,7 +175,7 @@ int main(int argc, char *argv[]) /* prepare process input data, set the data mem to input tensor */ std::string model_name = "hrnet"; std::string input_file = "./data/" + model_name + "_in.bin"; - FILE *fp; + FILE* fp; fp = fopen(input_file.c_str(), "rb"); if (fread(input_data.data(), sizeof(float), img_size, fp) == 0) { @@ -207,12 +208,12 @@ int main(int argc, char *argv[]) /* get output tensor */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float *data = (float *) (get_tensor_buffer(output_tensor)); + float* data = (float*)(get_tensor_buffer(output_tensor)); int output_size1 = get_tensor_buffer_size(output_tensor) / (sizeof(float)); std::string reference_file1 = "./data/" + model_name + "_out.bin"; std::vector reference_data1(output_size1); - FILE *fp1; + FILE* fp1; fp1 = fopen(reference_file1.c_str(), "rb"); if (fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0) { @@ -221,7 +222,7 @@ int main(int argc, char *argv[]) } fclose(fp1); int ret1 = float_mismatch(data, reference_data1.data(), output_size1); - + postrun_graph(graph); destroy_graph(graph); release_tengine(); diff --git a/tests/models/test_model_landmark.cpp b/tests/models/test_model_landmark.cpp index 9ece39520..4a5f442e5 100644 --- a/tests/models/test_model_landmark.cpp +++ b/tests/models/test_model_landmark.cpp @@ -35,10 +35,10 @@ #include int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.0001) + if (fabs(tmp) > 0.0001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -51,7 +51,7 @@ void get_input_fp32_data(const char* image_file, float* input_data, int img_h, i { image img = imread_process(image_file, img_w, img_h, mean, scale); - float* image_data = ( float* )img.data; + float* image_data = (float*)img.data; for (int i = 0; i < img_w * img_h * 3; i++) input_data[i] = image_data[i]; @@ -80,20 +80,20 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -129,8 +129,8 @@ int main(int argc, char* argv[]) /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - float* input_data = (float* )malloc(img_size * sizeof(float)); + int dims[] = {1, 3, img_h, img_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == nullptr) @@ -159,10 +159,10 @@ int main(int argc, char* argv[]) } /* prepare process input data, set the data mem to input tensor */ - std::string model_name="landmark"; + std::string model_name = "landmark"; // get_input_fp32_data(image_file, input_data, img_h, img_w, mean, scale); std::string input_file = "./data/" + model_name + "_in.bin"; - FILE *fp; + FILE* fp; fp = fopen(input_file.c_str(), "rb"); if (fread(input_data, sizeof(float), img_size, fp) == 0) { @@ -197,13 +197,13 @@ int main(int argc, char* argv[]) /* get output tensor */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )(get_tensor_buffer(output_tensor)); - int data_size = get_tensor_buffer_size(output_tensor) / sizeof(float ); + float* output_data = (float*)(get_tensor_buffer(output_tensor)); + int data_size = get_tensor_buffer_size(output_tensor) / sizeof(float); // save output_data std::string reference_file1 = "./data/" + model_name + "_out.bin"; std::vector reference_data(data_size); - FILE *fp1; + FILE* fp1; //read fp1 = fopen(reference_file1.c_str(), "rb"); if (fread(reference_data.data(), sizeof(float), data_size, fp1) == 0) diff --git a/tests/models/test_model_mobilefacenet.cpp b/tests/models/test_model_mobilefacenet.cpp index 38a30050d..5ff9b88ba 100644 --- a/tests/models/test_model_mobilefacenet.cpp +++ b/tests/models/test_model_mobilefacenet.cpp @@ -36,7 +36,7 @@ #define DEFAULT_MEAN3 122.679 #define MOBILE_FACE_HEIGHT 110 -#define MOBILE_FACE_WIDTH 110 +#define MOBILE_FACE_WIDTH 110 graph_t graph; tensor_t input_tensor; @@ -45,10 +45,10 @@ int feature_len; int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.0001) + if (fabs(tmp) > 0.0001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -86,7 +86,7 @@ int getFeature_a(const char* imagefile, float* feature) std::vector input_data(img_size); std::string model_name = "mobilefacenet"; std::string input_file = "./data/" + model_name + "_in1.bin"; - FILE *fp; + FILE* fp; fp = fopen(input_file.c_str(), "rb"); if (fread(input_data.data(), sizeof(float), img_size, fp) == 0) { @@ -94,7 +94,7 @@ int getFeature_a(const char* imagefile, float* feature) return -1; } fclose(fp); - + set_tensor_buffer(input_tensor, input_data.data(), img_size * sizeof(float)); if (run_graph(graph, 1) < 0) @@ -102,7 +102,7 @@ int getFeature_a(const char* imagefile, float* feature) fprintf(stderr, "run_graph fail"); return -1; } - float* data = ( float* )get_tensor_buffer(output_tensor); + float* data = (float*)get_tensor_buffer(output_tensor); int outsize; outsize = get_tensor_buffer_size(output_tensor) / sizeof(float); for (int i = 0; i < outsize; i++) @@ -111,7 +111,7 @@ int getFeature_a(const char* imagefile, float* feature) // save output_data std::string reference_file1 = "./data/" + model_name + "_out1.bin"; std::vector reference_data1(outsize); - FILE *fp1; + FILE* fp1; //read fp1 = fopen(reference_file1.c_str(), "rb"); if (fread(reference_data1.data(), sizeof(float), outsize, fp1) == 0) @@ -135,7 +135,7 @@ int getFeature_b(const char* imagefile, float* feature) std::vector input_data(img_size); std::string model_name = "mobilefacenet"; std::string input_file = "./data/" + model_name + "_in2.bin"; - FILE *fp; + FILE* fp; fp = fopen(input_file.c_str(), "rb"); if (fread(input_data.data(), sizeof(float), img_size, fp) == 0) { @@ -149,7 +149,7 @@ int getFeature_b(const char* imagefile, float* feature) fprintf(stderr, "run_graph fail"); return -1; } - float* data = ( float* )get_tensor_buffer(output_tensor); + float* data = (float*)get_tensor_buffer(output_tensor); int outsize; outsize = get_tensor_buffer_size(output_tensor) / sizeof(float); for (int i = 0; i < outsize; i++) @@ -158,7 +158,7 @@ int getFeature_b(const char* imagefile, float* feature) // save output_data std::string reference_file1 = "./data/" + model_name + "_out2.bin"; std::vector reference_data1(outsize); - FILE *fp1; + FILE* fp1; //read fp1 = fopen(reference_file1.c_str(), "rb"); if (fread(reference_data1.data(), sizeof(float), outsize, fp1) == 0) @@ -196,14 +196,14 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -226,7 +226,7 @@ int main(int argc, char* argv[]) int outputsizea = getFeature_a(person_a, featurea.data()); int outputsizeb = getFeature_b(person_b, featureb.data()); - int ret = (outputsizea | outputsizeb); + int ret = (outputsizea | outputsizeb); release(); return ret; } \ No newline at end of file diff --git a/tests/models/test_model_mobilenet_ssd.c b/tests/models/test_model_mobilenet_ssd.c index 8f65a5eda..134708761 100644 --- a/tests/models/test_model_mobilenet_ssd.c +++ b/tests/models/test_model_mobilenet_ssd.c @@ -29,15 +29,15 @@ #include "tengine_operations.h" #define DEFAULT_MAX_BOX_COUNT 100 -#define DEFAULT_REPEAT_COUNT 1 -#define DEFAULT_THREAD_COUNT 1 +#define DEFAULT_REPEAT_COUNT 1 +#define DEFAULT_THREAD_COUNT 1 int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.0001) + if (fabs(tmp) > 0.0001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -68,20 +68,20 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -117,8 +117,8 @@ int main(int argc, char* argv[]) /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - float* input_data = ( float* )malloc(img_size * sizeof(float)); + int dims[] = {1, 3, img_h, img_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -137,7 +137,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -147,9 +147,9 @@ int main(int argc, char* argv[]) } /* prepare process input data, set the data mem to input tensor */ - char *model_name="mobilenet_ssd"; + char* model_name = "mobilenet_ssd"; char* input_file = "./data/mobilenet_ssd_in.bin"; - FILE *fp; + FILE* fp; fp = fopen(input_file, "rb"); if (fread(input_data, sizeof(float), img_size, fp) == 0) { @@ -183,15 +183,15 @@ int main(int argc, char* argv[]) fprintf(stderr, "--------------------------------------\n"); /* process the detection result */ - tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); //"detection_out" + tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); //"detection_out" int out_dim[4]; get_tensor_shape(output_tensor, out_dim, 4); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size1 = get_tensor_buffer_size(output_tensor) / sizeof(float); char* reference_file1 = "./data/mobilenet_ssd_out.bin"; - float* reference_data1=(float* )malloc(output_size1*4); - FILE *fp1; + float* reference_data1 = (float*)malloc(output_size1 * 4); + FILE* fp1; //read fp1 = fopen(reference_file1, "rb"); if (fread(reference_data1, sizeof(float), output_size1, fp1) == 0) diff --git a/tests/models/test_model_nanodet_m.cpp b/tests/models/test_model_nanodet_m.cpp index eb5cc9300..8fd17dc0d 100644 --- a/tests/models/test_model_nanodet_m.cpp +++ b/tests/models/test_model_nanodet_m.cpp @@ -36,22 +36,21 @@ #include "tengine_operations.h" // tengine output tensor names -const char *cls_pred_name[] = { - "cls_pred_stride_8", "cls_pred_stride_16", "cls_pred_stride_32" -}; -const char *dis_pred_name[] = { +const char* cls_pred_name[] = { + "cls_pred_stride_8", "cls_pred_stride_16", "cls_pred_stride_32"}; +const char* dis_pred_name[] = { #ifdef TRY_POST_SOFTMAX "dis_pred_stride_8", "dis_pred_stride_16", "dis_pred_stride_32" -#else /* !TRY_POST_SOFTMAX */ +#else /* !TRY_POST_SOFTMAX */ "dis_sm_stride_8", "dis_sm_stride_16", "dis_sm_stride_32" #endif /* TRY_POST_SOFTMAX */ }; int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.0001) + if (fabs(tmp) > 0.0001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -61,15 +60,17 @@ int float_mismatch(float* current, float* reference, int size) return 0; } -static void show_usage() { +static void show_usage() +{ fprintf(stderr, "[Usage]: [-h]\n"); fprintf(stderr, " [-m model_file] [-r repeat_count] [-t thread_count] [-o output_file]\n"); } -int main(int argc, char* argv[]) { +int main(int argc, char* argv[]) +{ const char* model_file = "./models/nanodet.tmfile"; - const float mean[3] = { 103.53f, 116.28f, 123.675f }; // bgr - const float norm[3] = { 0.017429f, 0.017507f, 0.017125f }; + const float mean[3] = {103.53f, 116.28f, 123.675f}; // bgr + const float norm[3] = {0.017429f, 0.017507f, 0.017125f}; int repeat_count = 1; int num_thread = 1; @@ -78,32 +79,36 @@ int main(int argc, char* argv[]) { const float nms_threshold = 0.5f; int res; - while ((res = getopt(argc, argv, "m:i:o:r:t:h:")) != -1) { - switch (res) { - case 'm': - model_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + while ((res = getopt(argc, argv, "m:i:o:r:t:h:")) != -1) + { + switch (res) + { + case 'm': + model_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } /* check files */ - if (nullptr == model_file) { + if (nullptr == model_file) + { fprintf(stderr, "Error: Tengine model file not specified!\n"); show_usage(); return -1; } - if (!check_file_exist(model_file)) { + if (!check_file_exist(model_file)) + { return -1; } @@ -115,7 +120,8 @@ int main(int argc, char* argv[]) { opt.affinity = 0; /* inital tengine */ - if (0 != init_tengine()) { + if (0 != init_tengine()) + { fprintf(stderr, "Initial tengine failed.\n"); return -1; } @@ -123,25 +129,27 @@ int main(int argc, char* argv[]) { /* create graph, load tengine model xxx.tmfile */ graph_t graph = create_graph(nullptr, "tengine", model_file); - if (nullptr == graph) { + if (nullptr == graph) + { fprintf(stderr, "Create graph failed.\n"); return -1; } /* get input tensor of graph */ tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); - if (nullptr == input_tensor) { + if (nullptr == input_tensor) + { fprintf(stderr, "Get input tensor failed\n"); return -1; } - int img_size = 320 * 320 * 3; // lb.w * lb.h * lb.c; + int img_size = 320 * 320 * 3; // lb.w * lb.h * lb.c; std::string model_name = "nanodet"; std::string input_file = "./data/" + model_name + "_in.bin"; - std::vectorinput_data(img_size * sizeof(float )); + std::vector input_data(img_size * sizeof(float)); - FILE *fp; + FILE* fp; fp = fopen(input_file.c_str(), "rb"); if (fread(input_data.data(), sizeof(float), img_size, fp) == 0) { @@ -150,12 +158,14 @@ int main(int argc, char* argv[]) { } fclose(fp); /* set the data mem to input tensor */ - if (set_tensor_buffer(input_tensor, input_data.data(), img_size * sizeof(float)) < 0) { + if (set_tensor_buffer(input_tensor, input_data.data(), img_size * sizeof(float)) < 0) + { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; } /* prerun graph to infer shape, and set work options(num_thread, cluster, precision) */ - if (prerun_graph_multithread(graph, opt) < 0) { + if (prerun_graph_multithread(graph, opt) < 0) + { fprintf(stderr, "Prerun multithread graph failed.\n"); return -1; } @@ -164,9 +174,11 @@ int main(int argc, char* argv[]) { double min_time = DBL_MAX; double max_time = DBL_MIN; double total_time = 0.; - for (int i = 0; i < repeat_count; i++) { + for (int i = 0; i < repeat_count; i++) + { double start = get_current_time(); - if (run_graph(graph, 1) < 0) { + if (run_graph(graph, 1) < 0) + { fprintf(stderr, "Run graph failed\n"); return -1; } @@ -177,30 +189,32 @@ int main(int argc, char* argv[]) { max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); - int ret = 0; + int ret = 0; /* nanodet_m postprocess */ // std::vector proposals, objects; - for (int stride_index = 0; stride_index < 3; stride_index++) { + for (int stride_index = 0; stride_index < 3; stride_index++) + { tensor_t cls_tensor = get_graph_tensor(graph, cls_pred_name[stride_index]); tensor_t dis_tensor = get_graph_tensor(graph, dis_pred_name[stride_index]); - if (NULL == cls_tensor || NULL ==dis_tensor) { + if (NULL == cls_tensor || NULL == dis_tensor) + { fprintf(stderr, "get graph tensor failed\n"); return -1; } - float *cls_pred = (float *)get_tensor_buffer(cls_tensor); - float *dis_pred = (float *)get_tensor_buffer(dis_tensor); + float* cls_pred = (float*)get_tensor_buffer(cls_tensor); + float* dis_pred = (float*)get_tensor_buffer(dis_tensor); // save output_data int output_size1 = get_tensor_buffer_size(cls_tensor) / sizeof(float); int output_size2 = get_tensor_buffer_size(dis_tensor) / sizeof(float); - std::string reference_file1 = "./data/" + model_name + "_out" + std::to_string(stride_index*2+1) +".bin"; - std::string reference_file2 = "./data/" + model_name + "_out" + std::to_string(stride_index*2+2) +".bin"; + std::string reference_file1 = "./data/" + model_name + "_out" + std::to_string(stride_index * 2 + 1) + ".bin"; + std::string reference_file2 = "./data/" + model_name + "_out" + std::to_string(stride_index * 2 + 2) + ".bin"; std::vector reference_data1(output_size1); std::vector reference_data2(output_size2); - FILE *fp1; + FILE* fp1; //read fp1 = fopen(reference_file1.c_str(), "rb"); if (!fp || fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0) @@ -218,7 +232,7 @@ int main(int argc, char* argv[]) { fclose(fp1); int ret1 = float_mismatch(cls_pred, reference_data1.data(), output_size1); int ret2 = float_mismatch(dis_pred, reference_data2.data(), output_size2); - ret = ret | (ret1 | ret2); + ret = ret | (ret1 | ret2); } /* release tengine */ @@ -227,4 +241,3 @@ int main(int argc, char* argv[]) { release_tengine(); return ret; } - diff --git a/tests/models/test_model_openpose.cpp b/tests/models/test_model_openpose.cpp index e304e42f0..67074c78d 100644 --- a/tests/models/test_model_openpose.cpp +++ b/tests/models/test_model_openpose.cpp @@ -40,10 +40,10 @@ int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.0001) + if (fabs(tmp) > 0.0001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -72,20 +72,20 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -122,9 +122,9 @@ int main(int argc, char* argv[]) /* set the input shape to initial the graph, and prerun graph to infer shape */ int channel = 3; int img_size = img_h * img_w * channel; - int dims[] = {1, channel, img_h, img_w}; // nchw + int dims[] = {1, channel, img_h, img_w}; // nchw - float* input_data = ( float* )malloc(sizeof(float) * img_size); + float* input_data = (float*)malloc(sizeof(float) * img_size); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == nullptr) @@ -143,7 +143,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -155,7 +155,7 @@ int main(int argc, char* argv[]) /* prepare process input data, set the data mem to input tensor */ std::string model_name = "openpose_coco"; std::string input_file = "./data/" + model_name + "_in.bin"; - FILE *fp; + FILE* fp; fp = fopen(input_file.c_str(), "rb"); if (fread(input_data, sizeof(float), img_size, fp) == 0) { @@ -195,7 +195,7 @@ int main(int argc, char* argv[]) return -1; } - float* outdata = ( float* )get_tensor_buffer(out_tensor); + float* outdata = (float*)get_tensor_buffer(out_tensor); int H = out_dim[2]; int W = out_dim[3]; float show_threshold = 0.1; @@ -203,7 +203,7 @@ int main(int argc, char* argv[]) std::string reference_file1 = "./data/" + model_name + "_out.bin"; int output_size1 = get_tensor_buffer_size(out_tensor) / (sizeof(float)); std::vector reference_data1(output_size1); - FILE *fp1; + FILE* fp1; fp1 = fopen(reference_file1.c_str(), "rb"); if (fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0) { @@ -221,4 +221,3 @@ int main(int argc, char* argv[]) return ret1; } - diff --git a/tests/models/test_model_retinaface.cpp b/tests/models/test_model_retinaface.cpp index 6233ffa80..17f33b609 100644 --- a/tests/models/test_model_retinaface.cpp +++ b/tests/models/test_model_retinaface.cpp @@ -60,7 +60,6 @@ #define MODEL_PATH "models/retinaface.tmfile" - const float CONF_THRESH = 0.8f; const float NMS_THRESH = 0.4f; @@ -76,13 +75,12 @@ const int stride[3] = {32, 16, 8}; const float scales[3][2] = {{32.f, 16.f}, {8.f, 4.f}, {2.f, 1.f}}; - int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.0001) + if (fabs(tmp) > 0.0001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -92,7 +90,6 @@ int float_mismatch(float* current, float* reference, int size) return 0; } - void show_usage() { printf("[Usage]: [-h]\n [-m model_file] [-r repeat_count] [-t thread_count] [-n device_name]\n"); @@ -112,23 +109,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'n': - device_name = optarg; - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'n': + device_name = optarg; + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -140,7 +137,6 @@ int main(int argc, char* argv[]) return -1; } - if (!check_file_exist(model_file)) return -1; @@ -149,7 +145,7 @@ int main(int argc, char* argv[]) opt.num_thread = num_thread; opt.cluster = TENGINE_CLUSTER_ALL; opt.precision = TENGINE_MODE_FP32; - opt.affinity = 0; + opt.affinity = 0; /* inital tengine */ int ret = init_tengine(); @@ -175,19 +171,18 @@ int main(int argc, char* argv[]) int img_size = height * width * 3; std::vector image_data(img_size * sizeof(float)); - std::string model_name = "retinaface"; std::string input_file = "./data/" + model_name + "_in.bin"; - FILE *fp; + FILE* fp; fp = fopen(input_file.c_str(), "rb"); - if (!fp ) + if (!fp) { - fprintf(stderr, "open input file %s failed!\n",input_file.c_str()); + fprintf(stderr, "open input file %s failed!\n", input_file.c_str()); return -1; } if (!fp || fread(image_data.data(), sizeof(float), img_size, fp) == 0) { - fprintf(stderr, "read input file %s failed!\n",input_file.c_str()); + fprintf(stderr, "read input file %s failed!\n", input_file.c_str()); return -1; } fclose(fp); @@ -214,7 +209,7 @@ int main(int argc, char* argv[]) { printf("Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (0 != prerun_graph_multithread(graph, opt)) @@ -242,7 +237,7 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } printf("Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, - num_thread, total_time / ( float )repeat_count, max_time, min_time); + num_thread, total_time / (float)repeat_count, max_time, min_time); printf("--------------------------------------\n"); /* process the detection result */ @@ -264,41 +259,41 @@ int main(int argc, char* argv[]) get_tensor_shape(bbox_blob_tensor, bbox_blob_dims, MAX_SHAPE_DIM_NUM); get_tensor_shape(landmark_blob_tensor, landmark_blob_dims, MAX_SHAPE_DIM_NUM); - float* score_blob = ( float* )get_tensor_buffer(score_blob_tensor); - float* bbox_blob = ( float* )get_tensor_buffer(bbox_blob_tensor); - float* landmark_blob = ( float* )get_tensor_buffer(landmark_blob_tensor); + float* score_blob = (float*)get_tensor_buffer(score_blob_tensor); + float* bbox_blob = (float*)get_tensor_buffer(bbox_blob_tensor); + float* landmark_blob = (float*)get_tensor_buffer(landmark_blob_tensor); // save output_data int output_size1 = get_tensor_buffer_size(score_blob_tensor) / sizeof(float); int output_size2 = get_tensor_buffer_size(bbox_blob_tensor) / sizeof(float); int output_size3 = get_tensor_buffer_size(landmark_blob_tensor) / sizeof(float); - std::string reference_file1 = "./data/" + model_name + "_out" + std::to_string(stride_index*3+1) +".bin"; - std::string reference_file2 = "./data/" + model_name + "_out" + std::to_string(stride_index*3+2) +".bin"; - std::string reference_file3 = "./data/" + model_name + "_out" + std::to_string(stride_index*3+3) +".bin"; + std::string reference_file1 = "./data/" + model_name + "_out" + std::to_string(stride_index * 3 + 1) + ".bin"; + std::string reference_file2 = "./data/" + model_name + "_out" + std::to_string(stride_index * 3 + 2) + ".bin"; + std::string reference_file3 = "./data/" + model_name + "_out" + std::to_string(stride_index * 3 + 3) + ".bin"; std::vector reference_data1(output_size1); std::vector reference_data2(output_size2); std::vector reference_data3(output_size3); - FILE *fp1; + FILE* fp1; //read fp1 = fopen(reference_file1.c_str(), "rb"); if (fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0) { - fprintf(stderr, "read reference %s failed!\n",reference_file1.c_str()); + fprintf(stderr, "read reference %s failed!\n", reference_file1.c_str()); return -1; } fclose(fp1); fp1 = fopen(reference_file2.c_str(), "rb"); if (fread(reference_data2.data(), sizeof(float), output_size2, fp1) == 0) { - fprintf(stderr, "read reference %s failed!\n",reference_file2.c_str()); + fprintf(stderr, "read reference %s failed!\n", reference_file2.c_str()); return -1; } fclose(fp1); fp1 = fopen(reference_file3.c_str(), "rb"); if (fread(reference_data3.data(), sizeof(float), output_size3, fp1) == 0) { - fprintf(stderr, "read reference %s failed!\n",reference_file3.c_str()); + fprintf(stderr, "read reference %s failed!\n", reference_file3.c_str()); return -1; } fclose(fp1); diff --git a/tests/models/test_model_ultraface.cpp b/tests/models/test_model_ultraface.cpp index c2b9727a4..2c7991459 100644 --- a/tests/models/test_model_ultraface.cpp +++ b/tests/models/test_model_ultraface.cpp @@ -31,12 +31,12 @@ #include "tengine/c_api.h" #include "tengine_operations.h" -#define DEFAULT_REPEAT_COUNT 1 -#define DEFAULT_THREAD_COUNT 1 -#define num_featuremap 4 -#define hard_nms 1 -#define blending_nms 2 /* mix nms was been proposaled in paper blaze face, aims to minimize the temporal jitter*/ -#define clip(x, y) (x < 0 ? 0 : (x > y ? y : x)) +#define DEFAULT_REPEAT_COUNT 1 +#define DEFAULT_THREAD_COUNT 1 +#define num_featuremap 4 +#define hard_nms 1 +#define blending_nms 2 /* mix nms was been proposaled in paper blaze face, aims to minimize the temporal jitter*/ +#define clip(x, y) (x < 0 ? 0 : (x > y ? y : x)) typedef struct FaceInfo { @@ -57,10 +57,10 @@ const float g_center_variance = 0.1f; const float g_size_variance = 0.2f; int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.0001) + if (fabs(tmp) > 0.0001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -88,20 +88,20 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } std::string model_name = "version-RFB-320_simplified"; @@ -114,7 +114,6 @@ int main(int argc, char* argv[]) return -1; } - if (!check_file_exist(model_file)) return -1; @@ -139,8 +138,8 @@ int main(int argc, char* argv[]) /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = g_tensor_in_h * g_tensor_in_w * 3; - int dims[] = {1, 3, g_tensor_in_h, g_tensor_in_w}; // nchw - float* input_data = ( float* )malloc(img_size * sizeof(float)); + int dims[] = {1, 3, g_tensor_in_h, g_tensor_in_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -159,7 +158,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -171,7 +170,7 @@ int main(int argc, char* argv[]) /* prepare process input data, set the data mem to input tensor */ //save input std::string input_file = "./data/" + model_name + "_in.bin"; - FILE *fp; + FILE* fp; fp = fopen(input_file.c_str(), "rb"); if (fread(input_data, sizeof(float), img_size, fp) == 0) @@ -181,7 +180,6 @@ int main(int argc, char* argv[]) } fclose(fp); - /* run graph */ double min_time = DBL_MAX; double max_time = DBL_MIN; @@ -210,8 +208,8 @@ int main(int argc, char* argv[]) tensor_t boxs_tensor = get_graph_output_tensor(graph, 0, 0); tensor_t scores_tensor = get_graph_output_tensor(graph, 1, 0); - float* boxs_data = (float* )get_tensor_buffer(boxs_tensor); - float* scores_data = (float* )get_tensor_buffer(scores_tensor); + float* boxs_data = (float*)get_tensor_buffer(boxs_tensor); + float* scores_data = (float*)get_tensor_buffer(scores_tensor); // save output_data int output_size1 = get_tensor_buffer_size(boxs_tensor) / sizeof(float); @@ -220,7 +218,7 @@ int main(int argc, char* argv[]) std::string reference_file2 = "./data/" + model_name + "_out2.bin"; std::vector reference_data1(output_size1); std::vector reference_data2(output_size2); - FILE *fp1; + FILE* fp1; //write //read @@ -252,4 +250,3 @@ int main(int argc, char* argv[]) return ret; } - diff --git a/tests/models/test_model_unet.cpp b/tests/models/test_model_unet.cpp index 2380d5081..71df233c0 100644 --- a/tests/models/test_model_unet.cpp +++ b/tests/models/test_model_unet.cpp @@ -35,17 +35,17 @@ #include "tengine/c_api.h" #include "tengine_operations.h" -#define DEFAULT_IMG_H 512 -#define DEFAULT_IMG_W 512 -#define DEFAULT_SCALE1 (1.f/255.f) -#define DEFAULT_SCALE2 (1.f/255.f) -#define DEFAULT_SCALE3 (1.f/255.f) -#define DEFAULT_MEAN1 0 -#define DEFAULT_MEAN2 0 -#define DEFAULT_MEAN3 0 -#define DEFAULT_LOOP_COUNT 1 -#define DEFAULT_THREAD_COUNT 1 -#define DEFAULT_CPU_AFFINITY 255 +#define DEFAULT_IMG_H 512 +#define DEFAULT_IMG_W 512 +#define DEFAULT_SCALE1 (1.f / 255.f) +#define DEFAULT_SCALE2 (1.f / 255.f) +#define DEFAULT_SCALE3 (1.f / 255.f) +#define DEFAULT_MEAN1 0 +#define DEFAULT_MEAN2 0 +#define DEFAULT_MEAN3 0 +#define DEFAULT_LOOP_COUNT 1 +#define DEFAULT_THREAD_COUNT 1 +#define DEFAULT_CPU_AFFINITY 255 #define DEFAULT_CONF_THRESHOLD 0.5f /** @@ -56,10 +56,10 @@ */ int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.0001) + if (fabs(tmp) > 0.0001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -70,7 +70,7 @@ int float_mismatch(float* current, float* reference, int size) } int tengine_segment(const char* model_file, const char* image_file, int img_h, int img_w, const float* mean, - const float* scale, int loop_count, int num_thread, int affinity, float conf_thresh) + const float* scale, int loop_count, int num_thread, int affinity, float conf_thresh) { /* set runtime options */ struct options opt; @@ -97,8 +97,8 @@ int tengine_segment(const char* model_file, const char* image_file, int img_h, i /* set the shape, data buffer of input_tensor of the graph */ int img_size = img_h * img_w * 3; - int dims[] = {1, 3, img_h, img_w}; // nchw - float* input_data = ( float* )malloc(img_size * sizeof(float)); + int dims[] = {1, 3, img_h, img_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == NULL) @@ -117,7 +117,7 @@ int tengine_segment(const char* model_file, const char* image_file, int img_h, i { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -129,7 +129,7 @@ int tengine_segment(const char* model_file, const char* image_file, int img_h, i /* prepare process input data, set the data mem to input tensor */ std::string model_name = "unet"; std::string input_file = "./data/" + model_name + "_in.bin"; - FILE *fp; + FILE* fp; fp = fopen(input_file.c_str(), "rb"); if (!fp || fread(input_data, sizeof(float), img_size, fp) == 0) @@ -168,12 +168,12 @@ int tengine_segment(const char* model_file, const char* image_file, int img_h, i /* get the result of classification */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); std::string reference_file1 = "./data/" + model_name + "_out.bin"; std::vector reference_data1(output_size); - FILE *fp1; + FILE* fp1; fp1 = fopen(reference_file1.c_str(), "rb"); if (!fp || fread(reference_data1.data(), sizeof(float), output_size, fp1) == 0) { @@ -182,9 +182,9 @@ int tengine_segment(const char* model_file, const char* image_file, int img_h, i } fclose(fp1); int ret1 = float_mismatch(output_data, reference_data1.data(), output_size); - /* single class segmentation */ - /* multi-class segmentation */ - /* visualization */ + /* single class segmentation */ + /* multi-class segmentation */ + /* visualization */ /* release tengine */ free(input_data); @@ -222,26 +222,26 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'r': - loop_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'a': - cpu_affinity = atoi(optarg); - break; - case 'c': - conf_thresh = atof(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'r': + loop_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'a': + cpu_affinity = atoi(optarg); + break; + case 'c': + conf_thresh = atof(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -253,26 +253,23 @@ int main(int argc, char* argv[]) return -1; } - if (!check_file_exist(model_file)) return -1; - img_h = DEFAULT_IMG_H; + img_h = DEFAULT_IMG_H; - img_w = DEFAULT_IMG_W; + img_w = DEFAULT_IMG_W; - scale[0] = DEFAULT_SCALE1; - scale[1] = DEFAULT_SCALE2; - scale[2] = DEFAULT_SCALE3; - - mean[0] = DEFAULT_MEAN1; - mean[1] = DEFAULT_MEAN2; - mean[2] = DEFAULT_MEAN3; + scale[0] = DEFAULT_SCALE1; + scale[1] = DEFAULT_SCALE2; + scale[2] = DEFAULT_SCALE3; + mean[0] = DEFAULT_MEAN1; + mean[1] = DEFAULT_MEAN2; + mean[2] = DEFAULT_MEAN3; if (tengine_segment(model_file, image_file, img_h, img_w, mean, scale, loop_count, num_thread, cpu_affinity, conf_thresh) < 0) return -1; return 0; } - diff --git a/tests/models/test_model_yolact.cpp b/tests/models/test_model_yolact.cpp index 20af24355..83245f2f7 100644 --- a/tests/models/test_model_yolact.cpp +++ b/tests/models/test_model_yolact.cpp @@ -53,10 +53,10 @@ int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.0001) + if (fabs(tmp) > 0.0001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -83,20 +83,20 @@ int main(int argc, char** argv) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -138,7 +138,7 @@ int main(int argc, char** argv) /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = target_size * target_size * 3; - int dims[] = {1, 3, target_size, target_size}; // nchw + int dims[] = {1, 3, target_size, target_size}; // nchw std::vector input_data(img_size); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); @@ -158,7 +158,7 @@ int main(int argc, char** argv) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -170,7 +170,7 @@ int main(int argc, char** argv) /* prepare process input data, set the data mem to input tensor */ std::string model_name = "yolact"; std::string input_file = "./data/" + model_name + "_in.bin"; - FILE *fp; + FILE* fp; fp = fopen(input_file.c_str(), "rb"); if (fread(input_data.data(), sizeof(float), img_size, fp) == 0) { @@ -206,10 +206,10 @@ int main(int argc, char** argv) tensor_t location_tensor = get_graph_output_tensor(graph, 2, 0); tensor_t mask_tensor = get_graph_output_tensor(graph, 3, 0); tensor_t confidence_tensor = get_graph_output_tensor(graph, 4, 0); - float* maskmaps = ( float* )get_tensor_buffer(maskmaps_tensor); - float* location = ( float* )get_tensor_buffer(location_tensor); - float* mask = ( float* )get_tensor_buffer(mask_tensor); - float* confidence = ( float* )get_tensor_buffer(confidence_tensor); + float* maskmaps = (float*)get_tensor_buffer(maskmaps_tensor); + float* location = (float*)get_tensor_buffer(location_tensor); + float* mask = (float*)get_tensor_buffer(mask_tensor); + float* confidence = (float*)get_tensor_buffer(confidence_tensor); // save output_data int output_size1 = get_tensor_buffer_size(maskmaps_tensor) / sizeof(float); @@ -224,32 +224,32 @@ int main(int argc, char** argv) std::vector reference_data2(output_size2); std::vector reference_data3(output_size3); std::vector reference_data4(output_size4); - FILE *fp1; + FILE* fp1; fp1 = fopen(reference_file1.c_str(), "rb"); if (fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0) { - fprintf(stderr, "read %s data failed!\n",reference_file1.c_str()); + fprintf(stderr, "read %s data failed!\n", reference_file1.c_str()); return -1; } fclose(fp1); fp1 = fopen(reference_file2.c_str(), "rb"); if (fread(reference_data2.data(), sizeof(float), output_size2, fp1) == 0) { - fprintf(stderr, "read %s data failed!\n",reference_file2.c_str()); + fprintf(stderr, "read %s data failed!\n", reference_file2.c_str()); return -1; } fclose(fp1); fp1 = fopen(reference_file3.c_str(), "rb"); if (fread(reference_data3.data(), sizeof(float), output_size3, fp1) == 0) { - fprintf(stderr, "read %s data failed!\n",reference_file3.c_str()); + fprintf(stderr, "read %s data failed!\n", reference_file3.c_str()); return -1; } fclose(fp1); fp1 = fopen(reference_file4.c_str(), "rb"); if (fread(reference_data4.data(), sizeof(float), output_size4, fp1) == 0) { - fprintf(stderr, "read %s data failed!\n",reference_file4.c_str()); + fprintf(stderr, "read %s data failed!\n", reference_file4.c_str()); return -1; } fclose(fp1); @@ -267,4 +267,3 @@ int main(int argc, char** argv) return ret; } - diff --git a/tests/models/test_model_yolofastest.cpp b/tests/models/test_model_yolofastest.cpp index 1f90d05de..a67fb1e4b 100644 --- a/tests/models/test_model_yolofastest.cpp +++ b/tests/models/test_model_yolofastest.cpp @@ -23,7 +23,7 @@ * * original model: https://github.com/dog-qiuqiu/Yolo-Fastest/tree/master/ModelZoo/yolo-fastest-1.1_coco */ - + #include #include #include @@ -44,10 +44,10 @@ #define DEFAULT_THREAD_COUNT 1 int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.0001) + if (fabs(tmp) > 0.0001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -76,20 +76,20 @@ struct TMat return (const float*)data; } - float *row(int row) const + float* row(int row) const { - return (float *)data + w * row; + return (float*)data + w * row; } - TMat channel_range(int start, int chn_num) const + TMat channel_range(int start, int chn_num) const { - TMat mat = { 0 }; + TMat mat = {0}; mat.batch = 1; mat.c = chn_num; mat.h = h; mat.w = w; - mat.data = (float *)data + start * h * w; + mat.data = (float*)data + start * h * w; return mat; } @@ -100,7 +100,7 @@ struct TMat } int batch, c, h, w; - void *data; + void* data; }; int main(int argc, char* argv[]) @@ -143,7 +143,7 @@ int main(int argc, char* argv[]) return -1; } - if (!check_file_exist(model_file)) + if (!check_file_exist(model_file)) return -1; /* set runtime options */ @@ -171,7 +171,7 @@ int main(int argc, char* argv[]) /* set the input shape to initial the graph, and prerun graph to infer shape */ int img_size = net_h * net_w * 3; - int dims[] = { 1, 3, net_h, net_w }; // nchw + int dims[] = {1, 3, net_h, net_w}; // nchw std::vector input_data(img_size); @@ -202,11 +202,11 @@ int main(int argc, char* argv[]) } /* prepare process input data, set the data mem to input tensor */ - std::string model_name="yolo-fastest-1.1"; + std::string model_name = "yolo-fastest-1.1"; std::string input_file = "./data/" + model_name + "_in.bin"; - FILE *fp; + FILE* fp; fp = fopen(input_file.c_str(), "rb"); - if (!fp ||fread(input_data.data(), sizeof(float), img_size, fp) == 0) + if (!fp || fread(input_data.data(), sizeof(float), img_size, fp) == 0) { fprintf(stderr, "read input data file failed!\n"); return -1; @@ -232,24 +232,25 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, - num_thread, total_time / repeat_count, max_time, min_time); + num_thread, total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); /* process the detection result */ int output_node_num = get_graph_output_node_number(graph); int ret1 = 0; - tensor_t out_tensor; + tensor_t out_tensor; for (int i = 0; i < output_node_num; ++i) { - out_tensor = get_graph_output_tensor(graph, i, 0); //"detection_out" + out_tensor = get_graph_output_tensor(graph, i, 0); //"detection_out" // save output_data std::string model_name = "yolo-fastest-1.1"; - int output_size1 = get_tensor_buffer_size(out_tensor) / sizeof(float);; + int output_size1 = get_tensor_buffer_size(out_tensor) / sizeof(float); + ; float* yolo_outputs = (float*)get_tensor_buffer(out_tensor); - std::string reference_file1 = "./data/" + model_name + "_out" + std::to_string(i+1)+".bin"; + std::string reference_file1 = "./data/" + model_name + "_out" + std::to_string(i + 1) + ".bin"; std::vector reference_data1(output_size1); - FILE *fp1; + FILE* fp1; //read fp1 = fopen(reference_file1.c_str(), "rb"); if (fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0) diff --git a/tests/models/test_model_yolov3.cpp b/tests/models/test_model_yolov3.cpp index fb4a459bd..447f32a4f 100644 --- a/tests/models/test_model_yolov3.cpp +++ b/tests/models/test_model_yolov3.cpp @@ -35,10 +35,10 @@ int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.001) + if (fabs(tmp) > 0.001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -119,11 +119,11 @@ int main(int argc, char* argv[]) /* prepare process input data, set the data mem to input tensor */ // read input_data std::string input_file = "./data/" + model_name + "_in.bin"; - FILE *fp; + FILE* fp; fp = fopen(input_file.c_str(), "rb"); if (!fp || fread(input_data.data(), sizeof(float), img_size, fp) == 0) { - fprintf(stderr, "read input data file %s failed!\n",input_file.c_str()); + fprintf(stderr, "read input data file %s failed!\n", input_file.c_str()); return -1; } fclose(fp); @@ -140,13 +140,13 @@ int main(int argc, char* argv[]) fprintf(stderr, "Inference time %.2f ms\n", end - start); fprintf(stderr, "--------------------------------------\n"); - tensor_t p8_output = get_graph_output_tensor(graph, 2, 0); + tensor_t p8_output = get_graph_output_tensor(graph, 2, 0); tensor_t p16_output = get_graph_output_tensor(graph, 1, 0); tensor_t p32_output = get_graph_output_tensor(graph, 0, 0); - - float* p8_data = ( float*)get_tensor_buffer(p8_output); - float* p16_data = ( float*)get_tensor_buffer(p16_output); - float* p32_data = ( float*)get_tensor_buffer(p32_output); + + float* p8_data = (float*)get_tensor_buffer(p8_output); + float* p16_data = (float*)get_tensor_buffer(p16_output); + float* p32_data = (float*)get_tensor_buffer(p32_output); /* check the result */ int output_size1 = get_tensor_buffer_size(p8_output) / sizeof(float); @@ -155,26 +155,26 @@ int main(int argc, char* argv[]) std::string reference_file1 = "./data/" + model_name + "_out1.bin"; std::string reference_file2 = "./data/" + model_name + "_out2.bin"; std::string reference_file3 = "./data/" + model_name + "_out3.bin"; - std::vector reference_data1(output_size1),reference_data2(output_size2),reference_data3(output_size3); - FILE *fp1; + std::vector reference_data1(output_size1), reference_data2(output_size2), reference_data3(output_size3); + FILE* fp1; fp1 = fopen(reference_file1.c_str(), "rb"); if (!fp1 || fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0) { - fprintf(stderr, "read reference %s failed!\n",reference_file1.c_str()); + fprintf(stderr, "read reference %s failed!\n", reference_file1.c_str()); return -1; } fclose(fp1); fp1 = fopen(reference_file2.c_str(), "rb"); if (fread(reference_data2.data(), sizeof(float), output_size2, fp1) == 0) { - fprintf(stderr, "read reference %s failed!\n",reference_file2.c_str()); + fprintf(stderr, "read reference %s failed!\n", reference_file2.c_str()); return -1; } fclose(fp1); fp1 = fopen(reference_file3.c_str(), "rb"); if (fread(reference_data3.data(), sizeof(float), output_size3, fp1) == 0) { - fprintf(stderr, "read reference %s failed!\n",reference_file3.c_str()); + fprintf(stderr, "read reference %s failed!\n", reference_file3.c_str()); return -1; } fclose(fp1); diff --git a/tests/models/test_model_yolov3_tiny.cpp b/tests/models/test_model_yolov3_tiny.cpp index 9654b2bc2..da48cc6e5 100644 --- a/tests/models/test_model_yolov3_tiny.cpp +++ b/tests/models/test_model_yolov3_tiny.cpp @@ -35,10 +35,10 @@ int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.0001) + if (fabs(tmp) > 0.0001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -48,7 +48,6 @@ int float_mismatch(float* current, float* reference, int size) return 0; } - void show_usage() { fprintf( @@ -58,7 +57,7 @@ void show_usage() int main(int argc, char* argv[]) { - const char* model_file ="./models/yolov3-tiny.tmfile"; + const char* model_file = "./models/yolov3-tiny.tmfile"; int img_h = 416; int img_w = 416; int img_c = 3; @@ -73,23 +72,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } - std::string model_name="yolov3_tiny"; + std::string model_name = "yolov3_tiny"; /* check files */ if (nullptr == model_file) { @@ -98,11 +97,9 @@ int main(int argc, char* argv[]) return -1; } - if (!check_file_exist(model_file)) return -1; - /* set runtime options */ struct options opt; opt.num_thread = num_thread; @@ -128,7 +125,7 @@ int main(int argc, char* argv[]) int img_size = img_h * img_w * img_c; int dims[] = {1, 3, img_h, img_w}; - float* input_data = ( float* )malloc(img_size * sizeof(float)); + float* input_data = (float*)malloc(img_size * sizeof(float)); tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); if (input_tensor == nullptr) @@ -159,12 +156,12 @@ int main(int argc, char* argv[]) /* prepare process input data, set the data mem to input tensor */ // save input_data std::string input_file = "./data/" + model_name + "_in.bin"; - FILE *fp; + FILE* fp; fp = fopen(input_file.c_str(), "rb"); if (!fp || fread(input_data, sizeof(float), img_size, fp) == 0) { - fprintf(stderr, "read input data file %s failed!\n",input_file.c_str()); + fprintf(stderr, "read input data file %s failed!\n", input_file.c_str()); return -1; } fclose(fp); @@ -188,33 +185,33 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); tensor_t p16_output = get_graph_output_tensor(graph, 1, 0); tensor_t p32_output = get_graph_output_tensor(graph, 0, 0); - - float* p16_data = ( float*)get_tensor_buffer(p16_output); - float* p32_data = ( float*)get_tensor_buffer(p32_output); + + float* p16_data = (float*)get_tensor_buffer(p16_output); + float* p32_data = (float*)get_tensor_buffer(p32_output); int output_size2 = get_tensor_buffer_size(p16_output) / sizeof(float); int output_size3 = get_tensor_buffer_size(p32_output) / sizeof(float); std::string reference_file2 = "./data/" + model_name + "_out1.bin"; std::string reference_file3 = "./data/" + model_name + "_out2.bin"; - std::vector reference_data2(output_size2),reference_data3(output_size3); - FILE *fp1; + std::vector reference_data2(output_size2), reference_data3(output_size3); + FILE* fp1; fp1 = fopen(reference_file2.c_str(), "rb"); if (fread(reference_data2.data(), sizeof(float), output_size2, fp1) == 0) { - fprintf(stderr, "read reference %s failed!\n",reference_file2.c_str()); + fprintf(stderr, "read reference %s failed!\n", reference_file2.c_str()); return -1; } fclose(fp1); fp1 = fopen(reference_file3.c_str(), "rb"); if (fread(reference_data3.data(), sizeof(float), output_size3, fp1) == 0) { - fprintf(stderr, "read reference %s failed!\n",reference_file3.c_str()); + fprintf(stderr, "read reference %s failed!\n", reference_file3.c_str()); return -1; } fclose(fp1); @@ -223,7 +220,6 @@ int main(int argc, char* argv[]) int ret3 = float_mismatch(p32_data, reference_data3.data(), output_size3); /* postprocess */ - /* release tengine */ postrun_graph(graph); diff --git a/tests/models/test_model_yolov4.cpp b/tests/models/test_model_yolov4.cpp index ddb40a513..83f776001 100644 --- a/tests/models/test_model_yolov4.cpp +++ b/tests/models/test_model_yolov4.cpp @@ -37,10 +37,10 @@ int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.0001) + if (fabs(tmp) > 0.0001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -50,7 +50,6 @@ int float_mismatch(float* current, float* reference, int size) return 0; } - void show_usage() { fprintf( @@ -60,7 +59,7 @@ void show_usage() int main(int argc, char* argv[]) { - const char* model_file ="./models/yolov4.tmfile"; + const char* model_file = "./models/yolov4.tmfile"; int img_h = 416; int img_w = 416; int img_c = 3; @@ -75,23 +74,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } - std::string model_name="yolov4"; + std::string model_name = "yolov4"; /* check files */ if (nullptr == model_file) { @@ -100,11 +99,9 @@ int main(int argc, char* argv[]) return -1; } - if (!check_file_exist(model_file)) return -1; - /* set runtime options */ struct options opt; opt.num_thread = num_thread; @@ -161,12 +158,12 @@ int main(int argc, char* argv[]) /* prepare process input data, set the data mem to input tensor */ // read input_data std::string input_file = "./data/" + model_name + "_in.bin"; - FILE *fp; - + FILE* fp; + fp = fopen(input_file.c_str(), "rb"); if (!fp || fread(input_data.data(), sizeof(float), img_size, fp) == 0) { - fprintf(stderr, "read input data file %s failed!\n",input_file.c_str()); + fprintf(stderr, "read input data file %s failed!\n", input_file.c_str()); return -1; } fclose(fp); @@ -190,30 +187,21 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); tensor_t p8_output = get_graph_output_tensor(graph, 0, 0); tensor_t p16_output = get_graph_output_tensor(graph, 1, 0); tensor_t p32_output = get_graph_output_tensor(graph, 2, 0); - - float* p8_data = ( float*)get_tensor_buffer(p8_output); - float* p16_data = ( float*)get_tensor_buffer(p16_output); - float* p32_data = ( float*)get_tensor_buffer(p32_output); - - /* postprocess */ - + float* p8_data = (float*)get_tensor_buffer(p8_output); + float* p16_data = (float*)get_tensor_buffer(p16_output); + float* p32_data = (float*)get_tensor_buffer(p32_output); + /* postprocess */ /* yolov4 tiny draw the result */ - - - - - - /* check the result */ int output_size1 = get_tensor_buffer_size(p8_output) / sizeof(float); int output_size2 = get_tensor_buffer_size(p16_output) / sizeof(float); @@ -221,27 +209,27 @@ int main(int argc, char* argv[]) std::string reference_file1 = "./data/" + model_name + "_out1.bin"; std::string reference_file2 = "./data/" + model_name + "_out2.bin"; std::string reference_file3 = "./data/" + model_name + "_out3.bin"; - std::vector reference_data1(output_size1),reference_data2(output_size2),reference_data3(output_size3); - FILE *fp1; + std::vector reference_data1(output_size1), reference_data2(output_size2), reference_data3(output_size3); + FILE* fp1; fp1 = fopen(reference_file1.c_str(), "rb"); if (!fp1 || fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0) { - fprintf(stderr, "read reference %s failed!\n",reference_file1.c_str()); + fprintf(stderr, "read reference %s failed!\n", reference_file1.c_str()); return -1; } fclose(fp1); fp1 = fopen(reference_file2.c_str(), "rb"); if (fread(reference_data2.data(), sizeof(float), output_size2, fp1) == 0) { - fprintf(stderr, "read reference %s failed!\n",reference_file2.c_str()); + fprintf(stderr, "read reference %s failed!\n", reference_file2.c_str()); return -1; } fclose(fp1); fp1 = fopen(reference_file3.c_str(), "rb"); if (fread(reference_data3.data(), sizeof(float), output_size3, fp1) == 0) { - fprintf(stderr, "read reference %s failed!\n",reference_file3.c_str()); + fprintf(stderr, "read reference %s failed!\n", reference_file3.c_str()); return -1; } fclose(fp1); diff --git a/tests/models/test_model_yolov4_tiny.cpp b/tests/models/test_model_yolov4_tiny.cpp index da07ba8bc..1679a0090 100644 --- a/tests/models/test_model_yolov4_tiny.cpp +++ b/tests/models/test_model_yolov4_tiny.cpp @@ -37,10 +37,10 @@ int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.0001) + if (fabs(tmp) > 0.0001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -50,7 +50,6 @@ int float_mismatch(float* current, float* reference, int size) return 0; } - void show_usage() { fprintf( @@ -60,7 +59,7 @@ void show_usage() int main(int argc, char* argv[]) { - const char* model_file ="./models/yolov4-tiny.tmfile"; + const char* model_file = "./models/yolov4-tiny.tmfile"; int img_h = 416; int img_w = 416; int img_c = 3; @@ -75,23 +74,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } - std::string model_name="yolov4_tiny"; + std::string model_name = "yolov4_tiny"; /* check files */ if (nullptr == model_file) { @@ -100,11 +99,9 @@ int main(int argc, char* argv[]) return -1; } - if (!check_file_exist(model_file)) return -1; - /* set runtime options */ struct options opt; opt.num_thread = num_thread; @@ -161,12 +158,12 @@ int main(int argc, char* argv[]) /* prepare process input data, set the data mem to input tensor */ // save input_data std::string input_file = "./data/" + model_name + "_in.bin"; - FILE *fp; + FILE* fp; fp = fopen(input_file.c_str(), "rb"); if (!fp || fread(input_data.data(), sizeof(float), img_size, fp) == 0) { - fprintf(stderr, "read input data file %s failed!\n",input_file.c_str()); + fprintf(stderr, "read input data file %s failed!\n", input_file.c_str()); return -1; } fclose(fp); @@ -190,41 +187,39 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); - tensor_t p16_output = get_graph_output_tensor(graph, 1, 0); tensor_t p32_output = get_graph_output_tensor(graph, 0, 0); - float* p16_data = ( float*)get_tensor_buffer(p16_output); - float* p32_data = ( float*)get_tensor_buffer(p32_output); + float* p16_data = (float*)get_tensor_buffer(p16_output); + float* p32_data = (float*)get_tensor_buffer(p32_output); int output_size2 = get_tensor_buffer_size(p16_output) / sizeof(float); int output_size3 = get_tensor_buffer_size(p32_output) / sizeof(float); std::string reference_file2 = "./data/" + model_name + "_out1.bin"; std::string reference_file3 = "./data/" + model_name + "_out2.bin"; - std::vector reference_data2(output_size2),reference_data3(output_size3); - FILE *fp1; + std::vector reference_data2(output_size2), reference_data3(output_size3); + FILE* fp1; fp1 = fopen(reference_file2.c_str(), "rb"); if (fread(reference_data2.data(), sizeof(float), output_size2, fp1) == 0) { - fprintf(stderr, "read reference %s failed!\n",reference_file2.c_str()); + fprintf(stderr, "read reference %s failed!\n", reference_file2.c_str()); return -1; } fclose(fp1); fp1 = fopen(reference_file3.c_str(), "rb"); if (fread(reference_data3.data(), sizeof(float), output_size3, fp1) == 0) { - fprintf(stderr, "read reference %s failed!\n",reference_file3.c_str()); + fprintf(stderr, "read reference %s failed!\n", reference_file3.c_str()); return -1; } fclose(fp1); int ret2 = float_mismatch(p16_data, reference_data2.data(), output_size2); int ret3 = float_mismatch(p32_data, reference_data3.data(), output_size3); - /* postprocess */ - + /* postprocess */ /* release tengine */ postrun_graph(graph); diff --git a/tests/models/test_model_yolov5s.cpp b/tests/models/test_model_yolov5s.cpp index 288d75388..fad8f98ca 100644 --- a/tests/models/test_model_yolov5s.cpp +++ b/tests/models/test_model_yolov5s.cpp @@ -35,10 +35,10 @@ int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.0001) + if (fabs(tmp) > 0.0001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; @@ -51,8 +51,8 @@ int float_mismatch(float* current, float* reference, int size) void show_usage() { fprintf( - stderr, - "[Usage]: [-h]\n [-m model_file] [-r repeat_count] [-t thread_count]\n"); + stderr, + "[Usage]: [-h]\n [-m model_file] [-r repeat_count] [-t thread_count]\n"); } int main(int argc, char* argv[]) @@ -74,20 +74,20 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'r': - repeat_count = std::strtoul(optarg, nullptr, 10); - break; - case 't': - num_thread = std::strtoul(optarg, nullptr, 10); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'r': + repeat_count = std::strtoul(optarg, nullptr, 10); + break; + case 't': + num_thread = std::strtoul(optarg, nullptr, 10); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -158,7 +158,7 @@ int main(int argc, char* argv[]) /* prepare process input data, set the data mem to input tensor */ std::string model_name = "yolov5s"; std::string input_file = "./data/" + model_name + "_in.bin"; - FILE *fp; + FILE* fp; fp = fopen(input_file.c_str(), "rb"); if (fread(input_data.data(), sizeof(float), img_size, fp) == 0) { @@ -186,7 +186,7 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); /* yolov5 postprocess */ @@ -197,19 +197,19 @@ int main(int argc, char* argv[]) tensor_t p16_output = get_graph_output_tensor(graph, 1, 0); tensor_t p32_output = get_graph_output_tensor(graph, 2, 0); - float* p8_data = ( float*)get_tensor_buffer(p8_output); - float* p16_data = ( float*)get_tensor_buffer(p16_output); - float* p32_data = ( float*)get_tensor_buffer(p32_output); + float* p8_data = (float*)get_tensor_buffer(p8_output); + float* p16_data = (float*)get_tensor_buffer(p16_output); + float* p32_data = (float*)get_tensor_buffer(p32_output); /* postprocess */ - int output_size1 = get_tensor_buffer_size(p8_output) / sizeof(float); + int output_size1 = get_tensor_buffer_size(p8_output) / sizeof(float); int output_size2 = get_tensor_buffer_size(p16_output) / sizeof(float); int output_size3 = get_tensor_buffer_size(p32_output) / sizeof(float); std::string reference_file1 = "./data/" + model_name + "_out1.bin"; std::string reference_file2 = "./data/" + model_name + "_out2.bin"; std::string reference_file3 = "./data/" + model_name + "_out3.bin"; - std::vector reference_data1(output_size1),reference_data2(output_size2),reference_data3(output_size3); - FILE *fp1; + std::vector reference_data1(output_size1), reference_data2(output_size2), reference_data3(output_size3); + FILE* fp1; fp1 = fopen(reference_file1.c_str(), "rb"); if (fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0) { @@ -242,4 +242,3 @@ int main(int argc, char* argv[]) release_tengine(); return ret; } - diff --git a/tests/models/test_timvx_model_yolov5s.cpp b/tests/models/test_timvx_model_yolov5s.cpp index ecd704a67..509148612 100644 --- a/tests/models/test_timvx_model_yolov5s.cpp +++ b/tests/models/test_timvx_model_yolov5s.cpp @@ -37,7 +37,6 @@ #include "tengine/c_api.h" #include "tengine_operations.h" - struct Object { cv::Rect_ rect; @@ -135,8 +134,7 @@ static void nms_sorted_bboxes(const std::vector& faceobjects, std::vecto } } - -static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector& objects, +static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector& objects, int letterbox_cols, int letterbox_rows) { static float anchors[18] = {10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326}; @@ -146,11 +144,11 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh int feat_h = letterbox_rows / stride; int cls_num = 80; int anchor_group; - if(stride == 8) + if (stride == 8) anchor_group = 1; - if(stride == 16) + if (stride == 16) anchor_group = 2; - if(stride == 32) + if (stride == 32) anchor_group = 3; for (int h = 0; h <= feat_h - 1; h++) { @@ -164,7 +162,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh for (int s = 0; s <= cls_num - 1; s++) { float score = feat[a * feat_w * feat_h * (cls_num + 5) + h * feat_w * (cls_num + 5) + w * (cls_num + 5) + s + 5]; - if(score > class_score) + if (score > class_score) { class_index = s; class_score = score; @@ -172,7 +170,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh } //process box score float box_score = feat[a * feat_w * feat_h * (cls_num + 5) + (h * feat_w) * (cls_num + 5) + w * (cls_num + 5) + 4]; - float final_score = sigmoid(box_score ) * sigmoid(class_score); + float final_score = sigmoid(box_score) * sigmoid(class_score); if (final_score >= prob_threshold) { int loc_idx = a * feat_h * feat_w * (cls_num + 5) + h * feat_w * (cls_num + 5) + w * (cls_num + 5); @@ -208,16 +206,15 @@ static void generate_proposals(int stride, const float* feat, float prob_thresh static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { - "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", - "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", - "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", - "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", - "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", - "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", - "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", - "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", - "hair drier", "toothbrush" - }; + "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", + "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", + "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", + "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", + "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", + "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", + "hair drier", "toothbrush"}; cv::Mat image = bgr.clone(); @@ -256,8 +253,8 @@ static void draw_objects(const cv::Mat& bgr, const std::vector& objects) void show_usage() { fprintf( - stderr, - "[Usage]: [-h]\n [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n"); + stderr, + "[Usage]: [-h]\n [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n"); } void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int letterbox_rows, int letterbox_cols, const float* mean, @@ -275,9 +272,12 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int float scale_letterbox; int resize_rows; int resize_cols; - if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) { + if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) + { scale_letterbox = letterbox_rows * 1.0 / img.rows; - } else { + } + else + { scale_letterbox = letterbox_cols * 1.0 / img.cols; } resize_cols = int(scale_letterbox * img.cols); @@ -286,7 +286,7 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int cv::resize(img, img, cv::Size(resize_cols, resize_rows)); img.convertTo(img, CV_32FC3); // Generate a gray image for letterbox using opencv - cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3,cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2])); + cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3, cv::Scalar(0.5 / scale[0] + mean[0], 0.5 / scale[1] + mean[1], 0.5 / scale[2] + mean[2])); int top = (letterbox_rows - resize_rows) / 2; int bot = (letterbox_rows - resize_rows + 1) / 2; int left = (letterbox_cols - resize_cols) / 2; @@ -295,7 +295,7 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int cv::copyMakeBorder(img, img_new, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0)); img_new.convertTo(img_new, CV_32FC3); - float* img_data = (float* )img_new.data; + float* img_data = (float*)img_new.data; std::vector input_temp(3 * letterbox_cols * letterbox_rows); /* nhwc to nchw */ @@ -305,7 +305,7 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int { for (int c = 0; c < 3; c++) { - int in_index = h * letterbox_cols * 3 + w * 3 + c; + int in_index = h * letterbox_cols * 3 + w * 3 + c; int out_index = c * letterbox_rows * letterbox_cols + h * letterbox_cols + w; input_temp[out_index] = (img_data[in_index] - mean[c]) * scale[c]; } @@ -319,20 +319,15 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int { for (int c = 0; c < 3; c++) { - for (int h = 0; h < letterbox_rows/2; h++) + for (int h = 0; h < letterbox_rows / 2; h++) { - for (int w = 0; w < letterbox_cols/2; w++) + for (int w = 0; w < letterbox_cols / 2; w++) { - int in_index = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows + - h * 2 * letterbox_cols + w * 2; - int out_index = i * 2 * 3 * (letterbox_cols/2) * (letterbox_rows/2) + - g * 3 * (letterbox_cols/2) * (letterbox_rows/2) + - c * (letterbox_cols/2) * (letterbox_rows/2) + - h * (letterbox_cols/2) + - w; + int in_index = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows + h * 2 * letterbox_cols + w * 2; + int out_index = i * 2 * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + g * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + c * (letterbox_cols / 2) * (letterbox_rows / 2) + h * (letterbox_cols / 2) + w; /* quant to uint8 */ - int udata = (round)(input_temp[in_index] / input_scale + ( float )zero_point); + int udata = (round)(input_temp[in_index] / input_scale + (float)zero_point); if (udata > 255) udata = 255; else if (udata < 0) @@ -366,23 +361,23 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - model_file = optarg; - break; - case 'i': - image_file = optarg; - break; - case 'r': - repeat_count = atoi(optarg); - break; - case 't': - num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -498,7 +493,7 @@ int main(int argc, char* argv[]) max_time = std::max(max_time, cur); } fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread, - total_time/repeat_count, max_time, min_time); + total_time / repeat_count, max_time, min_time); fprintf(stderr, "--------------------------------------\n"); /* yolov5 postprocess */ @@ -529,23 +524,23 @@ int main(int argc, char* argv[]) std::vector p16_data(p16_count); std::vector p32_data(p32_count); - uint8_t* p8_data_u8 = ( uint8_t* )get_tensor_buffer(p8_output); - uint8_t* p16_data_u8 = ( uint8_t* )get_tensor_buffer(p16_output); - uint8_t* p32_data_u8 = ( uint8_t* )get_tensor_buffer(p32_output); + uint8_t* p8_data_u8 = (uint8_t*)get_tensor_buffer(p8_output); + uint8_t* p16_data_u8 = (uint8_t*)get_tensor_buffer(p16_output); + uint8_t* p32_data_u8 = (uint8_t*)get_tensor_buffer(p32_output); for (int c = 0; c < p8_count; c++) { - p8_data[c] = (( float )p8_data_u8[c] - ( float )p8_zero_point) * p8_scale; + p8_data[c] = ((float)p8_data_u8[c] - (float)p8_zero_point) * p8_scale; } for (int c = 0; c < p16_count; c++) { - p16_data[c] = (( float )p16_data_u8[c] - ( float )p16_zero_point) * p16_scale; + p16_data[c] = ((float)p16_data_u8[c] - (float)p16_zero_point) * p16_scale; } for (int c = 0; c < p32_count; c++) { - p32_data[c] = (( float )p32_data_u8[c] - ( float )p32_zero_point) * p32_scale; + p32_data[c] = ((float)p32_data_u8[c] - (float)p32_zero_point) * p32_scale; } /* postprocess */ @@ -562,7 +557,7 @@ int main(int argc, char* argv[]) proposals.insert(proposals.end(), objects32.begin(), objects32.end()); generate_proposals(16, p16_data.data(), prob_threshold, objects16, letterbox_cols, letterbox_rows); proposals.insert(proposals.end(), objects16.begin(), objects16.end()); - generate_proposals( 8, p8_data.data(), prob_threshold, objects8, letterbox_cols, letterbox_rows); + generate_proposals(8, p8_data.data(), prob_threshold, objects8, letterbox_cols, letterbox_rows); proposals.insert(proposals.end(), objects8.begin(), objects8.end()); qsort_descent_inplace(proposals); @@ -574,9 +569,12 @@ int main(int argc, char* argv[]) float scale_letterbox; int resize_rows; int resize_cols; - if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) { + if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) + { scale_letterbox = letterbox_rows * 1.0 / img.rows; - } else { + } + else + { scale_letterbox = letterbox_cols * 1.0 / img.cols; } resize_cols = int(scale_letterbox * img.cols); @@ -589,7 +587,7 @@ int main(int argc, char* argv[]) float ratio_y = (float)img.cols / resize_cols; int count = picked.size(); - fprintf(stderr, "detection num: %d\n",count); + fprintf(stderr, "detection num: %d\n", count); objects.resize(count); for (int i = 0; i < count; i++) @@ -623,4 +621,3 @@ int main(int argc, char* argv[]) destroy_graph(graph); release_tengine(); } - diff --git a/tests/op/test_onnx_op.h b/tests/op/test_onnx_op.h index 621baa85d..df9477dd7 100644 --- a/tests/op/test_onnx_op.h +++ b/tests/op/test_onnx_op.h @@ -39,7 +39,6 @@ #include "onnx.pb.h" - int get_pb_data(float* float_data, const std::string& filepath) { std::ifstream fs(filepath.c_str(), std::ifstream::in | std::ifstream::binary); @@ -155,16 +154,16 @@ int get_pb_data_i32(int32_t* i32_data, const std::string& filepath) int float_mismatch(float* current, float* reference, int size) { - for(int i=0;i 0.0001) + if (fabs(tmp) > 0.0001) { fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]); return -1; } } - fprintf(stderr,"test pass\n"); + fprintf(stderr, "test pass\n"); return 0; } diff --git a/tests/op/test_onnx_op_abs.cpp b/tests/op/test_onnx_op_abs.cpp index 387b2c3c2..de0a67a47 100644 --- a/tests/op/test_onnx_op_abs.cpp +++ b/tests/op/test_onnx_op_abs.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_abs"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_abs"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_acos.cpp b/tests/op/test_onnx_op_acos.cpp index 6a713fadd..26a30dc65 100644 --- a/tests/op/test_onnx_op_acos.cpp +++ b/tests/op/test_onnx_op_acos.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_acos"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_acos"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_add.cpp b/tests/op/test_onnx_op_add.cpp index ea634972f..a99c3b4f8 100644 --- a/tests/op/test_onnx_op_add.cpp +++ b/tests/op/test_onnx_op_add.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_add"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_add"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -112,7 +111,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -134,7 +133,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_asin.cpp b/tests/op/test_onnx_op_asin.cpp index babcb91ae..42b458d2e 100644 --- a/tests/op/test_onnx_op_asin.cpp +++ b/tests/op/test_onnx_op_asin.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_asin"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_asin"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_atan.cpp b/tests/op/test_onnx_op_atan.cpp index 6e35a9238..fd68814a9 100644 --- a/tests/op/test_onnx_op_atan.cpp +++ b/tests/op/test_onnx_op_atan.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_atan"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_atan"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_averagepool_2d_default.cpp b/tests/op/test_onnx_op_averagepool_2d_default.cpp index b6a096435..9a9e2fbbc 100644 --- a/tests/op/test_onnx_op_averagepool_2d_default.cpp +++ b/tests/op/test_onnx_op_averagepool_2d_default.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_averagepool_2d_default"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_averagepool_2d_default"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_averagepool_2d_pads.cpp b/tests/op/test_onnx_op_averagepool_2d_pads.cpp index bb3087e0b..de2f7ccee 100644 --- a/tests/op/test_onnx_op_averagepool_2d_pads.cpp +++ b/tests/op/test_onnx_op_averagepool_2d_pads.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_averagepool_2d_pads"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_averagepool_2d_pads"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_basic_conv_with_padding.cpp b/tests/op/test_onnx_op_basic_conv_with_padding.cpp index d93f99ee3..77242d234 100644 --- a/tests/op/test_onnx_op_basic_conv_with_padding.cpp +++ b/tests/op/test_onnx_op_basic_conv_with_padding.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_basic_conv_with_padding"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_basic_conv_with_padding"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -112,7 +111,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -134,7 +133,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_basic_conv_without_padding.cpp b/tests/op/test_onnx_op_basic_conv_without_padding.cpp index e1c4a0854..1aad544f2 100644 --- a/tests/op/test_onnx_op_basic_conv_without_padding.cpp +++ b/tests/op/test_onnx_op_basic_conv_without_padding.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_basic_conv_without_padding"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_basic_conv_without_padding"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -112,7 +111,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -134,7 +133,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_ceil.cpp b/tests/op/test_onnx_op_ceil.cpp index 853470b43..16112aa41 100644 --- a/tests/op/test_onnx_op_ceil.cpp +++ b/tests/op/test_onnx_op_ceil.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_ceil"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_ceil"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_clip_example.cpp b/tests/op/test_onnx_op_clip_example.cpp index 587ab840f..2d61de319 100644 --- a/tests/op/test_onnx_op_clip_example.cpp +++ b/tests/op/test_onnx_op_clip_example.cpp @@ -22,15 +22,14 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_clip_example"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; -std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb"; -std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string node = "test_clip_example"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb"; +std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -107,7 +106,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* input 2 */ int input_size_2 = w_2; @@ -154,7 +153,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_concat_1d_axis_0.cpp b/tests/op/test_onnx_op_concat_1d_axis_0.cpp index 9919d7d3d..71e6527ac 100644 --- a/tests/op/test_onnx_op_concat_1d_axis_0.cpp +++ b/tests/op/test_onnx_op_concat_1d_axis_0.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_concat_1d_axis_0"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_concat_1d_axis_0"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -87,7 +86,7 @@ int main(int argc, char* argv[]) } /* input 1 */ - int input_size_1 = h_1 * w_1; + int input_size_1 = h_1 * w_1; int dims_1[] = {h_1, w_1}; std::vector feature_in_1(input_size_1); @@ -108,7 +107,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -130,7 +129,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_concat_2d_axis_0.cpp b/tests/op/test_onnx_op_concat_2d_axis_0.cpp index 2d41bf5b8..db7586424 100644 --- a/tests/op/test_onnx_op_concat_2d_axis_0.cpp +++ b/tests/op/test_onnx_op_concat_2d_axis_0.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_concat_2d_axis_0"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_concat_2d_axis_0"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -87,7 +86,7 @@ int main(int argc, char* argv[]) } /* input 1 */ - int input_size_1 = h_1 * w_1; + int input_size_1 = h_1 * w_1; int dims_1[] = {h_1, w_1}; std::vector feature_in_1(input_size_1); @@ -108,7 +107,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -130,7 +129,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_concat_2d_axis_1.cpp b/tests/op/test_onnx_op_concat_2d_axis_1.cpp index 5afeaedc2..81fde2127 100644 --- a/tests/op/test_onnx_op_concat_2d_axis_1.cpp +++ b/tests/op/test_onnx_op_concat_2d_axis_1.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_concat_2d_axis_1"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_concat_2d_axis_1"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -87,7 +86,7 @@ int main(int argc, char* argv[]) } /* input 1 */ - int input_size_1 = h_1 * w_1; + int input_size_1 = h_1 * w_1; int dims_1[] = {h_1, w_1}; std::vector feature_in_1(input_size_1); @@ -108,7 +107,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -130,7 +129,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_concat_3d_axis_0.cpp b/tests/op/test_onnx_op_concat_3d_axis_0.cpp index 849d01b8f..ac4ea7f27 100644 --- a/tests/op/test_onnx_op_concat_3d_axis_0.cpp +++ b/tests/op/test_onnx_op_concat_3d_axis_0.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_concat_3d_axis_0"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_concat_3d_axis_0"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -110,7 +109,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -132,7 +131,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_concat_3d_axis_1.cpp b/tests/op/test_onnx_op_concat_3d_axis_1.cpp index cc3abc1e7..c1a58baa5 100644 --- a/tests/op/test_onnx_op_concat_3d_axis_1.cpp +++ b/tests/op/test_onnx_op_concat_3d_axis_1.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_concat_3d_axis_1"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_concat_3d_axis_1"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -110,7 +109,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -132,7 +131,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_concat_3d_axis_2.cpp b/tests/op/test_onnx_op_concat_3d_axis_2.cpp index 7a1167db3..5cf3306cc 100644 --- a/tests/op/test_onnx_op_concat_3d_axis_2.cpp +++ b/tests/op/test_onnx_op_concat_3d_axis_2.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_concat_3d_axis_2"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_concat_3d_axis_2"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -110,7 +109,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -132,7 +131,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_conv_with_strides_no_padding.cpp b/tests/op/test_onnx_op_conv_with_strides_no_padding.cpp index b48855cef..5adb63080 100644 --- a/tests/op/test_onnx_op_conv_with_strides_no_padding.cpp +++ b/tests/op/test_onnx_op_conv_with_strides_no_padding.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_conv_with_strides_no_padding"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_conv_with_strides_no_padding"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -112,7 +111,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -134,7 +133,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_conv_with_strides_padding.cpp b/tests/op/test_onnx_op_conv_with_strides_padding.cpp index f1f3407a7..c22adaf04 100644 --- a/tests/op/test_onnx_op_conv_with_strides_padding.cpp +++ b/tests/op/test_onnx_op_conv_with_strides_padding.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_conv_with_strides_padding"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_conv_with_strides_padding"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -112,7 +111,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -134,7 +133,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_convtranspose.cpp b/tests/op/test_onnx_op_convtranspose.cpp index 0db15b36d..71fcbf801 100644 --- a/tests/op/test_onnx_op_convtranspose.cpp +++ b/tests/op/test_onnx_op_convtranspose.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_convtranspose"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_convtranspose"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -112,7 +111,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -134,7 +133,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_convtranspose_dilations.cpp b/tests/op/test_onnx_op_convtranspose_dilations.cpp index 51cdad8a8..6c7a7b6b8 100644 --- a/tests/op/test_onnx_op_convtranspose_dilations.cpp +++ b/tests/op/test_onnx_op_convtranspose_dilations.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_convtranspose_dilations"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_convtranspose_dilations"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -112,7 +111,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -134,7 +133,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_convtranspose_pad.cpp b/tests/op/test_onnx_op_convtranspose_pad.cpp index 510c4d271..83e439c76 100644 --- a/tests/op/test_onnx_op_convtranspose_pad.cpp +++ b/tests/op/test_onnx_op_convtranspose_pad.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_convtranspose_pad"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_convtranspose_pad"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -112,7 +111,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -134,7 +133,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_convtranspose_pads.cpp b/tests/op/test_onnx_op_convtranspose_pads.cpp index ff97a6871..ed16a8a77 100644 --- a/tests/op/test_onnx_op_convtranspose_pads.cpp +++ b/tests/op/test_onnx_op_convtranspose_pads.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_convtranspose_pads"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_convtranspose_pads"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -112,7 +111,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -134,7 +133,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_cos.cpp b/tests/op/test_onnx_op_cos.cpp index d5ba50fd7..b8b8c20a5 100644 --- a/tests/op/test_onnx_op_cos.cpp +++ b/tests/op/test_onnx_op_cos.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_cos"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_cos"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_depthtospace_dcr_mode.cpp b/tests/op/test_onnx_op_depthtospace_dcr_mode.cpp index 4d7dd503f..d33409af4 100644 --- a/tests/op/test_onnx_op_depthtospace_dcr_mode.cpp +++ b/tests/op/test_onnx_op_depthtospace_dcr_mode.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_depthtospace_dcr_mode"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_depthtospace_dcr_mode"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_div.cpp b/tests/op/test_onnx_op_div.cpp index 50b13cd5e..240e73316 100644 --- a/tests/op/test_onnx_op_div.cpp +++ b/tests/op/test_onnx_op_div.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_div"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_div"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -112,7 +111,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -134,7 +133,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_dropout_default.cpp b/tests/op/test_onnx_op_dropout_default.cpp index 139c59fa5..266220f9a 100644 --- a/tests/op/test_onnx_op_dropout_default.cpp +++ b/tests/op/test_onnx_op_dropout_default.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_dropout_default"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_dropout_default"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -78,7 +77,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -99,7 +98,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_elu.cpp b/tests/op/test_onnx_op_elu.cpp index 9e868e435..3692035e6 100644 --- a/tests/op/test_onnx_op_elu.cpp +++ b/tests/op/test_onnx_op_elu.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_elu"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_elu"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_equal.cpp b/tests/op/test_onnx_op_equal.cpp index 5af989aad..02ee50e8f 100644 --- a/tests/op/test_onnx_op_equal.cpp +++ b/tests/op/test_onnx_op_equal.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_equal"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_equal"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -112,7 +111,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -134,7 +133,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_exp.cpp b/tests/op/test_onnx_op_exp.cpp index cab9a7d46..132bbf9be 100644 --- a/tests/op/test_onnx_op_exp.cpp +++ b/tests/op/test_onnx_op_exp.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_exp"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_exp"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_expand_dim_unchanged.cpp b/tests/op/test_onnx_op_expand_dim_unchanged.cpp index 964a0faa7..961254b78 100644 --- a/tests/op/test_onnx_op_expand_dim_unchanged.cpp +++ b/tests/op/test_onnx_op_expand_dim_unchanged.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_expand_dim_unchanged"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_expand_dim_unchanged"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -86,7 +85,7 @@ int main(int argc, char* argv[]) } /* input 1 */ - int input_size_1 = w_1; + int input_size_1 = w_1; int dims_1[] = {w_1}; std::vector feature_in_1(input_size_1); @@ -121,7 +120,6 @@ int main(int argc, char* argv[]) /* prepare process input data, set the data mem to input tensor */ get_pb_data(feature_in_0.data(), input_pb_0); - /* run graph */ if (run_graph(graph, 1) < 0) { @@ -131,7 +129,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_floor.cpp b/tests/op/test_onnx_op_floor.cpp index b520e2385..8957d081d 100644 --- a/tests/op/test_onnx_op_floor.cpp +++ b/tests/op/test_onnx_op_floor.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_floor"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_floor"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_globalaveragepool.cpp b/tests/op/test_onnx_op_globalaveragepool.cpp index b013fd306..9a5724dae 100644 --- a/tests/op/test_onnx_op_globalaveragepool.cpp +++ b/tests/op/test_onnx_op_globalaveragepool.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_globalaveragepool"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_globalaveragepool"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_greater.cpp b/tests/op/test_onnx_op_greater.cpp index 5eb64587d..d4323308e 100644 --- a/tests/op/test_onnx_op_greater.cpp +++ b/tests/op/test_onnx_op_greater.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_greater"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_greater"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_gru_defaults.cpp b/tests/op/test_onnx_op_gru_defaults.cpp index 8dc4620db..9e8c9ec4a 100644 --- a/tests/op/test_onnx_op_gru_defaults.cpp +++ b/tests/op/test_onnx_op_gru_defaults.cpp @@ -22,15 +22,14 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_gru_defaults"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; -std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb"; -std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string node = "test_gru_defaults"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb"; +std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -118,7 +117,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* input 2 */ int input_size_2 = c_2 * h_2 * w_2; @@ -165,7 +164,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_gru_seq_length.cpp b/tests/op/test_onnx_op_gru_seq_length.cpp index a47569ffc..938611efc 100644 --- a/tests/op/test_onnx_op_gru_seq_length.cpp +++ b/tests/op/test_onnx_op_gru_seq_length.cpp @@ -22,16 +22,15 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_gru_seq_length"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; -std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb"; -std::string input_pb_3 = "../onnx_node/" + node + "/test_data_set_0/input_3.pb"; -std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string node = "test_gru_seq_length"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb"; +std::string input_pb_3 = "../onnx_node/" + node + "/test_data_set_0/input_3.pb"; +std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -53,7 +52,7 @@ int main(int argc, char* argv[]) int n_3 = 1; int c_3 = 1; int h_3 = 1; - int w_3 = 30; + int w_3 = 30; /* set runtime options */ struct options opt; @@ -124,7 +123,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* input 2 */ int input_size_2 = c_2 * h_2 * w_2; @@ -148,7 +147,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* input 3 */ int input_size_3 = h_3 * w_3; @@ -196,7 +195,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_gru_with_initial_bias.cpp b/tests/op/test_onnx_op_gru_with_initial_bias.cpp index 3e93c3601..e516c0259 100644 --- a/tests/op/test_onnx_op_gru_with_initial_bias.cpp +++ b/tests/op/test_onnx_op_gru_with_initial_bias.cpp @@ -22,16 +22,15 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_gru_with_initial_bias"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; -std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb"; -std::string input_pb_3 = "../onnx_node/" + node + "/test_data_set_0/input_3.pb"; -std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string node = "test_gru_with_initial_bias"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb"; +std::string input_pb_3 = "../onnx_node/" + node + "/test_data_set_0/input_3.pb"; +std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -53,7 +52,7 @@ int main(int argc, char* argv[]) int n_3 = 1; int c_3 = 1; int h_3 = 1; - int w_3 = 18; + int w_3 = 18; /* set runtime options */ struct options opt; @@ -124,7 +123,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* input 2 */ int input_size_2 = c_2 * h_2 * w_2; @@ -148,7 +147,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* input 3 */ int input_size_3 = h_3 * w_3; @@ -196,7 +195,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_hardsigmoid.cpp b/tests/op/test_onnx_op_hardsigmoid.cpp index 9a55568e0..8497ed69b 100644 --- a/tests/op/test_onnx_op_hardsigmoid.cpp +++ b/tests/op/test_onnx_op_hardsigmoid.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_hardsigmoid"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_hardsigmoid"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_instancenorm_epsilon.cpp b/tests/op/test_onnx_op_instancenorm_epsilon.cpp index 35d599e63..fb59641ff 100644 --- a/tests/op/test_onnx_op_instancenorm_epsilon.cpp +++ b/tests/op/test_onnx_op_instancenorm_epsilon.cpp @@ -22,15 +22,14 @@ * Author: sqfu@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_instancenorm_epsilon"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; -std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb"; +std::string node = "test_instancenorm_epsilon"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -70,11 +69,11 @@ int main(int argc, char* argv[]) fprintf(stderr, "Create graph failed.\n"); return -1; } -// set_log_level(LOG_INFO); -// dump_graph(graph); + // set_log_level(LOG_INFO); + // dump_graph(graph); /* set the shape, data buffer of input_tensor of the graph */ - /* input 0 */ + /* input 0 */ int input_size_0 = n_0 * c_0 * h_0 * w_0; int dims[] = {n_0, c_0, h_0, w_0}; std::vector feature_in_0(input_size_0); @@ -97,7 +96,7 @@ int main(int argc, char* argv[]) return -1; } - /* input 1 */ + /* input 1 */ int input_size_1 = n_1 * c_1 * h_1 * w_1; int dims_1[] = {n_1, c_1, h_1, w_1}; std::vector feature_in_1(input_size_1); @@ -118,9 +117,9 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor_1 buffer failed\n"); return -1; - } + } - /* input 2 */ + /* input 2 */ int input_size_2 = n_2 * c_2 * h_2 * w_2; int dims_2[] = {n_2, c_2, h_2, w_2}; std::vector feature_in_2(input_size_2); @@ -141,7 +140,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor_2 buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -164,7 +163,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_instancenorm_example.cpp b/tests/op/test_onnx_op_instancenorm_example.cpp index 3331cc5f6..cb19cb2cf 100644 --- a/tests/op/test_onnx_op_instancenorm_example.cpp +++ b/tests/op/test_onnx_op_instancenorm_example.cpp @@ -22,15 +22,14 @@ * Author: sqfu@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_instancenorm_example"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; -std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb"; +std::string node = "test_instancenorm_example"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -70,11 +69,11 @@ int main(int argc, char* argv[]) fprintf(stderr, "Create graph failed.\n"); return -1; } -// set_log_level(LOG_INFO); -// dump_graph(graph); + // set_log_level(LOG_INFO); + // dump_graph(graph); /* set the shape, data buffer of input_tensor of the graph */ - /* input 0 */ + /* input 0 */ int input_size_0 = n_0 * c_0 * h_0 * w_0; int dims[] = {n_0, c_0, h_0, w_0}; std::vector feature_in_0(input_size_0); @@ -97,7 +96,7 @@ int main(int argc, char* argv[]) return -1; } - /* input 1 */ + /* input 1 */ int input_size_1 = n_1 * c_1 * h_1 * w_1; int dims_1[] = {n_1, c_1, h_1, w_1}; std::vector feature_in_1(input_size_1); @@ -118,9 +117,9 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor_1 buffer failed\n"); return -1; - } + } - /* input 2 */ + /* input 2 */ int input_size_2 = n_2 * c_2 * h_2 * w_2; int dims_2[] = {n_2, c_2, h_2, w_2}; std::vector feature_in_2(input_size_2); @@ -141,7 +140,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor_2 buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -164,7 +163,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_leakyrelu.cpp b/tests/op/test_onnx_op_leakyrelu.cpp index 89ee9ab11..1bbbb3976 100644 --- a/tests/op/test_onnx_op_leakyrelu.cpp +++ b/tests/op/test_onnx_op_leakyrelu.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_leakyrelu"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_leakyrelu"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_less.cpp b/tests/op/test_onnx_op_less.cpp index ad286141b..4976bdb66 100644 --- a/tests/op/test_onnx_op_less.cpp +++ b/tests/op/test_onnx_op_less.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_less"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_less"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -112,7 +111,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -134,7 +133,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_log.cpp b/tests/op/test_onnx_op_log.cpp index 9e9792a6a..b88123f1f 100644 --- a/tests/op/test_onnx_op_log.cpp +++ b/tests/op/test_onnx_op_log.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_log"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_log"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_logsoftmax_default_axis.cpp b/tests/op/test_onnx_op_logsoftmax_default_axis.cpp index b5939e428..aa2293a01 100644 --- a/tests/op/test_onnx_op_logsoftmax_default_axis.cpp +++ b/tests/op/test_onnx_op_logsoftmax_default_axis.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_logsoftmax_default_axis"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_logsoftmax_default_axis"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_lstm_defaults.cpp b/tests/op/test_onnx_op_lstm_defaults.cpp index 48d307e46..d5de60070 100644 --- a/tests/op/test_onnx_op_lstm_defaults.cpp +++ b/tests/op/test_onnx_op_lstm_defaults.cpp @@ -22,15 +22,14 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_lstm_defaults"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; -std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb"; -std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string node = "test_lstm_defaults"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb"; +std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -118,7 +117,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* input 2 */ int input_size_2 = c_2 * h_2 * w_2; @@ -165,7 +164,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_lstm_with_initial_bias.cpp b/tests/op/test_onnx_op_lstm_with_initial_bias.cpp index d76dc4b41..d5f61ad52 100644 --- a/tests/op/test_onnx_op_lstm_with_initial_bias.cpp +++ b/tests/op/test_onnx_op_lstm_with_initial_bias.cpp @@ -22,16 +22,15 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_lstm_with_initial_bias"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; -std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb"; -std::string input_pb_3 = "../onnx_node/" + node + "/test_data_set_0/input_3.pb"; -std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string node = "test_lstm_with_initial_bias"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb"; +std::string input_pb_3 = "../onnx_node/" + node + "/test_data_set_0/input_3.pb"; +std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -53,7 +52,7 @@ int main(int argc, char* argv[]) int n_3 = 1; int c_3 = 1; int h_3 = 1; - int w_3 = 32; + int w_3 = 32; /* set runtime options */ struct options opt; @@ -124,7 +123,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* input 2 */ int input_size_2 = c_2 * h_2 * w_2; @@ -148,7 +147,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* input 3 */ int input_size_3 = h_3 * w_3; @@ -196,7 +195,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_matmul_2d.cpp b/tests/op/test_onnx_op_matmul_2d.cpp index 866b9cd24..ef7f75c5b 100644 --- a/tests/op/test_onnx_op_matmul_2d.cpp +++ b/tests/op/test_onnx_op_matmul_2d.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_matmul_2d"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_matmul_2d"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -108,7 +107,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -130,7 +129,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_matmul_3d.cpp b/tests/op/test_onnx_op_matmul_3d.cpp index 28ed0612b..67e16bec1 100644 --- a/tests/op/test_onnx_op_matmul_3d.cpp +++ b/tests/op/test_onnx_op_matmul_3d.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_matmul_3d"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_matmul_3d"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -110,7 +109,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -132,7 +131,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_matmul_4d.cpp b/tests/op/test_onnx_op_matmul_4d.cpp index dcdc86d5c..02308791f 100644 --- a/tests/op/test_onnx_op_matmul_4d.cpp +++ b/tests/op/test_onnx_op_matmul_4d.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_matmul_4d"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_matmul_4d"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -112,7 +111,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -134,7 +133,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_maxpool_2d_default.cpp b/tests/op/test_onnx_op_maxpool_2d_default.cpp index a6316ffa8..6f5ef2fce 100644 --- a/tests/op/test_onnx_op_maxpool_2d_default.cpp +++ b/tests/op/test_onnx_op_maxpool_2d_default.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_maxpool_2d_default"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_maxpool_2d_default"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_maxpool_2d_dilations.cpp b/tests/op/test_onnx_op_maxpool_2d_dilations.cpp index d063e4851..475499f69 100644 --- a/tests/op/test_onnx_op_maxpool_2d_dilations.cpp +++ b/tests/op/test_onnx_op_maxpool_2d_dilations.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_maxpool_2d_dilations"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_maxpool_2d_dilations"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_maxpool_2d_pads.cpp b/tests/op/test_onnx_op_maxpool_2d_pads.cpp index 02f757e66..7de587e1a 100644 --- a/tests/op/test_onnx_op_maxpool_2d_pads.cpp +++ b/tests/op/test_onnx_op_maxpool_2d_pads.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_maxpool_2d_pads"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_maxpool_2d_pads"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_neg.cpp b/tests/op/test_onnx_op_neg.cpp index 1805caf47..b9c54fbe1 100644 --- a/tests/op/test_onnx_op_neg.cpp +++ b/tests/op/test_onnx_op_neg.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_neg"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_neg"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_pow.cpp b/tests/op/test_onnx_op_pow.cpp index b8a4dcfbd..c8a129d42 100644 --- a/tests/op/test_onnx_op_pow.cpp +++ b/tests/op/test_onnx_op_pow.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_pow"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_pow"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -112,7 +111,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -134,7 +133,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_reciprocal.cpp b/tests/op/test_onnx_op_reciprocal.cpp index 328e84ae9..aa8f1703d 100644 --- a/tests/op/test_onnx_op_reciprocal.cpp +++ b/tests/op/test_onnx_op_reciprocal.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_reciprocal"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_reciprocal"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_reduce_log_sum_default.cpp b/tests/op/test_onnx_op_reduce_log_sum_default.cpp index 7871749a2..967d53786 100644 --- a/tests/op/test_onnx_op_reduce_log_sum_default.cpp +++ b/tests/op/test_onnx_op_reduce_log_sum_default.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_reduce_log_sum_default"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_reduce_log_sum_default"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_reduce_max_default_axes_keepdim_example.cpp b/tests/op/test_onnx_op_reduce_max_default_axes_keepdim_example.cpp index 078f728d4..22114ea6b 100644 --- a/tests/op/test_onnx_op_reduce_max_default_axes_keepdim_example.cpp +++ b/tests/op/test_onnx_op_reduce_max_default_axes_keepdim_example.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_reduce_max_default_axes_keepdim_example"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_reduce_max_default_axes_keepdim_example"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_reduce_mean_default_axes_keepdims_example.cpp b/tests/op/test_onnx_op_reduce_mean_default_axes_keepdims_example.cpp index 7ee3e5e48..ef0a74ad2 100644 --- a/tests/op/test_onnx_op_reduce_mean_default_axes_keepdims_example.cpp +++ b/tests/op/test_onnx_op_reduce_mean_default_axes_keepdims_example.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_reduce_mean_default_axes_keepdims_example"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_reduce_mean_default_axes_keepdims_example"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_reduce_min_default_axes_keepdims_example.cpp b/tests/op/test_onnx_op_reduce_min_default_axes_keepdims_example.cpp index b3522ce9c..6eea307cf 100644 --- a/tests/op/test_onnx_op_reduce_min_default_axes_keepdims_example.cpp +++ b/tests/op/test_onnx_op_reduce_min_default_axes_keepdims_example.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_reduce_min_default_axes_keepdims_example"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_reduce_min_default_axes_keepdims_example"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_reduce_sum_square_default_axes_keepdims_example.cpp b/tests/op/test_onnx_op_reduce_sum_square_default_axes_keepdims_example.cpp index 724c300fa..e07d3fb26 100644 --- a/tests/op/test_onnx_op_reduce_sum_square_default_axes_keepdims_example.cpp +++ b/tests/op/test_onnx_op_reduce_sum_square_default_axes_keepdims_example.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_reduce_sum_square_default_axes_keepdims_example"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_reduce_sum_square_default_axes_keepdims_example"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_relu.cpp b/tests/op/test_onnx_op_relu.cpp index ac53247da..f8f6b70f9 100644 --- a/tests/op/test_onnx_op_relu.cpp +++ b/tests/op/test_onnx_op_relu.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_relu"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_relu"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_selu.cpp b/tests/op/test_onnx_op_selu.cpp index af057807f..35510186b 100644 --- a/tests/op/test_onnx_op_selu.cpp +++ b/tests/op/test_onnx_op_selu.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_selu"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_selu"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_selu_default.cpp b/tests/op/test_onnx_op_selu_default.cpp index 6225d028c..510c6fc6a 100644 --- a/tests/op/test_onnx_op_selu_default.cpp +++ b/tests/op/test_onnx_op_selu_default.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_selu_default"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_selu_default"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_softmax_default_axis.cpp b/tests/op/test_onnx_op_softmax_default_axis.cpp index 57581c1d9..81aff8276 100644 --- a/tests/op/test_onnx_op_softmax_default_axis.cpp +++ b/tests/op/test_onnx_op_softmax_default_axis.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_softmax_default_axis"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_softmax_default_axis"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_softplus.cpp b/tests/op/test_onnx_op_softplus.cpp index c6b3c1cc8..3a6a9bb15 100644 --- a/tests/op/test_onnx_op_softplus.cpp +++ b/tests/op/test_onnx_op_softplus.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_softplus"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_softplus"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_squeeze.cpp b/tests/op/test_onnx_op_squeeze.cpp index 2fdccb469..d5406e476 100644 --- a/tests/op/test_onnx_op_squeeze.cpp +++ b/tests/op/test_onnx_op_squeeze.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_squeeze"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_squeeze"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_sub.cpp b/tests/op/test_onnx_op_sub.cpp index 9a2f75db0..137afed64 100644 --- a/tests/op/test_onnx_op_sub.cpp +++ b/tests/op/test_onnx_op_sub.cpp @@ -22,14 +22,13 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_sub"; -std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; -std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; +std::string node = "test_sub"; +std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -112,7 +111,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -134,7 +133,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_tanh.cpp b/tests/op/test_onnx_op_tanh.cpp index 66c974957..78b2d1628 100644 --- a/tests/op/test_onnx_op_tanh.cpp +++ b/tests/op/test_onnx_op_tanh.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_tanh"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_tanh"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_onnx_op_unsqueeze_axis_1.cpp b/tests/op/test_onnx_op_unsqueeze_axis_1.cpp index 2bf5406e7..99105f29e 100644 --- a/tests/op/test_onnx_op_unsqueeze_axis_1.cpp +++ b/tests/op/test_onnx_op_unsqueeze_axis_1.cpp @@ -22,13 +22,12 @@ * Author: qtang@openailab.com */ - #include "test_onnx_op.h" -std::string node = "test_unsqueeze_axis_1"; -std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; +std::string node = "test_unsqueeze_axis_1"; +std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb"; std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb"; -std::string model = "../onnx_node/" + node + "/onnx.tmfile"; +std::string model = "../onnx_node/" + node + "/onnx.tmfile"; int main(int argc, char* argv[]) { @@ -81,7 +80,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Set input tensor buffer failed\n"); return -1; - } + } /* prerun graph, set work options(num_thread, cluster, precision) */ if (prerun_graph_multithread(graph, opt) < 0) @@ -102,7 +101,7 @@ int main(int argc, char* argv[]) /* get the current result of inference */ tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); + float* output_data = (float*)get_tensor_buffer(output_tensor); int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); /* get the reference result of inference */ diff --git a/tests/op/test_op.h b/tests/op/test_op.h index b08f81e44..dd9c97007 100644 --- a/tests/op/test_op.h +++ b/tests/op/test_op.h @@ -18,11 +18,10 @@ #include "graph/tensor.h" #define TENSOR_SHOW_LEADING_BLANK " " -#define TENSOR_FLOAT_EPSILON 0.0001f +#define TENSOR_FLOAT_EPSILON 0.0001f typedef int (*common_test)(graph_t, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w); - void dump_tensor_line(void* data_ptr, int offset, int data_type, int w) { if (0 >= w) @@ -33,80 +32,79 @@ void dump_tensor_line(void* data_ptr, int offset, int data_type, int w) printf("[ "); - switch(data_type) + switch (data_type) + { + case TENGINE_DT_FP32: { - case TENGINE_DT_FP32: + float* p = (float*)data_ptr; + + for (int i = 0; i < w - 1; i++) { - float* p = ( float* )data_ptr; + printf("%0.2f, ", p[offset + i]); + } + printf("%0.2f ", p[offset + w - 1]); - for(int i = 0; i < w - 1; i++) - { - printf("%0.2f, ", p[offset + i]); - } - printf("%0.2f ", p[offset + w - 1]); + break; + } + case TENGINE_DT_FP16: + { + __fp16* p = (__fp16*)data_ptr; - break; +#ifdef __ARM_ARCH + for (int i = 0; i < w - 1; i++) + { + printf("%f, ", (float)p[offset + i]); } - case TENGINE_DT_FP16: + printf("%f ", (float)p[offset + w - 1]); +#else + for (int i = 0; i < w - 1; i++) { - __fp16* p = ( __fp16* )data_ptr; + printf("%f, ", fp16_to_fp32(p[offset + i])); + } + printf("%f ", fp16_to_fp32(p[offset + w - 1])); +#endif + break; + } + case TENGINE_DT_INT8: + case TENGINE_DT_UINT8: + { + if (data_type == TENGINE_DT_INT8) + { + int8_t* p = (int8_t*)data_ptr; -#ifdef __ARM_ARCH - for(int i = 0; i < w - 1; i++) + for (int i = 0; i < w - 1; i++) { - printf("%f, ", (float)p[offset + i]); + printf("%d, ", (int)p[offset + i]); } - printf("%f ", (float)p[offset + w - 1]); -#else - for(int i = 0; i < w - 1; i++) - { - printf("%f, ", fp16_to_fp32(p[offset + i])); - } - printf("%f ", fp16_to_fp32(p[offset + w - 1])); -#endif - break; + printf("%d ", (int)p[offset + w - 1]); } - case TENGINE_DT_INT8: - case TENGINE_DT_UINT8: + else { - if(data_type == TENGINE_DT_INT8) - { - int8_t* p = ( int8_t* )data_ptr; + uint8_t* p = (uint8_t*)data_ptr; - for(int i = 0; i < w - 1; i++) - { - printf("%d, ", (int)p[offset + i]); - } - printf("%d ", (int)p[offset + w - 1]); - } - else + for (int i = 0; i < w - 1; i++) { - uint8_t* p = ( uint8_t* )data_ptr; - - for(int i = 0; i < w - 1; i++) - { - printf("%d, ", (int)p[offset + i]); - } - printf("%d ", (int)p[offset + w - 1]); + printf("%d, ", (int)p[offset + i]); } - - break; + printf("%d ", (int)p[offset + w - 1]); } - default: - // not deal with TENGINE_DT_INT16 and TENGINE_DT_INT32 - fprintf(stderr, "Unsupported data type for now. "); + + break; + } + default: + // not deal with TENGINE_DT_INT16 and TENGINE_DT_INT32 + fprintf(stderr, "Unsupported data type for now. "); } printf("]"); } - void dump_tensor(tensor_t tensor, const char* message) { int data_type = get_tensor_data_type(tensor); void* data_ptr = get_tensor_buffer(tensor); - int dim_array[MAX_SHAPE_DIM_NUM] = { 0 }; + int dim_array[MAX_SHAPE_DIM_NUM] = {0}; int dim_count = get_tensor_shape(tensor, dim_array, MAX_SHAPE_DIM_NUM); if (0 >= dim_count) fprintf(stderr, "Cannot get tensor shape."); @@ -119,34 +117,34 @@ void dump_tensor(tensor_t tensor, const char* message) switch (dim_count) { - case 4: - { - n = dim_array[0]; - c = dim_array[1]; - h = dim_array[2]; - w = dim_array[3]; - break; - } - case 3: - { - c = dim_array[0]; - h = dim_array[1]; - w = dim_array[2]; - break; - } - case 2: - { - h = dim_array[0]; - w = dim_array[1]; - break; - } - case 1: - { - w = dim_array[0]; - break; - } - default: - fprintf(stderr, "Cannot found the type of tensor.\n"); + case 4: + { + n = dim_array[0]; + c = dim_array[1]; + h = dim_array[2]; + w = dim_array[3]; + break; + } + case 3: + { + c = dim_array[0]; + h = dim_array[1]; + w = dim_array[2]; + break; + } + case 2: + { + h = dim_array[0]; + w = dim_array[1]; + break; + } + case 1: + { + w = dim_array[0]; + break; + } + default: + fprintf(stderr, "Cannot found the type of tensor.\n"); } // print leader @@ -182,11 +180,10 @@ void dump_tensor(tensor_t tensor, const char* message) printf("].\n"); } - void dump_node_input(node_t test_node, int index) { tensor_t tensor = get_node_input_tensor(test_node, index); - if(NULL == tensor) + if (NULL == tensor) { fprintf(stderr, "Get input tensor(%d) from the node failed.\n", index); return; @@ -200,11 +197,10 @@ void dump_node_input(node_t test_node, int index) release_graph_tensor(tensor); } - void dump_node_output(node_t test_node, int index) { tensor_t tensor = get_node_output_tensor(test_node, index); - if(NULL == tensor) + if (NULL == tensor) { fprintf(stderr, "Get output tensor from the node failed.\n"); return; @@ -218,7 +214,6 @@ void dump_node_output(node_t test_node, int index) release_graph_tensor(tensor); } - int create_node(graph_t graph, const char* node_name, int n, int c, int h, int w, int data_type, int layout) { node_t node = create_graph_node(graph, node_name, "InputOp"); @@ -229,7 +224,7 @@ int create_node(graph_t graph, const char* node_name, int n, int c, int h, int w } tensor_t tensor = create_graph_tensor(graph, node_name, data_type); - if(NULL == tensor) + if (NULL == tensor) { release_graph_node(node); @@ -239,13 +234,13 @@ int create_node(graph_t graph, const char* node_name, int n, int c, int h, int w set_node_output_tensor(node, 0, tensor, TENSOR_TYPE_INPUT); - if(TENGINE_LAYOUT_NCHW == layout) + if (TENGINE_LAYOUT_NCHW == layout) { int dims[4] = {n, c, h, w}; set_tensor_shape(tensor, dims, 4); } - if(TENGINE_LAYOUT_NHWC == layout) + if (TENGINE_LAYOUT_NHWC == layout) { int dims[4] = {n, h, w, c}; set_tensor_shape(tensor, dims, 4); @@ -257,7 +252,6 @@ int create_node(graph_t graph, const char* node_name, int n, int c, int h, int w return 0; } - int create_input_node(graph_t graph, const char* node_name, int data_type, int layout, int n, int c, int h, int w, int dims_count = 4) { if (0 == n) dims_count = 3; @@ -277,7 +271,7 @@ int create_input_node(graph_t graph, const char* node_name, int data_type, int l } tensor_t tensor = create_graph_tensor(graph, node_name, data_type); - if(NULL == tensor) + if (NULL == tensor) { release_graph_node(node); @@ -287,7 +281,7 @@ int create_input_node(graph_t graph, const char* node_name, int data_type, int l } int ret = set_node_output_tensor(node, 0, tensor, TENSOR_TYPE_INPUT); - if(0 != ret) + if (0 != ret) { release_graph_tensor(tensor); release_graph_node(node); @@ -297,70 +291,70 @@ int create_input_node(graph_t graph, const char* node_name, int data_type, int l return -1; } - switch(dims_count) + switch (dims_count) + { + case 1: + { + int dims_array[1] = {w}; + set_tensor_shape(tensor, dims_array, dims_count); + break; + } + case 2: { - case 1: + int dims_array[2] = {h, w}; + set_tensor_shape(tensor, dims_array, dims_count); + break; + } + case 3: + { + if (TENGINE_LAYOUT_NCHW == layout) { - int dims_array[1] = { w }; + int dims_array[3] = {c, h, w}; set_tensor_shape(tensor, dims_array, dims_count); break; } - case 2: + + if (TENGINE_LAYOUT_NHWC == layout) { - int dims_array[2] = { h, w }; + int dims_array[3] = {h, w, c}; set_tensor_shape(tensor, dims_array, dims_count); break; } - case 3: + } + case 4: + { + if (TENGINE_LAYOUT_NCHW == layout) { - if (TENGINE_LAYOUT_NCHW == layout) - { - int dims_array[3] = { c, h, w }; - set_tensor_shape(tensor, dims_array, dims_count); - break; - } - - if (TENGINE_LAYOUT_NHWC == layout) - { - int dims_array[3] = { h, w, c }; - set_tensor_shape(tensor, dims_array, dims_count); - break; - } + int dims_array[4] = {n, c, h, w}; + set_tensor_shape(tensor, dims_array, dims_count); + break; } - case 4: - { - if (TENGINE_LAYOUT_NCHW == layout) - { - int dims_array[4] = { n, c, h, w }; - set_tensor_shape(tensor, dims_array, dims_count); - break; - } - if (TENGINE_LAYOUT_NHWC == layout) - { - int dims_array[4] = { n, h, w, c }; - set_tensor_shape(tensor, dims_array, dims_count); - break; - } + if (TENGINE_LAYOUT_NHWC == layout) + { + int dims_array[4] = {n, h, w, c}; + set_tensor_shape(tensor, dims_array, dims_count); + break; } - case 5: + } + case 5: + { + if (TENGINE_LAYOUT_NCHW == layout) { - if (TENGINE_LAYOUT_NCHW == layout) - { - int dims_array[5] = {1, n, c, h, w }; - set_tensor_shape(tensor, dims_array, dims_count); - break; - } + int dims_array[5] = {1, n, c, h, w}; + set_tensor_shape(tensor, dims_array, dims_count); + break; + } - if (TENGINE_LAYOUT_NHWC == layout) - { - int dims_array[5] = {1, n, h, w, c }; - set_tensor_shape(tensor, dims_array, dims_count); - break; - } + if (TENGINE_LAYOUT_NHWC == layout) + { + int dims_array[5] = {1, n, h, w, c}; + set_tensor_shape(tensor, dims_array, dims_count); + break; } - default: - fprintf(stderr, "Cannot support %d dims tensor.\n", dims_count); + } + default: + fprintf(stderr, "Cannot support %d dims tensor.\n", dims_count); } release_graph_tensor(tensor); @@ -369,7 +363,6 @@ int create_input_node(graph_t graph, const char* node_name, int data_type, int l return 0; } - int fill_fp32_tensor(tensor_t tensor, float value) { int dims[MAX_SHAPE_DIM_NUM]; @@ -394,7 +387,6 @@ int fill_fp32_tensor(tensor_t tensor, float value) return 0; } - int fill_uint8_tensor(tensor_t tensor, float value) { int dims[MAX_SHAPE_DIM_NUM]; @@ -416,7 +408,7 @@ int fill_uint8_tensor(tensor_t tensor, float value) int input_zero_point = 0; get_tensor_quant_param(tensor, &input_scale, &input_zero_point, 1); - uint8_t * data_ptr = (uint8_t *)get_tensor_buffer(tensor); + uint8_t* data_ptr = (uint8_t*)get_tensor_buffer(tensor); for (int i = 0; i < element_count; i++) { int udata = (round)(value / input_scale + (float)input_zero_point); @@ -430,118 +422,112 @@ int fill_uint8_tensor(tensor_t tensor, float value) return 0; } - void fill_input_float_tensor_by_index(graph_t graph, int input_node_index, int tensor_index, float value) { tensor_t tensor = get_graph_input_tensor(graph, input_node_index, tensor_index); - if(NULL == tensor) + if (NULL == tensor) fprintf(stderr, "Cannot find the %dth tensor via node index(%d).\n", tensor_index, input_node_index); int buf_size = get_tensor_buffer_size(tensor); - float* data = (float* )malloc(buf_size); + float* data = (float*)malloc(buf_size); -// for(int i = 0; i < buf_size/sizeof(float); i++) -// data[i] = value; + // for(int i = 0; i < buf_size/sizeof(float); i++) + // data[i] = value; - int ret = set_tensor_buffer(tensor, (void* )data, buf_size); - if(0 != ret) + int ret = set_tensor_buffer(tensor, (void*)data, buf_size); + if (0 != ret) fprintf(stderr, "Set buffer for tensor failed.\n"); ret = fill_fp32_tensor(tensor, value); - if(0 != ret) + if (0 != ret) fprintf(stderr, "Fill buffer for tensor failed.\n"); } - void fill_input_uint8_tensor_by_index(graph_t graph, int input_node_index, int tensor_index, float value) { tensor_t tensor = get_graph_input_tensor(graph, input_node_index, tensor_index); - if(NULL == tensor) + if (NULL == tensor) fprintf(stderr, "Cannot find the %dth tensor via node index(%d).\n", tensor_index, input_node_index); int buf_size = get_tensor_buffer_size(tensor); - uint8_t* data = (uint8_t* )malloc(buf_size); + uint8_t* data = (uint8_t*)malloc(buf_size); - int ret = set_tensor_buffer(tensor, (void* )data, buf_size); - if(0 != ret) + int ret = set_tensor_buffer(tensor, (void*)data, buf_size); + if (0 != ret) fprintf(stderr, "Set buffer for tensor failed.\n"); ret = fill_uint8_tensor(tensor, value); - if(0 != ret) + if (0 != ret) fprintf(stderr, "Fill buffer for tensor failed.\n"); } - void fill_input_float_tensor_by_name(graph_t graph, const char* node_name, int tensor_index, float value) { node_t node = get_graph_node(graph, node_name); - if(NULL == node) + if (NULL == node) fprintf(stderr, "Cannot get node via node name(%s).\n", node_name); tensor_t tensor = get_node_input_tensor(node, tensor_index); - if(NULL == tensor) + if (NULL == tensor) fprintf(stderr, "Cannot find the %dth tensor via node name(%s)\n", tensor_index, node_name); int buf_size = get_tensor_buffer_size(tensor); - float* data = (float* )malloc(buf_size); + float* data = (float*)malloc(buf_size); -// for(unsigned int i = 0; i < buf_size/sizeof(float) ; i++) -// data[i] = value; + // for(unsigned int i = 0; i < buf_size/sizeof(float) ; i++) + // data[i] = value; - int ret = set_tensor_buffer(tensor, (void* )data, buf_size); - if(0 != ret) + int ret = set_tensor_buffer(tensor, (void*)data, buf_size); + if (0 != ret) fprintf(stderr, "Set buffer for tensor failed.\n"); ret = fill_fp32_tensor(tensor, value); - if(0 != ret) + if (0 != ret) fprintf(stderr, "Fill buffer for tensor failed.\n"); } - void fill_input_float_buffer_tensor_by_name(graph_t graph, const char* node_name, int tensor_index, void* value, int buf_size) { node_t node = get_graph_node(graph, node_name); - if(NULL == node) + if (NULL == node) fprintf(stderr, "Cannot get node via node name(%s).\n", node_name); tensor_t tensor = get_node_input_tensor(node, tensor_index); - if(NULL == tensor) + if (NULL == tensor) fprintf(stderr, "Cannot find the %dth tensor via node name(%s).\n", tensor_index, node_name); int ret = set_tensor_buffer(tensor, value, buf_size); - if(0 != ret) + if (0 != ret) fprintf(stderr, "Set buffer for tensor failed.\n"); } - void fill_input_integer_tensor_by_name(graph_t graph, const char* node_name, int tensor_index, int value) { node_t node = get_graph_node(graph, node_name); - if(NULL == node) + if (NULL == node) { fprintf(stderr, "Cannot get node via node name(%s).\n", node_name); return; } tensor_t tensor = get_node_input_tensor(node, tensor_index); - if(NULL == tensor) + if (NULL == tensor) { fprintf(stderr, "Cannot find the %dth tensor via node name(%s).\n", tensor_index, node_name); return; } int buf_size = get_tensor_buffer_size(tensor); - int* data = (int* )malloc(buf_size); + int* data = (int*)malloc(buf_size); - for(unsigned int i = 0; i < buf_size/sizeof(int) ; i++) + for (unsigned int i = 0; i < buf_size / sizeof(int); i++) data[i] = value; - int ret = set_tensor_buffer(tensor, (void* )data, buf_size); - if(0 != ret) + int ret = set_tensor_buffer(tensor, (void*)data, buf_size); + if (0 != ret) fprintf(stderr, "Set buffer for tensor failed.\n"); } - int test_graph_init() { // now init tengine will mask critical filed and return an error @@ -551,10 +537,9 @@ int test_graph_init() return 0; } - int test_graph_run(graph_t graph) { - if(prerun_graph(graph) < 0) + if (prerun_graph(graph) < 0) { fprintf(stderr, "Pre-run graph failed.\n"); return -1; @@ -571,7 +556,6 @@ int test_graph_run(graph_t graph) return 0; } - void test_graph_release(graph_t graph) { postrun_graph(graph); @@ -579,30 +563,29 @@ void test_graph_release(graph_t graph) release_tengine(); } - graph_t create_common_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num = 4) { graph_t graph = create_graph(NULL, NULL, NULL); - if(NULL == graph) + if (NULL == graph) { fprintf(stderr, "get graph failed.\n"); return NULL; } - if(set_graph_layout(graph, layout) < 0) + if (set_graph_layout(graph, layout) < 0) { fprintf(stderr, "set layout failed.\n"); return NULL; } const char* input_name = "input_node"; - if(create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0) + if (create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0) { fprintf(stderr, "create input node failed.\n"); return NULL; } - if(test_func(graph, input_name, test_node_name, data_type, layout, n, c, h ,w) < 0) + if (test_func(graph, input_name, test_node_name, data_type, layout, n, c, h, w) < 0) { fprintf(stderr, "create test node failed.\n"); return NULL; @@ -612,13 +595,13 @@ graph_t create_common_test_graph(const char* test_node_name, int data_type, int const char* inputs[] = {input_name}; const char* outputs[] = {test_node_name}; - if(set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0) + if (set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0) { fprintf(stderr, "set inputs failed.\n"); return NULL; } - if(set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0) + if (set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0) { fprintf(stderr, "set outputs failed.\n"); return NULL; @@ -627,7 +610,6 @@ graph_t create_common_test_graph(const char* test_node_name, int data_type, int return graph; } - graph_t create_timvx_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num = 4) { /* create VeriSilicon TIM-VX backend */ @@ -640,26 +622,26 @@ graph_t create_timvx_test_graph(const char* test_node_name, int data_type, int l } graph_t graph = create_graph(timvx_context, NULL, NULL); - if(NULL == graph) + if (NULL == graph) { fprintf(stderr, "get graph failed.\n"); return NULL; } - if(set_graph_layout(graph, layout) < 0) + if (set_graph_layout(graph, layout) < 0) { fprintf(stderr, "set layout failed.\n"); return NULL; } const char* input_name = "input_node"; - if(create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0) + if (create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0) { fprintf(stderr, "create input node failed.\n"); return NULL; } - if(test_func(graph, input_name, test_node_name, data_type, layout, n, c, h ,w) < 0) + if (test_func(graph, input_name, test_node_name, data_type, layout, n, c, h, w) < 0) { fprintf(stderr, "create test node failed.\n"); return NULL; @@ -669,13 +651,13 @@ graph_t create_timvx_test_graph(const char* test_node_name, int data_type, int l const char* inputs[] = {input_name}; const char* outputs[] = {test_node_name}; - if(set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0) + if (set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0) { fprintf(stderr, "set inputs failed.\n"); return NULL; } - if(set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0) + if (set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0) { fprintf(stderr, "set outputs failed.\n"); return NULL; @@ -696,26 +678,26 @@ graph_t create_tensorrt_test_graph(const char* test_node_name, int data_type, in } graph_t graph = create_graph(timvx_context, NULL, NULL); - if(NULL == graph) + if (NULL == graph) { fprintf(stderr, "get graph failed.\n"); return NULL; } - if(set_graph_layout(graph, layout) < 0) + if (set_graph_layout(graph, layout) < 0) { fprintf(stderr, "set layout failed.\n"); return NULL; } const char* input_name = "input_node"; - if(create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0) + if (create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0) { fprintf(stderr, "create input node failed.\n"); return NULL; } - if(test_func(graph, input_name, test_node_name, data_type, layout, n, c, h ,w) < 0) + if (test_func(graph, input_name, test_node_name, data_type, layout, n, c, h, w) < 0) { fprintf(stderr, "create test node failed.\n"); return NULL; @@ -725,13 +707,13 @@ graph_t create_tensorrt_test_graph(const char* test_node_name, int data_type, in const char* inputs[] = {input_name}; const char* outputs[] = {test_node_name}; - if(set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0) + if (set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0) { fprintf(stderr, "set inputs failed.\n"); return NULL; } - if(set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0) + if (set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0) { fprintf(stderr, "set outputs failed.\n"); return NULL; @@ -743,26 +725,26 @@ graph_t create_tensorrt_test_graph(const char* test_node_name, int data_type, in graph_t create_cpu_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num = 4) { graph_t graph = create_graph(NULL, NULL, NULL); - if(NULL == graph) + if (NULL == graph) { fprintf(stderr, "get graph failed.\n"); return NULL; } - if(set_graph_layout(graph, layout) < 0) + if (set_graph_layout(graph, layout) < 0) { fprintf(stderr, "set layout failed.\n"); return NULL; } const char* input_name = "input_node"; - if(create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0) + if (create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0) { fprintf(stderr, "create input node failed.\n"); return NULL; } - if(test_func(graph, input_name, test_node_name, data_type, layout, n, c, h ,w) < 0) + if (test_func(graph, input_name, test_node_name, data_type, layout, n, c, h, w) < 0) { fprintf(stderr, "create test node failed.\n"); return NULL; @@ -772,13 +754,13 @@ graph_t create_cpu_test_graph(const char* test_node_name, int data_type, int lay const char* inputs[] = {input_name}; const char* outputs[] = {test_node_name}; - if(set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0) + if (set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0) { fprintf(stderr, "set inputs failed.\n"); return NULL; } - if(set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0) + if (set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0) { fprintf(stderr, "set outputs failed.\n"); return NULL; @@ -818,75 +800,74 @@ int compare_tensor(tensor_t a, tensor_t b) switch (a_type) { - case TENGINE_DT_FP32: - { - float* a_data_ptr = (float*)get_tensor_buffer(a); - float* b_data_ptr = (float*)get_tensor_buffer(b); - - for (int i = 0; i < element_size; i++) - if (fabsf(a_data_ptr[i] - b_data_ptr[i]) < TENSOR_FLOAT_EPSILON) - return -1; + case TENGINE_DT_FP32: + { + float* a_data_ptr = (float*)get_tensor_buffer(a); + float* b_data_ptr = (float*)get_tensor_buffer(b); - break; - } - case TENGINE_DT_FP16: - { - __fp16* a_data_ptr = (__fp16*)get_tensor_buffer(a); - __fp16* b_data_ptr = (__fp16*)get_tensor_buffer(b); + for (int i = 0; i < element_size; i++) + if (fabsf(a_data_ptr[i] - b_data_ptr[i]) < TENSOR_FLOAT_EPSILON) + return -1; - for (int i = 0; i < element_size; i++) - { - if (fabsf((float)fp16_to_fp32(a_data_ptr[i]) - (float)fp16_to_fp32(b_data_ptr[i])) < TENSOR_FLOAT_EPSILON) - return -1; - } + break; + } + case TENGINE_DT_FP16: + { + __fp16* a_data_ptr = (__fp16*)get_tensor_buffer(a); + __fp16* b_data_ptr = (__fp16*)get_tensor_buffer(b); - break; - } - case TENGINE_DT_INT32: + for (int i = 0; i < element_size; i++) { - int32_t* a_data_ptr = (int32_t*)get_tensor_buffer(a); - int32_t* b_data_ptr = (int32_t*)get_tensor_buffer(b); + if (fabsf((float)fp16_to_fp32(a_data_ptr[i]) - (float)fp16_to_fp32(b_data_ptr[i])) < TENSOR_FLOAT_EPSILON) + return -1; + } - for (int i = 0; i < element_size; i++) - if (a_data_ptr[i] != b_data_ptr[i]) - return -1; + break; + } + case TENGINE_DT_INT32: + { + int32_t* a_data_ptr = (int32_t*)get_tensor_buffer(a); + int32_t* b_data_ptr = (int32_t*)get_tensor_buffer(b); - break; - } - case TENGINE_DT_INT16: - { - int16_t* a_data_ptr = (int16_t*)get_tensor_buffer(a); - int16_t* b_data_ptr = (int16_t*)get_tensor_buffer(b); + for (int i = 0; i < element_size; i++) + if (a_data_ptr[i] != b_data_ptr[i]) + return -1; - for (int i = 0; i < element_size; i++) - if (a_data_ptr[i] != b_data_ptr[i]) - return -1; + break; + } + case TENGINE_DT_INT16: + { + int16_t* a_data_ptr = (int16_t*)get_tensor_buffer(a); + int16_t* b_data_ptr = (int16_t*)get_tensor_buffer(b); - break; - } - case TENGINE_DT_UINT8: - case TENGINE_DT_INT8: - { - int8_t* a_data_ptr = (int8_t*)get_tensor_buffer(a); - int8_t* b_data_ptr = (int8_t*)get_tensor_buffer(b); + for (int i = 0; i < element_size; i++) + if (a_data_ptr[i] != b_data_ptr[i]) + return -1; - for (int i = 0; i < element_size; i++) - if (a_data_ptr[i] != b_data_ptr[i]) - return -1; + break; + } + case TENGINE_DT_UINT8: + case TENGINE_DT_INT8: + { + int8_t* a_data_ptr = (int8_t*)get_tensor_buffer(a); + int8_t* b_data_ptr = (int8_t*)get_tensor_buffer(b); - break; - } - default: - { - fprintf(stderr, "The type of tensor was not supported.\n"); - return -1; - } + for (int i = 0; i < element_size; i++) + if (a_data_ptr[i] != b_data_ptr[i]) + return -1; + + break; + } + default: + { + fprintf(stderr, "The type of tensor was not supported.\n"); + return -1; + } } return 0; } - static inline unsigned long get_current_time(void) { struct timespec tm; diff --git a/tests/op/test_op_conv.c b/tests/op/test_op_conv.c index c1799d471..5a8cffaa2 100644 --- a/tests/op/test_op_conv.c +++ b/tests/op/test_op_conv.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include "tengine/c_api.h" #include "tengine/c_api_ex.h" @@ -41,41 +41,41 @@ void record_allocated_buf(void* buf) void free_allocated_buf(void) { - for(int i = 0; i < allocated_num; i++) + for (int i = 0; i < allocated_num; i++) free(record_ptr[i]); - if(record_ptr) + if (record_ptr) free(record_ptr); } void init_buffer(void* buf, int elem_num, int elem_size, int val) { - for(int i = 0; i < elem_num; i++) + for (int i = 0; i < elem_num; i++) { float val0; float* fp; int16_t* i16; char* c; - if(val >= 0) + if (val >= 0) val0 = val; else - val0 = i%10; + val0 = i % 10; - switch(elem_size) + switch (elem_size) { - case 4: - fp = ( float* )buf; - fp[i] = val0; - break; - case 2: - i16 = ( int16_t* )buf; - i16[i] = val0; - break; - case 1: - c = ( char* )buf; - c[i] = val0; - break; + case 4: + fp = (float*)buf; + fp[i] = val0; + break; + case 2: + i16 = (int16_t*)buf; + i16[i] = val0; + break; + case 1: + c = (char*)buf; + c[i] = val0; + break; } } } @@ -129,7 +129,7 @@ int create_conv_node(graph_t graph, const char* node_name, const char* input_nam tensor_t input_tensor = get_graph_tensor(graph, input_name); - if(input_tensor == NULL) + if (input_tensor == NULL) { fprintf(stderr, "errno= %d\n", get_tengine_errno()); return -1; @@ -178,7 +178,7 @@ int create_pooling_node(graph_t graph, const char* node_name, const char* input_ tensor_t input_tensor = get_graph_tensor(graph, input_name); - if(input_tensor == NULL) + if (input_tensor == NULL) { fprintf(stderr, "ERRNO: %d\n", get_tengine_errno()); return -1; @@ -202,7 +202,7 @@ graph_t create_test_graph(int c, int h, int w, int out_c) { graph_t graph = create_graph(NULL, NULL, NULL); - if(graph == NULL) + if (graph == NULL) { fprintf(stderr, "ERRNO: %d\n", get_tengine_errno()); return NULL; @@ -211,7 +211,7 @@ graph_t create_test_graph(int c, int h, int w, int out_c) const char* input_name = "data"; const char* conv_name = "conv"; - if(create_input_node(graph, input_name, c, h, w) < 0) + if (create_input_node(graph, input_name, c, h, w) < 0) { fprintf(stderr, "create input failed\n"); return NULL; @@ -219,7 +219,7 @@ graph_t create_test_graph(int c, int h, int w, int out_c) // int out_c = 4; // k s p in_c out_c group - if(create_conv_node(graph, conv_name, input_name, 1, 1, 0, c, out_c, 1) < 0) + if (create_conv_node(graph, conv_name, input_name, 1, 1, 0, c, out_c, 1) < 0) { fprintf(stderr, "create conv node failed\n"); return NULL; @@ -243,13 +243,13 @@ graph_t create_test_graph(int c, int h, int w, int out_c) #endif - if(set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0) + if (set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0) { fprintf(stderr, "set inputs failed: ERRNO: %d\n", get_tengine_errno()); return NULL; } - if(set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0) + if (set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0) { fprintf(stderr, "set outputs failed: ERRNO: %d\n", get_tengine_errno()); return NULL; @@ -280,7 +280,7 @@ void fill_conv_node(node_t node) tensor_t bias = get_node_input_tensor(node, 2); - if(bias == NULL) + if (bias == NULL) return; get_tensor_shape(bias, dims, 1); @@ -302,13 +302,13 @@ void fill_graph_param(graph_t graph) { int node_num = get_graph_node_num(graph); - for(int i = 0; i < node_num; i++) + for (int i = 0; i < node_num; i++) { node_t node = get_graph_node_by_idx(graph, i); const char* node_op = get_node_op(node); - if(!strcmp(node_op, "Convolution")) + if (!strcmp(node_op, "Convolution")) { fill_conv_node(node); } @@ -329,8 +329,8 @@ int main(int argc, char* argv[]) init_tengine(); graph_t graph = create_test_graph(c, h, w, out_c); - - if(graph == NULL) + + if (graph == NULL) return 1; fill_graph_param(graph); @@ -344,7 +344,7 @@ int main(int argc, char* argv[]) int elem_num = 1; int elem_size = 4; - for(int i = 0; i < dim_num; i++) + for (int i = 0; i < dim_num; i++) elem_num *= dims[i]; void* input_buf = malloc(elem_num * elem_size); @@ -369,7 +369,7 @@ int main(int argc, char* argv[]) printf("output shape: ["); - for(int i = 0; i < dim_num; i++) + for (int i = 0; i < dim_num; i++) { elem_num *= dims[i]; printf(" %d", dims[i]); @@ -379,11 +379,11 @@ int main(int argc, char* argv[]) float* output = get_tensor_buffer(output_tensor); - for(int i = 0; i < elem_num; i++) + for (int i = 0; i < elem_num; i++) { int w = dims[3]; - if((i % w) == 0) + if ((i % w) == 0) printf("\n%d:\t", i); printf(" %f", output[i]); diff --git a/tests/op/test_op_prelu.c b/tests/op/test_op_prelu.c index 16f6ee3b9..dd31e4b1e 100644 --- a/tests/op/test_op_prelu.c +++ b/tests/op/test_op_prelu.c @@ -24,17 +24,20 @@ #include "test_op.h" - int create_test_prelu_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) { - (void)layout; (void)n; (void)c; (void)h; (void)w; + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; /* create the test node */ node_t test_node = create_graph_node(graph, node_name, "PReLU"); tensor_t input_tensor = get_graph_tensor(graph, input_name); - if(NULL == input_tensor) + if (NULL == input_tensor) { fprintf(stderr, "create test node failed. ERRNO: %d.\n", get_tengine_errno()); return -1; @@ -47,7 +50,7 @@ int create_test_prelu_node(graph_t graph, const char* input_name, const char* no int dims[4]; get_tensor_shape(input_tensor, dims, 4); - int slope_dims[1] = {dims[1]}; // channel num + int slope_dims[1] = {dims[1]}; // channel num set_tensor_shape(slope_tensor, slope_dims, 1); /* input tensors of test node */ @@ -78,7 +81,7 @@ int main(int argc, char* argv[]) // create graph_t graph = create_common_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_prelu_node); - if(NULL == graph) + if (NULL == graph) return -1; // set input data @@ -102,9 +105,9 @@ int main(int argc, char* argv[]) int cstep = output_tensor->dims[2] * output_tensor->dims[3]; ret = 0; - for (int i = 0; i< out_c; i++) + for (int i = 0; i < out_c; i++) { - float* output_data = (float *)output_tensor->data + i * cstep; + float* output_data = (float*)output_tensor->data + i * cstep; for (int j = 0; j < cstep; j++) { if (output_data[j] != result_value[i]) diff --git a/tests/op/test_op_relu.c b/tests/op/test_op_relu.c index fd8583023..730ab3260 100644 --- a/tests/op/test_op_relu.c +++ b/tests/op/test_op_relu.c @@ -24,10 +24,13 @@ #include "test_op.h" - int create_test_relu_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) { - (void)layout; (void)n; (void)c; (void)h; (void)w; + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; /* create the test node */ node_t test_node = create_graph_node(graph, node_name, "ReLU"); @@ -66,7 +69,6 @@ int create_test_relu_node(graph_t graph, const char* input_name, const char* nod return 0; } - int main(int argc, char* argv[]) { int n = 1, c = 3, h = 12, w = 12; @@ -81,7 +83,7 @@ int main(int argc, char* argv[]) // create graph_t graph = create_common_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_relu_node); - if(NULL == graph) + if (NULL == graph) return -1; // set input data @@ -98,7 +100,7 @@ int main(int argc, char* argv[]) // dump input node int input_node_count = get_graph_input_node_number(graph); - for(int i = 0; i < input_node_count; i++) + for (int i = 0; i < input_node_count; i++) { node_t input = get_graph_input_node(graph, i); dump_node_output(input, 0); @@ -106,7 +108,7 @@ int main(int argc, char* argv[]) // dump output node int output_node_count = get_graph_output_node_number(graph); - for(int i = 0; i < output_node_count; i++) + for (int i = 0; i < output_node_count; i++) { node_t output = get_graph_output_node(graph, i); dump_node_output(output, 0); diff --git a/tests/op/test_op_relu6.c b/tests/op/test_op_relu6.c index 1f772e97d..9315c6477 100644 --- a/tests/op/test_op_relu6.c +++ b/tests/op/test_op_relu6.c @@ -24,10 +24,13 @@ #include "test_op.h" - int create_test_relu6_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) { - (void)layout; (void)n; (void)c; (void)h; (void)w; + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; /* create the test node */ node_t test_node = create_graph_node(graph, node_name, "ReLU6"); @@ -66,7 +69,6 @@ int create_test_relu6_node(graph_t graph, const char* input_name, const char* no return 0; } - int main(int argc, char* argv[]) { int n = 1, c = 3, h = 12, w = 12; @@ -81,7 +83,7 @@ int main(int argc, char* argv[]) // create graph_t graph = create_common_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_relu6_node); - if(NULL == graph) + if (NULL == graph) return -1; // set input data @@ -98,7 +100,7 @@ int main(int argc, char* argv[]) // dump input node int input_node_count = get_graph_input_node_number(graph); - for(int i = 0; i < input_node_count; i++) + for (int i = 0; i < input_node_count; i++) { node_t input = get_graph_input_node(graph, i); dump_node_output(input, 0); @@ -106,7 +108,7 @@ int main(int argc, char* argv[]) // dump output node int output_node_count = get_graph_output_node_number(graph); - for(int i = 0; i < output_node_count; i++) + for (int i = 0; i < output_node_count; i++) { node_t output = get_graph_output_node(graph, i); dump_node_output(output, 0); diff --git a/tests/op/test_tensorrt_op_clip.cpp b/tests/op/test_tensorrt_op_clip.cpp index d166d7dc4..fef74a3ee 100644 --- a/tests/op/test_tensorrt_op_clip.cpp +++ b/tests/op/test_tensorrt_op_clip.cpp @@ -1,119 +1,120 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - - -int create_test_clip_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Clip"); - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - - if(NULL == input_tensor) - { - fprintf(stderr, "create test node failed.\n"); - return -1; - } - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - return 0; -} - -float input_fp32[5] = {-3.0f, 3.0f, 8.0f, 1.0f, -2.0f}; - -float reference_out[5] = {0.0f, 3.0f, 6.0f, 1.0f, 0.0f}; - - -int main(int argc, char* argv[]) -{ - int n = 1, c = 1, h = 5, w = 1; - const char* test_node_name = "clip"; - int data_type = TENGINE_DT_FP32; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - graph_t graph = create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_clip_node); - if(NULL == graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(graph); - - // set quantize params - struct tensor* input_tensor = (struct tensor*)get_graph_input_tensor(graph, 0, 0); - struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0); - - // set input data - set_tensor_buffer(input_tensor, input_fp32, 5 * 4); - - // graph run - ret = test_graph_run(graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(graph); - return -1; - } - - // get output and dequant - float* output_data = ( float* )output_tensor->data; - int output_size = output_tensor->elem_num; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +int create_test_clip_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Clip"); + + tensor_t input_tensor = get_graph_tensor(graph, input_name); + + if (NULL == input_tensor) + { + fprintf(stderr, "create test node failed.\n"); + return -1; + } + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + return 0; +} + +float input_fp32[5] = {-3.0f, 3.0f, 8.0f, 1.0f, -2.0f}; + +float reference_out[5] = {0.0f, 3.0f, 6.0f, 1.0f, 0.0f}; + +int main(int argc, char* argv[]) +{ + int n = 1, c = 1, h = 5, w = 1; + const char* test_node_name = "clip"; + int data_type = TENGINE_DT_FP32; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + graph_t graph = create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_clip_node); + if (NULL == graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(graph); + + // set quantize params + struct tensor* input_tensor = (struct tensor*)get_graph_input_tensor(graph, 0, 0); + struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0); + + // set input data + set_tensor_buffer(input_tensor, input_fp32, 5 * 4); + + // graph run + ret = test_graph_run(graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(graph); + return -1; + } + + // get output and dequant + float* output_data = (float*)output_tensor->data; + int output_size = output_tensor->elem_num; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(graph); + + return ret; +} diff --git a/tests/op/test_tensorrt_op_concat.cpp b/tests/op/test_tensorrt_op_concat.cpp index 42668a05f..185d620c6 100644 --- a/tests/op/test_tensorrt_op_concat.cpp +++ b/tests/op/test_tensorrt_op_concat.cpp @@ -1,150 +1,188 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/concat_param.h" - - -int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Concat"); - - tensor_t input0_tensor = get_graph_tensor(graph, input_name0); - - if(NULL == input0_tensor) - { - fprintf(stderr, "create test node input0 failed.\n"); - return -1; - } - - node_t input1_node = create_graph_node(graph, "input1", "InputOp"); - tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_FP32); - set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_INPUT); - int input1_dims[4] = {1, 1, 3, 3}; // channel num - set_tensor_shape(input1_tensor, input1_dims, 4); - - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input0_tensor); - set_node_input_tensor(test_node, 1, input1_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set params */ - struct concat_param* param = ( struct concat_param* )(struct node* )test_node->op.param_mem; - - param->axis = 1; - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input0_fp32[9] = {3.0f, 8.0f, 1.0f, 9.0f, 5.0f, 7.0f, 3.0f, 2.0f, 3.0f,}; - -float input1_fp32[9] = {9.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 2.0f,}; - -float reference_out[18] = {3.0f, 8.0f, 1.0f, 9.0f, 5.0f, 7.0f, 3.0f, 2.0f, 3.0f, - 9.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 2.0f,}; - - -int main(int argc, char* argv[]) -{ - int n = 1, c = 1, h = 3, w = 3; - const char* test_node_name = "concat"; - int data_type = TENGINE_DT_FP32; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "concat"); - - // set input data - set_tensor_buffer(input0_tensor, input0_fp32, 9 * 4); - - // set input data - set_tensor_buffer(input1_tensor, input1_fp32, 9 * 4); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - float* output_data = ( float* )output_tensor->data; - int output_size = output_tensor->elem_num; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/concat_param.h" + +int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Concat"); + + tensor_t input0_tensor = get_graph_tensor(graph, input_name0); + + if (NULL == input0_tensor) + { + fprintf(stderr, "create test node input0 failed.\n"); + return -1; + } + + node_t input1_node = create_graph_node(graph, "input1", "InputOp"); + tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_FP32); + set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_INPUT); + int input1_dims[4] = {1, 1, 3, 3}; // channel num + set_tensor_shape(input1_tensor, input1_dims, 4); + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input0_tensor); + set_node_input_tensor(test_node, 1, input1_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + /* set params */ + struct concat_param* param = (struct concat_param*)(struct node*)test_node->op.param_mem; + + param->axis = 1; + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input0_fp32[9] = { + 3.0f, + 8.0f, + 1.0f, + 9.0f, + 5.0f, + 7.0f, + 3.0f, + 2.0f, + 3.0f, +}; + +float input1_fp32[9] = { + 9.0f, + 0.0f, + 3.0f, + 0.0f, + 0.0f, + 0.0f, + 1.0f, + 0.0f, + 2.0f, +}; + +float reference_out[18] = { + 3.0f, + 8.0f, + 1.0f, + 9.0f, + 5.0f, + 7.0f, + 3.0f, + 2.0f, + 3.0f, + 9.0f, + 0.0f, + 3.0f, + 0.0f, + 0.0f, + 0.0f, + 1.0f, + 0.0f, + 2.0f, +}; + +int main(int argc, char* argv[]) +{ + int n = 1, c = 1, h = 3, w = 3; + const char* test_node_name = "concat"; + int data_type = TENGINE_DT_FP32; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "concat"); + + // set input data + set_tensor_buffer(input0_tensor, input0_fp32, 9 * 4); + + // set input data + set_tensor_buffer(input1_tensor, input1_fp32, 9 * 4); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + float* output_data = (float*)output_tensor->data; + int output_size = output_tensor->elem_num; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_tensorrt_op_deconv.cpp b/tests/op/test_tensorrt_op_deconv.cpp index fa0b7576c..7db3cb1ed 100644 --- a/tests/op/test_tensorrt_op_deconv.cpp +++ b/tests/op/test_tensorrt_op_deconv.cpp @@ -1,223 +1,246 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/deconv_param.h" - - -int create_test_deconv_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Deconvolution"); - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - - if(NULL == input_tensor) - { - fprintf(stderr, "create test node failed.\n"); - return -1; - } - - /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */ - /* weight */ - node_t weight_node = create_graph_node(graph, "weight", "Const"); - tensor_t weight_tensor = create_graph_tensor(graph, "weight", TENGINE_DT_FP32); - set_node_output_tensor(weight_node, 0, weight_tensor, TENSOR_TYPE_CONST); - int weight_dims[4] = {1, 1, 3, 3}; // channel num - set_tensor_shape(weight_tensor, weight_dims, 4); - - /* bias */ - // node_t bias_node = create_graph_node(graph, "bias", "Const"); - // tensor_t bias_tensor = create_graph_tensor(graph, "bias", TENGINE_DT_INT32); - // set_node_output_tensor(bias_node, 0, bias_tensor, TENSOR_TYPE_CONST); - // int bias_dims[1] = {1}; // channel num - // set_tensor_shape(bias_tensor, bias_dims, 1); - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - set_node_input_tensor(test_node, 1, weight_tensor); - // set_node_input_tensor(test_node, 2, bias_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set params */ - struct deconv_param* deconv_param = ( struct deconv_param* )(struct node* )test_node->op.param_mem; - - deconv_param->num_output = 1; - deconv_param->kernel_h = 3; - deconv_param->kernel_w = 3; - deconv_param->stride_h = 2; - deconv_param->stride_w = 2; - deconv_param->pad_h0 = 0; - deconv_param->pad_w0 = 0; - deconv_param->pad_h1 = 0; - deconv_param->pad_w1 = 0; - deconv_param->dilation_h = 1; - deconv_param->dilation_w = 1; - deconv_param->group = 1; - deconv_param->activation = -1; - deconv_param->output_pad_h0 = 0; - deconv_param->output_pad_w0 = 0; - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input_fp32[9] = {3.0f, 8.0f, 1.0f, 9.0f, 5.0f, 7.0f, 3.0f, 2.0f, 3.0f,}; - -float weight_fp32[9] = {9.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 2.0f,}; - -float reference_out[49] = {27.000000, - 0.000000, - 81.000000, - 0.000000, - 33.000000, - 0.000000, - 3.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 84.000000, - 0.000000, - 86.000000, - 0.000000, - 95.000000, - 0.000000, - 23.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 36.000000, - 0.000000, - 50.000000, - 0.000000, - 50.000000, - 0.000000, - 23.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 3.000000, - 0.000000, - 8.000000, - 0.000000, - 7.000000, - 0.000000, - 6.000000, }; - - -int main(int argc, char* argv[]) -{ - int n = 1, c = 1, h = 3, w = 3; - const char* test_node_name = "deconv"; - int data_type = TENGINE_DT_FP32; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_deconv_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* weight_tensor = (struct tensor*)get_graph_tensor(ir_graph, "weight"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "deconv"); - - // set input data - set_tensor_buffer(input_tensor, input_fp32, 9 * 4); - - // set weight data - set_tensor_buffer(weight_tensor, weight_fp32, 9 * 4); - - // set bias data - // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - float* output_data = ( float* )output_tensor->data; - int output_size = output_tensor->elem_num; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/deconv_param.h" + +int create_test_deconv_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Deconvolution"); + + tensor_t input_tensor = get_graph_tensor(graph, input_name); + + if (NULL == input_tensor) + { + fprintf(stderr, "create test node failed.\n"); + return -1; + } + + /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */ + /* weight */ + node_t weight_node = create_graph_node(graph, "weight", "Const"); + tensor_t weight_tensor = create_graph_tensor(graph, "weight", TENGINE_DT_FP32); + set_node_output_tensor(weight_node, 0, weight_tensor, TENSOR_TYPE_CONST); + int weight_dims[4] = {1, 1, 3, 3}; // channel num + set_tensor_shape(weight_tensor, weight_dims, 4); + + /* bias */ + // node_t bias_node = create_graph_node(graph, "bias", "Const"); + // tensor_t bias_tensor = create_graph_tensor(graph, "bias", TENGINE_DT_INT32); + // set_node_output_tensor(bias_node, 0, bias_tensor, TENSOR_TYPE_CONST); + // int bias_dims[1] = {1}; // channel num + // set_tensor_shape(bias_tensor, bias_dims, 1); + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input_tensor); + set_node_input_tensor(test_node, 1, weight_tensor); + // set_node_input_tensor(test_node, 2, bias_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + /* set params */ + struct deconv_param* deconv_param = (struct deconv_param*)(struct node*)test_node->op.param_mem; + + deconv_param->num_output = 1; + deconv_param->kernel_h = 3; + deconv_param->kernel_w = 3; + deconv_param->stride_h = 2; + deconv_param->stride_w = 2; + deconv_param->pad_h0 = 0; + deconv_param->pad_w0 = 0; + deconv_param->pad_h1 = 0; + deconv_param->pad_w1 = 0; + deconv_param->dilation_h = 1; + deconv_param->dilation_w = 1; + deconv_param->group = 1; + deconv_param->activation = -1; + deconv_param->output_pad_h0 = 0; + deconv_param->output_pad_w0 = 0; + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input_fp32[9] = { + 3.0f, + 8.0f, + 1.0f, + 9.0f, + 5.0f, + 7.0f, + 3.0f, + 2.0f, + 3.0f, +}; + +float weight_fp32[9] = { + 9.0f, + 0.0f, + 3.0f, + 0.0f, + 0.0f, + 0.0f, + 1.0f, + 0.0f, + 2.0f, +}; + +float reference_out[49] = { + 27.000000, + 0.000000, + 81.000000, + 0.000000, + 33.000000, + 0.000000, + 3.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 84.000000, + 0.000000, + 86.000000, + 0.000000, + 95.000000, + 0.000000, + 23.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 36.000000, + 0.000000, + 50.000000, + 0.000000, + 50.000000, + 0.000000, + 23.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 3.000000, + 0.000000, + 8.000000, + 0.000000, + 7.000000, + 0.000000, + 6.000000, +}; + +int main(int argc, char* argv[]) +{ + int n = 1, c = 1, h = 3, w = 3; + const char* test_node_name = "deconv"; + int data_type = TENGINE_DT_FP32; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_deconv_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* weight_tensor = (struct tensor*)get_graph_tensor(ir_graph, "weight"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "deconv"); + + // set input data + set_tensor_buffer(input_tensor, input_fp32, 9 * 4); + + // set weight data + set_tensor_buffer(weight_tensor, weight_fp32, 9 * 4); + + // set bias data + // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + float* output_data = (float*)output_tensor->data; + int output_size = output_tensor->elem_num; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_tensorrt_op_dropout.cpp b/tests/op/test_tensorrt_op_dropout.cpp index d9c53d416..51453b2e7 100644 --- a/tests/op/test_tensorrt_op_dropout.cpp +++ b/tests/op/test_tensorrt_op_dropout.cpp @@ -1,133 +1,144 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" - - -int create_test_dropout_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Dropout"); - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - - if(NULL == input_tensor) - { - fprintf(stderr, "create test node failed.\n"); - return -1; - } - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input_fp32[6] = {1.0f, 2.0f, 3.0f, - 4.0f, 5.0f, 6.0f, }; - -float reference_out[6] = {1.0f, 2.0f, 3.0f, - 4.0f, 5.0f, 6.0f, }; - - -int main(int argc, char* argv[]) -{ - int n = 1, c = 2, h = 1, w = 3; - const char* test_node_name = "dropout"; - int data_type = TENGINE_DT_FP32; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_dropout_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "dropout"); - - - // set input data - set_tensor_buffer(input_tensor, input_fp32, 6 * 4); - - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - float* output_data = ( float* )output_tensor->data; - int output_size = output_tensor->elem_num; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" + +int create_test_dropout_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Dropout"); + + tensor_t input_tensor = get_graph_tensor(graph, input_name); + + if (NULL == input_tensor) + { + fprintf(stderr, "create test node failed.\n"); + return -1; + } + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input_fp32[6] = { + 1.0f, + 2.0f, + 3.0f, + 4.0f, + 5.0f, + 6.0f, +}; + +float reference_out[6] = { + 1.0f, + 2.0f, + 3.0f, + 4.0f, + 5.0f, + 6.0f, +}; + +int main(int argc, char* argv[]) +{ + int n = 1, c = 2, h = 1, w = 3; + const char* test_node_name = "dropout"; + int data_type = TENGINE_DT_FP32; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_dropout_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "dropout"); + + // set input data + set_tensor_buffer(input_tensor, input_fp32, 6 * 4); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + float* output_data = (float*)output_tensor->data; + int output_size = output_tensor->elem_num; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_tensorrt_op_eltwise.cpp b/tests/op/test_tensorrt_op_eltwise.cpp index a59b99edd..3cf144c23 100644 --- a/tests/op/test_tensorrt_op_eltwise.cpp +++ b/tests/op/test_tensorrt_op_eltwise.cpp @@ -1,157 +1,186 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/eltwise_param.h" - - -int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Eltwise"); - - tensor_t input0_tensor = get_graph_tensor(graph, input_name0); - - if(NULL == input0_tensor) - { - fprintf(stderr, "create test node input0 failed.\n"); - return -1; - } - - node_t input1_node = create_graph_node(graph, "input1", "InputOp"); - tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_FP32); - set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_INPUT); - int input1_dims[4] = {1, 1, 3, 3}; // channel num - set_tensor_shape(input1_tensor, input1_dims, 4); - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input0_tensor); - set_node_input_tensor(test_node, 1, input1_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set params */ - struct eltwise_param* param = ( struct eltwise_param* )(struct node* )test_node->op.param_mem; - - param->type = 2; - param->caffe_flavor = 1; - param->shift = NULL; - param->power = NULL; - param->scale = NULL; - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input0_fp32[9] = {3.0f, 8.0f, 1.0f, 9.0f, 5.0f, 7.0f, 3.0f, 2.0f, 3.0f,}; - -float input1_fp32[9] = {9.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 2.0f,}; - -float reference_out[9] = {12.0f, 8.0f, 4.0f, 9.0f, 5.0f, 7.0f, 4.0f, 2.0f, 5.0f,}; - - - -int main(int argc, char* argv[]) -{ - int n = 1, c = 1, h = 3, w = 3; - const char* test_node_name = "eltwise"; - int data_type = TENGINE_DT_FP32; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "eltwise"); - - - // set input data - set_tensor_buffer(input0_tensor, input0_fp32, 9 * 4); - - // set input data - set_tensor_buffer(input1_tensor, input1_fp32, 9 * 4); - - // set bias data - // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - float* output_data = ( float* )output_tensor->data; - int output_size = output_tensor->elem_num; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/eltwise_param.h" + +int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Eltwise"); + + tensor_t input0_tensor = get_graph_tensor(graph, input_name0); + + if (NULL == input0_tensor) + { + fprintf(stderr, "create test node input0 failed.\n"); + return -1; + } + + node_t input1_node = create_graph_node(graph, "input1", "InputOp"); + tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_FP32); + set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_INPUT); + int input1_dims[4] = {1, 1, 3, 3}; // channel num + set_tensor_shape(input1_tensor, input1_dims, 4); + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input0_tensor); + set_node_input_tensor(test_node, 1, input1_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + /* set params */ + struct eltwise_param* param = (struct eltwise_param*)(struct node*)test_node->op.param_mem; + + param->type = 2; + param->caffe_flavor = 1; + param->shift = NULL; + param->power = NULL; + param->scale = NULL; + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input0_fp32[9] = { + 3.0f, + 8.0f, + 1.0f, + 9.0f, + 5.0f, + 7.0f, + 3.0f, + 2.0f, + 3.0f, +}; + +float input1_fp32[9] = { + 9.0f, + 0.0f, + 3.0f, + 0.0f, + 0.0f, + 0.0f, + 1.0f, + 0.0f, + 2.0f, +}; + +float reference_out[9] = { + 12.0f, + 8.0f, + 4.0f, + 9.0f, + 5.0f, + 7.0f, + 4.0f, + 2.0f, + 5.0f, +}; + +int main(int argc, char* argv[]) +{ + int n = 1, c = 1, h = 3, w = 3; + const char* test_node_name = "eltwise"; + int data_type = TENGINE_DT_FP32; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "eltwise"); + + // set input data + set_tensor_buffer(input0_tensor, input0_fp32, 9 * 4); + + // set input data + set_tensor_buffer(input1_tensor, input1_fp32, 9 * 4); + + // set bias data + // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + float* output_data = (float*)output_tensor->data; + int output_size = output_tensor->elem_num; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_tensorrt_op_fc.cpp b/tests/op/test_tensorrt_op_fc.cpp index 6da5459e2..be4597531 100644 --- a/tests/op/test_tensorrt_op_fc.cpp +++ b/tests/op/test_tensorrt_op_fc.cpp @@ -1,152 +1,163 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/fc_param.h" - - -int create_test_fc_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "FullyConnected"); - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - - if(NULL == input_tensor) - { - fprintf(stderr, "create test node failed.\n"); - return -1; - } - - /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */ - /* weight */ - node_t weight_node = create_graph_node(graph, "weight", "Const"); - tensor_t weight_tensor = create_graph_tensor(graph, "weight", TENGINE_DT_FP32); - set_node_output_tensor(weight_node, 0, weight_tensor, TENSOR_TYPE_CONST); - int weight_dims[2] = {1, 3}; // channel num - set_tensor_shape(weight_tensor, weight_dims, 2); - - /* bias */ - // node_t bias_node = create_graph_node(graph, "bias", "Const"); - // tensor_t bias_tensor = create_graph_tensor(graph, "bias", TENGINE_DT_INT32); - // set_node_output_tensor(bias_node, 0, bias_tensor, TENSOR_TYPE_CONST); - // int bias_dims[1] = {1}; // channel num - // set_tensor_shape(bias_tensor, bias_dims, 1); - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - set_node_input_tensor(test_node, 1, weight_tensor); - // set_node_input_tensor(test_node, 2, bias_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set params */ - struct fc_param* param = ( struct fc_param* )(struct node* )test_node->op.param_mem; - - param->num_output = 1; - - return 0; -} - - -float input_fp32[3] = {3.0f, 8.0f, 1.0f,}; - -float weight_fp32[3] = {9.0f, 0.0f, 3.0f,}; - -float reference_out[1] = {30,}; - -int main(int argc, char* argv[]) -{ - int n = 1, c = 3, h = 1, w = 1; - const char* test_node_name = "conv"; - int data_type = TENGINE_DT_FP32; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_fc_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* weight_tensor = (struct tensor*)get_graph_tensor(ir_graph, "weight"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "conv"); - - // set input data - set_tensor_buffer(input_tensor, input_fp32, 3 * 4); - - // set weight data - set_tensor_buffer(weight_tensor, weight_fp32, 3 * 4); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - float* output_data = ( float* )output_tensor->data; - int output_size = output_tensor->elem_num; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/fc_param.h" + +int create_test_fc_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "FullyConnected"); + + tensor_t input_tensor = get_graph_tensor(graph, input_name); + + if (NULL == input_tensor) + { + fprintf(stderr, "create test node failed.\n"); + return -1; + } + + /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */ + /* weight */ + node_t weight_node = create_graph_node(graph, "weight", "Const"); + tensor_t weight_tensor = create_graph_tensor(graph, "weight", TENGINE_DT_FP32); + set_node_output_tensor(weight_node, 0, weight_tensor, TENSOR_TYPE_CONST); + int weight_dims[2] = {1, 3}; // channel num + set_tensor_shape(weight_tensor, weight_dims, 2); + + /* bias */ + // node_t bias_node = create_graph_node(graph, "bias", "Const"); + // tensor_t bias_tensor = create_graph_tensor(graph, "bias", TENGINE_DT_INT32); + // set_node_output_tensor(bias_node, 0, bias_tensor, TENSOR_TYPE_CONST); + // int bias_dims[1] = {1}; // channel num + // set_tensor_shape(bias_tensor, bias_dims, 1); + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input_tensor); + set_node_input_tensor(test_node, 1, weight_tensor); + // set_node_input_tensor(test_node, 2, bias_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + /* set params */ + struct fc_param* param = (struct fc_param*)(struct node*)test_node->op.param_mem; + + param->num_output = 1; + + return 0; +} + +float input_fp32[3] = { + 3.0f, + 8.0f, + 1.0f, +}; + +float weight_fp32[3] = { + 9.0f, + 0.0f, + 3.0f, +}; + +float reference_out[1] = { + 30, +}; + +int main(int argc, char* argv[]) +{ + int n = 1, c = 3, h = 1, w = 1; + const char* test_node_name = "conv"; + int data_type = TENGINE_DT_FP32; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_fc_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* weight_tensor = (struct tensor*)get_graph_tensor(ir_graph, "weight"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "conv"); + + // set input data + set_tensor_buffer(input_tensor, input_fp32, 3 * 4); + + // set weight data + set_tensor_buffer(weight_tensor, weight_fp32, 3 * 4); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + float* output_data = (float*)output_tensor->data; + int output_size = output_tensor->elem_num; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_timvx_op_clip.cpp b/tests/op/test_timvx_op_clip.cpp index c3ea778d2..4e82c2f71 100644 --- a/tests/op/test_timvx_op_clip.cpp +++ b/tests/op/test_timvx_op_clip.cpp @@ -22,20 +22,22 @@ * Author: qtang@openailab.com */ - #include "test_op.h" - int create_test_clip_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) { - (void)layout; (void)n; (void)c; (void)h; (void)w; + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Clip"); + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Clip"); tensor_t input_tensor = get_graph_tensor(graph, input_name); - if(NULL == input_tensor) + if (NULL == input_tensor) { fprintf(stderr, "create test node failed.\n"); return -1; @@ -78,7 +80,7 @@ int main(int argc, char* argv[]) // create graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_clip_node); - if(NULL == graph) + if (NULL == graph) return -1; // set quantize params @@ -101,7 +103,7 @@ int main(int argc, char* argv[]) // get output and dequant struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0); - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; + uint8_t* output_u8 = (uint8_t*)output_tensor->data; int output_size = output_tensor->elem_num; int out_c = output_tensor->dims[1]; int cstep = output_tensor->dims[2] * output_tensor->dims[3]; @@ -109,13 +111,13 @@ int main(int argc, char* argv[]) get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); std::vector output_data(output_size); for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; // check the result ret = 0; - for (int i = 0; i< out_c; i++) + for (int i = 0; i < out_c; i++) { - float* output_value = (float *)output_data.data() + i * cstep; + float* output_value = (float*)output_data.data() + i * cstep; for (int j = 0; j < cstep; j++) { if (fabsf(output_value[j] - reference_out[i]) > 0.01f) diff --git a/tests/op/test_timvx_op_concat.cpp b/tests/op/test_timvx_op_concat.cpp index 05e9e7a67..2a9eeebde 100644 --- a/tests/op/test_timvx_op_concat.cpp +++ b/tests/op/test_timvx_op_concat.cpp @@ -1,187 +1,225 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/concat_param.h" - - -int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Concat"); - - tensor_t input0_tensor = get_graph_tensor(graph, input_name0); - - if(NULL == input0_tensor) - { - fprintf(stderr, "create test node input0 failed.\n"); - return -1; - } - - node_t input1_node = create_graph_node(graph, "input1", "Const"); - tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_UINT8); - set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_CONST); - int input1_dims[4] = {1, 1, 3, 3}; // channel num - set_tensor_shape(input1_tensor, input1_dims, 4); - - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input0_tensor); - set_node_input_tensor(test_node, 1, input1_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set params */ - struct concat_param* param = ( struct concat_param* )(struct node* )test_node->op.param_mem; - - param->axis = 1; - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input0_fp32[9] = {3.0f, 8.0f, 1.0f, 9.0f, 5.0f, 7.0f, 3.0f, 2.0f, 3.0f,}; -float input0_scale = 1; -int input0_zero_point = 0; - -float input1_fp32[9] = {9.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 2.0f,}; -float input1_scale = 1; -int input1_zero_point = 0; - -float reference_out[18] = {3.0f, 8.0f, 1.0f, 9.0f, 5.0f, 7.0f, 3.0f, 2.0f, 3.0f, - 9.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 2.0f,}; -float output_scale = 1; -int output_zero_point = 0; - - -void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) -{ - for (int i = 0; i < size; i++) - { - int udata = (round)(data_fp32[i] / scale + zero_point); - if (udata > 255) - udata = 255; - else if (udata < 0) - udata = 0; - - date_u8[i] = udata; - } -} - -int main(int argc, char* argv[]) -{ - int n = 1, c = 1, h = 3, w = 3; - const char* test_node_name = "concat"; - int data_type = TENGINE_DT_UINT8; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "concat"); - -// tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); - set_tensor_quant_param(input0_tensor, &input0_scale, &input0_zero_point, 1); - set_tensor_quant_param(input1_tensor, &input1_scale, &input1_zero_point, 1); - set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - - // set input data - uint8_t input0_u8[9] = {0}; - get_uint8_data(input0_fp32, input0_u8, 9, input0_scale, input0_zero_point); - set_tensor_buffer(input0_tensor, input0_u8, 9); - - // set input data - uint8_t input1_u8[9] = {0}; - get_uint8_data(input1_fp32, input1_u8, 9, input1_scale, input1_zero_point); - set_tensor_buffer(input1_tensor, input1_u8, 9); - - // set bias data - // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; - int output_size = output_tensor->elem_num; - - get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); - for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/concat_param.h" + +int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Concat"); + + tensor_t input0_tensor = get_graph_tensor(graph, input_name0); + + if (NULL == input0_tensor) + { + fprintf(stderr, "create test node input0 failed.\n"); + return -1; + } + + node_t input1_node = create_graph_node(graph, "input1", "Const"); + tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_UINT8); + set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_CONST); + int input1_dims[4] = {1, 1, 3, 3}; // channel num + set_tensor_shape(input1_tensor, input1_dims, 4); + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input0_tensor); + set_node_input_tensor(test_node, 1, input1_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + /* set params */ + struct concat_param* param = (struct concat_param*)(struct node*)test_node->op.param_mem; + + param->axis = 1; + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input0_fp32[9] = { + 3.0f, + 8.0f, + 1.0f, + 9.0f, + 5.0f, + 7.0f, + 3.0f, + 2.0f, + 3.0f, +}; +float input0_scale = 1; +int input0_zero_point = 0; + +float input1_fp32[9] = { + 9.0f, + 0.0f, + 3.0f, + 0.0f, + 0.0f, + 0.0f, + 1.0f, + 0.0f, + 2.0f, +}; +float input1_scale = 1; +int input1_zero_point = 0; + +float reference_out[18] = { + 3.0f, + 8.0f, + 1.0f, + 9.0f, + 5.0f, + 7.0f, + 3.0f, + 2.0f, + 3.0f, + 9.0f, + 0.0f, + 3.0f, + 0.0f, + 0.0f, + 0.0f, + 1.0f, + 0.0f, + 2.0f, +}; +float output_scale = 1; +int output_zero_point = 0; + +void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) +{ + for (int i = 0; i < size; i++) + { + int udata = (round)(data_fp32[i] / scale + zero_point); + if (udata > 255) + udata = 255; + else if (udata < 0) + udata = 0; + + date_u8[i] = udata; + } +} + +int main(int argc, char* argv[]) +{ + int n = 1, c = 1, h = 3, w = 3; + const char* test_node_name = "concat"; + int data_type = TENGINE_DT_UINT8; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "concat"); + + // tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); + set_tensor_quant_param(input0_tensor, &input0_scale, &input0_zero_point, 1); + set_tensor_quant_param(input1_tensor, &input1_scale, &input1_zero_point, 1); + set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + + // set input data + uint8_t input0_u8[9] = {0}; + get_uint8_data(input0_fp32, input0_u8, 9, input0_scale, input0_zero_point); + set_tensor_buffer(input0_tensor, input0_u8, 9); + + // set input data + uint8_t input1_u8[9] = {0}; + get_uint8_data(input1_fp32, input1_u8, 9, input1_scale, input1_zero_point); + set_tensor_buffer(input1_tensor, input1_u8, 9); + + // set bias data + // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + uint8_t* output_u8 = (uint8_t*)output_tensor->data; + int output_size = output_tensor->elem_num; + + get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + float* output_data = (float*)malloc(output_size * sizeof(float)); + for (int i = 0; i < output_size; i++) + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_timvx_op_convolution.cpp b/tests/op/test_timvx_op_convolution.cpp index d5cd6f86b..da10249ab 100644 --- a/tests/op/test_timvx_op_convolution.cpp +++ b/tests/op/test_timvx_op_convolution.cpp @@ -22,7 +22,6 @@ * Author: qtang@openailab.com */ - #include "test_op.h" #include "graph/graph.h" @@ -30,17 +29,20 @@ #include "graph/tensor.h" #include "operator/prototype/convolution_param.h" - int create_test_convolution_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) { - (void)layout; (void)n; (void)c; (void)h; (void)w; + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Convolution"); + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Convolution"); tensor_t input_tensor = get_graph_tensor(graph, input_name); - if(NULL == input_tensor) + if (NULL == input_tensor) { fprintf(stderr, "create test node failed.\n"); return -1; @@ -51,7 +53,7 @@ int create_test_convolution_node(graph_t graph, const char* input_name, const ch node_t weight_node = create_graph_node(graph, "weight", "Const"); tensor_t weight_tensor = create_graph_tensor(graph, "weight", TENGINE_DT_UINT8); set_node_output_tensor(weight_node, 0, weight_tensor, TENSOR_TYPE_CONST); - int weight_dims[4] = {1, 1, 3, 3}; // channel num + int weight_dims[4] = {1, 1, 3, 3}; // channel num set_tensor_shape(weight_tensor, weight_dims, 4); /* bias */ @@ -59,7 +61,7 @@ int create_test_convolution_node(graph_t graph, const char* input_name, const ch // tensor_t bias_tensor = create_graph_tensor(graph, "bias", TENGINE_DT_INT32); // set_node_output_tensor(bias_node, 0, bias_tensor, TENSOR_TYPE_CONST); // int bias_dims[1] = {1}; // channel num - // set_tensor_shape(bias_tensor, bias_dims, 1); + // set_tensor_shape(bias_tensor, bias_dims, 1); /* input tensors of test node */ set_node_input_tensor(test_node, 0, input_tensor); @@ -71,7 +73,7 @@ int create_test_convolution_node(graph_t graph, const char* input_name, const ch set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); /* set params */ - struct conv_param* conv_param = ( struct conv_param* )(struct node* )test_node->op.param_mem; + struct conv_param* conv_param = (struct conv_param*)(struct node*)test_node->op.param_mem; conv_param->kernel_h = 3; conv_param->kernel_w = 3; @@ -98,8 +100,8 @@ int create_test_convolution_node(graph_t graph, const char* input_name, const ch * float32 = (uint8 - zero_point) * scale */ float input_fp32[9] = {-3, -2, 1, - 1, 0, 2, - 1, 1, 1}; + 1, 0, 2, + 1, 1, 1}; float input_scale = 0.0196078f; int input_zero_point = 153; @@ -110,12 +112,11 @@ float weight_scale = 0.0039216f; int weight_zero_point = 0; float reference_out[9] = {-4, -1, 1, - -2, 2, 3, - 3, 6, 4}; + -2, 2, 3, + 3, 6, 4}; float output_scale = 0.03921568f; int output_zero_point = 102; - void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) { for (int i = 0; i < size; i++) @@ -143,8 +144,8 @@ int main(int argc, char* argv[]) fprintf(stderr, "Tengine init failed.\n"); // create - struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_convolution_node); - if(NULL == ir_graph) + struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_convolution_node); + if (NULL == ir_graph) return -1; set_log_level(LOG_INFO); @@ -155,7 +156,7 @@ int main(int argc, char* argv[]) struct tensor* weight_tensor = (struct tensor*)get_graph_tensor(ir_graph, "weight"); struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "conv"); -// tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); + // tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); set_tensor_quant_param(weight_tensor, &weight_scale, &weight_zero_point, 1); set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); @@ -183,17 +184,17 @@ int main(int argc, char* argv[]) } // get output and dequant - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; + uint8_t* output_u8 = (uint8_t*)output_tensor->data; int output_size = output_tensor->elem_num; get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); + float* output_data = (float*)malloc(output_size * sizeof(float)); for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; // check the result ret = 0; - for (int i = 0; i< output_size; i++) + for (int i = 0; i < output_size; i++) { if (fabsf(output_data[i] - reference_out[i]) > 0.1) { diff --git a/tests/op/test_timvx_op_deconv.cpp b/tests/op/test_timvx_op_deconv.cpp index 15f6cdfd5..2013f37ea 100644 --- a/tests/op/test_timvx_op_deconv.cpp +++ b/tests/op/test_timvx_op_deconv.cpp @@ -1,257 +1,280 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/deconv_param.h" - - -int create_test_deconv_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Deconvolution"); - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - - if(NULL == input_tensor) - { - fprintf(stderr, "create test node failed.\n"); - return -1; - } - - /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */ - /* weight */ - node_t weight_node = create_graph_node(graph, "weight", "Const"); - tensor_t weight_tensor = create_graph_tensor(graph, "weight", TENGINE_DT_UINT8); - set_node_output_tensor(weight_node, 0, weight_tensor, TENSOR_TYPE_CONST); - int weight_dims[4] = {1, 1, 3, 3}; // channel num - set_tensor_shape(weight_tensor, weight_dims, 4); - - /* bias */ - // node_t bias_node = create_graph_node(graph, "bias", "Const"); - // tensor_t bias_tensor = create_graph_tensor(graph, "bias", TENGINE_DT_INT32); - // set_node_output_tensor(bias_node, 0, bias_tensor, TENSOR_TYPE_CONST); - // int bias_dims[1] = {1}; // channel num - // set_tensor_shape(bias_tensor, bias_dims, 1); - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - set_node_input_tensor(test_node, 1, weight_tensor); - // set_node_input_tensor(test_node, 2, bias_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set params */ - struct deconv_param* deconv_param = ( struct deconv_param* )(struct node* )test_node->op.param_mem; - - deconv_param->num_output = 1; - deconv_param->kernel_h = 3; - deconv_param->kernel_w = 3; - deconv_param->stride_h = 2; - deconv_param->stride_w = 2; - deconv_param->pad_h0 = 0; - deconv_param->pad_w0 = 0; - deconv_param->pad_h1 = 0; - deconv_param->pad_w1 = 0; - deconv_param->dilation_h = 1; - deconv_param->dilation_w = 1; - deconv_param->group = 1; - deconv_param->activation = -1; - deconv_param->output_pad_h0 = 0; - deconv_param->output_pad_w0 = 0; - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input_fp32[9] = {3.0f, 8.0f, 1.0f, 9.0f, 5.0f, 7.0f, 3.0f, 2.0f, 3.0f,}; -float input_scale = 1; -int input_zero_point = 0; - -float weight_fp32[9] = {9.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 2.0f,}; -float weight_scale = 1; -int weight_zero_point = 0; - -float reference_out[49] = {27.000000, - 0.000000, - 81.000000, - 0.000000, - 33.000000, - 0.000000, - 3.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 84.000000, - 0.000000, - 86.000000, - 0.000000, - 95.000000, - 0.000000, - 23.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 36.000000, - 0.000000, - 50.000000, - 0.000000, - 50.000000, - 0.000000, - 23.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 3.000000, - 0.000000, - 8.000000, - 0.000000, - 7.000000, - 0.000000, - 6.000000, }; -float output_scale = 1; -int output_zero_point = 0; - - -void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) -{ - for (int i = 0; i < size; i++) - { - int udata = (round)(data_fp32[i] / scale + zero_point); - if (udata > 255) - udata = 255; - else if (udata < 0) - udata = 0; - - date_u8[i] = udata; - } -} - -int main(int argc, char* argv[]) -{ - int n = 1, c = 1, h = 3, w = 3; - const char* test_node_name = "deconv"; - int data_type = TENGINE_DT_UINT8; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_deconv_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* weight_tensor = (struct tensor*)get_graph_tensor(ir_graph, "weight"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "deconv"); - -// tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); - set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); - set_tensor_quant_param(weight_tensor, &weight_scale, &weight_zero_point, 1); - set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - - // set input data - uint8_t input_u8[9] = {0}; - get_uint8_data(input_fp32, input_u8, 9, input_scale, input_zero_point); - set_tensor_buffer(input_tensor, input_u8, 9); - - // set weight data - uint8_t weight_u8[9] = {0}; - get_uint8_data(weight_fp32, weight_u8, 9, weight_scale, weight_zero_point); - set_tensor_buffer(weight_tensor, weight_u8, 9); - - // set bias data - // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; - int output_size = output_tensor->elem_num; - - get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); - for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/deconv_param.h" + +int create_test_deconv_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Deconvolution"); + + tensor_t input_tensor = get_graph_tensor(graph, input_name); + + if (NULL == input_tensor) + { + fprintf(stderr, "create test node failed.\n"); + return -1; + } + + /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */ + /* weight */ + node_t weight_node = create_graph_node(graph, "weight", "Const"); + tensor_t weight_tensor = create_graph_tensor(graph, "weight", TENGINE_DT_UINT8); + set_node_output_tensor(weight_node, 0, weight_tensor, TENSOR_TYPE_CONST); + int weight_dims[4] = {1, 1, 3, 3}; // channel num + set_tensor_shape(weight_tensor, weight_dims, 4); + + /* bias */ + // node_t bias_node = create_graph_node(graph, "bias", "Const"); + // tensor_t bias_tensor = create_graph_tensor(graph, "bias", TENGINE_DT_INT32); + // set_node_output_tensor(bias_node, 0, bias_tensor, TENSOR_TYPE_CONST); + // int bias_dims[1] = {1}; // channel num + // set_tensor_shape(bias_tensor, bias_dims, 1); + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input_tensor); + set_node_input_tensor(test_node, 1, weight_tensor); + // set_node_input_tensor(test_node, 2, bias_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + /* set params */ + struct deconv_param* deconv_param = (struct deconv_param*)(struct node*)test_node->op.param_mem; + + deconv_param->num_output = 1; + deconv_param->kernel_h = 3; + deconv_param->kernel_w = 3; + deconv_param->stride_h = 2; + deconv_param->stride_w = 2; + deconv_param->pad_h0 = 0; + deconv_param->pad_w0 = 0; + deconv_param->pad_h1 = 0; + deconv_param->pad_w1 = 0; + deconv_param->dilation_h = 1; + deconv_param->dilation_w = 1; + deconv_param->group = 1; + deconv_param->activation = -1; + deconv_param->output_pad_h0 = 0; + deconv_param->output_pad_w0 = 0; + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input_fp32[9] = { + 3.0f, + 8.0f, + 1.0f, + 9.0f, + 5.0f, + 7.0f, + 3.0f, + 2.0f, + 3.0f, +}; +float input_scale = 1; +int input_zero_point = 0; + +float weight_fp32[9] = { + 9.0f, + 0.0f, + 3.0f, + 0.0f, + 0.0f, + 0.0f, + 1.0f, + 0.0f, + 2.0f, +}; +float weight_scale = 1; +int weight_zero_point = 0; + +float reference_out[49] = { + 27.000000, + 0.000000, + 81.000000, + 0.000000, + 33.000000, + 0.000000, + 3.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 84.000000, + 0.000000, + 86.000000, + 0.000000, + 95.000000, + 0.000000, + 23.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 36.000000, + 0.000000, + 50.000000, + 0.000000, + 50.000000, + 0.000000, + 23.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 3.000000, + 0.000000, + 8.000000, + 0.000000, + 7.000000, + 0.000000, + 6.000000, +}; +float output_scale = 1; +int output_zero_point = 0; + +void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) +{ + for (int i = 0; i < size; i++) + { + int udata = (round)(data_fp32[i] / scale + zero_point); + if (udata > 255) + udata = 255; + else if (udata < 0) + udata = 0; + + date_u8[i] = udata; + } +} + +int main(int argc, char* argv[]) +{ + int n = 1, c = 1, h = 3, w = 3; + const char* test_node_name = "deconv"; + int data_type = TENGINE_DT_UINT8; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_deconv_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* weight_tensor = (struct tensor*)get_graph_tensor(ir_graph, "weight"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "deconv"); + + // tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); + set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); + set_tensor_quant_param(weight_tensor, &weight_scale, &weight_zero_point, 1); + set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + + // set input data + uint8_t input_u8[9] = {0}; + get_uint8_data(input_fp32, input_u8, 9, input_scale, input_zero_point); + set_tensor_buffer(input_tensor, input_u8, 9); + + // set weight data + uint8_t weight_u8[9] = {0}; + get_uint8_data(weight_fp32, weight_u8, 9, weight_scale, weight_zero_point); + set_tensor_buffer(weight_tensor, weight_u8, 9); + + // set bias data + // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + uint8_t* output_u8 = (uint8_t*)output_tensor->data; + int output_size = output_tensor->elem_num; + + get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + float* output_data = (float*)malloc(output_size * sizeof(float)); + for (int i = 0; i < output_size; i++) + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_timvx_op_dropout.cpp b/tests/op/test_timvx_op_dropout.cpp index ac991bf6b..85f7e29ad 100644 --- a/tests/op/test_timvx_op_dropout.cpp +++ b/tests/op/test_timvx_op_dropout.cpp @@ -22,20 +22,22 @@ * Author: qtang@openailab.com */ - #include "test_op.h" - int create_test_dropout_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) { - (void)layout; (void)n; (void)c; (void)h; (void)w; + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; /* create the test node */ node_t test_node = create_graph_node(graph, node_name, "Dropout"); tensor_t input_tensor = get_graph_tensor(graph, input_name); - if(NULL == input_tensor) + if (NULL == input_tensor) { fprintf(stderr, "create test node failed.\n"); return -1; @@ -78,7 +80,7 @@ int main(int argc, char* argv[]) // create graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_dropout_node); - if(NULL == graph) + if (NULL == graph) return -1; // set quantize params @@ -101,21 +103,21 @@ int main(int argc, char* argv[]) // get output and dequant struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0); - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; + uint8_t* output_u8 = (uint8_t*)output_tensor->data; int output_size = output_tensor->elem_num; int out_c = output_tensor->dims[1]; int cstep = output_tensor->dims[2] * output_tensor->dims[3]; get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); + float* output_data = (float*)malloc(output_size * sizeof(float)); for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; // check the result ret = 0; - for (int i = 0; i< out_c; i++) + for (int i = 0; i < out_c; i++) { - float* output_value = (float *)output_data + i * cstep; + float* output_value = (float*)output_data + i * cstep; for (int j = 0; j < cstep; j++) { if (fabsf(output_value[j] - reference_out[i]) > 0.01) diff --git a/tests/op/test_timvx_op_eltwise_mul.cpp b/tests/op/test_timvx_op_eltwise_mul.cpp index 8f464d3b6..a2af5a610 100644 --- a/tests/op/test_timvx_op_eltwise_mul.cpp +++ b/tests/op/test_timvx_op_eltwise_mul.cpp @@ -1,189 +1,220 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/eltwise_param.h" - - -int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Eltwise"); - - tensor_t input0_tensor = get_graph_tensor(graph, input_name0); - - if(NULL == input0_tensor) - { - fprintf(stderr, "create test node input0 failed.\n"); - return -1; - } - - node_t input1_node = create_graph_node(graph, "input1", "Const"); - tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_UINT8); - set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_CONST); - int input1_dims[4] = {1, 1, 3, 3}; // channel num - set_tensor_shape(input1_tensor, input1_dims, 4); - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input0_tensor); - set_node_input_tensor(test_node, 1, input1_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set params */ - struct eltwise_param* param = ( struct eltwise_param* )(struct node* )test_node->op.param_mem; - - param->type = 0; - param->caffe_flavor = 1; - param->shift = NULL; - param->power = NULL; - param->scale = NULL; - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input0_fp32[9] = {3.0f, 8.0f, 1.0f, 9.0f, 5.0f, 7.0f, 3.0f, 2.0f, 3.0f,}; -float input0_scale = 1; -int input0_zero_point = 0; - -float input1_fp32[9] = {9.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 2.0f,}; -float input1_scale = 1; -int input1_zero_point = 0; - -float reference_out[9] = {27.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 3.0f, 0.0f, 6.0f,}; -float output_scale = 1; -int output_zero_point = 0; - - -void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) -{ - for (int i = 0; i < size; i++) - { - int udata = (round)(data_fp32[i] / scale + zero_point); - if (udata > 255) - udata = 255; - else if (udata < 0) - udata = 0; - - date_u8[i] = udata; - } -} - -int main(int argc, char* argv[]) -{ - int n = 1, c = 1, h = 3, w = 3; - const char* test_node_name = "eltwise"; - int data_type = TENGINE_DT_UINT8; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "eltwise"); - -// tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); - set_tensor_quant_param(input0_tensor, &input0_scale, &input0_zero_point, 1); - set_tensor_quant_param(input1_tensor, &input1_scale, &input1_zero_point, 1); - set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - - // set input data - uint8_t input0_u8[9] = {0}; - get_uint8_data(input0_fp32, input0_u8, 9, input0_scale, input0_zero_point); - set_tensor_buffer(input0_tensor, input0_u8, 9); - - // set input data - uint8_t input1_u8[9] = {0}; - get_uint8_data(input1_fp32, input1_u8, 9, input1_scale, input1_zero_point); - set_tensor_buffer(input1_tensor, input1_u8, 9); - - // set bias data - // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; - int output_size = output_tensor->elem_num; - - get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); - for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/eltwise_param.h" + +int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Eltwise"); + + tensor_t input0_tensor = get_graph_tensor(graph, input_name0); + + if (NULL == input0_tensor) + { + fprintf(stderr, "create test node input0 failed.\n"); + return -1; + } + + node_t input1_node = create_graph_node(graph, "input1", "Const"); + tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_UINT8); + set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_CONST); + int input1_dims[4] = {1, 1, 3, 3}; // channel num + set_tensor_shape(input1_tensor, input1_dims, 4); + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input0_tensor); + set_node_input_tensor(test_node, 1, input1_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + /* set params */ + struct eltwise_param* param = (struct eltwise_param*)(struct node*)test_node->op.param_mem; + + param->type = 0; + param->caffe_flavor = 1; + param->shift = NULL; + param->power = NULL; + param->scale = NULL; + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input0_fp32[9] = { + 3.0f, + 8.0f, + 1.0f, + 9.0f, + 5.0f, + 7.0f, + 3.0f, + 2.0f, + 3.0f, +}; +float input0_scale = 1; +int input0_zero_point = 0; + +float input1_fp32[9] = { + 9.0f, + 0.0f, + 3.0f, + 0.0f, + 0.0f, + 0.0f, + 1.0f, + 0.0f, + 2.0f, +}; +float input1_scale = 1; +int input1_zero_point = 0; + +float reference_out[9] = { + 27.0f, + 0.0f, + 3.0f, + 0.0f, + 0.0f, + 0.0f, + 3.0f, + 0.0f, + 6.0f, +}; +float output_scale = 1; +int output_zero_point = 0; + +void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) +{ + for (int i = 0; i < size; i++) + { + int udata = (round)(data_fp32[i] / scale + zero_point); + if (udata > 255) + udata = 255; + else if (udata < 0) + udata = 0; + + date_u8[i] = udata; + } +} + +int main(int argc, char* argv[]) +{ + int n = 1, c = 1, h = 3, w = 3; + const char* test_node_name = "eltwise"; + int data_type = TENGINE_DT_UINT8; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "eltwise"); + + // tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); + set_tensor_quant_param(input0_tensor, &input0_scale, &input0_zero_point, 1); + set_tensor_quant_param(input1_tensor, &input1_scale, &input1_zero_point, 1); + set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + + // set input data + uint8_t input0_u8[9] = {0}; + get_uint8_data(input0_fp32, input0_u8, 9, input0_scale, input0_zero_point); + set_tensor_buffer(input0_tensor, input0_u8, 9); + + // set input data + uint8_t input1_u8[9] = {0}; + get_uint8_data(input1_fp32, input1_u8, 9, input1_scale, input1_zero_point); + set_tensor_buffer(input1_tensor, input1_u8, 9); + + // set bias data + // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + uint8_t* output_u8 = (uint8_t*)output_tensor->data; + int output_size = output_tensor->elem_num; + + get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + float* output_data = (float*)malloc(output_size * sizeof(float)); + for (int i = 0; i < output_size; i++) + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_timvx_op_eltwise_sum.cpp b/tests/op/test_timvx_op_eltwise_sum.cpp index 511aebb27..fb38ca449 100644 --- a/tests/op/test_timvx_op_eltwise_sum.cpp +++ b/tests/op/test_timvx_op_eltwise_sum.cpp @@ -1,189 +1,220 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/eltwise_param.h" - - -int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Eltwise"); - - tensor_t input0_tensor = get_graph_tensor(graph, input_name0); - - if(NULL == input0_tensor) - { - fprintf(stderr, "create test node input0 failed.\n"); - return -1; - } - - node_t input1_node = create_graph_node(graph, "input1", "Const"); - tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_UINT8); - set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_CONST); - int input1_dims[4] = {1, 1, 3, 3}; // channel num - set_tensor_shape(input1_tensor, input1_dims, 4); - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input0_tensor); - set_node_input_tensor(test_node, 1, input1_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set params */ - struct eltwise_param* param = ( struct eltwise_param* )(struct node* )test_node->op.param_mem; - - param->type = 2; - param->caffe_flavor = 1; - param->shift = NULL; - param->power = NULL; - param->scale = NULL; - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input0_fp32[9] = {3.0f, 8.0f, 1.0f, 9.0f, 5.0f, 7.0f, 3.0f, 2.0f, 3.0f,}; -float input0_scale = 1; -int input0_zero_point = 0; - -float input1_fp32[9] = {9.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 2.0f,}; -float input1_scale = 1; -int input1_zero_point = 0; - -float reference_out[9] = {12.0f, 8.0f, 4.0f, 9.0f, 5.0f, 7.0f, 4.0f, 2.0f, 5.0f,}; -float output_scale = 1; -int output_zero_point = 0; - - -void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) -{ - for (int i = 0; i < size; i++) - { - int udata = (round)(data_fp32[i] / scale + zero_point); - if (udata > 255) - udata = 255; - else if (udata < 0) - udata = 0; - - date_u8[i] = udata; - } -} - -int main(int argc, char* argv[]) -{ - int n = 1, c = 1, h = 3, w = 3; - const char* test_node_name = "eltwise"; - int data_type = TENGINE_DT_UINT8; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "eltwise"); - -// tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); - set_tensor_quant_param(input0_tensor, &input0_scale, &input0_zero_point, 1); - set_tensor_quant_param(input1_tensor, &input1_scale, &input1_zero_point, 1); - set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - - // set input data - uint8_t input0_u8[9] = {0}; - get_uint8_data(input0_fp32, input0_u8, 9, input0_scale, input0_zero_point); - set_tensor_buffer(input0_tensor, input0_u8, 9); - - // set input data - uint8_t input1_u8[9] = {0}; - get_uint8_data(input1_fp32, input1_u8, 9, input1_scale, input1_zero_point); - set_tensor_buffer(input1_tensor, input1_u8, 9); - - // set bias data - // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; - int output_size = output_tensor->elem_num; - - get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); - for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/eltwise_param.h" + +int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Eltwise"); + + tensor_t input0_tensor = get_graph_tensor(graph, input_name0); + + if (NULL == input0_tensor) + { + fprintf(stderr, "create test node input0 failed.\n"); + return -1; + } + + node_t input1_node = create_graph_node(graph, "input1", "Const"); + tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_UINT8); + set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_CONST); + int input1_dims[4] = {1, 1, 3, 3}; // channel num + set_tensor_shape(input1_tensor, input1_dims, 4); + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input0_tensor); + set_node_input_tensor(test_node, 1, input1_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + /* set params */ + struct eltwise_param* param = (struct eltwise_param*)(struct node*)test_node->op.param_mem; + + param->type = 2; + param->caffe_flavor = 1; + param->shift = NULL; + param->power = NULL; + param->scale = NULL; + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input0_fp32[9] = { + 3.0f, + 8.0f, + 1.0f, + 9.0f, + 5.0f, + 7.0f, + 3.0f, + 2.0f, + 3.0f, +}; +float input0_scale = 1; +int input0_zero_point = 0; + +float input1_fp32[9] = { + 9.0f, + 0.0f, + 3.0f, + 0.0f, + 0.0f, + 0.0f, + 1.0f, + 0.0f, + 2.0f, +}; +float input1_scale = 1; +int input1_zero_point = 0; + +float reference_out[9] = { + 12.0f, + 8.0f, + 4.0f, + 9.0f, + 5.0f, + 7.0f, + 4.0f, + 2.0f, + 5.0f, +}; +float output_scale = 1; +int output_zero_point = 0; + +void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) +{ + for (int i = 0; i < size; i++) + { + int udata = (round)(data_fp32[i] / scale + zero_point); + if (udata > 255) + udata = 255; + else if (udata < 0) + udata = 0; + + date_u8[i] = udata; + } +} + +int main(int argc, char* argv[]) +{ + int n = 1, c = 1, h = 3, w = 3; + const char* test_node_name = "eltwise"; + int data_type = TENGINE_DT_UINT8; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "eltwise"); + + // tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); + set_tensor_quant_param(input0_tensor, &input0_scale, &input0_zero_point, 1); + set_tensor_quant_param(input1_tensor, &input1_scale, &input1_zero_point, 1); + set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + + // set input data + uint8_t input0_u8[9] = {0}; + get_uint8_data(input0_fp32, input0_u8, 9, input0_scale, input0_zero_point); + set_tensor_buffer(input0_tensor, input0_u8, 9); + + // set input data + uint8_t input1_u8[9] = {0}; + get_uint8_data(input1_fp32, input1_u8, 9, input1_scale, input1_zero_point); + set_tensor_buffer(input1_tensor, input1_u8, 9); + + // set bias data + // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + uint8_t* output_u8 = (uint8_t*)output_tensor->data; + int output_size = output_tensor->elem_num; + + get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + float* output_data = (float*)malloc(output_size * sizeof(float)); + for (int i = 0; i < output_size; i++) + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_timvx_op_elu.cpp b/tests/op/test_timvx_op_elu.cpp index f421fdc53..39f82fac2 100644 --- a/tests/op/test_timvx_op_elu.cpp +++ b/tests/op/test_timvx_op_elu.cpp @@ -22,20 +22,22 @@ * Author: qtang@openailab.com */ - #include "test_op.h" - int create_test_elu_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) { - (void)layout; (void)n; (void)c; (void)h; (void)w; + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Elu"); + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Elu"); tensor_t input_tensor = get_graph_tensor(graph, input_name); - if(NULL == input_tensor) + if (NULL == input_tensor) { fprintf(stderr, "create test node failed.\n"); return -1; @@ -78,7 +80,7 @@ int main(int argc, char* argv[]) // create graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_elu_node); - if(NULL == graph) + if (NULL == graph) return -1; // set quantize params @@ -101,7 +103,7 @@ int main(int argc, char* argv[]) // get output and dequant struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0); - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; + uint8_t* output_u8 = (uint8_t*)output_tensor->data; int output_size = output_tensor->elem_num; int out_c = output_tensor->dims[1]; int cstep = output_tensor->dims[2] * output_tensor->dims[3]; @@ -109,13 +111,13 @@ int main(int argc, char* argv[]) get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); std::vector output_data(output_size); for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; // check the result ret = 0; - for (int i = 0; i< out_c; i++) + for (int i = 0; i < out_c; i++) { - float* output_value = (float *)output_data.data() + i * cstep; + float* output_value = (float*)output_data.data() + i * cstep; for (int j = 0; j < cstep; j++) { if (fabsf(output_value[j] - reference_out[i]) > 0.05f) diff --git a/tests/op/test_timvx_op_fc.cpp b/tests/op/test_timvx_op_fc.cpp index 0cb721c8c..080fd0af8 100644 --- a/tests/op/test_timvx_op_fc.cpp +++ b/tests/op/test_timvx_op_fc.cpp @@ -1,195 +1,206 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/fc_param.h" - - -int create_test_fc_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "FullyConnected"); - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - - if(NULL == input_tensor) - { - fprintf(stderr, "create test node failed.\n"); - return -1; - } - - /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */ - /* weight */ - node_t weight_node = create_graph_node(graph, "weight", "Const"); - tensor_t weight_tensor = create_graph_tensor(graph, "weight", TENGINE_DT_UINT8); - set_node_output_tensor(weight_node, 0, weight_tensor, TENSOR_TYPE_CONST); - int weight_dims[2] = {1, 3}; // channel num - set_tensor_shape(weight_tensor, weight_dims, 2); - - /* bias */ - // node_t bias_node = create_graph_node(graph, "bias", "Const"); - // tensor_t bias_tensor = create_graph_tensor(graph, "bias", TENGINE_DT_INT32); - // set_node_output_tensor(bias_node, 0, bias_tensor, TENSOR_TYPE_CONST); - // int bias_dims[1] = {1}; // channel num - // set_tensor_shape(bias_tensor, bias_dims, 1); - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - set_node_input_tensor(test_node, 1, weight_tensor); - // set_node_input_tensor(test_node, 2, bias_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set params */ - struct fc_param* param = ( struct fc_param* )(struct node* )test_node->op.param_mem; - - param->num_output = 1; - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input_fp32[3] = {3.0f, 8.0f, 1.0f,}; -float input_scale = 1; -int input_zero_point = 0; - -float weight_fp32[3] = {9.0f, 0.0f, 3.0f,}; -float weight_scale = 1; -int weight_zero_point = 0; - -float reference_out[1] = {30,}; -float output_scale = 1; -int output_zero_point = 0; - - -void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) -{ - for (int i = 0; i < size; i++) - { - int udata = (round)(data_fp32[i] / scale + zero_point); - if (udata > 255) - udata = 255; - else if (udata < 0) - udata = 0; - - date_u8[i] = udata; - } -} - -int main(int argc, char* argv[]) -{ - int n = 1, c = 3, h = 1, w = 1; - const char* test_node_name = "conv"; - int data_type = TENGINE_DT_UINT8; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_fc_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* weight_tensor = (struct tensor*)get_graph_tensor(ir_graph, "weight"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "conv"); - -// tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); - set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); - set_tensor_quant_param(weight_tensor, &weight_scale, &weight_zero_point, 1); - set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - - // set input data - uint8_t input_u8[3] = {0}; - get_uint8_data(input_fp32, input_u8, 3, input_scale, input_zero_point); - set_tensor_buffer(input_tensor, input_u8, 3); - - // set weight data - uint8_t weight_u8[3] = {0}; - get_uint8_data(weight_fp32, weight_u8, 3, weight_scale, weight_zero_point); - set_tensor_buffer(weight_tensor, weight_u8, 3); - - // set bias data - // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; - int output_size = output_tensor->elem_num; - - get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); - for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/fc_param.h" + +int create_test_fc_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "FullyConnected"); + + tensor_t input_tensor = get_graph_tensor(graph, input_name); + + if (NULL == input_tensor) + { + fprintf(stderr, "create test node failed.\n"); + return -1; + } + + /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */ + /* weight */ + node_t weight_node = create_graph_node(graph, "weight", "Const"); + tensor_t weight_tensor = create_graph_tensor(graph, "weight", TENGINE_DT_UINT8); + set_node_output_tensor(weight_node, 0, weight_tensor, TENSOR_TYPE_CONST); + int weight_dims[2] = {1, 3}; // channel num + set_tensor_shape(weight_tensor, weight_dims, 2); + + /* bias */ + // node_t bias_node = create_graph_node(graph, "bias", "Const"); + // tensor_t bias_tensor = create_graph_tensor(graph, "bias", TENGINE_DT_INT32); + // set_node_output_tensor(bias_node, 0, bias_tensor, TENSOR_TYPE_CONST); + // int bias_dims[1] = {1}; // channel num + // set_tensor_shape(bias_tensor, bias_dims, 1); + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input_tensor); + set_node_input_tensor(test_node, 1, weight_tensor); + // set_node_input_tensor(test_node, 2, bias_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + /* set params */ + struct fc_param* param = (struct fc_param*)(struct node*)test_node->op.param_mem; + + param->num_output = 1; + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input_fp32[3] = { + 3.0f, + 8.0f, + 1.0f, +}; +float input_scale = 1; +int input_zero_point = 0; + +float weight_fp32[3] = { + 9.0f, + 0.0f, + 3.0f, +}; +float weight_scale = 1; +int weight_zero_point = 0; + +float reference_out[1] = { + 30, +}; +float output_scale = 1; +int output_zero_point = 0; + +void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) +{ + for (int i = 0; i < size; i++) + { + int udata = (round)(data_fp32[i] / scale + zero_point); + if (udata > 255) + udata = 255; + else if (udata < 0) + udata = 0; + + date_u8[i] = udata; + } +} + +int main(int argc, char* argv[]) +{ + int n = 1, c = 3, h = 1, w = 1; + const char* test_node_name = "conv"; + int data_type = TENGINE_DT_UINT8; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_fc_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* weight_tensor = (struct tensor*)get_graph_tensor(ir_graph, "weight"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "conv"); + + // tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); + set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); + set_tensor_quant_param(weight_tensor, &weight_scale, &weight_zero_point, 1); + set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + + // set input data + uint8_t input_u8[3] = {0}; + get_uint8_data(input_fp32, input_u8, 3, input_scale, input_zero_point); + set_tensor_buffer(input_tensor, input_u8, 3); + + // set weight data + uint8_t weight_u8[3] = {0}; + get_uint8_data(weight_fp32, weight_u8, 3, weight_scale, weight_zero_point); + set_tensor_buffer(weight_tensor, weight_u8, 3); + + // set bias data + // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + uint8_t* output_u8 = (uint8_t*)output_tensor->data; + int output_size = output_tensor->elem_num; + + get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + float* output_data = (float*)malloc(output_size * sizeof(float)); + for (int i = 0; i < output_size; i++) + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_timvx_op_flatten.cpp b/tests/op/test_timvx_op_flatten.cpp index 0f960b8e1..fc0ff8e64 100644 --- a/tests/op/test_timvx_op_flatten.cpp +++ b/tests/op/test_timvx_op_flatten.cpp @@ -1,169 +1,177 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/flatten_param.h" - - -int create_test_fc_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Flatten"); - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - - if(NULL == input_tensor) - { - fprintf(stderr, "create test node failed.\n"); - return -1; - } - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set params */ - struct flatten_param* param = ( struct flatten_param* )(struct node* )test_node->op.param_mem; - - param->axis = 1; - param->end_axis = 3; - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input_fp32[3] = {3.0f, 8.0f, 1.0f,}; -float input_scale = 1; -int input_zero_point = 0; - -float reference_out[3] = {3.0f, 8.0f, 1.0f,}; -float output_scale = 1; -int output_zero_point = 0; - - -void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) -{ - for (int i = 0; i < size; i++) - { - int udata = (round)(data_fp32[i] / scale + zero_point); - if (udata > 255) - udata = 255; - else if (udata < 0) - udata = 0; - - date_u8[i] = udata; - } -} - -int main(int argc, char* argv[]) -{ - int n = 1, c = 3, h = 1, w = 1; - const char* test_node_name = "flatten"; - int data_type = TENGINE_DT_UINT8; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_fc_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "flatten"); - -// tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); - set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); - set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - - // set input data - uint8_t input_u8[3] = {0}; - get_uint8_data(input_fp32, input_u8, 3, input_scale, input_zero_point); - set_tensor_buffer(input_tensor, input_u8, 3); - - - // set bias data - // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; - int output_size = output_tensor->elem_num; - - get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); - for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/flatten_param.h" + +int create_test_fc_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Flatten"); + + tensor_t input_tensor = get_graph_tensor(graph, input_name); + + if (NULL == input_tensor) + { + fprintf(stderr, "create test node failed.\n"); + return -1; + } + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + /* set params */ + struct flatten_param* param = (struct flatten_param*)(struct node*)test_node->op.param_mem; + + param->axis = 1; + param->end_axis = 3; + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input_fp32[3] = { + 3.0f, + 8.0f, + 1.0f, +}; +float input_scale = 1; +int input_zero_point = 0; + +float reference_out[3] = { + 3.0f, + 8.0f, + 1.0f, +}; +float output_scale = 1; +int output_zero_point = 0; + +void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) +{ + for (int i = 0; i < size; i++) + { + int udata = (round)(data_fp32[i] / scale + zero_point); + if (udata > 255) + udata = 255; + else if (udata < 0) + udata = 0; + + date_u8[i] = udata; + } +} + +int main(int argc, char* argv[]) +{ + int n = 1, c = 3, h = 1, w = 1; + const char* test_node_name = "flatten"; + int data_type = TENGINE_DT_UINT8; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_fc_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "flatten"); + + // tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); + set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); + set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + + // set input data + uint8_t input_u8[3] = {0}; + get_uint8_data(input_fp32, input_u8, 3, input_scale, input_zero_point); + set_tensor_buffer(input_tensor, input_u8, 3); + + // set bias data + // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + uint8_t* output_u8 = (uint8_t*)output_tensor->data; + int output_size = output_tensor->elem_num; + + get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + float* output_data = (float*)malloc(output_size * sizeof(float)); + for (int i = 0; i < output_size; i++) + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_timvx_op_gather.cpp b/tests/op/test_timvx_op_gather.cpp index cce32b477..c4c54690d 100644 --- a/tests/op/test_timvx_op_gather.cpp +++ b/tests/op/test_timvx_op_gather.cpp @@ -1,174 +1,182 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/gather_param.h" - - -int create_test_fc_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Gather"); - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - - if(NULL == input_tensor) - { - fprintf(stderr, "create test node failed.\n"); - return -1; - } - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set params */ - struct gather_param* param = ( struct gather_param* )(struct node* )test_node->op.param_mem; - - param->axis = 1; - param->is_onnx = 1; - param->indices_num = 2; - - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input_fp32[6] = {3.0f, 7.0f, - 2.0f, 1.0f, - 4.0f, 6.0f,}; -float input_scale = 1; -int input_zero_point = 0; - -float reference_out[4] = {3.0f, 7.0f, - 2.0f, 1.0f,}; -float output_scale = 1; -int output_zero_point = 0; - - -void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) -{ - for (int i = 0; i < size; i++) - { - int udata = (round)(data_fp32[i] / scale + zero_point); - if (udata > 255) - udata = 255; - else if (udata < 0) - udata = 0; - - date_u8[i] = udata; - } -} - -int main(int argc, char* argv[]) -{ - int n = 1, c = 3, h = 2, w = 1; - const char* test_node_name = "gather"; - int data_type = TENGINE_DT_UINT8; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_fc_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "gather"); - -// tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); - set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); - set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - - // set input data - uint8_t input_u8[6] = {0}; - get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point); - set_tensor_buffer(input_tensor, input_u8, 6); - - - // set bias data - // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; - int output_size = output_tensor->elem_num; - - get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); - for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/gather_param.h" + +int create_test_fc_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Gather"); + + tensor_t input_tensor = get_graph_tensor(graph, input_name); + + if (NULL == input_tensor) + { + fprintf(stderr, "create test node failed.\n"); + return -1; + } + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + /* set params */ + struct gather_param* param = (struct gather_param*)(struct node*)test_node->op.param_mem; + + param->axis = 1; + param->is_onnx = 1; + param->indices_num = 2; + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input_fp32[6] = { + 3.0f, + 7.0f, + 2.0f, + 1.0f, + 4.0f, + 6.0f, +}; +float input_scale = 1; +int input_zero_point = 0; + +float reference_out[4] = { + 3.0f, + 7.0f, + 2.0f, + 1.0f, +}; +float output_scale = 1; +int output_zero_point = 0; + +void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) +{ + for (int i = 0; i < size; i++) + { + int udata = (round)(data_fp32[i] / scale + zero_point); + if (udata > 255) + udata = 255; + else if (udata < 0) + udata = 0; + + date_u8[i] = udata; + } +} + +int main(int argc, char* argv[]) +{ + int n = 1, c = 3, h = 2, w = 1; + const char* test_node_name = "gather"; + int data_type = TENGINE_DT_UINT8; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_fc_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "gather"); + + // tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); + set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); + set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + + // set input data + uint8_t input_u8[6] = {0}; + get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point); + set_tensor_buffer(input_tensor, input_u8, 6); + + // set bias data + // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + uint8_t* output_u8 = (uint8_t*)output_tensor->data; + int output_size = output_tensor->elem_num; + + get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + float* output_data = (float*)malloc(output_size * sizeof(float)); + for (int i = 0; i < output_size; i++) + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_timvx_op_hardswish.cpp b/tests/op/test_timvx_op_hardswish.cpp index ae0f95fe1..e3df15ed6 100644 --- a/tests/op/test_timvx_op_hardswish.cpp +++ b/tests/op/test_timvx_op_hardswish.cpp @@ -22,20 +22,22 @@ * Author: qtang@openailab.com */ - #include "test_op.h" - int create_test_hardswish_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) { - (void)layout; (void)n; (void)c; (void)h; (void)w; + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Hardswish"); + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Hardswish"); tensor_t input_tensor = get_graph_tensor(graph, input_name); - if(NULL == input_tensor) + if (NULL == input_tensor) { fprintf(stderr, "create test node failed.\n"); return -1; @@ -78,7 +80,7 @@ int main(int argc, char* argv[]) // create graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_hardswish_node); - if(NULL == graph) + if (NULL == graph) return -1; // set quantize params @@ -101,7 +103,7 @@ int main(int argc, char* argv[]) // get output and dequant struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0); - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; + uint8_t* output_u8 = (uint8_t*)output_tensor->data; int output_size = output_tensor->elem_num; int out_c = output_tensor->dims[1]; int cstep = output_tensor->dims[2] * output_tensor->dims[3]; @@ -109,13 +111,13 @@ int main(int argc, char* argv[]) get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); std::vector output_data(output_size); for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; // check the result ret = 0; - for (int i = 0; i< out_c; i++) + for (int i = 0; i < out_c; i++) { - float* output_value = (float *)output_data.data() + i * cstep; + float* output_value = (float*)output_data.data() + i * cstep; for (int j = 0; j < cstep; j++) { if (fabsf(output_value[j] - reference_out[i]) > 0.01f) diff --git a/tests/op/test_timvx_op_interp.cpp b/tests/op/test_timvx_op_interp.cpp index 625fe8b57..3a226fb10 100644 --- a/tests/op/test_timvx_op_interp.cpp +++ b/tests/op/test_timvx_op_interp.cpp @@ -1,175 +1,189 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/interp_param.h" - - -int create_test_interp_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Interp"); - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - - if(NULL == input_tensor) - { - fprintf(stderr, "create test node failed.\n"); - return -1; - } - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set params */ - struct interp_param* param = ( struct interp_param* )(struct node* )test_node->op.param_mem; - - param->resize_type = 1; - param->output_height = 2; - param->output_width = 2; - param->height_scale = 0.5; - param->width_scale = 0.5; - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input_fp32[16] = {1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 2.0f, 2.0f, 1.0f, - 1.0f, 2.0f, 2.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, }; -float input_scale = 1; -int input_zero_point = 0; - -float reference_out[4] = {1, 1, 1, 2}; -float output_scale = 1; -int output_zero_point = 0; - - -void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) -{ - for (int i = 0; i < size; i++) - { - int udata = (round)(data_fp32[i] / scale + zero_point); - if (udata > 255) - udata = 255; - else if (udata < 0) - udata = 0; - - date_u8[i] = udata; - } -} - -int main(int argc, char* argv[]) -{ - int n = 1, c = 1, h = 4, w = 4; - const char* test_node_name = "interp"; - int data_type = TENGINE_DT_UINT8; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_interp_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "interp"); - -// tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); - set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); - set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - - // set input data - uint8_t input_u8[16] = {0}; - get_uint8_data(input_fp32, input_u8, 16, input_scale, input_zero_point); - set_tensor_buffer(input_tensor, input_u8, 16); - - - // set bias data - // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; - int output_size = output_tensor->elem_num; - - get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); - for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/interp_param.h" + +int create_test_interp_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Interp"); + + tensor_t input_tensor = get_graph_tensor(graph, input_name); + + if (NULL == input_tensor) + { + fprintf(stderr, "create test node failed.\n"); + return -1; + } + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + /* set params */ + struct interp_param* param = (struct interp_param*)(struct node*)test_node->op.param_mem; + + param->resize_type = 1; + param->output_height = 2; + param->output_width = 2; + param->height_scale = 0.5; + param->width_scale = 0.5; + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input_fp32[16] = { + 1.0f, + 1.0f, + 1.0f, + 1.0f, + 1.0f, + 2.0f, + 2.0f, + 1.0f, + 1.0f, + 2.0f, + 2.0f, + 1.0f, + 1.0f, + 1.0f, + 1.0f, + 1.0f, +}; +float input_scale = 1; +int input_zero_point = 0; + +float reference_out[4] = {1, 1, 1, 2}; +float output_scale = 1; +int output_zero_point = 0; + +void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) +{ + for (int i = 0; i < size; i++) + { + int udata = (round)(data_fp32[i] / scale + zero_point); + if (udata > 255) + udata = 255; + else if (udata < 0) + udata = 0; + + date_u8[i] = udata; + } +} + +int main(int argc, char* argv[]) +{ + int n = 1, c = 1, h = 4, w = 4; + const char* test_node_name = "interp"; + int data_type = TENGINE_DT_UINT8; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_interp_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "interp"); + + // tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); + set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); + set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + + // set input data + uint8_t input_u8[16] = {0}; + get_uint8_data(input_fp32, input_u8, 16, input_scale, input_zero_point); + set_tensor_buffer(input_tensor, input_u8, 16); + + // set bias data + // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + uint8_t* output_u8 = (uint8_t*)output_tensor->data; + int output_size = output_tensor->elem_num; + + get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + float* output_data = (float*)malloc(output_size * sizeof(float)); + for (int i = 0; i < output_size; i++) + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_timvx_op_leakyrelu.cpp b/tests/op/test_timvx_op_leakyrelu.cpp index 3f69a25ac..6a037ef5a 100644 --- a/tests/op/test_timvx_op_leakyrelu.cpp +++ b/tests/op/test_timvx_op_leakyrelu.cpp @@ -22,21 +22,23 @@ * Author: qtang@openailab.com */ - #include "test_op.h" #include "operator/prototype/relu_param.h" - int create_test_leakyrelu_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) { - (void)layout; (void)n; (void)c; (void)h; (void)w; + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "ReLU"); + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "ReLU"); tensor_t input_tensor = get_graph_tensor(graph, input_name); - if(NULL == input_tensor) + if (NULL == input_tensor) { fprintf(stderr, "create test node failed.\n"); return -1; @@ -50,7 +52,7 @@ int create_test_leakyrelu_node(graph_t graph, const char* input_name, const char set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); /* set params */ - struct relu_param* relu_param = ( struct relu_param* )(struct node* )test_node->op.param_mem; + struct relu_param* relu_param = (struct relu_param*)(struct node*)test_node->op.param_mem; relu_param->negative_slope = 0.1f; return 0; @@ -83,7 +85,7 @@ int main(int argc, char* argv[]) // create graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_leakyrelu_node); - if(NULL == graph) + if (NULL == graph) return -1; // set quantize params @@ -106,7 +108,7 @@ int main(int argc, char* argv[]) // get output and dequant struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0); - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; + uint8_t* output_u8 = (uint8_t*)output_tensor->data; int output_size = output_tensor->elem_num; int out_c = output_tensor->dims[1]; int cstep = output_tensor->dims[2] * output_tensor->dims[3]; @@ -114,13 +116,13 @@ int main(int argc, char* argv[]) get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); std::vector output_data(output_size); for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; // check the result ret = 0; - for (int i = 0; i< out_c; i++) + for (int i = 0; i < out_c; i++) { - float* output_value = (float *)output_data.data() + i * cstep; + float* output_value = (float*)output_data.data() + i * cstep; for (int j = 0; j < cstep; j++) { if (fabsf(output_value[j] - reference_out[i]) > 0.05f) diff --git a/tests/op/test_timvx_op_mish.cpp b/tests/op/test_timvx_op_mish.cpp index bcbe67629..31aa6966a 100644 --- a/tests/op/test_timvx_op_mish.cpp +++ b/tests/op/test_timvx_op_mish.cpp @@ -25,17 +25,20 @@ #include #include "test_op.h" - int create_test_mish_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) { - (void)layout; (void)n; (void)c; (void)h; (void)w; + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; /* create the test node */ node_t test_node = create_graph_node(graph, node_name, "Mish"); tensor_t input_tensor = get_graph_tensor(graph, input_name); - if(NULL == input_tensor) + if (NULL == input_tensor) { fprintf(stderr, "create test node failed.\n"); return -1; @@ -78,7 +81,7 @@ int main(int argc, char* argv[]) // create graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_mish_node); - if(NULL == graph) + if (NULL == graph) return -1; // set quantize params @@ -101,22 +104,22 @@ int main(int argc, char* argv[]) // get output and dequant struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0); - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; + uint8_t* output_u8 = (uint8_t*)output_tensor->data; int output_size = output_tensor->elem_num; int out_c = output_tensor->dims[1]; int cstep = output_tensor->dims[2] * output_tensor->dims[3]; get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); std::vector output_data(output_size); - + for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; // check the result ret = 0; - for (int i = 0; i< out_c; i++) + for (int i = 0; i < out_c; i++) { - float* output_value = (float *)output_data.data() + i * cstep; + float* output_value = (float*)output_data.data() + i * cstep; for (int j = 0; j < cstep; j++) { if (fabsf(output_value[j] - reference_out[i]) > 0.01) diff --git a/tests/op/test_timvx_op_permute.cpp b/tests/op/test_timvx_op_permute.cpp index 8232f6cab..6848c1588 100644 --- a/tests/op/test_timvx_op_permute.cpp +++ b/tests/op/test_timvx_op_permute.cpp @@ -1,175 +1,186 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/permute_param.h" - - -int create_test_permute_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Permute"); - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - - if(NULL == input_tensor) - { - fprintf(stderr, "create test node failed.\n"); - return -1; - } - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set params */ - struct permute_param* param = ( struct permute_param* )(struct node* )test_node->op.param_mem; - - param->flag = 0; - param->order0 = 0; - param->order1 = 2; - param->order2 = 3; - param->order3 = 1; - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input_fp32[6] = {1.0f, 2.0f, 3.0f, - 4.0f, 5.0f, 6.0f, }; -float input_scale = 1; -int input_zero_point = 0; - -float reference_out[6] = {1.0f, 4.0f, - 2.0f, 5.0f, - 3.0f, 6.0f, }; -float output_scale = 1; -int output_zero_point = 0; - - -void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) -{ - for (int i = 0; i < size; i++) - { - int udata = (round)(data_fp32[i] / scale + zero_point); - if (udata > 255) - udata = 255; - else if (udata < 0) - udata = 0; - - date_u8[i] = udata; - } -} - -int main(int argc, char* argv[]) -{ - int n = 1, c = 2, h = 1, w = 3; - const char* test_node_name = "permute"; - int data_type = TENGINE_DT_UINT8; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_permute_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "permute"); - -// tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); - set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); - set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - - // set input data - uint8_t input_u8[6] = {0}; - get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point); - set_tensor_buffer(input_tensor, input_u8, 6); - - - // set bias data - // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; - int output_size = output_tensor->elem_num; - - get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); - for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/permute_param.h" + +int create_test_permute_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Permute"); + + tensor_t input_tensor = get_graph_tensor(graph, input_name); + + if (NULL == input_tensor) + { + fprintf(stderr, "create test node failed.\n"); + return -1; + } + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + /* set params */ + struct permute_param* param = (struct permute_param*)(struct node*)test_node->op.param_mem; + + param->flag = 0; + param->order0 = 0; + param->order1 = 2; + param->order2 = 3; + param->order3 = 1; + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input_fp32[6] = { + 1.0f, + 2.0f, + 3.0f, + 4.0f, + 5.0f, + 6.0f, +}; +float input_scale = 1; +int input_zero_point = 0; + +float reference_out[6] = { + 1.0f, + 4.0f, + 2.0f, + 5.0f, + 3.0f, + 6.0f, +}; +float output_scale = 1; +int output_zero_point = 0; + +void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) +{ + for (int i = 0; i < size; i++) + { + int udata = (round)(data_fp32[i] / scale + zero_point); + if (udata > 255) + udata = 255; + else if (udata < 0) + udata = 0; + + date_u8[i] = udata; + } +} + +int main(int argc, char* argv[]) +{ + int n = 1, c = 2, h = 1, w = 3; + const char* test_node_name = "permute"; + int data_type = TENGINE_DT_UINT8; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_permute_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "permute"); + + // tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); + set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); + set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + + // set input data + uint8_t input_u8[6] = {0}; + get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point); + set_tensor_buffer(input_tensor, input_u8, 6); + + // set bias data + // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + uint8_t* output_u8 = (uint8_t*)output_tensor->data; + int output_size = output_tensor->elem_num; + + get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + float* output_data = (float*)malloc(output_size * sizeof(float)); + for (int i = 0; i < output_size; i++) + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_timvx_op_pooling.cpp b/tests/op/test_timvx_op_pooling.cpp index 61a02c7ac..65575616f 100644 --- a/tests/op/test_timvx_op_pooling.cpp +++ b/tests/op/test_timvx_op_pooling.cpp @@ -22,21 +22,23 @@ * Author: qtang@openailab.com */ - #include "test_op.h" #include "operator/prototype/pooling_param.h" - int create_test_pool_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) { - (void)layout; (void)n; (void)c; (void)h; (void)w; + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Pooling"); + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Pooling"); tensor_t input_tensor = get_graph_tensor(graph, input_name); - if(NULL == input_tensor) + if (NULL == input_tensor) { fprintf(stderr, "create test node failed.\n"); return -1; @@ -50,7 +52,7 @@ int create_test_pool_node(graph_t graph, const char* input_name, const char* nod set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); /* set params */ - struct pool_param* pool_param = ( struct pool_param* )(struct node* )test_node->op.param_mem; + struct pool_param* pool_param = (struct pool_param*)(struct node*)test_node->op.param_mem; pool_param->pool_method = POOL_MAX; pool_param->global = 0; @@ -99,7 +101,7 @@ int main(int argc, char* argv[]) // create graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_pool_node); - if(NULL == graph) + if (NULL == graph) return -1; // set quantize params @@ -122,21 +124,21 @@ int main(int argc, char* argv[]) // get output and dequant struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0); - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; + uint8_t* output_u8 = (uint8_t*)output_tensor->data; int output_size = output_tensor->elem_num; int out_c = output_tensor->dims[1]; int cstep = output_tensor->dims[2] * output_tensor->dims[3]; get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); + float* output_data = (float*)malloc(output_size * sizeof(float)); for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; // check the result ret = 0; - for (int i = 0; i< out_c; i++) + for (int i = 0; i < out_c; i++) { - float* output_value = (float *)output_data + i * cstep; + float* output_value = (float*)output_data + i * cstep; for (int j = 0; j < cstep; j++) { if (fabsf(output_value[j] - reference_out[i]) > 0.01) diff --git a/tests/op/test_timvx_op_prelu.cpp b/tests/op/test_timvx_op_prelu.cpp index fe3fc0a63..57b7e9bcd 100644 --- a/tests/op/test_timvx_op_prelu.cpp +++ b/tests/op/test_timvx_op_prelu.cpp @@ -22,20 +22,22 @@ * Author: qtang@openailab.com */ - #include "test_op.h" - int create_test_prelu_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) { - (void)layout; (void)n; (void)c; (void)h; (void)w; + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; /* create the test node */ node_t test_node = create_graph_node(graph, node_name, "PReLU"); tensor_t input_tensor = get_graph_tensor(graph, input_name); - if(NULL == input_tensor) + if (NULL == input_tensor) { fprintf(stderr, "create test node failed.\n"); return -1; @@ -48,7 +50,7 @@ int create_test_prelu_node(graph_t graph, const char* input_name, const char* no int dims[4]; get_tensor_shape(input_tensor, dims, 4); - int slope_dims[1] = {dims[1]}; // channel num + int slope_dims[1] = {dims[1]}; // channel num set_tensor_shape(slope_tensor, slope_dims, 1); /* input tensors of test node */ @@ -90,7 +92,7 @@ int main(int argc, char* argv[]) // create graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_prelu_node); - if(NULL == graph) + if (NULL == graph) return -1; // set quantize params @@ -120,21 +122,21 @@ int main(int argc, char* argv[]) // get output and dequant struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0); - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; + uint8_t* output_u8 = (uint8_t*)output_tensor->data; int output_size = output_tensor->elem_num; int out_c = output_tensor->dims[1]; int cstep = output_tensor->dims[2] * output_tensor->dims[3]; get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); + float* output_data = (float*)malloc(output_size * sizeof(float)); for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; // check the result ret = 0; - for (int i = 0; i< out_c; i++) + for (int i = 0; i < out_c; i++) { - float* output_value = (float *)output_data + i * cstep; + float* output_value = (float*)output_data + i * cstep; for (int j = 0; j < cstep; j++) { if (fabsf(output_value[j] - result_value[i]) > 0.01) diff --git a/tests/op/test_timvx_op_relu.cpp b/tests/op/test_timvx_op_relu.cpp index 1ed17f270..c7143ca78 100644 --- a/tests/op/test_timvx_op_relu.cpp +++ b/tests/op/test_timvx_op_relu.cpp @@ -22,20 +22,22 @@ * Author: qtang@openailab.com */ - #include "test_op.h" - int create_test_relu_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) { - (void)layout; (void)n; (void)c; (void)h; (void)w; + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; /* create the test node */ node_t test_node = create_graph_node(graph, node_name, "ReLU"); tensor_t input_tensor = get_graph_tensor(graph, input_name); - if(NULL == input_tensor) + if (NULL == input_tensor) { fprintf(stderr, "create test node failed.\n"); return -1; @@ -78,7 +80,7 @@ int main(int argc, char* argv[]) // create graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_relu_node); - if(NULL == graph) + if (NULL == graph) return -1; // set quantize params @@ -101,21 +103,21 @@ int main(int argc, char* argv[]) // get output and dequant struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0); - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; + uint8_t* output_u8 = (uint8_t*)output_tensor->data; int output_size = output_tensor->elem_num; int out_c = output_tensor->dims[1]; int cstep = output_tensor->dims[2] * output_tensor->dims[3]; get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); + float* output_data = (float*)malloc(output_size * sizeof(float)); for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; // check the result ret = 0; - for (int i = 0; i< out_c; i++) + for (int i = 0; i < out_c; i++) { - float* output_value = (float *)output_data + i * cstep; + float* output_value = (float*)output_data + i * cstep; for (int j = 0; j < cstep; j++) { if (fabsf(output_value[j] - reference_out[i]) > 0.01) diff --git a/tests/op/test_timvx_op_relu1.cpp b/tests/op/test_timvx_op_relu1.cpp index 8e3134a8d..78e60376d 100644 --- a/tests/op/test_timvx_op_relu1.cpp +++ b/tests/op/test_timvx_op_relu1.cpp @@ -22,20 +22,22 @@ * Author: qtang@openailab.com */ - #include "test_op.h" - int create_test_relu1_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) { - (void)layout; (void)n; (void)c; (void)h; (void)w; + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; /* create the test node */ node_t test_node = create_graph_node(graph, node_name, "ReLU1"); tensor_t input_tensor = get_graph_tensor(graph, input_name); - if(NULL == input_tensor) + if (NULL == input_tensor) { fprintf(stderr, "create test node failed.\n"); return -1; @@ -78,7 +80,7 @@ int main(int argc, char* argv[]) // create graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_relu1_node); - if(NULL == graph) + if (NULL == graph) return -1; // set quantize params @@ -101,21 +103,21 @@ int main(int argc, char* argv[]) // get output and dequant struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0); - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; + uint8_t* output_u8 = (uint8_t*)output_tensor->data; int output_size = output_tensor->elem_num; int out_c = output_tensor->dims[1]; int cstep = output_tensor->dims[2] * output_tensor->dims[3]; get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); + float* output_data = (float*)malloc(output_size * sizeof(float)); for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; // check the result ret = 0; - for (int i = 0; i< out_c; i++) + for (int i = 0; i < out_c; i++) { - float* output_value = (float *)output_data + i * cstep; + float* output_value = (float*)output_data + i * cstep; for (int j = 0; j < cstep; j++) { if (fabsf(output_value[j] - reference_out[i]) > 0.01) diff --git a/tests/op/test_timvx_op_reshape.cpp b/tests/op/test_timvx_op_reshape.cpp index b9190d086..a511fe7ff 100644 --- a/tests/op/test_timvx_op_reshape.cpp +++ b/tests/op/test_timvx_op_reshape.cpp @@ -1,180 +1,192 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/reshape_param.h" - - -int create_test_permute_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Reshape"); - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - - if(NULL == input_tensor) - { - fprintf(stderr, "create test node failed.\n"); - return -1; - } - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set params */ - struct reshape_param* param = ( struct reshape_param* )(struct node* )test_node->op.param_mem; - - param->dim_size = 4; - - int* shape_tmp = ( int* )malloc(param->dim_size * sizeof(int)); - shape_tmp[0] = 1; - shape_tmp[1] = 1; - shape_tmp[2] = 3; - shape_tmp[3] = 2; - - param->re_shape = shape_tmp; - param->is_onnx = 1; - param->reverse = 0; - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input_fp32[6] = {1.0f, 2.0f, 3.0f, - 4.0f, 5.0f, 6.0f, }; -float input_scale = 1; -int input_zero_point = 0; - -float reference_out[6] = {1.0f, 2.0f, 3.0f, - 4.0f, 5.0f, 6.0f, }; -float output_scale = 1; -int output_zero_point = 0; - - -void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) -{ - for (int i = 0; i < size; i++) - { - int udata = (round)(data_fp32[i] / scale + zero_point); - if (udata > 255) - udata = 255; - else if (udata < 0) - udata = 0; - - date_u8[i] = udata; - } -} - -int main(int argc, char* argv[]) -{ - int n = 1, c = 2, h = 1, w = 3; - const char* test_node_name = "reshape"; - int data_type = TENGINE_DT_UINT8; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_permute_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "reshape"); - -// tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); - set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); - set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - - // set input data - uint8_t input_u8[6] = {0}; - get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point); - set_tensor_buffer(input_tensor, input_u8, 6); - - - // set bias data - // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; - int output_size = output_tensor->elem_num; - - get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); - for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/reshape_param.h" + +int create_test_permute_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Reshape"); + + tensor_t input_tensor = get_graph_tensor(graph, input_name); + + if (NULL == input_tensor) + { + fprintf(stderr, "create test node failed.\n"); + return -1; + } + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + /* set params */ + struct reshape_param* param = (struct reshape_param*)(struct node*)test_node->op.param_mem; + + param->dim_size = 4; + + int* shape_tmp = (int*)malloc(param->dim_size * sizeof(int)); + shape_tmp[0] = 1; + shape_tmp[1] = 1; + shape_tmp[2] = 3; + shape_tmp[3] = 2; + + param->re_shape = shape_tmp; + param->is_onnx = 1; + param->reverse = 0; + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input_fp32[6] = { + 1.0f, + 2.0f, + 3.0f, + 4.0f, + 5.0f, + 6.0f, +}; +float input_scale = 1; +int input_zero_point = 0; + +float reference_out[6] = { + 1.0f, + 2.0f, + 3.0f, + 4.0f, + 5.0f, + 6.0f, +}; +float output_scale = 1; +int output_zero_point = 0; + +void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) +{ + for (int i = 0; i < size; i++) + { + int udata = (round)(data_fp32[i] / scale + zero_point); + if (udata > 255) + udata = 255; + else if (udata < 0) + udata = 0; + + date_u8[i] = udata; + } +} + +int main(int argc, char* argv[]) +{ + int n = 1, c = 2, h = 1, w = 3; + const char* test_node_name = "reshape"; + int data_type = TENGINE_DT_UINT8; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_permute_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "reshape"); + + // tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); + set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); + set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + + // set input data + uint8_t input_u8[6] = {0}; + get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point); + set_tensor_buffer(input_tensor, input_u8, 6); + + // set bias data + // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + uint8_t* output_u8 = (uint8_t*)output_tensor->data; + int output_size = output_tensor->elem_num; + + get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + float* output_data = (float*)malloc(output_size * sizeof(float)); + for (int i = 0; i < output_size; i++) + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_timvx_op_resize.cpp b/tests/op/test_timvx_op_resize.cpp index 2c54ee129..1c64c9b8e 100644 --- a/tests/op/test_timvx_op_resize.cpp +++ b/tests/op/test_timvx_op_resize.cpp @@ -1,173 +1,187 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/resize_param.h" - - -int create_test_resize_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Resize"); - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - - if(NULL == input_tensor) - { - fprintf(stderr, "create test node failed.\n"); - return -1; - } - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set params */ - struct resize_param* param = ( struct resize_param* )(struct node* )test_node->op.param_mem; - - param->type = 0; - param->scale_w = 0.5; - param->scale_h = 0.5; - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input_fp32[16] = {1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 2.0f, 2.0f, 1.0f, - 1.0f, 2.0f, 2.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, }; -float input_scale = 1; -int input_zero_point = 0; - -float reference_out[4] = {1, 1, 1, 2}; -float output_scale = 1; -int output_zero_point = 0; - - -void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) -{ - for (int i = 0; i < size; i++) - { - int udata = (round)(data_fp32[i] / scale + zero_point); - if (udata > 255) - udata = 255; - else if (udata < 0) - udata = 0; - - date_u8[i] = udata; - } -} - -int main(int argc, char* argv[]) -{ - int n = 1, c = 1, h = 4, w = 4; - const char* test_node_name = "resize"; - int data_type = TENGINE_DT_UINT8; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_resize_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "resize"); - -// tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); - set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); - set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - - // set input data - uint8_t input_u8[16] = {0}; - get_uint8_data(input_fp32, input_u8, 16, input_scale, input_zero_point); - set_tensor_buffer(input_tensor, input_u8, 16); - - - // set bias data - // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; - int output_size = output_tensor->elem_num; - - get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); - for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/resize_param.h" + +int create_test_resize_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Resize"); + + tensor_t input_tensor = get_graph_tensor(graph, input_name); + + if (NULL == input_tensor) + { + fprintf(stderr, "create test node failed.\n"); + return -1; + } + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + /* set params */ + struct resize_param* param = (struct resize_param*)(struct node*)test_node->op.param_mem; + + param->type = 0; + param->scale_w = 0.5; + param->scale_h = 0.5; + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input_fp32[16] = { + 1.0f, + 1.0f, + 1.0f, + 1.0f, + 1.0f, + 2.0f, + 2.0f, + 1.0f, + 1.0f, + 2.0f, + 2.0f, + 1.0f, + 1.0f, + 1.0f, + 1.0f, + 1.0f, +}; +float input_scale = 1; +int input_zero_point = 0; + +float reference_out[4] = {1, 1, 1, 2}; +float output_scale = 1; +int output_zero_point = 0; + +void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) +{ + for (int i = 0; i < size; i++) + { + int udata = (round)(data_fp32[i] / scale + zero_point); + if (udata > 255) + udata = 255; + else if (udata < 0) + udata = 0; + + date_u8[i] = udata; + } +} + +int main(int argc, char* argv[]) +{ + int n = 1, c = 1, h = 4, w = 4; + const char* test_node_name = "resize"; + int data_type = TENGINE_DT_UINT8; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_resize_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "resize"); + + // tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); + set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); + set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + + // set input data + uint8_t input_u8[16] = {0}; + get_uint8_data(input_fp32, input_u8, 16, input_scale, input_zero_point); + set_tensor_buffer(input_tensor, input_u8, 16); + + // set bias data + // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + uint8_t* output_u8 = (uint8_t*)output_tensor->data; + int output_size = output_tensor->elem_num; + + get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + float* output_data = (float*)malloc(output_size * sizeof(float)); + for (int i = 0; i < output_size; i++) + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_timvx_op_sigmoid.cpp b/tests/op/test_timvx_op_sigmoid.cpp index b680ff2d5..b37411f8f 100644 --- a/tests/op/test_timvx_op_sigmoid.cpp +++ b/tests/op/test_timvx_op_sigmoid.cpp @@ -22,20 +22,22 @@ * Author: qtang@openailab.com */ - #include "test_op.h" - int create_test_sigmoid_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) { - (void)layout; (void)n; (void)c; (void)h; (void)w; + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Sigmoid"); + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Sigmoid"); tensor_t input_tensor = get_graph_tensor(graph, input_name); - if(NULL == input_tensor) + if (NULL == input_tensor) { fprintf(stderr, "create test node failed.\n"); return -1; @@ -78,7 +80,7 @@ int main(int argc, char* argv[]) // create graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_sigmoid_node); - if(NULL == graph) + if (NULL == graph) return -1; // set quantize params @@ -101,7 +103,7 @@ int main(int argc, char* argv[]) // get output and dequant struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0); - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; + uint8_t* output_u8 = (uint8_t*)output_tensor->data; int output_size = output_tensor->elem_num; int out_c = output_tensor->dims[1]; int cstep = output_tensor->dims[2] * output_tensor->dims[3]; @@ -109,13 +111,13 @@ int main(int argc, char* argv[]) get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); std::vector output_data(output_size); for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; // check the result ret = 0; - for (int i = 0; i< out_c; i++) + for (int i = 0; i < out_c; i++) { - float* output_value = (float *)output_data.data() + i * cstep; + float* output_value = (float*)output_data.data() + i * cstep; for (int j = 0; j < cstep; j++) { if (fabsf(output_value[j] - reference_out[i]) > 0.01f) diff --git a/tests/op/test_timvx_op_slice.cpp b/tests/op/test_timvx_op_slice.cpp index f9ce16bbf..2f3f86083 100644 --- a/tests/op/test_timvx_op_slice.cpp +++ b/tests/op/test_timvx_op_slice.cpp @@ -1,172 +1,182 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/slice_param.h" - - -int create_test_slice_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Slice"); - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - - if(NULL == input_tensor) - { - fprintf(stderr, "create test node failed.\n"); - return -1; - } - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set params */ - struct slice_param* param = ( struct slice_param* )(struct node* )test_node->op.param_mem; - - param->axis = 1; - param->begin = 1; - param->end = 2; - param->isonnx = 1; - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input_fp32[6] = {1.0f, 2.0f, 3.0f, - 4.0f, 5.0f, 6.0f, }; -float input_scale = 1; -int input_zero_point = 0; - -float reference_out[3] = {4.0f, 5.0f, 6.0f, }; -float output_scale = 1; -int output_zero_point = 0; - - -void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) -{ - for (int i = 0; i < size; i++) - { - int udata = (round)(data_fp32[i] / scale + zero_point); - if (udata > 255) - udata = 255; - else if (udata < 0) - udata = 0; - - date_u8[i] = udata; - } -} - -int main(int argc, char* argv[]) -{ - int n = 1, c = 2, h = 1, w = 3; - const char* test_node_name = "slice"; - int data_type = TENGINE_DT_UINT8; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_slice_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "slice"); - -// tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); - set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); - set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - - // set input data - uint8_t input_u8[6] = {0}; - get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point); - set_tensor_buffer(input_tensor, input_u8, 6); - - - // set bias data - // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; - int output_size = output_tensor->elem_num; - - get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); - for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/slice_param.h" + +int create_test_slice_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Slice"); + + tensor_t input_tensor = get_graph_tensor(graph, input_name); + + if (NULL == input_tensor) + { + fprintf(stderr, "create test node failed.\n"); + return -1; + } + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + /* set params */ + struct slice_param* param = (struct slice_param*)(struct node*)test_node->op.param_mem; + + param->axis = 1; + param->begin = 1; + param->end = 2; + param->isonnx = 1; + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input_fp32[6] = { + 1.0f, + 2.0f, + 3.0f, + 4.0f, + 5.0f, + 6.0f, +}; +float input_scale = 1; +int input_zero_point = 0; + +float reference_out[3] = { + 4.0f, + 5.0f, + 6.0f, +}; +float output_scale = 1; +int output_zero_point = 0; + +void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) +{ + for (int i = 0; i < size; i++) + { + int udata = (round)(data_fp32[i] / scale + zero_point); + if (udata > 255) + udata = 255; + else if (udata < 0) + udata = 0; + + date_u8[i] = udata; + } +} + +int main(int argc, char* argv[]) +{ + int n = 1, c = 2, h = 1, w = 3; + const char* test_node_name = "slice"; + int data_type = TENGINE_DT_UINT8; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_slice_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "slice"); + + // tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); + set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); + set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + + // set input data + uint8_t input_u8[6] = {0}; + get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point); + set_tensor_buffer(input_tensor, input_u8, 6); + + // set bias data + // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + uint8_t* output_u8 = (uint8_t*)output_tensor->data; + int output_size = output_tensor->elem_num; + + get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + float* output_data = (float*)malloc(output_size * sizeof(float)); + for (int i = 0; i < output_size; i++) + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_timvx_op_softmax.cpp b/tests/op/test_timvx_op_softmax.cpp index 70d535d90..4d1a577fa 100644 --- a/tests/op/test_timvx_op_softmax.cpp +++ b/tests/op/test_timvx_op_softmax.cpp @@ -1,168 +1,176 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/softmax_param.h" - - -int create_test_softmax_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Softmax"); - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - - if(NULL == input_tensor) - { - fprintf(stderr, "create test node failed.\n"); - return -1; - } - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set params */ - struct softmax_param* param = ( struct softmax_param* )(struct node* )test_node->op.param_mem; - - param->axis = 1; - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input_fp32[3] = {0.0f, 1.0f, 2.0f,}; -float input_scale = 1; -int input_zero_point = 0; - -float reference_out[3] = {0.0f, 0.243164, 0.666740,}; -float output_scale = 0.003922; -int output_zero_point = 0; - - -void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) -{ - for (int i = 0; i < size; i++) - { - int udata = (round)(data_fp32[i] / scale + zero_point); - if (udata > 255) - udata = 255; - else if (udata < 0) - udata = 0; - - date_u8[i] = udata; - } -} - -int main(int argc, char* argv[]) -{ - int n = 1, c = 3, h = 1, w = 1; - const char* test_node_name = "softmax"; - int data_type = TENGINE_DT_UINT8; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_softmax_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "softmax"); - -// tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); - set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); - set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - - // set input data - uint8_t input_u8[3] = {0}; - get_uint8_data(input_fp32, input_u8, 3, input_scale, input_zero_point); - set_tensor_buffer(input_tensor, input_u8, 3); - - - // set bias data - // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; - int output_size = output_tensor->elem_num; - - get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); - for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/softmax_param.h" + +int create_test_softmax_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Softmax"); + + tensor_t input_tensor = get_graph_tensor(graph, input_name); + + if (NULL == input_tensor) + { + fprintf(stderr, "create test node failed.\n"); + return -1; + } + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + /* set params */ + struct softmax_param* param = (struct softmax_param*)(struct node*)test_node->op.param_mem; + + param->axis = 1; + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input_fp32[3] = { + 0.0f, + 1.0f, + 2.0f, +}; +float input_scale = 1; +int input_zero_point = 0; + +float reference_out[3] = { + 0.0f, + 0.243164, + 0.666740, +}; +float output_scale = 0.003922; +int output_zero_point = 0; + +void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) +{ + for (int i = 0; i < size; i++) + { + int udata = (round)(data_fp32[i] / scale + zero_point); + if (udata > 255) + udata = 255; + else if (udata < 0) + udata = 0; + + date_u8[i] = udata; + } +} + +int main(int argc, char* argv[]) +{ + int n = 1, c = 3, h = 1, w = 1; + const char* test_node_name = "softmax"; + int data_type = TENGINE_DT_UINT8; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_softmax_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "softmax"); + + // tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); + set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); + set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + + // set input data + uint8_t input_u8[3] = {0}; + get_uint8_data(input_fp32, input_u8, 3, input_scale, input_zero_point); + set_tensor_buffer(input_tensor, input_u8, 3); + + // set bias data + // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + uint8_t* output_u8 = (uint8_t*)output_tensor->data; + int output_size = output_tensor->elem_num; + + get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + float* output_data = (float*)malloc(output_size * sizeof(float)); + for (int i = 0; i < output_size; i++) + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_timvx_op_split.cpp b/tests/op/test_timvx_op_split.cpp index 7652cc3c4..419082e6f 100644 --- a/tests/op/test_timvx_op_split.cpp +++ b/tests/op/test_timvx_op_split.cpp @@ -1,205 +1,219 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/split_param.h" - -extern "C" { -#include "vector.h" -} - -int create_test_split_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Split"); - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - - if(NULL == input_tensor) - { - fprintf(stderr, "create test node failed.\n"); - return -1; - } - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - tensor_t output_tensor1 = create_graph_tensor(graph, "out1", data_type); - set_node_output_tensor(test_node, 1, output_tensor1, TENSOR_TYPE_VAR); - - /* set params */ - struct split_param* param = ( struct split_param* )(struct node* )test_node->op.param_mem; - - param->axis = 1; - param->split_dim = 2; - - param->split_sizes_ = create_vector(sizeof(int), nullptr); - - int tmp = 1; - push_vector_data(param->split_sizes_, &tmp); - push_vector_data(param->split_sizes_, &tmp); - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input_fp32[6] = {1.0f, 2.0f, 3.0f, - 4.0f, 5.0f, 6.0f, }; -float input_scale = 1; -int input_zero_point = 0; - -float reference_out[3] = {1.0f, 2.0f, 3.0f, }; -float output_scale = 1; -int output_zero_point = 0; - -float reference_out1[3] = {4.0f, 5.0f, 6.0f, }; -float output_scale1 = 1; -int output_zero_point1 = 0; - - -void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) -{ - for (int i = 0; i < size; i++) - { - int udata = (round)(data_fp32[i] / scale + zero_point); - if (udata > 255) - udata = 255; - else if (udata < 0) - udata = 0; - - date_u8[i] = udata; - } -} - -int main(int argc, char* argv[]) -{ - int n = 1, c = 2, h = 1, w = 3; - const char* test_node_name = "split"; - int data_type = TENGINE_DT_UINT8; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_split_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "split"); - struct tensor* output_tensor1 = (struct tensor*)get_graph_tensor(ir_graph, "out1"); - -// tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); - set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); - set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - set_tensor_quant_param(output_tensor1, &output_scale1, &output_zero_point1, 1); - - // set input data - uint8_t input_u8[6] = {0}; - get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point); - set_tensor_buffer(input_tensor, input_u8, 6); - - - // set bias data - // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; - int output_size = output_tensor->elem_num; - - get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); - for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; - - - uint8_t* output1_u8 = ( uint8_t* )output_tensor1->data; - int output_size1 = output_tensor1->elem_num; - - get_tensor_quant_param(output_tensor1, &output_scale1, &output_zero_point1, 1); - float* output_data1 = ( float* )malloc(output_size1 * sizeof(float)); - for (int i = 0; i < output_size1; i++) - output_data1[i] = (( float )output1_u8[i] - ( float )output_zero_point1) * output_scale1; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - for (int i = 0; i< output_size1; i++) - { - if (fabsf(output_data1[i] - reference_out1[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data1[i], reference_out1[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/split_param.h" + +extern "C" { +#include "vector.h" +} + +int create_test_split_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Split"); + + tensor_t input_tensor = get_graph_tensor(graph, input_name); + + if (NULL == input_tensor) + { + fprintf(stderr, "create test node failed.\n"); + return -1; + } + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + tensor_t output_tensor1 = create_graph_tensor(graph, "out1", data_type); + set_node_output_tensor(test_node, 1, output_tensor1, TENSOR_TYPE_VAR); + + /* set params */ + struct split_param* param = (struct split_param*)(struct node*)test_node->op.param_mem; + + param->axis = 1; + param->split_dim = 2; + + param->split_sizes_ = create_vector(sizeof(int), nullptr); + + int tmp = 1; + push_vector_data(param->split_sizes_, &tmp); + push_vector_data(param->split_sizes_, &tmp); + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input_fp32[6] = { + 1.0f, + 2.0f, + 3.0f, + 4.0f, + 5.0f, + 6.0f, +}; +float input_scale = 1; +int input_zero_point = 0; + +float reference_out[3] = { + 1.0f, + 2.0f, + 3.0f, +}; +float output_scale = 1; +int output_zero_point = 0; + +float reference_out1[3] = { + 4.0f, + 5.0f, + 6.0f, +}; +float output_scale1 = 1; +int output_zero_point1 = 0; + +void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) +{ + for (int i = 0; i < size; i++) + { + int udata = (round)(data_fp32[i] / scale + zero_point); + if (udata > 255) + udata = 255; + else if (udata < 0) + udata = 0; + + date_u8[i] = udata; + } +} + +int main(int argc, char* argv[]) +{ + int n = 1, c = 2, h = 1, w = 3; + const char* test_node_name = "split"; + int data_type = TENGINE_DT_UINT8; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_split_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "split"); + struct tensor* output_tensor1 = (struct tensor*)get_graph_tensor(ir_graph, "out1"); + + // tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); + set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); + set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + set_tensor_quant_param(output_tensor1, &output_scale1, &output_zero_point1, 1); + + // set input data + uint8_t input_u8[6] = {0}; + get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point); + set_tensor_buffer(input_tensor, input_u8, 6); + + // set bias data + // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + uint8_t* output_u8 = (uint8_t*)output_tensor->data; + int output_size = output_tensor->elem_num; + + get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + float* output_data = (float*)malloc(output_size * sizeof(float)); + for (int i = 0; i < output_size; i++) + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; + + uint8_t* output1_u8 = (uint8_t*)output_tensor1->data; + int output_size1 = output_tensor1->elem_num; + + get_tensor_quant_param(output_tensor1, &output_scale1, &output_zero_point1, 1); + float* output_data1 = (float*)malloc(output_size1 * sizeof(float)); + for (int i = 0; i < output_size1; i++) + output_data1[i] = ((float)output1_u8[i] - (float)output_zero_point1) * output_scale1; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + for (int i = 0; i < output_size1; i++) + { + if (fabsf(output_data1[i] - reference_out1[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data1[i], reference_out1[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_timvx_op_tanh.cpp b/tests/op/test_timvx_op_tanh.cpp index fdf93bd89..4f8310940 100644 --- a/tests/op/test_timvx_op_tanh.cpp +++ b/tests/op/test_timvx_op_tanh.cpp @@ -25,17 +25,20 @@ #include #include "test_op.h" - int create_test_tanh_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) { - (void)layout; (void)n; (void)c; (void)h; (void)w; + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; /* create the test node */ node_t test_node = create_graph_node(graph, node_name, "Tanh"); tensor_t input_tensor = get_graph_tensor(graph, input_name); - if(NULL == input_tensor) + if (NULL == input_tensor) { fprintf(stderr, "create test node failed.\n"); return -1; @@ -78,7 +81,7 @@ int main(int argc, char* argv[]) // create graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_tanh_node); - if(NULL == graph) + if (NULL == graph) return -1; // set quantize params @@ -101,22 +104,22 @@ int main(int argc, char* argv[]) // get output and dequant struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0); - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; + uint8_t* output_u8 = (uint8_t*)output_tensor->data; int output_size = output_tensor->elem_num; int out_c = output_tensor->dims[1]; int cstep = output_tensor->dims[2] * output_tensor->dims[3]; get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); std::vector output_data(output_size); - + for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; // check the result ret = 0; - for (int i = 0; i< out_c; i++) + for (int i = 0; i < out_c; i++) { - float* output_value = (float *)output_data.data() + i * cstep; + float* output_value = (float*)output_data.data() + i * cstep; for (int j = 0; j < cstep; j++) { if (fabsf(output_value[j] - reference_out[i]) > 0.05f) diff --git a/tests/op/test_timvx_op_transpose.cpp b/tests/op/test_timvx_op_transpose.cpp index 1f5b4e424..9233d5009 100644 --- a/tests/op/test_timvx_op_transpose.cpp +++ b/tests/op/test_timvx_op_transpose.cpp @@ -1,183 +1,223 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/transpose_param.h" - - -int create_test_permute_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Transpose"); - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - - if(NULL == input_tensor) - { - fprintf(stderr, "create test node failed.\n"); - return -1; - } - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set params */ - struct transpose_param* param = ( struct transpose_param* )(struct node* )test_node->op.param_mem; - - int* t_shape = (int*) malloc(sizeof(int) * 5 ) ; - t_shape[0] = 0; - t_shape[1] = 2; - t_shape[2] = 1; - t_shape[3] = 3; - t_shape[4] = 4; - - param->tr_shape_size = 5; - param->tr_shape = t_shape; - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input_fp32[24] = {1.0f, 2.0f, 3.0f, 4.0f, - 5.0f, 6.0f, 7.0f, 8.0f, - 9.0f, 10.0f, 11.0f, 12.0f, - 13.0f, 14.0f, 15.0f, 16.0f, - 17.0f, 18.0f, 19.0f, 20.0f, - 21.0f, 22.0f, 23.0f, 24.0f,}; -float input_scale = 1; -int input_zero_point = 0; - -float reference_out[24] = {1.0f, 2.0f, 3.0f, 4.0f, - 13.0f, 14.0f, 15.0f, 16.0f, - 5.0f, 6.0f, 7.0f, 8.0f, - 17.0f, 18.0f, 19.0f, 20.0f, - 9.0f, 10.0f, 11.0f, 12.0f, - 21.0f, 22.0f, 23.0f, 24.0f,}; -float output_scale = 1; -int output_zero_point = 0; - - -void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) -{ - for (int i = 0; i < size; i++) - { - int udata = (round)(data_fp32[i] / scale + zero_point); - if (udata > 255) - udata = 255; - else if (udata < 0) - udata = 0; - - date_u8[i] = udata; - } -} - -int main(int argc, char* argv[]) -{ - int n = 2, c = 3, h = 2, w = 2; - const char* test_node_name = "permute"; - int data_type = TENGINE_DT_UINT8; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_permute_node, 5); - if(NULL == ir_graph) - return -1; - - // set quantize params - struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "permute"); - -// tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); - set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); - set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - - // set input data - uint8_t input_u8[24] = {0}; - get_uint8_data(input_fp32, input_u8, 24, input_scale, input_zero_point); - set_tensor_buffer(input_tensor, input_u8, 24); - - - // set bias data - // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; - int output_size = output_tensor->elem_num; - - get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); - for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/transpose_param.h" + +int create_test_permute_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Transpose"); + + tensor_t input_tensor = get_graph_tensor(graph, input_name); + + if (NULL == input_tensor) + { + fprintf(stderr, "create test node failed.\n"); + return -1; + } + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + /* set params */ + struct transpose_param* param = (struct transpose_param*)(struct node*)test_node->op.param_mem; + + int* t_shape = (int*)malloc(sizeof(int) * 5); + t_shape[0] = 0; + t_shape[1] = 2; + t_shape[2] = 1; + t_shape[3] = 3; + t_shape[4] = 4; + + param->tr_shape_size = 5; + param->tr_shape = t_shape; + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input_fp32[24] = { + 1.0f, + 2.0f, + 3.0f, + 4.0f, + 5.0f, + 6.0f, + 7.0f, + 8.0f, + 9.0f, + 10.0f, + 11.0f, + 12.0f, + 13.0f, + 14.0f, + 15.0f, + 16.0f, + 17.0f, + 18.0f, + 19.0f, + 20.0f, + 21.0f, + 22.0f, + 23.0f, + 24.0f, +}; +float input_scale = 1; +int input_zero_point = 0; + +float reference_out[24] = { + 1.0f, + 2.0f, + 3.0f, + 4.0f, + 13.0f, + 14.0f, + 15.0f, + 16.0f, + 5.0f, + 6.0f, + 7.0f, + 8.0f, + 17.0f, + 18.0f, + 19.0f, + 20.0f, + 9.0f, + 10.0f, + 11.0f, + 12.0f, + 21.0f, + 22.0f, + 23.0f, + 24.0f, +}; +float output_scale = 1; +int output_zero_point = 0; + +void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) +{ + for (int i = 0; i < size; i++) + { + int udata = (round)(data_fp32[i] / scale + zero_point); + if (udata > 255) + udata = 255; + else if (udata < 0) + udata = 0; + + date_u8[i] = udata; + } +} + +int main(int argc, char* argv[]) +{ + int n = 2, c = 3, h = 2, w = 2; + const char* test_node_name = "permute"; + int data_type = TENGINE_DT_UINT8; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_permute_node, 5); + if (NULL == ir_graph) + return -1; + + // set quantize params + struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "permute"); + + // tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); + set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); + set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + + // set input data + uint8_t input_u8[24] = {0}; + get_uint8_data(input_fp32, input_u8, 24, input_scale, input_zero_point); + set_tensor_buffer(input_tensor, input_u8, 24); + + // set bias data + // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + uint8_t* output_u8 = (uint8_t*)output_tensor->data; + int output_size = output_tensor->elem_num; + + get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + float* output_data = (float*)malloc(output_size * sizeof(float)); + for (int i = 0; i < output_size; i++) + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tests/op/test_timvx_op_upsampling.cpp b/tests/op/test_timvx_op_upsampling.cpp index aa30baa5d..3f8e45a88 100644 --- a/tests/op/test_timvx_op_upsampling.cpp +++ b/tests/op/test_timvx_op_upsampling.cpp @@ -1,171 +1,185 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - - -#include "test_op.h" - -#include "graph/graph.h" -#include "graph/node.h" -#include "graph/tensor.h" -#include "operator/prototype/upsample_param.h" - - -int create_test_interp_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; (void)n; (void)c; (void)h; (void)w; - - /* create the test node */ - struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Upsample"); - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - - if(NULL == input_tensor) - { - fprintf(stderr, "create test node failed.\n"); - return -1; - } - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set params */ - struct upsample_param* param = ( struct upsample_param* )(struct node* )test_node->op.param_mem; - - param->scale = 0.5; - - return 0; -} - -/* - * scale = (max - min) / 255 - * zero_point = -min / scale - * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) - * float32 = (uint8 - zero_point) * scale - */ -float input_fp32[16] = {1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 2.0f, 2.0f, 1.0f, - 1.0f, 2.0f, 2.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, }; -float input_scale = 1; -int input_zero_point = 0; - -float reference_out[4] = {1, 1, 1, 2}; -float output_scale = 1; -int output_zero_point = 0; - - -void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) -{ - for (int i = 0; i < size; i++) - { - int udata = (round)(data_fp32[i] / scale + zero_point); - if (udata > 255) - udata = 255; - else if (udata < 0) - udata = 0; - - date_u8[i] = udata; - } -} - -int main(int argc, char* argv[]) -{ - int n = 1, c = 1, h = 4, w = 4; - const char* test_node_name = "upsample"; - int data_type = TENGINE_DT_UINT8; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed.\n"); - - // create - struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_interp_node); - if(NULL == ir_graph) - return -1; - - set_log_level(LOG_INFO); - dump_graph(ir_graph); - - // set quantize params - struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); - struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "upsample"); - -// tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); - set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); - set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - - // set input data - uint8_t input_u8[16] = {0}; - get_uint8_data(input_fp32, input_u8, 16, input_scale, input_zero_point); - set_tensor_buffer(input_tensor, input_u8, 16); - - - // set bias data - // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); - - // graph run - ret = test_graph_run(ir_graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(ir_graph); - return -1; - } - - // get output and dequant - uint8_t* output_u8 = ( uint8_t* )output_tensor->data; - int output_size = output_tensor->elem_num; - - get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); - float* output_data = ( float* )malloc(output_size * sizeof(float)); - for (int i = 0; i < output_size; i++) - output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale; - - // check the result - ret = 0; - for (int i = 0; i< output_size; i++) - { - if (fabsf(output_data[i] - reference_out[i]) > 0.1) - { - fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); - ret = -1; - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(ir_graph); - - return ret; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include "test_op.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "operator/prototype/upsample_param.h" + +int create_test_interp_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) +{ + (void)layout; + (void)n; + (void)c; + (void)h; + (void)w; + + /* create the test node */ + struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Upsample"); + + tensor_t input_tensor = get_graph_tensor(graph, input_name); + + if (NULL == input_tensor) + { + fprintf(stderr, "create test node failed.\n"); + return -1; + } + + /* input tensors of test node */ + set_node_input_tensor(test_node, 0, input_tensor); + + /* output tensors of test node */ + tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); + set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); + + /* set params */ + struct upsample_param* param = (struct upsample_param*)(struct node*)test_node->op.param_mem; + + param->scale = 0.5; + + return 0; +} + +/* + * scale = (max - min) / 255 + * zero_point = -min / scale + * uint8 = clip(round(float32 / scale) + zero_point, 0, 255) + * float32 = (uint8 - zero_point) * scale + */ +float input_fp32[16] = { + 1.0f, + 1.0f, + 1.0f, + 1.0f, + 1.0f, + 2.0f, + 2.0f, + 1.0f, + 1.0f, + 2.0f, + 2.0f, + 1.0f, + 1.0f, + 1.0f, + 1.0f, + 1.0f, +}; +float input_scale = 1; +int input_zero_point = 0; + +float reference_out[4] = {1, 1, 1, 2}; +float output_scale = 1; +int output_zero_point = 0; + +void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point) +{ + for (int i = 0; i < size; i++) + { + int udata = (round)(data_fp32[i] / scale + zero_point); + if (udata > 255) + udata = 255; + else if (udata < 0) + udata = 0; + + date_u8[i] = udata; + } +} + +int main(int argc, char* argv[]) +{ + int n = 1, c = 1, h = 4, w = 4; + const char* test_node_name = "upsample"; + int data_type = TENGINE_DT_UINT8; + int layout = TENGINE_LAYOUT_NCHW; + + // init + int ret = test_graph_init(); + if (0 != ret) + fprintf(stderr, "Tengine init failed.\n"); + + // create + struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_interp_node); + if (NULL == ir_graph) + return -1; + + set_log_level(LOG_INFO); + dump_graph(ir_graph); + + // set quantize params + struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node"); + struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "upsample"); + + // tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0); + set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1); + set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + + // set input data + uint8_t input_u8[16] = {0}; + get_uint8_data(input_fp32, input_u8, 16, input_scale, input_zero_point); + set_tensor_buffer(input_tensor, input_u8, 16); + + // set bias data + // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f); + + // graph run + ret = test_graph_run(ir_graph); + if (0 != ret) + { + fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); + test_graph_release(ir_graph); + return -1; + } + + // get output and dequant + uint8_t* output_u8 = (uint8_t*)output_tensor->data; + int output_size = output_tensor->elem_num; + + get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1); + float* output_data = (float*)malloc(output_size * sizeof(float)); + for (int i = 0; i < output_size; i++) + output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale; + + // check the result + ret = 0; + for (int i = 0; i < output_size; i++) + { + if (fabsf(output_data[i] - reference_out[i]) > 0.1) + { + fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]); + ret = -1; + } + } + + if (ret == 0) + fprintf(stderr, "test pass.\n"); + else + fprintf(stderr, "test failed.\n"); + + // exit + test_graph_release(ir_graph); + + return ret; +} diff --git a/tools/convert_tool/caffe/caffe2tengine.cpp b/tools/convert_tool/caffe/caffe2tengine.cpp index bf8556bb0..74f777abe 100644 --- a/tools/convert_tool/caffe/caffe2tengine.cpp +++ b/tools/convert_tool/caffe/caffe2tengine.cpp @@ -28,13 +28,12 @@ * SELF DEFINE VARIABLE * FOR CAFFE SERIALIZER */ -const int OP_VERSION=1; - +const int OP_VERSION = 1; int caffe_serializer::load_text_file(std::string model_file, te_caffe::NetParameter& caffe_net) { std::ifstream is(model_file.c_str(), std::ios::in); - + if (!is.is_open()) { TLOG_ERR("cannot open file: %s \n", model_file.c_str()); @@ -70,7 +69,6 @@ int caffe_serializer::load_binary_file(std::string model_file, te_caffe::NetPara coded_input.SetTotalBytesLimit(INT_MAX, INT_MAX / 2); #endif - bool ret = caffe_net.ParseFromCodedStream(&coded_input); is.close(); @@ -82,7 +80,7 @@ int caffe_serializer::load_binary_file(std::string model_file, te_caffe::NetPara } bool caffe_serializer::find_op_load_method(const std::string& op_name) { - if(op_load_map.count(op_name)) + if (op_load_map.count(op_name)) return true; return false; @@ -96,7 +94,7 @@ ir_tensor_t* find_caffe_tensor(ir_graph_t* graph, const std::string& tensor_name if (tensor->name == tensor_name) return tensor; } - + return nullptr; } @@ -122,13 +120,13 @@ int caffe_serializer::load_graph_node(ir_graph_t* graph, const te_caffe::NetPara { const te_caffe::LayerParameter& layer_param = test_net.layer(i); const std::string& caffe_op_name = layer_param.type(); - if(!find_op_load_method(caffe_op_name)) + if (!find_op_load_method(caffe_op_name)) { // printf("%s \n", caffe_op_name.c_str()); - auto it = find(no_supported_op.begin(),no_supported_op.end(),caffe_op_name); - if(it == no_supported_op.end()) + auto it = find(no_supported_op.begin(), no_supported_op.end(), caffe_op_name); + if (it == no_supported_op.end()) { - if(caffe_op_name == "Constant") + if (caffe_op_name == "Constant") continue; no_supported_op.push_back(caffe_op_name); } @@ -137,7 +135,7 @@ int caffe_serializer::load_graph_node(ir_graph_t* graph, const te_caffe::NetPara if (no_supported_op.size()) { TLOG_ERR("These %d op are not supported\n{ ", no_supported_op.size()); - for(int j = 0; j < (int) no_supported_op.size(); j++) + for (int j = 0; j < (int)no_supported_op.size(); j++) { TLOG_ERR("%s ", no_supported_op[j].c_str()); } @@ -165,12 +163,12 @@ int caffe_serializer::load_graph_node(ir_graph_t* graph, const te_caffe::NetPara // ir_tensor_t* tensor = find_caffe_tensor(graph, orig_name); int tensor_id = get_ir_tensor_index_from_name(graph, orig_name.c_str()); - ir_tensor_t* tensor = get_ir_graph_tensor(graph, tensor_id); + ir_tensor_t* tensor = get_ir_graph_tensor(graph, tensor_id); // fprintf(stderr, "input tensor : %s \n", tensor->name); set_ir_node_input_tensor(ir_node, i, tensor); - if(train_name_map.count(layer_param.name())) + if (train_name_map.count(layer_param.name())) { // printf("train data copy in: %s \n", layer_param.name().c_str()); @@ -182,7 +180,8 @@ int caffe_serializer::load_graph_node(ir_graph_t* graph, const te_caffe::NetPara if (p_train->blobs_size()) { blob_load_t func = blob_load_map[caffe_op_name]; - if (!func(graph, ir_node, *p_train)){ + if (!func(graph, ir_node, *p_train)) + { break; } } @@ -191,7 +190,6 @@ int caffe_serializer::load_graph_node(ir_graph_t* graph, const te_caffe::NetPara // output_tensors.push_back(tensor); // fprintf(stderr, "output_tensors num: %d %s\n", (int)output_tensors.size(), output_tensors[(int)output_tensors.size()-1]->name); } - // fprintf(stderr, "layer_param.top_size() %d %s \n", layer_param.top_size(), caffe_op_name.c_str()); for (int i = 0; i < layer_param.top_size(); i++) @@ -224,7 +222,7 @@ int caffe_serializer::load_graph_node(ir_graph_t* graph, const te_caffe::NetPara TLOG_ERR("load op %s func failed in node %s .\n", caffe_op_name.c_str(), ir_node->name); return -1; } - #if 0 +#if 0 if(train_name_map.count(layer_param.name())) { // fprintf(stderr, "train_name_map : %s \n", layer_param.name().c_str()); @@ -240,15 +238,14 @@ int caffe_serializer::load_graph_node(ir_graph_t* graph, const te_caffe::NetPara } } } - #endif - +#endif } // printf("tensor \n"); - if (n < layer_number){ + if (n < layer_number) + { fprintf(stderr, "Check layer number error ! \n"); return -1; } - } int caffe_serializer::load_tensor_data(ir_graph_t* graph, const te_caffe::NetParameter test_net, const te_caffe::NetParameter train_net) { @@ -265,7 +262,7 @@ int caffe_serializer::load_tensor_data(ir_graph_t* graph, const te_caffe::NetPar layer_number = test_net.layer_size(); int size = (int)op_load_map.size(); - + int n; // printf("layer number : %d \n", layer_number); for (n = 0; n < layer_number; n++) @@ -277,16 +274,16 @@ int caffe_serializer::load_tensor_data(ir_graph_t* graph, const te_caffe::NetPar if (ir_node == NULL) return -1; - if(train_name_map.count(layer_param.name())) + if (train_name_map.count(layer_param.name())) { - const te_caffe::LayerParameter* p_train; p_train = train_name_map[layer_param.name()]; if (p_train->blobs_size()) { blob_load_t func = blob_load_map[caffe_op_name]; - if (!func(graph, ir_node, *p_train)){ + if (!func(graph, ir_node, *p_train)) + { break; } } @@ -307,7 +304,7 @@ int caffe_serializer::load_model(ir_graph_t* graph, std::string model_file, std: return -1; fprintf(stderr, "Process 2: Finish load protobuf file \n"); // if (load_tensor_data(graph, test_net, train_net) < 0) - // return -1; + // return -1; fprintf(stderr, "Process 3: Finish load graph node \n"); if (load_graph_node(graph, test_net, train_net) < 0) return -1; @@ -321,7 +318,7 @@ int caffe_serializer::load_model(ir_graph_t* graph, std::string model_file, std: // return -1; // if (set_graph_output(graph, onnx_graph) < 0) // return -1; - + return 0; } @@ -350,7 +347,6 @@ graph_t caffe_serializer::caffe2tengine(std::string model_file, std::string prot return ir_graph; } - static void LoadCaffeBlob(ir_graph_t* ir_graph, ir_node_t* ir_node, const std::vector& name_list, const std::vector& layout_list, const te_caffe::LayerParameter& layer_param) @@ -361,7 +357,7 @@ static void LoadCaffeBlob(ir_graph_t* ir_graph, ir_node_t* ir_node, const std::v { std::string node_name = ir_node->name; std::string new_tensor_name = node_name + "/" + name_list[i]; - + ir_tensor_t* ir_tensor = create_ir_tensor(ir_graph, new_tensor_name.c_str(), TENGINE_DT_FP32); /* load tensor data*/ @@ -369,12 +365,12 @@ static void LoadCaffeBlob(ir_graph_t* ir_graph, ir_node_t* ir_node, const std::v const te_caffe::BlobProto& blob = layer_param.blobs(i); int dim_num = 0; - int *dims; + int* dims; if (blob.has_shape()) { dim_num = blob.shape().dim_size(); - dims = (int*)malloc(sizeof(int)*dim_num); - memset(dims, 0, sizeof(int)*dim_num); + dims = (int*)malloc(sizeof(int) * dim_num); + memset(dims, 0, sizeof(int) * dim_num); for (int i = 0; i < dim_num; i++) { dims[i] = blob.shape().dim(i); @@ -393,17 +389,17 @@ static void LoadCaffeBlob(ir_graph_t* ir_graph, ir_node_t* ir_node, const std::v while (temp[start] == 1) start++; - dim_num = temp.size() - start; - dims = (int*)malloc(sizeof(int)*dim_num); - memset(dims, 0, sizeof(int)*dim_num); + dim_num = temp.size() - start; + dims = (int*)malloc(sizeof(int) * dim_num); + memset(dims, 0, sizeof(int) * dim_num); for (unsigned int i = start; i < temp.size(); i++) dims[i] = temp[i]; } - if ( dim_num > 0) + if (dim_num > 0) { set_ir_tensor_shape(ir_tensor, dims, dim_num); ir_tensor->tensor_type = TENSOR_TYPE_CONST; - int tensor_size = ir_tensor->elem_num * sizeof(float); + int tensor_size = ir_tensor->elem_num * sizeof(float); ir_tensor->data = sys_malloc(tensor_size); float* ptr = (float*)ir_tensor->data; @@ -418,12 +414,10 @@ static void LoadCaffeBlob(ir_graph_t* ir_graph, ir_node_t* ir_node, const std::v // int index = get_ir_node_index_from_name(ir_graph, new_tensor_name.c_str()); set_ir_node_output_tensor(new_ir_node, 0, ir_tensor); - set_ir_node_input_tensor(ir_node, i+1, ir_tensor); + set_ir_node_input_tensor(ir_node, i + 1, ir_tensor); } } - - static void CreatePresetNode(ir_graph_t* graph, ir_node_t* ir_node, const char* name, const char* layout, std::vector& temp, float val, int index) { @@ -434,8 +428,8 @@ static void CreatePresetNode(ir_graph_t* graph, ir_node_t* ir_node, const char* int dim_num = temp.size(); if (dim_num > 0) { - int *dims = (int*)malloc(sizeof(int)*dim_num); - memset(dims, 0, sizeof(int)*dim_num); + int* dims = (int*)malloc(sizeof(int) * dim_num); + memset(dims, 0, sizeof(int) * dim_num); int elem_size = 1; for (unsigned int i = 0; i < dim_num; i++) @@ -445,7 +439,7 @@ static void CreatePresetNode(ir_graph_t* graph, ir_node_t* ir_node, const char* } set_ir_tensor_shape(ir_tensor, dims, dim_num); ir_tensor->tensor_type = TENSOR_TYPE_CONST; - int tensor_size = elem_size * sizeof(float); + int tensor_size = elem_size * sizeof(float); ir_tensor->data = sys_malloc(tensor_size); float* ptr = (float*)ir_tensor->data; @@ -459,13 +453,11 @@ static void CreatePresetNode(ir_graph_t* graph, ir_node_t* ir_node, const char* set_ir_node_input_tensor(new_ir_node, 0, ir_tensor); } - bool load_batchnorm_blob(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param) { - const te_caffe::BlobProto& rescale_blob = layer_param.blobs(2); - + const te_caffe::BlobProto& rescale_blob = layer_param.blobs(2); - struct batchnorm_param* batchnorm_param = ( struct batchnorm_param* )node->op.param_mem; + struct batchnorm_param* batchnorm_param = (struct batchnorm_param*)node->op.param_mem; batchnorm_param->rescale_factor = rescale_blob.data(0); @@ -486,13 +478,13 @@ bool load_batchnorm_blob(ir_graph_t* graph, ir_node_t* node, const te_caffe::Lay std::vector layout_list = {"W", "W"}; LoadCaffeBlob(graph, node, name_list, layout_list, layer_param); - } + } return 0; } int load_batchnorm(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param) { - struct batchnorm_param* batchnorm_param = ( struct batchnorm_param* )node->op.param_mem; + struct batchnorm_param* batchnorm_param = (struct batchnorm_param*)node->op.param_mem; const te_caffe::BatchNormParameter& bn_param = layer_param.batch_norm_param(); @@ -511,7 +503,7 @@ int load_softmax(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParame { const te_caffe::SoftmaxParameter& softmax_param = layer_param.softmax_param(); - struct softmax_param* param = ( struct softmax_param* )node->op.param_mem; + struct softmax_param* param = (struct softmax_param*)node->op.param_mem; if (softmax_param.has_axis()) param->axis = softmax_param.axis(); @@ -521,13 +513,12 @@ int load_softmax(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParame return 0; } - int load_conv(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param) { const te_caffe::ConvolutionParameter& conv_param = layer_param.convolution_param(); // const te_caffe::LayerParameter& layer_param = caffe_net.layer(i); const std::string& caffe_op_name = layer_param.type(); - struct conv_param* param = ( struct conv_param* )node->op.param_mem; + struct conv_param* param = (struct conv_param*)node->op.param_mem; if (conv_param.has_kernel_h() && conv_param.has_kernel_w()) { @@ -596,7 +587,7 @@ int load_deconv(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParamet { const te_caffe::ConvolutionParameter& conv_param = layer_param.convolution_param(); - struct deconv_param* param = ( struct deconv_param* )node->op.param_mem; + struct deconv_param* param = (struct deconv_param*)node->op.param_mem; if (conv_param.has_kernel_h() && conv_param.has_kernel_w()) { @@ -667,12 +658,11 @@ PoolArg ConvertCaffePool(te_caffe::PoolingParameter_PoolMethod method) return kPoolMax; } - int load_fc(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param) { const te_caffe::InnerProductParameter& ip_param = layer_param.inner_product_param(); - struct fc_param* param = ( struct fc_param* )node->op.param_mem; + struct fc_param* param = (struct fc_param*)node->op.param_mem; param->num_output = ip_param.num_output(); /* Load weight and bias blob */ @@ -700,7 +690,7 @@ int load_normalize(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerPara { const te_caffe::NormalizeParameter& normalize_param = layer_param.norm_param(); - struct normalize_param* param = ( struct normalize_param* )node->op.param_mem; + struct normalize_param* param = (struct normalize_param*)node->op.param_mem; param->across_spatial = normalize_param.across_spatial(); param->channel_shared = normalize_param.channel_shared(); @@ -708,10 +698,9 @@ int load_normalize(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerPara return 0; } - int load_scale(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param) { - struct scale_param* param = ( struct scale_param* )node->op.param_mem; + struct scale_param* param = (struct scale_param*)node->op.param_mem; const te_caffe::ScaleParameter& scale_param = layer_param.scale_param(); @@ -724,7 +713,6 @@ int load_scale(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParamete if (scale_param.has_bias_term()) param->bias_term = scale_param.bias_term(); - if (layer_param.blobs_size()) { std::vector name_list = {"gamma", "beta"}; @@ -738,7 +726,7 @@ int load_scale(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParamete int load_relu(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param) { - struct relu_param* param = ( struct relu_param* )node->op.param_mem; + struct relu_param* param = (struct relu_param*)node->op.param_mem; const te_caffe::ReLUParameter& caffe_param = layer_param.relu_param(); @@ -752,13 +740,12 @@ int load_relu(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter int load_split(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param) { - struct split_param* param = ( struct split_param* )node->op.param_mem; + struct split_param* param = (struct split_param*)node->op.param_mem; param->is_caffe = true; return 0; } - #if 0 int load_data(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param) { @@ -786,7 +773,7 @@ int load_pool(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter { const te_caffe::PoolingParameter& pool_param = layer_param.pooling_param(); - struct pool_param* param = ( struct pool_param* )node->op.param_mem; + struct pool_param* param = (struct pool_param*)node->op.param_mem; // param.alg = ConvertCaffePool(pool_param.pool()); if (pool_param.has_kernel_size()) @@ -830,7 +817,6 @@ int load_pool(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter param->caffe_flavor = 1; - return 0; } static EltType ConvertCaffeEltwise(te_caffe::EltwiseParameter_EltwiseOp method) @@ -847,7 +833,7 @@ static EltType ConvertCaffeEltwise(te_caffe::EltwiseParameter_EltwiseOp method) int load_eltwise(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param) { const te_caffe::EltwiseParameter& eltwise_param = layer_param.eltwise_param(); - struct eltwise_param* param = ( struct eltwise_param* )node->op.param_mem; + struct eltwise_param* param = (struct eltwise_param*)node->op.param_mem; // defalt: SUM param->type = ELT_SUM; if (eltwise_param.has_operation()) @@ -889,16 +875,16 @@ int load_input(ir_graph_t* graph, ir_node_t* ir_node, const te_caffe::LayerParam if (dim_num == 0) has_shape = 0; - #if 1 +#if 1 if (has_shape) { - int* dims = (int*)malloc(sizeof(int)*dim_num); - memset(dims, 0, sizeof(int)*dim_num); - for(int i = 0; i < dim_num ; i++) + int* dims = (int*)malloc(sizeof(int) * dim_num); + memset(dims, 0, sizeof(int) * dim_num); + for (int i = 0; i < dim_num; i++) dims[i] = dim[i]; set_ir_tensor_shape(tensor, dims, dim_num); } - #endif +#endif ir_node_t* node = create_ir_node(graph, val.c_str(), OP_INPUT, OP_VERSION); set_ir_node_output_tensor(node, 0, tensor); @@ -935,7 +921,6 @@ int LoadDeconvolutionBlob(ir_graph_t* graph, ir_node_t* node, const te_caffe::La return true; } - int LoadBiasBlob(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param) { if (layer_param.blobs_size()) @@ -948,7 +933,6 @@ int LoadBiasBlob(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParame return true; } - int LoadFullyConnectedBlob(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param) { if (layer_param.blobs_size()) @@ -995,11 +979,10 @@ int LoadBatchNormBlob(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerP { const te_caffe::BlobProto& rescale_blob = layer_param.blobs(2); - struct batchnorm_param* param = ( struct batchnorm_param* )node->op.param_mem; + struct batchnorm_param* param = (struct batchnorm_param*)node->op.param_mem; param->rescale_factor = rescale_blob.data(0); - /* for compatible reason, create the two tensors: gamma (1.0) and beta (0.0) */ /* get the dim, i.e., channel size */ @@ -1025,34 +1008,33 @@ int LoadBatchNormBlob(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerP */ void caffe_serializer::register_op_load() { - op_load_map["BatchNorm"] = std::pair(OP_UNARY, load_batchnorm); - op_load_map["Convolution"] = std::pair(OP_CONV, load_conv); - op_load_map["DeConvolution"] = std::pair(OP_DECONV, load_deconv); - op_load_map["Softmax"] = std::pair(OP_SOFTMAX, load_softmax); - op_load_map["PReLU"] = std::pair(OP_PRELU, load_prelu); - op_load_map["InnerProduct"] = std::pair(OP_FC, load_fc); - op_load_map["SoftmaxWithLoss"] = std::pair(OP_SOFTMAX, load_softmax); - op_load_map["Normalize"] = std::pair(OP_NORMALIZE, load_normalize); - op_load_map["Scale"] = std::pair(OP_SCALE, load_scale); - op_load_map["ReLU"] = std::pair(OP_RELU, load_relu); - op_load_map["Split"] = std::pair(OP_SPLIT, load_split); - op_load_map["Pooling"] = std::pair(OP_POOL, load_pool); - op_load_map["Eltwise"] = std::pair(OP_ELTWISE, load_eltwise); - op_load_map["Input"] = std::pair(OP_INPUT, load_input); - op_load_map["Data"] = std::pair(OP_INPUT, load_input); - - - blob_load_map["Convolution"] = LoadConvolutionBlob; + op_load_map["BatchNorm"] = std::pair(OP_UNARY, load_batchnorm); + op_load_map["Convolution"] = std::pair(OP_CONV, load_conv); + op_load_map["DeConvolution"] = std::pair(OP_DECONV, load_deconv); + op_load_map["Softmax"] = std::pair(OP_SOFTMAX, load_softmax); + op_load_map["PReLU"] = std::pair(OP_PRELU, load_prelu); + op_load_map["InnerProduct"] = std::pair(OP_FC, load_fc); + op_load_map["SoftmaxWithLoss"] = std::pair(OP_SOFTMAX, load_softmax); + op_load_map["Normalize"] = std::pair(OP_NORMALIZE, load_normalize); + op_load_map["Scale"] = std::pair(OP_SCALE, load_scale); + op_load_map["ReLU"] = std::pair(OP_RELU, load_relu); + op_load_map["Split"] = std::pair(OP_SPLIT, load_split); + op_load_map["Pooling"] = std::pair(OP_POOL, load_pool); + op_load_map["Eltwise"] = std::pair(OP_ELTWISE, load_eltwise); + op_load_map["Input"] = std::pair(OP_INPUT, load_input); + op_load_map["Data"] = std::pair(OP_INPUT, load_input); + + blob_load_map["Convolution"] = LoadConvolutionBlob; // blob_load_map["Deconvolution"] = LoadDeconvolutionBlob; - blob_load_map["InnerProduct"] = LoadFullyConnectedBlob; - blob_load_map["BatchNorm"] = LoadBatchNormBlob; - blob_load_map["Scale"] = LoadScaleBlob; + blob_load_map["InnerProduct"] = LoadFullyConnectedBlob; + blob_load_map["BatchNorm"] = LoadBatchNormBlob; + blob_load_map["Scale"] = LoadScaleBlob; // blob_load_map["PReLU"] = LoadPReLuBlob; // blob_load_map["Normalize"] = LoadNormalizeBlob; // blob_load_map["ConvolutionDepthwise"] = LoadConvolutionBlob; // blob_load_map["DepthwiseConvolution"] = LoadConvolutionBlob; - blob_load_map["Bias"] = LoadBiasBlob; - #if 0 + blob_load_map["Bias"] = LoadBiasBlob; +#if 0 op_load_map["Data"] = std::pair(OP_INPUT, load_data); op_load_map["Slice"] = std::pair(OP_SLICE, load_slice); op_load_map["Concat"] = std::pair(OP_CONCAT, load_concat); @@ -1087,8 +1069,7 @@ void caffe_serializer::register_op_load() op_load_map["MVN"] = std::pair(OP_MVN, load_mvn); op_load_map["Reduction"] = std::pair(OP_REDUCTION, load_reduction); op_load_map["Bias"] = std::pair(OP_BIAS, load_bias); - #endif - +#endif } /* * OPERAOTR REGISTER FUNCTION DEFINE FOR ONNX SERIALIZER END diff --git a/tools/convert_tool/caffe/caffe2tengine.hpp b/tools/convert_tool/caffe/caffe2tengine.hpp index 844623bfc..1a5ca52d9 100644 --- a/tools/convert_tool/caffe/caffe2tengine.hpp +++ b/tools/convert_tool/caffe/caffe2tengine.hpp @@ -36,19 +36,18 @@ #include #include -extern "C" -{ - #include "tengine/c_api.h" - #include "graph/graph.h" - #include "graph/subgraph.h" - #include "graph/node.h" - #include "graph/tensor.h" - #include "executer/executer.h" - #include "module/module.h" - #include "utility/log.h" - #include "utility/sys_port.h" - #include "utility/vector.h" - #include "../utils/save_graph/op_include.h" +extern "C" { +#include "tengine/c_api.h" +#include "graph/graph.h" +#include "graph/subgraph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "executer/executer.h" +#include "module/module.h" +#include "utility/log.h" +#include "utility/sys_port.h" +#include "utility/vector.h" +#include "../utils/save_graph/op_include.h" } enum PoolArg @@ -63,24 +62,22 @@ class caffe_serializer public: graph_t caffe2tengine(std::string model_file, std::string proto_file); typedef int (*op_load_t)(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param); - typedef int (*blob_load_t)(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param); + typedef int (*blob_load_t)(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param); using name_map_t = std::unordered_map; private: - std::unordered_map> op_load_map; + std::unordered_map > op_load_map; std::unordered_map blob_load_map; int load_model(ir_graph_t* graph, std::string model_file, std::string proto_file); int load_graph_node(ir_graph_t* graph, const te_caffe::NetParameter test_net, const te_caffe::NetParameter train_net); int load_tensor_data(ir_graph_t* graph, const te_caffe::NetParameter test_net, const te_caffe::NetParameter train_net); int load_text_file(std::string model_file, te_caffe::NetParameter& caffe_net); int load_binary_file(std::string model_file, te_caffe::NetParameter& caffe_net); - bool find_op_load_method(const std::string& op_name); + bool find_op_load_method(const std::string& op_name); void register_op_load(); std::unordered_map train_name_map; std::vector output_tensors; - }; - #endif \ No newline at end of file diff --git a/tools/convert_tool/convert_tool.cpp b/tools/convert_tool/convert_tool.cpp index 8220ff302..a3d55374a 100644 --- a/tools/convert_tool/convert_tool.cpp +++ b/tools/convert_tool/convert_tool.cpp @@ -34,11 +34,11 @@ #include "utils/graph_optimizer/graph_opt.hpp" const char* help_params = "[Convert Tools Info]: optional arguments:\n" - "\t-h help show this help message and exit\n" - "\t-f input type path to input float32 tmfile\n" - "\t-p input structure path to the network structure of input model(*.prototxt, *.symbol, *.cfg, *.pdmodel)\n" - "\t-m input params path to the network params of input model(*.caffemodel, *.params, *.weight, *.pb, *.onnx, *.tflite, *.pdiparams)\n" - "\t-o output model path to output fp32 tmfile\n"; + "\t-h help show this help message and exit\n" + "\t-f input type path to input float32 tmfile\n" + "\t-p input structure path to the network structure of input model(*.prototxt, *.symbol, *.cfg, *.pdmodel)\n" + "\t-m input params path to the network params of input model(*.caffemodel, *.params, *.weight, *.pb, *.onnx, *.tflite, *.pdiparams)\n" + "\t-o output model path to output fp32 tmfile\n"; const char* example_params = "[Convert Tools Info]: example arguments:\n" "\t./convert_tool -f caffe -p ./mobilenet.prototxt -m ./mobilenet.caffemodel -o ./mobilenet.tmfile\n"; @@ -64,24 +64,24 @@ int main(int argc, char* argv[]) { switch (res) { - case 'f': - file_format = optarg; - break; - case 'p': - proto_file = optarg; - break; - case 'm': - model_file = optarg; - break; - case 'o': - output_tmfile = optarg; - break; - case 'h': - show_usage(); - return 0; - default: - show_usage(); - break; + case 'f': + file_format = optarg; + break; + case 'p': + proto_file = optarg; + break; + case 'm': + model_file = optarg; + break; + case 'o': + output_tmfile = optarg; + break; + case 'h': + show_usage(); + return 0; + default: + show_usage(); + break; } } @@ -105,8 +105,7 @@ int main(int argc, char* argv[]) model_file_needed = true; input_file_number = 2; } - else if (file_format == "caffe_single" || file_format == "onnx" || file_format == "tensorflow" || - file_format == "tflite") + else if (file_format == "caffe_single" || file_format == "onnx" || file_format == "tensorflow" || file_format == "tflite") { model_file_needed = true; input_file_number = 1; @@ -160,7 +159,7 @@ int main(int argc, char* argv[]) return -1; } } - + init_tengine(); set_log_level(LOG_INFO); graph_t graph = NULL; @@ -176,8 +175,8 @@ int main(int argc, char* argv[]) } else if (file_format == "ncnn") { - ncnn_serializer n2t; - graph = n2t.ncnn2tengine(model_file, proto_file); + ncnn_serializer n2t; + graph = n2t.ncnn2tengine(model_file, proto_file); } else { @@ -196,7 +195,7 @@ int main(int argc, char* argv[]) fprintf(stderr, "optimize graph failed! \n"); return -1; } - + if (save_graph(graph, output_tmfile.c_str()) < 0) { fprintf(stderr, "save graph failed! \n"); diff --git a/tools/convert_tool/ncnn/ncnn2tengine.cpp b/tools/convert_tool/ncnn/ncnn2tengine.cpp index f78b67d1d..f475a7230 100644 --- a/tools/convert_tool/ncnn/ncnn2tengine.cpp +++ b/tools/convert_tool/ncnn/ncnn2tengine.cpp @@ -28,7 +28,7 @@ * SELF DEFINE VARIABLE * FOR ONNX SERIALIZER */ -const int OP_VERSION=1; +const int OP_VERSION = 1; /* * ASSIST FUNCTIONS FOR NCNN SERIALIZER START @@ -130,21 +130,30 @@ int ncnn_serializer::read(void* buf, int size) { return fread(buf, 1, size, fp); } -void remove_ncnn_split(std::vector& nodelist){ - for(auto &curr_node : nodelist){ - if(curr_node.op == "Split"){ - for(auto &in_node : nodelist){ - if(in_node.output_name[0] == curr_node.inputs_name[0]){ +void remove_ncnn_split(std::vector& nodelist) +{ + for (auto& curr_node : nodelist) + { + if (curr_node.op == "Split") + { + for (auto& in_node : nodelist) + { + if (in_node.output_name[0] == curr_node.inputs_name[0]) + { auto out_name = in_node.output_name[0]; - for(auto &out_node : nodelist){ - for(auto &out_node_inbound_name : out_node.inputs_name){ - for(auto &curr_node_outbound_name : curr_node.output_name){ - if(out_node_inbound_name == curr_node_outbound_name){ + for (auto& out_node : nodelist) + { + for (auto& out_node_inbound_name : out_node.inputs_name) + { + for (auto& curr_node_outbound_name : curr_node.output_name) + { + if (out_node_inbound_name == curr_node_outbound_name) + { out_node.inputs_name.erase(std::remove( - out_node.inputs_name.begin(), - out_node.inputs_name.end(), - out_node_inbound_name - ), out_node.inputs_name.end()); + out_node.inputs_name.begin(), + out_node.inputs_name.end(), + out_node_inbound_name), + out_node.inputs_name.end()); out_node.inputs_name.push_back(in_node.output_name[0]); } } @@ -154,7 +163,7 @@ void remove_ncnn_split(std::vector& nodelist){ } } } - nodelist.erase(std::remove_if(nodelist.begin(), nodelist.end(), [&](NcnnNode& n){return n.op == "Split";}), nodelist.end()); + nodelist.erase(std::remove_if(nodelist.begin(), nodelist.end(), [&](NcnnNode& n) { return n.op == "Split"; }), nodelist.end()); } int ncnn_serializer::load_model_file(const char* fname, std::vector& nodelist) { @@ -169,10 +178,10 @@ int ncnn_serializer::load_model_file(const char* fname, std::vector& n int res = 0; int magic = 0; res = fscanf(fp, "%d=", &magic); - fprintf(stderr, "%s magic: %d \n",fname, magic); + fprintf(stderr, "%s magic: %d \n", fname, magic); if (magic != 7767517) { - TLOG_ERR("param is too old, please regenerate \n"); + TLOG_ERR("param is too old, please regenerate \n"); } int layer_count = 0; int blob_count = 0; @@ -196,7 +205,6 @@ int ncnn_serializer::load_model_file(const char* fname, std::vector& n node.optimized = 0; node.name = layer_name; - for (int j = 0; j < bottom_count; j++) { char bottom_name[256]; @@ -213,7 +221,7 @@ int ncnn_serializer::load_model_file(const char* fname, std::vector& n if (res < 0) { - TLOG_ERR( "Read Param file data failed\n"); + TLOG_ERR("Read Param file data failed\n"); return false; } while (fscanf(fp, "%d=", &id) == 1) @@ -238,8 +246,8 @@ int ncnn_serializer::load_model_file(const char* fname, std::vector& n return false; } - params[id].f_data_array = ( float* )malloc(sizeof(float) * len); - params[id].i_data_array = ( int* )malloc(sizeof(int) * len); + params[id].f_data_array = (float*)malloc(sizeof(float) * len); + params[id].i_data_array = (int*)malloc(sizeof(int) * len); // std::vector opt_str; std::string str = ""; for (int j = 0; j < len; j++) @@ -292,8 +300,8 @@ int ncnn_serializer::load_model_file(const char* fname, std::vector& n return false; } std::string str = ""; - params[id].f_data_array = ( float* )malloc(sizeof(float) * len); - params[id].i_data_array = ( int* )malloc(sizeof(int) * len); + params[id].f_data_array = (float*)malloc(sizeof(float) * len); + params[id].i_data_array = (int*)malloc(sizeof(int) * len); for (int j = 0; j < len; j++) { char vstr[16]; @@ -376,16 +384,15 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector& fp = fopen(fname, "rb"); if (!fp) { - TLOG_ERR("Cannot open the bin file: %d\n "); + TLOG_ERR("Cannot open the bin file: %d\n "); return false; } float magic = 0; int nscan = 0; - for (int i = 0; i < ( int )nodelist.size(); i++) + for (int i = 0; i < (int)nodelist.size(); i++) { - if (nodelist[i].op == "Convolution" || nodelist[i].op == "DeconvolutionDepthWise" || - nodelist[i].op == "Deconvolution" || nodelist[i].op == "ConvolutionDepthWise") + if (nodelist[i].op == "Convolution" || nodelist[i].op == "DeconvolutionDepthWise" || nodelist[i].op == "Deconvolution" || nodelist[i].op == "ConvolutionDepthWise") { NcnnParam weight; nscan = read(&magic, sizeof(float)); @@ -398,7 +405,7 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector& iter = nodelist[i].attrs.find(0); int output_channel = std::atoi(iter->second.c_str()); - weight.data = ( float* )malloc(sizeof(float) * weight.data_len); + weight.data = (float*)malloc(sizeof(float) * weight.data_len); read(weight.data, sizeof(float) * weight.data_len); // printf("%f %f \n", weight.data, weight.data); iter = nodelist[i].attrs.find(1); @@ -410,8 +417,8 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector& weight.dims.push_back(kernel_size); iter = nodelist[i].attrs.find(5); int biasTerm = 0; - - if(!iter->second.empty()) + + if (!iter->second.empty()) biasTerm = std::atoi(iter->second.c_str()); paramlist.push_back(weight); @@ -420,7 +427,7 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector& NcnnParam bias; bias.name = nodelist[i].name + "_b"; bias.data_len = output_channel; - bias.data = ( float* )malloc(sizeof(float) * output_channel); + bias.data = (float*)malloc(sizeof(float) * output_channel); read(bias.data, sizeof(float) * output_channel); bias.dims.push_back(output_channel); paramlist.push_back(bias); @@ -442,10 +449,10 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector& variance.data_len = std::atoi(iter->second.c_str()); bias.data_len = std::atoi(iter->second.c_str()); - bias.data = ( float* )malloc(sizeof(float) * slope.data_len); - variance.data = ( float* )malloc(sizeof(float) * slope.data_len); - slope.data = ( float* )malloc(sizeof(float) * slope.data_len); - mean.data = ( float* )malloc(sizeof(float) * slope.data_len); + bias.data = (float*)malloc(sizeof(float) * slope.data_len); + variance.data = (float*)malloc(sizeof(float) * slope.data_len); + slope.data = (float*)malloc(sizeof(float) * slope.data_len); + mean.data = (float*)malloc(sizeof(float) * slope.data_len); read(slope.data, sizeof(float) * slope.data_len); read(mean.data, sizeof(float) * slope.data_len); @@ -474,8 +481,8 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector& iter = nodelist[i].attrs.find(0); bias.data_len = std::atoi(iter->second.c_str()); - weight.data = ( float* )malloc(sizeof(float) * weight.data_len); - bias.data = ( float* )malloc(sizeof(float) * bias.data_len); + weight.data = (float*)malloc(sizeof(float) * weight.data_len); + bias.data = (float*)malloc(sizeof(float) * bias.data_len); read(weight.data, sizeof(float) * weight.data_len); read(bias.data, sizeof(float) * bias.data_len); weight.dims.push_back(weight.data_len); @@ -494,7 +501,7 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector& iter = nodelist[i].attrs.find(2); weight.data_len = std::atoi(iter->second.c_str()); - weight.data = ( float* )malloc(sizeof(float) * weight.data_len); + weight.data = (float*)malloc(sizeof(float) * weight.data_len); read(weight.data, sizeof(float) * weight.data_len); weight.dims.push_back(output_num); weight.dims.push_back(weight.data_len / output_num); @@ -506,7 +513,7 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector& NcnnParam bias; bias.name = nodelist[i].name + "_b"; bias.data_len = output_num; - bias.data = ( float* )malloc(sizeof(float) * output_num); + bias.data = (float*)malloc(sizeof(float) * output_num); read(bias.data, sizeof(float) * output_num); bias.dims.push_back(output_num); paramlist.push_back(bias); @@ -520,7 +527,7 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector& std::map::iterator iter; iter = nodelist[i].attrs.find(3); scale.data_len = std::atoi(iter->second.c_str()); - scale.data = ( float* )malloc(sizeof(float) * scale.data_len); + scale.data = (float*)malloc(sizeof(float) * scale.data_len); read(scale.data, sizeof(float) * scale.data_len); scale.dims.push_back(scale.data_len); paramlist.push_back(scale); @@ -533,7 +540,7 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector& std::map::iterator iter; iter = nodelist[i].attrs.find(0); slope.data_len = std::atoi(iter->second.c_str()); - slope.data = ( float* )malloc(sizeof(float) * slope.data_len); + slope.data = (float*)malloc(sizeof(float) * slope.data_len); read(slope.data, sizeof(float) * slope.data_len); slope.dims.push_back(slope.data_len); paramlist.push_back(slope); @@ -546,7 +553,7 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector& std::map::iterator iter; iter = nodelist[i].attrs.find(0); scale.data_len = std::atoi(iter->second.c_str()); - scale.data = ( float* )malloc(sizeof(float) * scale.data_len); + scale.data = (float*)malloc(sizeof(float) * scale.data_len); read(scale.data, sizeof(float) * scale.data_len); scale.dims.push_back(scale.data_len); paramlist.push_back(scale); @@ -558,36 +565,37 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector& NcnnParam bias; bias.name = nodelist[i].name + "_b"; bias.data_len = scale.data_len; - bias.data = ( float* )malloc(sizeof(float) * scale.data_len); + bias.data = (float*)malloc(sizeof(float) * scale.data_len); read(bias.data, sizeof(float) * scale.data_len); bias.dims.push_back(scale.data_len); paramlist.push_back(bias); - } + } } - else if(nodelist[i].op == "MemoryData"){ + else if (nodelist[i].op == "MemoryData") + { NcnnParam const_data; std::map::iterator iter; int data_len = 1; int size = (int)nodelist[i].attrs.size(); std::vector dims(size); - for(iter = nodelist[i].attrs.begin(); iter != nodelist[i].attrs.end(); iter++) + for (iter = nodelist[i].attrs.begin(); iter != nodelist[i].attrs.end(); iter++) { std::pair pair = *iter; data_len *= atoi(pair.second.c_str()); dims[pair.first] = atoi(pair.second.c_str()); } const_data.name = nodelist[i].name; - const_data.dim_size = (int) dims.size(); + const_data.dim_size = (int)dims.size(); const_data.dims = dims; const_data.data_len = data_len; - const_data.data = (float*)malloc(sizeof(float)*data_len); - read(const_data.data, sizeof(float)* data_len); + const_data.data = (float*)malloc(sizeof(float) * data_len); + read(const_data.data, sizeof(float) * data_len); paramlist.push_back(const_data); } } if (nscan < 0) { - TLOG_ERR( "Cannot read the binary file: %s \n " , fname ); + TLOG_ERR("Cannot read the binary file: %s \n ", fname); } #if 0 printf("total size: %d \n", totalSize); @@ -619,7 +627,7 @@ int ncnn_serializer::load_constant_tensor(ir_graph_t* graph, const std::vector dims = ncnn_tensor.dims; ir_tensor_t* ir_tensor = create_ir_tensor(graph, ncnn_tensor.name.c_str(), TENGINE_DT_FP32); - int *tensor_dims = new int[(int)dims.size()]; + int* tensor_dims = new int[(int)dims.size()]; for (int j = 0; j < (int)dims.size(); j++) { tensor_dims[j] = ncnn_tensor.dims[j]; @@ -629,8 +637,8 @@ int ncnn_serializer::load_constant_tensor(ir_graph_t* graph, const std::vectordata = (float*)malloc(tensor_size); - float* mem_buf = ( float* )ir_tensor->data; - float* raw_data = ( float* )ncnn_tensor.data; + float* mem_buf = (float*)ir_tensor->data; + float* raw_data = (float*)ncnn_tensor.data; /* load data */ for (int k = 0; k < ncnn_tensor.data_len; k++) { @@ -639,7 +647,7 @@ int ncnn_serializer::load_constant_tensor(ir_graph_t* graph, const std::vector '9') @@ -775,7 +784,7 @@ int ncnn_serializer::set_graph_input(ir_graph_t* graph, const std::vector ir_dims = param.dims; - int *tensor_dims = new int[ir_dims.size()]; + int* tensor_dims = new int[ir_dims.size()]; for (int j = 0; j < ir_dims.size(); j++) { tensor_dims[j] = ir_dims[j]; } - if (ir_dims.size() > 0); - set_ir_tensor_shape(ir_tensor, tensor_dims, ir_dims.size()); + if (ir_dims.size() > 0) + ; + set_ir_tensor_shape(ir_tensor, tensor_dims, ir_dims.size()); } ir_node_t* node = create_ir_node(graph, input_name.c_str(), OP_INPUT, OP_VERSION); set_ir_node_output_tensor(node, 0, ir_tensor); @@ -820,19 +830,20 @@ int ncnn_serializer::set_graph_output(ir_graph_t* graph, const std::vectorconsumer_num == 0){ - + if (ir_tensor->consumer_num == 0) + { NcnnParam param; if (GetParam(input_name, paramlist, param)) { std::vector ir_dims = param.dims; - int *tensor_dims = new int[ir_dims.size()]; + int* tensor_dims = new int[ir_dims.size()]; for (int j = 0; j < ir_dims.size(); j++) { tensor_dims[j] = ir_dims[j]; } - if (ir_dims.size() > 0); - set_ir_tensor_shape(ir_tensor, tensor_dims, ir_dims.size()); + if (ir_dims.size() > 0) + ; + set_ir_tensor_shape(ir_tensor, tensor_dims, ir_dims.size()); } ir_node_t* node = create_ir_node(graph, input_name.c_str(), OP_INPUT, OP_VERSION); @@ -851,7 +862,7 @@ int ncnn_serializer::set_graph_output(ir_graph_t* graph, const std::vectorname == tensor_name) return tensor; } - + return nullptr; } int ncnn_serializer::load_graph_node(ir_graph_t* graph, const std::vector& nodelist, const std::vector& paramlist) @@ -875,8 +886,8 @@ int ncnn_serializer::load_graph_node(ir_graph_t* graph, const std::vector::const_iterator const_iterator; int load_conv(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) { - struct conv_param* param = ( struct conv_param* )node->op.param_mem; + struct conv_param* param = (struct conv_param*)node->op.param_mem; const_iterator iter; @@ -1082,7 +1091,7 @@ int load_conv(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) } int load_pool(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) { - struct pool_param* param = ( struct pool_param* )node->op.param_mem; + struct pool_param* param = (struct pool_param*)node->op.param_mem; const_iterator iter; iter = ncnn_node.attrs.find(0); if (iter != ncnn_node.attrs.end()) @@ -1128,13 +1137,13 @@ int load_pool(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) } int load_relu(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) { - struct relu_param* relu_param = ( struct relu_param* )node->op.param_mem; + struct relu_param* relu_param = (struct relu_param*)node->op.param_mem; return 0; } int load_concat(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) { - struct concat_param* param = ( struct concat_param* )node->op.param_mem; + struct concat_param* param = (struct concat_param*)node->op.param_mem; const_iterator iter; iter = ncnn_node.attrs.find(0); if (iter != ncnn_node.attrs.end()) @@ -1144,7 +1153,7 @@ int load_concat(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) } int load_softmax(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) { - struct softmax_param* param = ( struct softmax_param* )node->op.param_mem; + struct softmax_param* param = (struct softmax_param*)node->op.param_mem; const_iterator iter; iter = ncnn_node.attrs.find(0); @@ -1164,7 +1173,7 @@ int load_no_param(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) } int load_bn(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) { - struct batchnorm_param* param = ( struct batchnorm_param* )node->op.param_mem; + struct batchnorm_param* param = (struct batchnorm_param*)node->op.param_mem; const_iterator iter; @@ -1176,7 +1185,7 @@ int load_bn(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) } int load_scale(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) { - struct scale_param* param = ( struct scale_param* )node->op.param_mem; + struct scale_param* param = (struct scale_param*)node->op.param_mem; const_iterator iter; @@ -1188,7 +1197,7 @@ int load_scale(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) } int load_clip(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) { - struct clip_param* param = ( struct clip_param* )node->op.param_mem; + struct clip_param* param = (struct clip_param*)node->op.param_mem; const_iterator iter; iter = ncnn_node.attrs.find(1); @@ -1203,7 +1212,7 @@ int load_clip(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) } int load_fc(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) { - struct fc_param* param = ( struct fc_param* )node->op.param_mem; + struct fc_param* param = (struct fc_param*)node->op.param_mem; const_iterator iter; @@ -1215,8 +1224,7 @@ int load_fc(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) } int load_flatten(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) { - struct flatten_param* param = ( struct flatten_param* )node->op.param_mem; - + struct flatten_param* param = (struct flatten_param*)node->op.param_mem; param->axis = 1; @@ -1224,7 +1232,7 @@ int load_flatten(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) } int load_reshape(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) { - struct reshape_param* param = ( struct reshape_param* )node->op.param_mem; + struct reshape_param* param = (struct reshape_param*)node->op.param_mem; std::vector dim_shape; const_iterator iter; iter = ncnn_node.attrs.find(3); @@ -1246,8 +1254,10 @@ int load_reshape(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) { dim_shape.push_back(std::atoi(iter->second.c_str())); } - }else { - dim_shape.push_back(0); + } + else + { + dim_shape.push_back(0); } iter = ncnn_node.attrs.find(1); if (iter != ncnn_node.attrs.end()) @@ -1270,7 +1280,7 @@ int load_reshape(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) param->re_shape = (int*)sys_malloc(sizeof(int) * size); param->dim_size = size; for (int i = 0; i < size; i++) - { + { param->re_shape[i] = dim_shape[i]; } @@ -1278,7 +1288,7 @@ int load_reshape(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) } int load_eltwise(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) { - struct eltwise_param* param = ( struct eltwise_param* )node->op.param_mem; + struct eltwise_param* param = (struct eltwise_param*)node->op.param_mem; const_iterator iter; std::vector coef; @@ -1309,7 +1319,7 @@ int load_eltwise(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) } int load_resize(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) { - struct interp_param* param = ( struct interp_param* )node->op.param_mem; + struct interp_param* param = (struct interp_param*)node->op.param_mem; std::vector v1, v2; const_iterator iter; @@ -1323,7 +1333,9 @@ int load_resize(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) { ParseAttr_n(iter->second, v1); param->width_scale = v1.at(0); - } else { + } + else + { param->width_scale = 0; } iter = ncnn_node.attrs.find(2); @@ -1331,16 +1343,20 @@ int load_resize(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) { ParseAttr_n(iter->second, v2); param->height_scale = v2.at(0); - } else { + } + else + { param->height_scale = 0; } iter = ncnn_node.attrs.find(3); - if(iter != ncnn_node.attrs.end()){ + if (iter != ncnn_node.attrs.end()) + { ParseAttr_n(iter->second, v2); param->output_width = v2.at(0); } iter = ncnn_node.attrs.find(4); - if(iter != ncnn_node.attrs.end()){ + if (iter != ncnn_node.attrs.end()) + { ParseAttr_n(iter->second, v2); param->output_height = v2.at(0); } @@ -1348,7 +1364,7 @@ int load_resize(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) } int load_slice(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) { - struct slice_param* param = ( struct slice_param* )node->op.param_mem; + struct slice_param* param = (struct slice_param*)node->op.param_mem; // param->isncnn= true; param->iscaffe = false; param->ismxnet = false; @@ -1357,75 +1373,78 @@ int load_slice(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) const_iterator iter; iter = ncnn_node.attrs.find(0); std::vector v1; - if(iter != ncnn_node.attrs.end()){ + if (iter != ncnn_node.attrs.end()) + { ParseAttr_n(iter->second, v1); std::vector slice_shape; - for(int i = 0; i < (int)v1.size(); i++){ + for (int i = 0; i < (int)v1.size(); i++) + { // param->slice_point_.push_back((int)v1.at(i)); } } iter = ncnn_node.attrs.find(1); - if(iter != ncnn_node.attrs.end()){ - param->axis = std::atoi(iter->second.c_str())+1; + if (iter != ncnn_node.attrs.end()) + { + param->axis = std::atoi(iter->second.c_str()) + 1; } return 0; } int load_unary(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) { - struct unary_param* param = ( struct unary_param* )node->op.param_mem; + struct unary_param* param = (struct unary_param*)node->op.param_mem; const_iterator iter; iter = ncnn_node.attrs.find(0); - if(iter != ncnn_node.attrs.end()) + if (iter != ncnn_node.attrs.end()) param->type = std::atoi(iter->second.c_str()); - + return 0; } int load_deconv(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) { - struct deconv_param* param = ( struct deconv_param* )node->op.param_mem; + struct deconv_param* param = (struct deconv_param*)node->op.param_mem; const_iterator iter; std::vector v1; iter = ncnn_node.attrs.find(0); - if(iter != ncnn_node.attrs.end()) + if (iter != ncnn_node.attrs.end()) { param->num_output = std::atoi(iter->second.c_str()); } iter = ncnn_node.attrs.find(1); - if(iter != ncnn_node.attrs.end()) + if (iter != ncnn_node.attrs.end()) { param->kernel_w = std::atoi(iter->second.c_str()); param->kernel_h = std::atoi(iter->second.c_str()); } iter = ncnn_node.attrs.find(11); - if(iter != ncnn_node.attrs.end()) + if (iter != ncnn_node.attrs.end()) { param->kernel_h = std::atoi(iter->second.c_str()); } iter = ncnn_node.attrs.find(2); - if(iter != ncnn_node.attrs.end()) + if (iter != ncnn_node.attrs.end()) { param->dilation_w = std::atoi(iter->second.c_str()); param->dilation_h = std::atoi(iter->second.c_str()); } iter = ncnn_node.attrs.find(12); - if(iter != ncnn_node.attrs.end()) + if (iter != ncnn_node.attrs.end()) { param->dilation_h = std::atoi(iter->second.c_str()); } iter = ncnn_node.attrs.find(3); - if(iter != ncnn_node.attrs.end()) + if (iter != ncnn_node.attrs.end()) { param->stride_h = std::atoi(iter->second.c_str()); param->stride_w = std::atoi(iter->second.c_str()); } iter = ncnn_node.attrs.find(13); - if(iter != ncnn_node.attrs.end()) + if (iter != ncnn_node.attrs.end()) { param->stride_w = std::atoi(iter->second.c_str()); - } + } iter = ncnn_node.attrs.find(4); - if(iter != ncnn_node.attrs.end()) + if (iter != ncnn_node.attrs.end()) { param->pad_w0 = std::atoi(iter->second.c_str()); param->pad_w1 = std::atoi(iter->second.c_str()); @@ -1433,22 +1452,22 @@ int load_deconv(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) param->pad_h1 = std::atoi(iter->second.c_str()); } iter = ncnn_node.attrs.find(15); - if(iter != ncnn_node.attrs.end()) + if (iter != ncnn_node.attrs.end()) { param->pad_w1 = std::atoi(iter->second.c_str()); - } + } iter = ncnn_node.attrs.find(16); - if(iter != ncnn_node.attrs.end()) + if (iter != ncnn_node.attrs.end()) { param->pad_h0 = std::atoi(iter->second.c_str()); - } + } iter = ncnn_node.attrs.find(17); - if(iter != ncnn_node.attrs.end()) + if (iter != ncnn_node.attrs.end()) { param->pad_h1 = std::atoi(iter->second.c_str()); } iter = ncnn_node.attrs.find(7); - if(iter != ncnn_node.attrs.end()) + if (iter != ncnn_node.attrs.end()) { param->group = std::atoi(iter->second.c_str()); } @@ -1460,26 +1479,26 @@ int load_deconv(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node) */ void ncnn_serializer::register_op_load() { - op_load_map["Convolution"] = std::pair(OP_CONV, load_conv); - op_load_map["ConvolutionDepthWise"] = std::pair(OP_CONV, load_conv); - op_load_map["Pooling"] = std::pair(OP_POOL, load_pool); - op_load_map["ReLU"] = std::pair(OP_RELU, load_relu); - op_load_map["Concat"] = std::pair(OP_CONCAT, load_concat); - op_load_map["Softmax"] = std::pair(OP_SOFTMAX, load_softmax); - op_load_map["Dropout"] = std::pair(OP_DROPOUT, load_no_param); - op_load_map["BatchNorm"] = std::pair(OP_BATCHNORM, load_bn); - op_load_map["Scale"] = std::pair(OP_SCALE, load_scale); - op_load_map["Clip"] = std::pair(OP_CLIP, load_clip); - op_load_map["InnerProduct"] = std::pair(OP_FC, load_fc); + op_load_map["Convolution"] = std::pair(OP_CONV, load_conv); + op_load_map["ConvolutionDepthWise"] = std::pair(OP_CONV, load_conv); + op_load_map["Pooling"] = std::pair(OP_POOL, load_pool); + op_load_map["ReLU"] = std::pair(OP_RELU, load_relu); + op_load_map["Concat"] = std::pair(OP_CONCAT, load_concat); + op_load_map["Softmax"] = std::pair(OP_SOFTMAX, load_softmax); + op_load_map["Dropout"] = std::pair(OP_DROPOUT, load_no_param); + op_load_map["BatchNorm"] = std::pair(OP_BATCHNORM, load_bn); + op_load_map["Scale"] = std::pair(OP_SCALE, load_scale); + op_load_map["Clip"] = std::pair(OP_CLIP, load_clip); + op_load_map["InnerProduct"] = std::pair(OP_FC, load_fc); // op_load_map["PriorBox"] = std::pair(); - op_load_map["Flatten"] = std::pair(OP_FLATTEN, load_flatten); - op_load_map["Reshape"] = std::pair(OP_RESHAPE, load_reshape); - op_load_map["Eltwise"] = std::pair(OP_ELTWISE, load_eltwise); - op_load_map["Interp"] = std::pair(OP_INTERP, load_resize); - op_load_map["Slice"] = std::pair(OP_SLICE, load_slice); - op_load_map["Sigmoid"] = std::pair(OP_SIGMOID, load_no_param); - op_load_map["UnaryOp"] = std::pair(OP_UNARY, load_unary); - op_load_map["DeconvolutionDepthWise"] = std::pair(OP_DECONV, load_deconv); + op_load_map["Flatten"] = std::pair(OP_FLATTEN, load_flatten); + op_load_map["Reshape"] = std::pair(OP_RESHAPE, load_reshape); + op_load_map["Eltwise"] = std::pair(OP_ELTWISE, load_eltwise); + op_load_map["Interp"] = std::pair(OP_INTERP, load_resize); + op_load_map["Slice"] = std::pair(OP_SLICE, load_slice); + op_load_map["Sigmoid"] = std::pair(OP_SIGMOID, load_no_param); + op_load_map["UnaryOp"] = std::pair(OP_UNARY, load_unary); + op_load_map["DeconvolutionDepthWise"] = std::pair(OP_DECONV, load_deconv); } /* * OPERAOTR REGISTER FUNCTION DEFINE FOR NCNN SERIALIZER END diff --git a/tools/convert_tool/ncnn/ncnn2tengine.hpp b/tools/convert_tool/ncnn/ncnn2tengine.hpp index d55149a21..e4f06baaf 100644 --- a/tools/convert_tool/ncnn/ncnn2tengine.hpp +++ b/tools/convert_tool/ncnn/ncnn2tengine.hpp @@ -36,19 +36,18 @@ #include #include -extern "C" -{ - #include "tengine/c_api.h" - #include "graph/graph.h" - #include "graph/subgraph.h" - #include "graph/node.h" - #include "graph/tensor.h" - #include "executer/executer.h" - #include "module/module.h" - #include "utility/log.h" - #include "utility/sys_port.h" - #include "utility/vector.h" - #include "../utils/save_graph/op_include.h" +extern "C" { +#include "tengine/c_api.h" +#include "graph/graph.h" +#include "graph/subgraph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "executer/executer.h" +#include "module/module.h" +#include "utility/log.h" +#include "utility/sys_port.h" +#include "utility/vector.h" +#include "../utils/save_graph/op_include.h" } #define NCNN_MAX_PARAM_COUNT 32 @@ -58,7 +57,7 @@ struct NcnnNode std::string name; int optimized; std::map attrs; - std::map> opt_attrs; + std::map > opt_attrs; //std::vector inputs; std::vector inputs_name; std::vector output_name; @@ -81,13 +80,13 @@ class ncnn_serializer typedef std::map::const_iterator const_iterator; private: - std::unordered_map> op_load_map; + std::unordered_map > op_load_map; int load_model(ir_graph_t* graph, std::string params_file, std::string bin_file); int set_graph_input(ir_graph_t* graph, const std::vector& nodelist, const std::vector& paramlist); int load_constant_tensor(ir_graph_t* graph, const std::vector& nodelist, const std::vector& paramlist); int load_binary_file(const char* fname, std::vector& paramlist, std::vector& nodelist); int load_model_file(const char* fname, std::vector& nodelist); - int load_graph_node(ir_graph_t* graph,const std::vector& nodelist, const std::vector& paramlist); + int load_graph_node(ir_graph_t* graph, const std::vector& nodelist, const std::vector& paramlist); bool find_op_load_method(const std::string& op_name); int read(void* buf, int size); ir_tensor_t* find_tensor(ir_graph_t* graph, const std::string& tensor_name); @@ -98,12 +97,16 @@ class ncnn_serializer struct { int loaded; - union { int i; float f; }; + union + { + int i; + float f; + }; float* f_data; int* i_data; float* f_data_array; int* i_data_array; - } params[NCNN_MAX_PARAM_COUNT]; + } params[NCNN_MAX_PARAM_COUNT]; }; #endif \ No newline at end of file diff --git a/tools/convert_tool/onnx/onnx2tengine.cpp b/tools/convert_tool/onnx/onnx2tengine.cpp index 5c70f6f2b..aa152df69 100644 --- a/tools/convert_tool/onnx/onnx2tengine.cpp +++ b/tools/convert_tool/onnx/onnx2tengine.cpp @@ -25,12 +25,11 @@ #include "onnx2tengine.hpp" - /* * SELF DEFINE VARIABLE * FOR ONNX SERIALIZER */ -const int OP_VERSION=1; +const int OP_VERSION = 1; static int op_set; /* @@ -38,7 +37,7 @@ static int op_set; */ bool onnx_serializer::find_op_load_method(const std::string& op_name) { - if(op_load_map.count(op_name)) + if (op_load_map.count(op_name)) return true; return false; @@ -52,9 +51,9 @@ ir_tensor_t* find_tensor(ir_graph_t* graph, const std::string& tensor_name) if (tensor->name == tensor_name) { return tensor; - } + } } - + return nullptr; } @@ -76,29 +75,29 @@ const int get_onnx_tensor_data_type(const onnx::TensorProto& onnx_tensor) int tensor_data_type = -1; switch (onnx_tensor.data_type()) { - case 1: - tensor_data_type = TENGINE_DT_FP32; - break; - case 2: - tensor_data_type = TENGINE_DT_UINT8; - break; - case 3: - tensor_data_type = TENGINE_DT_INT8; - break; - case 5: - tensor_data_type = TENGINE_DT_INT16; - break; - case 6: // int 32 - case 7: // int 64 - tensor_data_type = TENGINE_DT_INT32; - break; - case 10: - tensor_data_type = TENGINE_DT_FP16; - break; - - default: - fprintf(stderr, "tensor: %s. data type unsupported in get data type: %d.\n", onnx_tensor.name().c_str(), onnx_tensor.data_type()); - return -1; + case 1: + tensor_data_type = TENGINE_DT_FP32; + break; + case 2: + tensor_data_type = TENGINE_DT_UINT8; + break; + case 3: + tensor_data_type = TENGINE_DT_INT8; + break; + case 5: + tensor_data_type = TENGINE_DT_INT16; + break; + case 6: // int 32 + case 7: // int 64 + tensor_data_type = TENGINE_DT_INT32; + break; + case 10: + tensor_data_type = TENGINE_DT_FP16; + break; + + default: + fprintf(stderr, "tensor: %s. data type unsupported in get data type: %d.\n", onnx_tensor.name().c_str(), onnx_tensor.data_type()); + return -1; } return tensor_data_type; @@ -122,11 +121,11 @@ onnx::TensorProto get_node_attr_tensor(const onnx::NodeProto& node, const char* * ASSIST FUNCTIONS FOR ONNX SERIALIZER END */ -int onnx_serializer::load_model_file(std::string model_file, onnx::ModelProto &model) +int onnx_serializer::load_model_file(std::string model_file, onnx::ModelProto& model) { std::ifstream is(model_file, std::ios::in | std::ios::binary); - if(!is.is_open()) + if (!is.is_open()) { fprintf(stderr, "cannot open file: %s \n", model_file.c_str()); return -1; @@ -145,7 +144,7 @@ int onnx_serializer::load_model_file(std::string model_file, onnx::ModelProto &m is.close(); - if(!ret) + if (!ret) { fprintf(stderr, "onnx serializer: parse file: %s \n", model_file.c_str()); return -1; @@ -190,14 +189,12 @@ int onnx_serializer::load_constant_tensor(ir_graph_t* graph, const onnx::GraphPr } for (int i = 0; i < node_count; i++) { - const onnx::NodeProto& node = onnx_graph.node(i); const std::string& op = node.op_type(); - - if ((op == "Reshape" || op == "Gather" || op == "Div" || op == "Resize") ) - { + if ((op == "Reshape" || op == "Gather" || op == "Div" || op == "Resize")) + { const onnx::TensorProto& onnx_tensor = node_tensor[node.input(1)]; std::pair t(node.input(1), 0); tensor_check.insert(t); @@ -206,10 +203,10 @@ int onnx_serializer::load_constant_tensor(ir_graph_t* graph, const onnx::GraphPr { return -1; } - + const char* name = node.input(1).c_str(); int dim_num = onnx_tensor.dims_size(); - int *dims = new int[dim_num]; + int* dims = new int[dim_num]; for (int j = 0; j < dim_num; j++) { dims[j] = onnx_tensor.dims(j); @@ -225,19 +222,19 @@ int onnx_serializer::load_constant_tensor(ir_graph_t* graph, const onnx::GraphPr set_ir_tensor_shape(ir_tensor, dims, dim_num); ir_tensor->tensor_type = TENSOR_TYPE_CONST; // set tensor data - if ( 7 == onnx_tensor.data_type()) + if (7 == onnx_tensor.data_type()) { - int tensor_size = ir_tensor->elem_num * sizeof(int64_t); + int tensor_size = ir_tensor->elem_num * sizeof(int64_t); ir_tensor->data = sys_malloc(tensor_size); int64_t* mem_buf = (int64_t*)ir_tensor->data; - if(onnx_tensor.has_raw_data()) + if (onnx_tensor.has_raw_data()) { int64_t* raw_data = (int64_t*)onnx_tensor.raw_data().data(); for (int j = 0; j < ir_tensor->elem_num; j++) { mem_buf[j] = raw_data[j]; } - } + } else { int64_t* raw_data = (int64_t*)onnx_tensor.int64_data().data(); @@ -249,10 +246,10 @@ int onnx_serializer::load_constant_tensor(ir_graph_t* graph, const onnx::GraphPr } else { - int tensor_size = ir_tensor->elem_num * sizeof(uint8_t); + int tensor_size = ir_tensor->elem_num * sizeof(uint8_t); ir_tensor->data = sys_malloc(tensor_size); uint8_t* mem_buf = (uint8_t*)ir_tensor->data; - if(onnx_tensor.has_raw_data()) + if (onnx_tensor.has_raw_data()) { uint8_t* raw_data = (uint8_t*)onnx_tensor.raw_data().data(); for (int j = 0; j < ir_tensor->elem_num; j++) @@ -272,9 +269,8 @@ int onnx_serializer::load_constant_tensor(ir_graph_t* graph, const onnx::GraphPr ir_node_t* ir_node = create_ir_node(graph, name, OP_CONST, OP_VERSION); set_ir_node_output_tensor(ir_node, 0, ir_tensor); } - } - + return 0; } @@ -284,7 +280,7 @@ int onnx_serializer::load_initializer_tensor(ir_graph_t* graph, const onnx::Grap for (int i = 0; i < const_tensor_num; i++) { const onnx::TensorProto& onnx_tensor = onnx_graph.initializer(i); - + if (onnx_tensor.data_type() != 1 && onnx_tensor.data_type() != 6 && onnx_tensor.data_type() != 7) // fp32 int32 int64 { fprintf(stderr, "const tensor data type is not fp32 or int32 or int64. \n"); @@ -300,7 +296,7 @@ int onnx_serializer::load_initializer_tensor(ir_graph_t* graph, const onnx::Grap } const char* name = onnx_tensor.name().c_str(); int dim_num = onnx_tensor.dims_size(); - int *dims = new int[dim_num]; + int* dims = new int[dim_num]; for (int j = 0; j < dim_num; j++) { dims[j] = onnx_tensor.dims(j); @@ -320,12 +316,12 @@ int onnx_serializer::load_initializer_tensor(ir_graph_t* graph, const onnx::Grap ir_tensor->dim_num = 1; ir_tensor->dims[0] = 1; } - + if (onnx_tensor.has_raw_data()) { if (onnx_tensor.data_type() == 1) //fp32 { - int tensor_size = ir_tensor->elem_num * sizeof(float); + int tensor_size = ir_tensor->elem_num * sizeof(float); ir_tensor->data = sys_malloc(tensor_size); float* mem_buf = (float*)ir_tensor->data; float* raw_data = (float*)onnx_tensor.raw_data().c_str(); @@ -336,7 +332,7 @@ int onnx_serializer::load_initializer_tensor(ir_graph_t* graph, const onnx::Grap } else if (onnx_tensor.data_type() == 6) // int32 { - int tensor_size = ir_tensor->elem_num * sizeof(int32_t); + int tensor_size = ir_tensor->elem_num * sizeof(int32_t); ir_tensor->data = sys_malloc(tensor_size); int32_t* mem_buf = (int32_t*)ir_tensor->data; int32_t* raw_data = (int32_t*)onnx_tensor.raw_data().data(); @@ -347,7 +343,7 @@ int onnx_serializer::load_initializer_tensor(ir_graph_t* graph, const onnx::Grap } else if (onnx_tensor.data_type() == 7) // int64 { - int tensor_size = ir_tensor->elem_num * sizeof(int64_t); + int tensor_size = ir_tensor->elem_num * sizeof(int64_t); ir_tensor->data = sys_malloc(tensor_size); int64_t* mem_buf = (int64_t*)ir_tensor->data; int64_t* raw_data = (int64_t*)onnx_tensor.raw_data().data(); @@ -403,7 +399,7 @@ int onnx_serializer::load_initializer_tensor(ir_graph_t* graph, const onnx::Grap return -1; } } - + ir_node_t* ir_node = create_ir_node(graph, name, OP_CONST, OP_VERSION); set_ir_node_output_tensor(ir_node, 0, ir_tensor); } @@ -416,7 +412,7 @@ int onnx_serializer::set_graph_input(ir_graph_t* graph, const onnx::GraphProto& for (int i = 0; i < onnx_graph.input_size(); i++) { const onnx::ValueInfoProto& val = onnx_graph.input(i); - if(get_ir_tensor_index_from_name(graph, val.name().c_str()) != -1) + if (get_ir_tensor_index_from_name(graph, val.name().c_str()) != -1) continue; // now, catch an input tensor @@ -424,11 +420,11 @@ int onnx_serializer::set_graph_input(ir_graph_t* graph, const onnx::GraphProto& const onnx::TypeProto::Tensor& tensor_type = type.tensor_type(); const onnx::TensorShapeProto& shape = tensor_type.shape(); int has_shape = 1; - int *dims = new int[shape.dim_size()]; - for(int j = 0; j < shape.dim_size(); j++) + int* dims = new int[shape.dim_size()]; + for (int j = 0; j < shape.dim_size(); j++) { const onnx::TensorShapeProto::Dimension& dim = shape.dim(j); - if(dim.has_dim_param()) + if (dim.has_dim_param()) { has_shape = 0; break; @@ -460,26 +456,26 @@ int onnx_serializer::load_graph_node(ir_graph_t* graph, const onnx::GraphProto& { int i; std::vector no_supported_op; - for(i = 0; i < onnx_graph.node_size(); i++) + for (i = 0; i < onnx_graph.node_size(); i++) { const onnx::NodeProto& onnx_node = onnx_graph.node(i); const std::string& onnx_op_name = onnx_node.op_type(); - if(!find_op_load_method(onnx_op_name)) + if (!find_op_load_method(onnx_op_name)) { - auto it = find(no_supported_op.begin(),no_supported_op.end(),onnx_op_name); - if(it == no_supported_op.end()) + auto it = find(no_supported_op.begin(), no_supported_op.end(), onnx_op_name); + if (it == no_supported_op.end()) { - if(onnx_op_name == "Constant") + if (onnx_op_name == "Constant") continue; no_supported_op.push_back(onnx_op_name); } } } - if(no_supported_op.size()) + if (no_supported_op.size()) { fprintf(stderr, "These %zu op are not supported\n{ ", no_supported_op.size()); - for(int j = 0; j < (int) no_supported_op.size(); j++) + for (int j = 0; j < (int)no_supported_op.size(); j++) { fprintf(stderr, "%s ", no_supported_op[j].c_str()); } @@ -487,7 +483,7 @@ int onnx_serializer::load_graph_node(ir_graph_t* graph, const onnx::GraphProto& return -1; } - for(i = 0; i < onnx_graph.node_size(); i++) + for (i = 0; i < onnx_graph.node_size(); i++) { /* create ir node*/ const onnx::NodeProto& onnx_node = onnx_graph.node(i); @@ -515,7 +511,7 @@ int onnx_serializer::load_graph_node(ir_graph_t* graph, const onnx::GraphProto& continue; } int tensor_id = get_ir_tensor_index_from_name(graph, input_name.c_str()); - ir_tensor_t* tensor = get_ir_graph_tensor(graph, tensor_id); + ir_tensor_t* tensor = get_ir_graph_tensor(graph, tensor_id); tensor_check[tensor->name] = tensor_check[tensor->name] + 1; set_ir_node_input_tensor(ir_node, j, tensor); } @@ -549,16 +545,15 @@ int onnx_serializer::set_graph_output(ir_graph_t* graph, const onnx::GraphProto& const onnx::ValueInfoProto& val = onnx_graph.output(i); int tensor_id = get_ir_tensor_index_from_name(graph, val.name().c_str()); - const onnx::TypeProto& type = val.type(); const onnx::TypeProto::Tensor& tensor_type = type.tensor_type(); const onnx::TensorShapeProto& shape = tensor_type.shape(); int has_shape = 1; - int *dims = new int[shape.dim_size()]; - for(int j = 0; j < shape.dim_size(); j++) + int* dims = new int[shape.dim_size()]; + for (int j = 0; j < shape.dim_size(); j++) { const onnx::TensorShapeProto::Dimension& dim = shape.dim(j); - if(dim.has_dim_param()) + if (dim.has_dim_param()) { has_shape = 0; break; @@ -633,7 +628,7 @@ graph_t onnx_serializer::onnx2tengine(std::string model_file) int load_conv(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct conv_param* conv_param = ( struct conv_param* )node->op.param_mem; + struct conv_param* conv_param = (struct conv_param*)node->op.param_mem; for (int k = 0; k < onnx_node.attribute_size(); k++) { const onnx::AttributeProto& attr = onnx_node.attribute(k); @@ -697,45 +692,45 @@ int load_conv(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no int load_relu(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct relu_param* relu_param = ( struct relu_param* )node->op.param_mem; + struct relu_param* relu_param = (struct relu_param*)node->op.param_mem; relu_param->negative_slope = 0.f; return 0; } int load_pool(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct pool_param* pool_param = ( struct pool_param* )node->op.param_mem; + struct pool_param* pool_param = (struct pool_param*)node->op.param_mem; const std::string& onnx_op = onnx_node.op_type(); - if(onnx_op == "GlobalAveragePool") + if (onnx_op == "GlobalAveragePool") { pool_param->global = 1; pool_param->pool_method = POOL_AVG; } - else if(onnx_op == "MaxPool" || onnx_op == "AveragePool") + else if (onnx_op == "MaxPool" || onnx_op == "AveragePool") { pool_param->global = 0; - if(onnx_op == "AveragePool") + if (onnx_op == "AveragePool") pool_param->pool_method = POOL_AVG; else pool_param->pool_method = POOL_MAX; - for(int k = 0; k < onnx_node.attribute_size(); k++) + for (int k = 0; k < onnx_node.attribute_size(); k++) { const onnx::AttributeProto& attr = onnx_node.attribute(k); - if(attr.name() == "kernel_shape") + if (attr.name() == "kernel_shape") { pool_param->kernel_h = attr.ints(0); pool_param->kernel_w = attr.ints(1); } - else if(attr.name() == "strides") + else if (attr.name() == "strides") { pool_param->stride_h = attr.ints(0); pool_param->stride_w = attr.ints(1); } - else if(attr.name() == "pads") /* onnx pads: x0_begin, x1_begin, ... , x0_end, x1_end, ... */ + else if (attr.name() == "pads") /* onnx pads: x0_begin, x1_begin, ... , x0_end, x1_end, ... */ { pool_param->pad_h0 = attr.ints(0); pool_param->pad_h1 = attr.ints(2); @@ -758,7 +753,7 @@ int load_pool(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no int load_flatten(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct flatten_param* flatten_param = ( struct flatten_param* )node->op.param_mem; + struct flatten_param* flatten_param = (struct flatten_param*)node->op.param_mem; flatten_param->axis = 1; if (1 == onnx_node.attribute_size()) @@ -771,7 +766,7 @@ int load_flatten(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx int load_gemm(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct gemm_param* gemm_param = ( struct gemm_param* )node->op.param_mem; + struct gemm_param* gemm_param = (struct gemm_param*)node->op.param_mem; for (int k = 0; k < onnx_node.attribute_size(); k++) { const onnx::AttributeProto& attr = onnx_node.attribute(k); @@ -813,7 +808,7 @@ int load_gemm(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no // float* tmp = ( float* )sys_malloc(k * n * sizeof(float)); std::vector tmp(k * n); - float* data = ( float* )weight_tensor->data; + float* data = (float*)weight_tensor->data; for (int i = 0; i < n; i++) for (int j = 0; j < k; j++) { @@ -826,7 +821,7 @@ int load_gemm(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no if (gemm_param->alpha != 1) { - float* data = ( float* )weight_tensor->data; + float* data = (float*)weight_tensor->data; int tensor_size = weight_tensor->dims[0] * weight_tensor->dims[1]; for (int i = 0; i < tensor_size; i++) @@ -835,7 +830,7 @@ int load_gemm(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no if (gemm_param->beta != 1) { - float* data = ( float* )bias_tensor->data; + float* data = (float*)bias_tensor->data; int tensor_size = weight_tensor->dims[0]; for (int i = 0; i < tensor_size; i++) @@ -848,13 +843,13 @@ int load_gemm(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no } struct fc_param* fc_param = (struct fc_param*)node->op.param_mem; fc_param->num_output = weight_tensor->dims[0]; - + return 0; } int load_concat(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct concat_param* concat_param = ( struct concat_param* )node->op.param_mem; + struct concat_param* concat_param = (struct concat_param*)node->op.param_mem; for (int k = 0; k < onnx_node.attribute_size(); k++) { @@ -870,7 +865,7 @@ int load_concat(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ int load_bn(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct batchnorm_param* batchnorm_param = ( struct batchnorm_param* )node->op.param_mem; + struct batchnorm_param* batchnorm_param = (struct batchnorm_param*)node->op.param_mem; // get espilon for (int k = 0; k < onnx_node.attribute_size(); k++) @@ -888,39 +883,39 @@ int load_bn(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node int load_eltwise(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct eltwise_param* eltwise_param = ( struct eltwise_param* )node->op.param_mem; + struct eltwise_param* eltwise_param = (struct eltwise_param*)node->op.param_mem; const std::string& op_name = onnx_node.op_type(); if (op_name == "Add") { - eltwise_param->type = ELT_SUM; + eltwise_param->type = ELT_SUM; } else if (op_name == "Mul") { - eltwise_param->type = ELT_PROD; + eltwise_param->type = ELT_PROD; } else if (op_name == "Div") { - eltwise_param->type = ELT_DIV; + eltwise_param->type = ELT_DIV; } else if (op_name == "Floor") { - eltwise_param->type = ELT_FLOOR; + eltwise_param->type = ELT_FLOOR; } else if (op_name == "Exp") { - eltwise_param->type = ELT_EXP; + eltwise_param->type = ELT_EXP; } else if (op_name == "Sub") { - eltwise_param->type = ELT_SUB; + eltwise_param->type = ELT_SUB; } else if (op_name == "Pow") { - eltwise_param->type = ELT_POW; + eltwise_param->type = ELT_POW; } else if (op_name == "Sqrt") { - eltwise_param->type = ELT_SQRT; + eltwise_param->type = ELT_SQRT; } return 0; @@ -928,8 +923,8 @@ int load_eltwise(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx int load_transpose(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct transpose_param* transpose_param = ( struct transpose_param* )node->op.param_mem; - + struct transpose_param* transpose_param = (struct transpose_param*)node->op.param_mem; + const onnx::AttributeProto& attr = onnx_node.attribute(0); int size = attr.ints_size(); transpose_param->tr_shape = (int*)sys_malloc(sizeof(int) * size); @@ -944,7 +939,7 @@ int load_transpose(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& on int load_clip(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct clip_param* clip_param = ( struct clip_param* )node->op.param_mem; + struct clip_param* clip_param = (struct clip_param*)node->op.param_mem; int size = onnx_node.attribute_size(); for (int i = 0; i < size; i++) @@ -975,7 +970,7 @@ int load_clip(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no int load_reshape(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct reshape_param* reshape_param = ( struct reshape_param* )node->op.param_mem; + struct reshape_param* reshape_param = (struct reshape_param*)node->op.param_mem; ir_tensor_t* shape_tensor = find_tensor(graph, onnx_node.input(1)); if (shape_tensor == nullptr) @@ -990,7 +985,7 @@ int load_reshape(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx int64_t* data = (int64_t*)shape_tensor->data; for (int i = 0; i < size; i++) - { + { reshape_param->re_shape[i] = data[i]; } return 0; @@ -1004,7 +999,7 @@ int load_no_param(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onn int load_softmax(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct softmax_param* softmax_param = ( struct softmax_param* )node->op.param_mem; + struct softmax_param* softmax_param = (struct softmax_param*)node->op.param_mem; for (int k = 0; k < onnx_node.attribute_size(); k++) { @@ -1026,7 +1021,7 @@ int load_softmax(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx int load_elu(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct elu_param* elu_param = ( struct elu_param* )node->op.param_mem; + struct elu_param* elu_param = (struct elu_param*)node->op.param_mem; for (int k = 0; k < onnx_node.attribute_size(); k++) { @@ -1053,9 +1048,9 @@ int load_interp(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ mode = attr.s(); } } - if(mode != "nearest") + if (mode != "nearest") { - struct interp_param* interp_param = ( struct interp_param* )node->op.param_mem; + struct interp_param* interp_param = (struct interp_param*)node->op.param_mem; if (onnx_node.input_size() == 1) { @@ -1085,7 +1080,7 @@ int load_interp(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ { const std::string& input_name = onnx_node.input(1); ir_tensor_t* tensor = find_tensor(graph, input_name); - float* data = ( float* )tensor->data; + float* data = (float*)tensor->data; interp_param->height_scale = data[2]; interp_param->width_scale = data[3]; @@ -1098,7 +1093,7 @@ int load_interp(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ { interp_param->resize_type = 2; } - } + } else { if (change_node_op(node, OP_RESIZE) < 0) @@ -1111,7 +1106,7 @@ int load_interp(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ { const std::string& input_name = onnx_node.input(1); ir_tensor_t* tensor = find_tensor(graph, input_name); - float* data = ( float* )tensor->data; + float* data = (float*)tensor->data; resize_param->scale_h = data[2]; resize_param->scale_w = data[3]; } @@ -1127,7 +1122,7 @@ int load_interp(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ int load_leaky_relu(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct relu_param* relu_param = ( struct relu_param* )node->op.param_mem; + struct relu_param* relu_param = (struct relu_param*)node->op.param_mem; const onnx::AttributeProto& attr = onnx_node.attribute(0); relu_param->negative_slope = attr.f(); @@ -1136,7 +1131,7 @@ int load_leaky_relu(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& o int load_slice(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct slice_param* slice_param = ( struct slice_param* )node->op.param_mem; + struct slice_param* slice_param = (struct slice_param*)node->op.param_mem; slice_param->step = 1; slice_param->axis = 0; @@ -1162,7 +1157,7 @@ int load_slice(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_n { end = INT_MAX; } - slice_param->end = ( int )end; + slice_param->end = (int)end; } else if (attr.name() == "starts") { @@ -1200,7 +1195,7 @@ int load_slice(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_n int load_split(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct split_param* split_param = ( struct split_param* )node->op.param_mem; + struct split_param* split_param = (struct split_param*)node->op.param_mem; split_param->is_onnx = true; for (int k = 0; k < onnx_node.attribute_size(); k++) { @@ -1229,7 +1224,7 @@ int load_split(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_n int load_unsqueeze(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct unsqueeze_param* unsqueeze_param = ( struct unsqueeze_param* )node->op.param_mem; + struct unsqueeze_param* unsqueeze_param = (struct unsqueeze_param*)node->op.param_mem; std::vector axises; for (int k = 0; k < onnx_node.attribute_size(); k++) @@ -1248,12 +1243,12 @@ int load_unsqueeze(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& on if (axises.empty() && node->input_num == 2) { ir_tensor_t* axes_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]); - int* data = ( int* )axes_tensor->data; + int* data = (int*)axes_tensor->data; for (int i = 0; i < axes_tensor->elem_num; i++) { axises.push_back(data[i]); } - + // remove axes tensor node->input_num = 1; } @@ -1265,13 +1260,13 @@ int load_unsqueeze(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& on { unsqueeze_param->axises[i] = axises[i]; } - + return 0; } int load_squeeze(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct squeeze_param* squeeze_param = ( struct squeeze_param* )node->op.param_mem; + struct squeeze_param* squeeze_param = (struct squeeze_param*)node->op.param_mem; for (int k = 0; k < onnx_node.attribute_size(); k++) { @@ -1299,7 +1294,7 @@ int load_squeeze(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx } } } - + return 0; } @@ -1308,7 +1303,7 @@ int load_matmul(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ ir_tensor_t* input_tensor = find_tensor(graph, onnx_node.input(0)); ir_tensor_t* weight_tensor = find_tensor(graph, onnx_node.input(1)); - if(2 == input_tensor->dim_num && weight_tensor->tensor_type == TENSOR_TYPE_CONST) + if (2 == input_tensor->dim_num && weight_tensor->tensor_type == TENSOR_TYPE_CONST) { // swap shape int k = weight_tensor->dims[0]; @@ -1319,7 +1314,7 @@ int load_matmul(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ // float* tmp = ( float* )sys_malloc(k * n * sizeof(float)); std::vector tmp(k * n); - float* data = ( float* )weight_tensor->data; + float* data = (float*)weight_tensor->data; for (int i = 0; i < n; i++) { @@ -1335,36 +1330,36 @@ int load_matmul(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ { return -1; } - struct fc_param* fc_param = ( struct fc_param* )node->op.param_mem; + struct fc_param* fc_param = (struct fc_param*)node->op.param_mem; fc_param->num_output = weight_tensor->dims[0]; } - + return 0; } int load_reducel2(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct reducel2_param* reducel2_param = ( struct reducel2_param* )node->op.param_mem; + struct reducel2_param* reducel2_param = (struct reducel2_param*)node->op.param_mem; for (int k = 0; k < onnx_node.attribute_size(); k++) { const onnx::AttributeProto& attr = onnx_node.attribute(k); if (attr.name() == "axes") { - reducel2_param->axis = attr.ints(0); // TODO:Support muti axis + reducel2_param->axis = attr.ints(0); // TODO:Support muti axis } if (attr.name() == "keepdims") { reducel2_param->keepdim = attr.i(); } } - + return 0; } int load_gather(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct gather_param* gather_param = ( struct gather_param* )node->op.param_mem; + struct gather_param* gather_param = (struct gather_param*)node->op.param_mem; ir_tensor_t* indices_tensor = find_tensor(graph, onnx_node.input(1)); for (int k = 0; k < onnx_node.attribute_size(); k++) @@ -1375,16 +1370,16 @@ int load_gather(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ gather_param->axis = attr.i(); } } - int64_t* data = ( int64_t* )indices_tensor->data; + int64_t* data = (int64_t*)indices_tensor->data; gather_param->indices_num = *data; gather_param->is_onnx = 1; - + return 0; } int load_comparison(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct comparison_param* comparison_param = ( struct comparison_param* )node->op.param_mem; + struct comparison_param* comparison_param = (struct comparison_param*)node->op.param_mem; const std::string& op_name = onnx_node.op_type(); if (op_name == "Greater") @@ -1405,13 +1400,13 @@ int load_comparison(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& o int load_LRN(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct lrn_param* lrn_param = ( struct lrn_param* )node->op.param_mem; + struct lrn_param* lrn_param = (struct lrn_param*)node->op.param_mem; for (int k = 0; k < onnx_node.attribute_size(); k++) { const onnx::AttributeProto& attr = onnx_node.attribute(k); if (attr.name() == "alpha") { - lrn_param->alpha = attr.f(); // TODO:Support multi axis + lrn_param->alpha = attr.f(); // TODO:Support multi axis } if (attr.name() == "beta") { @@ -1426,13 +1421,13 @@ int load_LRN(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_nod lrn_param->local_size = attr.i(); } } - + return 0; } int load_unary(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct unary_param* unary_param = ( struct unary_param* )node->op.param_mem; + struct unary_param* unary_param = (struct unary_param*)node->op.param_mem; const std::string& op_name = onnx_node.op_type(); if (op_name == "Abs") @@ -1467,13 +1462,13 @@ int load_unary(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_n { unary_param->type = 14; } - + return 0; } int load_logical(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct logical_param* logical_param = ( struct logical_param* )node->op.param_mem; + struct logical_param* logical_param = (struct logical_param*)node->op.param_mem; const std::string& op_name = onnx_node.op_type(); if (op_name == "And") @@ -1484,19 +1479,19 @@ int load_logical(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx { logical_param->type = 1; } - + return 0; } int load_pad(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct pad_param* pad_param = ( struct pad_param* )node->op.param_mem; - - if (onnx_node.attribute_size() == 1) // since opset 11, 'pads' and 'value' have been moved from attributes to inputs + struct pad_param* pad_param = (struct pad_param*)node->op.param_mem; + + if (onnx_node.attribute_size() == 1) // since opset 11, 'pads' and 'value' have been moved from attributes to inputs { const std::string& input_name_pad = onnx_node.input(1); ir_tensor_t* tensor_pad = find_tensor(graph, input_name_pad); - int64_t* data_pad = ( int64_t * )tensor_pad->data; + int64_t* data_pad = (int64_t*)tensor_pad->data; pad_param->pad_0_h = data_pad[0]; pad_param->pad_0_w = data_pad[4]; pad_param->pad_1_h = data_pad[1]; @@ -1510,12 +1505,11 @@ int load_pad(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_nod { const std::string& input_name_value = onnx_node.input(2); ir_tensor_t* tensor_value = find_tensor(graph, input_name_value); - float* data_value = ( float * )tensor_value->data; + float* data_value = (float*)tensor_value->data; pad_param->value = data_value[0]; } - } - + for (int k = 0; k < onnx_node.attribute_size(); k++) { const onnx::AttributeProto& attr = onnx_node.attribute(k); @@ -1550,20 +1544,21 @@ int load_pad(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_nod pad_param->value = attr.f(); } } - if(onnx_node.input_size() > 1){ + if (onnx_node.input_size() > 1) + { ir_tensor_t* shape_tensor = find_tensor(graph, onnx_node.input(1)); int size = shape_tensor->dims[0]; - int64_t* data = ( int64_t* )shape_tensor->data; + int64_t* data = (int64_t*)shape_tensor->data; for (int i = 0; i < size; i++) { - pad_param->pad_0_h = data[0]; - pad_param->pad_0_w = data[4]; - pad_param->pad_1_h = data[1]; - pad_param->pad_1_w = data[5]; - pad_param->pad_2_h = data[2]; - pad_param->pad_2_w = data[6]; - pad_param->pad_3_h = data[3]; - pad_param->pad_3_w = data[7]; + pad_param->pad_0_h = data[0]; + pad_param->pad_0_w = data[4]; + pad_param->pad_1_h = data[1]; + pad_param->pad_1_w = data[5]; + pad_param->pad_2_h = data[2]; + pad_param->pad_2_w = data[6]; + pad_param->pad_3_h = data[3]; + pad_param->pad_3_w = data[7]; } } return 0; @@ -1571,7 +1566,7 @@ int load_pad(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_nod int load_reduce(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct reduction_param* reduction_param = ( struct reduction_param* )node->op.param_mem; + struct reduction_param* reduction_param = (struct reduction_param*)node->op.param_mem; const std::string& op_name = onnx_node.op_type(); if (op_name == "ReduceSum") @@ -1612,7 +1607,7 @@ int load_reduce(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ reduction_param->dim_2 = -2; reduction_param->dim_3 = -2; reduction_param->keepdim = 1; - + ir_tensor_t* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]); int input_dim_num = input_tensor->dim_num; int size = onnx_node.attribute_size(); @@ -1705,8 +1700,8 @@ int load_reduce(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ int load_argmax(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct argmax_param* argmax_param = ( struct argmax_param* )node->op.param_mem; - + struct argmax_param* argmax_param = (struct argmax_param*)node->op.param_mem; + int size = onnx_node.attribute_size(); argmax_param->axis = 0; for (int i = 0; i < size; i++) @@ -1721,14 +1716,14 @@ int load_argmax(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ argmax_param->keepdims = attr.i(); } } - + return 0; } int load_argmin(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct argmin_param* argmin_param = ( struct argmin_param* )node->op.param_mem; - + struct argmin_param* argmin_param = (struct argmin_param*)node->op.param_mem; + int size = onnx_node.attribute_size(); argmin_param->axis = 0; for (int i = 0; i < size; i++) @@ -1743,14 +1738,14 @@ int load_argmin(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ argmin_param->keepdims = attr.i(); } } - + return 0; } int load_log_softmax(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct logsoftmax_param* logsoftmax_param = ( struct logsoftmax_param* )node->op.param_mem; - + struct logsoftmax_param* logsoftmax_param = (struct logsoftmax_param*)node->op.param_mem; + int size = onnx_node.attribute_size(); logsoftmax_param->axis = 1; for (int i = 0; i < size; i++) @@ -1761,14 +1756,14 @@ int load_log_softmax(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& logsoftmax_param->axis = attr.i(); } } - + return 0; } int load_deconv(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct deconv_param* deconv_param = ( struct deconv_param* )node->op.param_mem; - + struct deconv_param* deconv_param = (struct deconv_param*)node->op.param_mem; + for (int k = 0; k < onnx_node.attribute_size(); k++) { const onnx::AttributeProto& attr = onnx_node.attribute(k); @@ -1812,7 +1807,7 @@ int load_deconv(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ { const std::string& input_name = onnx_node.input(k); ir_tensor_t* tensor = find_tensor(graph, input_name); - if (k == 1) // weight + if (k == 1) // weight { int* dim = tensor->dims; /* onnx hide the output channel in weight ..*/ @@ -1821,14 +1816,14 @@ int load_deconv(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ deconv_param->kernel_w = dim[3]; } } - + return 0; } int load_scatter(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct scatter_param* scatter_param = ( struct scatter_param* )node->op.param_mem; - + struct scatter_param* scatter_param = (struct scatter_param*)node->op.param_mem; + int size = onnx_node.attribute_size(); scatter_param->axis = 0; scatter_param->is_onnx = 1; @@ -1840,14 +1835,14 @@ int load_scatter(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx scatter_param->axis = attr.i(); } } - + return 0; } int load_selu(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct selu_param* selu_param = ( struct selu_param* )node->op.param_mem; - + struct selu_param* selu_param = (struct selu_param*)node->op.param_mem; + for (int k = 0; k < onnx_node.attribute_size(); k++) { const onnx::AttributeProto& attr = onnx_node.attribute(k); @@ -1860,14 +1855,14 @@ int load_selu(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no selu_param->lambda = attr.f(); } } - + return 0; } int load_hard_sigmoid(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct hard_sigmoid_param* hard_sigmoid_param = ( struct hard_sigmoid_param* )node->op.param_mem; - + struct hard_sigmoid_param* hard_sigmoid_param = (struct hard_sigmoid_param*)node->op.param_mem; + for (int k = 0; k < onnx_node.attribute_size(); k++) { const onnx::AttributeProto& attr = onnx_node.attribute(k); @@ -1880,26 +1875,26 @@ int load_hard_sigmoid(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& hard_sigmoid_param->beta = attr.f(); } } - + return 0; } int load_tile(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct tile_param* tile_param = ( struct tile_param* )node->op.param_mem; + struct tile_param* tile_param = (struct tile_param*)node->op.param_mem; tile_param->frame_flag = 1; - + return 0; } int load_cast(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct cast_param* cast_param = ( struct cast_param* )node->op.param_mem; + struct cast_param* cast_param = (struct cast_param*)node->op.param_mem; - for(int k = 0; k < onnx_node.attribute_size(); k++) + for (int k = 0; k < onnx_node.attribute_size(); k++) { const onnx::AttributeProto& attr = onnx_node.attribute(k); - if(attr.name() == "to") + if (attr.name() == "to") cast_param->type_to = attr.i(); } cast_param->type_from = 1; @@ -1909,11 +1904,13 @@ int load_cast(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no int load_depth_to_space(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct depthtospace_param* depthtospace_param = ( struct depthtospace_param* )node->op.param_mem; + struct depthtospace_param* depthtospace_param = (struct depthtospace_param*)node->op.param_mem; - for(int k = 0; k < onnx_node.attribute_size(); k++){ + for (int k = 0; k < onnx_node.attribute_size(); k++) + { const onnx::AttributeProto& attr = onnx_node.attribute(k); - if(attr.name() == "block_size"){ + if (attr.name() == "block_size") + { depthtospace_param->block_size = attr.i(); } } @@ -1923,12 +1920,12 @@ int load_depth_to_space(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProt int load_instance_norm(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct instancenorm_Param* instancenorm_param = ( struct instancenorm_Param* )node->op.param_mem; + struct instancenorm_Param* instancenorm_param = (struct instancenorm_Param*)node->op.param_mem; - for(int k = 0; k < onnx_node.attribute_size(); k++) + for (int k = 0; k < onnx_node.attribute_size(); k++) { const onnx::AttributeProto& attr = onnx_node.attribute(k); - if(attr.name() == "epsilon") + if (attr.name() == "epsilon") instancenorm_param->eps = attr.f(); } @@ -1937,34 +1934,34 @@ int load_instance_norm(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto int load_resize(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct interp_param* interp_param = ( struct interp_param* )node->op.param_mem; + struct interp_param* interp_param = (struct interp_param*)node->op.param_mem; - if(onnx_node.input_size() == 1) + if (onnx_node.input_size() == 1) { - for(int k = 0; k < onnx_node.attribute_size(); k++) + for (int k = 0; k < onnx_node.attribute_size(); k++) { const onnx::AttributeProto& attr = onnx_node.attribute(k); - if(attr.name() == "scales") + if (attr.name() == "scales") { interp_param->height_scale = attr.f(); interp_param->width_scale = attr.f(); } } } - else if(onnx_node.input_size() == 2) // opset 10 + else if (onnx_node.input_size() == 2) // opset 10 { const std::string& input_name = onnx_node.input(1); ir_tensor_t* tensor = find_tensor(graph, input_name); - float* data = ( float* )tensor->data; + float* data = (float*)tensor->data; interp_param->height_scale = data[2]; interp_param->width_scale = data[3]; } - else if(onnx_node.input_size() == 3) // opset 11 + else if (onnx_node.input_size() == 3) // opset 11 { const std::string& input_name = onnx_node.input(2); ir_tensor_t* tensor = find_tensor(graph, input_name); - float* data = ( float* )tensor->data; + float* data = (float*)tensor->data; interp_param->height_scale = data[2]; interp_param->width_scale = data[3]; @@ -1973,7 +1970,7 @@ int load_resize(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ { const std::string& input_name = onnx_node.input(3); ir_tensor_t* tensor = find_tensor(graph, input_name); - float* data = ( float* )tensor->data; + float* data = (float*)tensor->data; interp_param->height_scale = data[2]; interp_param->width_scale = data[3]; @@ -1985,10 +1982,10 @@ int load_resize(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ } std::string mode = "nearest"; - for(int k = 0; k < onnx_node.attribute_size(); k++) + for (int k = 0; k < onnx_node.attribute_size(); k++) { const onnx::AttributeProto& attr = onnx_node.attribute(k); - if(attr.name() == "mode") + if (attr.name() == "mode") mode = attr.s(); } @@ -2006,16 +2003,16 @@ int load_resize(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ int load_LSTM(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct lstm_param* lstm_param = ( struct lstm_param* )node->op.param_mem; + struct lstm_param* lstm_param = (struct lstm_param*)node->op.param_mem; int s_size; std::string lstm_type; - for(int k = 0; k < onnx_node.attribute_size(); k++) + for (int k = 0; k < onnx_node.attribute_size(); k++) { const onnx::AttributeProto& attr = onnx_node.attribute(k); - if(attr.name() == "hidden_size") + if (attr.name() == "hidden_size") s_size = attr.i(); - if(attr.name() == "direction") + if (attr.name() == "direction") lstm_type = attr.s(); } @@ -2028,7 +2025,7 @@ int load_LSTM(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no int load_expand(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node) { - struct expand_param* expand_param = ( struct expand_param* )node->op.param_mem; + struct expand_param* expand_param = (struct expand_param*)node->op.param_mem; ir_tensor_t* shape_tensor = find_tensor(graph, onnx_node.input(1)); if (shape_tensor == nullptr) @@ -2047,92 +2044,91 @@ int load_expand(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_ return 0; } - /* * OPERAOTR REGISTER FUNCTION DEFINE FOR ONNX SERIALIZER START */ void onnx_serializer::register_op_load() { - op_load_map["Abs"] = std::pair(OP_UNARY, load_unary); - op_load_map["Acos"] = std::pair(OP_UNARY, load_unary); - op_load_map["And"] = std::pair(OP_LOGICAL, load_logical); - op_load_map["ArgMax"] = std::pair(OP_ARGMAX, load_argmax); - op_load_map["ArgMin"] = std::pair(OP_ARGMIN, load_argmin); - op_load_map["Asin"] = std::pair(OP_UNARY, load_unary); - op_load_map["Atan"] = std::pair(OP_UNARY, load_unary); - op_load_map["AveragePool"] = std::pair(OP_POOL, load_pool); - op_load_map["Add"] = std::pair(OP_ELTWISE, load_eltwise); - op_load_map["BatchNormalization"] = std::pair(OP_BATCHNORM, load_bn); - op_load_map["Conv"] = std::pair(OP_CONV, load_conv); - op_load_map["ConvTranspose"] = std::pair(OP_DECONV, load_deconv); - op_load_map["Concat"] = std::pair(OP_CONCAT, load_concat); - op_load_map["Clip"] = std::pair(OP_CLIP, load_clip); - op_load_map["Ceil"] = std::pair(OP_UNARY, load_unary); - op_load_map["Cos"] = std::pair(OP_UNARY, load_unary); - op_load_map["Cast"] = std::pair(OP_CAST, load_cast); - op_load_map["Dropout"] = std::pair(OP_DROPOUT, load_no_param); - op_load_map["DepthToSpace"] = std::pair(OP_DEPTHTOSPACE, load_depth_to_space); - op_load_map["Div"] = std::pair(OP_ELTWISE, load_eltwise); - op_load_map["Elu"] = std::pair(OP_ELU, load_elu); - op_load_map["Exp"] = std::pair(OP_ELTWISE, load_eltwise); - op_load_map["Expand"] = std::pair(OP_EXPAND, load_expand); - op_load_map["Equal"] = std::pair(OP_COMPARISON, load_comparison); - op_load_map["Flatten"] = std::pair(OP_FLATTEN, load_flatten); - op_load_map["Floor"] = std::pair(OP_ELTWISE, load_eltwise); - op_load_map["Gemm"] = std::pair(OP_GEMM, load_gemm); - op_load_map["Gather"] = std::pair(OP_GATHER, load_gather); - op_load_map["Greater"] = std::pair(OP_COMPARISON, load_comparison); - op_load_map["GlobalAveragePool"] = std::pair(OP_POOL, load_pool); - op_load_map["HardSwish"] = std::pair(OP_HARDSWISH, load_no_param); - op_load_map["HardSigmoid"] = std::pair(OP_HARDSIGMOID, load_hard_sigmoid); + op_load_map["Abs"] = std::pair(OP_UNARY, load_unary); + op_load_map["Acos"] = std::pair(OP_UNARY, load_unary); + op_load_map["And"] = std::pair(OP_LOGICAL, load_logical); + op_load_map["ArgMax"] = std::pair(OP_ARGMAX, load_argmax); + op_load_map["ArgMin"] = std::pair(OP_ARGMIN, load_argmin); + op_load_map["Asin"] = std::pair(OP_UNARY, load_unary); + op_load_map["Atan"] = std::pair(OP_UNARY, load_unary); + op_load_map["AveragePool"] = std::pair(OP_POOL, load_pool); + op_load_map["Add"] = std::pair(OP_ELTWISE, load_eltwise); + op_load_map["BatchNormalization"] = std::pair(OP_BATCHNORM, load_bn); + op_load_map["Conv"] = std::pair(OP_CONV, load_conv); + op_load_map["ConvTranspose"] = std::pair(OP_DECONV, load_deconv); + op_load_map["Concat"] = std::pair(OP_CONCAT, load_concat); + op_load_map["Clip"] = std::pair(OP_CLIP, load_clip); + op_load_map["Ceil"] = std::pair(OP_UNARY, load_unary); + op_load_map["Cos"] = std::pair(OP_UNARY, load_unary); + op_load_map["Cast"] = std::pair(OP_CAST, load_cast); + op_load_map["Dropout"] = std::pair(OP_DROPOUT, load_no_param); + op_load_map["DepthToSpace"] = std::pair(OP_DEPTHTOSPACE, load_depth_to_space); + op_load_map["Div"] = std::pair(OP_ELTWISE, load_eltwise); + op_load_map["Elu"] = std::pair(OP_ELU, load_elu); + op_load_map["Exp"] = std::pair(OP_ELTWISE, load_eltwise); + op_load_map["Expand"] = std::pair(OP_EXPAND, load_expand); + op_load_map["Equal"] = std::pair(OP_COMPARISON, load_comparison); + op_load_map["Flatten"] = std::pair(OP_FLATTEN, load_flatten); + op_load_map["Floor"] = std::pair(OP_ELTWISE, load_eltwise); + op_load_map["Gemm"] = std::pair(OP_GEMM, load_gemm); + op_load_map["Gather"] = std::pair(OP_GATHER, load_gather); + op_load_map["Greater"] = std::pair(OP_COMPARISON, load_comparison); + op_load_map["GlobalAveragePool"] = std::pair(OP_POOL, load_pool); + op_load_map["HardSwish"] = std::pair(OP_HARDSWISH, load_no_param); + op_load_map["HardSigmoid"] = std::pair(OP_HARDSIGMOID, load_hard_sigmoid); op_load_map["InstanceNormalization"] = std::pair(OP_INSTANCENORM, load_instance_norm); - op_load_map["Log"] = std::pair(OP_UNARY, load_unary); - op_load_map["LRN"] = std::pair(OP_LRN, load_LRN); - op_load_map["Less"] = std::pair(OP_COMPARISON, load_comparison); - op_load_map["LSTM"] = std::pair(OP_LSTM, load_LSTM); - op_load_map["LeakyRelu"] = std::pair(OP_RELU, load_leaky_relu); - op_load_map["LogSoftmax"] = std::pair(OP_LOGSOFTMAX, load_log_softmax); - op_load_map["Mul"] = std::pair(OP_ELTWISE, load_eltwise); - op_load_map["Max"] = std::pair(OP_MAXIMUM, load_no_param); - op_load_map["Min"] = std::pair(OP_MINIMUM, load_no_param); - op_load_map["Mean"] = std::pair(OP_MEAN, load_no_param); - op_load_map["MatMul"] = std::pair(OP_MATMUL, load_matmul); - op_load_map["MaxPool"] = std::pair(OP_POOL, load_pool); - op_load_map["Neg"] = std::pair(OP_UNARY, load_unary); - op_load_map["Or"] = std::pair(OP_LOGICAL, load_logical); - op_load_map["Pad"] = std::pair(OP_PAD, load_pad); - op_load_map["Pow"] = std::pair(OP_ELTWISE, load_eltwise); - op_load_map["PRelu"] = std::pair(OP_PRELU, load_no_param); - op_load_map["Relu"] = std::pair(OP_RELU, load_relu); - op_load_map["Resize"] = std::pair(OP_INTERP, load_resize); - op_load_map["Reshape"] = std::pair(OP_RESHAPE, load_reshape); - op_load_map["ReduceL2"] = std::pair(OP_REDUCEL2, load_reducel2); - op_load_map["ReduceMean"] = std::pair(OP_REDUCTION, load_reduce); - op_load_map["ReduceLogSumExp"] = std::pair(OP_REDUCTION, load_reduce); - op_load_map["ReduceLogSum"] = std::pair(OP_REDUCTION, load_reduce); - op_load_map["ReduceMax"] = std::pair(OP_REDUCTION, load_reduce); - op_load_map["ReduceMin"] = std::pair(OP_REDUCTION, load_reduce); - op_load_map["ReduceProd"] = std::pair(OP_REDUCTION, load_reduce); - op_load_map["ReduceSumSquare"] = std::pair(OP_REDUCTION, load_reduce); - op_load_map["ReduceSum"] = std::pair(OP_REDUCTION, load_reduce); - op_load_map["Reciprocal"] = std::pair(OP_RECIPROCAL, load_no_param); - op_load_map["Sub"] = std::pair(OP_ELTWISE, load_eltwise); - op_load_map["Selu"] = std::pair(OP_SELU, load_selu); - op_load_map["Sqrt"] = std::pair(OP_ELTWISE, load_eltwise); - op_load_map["Slice"] = std::pair(OP_SLICE, load_slice); - op_load_map["Split"] = std::pair(OP_SPLIT, load_split); - op_load_map["Shape"] = std::pair(OP_SHAPE, load_no_param); - op_load_map["Squeeze"] = std::pair(OP_SQUEEZE, load_squeeze); - op_load_map["Scatter"] = std::pair(OP_SCATTER, load_scatter); - op_load_map["Sigmoid"] = std::pair(OP_SIGMOID, load_no_param); - op_load_map["Softmax"] = std::pair(OP_SOFTMAX, load_softmax); - op_load_map["Softplus"] = std::pair(OP_SOFTPLUS, load_no_param); - op_load_map["Tanh"] = std::pair(OP_TANH, load_no_param); - op_load_map["Tile"] = std::pair(OP_TILE, load_tile); - op_load_map["Transpose"] = std::pair(OP_TRANSPOSE, load_transpose); - op_load_map["Upsample"] = std::pair(OP_INTERP, load_interp); - op_load_map["Unsqueeze"] = std::pair(OP_UNSQUEEZE, load_unsqueeze); - op_load_map["Where"] = std::pair(OP_WHERE, load_no_param); + op_load_map["Log"] = std::pair(OP_UNARY, load_unary); + op_load_map["LRN"] = std::pair(OP_LRN, load_LRN); + op_load_map["Less"] = std::pair(OP_COMPARISON, load_comparison); + op_load_map["LSTM"] = std::pair(OP_LSTM, load_LSTM); + op_load_map["LeakyRelu"] = std::pair(OP_RELU, load_leaky_relu); + op_load_map["LogSoftmax"] = std::pair(OP_LOGSOFTMAX, load_log_softmax); + op_load_map["Mul"] = std::pair(OP_ELTWISE, load_eltwise); + op_load_map["Max"] = std::pair(OP_MAXIMUM, load_no_param); + op_load_map["Min"] = std::pair(OP_MINIMUM, load_no_param); + op_load_map["Mean"] = std::pair(OP_MEAN, load_no_param); + op_load_map["MatMul"] = std::pair(OP_MATMUL, load_matmul); + op_load_map["MaxPool"] = std::pair(OP_POOL, load_pool); + op_load_map["Neg"] = std::pair(OP_UNARY, load_unary); + op_load_map["Or"] = std::pair(OP_LOGICAL, load_logical); + op_load_map["Pad"] = std::pair(OP_PAD, load_pad); + op_load_map["Pow"] = std::pair(OP_ELTWISE, load_eltwise); + op_load_map["PRelu"] = std::pair(OP_PRELU, load_no_param); + op_load_map["Relu"] = std::pair(OP_RELU, load_relu); + op_load_map["Resize"] = std::pair(OP_INTERP, load_resize); + op_load_map["Reshape"] = std::pair(OP_RESHAPE, load_reshape); + op_load_map["ReduceL2"] = std::pair(OP_REDUCEL2, load_reducel2); + op_load_map["ReduceMean"] = std::pair(OP_REDUCTION, load_reduce); + op_load_map["ReduceLogSumExp"] = std::pair(OP_REDUCTION, load_reduce); + op_load_map["ReduceLogSum"] = std::pair(OP_REDUCTION, load_reduce); + op_load_map["ReduceMax"] = std::pair(OP_REDUCTION, load_reduce); + op_load_map["ReduceMin"] = std::pair(OP_REDUCTION, load_reduce); + op_load_map["ReduceProd"] = std::pair(OP_REDUCTION, load_reduce); + op_load_map["ReduceSumSquare"] = std::pair(OP_REDUCTION, load_reduce); + op_load_map["ReduceSum"] = std::pair(OP_REDUCTION, load_reduce); + op_load_map["Reciprocal"] = std::pair(OP_RECIPROCAL, load_no_param); + op_load_map["Sub"] = std::pair(OP_ELTWISE, load_eltwise); + op_load_map["Selu"] = std::pair(OP_SELU, load_selu); + op_load_map["Sqrt"] = std::pair(OP_ELTWISE, load_eltwise); + op_load_map["Slice"] = std::pair(OP_SLICE, load_slice); + op_load_map["Split"] = std::pair(OP_SPLIT, load_split); + op_load_map["Shape"] = std::pair(OP_SHAPE, load_no_param); + op_load_map["Squeeze"] = std::pair(OP_SQUEEZE, load_squeeze); + op_load_map["Scatter"] = std::pair(OP_SCATTER, load_scatter); + op_load_map["Sigmoid"] = std::pair(OP_SIGMOID, load_no_param); + op_load_map["Softmax"] = std::pair(OP_SOFTMAX, load_softmax); + op_load_map["Softplus"] = std::pair(OP_SOFTPLUS, load_no_param); + op_load_map["Tanh"] = std::pair(OP_TANH, load_no_param); + op_load_map["Tile"] = std::pair(OP_TILE, load_tile); + op_load_map["Transpose"] = std::pair(OP_TRANSPOSE, load_transpose); + op_load_map["Upsample"] = std::pair(OP_INTERP, load_interp); + op_load_map["Unsqueeze"] = std::pair(OP_UNSQUEEZE, load_unsqueeze); + op_load_map["Where"] = std::pair(OP_WHERE, load_no_param); } /* * OPERAOTR REGISTER FUNCTION DEFINE FOR ONNX SERIALIZER END diff --git a/tools/convert_tool/onnx/onnx2tengine.hpp b/tools/convert_tool/onnx/onnx2tengine.hpp index 17e52ec50..50df1b83a 100644 --- a/tools/convert_tool/onnx/onnx2tengine.hpp +++ b/tools/convert_tool/onnx/onnx2tengine.hpp @@ -37,19 +37,18 @@ #include #include -extern "C" -{ - #include "tengine/c_api.h" - #include "graph/graph.h" - #include "graph/subgraph.h" - #include "graph/node.h" - #include "graph/tensor.h" - #include "executer/executer.h" - #include "module/module.h" - #include "utility/log.h" - #include "utility/sys_port.h" - #include "utility/vector.h" - #include "../utils/save_graph/op_include.h" +extern "C" { +#include "tengine/c_api.h" +#include "graph/graph.h" +#include "graph/subgraph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "executer/executer.h" +#include "module/module.h" +#include "utility/log.h" +#include "utility/sys_port.h" +#include "utility/vector.h" +#include "../utils/save_graph/op_include.h" } class onnx_serializer @@ -59,19 +58,17 @@ class onnx_serializer typedef int (*op_load_t)(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node); private: - std::unordered_map> op_load_map; + std::unordered_map > op_load_map; int load_model(ir_graph_t* graph, std::string model_file); int set_graph_output(ir_graph_t* graph, const onnx::GraphProto& onnx_graph); int load_graph_node(ir_graph_t* graph, const onnx::GraphProto& onnx_graph); int set_graph_input(ir_graph_t* graph, const onnx::GraphProto& onnx_graph); int load_initializer_tensor(ir_graph_t* graph, const onnx::GraphProto& onnx_graph); int load_constant_tensor(ir_graph_t* graph, const onnx::GraphProto& onnx_graph); - int load_model_file(std::string model_file, onnx::ModelProto &model); + int load_model_file(std::string model_file, onnx::ModelProto& model); bool find_op_load_method(const std::string& op_name); void register_op_load(); std::unordered_map tensor_check; }; - - #endif \ No newline at end of file diff --git a/tools/convert_tool/utils/graph_optimizer/graph_opt.cpp b/tools/convert_tool/utils/graph_optimizer/graph_opt.cpp index d7ba7264d..ef46266f7 100644 --- a/tools/convert_tool/utils/graph_optimizer/graph_opt.cpp +++ b/tools/convert_tool/utils/graph_optimizer/graph_opt.cpp @@ -50,7 +50,7 @@ static int erase_tensor_id(ir_graph_t* graph, int16_t id) node->output_tensors[j] = old_new_id[node->output_tensors[j]]; } } - + ir_tensor_t** new_tensor_list = (ir_tensor_t**)sys_realloc(graph->tensor_list, sizeof(ir_tensor_t*) * (graph->tensor_num - 1)); graph->tensor_list = new_tensor_list; graph->tensor_num--; @@ -68,7 +68,7 @@ static int erase_node_id(ir_graph_t* graph, int16_t id) for (size_t i = 0; i < graph->node_num; i++) { if (i == id) continue; - + ir_node_t* node = get_ir_graph_node(graph, i); node->index = j; graph->node_list[j] = graph->node_list[i]; @@ -93,7 +93,7 @@ static int erase_node_id(ir_graph_t* graph, int16_t id) { graph->output_nodes[i] = old_new_id[graph->output_nodes[i]]; } - + ir_node_t** new_node_list = (ir_node_t**)sys_realloc(graph->node_list, sizeof(ir_node_t*) * (graph->node_num - 1)); graph->node_list = new_node_list; graph->node_num--; @@ -124,7 +124,7 @@ static int delete_node(ir_graph_t* graph, int16_t pre_node_id, int16_t del_node_ } } pre_output_tensor->consumer_num = del_output_tensor->consumer_num; - + /* delete node */ if (erase_tensor_id(graph, del_node->output_tensors[0]) < 0 || erase_node_id(graph, del_node->index) < 0) { @@ -173,7 +173,7 @@ static int insert_node_id(ir_graph_t* graph, int16_t insert_node_id, int16_t ins { graph->output_nodes[i] = old_new_id[graph->output_nodes[i]]; } - + return 0; } @@ -226,7 +226,7 @@ static int add_node(ir_graph_t* graph, int16_t down_node_id, int add_node_type, if (tensor->tensor_type == TENSOR_TYPE_VAR) up_nodes.push_back(tensor->producer); } - + /* create node and its own tensor */ ir_node_t* add_node = create_ir_node(graph, name, add_node_type, 1); if (add_node == nullptr) @@ -242,7 +242,7 @@ static int add_node(ir_graph_t* graph, int16_t down_node_id, int add_node_type, { ir_node_t* up_node = get_ir_graph_node(graph, up_nodes[i]); ir_tensor_t* up_node_output_tensor = get_ir_graph_tensor(graph, up_node->output_tensors[0]); - for (size_t i = 0; i consumer_num; i++) + for (size_t i = 0; i < up_node_output_tensor->consumer_num; i++) { if (up_node_output_tensor->consumer[i] == down_node_id) up_node_output_tensor->consumer[i] = add_node->index; @@ -252,7 +252,7 @@ static int add_node(ir_graph_t* graph, int16_t down_node_id, int add_node_type, down_node->input_tensors[0] = add_tensor->index; add_tensor->consumer[0] = down_node_id; add_tensor->consumer_num = 1; - + /* insert node id */ if (insert_node_id(graph, add_node->index, down_node_id) < 0) return -1; @@ -265,7 +265,7 @@ static int add_node(ir_graph_t* graph, int16_t down_node_id, int add_node_type, } static int weight_bn(ir_graph_t* graph, ir_node_t* conv_node, float* mean, float* var, float* gamma, float* beta, float eps, - float rescale_factor, ir_tensor_t* bias_tensor) + float rescale_factor, ir_tensor_t* bias_tensor) { ir_tensor_t* kernel_tensor = get_ir_graph_tensor(graph, conv_node->input_tensors[1]); struct conv_param* param = (struct conv_param*)conv_node->op.param_mem; @@ -278,9 +278,9 @@ static int weight_bn(ir_graph_t* graph, ir_node_t* conv_node, float* mean, float int kernel_size = input_chan * kernel_x * kernel_y; float* kernel_data = (float*)kernel_tensor->data; int channel_num = kernel_tensor->dims[0]; - - float* scale_mean = ( float* )malloc(channel_num * sizeof(float)); - float* scale_var_inv = ( float* )malloc(channel_num * sizeof(float)); + + float* scale_mean = (float*)malloc(channel_num * sizeof(float)); + float* scale_var_inv = (float*)malloc(channel_num * sizeof(float)); float rescale_factor_tmp = rescale_factor; float* bias = NULL; @@ -313,7 +313,7 @@ static int weight_bn(ir_graph_t* graph, ir_node_t* conv_node, float* mean, float insert_node_id(graph, bias_node->index, kernel_tensor->producer); insert_tensor_id(graph, bias_tensor->index, kernel_tensor->index); } - + rescale_factor_tmp = rescale_factor_tmp ? 1 / rescale_factor_tmp : 0; if (NULL == bias) @@ -385,9 +385,9 @@ static int fc_weight_bn(ir_graph_t* graph, ir_node_t* fc_node, float* mean, floa int channel_num = kernel_tensor->dims[0]; int total_size = kernel_tensor->dims[1]; int kernel_size = total_size; - - float* scale_mean = ( float* )malloc(channel_num * sizeof(float)); - float* scale_var_inv = ( float* )malloc(channel_num * sizeof(float)); + + float* scale_mean = (float*)malloc(channel_num * sizeof(float)); + float* scale_var_inv = (float*)malloc(channel_num * sizeof(float)); float rescale_factor_tmp = rescale_factor; float* bias = NULL; @@ -420,7 +420,7 @@ static int fc_weight_bn(ir_graph_t* graph, ir_node_t* fc_node, float* mean, floa insert_node_id(graph, bias_node->index, kernel_tensor->producer); insert_tensor_id(graph, bias_tensor->index, kernel_tensor->index); } - + rescale_factor_tmp = rescale_factor_tmp ? 1 / rescale_factor_tmp : 0; if (NULL == bias) @@ -460,7 +460,7 @@ static int fc_weight_bn(ir_graph_t* graph, ir_node_t* fc_node, float* mean, floa float w_scale = scale_var_inv[o_c]; for (int i = 0; i < kernel_size; i++) { - kernel_data[o_c * kernel_size + i] = kernel_data[o_c * kernel_size + i] * w_scale ; + kernel_data[o_c * kernel_size + i] = kernel_data[o_c * kernel_size + i] * w_scale; } } @@ -493,7 +493,7 @@ static int change_node_op(ir_node_t* node, int new_op_type) static int fuse_conv_relu_common(ir_graph_t* graph) { /* get all conv-relu chain */ - std::vector> conv_relu_v; + std::vector > conv_relu_v; for (size_t i = 0; i < graph->node_num; i++) { ir_node_t* relu_node = get_ir_graph_node(graph, i); @@ -501,7 +501,7 @@ static int fuse_conv_relu_common(ir_graph_t* graph) continue; if (relu_node->op.type == OP_RELU) { - struct relu_param* relu_param =(struct relu_param*)relu_node->op.param_mem; + struct relu_param* relu_param = (struct relu_param*)relu_node->op.param_mem; if (relu_param->negative_slope != 0.f) continue; } @@ -516,7 +516,7 @@ static int fuse_conv_relu_common(ir_graph_t* graph) } /* fused */ - for (auto& conv_relu:conv_relu_v) + for (auto& conv_relu : conv_relu_v) { ir_node_t* conv_node = conv_relu.first; ir_node_t* relu_node = conv_relu.second; @@ -525,7 +525,7 @@ static int fuse_conv_relu_common(ir_graph_t* graph) conv_param->activation = 0; if (relu_node->op.type == OP_RELU6) conv_param->activation = 6; - + /* delete relu node */ if (delete_node(graph, conv_node->index, relu_node->index) < 0) { @@ -533,14 +533,14 @@ static int fuse_conv_relu_common(ir_graph_t* graph) return -1; } } - + return 0; } static int fuse_relu_eltwise(ir_graph_t* graph) { /* get all relu-eltwise chain */ - std::vector> relu_eltwise_v; + std::vector > relu_eltwise_v; for (size_t i = 0; i < graph->node_num; i++) { ir_node_t* elt_node = get_ir_graph_node(graph, i); @@ -548,8 +548,8 @@ static int fuse_relu_eltwise(ir_graph_t* graph) continue; struct eltwise_param* elt_param = (struct eltwise_param*)elt_node->op.param_mem; if (elt_param->type != ELT_MIN_SCALAR) - continue; // todo: verify 6 - + continue; // todo: verify 6 + /*Check if it is a relu + minimum*/ ir_tensor_t* relu_tensor = get_ir_graph_tensor(graph, elt_node->input_tensors[0]); ir_node_t* relu_node = get_ir_graph_node(graph, relu_tensor->producer); @@ -559,12 +559,12 @@ static int fuse_relu_eltwise(ir_graph_t* graph) } /* fused */ - for (auto& relu_elt:relu_eltwise_v) + for (auto& relu_elt : relu_eltwise_v) { ir_node_t* relu_node = relu_elt.first; ir_node_t* elt_node = relu_elt.second; relu_node->op.type = OP_RELU6; - + /* delete elt node */ if (delete_node(graph, relu_node->index, elt_node->index) < 0) { @@ -572,20 +572,20 @@ static int fuse_relu_eltwise(ir_graph_t* graph) return -1; } } - + return 0; } static int fuse_bn_scale(ir_graph_t* graph) { /* get all bn-scale chain */ - std::vector> bn_scale_v; + std::vector > bn_scale_v; for (size_t i = 0; i < graph->node_num; i++) { ir_node_t* scale_node = get_ir_graph_node(graph, i); if (scale_node->op.type != OP_SCALE) continue; - + /*Check if it is a bn + scale*/ ir_tensor_t* bn_tensor = get_ir_graph_tensor(graph, scale_node->input_tensors[0]); ir_node_t* bn_node = get_ir_graph_node(graph, bn_tensor->producer); @@ -595,7 +595,7 @@ static int fuse_bn_scale(ir_graph_t* graph) } /* fused */ - for (auto& bn_scale:bn_scale_v) + for (auto& bn_scale : bn_scale_v) { ir_node_t* bn_node = bn_scale.first; ir_node_t* scale_node = bn_scale.second; @@ -610,7 +610,7 @@ static int fuse_bn_scale(ir_graph_t* graph) struct batchnorm_param* param = (struct batchnorm_param*)bn_node->op.param_mem; param->caffe_flavor = 0; - + /* delete scale node */ if (delete_node(graph, bn_node->index, scale_node->index) < 0) { @@ -618,20 +618,20 @@ static int fuse_bn_scale(ir_graph_t* graph) return -1; } } - + return 0; } static int fuse_conv_bn(ir_graph_t* graph) { /* get all conv-bn chain */ - std::vector> conv_bn_v; + std::vector > conv_bn_v; for (size_t i = 0; i < graph->node_num; i++) { ir_node_t* bn_node = get_ir_graph_node(graph, i); if (bn_node->op.type != OP_BATCHNORM) continue; - + /*Check if it is a conv + bn*/ ir_tensor_t* conv_tensor = get_ir_graph_tensor(graph, bn_node->input_tensors[0]); ir_node_t* conv_node = get_ir_graph_node(graph, conv_tensor->producer); @@ -641,23 +641,23 @@ static int fuse_conv_bn(ir_graph_t* graph) } /* fused */ - for (auto& conv_bn:conv_bn_v) + for (auto& conv_bn : conv_bn_v) { ir_node_t* conv_node = conv_bn.first; ir_node_t* bn_node = conv_bn.second; struct batchnorm_param* bn_param = (struct batchnorm_param*)bn_node->op.param_mem; ir_tensor_t* bn_mean = get_ir_graph_tensor(graph, bn_node->input_tensors[3]); - ir_tensor_t* bn_var = get_ir_graph_tensor(graph, bn_node->input_tensors[4]); + ir_tensor_t* bn_var = get_ir_graph_tensor(graph, bn_node->input_tensors[4]); float* mean = (float*)bn_mean->data; float* var = (float*)bn_var->data; float* gamma = NULL; float* beta = NULL; - if(!bn_param->caffe_flavor) + if (!bn_param->caffe_flavor) { ir_tensor_t* bn_gamma = get_ir_graph_tensor(graph, bn_node->input_tensors[1]); - ir_tensor_t* bn_beta = get_ir_graph_tensor(graph, bn_node->input_tensors[2]); + ir_tensor_t* bn_beta = get_ir_graph_tensor(graph, bn_node->input_tensors[2]); gamma = (float*)bn_gamma->data; beta = (float*)bn_beta->data; } @@ -665,9 +665,9 @@ static int fuse_conv_bn(ir_graph_t* graph) ir_tensor_t* bias_tensor = nullptr; if (conv_node->input_num > 2) bias_tensor = get_ir_graph_tensor(graph, conv_node->input_tensors[2]); - + weight_bn(graph, conv_node, mean, var, gamma, beta, bn_param->eps, bn_param->rescale_factor, bias_tensor); - + /* delete elt node */ if (delete_node(graph, conv_node->index, bn_node->index) < 0) { @@ -675,20 +675,20 @@ static int fuse_conv_bn(ir_graph_t* graph) return -1; } } - + return 0; } static int fuse_fc_bn(ir_graph_t* graph) { /* get all fc-bn chain */ - std::vector> fc_bn_v; + std::vector > fc_bn_v; for (size_t i = 0; i < graph->node_num; i++) { ir_node_t* bn_node = get_ir_graph_node(graph, i); if (bn_node->op.type != OP_BATCHNORM) continue; - + /*Check if it is a fc + bn*/ ir_tensor_t* fc_tensor = get_ir_graph_tensor(graph, bn_node->input_tensors[0]); ir_node_t* fc_node = get_ir_graph_node(graph, fc_tensor->producer); @@ -698,23 +698,23 @@ static int fuse_fc_bn(ir_graph_t* graph) } /* fused */ - for (auto& fc_bn:fc_bn_v) + for (auto& fc_bn : fc_bn_v) { ir_node_t* fc_node = fc_bn.first; ir_node_t* bn_node = fc_bn.second; struct batchnorm_param* bn_param = (struct batchnorm_param*)bn_node->op.param_mem; ir_tensor_t* bn_mean = get_ir_graph_tensor(graph, bn_node->input_tensors[3]); - ir_tensor_t* bn_var = get_ir_graph_tensor(graph, bn_node->input_tensors[4]); + ir_tensor_t* bn_var = get_ir_graph_tensor(graph, bn_node->input_tensors[4]); float* mean = (float*)bn_mean->data; float* var = (float*)bn_var->data; float* gamma = NULL; float* beta = NULL; - if(!bn_param->caffe_flavor) + if (!bn_param->caffe_flavor) { ir_tensor_t* bn_gamma = get_ir_graph_tensor(graph, bn_node->input_tensors[1]); - ir_tensor_t* bn_beta = get_ir_graph_tensor(graph, bn_node->input_tensors[2]); + ir_tensor_t* bn_beta = get_ir_graph_tensor(graph, bn_node->input_tensors[2]); gamma = (float*)bn_gamma->data; beta = (float*)bn_beta->data; } @@ -722,7 +722,7 @@ static int fuse_fc_bn(ir_graph_t* graph) ir_tensor_t* bias_tensor = nullptr; if (fc_node->input_num > 2) bias_tensor = get_ir_graph_tensor(graph, fc_node->input_tensors[2]); - + fc_weight_bn(graph, fc_node, mean, var, gamma, beta, bn_param->eps, bn_param->rescale_factor, bias_tensor); /* delete bn node */ @@ -732,14 +732,14 @@ static int fuse_fc_bn(ir_graph_t* graph) return -1; } } - + return 0; } static int fuse_conv_unsqueeze(ir_graph_t* graph) { /* get all unsqueeze conv|fc eltwise chain */ - std::vector> fused_nodes; + std::vector > fused_nodes; for (size_t i = 0; i < graph->node_num; i++) { ir_node_t* elt_node = get_ir_graph_node(graph, i); @@ -748,7 +748,7 @@ static int fuse_conv_unsqueeze(ir_graph_t* graph) struct eltwise_param* param = (struct eltwise_param*)elt_node->op.param_mem; if (elt_node->input_num != 2 || param->type != ELT_SUM) // unsqueeze and conv|fc continue; - + /* Check if it is a (unsqueeze conv|fc) + eltwise */ ir_tensor_t* conv_tensor = get_ir_graph_tensor(graph, elt_node->input_tensors[0]); ir_tensor_t* unsq_tensor = get_ir_graph_tensor(graph, elt_node->input_tensors[1]); @@ -766,7 +766,7 @@ static int fuse_conv_unsqueeze(ir_graph_t* graph) ir_node_t* conv_or_fc_node = fused_nodes[i][0]; ir_node_t* unsq_node = fused_nodes[i][1]; ir_node_t* elt_node = fused_nodes[i][2]; - + ir_tensor_t* bias_tensor = get_ir_graph_tensor(graph, unsq_node->input_tensors[0]); set_ir_node_input_tensor(conv_or_fc_node, conv_or_fc_node->input_num, bias_tensor); bias_tensor->consumer[0] = conv_or_fc_node->index; @@ -778,7 +778,7 @@ static int fuse_conv_unsqueeze(ir_graph_t* graph) fprintf(stderr, "delete node:%s failed.\n", unsq_node->name); return -1; } - + /* delete elt node */ if (delete_node(graph, conv_or_fc_node->index, elt_node->index) < 0) { @@ -786,7 +786,7 @@ static int fuse_conv_unsqueeze(ir_graph_t* graph) return -1; } } - + return 0; } diff --git a/tools/convert_tool/utils/graph_optimizer/graph_opt.hpp b/tools/convert_tool/utils/graph_optimizer/graph_opt.hpp index c867f853b..8d5c67e8d 100644 --- a/tools/convert_tool/utils/graph_optimizer/graph_opt.hpp +++ b/tools/convert_tool/utils/graph_optimizer/graph_opt.hpp @@ -7,21 +7,20 @@ #include "string.h" #include #include "math.h" -extern "C" -{ - #include "tengine/c_api.h" - #include "graph/graph.h" - #include "graph/node.h" - #include "graph/tensor.h" - #include "module/module.h" - #include "utility/log.h" - #include "utility/sys_port.h" - - #include "convolution_param.h" - #include "relu_param.h" - #include "eltwise_param.h" - #include "batchnorm_param.h" - #include "fc_param.h" +extern "C" { +#include "tengine/c_api.h" +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "module/module.h" +#include "utility/log.h" +#include "utility/sys_port.h" + +#include "convolution_param.h" +#include "relu_param.h" +#include "eltwise_param.h" +#include "batchnorm_param.h" +#include "fc_param.h" } int graph_opt(graph_t graph); diff --git a/tools/convert_tool/utils/save_graph/save_graph.cpp b/tools/convert_tool/utils/save_graph/save_graph.cpp index 2a3ba3346..71a5efb8b 100644 --- a/tools/convert_tool/utils/save_graph/save_graph.cpp +++ b/tools/convert_tool/utils/save_graph/save_graph.cpp @@ -35,7 +35,7 @@ bool IsSaveString(void) { const char* env = std::getenv("TM_NO_STRING"); - if(env) + if (env) return false; else return true; @@ -45,7 +45,7 @@ bool IsSaveData(void) { const char* env = std::getenv("TM_FOR_BENCHMARK"); - if(env) + if (env) return false; else return true; @@ -53,7 +53,7 @@ bool IsSaveData(void) bool RegisterOpSaveMethod(const uint16_t& op_type, const op_save_t& save_func) { - if(op_save_map_.count(op_type)) + if (op_save_map_.count(op_type)) return false; op_save_map_[op_type] = save_func; @@ -61,7 +61,7 @@ bool RegisterOpSaveMethod(const uint16_t& op_type, const op_save_t& save_func) } tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, ir_tensor_t* tensor, - unsigned int tensor_id, unsigned int buffer_id) + unsigned int tensor_id, unsigned int buffer_id) { TM2_Tensor tm_tensor; tm_tensor.tensor_id = tensor_id; @@ -72,11 +72,11 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, ir_tenso bool tm_with_string = IsSaveString(); - if(tm_with_string) + if (tm_with_string) { std::string name = tensor->name; TM2_String tensor_name; - tensor_name.size = name.size() + 1; // including trailing \0 + tensor_name.size = name.size() + 1; // including trailing \0 tensor_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), tensor_name.size); tm_tensor.offset_s_tname = WriteTmObject(start_ptr, cur_pos, &tensor_name, sizeof(TM2_String)); } @@ -86,13 +86,13 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, ir_tenso /* Get the dims of the tensor */ int* dim = tensor->dims; size_t vector_size; - if(tensor->dim_num) + if (tensor->dim_num) { /* Write the vector of dims */ vector_size = sizeof(tm_size_t) + sizeof(int32_t) * tensor->dim_num; - TM2_Vector_dims* v_dims = ( TM2_Vector_dims* )malloc(vector_size); + TM2_Vector_dims* v_dims = (TM2_Vector_dims*)malloc(vector_size); v_dims->v_num = tensor->dim_num; - for(unsigned int i = 0; i < tensor->dim_num; i++) + for (unsigned int i = 0; i < tensor->dim_num; i++) { v_dims->dims[i] = dim[i]; } @@ -103,10 +103,10 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, ir_tenso tm_tensor.offset_vd_dims = TM2_NOT_SET; /* Write the quant params */ - if(tensor->quant_param_num != 0) + if (tensor->quant_param_num != 0) { vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * tensor->quant_param_num; - TM2_Vector_offsets* v_qtparams = ( TM2_Vector_offsets* )malloc(vector_size); + TM2_Vector_offsets* v_qtparams = (TM2_Vector_offsets*)malloc(vector_size); v_qtparams->v_num = tensor->quant_param_num; if (v_qtparams->v_num == 1) { @@ -117,7 +117,7 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, ir_tenso } else if (v_qtparams->v_num > 1) { - for(unsigned int i = 0; i < v_qtparams->v_num; i++) + for (unsigned int i = 0; i < v_qtparams->v_num; i++) { TM2_QuantParam qtparam; qtparam.zero_point = tensor->zp_list[i]; @@ -126,7 +126,6 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, ir_tenso v_qtparams->offsets[i] = WriteTmObject(start_ptr, cur_pos, &qtparam, sizeof(TM2_QuantParam)); } } - /* Write the vector of quant params */ tm_tensor.offect_vo_quantparams = WriteTmObject(start_ptr, cur_pos, v_qtparams, vector_size); @@ -139,21 +138,20 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, ir_tenso } tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, ir_graph_t* graph, ir_node_t* node, - name_map_t& tensor_name_map) + name_map_t& tensor_name_map) { - TM2_Node tm_node; - memset(&tm_node, 0 , sizeof(TM2_Node)); + memset(&tm_node, 0, sizeof(TM2_Node)); tm_node.node_id = node->index; tm_node.dynamic_shape = node->dynamic_shape; bool tm_with_string = IsSaveString(); - if(tm_with_string) + if (tm_with_string) { std::string name = node->name; TM2_String node_name; - node_name.size = name.size() + 1; // including trailing \0 + node_name.size = name.size() + 1; // including trailing \0 node_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), node_name.size); tm_node.offset_s_nname = WriteTmObject(start_ptr, cur_pos, &node_name, sizeof(TM2_String)); } @@ -163,13 +161,13 @@ tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, ir_graph_t unsigned int input_num = node->input_num; unsigned int output_num = node->output_num; - if(input_num) + if (input_num) { /* Write the vector of input indices */ size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * input_num; - TM2_Vector_indices* v_input_indices = ( TM2_Vector_indices* )malloc(vector_size); + TM2_Vector_indices* v_input_indices = (TM2_Vector_indices*)malloc(vector_size); v_input_indices->v_num = input_num; - for(unsigned int i = 0; i < input_num; i++) + for (unsigned int i = 0; i < input_num; i++) { ir_tensor_t* p_tensor = get_ir_graph_tensor(graph, node->input_tensors[i]); v_input_indices->indices[i] = tensor_name_map[p_tensor->name]; @@ -180,13 +178,13 @@ tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, ir_graph_t else tm_node.offset_vi_input_tensors = TM2_NOT_SET; - if(output_num) + if (output_num) { /* Write the vector of output indices */ size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * output_num; - TM2_Vector_indices* v_output_indices = ( TM2_Vector_indices* )malloc(vector_size); + TM2_Vector_indices* v_output_indices = (TM2_Vector_indices*)malloc(vector_size); v_output_indices->v_num = output_num; - for(unsigned int i = 0; i < output_num; i++) + for (unsigned int i = 0; i < output_num; i++) { ir_tensor_t* p_tensor = get_ir_graph_tensor(graph, node->output_tensors[i]); v_output_indices->indices[i] = tensor_name_map[p_tensor->name]; @@ -199,7 +197,7 @@ tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, ir_graph_t /* Write tm operator */ uint16_t op_type = node->op.type; - if(!op_save_map_.count(op_type)) + if (!op_save_map_.count(op_type)) { TLOG_ERR("cannot find save function for operator:%d \n", op_type); return false; @@ -230,12 +228,12 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, ir_gra bool tm_no_data = !IsSaveData(); /* Write the nodes */ size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * graph->node_num; - TM2_Vector_offsets* v_nodes = ( TM2_Vector_offsets* )malloc(vector_size); + TM2_Vector_offsets* v_nodes = (TM2_Vector_offsets*)malloc(vector_size); v_nodes->v_num = graph->node_num; - for(unsigned int i = 0; i < graph->node_num; i++) + for (unsigned int i = 0; i < graph->node_num; i++) { ir_node_t* p_node = get_ir_graph_node(graph, i); - for(unsigned int k = 0; k < p_node->output_num; k++) + for (unsigned int k = 0; k < p_node->output_num; k++) { ir_tensor_t* p_tensor = get_ir_graph_tensor(graph, p_node->output_tensors[k]); tensor_ptrs.push_back(p_tensor); @@ -249,12 +247,12 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, ir_gra /* Write the tensors */ vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * tensor_num; - TM2_Vector_offsets* v_tensors = ( TM2_Vector_offsets* )malloc(vector_size); + TM2_Vector_offsets* v_tensors = (TM2_Vector_offsets*)malloc(vector_size); v_tensors->v_num = tensor_num; - for(unsigned int i = 0; i < tensor_num; i++) + for (unsigned int i = 0; i < tensor_num; i++) { ir_tensor_t* p_tensor = tensor_ptrs[i]; - if(p_tensor->tensor_type == TENSOR_TYPE_CONST) + if (p_tensor->tensor_type == TENSOR_TYPE_CONST) { // buf_ptrs.push_back(p_tensor->GetMemAddr()); buf_ptrs.push_back(p_tensor->data); // may cause bug @@ -269,14 +267,14 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, ir_gra /* Write the buffers */ vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * buffer_num; - TM2_Vector_offsets* v_buffers = ( TM2_Vector_offsets* )malloc(vector_size); + TM2_Vector_offsets* v_buffers = (TM2_Vector_offsets*)malloc(vector_size); v_buffers->v_num = buffer_num; - for(unsigned int i = 0; i < buffer_num; i++) + for (unsigned int i = 0; i < buffer_num; i++) { TM2_Buffer tm_buf; tm_buf.size = buf_sizes[i]; - if(tm_no_data) + if (tm_no_data) { /* TM2_FOR_BENCHMARK environment variable exists. Not write buf data into the tm file */ tm_buf.offset_data = TM2_NOT_SET; @@ -284,8 +282,7 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, ir_gra else { /* TM2_FOR_BENCHMARK environment variable does not exist */ - tm_buf.offset_data = - WriteTmFileAlign1(start_ptr, cur_pos, reinterpret_cast(buf_ptrs[i]), tm_buf.size); + tm_buf.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, reinterpret_cast(buf_ptrs[i]), tm_buf.size); } v_buffers->offsets[i] = WriteTmObject(start_ptr, cur_pos, &tm_buf, sizeof(TM2_Buffer)); } @@ -294,9 +291,9 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, ir_gra /* Write the vector of input indices */ vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->input_num; - TM2_Vector_indices* v_input_indices = ( TM2_Vector_indices* )malloc(vector_size); + TM2_Vector_indices* v_input_indices = (TM2_Vector_indices*)malloc(vector_size); v_input_indices->v_num = graph->input_num; - for(unsigned int i = 0; i < graph->input_num; i++) + for (unsigned int i = 0; i < graph->input_num; i++) { v_input_indices->indices[i] = graph->input_nodes[i]; } @@ -304,9 +301,9 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, ir_gra /* Write the vector of output indices */ vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->output_num; - TM2_Vector_indices* v_output_indices = ( TM2_Vector_indices* )malloc(vector_size); + TM2_Vector_indices* v_output_indices = (TM2_Vector_indices*)malloc(vector_size); v_output_indices->v_num = graph->output_num; - for(unsigned int i = 0; i < graph->output_num; i++) + for (unsigned int i = 0; i < graph->output_num; i++) { v_output_indices->indices[i] = graph->output_nodes[i]; } @@ -356,7 +353,7 @@ bool SaveModelIntoMem(void* start_ptr, ir_graph_t* graph, uint32_t* tm_model_siz /* Write the subgraphs */ /* Only 1 subgraph is supported currently */ size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * 1; - TM2_Vector_offsets* v_subgraphs = ( TM2_Vector_offsets* )malloc(vector_size); + TM2_Vector_offsets* v_subgraphs = (TM2_Vector_offsets*)malloc(vector_size); v_subgraphs->v_num = 1; v_subgraphs->offsets[0] = SaveTmSubgraph(start_ptr, &cur_pos, graph); @@ -382,17 +379,16 @@ int save_model(std::vector& addr_list, std::vector& size_list, ir_gr uint32_t malloc_size = TM_FILE_MAX_SIZE; const char* env = std::getenv("TM_FILE_MAX_SIZE"); - if(env) + if (env) malloc_size = std::atoi(env); - void* start_ptr = ( void* )malloc(malloc_size); - if(start_ptr == nullptr) + void* start_ptr = (void*)malloc(malloc_size); + if (start_ptr == nullptr) { - TLOG_ERR("Malloc memory failed: .\n",malloc_size); + TLOG_ERR("Malloc memory failed: .\n", malloc_size); return false; } - bool ret = SaveModelIntoMem(start_ptr, graph, &tm_model_size); addr_list.push_back(start_ptr); @@ -411,16 +407,16 @@ bool save_graph(graph_t graph, const char* fname) ir_graph_t* ir_graph = (ir_graph_t*)graph; /* Open the tengine model file */ int fd = open(fname, O_RDWR | O_CREAT | O_TRUNC, 0666); - if(fd == -1) + if (fd == -1) { - TLOG_ERR("Could not open %s\n",fname); + TLOG_ERR("Could not open %s\n", fname); return false; } std::vector addr_list; std::vector size_list; - if(!save_model(addr_list, size_list, ir_graph)) + if (!save_model(addr_list, size_list, ir_graph)) { close(fd); return false; @@ -433,7 +429,7 @@ bool save_graph(graph_t graph, const char* fname) close(fd); free(buf); - if(ret != size) + if (ret != size) return false; else return true; diff --git a/tools/convert_tool/utils/save_graph/save_graph.hpp b/tools/convert_tool/utils/save_graph/save_graph.hpp index 2267e01a5..cdc46b725 100644 --- a/tools/convert_tool/utils/save_graph/save_graph.hpp +++ b/tools/convert_tool/utils/save_graph/save_graph.hpp @@ -10,8 +10,7 @@ #include #include -extern "C" -{ +extern "C" { #include "tengine/c_api.h" #include "graph/graph.h" #include "graph/subgraph.h" @@ -22,8 +21,6 @@ extern "C" #include "serializer/tmfile/tm2_format.h" } - #include "tm2_op_save.hpp" - bool save_graph(graph_t graph, const char* fname); diff --git a/tools/convert_tool/utils/save_graph/tm2_generate.c b/tools/convert_tool/utils/save_graph/tm2_generate.c index 71db31f8b..4ba97d177 100644 --- a/tools/convert_tool/utils/save_graph/tm2_generate.c +++ b/tools/convert_tool/utils/save_graph/tm2_generate.c @@ -28,7 +28,7 @@ extern "C" { #endif -#define ALIGN(pos, alignbytes) (((pos) + ( alignbytes )-1) & ~(( alignbytes )-1)) +#define ALIGN(pos, alignbytes) (((pos) + (alignbytes)-1) & ~((alignbytes)-1)) uint32_t WriteTmFileAlign1(void* const start_ptr, uint32_t* cur_pos, const void* buf, const uint32_t buf_size) { diff --git a/tools/convert_tool/utils/save_graph/tm2_op_save.cpp b/tools/convert_tool/utils/save_graph/tm2_op_save.cpp index d328c4749..c8b1d200f 100644 --- a/tools/convert_tool/utils/save_graph/tm2_op_save.cpp +++ b/tools/convert_tool/utils/save_graph/tm2_op_save.cpp @@ -27,8 +27,6 @@ // #include "utility/log.h" // #include "tengine_ir.h" - - inline void SetTmOperator(TM2_Operator* tm_op, const uint32_t op_type, const tm_uoffset_t offset) { tm_op->op_ver = TM2_OP_VER; @@ -272,9 +270,9 @@ tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_n TM2_PriorBoxParam tm_param; size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->min_size_num; - TM2_Vector_floats* v_minsizes = ( TM2_Vector_floats* )malloc(vector_size); + TM2_Vector_floats* v_minsizes = (TM2_Vector_floats*)malloc(vector_size); v_minsizes->v_num = p->min_size_num; - for(unsigned int i = 0; i < p->min_size_num; i++) + for (unsigned int i = 0; i < p->min_size_num; i++) { v_minsizes->data[i] = p->min_size[i]; } @@ -282,9 +280,9 @@ tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_n free(v_minsizes); vector_size = sizeof(tm_size_t) + sizeof(float) * p->max_size_num; - TM2_Vector_floats* v_maxsizes = ( TM2_Vector_floats* )malloc(vector_size); + TM2_Vector_floats* v_maxsizes = (TM2_Vector_floats*)malloc(vector_size); v_maxsizes->v_num = p->max_size_num; - for(unsigned int i = 0; i < p->max_size_num; i++) + for (unsigned int i = 0; i < p->max_size_num; i++) { v_maxsizes->data[i] = p->max_size[i]; } @@ -293,9 +291,9 @@ tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_n int variance_num = 4; // tengine lite does not set the variable. vector_size = sizeof(tm_size_t) + sizeof(float) * variance_num; - TM2_Vector_floats* v_variance = ( TM2_Vector_floats* )malloc(vector_size); + TM2_Vector_floats* v_variance = (TM2_Vector_floats*)malloc(vector_size); v_variance->v_num = variance_num; - for(unsigned int i = 0; i < variance_num; i++) + for (unsigned int i = 0; i < variance_num; i++) { v_variance->data[i] = p->variance[i]; } @@ -303,9 +301,9 @@ tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_n free(v_variance); vector_size = sizeof(tm_size_t) + sizeof(float) * p->aspect_ratio_size; - TM2_Vector_floats* v_ratios = ( TM2_Vector_floats* )malloc(vector_size); + TM2_Vector_floats* v_ratios = (TM2_Vector_floats*)malloc(vector_size); v_ratios->v_num = p->aspect_ratio_size; - for(unsigned int i = 0; i < p->aspect_ratio_size; i++) + for (unsigned int i = 0; i < p->aspect_ratio_size; i++) { v_ratios->data[i] = p->aspect_ratio[i]; } @@ -340,9 +338,9 @@ tm_uoffset_t SaveTmRegionOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_nod tm_param.nms_threshold = p->nms_threshold; size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->biases_num; - TM2_Vector_floats* v_biases = ( TM2_Vector_floats* )malloc(vector_size); + TM2_Vector_floats* v_biases = (TM2_Vector_floats*)malloc(vector_size); v_biases->v_num = p->biases_num; - for(unsigned int i = 0; i < p->biases_num; i++) + for (unsigned int i = 0; i < p->biases_num; i++) { v_biases->data[i] = p->biases[i]; } @@ -387,36 +385,35 @@ tm_uoffset_t SaveTmReshapeOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_no { struct reshape_param* p = (struct reshape_param*)node->op.param_mem; TM2_ReshapeParam tm_param; - if(p->reverse) + if (p->reverse) tm_param.reverse = 1; else tm_param.reverse = 0; - if(p->is_mxnet) + if (p->is_mxnet) tm_param.is_mxnet = 1; else tm_param.is_mxnet = 0; - if(p->dim_size) + if (p->dim_size) { size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->dim_size; - TM2_Vector_dims* v_re_shape = ( TM2_Vector_dims* )malloc(vector_size); + TM2_Vector_dims* v_re_shape = (TM2_Vector_dims*)malloc(vector_size); v_re_shape->v_num = p->dim_size; - for(unsigned int i = 0; i < p->dim_size; i++) + for (unsigned int i = 0; i < p->dim_size; i++) { v_re_shape->dims[i] = p->re_shape[i]; } tm_param.offset_re_shape = WriteTmObject(start_ptr, cur_pos, v_re_shape, vector_size); free(v_re_shape); } - else{ + else + { tm_param.offset_re_shape = TM2_NOT_SET; } - TM2_Operator tm_op; SetTmOperator(&tm_op, TM2_OPTYPE_RESHAPE, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ReshapeParam))); return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); - } tm_uoffset_t SaveTmResizeOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node_t* node) @@ -453,9 +450,9 @@ tm_uoffset_t SaveTmRPNOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node_t TM2_RPNParam tm_param; size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->ratios->elem_num; - TM2_Vector_floats* v_ratios = ( TM2_Vector_floats* )malloc(vector_size); + TM2_Vector_floats* v_ratios = (TM2_Vector_floats*)malloc(vector_size); v_ratios->v_num = p->ratios->elem_num; - for(unsigned int i = 0; i < p->ratios->elem_num; i++) + for (unsigned int i = 0; i < p->ratios->elem_num; i++) { v_ratios->data[i] = *(float*)get_vector_data(p->ratios, i); } @@ -463,9 +460,9 @@ tm_uoffset_t SaveTmRPNOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node_t free(v_ratios); vector_size = sizeof(tm_size_t) + sizeof(float) * p->anchor_scales->elem_num; - TM2_Vector_floats* v_scales = ( TM2_Vector_floats* )malloc(vector_size); + TM2_Vector_floats* v_scales = (TM2_Vector_floats*)malloc(vector_size); v_scales->v_num = p->anchor_scales->elem_num; - for(unsigned int i = 0; i < p->anchor_scales->elem_num; i++) + for (unsigned int i = 0; i < p->anchor_scales->elem_num; i++) { v_scales->data[i] = *(float*)get_vector_data(p->anchor_scales, i); } @@ -473,9 +470,9 @@ tm_uoffset_t SaveTmRPNOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node_t free(v_scales); vector_size = sizeof(tm_size_t) + sizeof(float) * p->anchors_->elem_num * 4; - TM2_Vector_anchors* v_anchors = ( TM2_Vector_anchors* )malloc(vector_size); + TM2_Vector_anchors* v_anchors = (TM2_Vector_anchors*)malloc(vector_size); v_anchors->v_num = p->anchors_->elem_num; - for(unsigned int i = 0; i < p->anchors_->elem_num; i++) + for (unsigned int i = 0; i < p->anchors_->elem_num; i++) { v_anchors->data[i][0] = ((Anchor_t*)get_vector_data(p->anchors_, i))->x0; v_anchors->data[i][1] = ((Anchor_t*)get_vector_data(p->anchors_, i))->y0; @@ -523,12 +520,12 @@ tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node tm_param.isonnx = p->isonnx; tm_param.ismxnet = p->ismxnet; - if(p->slice_point_ && p->slice_point_->elem_num) + if (p->slice_point_ && p->slice_point_->elem_num) { size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->slice_point_->elem_num; - TM2_Vector_dims* v_slice_points = ( TM2_Vector_dims* )malloc(vector_size); + TM2_Vector_dims* v_slice_points = (TM2_Vector_dims*)malloc(vector_size); v_slice_points->v_num = p->slice_point_->elem_num; - for(unsigned int i = 0; i < p->slice_point_->elem_num; i++) + for (unsigned int i = 0; i < p->slice_point_->elem_num; i++) { v_slice_points->dims[i] = *(int32_t*)get_vector_data(p->slice_point_, i); } @@ -538,12 +535,12 @@ tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node else tm_param.offset_vi_slice_points = TM2_NOT_SET; - if(p->begin_ && p->begin_->elem_num) + if (p->begin_ && p->begin_->elem_num) { size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->begin_->elem_num; - TM2_Vector_dims* v_begins = ( TM2_Vector_dims* )malloc(vector_size); + TM2_Vector_dims* v_begins = (TM2_Vector_dims*)malloc(vector_size); v_begins->v_num = p->begin_->elem_num; - for(unsigned int i = 0; i < p->begin_->elem_num; i++) + for (unsigned int i = 0; i < p->begin_->elem_num; i++) { v_begins->dims[i] = *(int32_t*)get_vector_data(p->begin_, i); } @@ -553,12 +550,12 @@ tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node else tm_param.offset_vi_begins = TM2_NOT_SET; - if(p->size_ && p->size_->elem_num) + if (p->size_ && p->size_->elem_num) { size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->size_->elem_num; - TM2_Vector_dims* v_sizes = ( TM2_Vector_dims* )malloc(vector_size); + TM2_Vector_dims* v_sizes = (TM2_Vector_dims*)malloc(vector_size); v_sizes->v_num = p->size_->elem_num; - for(unsigned int i = 0; i < p->size_->elem_num; i++) + for (unsigned int i = 0; i < p->size_->elem_num; i++) { v_sizes->dims[i] = *(int32_t*)get_vector_data(p->size_, i); } @@ -568,7 +565,6 @@ tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node else tm_param.offset_vi_sizes = TM2_NOT_SET; - TM2_Operator tm_op; SetTmOperator(&tm_op, TM2_OPTYPE_SLICE, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_SliceParam))); return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); @@ -589,24 +585,27 @@ tm_uoffset_t SaveTmSplitOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node { struct split_param* p = (struct split_param*)node->op.param_mem; TM2_SplitParam tm_param; - if(p->is_caffe) + if (p->is_caffe) tm_param.is_caffe = 1; else tm_param.is_caffe = 0; - if(p->is_onnx){ + if (p->is_onnx) + { tm_param.is_onnx = 1; - } else { + } + else + { tm_param.is_onnx = 0; } - if(!p->is_caffe) + if (!p->is_caffe) { - if(p->is_onnx) + if (p->is_onnx) tm_param.axis = p->axis; size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->split_sizes_->elem_num; - TM2_Vector_dims* v_split_sizes = ( TM2_Vector_dims* )malloc(vector_size); + TM2_Vector_dims* v_split_sizes = (TM2_Vector_dims*)malloc(vector_size); v_split_sizes->v_num = p->split_sizes_->elem_num; - for(unsigned int i = 0; i < p->split_sizes_->elem_num; i++) + for (unsigned int i = 0; i < p->split_sizes_->elem_num; i++) { v_split_sizes->dims[i] = *(int32_t*)get_vector_data(p->split_sizes_, i); } @@ -633,9 +632,9 @@ tm_uoffset_t SaveTmDetectionPostProcessOp(void* const start_ptr, tm_uoffset_t* c int param_scales_num = 4; size_t vector_size = sizeof(tm_size_t) + sizeof(float) * param_scales_num; - TM2_Vector_floats* v_scales = ( TM2_Vector_floats* )malloc(vector_size); + TM2_Vector_floats* v_scales = (TM2_Vector_floats*)malloc(vector_size); v_scales->v_num = param_scales_num; - for(unsigned int i = 0; i < param_scales_num; i++) + for (unsigned int i = 0; i < param_scales_num; i++) { v_scales->data[i] = p->scales[i]; } @@ -777,7 +776,7 @@ tm_uoffset_t SaveTmTopKV2Op(void* const start_ptr, tm_uoffset_t* cur_pos, ir_nod TM2_TopKV2Param tm_param; tm_param.k = p->k; - if(p->sorted) + if (p->sorted) tm_param.sorted = 1; else tm_param.sorted = 0; @@ -989,7 +988,7 @@ tm_uoffset_t SaveTmExpanddimsOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir struct expanddims_param* p = (struct expanddims_param*)node->op.param_mem; TM2_ExpanddimsParam tm_param; - tm_param.axis= p->axis; + tm_param.axis = p->axis; TM2_Operator tm_op; SetTmOperator(&tm_op, TM2_OPTYPE_EXPANDDIMS, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ExpanddimsParam))); @@ -1170,19 +1169,20 @@ tm_uoffset_t SaveTmTransposeOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_ { struct transpose_param* p = (struct transpose_param*)node->op.param_mem; TM2_TransposeParam tm_param; - if(p->tr_shape_size) + if (p->tr_shape_size) { size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->tr_shape_size; - TM2_Vector_dims* v_re_shape = ( TM2_Vector_dims* )malloc(vector_size); + TM2_Vector_dims* v_re_shape = (TM2_Vector_dims*)malloc(vector_size); v_re_shape->v_num = p->tr_shape_size; - for(unsigned int i = 0; i < p->tr_shape_size; i++) + for (unsigned int i = 0; i < p->tr_shape_size; i++) { v_re_shape->dims[i] = p->tr_shape[i]; } tm_param.offset_tr_shape = WriteTmObject(start_ptr, cur_pos, v_re_shape, vector_size); free(v_re_shape); } - else{ + else + { tm_param.offset_tr_shape = TM2_NOT_SET; } TM2_Operator tm_op; @@ -1282,12 +1282,12 @@ tm_uoffset_t SaveTmUnsqueezeOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_ struct unsqueeze_param* p = (struct unsqueeze_param*)node->op.param_mem; TM2_UnsqueezeParam tm_param; - if(p->axises_size) + if (p->axises_size) { size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->axises_size; - TM2_Vector_dims* v_axises = ( TM2_Vector_dims* )malloc(vector_size); + TM2_Vector_dims* v_axises = (TM2_Vector_dims*)malloc(vector_size); v_axises->v_num = p->axises_size; - for(unsigned int i = 0; i < p->axises_size; i++) + for (unsigned int i = 0; i < p->axises_size; i++) { v_axises->dims[i] = p->axises[i]; } @@ -1329,19 +1329,18 @@ tm_uoffset_t SaveTmMatMulOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_nod return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); } - -tm_uoffset_t SaveTmExpandOp(void* const start_ptr, tm_uoffset_t* cur_pos,ir_node_t* node) +tm_uoffset_t SaveTmExpandOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node_t* node) { struct expand_param* p = (struct expand_param*)node->op.param_mem; TM2_ExpandParam tm_param; memset(&tm_param, 0, sizeof(TM2_ExpandParam)); - if(p->dim_num) + if (p->dim_num) { size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->dim_num; - TM2_Vector_dims* v_axises = ( TM2_Vector_dims* )malloc(vector_size); + TM2_Vector_dims* v_axises = (TM2_Vector_dims*)malloc(vector_size); v_axises->v_num = p->dim_num; - for(unsigned int i = 0; i < p->dim_num; i++) + for (unsigned int i = 0; i < p->dim_num; i++) { v_axises->dims[i] = p->ex_shape[i]; } @@ -1359,27 +1358,28 @@ tm_uoffset_t SaveTmExpandOp(void* const start_ptr, tm_uoffset_t* cur_pos,ir_node return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); } -tm_uoffset_t SaveTmSpatialTransformerOp(void* const start_ptr, tm_uoffset_t* cur_pos,ir_node_t* node) +tm_uoffset_t SaveTmSpatialTransformerOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node_t* node) { struct spatialtransformer_param* p = (struct spatialtransformer_param*)node->op.param_mem; TM2_SpatialTransformerParam tm_param; memset(&tm_param, 0, sizeof(TM2_SpatialTransformerParam)); tm_param.sampler_type = p->sampler_type; tm_param.transformer_type = p->transformer_type; - tm_param.shape_size = sizeof(p->target_shape)/sizeof(p->target_shape[0]); - if(tm_param.shape_size) + tm_param.shape_size = sizeof(p->target_shape) / sizeof(p->target_shape[0]); + if (tm_param.shape_size) { size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * tm_param.shape_size; - TM2_Vector_dims* v_ta_shape = ( TM2_Vector_dims* )malloc(vector_size); + TM2_Vector_dims* v_ta_shape = (TM2_Vector_dims*)malloc(vector_size); v_ta_shape->v_num = tm_param.shape_size; - for(unsigned int i = 0; i < tm_param.shape_size; i++) + for (unsigned int i = 0; i < tm_param.shape_size; i++) { v_ta_shape->dims[i] = p->target_shape[i]; } tm_param.offset_ta_shape = WriteTmObject(start_ptr, cur_pos, v_ta_shape, vector_size); free(v_ta_shape); } - else{ + else + { tm_param.offset_ta_shape = TM2_NOT_SET; } @@ -1387,187 +1387,186 @@ tm_uoffset_t SaveTmSpatialTransformerOp(void* const start_ptr, tm_uoffset_t* cur memset(&tm_op, 0, sizeof(TM2_Operator)); SetTmOperator(&tm_op, TM2_OPTYPE_SPATIALTRANSFORMER, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_SpatialTransformerParam))); return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); - } op_save_t SaveTmOpFunc(uint32_t op_type) { - switch(op_type) + switch (op_type) { - case OP_BATCHNORM: - return SaveTmBatchNormOp; - case OP_CONCAT: - return SaveTmConcatOp; - case OP_CONST: - return SaveTmConstOp; - case OP_CONV: - return SaveTmConvOp; - case OP_DECONV: - return SaveTmDeconvOp; - case OP_DETECTION_OUTPUT: - return SaveTmDetectionOutputOp; - case OP_DROPOUT: - return SaveTmDropoutOp; - case OP_ELTWISE: - return SaveTmEltwiseOp; - case OP_FLATTEN: - return SaveTmFlattenOp; - case OP_FC: - return SaveTmFCOp; - case OP_INPUT: - return SaveTmInputOp; - case OP_LRN: - return SaveTmLRNOp; - case OP_NORMALIZE: - return SaveTmNormalizeOp; - case OP_PERMUTE: - return SaveTmPermuteOp; - case OP_POOL: - return SaveTmPoolingOp; - case OP_PRELU: - return SaveTmPreluOp; - case OP_PRIORBOX: - return SaveTmPriorBoxOp; - case OP_REGION: - return SaveTmRegionOp; - case OP_RELU: - return SaveTmReLuOp; - case OP_RELU6: - return SaveTmRelu6Op; - case OP_REORG: - return SaveTmReorgOp; - case OP_RESHAPE: - return SaveTmReshapeOp; - case OP_ROIPOOLING: - return SaveTmROIPoolingOp; - case OP_RPN: - return SaveTmRPNOp; - case OP_SCALE: - return SaveTmScaleOp; - case OP_SLICE: - return SaveTmSliceOp; - case OP_SOFTMAX: - return SaveTmSoftmaxOp; - case OP_SPLIT: - return SaveTmSplitOp; - case OP_DETECTION_POSTPROCESS: - return SaveTmDetectionPostProcessOp; - case OP_GEMM: - return SaveTmGemmOp; - case OP_LOGISTIC: - return SaveTmLogisticOp; - case OP_LSTM: - return SaveTmLstmOp; - case OP_RNN: - return SaveTmRnnOp; - case OP_TANH: - return SaveTmTanhOp; - case OP_SIGMOID: - return SaveTmSigmoidOp; - case OP_SQUEEZE: - return SaveTmSqueezeOp; - case OP_SWAP_AXIS: - return SaveTmSwapAxisOp; - case OP_GRU: - return SaveTmGruOp; - case OP_ARGMAX: - return SaveTmArgMaxOp; - case OP_ARGMIN: - return SaveTmArgMinOp; - case OP_TOPKV2: - return SaveTmTopKV2Op; - case OP_PAD: - return SaveTmPadOp; - case OP_STRIDED_SLICE: - return SaveTmStridedSliceOp; - case OP_REDUCTION: - return SaveTmReductionOp; - case OP_UPSAMPLE: - return SaveTmUpsampleOp; - case OP_SHUFFLECHANNEL: - return SaveTmShuffleChannelOp; - case OP_SPACETOBATCHND: - return SaveTmSpaceToBatchNDOp; - case OP_BATCHTOSPACEND: - return SaveTmBatchToSpaceNDOp; - case OP_RESIZE: - return SaveTmResizeOp; - case OP_CROP: - return SaveTmCropOp; - case OP_ROIALIGN: - return SaveTmRoialignOp; - case OP_PSROIPOOLING: - return SaveTmPsroipoolingOp; - case OP_EXPANDDIMS: - return SaveTmExpanddimsOp; - case OP_UNARY: - return SaveTmUnaryOp; - case OP_NOOP: - return SaveTmNoopOp; - case OP_THRESHOLD: - return SaveTmThresholdOp; - case OP_HARDSIGMOID: - return SaveTmHardsigmoidOp; - case OP_EMBEDDING: - return SaveTmEmbedOp; - case OP_INSTANCENORM: - return SaveTmInstanceNormOp; - case OP_MVN: - return SaveTmMVNOp; - case OP_CAST: - return SaveTmCastOp; - case OP_HARDSWISH: - return SaveTmHardSwishOp; - case OP_INTERP: - return SaveTmInterpOp; - case OP_SELU: - return SaveTmSeluOp; - case OP_ELU: - return SaveTmEluOp; - case OP_BROADMUL: - return SaveTmBroadMulOp; - case OP_LOGICAL: - return SaveTmLogicalOp; - case OP_GATHER: - return SaveTmGatherOp; - case OP_TRANSPOSE: - return SaveTmTransposeOp; - case OP_COMPARISON: - return SaveTmComparisonOp; - case OP_REVERSE: - return SaveTmReverseOp; - case OP_SPACETODEPTH: - return SaveTmSpaceToDepthOp; - case OP_DEPTHTOSPACE: - return SaveTmDepthToSpaceOp; - case OP_SQUAREDDIFFERENCE: - return SaveTmSquaredDifferenceOp; - case OP_SPARSETODENSE: - return SaveTmSparseToDenseOp; - case OP_CEIL: - return SaveTmCeilOp; - case OP_ROUND: - return SaveTmRoundOp; - case OP_ZEROSLIKE: - return SaveTmZerosLikeOp; - case OP_CLIP: - return SaveTmClipOp; - case OP_REDUCEL2: - return SaveTmReduceL2Op; - case OP_UNSQUEEZE: - return SaveTmUnsqueezeOp; - case OP_MEAN: - return SaveTmMeanOp; - case OP_MATMUL: - return SaveTmMatMulOp; - case OP_MISH: - return SaveTmMishOp; - case OP_SPATIALTRANSFORMER: - return SaveTmSpatialTransformerOp; - case OP_EXPAND: - return SaveTmExpandOp; - default: - // fprintf(stderr, "Operator #%d not supported in tengine model yet\n",op_type); - return nullptr; + case OP_BATCHNORM: + return SaveTmBatchNormOp; + case OP_CONCAT: + return SaveTmConcatOp; + case OP_CONST: + return SaveTmConstOp; + case OP_CONV: + return SaveTmConvOp; + case OP_DECONV: + return SaveTmDeconvOp; + case OP_DETECTION_OUTPUT: + return SaveTmDetectionOutputOp; + case OP_DROPOUT: + return SaveTmDropoutOp; + case OP_ELTWISE: + return SaveTmEltwiseOp; + case OP_FLATTEN: + return SaveTmFlattenOp; + case OP_FC: + return SaveTmFCOp; + case OP_INPUT: + return SaveTmInputOp; + case OP_LRN: + return SaveTmLRNOp; + case OP_NORMALIZE: + return SaveTmNormalizeOp; + case OP_PERMUTE: + return SaveTmPermuteOp; + case OP_POOL: + return SaveTmPoolingOp; + case OP_PRELU: + return SaveTmPreluOp; + case OP_PRIORBOX: + return SaveTmPriorBoxOp; + case OP_REGION: + return SaveTmRegionOp; + case OP_RELU: + return SaveTmReLuOp; + case OP_RELU6: + return SaveTmRelu6Op; + case OP_REORG: + return SaveTmReorgOp; + case OP_RESHAPE: + return SaveTmReshapeOp; + case OP_ROIPOOLING: + return SaveTmROIPoolingOp; + case OP_RPN: + return SaveTmRPNOp; + case OP_SCALE: + return SaveTmScaleOp; + case OP_SLICE: + return SaveTmSliceOp; + case OP_SOFTMAX: + return SaveTmSoftmaxOp; + case OP_SPLIT: + return SaveTmSplitOp; + case OP_DETECTION_POSTPROCESS: + return SaveTmDetectionPostProcessOp; + case OP_GEMM: + return SaveTmGemmOp; + case OP_LOGISTIC: + return SaveTmLogisticOp; + case OP_LSTM: + return SaveTmLstmOp; + case OP_RNN: + return SaveTmRnnOp; + case OP_TANH: + return SaveTmTanhOp; + case OP_SIGMOID: + return SaveTmSigmoidOp; + case OP_SQUEEZE: + return SaveTmSqueezeOp; + case OP_SWAP_AXIS: + return SaveTmSwapAxisOp; + case OP_GRU: + return SaveTmGruOp; + case OP_ARGMAX: + return SaveTmArgMaxOp; + case OP_ARGMIN: + return SaveTmArgMinOp; + case OP_TOPKV2: + return SaveTmTopKV2Op; + case OP_PAD: + return SaveTmPadOp; + case OP_STRIDED_SLICE: + return SaveTmStridedSliceOp; + case OP_REDUCTION: + return SaveTmReductionOp; + case OP_UPSAMPLE: + return SaveTmUpsampleOp; + case OP_SHUFFLECHANNEL: + return SaveTmShuffleChannelOp; + case OP_SPACETOBATCHND: + return SaveTmSpaceToBatchNDOp; + case OP_BATCHTOSPACEND: + return SaveTmBatchToSpaceNDOp; + case OP_RESIZE: + return SaveTmResizeOp; + case OP_CROP: + return SaveTmCropOp; + case OP_ROIALIGN: + return SaveTmRoialignOp; + case OP_PSROIPOOLING: + return SaveTmPsroipoolingOp; + case OP_EXPANDDIMS: + return SaveTmExpanddimsOp; + case OP_UNARY: + return SaveTmUnaryOp; + case OP_NOOP: + return SaveTmNoopOp; + case OP_THRESHOLD: + return SaveTmThresholdOp; + case OP_HARDSIGMOID: + return SaveTmHardsigmoidOp; + case OP_EMBEDDING: + return SaveTmEmbedOp; + case OP_INSTANCENORM: + return SaveTmInstanceNormOp; + case OP_MVN: + return SaveTmMVNOp; + case OP_CAST: + return SaveTmCastOp; + case OP_HARDSWISH: + return SaveTmHardSwishOp; + case OP_INTERP: + return SaveTmInterpOp; + case OP_SELU: + return SaveTmSeluOp; + case OP_ELU: + return SaveTmEluOp; + case OP_BROADMUL: + return SaveTmBroadMulOp; + case OP_LOGICAL: + return SaveTmLogicalOp; + case OP_GATHER: + return SaveTmGatherOp; + case OP_TRANSPOSE: + return SaveTmTransposeOp; + case OP_COMPARISON: + return SaveTmComparisonOp; + case OP_REVERSE: + return SaveTmReverseOp; + case OP_SPACETODEPTH: + return SaveTmSpaceToDepthOp; + case OP_DEPTHTOSPACE: + return SaveTmDepthToSpaceOp; + case OP_SQUAREDDIFFERENCE: + return SaveTmSquaredDifferenceOp; + case OP_SPARSETODENSE: + return SaveTmSparseToDenseOp; + case OP_CEIL: + return SaveTmCeilOp; + case OP_ROUND: + return SaveTmRoundOp; + case OP_ZEROSLIKE: + return SaveTmZerosLikeOp; + case OP_CLIP: + return SaveTmClipOp; + case OP_REDUCEL2: + return SaveTmReduceL2Op; + case OP_UNSQUEEZE: + return SaveTmUnsqueezeOp; + case OP_MEAN: + return SaveTmMeanOp; + case OP_MATMUL: + return SaveTmMatMulOp; + case OP_MISH: + return SaveTmMishOp; + case OP_SPATIALTRANSFORMER: + return SaveTmSpatialTransformerOp; + case OP_EXPAND: + return SaveTmExpandOp; + default: + // fprintf(stderr, "Operator #%d not supported in tengine model yet\n",op_type); + return nullptr; } } diff --git a/tools/convert_tool/utils/save_graph/tm2_op_save.hpp b/tools/convert_tool/utils/save_graph/tm2_op_save.hpp index 7eafbe566..79456b646 100644 --- a/tools/convert_tool/utils/save_graph/tm2_op_save.hpp +++ b/tools/convert_tool/utils/save_graph/tm2_op_save.hpp @@ -3,15 +3,13 @@ #include extern "C" { - #include "utility/vector.h" - #include "serializer/tmfile/tm2_format.h" - #include "tm2_generate.h" - #include "graph/node.h" - - #include "op_include.h" -} - +#include "utility/vector.h" +#include "serializer/tmfile/tm2_format.h" +#include "tm2_generate.h" +#include "graph/node.h" +#include "op_include.h" +} using op_save_t = std::function; op_save_t SaveTmOpFunc(uint32_t op_type); diff --git a/tools/quantize/compiler_fp16.h b/tools/quantize/compiler_fp16.h index 1857d7eec..d770707c2 100644 --- a/tools/quantize/compiler_fp16.h +++ b/tools/quantize/compiler_fp16.h @@ -48,7 +48,7 @@ extern "C" { #else #ifdef _MSC_VER -#pragma pack (push,1) +#pragma pack(push, 1) struct fp16_pack { unsigned short frac : 10; @@ -84,12 +84,12 @@ typedef struct fp16_pack __fp16; static inline float fp16_to_fp32(__fp16 data) { float f; - struct fp32_pack* fp32 = ( struct fp32_pack* )&f; + struct fp32_pack* fp32 = (struct fp32_pack*)&f; struct fp16_pack* fp16 = &data; int exp = fp16->exp; - if(exp == 31 && fp16->frac != 0) + if (exp == 31 && fp16->frac != 0) { // return __builtin_inf()-__builtin_inf(); fp32->sign = fp16->sign; @@ -99,28 +99,28 @@ static inline float fp16_to_fp32(__fp16 data) return f; } - if(exp == 31) + if (exp == 31) exp = 255; - if(exp == 0) + if (exp == 0) exp = 0; else exp = (exp - 15) + 127; fp32->exp = exp; fp32->sign = fp16->sign; - fp32->frac = (( int )fp16->frac) << 13; + fp32->frac = ((int)fp16->frac) << 13; return f; } static inline __fp16 fp32_to_fp16(float data) { - struct fp32_pack* fp32 = ( struct fp32_pack* )&data; + struct fp32_pack* fp32 = (struct fp32_pack*)&data; struct fp16_pack fp16; int exp = fp32->exp; - if(fp32->exp == 255 && fp32->frac != 0) + if (fp32->exp == 255 && fp32->frac != 0) { // NaN fp16.exp = 31; @@ -130,9 +130,9 @@ static inline __fp16 fp32_to_fp16(float data) return fp16; } - if((exp - 127) < -14) + if ((exp - 127) < -14) exp = 0; - else if((exp - 127) > 15) + else if ((exp - 127) > 15) exp = 31; else exp = exp - 127 + 15; diff --git a/tools/quantize/quant_save_graph.cpp b/tools/quantize/quant_save_graph.cpp index 0f8918fda..f53705076 100644 --- a/tools/quantize/quant_save_graph.cpp +++ b/tools/quantize/quant_save_graph.cpp @@ -1,1025 +1,1019 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: hhchen@openailab.com - */ - - -#include - -#include "quant_save_graph.hpp" -#include "compiler_fp16.h" - -#include "operator/prototype/convolution_param.h" -#include "operator/prototype/pooling_param.h" -#include "operator/prototype/relu_param.h" - - -void recursion_pass_through(struct graph* ir_graph, const char* layer_name, struct tensor* t, - std::tr1::unordered_map &layer_used, std::tr1::unordered_map &layer_scale, - std::tr1::unordered_map &layer_zeropoint, std::tr1::unordered_map &layer_pass) -{ - if (layer_pass[t->name] == false && layer_used[t->name] < 2) - { - t->scale = layer_scale[layer_name]; - t->zero_point = layer_zeropoint[layer_name]; - layer_scale[t->name] = layer_scale[layer_name]; - layer_zeropoint[t->name] = layer_zeropoint[layer_name]; - - uint32_t ir_node_idx = t->producer; - struct node* t_node = ir_graph->node_list[ir_node_idx]; - - std::string op_name = get_op_name_from_type(t_node->op.type); - bool poolTrue = false; - bool reluTrue = false; - if (op_name == "Pooling") - { - struct pool_param* pool_param = ( struct pool_param* )t_node->op.param_mem; - if (pool_param->pool_method == 0) - poolTrue = true; - } - else if (op_name == "ReLU") - { - struct relu_param* relu_param = ( struct relu_param* )t_node->op.param_mem; - if (relu_param->negative_slope == 0.f) - reluTrue = true; - } - if (op_name == "Flatten" || op_name == "Reshape" || op_name == "Squeeze" || op_name == "Clip" || - poolTrue || reluTrue) - { - struct tensor* t_in_tensor = ir_graph->tensor_list[t_node->input_tensors[0]]; - if (layer_scale[t->name] != 0) - { - if (t_in_tensor->tensor_type == TENSOR_TYPE_VAR || t_in_tensor->tensor_type == TENSOR_TYPE_INPUT) - { - recursion_pass_through(ir_graph, t->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass); - } - } - } - layer_pass[t->name] = true; - } -} - -int save_graph_u8_perlayer(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal) -{ - fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again\n"); - - /* Step 1 : create graph, load tengine model xxx.tmfile */ - struct graph* ir_graph = (struct graph*)create_graph(nullptr, "tengine", model_file); - if (nullptr == ir_graph) - { - fprintf(stderr, "Create graph failed.\n"); - return -1; - } - fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again done.\n"); - - std::tr1::unordered_map layer_scale; - std::tr1::unordered_map layer_zeropoint; - - fprintf(stderr, "[Quant Tools Info]: Step 3, load calibration table file %s.\n", scale_file); - /* Step 2 : set activation quant scale value into ir_tensor */ - if (nullptr != scale_file) - { - std::ifstream scales(scale_file); - std::string line; - while (std::getline(scales, line)) - { - std::string layer_name; - float scale_val = 0.f; - float zero_point = 0.f; - size_t last = 0; - size_t index = line.find_first_of(' ', last); - size_t idx = line.find_last_of(' ', line.size()); - layer_name = line.substr(last, index - last); - last = index + 1; - scale_val = atof((line.substr(last, line.size() - last)).c_str()); - zero_point = atof((line.substr(idx + 1, line.size())).c_str()); - - layer_scale[layer_name] = scale_val; - layer_zeropoint[layer_name] = zero_point; - -// fprintf(stderr, "[%s] \tscale final %8.4f, zero point %8.4f\n", layer_name.c_str(), scale_val, zero_point); - } - } - - std::tr1::unordered_map layer_used; - for (int i = 0; i < ir_graph->node_num; i++) - { - struct node* ir_node = ir_graph->node_list[i]; - for (int j = 0; j < ir_node->input_num; j++ ) - { - std::string layern = ir_graph->tensor_list[ir_node->input_tensors[j]]->name; - layer_used[layern] ++; - } - } - - fprintf(stderr, "[Quant Tools Info]: Step 4, optimize the calibration table.\n"); - /* process the inplace quant scale of activation in some types of op, such as max pooling, ReLU, Flatten, Reshape, Clip .... */ - if (inplace == 0) - { - for (int i = 0; i < ir_graph->tensor_num; i++) - { - struct tensor* ir_tensor = ir_graph->tensor_list[i]; - if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) - { - ir_tensor->scale = layer_scale[ir_tensor->name]; - ir_tensor->zero_point = layer_zeropoint[ir_tensor->name]; - } - } - } - else - { - std::tr1::unordered_map layer_pass; - for (int i = ir_graph->tensor_num-1; i >= 0; i--) - { - struct tensor* ir_tensor = ir_graph->tensor_list[i]; - if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) - { - if (layer_pass[ir_tensor->name] == false) - { - uint32_t ir_node_idx = ir_tensor->producer; - struct node* t_node = ir_graph->node_list[ir_node_idx]; - - std::string op_name = get_op_name_from_type(t_node->op.type); - - bool poolTrue = false; - bool reluTrue = false; - if (op_name == "Pooling") - { - struct pool_param* pool_param = ( struct pool_param* )t_node->op.param_mem; - if (pool_param->pool_method == 0) - poolTrue = true; - } - else if (op_name == "ReLU") - { - struct relu_param* relu_param = ( struct relu_param* )t_node->op.param_mem; - if (relu_param->negative_slope == 0.f) - reluTrue = true; - } - - if (op_name == "Flatten" || op_name == "Reshape" || op_name == "Squeeze" || op_name == "Clip" || - op_name == "Slice" || poolTrue || reluTrue) - { - struct tensor* t_in_tensor = ir_graph->tensor_list[t_node->input_tensors[0]]; - if (layer_scale[ir_tensor->name] != 0) - { - ir_tensor->scale = layer_scale[ir_tensor->name]; - ir_tensor->zero_point = layer_zeropoint[ir_tensor->name]; - - if (t_in_tensor->tensor_type == TENSOR_TYPE_VAR || t_in_tensor->tensor_type == TENSOR_TYPE_INPUT) - { - recursion_pass_through(ir_graph, ir_tensor->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass); - } - } - } - else - { - ir_tensor->scale = layer_scale[ir_tensor->name]; - ir_tensor->zero_point = layer_zeropoint[ir_tensor->name]; - } - layer_pass[ir_tensor->name] = true; - } - } - } - } - - fprintf(stderr, "[Quant Tools Info]: Step 4, quantize activation tensor done.\n"); - - /* Set the params of acitvation ir_tensor */ - for (int i = 0; i < ir_graph->tensor_num; i++) - { - struct tensor* ir_tensor = ir_graph->tensor_list[i]; - if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) - { - ir_tensor->data_type = TENGINE_DT_UINT8; - ir_tensor->elem_size = sizeof(uint8_t); - } - ir_tensor->quant_param_num = 1; - } - - /* Step 3 : set weight/bias quant scale value into ir_tensor, quant the weight params from Float32 to Int8 */ - for (int i = 0; i < ir_graph->node_num; i++) - { - struct node* noden = ir_graph->node_list[i]; - std::string op_name = get_op_name_from_type(noden->op.type); - - /* quantize the tensor data from fp32 to uint8 */ - if (op_name == "Convolution" || op_name == "FullyConnected" || op_name == "Deconvolution") - { - /* Step 3.1 : quant weight */ - struct tensor* weight_tensor = ir_graph->tensor_list[noden->input_tensors[1]]; - - uint8_t * u8_weight_data = (uint8_t*)sys_malloc(weight_tensor->elem_num * sizeof(uint8_t)); - float* weight_data = (float*)weight_tensor->data; - - /* calculate the quant scale value of weight perchannel, scale = (min-max / 255) */ - float weight_max = 0; - float weight_min = 0; - float weight_scale = 0; - int weight_zero_point = 0; - - if (internal) - { - weight_scale = weight_tensor->scale; - weight_zero_point = weight_tensor->zero_point; - } - else - { - weight_max = *std::max_element(weight_data, weight_data + weight_tensor->elem_num); - weight_min = *std::min_element(weight_data, weight_data + weight_tensor->elem_num); - weight_scale = (weight_max - weight_min) / 255.f; - weight_zero_point = int(-weight_min/weight_scale); - } -// fprintf(stderr, "[weight] scale final %8.4f, zero point %4d\n", weight_scale, weight_zero_point); - - /* quantize the value of weight from Float32 to UInt8, value_u8 = (value_fp32 / scale).round().clip(0, 255) */ - for (int wi = 0; wi < weight_tensor->elem_num; wi++) - { - weight_data[wi] = roundf(weight_data[wi] / weight_scale + (float )weight_zero_point); - weight_data[wi] = weight_data[wi] > 255.f ? 255.f : weight_data[wi]; - weight_data[wi] = weight_data[wi] < 0.f ? 0.f : weight_data[wi]; - u8_weight_data[wi] = uint8_t(weight_data[wi]); - } - - weight_tensor->scale = weight_scale; - weight_tensor->zero_point = weight_zero_point; - weight_tensor->data_type = TENGINE_DT_UINT8; - weight_tensor->elem_size = sizeof(uint8_t); - weight_tensor->data = u8_weight_data; - - /* step 3.2 : quant bias */ - if (noden->input_num > 2) - { - struct tensor* input_tensor = ir_graph->tensor_list[noden->input_tensors[0]]; - struct tensor* bias_tensor = ir_graph->tensor_list[noden->input_tensors[2]]; - - int* int32_bias_data = (int*)sys_malloc(bias_tensor->elem_num * bias_tensor->elem_size); - float* bias_data = (float*)bias_tensor->data; - - /* calculate the quant scale value of bias perchannel, scale = scale_weight * scale_in */ - float bias_scale = input_tensor->scale * weight_tensor->scale; - - /* quantize the value of bias from Float32 to Int32, value_i32 = (value_fp32 / scale).round() */ - for (int bi = 0; bi < bias_tensor->elem_num; bi++) - { - if (bias_scale == 0) - int32_bias_data[bi] = 0; - else - { - bias_data[bi] = roundf(bias_data[bi] / bias_scale); - int32_bias_data[bi] = int(bias_data[bi]); - } - } - - bias_tensor->scale = bias_scale; - bias_tensor->data_type = TENGINE_DT_INT32; - bias_tensor->data = int32_bias_data; - -// fprintf(stderr, "[bias] scale final %8.4f\n", bias_scale); - } - } - /* quantize the tensor data from fp32 to fp16, for TIM-VX NPU IP */ - else if (op_name == "PReLU") - { - for (int j = 0; j < noden->input_num; j++) - { - struct tensor* in_tensor = ir_graph->tensor_list[noden->input_tensors[j]]; - if (in_tensor->tensor_type == TENSOR_TYPE_CONST) - { - float* fp32_data = (float*) in_tensor->data; - int data_elem = in_tensor->elem_num; - - __fp16* fp16_data = (__fp16*)sys_malloc(data_elem * sizeof(__fp16)); - - for (int k = 0; k < data_elem; k++) - { - fp16_data[k] = fp32_to_fp16(fp32_data[k]); - } - - in_tensor->data_type = TENGINE_DT_FP16; - in_tensor->data = fp16_data; - in_tensor->quant_param_num = 0; - } - } - } - else if (op_name == "Slice") - { - struct tensor* slice_input_tensor = get_ir_graph_tensor(ir_graph, noden->input_tensors[0]); - struct tensor* slice_output_tensor = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]); - slice_output_tensor->scale = slice_input_tensor->scale; - slice_output_tensor->zero_point = slice_input_tensor->zero_point; - } - } - - fprintf(stderr, "[Quant Tools Info]: Step 5, quantize weight tensor done.\n"); - - if (!save_graph(ir_graph, output_file.c_str())) - { - fprintf(stderr, "save graph failed.\n"); - return -1; - } - - fprintf(stderr, "[Quant Tools Info]: Step 6, save UInt8 tmfile done, %s\n", output_file.c_str()); - - return 0; -} - -int save_graph_i8_perchannel(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal) -{ - fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again\n"); - - /* Step 1 : create graph, load tengine model xxx.tmfile */ - struct graph* ir_graph = (struct graph*)create_graph(nullptr, "tengine", model_file); - if (nullptr == ir_graph) - { - fprintf(stderr, "Create graph failed.\n"); - return -1; - } - fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again done.\n"); - - std::tr1::unordered_map layer_scale; - std::tr1::unordered_map layer_zeropoint; - - fprintf(stderr, "[Quant Tools Info]: Step 3, load calibration table file %s.\n", scale_file); - /* Step 2 : set activation quant scale value into ir_tensor */ - if (nullptr != scale_file) - { - std::ifstream scales(scale_file); - std::string line; - while (std::getline(scales, line)) - { - std::string layer_name; - float scale_val = 0.f; - float zero_point = 0.f; - size_t last = 0; - size_t index = line.find_first_of(' ', last); - size_t idx = line.find_last_of(' ', line.size()); - layer_name = line.substr(last, index - last); - last = index + 1; - scale_val = atof((line.substr(last, line.size() - last)).c_str()); - zero_point = atof((line.substr(idx + 1, line.size())).c_str()); - - layer_scale[layer_name] = scale_val; - layer_zeropoint[layer_name] = zero_point; - -// fprintf(stderr, "[%s] \tscale final %8.4f, zero point %8.4f\n", layer_name.c_str(), scale_val, zero_point); - } - } - - std::tr1::unordered_map layer_used; - for (int i = 0; i < ir_graph->node_num; i++) - { - struct node* ir_node = ir_graph->node_list[i]; - for (int j = 0; j < ir_node->input_num; j++ ) - { - std::string layern = ir_graph->tensor_list[ir_node->input_tensors[j]]->name; - layer_used[layern] ++; - } - } - - fprintf(stderr, "[Quant Tools Info]: Step 4, optimize the calibration table.\n"); - /* process the inplace quant scale of activation in some types of op, such as max pooling, ReLU, Flatten, Reshape, Clip .... */ - if (inplace == 0) - { - for (int i = 0; i < ir_graph->tensor_num; i++) - { - struct tensor* ir_tensor = ir_graph->tensor_list[i]; - if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) - { - ir_tensor->scale = layer_scale[ir_tensor->name]; - ir_tensor->zero_point = layer_zeropoint[ir_tensor->name]; - } - } - } - else - { - std::tr1::unordered_map layer_pass; - for (int i = ir_graph->tensor_num-1; i >= 0; i--) - { - struct tensor* ir_tensor = ir_graph->tensor_list[i]; - if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) - { - if (layer_pass[ir_tensor->name] == false) - { - uint32_t ir_node_idx = ir_tensor->producer; - struct node* t_node = ir_graph->node_list[ir_node_idx]; - - std::string op_name = get_op_name_from_type(t_node->op.type); - - bool poolTrue = false; - bool reluTrue = false; - if (op_name == "Pooling") - { - struct pool_param* pool_param = ( struct pool_param* )t_node->op.param_mem; - if (pool_param->pool_method == 0) - poolTrue = true; - } - else if (op_name == "ReLU") - { - struct relu_param* relu_param = ( struct relu_param* )t_node->op.param_mem; - if (relu_param->negative_slope == 0.f) - reluTrue = true; - } - - if (op_name == "Flatten" || op_name == "Reshape" || op_name == "Squeeze" || op_name == "Clip" || - op_name == "Slice" || poolTrue || reluTrue) - { - struct tensor* t_in_tensor = ir_graph->tensor_list[t_node->input_tensors[0]]; - if (layer_scale[ir_tensor->name] != 0) - { - ir_tensor->scale = layer_scale[ir_tensor->name]; - ir_tensor->zero_point = layer_zeropoint[ir_tensor->name]; - - if (t_in_tensor->tensor_type == TENSOR_TYPE_VAR || t_in_tensor->tensor_type == TENSOR_TYPE_INPUT) - { - recursion_pass_through(ir_graph, ir_tensor->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass); - } - } - } - else - { - ir_tensor->scale = layer_scale[ir_tensor->name]; - ir_tensor->zero_point = layer_zeropoint[ir_tensor->name]; - } - layer_pass[ir_tensor->name] = true; - } - } - } - } - - fprintf(stderr, "[Quant Tools Info]: Step 4, quantize activation tensor done.\n"); - - /* Set the params of acitvation ir_tensor */ - for (int i = 0; i < ir_graph->tensor_num; i++) - { - struct tensor* ir_tensor = ir_graph->tensor_list[i]; - if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) - { - ir_tensor->data_type = TENGINE_DT_INT8; - ir_tensor->elem_size = sizeof(int8_t); - } - ir_tensor->quant_param_num = 1; - } - - /* Step 3 : set weight/bias quant scale value into ir_tensor, quant the weight params from Float32 to Int8 */ - FILE* fp_weight = fopen("scale_weight.txt", "wb"); - FILE* fp_bias = fopen("scale_bias.txt", "wb"); - for (int i = 0; i < ir_graph->node_num; i++) - { - struct node* noden = ir_graph->node_list[i]; - std::string op_name = get_op_name_from_type(noden->op.type); - - /* quantize the tensor data from fp32 to uint8 */ - if (op_name == "Convolution" || op_name == "FullyConnected" || op_name == "Deconvolution") - { - /* Step 3.1 : quant weight */ - struct tensor* weight_tensor = ir_graph->tensor_list[noden->input_tensors[1]]; - - int channel_num = weight_tensor->dims[0]; - int cstep = int(weight_tensor->elem_num / channel_num); - float* weight_data = ( float* )weight_tensor->data; - int8_t* i8_weight_data = ( int8_t* )sys_malloc(weight_tensor->elem_num * sizeof(int8_t)); - - float* weight_scale_list = ( float* )sys_malloc(channel_num * sizeof(float)); - int* weight_zp_list = ( int* )sys_malloc(channel_num * sizeof(int)); - - fprintf(fp_weight, "%s ", weight_tensor->name); - /* calculate the quant scale value of weight perchannel, scale = abs(min, max) / 127 */ - if (internal) - { - // TODO - } - else - { - for (int ch = 0; ch < channel_num; ch++) - { - float* weight_data_ch_start = weight_data + ch * cstep; - float* weight_data_ch_end = weight_data + (ch + 1) * cstep; - float weight_max = *std::max_element(weight_data_ch_start, weight_data_ch_end); - float weight_min = *std::min_element(weight_data_ch_start, weight_data_ch_end); - - weight_scale_list[ch] = std::max(abs(weight_max), abs(weight_min)) / 127.f; - weight_zp_list[ch] = 0; - fprintf(fp_weight, "%8.8f ", weight_scale_list[ch]); - } - fprintf(fp_weight, "\n"); - } -// fprintf(stderr, "[weight] scale final %8.4f, zero point %4d\n", weight_scale, weight_zero_point); - - /* quantize the value of weight from Float32 to Int8, value_i8 = (value_fp32 / scale).round().clip(-127, 127) */ - for (int ch = 0; ch < channel_num; ch++) - { - for (int j = 0; j < cstep; j++) - { - if (weight_data[ch * cstep + j] == 0 || weight_scale_list[ch] == 0) - i8_weight_data[ch * cstep + j] = 0; - else - { - float int8_data = round(weight_data[ch * cstep + j] / weight_scale_list[ch]); - int8_data = int8_data > 127.f ? 127.f : int8_data; - int8_data = int8_data < -127.f ? -127.f : int8_data; - i8_weight_data[ch * cstep + j] = int8_t(int8_data); - } - } - } - - weight_tensor->scale_list = weight_scale_list; - weight_tensor->zp_list = weight_zp_list; - weight_tensor->data_type = TENGINE_DT_INT8; - weight_tensor->elem_size = sizeof(int8_t); // int8, signed char - weight_tensor->data = i8_weight_data; - weight_tensor->quant_param_num = channel_num; - - /* step 3.2 : quant bias */ - if (noden->input_num > 2) - { - struct tensor* input_tensor = ir_graph->tensor_list[noden->input_tensors[0]]; - struct tensor* bias_tensor = ir_graph->tensor_list[noden->input_tensors[2]]; - - float* bias_scale_list = ( float* )sys_malloc(bias_tensor->dims[0] * sizeof(float)); - int* bias_zp_list = ( int* )sys_malloc(bias_tensor->dims[0] * sizeof(int32_t)); - - float* bias_data = ( float* )bias_tensor->data; - int* int32_bias_data = ( int* )sys_malloc(bias_tensor->elem_num * sizeof(int32_t)); - - int bstep = int(bias_tensor->elem_num / channel_num); - - fprintf(fp_bias, "%s ", bias_tensor->name); - - /* calculate the quant scale value of bias perchannel, scale = scale_weight * scale_in */ - for (int ch = 0; ch < channel_num; ch++) - { - bias_scale_list[ch] = weight_scale_list[ch] * input_tensor->scale; - bias_zp_list[ch] = 0; - - fprintf(fp_bias, "%8.8f ", bias_scale_list[ch]); - } - fprintf(fp_bias, "\n"); - - /* quantize the value of bias from Float32 to Int32, value_i32 = (value_fp32 / scale).round() */ - for (int ch = 0; ch < channel_num; ch++) - { - for (int bi = 0; bi < bstep; bi++) - { - if (bias_data[ch * bstep + bi] == 0 || bias_scale_list[ch] == 0) - int32_bias_data[ch * bstep + bi] = 0; - else - int32_bias_data[ch * bstep + bi] = int(round(bias_data[ch * bstep + bi] / bias_scale_list[ch])); - } - } - - bias_tensor->scale_list = bias_scale_list; - bias_tensor->zp_list = bias_zp_list; - bias_tensor->data_type = TENGINE_DT_INT32; - bias_tensor->elem_size = sizeof(int32_t); // int32, signed int - bias_tensor->data = int32_bias_data; - bias_tensor->quant_param_num = channel_num; - - // fprintf(stderr, "bias %8.8f \t%s\n", bias_scale_list[0], bias_tensor->name); - } - // fprintf(stderr, "\n"); - } - /* quantize the tensor data from fp32 to fp16, for TIM-VX NPU IP */ - else if (op_name == "PReLU") - { - for (int j = 0; j < noden->input_num; j++) - { - struct tensor* in_tensor = ir_graph->tensor_list[noden->input_tensors[j]]; - if (in_tensor->tensor_type == TENSOR_TYPE_CONST) - { - float* fp32_data = (float*) in_tensor->data; - int data_elem = in_tensor->elem_num; - - __fp16* fp16_data = (__fp16*)sys_malloc(data_elem * sizeof(__fp16)); - - for (int k = 0; k < data_elem; k++) - { - fp16_data[k] = fp32_to_fp16(fp32_data[k]); - } - - in_tensor->data_type = TENGINE_DT_FP16; - in_tensor->data = fp16_data; - in_tensor->quant_param_num = 0; - } - } - } - else if (op_name == "Slice") - { - struct tensor* slice_input_tensor = get_ir_graph_tensor(ir_graph, noden->input_tensors[0]); - struct tensor* slice_output_tensor = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]); - slice_output_tensor->scale = slice_input_tensor->scale; - slice_output_tensor->zero_point = slice_input_tensor->zero_point; - } - } - - fclose(fp_weight); - fclose(fp_bias); - - fprintf(stderr, "[Quant Tools Info]: Step 5, quantize weight tensor done.\n"); - - if (!save_graph(ir_graph, output_file.c_str())) - { - fprintf(stderr, "save graph failed.\n"); - return -1; - } - - fprintf(stderr, "[Quant Tools Info]: Step 6, save Int8 tmfile done, %s\n", output_file.c_str()); - - return 0; -} - -int save_graph_u8_perchannel(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal) -{ - fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again\n"); - - /* Step 1 : create graph, load tengine model xxx.tmfile */ - struct graph* ir_graph = (struct graph*)create_graph(nullptr, "tengine", model_file); - if (nullptr == ir_graph) - { - fprintf(stderr, "Create graph failed.\n"); - return -1; - } - fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again done.\n"); - - std::tr1::unordered_map layer_scale; - std::tr1::unordered_map layer_zeropoint; - - fprintf(stderr, "[Quant Tools Info]: Step 3, load calibration table file %s.\n", scale_file); - /* Step 2 : set activation quant scale value into ir_tensor */ - if (nullptr != scale_file) - { - std::ifstream scales(scale_file); - std::string line; - while (std::getline(scales, line)) - { - std::string layer_name; - float scale_val = 0.f; - float zero_point = 0.f; - size_t last = 0; - size_t index = line.find_first_of(' ', last); - size_t idx = line.find_last_of(' ', line.size()); - layer_name = line.substr(last, index - last); - last = index + 1; - scale_val = atof((line.substr(last, line.size() - last)).c_str()); - zero_point = atof((line.substr(idx + 1, line.size())).c_str()); - - layer_scale[layer_name] = scale_val; - layer_zeropoint[layer_name] = zero_point; - -// fprintf(stderr, "[%s] \tscale final %8.4f, zero point %8.4f\n", layer_name.c_str(), scale_val, zero_point); - } - } - - std::tr1::unordered_map layer_used; - for (int i = 0; i < ir_graph->node_num; i++) - { - struct node* ir_node = ir_graph->node_list[i]; - for (int j = 0; j < ir_node->input_num; j++ ) - { - std::string layern = ir_graph->tensor_list[ir_node->input_tensors[j]]->name; - layer_used[layern] ++; - } - } - - fprintf(stderr, "[Quant Tools Info]: Step 4, optimize the calibration table.\n"); - /* process the inplace quant scale of activation in some types of op, such as max pooling, ReLU, Flatten, Reshape, Clip .... */ - if (inplace == 0) - { - for (int i = 0; i < ir_graph->tensor_num; i++) - { - struct tensor* ir_tensor = ir_graph->tensor_list[i]; - if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) - { - ir_tensor->scale = layer_scale[ir_tensor->name]; - ir_tensor->zero_point = layer_zeropoint[ir_tensor->name]; - } - } - } - else - { - std::tr1::unordered_map layer_pass; - for (int i = ir_graph->tensor_num-1; i >= 0; i--) - { - struct tensor* ir_tensor = ir_graph->tensor_list[i]; - if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) - { - if (layer_pass[ir_tensor->name] == false) - { - uint32_t ir_node_idx = ir_tensor->producer; - struct node* t_node = ir_graph->node_list[ir_node_idx]; - - std::string op_name = get_op_name_from_type(t_node->op.type); - - bool poolTrue = false; - bool reluTrue = false; - if (op_name == "Pooling") - { - struct pool_param* pool_param = ( struct pool_param* )t_node->op.param_mem; - if (pool_param->pool_method == 0) - poolTrue = true; - } - else if (op_name == "ReLU") - { - struct relu_param* relu_param = ( struct relu_param* )t_node->op.param_mem; - if (relu_param->negative_slope == 0.f) - reluTrue = true; - } - - if (op_name == "Flatten" || op_name == "Reshape" || op_name == "Squeeze" || op_name == "Clip" || - op_name == "Slice" || poolTrue || reluTrue) - { - struct tensor* t_in_tensor = ir_graph->tensor_list[t_node->input_tensors[0]]; - if (layer_scale[ir_tensor->name] != 0) - { - ir_tensor->scale = layer_scale[ir_tensor->name]; - ir_tensor->zero_point = layer_zeropoint[ir_tensor->name]; - - if (t_in_tensor->tensor_type == TENSOR_TYPE_VAR || t_in_tensor->tensor_type == TENSOR_TYPE_INPUT) - { - recursion_pass_through(ir_graph, ir_tensor->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass); - } - } - } - else - { - ir_tensor->scale = layer_scale[ir_tensor->name]; - ir_tensor->zero_point = layer_zeropoint[ir_tensor->name]; - } - layer_pass[ir_tensor->name] = true; - } - } - } - } - - fprintf(stderr, "[Quant Tools Info]: Step 4, quantize activation tensor done.\n"); - - /* Set the params of acitvation ir_tensor */ - for (int i = 0; i < ir_graph->tensor_num; i++) - { - struct tensor* ir_tensor = ir_graph->tensor_list[i]; - if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) - { - ir_tensor->data_type = TENGINE_DT_UINT8; - ir_tensor->elem_size = sizeof(uint8_t); - } - ir_tensor->quant_param_num = 1; - } - - /* Step 3 : set weight/bias quant scale value into ir_tensor, quant the weight params from Float32 to Int8 */ - FILE* fp_weight = fopen("scale_weight.txt", "wb"); - FILE* fp_bias = fopen("scale_bias.txt", "wb"); - for (int i = 0; i < ir_graph->node_num; i++) - { - struct node* noden = ir_graph->node_list[i]; - std::string op_name = get_op_name_from_type(noden->op.type); - - /* quantize the tensor data from fp32 to uint8 */ - if (op_name == "Convolution" ) - { - /* Step 3.1 : quant weight */ - struct tensor* weight_tensor = ir_graph->tensor_list[noden->input_tensors[1]]; - - int channel_num = weight_tensor->dims[0]; - int cstep = int(weight_tensor->elem_num / channel_num); - float* weight_data = ( float* )weight_tensor->data; - int8_t* i8_weight_data = ( int8_t* )sys_malloc(weight_tensor->elem_num * sizeof(int8_t)); - - float* weight_scale_list = ( float* )sys_malloc(channel_num * sizeof(float)); - int* weight_zp_list = ( int* )sys_malloc(channel_num * sizeof(int)); - - fprintf(fp_weight, "%s ", weight_tensor->name); - /* calculate the quant scale value of weight perchannel, scale = abs(min, max) / 127 */ - if (internal) - { - // TODO - } - else - { - for (int ch = 0; ch < channel_num; ch++) - { - float* weight_data_ch_start = weight_data + ch * cstep; - float* weight_data_ch_end = weight_data + (ch + 1) * cstep; - float weight_max = *std::max_element(weight_data_ch_start, weight_data_ch_end); - float weight_min = *std::min_element(weight_data_ch_start, weight_data_ch_end); - - weight_scale_list[ch] = std::max(abs(weight_max), abs(weight_min)) / 127.f; - weight_zp_list[ch] = 0; - fprintf(fp_weight, "%8.8f ", weight_scale_list[ch]); - } - fprintf(fp_weight, "\n"); - } -// fprintf(stderr, "[weight] scale final %8.4f, zero point %4d\n", weight_scale, weight_zero_point); - - /* quantize the value of weight from Float32 to Int8, value_i8 = (value_fp32 / scale).round().clip(-127, 127) */ - for (int ch = 0; ch < channel_num; ch++) - { - for (int j = 0; j < cstep; j++) - { - if (weight_data[ch * cstep + j] == 0 || weight_scale_list[ch] == 0) - i8_weight_data[ch * cstep + j] = 0; - else - { - float int8_data = round(weight_data[ch * cstep + j] / weight_scale_list[ch]); - int8_data = int8_data > 127.f ? 127.f : int8_data; - int8_data = int8_data < -127.f ? -127.f : int8_data; - i8_weight_data[ch * cstep + j] = int8_t(int8_data); - } - } - } - - weight_tensor->scale_list = weight_scale_list; - weight_tensor->zp_list = weight_zp_list; - weight_tensor->data_type = TENGINE_DT_INT8; - weight_tensor->elem_size = sizeof(int8_t); // int8, signed char - weight_tensor->data = i8_weight_data; - weight_tensor->quant_param_num = channel_num; - - /* step 3.2 : quant bias */ - if (noden->input_num > 2) - { - struct tensor* input_tensor = ir_graph->tensor_list[noden->input_tensors[0]]; - struct tensor* bias_tensor = ir_graph->tensor_list[noden->input_tensors[2]]; - - float* bias_scale_list = ( float* )sys_malloc(bias_tensor->dims[0] * sizeof(float)); - int* bias_zp_list = ( int* )sys_malloc(bias_tensor->dims[0] * sizeof(int32_t)); - - float* bias_data = ( float* )bias_tensor->data; - int* int32_bias_data = ( int* )sys_malloc(bias_tensor->elem_num * sizeof(int32_t)); - - int bstep = int(bias_tensor->elem_num / channel_num); - - fprintf(fp_bias, "%s ", bias_tensor->name); - - /* calculate the quant scale value of bias perchannel, scale = scale_weight * scale_in */ - for (int ch = 0; ch < channel_num; ch++) - { - bias_scale_list[ch] = weight_scale_list[ch] * input_tensor->scale; - bias_zp_list[ch] = 0; - - fprintf(fp_bias, "%8.8f ", bias_scale_list[ch]); - } - fprintf(fp_bias, "\n"); - - /* quantize the value of bias from Float32 to Int32, value_i32 = (value_fp32 / scale).round() */ - for (int ch = 0; ch < channel_num; ch++) - { - for (int bi = 0; bi < bstep; bi++) - { - if (bias_data[ch * bstep + bi] == 0 || bias_scale_list[ch] == 0) - int32_bias_data[ch * bstep + bi] = 0; - else - int32_bias_data[ch * bstep + bi] = int(round(bias_data[ch * bstep + bi] / bias_scale_list[ch])); - } - } - - bias_tensor->scale_list = bias_scale_list; - bias_tensor->zp_list = bias_zp_list; - bias_tensor->data_type = TENGINE_DT_INT32; - bias_tensor->elem_size = sizeof(int32_t); // int32, signed int - bias_tensor->data = int32_bias_data; - bias_tensor->quant_param_num = channel_num; - - // fprintf(stderr, "bias %8.8f \t%s\n", bias_scale_list[0], bias_tensor->name); - } - // fprintf(stderr, "\n"); - } - else if (op_name == "FullyConnected" || op_name == "Deconvolution") - { - /* Step 3.1 : quant weight */ - struct tensor* weight_tensor = ir_graph->tensor_list[noden->input_tensors[1]]; - - uint8_t * u8_weight_data = (uint8_t*)sys_malloc(weight_tensor->elem_num * sizeof(uint8_t)); - float* weight_data = (float*)weight_tensor->data; - - /* calculate the quant scale value of weight perchannel, scale = (min-max / 255) */ - float weight_max = 0; - float weight_min = 0; - float weight_scale = 0; - int weight_zero_point = 0; - - if (internal) - { - weight_scale = weight_tensor->scale; - weight_zero_point = weight_tensor->zero_point; - } - else - { - weight_max = *std::max_element(weight_data, weight_data + weight_tensor->elem_num); - weight_min = *std::min_element(weight_data, weight_data + weight_tensor->elem_num); - weight_scale = (weight_max - weight_min) / 255.f; - weight_zero_point = int(-weight_min/weight_scale); - } -// fprintf(stderr, "[weight] scale final %8.4f, zero point %4d\n", weight_scale, weight_zero_point); - - /* quantize the value of weight from Float32 to UInt8, value_u8 = (value_fp32 / scale).round().clip(0, 255) */ - for (int wi = 0; wi < weight_tensor->elem_num; wi++) - { - weight_data[wi] = roundf(weight_data[wi] / weight_scale + (float )weight_zero_point); - weight_data[wi] = weight_data[wi] > 255.f ? 255.f : weight_data[wi]; - weight_data[wi] = weight_data[wi] < 0.f ? 0.f : weight_data[wi]; - u8_weight_data[wi] = uint8_t(weight_data[wi]); - } - - weight_tensor->scale = weight_scale; - weight_tensor->zero_point = weight_zero_point; - weight_tensor->data_type = TENGINE_DT_UINT8; - weight_tensor->elem_size = sizeof(uint8_t); - weight_tensor->data = u8_weight_data; - - /* step 3.2 : quant bias */ - if (noden->input_num > 2) - { - struct tensor* input_tensor = ir_graph->tensor_list[noden->input_tensors[0]]; - struct tensor* bias_tensor = ir_graph->tensor_list[noden->input_tensors[2]]; - - int* int32_bias_data = (int*)sys_malloc(bias_tensor->elem_num * bias_tensor->elem_size); - float* bias_data = (float*)bias_tensor->data; - - /* calculate the quant scale value of bias perchannel, scale = scale_weight * scale_in */ - float bias_scale = input_tensor->scale * weight_tensor->scale; - - /* quantize the value of bias from Float32 to Int32, value_i32 = (value_fp32 / scale).round() */ - for (int bi = 0; bi < bias_tensor->elem_num; bi++) - { - if (bias_scale == 0) - int32_bias_data[bi] = 0; - else - { - bias_data[bi] = roundf(bias_data[bi] / bias_scale); - int32_bias_data[bi] = int(bias_data[bi]); - } - } - - bias_tensor->scale = bias_scale; - bias_tensor->data_type = TENGINE_DT_INT32; - bias_tensor->data = int32_bias_data; - -// fprintf(stderr, "[bias] scale final %8.4f\n", bias_scale); - } - } - /* quantize the tensor data from fp32 to fp16, for TIM-VX NPU IP */ - else if (op_name == "PReLU") - { - for (int j = 0; j < noden->input_num; j++) - { - struct tensor* in_tensor = ir_graph->tensor_list[noden->input_tensors[j]]; - if (in_tensor->tensor_type == TENSOR_TYPE_CONST) - { - float* fp32_data = (float*) in_tensor->data; - int data_elem = in_tensor->elem_num; - - __fp16* fp16_data = (__fp16*)sys_malloc(data_elem * sizeof(__fp16)); - - for (int k = 0; k < data_elem; k++) - { - fp16_data[k] = fp32_to_fp16(fp32_data[k]); - } - - in_tensor->data_type = TENGINE_DT_FP16; - in_tensor->data = fp16_data; - in_tensor->quant_param_num = 0; - } - } - } - else if (op_name == "Slice") - { - struct tensor* slice_input_tensor = get_ir_graph_tensor(ir_graph, noden->input_tensors[0]); - struct tensor* slice_output_tensor = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]); - slice_output_tensor->scale = slice_input_tensor->scale; - slice_output_tensor->zero_point = slice_input_tensor->zero_point; - } - } - - fclose(fp_weight); - fclose(fp_bias); - - fprintf(stderr, "[Quant Tools Info]: Step 5, quantize weight tensor done.\n"); - - if (!save_graph(ir_graph, output_file.c_str())) - { - fprintf(stderr, "save graph failed.\n"); - return -1; - } - - fprintf(stderr, "[Quant Tools Info]: Step 6, save Int8 tmfile done, %s\n", output_file.c_str()); - - return 0; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: hhchen@openailab.com + */ + +#include + +#include "quant_save_graph.hpp" +#include "compiler_fp16.h" + +#include "operator/prototype/convolution_param.h" +#include "operator/prototype/pooling_param.h" +#include "operator/prototype/relu_param.h" + +void recursion_pass_through(struct graph* ir_graph, const char* layer_name, struct tensor* t, + std::tr1::unordered_map& layer_used, std::tr1::unordered_map& layer_scale, + std::tr1::unordered_map& layer_zeropoint, std::tr1::unordered_map& layer_pass) +{ + if (layer_pass[t->name] == false && layer_used[t->name] < 2) + { + t->scale = layer_scale[layer_name]; + t->zero_point = layer_zeropoint[layer_name]; + layer_scale[t->name] = layer_scale[layer_name]; + layer_zeropoint[t->name] = layer_zeropoint[layer_name]; + + uint32_t ir_node_idx = t->producer; + struct node* t_node = ir_graph->node_list[ir_node_idx]; + + std::string op_name = get_op_name_from_type(t_node->op.type); + bool poolTrue = false; + bool reluTrue = false; + if (op_name == "Pooling") + { + struct pool_param* pool_param = (struct pool_param*)t_node->op.param_mem; + if (pool_param->pool_method == 0) + poolTrue = true; + } + else if (op_name == "ReLU") + { + struct relu_param* relu_param = (struct relu_param*)t_node->op.param_mem; + if (relu_param->negative_slope == 0.f) + reluTrue = true; + } + if (op_name == "Flatten" || op_name == "Reshape" || op_name == "Squeeze" || op_name == "Clip" || poolTrue || reluTrue) + { + struct tensor* t_in_tensor = ir_graph->tensor_list[t_node->input_tensors[0]]; + if (layer_scale[t->name] != 0) + { + if (t_in_tensor->tensor_type == TENSOR_TYPE_VAR || t_in_tensor->tensor_type == TENSOR_TYPE_INPUT) + { + recursion_pass_through(ir_graph, t->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass); + } + } + } + layer_pass[t->name] = true; + } +} + +int save_graph_u8_perlayer(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal) +{ + fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again\n"); + + /* Step 1 : create graph, load tengine model xxx.tmfile */ + struct graph* ir_graph = (struct graph*)create_graph(nullptr, "tengine", model_file); + if (nullptr == ir_graph) + { + fprintf(stderr, "Create graph failed.\n"); + return -1; + } + fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again done.\n"); + + std::tr1::unordered_map layer_scale; + std::tr1::unordered_map layer_zeropoint; + + fprintf(stderr, "[Quant Tools Info]: Step 3, load calibration table file %s.\n", scale_file); + /* Step 2 : set activation quant scale value into ir_tensor */ + if (nullptr != scale_file) + { + std::ifstream scales(scale_file); + std::string line; + while (std::getline(scales, line)) + { + std::string layer_name; + float scale_val = 0.f; + float zero_point = 0.f; + size_t last = 0; + size_t index = line.find_first_of(' ', last); + size_t idx = line.find_last_of(' ', line.size()); + layer_name = line.substr(last, index - last); + last = index + 1; + scale_val = atof((line.substr(last, line.size() - last)).c_str()); + zero_point = atof((line.substr(idx + 1, line.size())).c_str()); + + layer_scale[layer_name] = scale_val; + layer_zeropoint[layer_name] = zero_point; + + // fprintf(stderr, "[%s] \tscale final %8.4f, zero point %8.4f\n", layer_name.c_str(), scale_val, zero_point); + } + } + + std::tr1::unordered_map layer_used; + for (int i = 0; i < ir_graph->node_num; i++) + { + struct node* ir_node = ir_graph->node_list[i]; + for (int j = 0; j < ir_node->input_num; j++) + { + std::string layern = ir_graph->tensor_list[ir_node->input_tensors[j]]->name; + layer_used[layern]++; + } + } + + fprintf(stderr, "[Quant Tools Info]: Step 4, optimize the calibration table.\n"); + /* process the inplace quant scale of activation in some types of op, such as max pooling, ReLU, Flatten, Reshape, Clip .... */ + if (inplace == 0) + { + for (int i = 0; i < ir_graph->tensor_num; i++) + { + struct tensor* ir_tensor = ir_graph->tensor_list[i]; + if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) + { + ir_tensor->scale = layer_scale[ir_tensor->name]; + ir_tensor->zero_point = layer_zeropoint[ir_tensor->name]; + } + } + } + else + { + std::tr1::unordered_map layer_pass; + for (int i = ir_graph->tensor_num - 1; i >= 0; i--) + { + struct tensor* ir_tensor = ir_graph->tensor_list[i]; + if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) + { + if (layer_pass[ir_tensor->name] == false) + { + uint32_t ir_node_idx = ir_tensor->producer; + struct node* t_node = ir_graph->node_list[ir_node_idx]; + + std::string op_name = get_op_name_from_type(t_node->op.type); + + bool poolTrue = false; + bool reluTrue = false; + if (op_name == "Pooling") + { + struct pool_param* pool_param = (struct pool_param*)t_node->op.param_mem; + if (pool_param->pool_method == 0) + poolTrue = true; + } + else if (op_name == "ReLU") + { + struct relu_param* relu_param = (struct relu_param*)t_node->op.param_mem; + if (relu_param->negative_slope == 0.f) + reluTrue = true; + } + + if (op_name == "Flatten" || op_name == "Reshape" || op_name == "Squeeze" || op_name == "Clip" || op_name == "Slice" || poolTrue || reluTrue) + { + struct tensor* t_in_tensor = ir_graph->tensor_list[t_node->input_tensors[0]]; + if (layer_scale[ir_tensor->name] != 0) + { + ir_tensor->scale = layer_scale[ir_tensor->name]; + ir_tensor->zero_point = layer_zeropoint[ir_tensor->name]; + + if (t_in_tensor->tensor_type == TENSOR_TYPE_VAR || t_in_tensor->tensor_type == TENSOR_TYPE_INPUT) + { + recursion_pass_through(ir_graph, ir_tensor->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass); + } + } + } + else + { + ir_tensor->scale = layer_scale[ir_tensor->name]; + ir_tensor->zero_point = layer_zeropoint[ir_tensor->name]; + } + layer_pass[ir_tensor->name] = true; + } + } + } + } + + fprintf(stderr, "[Quant Tools Info]: Step 4, quantize activation tensor done.\n"); + + /* Set the params of acitvation ir_tensor */ + for (int i = 0; i < ir_graph->tensor_num; i++) + { + struct tensor* ir_tensor = ir_graph->tensor_list[i]; + if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) + { + ir_tensor->data_type = TENGINE_DT_UINT8; + ir_tensor->elem_size = sizeof(uint8_t); + } + ir_tensor->quant_param_num = 1; + } + + /* Step 3 : set weight/bias quant scale value into ir_tensor, quant the weight params from Float32 to Int8 */ + for (int i = 0; i < ir_graph->node_num; i++) + { + struct node* noden = ir_graph->node_list[i]; + std::string op_name = get_op_name_from_type(noden->op.type); + + /* quantize the tensor data from fp32 to uint8 */ + if (op_name == "Convolution" || op_name == "FullyConnected" || op_name == "Deconvolution") + { + /* Step 3.1 : quant weight */ + struct tensor* weight_tensor = ir_graph->tensor_list[noden->input_tensors[1]]; + + uint8_t* u8_weight_data = (uint8_t*)sys_malloc(weight_tensor->elem_num * sizeof(uint8_t)); + float* weight_data = (float*)weight_tensor->data; + + /* calculate the quant scale value of weight perchannel, scale = (min-max / 255) */ + float weight_max = 0; + float weight_min = 0; + float weight_scale = 0; + int weight_zero_point = 0; + + if (internal) + { + weight_scale = weight_tensor->scale; + weight_zero_point = weight_tensor->zero_point; + } + else + { + weight_max = *std::max_element(weight_data, weight_data + weight_tensor->elem_num); + weight_min = *std::min_element(weight_data, weight_data + weight_tensor->elem_num); + weight_scale = (weight_max - weight_min) / 255.f; + weight_zero_point = int(-weight_min / weight_scale); + } + // fprintf(stderr, "[weight] scale final %8.4f, zero point %4d\n", weight_scale, weight_zero_point); + + /* quantize the value of weight from Float32 to UInt8, value_u8 = (value_fp32 / scale).round().clip(0, 255) */ + for (int wi = 0; wi < weight_tensor->elem_num; wi++) + { + weight_data[wi] = roundf(weight_data[wi] / weight_scale + (float)weight_zero_point); + weight_data[wi] = weight_data[wi] > 255.f ? 255.f : weight_data[wi]; + weight_data[wi] = weight_data[wi] < 0.f ? 0.f : weight_data[wi]; + u8_weight_data[wi] = uint8_t(weight_data[wi]); + } + + weight_tensor->scale = weight_scale; + weight_tensor->zero_point = weight_zero_point; + weight_tensor->data_type = TENGINE_DT_UINT8; + weight_tensor->elem_size = sizeof(uint8_t); + weight_tensor->data = u8_weight_data; + + /* step 3.2 : quant bias */ + if (noden->input_num > 2) + { + struct tensor* input_tensor = ir_graph->tensor_list[noden->input_tensors[0]]; + struct tensor* bias_tensor = ir_graph->tensor_list[noden->input_tensors[2]]; + + int* int32_bias_data = (int*)sys_malloc(bias_tensor->elem_num * bias_tensor->elem_size); + float* bias_data = (float*)bias_tensor->data; + + /* calculate the quant scale value of bias perchannel, scale = scale_weight * scale_in */ + float bias_scale = input_tensor->scale * weight_tensor->scale; + + /* quantize the value of bias from Float32 to Int32, value_i32 = (value_fp32 / scale).round() */ + for (int bi = 0; bi < bias_tensor->elem_num; bi++) + { + if (bias_scale == 0) + int32_bias_data[bi] = 0; + else + { + bias_data[bi] = roundf(bias_data[bi] / bias_scale); + int32_bias_data[bi] = int(bias_data[bi]); + } + } + + bias_tensor->scale = bias_scale; + bias_tensor->data_type = TENGINE_DT_INT32; + bias_tensor->data = int32_bias_data; + + // fprintf(stderr, "[bias] scale final %8.4f\n", bias_scale); + } + } + /* quantize the tensor data from fp32 to fp16, for TIM-VX NPU IP */ + else if (op_name == "PReLU") + { + for (int j = 0; j < noden->input_num; j++) + { + struct tensor* in_tensor = ir_graph->tensor_list[noden->input_tensors[j]]; + if (in_tensor->tensor_type == TENSOR_TYPE_CONST) + { + float* fp32_data = (float*)in_tensor->data; + int data_elem = in_tensor->elem_num; + + __fp16* fp16_data = (__fp16*)sys_malloc(data_elem * sizeof(__fp16)); + + for (int k = 0; k < data_elem; k++) + { + fp16_data[k] = fp32_to_fp16(fp32_data[k]); + } + + in_tensor->data_type = TENGINE_DT_FP16; + in_tensor->data = fp16_data; + in_tensor->quant_param_num = 0; + } + } + } + else if (op_name == "Slice") + { + struct tensor* slice_input_tensor = get_ir_graph_tensor(ir_graph, noden->input_tensors[0]); + struct tensor* slice_output_tensor = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]); + slice_output_tensor->scale = slice_input_tensor->scale; + slice_output_tensor->zero_point = slice_input_tensor->zero_point; + } + } + + fprintf(stderr, "[Quant Tools Info]: Step 5, quantize weight tensor done.\n"); + + if (!save_graph(ir_graph, output_file.c_str())) + { + fprintf(stderr, "save graph failed.\n"); + return -1; + } + + fprintf(stderr, "[Quant Tools Info]: Step 6, save UInt8 tmfile done, %s\n", output_file.c_str()); + + return 0; +} + +int save_graph_i8_perchannel(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal) +{ + fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again\n"); + + /* Step 1 : create graph, load tengine model xxx.tmfile */ + struct graph* ir_graph = (struct graph*)create_graph(nullptr, "tengine", model_file); + if (nullptr == ir_graph) + { + fprintf(stderr, "Create graph failed.\n"); + return -1; + } + fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again done.\n"); + + std::tr1::unordered_map layer_scale; + std::tr1::unordered_map layer_zeropoint; + + fprintf(stderr, "[Quant Tools Info]: Step 3, load calibration table file %s.\n", scale_file); + /* Step 2 : set activation quant scale value into ir_tensor */ + if (nullptr != scale_file) + { + std::ifstream scales(scale_file); + std::string line; + while (std::getline(scales, line)) + { + std::string layer_name; + float scale_val = 0.f; + float zero_point = 0.f; + size_t last = 0; + size_t index = line.find_first_of(' ', last); + size_t idx = line.find_last_of(' ', line.size()); + layer_name = line.substr(last, index - last); + last = index + 1; + scale_val = atof((line.substr(last, line.size() - last)).c_str()); + zero_point = atof((line.substr(idx + 1, line.size())).c_str()); + + layer_scale[layer_name] = scale_val; + layer_zeropoint[layer_name] = zero_point; + + // fprintf(stderr, "[%s] \tscale final %8.4f, zero point %8.4f\n", layer_name.c_str(), scale_val, zero_point); + } + } + + std::tr1::unordered_map layer_used; + for (int i = 0; i < ir_graph->node_num; i++) + { + struct node* ir_node = ir_graph->node_list[i]; + for (int j = 0; j < ir_node->input_num; j++) + { + std::string layern = ir_graph->tensor_list[ir_node->input_tensors[j]]->name; + layer_used[layern]++; + } + } + + fprintf(stderr, "[Quant Tools Info]: Step 4, optimize the calibration table.\n"); + /* process the inplace quant scale of activation in some types of op, such as max pooling, ReLU, Flatten, Reshape, Clip .... */ + if (inplace == 0) + { + for (int i = 0; i < ir_graph->tensor_num; i++) + { + struct tensor* ir_tensor = ir_graph->tensor_list[i]; + if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) + { + ir_tensor->scale = layer_scale[ir_tensor->name]; + ir_tensor->zero_point = layer_zeropoint[ir_tensor->name]; + } + } + } + else + { + std::tr1::unordered_map layer_pass; + for (int i = ir_graph->tensor_num - 1; i >= 0; i--) + { + struct tensor* ir_tensor = ir_graph->tensor_list[i]; + if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) + { + if (layer_pass[ir_tensor->name] == false) + { + uint32_t ir_node_idx = ir_tensor->producer; + struct node* t_node = ir_graph->node_list[ir_node_idx]; + + std::string op_name = get_op_name_from_type(t_node->op.type); + + bool poolTrue = false; + bool reluTrue = false; + if (op_name == "Pooling") + { + struct pool_param* pool_param = (struct pool_param*)t_node->op.param_mem; + if (pool_param->pool_method == 0) + poolTrue = true; + } + else if (op_name == "ReLU") + { + struct relu_param* relu_param = (struct relu_param*)t_node->op.param_mem; + if (relu_param->negative_slope == 0.f) + reluTrue = true; + } + + if (op_name == "Flatten" || op_name == "Reshape" || op_name == "Squeeze" || op_name == "Clip" || op_name == "Slice" || poolTrue || reluTrue) + { + struct tensor* t_in_tensor = ir_graph->tensor_list[t_node->input_tensors[0]]; + if (layer_scale[ir_tensor->name] != 0) + { + ir_tensor->scale = layer_scale[ir_tensor->name]; + ir_tensor->zero_point = layer_zeropoint[ir_tensor->name]; + + if (t_in_tensor->tensor_type == TENSOR_TYPE_VAR || t_in_tensor->tensor_type == TENSOR_TYPE_INPUT) + { + recursion_pass_through(ir_graph, ir_tensor->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass); + } + } + } + else + { + ir_tensor->scale = layer_scale[ir_tensor->name]; + ir_tensor->zero_point = layer_zeropoint[ir_tensor->name]; + } + layer_pass[ir_tensor->name] = true; + } + } + } + } + + fprintf(stderr, "[Quant Tools Info]: Step 4, quantize activation tensor done.\n"); + + /* Set the params of acitvation ir_tensor */ + for (int i = 0; i < ir_graph->tensor_num; i++) + { + struct tensor* ir_tensor = ir_graph->tensor_list[i]; + if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) + { + ir_tensor->data_type = TENGINE_DT_INT8; + ir_tensor->elem_size = sizeof(int8_t); + } + ir_tensor->quant_param_num = 1; + } + + /* Step 3 : set weight/bias quant scale value into ir_tensor, quant the weight params from Float32 to Int8 */ + FILE* fp_weight = fopen("scale_weight.txt", "wb"); + FILE* fp_bias = fopen("scale_bias.txt", "wb"); + for (int i = 0; i < ir_graph->node_num; i++) + { + struct node* noden = ir_graph->node_list[i]; + std::string op_name = get_op_name_from_type(noden->op.type); + + /* quantize the tensor data from fp32 to uint8 */ + if (op_name == "Convolution" || op_name == "FullyConnected" || op_name == "Deconvolution") + { + /* Step 3.1 : quant weight */ + struct tensor* weight_tensor = ir_graph->tensor_list[noden->input_tensors[1]]; + + int channel_num = weight_tensor->dims[0]; + int cstep = int(weight_tensor->elem_num / channel_num); + float* weight_data = (float*)weight_tensor->data; + int8_t* i8_weight_data = (int8_t*)sys_malloc(weight_tensor->elem_num * sizeof(int8_t)); + + float* weight_scale_list = (float*)sys_malloc(channel_num * sizeof(float)); + int* weight_zp_list = (int*)sys_malloc(channel_num * sizeof(int)); + + fprintf(fp_weight, "%s ", weight_tensor->name); + /* calculate the quant scale value of weight perchannel, scale = abs(min, max) / 127 */ + if (internal) + { + // TODO + } + else + { + for (int ch = 0; ch < channel_num; ch++) + { + float* weight_data_ch_start = weight_data + ch * cstep; + float* weight_data_ch_end = weight_data + (ch + 1) * cstep; + float weight_max = *std::max_element(weight_data_ch_start, weight_data_ch_end); + float weight_min = *std::min_element(weight_data_ch_start, weight_data_ch_end); + + weight_scale_list[ch] = std::max(abs(weight_max), abs(weight_min)) / 127.f; + weight_zp_list[ch] = 0; + fprintf(fp_weight, "%8.8f ", weight_scale_list[ch]); + } + fprintf(fp_weight, "\n"); + } + // fprintf(stderr, "[weight] scale final %8.4f, zero point %4d\n", weight_scale, weight_zero_point); + + /* quantize the value of weight from Float32 to Int8, value_i8 = (value_fp32 / scale).round().clip(-127, 127) */ + for (int ch = 0; ch < channel_num; ch++) + { + for (int j = 0; j < cstep; j++) + { + if (weight_data[ch * cstep + j] == 0 || weight_scale_list[ch] == 0) + i8_weight_data[ch * cstep + j] = 0; + else + { + float int8_data = round(weight_data[ch * cstep + j] / weight_scale_list[ch]); + int8_data = int8_data > 127.f ? 127.f : int8_data; + int8_data = int8_data < -127.f ? -127.f : int8_data; + i8_weight_data[ch * cstep + j] = int8_t(int8_data); + } + } + } + + weight_tensor->scale_list = weight_scale_list; + weight_tensor->zp_list = weight_zp_list; + weight_tensor->data_type = TENGINE_DT_INT8; + weight_tensor->elem_size = sizeof(int8_t); // int8, signed char + weight_tensor->data = i8_weight_data; + weight_tensor->quant_param_num = channel_num; + + /* step 3.2 : quant bias */ + if (noden->input_num > 2) + { + struct tensor* input_tensor = ir_graph->tensor_list[noden->input_tensors[0]]; + struct tensor* bias_tensor = ir_graph->tensor_list[noden->input_tensors[2]]; + + float* bias_scale_list = (float*)sys_malloc(bias_tensor->dims[0] * sizeof(float)); + int* bias_zp_list = (int*)sys_malloc(bias_tensor->dims[0] * sizeof(int32_t)); + + float* bias_data = (float*)bias_tensor->data; + int* int32_bias_data = (int*)sys_malloc(bias_tensor->elem_num * sizeof(int32_t)); + + int bstep = int(bias_tensor->elem_num / channel_num); + + fprintf(fp_bias, "%s ", bias_tensor->name); + + /* calculate the quant scale value of bias perchannel, scale = scale_weight * scale_in */ + for (int ch = 0; ch < channel_num; ch++) + { + bias_scale_list[ch] = weight_scale_list[ch] * input_tensor->scale; + bias_zp_list[ch] = 0; + + fprintf(fp_bias, "%8.8f ", bias_scale_list[ch]); + } + fprintf(fp_bias, "\n"); + + /* quantize the value of bias from Float32 to Int32, value_i32 = (value_fp32 / scale).round() */ + for (int ch = 0; ch < channel_num; ch++) + { + for (int bi = 0; bi < bstep; bi++) + { + if (bias_data[ch * bstep + bi] == 0 || bias_scale_list[ch] == 0) + int32_bias_data[ch * bstep + bi] = 0; + else + int32_bias_data[ch * bstep + bi] = int(round(bias_data[ch * bstep + bi] / bias_scale_list[ch])); + } + } + + bias_tensor->scale_list = bias_scale_list; + bias_tensor->zp_list = bias_zp_list; + bias_tensor->data_type = TENGINE_DT_INT32; + bias_tensor->elem_size = sizeof(int32_t); // int32, signed int + bias_tensor->data = int32_bias_data; + bias_tensor->quant_param_num = channel_num; + + // fprintf(stderr, "bias %8.8f \t%s\n", bias_scale_list[0], bias_tensor->name); + } + // fprintf(stderr, "\n"); + } + /* quantize the tensor data from fp32 to fp16, for TIM-VX NPU IP */ + else if (op_name == "PReLU") + { + for (int j = 0; j < noden->input_num; j++) + { + struct tensor* in_tensor = ir_graph->tensor_list[noden->input_tensors[j]]; + if (in_tensor->tensor_type == TENSOR_TYPE_CONST) + { + float* fp32_data = (float*)in_tensor->data; + int data_elem = in_tensor->elem_num; + + __fp16* fp16_data = (__fp16*)sys_malloc(data_elem * sizeof(__fp16)); + + for (int k = 0; k < data_elem; k++) + { + fp16_data[k] = fp32_to_fp16(fp32_data[k]); + } + + in_tensor->data_type = TENGINE_DT_FP16; + in_tensor->data = fp16_data; + in_tensor->quant_param_num = 0; + } + } + } + else if (op_name == "Slice") + { + struct tensor* slice_input_tensor = get_ir_graph_tensor(ir_graph, noden->input_tensors[0]); + struct tensor* slice_output_tensor = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]); + slice_output_tensor->scale = slice_input_tensor->scale; + slice_output_tensor->zero_point = slice_input_tensor->zero_point; + } + } + + fclose(fp_weight); + fclose(fp_bias); + + fprintf(stderr, "[Quant Tools Info]: Step 5, quantize weight tensor done.\n"); + + if (!save_graph(ir_graph, output_file.c_str())) + { + fprintf(stderr, "save graph failed.\n"); + return -1; + } + + fprintf(stderr, "[Quant Tools Info]: Step 6, save Int8 tmfile done, %s\n", output_file.c_str()); + + return 0; +} + +int save_graph_u8_perchannel(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal) +{ + fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again\n"); + + /* Step 1 : create graph, load tengine model xxx.tmfile */ + struct graph* ir_graph = (struct graph*)create_graph(nullptr, "tengine", model_file); + if (nullptr == ir_graph) + { + fprintf(stderr, "Create graph failed.\n"); + return -1; + } + fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again done.\n"); + + std::tr1::unordered_map layer_scale; + std::tr1::unordered_map layer_zeropoint; + + fprintf(stderr, "[Quant Tools Info]: Step 3, load calibration table file %s.\n", scale_file); + /* Step 2 : set activation quant scale value into ir_tensor */ + if (nullptr != scale_file) + { + std::ifstream scales(scale_file); + std::string line; + while (std::getline(scales, line)) + { + std::string layer_name; + float scale_val = 0.f; + float zero_point = 0.f; + size_t last = 0; + size_t index = line.find_first_of(' ', last); + size_t idx = line.find_last_of(' ', line.size()); + layer_name = line.substr(last, index - last); + last = index + 1; + scale_val = atof((line.substr(last, line.size() - last)).c_str()); + zero_point = atof((line.substr(idx + 1, line.size())).c_str()); + + layer_scale[layer_name] = scale_val; + layer_zeropoint[layer_name] = zero_point; + + // fprintf(stderr, "[%s] \tscale final %8.4f, zero point %8.4f\n", layer_name.c_str(), scale_val, zero_point); + } + } + + std::tr1::unordered_map layer_used; + for (int i = 0; i < ir_graph->node_num; i++) + { + struct node* ir_node = ir_graph->node_list[i]; + for (int j = 0; j < ir_node->input_num; j++) + { + std::string layern = ir_graph->tensor_list[ir_node->input_tensors[j]]->name; + layer_used[layern]++; + } + } + + fprintf(stderr, "[Quant Tools Info]: Step 4, optimize the calibration table.\n"); + /* process the inplace quant scale of activation in some types of op, such as max pooling, ReLU, Flatten, Reshape, Clip .... */ + if (inplace == 0) + { + for (int i = 0; i < ir_graph->tensor_num; i++) + { + struct tensor* ir_tensor = ir_graph->tensor_list[i]; + if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) + { + ir_tensor->scale = layer_scale[ir_tensor->name]; + ir_tensor->zero_point = layer_zeropoint[ir_tensor->name]; + } + } + } + else + { + std::tr1::unordered_map layer_pass; + for (int i = ir_graph->tensor_num - 1; i >= 0; i--) + { + struct tensor* ir_tensor = ir_graph->tensor_list[i]; + if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) + { + if (layer_pass[ir_tensor->name] == false) + { + uint32_t ir_node_idx = ir_tensor->producer; + struct node* t_node = ir_graph->node_list[ir_node_idx]; + + std::string op_name = get_op_name_from_type(t_node->op.type); + + bool poolTrue = false; + bool reluTrue = false; + if (op_name == "Pooling") + { + struct pool_param* pool_param = (struct pool_param*)t_node->op.param_mem; + if (pool_param->pool_method == 0) + poolTrue = true; + } + else if (op_name == "ReLU") + { + struct relu_param* relu_param = (struct relu_param*)t_node->op.param_mem; + if (relu_param->negative_slope == 0.f) + reluTrue = true; + } + + if (op_name == "Flatten" || op_name == "Reshape" || op_name == "Squeeze" || op_name == "Clip" || op_name == "Slice" || poolTrue || reluTrue) + { + struct tensor* t_in_tensor = ir_graph->tensor_list[t_node->input_tensors[0]]; + if (layer_scale[ir_tensor->name] != 0) + { + ir_tensor->scale = layer_scale[ir_tensor->name]; + ir_tensor->zero_point = layer_zeropoint[ir_tensor->name]; + + if (t_in_tensor->tensor_type == TENSOR_TYPE_VAR || t_in_tensor->tensor_type == TENSOR_TYPE_INPUT) + { + recursion_pass_through(ir_graph, ir_tensor->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass); + } + } + } + else + { + ir_tensor->scale = layer_scale[ir_tensor->name]; + ir_tensor->zero_point = layer_zeropoint[ir_tensor->name]; + } + layer_pass[ir_tensor->name] = true; + } + } + } + } + + fprintf(stderr, "[Quant Tools Info]: Step 4, quantize activation tensor done.\n"); + + /* Set the params of acitvation ir_tensor */ + for (int i = 0; i < ir_graph->tensor_num; i++) + { + struct tensor* ir_tensor = ir_graph->tensor_list[i]; + if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) + { + ir_tensor->data_type = TENGINE_DT_UINT8; + ir_tensor->elem_size = sizeof(uint8_t); + } + ir_tensor->quant_param_num = 1; + } + + /* Step 3 : set weight/bias quant scale value into ir_tensor, quant the weight params from Float32 to Int8 */ + FILE* fp_weight = fopen("scale_weight.txt", "wb"); + FILE* fp_bias = fopen("scale_bias.txt", "wb"); + for (int i = 0; i < ir_graph->node_num; i++) + { + struct node* noden = ir_graph->node_list[i]; + std::string op_name = get_op_name_from_type(noden->op.type); + + /* quantize the tensor data from fp32 to uint8 */ + if (op_name == "Convolution") + { + /* Step 3.1 : quant weight */ + struct tensor* weight_tensor = ir_graph->tensor_list[noden->input_tensors[1]]; + + int channel_num = weight_tensor->dims[0]; + int cstep = int(weight_tensor->elem_num / channel_num); + float* weight_data = (float*)weight_tensor->data; + int8_t* i8_weight_data = (int8_t*)sys_malloc(weight_tensor->elem_num * sizeof(int8_t)); + + float* weight_scale_list = (float*)sys_malloc(channel_num * sizeof(float)); + int* weight_zp_list = (int*)sys_malloc(channel_num * sizeof(int)); + + fprintf(fp_weight, "%s ", weight_tensor->name); + /* calculate the quant scale value of weight perchannel, scale = abs(min, max) / 127 */ + if (internal) + { + // TODO + } + else + { + for (int ch = 0; ch < channel_num; ch++) + { + float* weight_data_ch_start = weight_data + ch * cstep; + float* weight_data_ch_end = weight_data + (ch + 1) * cstep; + float weight_max = *std::max_element(weight_data_ch_start, weight_data_ch_end); + float weight_min = *std::min_element(weight_data_ch_start, weight_data_ch_end); + + weight_scale_list[ch] = std::max(abs(weight_max), abs(weight_min)) / 127.f; + weight_zp_list[ch] = 0; + fprintf(fp_weight, "%8.8f ", weight_scale_list[ch]); + } + fprintf(fp_weight, "\n"); + } + // fprintf(stderr, "[weight] scale final %8.4f, zero point %4d\n", weight_scale, weight_zero_point); + + /* quantize the value of weight from Float32 to Int8, value_i8 = (value_fp32 / scale).round().clip(-127, 127) */ + for (int ch = 0; ch < channel_num; ch++) + { + for (int j = 0; j < cstep; j++) + { + if (weight_data[ch * cstep + j] == 0 || weight_scale_list[ch] == 0) + i8_weight_data[ch * cstep + j] = 0; + else + { + float int8_data = round(weight_data[ch * cstep + j] / weight_scale_list[ch]); + int8_data = int8_data > 127.f ? 127.f : int8_data; + int8_data = int8_data < -127.f ? -127.f : int8_data; + i8_weight_data[ch * cstep + j] = int8_t(int8_data); + } + } + } + + weight_tensor->scale_list = weight_scale_list; + weight_tensor->zp_list = weight_zp_list; + weight_tensor->data_type = TENGINE_DT_INT8; + weight_tensor->elem_size = sizeof(int8_t); // int8, signed char + weight_tensor->data = i8_weight_data; + weight_tensor->quant_param_num = channel_num; + + /* step 3.2 : quant bias */ + if (noden->input_num > 2) + { + struct tensor* input_tensor = ir_graph->tensor_list[noden->input_tensors[0]]; + struct tensor* bias_tensor = ir_graph->tensor_list[noden->input_tensors[2]]; + + float* bias_scale_list = (float*)sys_malloc(bias_tensor->dims[0] * sizeof(float)); + int* bias_zp_list = (int*)sys_malloc(bias_tensor->dims[0] * sizeof(int32_t)); + + float* bias_data = (float*)bias_tensor->data; + int* int32_bias_data = (int*)sys_malloc(bias_tensor->elem_num * sizeof(int32_t)); + + int bstep = int(bias_tensor->elem_num / channel_num); + + fprintf(fp_bias, "%s ", bias_tensor->name); + + /* calculate the quant scale value of bias perchannel, scale = scale_weight * scale_in */ + for (int ch = 0; ch < channel_num; ch++) + { + bias_scale_list[ch] = weight_scale_list[ch] * input_tensor->scale; + bias_zp_list[ch] = 0; + + fprintf(fp_bias, "%8.8f ", bias_scale_list[ch]); + } + fprintf(fp_bias, "\n"); + + /* quantize the value of bias from Float32 to Int32, value_i32 = (value_fp32 / scale).round() */ + for (int ch = 0; ch < channel_num; ch++) + { + for (int bi = 0; bi < bstep; bi++) + { + if (bias_data[ch * bstep + bi] == 0 || bias_scale_list[ch] == 0) + int32_bias_data[ch * bstep + bi] = 0; + else + int32_bias_data[ch * bstep + bi] = int(round(bias_data[ch * bstep + bi] / bias_scale_list[ch])); + } + } + + bias_tensor->scale_list = bias_scale_list; + bias_tensor->zp_list = bias_zp_list; + bias_tensor->data_type = TENGINE_DT_INT32; + bias_tensor->elem_size = sizeof(int32_t); // int32, signed int + bias_tensor->data = int32_bias_data; + bias_tensor->quant_param_num = channel_num; + + // fprintf(stderr, "bias %8.8f \t%s\n", bias_scale_list[0], bias_tensor->name); + } + // fprintf(stderr, "\n"); + } + else if (op_name == "FullyConnected" || op_name == "Deconvolution") + { + /* Step 3.1 : quant weight */ + struct tensor* weight_tensor = ir_graph->tensor_list[noden->input_tensors[1]]; + + uint8_t* u8_weight_data = (uint8_t*)sys_malloc(weight_tensor->elem_num * sizeof(uint8_t)); + float* weight_data = (float*)weight_tensor->data; + + /* calculate the quant scale value of weight perchannel, scale = (min-max / 255) */ + float weight_max = 0; + float weight_min = 0; + float weight_scale = 0; + int weight_zero_point = 0; + + if (internal) + { + weight_scale = weight_tensor->scale; + weight_zero_point = weight_tensor->zero_point; + } + else + { + weight_max = *std::max_element(weight_data, weight_data + weight_tensor->elem_num); + weight_min = *std::min_element(weight_data, weight_data + weight_tensor->elem_num); + weight_scale = (weight_max - weight_min) / 255.f; + weight_zero_point = int(-weight_min / weight_scale); + } + // fprintf(stderr, "[weight] scale final %8.4f, zero point %4d\n", weight_scale, weight_zero_point); + + /* quantize the value of weight from Float32 to UInt8, value_u8 = (value_fp32 / scale).round().clip(0, 255) */ + for (int wi = 0; wi < weight_tensor->elem_num; wi++) + { + weight_data[wi] = roundf(weight_data[wi] / weight_scale + (float)weight_zero_point); + weight_data[wi] = weight_data[wi] > 255.f ? 255.f : weight_data[wi]; + weight_data[wi] = weight_data[wi] < 0.f ? 0.f : weight_data[wi]; + u8_weight_data[wi] = uint8_t(weight_data[wi]); + } + + weight_tensor->scale = weight_scale; + weight_tensor->zero_point = weight_zero_point; + weight_tensor->data_type = TENGINE_DT_UINT8; + weight_tensor->elem_size = sizeof(uint8_t); + weight_tensor->data = u8_weight_data; + + /* step 3.2 : quant bias */ + if (noden->input_num > 2) + { + struct tensor* input_tensor = ir_graph->tensor_list[noden->input_tensors[0]]; + struct tensor* bias_tensor = ir_graph->tensor_list[noden->input_tensors[2]]; + + int* int32_bias_data = (int*)sys_malloc(bias_tensor->elem_num * bias_tensor->elem_size); + float* bias_data = (float*)bias_tensor->data; + + /* calculate the quant scale value of bias perchannel, scale = scale_weight * scale_in */ + float bias_scale = input_tensor->scale * weight_tensor->scale; + + /* quantize the value of bias from Float32 to Int32, value_i32 = (value_fp32 / scale).round() */ + for (int bi = 0; bi < bias_tensor->elem_num; bi++) + { + if (bias_scale == 0) + int32_bias_data[bi] = 0; + else + { + bias_data[bi] = roundf(bias_data[bi] / bias_scale); + int32_bias_data[bi] = int(bias_data[bi]); + } + } + + bias_tensor->scale = bias_scale; + bias_tensor->data_type = TENGINE_DT_INT32; + bias_tensor->data = int32_bias_data; + + // fprintf(stderr, "[bias] scale final %8.4f\n", bias_scale); + } + } + /* quantize the tensor data from fp32 to fp16, for TIM-VX NPU IP */ + else if (op_name == "PReLU") + { + for (int j = 0; j < noden->input_num; j++) + { + struct tensor* in_tensor = ir_graph->tensor_list[noden->input_tensors[j]]; + if (in_tensor->tensor_type == TENSOR_TYPE_CONST) + { + float* fp32_data = (float*)in_tensor->data; + int data_elem = in_tensor->elem_num; + + __fp16* fp16_data = (__fp16*)sys_malloc(data_elem * sizeof(__fp16)); + + for (int k = 0; k < data_elem; k++) + { + fp16_data[k] = fp32_to_fp16(fp32_data[k]); + } + + in_tensor->data_type = TENGINE_DT_FP16; + in_tensor->data = fp16_data; + in_tensor->quant_param_num = 0; + } + } + } + else if (op_name == "Slice") + { + struct tensor* slice_input_tensor = get_ir_graph_tensor(ir_graph, noden->input_tensors[0]); + struct tensor* slice_output_tensor = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]); + slice_output_tensor->scale = slice_input_tensor->scale; + slice_output_tensor->zero_point = slice_input_tensor->zero_point; + } + } + + fclose(fp_weight); + fclose(fp_bias); + + fprintf(stderr, "[Quant Tools Info]: Step 5, quantize weight tensor done.\n"); + + if (!save_graph(ir_graph, output_file.c_str())) + { + fprintf(stderr, "save graph failed.\n"); + return -1; + } + + fprintf(stderr, "[Quant Tools Info]: Step 6, save Int8 tmfile done, %s\n", output_file.c_str()); + + return 0; +} diff --git a/tools/quantize/quant_save_graph.hpp b/tools/quantize/quant_save_graph.hpp index e23f385df..ad6fed617 100644 --- a/tools/quantize/quant_save_graph.hpp +++ b/tools/quantize/quant_save_graph.hpp @@ -1,53 +1,53 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: hhchen@openailab.com - */ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "quant_utils.hpp" -#include "save_graph.hpp" - -#include "tengine/c_api.h" - -extern "C" { - #include "graph/graph.h" - #include "graph/subgraph.h" - #include "graph/node.h" - #include "graph/tensor.h" - #include "utility/sys_port.h" - #include "utility/utils.h" -} - -int save_graph_u8_perlayer(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal); - -int save_graph_i8_perchannel(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal); - -int save_graph_u8_perchannel(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal); +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: hhchen@openailab.com + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "quant_utils.hpp" +#include "save_graph.hpp" + +#include "tengine/c_api.h" + +extern "C" { +#include "graph/graph.h" +#include "graph/subgraph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "utility/sys_port.h" +#include "utility/utils.h" +} + +int save_graph_u8_perlayer(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal); + +int save_graph_i8_perchannel(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal); + +int save_graph_u8_perchannel(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal); diff --git a/tools/quantize/quant_tool.hpp b/tools/quantize/quant_tool.hpp index b07ec275e..35213bfa2 100644 --- a/tools/quantize/quant_tool.hpp +++ b/tools/quantize/quant_tool.hpp @@ -22,13 +22,11 @@ * Author: hhchen@openailab.com */ - #include #include #include -extern "C" -{ +extern "C" { #include "tengine/c_api.h" #include "graph/graph.h" #include "graph/subgraph.h" @@ -41,7 +39,6 @@ extern "C" #define ALGORITHM_MIN_MAX 0 #define ALGORITHM_KL 1 - class QuantTool { public: @@ -49,26 +46,27 @@ class QuantTool ~QuantTool(); int activation_quant_tool(); -public: + +public: struct options opt; - std::string model_file; // path to input float32 tmfile - std::string scale_file; // path to calibration scale file - std::string output_file;// path to output int8/uint8 tmfile - std::string image_dir; // path to calibration images folder + std::string model_file; // path to input float32 tmfile + std::string scale_file; // path to calibration scale file + std::string output_file; // path to output int8/uint8 tmfile + std::string image_dir; // path to calibration images folder int num_thread; - + int img_c; int img_h; int img_w; - float mean[3]; // value of mean (mean value, default is 104.0,117.0,123.0) - float scale[3]; // value of normalize (scale value, default is 1.0,1.0,1.0) - int center_crop; // flag which indicates that center crop process image is necessary(0:OFF, 1:ON, default is 0) + float mean[3]; // value of mean (mean value, default is 104.0,117.0,123.0) + float scale[3]; // value of normalize (scale value, default is 1.0,1.0,1.0) + int center_crop; // flag which indicates that center crop process image is necessary(0:OFF, 1:ON, default is 0) int letterbox_rows; int letterbox_cols; - int sw_RGB; // flag which indicates that swap first and last channels in 3-channel image is necessary(0:OFF, 1:ON, default is 1) - int focus; // flag which indicates that focus process image is necessary(maybe using for YOLOv5, 0:OFF, 1:ON, default is 0) - int inplace; // process the inplace quant scale of activation in some types of op, such as max pooling, ReLU, Flatten, Reshape, Clip - int algorithm_type; // the type of quant algorithm(0:min-max, 1:kl, default is 0) + int sw_RGB; // flag which indicates that swap first and last channels in 3-channel image is necessary(0:OFF, 1:ON, default is 1) + int focus; // flag which indicates that focus process image is necessary(maybe using for YOLOv5, 0:OFF, 1:ON, default is 0) + int inplace; // process the inplace quant scale of activation in some types of op, such as max pooling, ReLU, Flatten, Reshape, Clip + int algorithm_type; // the type of quant algorithm(0:min-max, 1:kl, default is 0) }; diff --git a/tools/quantize/quant_tool_int8.cpp b/tools/quantize/quant_tool_int8.cpp index 70bd39b07..009c6dd31 100644 --- a/tools/quantize/quant_tool_int8.cpp +++ b/tools/quantize/quant_tool_int8.cpp @@ -22,14 +22,12 @@ * Author: hhchen@openailab.com */ - #include #include #include "quant_tool.hpp" #include "quant_save_graph.hpp" - QuantTool::QuantTool() { // initial tengine @@ -86,7 +84,7 @@ int QuantTool::activation_quant_tool() /* set the shape, data buffer of input_tensor of the graph */ int img_size = img_h * img_w * img_c; - int dims[] = {1, img_c, img_h, img_w}; // nchw + int dims[] = {1, img_c, img_h, img_w}; // nchw std::vector input_data(img_size); tensor_t input_tensor = get_graph_input_tensor(ir_graph, 0, 0); @@ -114,7 +112,7 @@ int QuantTool::activation_quant_tool() struct tensor* var_tensor = ir_graph->tensor_list[i]; if (var_tensor->tensor_type == TENSOR_TYPE_VAR) { - var_tensor->data = ( float* )malloc(sizeof(float)); + var_tensor->data = (float*)malloc(sizeof(float)); } } @@ -168,7 +166,7 @@ int QuantTool::activation_quant_tool() double total_time = 0.; for (int nums = 0; nums < img_num; nums++) { - fprintf(stderr, "\r[Quant Tools Info]: Step 1, images %.5d / %.5d", nums+1, img_num); + fprintf(stderr, "\r[Quant Tools Info]: Step 1, images %.5d / %.5d", nums + 1, img_num); get_input_data_cv(imgs_list[nums].c_str(), input_data.data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus); /* run graph */ @@ -191,8 +189,8 @@ int QuantTool::activation_quant_tool() struct tensor* act_tensor = ir_graph->tensor_list[i]; if (act_tensor->tensor_type == TENSOR_TYPE_VAR || act_tensor->tensor_type == TENSOR_TYPE_INPUT) { - float* start_addr = ( float* )act_tensor->data; - float* end_addr = ( float* )act_tensor->data + act_tensor->elem_num; + float* start_addr = (float*)act_tensor->data; + float* end_addr = (float*)act_tensor->data + act_tensor->elem_num; max_activation[i] = std::max(max_activation[i], *std::max_element(start_addr, end_addr)); min_activation[i] = std::min(min_activation[i], *std::min_element(start_addr, end_addr)); } @@ -231,14 +229,14 @@ int QuantTool::activation_quant_tool() } } - fprintf(fp_minmax,"%s %f %d\n",ir_graph->tensor_list[i]->name, act_scale, act_zero_point); + fprintf(fp_minmax, "%s %f %d\n", ir_graph->tensor_list[i]->name, act_scale, act_zero_point); } } fclose(fp_minmax); fprintf(stderr, "\r\n[Quant Tools Info]: Step 1, find original calibration table done, output ./table_minmax.scale\n"); if (this->algorithm_type == ALGORITHM_KL) - { + { /* todo support */ } @@ -285,58 +283,58 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - quant_tool.model_file = optarg; - break; - case 'a': - quant_tool.algorithm_type = atoi(optarg); - break; - case 'f': - quant_tool.scale_file = optarg; - break; - case 'o': - quant_tool.output_file = optarg; - break; - case 'i': - quant_tool.image_dir = optarg; - break; - case 'g': - float img_chw[3]; - split(img_chw, optarg, ","); - quant_tool.img_c = (int)img_chw[0]; - quant_tool.img_h = (int)img_chw[1]; - quant_tool.img_w = (int)img_chw[2]; - break; - case 'w': - split(quant_tool.mean, optarg, ","); - break; - case 's': - split(quant_tool.scale, optarg, ","); - break; - case 'b': - quant_tool.sw_RGB = atoi(optarg); - break; - case 'c': - quant_tool.center_crop = atoi(optarg); - break; - case 'y': - float letterboxs[2]; - split(letterboxs, optarg, ","); - quant_tool.letterbox_rows = (int)letterboxs[0]; - quant_tool.letterbox_cols = (int)letterboxs[1]; - break; - case 'k': - quant_tool.focus = atoi(optarg); - break; - case 't': - quant_tool.num_thread = atoi(optarg); - quant_tool.opt.num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + quant_tool.model_file = optarg; + break; + case 'a': + quant_tool.algorithm_type = atoi(optarg); + break; + case 'f': + quant_tool.scale_file = optarg; + break; + case 'o': + quant_tool.output_file = optarg; + break; + case 'i': + quant_tool.image_dir = optarg; + break; + case 'g': + float img_chw[3]; + split(img_chw, optarg, ","); + quant_tool.img_c = (int)img_chw[0]; + quant_tool.img_h = (int)img_chw[1]; + quant_tool.img_w = (int)img_chw[2]; + break; + case 'w': + split(quant_tool.mean, optarg, ","); + break; + case 's': + split(quant_tool.scale, optarg, ","); + break; + case 'b': + quant_tool.sw_RGB = atoi(optarg); + break; + case 'c': + quant_tool.center_crop = atoi(optarg); + break; + case 'y': + float letterboxs[2]; + split(letterboxs, optarg, ","); + quant_tool.letterbox_rows = (int)letterboxs[0]; + quant_tool.letterbox_cols = (int)letterboxs[1]; + break; + case 'k': + quant_tool.focus = atoi(optarg); + break; + case 't': + quant_tool.num_thread = atoi(optarg); + quant_tool.opt.num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -348,21 +346,21 @@ int main(int argc, char* argv[]) /* check input params */ if (quant_tool.model_file.empty()) { - fprintf(stderr,"[Quant Tools Info]: The input file of Float32 tmfile file not specified!\n"); + fprintf(stderr, "[Quant Tools Info]: The input file of Float32 tmfile file not specified!\n"); show_usage(); return -1; } if (quant_tool.image_dir.empty()) { - fprintf(stderr,"[Quant Tools Info]: The input dir of Calibration image not specified!\n"); + fprintf(stderr, "[Quant Tools Info]: The input dir of Calibration image not specified!\n"); show_usage(); return -1; } if (quant_tool.output_file.empty()) { - fprintf(stderr,"[Quant Tools Info]: The output file of Int8 tmfile not specified!\n"); + fprintf(stderr, "[Quant Tools Info]: The output file of Int8 tmfile not specified!\n"); show_usage(); return -1; } @@ -371,15 +369,15 @@ int main(int argc, char* argv[]) fprintf(stderr, "Input model : %s\n", quant_tool.model_file.c_str()); fprintf(stderr, "Output model: %s\n", quant_tool.output_file.c_str()); fprintf(stderr, "Calib images: %s\n", quant_tool.image_dir.c_str()); - fprintf(stderr, "Scale file : %s\n", quant_tool.scale_file.empty()?"NULL":quant_tool.scale_file.c_str()); - fprintf(stderr, "Algorithm : %s\n", quant_tool.algorithm_type?"KL":"MIN MAX"); + fprintf(stderr, "Scale file : %s\n", quant_tool.scale_file.empty() ? "NULL" : quant_tool.scale_file.c_str()); + fprintf(stderr, "Algorithm : %s\n", quant_tool.algorithm_type ? "KL" : "MIN MAX"); fprintf(stderr, "Dims : %d %d %d\n", quant_tool.img_c, quant_tool.img_h, quant_tool.img_w); fprintf(stderr, "Mean : %.3f %.3f %.3f\n", quant_tool.mean[0], quant_tool.mean[1], quant_tool.mean[2]); fprintf(stderr, "Scale : %.3f %.3f %.3f\n", quant_tool.scale[0], quant_tool.scale[1], quant_tool.scale[2]); - fprintf(stderr, "BGR2RGB : %s\n", quant_tool.sw_RGB?"ON":"OFF"); - fprintf(stderr, "Center crop : %s\n", quant_tool.center_crop?"ON":"OFF"); + fprintf(stderr, "BGR2RGB : %s\n", quant_tool.sw_RGB ? "ON" : "OFF"); + fprintf(stderr, "Center crop : %s\n", quant_tool.center_crop ? "ON" : "OFF"); fprintf(stderr, "Letter box : %d %d\n", quant_tool.letterbox_rows, quant_tool.letterbox_cols); - fprintf(stderr, "YOLOv5 focus: %s\n", quant_tool.focus?"ON":"OFF"); + fprintf(stderr, "YOLOv5 focus: %s\n", quant_tool.focus ? "ON" : "OFF"); fprintf(stderr, "Thread num : %d\n\n", quant_tool.num_thread); /* using 3rd calibration table file */ @@ -387,19 +385,19 @@ int main(int argc, char* argv[]) { /* quantize activation */ quant_tool.activation_quant_tool(); - + /* select algorithm */ if (quant_tool.algorithm_type == ALGORITHM_MIN_MAX) - quant_tool.scale_file = "table_minmax.scale"; + quant_tool.scale_file = "table_minmax.scale"; else { - fprintf(stderr,"[Quant Tools Info]: algorithm not specified, using default type MIN MAX\n"); + fprintf(stderr, "[Quant Tools Info]: algorithm not specified, using default type MIN MAX\n"); quant_tool.scale_file = "table_minmax.scale"; } } /* quantize weight/bias and save into int8 tmfile */ - fprintf(stderr,"[Quant Tools Info]: Calibration file is using %s\n", quant_tool.scale_file.c_str()); + fprintf(stderr, "[Quant Tools Info]: Calibration file is using %s\n", quant_tool.scale_file.c_str()); save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); fprintf(stderr, "\n---- Tengine Int8 tmfile create success, best wish for your INT8 inference has a low accuracy loss...\\(^0^)/ ----\n"); diff --git a/tools/quantize/quant_tool_uint8.cpp b/tools/quantize/quant_tool_uint8.cpp index 733086283..660981e1f 100644 --- a/tools/quantize/quant_tool_uint8.cpp +++ b/tools/quantize/quant_tool_uint8.cpp @@ -22,14 +22,12 @@ * Author: hhchen@openailab.com */ - #include #include #include "quant_tool.hpp" #include "quant_save_graph.hpp" - QuantTool::QuantTool() { // initial tengine @@ -86,7 +84,7 @@ int QuantTool::activation_quant_tool() /* set the shape, data buffer of input_tensor of the graph */ int img_size = img_h * img_w * img_c; - int dims[] = {1, img_c, img_h, img_w}; // nchw + int dims[] = {1, img_c, img_h, img_w}; // nchw std::vector input_data(img_size); tensor_t input_tensor = get_graph_input_tensor(ir_graph, 0, 0); @@ -114,7 +112,7 @@ int QuantTool::activation_quant_tool() struct tensor* var_tensor = ir_graph->tensor_list[i]; if (var_tensor->tensor_type == TENSOR_TYPE_VAR) { - var_tensor->data = ( float* )malloc(sizeof(float)); + var_tensor->data = (float*)malloc(sizeof(float)); } } @@ -168,7 +166,7 @@ int QuantTool::activation_quant_tool() double total_time = 0.; for (int nums = 0; nums < img_num; nums++) { - fprintf(stderr, "\r[Quant Tools Info]: Step 1, images %.5d / %.5d", nums+1, img_num); + fprintf(stderr, "\r[Quant Tools Info]: Step 1, images %.5d / %.5d", nums + 1, img_num); get_input_data_cv(imgs_list[nums].c_str(), input_data.data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus); /* run graph */ @@ -191,8 +189,8 @@ int QuantTool::activation_quant_tool() struct tensor* act_tensor = ir_graph->tensor_list[i]; if (act_tensor->tensor_type == TENSOR_TYPE_VAR || act_tensor->tensor_type == TENSOR_TYPE_INPUT) { - float* start_addr = ( float* )act_tensor->data; - float* end_addr = ( float* )act_tensor->data + act_tensor->elem_num; + float* start_addr = (float*)act_tensor->data; + float* end_addr = (float*)act_tensor->data + act_tensor->elem_num; max_activation[i] = std::max(max_activation[i], *std::max_element(start_addr, end_addr)); min_activation[i] = std::min(min_activation[i], *std::min_element(start_addr, end_addr)); } @@ -216,14 +214,14 @@ int QuantTool::activation_quant_tool() else if (min_activation[i] > 0) { act_scale = (max_activation[i] - 0) / 255; - act_zero_point = 0; + act_zero_point = 0; } else { act_scale = (max_activation[i] - min_activation[i]) / 255; act_zero_point = int(-min_activation[i] / act_scale); } - + if (act_scale == 0) act_zero_point = 0; @@ -260,13 +258,13 @@ int QuantTool::activation_quant_tool() fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table.\n"); std::tr1::unordered_map tensor_hist; std::tr1::unordered_map hist_tensor; - std::vector> hist_edge; - std::vector> hist_gram; + std::vector > hist_edge; + std::vector > hist_gram; /* second loop, create histgram */ - for (int nums = imgs_list.size()-1; nums >= 0; nums--) + for (int nums = imgs_list.size() - 1; nums >= 0; nums--) { - fprintf(stderr, "\r[Quant Tools Info]: Step 2, images %.5d / %.5d", nums+1, img_num); + fprintf(stderr, "\r[Quant Tools Info]: Step 2, images %.5d / %.5d", nums + 1, img_num); get_input_data_cv(imgs_list[nums].c_str(), input_data.data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus); @@ -296,12 +294,12 @@ int QuantTool::activation_quant_tool() every_edge.push_back(edge_float); } hist_edge.push_back(every_edge); - hist_gram.push_back(histCount(( float* )ir_tensor->data, ir_tensor->elem_num, max_activation[i], min_activation[i])); + hist_gram.push_back(histCount((float*)ir_tensor->data, ir_tensor->elem_num, max_activation[i], min_activation[i])); } else { std::vector hist_tmp; - hist_tmp = histCount(( float* )ir_tensor->data, ir_tensor->elem_num, max_activation[i], min_activation[i]); + hist_tmp = histCount((float*)ir_tensor->data, ir_tensor->elem_num, max_activation[i], min_activation[i]); for (int j = 0; j < 2048; j++) { hist_gram[inum][j] += hist_tmp[j]; @@ -322,15 +320,15 @@ int QuantTool::activation_quant_tool() for (int i = 0; i < act_tensor_num; i++) { int threshold_bin = threshold_distribution(hist_gram[i], 256); - // fprintf(stderr, " threshold_bin %d \n", threshold_bin); + // fprintf(stderr, " threshold_bin %d \n", threshold_bin); std::vector hist_gram_F(threshold_bin + 1); - for (int j = 0; j < threshold_bin+1; j++) + for (int j = 0; j < threshold_bin + 1; j++) { hist_gram_F[j] = hist_gram[i][threshold_bin - j]; } int threshold_bin_F = threshold_distribution(hist_gram_F, 256); - int threshold_bin_min = threshold_bin - threshold_bin_F + 1; + int threshold_bin_min = threshold_bin - threshold_bin_F + 1; // fprintf(stderr, "### %s : %d %f %f & %f %f\n",ir_graph->tensor_list[hist_tensor[i]]->name, threshold_bin, min_activation[hist_tensor[i]],\ // hist_edge[i][threshold_bin_min], hist_edge[i][threshold_bin], max_activation[hist_tensor[i]]); @@ -383,7 +381,7 @@ int QuantTool::activation_quant_tool() fprintf(fp_kl, "%s %f %d\n", ir_graph->tensor_list[hist_tensor[i]]->name, act_scale, act_zero_point); } fclose(fp_kl); - fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table done, output ./table_kl.scale\n"); + fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table done, output ./table_kl.scale\n"); } fprintf(stderr, "[Quant Tools Info]: Thread %d, image nums %d, total time %.2f ms, avg time %.2f ms\n", num_thread, img_num, total_time, total_time / img_num); @@ -429,58 +427,58 @@ int main(int argc, char* argv[]) { switch (res) { - case 'm': - quant_tool.model_file = optarg; - break; - case 'a': - quant_tool.algorithm_type = atoi(optarg); - break; - case 'f': - quant_tool.scale_file = optarg; - break; - case 'o': - quant_tool.output_file = optarg; - break; - case 'i': - quant_tool.image_dir = optarg; - break; - case 'g': - float img_chw[3]; - split(img_chw, optarg, ","); - quant_tool.img_c = (int)img_chw[0]; - quant_tool.img_h = (int)img_chw[1]; - quant_tool.img_w = (int)img_chw[2]; - break; - case 'w': - split(quant_tool.mean, optarg, ","); - break; - case 's': - split(quant_tool.scale, optarg, ","); - break; - case 'b': - quant_tool.sw_RGB = atoi(optarg); - break; - case 'c': - quant_tool.center_crop = atoi(optarg); - break; - case 'y': - float letterboxs[2]; - split(letterboxs, optarg, ","); - quant_tool.letterbox_rows = (int)letterboxs[0]; - quant_tool.letterbox_cols = (int)letterboxs[1]; - break; - case 'k': - quant_tool.focus = atoi(optarg); - break; - case 't': - quant_tool.num_thread = atoi(optarg); - quant_tool.opt.num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; + case 'm': + quant_tool.model_file = optarg; + break; + case 'a': + quant_tool.algorithm_type = atoi(optarg); + break; + case 'f': + quant_tool.scale_file = optarg; + break; + case 'o': + quant_tool.output_file = optarg; + break; + case 'i': + quant_tool.image_dir = optarg; + break; + case 'g': + float img_chw[3]; + split(img_chw, optarg, ","); + quant_tool.img_c = (int)img_chw[0]; + quant_tool.img_h = (int)img_chw[1]; + quant_tool.img_w = (int)img_chw[2]; + break; + case 'w': + split(quant_tool.mean, optarg, ","); + break; + case 's': + split(quant_tool.scale, optarg, ","); + break; + case 'b': + quant_tool.sw_RGB = atoi(optarg); + break; + case 'c': + quant_tool.center_crop = atoi(optarg); + break; + case 'y': + float letterboxs[2]; + split(letterboxs, optarg, ","); + quant_tool.letterbox_rows = (int)letterboxs[0]; + quant_tool.letterbox_cols = (int)letterboxs[1]; + break; + case 'k': + quant_tool.focus = atoi(optarg); + break; + case 't': + quant_tool.num_thread = atoi(optarg); + quant_tool.opt.num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; } } @@ -492,21 +490,21 @@ int main(int argc, char* argv[]) /* check input params */ if (quant_tool.model_file.empty()) { - fprintf(stderr,"[Quant Tools Info]: The input file of Float32 tmfile file not specified!\n"); + fprintf(stderr, "[Quant Tools Info]: The input file of Float32 tmfile file not specified!\n"); show_usage(); return -1; } if (quant_tool.image_dir.empty()) { - fprintf(stderr,"[Quant Tools Info]: The input dir of Calibration image not specified!\n"); + fprintf(stderr, "[Quant Tools Info]: The input dir of Calibration image not specified!\n"); show_usage(); return -1; } if (quant_tool.output_file.empty()) { - fprintf(stderr,"[Quant Tools Info]: The output file of Int8 tmfile not specified!\n"); + fprintf(stderr, "[Quant Tools Info]: The output file of Int8 tmfile not specified!\n"); show_usage(); return -1; } @@ -515,15 +513,15 @@ int main(int argc, char* argv[]) fprintf(stderr, "Input model : %s\n", quant_tool.model_file.c_str()); fprintf(stderr, "Output model: %s\n", quant_tool.output_file.c_str()); fprintf(stderr, "Calib images: %s\n", quant_tool.image_dir.c_str()); - fprintf(stderr, "Scale file : %s\n", quant_tool.scale_file.empty()?"NULL":quant_tool.scale_file.c_str()); - fprintf(stderr, "Algorithm : %s\n", quant_tool.algorithm_type?"KL":"MIN MAX"); + fprintf(stderr, "Scale file : %s\n", quant_tool.scale_file.empty() ? "NULL" : quant_tool.scale_file.c_str()); + fprintf(stderr, "Algorithm : %s\n", quant_tool.algorithm_type ? "KL" : "MIN MAX"); fprintf(stderr, "Dims : %d %d %d\n", quant_tool.img_c, quant_tool.img_h, quant_tool.img_w); fprintf(stderr, "Mean : %.3f %.3f %.3f\n", quant_tool.mean[0], quant_tool.mean[1], quant_tool.mean[2]); fprintf(stderr, "Scale : %.3f %.3f %.3f\n", quant_tool.scale[0], quant_tool.scale[1], quant_tool.scale[2]); - fprintf(stderr, "BGR2RGB : %s\n", quant_tool.sw_RGB?"ON":"OFF"); - fprintf(stderr, "Center crop : %s\n", quant_tool.center_crop?"ON":"OFF"); + fprintf(stderr, "BGR2RGB : %s\n", quant_tool.sw_RGB ? "ON" : "OFF"); + fprintf(stderr, "Center crop : %s\n", quant_tool.center_crop ? "ON" : "OFF"); fprintf(stderr, "Letter box : %d %d\n", quant_tool.letterbox_rows, quant_tool.letterbox_cols); - fprintf(stderr, "YOLOv5 focus: %s\n", quant_tool.focus?"ON":"OFF"); + fprintf(stderr, "YOLOv5 focus: %s\n", quant_tool.focus ? "ON" : "OFF"); fprintf(stderr, "Thread num : %d\n\n", quant_tool.num_thread); /* using 3rd calibration table file */ @@ -531,21 +529,21 @@ int main(int argc, char* argv[]) { /* quantize activation */ quant_tool.activation_quant_tool(); - + /* select algorithm */ if (quant_tool.algorithm_type == ALGORITHM_MIN_MAX) quant_tool.scale_file = "table_minmax.scale"; - else if (quant_tool.algorithm_type == ALGORITHM_KL) - quant_tool.scale_file = "table_kl.scale"; + else if (quant_tool.algorithm_type == ALGORITHM_KL) + quant_tool.scale_file = "table_kl.scale"; else { - fprintf(stderr,"[Quant Tools Info]: algorithm not specified, using default type MIN MAX\n"); + fprintf(stderr, "[Quant Tools Info]: algorithm not specified, using default type MIN MAX\n"); quant_tool.scale_file = "table_minmax.scale"; } } /* quantize weight/bias and save into uint8 tmfile */ - fprintf(stderr,"[Quant Tools Info]: Calibration file is using %s\n", quant_tool.scale_file.c_str()); + fprintf(stderr, "[Quant Tools Info]: Calibration file is using %s\n", quant_tool.scale_file.c_str()); save_graph_u8_perlayer(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); fprintf(stderr, "\n---- Tengine Int8 tmfile create success, best wish for your UInt8 inference has a low accuracy loss...\\(^0^)/ ----\n"); diff --git a/tools/quantize/quant_tool_uint8_perchannel.cpp b/tools/quantize/quant_tool_uint8_perchannel.cpp index 14ece0fb9..a944cad9c 100644 --- a/tools/quantize/quant_tool_uint8_perchannel.cpp +++ b/tools/quantize/quant_tool_uint8_perchannel.cpp @@ -1,555 +1,552 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: hhchen@openailab.com - */ - - -#include -#include - -#include "quant_tool.hpp" -#include "quant_save_graph.hpp" - - -QuantTool::QuantTool() -{ - // initial tengine - if (init_tengine() != 0) - { - fprintf(stderr, "Initial tengine failed.\n"); - } - - // system variable - this->opt.num_thread = 4; - this->opt.cluster = TENGINE_CLUSTER_ALL; - this->opt.precision = TENGINE_MODE_FP32; - this->opt.affinity = 0; - this->num_thread = 4; - - // input variable - this->sw_RGB = 1; - this->img_c = 3; - this->img_h = 224; - this->img_w = 224; - this->mean[0] = 104.f; - this->mean[1] = 117.f; - this->mean[2] = 123.f; - this->scale[0] = 1.f; - this->scale[1] = 1.f; - this->scale[2] = 1.f; - this->center_crop = 0; - this->letterbox_rows = 0; - this->letterbox_cols = 0; - this->focus = 0; - this->inplace = true; - this->algorithm_type = ALGORITHM_MIN_MAX; -} - -QuantTool::~QuantTool() -{ - /* release tengine */ - release_tengine(); -} - -int QuantTool::activation_quant_tool() -{ - fprintf(stderr, "[Quant Tools Info]: Step 0, load FP32 tmfile.\n"); - - /* create graph, load tengine model xxx.tmfile */ - struct graph* ir_graph = (struct graph*)create_graph(nullptr, "tengine", model_file.c_str()); - if (nullptr == ir_graph) - { - fprintf(stderr, "Create graph failed.\n"); - return -1; - } - - fprintf(stderr, "[Quant Tools Info]: Step 0, load FP32 tmfile done.\n"); - - /* set the shape, data buffer of input_tensor of the graph */ - int img_size = img_h * img_w * img_c; - int dims[] = {1, img_c, img_h, img_w}; // nchw - std::vector input_data(img_size); - - tensor_t input_tensor = get_graph_input_tensor(ir_graph, 0, 0); - if (input_tensor == nullptr) - { - fprintf(stderr, "Get input tensor failed\n"); - return -1; - } - - if (set_tensor_shape(input_tensor, dims, 4) < 0) - { - fprintf(stderr, "Set input tensor shape failed\n"); - return -1; - } - - if (set_tensor_buffer(input_tensor, input_data.data(), img_size * sizeof(float)) < 0) - { - fprintf(stderr, "Set input tensor buffer failed\n"); - return -1; - } - - /* initial malloc the output tesnors date buffer of nodes in the graph, to disable the mem pool, before prerun */ - for (int i = 0; i < ir_graph->tensor_num; i++) - { - struct tensor* var_tensor = ir_graph->tensor_list[i]; - if (var_tensor->tensor_type == TENSOR_TYPE_VAR) - { - var_tensor->data = ( float* )malloc(sizeof(float)); - } - } - - /* prerun graph, set work options(num_thread, cluster, precision) */ - if (prerun_graph_multithread(ir_graph, this->opt) < 0) - { - fprintf(stderr, "Prerun multithread graph failed.\n"); - return -1; - } - - fprintf(stderr, "[Quant Tools Info]: Step 0, load calibration image files.\n"); - - /* really malloc the output tesnors date buffer of nodes in the graph */ - for (int i = 0; i < ir_graph->tensor_num; i++) - { - struct tensor* var_tensor = ir_graph->tensor_list[i]; - if (var_tensor->tensor_type == TENSOR_TYPE_VAR) - { - var_tensor->data = realloc(var_tensor->data, sizeof(float) * var_tensor->elem_num); - memset(var_tensor->data, 0, sizeof(float) * var_tensor->elem_num); - } - } - - /* read image list */ - std::vector imgs_list; - readFileList(image_dir, imgs_list); - uint32_t img_num = imgs_list.size(); - - fprintf(stderr, "[Quant Tools Info]: Step 0, load calibration image files done, image num is %d.\n", img_num); - - /* init minmax */ - std::unordered_map max_activation; - std::unordered_map min_activation; - uint32_t act_tensor_num = 0; - for (int i = 0; i < ir_graph->tensor_num; i++) - { - struct tensor* act_tensor = ir_graph->tensor_list[i]; - if (act_tensor->tensor_type == TENSOR_TYPE_VAR || act_tensor->tensor_type == TENSOR_TYPE_INPUT) - { - act_tensor_num++; - max_activation[i] = -FLT_MAX; - min_activation[i] = FLT_MAX; - } - } - - fprintf(stderr, "[Quant Tools Info]: Step 1, find original calibration table.\n"); - - /* first loop, find the min/max value of every activation tensor of the graph */ - double min_time = DBL_MAX; - double max_time = DBL_MIN; - double total_time = 0.; - for (int nums = 0; nums < img_num; nums++) - { - fprintf(stderr, "\r[Quant Tools Info]: Step 1, images %.5d / %.5d", nums+1, img_num); - get_input_data_cv(imgs_list[nums].c_str(), input_data.data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus); - - /* run graph */ - double start = get_current_time(); - if (run_graph(ir_graph, 1) < 0) - { - fprintf(stderr, "Run graph failed\n"); - return -1; - } - - double end = get_current_time(); - double cur = end - start; - total_time += cur; - min_time = std::min(min_time, cur); - max_time = std::max(max_time, cur); - - /* get the min/max value of activation tensor */ - for (int i = 0; i < ir_graph->tensor_num; i++) - { - struct tensor* act_tensor = ir_graph->tensor_list[i]; - if (act_tensor->tensor_type == TENSOR_TYPE_VAR || act_tensor->tensor_type == TENSOR_TYPE_INPUT) - { - float* start_addr = ( float* )act_tensor->data; - float* end_addr = ( float* )act_tensor->data + act_tensor->elem_num; - max_activation[i] = std::max(max_activation[i], *std::max_element(start_addr, end_addr)); - min_activation[i] = std::min(min_activation[i], *std::min_element(start_addr, end_addr)); - } - } - } - - /* save the calibration file with min-max algorithm */ - FILE* fp_minmax = fopen("table_minmax.scale", "wb"); - for (int i = 0; i < ir_graph->tensor_num; i++) - { - struct tensor* t = ir_graph->tensor_list[i]; - if (t->tensor_type == TENSOR_TYPE_VAR || t->tensor_type == TENSOR_TYPE_INPUT) - { - float act_scale; - int act_zero_point; - if (max_activation[i] < 0) - { - act_scale = (0 - min_activation[i]) / 255; - act_zero_point = int(-min_activation[i] / act_scale); - } - else if (min_activation[i] > 0) - { - act_scale = (max_activation[i] - 0) / 255; - act_zero_point = 0; - } - else - { - act_scale = (max_activation[i] - min_activation[i]) / 255; - act_zero_point = int(-min_activation[i] / act_scale); - } - - if (act_scale == 0) - act_zero_point = 0; - - /* the scale of softmax always is scale = 1 / 127.f */ - for (int j = 0; j < ir_graph->node_num; j++) - { - struct node* noden = ir_graph->node_list[j]; - struct tensor* tensor_tmp = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]); - - if (!(tensor_tmp->tensor_type == TENSOR_TYPE_INPUT || tensor_tmp->tensor_type == TENSOR_TYPE_VAR)) - continue; - - std::string tmp_op_name = get_op_name_from_type(noden->op.type); - std::string cur_name = t->name; - std::string tmp_name = tensor_tmp->name; - - if ((cur_name == tmp_name) && tmp_op_name == "Softmax") - { - act_scale = 1 / 255.f; - act_zero_point = 0; - break; - } - } - - fprintf(fp_minmax, "%s %f %d\n", ir_graph->tensor_list[i]->name, act_scale, act_zero_point); - } - } - fclose(fp_minmax); - fprintf(stderr, "\r\n[Quant Tools Info]: Step 1, find original calibration table done, output ./table_minmax.scale\n"); - - if (this->algorithm_type == ALGORITHM_KL) - { - /* kl process divergence */ - fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table.\n"); - std::tr1::unordered_map tensor_hist; - std::tr1::unordered_map hist_tensor; - std::vector> hist_edge; - std::vector> hist_gram; - - /* second loop, create histgram */ - for (int nums = imgs_list.size()-1; nums >= 0; nums--) - { - fprintf(stderr, "\r[Quant Tools Info]: Step 2, images %.5d / %.5d", nums+1, img_num); - - get_input_data_cv(imgs_list[nums].c_str(), input_data.data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus); - - /* run graph */ - if (run_graph(ir_graph, 1) < 0) - { - fprintf(stderr, "Run graph failed\n"); - return -1; - } - - /* calculate hist */ - uint32_t inum = 0; - for (int i = 0; i < ir_graph->tensor_num; i++) - { - struct tensor* ir_tensor = ir_graph->tensor_list[i]; - if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) - { - float step_max = max_activation[i] - min_activation[i]; - float step_bin = step_max / 2048.0f; - - std::vector every_edge; - if (nums == imgs_list.size() - 1) - { - for (int j = 0; j < 2048; j++) - { - float edge_float = (step_bin * (j + 0.5f)) + min_activation[i]; - every_edge.push_back(edge_float); - } - hist_edge.push_back(every_edge); - hist_gram.push_back(histCount(( float* )ir_tensor->data, ir_tensor->elem_num, max_activation[i], min_activation[i])); - } - else - { - std::vector hist_tmp; - hist_tmp = histCount(( float* )ir_tensor->data, ir_tensor->elem_num, max_activation[i], min_activation[i]); - for (int j = 0; j < 2048; j++) - { - hist_gram[inum][j] += hist_tmp[j]; - } - } - - tensor_hist[i] = inum; - hist_tensor[inum] = i; - inum++; - } - } - } - - fprintf(stderr, "\n"); - - /* save the calibration file with min-max algorithm with kl divergence */ - FILE* fp_kl = fopen("table_kl.scale", "wb"); - for (int i = 0; i < act_tensor_num; i++) - { - int threshold_bin = threshold_distribution(hist_gram[i], 256); - // fprintf(stderr, " threshold_bin %d \n", threshold_bin); - - std::vector hist_gram_F(threshold_bin + 1); - for (int j = 0; j < threshold_bin+1; j++) - { - hist_gram_F[j] = hist_gram[i][threshold_bin - j]; - } - int threshold_bin_F = threshold_distribution(hist_gram_F, 256); - int threshold_bin_min = threshold_bin - threshold_bin_F + 1; - - // fprintf(stderr, "### %s : %d %f %f & %f %f\n",ir_graph->tensor_list[hist_tensor[i]]->name, threshold_bin, min_activation[hist_tensor[i]],\ - // hist_edge[i][threshold_bin_min], hist_edge[i][threshold_bin], max_activation[hist_tensor[i]]); - - float kl_min = hist_edge[i][threshold_bin_min]; - float kl_max = hist_edge[i][threshold_bin]; - - float act_scale = 1.0f; - int act_zero_point = 0; - if (kl_max < 0) - { - act_scale = (0 - kl_min) / 255.f; - act_zero_point = int(-kl_min / act_scale); - } - else if (kl_min > 0) - { - act_scale = (kl_max - 0) / 255.f; - act_zero_point = 0; - } - else - { - act_scale = (kl_max - kl_min) / 255.f; - act_zero_point = int(-kl_min / act_scale); - } - - if (act_scale == 0) - act_zero_point = 0; - - /* the scale of softmax always is scale = 1 / 255.f */ - for (int j = 0; j < ir_graph->node_num; j++) - { - struct node* ir_node = ir_graph->node_list[j]; - struct tensor* ir_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - - if (!(ir_tensor->tensor_type == TENSOR_TYPE_INPUT || ir_tensor->tensor_type == TENSOR_TYPE_VAR)) - continue; - - std::string tmp_op_name = get_op_name_from_type(ir_node->op.type); - std::string cur_name = ir_graph->tensor_list[hist_tensor[i]]->name; - std::string tmp_name = ir_tensor->name; - - if ((cur_name == tmp_name) && tmp_op_name == "Softmax") - { - act_scale = 1 / 255.f; - act_zero_point = 0; - break; - } - } - - fprintf(fp_kl, "%s %f %d\n", ir_graph->tensor_list[hist_tensor[i]]->name, act_scale, act_zero_point); - } - fclose(fp_kl); - fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table done, output ./table_kl.scale\n"); - } - - fprintf(stderr, "[Quant Tools Info]: Thread %d, image nums %d, total time %.2f ms, avg time %.2f ms\n", num_thread, img_num, total_time, total_time / img_num); - - /* release tengine */ - postrun_graph(ir_graph); - destroy_graph(ir_graph); - - return 0; -} - -const char* help_params = "[Quant Tools Info]: optional arguments:\n" - "\t-h help show this help message and exit\n" - "\t-m input model path to input float32 tmfile\n" - "\t-i image dir path to calibration images folder\n" - "\t-f scale file path to calibration scale file\n" - "\t-o output model path to output uint8 tmfile\n" - "\t-a algorithm the type of quant algorithm(0:min-max, 1:kl, default is 0)\n" - "\t-g size the size of input image(using the resize the original image,default is 3,224,224)\n" - "\t-w mean value of mean (mean value, default is 104.0,117.0,123.0)\n" - "\t-s scale value of normalize (scale value, default is 1.0,1.0,1.0)\n" - "\t-b swapRB flag which indicates that swap first and last channels in 3-channel image is necessary(0:OFF, 1:ON, default is 1)\n" - "\t-c center crop flag which indicates that center crop process image is necessary(0:OFF, 1:ON, default is 0)\n" - "\t-y letter box the size of letter box process image is necessary([rows, cols], default is [0, 0])\n" - "\t-k focus flag which indicates that focus process image is necessary(maybe using for YOLOv5, 0:OFF, 1:ON, default is 0)\n" - "\t-t num thread count of processing threads(default is 1)\n"; - -const char* example_params = "[Quant Tools Info]: example arguments:\n" - "\t./quant_tool_uint8 -m ./mobilenet_fp32.tmfile -i ./dataset -o ./mobilenet_uint8.tmfile -g 3,224,224 -w 104.007,116.669,122.679 -s 0.017,0.017,0.017\n"; - -void show_usage() -{ - fprintf(stderr, "%s\n", help_params); - fprintf(stderr, "%s\n", example_params); -} - -int main(int argc, char* argv[]) -{ - QuantTool quant_tool; - - int res; - while ((res = getopt(argc, argv, "m:a:f:o:i:g:s:w:b:c:y:k:t:h")) != -1) - { - switch (res) - { - case 'm': - quant_tool.model_file = optarg; - break; - case 'a': - quant_tool.algorithm_type = atoi(optarg); - break; - case 'f': - quant_tool.scale_file = optarg; - break; - case 'o': - quant_tool.output_file = optarg; - break; - case 'i': - quant_tool.image_dir = optarg; - break; - case 'g': - float img_chw[3]; - split(img_chw, optarg, ","); - quant_tool.img_c = (int)img_chw[0]; - quant_tool.img_h = (int)img_chw[1]; - quant_tool.img_w = (int)img_chw[2]; - break; - case 'w': - split(quant_tool.mean, optarg, ","); - break; - case 's': - split(quant_tool.scale, optarg, ","); - break; - case 'b': - quant_tool.sw_RGB = atoi(optarg); - break; - case 'c': - quant_tool.center_crop = atoi(optarg); - break; - case 'y': - float letterboxs[2]; - split(letterboxs, optarg, ","); - quant_tool.letterbox_rows = (int)letterboxs[0]; - quant_tool.letterbox_cols = (int)letterboxs[1]; - break; - case 'k': - quant_tool.focus = atoi(optarg); - break; - case 't': - quant_tool.num_thread = atoi(optarg); - quant_tool.opt.num_thread = atoi(optarg); - break; - case 'h': - show_usage(); - return 0; - default: - break; - } - } - - /* version */ - fprintf(stderr, "\n---- Tengine Post Training Quantization Tool ---- \n"); - fprintf(stderr, "\nVersion : v1.2, %s %s\n", __TIME__, __DATE__); - fprintf(stderr, "Status : uint8, per-channel, asymmetric\n"); - - /* check input params */ - if (quant_tool.model_file.empty()) - { - fprintf(stderr,"[Quant Tools Info]: The input file of Float32 tmfile file not specified!\n"); - show_usage(); - return -1; - } - - if (quant_tool.image_dir.empty()) - { - fprintf(stderr,"[Quant Tools Info]: The input dir of Calibration image not specified!\n"); - show_usage(); - return -1; - } - - if (quant_tool.output_file.empty()) - { - fprintf(stderr,"[Quant Tools Info]: The output file of Int8 tmfile not specified!\n"); - show_usage(); - return -1; - } - - /* debug info : input params */ - fprintf(stderr, "Input model : %s\n", quant_tool.model_file.c_str()); - fprintf(stderr, "Output model: %s\n", quant_tool.output_file.c_str()); - fprintf(stderr, "Calib images: %s\n", quant_tool.image_dir.c_str()); - fprintf(stderr, "Scale file : %s\n", quant_tool.scale_file.empty()?"NULL":quant_tool.scale_file.c_str()); - fprintf(stderr, "Algorithm : %s\n", quant_tool.algorithm_type?"KL":"MIN MAX"); - fprintf(stderr, "Dims : %d %d %d\n", quant_tool.img_c, quant_tool.img_h, quant_tool.img_w); - fprintf(stderr, "Mean : %.3f %.3f %.3f\n", quant_tool.mean[0], quant_tool.mean[1], quant_tool.mean[2]); - fprintf(stderr, "Scale : %.3f %.3f %.3f\n", quant_tool.scale[0], quant_tool.scale[1], quant_tool.scale[2]); - fprintf(stderr, "BGR2RGB : %s\n", quant_tool.sw_RGB?"ON":"OFF"); - fprintf(stderr, "Center crop : %s\n", quant_tool.center_crop?"ON":"OFF"); - fprintf(stderr, "Letter box : %d %d\n", quant_tool.letterbox_rows, quant_tool.letterbox_cols); - fprintf(stderr, "YOLOv5 focus: %s\n", quant_tool.focus?"ON":"OFF"); - fprintf(stderr, "Thread num : %d\n\n", quant_tool.num_thread); - - - /* using 3rd calibration table file */ - if (quant_tool.scale_file.empty()) - { - /* quantize activation */ - quant_tool.activation_quant_tool(); - - /* select algorithm */ - if (quant_tool.algorithm_type == ALGORITHM_MIN_MAX) - quant_tool.scale_file = "table_minmax.scale"; - else if (quant_tool.algorithm_type == ALGORITHM_KL) - quant_tool.scale_file = "table_kl.scale"; - else - { - fprintf(stderr,"[Quant Tools Info]: algorithm not specified, using default type MIN MAX\n"); - quant_tool.scale_file = "table_minmax.scale"; - } - } - - /* quantize weight/bias and save into uint8 tmfile */ - fprintf(stderr,"[Quant Tools Info]: Calibration file is using %s\n", quant_tool.scale_file.c_str()); - save_graph_u8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); - - fprintf(stderr, "\n---- Tengine Int8 tmfile create success, best wish for your INT8 inference has a low accuracy loss...\\(^0^)/ ----\n"); - - return 0; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: hhchen@openailab.com + */ + +#include +#include + +#include "quant_tool.hpp" +#include "quant_save_graph.hpp" + +QuantTool::QuantTool() +{ + // initial tengine + if (init_tengine() != 0) + { + fprintf(stderr, "Initial tengine failed.\n"); + } + + // system variable + this->opt.num_thread = 4; + this->opt.cluster = TENGINE_CLUSTER_ALL; + this->opt.precision = TENGINE_MODE_FP32; + this->opt.affinity = 0; + this->num_thread = 4; + + // input variable + this->sw_RGB = 1; + this->img_c = 3; + this->img_h = 224; + this->img_w = 224; + this->mean[0] = 104.f; + this->mean[1] = 117.f; + this->mean[2] = 123.f; + this->scale[0] = 1.f; + this->scale[1] = 1.f; + this->scale[2] = 1.f; + this->center_crop = 0; + this->letterbox_rows = 0; + this->letterbox_cols = 0; + this->focus = 0; + this->inplace = true; + this->algorithm_type = ALGORITHM_MIN_MAX; +} + +QuantTool::~QuantTool() +{ + /* release tengine */ + release_tengine(); +} + +int QuantTool::activation_quant_tool() +{ + fprintf(stderr, "[Quant Tools Info]: Step 0, load FP32 tmfile.\n"); + + /* create graph, load tengine model xxx.tmfile */ + struct graph* ir_graph = (struct graph*)create_graph(nullptr, "tengine", model_file.c_str()); + if (nullptr == ir_graph) + { + fprintf(stderr, "Create graph failed.\n"); + return -1; + } + + fprintf(stderr, "[Quant Tools Info]: Step 0, load FP32 tmfile done.\n"); + + /* set the shape, data buffer of input_tensor of the graph */ + int img_size = img_h * img_w * img_c; + int dims[] = {1, img_c, img_h, img_w}; // nchw + std::vector input_data(img_size); + + tensor_t input_tensor = get_graph_input_tensor(ir_graph, 0, 0); + if (input_tensor == nullptr) + { + fprintf(stderr, "Get input tensor failed\n"); + return -1; + } + + if (set_tensor_shape(input_tensor, dims, 4) < 0) + { + fprintf(stderr, "Set input tensor shape failed\n"); + return -1; + } + + if (set_tensor_buffer(input_tensor, input_data.data(), img_size * sizeof(float)) < 0) + { + fprintf(stderr, "Set input tensor buffer failed\n"); + return -1; + } + + /* initial malloc the output tesnors date buffer of nodes in the graph, to disable the mem pool, before prerun */ + for (int i = 0; i < ir_graph->tensor_num; i++) + { + struct tensor* var_tensor = ir_graph->tensor_list[i]; + if (var_tensor->tensor_type == TENSOR_TYPE_VAR) + { + var_tensor->data = (float*)malloc(sizeof(float)); + } + } + + /* prerun graph, set work options(num_thread, cluster, precision) */ + if (prerun_graph_multithread(ir_graph, this->opt) < 0) + { + fprintf(stderr, "Prerun multithread graph failed.\n"); + return -1; + } + + fprintf(stderr, "[Quant Tools Info]: Step 0, load calibration image files.\n"); + + /* really malloc the output tesnors date buffer of nodes in the graph */ + for (int i = 0; i < ir_graph->tensor_num; i++) + { + struct tensor* var_tensor = ir_graph->tensor_list[i]; + if (var_tensor->tensor_type == TENSOR_TYPE_VAR) + { + var_tensor->data = realloc(var_tensor->data, sizeof(float) * var_tensor->elem_num); + memset(var_tensor->data, 0, sizeof(float) * var_tensor->elem_num); + } + } + + /* read image list */ + std::vector imgs_list; + readFileList(image_dir, imgs_list); + uint32_t img_num = imgs_list.size(); + + fprintf(stderr, "[Quant Tools Info]: Step 0, load calibration image files done, image num is %d.\n", img_num); + + /* init minmax */ + std::unordered_map max_activation; + std::unordered_map min_activation; + uint32_t act_tensor_num = 0; + for (int i = 0; i < ir_graph->tensor_num; i++) + { + struct tensor* act_tensor = ir_graph->tensor_list[i]; + if (act_tensor->tensor_type == TENSOR_TYPE_VAR || act_tensor->tensor_type == TENSOR_TYPE_INPUT) + { + act_tensor_num++; + max_activation[i] = -FLT_MAX; + min_activation[i] = FLT_MAX; + } + } + + fprintf(stderr, "[Quant Tools Info]: Step 1, find original calibration table.\n"); + + /* first loop, find the min/max value of every activation tensor of the graph */ + double min_time = DBL_MAX; + double max_time = DBL_MIN; + double total_time = 0.; + for (int nums = 0; nums < img_num; nums++) + { + fprintf(stderr, "\r[Quant Tools Info]: Step 1, images %.5d / %.5d", nums + 1, img_num); + get_input_data_cv(imgs_list[nums].c_str(), input_data.data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus); + + /* run graph */ + double start = get_current_time(); + if (run_graph(ir_graph, 1) < 0) + { + fprintf(stderr, "Run graph failed\n"); + return -1; + } + + double end = get_current_time(); + double cur = end - start; + total_time += cur; + min_time = std::min(min_time, cur); + max_time = std::max(max_time, cur); + + /* get the min/max value of activation tensor */ + for (int i = 0; i < ir_graph->tensor_num; i++) + { + struct tensor* act_tensor = ir_graph->tensor_list[i]; + if (act_tensor->tensor_type == TENSOR_TYPE_VAR || act_tensor->tensor_type == TENSOR_TYPE_INPUT) + { + float* start_addr = (float*)act_tensor->data; + float* end_addr = (float*)act_tensor->data + act_tensor->elem_num; + max_activation[i] = std::max(max_activation[i], *std::max_element(start_addr, end_addr)); + min_activation[i] = std::min(min_activation[i], *std::min_element(start_addr, end_addr)); + } + } + } + + /* save the calibration file with min-max algorithm */ + FILE* fp_minmax = fopen("table_minmax.scale", "wb"); + for (int i = 0; i < ir_graph->tensor_num; i++) + { + struct tensor* t = ir_graph->tensor_list[i]; + if (t->tensor_type == TENSOR_TYPE_VAR || t->tensor_type == TENSOR_TYPE_INPUT) + { + float act_scale; + int act_zero_point; + if (max_activation[i] < 0) + { + act_scale = (0 - min_activation[i]) / 255; + act_zero_point = int(-min_activation[i] / act_scale); + } + else if (min_activation[i] > 0) + { + act_scale = (max_activation[i] - 0) / 255; + act_zero_point = 0; + } + else + { + act_scale = (max_activation[i] - min_activation[i]) / 255; + act_zero_point = int(-min_activation[i] / act_scale); + } + + if (act_scale == 0) + act_zero_point = 0; + + /* the scale of softmax always is scale = 1 / 127.f */ + for (int j = 0; j < ir_graph->node_num; j++) + { + struct node* noden = ir_graph->node_list[j]; + struct tensor* tensor_tmp = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]); + + if (!(tensor_tmp->tensor_type == TENSOR_TYPE_INPUT || tensor_tmp->tensor_type == TENSOR_TYPE_VAR)) + continue; + + std::string tmp_op_name = get_op_name_from_type(noden->op.type); + std::string cur_name = t->name; + std::string tmp_name = tensor_tmp->name; + + if ((cur_name == tmp_name) && tmp_op_name == "Softmax") + { + act_scale = 1 / 255.f; + act_zero_point = 0; + break; + } + } + + fprintf(fp_minmax, "%s %f %d\n", ir_graph->tensor_list[i]->name, act_scale, act_zero_point); + } + } + fclose(fp_minmax); + fprintf(stderr, "\r\n[Quant Tools Info]: Step 1, find original calibration table done, output ./table_minmax.scale\n"); + + if (this->algorithm_type == ALGORITHM_KL) + { + /* kl process divergence */ + fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table.\n"); + std::tr1::unordered_map tensor_hist; + std::tr1::unordered_map hist_tensor; + std::vector > hist_edge; + std::vector > hist_gram; + + /* second loop, create histgram */ + for (int nums = imgs_list.size() - 1; nums >= 0; nums--) + { + fprintf(stderr, "\r[Quant Tools Info]: Step 2, images %.5d / %.5d", nums + 1, img_num); + + get_input_data_cv(imgs_list[nums].c_str(), input_data.data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus); + + /* run graph */ + if (run_graph(ir_graph, 1) < 0) + { + fprintf(stderr, "Run graph failed\n"); + return -1; + } + + /* calculate hist */ + uint32_t inum = 0; + for (int i = 0; i < ir_graph->tensor_num; i++) + { + struct tensor* ir_tensor = ir_graph->tensor_list[i]; + if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) + { + float step_max = max_activation[i] - min_activation[i]; + float step_bin = step_max / 2048.0f; + + std::vector every_edge; + if (nums == imgs_list.size() - 1) + { + for (int j = 0; j < 2048; j++) + { + float edge_float = (step_bin * (j + 0.5f)) + min_activation[i]; + every_edge.push_back(edge_float); + } + hist_edge.push_back(every_edge); + hist_gram.push_back(histCount((float*)ir_tensor->data, ir_tensor->elem_num, max_activation[i], min_activation[i])); + } + else + { + std::vector hist_tmp; + hist_tmp = histCount((float*)ir_tensor->data, ir_tensor->elem_num, max_activation[i], min_activation[i]); + for (int j = 0; j < 2048; j++) + { + hist_gram[inum][j] += hist_tmp[j]; + } + } + + tensor_hist[i] = inum; + hist_tensor[inum] = i; + inum++; + } + } + } + + fprintf(stderr, "\n"); + + /* save the calibration file with min-max algorithm with kl divergence */ + FILE* fp_kl = fopen("table_kl.scale", "wb"); + for (int i = 0; i < act_tensor_num; i++) + { + int threshold_bin = threshold_distribution(hist_gram[i], 256); + // fprintf(stderr, " threshold_bin %d \n", threshold_bin); + + std::vector hist_gram_F(threshold_bin + 1); + for (int j = 0; j < threshold_bin + 1; j++) + { + hist_gram_F[j] = hist_gram[i][threshold_bin - j]; + } + int threshold_bin_F = threshold_distribution(hist_gram_F, 256); + int threshold_bin_min = threshold_bin - threshold_bin_F + 1; + + // fprintf(stderr, "### %s : %d %f %f & %f %f\n",ir_graph->tensor_list[hist_tensor[i]]->name, threshold_bin, min_activation[hist_tensor[i]],\ + // hist_edge[i][threshold_bin_min], hist_edge[i][threshold_bin], max_activation[hist_tensor[i]]); + + float kl_min = hist_edge[i][threshold_bin_min]; + float kl_max = hist_edge[i][threshold_bin]; + + float act_scale = 1.0f; + int act_zero_point = 0; + if (kl_max < 0) + { + act_scale = (0 - kl_min) / 255.f; + act_zero_point = int(-kl_min / act_scale); + } + else if (kl_min > 0) + { + act_scale = (kl_max - 0) / 255.f; + act_zero_point = 0; + } + else + { + act_scale = (kl_max - kl_min) / 255.f; + act_zero_point = int(-kl_min / act_scale); + } + + if (act_scale == 0) + act_zero_point = 0; + + /* the scale of softmax always is scale = 1 / 255.f */ + for (int j = 0; j < ir_graph->node_num; j++) + { + struct node* ir_node = ir_graph->node_list[j]; + struct tensor* ir_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); + + if (!(ir_tensor->tensor_type == TENSOR_TYPE_INPUT || ir_tensor->tensor_type == TENSOR_TYPE_VAR)) + continue; + + std::string tmp_op_name = get_op_name_from_type(ir_node->op.type); + std::string cur_name = ir_graph->tensor_list[hist_tensor[i]]->name; + std::string tmp_name = ir_tensor->name; + + if ((cur_name == tmp_name) && tmp_op_name == "Softmax") + { + act_scale = 1 / 255.f; + act_zero_point = 0; + break; + } + } + + fprintf(fp_kl, "%s %f %d\n", ir_graph->tensor_list[hist_tensor[i]]->name, act_scale, act_zero_point); + } + fclose(fp_kl); + fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table done, output ./table_kl.scale\n"); + } + + fprintf(stderr, "[Quant Tools Info]: Thread %d, image nums %d, total time %.2f ms, avg time %.2f ms\n", num_thread, img_num, total_time, total_time / img_num); + + /* release tengine */ + postrun_graph(ir_graph); + destroy_graph(ir_graph); + + return 0; +} + +const char* help_params = "[Quant Tools Info]: optional arguments:\n" + "\t-h help show this help message and exit\n" + "\t-m input model path to input float32 tmfile\n" + "\t-i image dir path to calibration images folder\n" + "\t-f scale file path to calibration scale file\n" + "\t-o output model path to output uint8 tmfile\n" + "\t-a algorithm the type of quant algorithm(0:min-max, 1:kl, default is 0)\n" + "\t-g size the size of input image(using the resize the original image,default is 3,224,224)\n" + "\t-w mean value of mean (mean value, default is 104.0,117.0,123.0)\n" + "\t-s scale value of normalize (scale value, default is 1.0,1.0,1.0)\n" + "\t-b swapRB flag which indicates that swap first and last channels in 3-channel image is necessary(0:OFF, 1:ON, default is 1)\n" + "\t-c center crop flag which indicates that center crop process image is necessary(0:OFF, 1:ON, default is 0)\n" + "\t-y letter box the size of letter box process image is necessary([rows, cols], default is [0, 0])\n" + "\t-k focus flag which indicates that focus process image is necessary(maybe using for YOLOv5, 0:OFF, 1:ON, default is 0)\n" + "\t-t num thread count of processing threads(default is 1)\n"; + +const char* example_params = "[Quant Tools Info]: example arguments:\n" + "\t./quant_tool_uint8 -m ./mobilenet_fp32.tmfile -i ./dataset -o ./mobilenet_uint8.tmfile -g 3,224,224 -w 104.007,116.669,122.679 -s 0.017,0.017,0.017\n"; + +void show_usage() +{ + fprintf(stderr, "%s\n", help_params); + fprintf(stderr, "%s\n", example_params); +} + +int main(int argc, char* argv[]) +{ + QuantTool quant_tool; + + int res; + while ((res = getopt(argc, argv, "m:a:f:o:i:g:s:w:b:c:y:k:t:h")) != -1) + { + switch (res) + { + case 'm': + quant_tool.model_file = optarg; + break; + case 'a': + quant_tool.algorithm_type = atoi(optarg); + break; + case 'f': + quant_tool.scale_file = optarg; + break; + case 'o': + quant_tool.output_file = optarg; + break; + case 'i': + quant_tool.image_dir = optarg; + break; + case 'g': + float img_chw[3]; + split(img_chw, optarg, ","); + quant_tool.img_c = (int)img_chw[0]; + quant_tool.img_h = (int)img_chw[1]; + quant_tool.img_w = (int)img_chw[2]; + break; + case 'w': + split(quant_tool.mean, optarg, ","); + break; + case 's': + split(quant_tool.scale, optarg, ","); + break; + case 'b': + quant_tool.sw_RGB = atoi(optarg); + break; + case 'c': + quant_tool.center_crop = atoi(optarg); + break; + case 'y': + float letterboxs[2]; + split(letterboxs, optarg, ","); + quant_tool.letterbox_rows = (int)letterboxs[0]; + quant_tool.letterbox_cols = (int)letterboxs[1]; + break; + case 'k': + quant_tool.focus = atoi(optarg); + break; + case 't': + quant_tool.num_thread = atoi(optarg); + quant_tool.opt.num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; + } + } + + /* version */ + fprintf(stderr, "\n---- Tengine Post Training Quantization Tool ---- \n"); + fprintf(stderr, "\nVersion : v1.2, %s %s\n", __TIME__, __DATE__); + fprintf(stderr, "Status : uint8, per-channel, asymmetric\n"); + + /* check input params */ + if (quant_tool.model_file.empty()) + { + fprintf(stderr, "[Quant Tools Info]: The input file of Float32 tmfile file not specified!\n"); + show_usage(); + return -1; + } + + if (quant_tool.image_dir.empty()) + { + fprintf(stderr, "[Quant Tools Info]: The input dir of Calibration image not specified!\n"); + show_usage(); + return -1; + } + + if (quant_tool.output_file.empty()) + { + fprintf(stderr, "[Quant Tools Info]: The output file of Int8 tmfile not specified!\n"); + show_usage(); + return -1; + } + + /* debug info : input params */ + fprintf(stderr, "Input model : %s\n", quant_tool.model_file.c_str()); + fprintf(stderr, "Output model: %s\n", quant_tool.output_file.c_str()); + fprintf(stderr, "Calib images: %s\n", quant_tool.image_dir.c_str()); + fprintf(stderr, "Scale file : %s\n", quant_tool.scale_file.empty() ? "NULL" : quant_tool.scale_file.c_str()); + fprintf(stderr, "Algorithm : %s\n", quant_tool.algorithm_type ? "KL" : "MIN MAX"); + fprintf(stderr, "Dims : %d %d %d\n", quant_tool.img_c, quant_tool.img_h, quant_tool.img_w); + fprintf(stderr, "Mean : %.3f %.3f %.3f\n", quant_tool.mean[0], quant_tool.mean[1], quant_tool.mean[2]); + fprintf(stderr, "Scale : %.3f %.3f %.3f\n", quant_tool.scale[0], quant_tool.scale[1], quant_tool.scale[2]); + fprintf(stderr, "BGR2RGB : %s\n", quant_tool.sw_RGB ? "ON" : "OFF"); + fprintf(stderr, "Center crop : %s\n", quant_tool.center_crop ? "ON" : "OFF"); + fprintf(stderr, "Letter box : %d %d\n", quant_tool.letterbox_rows, quant_tool.letterbox_cols); + fprintf(stderr, "YOLOv5 focus: %s\n", quant_tool.focus ? "ON" : "OFF"); + fprintf(stderr, "Thread num : %d\n\n", quant_tool.num_thread); + + /* using 3rd calibration table file */ + if (quant_tool.scale_file.empty()) + { + /* quantize activation */ + quant_tool.activation_quant_tool(); + + /* select algorithm */ + if (quant_tool.algorithm_type == ALGORITHM_MIN_MAX) + quant_tool.scale_file = "table_minmax.scale"; + else if (quant_tool.algorithm_type == ALGORITHM_KL) + quant_tool.scale_file = "table_kl.scale"; + else + { + fprintf(stderr, "[Quant Tools Info]: algorithm not specified, using default type MIN MAX\n"); + quant_tool.scale_file = "table_minmax.scale"; + } + } + + /* quantize weight/bias and save into uint8 tmfile */ + fprintf(stderr, "[Quant Tools Info]: Calibration file is using %s\n", quant_tool.scale_file.c_str()); + save_graph_u8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); + + fprintf(stderr, "\n---- Tengine Int8 tmfile create success, best wish for your INT8 inference has a low accuracy loss...\\(^0^)/ ----\n"); + + return 0; +} diff --git a/tools/quantize/quant_utils.cpp b/tools/quantize/quant_utils.cpp index ff4c72662..7cf67daf9 100644 --- a/tools/quantize/quant_utils.cpp +++ b/tools/quantize/quant_utils.cpp @@ -1,550 +1,543 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: hhchen@openailab.com - */ - - -#include -#include - -#include -#include -#include - -#ifdef _MSC_VER -#include "getopt.h" -#else -#include -#endif - -#ifdef _WIN32 -#define WIN32_LEAN_AND_MEAN -#include -#else // _WIN32 -#include -#endif // _WIN32 - -#include "quant_utils.hpp" - - -#ifdef _WIN32 -double get_current_time() -{ - LARGE_INTEGER freq; - LARGE_INTEGER pc; - QueryPerformanceFrequency(&freq); - QueryPerformanceCounter(&pc); - - return pc.QuadPart * 1000.0 / freq.QuadPart; -} -#else // _WIN32 - -double get_current_time() -{ - struct timeval tv; - gettimeofday(&tv, nullptr); - - return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0; -} -#endif // _WIN32 - -void split(float* array, char* str, const char* del) -{ - char* s = nullptr; - s = strtok(str, del); - while (s != nullptr) - { - *array++ = atof(s); - s = strtok(nullptr, del); - } -} - -void get_input_data_cv(const char* image_file, float* input_data, int img_c, int img_h, int img_w, const float* mean, - const float* scale, int sw_RGB = 0, int center_crop = 0, int letterbox_rows = 0, int letterbox_cols = 0, int focus = 0) -{ - /* only for yolov5s */ - if (focus == 1 && letterbox_rows > 0 && letterbox_cols > 0) - { - cv::Mat sample = cv::imread(image_file, 1); - cv::Mat img; - - if (sample.channels() == 4) - { - cv::cvtColor(sample, img, cv::COLOR_BGRA2BGR); - } - else if (sample.channels() == 1 && img_c == 3 && sw_RGB == 0) - { - cv::cvtColor(sample, img, cv::COLOR_GRAY2BGR); - } - else if (sample.channels() == 1 && img_c == 3 && sw_RGB == 1) - { - cv::cvtColor(sample, img, cv::COLOR_GRAY2RGB); - } - else if (sample.channels() == 3 && sw_RGB == 1 && img_c != 1) - { - cv::cvtColor(sample, img, cv::COLOR_BGR2RGB); - } - else if (sample.channels() == 3 && img_c == 1) - { - cv::cvtColor(sample, img, cv::COLOR_BGR2GRAY); - } - else - { - img = sample; - } - - /* letterbox process to support different letterbox size */ - float scale_letterbox; - int resize_rows; - int resize_cols; - if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) - { - scale_letterbox = letterbox_rows * 1.0 / img.rows; - } - else - { - scale_letterbox = letterbox_cols * 1.0 / img.cols; - } - resize_cols = int(scale_letterbox * img.cols); - resize_rows = int(scale_letterbox * img.rows); - - cv::resize(img, img, cv::Size(resize_cols, resize_rows)); - img.convertTo(img, CV_32FC3); - - // Generate a gray image for letterbox using opencv - cv::Mat resize_img(letterbox_cols, letterbox_rows, CV_32FC3, cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2])); - int top = (letterbox_rows - resize_rows) / 2; - int bot = (letterbox_rows - resize_rows + 1) / 2; - int left = (letterbox_cols - resize_cols) / 2; - int right = (letterbox_cols - resize_cols + 1) / 2; - - // Letterbox filling - cv::copyMakeBorder(img, resize_img, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2])); - - resize_img.convertTo(resize_img, CV_32FC3); - float* img_data = (float* )resize_img.data; - float* input_temp = (float* )malloc(3 * letterbox_rows * letterbox_cols * sizeof(float)); - - /* nhwc to nchw */ - for (int h = 0; h < letterbox_rows; h++) - { - for (int w = 0; w < letterbox_cols; w++) - { - for (int c = 0; c < 3; c++) - { - int in_index = h * letterbox_cols * 3 + w * 3 + c; - int out_index = c * letterbox_rows * letterbox_cols + h * letterbox_cols + w; - input_temp[out_index] = (img_data[in_index] - mean[c]) * scale[c]; - } - } - } - - /* focus process */ - for (int i = 0; i < 2; i++) // corresponding to rows - { - for (int g = 0; g < 2; g++) // corresponding to cols - { - for (int c = 0; c < 3; c++) - { - for (int h = 0; h < letterbox_rows/2; h++) - { - for (int w = 0; w < letterbox_cols/2; w++) - { - int in_index = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows + - h * 2 * letterbox_cols + w * 2; - int out_index = i * 2 * 3 * (letterbox_cols/2) * (letterbox_rows/2) + - g * 3 * (letterbox_cols/2) * (letterbox_rows/2) + - c * (letterbox_cols/2) * (letterbox_rows/2) + - h * (letterbox_cols/2) + - w; - - input_data[out_index] = input_temp[in_index]; - } - } - } - } - } - - free(input_temp); - - return; - } - - cv::Mat sample = cv::imread(image_file, 1); - cv::Mat img; - - if (sample.channels() == 4) - { - cv::cvtColor(sample, img, cv::COLOR_BGRA2BGR); - } - else if (sample.channels() == 1 && img_c == 3 && sw_RGB == 0) - { - cv::cvtColor(sample, img, cv::COLOR_GRAY2BGR); - } - else if (sample.channels() == 1 && img_c == 3 && sw_RGB == 1) - { - cv::cvtColor(sample, img, cv::COLOR_GRAY2RGB); - } - else if (sample.channels() == 3 && sw_RGB == 1 && img_c != 1) - { - cv::cvtColor(sample, img, cv::COLOR_BGR2RGB); - } - else if (sample.channels() == 3 && img_c == 1) - { - cv::cvtColor(sample, img, cv::COLOR_BGR2GRAY); - } - else - { - img = sample; - } - - if (center_crop == 1) - { - int h0 = 0; - int w0 = 0; - if ( img.rows < img.cols) - { - h0 = 256; - w0 = int(img.cols*(256.0/img.rows)); - } - else - { - h0 = int(img.rows*(256.0/img.cols)); - w0 = 256; - } - int center_h = int(h0/2); - int center_w = int(w0/2); - - float* img_data = nullptr; - - cv::resize(img, img, cv::Size(w0, h0)); - cv::Rect img_roi_box(center_w - 112, center_h - 112, 224, 224); - cv::Mat img_crop = img(img_roi_box).clone(); - - if (img_c == 3) - img_crop.convertTo(img_crop, CV_32FC3); - else if (img_c == 1) - img_crop.convertTo(img_crop, CV_32FC1); - img_data = ( float* )img_crop.data; - - int hw = img_h * img_w; - for (int h = 0; h < img_h; h++) - { - for (int w = 0; w < img_w; w++) - { - for (int c = 0; c < img_c; c++) - { - input_data[c * hw + h * img_w + w] = (*img_data - mean[c]) * scale[c]; - img_data++; - } - } - } - } - else if (letterbox_rows > 0) - { - float letterbox_size = (float)letterbox_rows; - int resize_h = 0; - int resize_w = 0; - if (img.rows > img.cols) - { - resize_h = letterbox_size; - resize_w = int(img.cols * (letterbox_size / img.rows)); - } - else - { - resize_h = int(img.rows * (letterbox_size / img.cols)); - resize_w = letterbox_size; - } - - float* img_data = nullptr; - - cv::resize(img, img, cv::Size(resize_w, resize_h)); - img.convertTo(img, CV_32FC3); - cv::Mat img_new(letterbox_size, letterbox_size, CV_32FC3, - cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2])); - int dh = int((letterbox_size - resize_h) / 2); - int dw = int((letterbox_size - resize_w) / 2); - - for (int h = 0; h < resize_h; h++) - { - for (int w = 0; w < resize_w; w++) - { - for (int c = 0; c < 3; ++c) - { - int in_index = h * resize_w * 3 + w * 3 + c; - int out_index = (dh + h) * letterbox_size * 3 + (dw + w) * 3 + c; - - (( float* )img_new.data)[out_index] = (( float* )img.data)[in_index]; - } - } - } - - if (img_c == 3) - img_new.convertTo(img_new, CV_32FC3); - else if (img_c == 1) - img_new.convertTo(img_new, CV_32FC1); - img_data = ( float* )img_new.data; - - int hw = img_h * img_w; - for (int h = 0; h < img_h; h++) - { - for (int w = 0; w < img_w; w++) - { - for (int c = 0; c < img_c; c++) - { - input_data[c * hw + h * img_w + w] = (*img_data - mean[c]) * scale[c]; - img_data++; - } - } - } - } - else - { - cv::resize(img, img, cv::Size(img_w, img_h)); - if (img_c == 3) - img.convertTo(img, CV_32FC3); - else if (img_c == 1) - img.convertTo(img, CV_32FC1); - float* img_data = ( float* )img.data; - int hw = img_h * img_w; - for (int h = 0; h < img_h; h++) - { - for (int w = 0; w < img_w; w++) - { - for (int c = 0; c < img_c; c++) - { - input_data[c * hw + h * img_w + w] = (*img_data - mean[c]) * scale[c]; - img_data++; - } - } - } - } -} - -void readFileList(std::string basePath, std::vector& imgs) -{ - DIR *dir; - struct dirent *ptr; - std::string base; - - if ((dir=opendir(basePath.c_str())) == NULL) - { - perror("Open dir error..."); - exit(1); - } - - while ((ptr=readdir(dir)) != NULL) - { - if(strcmp(ptr->d_name,".")==0 || strcmp(ptr->d_name,"..")==0) ///current dir OR parrent dir - continue; - else if(ptr->d_type == 8) ///file - { - base = basePath + "/" + ptr->d_name; - imgs.push_back(base); - } - else if(ptr->d_type == 4) ///dir - { - readFileList(basePath + "/" + ptr->d_name, imgs); - } - } - closedir(dir); -} - -std::vector histCount(float *data, uint32_t elem_num, float max_val, float min_val) -{ - float bin_scale = (max_val - min_val) / 2047.f; - int bin_zp = int(-min_val / bin_scale); - std::vector hist(2048); - for (int i = 0; i < elem_num; i++) - if (data[i] != 0) - hist[uint32_t(data[i] / bin_scale + bin_zp)] ++; - return hist; -} - -float compute_kl_divergence(std::vector &dist_a, std::vector &dist_b) -{ - const size_t length = dist_a.size(); - float result = 0; - - for (size_t i = 0; i < length; i++) - { - if (dist_a[i] != 0) - { - if (dist_b[i] == 0) - { - result += 1; - } - else - { - result += dist_a[i] * log(dist_a[i] / dist_b[i]); - } - } - } - - return result; -} - -std::vector normalize_histogram(std::vector &histogram) -{ - std::vector histogram_out(histogram.size()); - const size_t length = histogram.size(); - float sum = 0; - - for (size_t i = 1; i < length; i++) - sum += histogram[i]; - - for (size_t i = 1; i < length; i++) - histogram_out[i] = float(histogram[i] / sum); - - return histogram_out; -} - -int threshold_distribution(std::vector &distribution_in, const int target_bin) -{ - int target_threshold = target_bin; - float min_kl_divergence = FLT_MAX; - const int length = static_cast(distribution_in.size()); - - std::vector distribution(distribution_in.size()); - std::vector quantize_distribution(target_bin); - distribution = normalize_histogram(distribution_in); - - float threshold_sum = 0; - for (int threshold = target_bin; threshold < length; threshold++) - { - threshold_sum += distribution[threshold]; - } - - for (int threshold = target_bin; threshold < length; threshold++) - { - std::vector t_distribution(distribution.begin(), distribution.begin() + threshold); - - t_distribution[threshold - 1] += threshold_sum; - threshold_sum -= distribution[threshold]; - - // get P - fill(quantize_distribution.begin(), quantize_distribution.end(), 0.0f); - - const float num_per_bin = static_cast(threshold) / static_cast(target_bin); - - for (int i = 0; i < target_bin; i++) - { - const float start = static_cast(i) * num_per_bin; - const float end = start + num_per_bin; - - const int left_upper = static_cast(ceil(start)); - if (static_cast(left_upper) > start) - { - const float left_scale = static_cast(left_upper) - start; - quantize_distribution[i] += left_scale * distribution[left_upper - 1]; - } - - const int right_lower = static_cast(floor(end)); - - if (static_cast(right_lower) < end) - { - const float right_scale = end - static_cast(right_lower); - quantize_distribution[i] += right_scale * distribution[right_lower]; - } - - for (int j = left_upper; j < right_lower; j++) - { - quantize_distribution[i] += distribution[j]; - } - } - - // get Q - std::vector expand_distribution(threshold, 0); - for (int i = 0; i < target_bin; i++) - { - const float start = static_cast(i) * num_per_bin; - const float end = start + num_per_bin; - - float count = 0; - - const int left_upper = static_cast(ceil(start)); - float left_scale = 0; - if (static_cast(left_upper) > start) - { - left_scale = static_cast(left_upper) - start; - if (distribution[left_upper - 1] != 0) - { - count += left_scale; - } - } - - const int right_lower = static_cast(floor(end)); - float right_scale = 0; - if (static_cast(right_lower) < end) - { - right_scale = end - static_cast(right_lower); - if (distribution[right_lower] != 0) - { - count += right_scale; - } - } - - for (int j = left_upper; j < right_lower; j++) - { - if (distribution[j] != 0) - { - count++; - } - } - - const float expand_value = quantize_distribution[i] / count; - - if (static_cast(left_upper) > start) - { - if (distribution[left_upper - 1] != 0) - { - expand_distribution[left_upper - 1] += expand_value * left_scale; - } - } - if (static_cast(right_lower) < end) - { - if (distribution[right_lower] != 0) - { - expand_distribution[right_lower] += expand_value * right_scale; - } - } - for (int j = left_upper; j < right_lower; j++) - { - if (distribution[j] != 0) - { - expand_distribution[j] += expand_value; - } - } - } - - const float kl_divergence = compute_kl_divergence(t_distribution, expand_distribution); - - // the best num of bin - if (kl_divergence < min_kl_divergence) - { - min_kl_divergence = kl_divergence; - target_threshold = threshold; - } - } - - return target_threshold; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: hhchen@openailab.com + */ + +#include +#include + +#include +#include +#include + +#ifdef _MSC_VER +#include "getopt.h" +#else +#include +#endif + +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#include +#else // _WIN32 +#include +#endif // _WIN32 + +#include "quant_utils.hpp" + +#ifdef _WIN32 +double get_current_time() +{ + LARGE_INTEGER freq; + LARGE_INTEGER pc; + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&pc); + + return pc.QuadPart * 1000.0 / freq.QuadPart; +} +#else // _WIN32 + +double get_current_time() +{ + struct timeval tv; + gettimeofday(&tv, nullptr); + + return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0; +} +#endif // _WIN32 + +void split(float* array, char* str, const char* del) +{ + char* s = nullptr; + s = strtok(str, del); + while (s != nullptr) + { + *array++ = atof(s); + s = strtok(nullptr, del); + } +} + +void get_input_data_cv(const char* image_file, float* input_data, int img_c, int img_h, int img_w, const float* mean, + const float* scale, int sw_RGB = 0, int center_crop = 0, int letterbox_rows = 0, int letterbox_cols = 0, int focus = 0) +{ + /* only for yolov5s */ + if (focus == 1 && letterbox_rows > 0 && letterbox_cols > 0) + { + cv::Mat sample = cv::imread(image_file, 1); + cv::Mat img; + + if (sample.channels() == 4) + { + cv::cvtColor(sample, img, cv::COLOR_BGRA2BGR); + } + else if (sample.channels() == 1 && img_c == 3 && sw_RGB == 0) + { + cv::cvtColor(sample, img, cv::COLOR_GRAY2BGR); + } + else if (sample.channels() == 1 && img_c == 3 && sw_RGB == 1) + { + cv::cvtColor(sample, img, cv::COLOR_GRAY2RGB); + } + else if (sample.channels() == 3 && sw_RGB == 1 && img_c != 1) + { + cv::cvtColor(sample, img, cv::COLOR_BGR2RGB); + } + else if (sample.channels() == 3 && img_c == 1) + { + cv::cvtColor(sample, img, cv::COLOR_BGR2GRAY); + } + else + { + img = sample; + } + + /* letterbox process to support different letterbox size */ + float scale_letterbox; + int resize_rows; + int resize_cols; + if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) + { + scale_letterbox = letterbox_rows * 1.0 / img.rows; + } + else + { + scale_letterbox = letterbox_cols * 1.0 / img.cols; + } + resize_cols = int(scale_letterbox * img.cols); + resize_rows = int(scale_letterbox * img.rows); + + cv::resize(img, img, cv::Size(resize_cols, resize_rows)); + img.convertTo(img, CV_32FC3); + + // Generate a gray image for letterbox using opencv + cv::Mat resize_img(letterbox_cols, letterbox_rows, CV_32FC3, cv::Scalar(0.5 / scale[0] + mean[0], 0.5 / scale[1] + mean[1], 0.5 / scale[2] + mean[2])); + int top = (letterbox_rows - resize_rows) / 2; + int bot = (letterbox_rows - resize_rows + 1) / 2; + int left = (letterbox_cols - resize_cols) / 2; + int right = (letterbox_cols - resize_cols + 1) / 2; + + // Letterbox filling + cv::copyMakeBorder(img, resize_img, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(0.5 / scale[0] + mean[0], 0.5 / scale[1] + mean[1], 0.5 / scale[2] + mean[2])); + + resize_img.convertTo(resize_img, CV_32FC3); + float* img_data = (float*)resize_img.data; + float* input_temp = (float*)malloc(3 * letterbox_rows * letterbox_cols * sizeof(float)); + + /* nhwc to nchw */ + for (int h = 0; h < letterbox_rows; h++) + { + for (int w = 0; w < letterbox_cols; w++) + { + for (int c = 0; c < 3; c++) + { + int in_index = h * letterbox_cols * 3 + w * 3 + c; + int out_index = c * letterbox_rows * letterbox_cols + h * letterbox_cols + w; + input_temp[out_index] = (img_data[in_index] - mean[c]) * scale[c]; + } + } + } + + /* focus process */ + for (int i = 0; i < 2; i++) // corresponding to rows + { + for (int g = 0; g < 2; g++) // corresponding to cols + { + for (int c = 0; c < 3; c++) + { + for (int h = 0; h < letterbox_rows / 2; h++) + { + for (int w = 0; w < letterbox_cols / 2; w++) + { + int in_index = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows + h * 2 * letterbox_cols + w * 2; + int out_index = i * 2 * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + g * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + c * (letterbox_cols / 2) * (letterbox_rows / 2) + h * (letterbox_cols / 2) + w; + + input_data[out_index] = input_temp[in_index]; + } + } + } + } + } + + free(input_temp); + + return; + } + + cv::Mat sample = cv::imread(image_file, 1); + cv::Mat img; + + if (sample.channels() == 4) + { + cv::cvtColor(sample, img, cv::COLOR_BGRA2BGR); + } + else if (sample.channels() == 1 && img_c == 3 && sw_RGB == 0) + { + cv::cvtColor(sample, img, cv::COLOR_GRAY2BGR); + } + else if (sample.channels() == 1 && img_c == 3 && sw_RGB == 1) + { + cv::cvtColor(sample, img, cv::COLOR_GRAY2RGB); + } + else if (sample.channels() == 3 && sw_RGB == 1 && img_c != 1) + { + cv::cvtColor(sample, img, cv::COLOR_BGR2RGB); + } + else if (sample.channels() == 3 && img_c == 1) + { + cv::cvtColor(sample, img, cv::COLOR_BGR2GRAY); + } + else + { + img = sample; + } + + if (center_crop == 1) + { + int h0 = 0; + int w0 = 0; + if (img.rows < img.cols) + { + h0 = 256; + w0 = int(img.cols * (256.0 / img.rows)); + } + else + { + h0 = int(img.rows * (256.0 / img.cols)); + w0 = 256; + } + int center_h = int(h0 / 2); + int center_w = int(w0 / 2); + + float* img_data = nullptr; + + cv::resize(img, img, cv::Size(w0, h0)); + cv::Rect img_roi_box(center_w - 112, center_h - 112, 224, 224); + cv::Mat img_crop = img(img_roi_box).clone(); + + if (img_c == 3) + img_crop.convertTo(img_crop, CV_32FC3); + else if (img_c == 1) + img_crop.convertTo(img_crop, CV_32FC1); + img_data = (float*)img_crop.data; + + int hw = img_h * img_w; + for (int h = 0; h < img_h; h++) + { + for (int w = 0; w < img_w; w++) + { + for (int c = 0; c < img_c; c++) + { + input_data[c * hw + h * img_w + w] = (*img_data - mean[c]) * scale[c]; + img_data++; + } + } + } + } + else if (letterbox_rows > 0) + { + float letterbox_size = (float)letterbox_rows; + int resize_h = 0; + int resize_w = 0; + if (img.rows > img.cols) + { + resize_h = letterbox_size; + resize_w = int(img.cols * (letterbox_size / img.rows)); + } + else + { + resize_h = int(img.rows * (letterbox_size / img.cols)); + resize_w = letterbox_size; + } + + float* img_data = nullptr; + + cv::resize(img, img, cv::Size(resize_w, resize_h)); + img.convertTo(img, CV_32FC3); + cv::Mat img_new(letterbox_size, letterbox_size, CV_32FC3, + cv::Scalar(0.5 / scale[0] + mean[0], 0.5 / scale[1] + mean[1], 0.5 / scale[2] + mean[2])); + int dh = int((letterbox_size - resize_h) / 2); + int dw = int((letterbox_size - resize_w) / 2); + + for (int h = 0; h < resize_h; h++) + { + for (int w = 0; w < resize_w; w++) + { + for (int c = 0; c < 3; ++c) + { + int in_index = h * resize_w * 3 + w * 3 + c; + int out_index = (dh + h) * letterbox_size * 3 + (dw + w) * 3 + c; + + ((float*)img_new.data)[out_index] = ((float*)img.data)[in_index]; + } + } + } + + if (img_c == 3) + img_new.convertTo(img_new, CV_32FC3); + else if (img_c == 1) + img_new.convertTo(img_new, CV_32FC1); + img_data = (float*)img_new.data; + + int hw = img_h * img_w; + for (int h = 0; h < img_h; h++) + { + for (int w = 0; w < img_w; w++) + { + for (int c = 0; c < img_c; c++) + { + input_data[c * hw + h * img_w + w] = (*img_data - mean[c]) * scale[c]; + img_data++; + } + } + } + } + else + { + cv::resize(img, img, cv::Size(img_w, img_h)); + if (img_c == 3) + img.convertTo(img, CV_32FC3); + else if (img_c == 1) + img.convertTo(img, CV_32FC1); + float* img_data = (float*)img.data; + int hw = img_h * img_w; + for (int h = 0; h < img_h; h++) + { + for (int w = 0; w < img_w; w++) + { + for (int c = 0; c < img_c; c++) + { + input_data[c * hw + h * img_w + w] = (*img_data - mean[c]) * scale[c]; + img_data++; + } + } + } + } +} + +void readFileList(std::string basePath, std::vector& imgs) +{ + DIR* dir; + struct dirent* ptr; + std::string base; + + if ((dir = opendir(basePath.c_str())) == NULL) + { + perror("Open dir error..."); + exit(1); + } + + while ((ptr = readdir(dir)) != NULL) + { + if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0) ///current dir OR parrent dir + continue; + else if (ptr->d_type == 8) ///file + { + base = basePath + "/" + ptr->d_name; + imgs.push_back(base); + } + else if (ptr->d_type == 4) ///dir + { + readFileList(basePath + "/" + ptr->d_name, imgs); + } + } + closedir(dir); +} + +std::vector histCount(float* data, uint32_t elem_num, float max_val, float min_val) +{ + float bin_scale = (max_val - min_val) / 2047.f; + int bin_zp = int(-min_val / bin_scale); + std::vector hist(2048); + for (int i = 0; i < elem_num; i++) + if (data[i] != 0) + hist[uint32_t(data[i] / bin_scale + bin_zp)]++; + return hist; +} + +float compute_kl_divergence(std::vector& dist_a, std::vector& dist_b) +{ + const size_t length = dist_a.size(); + float result = 0; + + for (size_t i = 0; i < length; i++) + { + if (dist_a[i] != 0) + { + if (dist_b[i] == 0) + { + result += 1; + } + else + { + result += dist_a[i] * log(dist_a[i] / dist_b[i]); + } + } + } + + return result; +} + +std::vector normalize_histogram(std::vector& histogram) +{ + std::vector histogram_out(histogram.size()); + const size_t length = histogram.size(); + float sum = 0; + + for (size_t i = 1; i < length; i++) + sum += histogram[i]; + + for (size_t i = 1; i < length; i++) + histogram_out[i] = float(histogram[i] / sum); + + return histogram_out; +} + +int threshold_distribution(std::vector& distribution_in, const int target_bin) +{ + int target_threshold = target_bin; + float min_kl_divergence = FLT_MAX; + const int length = static_cast(distribution_in.size()); + + std::vector distribution(distribution_in.size()); + std::vector quantize_distribution(target_bin); + distribution = normalize_histogram(distribution_in); + + float threshold_sum = 0; + for (int threshold = target_bin; threshold < length; threshold++) + { + threshold_sum += distribution[threshold]; + } + + for (int threshold = target_bin; threshold < length; threshold++) + { + std::vector t_distribution(distribution.begin(), distribution.begin() + threshold); + + t_distribution[threshold - 1] += threshold_sum; + threshold_sum -= distribution[threshold]; + + // get P + fill(quantize_distribution.begin(), quantize_distribution.end(), 0.0f); + + const float num_per_bin = static_cast(threshold) / static_cast(target_bin); + + for (int i = 0; i < target_bin; i++) + { + const float start = static_cast(i) * num_per_bin; + const float end = start + num_per_bin; + + const int left_upper = static_cast(ceil(start)); + if (static_cast(left_upper) > start) + { + const float left_scale = static_cast(left_upper) - start; + quantize_distribution[i] += left_scale * distribution[left_upper - 1]; + } + + const int right_lower = static_cast(floor(end)); + + if (static_cast(right_lower) < end) + { + const float right_scale = end - static_cast(right_lower); + quantize_distribution[i] += right_scale * distribution[right_lower]; + } + + for (int j = left_upper; j < right_lower; j++) + { + quantize_distribution[i] += distribution[j]; + } + } + + // get Q + std::vector expand_distribution(threshold, 0); + for (int i = 0; i < target_bin; i++) + { + const float start = static_cast(i) * num_per_bin; + const float end = start + num_per_bin; + + float count = 0; + + const int left_upper = static_cast(ceil(start)); + float left_scale = 0; + if (static_cast(left_upper) > start) + { + left_scale = static_cast(left_upper) - start; + if (distribution[left_upper - 1] != 0) + { + count += left_scale; + } + } + + const int right_lower = static_cast(floor(end)); + float right_scale = 0; + if (static_cast(right_lower) < end) + { + right_scale = end - static_cast(right_lower); + if (distribution[right_lower] != 0) + { + count += right_scale; + } + } + + for (int j = left_upper; j < right_lower; j++) + { + if (distribution[j] != 0) + { + count++; + } + } + + const float expand_value = quantize_distribution[i] / count; + + if (static_cast(left_upper) > start) + { + if (distribution[left_upper - 1] != 0) + { + expand_distribution[left_upper - 1] += expand_value * left_scale; + } + } + if (static_cast(right_lower) < end) + { + if (distribution[right_lower] != 0) + { + expand_distribution[right_lower] += expand_value * right_scale; + } + } + for (int j = left_upper; j < right_lower; j++) + { + if (distribution[j] != 0) + { + expand_distribution[j] += expand_value; + } + } + } + + const float kl_divergence = compute_kl_divergence(t_distribution, expand_distribution); + + // the best num of bin + if (kl_divergence < min_kl_divergence) + { + min_kl_divergence = kl_divergence; + target_threshold = threshold; + } + } + + return target_threshold; +} diff --git a/tools/quantize/quant_utils.hpp b/tools/quantize/quant_utils.hpp index 6440a8708..4ad636763 100644 --- a/tools/quantize/quant_utils.hpp +++ b/tools/quantize/quant_utils.hpp @@ -1,49 +1,48 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: hhchen@openailab.com - */ -#pragma once - -#include -#include -#include -#include -#include -#include - - -double get_current_time(); - -void split(float* array, char* str, const char* del); - -void get_input_data_cv(const char* image_file, float* input_data, int img_c, int img_h, int img_w, const float* mean, - const float* scale, int sw_RGB, int center_crop, int letterbox_rows, int letterbox_cols, int focus); - -void readFileList(std::string basePath, std::vector& imgs); - -std::vector histCount(float *data, uint32_t elem_num, float max_val, float min_val); - -float compute_kl_divergence(std::vector &dist_a, std::vector &dist_b); - -std::vector normalize_histogram(std::vector &histogram); - -int threshold_distribution(std::vector &distribution_in, const int target_bin); +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: hhchen@openailab.com + */ +#pragma once + +#include +#include +#include +#include +#include +#include + +double get_current_time(); + +void split(float* array, char* str, const char* del); + +void get_input_data_cv(const char* image_file, float* input_data, int img_c, int img_h, int img_w, const float* mean, + const float* scale, int sw_RGB, int center_crop, int letterbox_rows, int letterbox_cols, int focus); + +void readFileList(std::string basePath, std::vector& imgs); + +std::vector histCount(float* data, uint32_t elem_num, float max_val, float min_val); + +float compute_kl_divergence(std::vector& dist_a, std::vector& dist_b); + +std::vector normalize_histogram(std::vector& histogram); + +int threshold_distribution(std::vector& distribution_in, const int target_bin); diff --git a/tools/quantize/savegraph/save_graph.cpp b/tools/quantize/savegraph/save_graph.cpp index 92b24b443..b528ca2fc 100644 --- a/tools/quantize/savegraph/save_graph.cpp +++ b/tools/quantize/savegraph/save_graph.cpp @@ -35,7 +35,7 @@ bool IsSaveString(void) { const char* env = std::getenv("TM_NO_STRING"); - if(env) + if (env) return false; else return true; @@ -45,7 +45,7 @@ bool IsSaveData(void) { const char* env = std::getenv("TM_FOR_BENCHMARK"); - if(env) + if (env) return false; else return true; @@ -53,7 +53,7 @@ bool IsSaveData(void) bool RegisterOpSaveMethod(const uint16_t& op_type, const op_save_t& save_func) { - if(op_save_map_.count(op_type)) + if (op_save_map_.count(op_type)) return false; op_save_map_[op_type] = save_func; @@ -61,7 +61,7 @@ bool RegisterOpSaveMethod(const uint16_t& op_type, const op_save_t& save_func) } tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, struct tensor* tensor, - unsigned int tensor_id, unsigned int buffer_id) + unsigned int tensor_id, unsigned int buffer_id) { TM2_Tensor tm_tensor; tm_tensor.tensor_id = tensor_id; @@ -72,11 +72,11 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, struct t bool tm_with_string = IsSaveString(); - if(tm_with_string) + if (tm_with_string) { std::string name = tensor->name; TM2_String tensor_name; - tensor_name.size = name.size() + 1; // including trailing \0 + tensor_name.size = name.size() + 1; // including trailing \0 tensor_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), tensor_name.size); tm_tensor.offset_s_tname = WriteTmObject(start_ptr, cur_pos, &tensor_name, sizeof(TM2_String)); } @@ -86,13 +86,13 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, struct t /* Get the dims of the tensor */ int* dim = tensor->dims; size_t vector_size; - if(tensor->dim_num) + if (tensor->dim_num) { /* Write the vector of dims */ vector_size = sizeof(tm_size_t) + sizeof(int32_t) * tensor->dim_num; - TM2_Vector_dims* v_dims = ( TM2_Vector_dims* )malloc(vector_size); + TM2_Vector_dims* v_dims = (TM2_Vector_dims*)malloc(vector_size); v_dims->v_num = tensor->dim_num; - for(unsigned int i = 0; i < tensor->dim_num; i++) + for (unsigned int i = 0; i < tensor->dim_num; i++) { v_dims->dims[i] = dim[i]; } @@ -103,10 +103,10 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, struct t tm_tensor.offset_vd_dims = TM2_NOT_SET; /* Write the quant params */ - if(tensor->quant_param_num != 0) + if (tensor->quant_param_num != 0) { vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * tensor->quant_param_num; - TM2_Vector_offsets* v_qtparams = ( TM2_Vector_offsets* )malloc(vector_size); + TM2_Vector_offsets* v_qtparams = (TM2_Vector_offsets*)malloc(vector_size); v_qtparams->v_num = tensor->quant_param_num; if (v_qtparams->v_num == 1) { @@ -117,7 +117,7 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, struct t } else if (v_qtparams->v_num > 1) { - for(unsigned int i = 0; i < v_qtparams->v_num; i++) + for (unsigned int i = 0; i < v_qtparams->v_num; i++) { TM2_QuantParam qtparam; qtparam.zero_point = tensor->zp_list[i]; @@ -126,7 +126,6 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, struct t v_qtparams->offsets[i] = WriteTmObject(start_ptr, cur_pos, &qtparam, sizeof(TM2_QuantParam)); } } - /* Write the vector of quant params */ tm_tensor.offect_vo_quantparams = WriteTmObject(start_ptr, cur_pos, v_qtparams, vector_size); @@ -139,20 +138,20 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, struct t } tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, struct graph* graph, struct node* node, - name_map_t& tensor_name_map) + name_map_t& tensor_name_map) { TM2_Node tm_node; - memset(&tm_node, 0 , sizeof(TM2_Node)); + memset(&tm_node, 0, sizeof(TM2_Node)); tm_node.node_id = node->index; tm_node.dynamic_shape = node->dynamic_shape; bool tm_with_string = IsSaveString(); - if(tm_with_string) + if (tm_with_string) { std::string name = node->name; TM2_String node_name; - node_name.size = name.size() + 1; // including trailing \0 + node_name.size = name.size() + 1; // including trailing \0 node_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), node_name.size); tm_node.offset_s_nname = WriteTmObject(start_ptr, cur_pos, &node_name, sizeof(TM2_String)); } @@ -162,13 +161,13 @@ tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, struct gra unsigned int input_num = node->input_num; unsigned int output_num = node->output_num; - if(input_num) + if (input_num) { /* Write the vector of input indices */ size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * input_num; - TM2_Vector_indices* v_input_indices = ( TM2_Vector_indices* )malloc(vector_size); + TM2_Vector_indices* v_input_indices = (TM2_Vector_indices*)malloc(vector_size); v_input_indices->v_num = input_num; - for(unsigned int i = 0; i < input_num; i++) + for (unsigned int i = 0; i < input_num; i++) { struct tensor* p_tensor = get_ir_graph_tensor(graph, node->input_tensors[i]); v_input_indices->indices[i] = tensor_name_map[p_tensor->name]; @@ -179,13 +178,13 @@ tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, struct gra else tm_node.offset_vi_input_tensors = TM2_NOT_SET; - if(output_num) + if (output_num) { /* Write the vector of output indices */ size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * output_num; - TM2_Vector_indices* v_output_indices = ( TM2_Vector_indices* )malloc(vector_size); + TM2_Vector_indices* v_output_indices = (TM2_Vector_indices*)malloc(vector_size); v_output_indices->v_num = output_num; - for(unsigned int i = 0; i < output_num; i++) + for (unsigned int i = 0; i < output_num; i++) { struct tensor* p_tensor = get_ir_graph_tensor(graph, node->output_tensors[i]); v_output_indices->indices[i] = tensor_name_map[p_tensor->name]; @@ -198,7 +197,7 @@ tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, struct gra /* Write tm operator */ uint16_t op_type = node->op.type; - if(!op_save_map_.count(op_type)) + if (!op_save_map_.count(op_type)) { TLOG_ERR("cannot find save function for operator:%d \n", op_type); return false; @@ -230,12 +229,12 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, struct /* Write the nodes */ size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * graph->node_num; - TM2_Vector_offsets* v_nodes = ( TM2_Vector_offsets* )malloc(vector_size); + TM2_Vector_offsets* v_nodes = (TM2_Vector_offsets*)malloc(vector_size); v_nodes->v_num = graph->node_num; - for(unsigned int i = 0; i < graph->node_num; i++) + for (unsigned int i = 0; i < graph->node_num; i++) { struct node* p_node = get_ir_graph_node(graph, i); - for(unsigned int k = 0; k < p_node->output_num; k++) + for (unsigned int k = 0; k < p_node->output_num; k++) { struct tensor* p_tensor = get_ir_graph_tensor(graph, p_node->output_tensors[k]); tensor_ptrs.push_back(p_tensor); @@ -249,12 +248,12 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, struct /* Write the tensors */ vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * tensor_num; - TM2_Vector_offsets* v_tensors = ( TM2_Vector_offsets* )malloc(vector_size); + TM2_Vector_offsets* v_tensors = (TM2_Vector_offsets*)malloc(vector_size); v_tensors->v_num = tensor_num; - for(unsigned int i = 0; i < tensor_num; i++) + for (unsigned int i = 0; i < tensor_num; i++) { struct tensor* p_tensor = tensor_ptrs[i]; - if(p_tensor->tensor_type == TENSOR_TYPE_CONST) + if (p_tensor->tensor_type == TENSOR_TYPE_CONST) { // buf_ptrs.push_back(p_tensor->GetMemAddr()); buf_ptrs.push_back(p_tensor->data); // may cause bug @@ -269,14 +268,14 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, struct /* Write the buffers */ vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * buffer_num; - TM2_Vector_offsets* v_buffers = ( TM2_Vector_offsets* )malloc(vector_size); + TM2_Vector_offsets* v_buffers = (TM2_Vector_offsets*)malloc(vector_size); v_buffers->v_num = buffer_num; - for(unsigned int i = 0; i < buffer_num; i++) + for (unsigned int i = 0; i < buffer_num; i++) { TM2_Buffer tm_buf; tm_buf.size = buf_sizes[i]; - if(tm_no_data) + if (tm_no_data) { /* TM2_FOR_BENCHMARK environment variable exists. Not write buf data into the tm file */ tm_buf.offset_data = TM2_NOT_SET; @@ -284,8 +283,7 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, struct else { /* TM2_FOR_BENCHMARK environment variable does not exist */ - tm_buf.offset_data = - WriteTmFileAlign1(start_ptr, cur_pos, reinterpret_cast(buf_ptrs[i]), tm_buf.size); + tm_buf.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, reinterpret_cast(buf_ptrs[i]), tm_buf.size); } v_buffers->offsets[i] = WriteTmObject(start_ptr, cur_pos, &tm_buf, sizeof(TM2_Buffer)); } @@ -294,9 +292,9 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, struct /* Write the vector of input indices */ vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->input_num; - TM2_Vector_indices* v_input_indices = ( TM2_Vector_indices* )malloc(vector_size); + TM2_Vector_indices* v_input_indices = (TM2_Vector_indices*)malloc(vector_size); v_input_indices->v_num = graph->input_num; - for(unsigned int i = 0; i < graph->input_num; i++) + for (unsigned int i = 0; i < graph->input_num; i++) { v_input_indices->indices[i] = graph->input_nodes[i]; } @@ -304,9 +302,9 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, struct /* Write the vector of output indices */ vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->output_num; - TM2_Vector_indices* v_output_indices = ( TM2_Vector_indices* )malloc(vector_size); + TM2_Vector_indices* v_output_indices = (TM2_Vector_indices*)malloc(vector_size); v_output_indices->v_num = graph->output_num; - for(unsigned int i = 0; i < graph->output_num; i++) + for (unsigned int i = 0; i < graph->output_num; i++) { v_output_indices->indices[i] = graph->output_nodes[i]; } @@ -356,7 +354,7 @@ bool SaveModelIntoMem(void* start_ptr, struct graph* graph, uint32_t* tm_model_s /* Write the subgraphs */ /* Only 1 subgraph is supported currently */ size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * 1; - TM2_Vector_offsets* v_subgraphs = ( TM2_Vector_offsets* )malloc(vector_size); + TM2_Vector_offsets* v_subgraphs = (TM2_Vector_offsets*)malloc(vector_size); v_subgraphs->v_num = 1; v_subgraphs->offsets[0] = SaveTmSubgraph(start_ptr, &cur_pos, graph); @@ -382,17 +380,16 @@ int save_model(std::vector& addr_list, std::vector& size_list, struc uint32_t malloc_size = TM_FILE_MAX_SIZE; const char* env = std::getenv("TM_FILE_MAX_SIZE"); - if(env) + if (env) malloc_size = std::atoi(env); - void* start_ptr = ( void* )malloc(malloc_size); - if(start_ptr == nullptr) + void* start_ptr = (void*)malloc(malloc_size); + if (start_ptr == nullptr) { - TLOG_ERR("Malloc memory failed: .\n",malloc_size); + TLOG_ERR("Malloc memory failed: .\n", malloc_size); return false; } - bool ret = SaveModelIntoMem(start_ptr, graph, &tm_model_size); addr_list.push_back(start_ptr); @@ -411,16 +408,16 @@ bool save_graph(graph_t graph, const char* fname) struct graph* ir_graph = (struct graph*)graph; /* Open the tengine model file */ int fd = open(fname, O_RDWR | O_CREAT | O_TRUNC, 0666); - if(fd == -1) + if (fd == -1) { - TLOG_ERR("Could not open %s\n",fname); + TLOG_ERR("Could not open %s\n", fname); return false; } std::vector addr_list; std::vector size_list; - if(!save_model(addr_list, size_list, ir_graph)) + if (!save_model(addr_list, size_list, ir_graph)) { close(fd); return false; @@ -433,7 +430,7 @@ bool save_graph(graph_t graph, const char* fname) close(fd); free(buf); - if(ret != size) + if (ret != size) return false; else return true; diff --git a/tools/quantize/savegraph/save_graph.hpp b/tools/quantize/savegraph/save_graph.hpp index 5ed1757f4..fe4fb2d0f 100644 --- a/tools/quantize/savegraph/save_graph.hpp +++ b/tools/quantize/savegraph/save_graph.hpp @@ -9,8 +9,7 @@ #include #include -extern "C" -{ +extern "C" { #include "tengine/c_api.h" #include "graph/graph.h" #include "graph/subgraph.h" @@ -21,8 +20,6 @@ extern "C" #include "tm2_format.h" } - #include "tm2_op_save.hpp" - bool save_graph(graph_t graph, const char* fname); diff --git a/tools/quantize/savegraph/tm2_format.h b/tools/quantize/savegraph/tm2_format.h index 5fb2aea3b..fc4fa32a4 100644 --- a/tools/quantize/savegraph/tm2_format.h +++ b/tools/quantize/savegraph/tm2_format.h @@ -32,8 +32,8 @@ extern "C" { #endif -#define TM2_FILE_VER_MAIN 2 -#define TM2_FILE_VER_SUB 0 +#define TM2_FILE_VER_MAIN 2 +#define TM2_FILE_VER_SUB 0 #define TM2_FILE_VER_COMPILE 0 #define TM2_OP_VER 1 @@ -42,247 +42,247 @@ extern "C" { /* Type define */ typedef uint32_t tm_uoffset_t; /* offset is 4-byte unsigned integer */ -typedef uint32_t tm_size_t; /* size is 4-byte unsigned integer */ -typedef uint8_t tm_bool_t; /* bool is 1-byte unsigned integer */ +typedef uint32_t tm_size_t; /* size is 4-byte unsigned integer */ +typedef uint8_t tm_bool_t; /* bool is 1-byte unsigned integer */ /* Operator strings */ -#define TM2_OPSTR_ACCURACY "Accuracy" -#define TM2_OPSTR_BATCHNORMALIZATION "BatchNormalization" -#define TM2_OPSTR_BILINEARRESIZE "Resize" -#define TM2_OPSTR_CONCAT "Concat" -#define TM2_OPSTR_CONST "Const" -#define TM2_OPSTR_CONVOLUTION "Convolution" -#define TM2_OPSTR_DECONVOLUTION "Deconvolution" -#define TM2_OPSTR_DETECTIONOUTPUT "DetectionOutput" -#define TM2_OPSTR_DROPOUT "Dropout" -#define TM2_OPSTR_ELTWISE "Eltwise" -#define TM2_OPSTR_FLATTEN "Flatten" -#define TM2_OPSTR_FULLYCONNECTED "FullyConnected" -#define TM2_OPSTR_INPUTOP "InputOp" -#define TM2_OPSTR_LRN "LRN" -#define TM2_OPSTR_NORMALIZE "Normalize" -#define TM2_OPSTR_PERMUTE "Permute" -#define TM2_OPSTR_POOLING "Pooling" -#define TM2_OPSTR_PRELU "PReLU" -#define TM2_OPSTR_PRIORBOX "PriorBox" -#define TM2_OPSTR_REGION "Region" -#define TM2_OPSTR_RELU "ReLu" -#define TM2_OPSTR_RELU6 "ReLu6" -#define TM2_OPSTR_REORG "Reorg" -#define TM2_OPSTR_RESHAPE "Reshape" -#define TM2_OPSTR_ROIPOOLING "ROIPooling" -#define TM2_OPSTR_RPN "RPN" -#define TM2_OPSTR_SCALE "Scale" -#define TM2_OPSTR_SLICE "Slice" -#define TM2_OPSTR_SOFTMAX "Softmax" -#define TM2_OPSTR_SPLIT "Split" +#define TM2_OPSTR_ACCURACY "Accuracy" +#define TM2_OPSTR_BATCHNORMALIZATION "BatchNormalization" +#define TM2_OPSTR_BILINEARRESIZE "Resize" +#define TM2_OPSTR_CONCAT "Concat" +#define TM2_OPSTR_CONST "Const" +#define TM2_OPSTR_CONVOLUTION "Convolution" +#define TM2_OPSTR_DECONVOLUTION "Deconvolution" +#define TM2_OPSTR_DETECTIONOUTPUT "DetectionOutput" +#define TM2_OPSTR_DROPOUT "Dropout" +#define TM2_OPSTR_ELTWISE "Eltwise" +#define TM2_OPSTR_FLATTEN "Flatten" +#define TM2_OPSTR_FULLYCONNECTED "FullyConnected" +#define TM2_OPSTR_INPUTOP "InputOp" +#define TM2_OPSTR_LRN "LRN" +#define TM2_OPSTR_NORMALIZE "Normalize" +#define TM2_OPSTR_PERMUTE "Permute" +#define TM2_OPSTR_POOLING "Pooling" +#define TM2_OPSTR_PRELU "PReLU" +#define TM2_OPSTR_PRIORBOX "PriorBox" +#define TM2_OPSTR_REGION "Region" +#define TM2_OPSTR_RELU "ReLu" +#define TM2_OPSTR_RELU6 "ReLu6" +#define TM2_OPSTR_REORG "Reorg" +#define TM2_OPSTR_RESHAPE "Reshape" +#define TM2_OPSTR_ROIPOOLING "ROIPooling" +#define TM2_OPSTR_RPN "RPN" +#define TM2_OPSTR_SCALE "Scale" +#define TM2_OPSTR_SLICE "Slice" +#define TM2_OPSTR_SOFTMAX "Softmax" +#define TM2_OPSTR_SPLIT "Split" #define TM2_OPSTR_DETECTIONPOSTPROCESS "DetectionPostProcess" -#define TM2_OPSTR_GEMM "Gemm" -#define TM2_OPSTR_GENERIC "Generic" -#define TM2_OPSTR_LOGISTIC "Logistic" -#define TM2_OPSTR_LSTM "LSTM" -#define TM2_OPSTR_RNN "RNN" -#define TM2_OPSTR_TANH "Tanh" -#define TM2_OPSTR_SIGMOID "Sigmoid" -#define TM2_OPSTR_SQUEEZE "Squeeze" -#define TM2_OPSTR_PAD "Pad" -#define TM2_OPSTR_STRIDEDSLICE "StridedSlice" -#define TM2_OPSTR_REDUCTION "Reduction" -#define TM2_OPSTR_ARGMAX "ArgMax" -#define TM2_OPSTR_ARGMIN "ArgMin" -#define TM2_OPSTR_TOPKV2 "TopKV2" -#define TM2_OPSTR_MAX "Maximum" -#define TM2_OPSTR_MIN "Minimum" -#define TM2_OPSTR_ADDN "Addn" -#define TM2_OPSTR_SWAPAXIS "SwapAxis" -#define TM2_OPSTR_GRU "GRU" -#define TM2_OPSTR_FUSEDBNSCALERELU "Fused.BNScaleReLu" -#define TM2_OPSTR_UPSAMPLE "Upsample" -#define TM2_OPSTR_SHUFFLECHANNEL "ShuffleChannel" -#define TM2_OPSTR_RESIZE "Resize" -#define TM2_OPSTR_SPACETOBATCHND "SpaceToBatchND" -#define TM2_OPSTR_BATCHTOSPACEND "BatchToSpaceND" -#define TM2_OPSTR_CROP "Crop" -#define TM2_OPSTR_PSROIPOOLING "Psroipooling" -#define TM2_OPSTR_ROIALIGN "Roialign" -#define TM2_OPSTR_EXPANDDIMS "Expanddims" -#define TM2_OPSTR_UNARY "Unary" -#define TM2_OPSTR_BIAS "Bias" -#define TM2_OPSTR_NOOP "Noop" -#define TM2_OPSTR_THRESHOLD "Threshold" -#define TM2_OPSTR_HARDSIGMOID "Hardsigmoid" -#define TM2_OPSTR_EMBED "Embedding" -#define TM2_OPSTR_INSTANCENORM "InstanceNorm" -#define TM2_OPSTR_MVN "MVN" -#define TM2_OPSTR_ABSVAL "Absval" -#define TM2_OPSTR_CAST "Cast" -#define TM2_OPSTR_HARDSWISH "HardSwish" -#define TM2_OPSTR_INTERP "Interp" -#define TM2_OPSTR_SELU "Selu" -#define TM2_OPSTR_ELU "Elu" -#define TM2_OPSTR_BROADMUL "BroadMul" -#define TM2_OPSTR_LOGICAL "Logical" -#define TM2_OPSTR_GATHER "Gather" -#define TM2_OPSTR_TRANSPOSE "Transpose" -#define TM2_OPSTR_REVERSE "Reverse" -#define TM2_OPSTR_COMPARISON "Comparison" -#define TM2_OPSTR_SPACETODEPTH "SpaceToDepth" -#define TM2_OPSTR_DEPTHTOSPACE "DepthToSpace" -#define TM2_OPSTR_SQUAREDDIFFERENCE "SquaredDifference" -#define TM2_OPSTR_SPARSETODENSE "SparseToDense" -#define TM2_OPSTR_CEIL "Ceil" -#define TM2_OPSTR_ROUND "Round" -#define TM2_OPSTR_ZEROSLIKE "ZerosLike" -#define TM2_OPSTR_CLIP "Clip" -#define TM2_OPSTR_UNSQUEEZE "Unsqueeze" -#define TM2_OPSTR_REDUCEL2 "ReduceL2" -#define TM2_OPSTR_MEAN "Mean" -#define TM2_OPSTR_EXPAND "Expand" -#define TM2_OPSTR_MATMUL "MatMul" -#define TM2_OPSTR_SCATTER "Scatter" -#define TM2_OPSTR_SHAPE "Shape" -#define TM2_OPSTR_WHERE "Where" -#define TM2_OPSTR_TILE "Tile" +#define TM2_OPSTR_GEMM "Gemm" +#define TM2_OPSTR_GENERIC "Generic" +#define TM2_OPSTR_LOGISTIC "Logistic" +#define TM2_OPSTR_LSTM "LSTM" +#define TM2_OPSTR_RNN "RNN" +#define TM2_OPSTR_TANH "Tanh" +#define TM2_OPSTR_SIGMOID "Sigmoid" +#define TM2_OPSTR_SQUEEZE "Squeeze" +#define TM2_OPSTR_PAD "Pad" +#define TM2_OPSTR_STRIDEDSLICE "StridedSlice" +#define TM2_OPSTR_REDUCTION "Reduction" +#define TM2_OPSTR_ARGMAX "ArgMax" +#define TM2_OPSTR_ARGMIN "ArgMin" +#define TM2_OPSTR_TOPKV2 "TopKV2" +#define TM2_OPSTR_MAX "Maximum" +#define TM2_OPSTR_MIN "Minimum" +#define TM2_OPSTR_ADDN "Addn" +#define TM2_OPSTR_SWAPAXIS "SwapAxis" +#define TM2_OPSTR_GRU "GRU" +#define TM2_OPSTR_FUSEDBNSCALERELU "Fused.BNScaleReLu" +#define TM2_OPSTR_UPSAMPLE "Upsample" +#define TM2_OPSTR_SHUFFLECHANNEL "ShuffleChannel" +#define TM2_OPSTR_RESIZE "Resize" +#define TM2_OPSTR_SPACETOBATCHND "SpaceToBatchND" +#define TM2_OPSTR_BATCHTOSPACEND "BatchToSpaceND" +#define TM2_OPSTR_CROP "Crop" +#define TM2_OPSTR_PSROIPOOLING "Psroipooling" +#define TM2_OPSTR_ROIALIGN "Roialign" +#define TM2_OPSTR_EXPANDDIMS "Expanddims" +#define TM2_OPSTR_UNARY "Unary" +#define TM2_OPSTR_BIAS "Bias" +#define TM2_OPSTR_NOOP "Noop" +#define TM2_OPSTR_THRESHOLD "Threshold" +#define TM2_OPSTR_HARDSIGMOID "Hardsigmoid" +#define TM2_OPSTR_EMBED "Embedding" +#define TM2_OPSTR_INSTANCENORM "InstanceNorm" +#define TM2_OPSTR_MVN "MVN" +#define TM2_OPSTR_ABSVAL "Absval" +#define TM2_OPSTR_CAST "Cast" +#define TM2_OPSTR_HARDSWISH "HardSwish" +#define TM2_OPSTR_INTERP "Interp" +#define TM2_OPSTR_SELU "Selu" +#define TM2_OPSTR_ELU "Elu" +#define TM2_OPSTR_BROADMUL "BroadMul" +#define TM2_OPSTR_LOGICAL "Logical" +#define TM2_OPSTR_GATHER "Gather" +#define TM2_OPSTR_TRANSPOSE "Transpose" +#define TM2_OPSTR_REVERSE "Reverse" +#define TM2_OPSTR_COMPARISON "Comparison" +#define TM2_OPSTR_SPACETODEPTH "SpaceToDepth" +#define TM2_OPSTR_DEPTHTOSPACE "DepthToSpace" +#define TM2_OPSTR_SQUAREDDIFFERENCE "SquaredDifference" +#define TM2_OPSTR_SPARSETODENSE "SparseToDense" +#define TM2_OPSTR_CEIL "Ceil" +#define TM2_OPSTR_ROUND "Round" +#define TM2_OPSTR_ZEROSLIKE "ZerosLike" +#define TM2_OPSTR_CLIP "Clip" +#define TM2_OPSTR_UNSQUEEZE "Unsqueeze" +#define TM2_OPSTR_REDUCEL2 "ReduceL2" +#define TM2_OPSTR_MEAN "Mean" +#define TM2_OPSTR_EXPAND "Expand" +#define TM2_OPSTR_MATMUL "MatMul" +#define TM2_OPSTR_SCATTER "Scatter" +#define TM2_OPSTR_SHAPE "Shape" +#define TM2_OPSTR_WHERE "Where" +#define TM2_OPSTR_TILE "Tile" /* Operator types */ -#define TM2_OPTYPE_ACCURACY 0 /* No Param */ -#define TM2_OPTYPE_BATCHNORMALIZATION 1 /* TM2_BatchNormParam */ -#define TM2_OPTYPE_BILINEARRESIZE 2 /* TM2_ResizeParam */ -#define TM2_OPTYPE_CONCAT 3 /* TM2_ConcatParam */ -#define TM2_OPTYPE_CONST 4 /* No Param */ -#define TM2_OPTYPE_CONVOLUTION 5 /* TM2_ConvParam */ -#define TM2_OPTYPE_DECONVOLUTION 6 /* TM2_DeconvParam */ -#define TM2_OPTYPE_DETECTIONOUTPUT 7 /* TM2_DetectionOutputParam */ -#define TM2_OPTYPE_DROPOUT 8 /* No Param */ -#define TM2_OPTYPE_ELTWISE 9 /* TM2_EltwiseParam */ -#define TM2_OPTYPE_FLATTEN 10 /* TM2_FlattenParam */ -#define TM2_OPTYPE_FULLYCONNECTED 11 /* TM2_FCParam */ -#define TM2_OPTYPE_INPUTOP 12 /* No Param */ -#define TM2_OPTYPE_LRN 13 /* TM2_LRNParam */ -#define TM2_OPTYPE_NORMALIZE 14 /* TM2_NormalizeParam */ -#define TM2_OPTYPE_PERMUTE 15 /* TM2_PermuteParam */ -#define TM2_OPTYPE_POOLING 16 /* TM2_PoolParam */ -#define TM2_OPTYPE_PRELU 17 /* No Param */ -#define TM2_OPTYPE_PRIORBOX 18 /* TM2_PriorBoxParam */ -#define TM2_OPTYPE_REGION 19 /* TM2_RegionParam */ -#define TM2_OPTYPE_RELU 20 /* TM2_ReLuParam */ -#define TM2_OPTYPE_RELU6 21 /* No Param */ -#define TM2_OPTYPE_REORG 22 /* TM2_ReorgParam */ -#define TM2_OPTYPE_RESHAPE 23 /* TM2_ReshapeParam */ -#define TM2_OPTYPE_ROIPOOLING 24 /* TM2_ROIPoolingParam */ -#define TM2_OPTYPE_RPN 25 /* TM2_RPNParam */ -#define TM2_OPTYPE_SCALE 26 /* TM2_ScaleParam */ -#define TM2_OPTYPE_SLICE 27 /* TM2_SliceParam */ -#define TM2_OPTYPE_SOFTMAX 28 /* TM2_SoftmaxParam */ -#define TM2_OPTYPE_SPLIT 29 /* No Param */ +#define TM2_OPTYPE_ACCURACY 0 /* No Param */ +#define TM2_OPTYPE_BATCHNORMALIZATION 1 /* TM2_BatchNormParam */ +#define TM2_OPTYPE_BILINEARRESIZE 2 /* TM2_ResizeParam */ +#define TM2_OPTYPE_CONCAT 3 /* TM2_ConcatParam */ +#define TM2_OPTYPE_CONST 4 /* No Param */ +#define TM2_OPTYPE_CONVOLUTION 5 /* TM2_ConvParam */ +#define TM2_OPTYPE_DECONVOLUTION 6 /* TM2_DeconvParam */ +#define TM2_OPTYPE_DETECTIONOUTPUT 7 /* TM2_DetectionOutputParam */ +#define TM2_OPTYPE_DROPOUT 8 /* No Param */ +#define TM2_OPTYPE_ELTWISE 9 /* TM2_EltwiseParam */ +#define TM2_OPTYPE_FLATTEN 10 /* TM2_FlattenParam */ +#define TM2_OPTYPE_FULLYCONNECTED 11 /* TM2_FCParam */ +#define TM2_OPTYPE_INPUTOP 12 /* No Param */ +#define TM2_OPTYPE_LRN 13 /* TM2_LRNParam */ +#define TM2_OPTYPE_NORMALIZE 14 /* TM2_NormalizeParam */ +#define TM2_OPTYPE_PERMUTE 15 /* TM2_PermuteParam */ +#define TM2_OPTYPE_POOLING 16 /* TM2_PoolParam */ +#define TM2_OPTYPE_PRELU 17 /* No Param */ +#define TM2_OPTYPE_PRIORBOX 18 /* TM2_PriorBoxParam */ +#define TM2_OPTYPE_REGION 19 /* TM2_RegionParam */ +#define TM2_OPTYPE_RELU 20 /* TM2_ReLuParam */ +#define TM2_OPTYPE_RELU6 21 /* No Param */ +#define TM2_OPTYPE_REORG 22 /* TM2_ReorgParam */ +#define TM2_OPTYPE_RESHAPE 23 /* TM2_ReshapeParam */ +#define TM2_OPTYPE_ROIPOOLING 24 /* TM2_ROIPoolingParam */ +#define TM2_OPTYPE_RPN 25 /* TM2_RPNParam */ +#define TM2_OPTYPE_SCALE 26 /* TM2_ScaleParam */ +#define TM2_OPTYPE_SLICE 27 /* TM2_SliceParam */ +#define TM2_OPTYPE_SOFTMAX 28 /* TM2_SoftmaxParam */ +#define TM2_OPTYPE_SPLIT 29 /* No Param */ #define TM2_OPTYPE_DETECTIONPOSTPROCESS 30 /* TM2_DetectionPostProcessParam */ -#define TM2_OPTYPE_GEMM 31 /* TM2_GemmParam */ -#define TM2_OPTYPE_GENERIC 32 /* TM2_GenericParam */ -#define TM2_OPTYPE_LOGISTIC 33 /* No Param */ -#define TM2_OPTYPE_LSTM 34 /* TM2_LstmParam */ -#define TM2_OPTYPE_RNN 35 /* TM2_RnnParam */ -#define TM2_OPTYPE_TANH 36 /* No Param */ -#define TM2_OPTYPE_SIGMOID 37 /* No Param */ -#define TM2_OPTYPE_SQUEEZE 38 /* TM2_SqueezeParam */ -#define TM2_OPTYPE_FUSEDBNSCALERELU 39 /* No Param */ -#define TM2_OPTYPE_PAD 40 /* TM2_PadParam */ -#define TM2_OPTYPE_STRIDEDSLICE 41 /* TM2_StrideSliceParam */ -#define TM2_OPTYPE_ARGMAX 42 /* TM2_ArgmaxParam */ -#define TM2_OPTYPE_ARGMIN 43 /* TM2_ArgminParam */ -#define TM2_OPTYPE_TOPKV2 44 /* TM2_TopkV2Param */ -#define TM2_OPTYPE_REDUCTION 45 /* TM2_ReductionParam */ -#define TM2_OPTYPE_MAX 46 /* No Param */ -#define TM2_OPTYPE_MIN 47 /* No Param */ -#define TM2_OPTYPE_GRU 48 /* TM2_GruParam */ -#define TM2_OPTYPE_ADDN 49 /* TM2_AddNParam */ -#define TM2_OPTYPE_SWAPAXIS 50 /* TM2_SwapAixsParam */ -#define TM2_OPTYPE_UPSAMPLE 51 /* TM2_UpsampleParam */ -#define TM2_OPTYPE_SPACETOBATCHND 52 -#define TM2_OPTYPE_BATCHTOSPACEND 53 -#define TM2_OPTYPE_RESIZE 54 -#define TM2_OPTYPE_SHUFFLECHANNEL 55 /* TM2_ShuffleChannelParam */ -#define TM2_OPTYPE_CROP 56 /* TM2_CropParam */ -#define TM2_OPTYPE_ROIALIGN 57 -#define TM2_OPTYPE_PSROIPOOLING 58 -#define TM2_OPTYPE_UNARY 59 -#define TM2_OPTYPE_EXPANDDIMS 60 -#define TM2_OPTYPE_BIAS 61 -#define TM2_OPTYPE_NOOP 62 -#define TM2_OPTYPE_THRESHOLD 63 -#define TM2_OPTYPE_HARDSIGMOID 64 -#define TM2_OPTYPE_EMBED 65 -#define TM2_OPTYPE_INSTANCENORM 66 -#define TM2_OPTYPE_MVN 67 -#define TM2_OPTYPE_ABSVAL 68 -#define TM2_OPTYPE_CAST 69 -#define TM2_OPTYPE_HARDSWISH 70 -#define TM2_OPTYPE_INTERP 71 -#define TM2_OPTYPE_SELU 72 -#define TM2_OPTYPE_ELU 73 -#define TM2_OPTYPE_BROADMUL 74 -#define TM2_OPTYPE_LOGICAL 75 -#define TM2_OPTYPE_GATHER 76 -#define TM2_OPTYPE_TRANSPOSE 77 -#define TM2_OPTYPE_COMPARISON 78 -#define TM2_OPTYPE_SPACETODEPTH 79 -#define TM2_OPTYPE_DEPTHTOSPACE 80 -#define TM2_OPTYPE_REVERSE 81 -#define TM2_OPTYPE_SPARSETODENSE 82 -#define TM2_OPTYPE_CEIL 83 -#define TM2_OPTYPE_SQUAREDDIFFERENCE 84 -#define TM2_OPTYPE_ROUND 85 -#define TM2_OPTYPE_ZEROSLIKE 86 -#define TM2_OPTYPE_CLIP 87 -#define TM2_OPTYPE_UNSQUEEZE 88 -#define TM2_OPTYPE_REDUCEL2 89 -#define TM2_OPTYPE_MEAN 90 -#define TM2_OPTYPE_EXPAND 91 -#define TM2_OPTYPE_MATMUL 92 -#define TM2_OPTYPE_SCATTER 93 -#define TM2_OPTYPE_SHAPE 94 -#define TM2_OPTYPE_WHERE 95 -#define TM2_OPTYPE_TILE 96 -#define TM2_OPTYPE_MISH 97 /* No param*/ -#define TM2_OPTYPE_NUM 98 +#define TM2_OPTYPE_GEMM 31 /* TM2_GemmParam */ +#define TM2_OPTYPE_GENERIC 32 /* TM2_GenericParam */ +#define TM2_OPTYPE_LOGISTIC 33 /* No Param */ +#define TM2_OPTYPE_LSTM 34 /* TM2_LstmParam */ +#define TM2_OPTYPE_RNN 35 /* TM2_RnnParam */ +#define TM2_OPTYPE_TANH 36 /* No Param */ +#define TM2_OPTYPE_SIGMOID 37 /* No Param */ +#define TM2_OPTYPE_SQUEEZE 38 /* TM2_SqueezeParam */ +#define TM2_OPTYPE_FUSEDBNSCALERELU 39 /* No Param */ +#define TM2_OPTYPE_PAD 40 /* TM2_PadParam */ +#define TM2_OPTYPE_STRIDEDSLICE 41 /* TM2_StrideSliceParam */ +#define TM2_OPTYPE_ARGMAX 42 /* TM2_ArgmaxParam */ +#define TM2_OPTYPE_ARGMIN 43 /* TM2_ArgminParam */ +#define TM2_OPTYPE_TOPKV2 44 /* TM2_TopkV2Param */ +#define TM2_OPTYPE_REDUCTION 45 /* TM2_ReductionParam */ +#define TM2_OPTYPE_MAX 46 /* No Param */ +#define TM2_OPTYPE_MIN 47 /* No Param */ +#define TM2_OPTYPE_GRU 48 /* TM2_GruParam */ +#define TM2_OPTYPE_ADDN 49 /* TM2_AddNParam */ +#define TM2_OPTYPE_SWAPAXIS 50 /* TM2_SwapAixsParam */ +#define TM2_OPTYPE_UPSAMPLE 51 /* TM2_UpsampleParam */ +#define TM2_OPTYPE_SPACETOBATCHND 52 +#define TM2_OPTYPE_BATCHTOSPACEND 53 +#define TM2_OPTYPE_RESIZE 54 +#define TM2_OPTYPE_SHUFFLECHANNEL 55 /* TM2_ShuffleChannelParam */ +#define TM2_OPTYPE_CROP 56 /* TM2_CropParam */ +#define TM2_OPTYPE_ROIALIGN 57 +#define TM2_OPTYPE_PSROIPOOLING 58 +#define TM2_OPTYPE_UNARY 59 +#define TM2_OPTYPE_EXPANDDIMS 60 +#define TM2_OPTYPE_BIAS 61 +#define TM2_OPTYPE_NOOP 62 +#define TM2_OPTYPE_THRESHOLD 63 +#define TM2_OPTYPE_HARDSIGMOID 64 +#define TM2_OPTYPE_EMBED 65 +#define TM2_OPTYPE_INSTANCENORM 66 +#define TM2_OPTYPE_MVN 67 +#define TM2_OPTYPE_ABSVAL 68 +#define TM2_OPTYPE_CAST 69 +#define TM2_OPTYPE_HARDSWISH 70 +#define TM2_OPTYPE_INTERP 71 +#define TM2_OPTYPE_SELU 72 +#define TM2_OPTYPE_ELU 73 +#define TM2_OPTYPE_BROADMUL 74 +#define TM2_OPTYPE_LOGICAL 75 +#define TM2_OPTYPE_GATHER 76 +#define TM2_OPTYPE_TRANSPOSE 77 +#define TM2_OPTYPE_COMPARISON 78 +#define TM2_OPTYPE_SPACETODEPTH 79 +#define TM2_OPTYPE_DEPTHTOSPACE 80 +#define TM2_OPTYPE_REVERSE 81 +#define TM2_OPTYPE_SPARSETODENSE 82 +#define TM2_OPTYPE_CEIL 83 +#define TM2_OPTYPE_SQUAREDDIFFERENCE 84 +#define TM2_OPTYPE_ROUND 85 +#define TM2_OPTYPE_ZEROSLIKE 86 +#define TM2_OPTYPE_CLIP 87 +#define TM2_OPTYPE_UNSQUEEZE 88 +#define TM2_OPTYPE_REDUCEL2 89 +#define TM2_OPTYPE_MEAN 90 +#define TM2_OPTYPE_EXPAND 91 +#define TM2_OPTYPE_MATMUL 92 +#define TM2_OPTYPE_SCATTER 93 +#define TM2_OPTYPE_SHAPE 94 +#define TM2_OPTYPE_WHERE 95 +#define TM2_OPTYPE_TILE 96 +#define TM2_OPTYPE_MISH 97 /* No param*/ +#define TM2_OPTYPE_NUM 98 /* --------------------- -------- TM objects -------------------------------- */ typedef struct { - uint16_t ver_main; /* main version of Tengine model file format */ - uint16_t ver_sub; /* sub version of Tengine model file format */ - uint16_t ver_compile; /* compile version of Tengine model file format */ + uint16_t ver_main; /* main version of Tengine model file format */ + uint16_t ver_sub; /* sub version of Tengine model file format */ + uint16_t ver_compile; /* compile version of Tengine model file format */ tm_uoffset_t offset_root; /* offset of root table (TM2_Model) */ } TM2_Header; /* Root table of Tengine model */ typedef struct { - int32_t orig_format; /* format of original model */ - int32_t sub_format; /* sub format for DLA model */ + int32_t orig_format; /* format of original model */ + int32_t sub_format; /* sub format for DLA model */ tm_uoffset_t offset_vo_subgraphs; /* offset of TM2_Vector_offsets */ - tm_uoffset_t offset_s_mname; /* offset of string */ + tm_uoffset_t offset_s_mname; /* offset of string */ } TM2_Model; /* Only 1 subgraph is supported currently */ typedef struct { - uint32_t subgraph_id; /* subgraph id */ - int32_t graph_layout; /* actual data layout */ - int32_t model_layout; /* data layout of original model */ - tm_uoffset_t offset_vi_input_indices; /* offset of TM2_Vector_indices */ + uint32_t subgraph_id; /* subgraph id */ + int32_t graph_layout; /* actual data layout */ + int32_t model_layout; /* data layout of original model */ + tm_uoffset_t offset_vi_input_indices; /* offset of TM2_Vector_indices */ tm_uoffset_t offset_vi_output_indices; /* offset of TM2_Vector_indices */ - tm_uoffset_t offset_vo_seq_nodes; /* offset of TM2_Vector_offsets */ - tm_uoffset_t offset_vo_tensors; /* offset of TM2_Vector_offsets */ - tm_uoffset_t offset_vo_buffers; /* offset of TM2_Vector_offsets */ - tm_uoffset_t offset_s_sname; /* offset of string */ - tm_uoffset_t offset_vo_sub_info; /* offset of TM2_Vector_offsets */ + tm_uoffset_t offset_vo_seq_nodes; /* offset of TM2_Vector_offsets */ + tm_uoffset_t offset_vo_tensors; /* offset of TM2_Vector_offsets */ + tm_uoffset_t offset_vo_buffers; /* offset of TM2_Vector_offsets */ + tm_uoffset_t offset_s_sname; /* offset of string */ + tm_uoffset_t offset_vo_sub_info; /* offset of TM2_Vector_offsets */ } TM2_Subgraph; typedef struct { - uint32_t subgraph_id; /* sub graph idx */ - uint32_t input_wait_count; /* input wait count */ - int32_t data_type; /* FP32 FP16 U8 INT8 */ + uint32_t subgraph_id; /* sub graph idx */ + uint32_t input_wait_count; /* input wait count */ + int32_t data_type; /* FP32 FP16 U8 INT8 */ tm_uoffset_t offset_vi_node_list; /* offset of TM2_Vector_indices */ tm_uoffset_t offset_vi_input_tensor; /* offset of TM2_Vector_indices */ tm_uoffset_t offset_vi_output_tensor; /* offset of TM2_Vector_indices */ @@ -292,25 +292,25 @@ typedef struct typedef struct { tm_uoffset_t offset_s_attrname; /* offset of string */ - tm_uoffset_t offset_s_attrval; /* offset of string */ + tm_uoffset_t offset_s_attrval; /* offset of string */ int32_t attr_type; } TM2_Attr; typedef struct { - uint32_t node_id; /* node id */ - tm_uoffset_t offset_vi_input_tensors; /* offset of TM2_Vector_indices */ + uint32_t node_id; /* node id */ + tm_uoffset_t offset_vi_input_tensors; /* offset of TM2_Vector_indices */ tm_uoffset_t offset_vi_output_tensors; /* offset of TM2_Vector_indices */ - tm_uoffset_t offset_t_operator; /* offset of table */ - tm_uoffset_t offset_s_nname; /* offset of string */ - tm_uoffset_t offset_vo_attrs; /* offset of TM2_Vector_offsets */ + tm_uoffset_t offset_t_operator; /* offset of table */ + tm_uoffset_t offset_s_nname; /* offset of string */ + tm_uoffset_t offset_vo_attrs; /* offset of TM2_Vector_offsets */ tm_bool_t dynamic_shape; } TM2_Node; typedef struct { - uint32_t op_ver; /* version of operator */ - uint32_t operator_type; /* operator type */ + uint32_t op_ver; /* version of operator */ + uint32_t operator_type; /* operator type */ tm_uoffset_t offset_t_param; /* offset of table */ } TM2_Operator; @@ -325,8 +325,8 @@ typedef struct { uint32_t tensor_id; uint32_t buffer_id; - tm_uoffset_t offset_vd_dims; /* offset of TM2_Vector_dims */ - tm_uoffset_t offset_s_tname; /* offset of string */ + tm_uoffset_t offset_vd_dims; /* offset of TM2_Vector_dims */ + tm_uoffset_t offset_s_tname; /* offset of string */ tm_uoffset_t offect_vo_quantparams; /* offset of TM2_Vector_offsets */ int32_t layout; int32_t type; @@ -335,13 +335,13 @@ typedef struct typedef struct { - tm_size_t size; /* buffer size */ + tm_size_t size; /* buffer size */ tm_uoffset_t offset_data; /* offset of buffer data */ } TM2_Buffer; typedef struct { - tm_size_t size; /* string size */ + tm_size_t size; /* string size */ tm_uoffset_t offset_data; /* offset of string data */ } TM2_String; @@ -373,7 +373,7 @@ typedef struct typedef struct { - tm_size_t v_num; /* number of vector elements */ + tm_size_t v_num; /* number of vector elements */ float data[0][4]; /* x0, y0, x1, y1 */ } TM2_Vector_anchors; @@ -504,9 +504,9 @@ typedef struct typedef struct { - tm_uoffset_t offset_vf_min_size; /* offset of TM2_Vector_floats */ - tm_uoffset_t offset_vf_max_size; /* offset of TM2_Vector_floats */ - tm_uoffset_t offset_vf_variance; /* offset of TM2_Vector_floats */ + tm_uoffset_t offset_vf_min_size; /* offset of TM2_Vector_floats */ + tm_uoffset_t offset_vf_max_size; /* offset of TM2_Vector_floats */ + tm_uoffset_t offset_vf_variance; /* offset of TM2_Vector_floats */ tm_uoffset_t offset_vf_aspect_ratio; /* offset of TM2_Vector_floats */ int32_t flip; int32_t clip; @@ -564,7 +564,7 @@ typedef struct typedef struct { - tm_uoffset_t offset_vf_ratios; /* pointer to TM2_Vector_floats */ + tm_uoffset_t offset_vf_ratios; /* pointer to TM2_Vector_floats */ tm_uoffset_t offset_vf_anchor_scales; /* pointer to TM2_Vector_floats */ int32_t feat_stride; int32_t basesize; @@ -586,8 +586,8 @@ typedef struct { int32_t axis; tm_uoffset_t offset_vi_slice_points; /* offset of TM2_Vector_dims */ - tm_uoffset_t offset_vi_begins; /* offset of TM2_Vector_dims */ - tm_uoffset_t offset_vi_sizes; /* offset of TM2_Vector_dims */ + tm_uoffset_t offset_vi_begins; /* offset of TM2_Vector_dims */ + tm_uoffset_t offset_vi_sizes; /* offset of TM2_Vector_dims */ int32_t iscaffe; int32_t ismxnet; int32_t isonnx; @@ -766,42 +766,40 @@ typedef struct typedef struct { - int32_t dilation_x; - int32_t dilation_y; - int32_t pad_top; - int32_t pad_bottom; - int32_t pad_left; - int32_t pad_right; + int32_t dilation_x; + int32_t dilation_y; + int32_t pad_top; + int32_t pad_bottom; + int32_t pad_left; + int32_t pad_right; } TM2_SpaceToBatchNDParam; - typedef struct { - int32_t dilation_x; - int32_t dilation_y; - int32_t crop_top; - int32_t crop_bottom; - int32_t crop_left; - int32_t crop_right; + int32_t dilation_x; + int32_t dilation_y; + int32_t crop_top; + int32_t crop_bottom; + int32_t crop_left; + int32_t crop_right; } TM2_BatchToSpaceNDParam; typedef struct { - int32_t num_args; - int32_t offset_c; - int32_t offset_h; - int32_t offset_w; - int32_t crop_h; - int32_t crop_w; - bool center_crop; - int32_t axis; - int32_t flag; + int32_t num_args; + int32_t offset_c; + int32_t offset_h; + int32_t offset_w; + int32_t crop_h; + int32_t crop_w; + bool center_crop; + int32_t axis; + int32_t flag; } TM2_CropParam; - -typedef struct +typedef struct { int32_t pooled_width; int32_t pooled_height; @@ -826,34 +824,33 @@ typedef struct int32_t type; } TM2_UnaryParam; - typedef struct { - int32_t bias_size; + int32_t bias_size; } TM2_BiasParam; typedef struct { - float threshold; + float threshold; } TM2_ThresholdParam; typedef struct { - float alpha; - float beta; + float alpha; + float beta; } TM2_HardsigmoidParam; typedef struct { - int32_t num_output; - int32_t input_dim; - int32_t bias_term; - int32_t weight_data_size; + int32_t num_output; + int32_t input_dim; + int32_t bias_term; + int32_t weight_data_size; } TM2_EmbedParam; typedef struct { - float eps; + float eps; } TM2_InstanceNormParam; typedef struct @@ -863,33 +860,37 @@ typedef struct float eps; } TM2_MVNParam; - -typedef struct{ +typedef struct +{ int32_t type_from; int32_t type_to; -}TM2_CastParam; +} TM2_CastParam; -typedef struct{ +typedef struct +{ float alpha; float beta; -}TM2_HardSwishParam; +} TM2_HardSwishParam; -typedef struct{ - int32_t resize_type;//1=nearest 2=bilinear 3=bicubic +typedef struct +{ + int32_t resize_type; //1=nearest 2=bilinear 3=bicubic float width_scale; float height_scale; int32_t output_width; int32_t output_height; -}TM2_InterpParam; +} TM2_InterpParam; -typedef struct{ +typedef struct +{ float alpha; float gamma; -}TM2_SeluParam; +} TM2_SeluParam; -typedef struct{ +typedef struct +{ float alpha; -}TM2_EluParam; +} TM2_EluParam; typedef struct { @@ -902,21 +903,22 @@ typedef struct int32_t indices_num; tm_bool_t is_onnx; } TM2_GatherParam; -typedef struct{ +typedef struct +{ tm_uoffset_t offset_tr_shape; -}TM2_TransposeParam; +} TM2_TransposeParam; typedef struct { int32_t type; } TM2_ComparisonParam; typedef struct { - int block_size; + int block_size; } TM2_SpaceToDepthParam; typedef struct { - int block_size; + int block_size; } TM2_DepthToSpaceParam; typedef struct @@ -934,31 +936,31 @@ typedef struct typedef struct { - tm_uoffset_t offset_vi_axises; -}TM2_UnsqueezeParam; + tm_uoffset_t offset_vi_axises; +} TM2_UnsqueezeParam; typedef struct { int axis; int keepdim; -}TM2_ReduceL2Param; +} TM2_ReduceL2Param; typedef struct { - tm_uoffset_t offset_v_shape; -}TM2_ExpandParam; + tm_uoffset_t offset_v_shape; +} TM2_ExpandParam; typedef struct { - int axis; - tm_bool_t is_onnx; -}TM2_ScatterParam; + int axis; + tm_bool_t is_onnx; +} TM2_ScatterParam; typedef struct { - tm_uoffset_t offset_vi_flag; // caffe: 0, onnx: 1 - tm_uoffset_t offset_vi_reps; -}TM2_TileParam; + tm_uoffset_t offset_vi_flag; // caffe: 0, onnx: 1 + tm_uoffset_t offset_vi_reps; +} TM2_TileParam; #ifdef __cplusplus } diff --git a/tools/quantize/savegraph/tm2_generate.c b/tools/quantize/savegraph/tm2_generate.c index 71db31f8b..4ba97d177 100644 --- a/tools/quantize/savegraph/tm2_generate.c +++ b/tools/quantize/savegraph/tm2_generate.c @@ -28,7 +28,7 @@ extern "C" { #endif -#define ALIGN(pos, alignbytes) (((pos) + ( alignbytes )-1) & ~(( alignbytes )-1)) +#define ALIGN(pos, alignbytes) (((pos) + (alignbytes)-1) & ~((alignbytes)-1)) uint32_t WriteTmFileAlign1(void* const start_ptr, uint32_t* cur_pos, const void* buf, const uint32_t buf_size) { diff --git a/tools/quantize/savegraph/tm2_op_save.cpp b/tools/quantize/savegraph/tm2_op_save.cpp index 29e052471..a964f84b4 100644 --- a/tools/quantize/savegraph/tm2_op_save.cpp +++ b/tools/quantize/savegraph/tm2_op_save.cpp @@ -31,7 +31,6 @@ #include "graph/tensor.h" #include "utility/log.h" - inline void SetTmOperator(TM2_Operator* tm_op, const uint32_t op_type, const tm_uoffset_t offset) { tm_op->op_ver = TM2_OP_VER; @@ -275,9 +274,9 @@ tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, stru TM2_PriorBoxParam tm_param; size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->min_size_num; - TM2_Vector_floats* v_minsizes = ( TM2_Vector_floats* )malloc(vector_size); + TM2_Vector_floats* v_minsizes = (TM2_Vector_floats*)malloc(vector_size); v_minsizes->v_num = p->min_size_num; - for(unsigned int i = 0; i < p->min_size_num; i++) + for (unsigned int i = 0; i < p->min_size_num; i++) { v_minsizes->data[i] = p->min_size[i]; } @@ -285,9 +284,9 @@ tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, stru free(v_minsizes); vector_size = sizeof(tm_size_t) + sizeof(float) * p->max_size_num; - TM2_Vector_floats* v_maxsizes = ( TM2_Vector_floats* )malloc(vector_size); + TM2_Vector_floats* v_maxsizes = (TM2_Vector_floats*)malloc(vector_size); v_maxsizes->v_num = p->max_size_num; - for(unsigned int i = 0; i < p->max_size_num; i++) + for (unsigned int i = 0; i < p->max_size_num; i++) { v_maxsizes->data[i] = p->max_size[i]; } @@ -296,9 +295,9 @@ tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, stru int variance_num = 4; // tengine lite does not set the variable. vector_size = sizeof(tm_size_t) + sizeof(float) * variance_num; - TM2_Vector_floats* v_variance = ( TM2_Vector_floats* )malloc(vector_size); + TM2_Vector_floats* v_variance = (TM2_Vector_floats*)malloc(vector_size); v_variance->v_num = variance_num; - for(unsigned int i = 0; i < variance_num; i++) + for (unsigned int i = 0; i < variance_num; i++) { v_variance->data[i] = p->variance[i]; } @@ -306,9 +305,9 @@ tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, stru free(v_variance); vector_size = sizeof(tm_size_t) + sizeof(float) * p->aspect_ratio_size; - TM2_Vector_floats* v_ratios = ( TM2_Vector_floats* )malloc(vector_size); + TM2_Vector_floats* v_ratios = (TM2_Vector_floats*)malloc(vector_size); v_ratios->v_num = p->aspect_ratio_size; - for(unsigned int i = 0; i < p->aspect_ratio_size; i++) + for (unsigned int i = 0; i < p->aspect_ratio_size; i++) { v_ratios->data[i] = p->aspect_ratio[i]; } @@ -343,9 +342,9 @@ tm_uoffset_t SaveTmRegionOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct tm_param.nms_threshold = p->nms_threshold; size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->biases_num; - TM2_Vector_floats* v_biases = ( TM2_Vector_floats* )malloc(vector_size); + TM2_Vector_floats* v_biases = (TM2_Vector_floats*)malloc(vector_size); v_biases->v_num = p->biases_num; - for(unsigned int i = 0; i < p->biases_num; i++) + for (unsigned int i = 0; i < p->biases_num; i++) { v_biases->data[i] = p->biases[i]; } @@ -390,36 +389,35 @@ tm_uoffset_t SaveTmReshapeOp(void* const start_ptr, tm_uoffset_t* cur_pos, struc { struct reshape_param* p = (struct reshape_param*)node->op.param_mem; TM2_ReshapeParam tm_param; - if(p->reverse) + if (p->reverse) tm_param.reverse = 1; else tm_param.reverse = 0; - if(p->is_mxnet) + if (p->is_mxnet) tm_param.is_mxnet = 1; else tm_param.is_mxnet = 0; - if(p->dim_size) + if (p->dim_size) { size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->dim_size; - TM2_Vector_dims* v_re_shape = ( TM2_Vector_dims* )malloc(vector_size); + TM2_Vector_dims* v_re_shape = (TM2_Vector_dims*)malloc(vector_size); v_re_shape->v_num = p->dim_size; - for(unsigned int i = 0; i < p->dim_size; i++) + for (unsigned int i = 0; i < p->dim_size; i++) { v_re_shape->dims[i] = p->re_shape[i]; } tm_param.offset_re_shape = WriteTmObject(start_ptr, cur_pos, v_re_shape, vector_size); free(v_re_shape); } - else{ + else + { tm_param.offset_re_shape = TM2_NOT_SET; } - TM2_Operator tm_op; SetTmOperator(&tm_op, TM2_OPTYPE_RESHAPE, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ReshapeParam))); return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); - } tm_uoffset_t SaveTmResizeOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct node* node) @@ -456,9 +454,9 @@ tm_uoffset_t SaveTmRPNOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct no TM2_RPNParam tm_param; size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->ratios->elem_num; - TM2_Vector_floats* v_ratios = ( TM2_Vector_floats* )malloc(vector_size); + TM2_Vector_floats* v_ratios = (TM2_Vector_floats*)malloc(vector_size); v_ratios->v_num = p->ratios->elem_num; - for(unsigned int i = 0; i < p->ratios->elem_num; i++) + for (unsigned int i = 0; i < p->ratios->elem_num; i++) { v_ratios->data[i] = *(float*)get_vector_data(p->ratios, i); } @@ -466,9 +464,9 @@ tm_uoffset_t SaveTmRPNOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct no free(v_ratios); vector_size = sizeof(tm_size_t) + sizeof(float) * p->anchor_scales->elem_num; - TM2_Vector_floats* v_scales = ( TM2_Vector_floats* )malloc(vector_size); + TM2_Vector_floats* v_scales = (TM2_Vector_floats*)malloc(vector_size); v_scales->v_num = p->anchor_scales->elem_num; - for(unsigned int i = 0; i < p->anchor_scales->elem_num; i++) + for (unsigned int i = 0; i < p->anchor_scales->elem_num; i++) { v_scales->data[i] = *(float*)get_vector_data(p->anchor_scales, i); } @@ -476,9 +474,9 @@ tm_uoffset_t SaveTmRPNOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct no free(v_scales); vector_size = sizeof(tm_size_t) + sizeof(float) * p->anchors_->elem_num * 4; - TM2_Vector_anchors* v_anchors = ( TM2_Vector_anchors* )malloc(vector_size); + TM2_Vector_anchors* v_anchors = (TM2_Vector_anchors*)malloc(vector_size); v_anchors->v_num = p->anchors_->elem_num; - for(unsigned int i = 0; i < p->anchors_->elem_num; i++) + for (unsigned int i = 0; i < p->anchors_->elem_num; i++) { v_anchors->data[i][0] = ((Anchor_t*)get_vector_data(p->anchors_, i))->x0; v_anchors->data[i][1] = ((Anchor_t*)get_vector_data(p->anchors_, i))->y0; @@ -522,16 +520,17 @@ tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct tm_param.iscaffe = p->iscaffe; tm_param.isonnx = p->isonnx; tm_param.ismxnet = p->ismxnet; - if(!tm_param.iscaffe){ + if (!tm_param.iscaffe) + { tm_param.begin = p->begin; tm_param.end = p->end; } - if(p->slice_point_->elem_num) + if (p->slice_point_->elem_num) { size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->slice_point_->elem_num; - TM2_Vector_dims* v_slice_points = ( TM2_Vector_dims* )malloc(vector_size); + TM2_Vector_dims* v_slice_points = (TM2_Vector_dims*)malloc(vector_size); v_slice_points->v_num = p->slice_point_->elem_num; - for(unsigned int i = 0; i < p->slice_point_->elem_num; i++) + for (unsigned int i = 0; i < p->slice_point_->elem_num; i++) { v_slice_points->dims[i] = *(int32_t*)get_vector_data(p->slice_point_, i); } @@ -541,12 +540,12 @@ tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct else tm_param.offset_vi_slice_points = TM2_NOT_SET; - if(p->begin_->elem_num) + if (p->begin_->elem_num) { size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->begin_->elem_num; - TM2_Vector_dims* v_begins = ( TM2_Vector_dims* )malloc(vector_size); + TM2_Vector_dims* v_begins = (TM2_Vector_dims*)malloc(vector_size); v_begins->v_num = p->begin_->elem_num; - for(unsigned int i = 0; i < p->begin_->elem_num; i++) + for (unsigned int i = 0; i < p->begin_->elem_num; i++) { v_begins->dims[i] = *(int32_t*)get_vector_data(p->begin_, i); } @@ -556,12 +555,12 @@ tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct else tm_param.offset_vi_begins = TM2_NOT_SET; - if(p->size_->elem_num) + if (p->size_->elem_num) { size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->size_->elem_num; - TM2_Vector_dims* v_sizes = ( TM2_Vector_dims* )malloc(vector_size); + TM2_Vector_dims* v_sizes = (TM2_Vector_dims*)malloc(vector_size); v_sizes->v_num = p->size_->elem_num; - for(unsigned int i = 0; i < p->size_->elem_num; i++) + for (unsigned int i = 0; i < p->size_->elem_num; i++) { v_sizes->dims[i] = *(int32_t*)get_vector_data(p->size_, i); } @@ -571,7 +570,6 @@ tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct else tm_param.offset_vi_sizes = TM2_NOT_SET; - TM2_Operator tm_op; SetTmOperator(&tm_op, TM2_OPTYPE_SLICE, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_SliceParam))); return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); @@ -592,24 +590,27 @@ tm_uoffset_t SaveTmSplitOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct { struct split_param* p = (struct split_param*)node->op.param_mem; TM2_SplitParam tm_param; - if(p->is_caffe) + if (p->is_caffe) tm_param.is_caffe = 1; else tm_param.is_caffe = 0; - if(p->is_onnx){ + if (p->is_onnx) + { tm_param.is_onnx = 1; - } else { + } + else + { tm_param.is_onnx = 0; } - if(!p->is_caffe) + if (!p->is_caffe) { - if(p->is_onnx) + if (p->is_onnx) tm_param.axis = p->axis; size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->split_sizes_->elem_num; - TM2_Vector_dims* v_split_sizes = ( TM2_Vector_dims* )malloc(vector_size); + TM2_Vector_dims* v_split_sizes = (TM2_Vector_dims*)malloc(vector_size); v_split_sizes->v_num = p->split_sizes_->elem_num; - for(unsigned int i = 0; i < p->split_sizes_->elem_num; i++) + for (unsigned int i = 0; i < p->split_sizes_->elem_num; i++) { v_split_sizes->dims[i] = *(int32_t*)get_vector_data(p->split_sizes_, i); } @@ -636,9 +637,9 @@ tm_uoffset_t SaveTmDetectionPostProcessOp(void* const start_ptr, tm_uoffset_t* c int param_scales_num = 4; size_t vector_size = sizeof(tm_size_t) + sizeof(float) * param_scales_num; - TM2_Vector_floats* v_scales = ( TM2_Vector_floats* )malloc(vector_size); + TM2_Vector_floats* v_scales = (TM2_Vector_floats*)malloc(vector_size); v_scales->v_num = param_scales_num; - for(unsigned int i = 0; i < param_scales_num; i++) + for (unsigned int i = 0; i < param_scales_num; i++) { v_scales->data[i] = p->scales[i]; } @@ -780,7 +781,7 @@ tm_uoffset_t SaveTmTopKV2Op(void* const start_ptr, tm_uoffset_t* cur_pos, struct TM2_TopKV2Param tm_param; tm_param.k = p->k; - if(p->sorted) + if (p->sorted) tm_param.sorted = 1; else tm_param.sorted = 0; @@ -992,7 +993,7 @@ tm_uoffset_t SaveTmExpanddimsOp(void* const start_ptr, tm_uoffset_t* cur_pos, st struct expanddims_param* p = (struct expanddims_param*)node->op.param_mem; TM2_ExpanddimsParam tm_param; - tm_param.axis= p->axis; + tm_param.axis = p->axis; TM2_Operator tm_op; SetTmOperator(&tm_op, TM2_OPTYPE_EXPANDDIMS, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ExpanddimsParam))); @@ -1116,7 +1117,7 @@ tm_uoffset_t SaveTmSeluOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct n struct selu_param* p = (struct selu_param*)node->op.param_mem; TM2_SeluParam tm_param; tm_param.alpha = p->alpha; - tm_param.gamma = p->lambda;//gamma + tm_param.gamma = p->lambda; //gamma TM2_Operator tm_op; SetTmOperator(&tm_op, TM2_OPTYPE_SELU, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_SeluParam))); @@ -1169,19 +1170,20 @@ tm_uoffset_t SaveTmTransposeOp(void* const start_ptr, tm_uoffset_t* cur_pos, str { struct transpose_param* p = (struct transpose_param*)node->op.param_mem; TM2_TransposeParam tm_param; - if(p->tr_shape_size) + if (p->tr_shape_size) { size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->tr_shape_size; - TM2_Vector_dims* v_re_shape = ( TM2_Vector_dims* )malloc(vector_size); + TM2_Vector_dims* v_re_shape = (TM2_Vector_dims*)malloc(vector_size); v_re_shape->v_num = p->tr_shape_size; - for(unsigned int i = 0; i < p->tr_shape_size; i++) + for (unsigned int i = 0; i < p->tr_shape_size; i++) { v_re_shape->dims[i] = p->tr_shape[i]; } tm_param.offset_tr_shape = WriteTmObject(start_ptr, cur_pos, v_re_shape, vector_size); free(v_re_shape); } - else{ + else + { tm_param.offset_tr_shape = TM2_NOT_SET; } TM2_Operator tm_op; @@ -1281,12 +1283,12 @@ tm_uoffset_t SaveTmUnsqueezeOp(void* const start_ptr, tm_uoffset_t* cur_pos, str struct unsqueeze_param* p = (struct unsqueeze_param*)node->op.param_mem; TM2_UnsqueezeParam tm_param; - if(p->axises_size) + if (p->axises_size) { size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->axises_size; - TM2_Vector_dims* v_axises = ( TM2_Vector_dims* )malloc(vector_size); + TM2_Vector_dims* v_axises = (TM2_Vector_dims*)malloc(vector_size); v_axises->v_num = p->axises_size; - for(unsigned int i = 0; i < p->axises_size; i++) + for (unsigned int i = 0; i < p->axises_size; i++) { v_axises->dims[i] = p->axises[i]; } @@ -1330,178 +1332,178 @@ tm_uoffset_t SaveTmMatMulOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct op_save_t SaveTmOpFunc(uint32_t op_type) { - switch(op_type) + switch (op_type) { - case OP_BATCHNORM: - return SaveTmBatchNormOp; - case OP_CONCAT: - return SaveTmConcatOp; - case OP_CONST: - return SaveTmConstOp; - case OP_CONV: - return SaveTmConvOp; - case OP_DECONV: - return SaveTmDeconvOp; - case OP_DETECTION_OUTPUT: - return SaveTmDetectionOutputOp; - case OP_DROPOUT: - return SaveTmDropoutOp; - case OP_ELTWISE: - return SaveTmEltwiseOp; - case OP_FLATTEN: - return SaveTmFlattenOp; - case OP_FC: - return SaveTmFCOp; - case OP_INPUT: - return SaveTmInputOp; - case OP_LRN: - return SaveTmLRNOp; - case OP_NORMALIZE: - return SaveTmNormalizeOp; - case OP_PERMUTE: - return SaveTmPermuteOp; - case OP_POOL: - return SaveTmPoolingOp; - case OP_PRELU: - return SaveTmPreluOp; - case OP_PRIORBOX: - return SaveTmPriorBoxOp; - case OP_REGION: - return SaveTmRegionOp; - case OP_RELU: - return SaveTmReLuOp; - case OP_RELU6: - return SaveTmRelu6Op; - case OP_REORG: - return SaveTmReorgOp; - case OP_RESHAPE: - return SaveTmReshapeOp; - case OP_ROIPOOLING: - return SaveTmROIPoolingOp; - case OP_RPN: - return SaveTmRPNOp; - case OP_SCALE: - return SaveTmScaleOp; - case OP_SLICE: - return SaveTmSliceOp; - case OP_SOFTMAX: - return SaveTmSoftmaxOp; - case OP_SPLIT: - return SaveTmSplitOp; - case OP_DETECTION_POSTPROCESS: - return SaveTmDetectionPostProcessOp; - case OP_GEMM: - return SaveTmGemmOp; - case OP_LOGISTIC: - return SaveTmLogisticOp; - case OP_LSTM: - return SaveTmLstmOp; - case OP_RNN: - return SaveTmRnnOp; - case OP_TANH: - return SaveTmTanhOp; - case OP_SIGMOID: - return SaveTmSigmoidOp; - case OP_SQUEEZE: - return SaveTmSqueezeOp; - case OP_SWAP_AXIS: - return SaveTmSwapAxisOp; - case OP_GRU: - return SaveTmGruOp; - case OP_ARGMAX: - return SaveTmArgMaxOp; - case OP_ARGMIN: - return SaveTmArgMinOp; - case OP_TOPKV2: - return SaveTmTopKV2Op; - case OP_PAD: - return SaveTmPadOp; - case OP_STRIDED_SLICE: - return SaveTmStridedSliceOp; - case OP_REDUCTION: - return SaveTmReductionOp; - case OP_UPSAMPLE: - return SaveTmUpsampleOp; - case OP_SHUFFLECHANNEL: - return SaveTmShuffleChannelOp; - case OP_SPACETOBATCHND: - return SaveTmSpaceToBatchNDOp; - case OP_BATCHTOSPACEND: - return SaveTmBatchToSpaceNDOp; - case OP_RESIZE: - return SaveTmResizeOp; - case OP_CROP: - return SaveTmCropOp; - case OP_ROIALIGN: - return SaveTmRoialignOp; - case OP_PSROIPOOLING: - return SaveTmPsroipoolingOp; - case OP_EXPANDDIMS: - return SaveTmExpanddimsOp; - case OP_UNARY: - return SaveTmUnaryOp; - case OP_NOOP: - return SaveTmNoopOp; - case OP_THRESHOLD: - return SaveTmThresholdOp; - case OP_HARDSIGMOID: - return SaveTmHardsigmoidOp; - case OP_EMBEDDING: - return SaveTmEmbedOp; - case OP_INSTANCENORM: - return SaveTmInstanceNormOp; - case OP_MVN: - return SaveTmMVNOp; - case OP_CAST: - return SaveTmCastOp; - case OP_HARDSWISH: - return SaveTmHardSwishOp; - case OP_INTERP: - return SaveTmInterpOp; - case OP_SELU: - return SaveTmSeluOp; - case OP_ELU: - return SaveTmEluOp; - case OP_BROADMUL: - return SaveTmBroadMulOp; - case OP_LOGICAL: - return SaveTmLogicalOp; - case OP_GATHER: - return SaveTmGatherOp; - case OP_TRANSPOSE: - return SaveTmTransposeOp; - case OP_COMPARISON: - return SaveTmComparisonOp; - case OP_REVERSE: - return SaveTmReverseOp; - case OP_SPACETODEPTH: - return SaveTmSpaceToDepthOp; - case OP_DEPTHTOSPACE: - return SaveTmDepthToSpaceOp; - case OP_SQUAREDDIFFERENCE: - return SaveTmSquaredDifferenceOp; - case OP_SPARSETODENSE: - return SaveTmSparseToDenseOp; - case OP_CEIL: - return SaveTmCeilOp; - case OP_ROUND: - return SaveTmRoundOp; - case OP_ZEROSLIKE: - return SaveTmZerosLikeOp; - case OP_CLIP: - return SaveTmClipOp; - case OP_REDUCEL2: - return SaveTmReduceL2Op; - case OP_UNSQUEEZE: - return SaveTmUnsqueezeOp; - case OP_MEAN: - return SaveTmMeanOp; - case OP_MATMUL: - return SaveTmMatMulOp; - case OP_MISH: - return SaveTmMishOp; - default: - // fprintf(stderr, "Operator #%d not supported in tengine model yet\n",op_type); - return nullptr; + case OP_BATCHNORM: + return SaveTmBatchNormOp; + case OP_CONCAT: + return SaveTmConcatOp; + case OP_CONST: + return SaveTmConstOp; + case OP_CONV: + return SaveTmConvOp; + case OP_DECONV: + return SaveTmDeconvOp; + case OP_DETECTION_OUTPUT: + return SaveTmDetectionOutputOp; + case OP_DROPOUT: + return SaveTmDropoutOp; + case OP_ELTWISE: + return SaveTmEltwiseOp; + case OP_FLATTEN: + return SaveTmFlattenOp; + case OP_FC: + return SaveTmFCOp; + case OP_INPUT: + return SaveTmInputOp; + case OP_LRN: + return SaveTmLRNOp; + case OP_NORMALIZE: + return SaveTmNormalizeOp; + case OP_PERMUTE: + return SaveTmPermuteOp; + case OP_POOL: + return SaveTmPoolingOp; + case OP_PRELU: + return SaveTmPreluOp; + case OP_PRIORBOX: + return SaveTmPriorBoxOp; + case OP_REGION: + return SaveTmRegionOp; + case OP_RELU: + return SaveTmReLuOp; + case OP_RELU6: + return SaveTmRelu6Op; + case OP_REORG: + return SaveTmReorgOp; + case OP_RESHAPE: + return SaveTmReshapeOp; + case OP_ROIPOOLING: + return SaveTmROIPoolingOp; + case OP_RPN: + return SaveTmRPNOp; + case OP_SCALE: + return SaveTmScaleOp; + case OP_SLICE: + return SaveTmSliceOp; + case OP_SOFTMAX: + return SaveTmSoftmaxOp; + case OP_SPLIT: + return SaveTmSplitOp; + case OP_DETECTION_POSTPROCESS: + return SaveTmDetectionPostProcessOp; + case OP_GEMM: + return SaveTmGemmOp; + case OP_LOGISTIC: + return SaveTmLogisticOp; + case OP_LSTM: + return SaveTmLstmOp; + case OP_RNN: + return SaveTmRnnOp; + case OP_TANH: + return SaveTmTanhOp; + case OP_SIGMOID: + return SaveTmSigmoidOp; + case OP_SQUEEZE: + return SaveTmSqueezeOp; + case OP_SWAP_AXIS: + return SaveTmSwapAxisOp; + case OP_GRU: + return SaveTmGruOp; + case OP_ARGMAX: + return SaveTmArgMaxOp; + case OP_ARGMIN: + return SaveTmArgMinOp; + case OP_TOPKV2: + return SaveTmTopKV2Op; + case OP_PAD: + return SaveTmPadOp; + case OP_STRIDED_SLICE: + return SaveTmStridedSliceOp; + case OP_REDUCTION: + return SaveTmReductionOp; + case OP_UPSAMPLE: + return SaveTmUpsampleOp; + case OP_SHUFFLECHANNEL: + return SaveTmShuffleChannelOp; + case OP_SPACETOBATCHND: + return SaveTmSpaceToBatchNDOp; + case OP_BATCHTOSPACEND: + return SaveTmBatchToSpaceNDOp; + case OP_RESIZE: + return SaveTmResizeOp; + case OP_CROP: + return SaveTmCropOp; + case OP_ROIALIGN: + return SaveTmRoialignOp; + case OP_PSROIPOOLING: + return SaveTmPsroipoolingOp; + case OP_EXPANDDIMS: + return SaveTmExpanddimsOp; + case OP_UNARY: + return SaveTmUnaryOp; + case OP_NOOP: + return SaveTmNoopOp; + case OP_THRESHOLD: + return SaveTmThresholdOp; + case OP_HARDSIGMOID: + return SaveTmHardsigmoidOp; + case OP_EMBEDDING: + return SaveTmEmbedOp; + case OP_INSTANCENORM: + return SaveTmInstanceNormOp; + case OP_MVN: + return SaveTmMVNOp; + case OP_CAST: + return SaveTmCastOp; + case OP_HARDSWISH: + return SaveTmHardSwishOp; + case OP_INTERP: + return SaveTmInterpOp; + case OP_SELU: + return SaveTmSeluOp; + case OP_ELU: + return SaveTmEluOp; + case OP_BROADMUL: + return SaveTmBroadMulOp; + case OP_LOGICAL: + return SaveTmLogicalOp; + case OP_GATHER: + return SaveTmGatherOp; + case OP_TRANSPOSE: + return SaveTmTransposeOp; + case OP_COMPARISON: + return SaveTmComparisonOp; + case OP_REVERSE: + return SaveTmReverseOp; + case OP_SPACETODEPTH: + return SaveTmSpaceToDepthOp; + case OP_DEPTHTOSPACE: + return SaveTmDepthToSpaceOp; + case OP_SQUAREDDIFFERENCE: + return SaveTmSquaredDifferenceOp; + case OP_SPARSETODENSE: + return SaveTmSparseToDenseOp; + case OP_CEIL: + return SaveTmCeilOp; + case OP_ROUND: + return SaveTmRoundOp; + case OP_ZEROSLIKE: + return SaveTmZerosLikeOp; + case OP_CLIP: + return SaveTmClipOp; + case OP_REDUCEL2: + return SaveTmReduceL2Op; + case OP_UNSQUEEZE: + return SaveTmUnsqueezeOp; + case OP_MEAN: + return SaveTmMeanOp; + case OP_MATMUL: + return SaveTmMatMulOp; + case OP_MISH: + return SaveTmMishOp; + default: + // fprintf(stderr, "Operator #%d not supported in tengine model yet\n",op_type); + return nullptr; } } \ No newline at end of file diff --git a/tools/quantize/savegraph/tm2_op_save.hpp b/tools/quantize/savegraph/tm2_op_save.hpp index 78ad4e40e..08d63ec4d 100644 --- a/tools/quantize/savegraph/tm2_op_save.hpp +++ b/tools/quantize/savegraph/tm2_op_save.hpp @@ -3,14 +3,13 @@ #include extern "C" { - #include "utility/vector.h" - #include "tm2_format.h" - #include "tm2_generate.h" - #include "graph/node.h" - - #include "op_include.h" -} +#include "utility/vector.h" +#include "tm2_format.h" +#include "tm2_generate.h" +#include "graph/node.h" +#include "op_include.h" +} using op_save_t = std::function; op_save_t SaveTmOpFunc(uint32_t op_type);